protected function _fillTokenBuffer(Analysis\Token $token, $startPos, $endPos)
 {
     $matchedWord = $token->getTermText();
     //echo "matchedWord: $matchedWord<br>";
     // short words and numbers should not be tokenized to n-grams
     if (iconv_strlen($matchedWord, 'UTF-8') <= $this->_minWordSize || is_numeric($matchedWord)) {
         $this->_tokenBuffer[] = new Analysis\Token($matchedWord, $startPos, $endPos);
     } else {
         // generate n-grams and fill the tokenBuffer with Zend_Search_Lucene_Analysis_Token
         $matchedWord = '_' . $matchedWord . '_';
         $length = iconv_strlen($matchedWord, 'UTF-8');
         for ($pos = 0; $pos < $length; $pos++) {
             for ($chars = 0; $chars < $this->_maxNGramSize; $chars++) {
                 if ($pos + $chars < $length) {
                     //Prendre 'substr' le jour 'mb_substr' ne marche pas
                     $nGram = mb_substr($matchedWord, $pos, $chars + 1, 'UTF-8');
                     if (iconv_strlen($nGram, 'UTF-8') >= $this->_minNGramSize) {
                         $this->_tokenBuffer[] = new Analysis\Token($nGram, $startPos, $endPos);
                         //echo "n-gram: $nGram<br>";
                     }
                 }
             }
         }
     }
 }
示例#2
0
 /**
  * Normalize Token or remove it (if null is returned)
  *
  * @param \ZendSearch\Lucene\Analysis\Token $srcToken
  * @return \ZendSearch\Lucene\Analysis\Token
  */
 public function normalize(Token $srcToken)
 {
     if (array_key_exists($srcToken->getTermText(), $this->_stopSet)) {
         return null;
     } else {
         return $srcToken;
     }
 }
示例#3
0
 /**
  * Normalize Token or remove it (if null is returned)
  *
  * @param \ZendSearch\Lucene\Analysis\Token $srcToken
  * @return \ZendSearch\Lucene\Analysis\Token
  */
 public function normalize(Token $srcToken)
 {
     if (strlen($srcToken->getTermText()) < $this->length) {
         return null;
     } else {
         return $srcToken;
     }
 }
 /**
  * @dataProvider getNormalizeDataProvider
  */
 public function testNormalize($source, $normalized)
 {
     $token = new Token($source, 0, 100);
     $token->setPositionIncrement(50);
     $actualToken = $this->filter->normalize($token);
     $expectedToken = new Token($normalized, 0, 100);
     $expectedToken->setPositionIncrement(50);
     $this->assertEquals($expectedToken, $actualToken);
 }
 /**
  * @dataProvider getNormalizeDataProvider
  */
 public function testNormalize($source, $expected, $pseudoRoots, $encoding)
 {
     $this->phpmorphy->shouldReceive('getEncoding')->andReturn($encoding);
     $this->phpmorphy->shouldReceive('getPseudoRoot')->andReturn($pseudoRoots);
     $filter = new TokenFilterEnRu($this->phpmorphyFactory);
     $token = new Token($source, 0, 100);
     $token->setPositionIncrement(50);
     $actualToken = $filter->normalize($token);
     $expectedToken = new Token($expected, 0, 100);
     $expectedToken->setPositionIncrement(50);
     $this->assertEquals($expectedToken, $actualToken);
 }
示例#6
0
文件: Morphy.php 项目: jarick/bx
 /**
  * Normalize
  * @param \ZendSearch\Lucene\Analysis\Token $token
  * @return null|\ZendSearch\Lucene\Analysis\Token
  */
 public function normalize(Token $token)
 {
     $pseudo_root = $this->morphy->getPseudoRoot($this->string()->toUpper($token->getTermText()));
     if ($pseudo_root === false) {
         $new_str = $this->string()->toUpper($token->getTermText());
     } else {
         $new_str = $pseudo_root[0];
     }
     if (strlen($new_str) < 3) {
         return null;
     }
     $new_token = new Token($new_str, $token->getStartOffset(), $token->getEndOffset());
     $new_token->setPositionIncrement($token->getPositionIncrement());
     return $new_token;
 }
示例#7
0
 /**
  * Normalize Token or remove it (if null is returned)
  *
  * @param \ZendSearch\Lucene\Analysis\Token $srcToken
  * @return \ZendSearch\Lucene\Analysis\Token
  */
 public function normalize(Token $srcToken)
 {
     $newToken = new Token(mb_strtolower($srcToken->getTermText(), 'UTF-8'), $srcToken->getStartOffset(), $srcToken->getEndOffset());
     $newToken->setPositionIncrement($srcToken->getPositionIncrement());
     return $newToken;
 }
 /**
  * Normalize Token or remove it (if null is returned)
  *
  * @param Zend_Search_Lucene_Analysis_Token $srcToken
  * @return Zend_Search_Lucene_Analysis_Token
  */
 public function normalize(\ZendSearch\Lucene\Analysis\Token $srcToken)
 {
     $newToken = new \ZendSearch\Lucene\Analysis\Token(SearchHelper::utf8_to_ascii($srcToken->getTermText()), $srcToken->getStartOffset(), $srcToken->getEndOffset());
     $newToken->setPositionIncrement($srcToken->getPositionIncrement());
     return $newToken;
 }
 /**
  * {@inheritdoc}
  */
 public function normalize(Token $srcToken)
 {
     $termText = $srcToken->getTermText();
     $newTokenString = !is_numeric($termText) ? $this->getPseudoRoot($termText) : $termText;
     $newToken = new Token($newTokenString, $srcToken->getStartOffset(), $srcToken->getEndOffset());
     $newToken->setPositionIncrement($srcToken->getPositionIncrement());
     return $newToken;
 }
示例#10
0
 /**
  * {@inheritdoc}
  */
 public function normalize(Token $srcToken)
 {
     $pseudoRootList = array();
     $toSearch = mb_strtoupper($srcToken->getTermText(), 'utf-8');
     $encoding = $this->getDictionaryEncoding($toSearch);
     /**
      * Если лексема короче MIN_TOKEN_LENGTH символов, то не используем её
      */
     if (mb_strlen($toSearch, 'utf-8') < self::MIN_TOKEN_LENGTH) {
         return null;
     }
     $toSearch = iconv('utf-8', "{$encoding}//IGNORE", $toSearch);
     if (mb_strlen($toSearch, $encoding) < self::MIN_TOKEN_LENGTH) {
         return null;
     }
     /**
      * хардкорно извлекаем 'псевдокорень' слова
      */
     $pseudoRootResult[] = $toSearch;
     do {
         $temp = $pseudoRootResult[0];
         $pseudoRootResult = $this->getPseudoRoot($temp);
         /**
          * если возвращается несколько - выбрать самым короткий `псевдокорень`
          */
         if (is_array($pseudoRootResult)) {
             usort($pseudoRootResult, function ($a, $b) use($encoding) {
                 $len1 = mb_strlen($a, $encoding);
                 $len2 = mb_strlen($b, $encoding);
                 return $len1 > $len2;
             });
         }
         $flag = $pseudoRootResult !== false && $pseudoRootResult[0] != $temp;
         if ($flag) {
             array_unshift($pseudoRootList, $pseudoRootResult[0]);
         }
     } while ($flag);
     if (count($pseudoRootList) == 0 && $pseudoRootResult === false) {
         /**
          * В случае если 'псевдокорень' получить не удалось, берем исходное слово целиком
          */
         $newTokenString = $toSearch;
     } else {
         /**
          * Из полученного списка 'псевдокорней' выберем первый,
          * длина которого не менее MIN_TOKEN_LENGTH
          */
         $newTokenString = null;
         foreach ($pseudoRootList as $pseudoRoot) {
             if (mb_strlen($pseudoRoot, $encoding) < self::MIN_TOKEN_LENGTH) {
                 continue;
             } else {
                 $newTokenString = $pseudoRoot;
                 break;
             }
         }
         /**
          * Если 'псевдокорень' не удалось получить даже сейчас, берем исходное слово целиком
          */
         if (is_null($newTokenString)) {
             $newTokenString = $toSearch;
         }
     }
     $newTokenString = iconv($encoding, 'utf-8//IGNORE', $newTokenString);
     $newToken = new Token($newTokenString, $srcToken->getStartOffset(), $srcToken->getEndOffset());
     $newToken->setPositionIncrement($srcToken->getPositionIncrement());
     return $newToken;
 }