protected function _fillTokenBuffer(Analysis\Token $token, $startPos, $endPos) { $matchedWord = $token->getTermText(); //echo "matchedWord: $matchedWord<br>"; // short words and numbers should not be tokenized to n-grams if (iconv_strlen($matchedWord, 'UTF-8') <= $this->_minWordSize || is_numeric($matchedWord)) { $this->_tokenBuffer[] = new Analysis\Token($matchedWord, $startPos, $endPos); } else { // generate n-grams and fill the tokenBuffer with Zend_Search_Lucene_Analysis_Token $matchedWord = '_' . $matchedWord . '_'; $length = iconv_strlen($matchedWord, 'UTF-8'); for ($pos = 0; $pos < $length; $pos++) { for ($chars = 0; $chars < $this->_maxNGramSize; $chars++) { if ($pos + $chars < $length) { //Prendre 'substr' le jour 'mb_substr' ne marche pas $nGram = mb_substr($matchedWord, $pos, $chars + 1, 'UTF-8'); if (iconv_strlen($nGram, 'UTF-8') >= $this->_minNGramSize) { $this->_tokenBuffer[] = new Analysis\Token($nGram, $startPos, $endPos); //echo "n-gram: $nGram<br>"; } } } } } }
/** * Normalize Token or remove it (if null is returned) * * @param \ZendSearch\Lucene\Analysis\Token $srcToken * @return \ZendSearch\Lucene\Analysis\Token */ public function normalize(Token $srcToken) { if (array_key_exists($srcToken->getTermText(), $this->_stopSet)) { return null; } else { return $srcToken; } }
/** * Normalize Token or remove it (if null is returned) * * @param \ZendSearch\Lucene\Analysis\Token $srcToken * @return \ZendSearch\Lucene\Analysis\Token */ public function normalize(Token $srcToken) { if (strlen($srcToken->getTermText()) < $this->length) { return null; } else { return $srcToken; } }
/** * @dataProvider getNormalizeDataProvider */ public function testNormalize($source, $normalized) { $token = new Token($source, 0, 100); $token->setPositionIncrement(50); $actualToken = $this->filter->normalize($token); $expectedToken = new Token($normalized, 0, 100); $expectedToken->setPositionIncrement(50); $this->assertEquals($expectedToken, $actualToken); }
/** * @dataProvider getNormalizeDataProvider */ public function testNormalize($source, $expected, $pseudoRoots, $encoding) { $this->phpmorphy->shouldReceive('getEncoding')->andReturn($encoding); $this->phpmorphy->shouldReceive('getPseudoRoot')->andReturn($pseudoRoots); $filter = new TokenFilterEnRu($this->phpmorphyFactory); $token = new Token($source, 0, 100); $token->setPositionIncrement(50); $actualToken = $filter->normalize($token); $expectedToken = new Token($expected, 0, 100); $expectedToken->setPositionIncrement(50); $this->assertEquals($expectedToken, $actualToken); }
/** * Normalize * @param \ZendSearch\Lucene\Analysis\Token $token * @return null|\ZendSearch\Lucene\Analysis\Token */ public function normalize(Token $token) { $pseudo_root = $this->morphy->getPseudoRoot($this->string()->toUpper($token->getTermText())); if ($pseudo_root === false) { $new_str = $this->string()->toUpper($token->getTermText()); } else { $new_str = $pseudo_root[0]; } if (strlen($new_str) < 3) { return null; } $new_token = new Token($new_str, $token->getStartOffset(), $token->getEndOffset()); $new_token->setPositionIncrement($token->getPositionIncrement()); return $new_token; }
/** * Normalize Token or remove it (if null is returned) * * @param \ZendSearch\Lucene\Analysis\Token $srcToken * @return \ZendSearch\Lucene\Analysis\Token */ public function normalize(Token $srcToken) { $newToken = new Token(mb_strtolower($srcToken->getTermText(), 'UTF-8'), $srcToken->getStartOffset(), $srcToken->getEndOffset()); $newToken->setPositionIncrement($srcToken->getPositionIncrement()); return $newToken; }
/** * Normalize Token or remove it (if null is returned) * * @param Zend_Search_Lucene_Analysis_Token $srcToken * @return Zend_Search_Lucene_Analysis_Token */ public function normalize(\ZendSearch\Lucene\Analysis\Token $srcToken) { $newToken = new \ZendSearch\Lucene\Analysis\Token(SearchHelper::utf8_to_ascii($srcToken->getTermText()), $srcToken->getStartOffset(), $srcToken->getEndOffset()); $newToken->setPositionIncrement($srcToken->getPositionIncrement()); return $newToken; }
/** * {@inheritdoc} */ public function normalize(Token $srcToken) { $termText = $srcToken->getTermText(); $newTokenString = !is_numeric($termText) ? $this->getPseudoRoot($termText) : $termText; $newToken = new Token($newTokenString, $srcToken->getStartOffset(), $srcToken->getEndOffset()); $newToken->setPositionIncrement($srcToken->getPositionIncrement()); return $newToken; }
/** * {@inheritdoc} */ public function normalize(Token $srcToken) { $pseudoRootList = array(); $toSearch = mb_strtoupper($srcToken->getTermText(), 'utf-8'); $encoding = $this->getDictionaryEncoding($toSearch); /** * Если лексема короче MIN_TOKEN_LENGTH символов, то не используем её */ if (mb_strlen($toSearch, 'utf-8') < self::MIN_TOKEN_LENGTH) { return null; } $toSearch = iconv('utf-8', "{$encoding}//IGNORE", $toSearch); if (mb_strlen($toSearch, $encoding) < self::MIN_TOKEN_LENGTH) { return null; } /** * хардкорно извлекаем 'псевдокорень' слова */ $pseudoRootResult[] = $toSearch; do { $temp = $pseudoRootResult[0]; $pseudoRootResult = $this->getPseudoRoot($temp); /** * если возвращается несколько - выбрать самым короткий `псевдокорень` */ if (is_array($pseudoRootResult)) { usort($pseudoRootResult, function ($a, $b) use($encoding) { $len1 = mb_strlen($a, $encoding); $len2 = mb_strlen($b, $encoding); return $len1 > $len2; }); } $flag = $pseudoRootResult !== false && $pseudoRootResult[0] != $temp; if ($flag) { array_unshift($pseudoRootList, $pseudoRootResult[0]); } } while ($flag); if (count($pseudoRootList) == 0 && $pseudoRootResult === false) { /** * В случае если 'псевдокорень' получить не удалось, берем исходное слово целиком */ $newTokenString = $toSearch; } else { /** * Из полученного списка 'псевдокорней' выберем первый, * длина которого не менее MIN_TOKEN_LENGTH */ $newTokenString = null; foreach ($pseudoRootList as $pseudoRoot) { if (mb_strlen($pseudoRoot, $encoding) < self::MIN_TOKEN_LENGTH) { continue; } else { $newTokenString = $pseudoRoot; break; } } /** * Если 'псевдокорень' не удалось получить даже сейчас, берем исходное слово целиком */ if (is_null($newTokenString)) { $newTokenString = $toSearch; } } $newTokenString = iconv($encoding, 'utf-8//IGNORE', $newTokenString); $newToken = new Token($newTokenString, $srcToken->getStartOffset(), $srcToken->getEndOffset()); $newToken->setPositionIncrement($srcToken->getPositionIncrement()); return $newToken; }