public function testGetPrefixUtf8() { // UTF-8 string with non-ascii symbols (Russian alphabet) $this->assertEquals(Index\Term::getPrefix('абвгдеёжзийклмнопрстуфхцчшщьыъэюя', 64), 'абвгдеёжзийклмнопрстуфхцчшщьыъэюя'); $this->assertEquals(Index\Term::getPrefix('абвгдеёжзийклмнопрстуфхцчшщьыъэюя', 33), 'абвгдеёжзийклмнопрстуфхцчшщьыъэюя'); $this->assertEquals(Index\Term::getPrefix('абвгдеёжзийклмнопрстуфхцчшщьыъэюя', 4), 'абвг'); $this->assertEquals(Index\Term::getPrefix('абвгдеёжзийклмнопрстуфхцчшщьыъэюя', 0), ''); }
/** * Query specific matches highlighting * * @param Highlighter $highlighter Highlighter object (also contains doc for highlighting) */ protected function _highlightMatches(Highlighter $highlighter) { $words = array(); $prefix = Index\Term::getPrefix($this->_term->text, $this->_prefixLength); $prefixByteLength = strlen($prefix); $prefixUtf8Length = Index\Term::getLength($prefix); $termLength = Index\Term::getLength($this->_term->text); $termRest = substr($this->_term->text, $prefixByteLength); // we calculate length of the rest in bytes since levenshtein() is not UTF-8 compatible $termRestLength = strlen($termRest); $scaleFactor = 1 / (1 - $this->_minimumSimilarity); $docBody = $highlighter->getDocument()->getFieldUtf8Value('body'); $tokens = Lucene\Analysis\Analyzer\Analyzer::getDefault()->tokenize($docBody, 'UTF-8'); foreach ($tokens as $token) { $termText = $token->getTermText(); if (substr($termText, 0, $prefixByteLength) == $prefix) { // Calculate similarity $target = substr($termText, $prefixByteLength); $maxDistance = isset($this->_maxDistances[strlen($target)]) ? $this->_maxDistances[strlen($target)] : $this->_calculateMaxDistance($prefixUtf8Length, $termRestLength, strlen($target)); if ($termRestLength == 0) { // we don't have anything to compare. That means if we just add // the letters for current term we get the new word $similarity = $prefixUtf8Length == 0 ? 0 : 1 - strlen($target) / $prefixUtf8Length; } elseif (strlen($target) == 0) { $similarity = $prefixUtf8Length == 0 ? 0 : 1 - $termRestLength / $prefixUtf8Length; } elseif ($maxDistance < abs($termRestLength - strlen($target))) { //just adding the characters of term to target or vice-versa results in too many edits //for example "pre" length is 3 and "prefixes" length is 8. We can see that //given this optimal circumstance, the edit distance cannot be less than 5. //which is 8-3 or more precisesly abs(3-8). //if our maximum edit distance is 4, then we can discard this word //without looking at it. $similarity = 0; } else { $similarity = 1 - levenshtein($termRest, $target) / ($prefixUtf8Length + min($termRestLength, strlen($target))); } if ($similarity > $this->_minimumSimilarity) { $words[] = $termText; } } } $highlighter->highlight($words); }
/** * Scans terms dictionary and returns next term * * @return \ZendSearch\Lucene\Index\Term|null */ public function nextTerm() { if ($this->_tisFile === null || $this->_termCount == 0) { $this->_lastTerm = null; $this->_lastTermInfo = null; $this->_lastTermPositions = null; $this->_docMap = null; // may be necessary for "empty" segment $this->_tisFile = null; $this->_frqFile = null; $this->_prxFile = null; return null; } $termPrefixLength = $this->_tisFile->readVInt(); $termSuffix = $this->_tisFile->readString(); $termFieldNum = $this->_tisFile->readVInt(); $termValue = Term::getPrefix($this->_lastTerm->text, $termPrefixLength) . $termSuffix; $this->_lastTerm = new Term($termValue, $this->_fields[$termFieldNum]->name); $docFreq = $this->_tisFile->readVInt(); $freqPointer = $this->_lastTermInfo->freqPointer + $this->_tisFile->readVInt(); $proxPointer = $this->_lastTermInfo->proxPointer + $this->_tisFile->readVInt(); if ($docFreq >= $this->_skipInterval) { $skipOffset = $this->_tisFile->readVInt(); } else { $skipOffset = 0; } $this->_lastTermInfo = new TermInfo($docFreq, $freqPointer, $proxPointer, $skipOffset); if ($this->_termsScanMode == self::SM_FULL_INFO || $this->_termsScanMode == self::SM_MERGE_INFO) { $this->_lastTermPositions = array(); $this->_frqFile->seek($this->_lastTermInfo->freqPointer + $this->_frqFileOffset, SEEK_SET); $freqs = array(); $docId = 0; for ($count = 0; $count < $this->_lastTermInfo->docFreq; $count++) { $docDelta = $this->_frqFile->readVInt(); if ($docDelta % 2 == 1) { $docId += ($docDelta - 1) / 2; $freqs[$docId] = 1; } else { $docId += $docDelta / 2; $freqs[$docId] = $this->_frqFile->readVInt(); } } $this->_prxFile->seek($this->_lastTermInfo->proxPointer + $this->_prxFileOffset, SEEK_SET); foreach ($freqs as $docId => $freq) { $termPosition = 0; $positions = array(); for ($count = 0; $count < $freq; $count++) { $termPosition += $this->_prxFile->readVInt(); $positions[] = $termPosition; } if (isset($this->_docMap[$docId])) { $this->_lastTermPositions[$this->_docMap[$docId]] = $positions; } } } $this->_termCount--; if ($this->_termCount == 0) { $this->_tisFile = null; $this->_frqFile = null; $this->_prxFile = null; } return $this->_lastTerm; }