/** * Scans terms dictionary and returns next term * * @return \Zend\Search\Lucene\Index\Term|null */ public function nextTerm() { if ($this->_tisFile === null || $this->_termCount == 0) { $this->_lastTerm = null; $this->_lastTermInfo = null; $this->_lastTermPositions = null; $this->_docMap = null; // may be necessary for "empty" segment $this->_tisFile = null; $this->_frqFile = null; $this->_prxFile = null; return null; } $termPrefixLength = $this->_tisFile->readVInt(); $termSuffix = $this->_tisFile->readString(); $termFieldNum = $this->_tisFile->readVInt(); $termValue = Term::getPrefix($this->_lastTerm->text, $termPrefixLength) . $termSuffix; $this->_lastTerm = new Term($termValue, $this->_fields[$termFieldNum]->name); $docFreq = $this->_tisFile->readVInt(); $freqPointer = $this->_lastTermInfo->freqPointer + $this->_tisFile->readVInt(); $proxPointer = $this->_lastTermInfo->proxPointer + $this->_tisFile->readVInt(); if ($docFreq >= $this->_skipInterval) { $skipOffset = $this->_tisFile->readVInt(); } else { $skipOffset = 0; } $this->_lastTermInfo = new TermInfo($docFreq, $freqPointer, $proxPointer, $skipOffset); if ($this->_termsScanMode == self::SM_FULL_INFO || $this->_termsScanMode == self::SM_MERGE_INFO) { $this->_lastTermPositions = array(); $this->_frqFile->seek($this->_lastTermInfo->freqPointer + $this->_frqFileOffset, SEEK_SET); $freqs = array(); $docId = 0; for ($count = 0; $count < $this->_lastTermInfo->docFreq; $count++) { $docDelta = $this->_frqFile->readVInt(); if ($docDelta % 2 == 1) { $docId += ($docDelta - 1) / 2; $freqs[$docId] = 1; } else { $docId += $docDelta / 2; $freqs[$docId] = $this->_frqFile->readVInt(); } } $this->_prxFile->seek($this->_lastTermInfo->proxPointer + $this->_prxFileOffset, SEEK_SET); foreach ($freqs as $docId => $freq) { $termPosition = 0; $positions = array(); for ($count = 0; $count < $freq; $count++) { $termPosition += $this->_prxFile->readVInt(); $positions[] = $termPosition; } if (isset($this->_docMap[$docId])) { $this->_lastTermPositions[$this->_docMap[$docId]] = $positions; } } } $this->_termCount--; if ($this->_termCount == 0) { $this->_tisFile = null; $this->_frqFile = null; $this->_prxFile = null; } return $this->_lastTerm; }
/** * Dump Term Dictionary segment file entry. * Used to write entry to .tis or .tii files * * @param \Zend\Search\Lucene\Storage\File $dicFile * @param \Zend\Search\Lucene\Index\Term $prevTerm * @param \Zend\Search\Lucene\Index\Term $term * @param \Zend\Search\Lucene\Index\TermInfo $prevTermInfo * @param \Zend\Search\Lucene\Index\TermInfo $termInfo */ protected function _dumpTermDictEntry(File $dicFile, &$prevTerm, Index\Term $term, &$prevTermInfo, Index\TermInfo $termInfo) { if (isset($prevTerm) && $prevTerm->field == $term->field) { $matchedBytes = 0; $maxBytes = min(strlen($prevTerm->text), strlen($term->text)); while ($matchedBytes < $maxBytes && $prevTerm->text[$matchedBytes] == $term->text[$matchedBytes]) { $matchedBytes++; } // Calculate actual matched UTF-8 pattern $prefixBytes = 0; $prefixChars = 0; while ($prefixBytes < $matchedBytes) { $charBytes = 1; if ((ord($term->text[$prefixBytes]) & 0xc0) == 0xc0) { $charBytes++; if (ord($term->text[$prefixBytes]) & 0x20) { $charBytes++; if (ord($term->text[$prefixBytes]) & 0x10) { $charBytes++; } } } if ($prefixBytes + $charBytes > $matchedBytes) { // char crosses matched bytes boundary // skip char break; } $prefixChars++; $prefixBytes += $charBytes; } // Write preffix length $dicFile->writeVInt($prefixChars); // Write suffix $dicFile->writeString(substr($term->text, $prefixBytes)); } else { // Write preffix length $dicFile->writeVInt(0); // Write suffix $dicFile->writeString($term->text); } // Write field number $dicFile->writeVInt($term->field); // DocFreq (the count of documents which contain the term) $dicFile->writeVInt($termInfo->docFreq); $prevTerm = $term; if (!isset($prevTermInfo)) { // Write FreqDelta $dicFile->writeVInt($termInfo->freqPointer); // Write ProxDelta $dicFile->writeVInt($termInfo->proxPointer); } else { // Write FreqDelta $dicFile->writeVInt($termInfo->freqPointer - $prevTermInfo->freqPointer); // Write ProxDelta $dicFile->writeVInt($termInfo->proxPointer - $prevTermInfo->proxPointer); } // Write SkipOffset - it's not 0 when $termInfo->docFreq > self::$skipInterval if ($termInfo->skipOffset != 0) { $dicFile->writeVInt($termInfo->skipOffset); } $prevTermInfo = $termInfo; }