/** * Reset terms stream * * $startId - id for the fist document * $compact - remove deleted documents * * Returns start document id for the next segment * * @param integer $startId * @param integer $mode * @throws Zend_Search_Lucene_Exception * @return integer */ public function resetTermsStream(/** $startId = 0, $mode = self::SM_TERMS_ONLY */) { /** * SegmentInfo->resetTermsStream() method actually takes two optional parameters: * $startId (default value is 0) * $mode (default value is self::SM_TERMS_ONLY) */ $argList = func_get_args(); if (count($argList) > 2) { require_once 'Zend/Search/Lucene/Exception.php'; throw new Zend_Search_Lucene_Exception('Wrong number of arguments'); } else if (count($argList) == 2) { $startId = $argList[0]; $mode = $argList[1]; } else if (count($argList) == 1) { $startId = $argList[0]; $mode = self::SM_TERMS_ONLY; } else { $startId = 0; $mode = self::SM_TERMS_ONLY; } if ($this->_tisFile !== null) { $this->_tisFile = null; } $this->_tisFile = $this->openCompoundFile('.tis', false); $this->_tisFileOffset = $this->_tisFile->tell(); $tiVersion = $this->_tisFile->readInt(); if ($tiVersion != (int)0xFFFFFFFE /* pre-2.1 format */ && $tiVersion != (int)0xFFFFFFFD /* 2.1+ format */) { require_once 'Zend/Search/Lucene/Exception.php'; throw new Zend_Search_Lucene_Exception('Wrong TermInfoFile file format'); } $this->_termCount = $this->_termNum = $this->_tisFile->readLong(); // Read terms count $this->_indexInterval = $this->_tisFile->readInt(); // Read Index interval $this->_skipInterval = $this->_tisFile->readInt(); // Read skip interval if ($tiVersion == (int)0xFFFFFFFD /* 2.1+ format */) { $maxSkipLevels = $this->_tisFile->readInt(); } if ($this->_frqFile !== null) { $this->_frqFile = null; } if ($this->_prxFile !== null) { $this->_prxFile = null; } $this->_docMap = array(); $this->_lastTerm = new Zend_Search_Lucene_Index_Term('', -1); $this->_lastTermInfo = new Zend_Search_Lucene_Index_TermInfo(0, 0, 0, 0); $this->_lastTermPositions = null; $this->_termsScanMode = $mode; switch ($mode) { case self::SM_TERMS_ONLY: // Do nothing break; case self::SM_FULL_INFO: // break intentionally omitted case self::SM_MERGE_INFO: $this->_frqFile = $this->openCompoundFile('.frq', false); $this->_frqFileOffset = $this->_frqFile->tell(); $this->_prxFile = $this->openCompoundFile('.prx', false); $this->_prxFileOffset = $this->_prxFile->tell(); for ($count = 0; $count < $this->_docCount; $count++) { if (!$this->isDeleted($count)) { $this->_docMap[$count] = $startId + (($mode == self::SM_MERGE_INFO) ? count($this->_docMap) : $count); } } break; default: require_once 'Zend/Search/Lucene/Exception.php'; throw new Zend_Search_Lucene_Exception('Wrong terms scaning mode specified.'); break; } $this->nextTerm(); return $startId + (($mode == self::SM_MERGE_INFO) ? count($this->_docMap) : $this->_docCount); }
/** * Add term * * Term positions is an array( docId => array(pos1, pos2, pos3, ...), ... ) * * @param Zend_Search_Lucene_Index_Term $termEntry * @param array $termDocs */ public function addTerm($termEntry, $termDocs) { $freqPointer = $this->_frqFile->tell(); $proxPointer = $this->_prxFile->tell(); $prevDoc = 0; foreach ($termDocs as $docId => $termPositions) { $docDelta = ($docId - $prevDoc) * 2; $prevDoc = $docId; if (count($termPositions) > 1) { $this->_frqFile->writeVInt($docDelta); $this->_frqFile->writeVInt(count($termPositions)); } else { $this->_frqFile->writeVInt($docDelta + 1); } $prevPosition = 0; foreach ($termPositions as $position) { $this->_prxFile->writeVInt($position - $prevPosition); $prevPosition = $position; } } if (count($termDocs) >= self::$skipInterval) { /** * @todo Write Skip Data to a freq file. * It's not used now, but make index more optimal */ $skipOffset = $this->_frqFile->tell() - $freqPointer; } else { $skipOffset = 0; } $term = new Zend_Search_Lucene_Index_Term($termEntry->text, $this->_fields[$termEntry->field]->number); $termInfo = new Zend_Search_Lucene_Index_TermInfo(count($termDocs), $freqPointer, $proxPointer, $skipOffset); $this->_dumpTermDictEntry($this->_tisFile, $this->_prevTerm, $term, $this->_prevTermInfo, $termInfo); if (($this->_termCount + 1) % self::$indexInterval == 0) { $this->_dumpTermDictEntry($this->_tiiFile, $this->_prevIndexTerm, $term, $this->_prevIndexTermInfo, $termInfo); $indexPosition = $this->_tisFile->tell(); $this->_tiiFile->writeVInt($indexPosition - $this->_lastIndexPosition); $this->_lastIndexPosition = $indexPosition; } $this->_termCount++; }
/** * Reset terms stream * * $startId - id for the fist document * $compact - remove deleted documents * * Returns start document id for the next segment * * @param integer $startId * @param integer $mode * @throws Zend_Search_Lucene_Exception * @return integer */ public function reset($startId = 0, $mode = self::SM_TERMS_ONLY) { if ($this->_tisFile !== null) { $this->_tisFile = null; } $this->_tisFile = $this->openCompoundFile('.tis', false); $this->_tisFileOffset = $this->_tisFile->tell(); $tiVersion = $this->_tisFile->readInt(); if ($tiVersion != (int) 0xfffffffe && $tiVersion != (int) 0xfffffffd) { require_once 'Zend/Search/Lucene/Exception.php'; throw new Zend_Search_Lucene_Exception('Wrong TermInfoFile file format'); } $this->_termCount = $this->_termNum = $this->_tisFile->readLong(); // Read terms count $this->_indexInterval = $this->_tisFile->readInt(); // Read Index interval $this->_skipInterval = $this->_tisFile->readInt(); // Read skip interval if ($tiVersion == (int) 0xfffffffd) { $maxSkipLevels = $this->_tisFile->readInt(); } if ($this->_frqFile !== null) { $this->_frqFile = null; } if ($this->_prxFile !== null) { $this->_prxFile = null; } $this->_docMap = array(); $this->_lastTerm = new Zend_Search_Lucene_Index_Term('', -1); $this->_lastTermInfo = new Zend_Search_Lucene_Index_TermInfo(0, 0, 0, 0); $this->_lastTermPositions = null; $this->_termsScanMode = $mode; switch ($mode) { case self::SM_TERMS_ONLY: // Do nothing break; case self::SM_FULL_INFO: // break intentionally omitted // break intentionally omitted case self::SM_MERGE_INFO: $this->_frqFile = $this->openCompoundFile('.frq', false); $this->_frqFileOffset = $this->_frqFile->tell(); $this->_prxFile = $this->openCompoundFile('.prx', false); $this->_prxFileOffset = $this->_prxFile->tell(); for ($count = 0; $count < $this->_docCount; $count++) { if (!$this->isDeleted($count)) { $this->_docMap[$count] = $startId + ($mode == self::SM_MERGE_INFO ? count($this->_docMap) : $count); } } break; default: require_once 'Zend/Search/Lucene/Exception.php'; throw new Zend_Search_Lucene_Exception('Wrong terms scaning mode specified.'); break; } $this->nextTerm(); return $startId + ($mode == self::SM_MERGE_INFO ? count($this->_docMap) : $this->_docCount); }
/** * Reset terms stream * * $startId - id for the fist document * $compact - remove deleted documents * * Returns start document id for the next segment * * @param integer $startId * @param boolean $compact * @throws Zend_Search_Lucene_Exception * @return integer */ public function reset($startId = 0, $compact = false) { if ($this->_tisFile !== null) { $this->_tisFile = null; } $this->_tisFile = $this->openCompoundFile('.tis', false); $tiVersion = $this->_tisFile->readInt(); if ($tiVersion != (int) 0.0) { throw new Zend_Search_Lucene_Exception('Wrong TermInfoFile file format'); } $this->_termCount = $this->_tisFile->readLong(); $this->_tisFile->readInt(); // Read Index interval $this->_skipInterval = $this->_tisFile->readInt(); // Read skip interval if ($this->_frqFile !== null) { $this->_frqFile = null; } $this->_frqFile = $this->openCompoundFile('.frq', false); $this->_frqFileOffset = $this->_frqFile->tell(); if ($this->_prxFile !== null) { $this->_prxFile = null; } $this->_prxFile = $this->openCompoundFile('.prx', false); $this->_prxFileOffset = $this->_prxFile->tell(); $this->_lastTerm = new Zend_Search_Lucene_Index_Term('', -1); $this->_lastTermInfo = new Zend_Search_Lucene_Index_TermInfo(0, 0, 0, 0); $this->_docMap = array(); for ($count = 0; $count < $this->_docCount; $count++) { if (!$this->isDeleted($count)) { $this->_docMap[$count] = $startId + ($compact ? count($this->_docMap) : $count); } } $this->nextTerm(); return $startId + ($compact ? count($this->_docMap) : $this->_docCount); }
/** * Adds a document to this segment. * * @param Zend_Search_Lucene_Document $document * @throws Zend_Search_Lucene_Exception */ public function addDocument(Zend_Search_Lucene_Document $document) { $storedFields = array(); foreach ($document->getFieldNames() as $fieldName) { $field = $document->getField($fieldName); $this->_addFieldInfo($field); if ($field->storeTermVector) { /** * @todo term vector storing support */ throw new Zend_Search_Lucene_Exception('Store term vector functionality is not supported yet.'); } if ($field->isIndexed) { if ($field->isTokenized) { $tokenList = Zend_Search_Lucene_Analysis_Analyzer::getDefault()->tokenize($field->stringValue); } else { $tokenList = array(); $tokenList[] = new Zend_Search_Lucene_Analysis_Token($field->stringValue, 0, strlen($field->stringValue)); } $this->_fieldLengths[$field->name][$this->_docCount] = count($tokenList); $position = 0; foreach ($tokenList as $token) { $term = new Zend_Search_Lucene_Index_Term($token->getTermText(), $field->name); $termKey = $term->key(); if (!isset($this->_termDictionary[$termKey])) { // New term $this->_termDictionary[$termKey] = $term; $this->_termDocs[$termKey] = array(); $this->_termDocs[$termKey][$this->_docCount] = array(); } else { if (!isset($this->_termDocs[$termKey][$this->_docCount])) { // Existing term, but new term entry $this->_termDocs[$termKey][$this->_docCount] = array(); } } $position += $token->getPositionIncrement(); $this->_termDocs[$termKey][$this->_docCount][] = $position; } } if ($field->isStored) { $storedFields[] = $field; } } if (count($storedFields) != 0) { if (!isset($this->_fdxFile)) { $this->_fdxFile = $this->_directory->createFile($this->_name . '.fdx'); $this->_fdtFile = $this->_directory->createFile($this->_name . '.fdt'); $this->_files[] = $this->_name . '.fdx'; $this->_files[] = $this->_name . '.fdt'; } $this->_fdxFile->writeLong($this->_fdtFile->tell()); $this->_fdtFile->writeVInt(count($storedFields)); foreach ($storedFields as $field) { $this->_fdtFile->writeVInt($this->_fields[$field->name]->number); $fieldBits = ($field->isTokenized ? 0x1 : 0x0) | ($field->isBinary ? 0x2 : 0x0) | 0x0; /* 0x04 - third bit, compressed (ZLIB) */ $this->_fdtFile->writeByte($fieldBits); if ($field->isBinary) { $this->_fdtFile->writeVInt(strlen($field->stringValue)); $this->_fdtFile->writeBytes($field->stringValue); } else { $this->_fdtFile->writeString($field->stringValue); } } } $this->_docCount++; }