public function testKey() { $term1_1 = new Zend_Search_Lucene_Index_Term('term_text1', 'field_name1'); $term2_1 = new Zend_Search_Lucene_Index_Term('term_text2', 'field_name1'); $term2_2 = new Zend_Search_Lucene_Index_Term('term_text2', 'field_name2'); $term2_1Dup = new Zend_Search_Lucene_Index_Term('term_text2', 'field_name1'); $this->assertEquals($term1_1->text > $term2_1->text, $term1_1->key() > $term2_1->key()); $this->assertEquals($term1_1->text >= $term2_1->text, $term1_1->key() >= $term2_1->key()); $this->assertEquals($term1_1->field > $term2_2->field, $term1_1->key() > $term2_2->key()); $this->assertEquals($term1_1->field >= $term2_2->field, $term1_1->key() >= $term2_2->key()); $this->assertEquals($term2_1->key(), $term2_1Dup->key()); }
/** * Scans terms dictionary and returns term info * * @param Zend_Search_Lucene_Index_Term $term * @return Zend_Search_Lucene_Index_TermInfo */ public function getTermInfo(Zend_Search_Lucene_Index_Term $term) { $termKey = $term->key(); if (isset($this->_termInfoCache[$termKey])) { $termInfo = $this->_termInfoCache[$termKey]; // Move termInfo to the end of cache unset($this->_termInfoCache[$termKey]); $this->_termInfoCache[$termKey] = $termInfo; return $termInfo; } if ($this->_termDictionary === null) { $this->_loadDictionaryIndex(); } $searchField = $this->getFieldNum($term->field); if ($searchField == -1) { return null; } $searchDicField = $this->_getFieldPosition($searchField); // search for appropriate value in dictionary $lowIndex = 0; $highIndex = count($this->_termDictionary)-1; while ($highIndex >= $lowIndex) { // $mid = ($highIndex - $lowIndex)/2; $mid = ($highIndex + $lowIndex) >> 1; $midTerm = $this->_termDictionary[$mid]; $fieldNum = $this->_getFieldPosition($midTerm[0] /* field */); $delta = $searchDicField - $fieldNum; if ($delta == 0) { $delta = strcmp($term->text, $midTerm[1] /* text */); } if ($delta < 0) { $highIndex = $mid-1; } elseif ($delta > 0) { $lowIndex = $mid+1; } else { // return $this->_termDictionaryInfos[$mid]; // We got it! $a = $this->_termDictionaryInfos[$mid]; $termInfo = new Zend_Search_Lucene_Index_TermInfo($a[0], $a[1], $a[2], $a[3], $a[4]); // Put loaded termInfo into cache $this->_termInfoCache[$termKey] = $termInfo; return $termInfo; } } if ($highIndex == -1) { // Term is out of the dictionary range return null; } $prevPosition = $highIndex; $prevTerm = $this->_termDictionary[$prevPosition]; $prevTermInfo = $this->_termDictionaryInfos[$prevPosition]; $tisFile = $this->openCompoundFile('.tis'); $tiVersion = $tisFile->readInt(); if ($tiVersion != (int)0xFFFFFFFE /* pre-2.1 format */ && $tiVersion != (int)0xFFFFFFFD /* 2.1+ format */) { require_once 'Zend/Search/Lucene/Exception.php'; throw new Zend_Search_Lucene_Exception('Wrong TermInfoFile file format'); } $termCount = $tisFile->readLong(); $indexInterval = $tisFile->readInt(); $skipInterval = $tisFile->readInt(); if ($tiVersion == (int)0xFFFFFFFD /* 2.1+ format */) { $maxSkipLevels = $tisFile->readInt(); } $tisFile->seek($prevTermInfo[4] /* indexPointer */ - (($tiVersion == (int)0xFFFFFFFD)? 24 : 20) /* header size*/, SEEK_CUR); $termValue = $prevTerm[1] /* text */; $termFieldNum = $prevTerm[0] /* field */; $freqPointer = $prevTermInfo[1] /* freqPointer */; $proxPointer = $prevTermInfo[2] /* proxPointer */; for ($count = $prevPosition*$indexInterval + 1; $count <= $termCount && ( $this->_getFieldPosition($termFieldNum) < $searchDicField || ($this->_getFieldPosition($termFieldNum) == $searchDicField && strcmp($termValue, $term->text) < 0) ); $count++) { $termPrefixLength = $tisFile->readVInt(); $termSuffix = $tisFile->readString(); $termFieldNum = $tisFile->readVInt(); $termValue = Zend_Search_Lucene_Index_Term::getPrefix($termValue, $termPrefixLength) . $termSuffix; $docFreq = $tisFile->readVInt(); $freqPointer += $tisFile->readVInt(); $proxPointer += $tisFile->readVInt(); if( $docFreq >= $skipInterval ) { $skipOffset = $tisFile->readVInt(); } else { $skipOffset = 0; } } if ($termFieldNum == $searchField && $termValue == $term->text) { $termInfo = new Zend_Search_Lucene_Index_TermInfo($docFreq, $freqPointer, $proxPointer, $skipOffset); } else { $termInfo = null; } // Put loaded termInfo into cache $this->_termInfoCache[$termKey] = $termInfo; if (count($this->_termInfoCache) == 1024) { $this->_cleanUpTermInfoCache(); } return $termInfo; }
/** * Scans terms dictionary and returns term info * * @param Zend_Search_Lucene_Index_Term $term * @return Zend_Search_Lucene_Index_TermInfo */ public function getTermInfo(Zend_Search_Lucene_Index_Term $term) { $termKey = $term->key(); if (isset($this->_termInfoCache[$termKey])) { $termInfo = $this->_termInfoCache[$termKey]; // Move termInfo to the end of cache unset($this->_termInfoCache[$termKey]); $this->_termInfoCache[$termKey] = $termInfo; return $termInfo; } if ($this->_termDictionary === null) { // Check, if index is already serialized if ($this->_directory->fileExists($this->_name . '.sti')) { // Prefetch dictionary index data $stiFile = $this->_directory->getFileObject($this->_name . '.sti'); $stiFileData = $stiFile->readBytes($this->_directory->fileLength($this->_name . '.sti')); // Load dictionary index data list($this->_termDictionary, $this->_termDictionaryInfos) = unserialize($stiFileData); } else { // Prefetch dictionary index data $tiiFile = $this->openCompoundFile('.tii'); $tiiFileData = $tiiFile->readBytes($this->compoundFileLength('.tii')); // Load dictionary index data list($this->_termDictionary, $this->_termDictionaryInfos) = Zend_Search_Lucene_Index_DictionaryLoader::load($tiiFileData); $stiFileData = serialize(array($this->_termDictionary, $this->_termDictionaryInfos)); $stiFile = $this->_directory->createFile($this->_name . '.sti'); $stiFile->writeBytes($stiFileData); } } $searchField = $this->getFieldNum($term->field); if ($searchField == -1) { return null; } $searchDicField = $this->_getFieldPosition($searchField); // search for appropriate value in dictionary $lowIndex = 0; $highIndex = count($this->_termDictionary) - 1; while ($highIndex >= $lowIndex) { // $mid = ($highIndex - $lowIndex)/2; $mid = $highIndex + $lowIndex >> 1; $midTerm = $this->_termDictionary[$mid]; $fieldNum = $this->_getFieldPosition($midTerm[0]); $delta = $searchDicField - $fieldNum; if ($delta == 0) { $delta = strcmp($term->text, $midTerm[1]); } if ($delta < 0) { $highIndex = $mid - 1; } elseif ($delta > 0) { $lowIndex = $mid + 1; } else { // return $this->_termDictionaryInfos[$mid]; // We got it! $a = $this->_termDictionaryInfos[$mid]; $termInfo = new Zend_Search_Lucene_Index_TermInfo($a[0], $a[1], $a[2], $a[3], $a[4]); // Put loaded termInfo into cache $this->_termInfoCache[$termKey] = $termInfo; return $termInfo; } } if ($highIndex == -1) { // Term is out of the dictionary range return null; } $prevPosition = $highIndex; $prevTerm = $this->_termDictionary[$prevPosition]; $prevTermInfo = $this->_termDictionaryInfos[$prevPosition]; $tisFile = $this->openCompoundFile('.tis'); $tiVersion = $tisFile->readInt(); if ($tiVersion != (int) 0.0) { throw new Zend_Search_Lucene_Exception('Wrong TermInfoFile file format'); } $termCount = $tisFile->readLong(); $indexInterval = $tisFile->readInt(); $skipInterval = $tisFile->readInt(); $tisFile->seek($prevTermInfo[4] - 20, SEEK_CUR); $termValue = $prevTerm[1]; $termFieldNum = $prevTerm[0]; $freqPointer = $prevTermInfo[1]; $proxPointer = $prevTermInfo[2]; for ($count = $prevPosition * $indexInterval + 1; $count <= $termCount && ($this->_getFieldPosition($termFieldNum) < $searchDicField || $this->_getFieldPosition($termFieldNum) == $searchDicField && strcmp($termValue, $term->text) < 0); $count++) { $termPrefixLength = $tisFile->readVInt(); $termSuffix = $tisFile->readString(); $termFieldNum = $tisFile->readVInt(); $termValue = Zend_Search_Lucene_Index_Term::getPrefix($termValue, $termPrefixLength) . $termSuffix; $docFreq = $tisFile->readVInt(); $freqPointer += $tisFile->readVInt(); $proxPointer += $tisFile->readVInt(); if ($docFreq >= $skipInterval) { $skipOffset = $tisFile->readVInt(); } else { $skipOffset = 0; } } if ($termFieldNum == $searchField && $termValue == $term->text) { $termInfo = new Zend_Search_Lucene_Index_TermInfo($docFreq, $freqPointer, $proxPointer, $skipOffset); } else { $termInfo = null; } // Put loaded termInfo into cache $this->_termInfoCache[$termKey] = $termInfo; if (count($this->_termInfoCache) == 1024) { $this->_cleanUpTermInfoCache(); } return $termInfo; }
/** * Adds a document to this segment. * * @param Zend_Search_Lucene_Document $document * @throws Zend_Search_Lucene_Exception */ public function addDocument(Zend_Search_Lucene_Document $document) { /** Zend_Search_Lucene_Search_Similarity */ // require_once 'Zend/Search/Lucene/Search/Similarity.php'; $storedFields = array(); $docNorms = array(); $similarity = Zend_Search_Lucene_Search_Similarity::getDefault(); foreach ($document->getFieldNames() as $fieldName) { $field = $document->getField($fieldName); if ($field->storeTermVector) { /** * @todo term vector storing support */ // require_once 'Zend/Search/Lucene/Exception.php'; throw new Zend_Search_Lucene_Exception('Store term vector functionality is not supported yet.'); } if ($field->isIndexed) { if ($field->isTokenized) { /** Zend_Search_Lucene_Analysis_Analyzer */ // require_once 'Zend/Search/Lucene/Analysis/Analyzer.php'; $analyzer = Zend_Search_Lucene_Analysis_Analyzer::getDefault(); $analyzer->setInput($field->value, $field->encoding); $position = 0; $tokenCounter = 0; while (($token = $analyzer->nextToken()) !== null) { $tokenCounter++; $term = new Zend_Search_Lucene_Index_Term($token->getTermText(), $field->name); $termKey = $term->key(); if (!isset($this->_termDictionary[$termKey])) { // New term $this->_termDictionary[$termKey] = $term; $this->_termDocs[$termKey] = array(); $this->_termDocs[$termKey][$this->_docCount] = array(); } else { if (!isset($this->_termDocs[$termKey][$this->_docCount])) { // Existing term, but new term entry $this->_termDocs[$termKey][$this->_docCount] = array(); } } $position += $token->getPositionIncrement(); $this->_termDocs[$termKey][$this->_docCount][] = $position; } if ($tokenCounter == 0) { // Field contains empty value. Treat it as non-indexed and non-tokenized $field = clone $field; $field->isIndexed = $field->isTokenized = false; } else { $docNorms[$field->name] = chr($similarity->encodeNorm($similarity->lengthNorm($field->name, $tokenCounter) * $document->boost * $field->boost)); } } else { if (($fieldUtf8Value = $field->getUtf8Value()) == '') { // Field contains empty value. Treat it as non-indexed and non-tokenized $field = clone $field; $field->isIndexed = $field->isTokenized = false; } else { $term = new Zend_Search_Lucene_Index_Term($fieldUtf8Value, $field->name); $termKey = $term->key(); if (!isset($this->_termDictionary[$termKey])) { // New term $this->_termDictionary[$termKey] = $term; $this->_termDocs[$termKey] = array(); $this->_termDocs[$termKey][$this->_docCount] = array(); } else { if (!isset($this->_termDocs[$termKey][$this->_docCount])) { // Existing term, but new term entry $this->_termDocs[$termKey][$this->_docCount] = array(); } } $this->_termDocs[$termKey][$this->_docCount][] = 0; // position $docNorms[$field->name] = chr($similarity->encodeNorm($similarity->lengthNorm($field->name, 1) * $document->boost * $field->boost)); } } } if ($field->isStored) { $storedFields[] = $field; } $this->addField($field); } foreach ($this->_fields as $fieldName => $field) { if (!$field->isIndexed) { continue; } if (!isset($this->_norms[$fieldName])) { $this->_norms[$fieldName] = str_repeat(chr($similarity->encodeNorm($similarity->lengthNorm($fieldName, 0))), $this->_docCount); } if (isset($docNorms[$fieldName])) { $this->_norms[$fieldName] .= $docNorms[$fieldName]; } else { $this->_norms[$fieldName] .= chr($similarity->encodeNorm($similarity->lengthNorm($fieldName, 0))); } } $this->addStoredFields($storedFields); }
/** * Adds a document to this segment. * * @param Zend_Search_Lucene_Document $document * @throws Zend_Search_Lucene_Exception */ public function addDocument(Zend_Search_Lucene_Document $document) { $storedFields = array(); $docNorms = array(); $similarity = Zend_Search_Lucene_Search_Similarity::getDefault(); foreach ($document->getFieldNames() as $fieldName) { $field = $document->getField($fieldName); $this->addField($field); if ($field->storeTermVector) { /** * @todo term vector storing support */ throw new Zend_Search_Lucene_Exception('Store term vector functionality is not supported yet.'); } if ($field->isIndexed) { if ($field->isTokenized) { $tokenList = Zend_Search_Lucene_Analysis_Analyzer::getDefault()->tokenize($field->stringValue); } else { $tokenList = array(); $tokenList[] = new Zend_Search_Lucene_Analysis_Token($field->stringValue, 0, strlen($field->stringValue)); } $docNorms[$field->name] = chr($similarity->encodeNorm($similarity->lengthNorm($field->name, count($tokenList)))); $position = 0; foreach ($tokenList as $token) { $term = new Zend_Search_Lucene_Index_Term($token->getTermText(), $field->name); $termKey = $term->key(); if (!isset($this->_termDictionary[$termKey])) { // New term $this->_termDictionary[$termKey] = $term; $this->_termDocs[$termKey] = array(); $this->_termDocs[$termKey][$this->_docCount] = array(); } else { if (!isset($this->_termDocs[$termKey][$this->_docCount])) { // Existing term, but new term entry $this->_termDocs[$termKey][$this->_docCount] = array(); } } $position += $token->getPositionIncrement(); $this->_termDocs[$termKey][$this->_docCount][] = $position; } } if ($field->isStored) { $storedFields[] = $field; } } foreach ($this->_fields as $fieldName => $field) { if (!$field->isIndexed) { continue; } if (!isset($this->_norms[$fieldName])) { $this->_norms[$fieldName] = str_repeat(chr($similarity->encodeNorm($similarity->lengthNorm($fieldName, 0))), $this->_docCount); } if (isset($docNorms[$fieldName])) { $this->_norms[$fieldName] .= $docNorms[$fieldName]; } else { $this->_norms[$fieldName] .= chr($similarity->encodeNorm($similarity->lengthNorm($fieldName, 0))); } } $this->addStoredFields($storedFields); }
/** * Adds a document to this segment. * * @param Zend_Search_Lucene_Document $document * @throws Zend_Search_Lucene_Exception */ public function addDocument(Zend_Search_Lucene_Document $document) { $storedFields = array(); foreach ($document->getFieldNames() as $fieldName) { $field = $document->getField($fieldName); $this->_addFieldInfo($field); if ($field->storeTermVector) { /** * @todo term vector storing support */ throw new Zend_Search_Lucene_Exception('Store term vector functionality is not supported yet.'); } if ($field->isIndexed) { if ($field->isTokenized) { $tokenList = Zend_Search_Lucene_Analysis_Analyzer::getDefault()->tokenize($field->stringValue); } else { $tokenList = array(); $tokenList[] = new Zend_Search_Lucene_Analysis_Token($field->stringValue, 0, strlen($field->stringValue)); } $this->_fieldLengths[$field->name][$this->_docCount] = count($tokenList); $position = 0; foreach ($tokenList as $token) { $term = new Zend_Search_Lucene_Index_Term($token->getTermText(), $field->name); $termKey = $term->key(); if (!isset($this->_termDictionary[$termKey])) { // New term $this->_termDictionary[$termKey] = $term; $this->_termDocs[$termKey] = array(); $this->_termDocs[$termKey][$this->_docCount] = array(); } else { if (!isset($this->_termDocs[$termKey][$this->_docCount])) { // Existing term, but new term entry $this->_termDocs[$termKey][$this->_docCount] = array(); } } $position += $token->getPositionIncrement(); $this->_termDocs[$termKey][$this->_docCount][] = $position; } } if ($field->isStored) { $storedFields[] = $field; } } if (count($storedFields) != 0) { if (!isset($this->_fdxFile)) { $this->_fdxFile = $this->_directory->createFile($this->_name . '.fdx'); $this->_fdtFile = $this->_directory->createFile($this->_name . '.fdt'); $this->_files[] = $this->_name . '.fdx'; $this->_files[] = $this->_name . '.fdt'; } $this->_fdxFile->writeLong($this->_fdtFile->tell()); $this->_fdtFile->writeVInt(count($storedFields)); foreach ($storedFields as $field) { $this->_fdtFile->writeVInt($this->_fields[$field->name]->number); $fieldBits = ($field->isTokenized ? 0x1 : 0x0) | ($field->isBinary ? 0x2 : 0x0) | 0x0; /* 0x04 - third bit, compressed (ZLIB) */ $this->_fdtFile->writeByte($fieldBits); if ($field->isBinary) { $this->_fdtFile->writeVInt(strlen($field->stringValue)); $this->_fdtFile->writeBytes($field->stringValue); } else { $this->_fdtFile->writeString($field->stringValue); } } } $this->_docCount++; }