public function testKey() { $term1_1 = new Index\Term('term_text1', 'field_name1'); $term2_1 = new Index\Term('term_text2', 'field_name1'); $term2_2 = new Index\Term('term_text2', 'field_name2'); $term2_1Dup = new Index\Term('term_text2', 'field_name1'); $this->assertEquals($term1_1->text > $term2_1->text, $term1_1->key() > $term2_1->key()); $this->assertEquals($term1_1->text >= $term2_1->text, $term1_1->key() >= $term2_1->key()); $this->assertEquals($term1_1->field > $term2_2->field, $term1_1->key() > $term2_2->key()); $this->assertEquals($term1_1->field >= $term2_2->field, $term1_1->key() >= $term2_2->key()); $this->assertEquals($term2_1->key(), $term2_1Dup->key()); }
/** * Adds a document to this segment. * * @param \ZendSearch\Lucene\Document $document * @throws LuceneException\UnsupportedMethodCallException */ public function addDocument(Document $document) { $storedFields = array(); $docNorms = array(); $similarity = AbstractSimilarity::getDefault(); foreach ($document->getFieldNames() as $fieldName) { $field = $document->getField($fieldName); if ($field->storeTermVector) { /** * @todo term vector storing support */ throw new LuceneException\UnsupportedMethodCallException('Store term vector functionality is not supported yet.'); } if ($field->isIndexed) { if ($field->isTokenized) { $analyzer = Analyzer\Analyzer::getDefault(); $analyzer->setInput($field->value, $field->encoding); $position = 0; $tokenCounter = 0; while (($token = $analyzer->nextToken()) !== null) { $tokenCounter++; $term = new Index\Term($token->getTermText(), $field->name); $termKey = $term->key(); if (!isset($this->_termDictionary[$termKey])) { // New term $this->_termDictionary[$termKey] = $term; $this->_termDocs[$termKey] = array(); $this->_termDocs[$termKey][$this->_docCount] = array(); } elseif (!isset($this->_termDocs[$termKey][$this->_docCount])) { // Existing term, but new term entry $this->_termDocs[$termKey][$this->_docCount] = array(); } $position += $token->getPositionIncrement(); $this->_termDocs[$termKey][$this->_docCount][] = $position; } if ($tokenCounter == 0) { // Field contains empty value. Treat it as non-indexed and non-tokenized $field = clone $field; $field->isIndexed = $field->isTokenized = false; } else { $docNorms[$field->name] = chr($similarity->encodeNorm($similarity->lengthNorm($field->name, $tokenCounter) * $document->boost * $field->boost)); } } elseif (($fieldUtf8Value = $field->getUtf8Value()) == '') { // Field contains empty value. Treat it as non-indexed and non-tokenized $field = clone $field; $field->isIndexed = $field->isTokenized = false; } else { $term = new Index\Term($fieldUtf8Value, $field->name); $termKey = $term->key(); if (!isset($this->_termDictionary[$termKey])) { // New term $this->_termDictionary[$termKey] = $term; $this->_termDocs[$termKey] = array(); $this->_termDocs[$termKey][$this->_docCount] = array(); } elseif (!isset($this->_termDocs[$termKey][$this->_docCount])) { // Existing term, but new term entry $this->_termDocs[$termKey][$this->_docCount] = array(); } $this->_termDocs[$termKey][$this->_docCount][] = 0; // position $docNorms[$field->name] = chr($similarity->encodeNorm($similarity->lengthNorm($field->name, 1) * $document->boost * $field->boost)); } } if ($field->isStored) { $storedFields[] = $field; } $this->addField($field); } foreach ($this->_fields as $fieldName => $field) { if (!$field->isIndexed) { continue; } if (!isset($this->_norms[$fieldName])) { $this->_norms[$fieldName] = str_repeat(chr($similarity->encodeNorm($similarity->lengthNorm($fieldName, 0))), $this->_docCount); } if (isset($docNorms[$fieldName])) { $this->_norms[$fieldName] .= $docNorms[$fieldName]; } else { $this->_norms[$fieldName] .= chr($similarity->encodeNorm($similarity->lengthNorm($fieldName, 0))); } } $this->addStoredFields($storedFields); }
/** * Scans terms dictionary and returns term info * * @param \ZendSearch\Lucene\Index\Term $term * @throws \ZendSearch\Lucene\Exception\InvalidFileFormatException * @return \ZendSearch\Lucene\Index\TermInfo */ public function getTermInfo(Term $term) { $termKey = $term->key(); if (isset($this->_termInfoCache[$termKey])) { $termInfo = $this->_termInfoCache[$termKey]; // Move termInfo to the end of cache unset($this->_termInfoCache[$termKey]); $this->_termInfoCache[$termKey] = $termInfo; return $termInfo; } if ($this->_termDictionary === null) { $this->_loadDictionaryIndex(); } $searchField = $this->getFieldNum($term->field); if ($searchField == -1) { return null; } $searchDicField = $this->_getFieldPosition($searchField); // search for appropriate value in dictionary $lowIndex = 0; $highIndex = count($this->_termDictionary) - 1; while ($highIndex >= $lowIndex) { // $mid = ($highIndex - $lowIndex)/2; $mid = $highIndex + $lowIndex >> 1; $midTerm = $this->_termDictionary[$mid]; $fieldNum = $this->_getFieldPosition($midTerm[0]); $delta = $searchDicField - $fieldNum; if ($delta == 0) { $delta = strcmp($term->text, $midTerm[1]); } if ($delta < 0) { $highIndex = $mid - 1; } elseif ($delta > 0) { $lowIndex = $mid + 1; } else { // return $this->_termDictionaryInfos[$mid]; // We got it! $a = $this->_termDictionaryInfos[$mid]; $termInfo = new TermInfo($a[0], $a[1], $a[2], $a[3], $a[4]); // Put loaded termInfo into cache $this->_termInfoCache[$termKey] = $termInfo; return $termInfo; } } if ($highIndex == -1) { // Term is out of the dictionary range return null; } $prevPosition = $highIndex; $prevTerm = $this->_termDictionary[$prevPosition]; $prevTermInfo = $this->_termDictionaryInfos[$prevPosition]; $tisFile = $this->openCompoundFile('.tis'); $tiVersion = $tisFile->readInt(); if ($tiVersion != (int) 0xfffffffe && $tiVersion != (int) 0xfffffffd) { throw new InvalidFileFormatException('Wrong TermInfoFile file format'); } $termCount = $tisFile->readLong(); $indexInterval = $tisFile->readInt(); $skipInterval = $tisFile->readInt(); if ($tiVersion == (int) 0xfffffffd) { $maxSkipLevels = $tisFile->readInt(); } $tisFile->seek($prevTermInfo[4] - ($tiVersion == (int) 0xfffffffd ? 24 : 20), SEEK_CUR); $termValue = $prevTerm[1]; $termFieldNum = $prevTerm[0]; $freqPointer = $prevTermInfo[1]; $proxPointer = $prevTermInfo[2]; for ($count = $prevPosition * $indexInterval + 1; $count <= $termCount && ($this->_getFieldPosition($termFieldNum) < $searchDicField || $this->_getFieldPosition($termFieldNum) == $searchDicField && strcmp($termValue, $term->text) < 0); $count++) { $termPrefixLength = $tisFile->readVInt(); $termSuffix = $tisFile->readString(); $termFieldNum = $tisFile->readVInt(); $termValue = Term::getPrefix($termValue, $termPrefixLength) . $termSuffix; $docFreq = $tisFile->readVInt(); $freqPointer += $tisFile->readVInt(); $proxPointer += $tisFile->readVInt(); if ($docFreq >= $skipInterval) { $skipOffset = $tisFile->readVInt(); } else { $skipOffset = 0; } } if ($termFieldNum == $searchField && $termValue == $term->text) { $termInfo = new TermInfo($docFreq, $freqPointer, $proxPointer, $skipOffset); } else { $termInfo = null; } // Put loaded termInfo into cache $this->_termInfoCache[$termKey] = $termInfo; if (count($this->_termInfoCache) == 1024) { $this->_cleanUpTermInfoCache(); } return $termInfo; }