public function testTermsStreamInterfaceSkipToTermsRetrievingTwoTermsCase() { $index = Lucene\Lucene::create(dirname(__FILE__) . '/_index/_files'); // Zero terms $doc = new Document\Document(); $doc->addField(Document\Field::Text('contents', 'someterm word')); $index->addDocument($doc); unset($index); $index = Lucene\Lucene::open(dirname(__FILE__) . '/_index/_files'); $index->resetTermsStream(); $index->skipTo(new Index\Term('term', 'contents')); $this->assertTrue($index->currentTerm() == new Index\Term('word', 'contents')); $index->closeTermsStream(); $this->_clearDirectory(dirname(__FILE__) . '/_index/_files'); }
/** * Returns a Zend_Search_Lucene_Document object for the document * number $id in this index. * * @param integer|\Zend\Search\Lucene\Search\QueryHit $id * @return \Zend\Search\Lucene\Document\Document * @throws \Zend\Search\Lucene\Exception Exception is thrown if $id is out of the range */ public function getDocument($id) { if ($id instanceof Search\QueryHit) { /* @var $id Zend_Search_Lucene_Search_QueryHit */ $id = $id->id; } if ($id >= $this->_docCount) { throw new Exception('Document id is out of the range.'); } $segmentStartId = 0; foreach ($this->_segmentInfos as $segmentInfo) { if ($segmentStartId + $segmentInfo->count() > $id) { break; } $segmentStartId += $segmentInfo->count(); } $fdxFile = $segmentInfo->openCompoundFile('.fdx'); $fdxFile->seek(($id - $segmentStartId) * 8, SEEK_CUR); $fieldValuesPosition = $fdxFile->readLong(); $fdtFile = $segmentInfo->openCompoundFile('.fdt'); $fdtFile->seek($fieldValuesPosition, SEEK_CUR); $fieldCount = $fdtFile->readVInt(); $doc = new Document\Document(); for ($count = 0; $count < $fieldCount; $count++) { $fieldNum = $fdtFile->readVInt(); $bits = $fdtFile->readByte(); $fieldInfo = $segmentInfo->getField($fieldNum); if (!($bits & 2)) { // Text data $field = new Document\Field($fieldInfo->name, $fdtFile->readString(), 'UTF-8', true, $fieldInfo->isIndexed, $bits & 1); } else { // Binary data $field = new Document\Field($fieldInfo->name, $fdtFile->readBinary(), '', true, $fieldInfo->isIndexed, $bits & 1, true); } $doc->addField($field); } return $doc; }
public function testAddFieldMethodChaining() { $document = new Document\Document(); $this->assertTrue($document->addField(Document\Field::Text('title', 'Title')) instanceof Document\Document); $document = new Document\Document(); $document->addField(Document\Field::Text('title', 'Title'))->addField(Document\Field::Text('annotation', 'Annotation'))->addField(Document\Field::Text('body', 'Document body, document body, document body...')); }
/** * Adds a document to this segment. * * @param \Zend\Search\Lucene\Document\Document $document * @throws \Zend\Search\Lucene\Exception */ public function addDocument(Document\Document $document) { $storedFields = array(); $docNorms = array(); $similarity = Similarity\Similarity::getDefault(); foreach ($document->getFieldNames() as $fieldName) { $field = $document->getField($fieldName); if ($field->storeTermVector) { /** * @todo term vector storing support */ throw new Lucene\Exception('Store term vector functionality is not supported yet.'); } if ($field->isIndexed) { if ($field->isTokenized) { $analyzer = Analyzer\Analyzer::getDefault(); $analyzer->setInput($field->value, $field->encoding); $position = 0; $tokenCounter = 0; while (($token = $analyzer->nextToken()) !== null) { $tokenCounter++; $term = new Index\Term($token->getTermText(), $field->name); $termKey = $term->key(); if (!isset($this->_termDictionary[$termKey])) { // New term $this->_termDictionary[$termKey] = $term; $this->_termDocs[$termKey] = array(); $this->_termDocs[$termKey][$this->_docCount] = array(); } else { if (!isset($this->_termDocs[$termKey][$this->_docCount])) { // Existing term, but new term entry $this->_termDocs[$termKey][$this->_docCount] = array(); } } $position += $token->getPositionIncrement(); $this->_termDocs[$termKey][$this->_docCount][] = $position; } if ($tokenCounter == 0) { // Field contains empty value. Treat it as non-indexed and non-tokenized $field = clone $field; $field->isIndexed = $field->isTokenized = false; } else { $docNorms[$field->name] = chr($similarity->encodeNorm($similarity->lengthNorm($field->name, $tokenCounter) * $document->boost * $field->boost)); } } else { if (($fieldUtf8Value = $field->getUtf8Value()) == '') { // Field contains empty value. Treat it as non-indexed and non-tokenized $field = clone $field; $field->isIndexed = $field->isTokenized = false; } else { $term = new Index\Term($fieldUtf8Value, $field->name); $termKey = $term->key(); if (!isset($this->_termDictionary[$termKey])) { // New term $this->_termDictionary[$termKey] = $term; $this->_termDocs[$termKey] = array(); $this->_termDocs[$termKey][$this->_docCount] = array(); } else { if (!isset($this->_termDocs[$termKey][$this->_docCount])) { // Existing term, but new term entry $this->_termDocs[$termKey][$this->_docCount] = array(); } } $this->_termDocs[$termKey][$this->_docCount][] = 0; // position $docNorms[$field->name] = chr($similarity->encodeNorm($similarity->lengthNorm($field->name, 1) * $document->boost * $field->boost)); } } } if ($field->isStored) { $storedFields[] = $field; } $this->addField($field); } foreach ($this->_fields as $fieldName => $field) { if (!$field->isIndexed) { continue; } if (!isset($this->_norms[$fieldName])) { $this->_norms[$fieldName] = str_repeat(chr($similarity->encodeNorm($similarity->lengthNorm($fieldName, 0))), $this->_docCount); } if (isset($docNorms[$fieldName])) { $this->_norms[$fieldName] .= $docNorms[$fieldName]; } else { $this->_norms[$fieldName] .= chr($similarity->encodeNorm($similarity->lengthNorm($fieldName, 0))); } } $this->addStoredFields($storedFields); }