public function testFields() { $document = new Zend_Search_Lucene_Document(); $document->addField(Zend_Search_Lucene_Field::Text('title', 'Title')); $document->addField(Zend_Search_Lucene_Field::Text('annotation', 'Annotation')); $document->addField(Zend_Search_Lucene_Field::Text('body', 'Document body, document body, document body...')); $fieldnamesDiffArray = array_diff($document->getFieldNames(), array('title', 'annotation', 'body')); $this->assertTrue(is_array($fieldnamesDiffArray)); $this->assertEquals(count($fieldnamesDiffArray), 0); $this->assertEquals($document->title, 'Title'); $this->assertEquals($document->annotation, 'Annotation'); $this->assertEquals($document->body, 'Document body, document body, document body...'); $this->assertEquals($document->getField('title')->value, 'Title'); $this->assertEquals($document->getField('annotation')->value, 'Annotation'); $this->assertEquals($document->getField('body')->value, 'Document body, document body, document body...'); $this->assertEquals($document->getFieldValue('title'), 'Title'); $this->assertEquals($document->getFieldValue('annotation'), 'Annotation'); $this->assertEquals($document->getFieldValue('body'), 'Document body, document body, document body...'); if (PHP_OS == 'AIX') { return; // tests below here not valid on AIX } $wordsWithUmlautsIso88591 = iconv('UTF-8', 'ISO-8859-1', 'Words with umlauts: åãü...'); $document->addField(Zend_Search_Lucene_Field::Text('description', $wordsWithUmlautsIso88591, 'ISO-8859-1')); $this->assertEquals($document->description, $wordsWithUmlautsIso88591); $this->assertEquals($document->getFieldUtf8Value('description'), 'Words with umlauts: åãü...'); }
/** * Adds a document to this segment. * * @param Zend_Search_Lucene_Document $document * @throws Zend_Search_Lucene_Exception */ public function addDocument(Zend_Search_Lucene_Document $document) { $storedFields = array(); $docNorms = array(); $similarity = Zend_Search_Lucene_Search_Similarity::getDefault(); foreach ($document->getFieldNames() as $fieldName) { $field = $document->getField($fieldName); $this->addField($field); if ($field->storeTermVector) { /** * @todo term vector storing support */ throw new Zend_Search_Lucene_Exception('Store term vector functionality is not supported yet.'); } if ($field->isIndexed) { if ($field->isTokenized) { $tokenList = Zend_Search_Lucene_Analysis_Analyzer::getDefault()->tokenize($field->stringValue); } else { $tokenList = array(); $tokenList[] = new Zend_Search_Lucene_Analysis_Token($field->stringValue, 0, strlen($field->stringValue)); } $docNorms[$field->name] = chr($similarity->encodeNorm($similarity->lengthNorm($field->name, count($tokenList)))); $position = 0; foreach ($tokenList as $token) { $term = new Zend_Search_Lucene_Index_Term($token->getTermText(), $field->name); $termKey = $term->key(); if (!isset($this->_termDictionary[$termKey])) { // New term $this->_termDictionary[$termKey] = $term; $this->_termDocs[$termKey] = array(); $this->_termDocs[$termKey][$this->_docCount] = array(); } else { if (!isset($this->_termDocs[$termKey][$this->_docCount])) { // Existing term, but new term entry $this->_termDocs[$termKey][$this->_docCount] = array(); } } $position += $token->getPositionIncrement(); $this->_termDocs[$termKey][$this->_docCount][] = $position; } } if ($field->isStored) { $storedFields[] = $field; } } foreach ($this->_fields as $fieldName => $field) { if (!$field->isIndexed) { continue; } if (!isset($this->_norms[$fieldName])) { $this->_norms[$fieldName] = str_repeat(chr($similarity->encodeNorm($similarity->lengthNorm($fieldName, 0))), $this->_docCount); } if (isset($docNorms[$fieldName])) { $this->_norms[$fieldName] .= $docNorms[$fieldName]; } else { $this->_norms[$fieldName] .= chr($similarity->encodeNorm($similarity->lengthNorm($fieldName, 0))); } } $this->addStoredFields($storedFields); }
public function testFields() { $document = new Zend_Search_Lucene_Document(); $document->addField(Zend_Search_Lucene_Field::Text('title', 'Title')); $document->addField(Zend_Search_Lucene_Field::Text('annotation', 'Annotation')); $document->addField(Zend_Search_Lucene_Field::Text('body', 'Document body, document body, document body...')); $fieldnamesDiffArray = array_diff($document->getFieldNames(), array('title', 'annotation', 'body')); $this->assertTrue(is_array($fieldnamesDiffArray)); $this->assertEquals(count($fieldnamesDiffArray), 0); $this->assertEquals($document->title, 'Title'); $this->assertEquals($document->annotation, 'Annotation'); $this->assertEquals($document->body, 'Document body, document body, document body...'); $this->assertEquals($document->getField('title')->value, 'Title'); $this->assertEquals($document->getField('annotation')->value, 'Annotation'); $this->assertEquals($document->getField('body')->value, 'Document body, document body, document body...'); $this->assertEquals($document->getFieldValue('title'), 'Title'); $this->assertEquals($document->getFieldValue('annotation'), 'Annotation'); $this->assertEquals($document->getFieldValue('body'), 'Document body, document body, document body...'); $document->addField(Zend_Search_Lucene_Field::Text('description', 'Words with umlauts: εγό...', 'ISO-8859-1')); $this->assertEquals($document->description, 'Words with umlauts: εγό...'); $this->assertEquals($document->getFieldUtf8Value('description'), 'Words with umlauts: Γ₯ãü...'); }
/** * Adds a document to this segment. * * @param Zend_Search_Lucene_Document $document * @throws Zend_Search_Lucene_Exception */ public function addDocument(Zend_Search_Lucene_Document $document) { /** Zend_Search_Lucene_Search_Similarity */ // require_once 'Zend/Search/Lucene/Search/Similarity.php'; $storedFields = array(); $docNorms = array(); $similarity = Zend_Search_Lucene_Search_Similarity::getDefault(); foreach ($document->getFieldNames() as $fieldName) { $field = $document->getField($fieldName); if ($field->storeTermVector) { /** * @todo term vector storing support */ // require_once 'Zend/Search/Lucene/Exception.php'; throw new Zend_Search_Lucene_Exception('Store term vector functionality is not supported yet.'); } if ($field->isIndexed) { if ($field->isTokenized) { /** Zend_Search_Lucene_Analysis_Analyzer */ // require_once 'Zend/Search/Lucene/Analysis/Analyzer.php'; $analyzer = Zend_Search_Lucene_Analysis_Analyzer::getDefault(); $analyzer->setInput($field->value, $field->encoding); $position = 0; $tokenCounter = 0; while (($token = $analyzer->nextToken()) !== null) { $tokenCounter++; $term = new Zend_Search_Lucene_Index_Term($token->getTermText(), $field->name); $termKey = $term->key(); if (!isset($this->_termDictionary[$termKey])) { // New term $this->_termDictionary[$termKey] = $term; $this->_termDocs[$termKey] = array(); $this->_termDocs[$termKey][$this->_docCount] = array(); } else { if (!isset($this->_termDocs[$termKey][$this->_docCount])) { // Existing term, but new term entry $this->_termDocs[$termKey][$this->_docCount] = array(); } } $position += $token->getPositionIncrement(); $this->_termDocs[$termKey][$this->_docCount][] = $position; } if ($tokenCounter == 0) { // Field contains empty value. Treat it as non-indexed and non-tokenized $field = clone $field; $field->isIndexed = $field->isTokenized = false; } else { $docNorms[$field->name] = chr($similarity->encodeNorm($similarity->lengthNorm($field->name, $tokenCounter) * $document->boost * $field->boost)); } } else { if (($fieldUtf8Value = $field->getUtf8Value()) == '') { // Field contains empty value. Treat it as non-indexed and non-tokenized $field = clone $field; $field->isIndexed = $field->isTokenized = false; } else { $term = new Zend_Search_Lucene_Index_Term($fieldUtf8Value, $field->name); $termKey = $term->key(); if (!isset($this->_termDictionary[$termKey])) { // New term $this->_termDictionary[$termKey] = $term; $this->_termDocs[$termKey] = array(); $this->_termDocs[$termKey][$this->_docCount] = array(); } else { if (!isset($this->_termDocs[$termKey][$this->_docCount])) { // Existing term, but new term entry $this->_termDocs[$termKey][$this->_docCount] = array(); } } $this->_termDocs[$termKey][$this->_docCount][] = 0; // position $docNorms[$field->name] = chr($similarity->encodeNorm($similarity->lengthNorm($field->name, 1) * $document->boost * $field->boost)); } } } if ($field->isStored) { $storedFields[] = $field; } $this->addField($field); } foreach ($this->_fields as $fieldName => $field) { if (!$field->isIndexed) { continue; } if (!isset($this->_norms[$fieldName])) { $this->_norms[$fieldName] = str_repeat(chr($similarity->encodeNorm($similarity->lengthNorm($fieldName, 0))), $this->_docCount); } if (isset($docNorms[$fieldName])) { $this->_norms[$fieldName] .= $docNorms[$fieldName]; } else { $this->_norms[$fieldName] .= chr($similarity->encodeNorm($similarity->lengthNorm($fieldName, 0))); } } $this->addStoredFields($storedFields); }
private function __getFieldInfo(Zend_Search_Lucene_Document $doc) { $fieldNames = $doc->getFieldNames(); $fields = array(); foreach ($fieldNames as $fieldName) { $fields[] = $doc->getField($fieldName); } return $fields; }
/** * Adds a document to this index. * * @param Zend_Search_Lucene_Document $document */ public function addDocument(Zend_Search_Lucene_Document $document) { $this->_documents[$this->_docID] = $document; // parse document $analyzer = Zend_Search_Lucene_Analysis_Analyzer::getDefault(); $fieldNames = $document->getFieldnames(); foreach ($fieldNames as $fieldName) { $field = $document->getField($fieldName); // tokenize if requested if ($field->isTokenized) { $tokens = $analyzer->tokenize($field->getUtf8Value(), 'UTF-8'); } else { $tokens = array(new Zend_Search_Lucene_Analysis_Token($field->getUtf8Value(), 0, strlen(utf8_decode($field->getUtf8Value())))); } // store tokens in "index" $position = -1; foreach ($tokens as $token) { $text = $token->getTermText(); $term = new Zend_Search_Lucene_Index_Term($text, $fieldName); $position += $token->getPositionIncrement(); // build an ordered array (list) of terms for each field if (isset($this->_terms[$fieldName])) { // if the term is not set already, sort it in if (!isset($this->_terms[$fieldName][$text])) { $new = array(); while (($current = array_shift($this->_terms[$fieldName])) && $text > $current->text) { $new[$current->text] = $current; } $new[$text] = $term; if ($current) { $new[$current->text] = $current; } $this->_terms[$fieldName] = array_merge($new, $this->_terms[$fieldName]); } } else { // first terms in each field are just stored $this->_terms[$fieldName][$text] = $term; } // store termPosition for this term $this->_termPositions[$fieldName][$text][$this->_docID][] = $position; // store or increase term freq for this document if (!isset($this->_termDocs[$fieldName][$text][$this->_docID])) { $this->_termDocs[$fieldName][$text][$this->_docID] = 1; } else { $this->_termDocs[$fieldName][$text][$this->_docID]++; } } // remember fieldname and document $this->_fields[$fieldName][$this->_docID] = 1; // calculate and store normalisation vector $this->_norms[$fieldName][$this->_docID] = $this->getSimilarity()->lengthNorm($fieldName, sizeof($tokens)) * $document->boost * $field->boost; } // increase docID $this->_docID++; }
/** * Adds a document to this segment. * * @param Zend_Search_Lucene_Document $document * @throws Zend_Search_Lucene_Exception */ public function addDocument(Zend_Search_Lucene_Document $document) { $storedFields = array(); foreach ($document->getFieldNames() as $fieldName) { $field = $document->getField($fieldName); $this->_addFieldInfo($field); if ($field->storeTermVector) { /** * @todo term vector storing support */ throw new Zend_Search_Lucene_Exception('Store term vector functionality is not supported yet.'); } if ($field->isIndexed) { if ($field->isTokenized) { $tokenList = Zend_Search_Lucene_Analysis_Analyzer::getDefault()->tokenize($field->stringValue); } else { $tokenList = array(); $tokenList[] = new Zend_Search_Lucene_Analysis_Token($field->stringValue, 0, strlen($field->stringValue)); } $this->_fieldLengths[$field->name][$this->_docCount] = count($tokenList); $position = 0; foreach ($tokenList as $token) { $term = new Zend_Search_Lucene_Index_Term($token->getTermText(), $field->name); $termKey = $term->key(); if (!isset($this->_termDictionary[$termKey])) { // New term $this->_termDictionary[$termKey] = $term; $this->_termDocs[$termKey] = array(); $this->_termDocs[$termKey][$this->_docCount] = array(); } else { if (!isset($this->_termDocs[$termKey][$this->_docCount])) { // Existing term, but new term entry $this->_termDocs[$termKey][$this->_docCount] = array(); } } $position += $token->getPositionIncrement(); $this->_termDocs[$termKey][$this->_docCount][] = $position; } } if ($field->isStored) { $storedFields[] = $field; } } if (count($storedFields) != 0) { if (!isset($this->_fdxFile)) { $this->_fdxFile = $this->_directory->createFile($this->_name . '.fdx'); $this->_fdtFile = $this->_directory->createFile($this->_name . '.fdt'); $this->_files[] = $this->_name . '.fdx'; $this->_files[] = $this->_name . '.fdt'; } $this->_fdxFile->writeLong($this->_fdtFile->tell()); $this->_fdtFile->writeVInt(count($storedFields)); foreach ($storedFields as $field) { $this->_fdtFile->writeVInt($this->_fields[$field->name]->number); $fieldBits = ($field->isTokenized ? 0x1 : 0x0) | ($field->isBinary ? 0x2 : 0x0) | 0x0; /* 0x04 - third bit, compressed (ZLIB) */ $this->_fdtFile->writeByte($fieldBits); if ($field->isBinary) { $this->_fdtFile->writeVInt(strlen($field->stringValue)); $this->_fdtFile->writeBytes($field->stringValue); } else { $this->_fdtFile->writeString($field->stringValue); } } } $this->_docCount++; }
/** * Unrewrites a Zend_Search_Lucene document into a xfDocument * * @param Zend_Search_Lucene_Document $zdoc * @returns xfDocument */ public function unwriteDocument(Zend_Search_Lucene_Document $zdoc) { $doc = new xfDocument($zdoc->getFieldValue('__guid')); $boosts = unserialize($zdoc->getFieldValue('__boosts')); foreach ($zdoc->getFieldNames() as $name) { // ignore internal fields if (substr($name, 0, 2) != '__') { $zfield = $zdoc->getField($name); $type = 0; if ($zfield->isStored) { $type |= xfField::STORED; } if ($zfield->isIndexed) { $type |= xfField::INDEXED; } if ($zfield->isTokenized) { $type |= xfField::TOKENIZED; } if ($zfield->isBinary) { $type |= xfField::BINARY; } $field = new xfField($name, $type); $field->setBoost($boosts[$name]); $value = new xfFieldValue($field, $zfield->value); $doc->addField($value); } } foreach (unserialize($zdoc->getFieldValue('__sub_documents')) as $guid) { $doc->addChild($this->findGuid($guid)); } return $doc; }