public function testFields()
 {
     $document = new Zend_Search_Lucene_Document();
     $document->addField(Zend_Search_Lucene_Field::Text('title', 'Title'));
     $document->addField(Zend_Search_Lucene_Field::Text('annotation', 'Annotation'));
     $document->addField(Zend_Search_Lucene_Field::Text('body', 'Document body, document body, document body...'));
     $fieldnamesDiffArray = array_diff($document->getFieldNames(), array('title', 'annotation', 'body'));
     $this->assertTrue(is_array($fieldnamesDiffArray));
     $this->assertEquals(count($fieldnamesDiffArray), 0);
     $this->assertEquals($document->title, 'Title');
     $this->assertEquals($document->annotation, 'Annotation');
     $this->assertEquals($document->body, 'Document body, document body, document body...');
     $this->assertEquals($document->getField('title')->value, 'Title');
     $this->assertEquals($document->getField('annotation')->value, 'Annotation');
     $this->assertEquals($document->getField('body')->value, 'Document body, document body, document body...');
     $this->assertEquals($document->getFieldValue('title'), 'Title');
     $this->assertEquals($document->getFieldValue('annotation'), 'Annotation');
     $this->assertEquals($document->getFieldValue('body'), 'Document body, document body, document body...');
     if (PHP_OS == 'AIX') {
         return;
         // tests below here not valid on AIX
     }
     $wordsWithUmlautsIso88591 = iconv('UTF-8', 'ISO-8859-1', 'Words with umlauts: åãü...');
     $document->addField(Zend_Search_Lucene_Field::Text('description', $wordsWithUmlautsIso88591, 'ISO-8859-1'));
     $this->assertEquals($document->description, $wordsWithUmlautsIso88591);
     $this->assertEquals($document->getFieldUtf8Value('description'), 'Words with umlauts: åãü...');
 }
 /**
  * Adds a document to this segment.
  *
  * @param Zend_Search_Lucene_Document $document
  * @throws Zend_Search_Lucene_Exception
  */
 public function addDocument(Zend_Search_Lucene_Document $document)
 {
     $storedFields = array();
     $docNorms = array();
     $similarity = Zend_Search_Lucene_Search_Similarity::getDefault();
     foreach ($document->getFieldNames() as $fieldName) {
         $field = $document->getField($fieldName);
         $this->addField($field);
         if ($field->storeTermVector) {
             /**
              * @todo term vector storing support
              */
             throw new Zend_Search_Lucene_Exception('Store term vector functionality is not supported yet.');
         }
         if ($field->isIndexed) {
             if ($field->isTokenized) {
                 $tokenList = Zend_Search_Lucene_Analysis_Analyzer::getDefault()->tokenize($field->stringValue);
             } else {
                 $tokenList = array();
                 $tokenList[] = new Zend_Search_Lucene_Analysis_Token($field->stringValue, 0, strlen($field->stringValue));
             }
             $docNorms[$field->name] = chr($similarity->encodeNorm($similarity->lengthNorm($field->name, count($tokenList))));
             $position = 0;
             foreach ($tokenList as $token) {
                 $term = new Zend_Search_Lucene_Index_Term($token->getTermText(), $field->name);
                 $termKey = $term->key();
                 if (!isset($this->_termDictionary[$termKey])) {
                     // New term
                     $this->_termDictionary[$termKey] = $term;
                     $this->_termDocs[$termKey] = array();
                     $this->_termDocs[$termKey][$this->_docCount] = array();
                 } else {
                     if (!isset($this->_termDocs[$termKey][$this->_docCount])) {
                         // Existing term, but new term entry
                         $this->_termDocs[$termKey][$this->_docCount] = array();
                     }
                 }
                 $position += $token->getPositionIncrement();
                 $this->_termDocs[$termKey][$this->_docCount][] = $position;
             }
         }
         if ($field->isStored) {
             $storedFields[] = $field;
         }
     }
     foreach ($this->_fields as $fieldName => $field) {
         if (!$field->isIndexed) {
             continue;
         }
         if (!isset($this->_norms[$fieldName])) {
             $this->_norms[$fieldName] = str_repeat(chr($similarity->encodeNorm($similarity->lengthNorm($fieldName, 0))), $this->_docCount);
         }
         if (isset($docNorms[$fieldName])) {
             $this->_norms[$fieldName] .= $docNorms[$fieldName];
         } else {
             $this->_norms[$fieldName] .= chr($similarity->encodeNorm($similarity->lengthNorm($fieldName, 0)));
         }
     }
     $this->addStoredFields($storedFields);
 }
 public function testFields()
 {
     $document = new Zend_Search_Lucene_Document();
     $document->addField(Zend_Search_Lucene_Field::Text('title', 'Title'));
     $document->addField(Zend_Search_Lucene_Field::Text('annotation', 'Annotation'));
     $document->addField(Zend_Search_Lucene_Field::Text('body', 'Document body, document body, document body...'));
     $fieldnamesDiffArray = array_diff($document->getFieldNames(), array('title', 'annotation', 'body'));
     $this->assertTrue(is_array($fieldnamesDiffArray));
     $this->assertEquals(count($fieldnamesDiffArray), 0);
     $this->assertEquals($document->title, 'Title');
     $this->assertEquals($document->annotation, 'Annotation');
     $this->assertEquals($document->body, 'Document body, document body, document body...');
     $this->assertEquals($document->getField('title')->value, 'Title');
     $this->assertEquals($document->getField('annotation')->value, 'Annotation');
     $this->assertEquals($document->getField('body')->value, 'Document body, document body, document body...');
     $this->assertEquals($document->getFieldValue('title'), 'Title');
     $this->assertEquals($document->getFieldValue('annotation'), 'Annotation');
     $this->assertEquals($document->getFieldValue('body'), 'Document body, document body, document body...');
     $document->addField(Zend_Search_Lucene_Field::Text('description', 'Words with umlauts: εγό...', 'ISO-8859-1'));
     $this->assertEquals($document->description, 'Words with umlauts: εγό...');
     $this->assertEquals($document->getFieldUtf8Value('description'), 'Words with umlauts: Γ₯ãü...');
 }
Beispiel #4
0
 /**
  * Adds a document to this segment.
  *
  * @param Zend_Search_Lucene_Document $document
  * @throws Zend_Search_Lucene_Exception
  */
 public function addDocument(Zend_Search_Lucene_Document $document)
 {
     /** Zend_Search_Lucene_Search_Similarity */
     // require_once 'Zend/Search/Lucene/Search/Similarity.php';
     $storedFields = array();
     $docNorms = array();
     $similarity = Zend_Search_Lucene_Search_Similarity::getDefault();
     foreach ($document->getFieldNames() as $fieldName) {
         $field = $document->getField($fieldName);
         if ($field->storeTermVector) {
             /**
              * @todo term vector storing support
              */
             // require_once 'Zend/Search/Lucene/Exception.php';
             throw new Zend_Search_Lucene_Exception('Store term vector functionality is not supported yet.');
         }
         if ($field->isIndexed) {
             if ($field->isTokenized) {
                 /** Zend_Search_Lucene_Analysis_Analyzer */
                 // require_once 'Zend/Search/Lucene/Analysis/Analyzer.php';
                 $analyzer = Zend_Search_Lucene_Analysis_Analyzer::getDefault();
                 $analyzer->setInput($field->value, $field->encoding);
                 $position = 0;
                 $tokenCounter = 0;
                 while (($token = $analyzer->nextToken()) !== null) {
                     $tokenCounter++;
                     $term = new Zend_Search_Lucene_Index_Term($token->getTermText(), $field->name);
                     $termKey = $term->key();
                     if (!isset($this->_termDictionary[$termKey])) {
                         // New term
                         $this->_termDictionary[$termKey] = $term;
                         $this->_termDocs[$termKey] = array();
                         $this->_termDocs[$termKey][$this->_docCount] = array();
                     } else {
                         if (!isset($this->_termDocs[$termKey][$this->_docCount])) {
                             // Existing term, but new term entry
                             $this->_termDocs[$termKey][$this->_docCount] = array();
                         }
                     }
                     $position += $token->getPositionIncrement();
                     $this->_termDocs[$termKey][$this->_docCount][] = $position;
                 }
                 if ($tokenCounter == 0) {
                     // Field contains empty value. Treat it as non-indexed and non-tokenized
                     $field = clone $field;
                     $field->isIndexed = $field->isTokenized = false;
                 } else {
                     $docNorms[$field->name] = chr($similarity->encodeNorm($similarity->lengthNorm($field->name, $tokenCounter) * $document->boost * $field->boost));
                 }
             } else {
                 if (($fieldUtf8Value = $field->getUtf8Value()) == '') {
                     // Field contains empty value. Treat it as non-indexed and non-tokenized
                     $field = clone $field;
                     $field->isIndexed = $field->isTokenized = false;
                 } else {
                     $term = new Zend_Search_Lucene_Index_Term($fieldUtf8Value, $field->name);
                     $termKey = $term->key();
                     if (!isset($this->_termDictionary[$termKey])) {
                         // New term
                         $this->_termDictionary[$termKey] = $term;
                         $this->_termDocs[$termKey] = array();
                         $this->_termDocs[$termKey][$this->_docCount] = array();
                     } else {
                         if (!isset($this->_termDocs[$termKey][$this->_docCount])) {
                             // Existing term, but new term entry
                             $this->_termDocs[$termKey][$this->_docCount] = array();
                         }
                     }
                     $this->_termDocs[$termKey][$this->_docCount][] = 0;
                     // position
                     $docNorms[$field->name] = chr($similarity->encodeNorm($similarity->lengthNorm($field->name, 1) * $document->boost * $field->boost));
                 }
             }
         }
         if ($field->isStored) {
             $storedFields[] = $field;
         }
         $this->addField($field);
     }
     foreach ($this->_fields as $fieldName => $field) {
         if (!$field->isIndexed) {
             continue;
         }
         if (!isset($this->_norms[$fieldName])) {
             $this->_norms[$fieldName] = str_repeat(chr($similarity->encodeNorm($similarity->lengthNorm($fieldName, 0))), $this->_docCount);
         }
         if (isset($docNorms[$fieldName])) {
             $this->_norms[$fieldName] .= $docNorms[$fieldName];
         } else {
             $this->_norms[$fieldName] .= chr($similarity->encodeNorm($similarity->lengthNorm($fieldName, 0)));
         }
     }
     $this->addStoredFields($storedFields);
 }
 private function __getFieldInfo(Zend_Search_Lucene_Document $doc)
 {
     $fieldNames = $doc->getFieldNames();
     $fields = array();
     foreach ($fieldNames as $fieldName) {
         $fields[] = $doc->getField($fieldName);
     }
     return $fields;
 }
 /**
  * Adds a document to this index.
  *
  * @param Zend_Search_Lucene_Document $document
  */
 public function addDocument(Zend_Search_Lucene_Document $document)
 {
     $this->_documents[$this->_docID] = $document;
     // parse document
     $analyzer = Zend_Search_Lucene_Analysis_Analyzer::getDefault();
     $fieldNames = $document->getFieldnames();
     foreach ($fieldNames as $fieldName) {
         $field = $document->getField($fieldName);
         // tokenize if requested
         if ($field->isTokenized) {
             $tokens = $analyzer->tokenize($field->getUtf8Value(), 'UTF-8');
         } else {
             $tokens = array(new Zend_Search_Lucene_Analysis_Token($field->getUtf8Value(), 0, strlen(utf8_decode($field->getUtf8Value()))));
         }
         // store tokens in "index"
         $position = -1;
         foreach ($tokens as $token) {
             $text = $token->getTermText();
             $term = new Zend_Search_Lucene_Index_Term($text, $fieldName);
             $position += $token->getPositionIncrement();
             // build an ordered array (list) of terms for each field
             if (isset($this->_terms[$fieldName])) {
                 // if the term is not set already, sort it in
                 if (!isset($this->_terms[$fieldName][$text])) {
                     $new = array();
                     while (($current = array_shift($this->_terms[$fieldName])) && $text > $current->text) {
                         $new[$current->text] = $current;
                     }
                     $new[$text] = $term;
                     if ($current) {
                         $new[$current->text] = $current;
                     }
                     $this->_terms[$fieldName] = array_merge($new, $this->_terms[$fieldName]);
                 }
             } else {
                 // first terms in each field are just stored
                 $this->_terms[$fieldName][$text] = $term;
             }
             // store termPosition for this term
             $this->_termPositions[$fieldName][$text][$this->_docID][] = $position;
             // store or increase term freq for this document
             if (!isset($this->_termDocs[$fieldName][$text][$this->_docID])) {
                 $this->_termDocs[$fieldName][$text][$this->_docID] = 1;
             } else {
                 $this->_termDocs[$fieldName][$text][$this->_docID]++;
             }
         }
         // remember fieldname and document
         $this->_fields[$fieldName][$this->_docID] = 1;
         // calculate and store normalisation vector
         $this->_norms[$fieldName][$this->_docID] = $this->getSimilarity()->lengthNorm($fieldName, sizeof($tokens)) * $document->boost * $field->boost;
     }
     // increase docID
     $this->_docID++;
 }
 /**
  * Adds a document to this segment.
  *
  * @param Zend_Search_Lucene_Document $document
  * @throws Zend_Search_Lucene_Exception
  */
 public function addDocument(Zend_Search_Lucene_Document $document)
 {
     $storedFields = array();
     foreach ($document->getFieldNames() as $fieldName) {
         $field = $document->getField($fieldName);
         $this->_addFieldInfo($field);
         if ($field->storeTermVector) {
             /**
              * @todo term vector storing support
              */
             throw new Zend_Search_Lucene_Exception('Store term vector functionality is not supported yet.');
         }
         if ($field->isIndexed) {
             if ($field->isTokenized) {
                 $tokenList = Zend_Search_Lucene_Analysis_Analyzer::getDefault()->tokenize($field->stringValue);
             } else {
                 $tokenList = array();
                 $tokenList[] = new Zend_Search_Lucene_Analysis_Token($field->stringValue, 0, strlen($field->stringValue));
             }
             $this->_fieldLengths[$field->name][$this->_docCount] = count($tokenList);
             $position = 0;
             foreach ($tokenList as $token) {
                 $term = new Zend_Search_Lucene_Index_Term($token->getTermText(), $field->name);
                 $termKey = $term->key();
                 if (!isset($this->_termDictionary[$termKey])) {
                     // New term
                     $this->_termDictionary[$termKey] = $term;
                     $this->_termDocs[$termKey] = array();
                     $this->_termDocs[$termKey][$this->_docCount] = array();
                 } else {
                     if (!isset($this->_termDocs[$termKey][$this->_docCount])) {
                         // Existing term, but new term entry
                         $this->_termDocs[$termKey][$this->_docCount] = array();
                     }
                 }
                 $position += $token->getPositionIncrement();
                 $this->_termDocs[$termKey][$this->_docCount][] = $position;
             }
         }
         if ($field->isStored) {
             $storedFields[] = $field;
         }
     }
     if (count($storedFields) != 0) {
         if (!isset($this->_fdxFile)) {
             $this->_fdxFile = $this->_directory->createFile($this->_name . '.fdx');
             $this->_fdtFile = $this->_directory->createFile($this->_name . '.fdt');
             $this->_files[] = $this->_name . '.fdx';
             $this->_files[] = $this->_name . '.fdt';
         }
         $this->_fdxFile->writeLong($this->_fdtFile->tell());
         $this->_fdtFile->writeVInt(count($storedFields));
         foreach ($storedFields as $field) {
             $this->_fdtFile->writeVInt($this->_fields[$field->name]->number);
             $fieldBits = ($field->isTokenized ? 0x1 : 0x0) | ($field->isBinary ? 0x2 : 0x0) | 0x0;
             /* 0x04 - third bit, compressed (ZLIB) */
             $this->_fdtFile->writeByte($fieldBits);
             if ($field->isBinary) {
                 $this->_fdtFile->writeVInt(strlen($field->stringValue));
                 $this->_fdtFile->writeBytes($field->stringValue);
             } else {
                 $this->_fdtFile->writeString($field->stringValue);
             }
         }
     }
     $this->_docCount++;
 }
 /**
  * Unrewrites a Zend_Search_Lucene document into a xfDocument
  *
  * @param Zend_Search_Lucene_Document $zdoc
  * @returns xfDocument
  */
 public function unwriteDocument(Zend_Search_Lucene_Document $zdoc)
 {
     $doc = new xfDocument($zdoc->getFieldValue('__guid'));
     $boosts = unserialize($zdoc->getFieldValue('__boosts'));
     foreach ($zdoc->getFieldNames() as $name) {
         // ignore internal fields
         if (substr($name, 0, 2) != '__') {
             $zfield = $zdoc->getField($name);
             $type = 0;
             if ($zfield->isStored) {
                 $type |= xfField::STORED;
             }
             if ($zfield->isIndexed) {
                 $type |= xfField::INDEXED;
             }
             if ($zfield->isTokenized) {
                 $type |= xfField::TOKENIZED;
             }
             if ($zfield->isBinary) {
                 $type |= xfField::BINARY;
             }
             $field = new xfField($name, $type);
             $field->setBoost($boosts[$name]);
             $value = new xfFieldValue($field, $zfield->value);
             $doc->addField($value);
         }
     }
     foreach (unserialize($zdoc->getFieldValue('__sub_documents')) as $guid) {
         $doc->addChild($this->findGuid($guid));
     }
     return $doc;
 }