public function testKey()
 {
     $term1_1 = new Zend_Search_Lucene_Index_Term('term_text1', 'field_name1');
     $term2_1 = new Zend_Search_Lucene_Index_Term('term_text2', 'field_name1');
     $term2_2 = new Zend_Search_Lucene_Index_Term('term_text2', 'field_name2');
     $term2_1Dup = new Zend_Search_Lucene_Index_Term('term_text2', 'field_name1');
     $this->assertEquals($term1_1->text > $term2_1->text, $term1_1->key() > $term2_1->key());
     $this->assertEquals($term1_1->text >= $term2_1->text, $term1_1->key() >= $term2_1->key());
     $this->assertEquals($term1_1->field > $term2_2->field, $term1_1->key() > $term2_2->key());
     $this->assertEquals($term1_1->field >= $term2_2->field, $term1_1->key() >= $term2_2->key());
     $this->assertEquals($term2_1->key(), $term2_1Dup->key());
 }
Exemplo n.º 2
0
    /**
     * Scans terms dictionary and returns term info
     *
     * @param Zend_Search_Lucene_Index_Term $term
     * @return Zend_Search_Lucene_Index_TermInfo
     */
    public function getTermInfo(Zend_Search_Lucene_Index_Term $term)
    {
        $termKey = $term->key();
        if (isset($this->_termInfoCache[$termKey])) {
            $termInfo = $this->_termInfoCache[$termKey];

            // Move termInfo to the end of cache
            unset($this->_termInfoCache[$termKey]);
            $this->_termInfoCache[$termKey] = $termInfo;

            return $termInfo;
        }


        if ($this->_termDictionary === null) {
            $this->_loadDictionaryIndex();
        }

        $searchField = $this->getFieldNum($term->field);

        if ($searchField == -1) {
            return null;
        }
        $searchDicField = $this->_getFieldPosition($searchField);

        // search for appropriate value in dictionary
        $lowIndex = 0;
        $highIndex = count($this->_termDictionary)-1;
        while ($highIndex >= $lowIndex) {
            // $mid = ($highIndex - $lowIndex)/2;
            $mid = ($highIndex + $lowIndex) >> 1;
            $midTerm = $this->_termDictionary[$mid];

            $fieldNum = $this->_getFieldPosition($midTerm[0] /* field */);
            $delta = $searchDicField - $fieldNum;
            if ($delta == 0) {
                $delta = strcmp($term->text, $midTerm[1] /* text */);
            }

            if ($delta < 0) {
                $highIndex = $mid-1;
            } elseif ($delta > 0) {
                $lowIndex  = $mid+1;
            } else {
                // return $this->_termDictionaryInfos[$mid]; // We got it!
                $a = $this->_termDictionaryInfos[$mid];
                $termInfo = new Zend_Search_Lucene_Index_TermInfo($a[0], $a[1], $a[2], $a[3], $a[4]);

                // Put loaded termInfo into cache
                $this->_termInfoCache[$termKey] = $termInfo;

                return $termInfo;
            }
        }

        if ($highIndex == -1) {
            // Term is out of the dictionary range
            return null;
        }

        $prevPosition = $highIndex;
        $prevTerm = $this->_termDictionary[$prevPosition];
        $prevTermInfo = $this->_termDictionaryInfos[$prevPosition];

        $tisFile = $this->openCompoundFile('.tis');
        $tiVersion = $tisFile->readInt();
        if ($tiVersion != (int)0xFFFFFFFE /* pre-2.1 format */  &&
            $tiVersion != (int)0xFFFFFFFD /* 2.1+ format    */) {
            require_once 'Zend/Search/Lucene/Exception.php';
            throw new Zend_Search_Lucene_Exception('Wrong TermInfoFile file format');
        }

        $termCount     = $tisFile->readLong();
        $indexInterval = $tisFile->readInt();
        $skipInterval  = $tisFile->readInt();
        if ($tiVersion == (int)0xFFFFFFFD /* 2.1+ format */) {
            $maxSkipLevels = $tisFile->readInt();
        }

        $tisFile->seek($prevTermInfo[4] /* indexPointer */ - (($tiVersion == (int)0xFFFFFFFD)? 24 : 20) /* header size*/, SEEK_CUR);

        $termValue    = $prevTerm[1] /* text */;
        $termFieldNum = $prevTerm[0] /* field */;
        $freqPointer = $prevTermInfo[1] /* freqPointer */;
        $proxPointer = $prevTermInfo[2] /* proxPointer */;
        for ($count = $prevPosition*$indexInterval + 1;
             $count <= $termCount &&
             ( $this->_getFieldPosition($termFieldNum) < $searchDicField ||
              ($this->_getFieldPosition($termFieldNum) == $searchDicField &&
               strcmp($termValue, $term->text) < 0) );
             $count++) {
            $termPrefixLength = $tisFile->readVInt();
            $termSuffix       = $tisFile->readString();
            $termFieldNum     = $tisFile->readVInt();
            $termValue        = Zend_Search_Lucene_Index_Term::getPrefix($termValue, $termPrefixLength) . $termSuffix;

            $docFreq      = $tisFile->readVInt();
            $freqPointer += $tisFile->readVInt();
            $proxPointer += $tisFile->readVInt();
            if( $docFreq >= $skipInterval ) {
                $skipOffset = $tisFile->readVInt();
            } else {
                $skipOffset = 0;
            }
        }

        if ($termFieldNum == $searchField && $termValue == $term->text) {
            $termInfo = new Zend_Search_Lucene_Index_TermInfo($docFreq, $freqPointer, $proxPointer, $skipOffset);
        } else {
            $termInfo = null;
        }

        // Put loaded termInfo into cache
        $this->_termInfoCache[$termKey] = $termInfo;

        if (count($this->_termInfoCache) == 1024) {
            $this->_cleanUpTermInfoCache();
        }

        return $termInfo;
    }
Exemplo n.º 3
0
 /**
  * Scans terms dictionary and returns term info
  *
  * @param Zend_Search_Lucene_Index_Term $term
  * @return Zend_Search_Lucene_Index_TermInfo
  */
 public function getTermInfo(Zend_Search_Lucene_Index_Term $term)
 {
     $termKey = $term->key();
     if (isset($this->_termInfoCache[$termKey])) {
         $termInfo = $this->_termInfoCache[$termKey];
         // Move termInfo to the end of cache
         unset($this->_termInfoCache[$termKey]);
         $this->_termInfoCache[$termKey] = $termInfo;
         return $termInfo;
     }
     if ($this->_termDictionary === null) {
         // Check, if index is already serialized
         if ($this->_directory->fileExists($this->_name . '.sti')) {
             // Prefetch dictionary index data
             $stiFile = $this->_directory->getFileObject($this->_name . '.sti');
             $stiFileData = $stiFile->readBytes($this->_directory->fileLength($this->_name . '.sti'));
             // Load dictionary index data
             list($this->_termDictionary, $this->_termDictionaryInfos) = unserialize($stiFileData);
         } else {
             // Prefetch dictionary index data
             $tiiFile = $this->openCompoundFile('.tii');
             $tiiFileData = $tiiFile->readBytes($this->compoundFileLength('.tii'));
             // Load dictionary index data
             list($this->_termDictionary, $this->_termDictionaryInfos) = Zend_Search_Lucene_Index_DictionaryLoader::load($tiiFileData);
             $stiFileData = serialize(array($this->_termDictionary, $this->_termDictionaryInfos));
             $stiFile = $this->_directory->createFile($this->_name . '.sti');
             $stiFile->writeBytes($stiFileData);
         }
     }
     $searchField = $this->getFieldNum($term->field);
     if ($searchField == -1) {
         return null;
     }
     $searchDicField = $this->_getFieldPosition($searchField);
     // search for appropriate value in dictionary
     $lowIndex = 0;
     $highIndex = count($this->_termDictionary) - 1;
     while ($highIndex >= $lowIndex) {
         // $mid = ($highIndex - $lowIndex)/2;
         $mid = $highIndex + $lowIndex >> 1;
         $midTerm = $this->_termDictionary[$mid];
         $fieldNum = $this->_getFieldPosition($midTerm[0]);
         $delta = $searchDicField - $fieldNum;
         if ($delta == 0) {
             $delta = strcmp($term->text, $midTerm[1]);
         }
         if ($delta < 0) {
             $highIndex = $mid - 1;
         } elseif ($delta > 0) {
             $lowIndex = $mid + 1;
         } else {
             // return $this->_termDictionaryInfos[$mid]; // We got it!
             $a = $this->_termDictionaryInfos[$mid];
             $termInfo = new Zend_Search_Lucene_Index_TermInfo($a[0], $a[1], $a[2], $a[3], $a[4]);
             // Put loaded termInfo into cache
             $this->_termInfoCache[$termKey] = $termInfo;
             return $termInfo;
         }
     }
     if ($highIndex == -1) {
         // Term is out of the dictionary range
         return null;
     }
     $prevPosition = $highIndex;
     $prevTerm = $this->_termDictionary[$prevPosition];
     $prevTermInfo = $this->_termDictionaryInfos[$prevPosition];
     $tisFile = $this->openCompoundFile('.tis');
     $tiVersion = $tisFile->readInt();
     if ($tiVersion != (int) 0.0) {
         throw new Zend_Search_Lucene_Exception('Wrong TermInfoFile file format');
     }
     $termCount = $tisFile->readLong();
     $indexInterval = $tisFile->readInt();
     $skipInterval = $tisFile->readInt();
     $tisFile->seek($prevTermInfo[4] - 20, SEEK_CUR);
     $termValue = $prevTerm[1];
     $termFieldNum = $prevTerm[0];
     $freqPointer = $prevTermInfo[1];
     $proxPointer = $prevTermInfo[2];
     for ($count = $prevPosition * $indexInterval + 1; $count <= $termCount && ($this->_getFieldPosition($termFieldNum) < $searchDicField || $this->_getFieldPosition($termFieldNum) == $searchDicField && strcmp($termValue, $term->text) < 0); $count++) {
         $termPrefixLength = $tisFile->readVInt();
         $termSuffix = $tisFile->readString();
         $termFieldNum = $tisFile->readVInt();
         $termValue = Zend_Search_Lucene_Index_Term::getPrefix($termValue, $termPrefixLength) . $termSuffix;
         $docFreq = $tisFile->readVInt();
         $freqPointer += $tisFile->readVInt();
         $proxPointer += $tisFile->readVInt();
         if ($docFreq >= $skipInterval) {
             $skipOffset = $tisFile->readVInt();
         } else {
             $skipOffset = 0;
         }
     }
     if ($termFieldNum == $searchField && $termValue == $term->text) {
         $termInfo = new Zend_Search_Lucene_Index_TermInfo($docFreq, $freqPointer, $proxPointer, $skipOffset);
     } else {
         $termInfo = null;
     }
     // Put loaded termInfo into cache
     $this->_termInfoCache[$termKey] = $termInfo;
     if (count($this->_termInfoCache) == 1024) {
         $this->_cleanUpTermInfoCache();
     }
     return $termInfo;
 }
Exemplo n.º 4
0
 /**
  * Adds a document to this segment.
  *
  * @param Zend_Search_Lucene_Document $document
  * @throws Zend_Search_Lucene_Exception
  */
 public function addDocument(Zend_Search_Lucene_Document $document)
 {
     /** Zend_Search_Lucene_Search_Similarity */
     // require_once 'Zend/Search/Lucene/Search/Similarity.php';
     $storedFields = array();
     $docNorms = array();
     $similarity = Zend_Search_Lucene_Search_Similarity::getDefault();
     foreach ($document->getFieldNames() as $fieldName) {
         $field = $document->getField($fieldName);
         if ($field->storeTermVector) {
             /**
              * @todo term vector storing support
              */
             // require_once 'Zend/Search/Lucene/Exception.php';
             throw new Zend_Search_Lucene_Exception('Store term vector functionality is not supported yet.');
         }
         if ($field->isIndexed) {
             if ($field->isTokenized) {
                 /** Zend_Search_Lucene_Analysis_Analyzer */
                 // require_once 'Zend/Search/Lucene/Analysis/Analyzer.php';
                 $analyzer = Zend_Search_Lucene_Analysis_Analyzer::getDefault();
                 $analyzer->setInput($field->value, $field->encoding);
                 $position = 0;
                 $tokenCounter = 0;
                 while (($token = $analyzer->nextToken()) !== null) {
                     $tokenCounter++;
                     $term = new Zend_Search_Lucene_Index_Term($token->getTermText(), $field->name);
                     $termKey = $term->key();
                     if (!isset($this->_termDictionary[$termKey])) {
                         // New term
                         $this->_termDictionary[$termKey] = $term;
                         $this->_termDocs[$termKey] = array();
                         $this->_termDocs[$termKey][$this->_docCount] = array();
                     } else {
                         if (!isset($this->_termDocs[$termKey][$this->_docCount])) {
                             // Existing term, but new term entry
                             $this->_termDocs[$termKey][$this->_docCount] = array();
                         }
                     }
                     $position += $token->getPositionIncrement();
                     $this->_termDocs[$termKey][$this->_docCount][] = $position;
                 }
                 if ($tokenCounter == 0) {
                     // Field contains empty value. Treat it as non-indexed and non-tokenized
                     $field = clone $field;
                     $field->isIndexed = $field->isTokenized = false;
                 } else {
                     $docNorms[$field->name] = chr($similarity->encodeNorm($similarity->lengthNorm($field->name, $tokenCounter) * $document->boost * $field->boost));
                 }
             } else {
                 if (($fieldUtf8Value = $field->getUtf8Value()) == '') {
                     // Field contains empty value. Treat it as non-indexed and non-tokenized
                     $field = clone $field;
                     $field->isIndexed = $field->isTokenized = false;
                 } else {
                     $term = new Zend_Search_Lucene_Index_Term($fieldUtf8Value, $field->name);
                     $termKey = $term->key();
                     if (!isset($this->_termDictionary[$termKey])) {
                         // New term
                         $this->_termDictionary[$termKey] = $term;
                         $this->_termDocs[$termKey] = array();
                         $this->_termDocs[$termKey][$this->_docCount] = array();
                     } else {
                         if (!isset($this->_termDocs[$termKey][$this->_docCount])) {
                             // Existing term, but new term entry
                             $this->_termDocs[$termKey][$this->_docCount] = array();
                         }
                     }
                     $this->_termDocs[$termKey][$this->_docCount][] = 0;
                     // position
                     $docNorms[$field->name] = chr($similarity->encodeNorm($similarity->lengthNorm($field->name, 1) * $document->boost * $field->boost));
                 }
             }
         }
         if ($field->isStored) {
             $storedFields[] = $field;
         }
         $this->addField($field);
     }
     foreach ($this->_fields as $fieldName => $field) {
         if (!$field->isIndexed) {
             continue;
         }
         if (!isset($this->_norms[$fieldName])) {
             $this->_norms[$fieldName] = str_repeat(chr($similarity->encodeNorm($similarity->lengthNorm($fieldName, 0))), $this->_docCount);
         }
         if (isset($docNorms[$fieldName])) {
             $this->_norms[$fieldName] .= $docNorms[$fieldName];
         } else {
             $this->_norms[$fieldName] .= chr($similarity->encodeNorm($similarity->lengthNorm($fieldName, 0)));
         }
     }
     $this->addStoredFields($storedFields);
 }
 /**
  * Adds a document to this segment.
  *
  * @param Zend_Search_Lucene_Document $document
  * @throws Zend_Search_Lucene_Exception
  */
 public function addDocument(Zend_Search_Lucene_Document $document)
 {
     $storedFields = array();
     $docNorms = array();
     $similarity = Zend_Search_Lucene_Search_Similarity::getDefault();
     foreach ($document->getFieldNames() as $fieldName) {
         $field = $document->getField($fieldName);
         $this->addField($field);
         if ($field->storeTermVector) {
             /**
              * @todo term vector storing support
              */
             throw new Zend_Search_Lucene_Exception('Store term vector functionality is not supported yet.');
         }
         if ($field->isIndexed) {
             if ($field->isTokenized) {
                 $tokenList = Zend_Search_Lucene_Analysis_Analyzer::getDefault()->tokenize($field->stringValue);
             } else {
                 $tokenList = array();
                 $tokenList[] = new Zend_Search_Lucene_Analysis_Token($field->stringValue, 0, strlen($field->stringValue));
             }
             $docNorms[$field->name] = chr($similarity->encodeNorm($similarity->lengthNorm($field->name, count($tokenList))));
             $position = 0;
             foreach ($tokenList as $token) {
                 $term = new Zend_Search_Lucene_Index_Term($token->getTermText(), $field->name);
                 $termKey = $term->key();
                 if (!isset($this->_termDictionary[$termKey])) {
                     // New term
                     $this->_termDictionary[$termKey] = $term;
                     $this->_termDocs[$termKey] = array();
                     $this->_termDocs[$termKey][$this->_docCount] = array();
                 } else {
                     if (!isset($this->_termDocs[$termKey][$this->_docCount])) {
                         // Existing term, but new term entry
                         $this->_termDocs[$termKey][$this->_docCount] = array();
                     }
                 }
                 $position += $token->getPositionIncrement();
                 $this->_termDocs[$termKey][$this->_docCount][] = $position;
             }
         }
         if ($field->isStored) {
             $storedFields[] = $field;
         }
     }
     foreach ($this->_fields as $fieldName => $field) {
         if (!$field->isIndexed) {
             continue;
         }
         if (!isset($this->_norms[$fieldName])) {
             $this->_norms[$fieldName] = str_repeat(chr($similarity->encodeNorm($similarity->lengthNorm($fieldName, 0))), $this->_docCount);
         }
         if (isset($docNorms[$fieldName])) {
             $this->_norms[$fieldName] .= $docNorms[$fieldName];
         } else {
             $this->_norms[$fieldName] .= chr($similarity->encodeNorm($similarity->lengthNorm($fieldName, 0)));
         }
     }
     $this->addStoredFields($storedFields);
 }
Exemplo n.º 6
0
 /**
  * Adds a document to this segment.
  *
  * @param Zend_Search_Lucene_Document $document
  * @throws Zend_Search_Lucene_Exception
  */
 public function addDocument(Zend_Search_Lucene_Document $document)
 {
     $storedFields = array();
     foreach ($document->getFieldNames() as $fieldName) {
         $field = $document->getField($fieldName);
         $this->_addFieldInfo($field);
         if ($field->storeTermVector) {
             /**
              * @todo term vector storing support
              */
             throw new Zend_Search_Lucene_Exception('Store term vector functionality is not supported yet.');
         }
         if ($field->isIndexed) {
             if ($field->isTokenized) {
                 $tokenList = Zend_Search_Lucene_Analysis_Analyzer::getDefault()->tokenize($field->stringValue);
             } else {
                 $tokenList = array();
                 $tokenList[] = new Zend_Search_Lucene_Analysis_Token($field->stringValue, 0, strlen($field->stringValue));
             }
             $this->_fieldLengths[$field->name][$this->_docCount] = count($tokenList);
             $position = 0;
             foreach ($tokenList as $token) {
                 $term = new Zend_Search_Lucene_Index_Term($token->getTermText(), $field->name);
                 $termKey = $term->key();
                 if (!isset($this->_termDictionary[$termKey])) {
                     // New term
                     $this->_termDictionary[$termKey] = $term;
                     $this->_termDocs[$termKey] = array();
                     $this->_termDocs[$termKey][$this->_docCount] = array();
                 } else {
                     if (!isset($this->_termDocs[$termKey][$this->_docCount])) {
                         // Existing term, but new term entry
                         $this->_termDocs[$termKey][$this->_docCount] = array();
                     }
                 }
                 $position += $token->getPositionIncrement();
                 $this->_termDocs[$termKey][$this->_docCount][] = $position;
             }
         }
         if ($field->isStored) {
             $storedFields[] = $field;
         }
     }
     if (count($storedFields) != 0) {
         if (!isset($this->_fdxFile)) {
             $this->_fdxFile = $this->_directory->createFile($this->_name . '.fdx');
             $this->_fdtFile = $this->_directory->createFile($this->_name . '.fdt');
             $this->_files[] = $this->_name . '.fdx';
             $this->_files[] = $this->_name . '.fdt';
         }
         $this->_fdxFile->writeLong($this->_fdtFile->tell());
         $this->_fdtFile->writeVInt(count($storedFields));
         foreach ($storedFields as $field) {
             $this->_fdtFile->writeVInt($this->_fields[$field->name]->number);
             $fieldBits = ($field->isTokenized ? 0x1 : 0x0) | ($field->isBinary ? 0x2 : 0x0) | 0x0;
             /* 0x04 - third bit, compressed (ZLIB) */
             $this->_fdtFile->writeByte($fieldBits);
             if ($field->isBinary) {
                 $this->_fdtFile->writeVInt(strlen($field->stringValue));
                 $this->_fdtFile->writeBytes($field->stringValue);
             } else {
                 $this->_fdtFile->writeString($field->stringValue);
             }
         }
     }
     $this->_docCount++;
 }