Exemple #1
0
    /**
     * Reset terms stream
     *
     * $startId - id for the fist document
     * $compact - remove deleted documents
     *
     * Returns start document id for the next segment
     *
     * @param integer $startId
     * @param integer $mode
     * @throws Zend_Search_Lucene_Exception
     * @return integer
     */
    public function resetTermsStream(/** $startId = 0, $mode = self::SM_TERMS_ONLY */)
    {
    	/**
    	 * SegmentInfo->resetTermsStream() method actually takes two optional parameters:
    	 *   $startId (default value is 0)
    	 *   $mode (default value is self::SM_TERMS_ONLY)
    	 */
    	$argList = func_get_args();
    	if (count($argList) > 2) {
            require_once 'Zend/Search/Lucene/Exception.php';
            throw new Zend_Search_Lucene_Exception('Wrong number of arguments');
    	} else if (count($argList) == 2) {
    		$startId = $argList[0];
    		$mode    = $argList[1];
    	} else if (count($argList) == 1) {
            $startId = $argList[0];
            $mode    = self::SM_TERMS_ONLY;
        } else {
            $startId = 0;
            $mode    = self::SM_TERMS_ONLY;
        }

        if ($this->_tisFile !== null) {
            $this->_tisFile = null;
        }

        $this->_tisFile = $this->openCompoundFile('.tis', false);
        $this->_tisFileOffset = $this->_tisFile->tell();

        $tiVersion = $this->_tisFile->readInt();
        if ($tiVersion != (int)0xFFFFFFFE /* pre-2.1 format */  &&
            $tiVersion != (int)0xFFFFFFFD /* 2.1+ format    */) {
            require_once 'Zend/Search/Lucene/Exception.php';
            throw new Zend_Search_Lucene_Exception('Wrong TermInfoFile file format');
        }

        $this->_termCount     =
              $this->_termNum = $this->_tisFile->readLong(); // Read terms count
        $this->_indexInterval = $this->_tisFile->readInt();  // Read Index interval
        $this->_skipInterval  = $this->_tisFile->readInt();  // Read skip interval
        if ($tiVersion == (int)0xFFFFFFFD /* 2.1+ format */) {
            $maxSkipLevels = $this->_tisFile->readInt();
        }

        if ($this->_frqFile !== null) {
            $this->_frqFile = null;
        }
        if ($this->_prxFile !== null) {
            $this->_prxFile = null;
        }
        $this->_docMap = array();

        $this->_lastTerm          = new Zend_Search_Lucene_Index_Term('', -1);
        $this->_lastTermInfo      = new Zend_Search_Lucene_Index_TermInfo(0, 0, 0, 0);
        $this->_lastTermPositions = null;

        $this->_termsScanMode = $mode;

        switch ($mode) {
            case self::SM_TERMS_ONLY:
                // Do nothing
                break;

            case self::SM_FULL_INFO:
                // break intentionally omitted
            case self::SM_MERGE_INFO:
                $this->_frqFile = $this->openCompoundFile('.frq', false);
                $this->_frqFileOffset = $this->_frqFile->tell();

                $this->_prxFile = $this->openCompoundFile('.prx', false);
                $this->_prxFileOffset = $this->_prxFile->tell();

                for ($count = 0; $count < $this->_docCount; $count++) {
                    if (!$this->isDeleted($count)) {
                        $this->_docMap[$count] = $startId + (($mode == self::SM_MERGE_INFO) ? count($this->_docMap) : $count);
                    }
                }
                break;

            default:
                require_once 'Zend/Search/Lucene/Exception.php';
                throw new Zend_Search_Lucene_Exception('Wrong terms scaning mode specified.');
                break;
        }


        $this->nextTerm();
        return $startId + (($mode == self::SM_MERGE_INFO) ? count($this->_docMap) : $this->_docCount);
    }
Exemple #2
0
 /**
  * Add term
  *
  * Term positions is an array( docId => array(pos1, pos2, pos3, ...), ... )
  *
  * @param Zend_Search_Lucene_Index_Term $termEntry
  * @param array $termDocs
  */
 public function addTerm($termEntry, $termDocs)
 {
     $freqPointer = $this->_frqFile->tell();
     $proxPointer = $this->_prxFile->tell();
     $prevDoc = 0;
     foreach ($termDocs as $docId => $termPositions) {
         $docDelta = ($docId - $prevDoc) * 2;
         $prevDoc = $docId;
         if (count($termPositions) > 1) {
             $this->_frqFile->writeVInt($docDelta);
             $this->_frqFile->writeVInt(count($termPositions));
         } else {
             $this->_frqFile->writeVInt($docDelta + 1);
         }
         $prevPosition = 0;
         foreach ($termPositions as $position) {
             $this->_prxFile->writeVInt($position - $prevPosition);
             $prevPosition = $position;
         }
     }
     if (count($termDocs) >= self::$skipInterval) {
         /**
          * @todo Write Skip Data to a freq file.
          * It's not used now, but make index more optimal
          */
         $skipOffset = $this->_frqFile->tell() - $freqPointer;
     } else {
         $skipOffset = 0;
     }
     $term = new Zend_Search_Lucene_Index_Term($termEntry->text, $this->_fields[$termEntry->field]->number);
     $termInfo = new Zend_Search_Lucene_Index_TermInfo(count($termDocs), $freqPointer, $proxPointer, $skipOffset);
     $this->_dumpTermDictEntry($this->_tisFile, $this->_prevTerm, $term, $this->_prevTermInfo, $termInfo);
     if (($this->_termCount + 1) % self::$indexInterval == 0) {
         $this->_dumpTermDictEntry($this->_tiiFile, $this->_prevIndexTerm, $term, $this->_prevIndexTermInfo, $termInfo);
         $indexPosition = $this->_tisFile->tell();
         $this->_tiiFile->writeVInt($indexPosition - $this->_lastIndexPosition);
         $this->_lastIndexPosition = $indexPosition;
     }
     $this->_termCount++;
 }
 /**
  * Reset terms stream
  *
  * $startId - id for the fist document
  * $compact - remove deleted documents
  *
  * Returns start document id for the next segment
  *
  * @param integer $startId
  * @param integer $mode
  * @throws Zend_Search_Lucene_Exception
  * @return integer
  */
 public function reset($startId = 0, $mode = self::SM_TERMS_ONLY)
 {
     if ($this->_tisFile !== null) {
         $this->_tisFile = null;
     }
     $this->_tisFile = $this->openCompoundFile('.tis', false);
     $this->_tisFileOffset = $this->_tisFile->tell();
     $tiVersion = $this->_tisFile->readInt();
     if ($tiVersion != (int) 0xfffffffe && $tiVersion != (int) 0xfffffffd) {
         require_once 'Zend/Search/Lucene/Exception.php';
         throw new Zend_Search_Lucene_Exception('Wrong TermInfoFile file format');
     }
     $this->_termCount = $this->_termNum = $this->_tisFile->readLong();
     // Read terms count
     $this->_indexInterval = $this->_tisFile->readInt();
     // Read Index interval
     $this->_skipInterval = $this->_tisFile->readInt();
     // Read skip interval
     if ($tiVersion == (int) 0xfffffffd) {
         $maxSkipLevels = $this->_tisFile->readInt();
     }
     if ($this->_frqFile !== null) {
         $this->_frqFile = null;
     }
     if ($this->_prxFile !== null) {
         $this->_prxFile = null;
     }
     $this->_docMap = array();
     $this->_lastTerm = new Zend_Search_Lucene_Index_Term('', -1);
     $this->_lastTermInfo = new Zend_Search_Lucene_Index_TermInfo(0, 0, 0, 0);
     $this->_lastTermPositions = null;
     $this->_termsScanMode = $mode;
     switch ($mode) {
         case self::SM_TERMS_ONLY:
             // Do nothing
             break;
         case self::SM_FULL_INFO:
             // break intentionally omitted
         // break intentionally omitted
         case self::SM_MERGE_INFO:
             $this->_frqFile = $this->openCompoundFile('.frq', false);
             $this->_frqFileOffset = $this->_frqFile->tell();
             $this->_prxFile = $this->openCompoundFile('.prx', false);
             $this->_prxFileOffset = $this->_prxFile->tell();
             for ($count = 0; $count < $this->_docCount; $count++) {
                 if (!$this->isDeleted($count)) {
                     $this->_docMap[$count] = $startId + ($mode == self::SM_MERGE_INFO ? count($this->_docMap) : $count);
                 }
             }
             break;
         default:
             require_once 'Zend/Search/Lucene/Exception.php';
             throw new Zend_Search_Lucene_Exception('Wrong terms scaning mode specified.');
             break;
     }
     $this->nextTerm();
     return $startId + ($mode == self::SM_MERGE_INFO ? count($this->_docMap) : $this->_docCount);
 }
 /**
  * Reset terms stream
  *
  * $startId - id for the fist document
  * $compact - remove deleted documents
  *
  * Returns start document id for the next segment
  *
  * @param integer $startId
  * @param boolean $compact
  * @throws Zend_Search_Lucene_Exception
  * @return integer
  */
 public function reset($startId = 0, $compact = false)
 {
     if ($this->_tisFile !== null) {
         $this->_tisFile = null;
     }
     $this->_tisFile = $this->openCompoundFile('.tis', false);
     $tiVersion = $this->_tisFile->readInt();
     if ($tiVersion != (int) 0.0) {
         throw new Zend_Search_Lucene_Exception('Wrong TermInfoFile file format');
     }
     $this->_termCount = $this->_tisFile->readLong();
     $this->_tisFile->readInt();
     // Read Index interval
     $this->_skipInterval = $this->_tisFile->readInt();
     // Read skip interval
     if ($this->_frqFile !== null) {
         $this->_frqFile = null;
     }
     $this->_frqFile = $this->openCompoundFile('.frq', false);
     $this->_frqFileOffset = $this->_frqFile->tell();
     if ($this->_prxFile !== null) {
         $this->_prxFile = null;
     }
     $this->_prxFile = $this->openCompoundFile('.prx', false);
     $this->_prxFileOffset = $this->_prxFile->tell();
     $this->_lastTerm = new Zend_Search_Lucene_Index_Term('', -1);
     $this->_lastTermInfo = new Zend_Search_Lucene_Index_TermInfo(0, 0, 0, 0);
     $this->_docMap = array();
     for ($count = 0; $count < $this->_docCount; $count++) {
         if (!$this->isDeleted($count)) {
             $this->_docMap[$count] = $startId + ($compact ? count($this->_docMap) : $count);
         }
     }
     $this->nextTerm();
     return $startId + ($compact ? count($this->_docMap) : $this->_docCount);
 }
 /**
  * Adds a document to this segment.
  *
  * @param Zend_Search_Lucene_Document $document
  * @throws Zend_Search_Lucene_Exception
  */
 public function addDocument(Zend_Search_Lucene_Document $document)
 {
     $storedFields = array();
     foreach ($document->getFieldNames() as $fieldName) {
         $field = $document->getField($fieldName);
         $this->_addFieldInfo($field);
         if ($field->storeTermVector) {
             /**
              * @todo term vector storing support
              */
             throw new Zend_Search_Lucene_Exception('Store term vector functionality is not supported yet.');
         }
         if ($field->isIndexed) {
             if ($field->isTokenized) {
                 $tokenList = Zend_Search_Lucene_Analysis_Analyzer::getDefault()->tokenize($field->stringValue);
             } else {
                 $tokenList = array();
                 $tokenList[] = new Zend_Search_Lucene_Analysis_Token($field->stringValue, 0, strlen($field->stringValue));
             }
             $this->_fieldLengths[$field->name][$this->_docCount] = count($tokenList);
             $position = 0;
             foreach ($tokenList as $token) {
                 $term = new Zend_Search_Lucene_Index_Term($token->getTermText(), $field->name);
                 $termKey = $term->key();
                 if (!isset($this->_termDictionary[$termKey])) {
                     // New term
                     $this->_termDictionary[$termKey] = $term;
                     $this->_termDocs[$termKey] = array();
                     $this->_termDocs[$termKey][$this->_docCount] = array();
                 } else {
                     if (!isset($this->_termDocs[$termKey][$this->_docCount])) {
                         // Existing term, but new term entry
                         $this->_termDocs[$termKey][$this->_docCount] = array();
                     }
                 }
                 $position += $token->getPositionIncrement();
                 $this->_termDocs[$termKey][$this->_docCount][] = $position;
             }
         }
         if ($field->isStored) {
             $storedFields[] = $field;
         }
     }
     if (count($storedFields) != 0) {
         if (!isset($this->_fdxFile)) {
             $this->_fdxFile = $this->_directory->createFile($this->_name . '.fdx');
             $this->_fdtFile = $this->_directory->createFile($this->_name . '.fdt');
             $this->_files[] = $this->_name . '.fdx';
             $this->_files[] = $this->_name . '.fdt';
         }
         $this->_fdxFile->writeLong($this->_fdtFile->tell());
         $this->_fdtFile->writeVInt(count($storedFields));
         foreach ($storedFields as $field) {
             $this->_fdtFile->writeVInt($this->_fields[$field->name]->number);
             $fieldBits = ($field->isTokenized ? 0x1 : 0x0) | ($field->isBinary ? 0x2 : 0x0) | 0x0;
             /* 0x04 - third bit, compressed (ZLIB) */
             $this->_fdtFile->writeByte($fieldBits);
             if ($field->isBinary) {
                 $this->_fdtFile->writeVInt(strlen($field->stringValue));
                 $this->_fdtFile->writeBytes($field->stringValue);
             } else {
                 $this->_fdtFile->writeString($field->stringValue);
             }
         }
     }
     $this->_docCount++;
 }