Ejemplo n.º 1
0
 /**
  * Class constructor.  Create a new multi-term query object.
  *
  * if $signs array is omitted then all terms are required
  * it differs from addTerm() behavior, but should never be used
  *
  * @param array $terms    Array of \ZendSearch\Lucene\Index\Term objects
  * @param array $signs    Array of signs.  Sign is boolean|null.
  * @throws \ZendSearch\Lucene\Exception\InvalidArgumentException
  */
 public function __construct($terms = null, $signs = null)
 {
     if (is_array($terms)) {
         if (count($terms) > Lucene\Lucene::getTermsPerQueryLimit()) {
             throw new InvalidArgumentException('Terms per query limit is reached.');
         }
         $this->_terms = $terms;
         $this->_signs = null;
         // Check if all terms are required
         if (is_array($signs)) {
             foreach ($signs as $sign) {
                 if ($sign !== true) {
                     $this->_signs = $signs;
                     break;
                 }
             }
         }
     }
 }
Ejemplo n.º 2
0
 /**
  * Re-write query into primitive queries in the context of specified index
  *
  * @param \ZendSearch\Lucene\SearchIndexInterface $index
  * @throws \ZendSearch\Lucene\Exception\RuntimeException
  * @throws \ZendSearch\Lucene\Exception\OutOfBoundsException
  * @return \ZendSearch\Lucene\Search\Query\AbstractQuery
  */
 public function rewrite(Lucene\SearchIndexInterface $index)
 {
     $this->_matches = array();
     if ($this->_pattern->field === null) {
         // Search through all fields
         $fields = $index->getFieldNames(true);
     } else {
         $fields = array($this->_pattern->field);
     }
     $prefix = self::_getPrefix($this->_pattern->text);
     $prefixLength = strlen($prefix);
     $matchExpression = '/^' . str_replace(array('\\?', '\\*'), array('.', '.*'), preg_quote($this->_pattern->text, '/')) . '$/';
     if ($prefixLength < self::$_minPrefixLength) {
         throw new RuntimeException('At least ' . self::$_minPrefixLength . ' non-wildcard characters are required at the beginning of pattern.');
     }
     /** 
      * @todo check for PCRE unicode support may be performed through Zend_Environment in some future 
      */
     ErrorHandler::start(E_WARNING);
     $result = preg_match('/\\pL/u', 'a');
     ErrorHandler::stop();
     if ($result == 1) {
         // PCRE unicode support is turned on
         // add Unicode modifier to the match expression
         $matchExpression .= 'u';
     }
     $maxTerms = Lucene\Lucene::getTermsPerQueryLimit();
     foreach ($fields as $field) {
         $index->resetTermsStream();
         if ($prefix != '') {
             $index->skipTo(new Index\Term($prefix, $field));
             while ($index->currentTerm() !== null && $index->currentTerm()->field == $field && substr($index->currentTerm()->text, 0, $prefixLength) == $prefix) {
                 if (preg_match($matchExpression, $index->currentTerm()->text) === 1) {
                     $this->_matches[] = $index->currentTerm();
                     if ($maxTerms != 0 && count($this->_matches) > $maxTerms) {
                         throw new OutOfBoundsException('Terms per query limit is reached.');
                     }
                 }
                 $index->nextTerm();
             }
         } else {
             $index->skipTo(new Index\Term('', $field));
             while ($index->currentTerm() !== null && $index->currentTerm()->field == $field) {
                 if (preg_match($matchExpression, $index->currentTerm()->text) === 1) {
                     $this->_matches[] = $index->currentTerm();
                     if ($maxTerms != 0 && count($this->_matches) > $maxTerms) {
                         throw new OutOfBoundsException('Terms per query limit is reached.');
                     }
                 }
                 $index->nextTerm();
             }
         }
         $index->closeTermsStream();
     }
     if (count($this->_matches) == 0) {
         return new EmptyResult();
     } elseif (count($this->_matches) == 1) {
         return new Term(reset($this->_matches));
     } else {
         $rewrittenQuery = new MultiTerm();
         foreach ($this->_matches as $matchedTerm) {
             $rewrittenQuery->addTerm($matchedTerm);
         }
         return $rewrittenQuery;
     }
 }
Ejemplo n.º 3
0
 /**
  * Re-write query into primitive queries in the context of specified index
  *
  * @param \ZendSearch\Lucene\SearchIndexInterface $index
  * @throws \ZendSearch\Lucene\Exception\OutOfBoundsException
  * @return \ZendSearch\Lucene\Search\Query\AbstractQuery
  */
 public function rewrite(Lucene\SearchIndexInterface $index)
 {
     $this->_matches = array();
     $this->_scores = array();
     $this->_termKeys = array();
     if ($this->_term->field === null) {
         // Search through all fields
         $fields = $index->getFieldNames(true);
     } else {
         $fields = array($this->_term->field);
     }
     $prefix = Index\Term::getPrefix($this->_term->text, $this->_prefixLength);
     $prefixByteLength = strlen($prefix);
     $prefixUtf8Length = Index\Term::getLength($prefix);
     $termLength = Index\Term::getLength($this->_term->text);
     $termRest = substr($this->_term->text, $prefixByteLength);
     // we calculate length of the rest in bytes since levenshtein() is not UTF-8 compatible
     $termRestLength = strlen($termRest);
     $scaleFactor = 1 / (1 - $this->_minimumSimilarity);
     $maxTerms = Lucene\Lucene::getTermsPerQueryLimit();
     foreach ($fields as $field) {
         $index->resetTermsStream();
         if ($prefix != '') {
             $index->skipTo(new Index\Term($prefix, $field));
             while ($index->currentTerm() !== null && $index->currentTerm()->field == $field && substr($index->currentTerm()->text, 0, $prefixByteLength) == $prefix) {
                 // Calculate similarity
                 $target = substr($index->currentTerm()->text, $prefixByteLength);
                 $maxDistance = isset($this->_maxDistances[strlen($target)]) ? $this->_maxDistances[strlen($target)] : $this->_calculateMaxDistance($prefixUtf8Length, $termRestLength, strlen($target));
                 if ($termRestLength == 0) {
                     // we don't have anything to compare.  That means if we just add
                     // the letters for current term we get the new word
                     $similarity = $prefixUtf8Length == 0 ? 0 : 1 - strlen($target) / $prefixUtf8Length;
                 } elseif (strlen($target) == 0) {
                     $similarity = $prefixUtf8Length == 0 ? 0 : 1 - $termRestLength / $prefixUtf8Length;
                 } elseif ($maxDistance < abs($termRestLength - strlen($target))) {
                     //just adding the characters of term to target or vice-versa results in too many edits
                     //for example "pre" length is 3 and "prefixes" length is 8.  We can see that
                     //given this optimal circumstance, the edit distance cannot be less than 5.
                     //which is 8-3 or more precisesly abs(3-8).
                     //if our maximum edit distance is 4, then we can discard this word
                     //without looking at it.
                     $similarity = 0;
                 } else {
                     $similarity = 1 - levenshtein($termRest, $target) / ($prefixUtf8Length + min($termRestLength, strlen($target)));
                 }
                 if ($similarity > $this->_minimumSimilarity) {
                     $this->_matches[] = $index->currentTerm();
                     $this->_termKeys[] = $index->currentTerm()->key();
                     $this->_scores[] = ($similarity - $this->_minimumSimilarity) * $scaleFactor;
                     if ($maxTerms != 0 && count($this->_matches) > $maxTerms) {
                         throw new OutOfBoundsException('Terms per query limit is reached.');
                     }
                 }
                 $index->nextTerm();
             }
         } else {
             $index->skipTo(new Index\Term('', $field));
             while ($index->currentTerm() !== null && $index->currentTerm()->field == $field) {
                 // Calculate similarity
                 $target = $index->currentTerm()->text;
                 $maxDistance = isset($this->_maxDistances[strlen($target)]) ? $this->_maxDistances[strlen($target)] : $this->_calculateMaxDistance(0, $termRestLength, strlen($target));
                 if ($maxDistance < abs($termRestLength - strlen($target))) {
                     //just adding the characters of term to target or vice-versa results in too many edits
                     //for example "pre" length is 3 and "prefixes" length is 8.  We can see that
                     //given this optimal circumstance, the edit distance cannot be less than 5.
                     //which is 8-3 or more precisesly abs(3-8).
                     //if our maximum edit distance is 4, then we can discard this word
                     //without looking at it.
                     $similarity = 0;
                 } else {
                     $similarity = 1 - levenshtein($termRest, $target) / min($termRestLength, strlen($target));
                 }
                 if ($similarity > $this->_minimumSimilarity) {
                     $this->_matches[] = $index->currentTerm();
                     $this->_termKeys[] = $index->currentTerm()->key();
                     $this->_scores[] = ($similarity - $this->_minimumSimilarity) * $scaleFactor;
                     if ($maxTerms != 0 && count($this->_matches) > $maxTerms) {
                         throw new OutOfBoundsException('Terms per query limit is reached.');
                     }
                 }
                 $index->nextTerm();
             }
         }
         $index->closeTermsStream();
     }
     if (count($this->_matches) == 0) {
         return new EmptyResult();
     } elseif (count($this->_matches) == 1) {
         return new Term(reset($this->_matches));
     } else {
         $rewrittenQuery = new Boolean();
         array_multisort($this->_scores, SORT_DESC, SORT_NUMERIC, $this->_termKeys, SORT_ASC, SORT_STRING, $this->_matches);
         $termCount = 0;
         foreach ($this->_matches as $id => $matchedTerm) {
             $subquery = new Term($matchedTerm);
             $subquery->setBoost($this->_scores[$id]);
             $rewrittenQuery->addSubquery($subquery);
             $termCount++;
             if ($termCount >= self::MAX_CLAUSE_COUNT) {
                 break;
             }
         }
         return $rewrittenQuery;
     }
 }
Ejemplo n.º 4
0
 /**
  * Re-write query into primitive queries in the context of specified index
  *
  * @param \ZendSearch\Lucene\SearchIndexInterface $index
  * @throws \ZendSearch\Lucene\Exception\OutOfBoundsException
  * @return \ZendSearch\Lucene\Search\Query\AbstractQuery
  */
 public function rewrite(Lucene\SearchIndexInterface $index)
 {
     $this->_matches = array();
     if ($this->_field === null) {
         // Search through all fields
         $fields = $index->getFieldNames(true);
     } else {
         $fields = array($this->_field);
     }
     $maxTerms = Lucene\Lucene::getTermsPerQueryLimit();
     foreach ($fields as $field) {
         $index->resetTermsStream();
         if ($this->_lowerTerm !== null) {
             $lowerTerm = new Index\Term($this->_lowerTerm->text, $field);
             $index->skipTo($lowerTerm);
             if (!$this->_inclusive && $index->currentTerm() == $lowerTerm) {
                 // Skip lower term
                 $index->nextTerm();
             }
         } else {
             $index->skipTo(new Index\Term('', $field));
         }
         if ($this->_upperTerm !== null) {
             // Walk up to the upper term
             $upperTerm = new Index\Term($this->_upperTerm->text, $field);
             while ($index->currentTerm() !== null && $index->currentTerm()->field == $field && $index->currentTerm()->text < $upperTerm->text) {
                 $this->_matches[] = $index->currentTerm();
                 if ($maxTerms != 0 && count($this->_matches) > $maxTerms) {
                     throw new OutOfBoundsException('Terms per query limit is reached.');
                 }
                 $index->nextTerm();
             }
             if ($this->_inclusive && $index->currentTerm() == $upperTerm) {
                 // Include upper term into result
                 $this->_matches[] = $upperTerm;
             }
         } else {
             // Walk up to the end of field data
             while ($index->currentTerm() !== null && $index->currentTerm()->field == $field) {
                 $this->_matches[] = $index->currentTerm();
                 if ($maxTerms != 0 && count($this->_matches) > $maxTerms) {
                     throw new OutOfBoundsException('Terms per query limit is reached.');
                 }
                 $index->nextTerm();
             }
         }
         $index->closeTermsStream();
     }
     if (count($this->_matches) == 0) {
         return new EmptyResult();
     } elseif (count($this->_matches) == 1) {
         return new Term(reset($this->_matches));
     } else {
         $rewrittenQuery = new MultiTerm();
         foreach ($this->_matches as $matchedTerm) {
             $rewrittenQuery->addTerm($matchedTerm);
         }
         return $rewrittenQuery;
     }
 }