Beispiel #1
0
    /**
     * Transforms a generic Query object into an Elastic Search query DSL
     *
     * @param Query $query
     */
    public function mapQuery(Query $query)
    {
        $arguments = array();

        $map = new Boolean();

        if ($query->hasQueryString()) {

            Lucene::setDefaultSearchField($query->getQueryString()->getDefaultField());

            QueryParser::setDefaultOperator($query->getQueryString()->getDefaultOperator() == Query::OPERATOR_AND ? QueryParser::B_AND : QueryParser::B_OR);

            $keyword = $query->getQueryString()->getQuery();

            if ("*" === $keyword) {
                $subQuery = new Wildcard(new Term($keyword));
                $subQuery->setMinPrefixLength(0);
            } else {
                $subQuery = QueryParser::parse($keyword);
            }

            $map->addSubquery($subQuery, true);
        }

        $arguments[] = $map;

        foreach ($query->getSort() as $sort) {
            $arguments[] = key($sort);
            $arguments[] = SORT_REGULAR;
            $arguments[] = current($sort) == 'asc' ? SORT_ASC : SORT_DESC;
        }

        return $arguments;
    }
Beispiel #2
0
 /**
  * @returns \Zend\Search\Lucene\SearchIndexInterface
  */
 public function get()
 {
     if (!$this->checkEsists()) {
         return Lucene::create($this->path);
     } else {
         return Lucene::open($this->path);
     }
 }
 /**
  * @covers Zend\Search\Lucene\MultiSearcher::find
  * @covers Zend\Search\Lucene\Search\QueryHit::getDocument
  */
 public function testFind()
 {
     $index = new Lucene\MultiSearcher(array(Lucene\Lucene::open(__DIR__ . '/_indexSample/_files'), Lucene\Lucene::open(__DIR__ . '/_indexSample/_files')));
     $hits = $index->find('submitting');
     $this->assertEquals(count($hits), 2 * 3);
     foreach ($hits as $hit) {
         $document = $hit->getDocument();
         $this->assertTrue($document instanceof Lucene\Document);
     }
 }
Beispiel #4
0
    /**
     * Transforms given objects into a bulk add operation directive
     *
     * @param ClassMetadata $metadata
     * @param array $objects
     * @param array bulk commands
     */
    public function run(ClassMetadata $metadata, array $objects)
    {
        $index = $metadata->getIndex()->getName();

        $index = Lucene::open("/tmp/index_$index");

        foreach ($objects as $object) {
            $document = $this->exportObject($metadata, $object);
            $index->addDocument($document);
        }
    }
Beispiel #5
0
 /**
  * Re-write query into primitive queries in the context of specified index
  *
  * @param \Zend\Search\Lucene\SearchIndexInterface $index
  * @throws \Zend\Search\Lucene\Exception\RuntimeException
  * @throws \Zend\Search\Lucene\Exception\OutOfBoundsException
  * @return \Zend\Search\Lucene\Search\Query\AbstractQuery
  */
 public function rewrite(Lucene\SearchIndexInterface $index)
 {
     $this->_matches = array();
     if ($this->_pattern->field === null) {
         // Search through all fields
         $fields = $index->getFieldNames(true);
     } else {
         $fields = array($this->_pattern->field);
     }
     $prefix = self::_getPrefix($this->_pattern->text);
     $prefixLength = strlen($prefix);
     $matchExpression = '/^' . str_replace(array('\\?', '\\*'), array('.', '.*'), preg_quote($this->_pattern->text, '/')) . '$/';
     if ($prefixLength < self::$_minPrefixLength) {
         throw new RuntimeException('At least ' . self::$_minPrefixLength . ' non-wildcard characters are required at the beginning of pattern.');
     }
     /** @todo check for PCRE unicode support may be performed through Zend_Environment in some future */
     if (@preg_match('/\\pL/u', 'a') == 1) {
         // PCRE unicode support is turned on
         // add Unicode modifier to the match expression
         $matchExpression .= 'u';
     }
     $maxTerms = Lucene\Lucene::getTermsPerQueryLimit();
     foreach ($fields as $field) {
         $index->resetTermsStream();
         if ($prefix != '') {
             $index->skipTo(new Index\Term($prefix, $field));
             while ($index->currentTerm() !== null && $index->currentTerm()->field == $field && substr($index->currentTerm()->text, 0, $prefixLength) == $prefix) {
                 if (preg_match($matchExpression, $index->currentTerm()->text) === 1) {
                     $this->_matches[] = $index->currentTerm();
                     if ($maxTerms != 0 && count($this->_matches) > $maxTerms) {
                         throw new OutOfBoundsException('Terms per query limit is reached.');
                     }
                 }
                 $index->nextTerm();
             }
         } else {
             $index->skipTo(new Index\Term('', $field));
             while ($index->currentTerm() !== null && $index->currentTerm()->field == $field) {
                 if (preg_match($matchExpression, $index->currentTerm()->text) === 1) {
                     $this->_matches[] = $index->currentTerm();
                     if ($maxTerms != 0 && count($this->_matches) > $maxTerms) {
                         throw new OutOfBoundsException('Terms per query limit is reached.');
                     }
                 }
                 $index->nextTerm();
             }
         }
         $index->closeTermsStream();
     }
     if (count($this->_matches) == 0) {
         return new EmptyResult();
     } elseif (count($this->_matches) == 1) {
         return new Term(reset($this->_matches));
     } else {
         $rewrittenQuery = new MultiTerm();
         foreach ($this->_matches as $matchedTerm) {
             $rewrittenQuery->addTerm($matchedTerm);
         }
         return $rewrittenQuery;
     }
 }
Beispiel #6
0
 /**
  * Re-write query into primitive queries in the context of specified index
  *
  * @param \Zend\Search\Lucene\SearchIndex $index
  * @throws \Zend\Search\Lucene\Exception\OutOfBoundsException
  * @return \Zend\Search\Lucene\Search\Query\AbstractQuery
  */
 public function rewrite(Lucene\SearchIndex $index)
 {
     $this->_matches = array();
     if ($this->_field === null) {
         // Search through all fields
         $fields = $index->getFieldNames(true);
     } else {
         $fields = array($this->_field);
     }
     $maxTerms = Lucene\Lucene::getTermsPerQueryLimit();
     foreach ($fields as $field) {
         $index->resetTermsStream();
         if ($this->_lowerTerm !== null) {
             $lowerTerm = new Index\Term($this->_lowerTerm->text, $field);
             $index->skipTo($lowerTerm);
             if (!$this->_inclusive && $index->currentTerm() == $lowerTerm) {
                 // Skip lower term
                 $index->nextTerm();
             }
         } else {
             $index->skipTo(new Index\Term('', $field));
         }
         if ($this->_upperTerm !== null) {
             // Walk up to the upper term
             $upperTerm = new Index\Term($this->_upperTerm->text, $field);
             while ($index->currentTerm() !== null && $index->currentTerm()->field == $field && $index->currentTerm()->text < $upperTerm->text) {
                 $this->_matches[] = $index->currentTerm();
                 if ($maxTerms != 0 && count($this->_matches) > $maxTerms) {
                     throw new OutOfBoundsException('Terms per query limit is reached.');
                 }
                 $index->nextTerm();
             }
             if ($this->_inclusive && $index->currentTerm() == $upperTerm) {
                 // Include upper term into result
                 $this->_matches[] = $upperTerm;
             }
         } else {
             // Walk up to the end of field data
             while ($index->currentTerm() !== null && $index->currentTerm()->field == $field) {
                 $this->_matches[] = $index->currentTerm();
                 if ($maxTerms != 0 && count($this->_matches) > $maxTerms) {
                     throw new OutOfBoundsException('Terms per query limit is reached.');
                 }
                 $index->nextTerm();
             }
         }
         $index->closeTermsStream();
     }
     if (count($this->_matches) == 0) {
         return new EmptyResult();
     } else {
         if (count($this->_matches) == 1) {
             return new Term(reset($this->_matches));
         } else {
             $rewrittenQuery = new MultiTerm();
             foreach ($this->_matches as $matchedTerm) {
                 $rewrittenQuery->addTerm($matchedTerm);
             }
             return $rewrittenQuery;
         }
     }
 }
Beispiel #7
0
 /**
  * Re-write query into primitive queries in the context of specified index
  *
  * @param \Zend\Search\Lucene\SearchIndexInterface $index
  * @return \Zend\Search\Lucene\Search\Query\AbstractQuery
  */
 public function rewrite(Lucene\SearchIndexInterface $index)
 {
     // Allow to use wildcards within phrases
     // They are either removed by text analyzer or used as a part of keyword for keyword fields
     //
     //        if (strpos($this->_phrase, '?') !== false || strpos($this->_phrase, '*') !== false) {
     //            require_once 'Zend/Search/Lucene/Search/QueryParserException.php';
     //            throw new Zend_Search_Lucene_Search_QueryParserException('Wildcards are only allowed in a single terms.');
     //        }
     // Split query into subqueries if field name is not specified
     if ($this->_field === null) {
         $query = new Query\Boolean();
         $query->setBoost($this->getBoost());
         if (Lucene\Lucene::getDefaultSearchField() === null) {
             $searchFields = $index->getFieldNames(true);
         } else {
             $searchFields = array(Lucene\Lucene::getDefaultSearchField());
         }
         foreach ($searchFields as $fieldName) {
             $subquery = new Phrase($this->_phrase, $this->_phraseEncoding, $fieldName);
             $subquery->setSlop($this->getSlop());
             $query->addSubquery($subquery->rewrite($index));
         }
         $this->_matches = $query->getQueryTerms();
         return $query;
     }
     // Recognize exact term matching (it corresponds to Keyword fields stored in the index)
     // encoding is not used since we expect binary matching
     $term = new Index\Term($this->_phrase, $this->_field);
     if ($index->hasTerm($term)) {
         $query = new Query\Term($term);
         $query->setBoost($this->getBoost());
         $this->_matches = $query->getQueryTerms();
         return $query;
     }
     // tokenize phrase using current analyzer and process it as a phrase query
     $tokens = Analyzer::getDefault()->tokenize($this->_phrase, $this->_phraseEncoding);
     if (count($tokens) == 0) {
         $this->_matches = array();
         return new Query\Insignificant();
     }
     if (count($tokens) == 1) {
         $term = new Index\Term($tokens[0]->getTermText(), $this->_field);
         $query = new Query\Term($term);
         $query->setBoost($this->getBoost());
         $this->_matches = $query->getQueryTerms();
         return $query;
     }
     //It's non-trivial phrase query
     $position = -1;
     $query = new Query\Phrase();
     foreach ($tokens as $token) {
         $position += $token->getPositionIncrement();
         $term = new Index\Term($token->getTermText(), $this->_field);
         $query->addTerm($term, $position);
         $query->setSlop($this->getSlop());
     }
     $this->_matches = $query->getQueryTerms();
     return $query;
 }
Beispiel #8
0
 /**
  * @group ZF-4252
  */
 public function testHtmlInlineTagsIndexing()
 {
     $index = Lucene\Lucene::create(__DIR__ . '/_index/_files');
     $htmlString = '<html><head><title>Hello World</title></head>' . '<body><b>Zend</b>Framework' . "\n" . ' <div>Foo</div>Bar ' . "\n" . ' <strong>Test</strong></body></html>';
     $doc = Document\Html::loadHTML($htmlString);
     $index->addDocument($doc);
     $hits = $index->find('FooBar');
     $this->assertEquals(count($hits), 0);
     $hits = $index->find('ZendFramework');
     $this->assertEquals(count($hits), 1);
     unset($index);
     $this->_clearDirectory(__DIR__ . '/_index/_files');
 }
Beispiel #9
0
 /**
  * @group ZF-9680
  */
 public function testIsDeletedWithoutExplicitCommit()
 {
     $index = Lucene\Lucene::create(__DIR__ . '/_index/_files');
     $document = new Document();
     $document->addField(Document\Field::Keyword('_id', 'myId'));
     $document->addField(Document\Field::Keyword('bla', 'blubb'));
     $index->addDocument($document);
     $this->assertFalse($index->isDeleted(0));
 }
Beispiel #10
0
 /**
  * Class constructor.  Create a new multi-term query object.
  *
  * if $signs array is omitted then all terms are required
  * it differs from addTerm() behavior, but should never be used
  *
  * @param array $terms    Array of \Zend\Search\Lucene\Index\Term objects
  * @param array $signs    Array of signs.  Sign is boolean|null.
  * @throws \Zend\Search\Lucene\Exception\InvalidArgumentException
  */
 public function __construct($terms = null, $signs = null)
 {
     if (is_array($terms)) {
         if (count($terms) > Lucene\Lucene::getTermsPerQueryLimit()) {
             throw new InvalidArgumentException('Terms per query limit is reached.');
         }
         $this->_terms = $terms;
         $this->_signs = null;
         // Check if all terms are required
         if (is_array($signs)) {
             foreach ($signs as $sign) {
                 if ($sign !== true) {
                     $this->_signs = $signs;
                     break;
                 }
             }
         }
     }
 }
Beispiel #11
0
 /**
  * Re-write query into primitive queries in the context of specified index
  *
  * @param \Zend\Search\Lucene\SearchIndex $index
  * @throws \Zend\Search\Lucence\Search\Exception\QueryParserException
  * @return \Zend\Search\Lucene\Search\Query\AbstractQuery
  */
 public function rewrite(Lucene\SearchIndex $index)
 {
     if ($this->_field === null) {
         $query = new Query\MultiTerm();
         $query->setBoost($this->getBoost());
         $hasInsignificantSubqueries = false;
         if (Lucene\Lucene::getDefaultSearchField() === null) {
             $searchFields = $index->getFieldNames(true);
         } else {
             $searchFields = array(Lucene\Lucene::getDefaultSearchField());
         }
         foreach ($searchFields as $fieldName) {
             $subquery = new Term($this->_word, $this->_encoding, $fieldName);
             $rewrittenSubquery = $subquery->rewrite($index);
             foreach ($rewrittenSubquery->getQueryTerms() as $term) {
                 $query->addTerm($term);
             }
             if ($rewrittenSubquery instanceof Query\Insignificant) {
                 $hasInsignificantSubqueries = true;
             }
         }
         if (count($query->getTerms()) == 0) {
             $this->_matches = array();
             if ($hasInsignificantSubqueries) {
                 return new Query\Insignificant();
             } else {
                 return new Query\EmptyResult();
             }
         }
         $this->_matches = $query->getQueryTerms();
         return $query;
     }
     // -------------------------------------
     // Recognize exact term matching (it corresponds to Keyword fields stored in the index)
     // encoding is not used since we expect binary matching
     $term = new Index\Term($this->_word, $this->_field);
     if ($index->hasTerm($term)) {
         $query = new Query\Term($term);
         $query->setBoost($this->getBoost());
         $this->_matches = $query->getQueryTerms();
         return $query;
     }
     // -------------------------------------
     // Recognize wildcard queries
     /** @todo check for PCRE unicode support may be performed through Zend_Environment in some future */
     if (@preg_match('/\\pL/u', 'a') == 1) {
         $word = iconv($this->_encoding, 'UTF-8', $this->_word);
         $wildcardsPattern = '/[*?]/u';
         $subPatternsEncoding = 'UTF-8';
     } else {
         $word = $this->_word;
         $wildcardsPattern = '/[*?]/';
         $subPatternsEncoding = $this->_encoding;
     }
     $subPatterns = preg_split($wildcardsPattern, $word, -1, PREG_SPLIT_OFFSET_CAPTURE);
     if (count($subPatterns) > 1) {
         // Wildcard query is recognized
         $pattern = '';
         foreach ($subPatterns as $id => $subPattern) {
             // Append corresponding wildcard character to the pattern before each sub-pattern (except first)
             if ($id != 0) {
                 $pattern .= $word[$subPattern[1] - 1];
             }
             // Check if each subputtern is a single word in terms of current analyzer
             $tokens = Analyzer\Analyzer::getDefault()->tokenize($subPattern[0], $subPatternsEncoding);
             if (count($tokens) > 1) {
                 throw new QueryParserException('Wildcard search is supported only for non-multiple word terms');
             }
             foreach ($tokens as $token) {
                 $pattern .= $token->getTermText();
             }
         }
         $term = new Index\Term($pattern, $this->_field);
         $query = new Query\Wildcard($term);
         $query->setBoost($this->getBoost());
         // Get rewritten query. Important! It also fills terms matching container.
         $rewrittenQuery = $query->rewrite($index);
         $this->_matches = $query->getQueryTerms();
         return $rewrittenQuery;
     }
     // -------------------------------------
     // Recognize one-term multi-term and "insignificant" queries
     $tokens = Analyzer\Analyzer::getDefault()->tokenize($this->_word, $this->_encoding);
     if (count($tokens) == 0) {
         $this->_matches = array();
         return new Query\Insignificant();
     }
     if (count($tokens) == 1) {
         $term = new Index\Term($tokens[0]->getTermText(), $this->_field);
         $query = new Query\Term($term);
         $query->setBoost($this->getBoost());
         $this->_matches = $query->getQueryTerms();
         return $query;
     }
     //It's not insignificant or one term query
     $query = new Query\MultiTerm();
     /**
      * @todo Process $token->getPositionIncrement() to support stemming, synonyms and other
      * analizer design features
      */
     foreach ($tokens as $token) {
         $term = new Index\Term($token->getTermText(), $this->_field);
         $query->addTerm($term, true);
         // all subterms are required
     }
     $query->setBoost($this->getBoost());
     $this->_matches = $query->getQueryTerms();
     return $query;
 }
Beispiel #12
0
    /**
     * Performs a query against the index and returns an array
     * of Zend_Search_Lucene_Search_QueryHit objects.
     * Input is a string or Zend_Search_Lucene_Search_Query.
     *
     * @param \Zend\Search\Lucene\Search\QueryParser|string $query
     * @return array \Zend\Search\Lucene\Search\QueryHit
     * @throws \Zend\Search\Lucene\Exception\InvalidArgumentException
     * @throws \Zend\Search\Lucene\Exception\RuntimeException
     */
    public function find($query)
    {
        if (is_string($query)) {
            $query = Search\QueryParser::parse($query);
        }

        if (!$query instanceof Search\Query\AbstractQuery) {
            throw new InvalidArgumentException('Query must be a string or Zend\Search\Lucene\Search\Query object');
        }

        $this->commit();

        $hits   = array();
        $scores = array();
        $ids    = array();

        $query = $query->rewrite($this)->optimize($this);

        $query->execute($this);

        $topScore = 0;

        $resultSetLimit = Lucene::getResultSetLimit();
        foreach ($query->matchedDocs() as $id => $num) {
            $docScore = $query->score($id, $this);
            if( $docScore != 0 ) {
                $hit = new Search\QueryHit($this);
                $hit->id = $id;
                $hit->score = $docScore;

                $hits[]   = $hit;
                $ids[]    = $id;
                $scores[] = $docScore;

                if ($docScore > $topScore) {
                    $topScore = $docScore;
                }
            }

            if ($resultSetLimit != 0  &&  count($hits) >= $resultSetLimit) {
                break;
            }
        }

        if (count($hits) == 0) {
            // skip sorting, which may cause a error on empty index
            return array();
        }

        if ($topScore > 1) {
            foreach ($hits as $hit) {
                $hit->score /= $topScore;
            }
        }

        if (func_num_args() == 1) {
            // sort by scores
            array_multisort($scores, SORT_DESC, SORT_NUMERIC,
                            $ids,    SORT_ASC,  SORT_NUMERIC,
                            $hits);
        } else {
            // sort by given field names

            $argList    = func_get_args();
            $fieldNames = $this->getFieldNames();
            $sortArgs   = array();

            // PHP 5.3 now expects all arguments to array_multisort be passed by
            // reference (if it's invoked through call_user_func_array());
            // since constants can't be passed by reference, create some placeholder variables.
            $sortReg    = SORT_REGULAR;
            $sortAsc    = SORT_ASC;
            $sortNum    = SORT_NUMERIC;

            $sortFieldValues = array();

            for ($count = 1; $count < count($argList); $count++) {
                $fieldName = $argList[$count];

                if (!is_string($fieldName)) {
                    throw new RuntimeException('Field name must be a string.');
                }

                if (strtolower($fieldName) == 'score') {
                    $sortArgs[] = &$scores;
                } else {
                    if (!in_array($fieldName, $fieldNames)) {
                        throw new RuntimeException('Wrong field name.');
                    }

                    if (!isset($sortFieldValues[$fieldName])) {
                        $valuesArray = array();
                        foreach ($hits as $hit) {
                            try {
                                $value = $hit->getDocument()->getFieldValue($fieldName);
                            } catch (\Exception $e) {
                                if (strpos($e->getMessage(), 'not found') === false) {
                                    throw new RuntimeException($e->getMessage(), $e->getCode(), $e);
                                } else {
                                    $value = null;
                                }
                            }

                            $valuesArray[] = $value;
                        }

                        // Collect loaded values in $sortFieldValues
                        // Required for PHP 5.3 which translates references into values when source
                        // variable is destroyed
                        $sortFieldValues[$fieldName] = $valuesArray;
                    }

                    $sortArgs[] = &$sortFieldValues[$fieldName];
                }

                if ($count + 1 < count($argList)  &&  is_integer($argList[$count+1])) {
                    $count++;
                    $sortArgs[] = &$argList[$count];

                    if ($count + 1 < count($argList)  &&  is_integer($argList[$count+1])) {
                        $count++;
                        $sortArgs[] = &$argList[$count];
                    } else {
                        if ($argList[$count] == SORT_ASC  || $argList[$count] == SORT_DESC) {
                            $sortArgs[] = &$sortReg;
                        } else {
                            $sortArgs[] = &$sortAsc;
                        }
                    }
                } else {
                    $sortArgs[] = &$sortAsc;
                    $sortArgs[] = &$sortReg;
                }
            }

            // Sort by id's if values are equal
            $sortArgs[] = &$ids;
            $sortArgs[] = &$sortAsc;
            $sortArgs[] = &$sortNum;

            // Array to be sorted
            $sortArgs[] = &$hits;

            // Do sort
            call_user_func_array('array_multisort', $sortArgs);
        }

        return $hits;
    }
Beispiel #13
0
 /**
  * Re-write query into primitive queries in the context of specified index
  *
  * @param \Zend\Search\Lucene\IndexInterface $index
  * @return \Zend\Search\Lucene\Search\Query\AbstractQuery
  * @throws \Zend\Search\Lucene\Exception
  */
 public function rewrite(Lucene\IndexInterface $index)
 {
     $this->_matches = array();
     $this->_scores = array();
     $this->_termKeys = array();
     if ($this->_term->field === null) {
         // Search through all fields
         $fields = $index->getFieldNames(true);
     } else {
         $fields = array($this->_term->field);
     }
     $prefix = Index\Term::getPrefix($this->_term->text, $this->_prefixLength);
     $prefixByteLength = strlen($prefix);
     $prefixUtf8Length = Index\Term::getLength($prefix);
     $termLength = Index\Term::getLength($this->_term->text);
     $termRest = substr($this->_term->text, $prefixByteLength);
     // we calculate length of the rest in bytes since levenshtein() is not UTF-8 compatible
     $termRestLength = strlen($termRest);
     $scaleFactor = 1 / (1 - $this->_minimumSimilarity);
     $maxTerms = Lucene\Lucene::getTermsPerQueryLimit();
     foreach ($fields as $field) {
         $index->resetTermsStream();
         if ($prefix != '') {
             $index->skipTo(new Index\Term($prefix, $field));
             while ($index->currentTerm() !== null && $index->currentTerm()->field == $field && substr($index->currentTerm()->text, 0, $prefixByteLength) == $prefix) {
                 // Calculate similarity
                 $target = substr($index->currentTerm()->text, $prefixByteLength);
                 $maxDistance = isset($this->_maxDistances[strlen($target)]) ? $this->_maxDistances[strlen($target)] : $this->_calculateMaxDistance($prefixUtf8Length, $termRestLength, strlen($target));
                 if ($termRestLength == 0) {
                     // we don't have anything to compare.  That means if we just add
                     // the letters for current term we get the new word
                     $similarity = $prefixUtf8Length == 0 ? 0 : 1 - strlen($target) / $prefixUtf8Length;
                 } else {
                     if (strlen($target) == 0) {
                         $similarity = $prefixUtf8Length == 0 ? 0 : 1 - $termRestLength / $prefixUtf8Length;
                     } else {
                         if ($maxDistance < abs($termRestLength - strlen($target))) {
                             //just adding the characters of term to target or vice-versa results in too many edits
                             //for example "pre" length is 3 and "prefixes" length is 8.  We can see that
                             //given this optimal circumstance, the edit distance cannot be less than 5.
                             //which is 8-3 or more precisesly abs(3-8).
                             //if our maximum edit distance is 4, then we can discard this word
                             //without looking at it.
                             $similarity = 0;
                         } else {
                             $similarity = 1 - levenshtein($termRest, $target) / ($prefixUtf8Length + min($termRestLength, strlen($target)));
                         }
                     }
                 }
                 if ($similarity > $this->_minimumSimilarity) {
                     $this->_matches[] = $index->currentTerm();
                     $this->_termKeys[] = $index->currentTerm()->key();
                     $this->_scores[] = ($similarity - $this->_minimumSimilarity) * $scaleFactor;
                     if ($maxTerms != 0 && count($this->_matches) > $maxTerms) {
                         throw new Lucene\Exception('Terms per query limit is reached.');
                     }
                 }
                 $index->nextTerm();
             }
         } else {
             $index->skipTo(new Index\Term('', $field));
             while ($index->currentTerm() !== null && $index->currentTerm()->field == $field) {
                 // Calculate similarity
                 $target = $index->currentTerm()->text;
                 $maxDistance = isset($this->_maxDistances[strlen($target)]) ? $this->_maxDistances[strlen($target)] : $this->_calculateMaxDistance(0, $termRestLength, strlen($target));
                 if ($maxDistance < abs($termRestLength - strlen($target))) {
                     //just adding the characters of term to target or vice-versa results in too many edits
                     //for example "pre" length is 3 and "prefixes" length is 8.  We can see that
                     //given this optimal circumstance, the edit distance cannot be less than 5.
                     //which is 8-3 or more precisesly abs(3-8).
                     //if our maximum edit distance is 4, then we can discard this word
                     //without looking at it.
                     $similarity = 0;
                 } else {
                     $similarity = 1 - levenshtein($termRest, $target) / min($termRestLength, strlen($target));
                 }
                 if ($similarity > $this->_minimumSimilarity) {
                     $this->_matches[] = $index->currentTerm();
                     $this->_termKeys[] = $index->currentTerm()->key();
                     $this->_scores[] = ($similarity - $this->_minimumSimilarity) * $scaleFactor;
                     if ($maxTerms != 0 && count($this->_matches) > $maxTerms) {
                         throw new Lucene\Exception('Terms per query limit is reached.');
                     }
                 }
                 $index->nextTerm();
             }
         }
         $index->closeTermsStream();
     }
     if (count($this->_matches) == 0) {
         return new EmptyResult();
     } else {
         if (count($this->_matches) == 1) {
             return new Term(reset($this->_matches));
         } else {
             $rewrittenQuery = new Boolean();
             array_multisort($this->_scores, SORT_DESC, SORT_NUMERIC, $this->_termKeys, SORT_ASC, SORT_STRING, $this->_matches);
             $termCount = 0;
             foreach ($this->_matches as $id => $matchedTerm) {
                 $subquery = new Term($matchedTerm);
                 $subquery->setBoost($this->_scores[$id]);
                 $rewrittenQuery->addSubquery($subquery);
                 $termCount++;
                 if ($termCount >= self::MAX_CLAUSE_COUNT) {
                     break;
                 }
             }
             return $rewrittenQuery;
         }
     }
 }
Beispiel #14
0
 /**
  * Re-write query into primitive queries in the context of specified index
  *
  * @param \Zend\Search\Lucene\SearchIndex $index
  * @throws \Zend\Search\Lucence\Search\Exception\QueryParserException
  * @return \Zend\Search\Lucene\Search\Query\AbstractQuery
  */
 public function rewrite(Lucene\SearchIndex $index)
 {
     if ($this->_field === null) {
         $query = new Search\Query\Boolean();
         $hasInsignificantSubqueries = false;
         if (Lucene\Lucene::getDefaultSearchField() === null) {
             $searchFields = $index->getFieldNames(true);
         } else {
             $searchFields = array(Lucene\Lucene::getDefaultSearchField());
         }
         foreach ($searchFields as $fieldName) {
             $subquery = new self($this->_word, $this->_encoding, $fieldName, $this->_minimumSimilarity);
             $rewrittenSubquery = $subquery->rewrite($index);
             if (!($rewrittenSubquery instanceof Query\Insignificant || $rewrittenSubquery instanceof Query\EmptyResult)) {
                 $query->addSubquery($rewrittenSubquery);
             }
             if ($rewrittenSubquery instanceof Query\Insignificant) {
                 $hasInsignificantSubqueries = true;
             }
         }
         $subqueries = $query->getSubqueries();
         if (count($subqueries) == 0) {
             $this->_matches = array();
             if ($hasInsignificantSubqueries) {
                 return new Query\Insignificant();
             } else {
                 return new Query\EmptyResult();
             }
         }
         if (count($subqueries) == 1) {
             $query = reset($subqueries);
         }
         $query->setBoost($this->getBoost());
         $this->_matches = $query->getQueryTerms();
         return $query;
     }
     // -------------------------------------
     // Recognize exact term matching (it corresponds to Keyword fields stored in the index)
     // encoding is not used since we expect binary matching
     $term = new Index\Term($this->_word, $this->_field);
     if ($index->hasTerm($term)) {
         $query = new Query\Fuzzy($term, $this->_minimumSimilarity);
         $query->setBoost($this->getBoost());
         // Get rewritten query. Important! It also fills terms matching container.
         $rewrittenQuery = $query->rewrite($index);
         $this->_matches = $query->getQueryTerms();
         return $rewrittenQuery;
     }
     // -------------------------------------
     // Recognize wildcard queries
     /** @todo check for PCRE unicode support may be performed through Zend_Environment in some future */
     if (@preg_match('/\\pL/u', 'a') == 1) {
         $subPatterns = preg_split('/[*?]/u', iconv($this->_encoding, 'UTF-8', $this->_word));
     } else {
         $subPatterns = preg_split('/[*?]/', $this->_word);
     }
     if (count($subPatterns) > 1) {
         throw new QueryParserException('Fuzzy search doesn\'t support wildcards (except within Keyword fields).');
     }
     // -------------------------------------
     // Recognize one-term multi-term and "insignificant" queries
     $tokens = Analyzer\Analyzer::getDefault()->tokenize($this->_word, $this->_encoding);
     if (count($tokens) == 0) {
         $this->_matches = array();
         return new Query\Insignificant();
     }
     if (count($tokens) == 1) {
         $term = new Index\Term($tokens[0]->getTermText(), $this->_field);
         $query = new Query\Fuzzy($term, $this->_minimumSimilarity);
         $query->setBoost($this->getBoost());
         // Get rewritten query. Important! It also fills terms matching container.
         $rewrittenQuery = $query->rewrite($index);
         $this->_matches = $query->getQueryTerms();
         return $rewrittenQuery;
     }
     // Word is tokenized into several tokens
     throw new QueryParserException('Fuzzy search is supported only for non-multiple word terms');
 }
Beispiel #15
0
    /**
     * Transforms given objects into a bulk add operation directive
     *
     * @param ClassMetadata $metadata
     * @param array $objects
     * @param array bulk commands
     */
    public function run(ClassMetadata $metadata)
    {
        $index = $metadata->getIndex()->getName();

        Lucene::create("/tmp/index_$index");
    }
Beispiel #16
0
 public function testLimitingResult()
 {
     $index = Lucene\Lucene::open(dirname(__FILE__) . '/_index23Sample/_files');
     $storedResultSetLimit = Lucene\Lucene::getResultSetLimit();
     Lucene\Lucene::setResultSetLimit(3);
     $hits = $index->find('"reporting bugs"', 'path');
     $this->assertEquals(count($hits), 3);
     $expectedResultset = array(array(7, 0.212395, 'IndexSource/contributing.bugs.html'), array(0, 0.247795, 'IndexSource/contributing.documentation.html'), array(2, 0.176996, 'IndexSource/contributing.patches.html'));
     foreach ($hits as $resId => $hit) {
         $this->assertEquals($hit->id, $expectedResultset[$resId][0]);
         $this->assertTrue(abs($hit->score - $expectedResultset[$resId][1]) < 1.0E-6);
         $this->assertEquals($hit->path, $expectedResultset[$resId][2]);
     }
     Lucene\Lucene::setResultSetLimit($storedResultSetLimit);
 }
Beispiel #17
0
 /**
  * Object constructor
  */
 public function __construct($text, $field = null)
 {
     $this->field = ($field === null)?  Lucene\Lucene::getDefaultSearchField() : $field;
     $this->text  = $text;
 }
Beispiel #18
0
 public function testTermsStreamInterfaceSkipToTermsRetrievingTwoTermsCase()
 {
     $index = Lucene\Lucene::create(dirname(__FILE__) . '/_index/_files');
     // Zero terms
     $doc = new Document\Document();
     $doc->addField(Document\Field::Text('contents', 'someterm word'));
     $index->addDocument($doc);
     unset($index);
     $index = Lucene\Lucene::open(dirname(__FILE__) . '/_index/_files');
     $index->resetTermsStream();
     $index->skipTo(new Index\Term('term', 'contents'));
     $this->assertTrue($index->currentTerm() == new Index\Term('word', 'contents'));
     $index->closeTermsStream();
     $this->_clearDirectory(dirname(__FILE__) . '/_index/_files');
 }