Example #1
0
 public function testAnalyzer()
 {
     $currentAnalyzer = Analyzer\Analyzer::getDefault();
     $this->assertTrue($currentAnalyzer instanceof Analyzer);
     /** Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8Num */
     $newAnalyzer = new Common\Utf8Num();
     Analyzer\Analyzer::setDefault($newAnalyzer);
     $this->assertTrue(Analyzer\Analyzer::getDefault() === $newAnalyzer);
     // Set analyzer to the default value (used in other tests)
     Analyzer\Analyzer::setDefault($currentAnalyzer);
 }
Example #2
0
 public function testFilteredTokensQueryParserProcessing()
 {
     $index = Lucene\Lucene::open(dirname(__FILE__) . '/_index23Sample/_files');
     $this->assertEquals(count(\Zend\Search\Lucene\Analysis\Analyzer\Analyzer::getDefault()->tokenize('123456787654321')), 0);
     $hits = $index->find('"PEAR developers" AND Home AND 123456787654321');
     $this->assertEquals(count($hits), 1);
     $expectedResultset = array(array(1, 0.16827, 'IndexSource/contributing.wishlist.html'));
     foreach ($hits as $resId => $hit) {
         $this->assertEquals($hit->id, $expectedResultset[$resId][0]);
         $this->assertTrue(abs($hit->score - $expectedResultset[$resId][1]) < 1.0E-6);
         $this->assertEquals($hit->path, $expectedResultset[$resId][2]);
     }
 }
Example #3
0
 /**
  * Query specific matches highlighting
  *
  * @param Highlighter $highlighter  Highlighter object (also contains doc for highlighting)
  */
 protected function _highlightMatches(Highlighter $highlighter)
 {
     $words = array();
     $matchExpression = '/^' . str_replace(array('\\?', '\\*'), array('.', '.*'), preg_quote($this->_pattern->text, '/')) . '$/';
     if (@preg_match('/\\pL/u', 'a') == 1) {
         // PCRE unicode support is turned on
         // add Unicode modifier to the match expression
         $matchExpression .= 'u';
     }
     $docBody = $highlighter->getDocument()->getFieldUtf8Value('body');
     $tokens = Analyzer::getDefault()->tokenize($docBody, 'UTF-8');
     foreach ($tokens as $token) {
         if (preg_match($matchExpression, $token->getTermText()) === 1) {
             $words[] = $token->getTermText();
         }
     }
     $highlighter->highlight($words);
 }
Example #4
0
 /**
  * Query specific matches highlighting
  *
  * @param \Zend\Search\Lucene\Search\Highlighter $highlighter  Highlighter object (also contains doc for highlighting)
  */
 protected function _highlightMatches(Highlighter $highlighter)
 {
     $words = array();
     $docBody = $highlighter->getDocument()->getFieldUtf8Value('body');
     $tokens = Lucene\Analysis\Analyzer\Analyzer::getDefault()->tokenize($docBody, 'UTF-8');
     $lowerTermText = $this->_lowerTerm !== null ? $this->_lowerTerm->text : null;
     $upperTermText = $this->_upperTerm !== null ? $this->_upperTerm->text : null;
     if ($this->_inclusive) {
         foreach ($tokens as $token) {
             $termText = $token->getTermText();
             if (($lowerTermText == null || $lowerTermText <= $termText) && ($upperTermText == null || $termText <= $upperTermText)) {
                 $words[] = $termText;
             }
         }
     } else {
         foreach ($tokens as $token) {
             $termText = $token->getTermText();
             if (($lowerTermText == null || $lowerTermText < $termText) && ($upperTermText == null || $termText < $upperTermText)) {
                 $words[] = $termText;
             }
         }
     }
     $highlighter->highlight($words);
 }
Example #5
0
File: HTML.php Project: rikaix/zf2
 /**
  * Highlight text using specified View helper or callback function.
  *
  * @param string|array $words  Words to highlight. Words could be organized using the array or string.
  * @param callback $callback   Callback method, used to transform (highlighting) text.
  * @param array    $params     Array of additionall callback parameters passed through into it
  *                             (first non-optional parameter is an HTML fragment for highlighting)
  * @throws \Zend\Search\Lucene\Exception\InvalidArgumentException
  * @return string
  */
 public function highlightExtended($words, $callback, $params = array())
 {
     if (!is_array($words)) {
         $words = array($words);
     }
     $wordsToHighlightList = array();
     $analyzer = Analyzer\Analyzer::getDefault();
     foreach ($words as $wordString) {
         $wordsToHighlightList[] = $analyzer->tokenize($wordString);
     }
     $wordsToHighlight = call_user_func_array('array_merge', $wordsToHighlightList);
     if (count($wordsToHighlight) == 0) {
         return $this->_doc->saveHTML();
     }
     $wordsToHighlightFlipped = array();
     foreach ($wordsToHighlight as $id => $token) {
         $wordsToHighlightFlipped[$token->getTermText()] = $id;
     }
     if (!is_callable($callback)) {
         throw new InvalidArgumentException('$viewHelper parameter mast be a View Helper name, View Helper object or callback.');
     }
     $xpath = new \DOMXPath($this->_doc);
     $matchedNodes = $xpath->query("/html/body");
     foreach ($matchedNodes as $matchedNode) {
         $this->_highlightNodeRecursive($matchedNode, $wordsToHighlightFlipped, $callback, $params);
     }
 }
Example #6
0
 /**
  * Query specific matches highlighting
  *
  * @param Highlighter $highlighter  Highlighter object (also contains doc for highlighting)
  */
 protected function _highlightMatches(Highlighter $highlighter)
 {
     /** Skip fields detection. We don't need it, since we expect all fields presented in the HTML body and don't differentiate them */
     /** Skip exact term matching recognition, keyword fields highlighting is not supported */
     /** Skip wildcard queries recognition. Supported wildcards are removed by text analyzer */
     // tokenize phrase using current analyzer and process it as a phrase query
     $tokens = Analyzer::getDefault()->tokenize($this->_phrase, $this->_phraseEncoding);
     if (count($tokens) == 0) {
         // Do nothing
         return;
     }
     if (count($tokens) == 1) {
         $highlighter->highlight($tokens[0]->getTermText());
         return;
     }
     //It's non-trivial phrase query
     $words = array();
     foreach ($tokens as $token) {
         $words[] = $token->getTermText();
     }
     $highlighter->highlight($words);
 }
Example #7
0
 /**
  * Process last range query term (closed interval)
  *
  * @throws \Zend\Search\Lucene\Search\Exception\QueryParserException
  */
 public function closedRQLastTerm()
 {
     $tokens = Analyzer\Analyzer::getDefault()->tokenize($this->_rqFirstTerm, $this->_encoding);
     if (count($tokens) > 1) {
         throw new QueryParserException('Range query boundary terms must be non-multiple word terms');
     } else {
         if (count($tokens) == 1) {
             $from = new Index\Term(reset($tokens)->getTermText(), $this->_context->getField());
         } else {
             $from = null;
         }
     }
     $tokens = Analyzer\Analyzer::getDefault()->tokenize($this->_currentToken->text, $this->_encoding);
     if (count($tokens) > 1) {
         throw new QueryParserException('Range query boundary terms must be non-multiple word terms');
     } else {
         if (count($tokens) == 1) {
             $to = new Index\Term(reset($tokens)->getTermText(), $this->_context->getField());
         } else {
             $to = null;
         }
     }
     if ($from === null && $to === null) {
         throw new QueryParserException('At least one range query boundary term must be non-empty term');
     }
     $rangeQuery = new Query\Range($from, $to, true);
     $entry = new QueryEntry\Subquery($rangeQuery);
     $this->_context->addEntry($entry);
 }
Example #8
0
 /**
  * Adds a document to this segment.
  *
  * @param \Zend\Search\Lucene\Document $document
  * @throws LuceneException\UnsupportedMethodCallException
  */
 public function addDocument(Document $document)
 {
     $storedFields = array();
     $docNorms = array();
     $similarity = AbstractSimilarity::getDefault();
     foreach ($document->getFieldNames() as $fieldName) {
         $field = $document->getField($fieldName);
         if ($field->storeTermVector) {
             /**
              * @todo term vector storing support
              */
             throw new LuceneException\UnsupportedMethodCallException('Store term vector functionality is not supported yet.');
         }
         if ($field->isIndexed) {
             if ($field->isTokenized) {
                 $analyzer = Analyzer\Analyzer::getDefault();
                 $analyzer->setInput($field->value, $field->encoding);
                 $position = 0;
                 $tokenCounter = 0;
                 while (($token = $analyzer->nextToken()) !== null) {
                     $tokenCounter++;
                     $term = new Index\Term($token->getTermText(), $field->name);
                     $termKey = $term->key();
                     if (!isset($this->_termDictionary[$termKey])) {
                         // New term
                         $this->_termDictionary[$termKey] = $term;
                         $this->_termDocs[$termKey] = array();
                         $this->_termDocs[$termKey][$this->_docCount] = array();
                     } else {
                         if (!isset($this->_termDocs[$termKey][$this->_docCount])) {
                             // Existing term, but new term entry
                             $this->_termDocs[$termKey][$this->_docCount] = array();
                         }
                     }
                     $position += $token->getPositionIncrement();
                     $this->_termDocs[$termKey][$this->_docCount][] = $position;
                 }
                 if ($tokenCounter == 0) {
                     // Field contains empty value. Treat it as non-indexed and non-tokenized
                     $field = clone $field;
                     $field->isIndexed = $field->isTokenized = false;
                 } else {
                     $docNorms[$field->name] = chr($similarity->encodeNorm($similarity->lengthNorm($field->name, $tokenCounter) * $document->boost * $field->boost));
                 }
             } else {
                 if (($fieldUtf8Value = $field->getUtf8Value()) == '') {
                     // Field contains empty value. Treat it as non-indexed and non-tokenized
                     $field = clone $field;
                     $field->isIndexed = $field->isTokenized = false;
                 } else {
                     $term = new Index\Term($fieldUtf8Value, $field->name);
                     $termKey = $term->key();
                     if (!isset($this->_termDictionary[$termKey])) {
                         // New term
                         $this->_termDictionary[$termKey] = $term;
                         $this->_termDocs[$termKey] = array();
                         $this->_termDocs[$termKey][$this->_docCount] = array();
                     } else {
                         if (!isset($this->_termDocs[$termKey][$this->_docCount])) {
                             // Existing term, but new term entry
                             $this->_termDocs[$termKey][$this->_docCount] = array();
                         }
                     }
                     $this->_termDocs[$termKey][$this->_docCount][] = 0;
                     // position
                     $docNorms[$field->name] = chr($similarity->encodeNorm($similarity->lengthNorm($field->name, 1) * $document->boost * $field->boost));
                 }
             }
         }
         if ($field->isStored) {
             $storedFields[] = $field;
         }
         $this->addField($field);
     }
     foreach ($this->_fields as $fieldName => $field) {
         if (!$field->isIndexed) {
             continue;
         }
         if (!isset($this->_norms[$fieldName])) {
             $this->_norms[$fieldName] = str_repeat(chr($similarity->encodeNorm($similarity->lengthNorm($fieldName, 0))), $this->_docCount);
         }
         if (isset($docNorms[$fieldName])) {
             $this->_norms[$fieldName] .= $docNorms[$fieldName];
         } else {
             $this->_norms[$fieldName] .= chr($similarity->encodeNorm($similarity->lengthNorm($fieldName, 0)));
         }
     }
     $this->addStoredFields($storedFields);
 }
Example #9
0
 /**
  * Query specific matches highlighting
  *
  * @param \Zend\Search\Lucene\Search\Highlighter $highlighter  Highlighter object (also contains doc for highlighting)
  */
 protected function _highlightMatches(Highlighter $highlighter)
 {
     /** Skip fields detection. We don't need it, since we expect all fields presented in the HTML body and don't differentiate them */
     /** Skip exact term matching recognition, keyword fields highlighting is not supported */
     // -------------------------------------
     // Recognize wildcard queries
     /** @todo check for PCRE unicode support may be performed through Zend_Environment in some future */
     if (@preg_match('/\\pL/u', 'a') == 1) {
         $word = iconv($this->_encoding, 'UTF-8', $this->_word);
         $wildcardsPattern = '/[*?]/u';
         $subPatternsEncoding = 'UTF-8';
     } else {
         $word = $this->_word;
         $wildcardsPattern = '/[*?]/';
         $subPatternsEncoding = $this->_encoding;
     }
     $subPatterns = preg_split($wildcardsPattern, $word, -1, PREG_SPLIT_OFFSET_CAPTURE);
     if (count($subPatterns) > 1) {
         // Wildcard query is recognized
         $pattern = '';
         foreach ($subPatterns as $id => $subPattern) {
             // Append corresponding wildcard character to the pattern before each sub-pattern (except first)
             if ($id != 0) {
                 $pattern .= $word[$subPattern[1] - 1];
             }
             // Check if each subputtern is a single word in terms of current analyzer
             $tokens = Analyzer\Analyzer::getDefault()->tokenize($subPattern[0], $subPatternsEncoding);
             if (count($tokens) > 1) {
                 // Do nothing (nothing is highlighted)
                 return;
             }
             foreach ($tokens as $token) {
                 $pattern .= $token->getTermText();
             }
         }
         $term = new Index\Term($pattern, $this->_field);
         $query = new Query\Wildcard($term);
         $query->_highlightMatches($highlighter);
         return;
     }
     // -------------------------------------
     // Recognize one-term multi-term and "insignificant" queries
     $tokens = Analyzer\Analyzer::getDefault()->tokenize($this->_word, $this->_encoding);
     if (count($tokens) == 0) {
         // Do nothing
         return;
     }
     if (count($tokens) == 1) {
         $highlighter->highlight($tokens[0]->getTermText());
         return;
     }
     //It's not insignificant or one term query
     $words = array();
     foreach ($tokens as $token) {
         $words[] = $token->getTermText();
     }
     $highlighter->highlight($words);
 }
Example #10
0
File: Fuzzy.php Project: stunti/zf2
 /**
  * Query specific matches highlighting
  *
  * @param \Zend\Search\Lucene\Search\Highlighter\HighlighterInterface $highlighter  Highlighter object (also contains doc for highlighting)
  */
 protected function _highlightMatches(Highlighter\HighlighterInterface $highlighter)
 {
     $words = array();
     $prefix = Index\Term::getPrefix($this->_term->text, $this->_prefixLength);
     $prefixByteLength = strlen($prefix);
     $prefixUtf8Length = Index\Term::getLength($prefix);
     $termLength = Index\Term::getLength($this->_term->text);
     $termRest = substr($this->_term->text, $prefixByteLength);
     // we calculate length of the rest in bytes since levenshtein() is not UTF-8 compatible
     $termRestLength = strlen($termRest);
     $scaleFactor = 1 / (1 - $this->_minimumSimilarity);
     $docBody = $highlighter->getDocument()->getFieldUtf8Value('body');
     $tokens = Lucene\Analysis\Analyzer\Analyzer::getDefault()->tokenize($docBody, 'UTF-8');
     foreach ($tokens as $token) {
         $termText = $token->getTermText();
         if (substr($termText, 0, $prefixByteLength) == $prefix) {
             // Calculate similarity
             $target = substr($termText, $prefixByteLength);
             $maxDistance = isset($this->_maxDistances[strlen($target)]) ? $this->_maxDistances[strlen($target)] : $this->_calculateMaxDistance($prefixUtf8Length, $termRestLength, strlen($target));
             if ($termRestLength == 0) {
                 // we don't have anything to compare.  That means if we just add
                 // the letters for current term we get the new word
                 $similarity = $prefixUtf8Length == 0 ? 0 : 1 - strlen($target) / $prefixUtf8Length;
             } else {
                 if (strlen($target) == 0) {
                     $similarity = $prefixUtf8Length == 0 ? 0 : 1 - $termRestLength / $prefixUtf8Length;
                 } else {
                     if ($maxDistance < abs($termRestLength - strlen($target))) {
                         //just adding the characters of term to target or vice-versa results in too many edits
                         //for example "pre" length is 3 and "prefixes" length is 8.  We can see that
                         //given this optimal circumstance, the edit distance cannot be less than 5.
                         //which is 8-3 or more precisesly abs(3-8).
                         //if our maximum edit distance is 4, then we can discard this word
                         //without looking at it.
                         $similarity = 0;
                     } else {
                         $similarity = 1 - levenshtein($termRest, $target) / ($prefixUtf8Length + min($termRestLength, strlen($target)));
                     }
                 }
             }
             if ($similarity > $this->_minimumSimilarity) {
                 $words[] = $termText;
             }
         }
     }
     $highlighter->highlight($words);
 }
Example #11
0
 /**
  * Query specific matches highlighting
  *
  * @param \Zend\Search\Lucene\Search\Highlighter $highlighter  Highlighter object (also contains doc for highlighting)
  */
 protected function _highlightMatches(Highlighter $highlighter)
 {
     /** Skip fields detection. We don't need it, since we expect all fields presented in the HTML body and don't differentiate them */
     /** Skip exact term matching recognition, keyword fields highlighting is not supported */
     // -------------------------------------
     // Recognize wildcard queries
     /** @todo check for PCRE unicode support may be performed through Zend_Environment in some future */
     if (@preg_match('/\\pL/u', 'a') == 1) {
         $subPatterns = preg_split('/[*?]/u', iconv($this->_encoding, 'UTF-8', $this->_word));
     } else {
         $subPatterns = preg_split('/[*?]/', $this->_word);
     }
     if (count($subPatterns) > 1) {
         // Do nothing
         return;
     }
     // -------------------------------------
     // Recognize one-term multi-term and "insignificant" queries
     $tokens = Analyzer\Analyzer::getDefault()->tokenize($this->_word, $this->_encoding);
     if (count($tokens) == 0) {
         // Do nothing
         return;
     }
     if (count($tokens) == 1) {
         $term = new Index\Term($tokens[0]->getTermText(), $this->_field);
         $query = new Query\Fuzzy($term, $this->_minimumSimilarity);
         $query->_highlightMatches($highlighter);
         return;
     }
     // Word is tokenized into several tokens
     // But fuzzy search is supported only for non-multiple word terms
     // Do nothing
 }