public function getQuery($encoding) { if (strpos($this->_phrase, '?') !== false || strpos($this->_phrase, '*') !== false) { require_once 'Zend/Search/Lucene/Search/QueryParserException.php'; throw new Zend_Search_Lucene_Search_QueryParserException('Wildcards are only allowed in a single terms.'); } $tokens = Zend_Search_Lucene_Analysis_Analyzer::getDefault()->tokenize($this->_phrase, $encoding); if (count($tokens) == 0) { return new Zend_Search_Lucene_Search_Query_Insignificant(); } if (count($tokens) == 1) { $term = new Zend_Search_Lucene_Index_Term($tokens[0]->getTermText(), $this->_field); $query = new Zend_Search_Lucene_Search_Query_Term($term); $query->setBoost($this->_boost); return $query; } //It's not empty or one term query $position = -1; $query = new Zend_Search_Lucene_Search_Query_Phrase(); foreach ($tokens as $token) { $position += $token->getPositionIncrement(); $term = new Zend_Search_Lucene_Index_Term($token->getTermText(), $this->_field); $query->addTerm($term, $position); } if ($this->_proximityQuery) { $query->setSlop($this->_wordsDistance); } $query->setBoost($this->_boost); return $query; }
/** * Adds a document to this segment. * * @param Zend_Search_Lucene_Document $document * @throws Zend_Search_Lucene_Exception */ public function addDocument(Zend_Search_Lucene_Document $document) { $storedFields = array(); $docNorms = array(); $similarity = Zend_Search_Lucene_Search_Similarity::getDefault(); foreach ($document->getFieldNames() as $fieldName) { $field = $document->getField($fieldName); $this->addField($field); if ($field->storeTermVector) { /** * @todo term vector storing support */ throw new Zend_Search_Lucene_Exception('Store term vector functionality is not supported yet.'); } if ($field->isIndexed) { if ($field->isTokenized) { $tokenList = Zend_Search_Lucene_Analysis_Analyzer::getDefault()->tokenize($field->stringValue); } else { $tokenList = array(); $tokenList[] = new Zend_Search_Lucene_Analysis_Token($field->stringValue, 0, strlen($field->stringValue)); } $docNorms[$field->name] = chr($similarity->encodeNorm($similarity->lengthNorm($field->name, count($tokenList)))); $position = 0; foreach ($tokenList as $token) { $term = new Zend_Search_Lucene_Index_Term($token->getTermText(), $field->name); $termKey = $term->key(); if (!isset($this->_termDictionary[$termKey])) { // New term $this->_termDictionary[$termKey] = $term; $this->_termDocs[$termKey] = array(); $this->_termDocs[$termKey][$this->_docCount] = array(); } else { if (!isset($this->_termDocs[$termKey][$this->_docCount])) { // Existing term, but new term entry $this->_termDocs[$termKey][$this->_docCount] = array(); } } $position += $token->getPositionIncrement(); $this->_termDocs[$termKey][$this->_docCount][] = $position; } } if ($field->isStored) { $storedFields[] = $field; } } foreach ($this->_fields as $fieldName => $field) { if (!$field->isIndexed) { continue; } if (!isset($this->_norms[$fieldName])) { $this->_norms[$fieldName] = str_repeat(chr($similarity->encodeNorm($similarity->lengthNorm($fieldName, 0))), $this->_docCount); } if (isset($docNorms[$fieldName])) { $this->_norms[$fieldName] .= $docNorms[$fieldName]; } else { $this->_norms[$fieldName] .= chr($similarity->encodeNorm($similarity->lengthNorm($fieldName, 0))); } } $this->addStoredFields($storedFields); }
public function testAnalyzer() { $currentAnalyzer = Zend_Search_Lucene_Analysis_Analyzer::getDefault(); $this->assertTrue($currentAnalyzer instanceof Zend_Search_Lucene_Analysis_Analyzer); $newAnalyzer = new Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8Num(); Zend_Search_Lucene_Analysis_Analyzer::setDefault($newAnalyzer); $this->assertTrue(Zend_Search_Lucene_Analysis_Analyzer::getDefault() === $newAnalyzer); // Set analyzer to the default value (used in other tests) Zend_Search_Lucene_Analysis_Analyzer::setDefault($currentAnalyzer); }
public function analyze($text) { $result = parent::analyze($text); sfOpenPNEApplicationConfiguration::registerZend(); Zend_Search_Lucene_Analysis_Analyzer::setDefault(new Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8()); $analyzer = Zend_Search_Lucene_Analysis_Analyzer::getDefault(); $analyzer->setInput($text, 'UTF-8'); while (($nextToken = $analyzer->nextToken()) !== null) { $result[] = $nextToken->getTermText(); } return $result; }
private static function prepareZendSearchLucene() { Zend_Search_Lucene_Analysis_Analyzer::setDefault(new Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8Num_CaseInsensitive()); $stopWords = sfConfig::get('app_sf_propel_luceneable_behavior_stopWords', false); $stopWordsFilter = new Zend_Search_Lucene_Analysis_TokenFilter_StopWords(false === $stopWords ? array() : explode(',', $stopWords)); Zend_Search_Lucene_Analysis_Analyzer::getDefault()->addFilter($stopWordsFilter); $shortWords = sfConfig::get('app_sf_propel_luceneable_behavior_shortWords', 3); $shortWordsFilter = new Zend_Search_Lucene_Analysis_TokenFilter_ShortWords($shortWords); Zend_Search_Lucene_Analysis_Analyzer::getDefault()->addFilter($shortWordsFilter); Zend_Search_Lucene_Storage_Directory_Filesystem::setDefaultFilePermissions(0777); }
public function testFilteredTokensQueryParserProcessing() { $index = Zend_Search_Lucene::open(dirname(__FILE__) . '/_index23Sample/_files'); $this->assertEquals(count(Zend_Search_Lucene_Analysis_Analyzer::getDefault()->tokenize('123456787654321')), 0); $hits = $index->find('"PEAR developers" AND Home AND 123456787654321'); $this->assertEquals(count($hits), 1); $expectedResultset = array(array(1, 0.16827, 'IndexSource/contributing.wishlist.html')); foreach ($hits as $resId => $hit) { $this->assertEquals($hit->id, $expectedResultset[$resId][0]); $this->assertTrue(abs($hit->score - $expectedResultset[$resId][1]) < 1.0E-6); $this->assertEquals($hit->path, $expectedResultset[$resId][2]); } }
/** * Highlight text using specified View helper or callback function. * * @param string|array $words Words to highlight. Words could be organized using the array or string. * @param callback $callback Callback method, used to transform (highlighting) text. * @param array $params Array of additionall callback parameters passed through into it * (first non-optional parameter is an HTML fragment for highlighting) * @return string * @throws Zend_Search_Lucene_Exception */ public function highlightExtended($words, $callback, $params = array()) { /** Zend_Search_Lucene_Analysis_Analyzer */ require_once 'Zend/Search/Lucene/Analysis/Analyzer.php'; if (!is_array($words)) { $words = array($words); } $wordsToHighlightList = array(); $analyzer = Zend_Search_Lucene_Analysis_Analyzer::getDefault(); foreach ($words as $wordString) { $wordsToHighlightList[] = $analyzer->tokenize($wordString); } $wordsToHighlight = call_user_func_array('array_merge', $wordsToHighlightList); if (count($wordsToHighlight) == 0) { return $this->_doc->saveHTML(); } $wordsToHighlightFlipped = array(); foreach ($wordsToHighlight as $id => $token) { $wordsToHighlightFlipped[$token->getTermText()] = $id; } if (!is_callable($callback)) { require_once 'Zend/Search/Lucene/Exception.php'; throw new Zend_Search_Lucene_Exception('$viewHelper parameter must be a View Helper name, View Helper object or callback.'); } $xpath = new DOMXPath($this->_doc); $matchedNodes = $xpath->query("/html/body"); foreach ($matchedNodes as $matchedNode) { $this->_highlightNodeRecursive($matchedNode, $wordsToHighlightFlipped, $callback, $params); } }
/** * Highlight query terms * * @param integer &$colorIndex * @param Zend_Search_Lucene_Document_Html $doc */ public function highlightMatchesDOM(Zend_Search_Lucene_Document_Html $doc, &$colorIndex) { /** @todo implementation */ $words = array(); $matchExpression = '/^' . str_replace(array('\\?', '\\*'), array('.', '.*'), preg_quote($this->_pattern->text, '/')) . '$/'; if (@preg_match('/\\pL/u', 'a') == 1) { // PCRE unicode support is turned on // add Unicode modifier to the match expression $matchExpression .= 'u'; } $tokens = Zend_Search_Lucene_Analysis_Analyzer::getDefault()->tokenize($doc->getFieldUtf8Value('body'), 'UTF-8'); foreach ($tokens as $token) { if (preg_match($matchExpression, $token->getTermText()) === 1) { $words[] = $token->getTermText(); } } $doc->highlight($words, $this->_getHighlightColor($colorIndex)); }
/** * Query specific matches highlighting * * @param Zend_Search_Lucene_Search_Highlighter_Interface $highlighter Highlighter object (also contains doc for highlighting) */ protected function _highlightMatches(Zend_Search_Lucene_Search_Highlighter_Interface $highlighter) { /** Skip fields detection. We don't need it, since we expect all fields presented in the HTML body and don't differentiate them */ /** Skip exact term matching recognition, keyword fields highlighting is not supported */ /** Skip wildcard queries recognition. Supported wildcards are removed by text analyzer */ // tokenize phrase using current analyzer and process it as a phrase query require_once 'Zend/Search/Lucene/Analysis/Analyzer.php'; $tokens = Zend_Search_Lucene_Analysis_Analyzer::getDefault()->tokenize($this->_phrase, $this->_phraseEncoding); if (count($tokens) == 0) { // Do nothing return; } if (count($tokens) == 1) { $highlighter->highlight($tokens[0]->getTermText()); return; } //It's non-trivial phrase query $words = array(); foreach ($tokens as $token) { $words[] = $token->getTermText(); } $highlighter->highlight($words); }
/** * Query specific matches highlighting * * @param Zend_Search_Lucene_Search_Highlighter_Interface $highlighter Highlighter object (also contains doc for highlighting) */ protected function _highlightMatches(Zend_Search_Lucene_Search_Highlighter_Interface $highlighter) { /** Skip fields detection. We don't need it, since we expect all fields presented in the HTML body and don't differentiate them */ /** Skip exact term matching recognition, keyword fields highlighting is not supported */ // ------------------------------------- // Recognize wildcard queries /** @todo check for PCRE unicode support may be performed through Zend_Environment in some future */ if (@preg_match('/\\pL/u', 'a') == 1) { $subPatterns = preg_split('/[*?]/u', iconv($this->_encoding, 'UTF-8', $this->_word)); } else { $subPatterns = preg_split('/[*?]/', $this->_word); } if (count($subPatterns) > 1) { // Do nothing return; } // ------------------------------------- // Recognize one-term multi-term and "insignificant" queries require_once 'Zend/Search/Lucene/Analysis/Analyzer.php'; $tokens = Zend_Search_Lucene_Analysis_Analyzer::getDefault()->tokenize($this->_word, $this->_encoding); if (count($tokens) == 0) { // Do nothing return; } if (count($tokens) == 1) { require_once 'Zend/Search/Lucene/Index/Term.php'; $term = new Zend_Search_Lucene_Index_Term($tokens[0]->getTermText(), $this->_field); require_once 'Zend/Search/Lucene/Search/Query/Fuzzy.php'; $query = new Zend_Search_Lucene_Search_Query_Fuzzy($term, $this->_minimumSimilarity); $query->_highlightMatches($highlighter); return; } // Word is tokenized into several tokens // But fuzzy search is supported only for non-multiple word terms // Do nothing }
$t->isa_ok($indexer, 'sfLuceneIndexerFactory', '->getIndexer() returns an instance of sfLuceneIndexerFactory'); $t->diag('testing ->getContext()'); $t->isa_ok($lucene->getContext(), 'sfContext', '->getContext() returns an instance of sfContext'); $t->is($lucene->getContext(), sfContext::getInstance(), '->getContext() returns the same context'); $t->diag('testing ->configure()'); $lucene->configure(); $t->is(Zend_Search_Lucene_Search_QueryParser::getDefaultEncoding(), 'UTF-8', '->configure() configures the query parsers encoding'); foreach (array('Text', 'TextNum', 'Utf8', 'Utf8Num') as $type) { $lucene->setParameter('analyzer', $type); $lucene->configure(); $class = 'Zend_Search_Lucene_Analysis_Analyzer_Common_' . $type; $expected = new $class(); $expected->addFilter(new sfLuceneLowerCaseFilter(true)); $expected->addFilter(new Zend_Search_Lucene_Analysis_TokenFilter_StopWords(array('and', 'the'))); $expected->addFilter(new Zend_Search_Lucene_Analysis_TokenFilter_ShortWords(2)); $actual = Zend_Search_Lucene_Analysis_Analyzer::getDefault(); $t->ok($actual == $expected, '->configure() configures the analyzer for ' . $type); } $lucene->setParameter('analyzer', 'foobar'); try { $lucene->configure(); $t->fail('->configure() analyzer must be of text, textnum, utf8, or utf8num'); } catch (Exception $e) { $t->pass('->configure() analyzer must be of text, textnum, utf8, or utf8num'); } $lucene->setParameter('analyzer', 'utf8num'); $t->diag('testing ->find()'); class MockLucene { public $args; public $scoring;
/** * Query specific matches highlighting * * @param Zend_Search_Lucene_Search_Highlighter_Interface $highlighter Highlighter object (also contains doc for highlighting) */ protected function _highlightMatches(Zend_Search_Lucene_Search_Highlighter_Interface $highlighter) { /** Skip fields detection. We don't need it, since we expect all fields presented in the HTML body and don't differentiate them */ /** Skip exact term matching recognition, keyword fields highlighting is not supported */ // ------------------------------------- // Recognize wildcard queries /** @todo check for PCRE unicode support may be performed through Zend_Environment in some future */ if (@preg_match('/\\pL/u', 'a') == 1) { $word = iconv($this->_encoding, 'UTF-8', $this->_word); $wildcardsPattern = '/[*?]/u'; $subPatternsEncoding = 'UTF-8'; } else { $word = $this->_word; $wildcardsPattern = '/[*?]/'; $subPatternsEncoding = $this->_encoding; } $subPatterns = preg_split($wildcardsPattern, $word, -1, PREG_SPLIT_OFFSET_CAPTURE); if (count($subPatterns) > 1) { // Wildcard query is recognized $pattern = ''; require_once 'Zend/Search/Lucene/Analysis/Analyzer.php'; foreach ($subPatterns as $id => $subPattern) { // Append corresponding wildcard character to the pattern before each sub-pattern (except first) if ($id != 0) { $pattern .= $word[$subPattern[1] - 1]; } // Check if each subputtern is a single word in terms of current analyzer $tokens = Zend_Search_Lucene_Analysis_Analyzer::getDefault()->tokenize($subPattern[0], $subPatternsEncoding); if (count($tokens) > 1) { // Do nothing (nothing is highlighted) return; } foreach ($tokens as $token) { $pattern .= $token->getTermText(); } } require_once 'Zend/Search/Lucene/Index/Term.php'; $term = new Zend_Search_Lucene_Index_Term($pattern, $this->_field); require_once 'Zend/Search/Lucene/Search/Query/Wildcard.php'; $query = new Zend_Search_Lucene_Search_Query_Wildcard($term); $query->_highlightMatches($highlighter); return; } // ------------------------------------- // Recognize one-term multi-term and "insignificant" queries require_once 'Zend/Search/Lucene/Analysis/Analyzer.php'; $tokens = Zend_Search_Lucene_Analysis_Analyzer::getDefault()->tokenize($this->_word, $this->_encoding); if (count($tokens) == 0) { // Do nothing return; } if (count($tokens) == 1) { $highlighter->highlight($tokens[0]->getTermText()); return; } //It's not insignificant or one term query $words = array(); foreach ($tokens as $token) { $words[] = $token->getTermText(); } $highlighter->highlight($words); }
/** * Query specific matches highlighting * * @param Zend_Search_Lucene_Search_Highlighter_Interface $highlighter Highlighter object (also contains doc for highlighting) */ protected function _highlightMatches(Zend_Search_Lucene_Search_Highlighter_Interface $highlighter) { $words = array(); $docBody = $highlighter->getDocument()->getFieldUtf8Value('body'); // require_once 'Zend/Search/Lucene/Analysis/Analyzer.php'; $tokens = Zend_Search_Lucene_Analysis_Analyzer::getDefault()->tokenize($docBody, 'UTF-8'); $lowerTermText = $this->_lowerTerm !== null ? $this->_lowerTerm->text : null; $upperTermText = $this->_upperTerm !== null ? $this->_upperTerm->text : null; if ($this->_inclusive) { foreach ($tokens as $token) { $termText = $token->getTermText(); if (($lowerTermText == null || $lowerTermText <= $termText) && ($upperTermText == null || $termText <= $upperTermText)) { $words[] = $termText; } } } else { foreach ($tokens as $token) { $termText = $token->getTermText(); if (($lowerTermText == null || $lowerTermText < $termText) && ($upperTermText == null || $termText < $upperTermText)) { $words[] = $termText; } } } $highlighter->highlight($words); }
/** * Return a list of posts that are similar to the current post. * This is not a very good implementation, so do not expect * amazing results - the term vector is not available for a doc * in ZSL, which limits how far you can go! * * @return array ids */ public function get_similar_posts($post, $max_recommended = 5) { Zend_Search_Lucene::setResultSetLimit($max_recommended + 1); $title = $post->title; $tags = $post->tags; $tagstring = ''; foreach ($tags as $tag) { $tagstring .= $tag . ' '; } $analyser = Zend_Search_Lucene_Analysis_Analyzer::getDefault(); $tokens = $analyser->tokenize(strtolower($tagstring) . ' ' . strtolower($title)); $query = new Zend_Search_Lucene_Search_Query_MultiTerm(); foreach ($tokens as $token) { $query->addTerm(new Zend_Search_Lucene_Index_Term($token->getTermText())); } $hits = $this->_index->find($query); $ids = array(); $counter = 0; foreach ($hits as $hit) { if ($hit->postid != $post->id) { $ids[] = $hit->postid; $counter++; } if ($counter == $max_recommended) { break; } } return $ids; }
/** * * @return Zend_Search_Lucene_Search_Query $query * @param string $queryString */ public function createFuzzyQuery($queryString) { Zend_Search_Lucene_Search_QueryParser::setDefaultEncoding($this->_encoding); $userQuery = Zend_Search_Lucene_Search_QueryParser::parse($queryString, $this->_encoding); $query = new Zend_Search_Lucene_Search_Query_Boolean(); $query->addSubquery($userQuery); $tokens = Zend_Search_Lucene_Analysis_Analyzer::getDefault()->tokenize($queryString, $this->_encoding); if (2 > count($tokens)) { $term = new Zend_Search_Lucene_Index_Term($queryString, 'name'); $fuzzy = new Zend_Search_Lucene_Search_Query_Fuzzy($term, 0.4); $query->addSubquery($fuzzy); } return $query; }
/** * Adds a document to this segment. * * @param Zend_Search_Lucene_Document $document * @throws Zend_Search_Lucene_Exception */ public function addDocument(Zend_Search_Lucene_Document $document) { $storedFields = array(); foreach ($document->getFieldNames() as $fieldName) { $field = $document->getField($fieldName); $this->_addFieldInfo($field); if ($field->storeTermVector) { /** * @todo term vector storing support */ throw new Zend_Search_Lucene_Exception('Store term vector functionality is not supported yet.'); } if ($field->isIndexed) { if ($field->isTokenized) { $tokenList = Zend_Search_Lucene_Analysis_Analyzer::getDefault()->tokenize($field->stringValue); } else { $tokenList = array(); $tokenList[] = new Zend_Search_Lucene_Analysis_Token($field->stringValue, 0, strlen($field->stringValue)); } $this->_fieldLengths[$field->name][$this->_docCount] = count($tokenList); $position = 0; foreach ($tokenList as $token) { $term = new Zend_Search_Lucene_Index_Term($token->getTermText(), $field->name); $termKey = $term->key(); if (!isset($this->_termDictionary[$termKey])) { // New term $this->_termDictionary[$termKey] = $term; $this->_termDocs[$termKey] = array(); $this->_termDocs[$termKey][$this->_docCount] = array(); } else { if (!isset($this->_termDocs[$termKey][$this->_docCount])) { // Existing term, but new term entry $this->_termDocs[$termKey][$this->_docCount] = array(); } } $position += $token->getPositionIncrement(); $this->_termDocs[$termKey][$this->_docCount][] = $position; } } if ($field->isStored) { $storedFields[] = $field; } } if (count($storedFields) != 0) { if (!isset($this->_fdxFile)) { $this->_fdxFile = $this->_directory->createFile($this->_name . '.fdx'); $this->_fdtFile = $this->_directory->createFile($this->_name . '.fdt'); $this->_files[] = $this->_name . '.fdx'; $this->_files[] = $this->_name . '.fdt'; } $this->_fdxFile->writeLong($this->_fdtFile->tell()); $this->_fdtFile->writeVInt(count($storedFields)); foreach ($storedFields as $field) { $this->_fdtFile->writeVInt($this->_fields[$field->name]->number); $fieldBits = ($field->isTokenized ? 0x1 : 0x0) | ($field->isBinary ? 0x2 : 0x0) | 0x0; /* 0x04 - third bit, compressed (ZLIB) */ $this->_fdtFile->writeByte($fieldBits); if ($field->isBinary) { $this->_fdtFile->writeVInt(strlen($field->stringValue)); $this->_fdtFile->writeBytes($field->stringValue); } else { $this->_fdtFile->writeString($field->stringValue); } } } $this->_docCount++; }
/** * Adds a document to this index. * * @param Zend_Search_Lucene_Document $document */ public function addDocument(Zend_Search_Lucene_Document $document) { $this->_documents[$this->_docID] = $document; // parse document $analyzer = Zend_Search_Lucene_Analysis_Analyzer::getDefault(); $fieldNames = $document->getFieldnames(); foreach ($fieldNames as $fieldName) { $field = $document->getField($fieldName); // tokenize if requested if ($field->isTokenized) { $tokens = $analyzer->tokenize($field->getUtf8Value(), 'UTF-8'); } else { $tokens = array(new Zend_Search_Lucene_Analysis_Token($field->getUtf8Value(), 0, strlen(utf8_decode($field->getUtf8Value())))); } // store tokens in "index" $position = -1; foreach ($tokens as $token) { $text = $token->getTermText(); $term = new Zend_Search_Lucene_Index_Term($text, $fieldName); $position += $token->getPositionIncrement(); // build an ordered array (list) of terms for each field if (isset($this->_terms[$fieldName])) { // if the term is not set already, sort it in if (!isset($this->_terms[$fieldName][$text])) { $new = array(); while (($current = array_shift($this->_terms[$fieldName])) && $text > $current->text) { $new[$current->text] = $current; } $new[$text] = $term; if ($current) { $new[$current->text] = $current; } $this->_terms[$fieldName] = array_merge($new, $this->_terms[$fieldName]); } } else { // first terms in each field are just stored $this->_terms[$fieldName][$text] = $term; } // store termPosition for this term $this->_termPositions[$fieldName][$text][$this->_docID][] = $position; // store or increase term freq for this document if (!isset($this->_termDocs[$fieldName][$text][$this->_docID])) { $this->_termDocs[$fieldName][$text][$this->_docID] = 1; } else { $this->_termDocs[$fieldName][$text][$this->_docID]++; } } // remember fieldname and document $this->_fields[$fieldName][$this->_docID] = 1; // calculate and store normalisation vector $this->_norms[$fieldName][$this->_docID] = $this->getSimilarity()->lengthNorm($fieldName, sizeof($tokens)) * $document->boost * $field->boost; } // increase docID $this->_docID++; }
/** * Process last range query term (closed interval) * * @throws Zend_Search_Lucene_Search_QueryParserException */ public function closedRQLastTerm() { $tokens = Zend_Search_Lucene_Analysis_Analyzer::getDefault()->tokenize($this->_rqFirstTerm, $this->_encoding); if (count($tokens) > 1) { require_once 'Zend/Search/Lucene/Search/QueryParserException.php'; throw new Zend_Search_Lucene_Search_QueryParserException('Range query boundary terms must be non-multiple word terms'); } else { if (count($tokens) == 1) { require_once 'Zend/Search/Lucene/Index/Term.php'; $from = new Zend_Search_Lucene_Index_Term(reset($tokens)->getTermText(), $this->_context->getField()); } else { $from = null; } } $tokens = Zend_Search_Lucene_Analysis_Analyzer::getDefault()->tokenize($this->_currentToken->text, $this->_encoding); if (count($tokens) > 1) { require_once 'Zend/Search/Lucene/Search/QueryParserException.php'; throw new Zend_Search_Lucene_Search_QueryParserException('Range query boundary terms must be non-multiple word terms'); } else { if (count($tokens) == 1) { require_once 'Zend/Search/Lucene/Index/Term.php'; $to = new Zend_Search_Lucene_Index_Term(reset($tokens)->getTermText(), $this->_context->getField()); } else { $to = null; } } if ($from === null && $to === null) { require_once 'Zend/Search/Lucene/Search/QueryParserException.php'; throw new Zend_Search_Lucene_Search_QueryParserException('At least one range query boundary term must be non-empty term'); } require_once 'Zend/Search/Lucene/Search/Query/Range.php'; $rangeQuery = new Zend_Search_Lucene_Search_Query_Range($from, $to, true); require_once 'Zend/Search/Lucene/Search/QueryEntry/Subquery.php'; $entry = new Zend_Search_Lucene_Search_QueryEntry_Subquery($rangeQuery); $this->_context->addEntry($entry); }
/** * Transform entry to a subquery * * @param string $encoding * @return Zend_Search_Lucene_Search_Query * @throws Zend_Search_Lucene_Search_QueryParserException */ public function getQuery($encoding) { if ($this->_fuzzyQuery) { throw new Zend_Search_Lucene_Search_QueryParserException('Fuzzy search is not supported yet.'); } if (strpos($this->_term, '?') !== false || strpos($this->_term, '*') !== false) { throw new Zend_Search_Lucene_Search_QueryParserException('Wildcard queries are not supported yet.'); } $tokens = Zend_Search_Lucene_Analysis_Analyzer::getDefault()->tokenize($this->_term, $encoding); if (count($tokens) == 0) { return new Zend_Search_Lucene_Search_Query_Insignificant(); } if (count($tokens) == 1) { $term = new Zend_Search_Lucene_Index_Term($tokens[0]->getTermText(), $this->_field); $query = new Zend_Search_Lucene_Search_Query_Term($term); $query->setBoost($this->_boost); return $query; } //It's not empty or one term query $query = new Zend_Search_Lucene_Search_Query_MultiTerm(); /** * @todo Process $token->getPositionIncrement() to support stemming, synonyms and other * analizer design features */ foreach ($tokens as $token) { $term = new Zend_Search_Lucene_Index_Term($token->getTermText(), $this->_field); $query->addTerm($term, true); // all subterms are required } $query->setBoost($this->_boost); return $query; }
/** * @see Page::show() */ public function show() { parent::show(); if ($this->action == 'create') { $fields = array('messageID' => 'UnIndexed', 'senderID' => 'Keyword', 'sender' => 'Text', 'ownerID' => 'Keyword', 'time' => 'Keyword', 'messageType' => 'Keyword', 'subject' => 'Text', 'message' => 'Text'); $luceneObj = LuceneEditor::create('messages', $fields); var_dump($luceneObj); } else { if ($this->action == 'count') { $luceneObj = $this->getLucene(); var_dump($luceneObj->count()); } else { if ($this->action == 'search') { $time = microtime(true); $luceneObj = $this->getLucene(); $hits = $luceneObj->search($_REQUEST['search']); echo count($hits) . ' in ' . (microtime(true) - $time) . 's:<br>"'; foreach ($hits as $hit) { echo $hit->id . '"<br>"'; echo $hit->ownerID . '"<br>"'; echo $hit->messageID . '"<br>"'; echo $hit->time . '"<br>"'; echo $hit->subject . '"<br>"'; echo $hit->message . '"<br><br>'; } } else { if ($this->action == 'termDocs') { $time = microtime(true); $luceneObj = $this->getLucene(); $ids = $luceneObj->termDocs($_REQUEST['search'], $_REQUEST['f']); echo count($ids) . ' in ' . (microtime(true) - $time) . 's:<br>"'; foreach ($ids as $id) { $hit = $luceneObj->getDocument($id); echo $id . '"<br>"'; echo $hit->ownerID . '"<br>"'; echo $hit->messageID . '"<br>"'; echo $hit->time . '"<br>"'; echo $hit->subject . '"<br>"'; echo $hit->message . '"<br><br>'; } } else { if ($this->action == 'test') { var_dump($_REQUEST['search']); $time = microtime(true); $luceneObj = $this->getLucene(); $luceneObj->getIndex(); $query = Zend_Search_Lucene_Search_QueryParser::parse($_REQUEST['search']); var_dump($query); echo '<br>'; echo '<br>'; $query = $query->rewrite($luceneObj->getIndex())->optimize($luceneObj->getIndex()); var_dump($query); echo '<br>'; echo '<br>'; //var_dump(Zend_Search_Lucene_Search_QueryParser::$_instance); echo '<br>'; echo '<br>'; var_dump(Zend_Search_Lucene_Search_QueryParser::$_instance->_context->getQuery()); echo '<br>'; echo '<br>'; /*$expressionRecognizer = new Zend_Search_Lucene_Search_BooleanExpressionRecognizer(); $expressionRecognizer->processLiteral(Zend_Search_Lucene_Search_QueryParser::$_instance->_context->_entries[0]); $conjuctions = $expressionRecognizer->finishExpression(); var_dump(Zend_Search_Lucene_Search_QueryParser::$_instance->_context->_entries[0], $conjuctions);*/ //$query2 = Zend_Search_Lucene_Search_QueryParser::$_instance->_context->_entries[0]->getQuery(null); var_dump($query2); echo '<br>'; echo '<br>'; $token = Zend_Search_Lucene_Analysis_Analyzer::getDefault()->tokenize('1'); var_dump($token); echo 'tested in ' . (microtime(true) - $time) . 's<br>'; } else { if ($this->action == 'test2') { $time = microtime(true); $luceneObj = $this->getLucene(); $lexer = new Zend_Search_Lucene_Search_QueryLexer(); var_dump($lexer); echo '<br>'; echo '<br>'; $tokens = $lexer->tokenize($_REQUEST['search']); var_dump($tokens); echo 'tested in ' . (microtime(true) - $time) . 's<br>'; } else { if ($this->action == 'list') { $time = microtime(true); $luceneObj = $this->getLucene(); $hits = $luceneObj->terms(); echo 'found in ' . (microtime(true) - $time) . 's:<br>'; var_dump($hits); } else { if ($this->action == 'optimize') { $time = microtime(true); $luceneObj = $this->getLucene(); $luceneObj->optimize(); echo 'optimized in ' . (microtime(true) - $time) . 's<br>'; } else { if ($this->action == 'process') { require_once '../wcf/lib/system/io/File.class.php'; include '/tmp/cache.php'; ob_start(); // // set_time_limit(120); $luceneObj = $this->getLucene(); $sql = "SELECT *\r\n\t\t\t\t\tFROM ugml_messages\r\n\t\t\t\t\tWHERE message_id BETWEEN " . $this->start . " AND " . ($this->start + self::INTERVAL); $result = WCF::getDB()->sendQuery($sql); $array = array(); $i = 0; $time = microtime(true); while ($row = WCF::getDB()->fetchArray($result)) { $fields = array('messageID' => $row['message_id'], 'senderID' => $row['message_sender'], 'sender' => $row['message_from'], 'ownerID' => $row['message_owner'], 'time' => $row['message_time'], 'messageType' => $row['message_type'], 'subject' => $row['message_subject'], 'message' => $row['message_text']); $array[] = $fields; ++$i; } if (count($array)) { $luceneObj->add($array); } else { echo 'i think im done :)'; } echo 'done ' . $i . ' in ' . (microtime(true) - $time) . ';<br>'; $output .= ob_get_contents(); ob_end_clean(); $file = new File('/tmp/cache.php'); $file->write("<?php\r\n/*\r\n This file is part of WOT Game.\r\n\r\n WOT Game is free software: you can redistribute it and/or modify\r\n it under the terms of the GNU Affero General Public License as published by\r\n the Free Software Foundation, either version 3 of the License, or\r\n (at your option) any later version.\r\n\r\n WOT Game is distributed in the hope that it will be useful,\r\n but WITHOUT ANY WARRANTY; without even the implied warranty of\r\n MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\r\n GNU Affero General Public License for more details.\r\n\r\n You should have received a copy of the GNU Affero General Public License\r\n along with WOT Game. If not, see <http://www.gnu.org/licenses/>.\r\n*/\r\n\n\$output = '" . $output . "';\n?>"); $file->close(); echo $output; ?> <br> <br> <br> <br> <br> next link: <br> <a href="index.php?page=Lucene&action=process&luceneID=<?php /* This file is part of WOT Game. WOT Game is free software: you can redistribute it and/or modify it under the terms of the GNU Affero General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. WOT Game is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. You should have received a copy of the GNU Affero General Public License along with WOT Game. If not, see <http://www.gnu.org/licenses/>. */ echo $this->luceneID; ?> &start=<?php /* This file is part of WOT Game. WOT Game is free software: you can redistribute it and/or modify it under the terms of the GNU Affero General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. WOT Game is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. You should have received a copy of the GNU Affero General Public License along with WOT Game. If not, see <http://www.gnu.org/licenses/>. */ echo $this->start + self::INTERVAL; ?> "> index.php?page=Lucene&action=process&luceneID=<?php /* This file is part of WOT Game. WOT Game is free software: you can redistribute it and/or modify it under the terms of the GNU Affero General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. WOT Game is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. You should have received a copy of the GNU Affero General Public License along with WOT Game. If not, see <http://www.gnu.org/licenses/>. */ echo $this->luceneID; ?> &start=<?php /* This file is part of WOT Game. WOT Game is free software: you can redistribute it and/or modify it under the terms of the GNU Affero General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. WOT Game is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. You should have received a copy of the GNU Affero General Public License along with WOT Game. If not, see <http://www.gnu.org/licenses/>. */ echo $this->start + self::INTERVAL; ?> </a> <?php /* This file is part of WOT Game. WOT Game is free software: you can redistribute it and/or modify it under the terms of the GNU Affero General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. WOT Game is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. You should have received a copy of the GNU Affero General Public License along with WOT Game. If not, see <http://www.gnu.org/licenses/>. */ usleep(100000); ob_flush(); flush(); if (count($array)) { ?> <script> window.location.href = 'index.php?page=Lucene&action=process&luceneID=<?php /* This file is part of WOT Game. WOT Game is free software: you can redistribute it and/or modify it under the terms of the GNU Affero General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. WOT Game is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. You should have received a copy of the GNU Affero General Public License along with WOT Game. If not, see <http://www.gnu.org/licenses/>. */ echo $this->luceneID; ?> &start=<?php /* This file is part of WOT Game. WOT Game is free software: you can redistribute it and/or modify it under the terms of the GNU Affero General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. WOT Game is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. You should have received a copy of the GNU Affero General Public License along with WOT Game. If not, see <http://www.gnu.org/licenses/>. */ echo $this->start + self::INTERVAL; ?> '; </script> <?php /* This file is part of WOT Game. WOT Game is free software: you can redistribute it and/or modify it under the terms of the GNU Affero General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. WOT Game is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. You should have received a copy of the GNU Affero General Public License along with WOT Game. If not, see <http://www.gnu.org/licenses/>. */ } exit; } } } } } } } } } }
/** * Highlight text with specified color * * @param string|array $words * @param string $color * @return string */ public function highlight($words, $color = '#66ffff') { if (!is_array($words)) { $words = array($words); } $wordsToHighlight = array(); $analyzer = Zend_Search_Lucene_Analysis_Analyzer::getDefault(); foreach ($words as $wordString) { $wordsToHighlight = array_merge($wordsToHighlight, $analyzer->tokenize($wordString)); } if (count($wordsToHighlight) == 0) { return $this->_doc->saveHTML(); } $wordsToHighlightFlipped = array(); foreach ($wordsToHighlight as $id => $token) { $wordsToHighlightFlipped[$token->getTermText()] = $id; } $xpath = new DOMXPath($this->_doc); $matchedNodes = $xpath->query("/html/body"); foreach ($matchedNodes as $matchedNode) { $this->_highlightNode($matchedNode, $wordsToHighlightFlipped, $color); } }
/** * Adds a document to this segment. * * @param Zend_Search_Lucene_Document $document * @throws Zend_Search_Lucene_Exception */ public function addDocument(Zend_Search_Lucene_Document $document) { /** Zend_Search_Lucene_Search_Similarity */ // require_once 'Zend/Search/Lucene/Search/Similarity.php'; $storedFields = array(); $docNorms = array(); $similarity = Zend_Search_Lucene_Search_Similarity::getDefault(); foreach ($document->getFieldNames() as $fieldName) { $field = $document->getField($fieldName); if ($field->storeTermVector) { /** * @todo term vector storing support */ // require_once 'Zend/Search/Lucene/Exception.php'; throw new Zend_Search_Lucene_Exception('Store term vector functionality is not supported yet.'); } if ($field->isIndexed) { if ($field->isTokenized) { /** Zend_Search_Lucene_Analysis_Analyzer */ // require_once 'Zend/Search/Lucene/Analysis/Analyzer.php'; $analyzer = Zend_Search_Lucene_Analysis_Analyzer::getDefault(); $analyzer->setInput($field->value, $field->encoding); $position = 0; $tokenCounter = 0; while (($token = $analyzer->nextToken()) !== null) { $tokenCounter++; $term = new Zend_Search_Lucene_Index_Term($token->getTermText(), $field->name); $termKey = $term->key(); if (!isset($this->_termDictionary[$termKey])) { // New term $this->_termDictionary[$termKey] = $term; $this->_termDocs[$termKey] = array(); $this->_termDocs[$termKey][$this->_docCount] = array(); } else { if (!isset($this->_termDocs[$termKey][$this->_docCount])) { // Existing term, but new term entry $this->_termDocs[$termKey][$this->_docCount] = array(); } } $position += $token->getPositionIncrement(); $this->_termDocs[$termKey][$this->_docCount][] = $position; } if ($tokenCounter == 0) { // Field contains empty value. Treat it as non-indexed and non-tokenized $field = clone $field; $field->isIndexed = $field->isTokenized = false; } else { $docNorms[$field->name] = chr($similarity->encodeNorm($similarity->lengthNorm($field->name, $tokenCounter) * $document->boost * $field->boost)); } } else { if (($fieldUtf8Value = $field->getUtf8Value()) == '') { // Field contains empty value. Treat it as non-indexed and non-tokenized $field = clone $field; $field->isIndexed = $field->isTokenized = false; } else { $term = new Zend_Search_Lucene_Index_Term($fieldUtf8Value, $field->name); $termKey = $term->key(); if (!isset($this->_termDictionary[$termKey])) { // New term $this->_termDictionary[$termKey] = $term; $this->_termDocs[$termKey] = array(); $this->_termDocs[$termKey][$this->_docCount] = array(); } else { if (!isset($this->_termDocs[$termKey][$this->_docCount])) { // Existing term, but new term entry $this->_termDocs[$termKey][$this->_docCount] = array(); } } $this->_termDocs[$termKey][$this->_docCount][] = 0; // position $docNorms[$field->name] = chr($similarity->encodeNorm($similarity->lengthNorm($field->name, 1) * $document->boost * $field->boost)); } } } if ($field->isStored) { $storedFields[] = $field; } $this->addField($field); } foreach ($this->_fields as $fieldName => $field) { if (!$field->isIndexed) { continue; } if (!isset($this->_norms[$fieldName])) { $this->_norms[$fieldName] = str_repeat(chr($similarity->encodeNorm($similarity->lengthNorm($fieldName, 0))), $this->_docCount); } if (isset($docNorms[$fieldName])) { $this->_norms[$fieldName] .= $docNorms[$fieldName]; } else { $this->_norms[$fieldName] .= chr($similarity->encodeNorm($similarity->lengthNorm($fieldName, 0))); } } $this->addStoredFields($storedFields); }
public function test_Indexer_Has_Correct_Config() { $analyzer = Zend_Search_Lucene_Analysis_Analyzer::getDefault(); $this->assertType('Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8Num_CaseInsensitive', $analyzer); }
/** * Query specific matches highlighting * * @param Zend_Search_Lucene_Search_Highlighter_Interface $highlighter Highlighter object (also contains doc for highlighting) */ protected function _highlightMatches(Zend_Search_Lucene_Search_Highlighter_Interface $highlighter) { $words = array(); //$1 'Zend/Search/Lucene/Index/Term.php'; $prefix = Zend_Search_Lucene_Index_Term::getPrefix($this->_term->text, $this->_prefixLength); $prefixByteLength = strlen($prefix); $prefixUtf8Length = Zend_Search_Lucene_Index_Term::getLength($prefix); $termLength = Zend_Search_Lucene_Index_Term::getLength($this->_term->text); $termRest = substr($this->_term->text, $prefixByteLength); // we calculate length of the rest in bytes since levenshtein() is not UTF-8 compatible $termRestLength = strlen($termRest); $scaleFactor = 1 / (1 - $this->_minimumSimilarity); $docBody = $highlighter->getDocument()->getFieldUtf8Value('body'); //$1 'Zend/Search/Lucene/Analysis/Analyzer.php'; $tokens = Zend_Search_Lucene_Analysis_Analyzer::getDefault()->tokenize($docBody, 'UTF-8'); foreach ($tokens as $token) { $termText = $token->getTermText(); if (substr($termText, 0, $prefixByteLength) == $prefix) { // Calculate similarity $target = substr($termText, $prefixByteLength); $maxDistance = isset($this->_maxDistances[strlen($target)]) ? $this->_maxDistances[strlen($target)] : $this->_calculateMaxDistance($prefixUtf8Length, $termRestLength, strlen($target)); if ($termRestLength == 0) { // we don't have anything to compare. That means if we just add // the letters for current term we get the new word $similarity = $prefixUtf8Length == 0 ? 0 : 1 - strlen($target) / $prefixUtf8Length; } else { if (strlen($target) == 0) { $similarity = $prefixUtf8Length == 0 ? 0 : 1 - $termRestLength / $prefixUtf8Length; } else { if ($maxDistance < abs($termRestLength - strlen($target))) { //just adding the characters of term to target or vice-versa results in too many edits //for example "pre" length is 3 and "prefixes" length is 8. We can see that //given this optimal circumstance, the edit distance cannot be less than 5. //which is 8-3 or more precisesly abs(3-8). //if our maximum edit distance is 4, then we can discard this word //without looking at it. $similarity = 0; } else { $similarity = 1 - levenshtein($termRest, $target) / ($prefixUtf8Length + min($termRestLength, strlen($target))); } } } if ($similarity > $this->_minimumSimilarity) { $words[] = $termText; } } } $highlighter->highlight($words); }
/** * Transform entry to a subquery * * @param string $encoding * @return Zend_Search_Lucene_Search_Query * @throws Zend_Search_Lucene_Search_QueryParserException */ public function getQuery($encoding) { if (strpos($this->_term, '?') !== false || strpos($this->_term, '*') !== false) { if ($this->_fuzzyQuery) { // require_once 'Zend/Search/Lucene/Search/QueryParserException.php'; throw new Zend_Search_Lucene_Search_QueryParserException('Fuzzy search is not supported for terms with wildcards.'); } $pattern = ''; $subPatterns = explode('*', $this->_term); $astericFirstPass = true; foreach ($subPatterns as $subPattern) { if (!$astericFirstPass) { $pattern .= '*'; } else { $astericFirstPass = false; } $subPatternsL2 = explode('?', $subPattern); $qMarkFirstPass = true; foreach ($subPatternsL2 as $subPatternL2) { if (!$qMarkFirstPass) { $pattern .= '?'; } else { $qMarkFirstPass = false; } $tokens = Zend_Search_Lucene_Analysis_Analyzer::getDefault()->tokenize($subPatternL2, $encoding); if (count($tokens) > 1) { // require_once 'Zend/Search/Lucene/Search/QueryParserException.php'; throw new Zend_Search_Lucene_Search_QueryParserException('Wildcard search is supported only for non-multiple word terms'); } foreach ($tokens as $token) { $pattern .= $token->getTermText(); } } } $term = new Zend_Search_Lucene_Index_Term($pattern, $this->_field); $query = new Zend_Search_Lucene_Search_Query_Wildcard($term); $query->setBoost($this->_boost); return $query; } $tokens = Zend_Search_Lucene_Analysis_Analyzer::getDefault()->tokenize($this->_term, $encoding); if (count($tokens) == 0) { return new Zend_Search_Lucene_Search_Query_Insignificant(); } if (count($tokens) == 1 && !$this->_fuzzyQuery) { $term = new Zend_Search_Lucene_Index_Term($tokens[0]->getTermText(), $this->_field); $query = new Zend_Search_Lucene_Search_Query_Term($term); $query->setBoost($this->_boost); return $query; } if (count($tokens) == 1 && $this->_fuzzyQuery) { $term = new Zend_Search_Lucene_Index_Term($tokens[0]->getTermText(), $this->_field); $query = new Zend_Search_Lucene_Search_Query_Fuzzy($term, $this->_similarity); $query->setBoost($this->_boost); return $query; } if ($this->_fuzzyQuery) { // require_once 'Zend/Search/Lucene/Search/QueryParserException.php'; throw new Zend_Search_Lucene_Search_QueryParserException('Fuzzy search is supported only for non-multiple word terms'); } //It's not empty or one term query $query = new Zend_Search_Lucene_Search_Query_MultiTerm(); /** * @todo Process $token->getPositionIncrement() to support stemming, synonyms and other * analizer design features */ foreach ($tokens as $token) { $term = new Zend_Search_Lucene_Index_Term($token->getTermText(), $this->_field); $query->addTerm($term, true); // all subterms are required } $query->setBoost($this->_boost); return $query; }
/** * Query specific matches highlighting * * @param Zend_Search_Lucene_Search_Highlighter_Interface $highlighter Highlighter object (also contains doc for highlighting) */ protected function _highlightMatches(Zend_Search_Lucene_Search_Highlighter_Interface $highlighter) { $words = array(); $matchExpression = '/^' . str_replace(array('\\?', '\\*'), array('.', '.*'), preg_quote($this->_pattern->text, '/')) . '$/'; if (@preg_match('/\\pL/u', 'a') == 1) { // PCRE unicode support is turned on // add Unicode modifier to the match expression $matchExpression .= 'u'; } $docBody = $highlighter->getDocument()->getFieldUtf8Value('body'); // require_once 'Zend/Search/Lucene/Analysis/Analyzer.php'; $tokens = Zend_Search_Lucene_Analysis_Analyzer::getDefault()->tokenize($docBody, 'UTF-8'); foreach ($tokens as $token) { if (preg_match($matchExpression, $token->getTermText()) === 1) { $words[] = $token->getTermText(); } } $highlighter->highlight($words); }
/** * Transform entry to a subquery * * @return Zend_Search_Lucene_Search_Query * @throws Zend_Search_Lucene_Search_QueryParserException */ public function getQuery() { if (strpos($this->_phrase, '?') !== false || strpos($this->_phrase, '*') !== false) { throw new Zend_Search_Lucene_Search_QueryParserException('Wildcards are only allowed in a single terms.'); } $tokens = Zend_Search_Lucene_Analysis_Analyzer::getDefault()->tokenize($this->_phrase); if (count($tokens) == 0) { return new Zend_Search_Lucene_Search_Query_Empty(); } if (count($tokens) == 1) { $term = new Zend_Search_Lucene_Index_Term($tokens[0]->getTermText(), $this->_field); $query = new Zend_Search_Lucene_Search_Query_Term($term); $query->setBoost($this->_boost); return $query; } //It's not empty or one term query $query = new Zend_Search_Lucene_Search_Query_Phrase(); foreach ($tokens as $token) { $term = new Zend_Search_Lucene_Index_Term($token->getTermText(), $this->_field); $query->addTerm($term, true); // all subterms are required } if ($this->_proximityQuery) { $query->setSlop($this->_wordsDistance); } $query->setBoost($this->_boost); return $query; }