/** * Query specific matches highlighting * * @param Zend_Search_Lucene_Search_Highlighter_Interface $highlighter Highlighter object (also contains doc for highlighting) */ protected function _highlightMatches(Zend_Search_Lucene_Search_Highlighter_Interface $highlighter) { /** Skip fields detection. We don't need it, since we expect all fields presented in the HTML body and don't differentiate them */ /** Skip exact term matching recognition, keyword fields highlighting is not supported */ // ------------------------------------- // Recognize wildcard queries /** @todo check for PCRE unicode support may be performed through Zend_Environment in some future */ if (@preg_match('/\\pL/u', 'a') == 1) { $subPatterns = preg_split('/[*?]/u', iconv($this->_encoding, 'UTF-8', $this->_word)); } else { $subPatterns = preg_split('/[*?]/', $this->_word); } if (count($subPatterns) > 1) { // Do nothing return; } // ------------------------------------- // Recognize one-term multi-term and "insignificant" queries require_once 'Zend/Search/Lucene/Analysis/Analyzer.php'; $tokens = Zend_Search_Lucene_Analysis_Analyzer::getDefault()->tokenize($this->_word, $this->_encoding); if (count($tokens) == 0) { // Do nothing return; } if (count($tokens) == 1) { require_once 'Zend/Search/Lucene/Index/Term.php'; $term = new Zend_Search_Lucene_Index_Term($tokens[0]->getTermText(), $this->_field); require_once 'Zend/Search/Lucene/Search/Query/Fuzzy.php'; $query = new Zend_Search_Lucene_Search_Query_Fuzzy($term, $this->_minimumSimilarity); $query->_highlightMatches($highlighter); return; } // Word is tokenized into several tokens // But fuzzy search is supported only for non-multiple word terms // Do nothing }
public function tearDown() { Zend_Search_Lucene_Search_Query_Wildcard::setMinPrefixLength($this->_wildcardMinPrefix); Zend_Search_Lucene_Search_Query_Fuzzy::setDefaultPrefixLength($this->_defaultPrefixLength); }
/** * Set default non-fuzzy prefix length * * @param integer $defaultPrefixLength */ public static function setDefaultPrefixLength($defaultPrefixLength) { self::$_defaultPrefixLength = $defaultPrefixLength; }
/** * Transform entry to a subquery * * @param string $encoding * @return Zend_Search_Lucene_Search_Query * @throws Zend_Search_Lucene_Search_QueryParserException */ public function getQuery($encoding) { if (strpos($this->_term, '?') !== false || strpos($this->_term, '*') !== false) { if ($this->_fuzzyQuery) { // require_once 'Zend/Search/Lucene/Search/QueryParserException.php'; throw new Zend_Search_Lucene_Search_QueryParserException('Fuzzy search is not supported for terms with wildcards.'); } $pattern = ''; $subPatterns = explode('*', $this->_term); $astericFirstPass = true; foreach ($subPatterns as $subPattern) { if (!$astericFirstPass) { $pattern .= '*'; } else { $astericFirstPass = false; } $subPatternsL2 = explode('?', $subPattern); $qMarkFirstPass = true; foreach ($subPatternsL2 as $subPatternL2) { if (!$qMarkFirstPass) { $pattern .= '?'; } else { $qMarkFirstPass = false; } $tokens = Zend_Search_Lucene_Analysis_Analyzer::getDefault()->tokenize($subPatternL2, $encoding); if (count($tokens) > 1) { // require_once 'Zend/Search/Lucene/Search/QueryParserException.php'; throw new Zend_Search_Lucene_Search_QueryParserException('Wildcard search is supported only for non-multiple word terms'); } foreach ($tokens as $token) { $pattern .= $token->getTermText(); } } } $term = new Zend_Search_Lucene_Index_Term($pattern, $this->_field); $query = new Zend_Search_Lucene_Search_Query_Wildcard($term); $query->setBoost($this->_boost); return $query; } $tokens = Zend_Search_Lucene_Analysis_Analyzer::getDefault()->tokenize($this->_term, $encoding); if (count($tokens) == 0) { return new Zend_Search_Lucene_Search_Query_Insignificant(); } if (count($tokens) == 1 && !$this->_fuzzyQuery) { $term = new Zend_Search_Lucene_Index_Term($tokens[0]->getTermText(), $this->_field); $query = new Zend_Search_Lucene_Search_Query_Term($term); $query->setBoost($this->_boost); return $query; } if (count($tokens) == 1 && $this->_fuzzyQuery) { $term = new Zend_Search_Lucene_Index_Term($tokens[0]->getTermText(), $this->_field); $query = new Zend_Search_Lucene_Search_Query_Fuzzy($term, $this->_similarity); $query->setBoost($this->_boost); return $query; } if ($this->_fuzzyQuery) { // require_once 'Zend/Search/Lucene/Search/QueryParserException.php'; throw new Zend_Search_Lucene_Search_QueryParserException('Fuzzy search is supported only for non-multiple word terms'); } //It's not empty or one term query $query = new Zend_Search_Lucene_Search_Query_MultiTerm(); /** * @todo Process $token->getPositionIncrement() to support stemming, synonyms and other * analizer design features */ foreach ($tokens as $token) { $term = new Zend_Search_Lucene_Index_Term($token->getTermText(), $this->_field); $query->addTerm($term, true); // all subterms are required } $query->setBoost($this->_boost); return $query; }
public function testFuzzyQuery() { $index = Zend_Search_Lucene::open(dirname(__FILE__) . '/_index23Sample/_files'); $defaultPrefixLength = Zend_Search_Lucene_Search_Query_Fuzzy::getDefaultPrefixLength(); Zend_Search_Lucene_Search_Query_Fuzzy::setDefaultPrefixLength(0); $hits = $index->find('tesd~0.4'); $this->assertEquals(count($hits), 9); $expectedResultset = array(array(2, 0.037139, 'IndexSource/contributing.patches.html'), array(0, 0.008735, 'IndexSource/contributing.documentation.html'), array(7, 0.002449, 'IndexSource/contributing.bugs.html'), array(1, 0.000483, 'IndexSource/contributing.wishlist.html'), array(3, 0.000483, 'IndexSource/about-pear.html'), array(9, 0.000483, 'IndexSource/core.html'), array(5, 0.000414, 'IndexSource/authors.html'), array(8, 0.000414, 'IndexSource/contributing.html'), array(4, 0.000345, 'IndexSource/copyright.html')); foreach ($hits as $resId => $hit) { $this->assertEquals($hit->id, $expectedResultset[$resId][0]); $this->assertTrue(abs($hit->score - $expectedResultset[$resId][1]) < 1.0E-6); $this->assertEquals($hit->path, $expectedResultset[$resId][2]); } Zend_Search_Lucene_Search_Query_Fuzzy::setDefaultPrefixLength($defaultPrefixLength); }
/** * Transform entry to a subquery * * @param string $encoding * @return Zend_Search_Lucene_Search_Query * @throws Zend_Search_Lucene_Search_QueryParserException */ public function getQuery($encoding) { if (strpos($this->_term, '?') !== false || strpos($this->_term, '*') !== false) { if ($this->_fuzzyQuery) { require_once __CA_LIB_DIR__ . '/core/Zend/Search/Lucene/Search/QueryParserException.php'; throw new Zend_Search_Lucene_Search_QueryParserException('Fuzzy search is not supported for terms with wildcards.'); } $pattern = ''; $subPatterns = explode('*', $this->_term); $astericFirstPass = true; foreach ($subPatterns as $subPattern) { if (!$astericFirstPass) { $pattern .= '*'; } else { $astericFirstPass = false; } $subPatternsL2 = explode('?', $subPattern); $qMarkFirstPass = true; foreach ($subPatternsL2 as $subPatternL2) { if (!$qMarkFirstPass) { $pattern .= '?'; } else { $qMarkFirstPass = false; } $pattern .= $subPatternL2; } } $term = new Zend_Search_Lucene_Index_Term(strtolower($pattern), $this->_field); $query = new Zend_Search_Lucene_Search_Query_Wildcard($term); $query->setBoost($this->_boost); return $query; } $tokens = explode(" ", $this->_term); if (count($tokens) == 0) { return new Zend_Search_Lucene_Search_Query_Insignificant(); } if (count($tokens) == 1 && !$this->_fuzzyQuery) { $term = new Zend_Search_Lucene_Index_Term(strtolower($tokens[0]), $this->_field); $query = new Zend_Search_Lucene_Search_Query_Term($term); $query->setBoost($this->_boost); return $query; } if (count($tokens) == 1 && $this->_fuzzyQuery) { $term = new Zend_Search_Lucene_Index_Term(strtolower($tokens[0]), $this->_field); $query = new Zend_Search_Lucene_Search_Query_Fuzzy($term, $this->_similarity); $query->setBoost($this->_boost); return $query; } if ($this->_fuzzyQuery) { require_once __CA_LIB_DIR__ . '/core/Zend/Search/Lucene/Search/QueryParserException.php'; throw new Zend_Search_Lucene_Search_QueryParserException('Fuzzy search is supported only for non-multiple word terms'); } //It's not empty or one term query $query = new Zend_Search_Lucene_Search_Query_MultiTerm(); foreach ($tokens as $token) { $term = new Zend_Search_Lucene_Index_Term(strtolower($token), $this->_field); $query->addTerm($term, true); } $query->setBoost($this->_boost); return $query; }
/** * Returns a handle on the actual lucene index. * @return Zend_Search_Lucene_Interface */ private function getLuceneIndex() { if (!empty($this->index)) { return $this->index; } if (file_exists($this->location)) { $this->index = new Zend_Search_Lucene($this->location); } else { $this->index = Zend_Search_Lucene::create($this->location); } $this->index->setMaxBufferedDocs(64); //$this->index->setMaxMergeDocs(50); Zend_Search_Lucene_Search_Query_Fuzzy::setDefaultPrefixLength(1); $this->index->setMergeFactor(5); return $this->index; }
/** * * @throws nc_search_exception */ protected function open_index() { $path = $this->get_index_path(); try { if ($this->index_exists($path)) { $this->index = Zend_Search_Lucene::open($path); } else { $this->index = Zend_Search_Lucene::create($path); } } catch (Zend_Search_Lucene_Exception $e) { throw new nc_search_exception("Cannot open Lucene index: {$e->getMessage()}"); } $this->is_opened = true; // apply settings to the index Zend_Search_Lucene::setResultSetLimit($this->get_setting('ZendSearchLucene_ResultSetLimit')); Zend_Search_Lucene::setTermsPerQueryLimit($this->get_setting('MaxTermsPerQuery')); $settings = array('MaxBufferedDocs', 'MaxMergeDocs', 'MergeFactor'); foreach ($settings as $s) { $setter = "set{$s}"; $this->index->{$setter}($this->get_setting("ZendSearchLucene_{$s}")); } // set analyzer Zend_Search_Lucene_Analysis_Analyzer::setDefault(new nc_search_provider_zend_analyzer()); // set fuzzy prefix length so fuzzy searches will match a wider array of possibilities Zend_Search_Lucene_Search_Query_Fuzzy::setDefaultPrefixLength(0); // set wildcard prefix length so wildcards will match a wider array of possibilities Zend_Search_Lucene_Search_Query_Wildcard::setMinPrefixLength(0); /** * @todo set default search fields */ }
public function findAction() { $queryFromRequest = $this->cleanRequestString($_REQUEST["query"]); $categoryFromRequest = $this->cleanRequestString($_REQUEST["cat"]); $searcher = new SearchPhp_Frontend_Searcher(); $this->view->groupByCategory = $this->_getParam("groupByCategory"); $this->view->omitSearchForm = $this->_getParam("omitSearchForm"); $this->view->categoryOrder = $this->_getParam("categoryOrder"); $this->view->omitJsIncludes = $this->_getParam("omitJsIncludes"); $perPage = $this->_getParam("perPage"); if (empty($perPage)) { $perPage = 10; } $page = $this->_getParam("page"); if (empty($page)) { $page = 1; } $queryStr = strtolower($queryFromRequest); $this->view->category = $categoryFromRequest; if (!empty($this->view->category)) { $category = $this->view->category; } else { $category = null; } $pluginConf = SearchPhp_Plugin::getSearchConfigArray(); if (!empty($pluginConf["search"]["frontend"]["categories"])) { $this->view->availableCategories = explode(",", $pluginConf["search"]["frontend"]["categories"]); } $doFuzzy = $this->_getParam("fuzzy"); try { $query = new Zend_Search_Lucene_Search_Query_Boolean(); $field = $this->_getParam("field"); if (!empty($field)) { Zend_Search_Lucene::setDefaultSearchField($field); } $searchResults = array(); if (!empty($queryStr)) { if ($doFuzzy) { $queryStr = str_replace(" ", "~ ", $queryStr); $queryStr .= "~"; Zend_Search_Lucene_Search_Query_Fuzzy::setDefaultPrefixLength(3); } $userQuery = Zend_Search_Lucene_Search_QueryParser::parse($queryStr, 'utf-8'); $query->addSubquery($userQuery, true); if (!empty($this->searchLanguage)) { if (is_object($this->searchLanguage)) { $lang = $this->searchLanguage->toString(); } else { $lang = $this->searchLanguage; } $lang = str_replace(array("_", "-"), "", $lang); $languageTerm = new Zend_Search_Lucene_Index_Term($lang, 'lang'); $languageQuery = new Zend_Search_Lucene_Search_Query_Term($languageTerm); $query->addSubquery($languageQuery, true); } if (!empty($category)) { $categoryTerm = new Zend_Search_Lucene_Index_Term($category, 'cat'); $categoryQuery = new Zend_Search_Lucene_Search_Query_Term($categoryTerm); $query->addSubquery($categoryQuery, true); } $hits = $this->frontendIndex->find($query); $validHits = array(); if ($this->ownHostOnly and $hits != null) { //get rid of hits from other hosts $currenthost = $_SERVER['HTTP_HOST']; for ($i = 0; $i < count($hits); $i++) { $url = $hits[$i]->getDocument()->getField("url"); if (strpos($url->value, "http://" . $currenthost) !== FALSE || strpos($url->value, "https://" . $currenthost) !== FALSE) { $validHits[] = $hits[$i]; } } } else { $validHits = $hits; } $start = $perPage * ($page - 1); $end = $start + ($perPage - 1); if ($end > count($validHits) - 1) { $end = count($validHits) - 1; } for ($i = $start; $i <= $end; $i++) { $hit = $validHits[$i]; $url = $hit->getDocument()->getField("url"); $title = $hit->getDocument()->getField("title"); $searchResult['boost'] = $hit->getDocument()->boost; $searchResult['title'] = $title->value; $searchResult['url'] = $url->value; $searchResult['sumary'] = $searcher->getSumaryForUrl($url->value, $queryStr); try { if ($hit->getDocument()->getField("h1")) { $searchResult['h1'] = $hit->getDocument()->getField("h1")->value; } } catch (Zend_Search_Lucene_Exception $e) { } foreach ($this->categories as $category) { try { $searchResult['categories'][] = $hit->getDocument()->getField("cat")->value; } catch (Zend_Search_Lucene_Exception $e) { } } $searchResults[] = $searchResult; unset($searchResult); } } if (count($validHits) < 1) { $this->view->pages = 0; } else { $this->view->pages = ceil(count($validHits) / $perPage); } $this->view->perPage = $perPage; $this->view->page = $page; $this->view->total = count($validHits); $this->view->query = $queryStr; $this->view->searchResults = $searchResults; if ($this->fuzzySearch) { //look for similar search terms if (!empty($queryStr) and (empty($searchResults) or count($searchResults) < 1)) { $terms = SearchPhp_Plugin::fuzzyFindTerms($queryStr, $this->frontendIndex, 3); if (empty($terms) or count($terms) < 1) { $terms = SearchPhp_Plugin::fuzzyFindTerms($queryStr, $this->frontendIndex, 0); } $suggestions = array(); if (is_array($terms)) { $counter = 0; foreach ($terms as $term) { $t = $term->text; //check if term can be found for current language if ($this->searchLanguage != null) { if (is_object($this->searchLanguage)) { $language = $this->searchLanguage->toString(); } else { $language = $this->searchLanguage; } $language = str_replace(array("_", "-"), "", $language); } $hits = null; $query = new Zend_Search_Lucene_Search_Query_Boolean(); if ($language != null) { $languageTerm = new Zend_Search_Lucene_Index_Term($language, 'lang'); $languageQuery = new Zend_Search_Lucene_Search_Query_Term($languageTerm); $query->addSubquery($languageQuery, true); } if (!empty($category)) { $categoryTerm = new Zend_Search_Lucene_Index_Term($category, 'cat'); $categoryQuery = new Zend_Search_Lucene_Search_Query_Term($categoryTerm); $query->addSubquery($categoryQuery, true); } $userQuery = Zend_Search_Lucene_Search_QueryParser::parse($t, 'utf-8'); $query->addSubquery($userQuery, true); $hits = $this->frontendIndex->find($query); $validHits = array(); if ($this->ownHostOnly and $hits != null) { //get rid of hits from other hosts $currenthost = $_SERVER['HTTP_HOST']; if (count($hits) == 1) { $url = $hits[0]->getDocument()->getField("url"); if (strpos($url->value, "http://" . $currenthost) !== FALSE || strpos($url->value, "https://" . $currenthost) !== FALSE) { $validHits[] = $hits[0]; } } for ($i = 0; $i < count($hits); $i++) { $url = $hits[$i]->getDocument()->getField("url"); if (strpos($url->value, "http://" . $currenthost) !== FALSE) { $validHits[] = $hits[$i]; } } } else { $validHits = $hits; } if (count($validHits) > 0 and !in_array($t, $suggestions)) { $suggestions[] = $t; if ($counter >= 20) { break; } $counter++; } } } $this->view->suggestions = $suggestions; } } } catch (Exception $e) { Logger::log("An Exception occured during search:", Zend_Log::ERR); Logger::log($e, Zend_Log::ERR); $this->view->searchResults = array(); } if ($this->_getParam("viewscript")) { $this->renderScript($this->_getParam("viewscript")); } }
/** * finds similar terms * @param string $queryStr * @param \Zend_Search_Lucene_Interface $index * @param integer $prefixLength optionally specify prefix length, default 0 * @param float $similarity optionally specify similarity, default 0.5 * @return string[] $similarSearchTerms */ public static function fuzzyFindTerms($queryStr, $index, $prefixLength = 0, $similarity = 0.5) { if ($index != NULL) { \Zend_Search_Lucene_Search_Query_Fuzzy::setDefaultPrefixLength($prefixLength); $term = new \Zend_Search_Lucene_Index_Term($queryStr); $fuzzyQuery = new \Zend_Search_Lucene_Search_Query_Fuzzy($term, $similarity); $hits = $index->find($fuzzyQuery); $terms = $fuzzyQuery->getQueryTerms(); return $terms; } }
public function findAction() { $this->disableViewAutoRender(); $searcher = new Searcher(); try { $query = new \Zend_Search_Lucene_Search_Query_Boolean(); $field = $this->getParam('field'); if (!empty($field)) { \Zend_Search_Lucene::setDefaultSearchField($field); } $searchResults = array(); $validHits = array(); if (!empty($this->query)) { if ($this->fuzzySearch) { $this->query = str_replace(' ', '~ ', $this->query); $this->query .= '~'; \Zend_Search_Lucene_Search_Query_Fuzzy::setDefaultPrefixLength(3); } $userQuery = \Zend_Search_Lucene_Search_QueryParser::parse($this->query, 'utf-8'); $query->addSubquery($userQuery, TRUE); $this->addLanguageQuery($query); $this->addCountryQuery($query); $this->addCategoryQuery($query); $this->addRestrictionQuery($query); $validHits = $this->getValidHits($this->frontendIndex->find($query)); $start = $this->perPage * ($this->currentPage - 1); $end = $start + ($this->perPage - 1); if ($end > count($validHits) - 1) { $end = count($validHits) - 1; } for ($i = $start; $i <= $end; $i++) { $hit = $validHits[$i]; /** @var \Zend_Search_Lucene_Document $doc */ $doc = $hit->getDocument(); $url = $doc->getField('url'); $title = $doc->getField('title'); $content = $doc->getField('content'); $searchResult['boost'] = $doc->boost; $searchResult['title'] = $title->value; $searchResult['url'] = $url->value; $searchResult['summary'] = $searcher->getSummaryForUrl($content->value, $this->untouchedQuery); //H1, description and imageTags are not available in pdf files. try { if ($doc->getField('h1')) { $searchResult['h1'] = $doc->getField('h1')->value; } if ($doc->getField('description')) { $searchResult['description'] = $searcher->getSummaryForUrl($doc->getField('description')->value, $this->untouchedQuery); } if ($doc->getField('imageTags')) { $searchResult['imageTags'] = $doc->getField('imageTags')->value; } } catch (\Zend_Search_Lucene_Exception $e) { } foreach ($this->categories as $category) { try { $searchResult['categories'][] = $hit->getDocument()->getField('cat')->value; } catch (\Zend_Search_Lucene_Exception $e) { } } $searchResults[] = $searchResult; unset($searchResult); } } $suggestions = FALSE; if ($this->fuzzySearch) { $suggestions = $this->getFuzzySuggestions($searchResults); } $currentPageResultStart = $this->perPage * ($this->currentPage - 1); $currentPageResultEnd = $currentPageResultStart + $this->perPage; if ($currentPageResultEnd > count($validHits)) { $currentPageResultEnd = count($validHits); } $pages = 0; if (count($validHits) > 0) { $pages = ceil(count($validHits) / $this->perPage); } $this->view->assign(['searchCurrentPage' => $this->currentPage, 'searchAllPages' => $pages, 'searchCategory' => $this->category, 'searchAvailableCategories' => $this->categories, 'searchSuggestions' => $suggestions, 'searchLanguage' => $this->searchLanguage, 'searchCountry' => $this->searchCountry, 'searchPerPage' => $this->perPage, 'searchTotalHits' => count($validHits), 'searchQuery' => $this->untouchedQuery, 'searchHasResults' => count($searchResults) > 0, 'searchResults' => $searchResults, 'searchCurrentPageResultStart' => $currentPageResultStart + 1, 'searchCurrentPageResultEnd' => $currentPageResultEnd]); } catch (\Exception $e) { \Pimcore\Logger::debug('An Exception occurred during search: ' . $e->getMessage()); $this->view->assign(['searchResults' => [], 'searchHasResults' => FALSE]); } if ($this->getParam('viewScript')) { $this->renderScript($this->_getParam('viewScript')); } else { $this->renderScript('/search/find.php'); } }
protected function _highlightMatches(Zend_Search_Lucene_Search_Highlighter_Interface $highlighter) { if (@preg_match('/\\pL/u', 'a') == 1) { $subPatterns = preg_split('/[*?]/u', iconv($this->_encoding, 'UTF-8', $this->_word)); } else { $subPatterns = preg_split('/[*?]/', $this->_word); } if (count($subPatterns) > 1) { return; } $tokens = Zend_Search_Lucene_Analysis_Analyzer::getDefault()->tokenize($this->_word, $this->_encoding); if (count($tokens) == 0) { return; } if (count($tokens) == 1) { $term = new Zend_Search_Lucene_Index_Term($tokens[0]->getTermText(), $this->_field); $query = new Zend_Search_Lucene_Search_Query_Fuzzy($term, $this->_minimumSimilarity); $query->_highlightMatches($highlighter); return; } }