public function searchAction(Request $request) { $queryString = $request->get('query'); // count total documents indexed $numDocs = $this->get('zendsearch')->numDocs(); // parse query string and return a Query object. // $query = Search\QueryParser::parse($queryString, 'UTF-8'); $queryTokens = Analyzer::getDefault()->tokenize($queryString, 'UTF-8'); $query = new Search\Query\Boolean(); foreach ($queryTokens as $token) { $query->addSubquery(new Search\Query\Fuzzy(new Index\Term($token->getTermText()), 0.5), null); } // process query $results = $this->get('zendsearch')->find($query); // sort results by score (MultiSearch does not sort the results between the differents indices) usort($results, create_function('$a, $b', 'return $a->score < $b->score;')); // // paginate results // $results = new \Zend\Paginator\Paginator(new \Zend\Paginator\Adapter\ArrayAdapter($results)); // $results->setCurrentPageNumber($page); // $results->setItemCountPerPage($rpp); // // fetch results entities // $dataResults = array(); // foreach ($results as $hit) { // $document = $hit->getDocument(); // $repository = $this->get('orm.em')->getRepository( $document->getFieldValue('entityClass') ); // $dataResults[] = $repository->find( $document->getFieldValue('id') ); // } // $results = $dataResults; return $this->get('twig')->render('admin/search.html.twig', array('query' => $queryString, 'numDocs' => $numDocs, 'results' => $results)); }
public function testSetHighlighterAnalyzer() { $this->app->instance('filterClass1', $tokenFilterMock = m::mock('ZendSearch\\Lucene\\Analysis\\TokenFilter\\TokenFilterInterface')); $this->analyzer->shouldReceive('addFilter')->with($tokenFilterMock)->once(); $this->config->setHighlighterAnalyzer(); $this->assertEquals($this->analyzer, Analyzer::getDefault()); }
/** * Get index * @return \ZendSearch\Lucene\Index */ private function index() { if (!isset(self::$index)) { $analyzer = new CaseInsensitive(); if ($this->config()->exists('zend_search', 'stop_words')) { $stop_word_filter = new StopWords(); $words = $this->getRealPath($this->config()->get('zend_search', 'stop_words')); if ($words !== false) { $stop_word_filter->loadFromFile($words); } else { throw new \InvalidArgumentException('Path not found'); } $analyzer->addFilter($stop_word_filter); } if ($this->config()->exists('zend_search', 'morphy_dicts')) { $morphy_dicts = $this->getRealPath($this->config()->get('zend_search', 'morphy_dicts')); if ($morphy_dicts !== false) { $analyzer->addFilter(new Morphy($morphy_dicts, $this->config()->getCharset())); } else { throw new \InvalidArgumentException('Path not found'); } } Analyzer::setDefault($analyzer); Lucene::setResultSetLimit($this->limit); QueryParser::setDefaultEncoding($this->config()->getCharset()); $index = $this->config() - get('zend_search', 'index'); $path = $this->getRealPath($index); self::$index = $path ? Lucene::open($path) : Lucene::create($index); } return self::$index; }
/** * {@inheritdoc} */ public function register(Application $app) { Analyzer::setDefault(new CaseInsensitive()); QueryParser::setDefaultEncoding('UTF-8'); $app['zendsearch.indices_path'] = array(); $app['zendsearch.indices.initializer'] = $app->protect(function () use($app) { static $initialized = false; if ($initialized) { return; } $initialized = true; $indices = array(); foreach ($app['zendsearch.indices_path'] as $name => $index) { $indices[$name] = file_exists($index) ? Lucene::open($index) : Lucene::create($index); } $app['zendsearch.indices_collection'] = $indices; }); $app['zendsearch.indices'] = $app->share(function ($app) { $app['zendsearch.indices.initializer'](); return $app['zendsearch.indices_collection']; }); $app['zendsearch.multisearcher'] = $app->share(function ($app) { $app['zendsearch.indices.initializer'](); $multi = new MultiSearcher(); foreach ($app['zendsearch.indices'] as $index) { $multi->addIndex($index); } return $multi; }); $app['zendsearch'] = $app->share(function ($app) { return $app['zendsearch.multisearcher']; }); }
/** * @param DB $db * @param Core $core * @param string $indexesBasePath */ public function __construct(DB $db, Core $core, $indexesBasePath) { $this->db = $db; $this->core = $core; $this->indexesBasePath = $indexesBasePath; $this->indexes = array(); Analyzer::setDefault(new CaseInsensitive()); }
/** * @param string $path Path to search index */ public function __construct($path, NodeTypeManagerInterface $nodeTypeManager = null, $hideDestructException = false) { $this->path = $path; $this->filesystem = new Filesystem(); $this->nodeTypeManager = $nodeTypeManager; $this->hideDestructException = $hideDestructException; Analyzer::setDefault(new ExactMatchAnalyzer()); Wildcard::setMinPrefixLength(0); }
/** * Set analyzer for words highlighting (not for indexing). */ public function setHighlighterAnalyzer() { /** @var AbstractCommon $analyzer */ $analyzer = App::make(AbstractCommon::class); foreach ($this->filterClasses as $filterClass) { $analyzer->addFilter(App::make($filterClass)); } Analyzer::setDefault($analyzer); }
/** * Set analyzer for words highlighting (not for indexing). */ public function setHighlighterAnalyzer() { /** @var AbstractCommon $analyzer */ $analyzer = App::make('ZendSearch\\Lucene\\Analysis\\Analyzer\\Common\\AbstractCommon'); foreach ($this->filterClasses as $filterClass) { $analyzer->addFilter(App::make($filterClass)); } Analyzer::setDefault($analyzer); }
public function init() { if ($this->caseSensitivity) { Analyzer::setDefault(new Utf8()); } else { Analyzer::setDefault(new CaseInsensitive()); } $this->indexDirectory = FileHelper::normalizePath(Yii::getAlias($this->indexDirectory)); $this->luceneIndex = $this->getLuceneIndex($this->indexDirectory); }
public function init() { QueryParser::setDefaultEncoding('UTF-8'); if ($this->caseSensitivity) { Analyzer::setDefault($this->parseNumeric ? new Utf8Num() : new Utf8()); } else { Analyzer::setDefault($this->parseNumeric ? new CaseInsensitiveNum() : new CaseInsensitive()); } $this->indexDirectory = FileHelper::normalizePath(Yii::getAlias($this->indexDirectory)); $this->luceneIndex = $this->getLuceneIndex($this->indexDirectory); }
public function testAnalyzer() { $currentAnalyzer = Analyzer::getDefault(); $this->assertTrue($currentAnalyzer instanceof AnalyzerInterface); /** Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8Num */ $newAnalyzer = new Common\Utf8Num(); Analyzer::setDefault($newAnalyzer); $this->assertTrue(Analyzer::getDefault() === $newAnalyzer); // Set analyzer to the default value (used in other tests) Analyzer::setDefault($currentAnalyzer); }
public function __construct(FrontendController $frontendController, $moduleName) { parent::__construct($frontendController, $moduleName); $this->controllerRoutes = array('/' => array('GET' => 'showSearchResults'), '/rebuild-index' => array('GET' => 'rebuildIndex')); Analyzer::setDefault(new CaseInsensitive()); QueryParser::setDefaultEncoding('UTF-8'); $this->registerService('index', 'generateIndex'); $cmsPage = $frontendController->getCmsPage(); if ($cmsPage !== null) { $cmsPage->setLastModified(date('Y-m-d H:i:s')); } }
/** * Get the ZendSearch lucene index instance associated with this instance. * * @return \ZendSearch\Lucene\Index */ protected function getIndex() { if (!$this->index) { $path = rtrim(Config::get('search.connections.zend.path'), '/') . '/' . $this->name; try { $this->index = \ZendSearch\Lucene\Lucene::open($path); } catch (\ZendSearch\Exception\ExceptionInterface $e) { $this->index = \ZendSearch\Lucene\Lucene::create($path); } catch (\ErrorException $e) { if (!file_exists($path)) { throw new \Exception("'path' directory does not exist for the 'zend' search driver: '" . rtrim(Config::get('search.connections.zend.path'), '/') . "'"); } throw $e; } \ZendSearch\Lucene\Analysis\Analyzer\Analyzer::setDefault(new \ZendSearch\Lucene\Analysis\Analyzer\Common\Utf8Num\CaseInsensitive()); } return $this->index; }
/** * opens or creates the given lucene index * * @throws SetUpException */ public function openOrCreate() { $indexFolder = $this->files->setUpIndexFolder(); $storage = $indexFolder->getStorage(); $localPath = $storage->getLocalFolder($indexFolder->getInternalPath()); //let lucene search for numbers as well as words Analyzer::setDefault(new CaseInsensitive()); // can we use the index? if ($indexFolder->nodeExists('v0.6.0')) { // correct index present $this->index = Lucene::open($localPath); } else { $this->logger->info('recreating outdated lucene index'); $indexFolder->delete(); $this->index = Lucene::create($localPath); $indexFolder->newFile('v0.6.0'); } }
public function search($expression, $page = 1, $conditions = null, $indexName = null) { if (!$expression) { throw new NotFoundHttpException('Empty expression'); } if (!$indexName) { $indexName = $this->kernel->getContainer()->getParameter('symbio_fulltext_search.' . Crawler::DEFAULT_INDEX_PARAM); } if (mb_strlen($expression, 'utf-8') > 2) { $index = $this->indexManager->getIndex($indexName); Analyzer::setDefault(new CaseInsensitive()); $query = $this->prepareQuery($expression, $conditions); $results = $index->find($query); // strankovani $paginator = $this->kernel->getContainer()->get('knp_paginator'); $pagination = $paginator->paginate($results, $page, $this->kernel->getContainer()->getParameter('symbio_fulltext_search.items_on_page')); return array('expression' => $expression, 'pagination' => $pagination, 'pgdata' => $pagination->getPaginationData()); } return false; }
/** * Gets the index mapped by the given lucene identifier. * * @param string $identifier The lucene identifier. * * @return \ZendSearch\Lucene\Index The lucene index. */ public function getIndex($identifier) { $config = $this->getConfig($identifier); $path = $config['path']; if (!$this->checkPath($path)) { $this->indexes[$identifier] = Lucene::create($path); } else { $this->indexes[$identifier] = Lucene::open($path); } Analyzer::setDefault(new $config['analyzer']()); $this->indexes[$identifier]->setMaxBufferedDocs($config['max_buffered_docs']); $this->indexes[$identifier]->setMaxMergeDocs($config['max_merge_docs']); $this->indexes[$identifier]->setMergeFactor($config['merge_factor']); ZfFilesystem::setDefaultFilePermissions($config['permissions']); if ($config['auto_optimized']) { $this->indexes[$identifier]->optimize(); } QueryParser::setDefaultEncoding($config['query_parser_encoding']); return $this->indexes[$identifier]; }
/** * Lists all Post models. * @return mixed */ public function actionIndex() { $searchModel = new PostSearch(); $dataProvider = $searchModel->search(Yii::$app->request->post()); //setlocale(LC_ALL, 'en_US.UTF-8'); setlocale(LC_CTYPE, 'ru_RU.UTF-8'); //Lucene\Lucene::setDefaultSearchField('contents'); Lucene\Search\QueryParser::setDefaultEncoding('UTF-8'); Lucene\Analysis\Analyzer\Analyzer::setDefault(new Lucene\Analysis\Analyzer\Common\Utf8\CaseInsensitive()); Lucene\Lucene::setResultSetLimit(10); // create blog posts index located in /data/posts_index ,make sure the folder is writable $index = Lucene\Lucene::create('data/posts_index'); $posts = Post::find()->all(); //var_dump($posts);die(); // iterate through posts and build the index foreach ($posts as $p) { $doc = new Lucene\Document(); $doc->addField(Lucene\Document\Field::UnIndexed('entry_id', $p->id)); $doc->addField(Lucene\Document\Field::Keyword('title', $p->title)); $doc->addField(Lucene\Document\Field::text('contents', $p->content)); $index->addDocument($doc); } // commit the index $index->commit(); //Lucene\Analysis\Analyzer\Analyzer::setDefault(new Lucene\Analysis\Analyzer\Common\Utf8\CaseInsensitive()); // explode the search query to individual words $words = explode(' ', urldecode(Yii::$app->getRequest()->getQueryParam('q'))); // start a search query and add a term for each word to it $query = new Lucene\Search\Query\MultiTerm(); foreach ($words as $w) { $query->addTerm(new Lucene\Index\Term($w)); } // open and query the index $index = Lucene\Lucene::open('data/posts_index'); $results = $index->find($query); // the search results //var_dump($results); return $this->render('index', ['searchModel' => $searchModel, 'dataProvider' => $dataProvider, 'search' => $results, 'query' => $query]); }
/** * インデックスファイルを生成 */ public static function updateIndex() { if (empty(self::$igo)) { self::$igo = new Tagger(array('dict_dir' => LIB_DIR . 'ipadic', 'reduce_mode' => true)); } Analyzer::setDefault(new Utf8()); // 索引の作成 $index = Lucene::create(CACHE_DIR . self::INDEX_NAME); foreach (Listing::pages() as $page) { if (empty($page)) { continue; } $wiki = Factory::Wiki($page); // 読む権限がない場合スキップ if (!$wiki->isReadable() || $wiki->isHidden()) { continue; } /* // HTML出力 $html[] = '<html><head>'; $html[] = '<meta http-equiv="Content-type" content="text/html; charset=UTF-8"/>'; $html[] = '<title>' . $wiki->title() . '</title>'; $html[] = '</head>'; $html[] = '<body>' . $wiki->render() . '</body>'; $html[] = '</html>'; */ $doc = new LuceneDoc(); $doc->addField(Field::Text('title', $wiki->title())); // Store document URL to identify it in the search results $doc->addField(Field::Text('url', $wiki->uri())); // Index document contents //$contents = join(" ", self::$igo->wakati(strip_tags($wiki->render()))); $contents = strip_tags($wiki->render()); $doc->addField(Field::UnStored('contents', $contents)); // 索引へ文書の登録 $index->addDocument($doc); } $index->optimize(); }
/** * Adds a document to this segment. * * @param \ZendSearch\Lucene\Document $document * @throws LuceneException\UnsupportedMethodCallException */ public function addDocument(Document $document) { $storedFields = array(); $docNorms = array(); $similarity = AbstractSimilarity::getDefault(); foreach ($document->getFieldNames() as $fieldName) { $field = $document->getField($fieldName); if ($field->storeTermVector) { /** * @todo term vector storing support */ throw new LuceneException\UnsupportedMethodCallException('Store term vector functionality is not supported yet.'); } if ($field->isIndexed) { if ($field->isTokenized) { $analyzer = Analyzer\Analyzer::getDefault(); $analyzer->setInput($field->value, $field->encoding); $position = 0; $tokenCounter = 0; while (($token = $analyzer->nextToken()) !== null) { $tokenCounter++; $term = new Index\Term($token->getTermText(), $field->name); $termKey = $term->key(); if (!isset($this->_termDictionary[$termKey])) { // New term $this->_termDictionary[$termKey] = $term; $this->_termDocs[$termKey] = array(); $this->_termDocs[$termKey][$this->_docCount] = array(); } elseif (!isset($this->_termDocs[$termKey][$this->_docCount])) { // Existing term, but new term entry $this->_termDocs[$termKey][$this->_docCount] = array(); } $position += $token->getPositionIncrement(); $this->_termDocs[$termKey][$this->_docCount][] = $position; } if ($tokenCounter == 0) { // Field contains empty value. Treat it as non-indexed and non-tokenized $field = clone $field; $field->isIndexed = $field->isTokenized = false; } else { $docNorms[$field->name] = chr($similarity->encodeNorm($similarity->lengthNorm($field->name, $tokenCounter) * $document->boost * $field->boost)); } } elseif (($fieldUtf8Value = $field->getUtf8Value()) == '') { // Field contains empty value. Treat it as non-indexed and non-tokenized $field = clone $field; $field->isIndexed = $field->isTokenized = false; } else { $term = new Index\Term($fieldUtf8Value, $field->name); $termKey = $term->key(); if (!isset($this->_termDictionary[$termKey])) { // New term $this->_termDictionary[$termKey] = $term; $this->_termDocs[$termKey] = array(); $this->_termDocs[$termKey][$this->_docCount] = array(); } elseif (!isset($this->_termDocs[$termKey][$this->_docCount])) { // Existing term, but new term entry $this->_termDocs[$termKey][$this->_docCount] = array(); } $this->_termDocs[$termKey][$this->_docCount][] = 0; // position $docNorms[$field->name] = chr($similarity->encodeNorm($similarity->lengthNorm($field->name, 1) * $document->boost * $field->boost)); } } if ($field->isStored) { $storedFields[] = $field; } $this->addField($field); } foreach ($this->_fields as $fieldName => $field) { if (!$field->isIndexed) { continue; } if (!isset($this->_norms[$fieldName])) { $this->_norms[$fieldName] = str_repeat(chr($similarity->encodeNorm($similarity->lengthNorm($fieldName, 0))), $this->_docCount); } if (isset($docNorms[$fieldName])) { $this->_norms[$fieldName] .= $docNorms[$fieldName]; } else { $this->_norms[$fieldName] .= chr($similarity->encodeNorm($similarity->lengthNorm($fieldName, 0))); } } $this->addStoredFields($storedFields); }
public function testFilteredTokensQueryParserProcessing() { $index = Lucene\Lucene::open(__DIR__ . '/_index23Sample/_files'); $this->assertEquals(count(\ZendSearch\Lucene\Analysis\Analyzer\Analyzer::getDefault()->tokenize('123456787654321')), 0); $hits = $index->find('"PEAR developers" AND Home AND 123456787654321'); $this->assertEquals(count($hits), 1); $expectedResultset = array(array(1, 0.16827, 'IndexSource/contributing.wishlist.html')); foreach ($hits as $resId => $hit) { $this->assertEquals($hit->id, $expectedResultset[$resId][0]); $this->assertTrue(abs($hit->score - $expectedResultset[$resId][1]) < 1.0E-6); $this->assertEquals($hit->path, $expectedResultset[$resId][2]); } }
/** * Query specific matches highlighting * * @param Highlighter $highlighter Highlighter object (also contains doc for highlighting) */ protected function _highlightMatches(Highlighter $highlighter) { $words = array(); $docBody = $highlighter->getDocument()->getFieldUtf8Value('body'); $tokens = Lucene\Analysis\Analyzer\Analyzer::getDefault()->tokenize($docBody, 'UTF-8'); $lowerTermText = $this->_lowerTerm !== null ? $this->_lowerTerm->text : null; $upperTermText = $this->_upperTerm !== null ? $this->_upperTerm->text : null; if ($this->_inclusive) { foreach ($tokens as $token) { $termText = $token->getTermText(); if (($lowerTermText == null || $lowerTermText <= $termText) && ($upperTermText == null || $termText <= $upperTermText)) { $words[] = $termText; } } } else { foreach ($tokens as $token) { $termText = $token->getTermText(); if (($lowerTermText == null || $lowerTermText < $termText) && ($upperTermText == null || $termText < $upperTermText)) { $words[] = $termText; } } } $highlighter->highlight($words); }
/** * Query specific matches highlighting * * @param Highlighter $highlighter Highlighter object (also contains doc for highlighting) */ protected function _highlightMatches(Highlighter $highlighter) { $words = array(); $prefix = Index\Term::getPrefix($this->_term->text, $this->_prefixLength); $prefixByteLength = strlen($prefix); $prefixUtf8Length = Index\Term::getLength($prefix); $termLength = Index\Term::getLength($this->_term->text); $termRest = substr($this->_term->text, $prefixByteLength); // we calculate length of the rest in bytes since levenshtein() is not UTF-8 compatible $termRestLength = strlen($termRest); $scaleFactor = 1 / (1 - $this->_minimumSimilarity); $docBody = $highlighter->getDocument()->getFieldUtf8Value('body'); $tokens = Lucene\Analysis\Analyzer\Analyzer::getDefault()->tokenize($docBody, 'UTF-8'); foreach ($tokens as $token) { $termText = $token->getTermText(); if (substr($termText, 0, $prefixByteLength) == $prefix) { // Calculate similarity $target = substr($termText, $prefixByteLength); $maxDistance = isset($this->_maxDistances[strlen($target)]) ? $this->_maxDistances[strlen($target)] : $this->_calculateMaxDistance($prefixUtf8Length, $termRestLength, strlen($target)); if ($termRestLength == 0) { // we don't have anything to compare. That means if we just add // the letters for current term we get the new word $similarity = $prefixUtf8Length == 0 ? 0 : 1 - strlen($target) / $prefixUtf8Length; } elseif (strlen($target) == 0) { $similarity = $prefixUtf8Length == 0 ? 0 : 1 - $termRestLength / $prefixUtf8Length; } elseif ($maxDistance < abs($termRestLength - strlen($target))) { //just adding the characters of term to target or vice-versa results in too many edits //for example "pre" length is 3 and "prefixes" length is 8. We can see that //given this optimal circumstance, the edit distance cannot be less than 5. //which is 8-3 or more precisesly abs(3-8). //if our maximum edit distance is 4, then we can discard this word //without looking at it. $similarity = 0; } else { $similarity = 1 - levenshtein($termRest, $target) / ($prefixUtf8Length + min($termRestLength, strlen($target))); } if ($similarity > $this->_minimumSimilarity) { $words[] = $termText; } } } $highlighter->highlight($words); }
/** * Highlight text using specified View helper or callback function. * * @param string|array $words Words to highlight. Words could be organized using the array or string. * @param callback $callback Callback method, used to transform (highlighting) text. * @param array $params Array of additionall callback parameters passed through into it * (first non-optional parameter is an HTML fragment for highlighting) * @throws \ZendSearch\Lucene\Exception\InvalidArgumentException * @return string */ public function highlightExtended($words, $callback, $params = array()) { if (!is_array($words)) { $words = array($words); } $wordsToHighlightList = array(); $analyzer = Analyzer\Analyzer::getDefault(); foreach ($words as $wordString) { $wordsToHighlightList[] = $analyzer->tokenize($wordString, $this->_doc->encoding); } $wordsToHighlight = call_user_func_array('array_merge', $wordsToHighlightList); if (count($wordsToHighlight) == 0) { return $this->_doc->saveHTML(); } $wordsToHighlightFlipped = array(); foreach ($wordsToHighlight as $id => $token) { $wordsToHighlightFlipped[$token->getTermText()] = $id; } if (!is_callable($callback)) { throw new InvalidArgumentException('$viewHelper parameter mast be a View Helper name, View Helper object or callback.'); } $xpath = new \DOMXPath($this->_doc); $matchedNodes = $xpath->query("/html/body"); foreach ($matchedNodes as $matchedNode) { $this->_highlightNodeRecursive($matchedNode, $wordsToHighlightFlipped, $callback, $params); } }
protected function getIndex() { if ($this->index != null) { return $this->index; } \ZendSearch\Lucene\Search\QueryParser::setDefaultEncoding('utf-8'); \ZendSearch\Lucene\Analysis\Analyzer\Analyzer::setDefault(new \ZendSearch\Lucene\Analysis\Analyzer\Common\Utf8Num\CaseInsensitive()); \ZendSearch\Lucene\Search\QueryParser::setDefaultOperator(\ZendSearch\Lucene\Search\QueryParser::B_AND); try { $index = \ZendSearch\Lucene\Lucene::open($this->getIndexPath()); } catch (\ZendSearch\Lucene\Exception\RuntimeException $ex) { $index = \ZendSearch\Lucene\Lucene::create($this->getIndexPath()); } $this->index = $index; return $index; }
private function getIndex() : SearchIndexInterface { $path = $this->getIndexPath(); if (!$this->checkIndexPath($path)) { $index = Lucene::create($path); } else { $index = Lucene::open($path); } Analyzer::setDefault(new CaseInsensitive()); LuceneFilesystem::setDefaultFilePermissions(0775); QueryParser::setDefaultEncoding('UTF-8'); $index->setMaxBufferedDocs($this->options['max_buffered_docs']); $index->setMaxMergeDocs($this->options['max_merge_docs']); $index->setMergeFactor($this->options['merge_factor']); $index->optimize(); return $index; }
public function sortOutHlCoords() { //Lucene operators $operators = array("and", "or", "not"); $config = $this->getServiceLocator()->get('config'); $paramInfo = $this->sortOutParams($config); //collect building blocks $resLoc = $paramInfo['resLoc']; $site = $paramInfo['site']; $collection = $paramInfo['collection']; $container = $paramInfo['container']; $reel = $paramInfo['reel']; $page = $paramInfo['page']; //the all important query $hl = $this->params()->fromRoute('hl', ''); //coordinates to pass back $coords = []; //pass back empty coordinate set if any of these parameters //are missing if ($this->isNullOrEmpty($reel) || $this->isNullOrEmpty($page) || $this->isNullOrEmpty($hl)) { return array("imgloc" => '', "indloc" => '', "coords" => $coords); } //if //location of files - ODW file layout $resLoc .= '/' . $site . '/' . $collection . '/' . $container . '/' . $reel . '/odw/' . $page . '/'; $imgLoc = $resLoc . '../../' . $page . '.jpg'; $iaLoc = $resLoc . 'ia/' . $page . '.jpg'; //not all images will have IA derivative if (file_exists($iaLoc) !== false) { $imgLoc = $iaLoc; } $indLoc = $resLoc . 'index/imgworks'; //need index directory and segments file to be valid lucene layout if (!file_exists($indLoc . '/segments.gen')) { return array("imgloc" => $imgLoc, "indloc" => $indLoc, "coords" => $coords); } //get coordinates from Lucene index $searchText = ''; //use Lucene tokens for searching $queryTokens = Analyzer\Analyzer::getDefault()->tokenize($hl); foreach ($queryTokens as $token) { $searchTerm = $token->getTermText(); if (!in_array($searchTerm, $operators)) { //no snowball analyzer or other stemming option //in Lucene 2.x, so create stem seperately $searchText .= stem_english($searchTerm); //Lucene dropped this limitation after 2.x //but this version won't wildcard without //at least 3 characters in term if (strlen($searchTerm) >= 3) { $searchText .= "* "; } //if strlen } //if } //foreach //now do search $index = Lucene\Lucene::open($indLoc); $searchResults = $index->find($searchText); //assemble results foreach ($searchResults as $searchResult) { array_push($coords, [$searchResult->x1, $searchResult->y1, $searchResult->x2, $searchResult->y2]); } //foreach //pass back image and index location in addition to results return array("imgloc" => $imgLoc, "indloc" => $indLoc, "coords" => $coords); }
/** * Process last range query term (closed interval) * * @throws \ZendSearch\Lucene\Search\Exception\QueryParserException */ public function closedRQLastTerm() { $tokens = Analyzer\Analyzer::getDefault()->tokenize($this->_rqFirstTerm, $this->_encoding); if (count($tokens) > 1) { throw new QueryParserException('Range query boundary terms must be non-multiple word terms'); } elseif (count($tokens) == 1) { $from = new Index\Term(reset($tokens)->getTermText(), $this->_context->getField()); } else { $from = null; } $tokens = Analyzer\Analyzer::getDefault()->tokenize($this->_currentToken->text, $this->_encoding); if (count($tokens) > 1) { throw new QueryParserException('Range query boundary terms must be non-multiple word terms'); } elseif (count($tokens) == 1) { $to = new Index\Term(reset($tokens)->getTermText(), $this->_context->getField()); } else { $to = null; } if ($from === null && $to === null) { throw new QueryParserException('At least one range query boundary term must be non-empty term'); } $rangeQuery = new Query\Range($from, $to, true); $entry = new QueryEntry\Subquery($rangeQuery); $this->_context->addEntry($entry); }
public function luceneSearchAetCommunications($index, $searchKeyWord) { \ZendSearch\Lucene\Analysis\Analyzer\Analyzer::setDefault(new \ZendSearch\Lucene\Analysis\Analyzer\Common\Text\CaseInsensitive()); $dbIds = array(); $searchValue = SearchHelper::utf8_to_ascii(mb_strtolower($searchKeyWord, "UTF-8")); $em = $this->getDoctrine()->getManager(); /* $term1 = new \ZendSearch\Lucene\Index\Term($searchValue, 'firstname'); //$subquery1 = new \ZendSearch\Lucene\Search\Query\Term($term1); $term2 = new \ZendSearch\Lucene\Index\Term($searchValue, 'title'); //$subquery2 = new \ZendSearch\Lucene\Search\Query\Term($term2); $term3 = new \ZendSearch\Lucene\Index\Term($searchValue, 'shortdesc'); //$subquery3 = new \ZendSearch\Lucene\Search\Query\Term($term3); $term4 = new \ZendSearch\Lucene\Index\Term($searchValue, 'body'); //$subquery4 = new \ZendSearch\Lucene\Search\Query\Term($term4); $term5 = new \ZendSearch\Lucene\Index\Term($searchValue, 'author'); //$subquery5 = new \ZendSearch\Lucene\Search\Query\Term($term5); $terms = array($term1, $term2, $term3, $term4, $term5); //$subqueries = array($subquery1, $subquery2, $subquery3, $subquery4, $subquery5); $signs = array(null, null, null, null, null); $termsQuery = new \ZendSearch\Lucene\Search\Query\MultiTerm($terms,$signs); */ \ZendSearch\Lucene\Search\QueryParser::setDefaultOperator(\ZendSearch\Lucene\Search\QueryParser::B_OR); $query = \ZendSearch\Lucene\Search\QueryParser::parse($searchValue, 'UTF-8'); $foundDocuments = $index->find($query); //$docNum = count($foundDocuments); foreach ($foundDocuments as $foundDoc) { $dbIds[] = $foundDoc->dbId; } $results = $em->getRepository('AetCommunicationBundle:Communication')->findById($dbIds); return $results; }
/** * Query specific matches highlighting * * @param Highlighter $highlighter Highlighter object (also contains doc for highlighting) */ protected function _highlightMatches(Highlighter $highlighter) { $words = array(); $matchExpression = '/^' . str_replace(array('\\?', '\\*'), array('.', '.*'), preg_quote($this->_pattern->text, '/')) . '$/'; ErrorHandler::start(E_WARNING); $result = preg_match('/\\pL/u', 'a'); ErrorHandler::stop(); if ($result == 1) { // PCRE unicode support is turned on // add Unicode modifier to the match expression $matchExpression .= 'u'; } $docBody = $highlighter->getDocument()->getFieldUtf8Value('body'); $tokens = Analyzer::getDefault()->tokenize($docBody, 'UTF-8'); foreach ($tokens as $token) { if (preg_match($matchExpression, $token->getTermText()) === 1) { $words[] = $token->getTermText(); } } $highlighter->highlight($words); }
/** * Подсветка результата поиска в html-фрагменте * * @param string $inputHTMLFragment исходный фрагмента html * @param string $inputEncoding Кодировка исходного фрагмента html * @param string $outputEncoding Кодировка резульрирующего фрагмента html * @return string html фрагмент с подсвеченными результатами поиска */ public function highlightMatches($inputHTMLFragment, $inputEncoding = 'utf-8', $outputEncoding = 'utf-8') { $highlightedHTMLFragment = ''; if (!empty($this->lastQuery)) { $queryParser = QueryParser::parse($this->lastQuery); /** * Убираем фильтры стоп-слов для подсветки слов с псевдокорнями типа 'под' и т.п. */ Analyzer::setDefault($this->analyzerForHighlighter); $highlightedHTMLFragment = $queryParser->htmlFragmentHighlightMatches($inputHTMLFragment, $inputEncoding, new Highlighter()); Analyzer::setDefault($this->defaultAnalyzer); $highlightedHTMLFragment = mb_convert_encoding($highlightedHTMLFragment, $outputEncoding, 'utf-8'); } return $highlightedHTMLFragment; }