/** * Return the document object for this hit * * @return Zend_Search_Lucene_Document */ public function getDocument() { if (!$this->_document instanceof Zend_Search_Lucene_Document) { $this->_document = $this->_index->getDocument($this->id); } return $this->_document; }
public function score($docId, Zend_Search_Lucene_Interface $reader) { if (isset($this->_docVector[$docId])) { return $reader->getSimilarity()->tf($this->_termFreqs[$docId]) * $this->_weight->getValue() * $reader->norm($docId, $this->_term->field) * $this->getBoost(); } else { return 0; } }
/** * The sum of squared weights of contained query clauses. * * @return float */ public function sumOfSquaredWeights() { // compute idf $this->_idf = $this->_reader->getSimilarity()->idf($this->_query->getTerms(), $this->_reader); // compute query weight $this->_queryWeight = $this->_idf * $this->_query->getBoost(); // square it return $this->_queryWeight * $this->_queryWeight; }
public function idf($input, Zend_Search_Lucene_Interface $reader) { if (!is_array($input)) { return $this->idfFreq($reader->docFreq($input), $reader->count()); } else { $idf = 0.0; foreach ($input as $term) { $idf += $this->idfFreq($reader->docFreq($term), $reader->count()); } return $idf; } }
/** * Return a list of posts that are similar to the current post. * This is not a very good implementation, so do not expect * amazing results - the term vector is not available for a doc * in ZSL, which limits how far you can go! * * @return array ids */ public function get_similar_posts($post, $max_recommended = 5) { Zend_Search_Lucene::setResultSetLimit($max_recommended + 1); $title = $post->title; $tags = $post->tags; $tagstring = ''; foreach ($tags as $tag) { $tagstring .= $tag . ' '; } $analyser = Zend_Search_Lucene_Analysis_Analyzer::getDefault(); $tokens = $analyser->tokenize(strtolower($tagstring) . ' ' . strtolower($title)); $query = new Zend_Search_Lucene_Search_Query_MultiTerm(); foreach ($tokens as $token) { $query->addTerm(new Zend_Search_Lucene_Index_Term($token->getTermText())); } $hits = $this->_index->find($query); $ids = array(); $counter = 0; foreach ($hits as $hit) { if ($hit->postid != $post->id) { $ids[] = $hit->postid; $counter++; } if ($counter == $max_recommended) { break; } } return $ids; }
public static function addPageToIndex($page, $toasterSearchIndex = false) { if (!self::initIndex()) { return false; } if ($page instanceof Application_Model_Models_Page) { $page = $page->toArray(); $containers = Application_Model_Mappers_ContainerMapper::getInstance()->findByPageId($page['id']); $page['content'] = ''; if (!empty($containers)) { foreach ($containers as $container) { $page['content'] .= $container->getContent(); } } } $document = new Zend_Search_Lucene_Document(); $document->addField(Zend_Search_Lucene_Field::keyword('pageId', $page['id'])); $document->addField(Zend_Search_Lucene_Field::unStored('metaKeyWords', $page['metaKeywords'], 'UTF-8')); $document->addField(Zend_Search_Lucene_Field::unStored('metaDescription', $page['metaDescription'], 'UTF-8')); $document->addField(Zend_Search_Lucene_Field::unStored('headerTitle', $page['headerTitle'], 'UTF-8')); $document->addField(Zend_Search_Lucene_Field::unStored('content', $page['content'], 'UTF-8')); $document->addField(Zend_Search_Lucene_Field::text('draft', $page['draft'], 'UTF-8')); $document->addField(Zend_Search_Lucene_Field::text('teaserText', $page['teaserText'], 'UTF-8')); $document->addField(Zend_Search_Lucene_Field::text('url', $page['url'], 'UTF-8')); $document->addField(Zend_Search_Lucene_Field::text('navName', $page['navName'], 'UTF-8')); $document->addField(Zend_Search_Lucene_Field::text('h1', $page['h1'], 'UTF-8')); // $document->addField(Zend_Search_Lucene_Field::text('previewImage', $page['previewImage'])); self::$_index->addDocument($document); }
/** * Execute the query * * @param string|Zym_Search_Lucene_IQuery $query * @param int $resultSetLimit * @return array */ public function search($query, $resultSetLimit = null) { if (!$resultSetLimit) { $resultSetLimit = self::$_defaultResultSetLimit; } Zend_Search_Lucene::setResultSetLimit((int) $resultSetLimit); return $this->_searchIndex->find((string) $query); }
/** * Find, format, return results * * @param \Zend_Search_Lucene_Interface $index * @param string $query * @return array */ public function query(\Zend_Search_Lucene_Interface $index, $query) { $preparedQuery = $this->prepareQuery($query); $q = Parser::parse($preparedQuery); /* @var $hits \Zend_Search_Lucene_Search_QueryHit[] */ $hits = $index->find($q, 'type', SORT_REGULAR, SORT_DESC); // if no hits are found with an exact phrase, fall back to token search if (count($hits) === 0) { $q = Parser::parse($query); $hits = $index->find($q, 'type', SORT_REGULAR, SORT_ASC); } $results = array(); foreach ($hits as $hit) { $h = array(); $h['title'] = $hit->getDocument()->getFieldValue('title'); $h['url'] = $hit->getDocument()->getFieldValue('url'); $h['score'] = $hit->score; $h['type'] = $hit->getDocument()->getFieldValue('type'); $results[] = $h; } return $results; }
/** * Instanciate the Lucene index * * The index will be created if it doesn't exist yet. * * @return \Zend_Search_Lucene_Interface Lucene index instance * @throws Exception If the index cannot be created */ protected function _index() { // One-time instanciation or creation of the lucene index if ($this->_index === null) { // Try to instanciate an existing lucene index try { $this->_index = \Zend_Search_Lucene::open($this->_indexDirectory); // If an error occurs ... } catch (\Zend_Search_Lucene_Exception $e) { // Try to create a new lucene index ... try { $this->_index = \Zend_Search_Lucene::create($this->_indexDirectory); // If an error occurs: Failure } catch (\Zend_Search_Lucene_Exception $e) { throw new Exception(sprintf('Error creating lucene index in "%1$s", reason: "%2$s"', $this->_indexDirectory, $e->getMessage())); } } // Index setup \Zend_Search_Lucene_Storage_Directory_Filesystem::setDefaultFilePermissions(0664); \Zend_Search_Lucene_Analysis_Analyzer::setDefault(new \Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8Num_CaseInsensitive()); \Zend_Search_Lucene_Search_QueryParser::setDefaultEncoding('UTF-8'); // Minimize memory consumption $this->_index->setMaxBufferedDocs(1); // Set optimization frequency $this->_index->setMergeFactor(max(1, intval($GLOBALS['TYPO3_CONF_VARS']['EXT']['extParams']['tw_lucenesearch']['mergeFactor']))); // If applicable: Optimize index if ($this->_indexOptimize) { $this->_index->optimize(); } $this->_index->commit(); if (TYPO3_MODE == 'FE') { \Zend_Search_Lucene::setTermsPerQueryLimit(\Tollwerk\TwLucenesearch\Utility\Indexer::indexConfig($GLOBALS['TSFE'], 'search.limits.query')); } } return $this->_index; }
/** * Execute the search and return the results * * @param string|Zym_Search_Lucene_IQuery $query * @return array */ protected function _executeSearch($query) { return $this->_searchIndex->find((string) $query); }
/** * Score calculator for non conjunction queries (not all subqueries are required) * * @param integer $docId * @param Zend_Search_Lucene_Interface $reader * @return float */ public function _nonConjunctionScore($docId, Zend_Search_Lucene_Interface $reader) { if ($this->_coord === null) { $this->_coord = array(); $maxCoord = 0; foreach ($this->_signs as $sign) { if ($sign !== false) { $maxCoord++; } } for ($count = 0; $count <= $maxCoord; $count++) { $this->_coord[$count] = $reader->getSimilarity()->coord($count, $maxCoord); } } $score = 0; $matchedSubqueries = 0; foreach ($this->_subqueries as $subqueryId => $subquery) { $subscore = $subquery->score($docId, $reader); // Prohibited if ($this->_signs[$subqueryId] === false && $subscore != 0) { return 0; } // is required, but doen't match if ($this->_signs[$subqueryId] === true && $subscore == 0) { return 0; } if ($subscore != 0) { $matchedSubqueries++; $score += $subscore; } } return $score * $this->_coord[$matchedSubqueries] * $this->getBoost(); }
/** * Remove reference from the index object * * When reference count becomes zero, index is closed and resources are cleaned up * * @internal */ public function removeReference() { return $this->_index->removeReference(); }
/** * Re-write query into primitive queries in the context of specified index * * @param Zend_Search_Lucene_Interface $index * @return Zend_Search_Lucene_Search_Query */ public function rewrite(Zend_Search_Lucene_Interface $index) { $this->_matches = array(); if ($this->_field === null) { // Search through all fields $fields = $index->getFieldNames(true); } else { $fields = array($this->_field); } // require_once 'Zend/Search/Lucene.php'; $maxTerms = Zend_Search_Lucene::getTermsPerQueryLimit(); foreach ($fields as $field) { $index->resetTermsStream(); // require_once 'Zend/Search/Lucene/Index/Term.php'; if ($this->_lowerTerm !== null) { $lowerTerm = new Zend_Search_Lucene_Index_Term($this->_lowerTerm->text, $field); $index->skipTo($lowerTerm); if (!$this->_inclusive && $index->currentTerm() == $lowerTerm) { // Skip lower term $index->nextTerm(); } } else { $index->skipTo(new Zend_Search_Lucene_Index_Term('', $field)); } if ($this->_upperTerm !== null) { // Walk up to the upper term $upperTerm = new Zend_Search_Lucene_Index_Term($this->_upperTerm->text, $field); while ($index->currentTerm() !== null && $index->currentTerm()->field == $field && strcmp($index->currentTerm()->text, $upperTerm->text) < 0) { $this->_matches[] = $index->currentTerm(); if ($maxTerms != 0 && count($this->_matches) > $maxTerms) { // require_once 'Zend/Search/Lucene/Exception.php'; throw new Zend_Search_Lucene_Exception('Terms per query limit is reached.'); } $index->nextTerm(); } if ($this->_inclusive && $index->currentTerm() == $upperTerm) { // Include upper term into result $this->_matches[] = $upperTerm; } } else { // Walk up to the end of field data while ($index->currentTerm() !== null && $index->currentTerm()->field == $field) { $this->_matches[] = $index->currentTerm(); if ($maxTerms != 0 && count($this->_matches) > $maxTerms) { // require_once 'Zend/Search/Lucene/Exception.php'; throw new Zend_Search_Lucene_Exception('Terms per query limit is reached.'); } $index->nextTerm(); } } $index->closeTermsStream(); } if (count($this->_matches) == 0) { // require_once 'Zend/Search/Lucene/Search/Query/Empty.php'; return new Zend_Search_Lucene_Search_Query_Empty(); } else { if (count($this->_matches) == 1) { // require_once 'Zend/Search/Lucene/Search/Query/Term.php'; return new Zend_Search_Lucene_Search_Query_Term(reset($this->_matches)); } else { // require_once 'Zend/Search/Lucene/Search/Query/MultiTerm.php'; $rewrittenQuery = new Zend_Search_Lucene_Search_Query_MultiTerm(); foreach ($this->_matches as $matchedTerm) { $rewrittenQuery->addTerm($matchedTerm); } return $rewrittenQuery; } } }
/** * Execute query in context of index reader * It also initializes necessary internal structures * * @param Zend_Search_Lucene_Interface $reader * @param Zend_Search_Lucene_Index_DocsFilter|null $docsFilter */ public function execute(Zend_Search_Lucene_Interface $reader, $docsFilter = null) { $this->_resVector = null; if (count($this->_terms) == 0) { $this->_resVector = array(); } $resVectors = array(); $resVectorsSizes = array(); $resVectorsIds = array(); // is used to prevent arrays comparison foreach ($this->_terms as $termId => $term) { $resVectors[] = array_flip($reader->termDocs($term)); $resVectorsSizes[] = count(end($resVectors)); $resVectorsIds[] = $termId; $this->_termsPositions[$termId] = $reader->termPositions($term); } // sort resvectors in order of subquery cardinality increasing array_multisort($resVectorsSizes, SORT_ASC, SORT_NUMERIC, $resVectorsIds, SORT_ASC, SORT_NUMERIC, $resVectors); foreach ($resVectors as $nextResVector) { if ($this->_resVector === null) { $this->_resVector = $nextResVector; } else { //$this->_resVector = array_intersect_key($this->_resVector, $nextResVector); /** * This code is used as workaround for array_intersect_key() slowness problem. */ $updatedVector = array(); foreach ($this->_resVector as $id => $value) { if (isset($nextResVector[$id])) { $updatedVector[$id] = $value; } } $this->_resVector = $updatedVector; } if (count($this->_resVector) == 0) { // Empty result set, we don't need to check other terms break; } } // ksort($this->_resVector, SORT_NUMERIC); // Docs are returned ordered. Used algorithm doesn't change elements order. // Initialize weight if it's not done yet $this->_initWeight($reader); }
/** * Get the document count * * @return int Document count */ public function getDocumentCount() { return $this->_data->numDocs(); }
/** * Execute query in context of index reader * It also initializes necessary internal structures * * @param Zend_Search_Lucene_Interface $reader */ public function execute(Zend_Search_Lucene_Interface $reader) { $this->_resVector = null; if (count($this->_terms) == 0) { $this->_resVector = array(); } foreach ($this->_terms as $termId => $term) { if ($this->_resVector === null) { $this->_resVector = array_flip($reader->termDocs($term)); } else { $this->_resVector = array_intersect_key($this->_resVector, array_flip($reader->termDocs($term))); } if (count($this->_resVector) == 0) { // Empty result set, we don't need to check other terms break; } $this->_termsPositions[$termId] = $reader->termPositions($term); } ksort($this->_resVector, SORT_NUMERIC); // Initialize weight if it's not done yet $this->_initWeight($reader); }
/** * Re-write query into primitive queries in the context of specified index * * @param Zend_Search_Lucene_Interface $index * @return Zend_Search_Lucene_Search_Query */ public function rewrite(Zend_Search_Lucene_Interface $index) { if ($this->_field === null) { require_once 'Zend/Search/Lucene/Search/Query/Boolean.php'; $query = new Zend_Search_Lucene_Search_Query_Boolean(); $hasInsignificantSubqueries = false; require_once 'Zend/Search/Lucene.php'; if (Zend_Search_Lucene::getDefaultSearchField() === null) { $searchFields = $index->getFieldNames(true); } else { $searchFields = array(Zend_Search_Lucene::getDefaultSearchField()); } require_once 'Zend/Search/Lucene/Search/Query/Preprocessing/Fuzzy.php'; foreach ($searchFields as $fieldName) { $subquery = new Zend_Search_Lucene_Search_Query_Preprocessing_Fuzzy($this->_word, $this->_encoding, $fieldName, $this->_minimumSimilarity); $rewrittenSubquery = $subquery->rewrite($index); if (!($rewrittenSubquery instanceof Zend_Search_Lucene_Search_Query_Insignificant || $rewrittenSubquery instanceof Zend_Search_Lucene_Search_Query_Empty)) { $query->addSubquery($rewrittenSubquery); } if ($rewrittenSubquery instanceof Zend_Search_Lucene_Search_Query_Insignificant) { $hasInsignificantSubqueries = true; } } $subqueries = $query->getSubqueries(); if (count($subqueries) == 0) { $this->_matches = array(); if ($hasInsignificantSubqueries) { require_once 'Zend/Search/Lucene/Search/Query/Insignificant.php'; return new Zend_Search_Lucene_Search_Query_Insignificant(); } else { require_once 'Zend/Search/Lucene/Search/Query/Empty.php'; return new Zend_Search_Lucene_Search_Query_Empty(); } } if (count($subqueries) == 1) { $query = reset($subqueries); } $query->setBoost($this->getBoost()); $this->_matches = $query->getQueryTerms(); return $query; } // ------------------------------------- // Recognize exact term matching (it corresponds to Keyword fields stored in the index) // encoding is not used since we expect binary matching require_once 'Zend/Search/Lucene/Index/Term.php'; $term = new Zend_Search_Lucene_Index_Term($this->_word, $this->_field); if ($index->hasTerm($term)) { require_once 'Zend/Search/Lucene/Search/Query/Fuzzy.php'; $query = new Zend_Search_Lucene_Search_Query_Fuzzy($term, $this->_minimumSimilarity); $query->setBoost($this->getBoost()); // Get rewritten query. Important! It also fills terms matching container. $rewrittenQuery = $query->rewrite($index); $this->_matches = $query->getQueryTerms(); return $rewrittenQuery; } // ------------------------------------- // Recognize wildcard queries /** @todo check for PCRE unicode support may be performed through Zend_Environment in some future */ if (@preg_match('/\\pL/u', 'a') == 1) { $subPatterns = preg_split('/[*?]/u', iconv($this->_encoding, 'UTF-8', $this->_word)); } else { $subPatterns = preg_split('/[*?]/', $this->_word); } if (count($subPatterns) > 1) { require_once 'Zend/Search/Lucene/Search/QueryParserException.php'; throw new Zend_Search_Lucene_Search_QueryParserException('Fuzzy search doesn\'t support wildcards (except within Keyword fields).'); } // ------------------------------------- // Recognize one-term multi-term and "insignificant" queries require_once 'Zend/Search/Lucene/Analysis/Analyzer.php'; $tokens = Zend_Search_Lucene_Analysis_Analyzer::getDefault()->tokenize($this->_word, $this->_encoding); if (count($tokens) == 0) { $this->_matches = array(); require_once 'Zend/Search/Lucene/Search/Query/Insignificant.php'; return new Zend_Search_Lucene_Search_Query_Insignificant(); } if (count($tokens) == 1) { require_once 'Zend/Search/Lucene/Index/Term.php'; $term = new Zend_Search_Lucene_Index_Term($tokens[0]->getTermText(), $this->_field); require_once 'Zend/Search/Lucene/Search/Query/Fuzzy.php'; $query = new Zend_Search_Lucene_Search_Query_Fuzzy($term, $this->_minimumSimilarity); $query->setBoost($this->getBoost()); // Get rewritten query. Important! It also fills terms matching container. $rewrittenQuery = $query->rewrite($index); $this->_matches = $query->getQueryTerms(); return $rewrittenQuery; } // Word is tokenized into several tokens require_once 'Zend/Search/Lucene/Search/QueryParserException.php'; throw new Zend_Search_Lucene_Search_QueryParserException('Fuzzy search is supported only for non-multiple word terms'); }
/** * Score calculator for non conjunction queries (not all terms are required) * * @param integer $docId * @param Zend_Search_Lucene_Interface $reader * @return float */ public function _nonConjunctionScore($docId, $reader) { if ($this->_coord === null) { $this->_coord = array(); $maxCoord = 0; foreach ($this->_signs as $sign) { if ($sign !== false) { $maxCoord++; } } for ($count = 0; $count <= $maxCoord; $count++) { $this->_coord[$count] = $reader->getSimilarity()->coord($count, $maxCoord); } } $score = 0.0; $matchedTerms = 0; foreach ($this->_terms as $termId => $term) { // Check if term is if ($this->_signs[$termId] !== false && isset($this->_termsFreqs[$termId][$docId])) { $matchedTerms++; /** * We don't need to check that term freq is not 0 * Score calculation is performed only for matched docs */ $score += $reader->getSimilarity()->tf($this->_termsFreqs[$termId][$docId]) * $this->_weights[$termId]->getValue() * $reader->norm($docId, $term->field); } } return $score * $this->_coord[$matchedTerms] * $this->getBoost(); }
/** * Find all existing lucene documents based on the parent url * @param Zend_Search_Lucene_Interface $index * @param AJXP_Node $ajxpNode * @return Zend_Search_Lucene_Search_QueryHit */ public function getIndexedChildrenDocuments($index, $ajxpNode) { // Try getting doc by url $testQ = str_replace("/", "AJXPFAKESEP", SystemTextEncoding::toUTF8($ajxpNode->getPath())); $pattern = new Zend_Search_Lucene_Index_Term($testQ . '*', 'node_path'); $query = new Zend_Search_Lucene_Search_Query_Wildcard($pattern); $hits = $index->find($query); return $hits; }
/** * finds similar terms * @param string $queryStr * @param \Zend_Search_Lucene_Interface $index * @param integer $prefixLength optionally specify prefix length, default 0 * @param float $similarity optionally specify similarity, default 0.5 * @return string[] $similarSearchTerms */ public static function fuzzyFindTerms($queryStr, $index, $prefixLength = 0, $similarity = 0.5) { if ($index != NULL) { \Zend_Search_Lucene_Search_Query_Fuzzy::setDefaultPrefixLength($prefixLength); $term = new \Zend_Search_Lucene_Index_Term($queryStr); $fuzzyQuery = new \Zend_Search_Lucene_Search_Query_Fuzzy($term, $similarity); $hits = $index->find($fuzzyQuery); $terms = $fuzzyQuery->getQueryTerms(); return $terms; } }
/** * Re-write query into primitive queries in the context of specified index * * @param Zend_Search_Lucene_Interface $index * @return Zend_Search_Lucene_Search_Query */ public function rewrite(Zend_Search_Lucene_Interface $index) { if ($this->_field === null) { require_once 'Zend/Search/Lucene/Search/Query/MultiTerm.php'; $query = new Zend_Search_Lucene_Search_Query_MultiTerm(); $query->setBoost($this->getBoost()); $hasInsignificantSubqueries = false; require_once 'Zend/Search/Lucene.php'; if (Zend_Search_Lucene::getDefaultSearchField() === null) { $searchFields = $index->getFieldNames(true); } else { $searchFields = array(Zend_Search_Lucene::getDefaultSearchField()); } require_once 'Zend/Search/Lucene/Search/Query/Preprocessing/Term.php'; foreach ($searchFields as $fieldName) { $subquery = new Zend_Search_Lucene_Search_Query_Preprocessing_Term($this->_word, $this->_encoding, $fieldName); $rewrittenSubquery = $subquery->rewrite($index); foreach ($rewrittenSubquery->getQueryTerms() as $term) { $query->addTerm($term); } if ($rewrittenSubquery instanceof Zend_Search_Lucene_Search_Query_Insignificant) { $hasInsignificantSubqueries = true; } } if (count($query->getTerms()) == 0) { $this->_matches = array(); if ($hasInsignificantSubqueries) { require_once 'Zend/Search/Lucene/Search/Query/Insignificant.php'; return new Zend_Search_Lucene_Search_Query_Insignificant(); } else { require_once 'Zend/Search/Lucene/Search/Query/Empty.php'; return new Zend_Search_Lucene_Search_Query_Empty(); } } $this->_matches = $query->getQueryTerms(); return $query; } // ------------------------------------- // Recognize exact term matching (it corresponds to Keyword fields stored in the index) // encoding is not used since we expect binary matching require_once 'Zend/Search/Lucene/Index/Term.php'; $term = new Zend_Search_Lucene_Index_Term($this->_word, $this->_field); if ($index->hasTerm($term)) { require_once 'Zend/Search/Lucene/Search/Query/Term.php'; $query = new Zend_Search_Lucene_Search_Query_Term($term); $query->setBoost($this->getBoost()); $this->_matches = $query->getQueryTerms(); return $query; } // ------------------------------------- // Recognize wildcard queries /** @todo check for PCRE unicode support may be performed through Zend_Environment in some future */ if (@preg_match('/\\pL/u', 'a') == 1) { $word = iconv($this->_encoding, 'UTF-8', $this->_word); $wildcardsPattern = '/[*?]/u'; $subPatternsEncoding = 'UTF-8'; } else { $word = $this->_word; $wildcardsPattern = '/[*?]/'; $subPatternsEncoding = $this->_encoding; } $subPatterns = preg_split($wildcardsPattern, $word, -1, PREG_SPLIT_OFFSET_CAPTURE); if (count($subPatterns) > 1) { // Wildcard query is recognized $pattern = ''; require_once 'Zend/Search/Lucene/Analysis/Analyzer.php'; foreach ($subPatterns as $id => $subPattern) { // Append corresponding wildcard character to the pattern before each sub-pattern (except first) if ($id != 0) { $pattern .= $word[$subPattern[1] - 1]; } // Check if each subputtern is a single word in terms of current analyzer $tokens = Zend_Search_Lucene_Analysis_Analyzer::getDefault()->tokenize($subPattern[0], $subPatternsEncoding); if (count($tokens) > 1) { require_once 'Zend/Search/Lucene/Search/QueryParserException.php'; throw new Zend_Search_Lucene_Search_QueryParserException('Wildcard search is supported only for non-multiple word terms'); } foreach ($tokens as $token) { $pattern .= $token->getTermText(); } } require_once 'Zend/Search/Lucene/Index/Term.php'; $term = new Zend_Search_Lucene_Index_Term($pattern, $this->_field); require_once 'Zend/Search/Lucene/Search/Query/Wildcard.php'; $query = new Zend_Search_Lucene_Search_Query_Wildcard($term); $query->setBoost($this->getBoost()); // Get rewritten query. Important! It also fills terms matching container. $rewrittenQuery = $query->rewrite($index); $this->_matches = $query->getQueryTerms(); return $rewrittenQuery; } // ------------------------------------- // Recognize one-term multi-term and "insignificant" queries require_once 'Zend/Search/Lucene/Analysis/Analyzer.php'; $tokens = Zend_Search_Lucene_Analysis_Analyzer::getDefault()->tokenize($this->_word, $this->_encoding); if (count($tokens) == 0) { $this->_matches = array(); require_once 'Zend/Search/Lucene/Search/Query/Insignificant.php'; return new Zend_Search_Lucene_Search_Query_Insignificant(); } if (count($tokens) == 1) { require_once 'Zend/Search/Lucene/Index/Term.php'; $term = new Zend_Search_Lucene_Index_Term($tokens[0]->getTermText(), $this->_field); require_once 'Zend/Search/Lucene/Search/Query/Term.php'; $query = new Zend_Search_Lucene_Search_Query_Term($term); $query->setBoost($this->getBoost()); $this->_matches = $query->getQueryTerms(); return $query; } //It's not insignificant or one term query require_once 'Zend/Search/Lucene/Search/Query/MultiTerm.php'; $query = new Zend_Search_Lucene_Search_Query_MultiTerm(); /** * @todo Process $token->getPositionIncrement() to support stemming, synonyms and other * analizer design features */ require_once 'Zend/Search/Lucene/Index/Term.php'; foreach ($tokens as $token) { $term = new Zend_Search_Lucene_Index_Term($token->getTermText(), $this->_field); $query->addTerm($term, true); // all subterms are required } $query->setBoost($this->getBoost()); $this->_matches = $query->getQueryTerms(); return $query; }
/** * Re-write query into primitive queries in the context of specified index * * @param Zend_Search_Lucene_Interface $index * @return Zend_Search_Lucene_Search_Query */ public function rewrite(Zend_Search_Lucene_Interface $index) { $this->_matches = array(); if ($this->_field === null) { // Search through all fields $fields = $index->getFieldNames(true); } else { $fields = array($this->_field); } foreach ($fields as $field) { $index->resetTermsStream(); if ($this->_lowerTerm !== null) { $lowerTerm = new Zend_Search_Lucene_Index_Term($this->_lowerTerm->text, $field); $index->skipTo($lowerTerm); if (!$this->_inclusive && $index->currentTerm() == $lowerTerm) { // Skip lower term $index->nextTerm(); } } else { $index->skipTo(new Zend_Search_Lucene_Index_Term('', $field)); } if ($this->_upperTerm !== null) { // Walk up to the upper term $upperTerm = new Zend_Search_Lucene_Index_Term($this->_upperTerm->text, $field); while ($index->currentTerm() !== null && $index->currentTerm()->field == $field && $index->currentTerm()->text < $upperTerm->text) { $this->_matches[] = $index->currentTerm(); $index->nextTerm(); } if ($this->_inclusive && $index->currentTerm() == $upperTerm) { // Include upper term into result $this->_matches[] = $upperTerm; } } else { // Walk up to the end of field data while ($index->currentTerm() !== null && $index->currentTerm()->field == $field) { $this->_matches[] = $index->currentTerm(); $index->nextTerm(); } } $index->closeTermsStream(); } if (count($this->_matches) == 0) { return new Zend_Search_Lucene_Search_Query_Empty(); } else { if (count($this->_matches) == 1) { return new Zend_Search_Lucene_Search_Query_Term(reset($this->_matches)); } else { $rewrittenQuery = new Zend_Search_Lucene_Search_Query_MultiTerm(); foreach ($this->_matches as $matchedTerm) { $rewrittenQuery->addTerm($matchedTerm); } return $rewrittenQuery; } } }
/** * Open the lucene search index and return it. * * @param bool $forceCreation * @return Zend_Search_Lucene_Interface */ public static function getSearchIndex($forceCreation = false) { if (!self::$index || $forceCreation) { if (self::$index) { self::$index->commit(); self::$index = null; } Zend_Search_Lucene_Analysis_Analyzer::setDefault(new Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8Num_CaseInsensitive()); Zend_Search_Lucene_Search_QueryParser::setDefaultEncoding(self::$config->curry->internalEncoding); $path = Curry_Util::path(self::$config->curry->projectPath, 'data', 'searchindex'); self::$index = $forceCreation ? Zend_Search_Lucene::create($path) : Zend_Search_Lucene::open($path); } return self::$index; }
/** * Re-write query into primitive queries in the context of specified index * * @param Zend_Search_Lucene_Interface $index * @return Zend_Search_Lucene_Search_Query */ public function rewrite(Zend_Search_Lucene_Interface $index) { // Allow to use wildcards within phrases // They are either removed by text analyzer or used as a part of keyword for keyword fields // // if (strpos($this->_phrase, '?') !== false || strpos($this->_phrase, '*') !== false) { // require_once 'Zend/Search/Lucene/Search/QueryParserException.php'; // throw new Zend_Search_Lucene_Search_QueryParserException('Wildcards are only allowed in a single terms.'); // } // Split query into subqueries if field name is not specified if ($this->_field === null) { require_once 'Zend/Search/Lucene/Search/Query/Boolean.php'; $query = new Zend_Search_Lucene_Search_Query_Boolean(); $query->setBoost($this->getBoost()); require_once 'Zend/Search/Lucene.php'; if (Zend_Search_Lucene::getDefaultSearchField() === null) { $searchFields = $index->getFieldNames(true); } else { $searchFields = array(Zend_Search_Lucene::getDefaultSearchField()); } foreach ($searchFields as $fieldName) { $subquery = new Zend_Search_Lucene_Search_Query_Preprocessing_Phrase($this->_phrase, $this->_phraseEncoding, $fieldName); $subquery->setSlop($this->getSlop()); $query->addSubquery($subquery->rewrite($index)); } $this->_matches = $query->getQueryTerms(); return $query; } // Recognize exact term matching (it corresponds to Keyword fields stored in the index) // encoding is not used since we expect binary matching require_once 'Zend/Search/Lucene/Index/Term.php'; $term = new Zend_Search_Lucene_Index_Term($this->_phrase, $this->_field); if ($index->hasTerm($term)) { require_once 'Zend/Search/Lucene/Search/Query/Term.php'; $query = new Zend_Search_Lucene_Search_Query_Term($term); $query->setBoost($this->getBoost()); $this->_matches = $query->getQueryTerms(); return $query; } // tokenize phrase using current analyzer and process it as a phrase query require_once 'Zend/Search/Lucene/Analysis/Analyzer.php'; $tokens = Zend_Search_Lucene_Analysis_Analyzer::getDefault()->tokenize($this->_phrase, $this->_phraseEncoding); if (count($tokens) == 0) { $this->_matches = array(); require_once 'Zend/Search/Lucene/Search/Query/Insignificant.php'; return new Zend_Search_Lucene_Search_Query_Insignificant(); } if (count($tokens) == 1) { require_once 'Zend/Search/Lucene/Index/Term.php'; $term = new Zend_Search_Lucene_Index_Term($tokens[0]->getTermText(), $this->_field); require_once 'Zend/Search/Lucene/Search/Query/Term.php'; $query = new Zend_Search_Lucene_Search_Query_Term($term); $query->setBoost($this->getBoost()); $this->_matches = $query->getQueryTerms(); return $query; } //It's non-trivial phrase query $position = -1; require_once 'Zend/Search/Lucene/Search/Query/Phrase.php'; $query = new Zend_Search_Lucene_Search_Query_Phrase(); require_once 'Zend/Search/Lucene/Index/Term.php'; foreach ($tokens as $token) { $position += $token->getPositionIncrement(); $term = new Zend_Search_Lucene_Index_Term($token->getTermText(), $this->_field); $query->addTerm($term, $position); $query->setSlop($this->getSlop()); } $this->_matches = $query->getQueryTerms(); return $query; }
/** * Undeletes all documents currently marked as deleted in this index. */ public function undeleteAll() { return $this->_index->undeleteAll(); }
/** * Re-write query into primitive queries in the context of specified index * * @param Zend_Search_Lucene_Interface $index * @return Zend_Search_Lucene_Search_Query * @throws Zend_Search_Lucene_Exception */ public function rewrite(Zend_Search_Lucene_Interface $index) { $this->_matches = array(); $this->_scores = array(); $this->_termKeys = array(); if ($this->_term->field === null) { // Search through all fields $fields = $index->getFieldNames(true); } else { $fields = array($this->_term->field); } //$1 'Zend/Search/Lucene/Index/Term.php'; $prefix = Zend_Search_Lucene_Index_Term::getPrefix($this->_term->text, $this->_prefixLength); $prefixByteLength = strlen($prefix); $prefixUtf8Length = Zend_Search_Lucene_Index_Term::getLength($prefix); $termLength = Zend_Search_Lucene_Index_Term::getLength($this->_term->text); $termRest = substr($this->_term->text, $prefixByteLength); // we calculate length of the rest in bytes since levenshtein() is not UTF-8 compatible $termRestLength = strlen($termRest); $scaleFactor = 1 / (1 - $this->_minimumSimilarity); //$1 'Zend/Search/Lucene.php'; $maxTerms = Zend_Search_Lucene::getTermsPerQueryLimit(); foreach ($fields as $field) { $index->resetTermsStream(); //$1 'Zend/Search/Lucene/Index/Term.php'; if ($prefix != '') { $index->skipTo(new Zend_Search_Lucene_Index_Term($prefix, $field)); while ($index->currentTerm() !== null && $index->currentTerm()->field == $field && substr($index->currentTerm()->text, 0, $prefixByteLength) == $prefix) { // Calculate similarity $target = substr($index->currentTerm()->text, $prefixByteLength); $maxDistance = isset($this->_maxDistances[strlen($target)]) ? $this->_maxDistances[strlen($target)] : $this->_calculateMaxDistance($prefixUtf8Length, $termRestLength, strlen($target)); if ($termRestLength == 0) { // we don't have anything to compare. That means if we just add // the letters for current term we get the new word $similarity = $prefixUtf8Length == 0 ? 0 : 1 - strlen($target) / $prefixUtf8Length; } else { if (strlen($target) == 0) { $similarity = $prefixUtf8Length == 0 ? 0 : 1 - $termRestLength / $prefixUtf8Length; } else { if ($maxDistance < abs($termRestLength - strlen($target))) { //just adding the characters of term to target or vice-versa results in too many edits //for example "pre" length is 3 and "prefixes" length is 8. We can see that //given this optimal circumstance, the edit distance cannot be less than 5. //which is 8-3 or more precisesly abs(3-8). //if our maximum edit distance is 4, then we can discard this word //without looking at it. $similarity = 0; } else { $similarity = 1 - levenshtein($termRest, $target) / ($prefixUtf8Length + min($termRestLength, strlen($target))); } } } if ($similarity > $this->_minimumSimilarity) { $this->_matches[] = $index->currentTerm(); $this->_termKeys[] = $index->currentTerm()->key(); $this->_scores[] = ($similarity - $this->_minimumSimilarity) * $scaleFactor; if ($maxTerms != 0 && count($this->_matches) > $maxTerms) { //$1 'Zend/Search/Lucene/Exception.php'; throw new Zend_Search_Lucene_Exception('Terms per query limit is reached.'); } } $index->nextTerm(); } } else { $index->skipTo(new Zend_Search_Lucene_Index_Term('', $field)); while ($index->currentTerm() !== null && $index->currentTerm()->field == $field) { // Calculate similarity $target = $index->currentTerm()->text; $maxDistance = isset($this->_maxDistances[strlen($target)]) ? $this->_maxDistances[strlen($target)] : $this->_calculateMaxDistance(0, $termRestLength, strlen($target)); if ($maxDistance < abs($termRestLength - strlen($target))) { //just adding the characters of term to target or vice-versa results in too many edits //for example "pre" length is 3 and "prefixes" length is 8. We can see that //given this optimal circumstance, the edit distance cannot be less than 5. //which is 8-3 or more precisesly abs(3-8). //if our maximum edit distance is 4, then we can discard this word //without looking at it. $similarity = 0; } else { $similarity = 1 - levenshtein($termRest, $target) / min($termRestLength, strlen($target)); } if ($similarity > $this->_minimumSimilarity) { $this->_matches[] = $index->currentTerm(); $this->_termKeys[] = $index->currentTerm()->key(); $this->_scores[] = ($similarity - $this->_minimumSimilarity) * $scaleFactor; if ($maxTerms != 0 && count($this->_matches) > $maxTerms) { //$1 'Zend/Search/Lucene/Exception.php'; throw new Zend_Search_Lucene_Exception('Terms per query limit is reached.'); } } $index->nextTerm(); } } $index->closeTermsStream(); } if (count($this->_matches) == 0) { //$1 'Zend/Search/Lucene/Search/Query/Empty.php'; return new Zend_Search_Lucene_Search_Query_Empty(); } else { if (count($this->_matches) == 1) { //$1 'Zend/Search/Lucene/Search/Query/Term.php'; return new Zend_Search_Lucene_Search_Query_Term(reset($this->_matches)); } else { //$1 'Zend/Search/Lucene/Search/Query/Boolean.php'; $rewrittenQuery = new Zend_Search_Lucene_Search_Query_Boolean(); array_multisort($this->_scores, SORT_DESC, SORT_NUMERIC, $this->_termKeys, SORT_ASC, SORT_STRING, $this->_matches); $termCount = 0; //$1 'Zend/Search/Lucene/Search/Query/Term.php'; foreach ($this->_matches as $id => $matchedTerm) { $subquery = new Zend_Search_Lucene_Search_Query_Term($matchedTerm); $subquery->setBoost($this->_scores[$id]); $rewrittenQuery->addSubquery($subquery); $termCount++; if ($termCount >= self::MAX_CLAUSE_COUNT) { break; } } return $rewrittenQuery; } } }
/** * Re-write query into primitive queries in the context of specified index * * @param Zend_Search_Lucene_Interface $index * @return Zend_Search_Lucene_Search_Query */ public function rewrite(Zend_Search_Lucene_Interface $index) { $this->_matches = array(); if ($this->_pattern->field === null) { // Search through all fields $fields = $index->getFieldNames(true); } else { $fields = array($this->_pattern->field); } $prefix = self::_getPrefix($this->_pattern->text); $prefixLength = strlen($prefix); $matchExpression = '/^' . str_replace(array('\\?', '\\*'), array('.', '.*'), preg_quote($this->_pattern->text, '/')) . '$/'; /** @todo check for PCRE unicode support may be performed through Zend_Environment in some future */ if (@preg_match('/\\pL/u', 'a') == 1) { // PCRE unicode support is turned on // add Unicode modifier to the match expression $matchExpression .= 'u'; } foreach ($fields as $field) { $index->resetTermsStream(); if ($prefix != '') { $index->skipTo(new Zend_Search_Lucene_Index_Term($prefix, $field)); while ($index->currentTerm() !== null && $index->currentTerm()->field == $field && substr($index->currentTerm()->text, 0, $prefixLength) == $prefix) { if (preg_match($matchExpression, $index->currentTerm()->text) === 1) { $this->_matches[] = $index->currentTerm(); } $index->nextTerm(); } } else { $index->skipTo(new Zend_Search_Lucene_Index_Term('', $field)); while ($index->currentTerm() !== null && $index->currentTerm()->field == $field) { if (preg_match($matchExpression, $index->currentTerm()->text) === 1) { $this->_matches[] = $index->currentTerm(); } $index->nextTerm(); } } $index->closeTermsStream(); } if (count($this->_matches) == 0) { return new Zend_Search_Lucene_Search_Query_Empty(); } else { if (count($this->_matches) == 1) { return new Zend_Search_Lucene_Search_Query_Term(reset($this->_matches)); } else { $rewrittenQuery = new Zend_Search_Lucene_Search_Query_MultiTerm(); foreach ($this->_matches as $matchedTerm) { $rewrittenQuery->addTerm($matchedTerm); } return $rewrittenQuery; } } }
/** * Re-write query into primitive queries in the context of specified index * * @param Zend_Search_Lucene_Interface $index * @return Zend_Search_Lucene_Search_Query * @throws Zend_Search_Lucene_Exception */ public function rewrite(Zend_Search_Lucene_Interface $index) { $this->_matches = array(); if ($this->_pattern->field === null) { // Search through all fields $fields = $index->getFieldNames(true); } else { $fields = array($this->_pattern->field); } $prefix = self::_getPrefix($this->_pattern->text); $prefixLength = strlen($prefix); $matchExpression = '/^' . str_replace(array('\\?', '\\*'), array('.', '.*'), preg_quote($this->_pattern->text, '/')) . '$/'; if ($prefixLength < self::$_minPrefixLength) { // require_once 'Zend/Search/Lucene/Exception.php'; throw new Zend_Search_Lucene_Exception('At least ' . self::$_minPrefixLength . ' non-wildcard characters are required at the beginning of pattern.'); } /** @todo check for PCRE unicode support may be performed through Zend_Environment in some future */ if (@preg_match('/\\pL/u', 'a') == 1) { // PCRE unicode support is turned on // add Unicode modifier to the match expression $matchExpression .= 'u'; } $maxTerms = Zend_Search_Lucene::getTermsPerQueryLimit(); foreach ($fields as $field) { $index->resetTermsStream(); // require_once 'Zend/Search/Lucene/Index/Term.php'; if ($prefix != '') { $index->skipTo(new Zend_Search_Lucene_Index_Term($prefix, $field)); while ($index->currentTerm() !== null && $index->currentTerm()->field == $field && substr($index->currentTerm()->text, 0, $prefixLength) == $prefix) { if (preg_match($matchExpression, $index->currentTerm()->text) === 1) { $this->_matches[] = $index->currentTerm(); if ($maxTerms != 0 && count($this->_matches) > $maxTerms) { // require_once 'Zend/Search/Lucene/Exception.php'; throw new Zend_Search_Lucene_Exception('Terms per query limit is reached.'); } } $index->nextTerm(); } } else { $index->skipTo(new Zend_Search_Lucene_Index_Term('', $field)); while ($index->currentTerm() !== null && $index->currentTerm()->field == $field) { if (preg_match($matchExpression, $index->currentTerm()->text) === 1) { $this->_matches[] = $index->currentTerm(); if ($maxTerms != 0 && count($this->_matches) > $maxTerms) { // require_once 'Zend/Search/Lucene/Exception.php'; throw new Zend_Search_Lucene_Exception('Terms per query limit is reached.'); } } $index->nextTerm(); } } $index->closeTermsStream(); } if (count($this->_matches) == 0) { // require_once 'Zend/Search/Lucene/Search/Query/Empty.php'; return new Zend_Search_Lucene_Search_Query_Empty(); } else { if (count($this->_matches) == 1) { // require_once 'Zend/Search/Lucene/Search/Query/Term.php'; return new Zend_Search_Lucene_Search_Query_Term(reset($this->_matches)); } else { // require_once 'Zend/Search/Lucene/Search/Query/MultiTerm.php'; $rewrittenQuery = new Zend_Search_Lucene_Search_Query_MultiTerm(); foreach ($this->_matches as $matchedTerm) { $rewrittenQuery->addTerm($matchedTerm); } return $rewrittenQuery; } } }
public function _conjunctionScore($docId, Zend_Search_Lucene_Interface $reader) { if ($this->_coord === null) { $this->_coord = $reader->getSimilarity()->coord(count($this->_terms), count($this->_terms)); } $score = 0.0; foreach ($this->_terms as $termId => $term) { $score += $reader->getSimilarity()->tf($this->_termsFreqs[$termId][$docId]) * $this->_weights[$termId]->getValue() * $reader->norm($docId, $term->field); } return $score * $this->_coord * $this->getBoost(); }