/** * Find all documents where the values are matched in the field. The type option * allows you to specify the type of match, can be either phrase or phrase_prefix. * * The phrase match analyzes the text and creates a phrase query out of the * analyzed text. * * The phrase prefix match is the same as phrase, except that it allows for * prefix matches on the last term in the text. * * @link https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-match-query.html * * @param string $field The field to search in the index * @param string $query The values to search for * @param string $type The match type * @param bool $fuzzy Set whether the match should be fuzzy * @return Query */ public function match($field, $query, $type = 'phrase', $fuzzy = false) { $match = new Match(); $match->setFieldQuery($field, $query); $match->setFieldType($field, $type); if ($fuzzy) { $match->setFieldFuzziness($field, 'AUTO'); } $query = $this->newQuery($match); $this->query[] = $query; return $query; }
public function testMatchPhrasePrefix() { $client = $this->_getClient(); $index = $client->getIndex('test'); $index->create(array(), true); $type = $index->getType('test'); $doc = new Document(1, array('name' => 'Basel-Stadt')); $type->addDocument($doc); $doc = new Document(2, array('name' => 'New York')); $type->addDocument($doc); $doc = new Document(3, array('name' => 'New Hampshire')); $type->addDocument($doc); $doc = new Document(4, array('name' => 'Basel Land')); $type->addDocument($doc); $index->refresh(); $field = 'name'; $type = 'phrase_prefix'; $query = new Match(); $query->setFieldQuery($field, 'New'); $query->setFieldType($field, $type); $resultSet = $index->search($query); $this->assertEquals(2, $resultSet->count()); }
/** * Search articles with provided term. * @param $term string term to search * @param boolean $showSuggestion should this search suggest alternative searches that might be better? * @return Status(mixed) status containing results defined by resultsType on success */ public function searchText($term, $showSuggestion) { $checkLengthStatus = self::checkTextSearchRequestLength($term); if (!$checkLengthStatus->isOk()) { return $checkLengthStatus; } // Transform Mediawiki specific syntax to filters and extra (pre-escaped) query string $searcher = $this; $originalTerm = $term; $searchContainedSyntax = false; $this->term = $term; $this->boostLinks = $this->config->get('CirrusSearchBoostLinks'); $searchType = 'full_text'; // Handle title prefix notation $prefixPos = strpos($this->term, 'prefix:'); if ($prefixPos !== false) { $value = substr($this->term, 7 + $prefixPos); $value = trim($value, '"'); // Trim quotes in case the user wanted to quote the prefix if (strlen($value) > 0) { $searchContainedSyntax = true; $this->term = substr($this->term, 0, max(0, $prefixPos - 1)); $this->suggestSuffixes[] = ' prefix:' . $value; // Suck namespaces out of $value $cirrusSearchEngine = new CirrusSearch(); $cirrusSearchEngine->setConnection($this->connection); $value = trim($cirrusSearchEngine->replacePrefixes($value)); $this->namespaces = $cirrusSearchEngine->namespaces; // If the namespace prefix wasn't the entire prefix filter then add a filter for the title if (strpos($value, ':') !== strlen($value) - 1) { $value = str_replace('_', ' ', $value); $prefixQuery = new \Elastica\Query\Match(); $prefixQuery->setFieldQuery('title.prefix', $value); $this->filters[] = new \Elastica\Filter\Query($prefixQuery); } } } $preferRecentDecayPortion = $this->config->get('CirrusSearchPreferRecentDefaultDecayPortion'); $preferRecentHalfLife = $this->config->get('CirrusSearchPreferRecentDefaultHalfLife'); $unspecifiedDecayPortion = $this->config->get('CirrusSearchPreferRecentUnspecifiedDecayPortion'); // Matches "prefer-recent:" and then an optional floating point number <= 1 but >= 0 (decay // portion) and then an optional comma followed by another floating point number >= 0 (half life) $this->extractSpecialSyntaxFromTerm('/prefer-recent:(1|0?(?:\\.\\d+)?)?(?:,(\\d*\\.?\\d+))? ?/', function ($matches) use($unspecifiedDecayPortion, &$preferRecentDecayPortion, &$preferRecentHalfLife, &$searchContainedSyntax) { if (isset($matches[1]) && strlen($matches[1])) { $preferRecentDecayPortion = floatval($matches[1]); } else { $preferRecentDecayPortion = $unspecifiedDecayPortion; } if (isset($matches[2])) { $preferRecentHalfLife = floatval($matches[2]); } $searchContainedSyntax = true; return ''; }); $this->preferRecentDecayPortion = $preferRecentDecayPortion; $this->preferRecentHalfLife = $preferRecentHalfLife; $this->extractSpecialSyntaxFromTerm('/^\\s*local:/', function ($matches) use($searcher) { $searcher->limitSearchToLocalWiki(true); return ''; }); // Handle other filters $filters = $this->filters; $notFilters = $this->notFilters; $boostTemplates = self::getDefaultBoostTemplates(); $highlightSource = array(); $this->extractSpecialSyntaxFromTerm('/(?<not>-)?insource:\\/(?<pattern>(?:[^\\\\\\/]|\\\\.)+)\\/(?<insensitive>i)? ?/', function ($matches) use($searcher, &$filters, &$notFilters, &$searchContainedSyntax, &$searchType, &$highlightSource) { if (!$searcher->config->get('CirrusSearchEnableRegex')) { return; } $searchContainedSyntax = true; $searchType = 'regex'; $insensitive = !empty($matches['insensitive']); $filterDestination =& $filters; if (!empty($matches['not'])) { $filterDestination =& $notFilters; } else { $highlightSource[] = array('pattern' => $matches['pattern'], 'locale' => $searcher->config->get('LanguageCode'), 'insensitive' => $insensitive); } $regex = $searcher->config->getElement('CirrusSearchWikimediaExtraPlugin', 'regex'); if ($regex && in_array('use', $regex)) { $filter = new SourceRegex($matches['pattern'], 'source_text', 'source_text.trigram'); if (isset($regex['max_inspect'])) { $filter->setMaxInspect($regex['max_inspect']); } else { $filter->setMaxInspect(10000); } $filter->setMaxDeterminizedStates($searcher->config->get('CirrusSearchRegexMaxDeterminizedStates')); if (isset($regex['max_ngrams_extracted'])) { $filter->setMaxNgramExtracted($regex['max_ngrams_extracted']); } $filter->setCaseSensitive(!$insensitive); $filter->setLocale($this->config->get('LanguageCode')); $filterDestination[] = $filter; } else { // Without the extra plugin we need to use groovy to attempt the regex. // Its less good but its something. $script = <<<GROOVY import org.apache.lucene.util.automaton.*; sourceText = _source.get("source_text"); if (sourceText == null) { \tfalse; } else { \tif (automaton == null) { \t\tif (insensitive) { \t\t\tlocale = new Locale(language); \t\t\tpattern = pattern.toLowerCase(locale); \t\t} \t\tregexp = new RegExp(pattern, RegExp.ALL ^ RegExp.AUTOMATON); \t\tautomaton = new CharacterRunAutomaton(regexp.toAutomaton()); \t} \tif (insensitive) { \t\tsourceText = sourceText.toLowerCase(locale); \t} \tautomaton.run(sourceText); } GROOVY; $filterDestination[] = new \Elastica\Filter\Script(new \Elastica\Script($script, array('pattern' => '.*(' . $matches['pattern'] . ').*', 'insensitive' => $insensitive, 'language' => $searcher->config->get('LanguageCode'), 'automaton' => null, 'locale' => null), 'groovy')); } }); // Match filters that look like foobar:thing or foobar:"thing thing" // The {7,15} keeps this from having horrible performance on big strings $escaper = $this->escaper; $fuzzyQuery = $this->fuzzyQuery; $isEmptyQuery = false; $this->extractSpecialSyntaxFromTerm('/(?<key>[a-z\\-]{7,15}):\\s*(?<value>"(?<quoted>(?:[^"]|(?<=\\\\)")+)"|(?<unquoted>\\S+)) ?/', function ($matches) use($searcher, $escaper, &$filters, &$notFilters, &$boostTemplates, &$searchContainedSyntax, &$fuzzyQuery, &$highlightSource, &$isEmptyQuery) { $key = $matches['key']; $quotedValue = $matches['value']; $value = $matches['quoted'] !== '' ? str_replace('\\"', '"', $matches['quoted']) : $matches['unquoted']; $filterDestination =& $filters; $keepText = true; if ($key[0] === '-') { $key = substr($key, 1); $filterDestination =& $notFilters; $keepText = false; } switch ($key) { case 'boost-templates': $boostTemplates = Searcher::parseBoostTemplates($value); if ($boostTemplates === null) { $boostTemplates = Searcher::getDefaultBoostTemplates(); } $searchContainedSyntax = true; return ''; case 'hastemplate': // We emulate template syntax here as best as possible, // so things in NS_MAIN are prefixed with ":" and things // in NS_TEMPLATE don't have a prefix at all. Since we // don't actually index templates like that, munge the // query here if (strpos($value, ':') === 0) { $value = substr($value, 1); } else { $title = Title::newFromText($value); if ($title && $title->getNamespace() == NS_MAIN) { $value = Title::makeTitle(NS_TEMPLATE, $title->getDBkey())->getPrefixedText(); } } $filterDestination[] = $searcher->matchPage('template', $value); $searchContainedSyntax = true; return ''; case 'linksto': $filterDestination[] = $searcher->matchPage('outgoing_link', $value, true); $searchContainedSyntax = true; return ''; case 'incategory': $categories = array_slice(explode('|', $value), 0, $searcher->config->get('CirrusSearchMaxIncategoryOptions')); $categoryFilters = $searcher->matchPageCategories($categories); if ($categoryFilters === null) { $isEmptyQuery = true; } else { $filterDestination[] = $categoryFilters; } $searchContainedSyntax = true; return ''; case 'insource': $updateReferences = Filters::insource($escaper, $searcher->getSearchContext(), $quotedValue); $updateReferences($fuzzyQuery, $filterDestination, $highlightSource, $searchContainedSyntax); return ''; case 'intitle': $updateReferences = Filters::intitle($escaper, $searcher->getSearchContext(), $quotedValue); $updateReferences($fuzzyQuery, $filterDestination, $highlightSource, $searchContainedSyntax); return $keepText ? "{$quotedValue} " : ''; default: return $matches[0]; } }); if ($isEmptyQuery) { return Status::newGood(new SearchResultSet(true)); } $this->filters = $filters; $this->notFilters = $notFilters; $this->boostTemplates = $boostTemplates; $this->searchContext->setSearchContainedSyntax($searchContainedSyntax); $this->fuzzyQuery = $fuzzyQuery; $this->highlightSource = $highlightSource; $this->term = $this->escaper->escapeQuotes($this->term); $this->term = trim($this->term); // Match quoted phrases including those containing escaped quotes // Those phrases can optionally be followed by ~ then a number (this is the phrase slop) // That can optionally be followed by a ~ (this matches stemmed words in phrases) // The following all match: "a", "a boat", "a\"boat", "a boat"~, "a boat"~9, "a boat"~9~, -"a boat", -"a boat"~9~ $slop = $this->config->get('CirrusSearchPhraseSlop'); $query = self::replacePartsOfQuery($this->term, '/(?<![\\]])(?<negate>-|!)?(?<main>"((?:[^"]|(?<=\\\\)")+)"(?<slop>~\\d+)?)(?<fuzzy>~)?/', function ($matches) use($searcher, $escaper, $slop) { $negate = $matches['negate'][0] ? 'NOT ' : ''; $main = $escaper->fixupQueryStringPart($matches['main'][0]); if (!$negate && !isset($matches['fuzzy']) && !isset($matches['slop']) && preg_match('/^"([^"*]+)[*]"/', $main, $matches)) { $phraseMatch = new Elastica\Query\Match(); $phraseMatch->setFieldQuery("all.plain", $matches[1]); $phraseMatch->setFieldType("all.plain", "phrase_prefix"); $this->nonTextQueries[] = $phraseMatch; $phraseHighlightMatch = new Elastica\Query\QueryString(); $phraseHighlightMatch->setQuery($matches[1] . '*'); $phraseHighlightMatch->setFields(array('all.plain')); $this->nonTextHighlightQueries[] = $phraseHighlightMatch; return array(); } if (!isset($matches['fuzzy'])) { if (!isset($matches['slop'])) { $main = $main . '~' . $slop['precise']; } // Got to collect phrases that don't use the all field so we can highlight them. // The highlighter locks phrases to the fields that specify them. It doesn't do // that with terms. return array('escaped' => $negate . $searcher->switchSearchToExact($main, true), 'nonAll' => $negate . $searcher->switchSearchToExact($main, false)); } return array('escaped' => $negate . $main); }); // Find prefix matches and force them to only match against the plain analyzed fields. This // prevents prefix matches from getting confused by stemming. Users really don't expect stemming // in prefix queries. $query = self::replaceAllPartsOfQuery($query, '/\\w+\\*(?:\\w*\\*?)*/u', function ($matches) use($searcher, $escaper) { $term = $escaper->fixupQueryStringPart($matches[0][0]); return array('escaped' => $searcher->switchSearchToExactForWildcards($term), 'nonAll' => $searcher->switchSearchToExactForWildcards($term)); }); $escapedQuery = array(); $nonAllQuery = array(); $nearMatchQuery = array(); foreach ($query as $queryPart) { if (isset($queryPart['escaped'])) { $escapedQuery[] = $queryPart['escaped']; if (isset($queryPart['nonAll'])) { $nonAllQuery[] = $queryPart['nonAll']; } else { $nonAllQuery[] = $queryPart['escaped']; } continue; } if (isset($queryPart['raw'])) { $fixed = $this->escaper->fixupQueryStringPart($queryPart['raw']); $escapedQuery[] = $fixed; $nonAllQuery[] = $fixed; $nearMatchQuery[] = $queryPart['raw']; continue; } LoggerFactory::getInstance('CirrusSearch')->warning('Unknown query part: {queryPart}', array('queryPart' => serialize($queryPart))); } // Actual text query list($queryStringQueryString, $this->fuzzyQuery) = $escaper->fixupWholeQueryString(implode(' ', $escapedQuery)); // Note that no escaping is required for near_match's match query. $nearMatchQuery = implode(' ', $nearMatchQuery); if ($queryStringQueryString !== '') { if (preg_match('/(?<!\\\\)[?*+~"!|-]|AND|OR|NOT/', $queryStringQueryString)) { $this->searchContext->setSearchContainedSyntax(true); // We're unlikey to make good suggestions for query string with special syntax in them.... $showSuggestion = false; } $fields = array_merge($this->buildFullTextSearchFields(1, '.plain', true), $this->buildFullTextSearchFields($this->config->get('CirrusSearchStemmedWeight'), '', true)); $nearMatchFields = $this->buildFullTextSearchFields($this->config->get('CirrusSearchNearMatchWeight'), '.near_match', true); $this->query = $this->buildSearchTextQuery($fields, $nearMatchFields, $queryStringQueryString, $nearMatchQuery); // The highlighter doesn't know about the weightinging from the all fields so we have to send // it a query without the all fields. This swaps one in. if ($this->config->getElement('CirrusSearchAllFields', 'use')) { $nonAllFields = array_merge($this->buildFullTextSearchFields(1, '.plain', false), $this->buildFullTextSearchFields($this->config->get('CirrusSearchStemmedWeight'), '', false)); list($nonAllQueryString, ) = $escaper->fixupWholeQueryString(implode(' ', $nonAllQuery)); $this->highlightQuery = $this->buildSearchTextQueryForFields($nonAllFields, $nonAllQueryString, 1, false, true); } else { $nonAllFields = $fields; } // Only do a phrase match rescore if the query doesn't include any quotes and has a space. // Queries without spaces are either single term or have a phrase query generated. // Queries with the quote already contain a phrase query and we can't build phrase queries // out of phrase queries at this point. if ($this->config->get('CirrusSearchPhraseRescoreBoost') > 1.0 && $this->config->get('CirrusSearchPhraseRescoreWindowSize') && !$this->searchContext->isSearchContainedSyntax() && strpos($queryStringQueryString, '"') === false && strpos($queryStringQueryString, ' ') !== false) { $rescoreFields = $fields; if (!$this->config->get('CirrusSearchAllFieldsForRescore')) { $rescoreFields = $nonAllFields; } $this->rescore[] = array('window_size' => $this->config->get('CirrusSearchPhraseRescoreWindowSize'), 'query' => array('rescore_query' => $this->buildSearchTextQueryForFields($rescoreFields, '"' . $queryStringQueryString . '"', $this->config->getElement('CirrusSearchPhraseSlop', 'boost'), true), 'query_weight' => 1.0, 'rescore_query_weight' => $this->config->get('CirrusSearchPhraseRescoreBoost'))); } $showSuggestion = $showSuggestion && $this->offset == 0; if ($showSuggestion) { $this->suggest = array('text' => $this->term, 'suggest' => $this->buildSuggestConfig('suggest')); } $result = $this->search($searchType, $originalTerm); if (!$result->isOK() && $this->isParseError($result)) { // Elasticsearch has reported a parse error and we've already logged it when we built the status // so at this point all we can do is retry the query as a simple query string query. $this->query = new \Elastica\Query\Simple(array('simple_query_string' => array('fields' => $fields, 'query' => $queryStringQueryString, 'default_operator' => 'AND'))); $this->rescore = array(); // Not worth trying in this state. $result = $this->search('degraded_full_text', $originalTerm); // If that doesn't work we're out of luck but it should. There no guarantee it'll work properly // with the syntax we've built above but it'll do _something_ and we'll still work on fixing all // the parse errors that come in. } } else { $result = $this->search($searchType, $originalTerm); // No need to check for a parse error here because we don't actually create a query for // Elasticsearch to parse } return $result; }