public function testSearchMultipleFields() { $str = md5(rand()); $query = new QueryString($str); $expected = array('query' => $str); $this->assertEquals(array('query_string' => $expected), $query->toArray()); $fields = array(); $max = rand() % 10 + 1; for ($i = 0; $i < $max; $i++) { $fields[] = md5(rand()); } $query->setFields($fields); $expected['fields'] = $fields; $this->assertEquals(array('query_string' => $expected), $query->toArray()); foreach (array(false, true) as $val) { $query->setUseDisMax($val); $expected['use_dis_max'] = $val; $this->assertEquals(array('query_string' => $expected), $query->toArray()); } }
/** * Search revisions with provided term. * * @param string $term Term to search * @return Status */ public function searchText($term) { // full-text search $queryString = new QueryString($term); $queryString->setFields(array('revisions.text')); $this->query->setQuery($queryString); // add aggregation to determine exact amount of matching search terms $terms = $this->getTerms($term); $this->query->addAggregation($this->termsAggregation($terms)); // @todo: abstract-away this config? (core/cirrus also has this - share it somehow?) $this->query->setHighlight(array('fields' => array(static::HIGHLIGHT_FIELD => array('type' => 'plain', 'order' => 'score', 'number_of_fragments' => 1, 'fragment_size' => 10000)), 'pre_tags' => array(static::HIGHLIGHT_PRE), 'post_tags' => array(static::HIGHLIGHT_POST))); // @todo: support insource: queries (and perhaps others) $searchable = Connection::getFlowIndex($this->indexBaseName); if ($this->type !== false) { $searchable = $searchable->getType($this->type); } $search = $searchable->createSearch($this->query); // @todo: PoolCounter config at PoolCounterSettings-eqiad.php // @todo: do we want this class to extend from ElasticsearchIntermediary and use its success & failure methods (like CirrusSearch/Searcher does)? // Perform the search $work = new PoolCounterWorkViaCallback('Flow-Search', "_elasticsearch", array('doWork' => function () use($search) { try { $result = $search->search(); return Status::newGood($result); } catch (ExceptionInterface $e) { if (strpos($e->getMessage(), 'dynamic scripting for [groovy] disabled')) { // known issue with default ES config, let's display a more helpful message return Status::newFatal(new \RawMessage("Couldn't complete search: dynamic scripting needs to be enabled. " . "Please add 'script.disable_dynamic: false' to your elasticsearch.yml")); } return Status::newFatal('flow-error-search'); } }, 'error' => function (Status $status) { $status = $status->getErrorsArray(); wfLogWarning('Pool error searching Elasticsearch: ' . $status[0][0]); return Status::newFatal('flow-error-search'); })); $result = $work->execute(); return $result; }
/** * @param Escaper $escaper * @param SearchContext $context * @param string $value * @param bool $updateHighlightSourceRef * @param callable $fieldF * @return callable */ private static function insourceOrIntitle($escaper, $context, $value, $updateHighlightSourceRef, $fieldF) { list($queryString, $fuzzyQuery) = $escaper->fixupWholeQueryString($escaper->fixupQueryStringPart($value)); $field = $fieldF($queryString); $query = new \Elastica\Query\QueryString($queryString); $query->setFields(array($field)); $query->setDefaultOperator('AND'); $query->setAllowLeadingWildcard($escaper->getAllowLeadingWildcard()); $query->setFuzzyPrefixLength(2); $query->setRewrite('top_terms_boost_1024'); $wrappedQuery = $context->wrapInSaferIfPossible($query, false); $updateReferences = function (&$fuzzyQueryRef, &$filterDestinationRef, &$highlightSourceRef, &$searchContainedSyntaxRef) use($fuzzyQuery, $wrappedQuery, $updateHighlightSourceRef) { $fuzzyQueryRef = $fuzzyQuery; $filterDestinationRef[] = new \Elastica\Filter\Query($wrappedQuery); if ($updateHighlightSourceRef) { $highlightSourceRef[] = array('query' => $wrappedQuery); } $searchContainedSyntaxRef = true; }; return $updateReferences; }
/** * Search articles with provided term. * @param $term string term to search * @param boolean $showSuggestion should this search suggest alternative searches that might be better? * @return Status(mixed) status containing results defined by resultsType on success */ public function searchText($term, $showSuggestion) { $checkLengthStatus = self::checkTextSearchRequestLength($term); if (!$checkLengthStatus->isOk()) { return $checkLengthStatus; } // Transform Mediawiki specific syntax to filters and extra (pre-escaped) query string $searcher = $this; $originalTerm = $term; $searchContainedSyntax = false; $this->term = $term; $this->boostLinks = $this->config->get('CirrusSearchBoostLinks'); $searchType = 'full_text'; // Handle title prefix notation $prefixPos = strpos($this->term, 'prefix:'); if ($prefixPos !== false) { $value = substr($this->term, 7 + $prefixPos); $value = trim($value, '"'); // Trim quotes in case the user wanted to quote the prefix if (strlen($value) > 0) { $searchContainedSyntax = true; $this->term = substr($this->term, 0, max(0, $prefixPos - 1)); $this->suggestSuffixes[] = ' prefix:' . $value; // Suck namespaces out of $value $cirrusSearchEngine = new CirrusSearch(); $cirrusSearchEngine->setConnection($this->connection); $value = trim($cirrusSearchEngine->replacePrefixes($value)); $this->namespaces = $cirrusSearchEngine->namespaces; // If the namespace prefix wasn't the entire prefix filter then add a filter for the title if (strpos($value, ':') !== strlen($value) - 1) { $value = str_replace('_', ' ', $value); $prefixQuery = new \Elastica\Query\Match(); $prefixQuery->setFieldQuery('title.prefix', $value); $this->filters[] = new \Elastica\Filter\Query($prefixQuery); } } } $preferRecentDecayPortion = $this->config->get('CirrusSearchPreferRecentDefaultDecayPortion'); $preferRecentHalfLife = $this->config->get('CirrusSearchPreferRecentDefaultHalfLife'); $unspecifiedDecayPortion = $this->config->get('CirrusSearchPreferRecentUnspecifiedDecayPortion'); // Matches "prefer-recent:" and then an optional floating point number <= 1 but >= 0 (decay // portion) and then an optional comma followed by another floating point number >= 0 (half life) $this->extractSpecialSyntaxFromTerm('/prefer-recent:(1|0?(?:\\.\\d+)?)?(?:,(\\d*\\.?\\d+))? ?/', function ($matches) use($unspecifiedDecayPortion, &$preferRecentDecayPortion, &$preferRecentHalfLife, &$searchContainedSyntax) { if (isset($matches[1]) && strlen($matches[1])) { $preferRecentDecayPortion = floatval($matches[1]); } else { $preferRecentDecayPortion = $unspecifiedDecayPortion; } if (isset($matches[2])) { $preferRecentHalfLife = floatval($matches[2]); } $searchContainedSyntax = true; return ''; }); $this->preferRecentDecayPortion = $preferRecentDecayPortion; $this->preferRecentHalfLife = $preferRecentHalfLife; $this->extractSpecialSyntaxFromTerm('/^\\s*local:/', function ($matches) use($searcher) { $searcher->limitSearchToLocalWiki(true); return ''; }); // Handle other filters $filters = $this->filters; $notFilters = $this->notFilters; $boostTemplates = self::getDefaultBoostTemplates(); $highlightSource = array(); $this->extractSpecialSyntaxFromTerm('/(?<not>-)?insource:\\/(?<pattern>(?:[^\\\\\\/]|\\\\.)+)\\/(?<insensitive>i)? ?/', function ($matches) use($searcher, &$filters, &$notFilters, &$searchContainedSyntax, &$searchType, &$highlightSource) { if (!$searcher->config->get('CirrusSearchEnableRegex')) { return; } $searchContainedSyntax = true; $searchType = 'regex'; $insensitive = !empty($matches['insensitive']); $filterDestination =& $filters; if (!empty($matches['not'])) { $filterDestination =& $notFilters; } else { $highlightSource[] = array('pattern' => $matches['pattern'], 'locale' => $searcher->config->get('LanguageCode'), 'insensitive' => $insensitive); } $regex = $searcher->config->getElement('CirrusSearchWikimediaExtraPlugin', 'regex'); if ($regex && in_array('use', $regex)) { $filter = new SourceRegex($matches['pattern'], 'source_text', 'source_text.trigram'); if (isset($regex['max_inspect'])) { $filter->setMaxInspect($regex['max_inspect']); } else { $filter->setMaxInspect(10000); } $filter->setMaxDeterminizedStates($searcher->config->get('CirrusSearchRegexMaxDeterminizedStates')); if (isset($regex['max_ngrams_extracted'])) { $filter->setMaxNgramExtracted($regex['max_ngrams_extracted']); } $filter->setCaseSensitive(!$insensitive); $filter->setLocale($this->config->get('LanguageCode')); $filterDestination[] = $filter; } else { // Without the extra plugin we need to use groovy to attempt the regex. // Its less good but its something. $script = <<<GROOVY import org.apache.lucene.util.automaton.*; sourceText = _source.get("source_text"); if (sourceText == null) { \tfalse; } else { \tif (automaton == null) { \t\tif (insensitive) { \t\t\tlocale = new Locale(language); \t\t\tpattern = pattern.toLowerCase(locale); \t\t} \t\tregexp = new RegExp(pattern, RegExp.ALL ^ RegExp.AUTOMATON); \t\tautomaton = new CharacterRunAutomaton(regexp.toAutomaton()); \t} \tif (insensitive) { \t\tsourceText = sourceText.toLowerCase(locale); \t} \tautomaton.run(sourceText); } GROOVY; $filterDestination[] = new \Elastica\Filter\Script(new \Elastica\Script($script, array('pattern' => '.*(' . $matches['pattern'] . ').*', 'insensitive' => $insensitive, 'language' => $searcher->config->get('LanguageCode'), 'automaton' => null, 'locale' => null), 'groovy')); } }); // Match filters that look like foobar:thing or foobar:"thing thing" // The {7,15} keeps this from having horrible performance on big strings $escaper = $this->escaper; $fuzzyQuery = $this->fuzzyQuery; $isEmptyQuery = false; $this->extractSpecialSyntaxFromTerm('/(?<key>[a-z\\-]{7,15}):\\s*(?<value>"(?<quoted>(?:[^"]|(?<=\\\\)")+)"|(?<unquoted>\\S+)) ?/', function ($matches) use($searcher, $escaper, &$filters, &$notFilters, &$boostTemplates, &$searchContainedSyntax, &$fuzzyQuery, &$highlightSource, &$isEmptyQuery) { $key = $matches['key']; $quotedValue = $matches['value']; $value = $matches['quoted'] !== '' ? str_replace('\\"', '"', $matches['quoted']) : $matches['unquoted']; $filterDestination =& $filters; $keepText = true; if ($key[0] === '-') { $key = substr($key, 1); $filterDestination =& $notFilters; $keepText = false; } switch ($key) { case 'boost-templates': $boostTemplates = Searcher::parseBoostTemplates($value); if ($boostTemplates === null) { $boostTemplates = Searcher::getDefaultBoostTemplates(); } $searchContainedSyntax = true; return ''; case 'hastemplate': // We emulate template syntax here as best as possible, // so things in NS_MAIN are prefixed with ":" and things // in NS_TEMPLATE don't have a prefix at all. Since we // don't actually index templates like that, munge the // query here if (strpos($value, ':') === 0) { $value = substr($value, 1); } else { $title = Title::newFromText($value); if ($title && $title->getNamespace() == NS_MAIN) { $value = Title::makeTitle(NS_TEMPLATE, $title->getDBkey())->getPrefixedText(); } } $filterDestination[] = $searcher->matchPage('template', $value); $searchContainedSyntax = true; return ''; case 'linksto': $filterDestination[] = $searcher->matchPage('outgoing_link', $value, true); $searchContainedSyntax = true; return ''; case 'incategory': $categories = array_slice(explode('|', $value), 0, $searcher->config->get('CirrusSearchMaxIncategoryOptions')); $categoryFilters = $searcher->matchPageCategories($categories); if ($categoryFilters === null) { $isEmptyQuery = true; } else { $filterDestination[] = $categoryFilters; } $searchContainedSyntax = true; return ''; case 'insource': $updateReferences = Filters::insource($escaper, $searcher->getSearchContext(), $quotedValue); $updateReferences($fuzzyQuery, $filterDestination, $highlightSource, $searchContainedSyntax); return ''; case 'intitle': $updateReferences = Filters::intitle($escaper, $searcher->getSearchContext(), $quotedValue); $updateReferences($fuzzyQuery, $filterDestination, $highlightSource, $searchContainedSyntax); return $keepText ? "{$quotedValue} " : ''; default: return $matches[0]; } }); if ($isEmptyQuery) { return Status::newGood(new SearchResultSet(true)); } $this->filters = $filters; $this->notFilters = $notFilters; $this->boostTemplates = $boostTemplates; $this->searchContext->setSearchContainedSyntax($searchContainedSyntax); $this->fuzzyQuery = $fuzzyQuery; $this->highlightSource = $highlightSource; $this->term = $this->escaper->escapeQuotes($this->term); $this->term = trim($this->term); // Match quoted phrases including those containing escaped quotes // Those phrases can optionally be followed by ~ then a number (this is the phrase slop) // That can optionally be followed by a ~ (this matches stemmed words in phrases) // The following all match: "a", "a boat", "a\"boat", "a boat"~, "a boat"~9, "a boat"~9~, -"a boat", -"a boat"~9~ $slop = $this->config->get('CirrusSearchPhraseSlop'); $query = self::replacePartsOfQuery($this->term, '/(?<![\\]])(?<negate>-|!)?(?<main>"((?:[^"]|(?<=\\\\)")+)"(?<slop>~\\d+)?)(?<fuzzy>~)?/', function ($matches) use($searcher, $escaper, $slop) { $negate = $matches['negate'][0] ? 'NOT ' : ''; $main = $escaper->fixupQueryStringPart($matches['main'][0]); if (!$negate && !isset($matches['fuzzy']) && !isset($matches['slop']) && preg_match('/^"([^"*]+)[*]"/', $main, $matches)) { $phraseMatch = new Elastica\Query\Match(); $phraseMatch->setFieldQuery("all.plain", $matches[1]); $phraseMatch->setFieldType("all.plain", "phrase_prefix"); $this->nonTextQueries[] = $phraseMatch; $phraseHighlightMatch = new Elastica\Query\QueryString(); $phraseHighlightMatch->setQuery($matches[1] . '*'); $phraseHighlightMatch->setFields(array('all.plain')); $this->nonTextHighlightQueries[] = $phraseHighlightMatch; return array(); } if (!isset($matches['fuzzy'])) { if (!isset($matches['slop'])) { $main = $main . '~' . $slop['precise']; } // Got to collect phrases that don't use the all field so we can highlight them. // The highlighter locks phrases to the fields that specify them. It doesn't do // that with terms. return array('escaped' => $negate . $searcher->switchSearchToExact($main, true), 'nonAll' => $negate . $searcher->switchSearchToExact($main, false)); } return array('escaped' => $negate . $main); }); // Find prefix matches and force them to only match against the plain analyzed fields. This // prevents prefix matches from getting confused by stemming. Users really don't expect stemming // in prefix queries. $query = self::replaceAllPartsOfQuery($query, '/\\w+\\*(?:\\w*\\*?)*/u', function ($matches) use($searcher, $escaper) { $term = $escaper->fixupQueryStringPart($matches[0][0]); return array('escaped' => $searcher->switchSearchToExactForWildcards($term), 'nonAll' => $searcher->switchSearchToExactForWildcards($term)); }); $escapedQuery = array(); $nonAllQuery = array(); $nearMatchQuery = array(); foreach ($query as $queryPart) { if (isset($queryPart['escaped'])) { $escapedQuery[] = $queryPart['escaped']; if (isset($queryPart['nonAll'])) { $nonAllQuery[] = $queryPart['nonAll']; } else { $nonAllQuery[] = $queryPart['escaped']; } continue; } if (isset($queryPart['raw'])) { $fixed = $this->escaper->fixupQueryStringPart($queryPart['raw']); $escapedQuery[] = $fixed; $nonAllQuery[] = $fixed; $nearMatchQuery[] = $queryPart['raw']; continue; } LoggerFactory::getInstance('CirrusSearch')->warning('Unknown query part: {queryPart}', array('queryPart' => serialize($queryPart))); } // Actual text query list($queryStringQueryString, $this->fuzzyQuery) = $escaper->fixupWholeQueryString(implode(' ', $escapedQuery)); // Note that no escaping is required for near_match's match query. $nearMatchQuery = implode(' ', $nearMatchQuery); if ($queryStringQueryString !== '') { if (preg_match('/(?<!\\\\)[?*+~"!|-]|AND|OR|NOT/', $queryStringQueryString)) { $this->searchContext->setSearchContainedSyntax(true); // We're unlikey to make good suggestions for query string with special syntax in them.... $showSuggestion = false; } $fields = array_merge($this->buildFullTextSearchFields(1, '.plain', true), $this->buildFullTextSearchFields($this->config->get('CirrusSearchStemmedWeight'), '', true)); $nearMatchFields = $this->buildFullTextSearchFields($this->config->get('CirrusSearchNearMatchWeight'), '.near_match', true); $this->query = $this->buildSearchTextQuery($fields, $nearMatchFields, $queryStringQueryString, $nearMatchQuery); // The highlighter doesn't know about the weightinging from the all fields so we have to send // it a query without the all fields. This swaps one in. if ($this->config->getElement('CirrusSearchAllFields', 'use')) { $nonAllFields = array_merge($this->buildFullTextSearchFields(1, '.plain', false), $this->buildFullTextSearchFields($this->config->get('CirrusSearchStemmedWeight'), '', false)); list($nonAllQueryString, ) = $escaper->fixupWholeQueryString(implode(' ', $nonAllQuery)); $this->highlightQuery = $this->buildSearchTextQueryForFields($nonAllFields, $nonAllQueryString, 1, false, true); } else { $nonAllFields = $fields; } // Only do a phrase match rescore if the query doesn't include any quotes and has a space. // Queries without spaces are either single term or have a phrase query generated. // Queries with the quote already contain a phrase query and we can't build phrase queries // out of phrase queries at this point. if ($this->config->get('CirrusSearchPhraseRescoreBoost') > 1.0 && $this->config->get('CirrusSearchPhraseRescoreWindowSize') && !$this->searchContext->isSearchContainedSyntax() && strpos($queryStringQueryString, '"') === false && strpos($queryStringQueryString, ' ') !== false) { $rescoreFields = $fields; if (!$this->config->get('CirrusSearchAllFieldsForRescore')) { $rescoreFields = $nonAllFields; } $this->rescore[] = array('window_size' => $this->config->get('CirrusSearchPhraseRescoreWindowSize'), 'query' => array('rescore_query' => $this->buildSearchTextQueryForFields($rescoreFields, '"' . $queryStringQueryString . '"', $this->config->getElement('CirrusSearchPhraseSlop', 'boost'), true), 'query_weight' => 1.0, 'rescore_query_weight' => $this->config->get('CirrusSearchPhraseRescoreBoost'))); } $showSuggestion = $showSuggestion && $this->offset == 0; if ($showSuggestion) { $this->suggest = array('text' => $this->term, 'suggest' => $this->buildSuggestConfig('suggest')); } $result = $this->search($searchType, $originalTerm); if (!$result->isOK() && $this->isParseError($result)) { // Elasticsearch has reported a parse error and we've already logged it when we built the status // so at this point all we can do is retry the query as a simple query string query. $this->query = new \Elastica\Query\Simple(array('simple_query_string' => array('fields' => $fields, 'query' => $queryStringQueryString, 'default_operator' => 'AND'))); $this->rescore = array(); // Not worth trying in this state. $result = $this->search('degraded_full_text', $originalTerm); // If that doesn't work we're out of luck but it should. There no guarantee it'll work properly // with the syntax we've built above but it'll do _something_ and we'll still work on fixing all // the parse errors that come in. } } else { $result = $this->search($searchType, $originalTerm); // No need to check for a parse error here because we don't actually create a query for // Elasticsearch to parse } return $result; }
/** * * @param array $location * * @return Elastica\Query $localityQuery */ public static function getLocalityQuery($location) { $query = new Agent\Elastica\Query\BoolQuery(); $method = 'addMust'; if (!isset($location['state']) && !isset($location['zip']) && !isset($location['locality'])) { foreach (['phone', 'ipaddress'] as $field) { if (isset($location[$field]) && empty($location['state'])) { switch ($field) { case 'ipaddress': $geo = self::$geo; $loc = $geo->getRecord($location['ipaddress']); if ($loc instanceof Record) { $state = $loc->getRegion(); if ($state) { $location['state'] = $state; } } break; case 'phone': $phone = Helper::parse_phonenumber($location['phone'], 'array'); if ($phone) { $state = Helper::area_code_to_state($phone[0]); if ($state) { $location['state'] = $state; } } break; } } } } foreach ($location as $field => $value) { switch ($field) { case 'locality': if (!isset($location['zip'])) { $fields = ['latitude', 'longitude']; $values = is_array($value) ? $value : explode(",", $value); $latlon = count($values) == 2 ? array_combine($fields, $values) : false; if ($latlon) { $path = "location"; $nested = new Elastica\Query\Nested(); $nested->setPath($path); $bool = new Elastica\Query\BoolQuery(); foreach ($latlon as $dim => $coord) { $bool->addMust(new Elastica\Query\Match("{$path}.{$dim}", $coord)); } $nested->setQuery($bool); $query->addMust($nested); } } break; case 'city': if (!isset($location['locality'])) { $query->addShould(new Elastica\Query\Match($field, $value)); } break; case 'state': if (!isset($location['locality'])) { $fields = ['state.abbrev', 'state.full']; $values = is_array($value) ? $value : [$value]; foreach ($values as $state) { $querystring = new Elastica\Query\QueryString($state); $querystring->setFields($fields); $nested = new Elastica\Query\Nested(); $nested->setQuery($querystring); $nested->setPath($field); if (count($values) > 1) { $query->addShould($nested); } else { $query->addMust($nested); } } } break; case 'zip': $query->{$method}(new Elastica\Query\Match($field, $value)); break; } } $localityQuery = new Elastica\Query($query); $localityQuery->setSize(1); return $localityQuery; }
/** * @param string[] $fields * @param string $queryString * @param int $phraseSlop * @param boolean $isRescore * @return \Elastica\Query\Simple */ private function buildSearchTextQueryForFields(array $fields, $queryString, $phraseSlop, $isRescore) { $query = new \Elastica\Query\QueryString($queryString); $query->setFields($fields); $query->setAutoGeneratePhraseQueries(true); $query->setPhraseSlop($phraseSlop); $query->setDefaultOperator('AND'); $query->setAllowLeadingWildcard($this->config->get('CirrusSearchAllowLeadingWildcard')); $query->setFuzzyPrefixLength(2); $query->setRewrite('top_terms_boost_1024'); $states = $this->config->get('CirrusSearchQueryStringMaxDeterminizedStates'); if (isset($states)) { // Requires ES 1.4+ $query->setParam('max_determinized_states', $states); } return $this->wrapInSaferIfPossible($query, $isRescore); }
/** * Query to search auto * * @param CSearchThesaurusEntry $favori The favori * @param CSejour $sejour The sejour * * @return Query */ function querySearchAuto($favori, $sejour) { $query_bool = new Elastica\Query\Bool(); // query des séjours $query_sejour = new Elastica\Query\QueryString(); $query_sejour->setQuery($this->constructWordsWithSejour($sejour->_id)); $query_sejour->setDefaultOperator("and"); $query_bool->addMust($query_sejour); // query du favoris $query_words = new Elastica\Query\QueryString(); $query_words->setQuery($this->normalizeEncoding($favori->entry)); $query_words->setFields(array("body", "title")); $query_words->setDefaultOperator("and"); $query_bool->addMust($query_words); $query = new Query($query_bool); // Pagination $query->setFrom(0); // Where to start $query->setLimit(30); //Highlight $query->setHighlight(array("pre_tags" => array(" <em> <strong> "), "post_tags" => array(" </strong> </em>"), "fields" => array("body" => array("fragment_size" => 50, "number_of_fragments" => 3, "highlight_query" => array("bool" => array("must" => array("match" => array("body" => array("query" => $this->normalizeEncoding($favori->entry)))), "minimum_should_match" => 1)))))); return $query; }