/** * Build the result. * @param $results \Elastica\ResultSet containing all search results * @param $result \Elastica\Result containing the given search result * @param string $interwiki Interwiki prefix, if any * @param $result \Elastic\Result containing information about the result this class should represent */ public function __construct($results, $result, $interwiki = '') { if ($interwiki) { $this->setInterwiki($result, $interwiki); } $this->docId = $result->getId(); $this->mTitle = Title::makeTitle($result->namespace, $result->title, '', $this->interwiki); if ($this->getTitle()->getNamespace() == NS_FILE) { $this->mImage = wfFindFile($this->mTitle); } $fields = $result->getFields(); // Not all results requested a word count. Just pretend we have none if so $this->wordCount = isset($fields['text.word_count']) ? $fields['text.word_count'][0] : 0; $this->byteSize = $result->text_bytes; $this->timestamp = new MWTimestamp($result->timestamp); $highlights = $result->getHighlights(); if (isset($highlights['title'])) { $nstext = $this->getTitle()->getNamespace() === 0 ? '' : Util::getNamespaceText($this->getTitle()) . ':'; $this->titleSnippet = $nstext . $this->escapeHighlightedText($highlights['title'][0]); } elseif ($this->mTitle->isExternal()) { // Interwiki searches are weird. They won't have title highlights by design, but // if we don't return a title snippet we'll get weird display results. $nsText = $this->getInterwikiNamespaceText(); $titleText = $this->mTitle->getText(); $this->titleSnippet = $nsText ? "{$nsText}:{$titleText}" : $titleText; } if (!isset($highlights['title']) && isset($highlights['redirect.title'])) { // Make sure to find the redirect title before escaping because escaping breaks it.... $redirects = $result->redirect; $this->redirectTitle = $this->findRedirectTitle($highlights['redirect.title'][0], $redirects); $this->redirectSnipppet = $this->escapeHighlightedText($highlights['redirect.title'][0]); } $this->textSnippet = $this->escapeHighlightedText($this->pickTextSnippet($highlights)); if (isset($highlights['heading'])) { $this->sectionSnippet = $this->escapeHighlightedText($highlights['heading'][0]); $this->sectionTitle = $this->findSectionTitle(); } if (isset($highlights['category'])) { $this->categorySnippet = $this->escapeHighlightedText($highlights['category'][0]); } }
/** * @param \Exception $e exception caught * @param int $errors number of errors * @param Maintenance $out * @param string $messagePrefix * @param string $description */ public function sleepOnRetry(\Exception $e, $errors, $messagePrefix, $description) { $type = get_class($e); $seconds = Util::backoffDelay($errors); $message = ElasticsearchIntermediary::extractMessage($e); $this->outputIndented($messagePrefix . "Caught an error {$description}. " . "Backing off for {$seconds} and retrying. Error type is '{$type}' and message is: {$message}\n"); sleep($seconds); }
/** * Extract more like this settings from the i18n message cirrussearch-morelikethis-settings */ private static function overrideMoreLikeThisOptionsFromMessage() { global $wgCirrusSearchMoreLikeThisConfig, $wgCirrusSearchMoreLikeThisUseFields, $wgCirrusSearchMoreLikeThisAllowedFields, $wgCirrusSearchMoreLikeThisMaxQueryTermsLimit, $wgCirrusSearchMoreLikeThisFields; $cache = \ObjectCache::newAccelerator(CACHE_NONE); $lines = $cache->getWithSetCallback($cache->makeKey('cirrussearch-morelikethis-settings'), 600, function () { $source = wfMessage('cirrussearch-morelikethis-settings')->inContentLanguage(); if ($source && $source->isDisabled()) { return array(); } return Util::parseSettingsInMessage($source->plain()); }); foreach ($lines as $line) { list($k, $v) = explode(':', $line, 2); switch ($k) { case 'min_doc_freq': case 'max_doc_freq': case 'max_query_terms': case 'min_term_freq': case 'min_word_len': case 'max_word_len': if (is_numeric($v) && $v >= 0) { $wgCirrusSearchMoreLikeThisConfig[$k] = intval($v); } else { if ($v === 'null') { unset($wgCirrusSearchMoreLikeThisConfig[$k]); } } break; case 'percent_terms_to_match': if (is_numeric($v) && $v > 0 && $v <= 1) { $wgCirrusSearchMoreLikeThisConfig[$k] = $v; } else { if ($v === 'null') { unset($wgCirrusSearchMoreLikeThisConfig[$k]); } } break; case 'fields': $wgCirrusSearchMoreLikeThisFields = array_intersect(array_map('trim', explode(',', $v)), $wgCirrusSearchMoreLikeThisAllowedFields); break; case 'use_fields': if ($v === 'true') { $wgCirrusSearchMoreLikeThisUseFields = true; } else { if ($v === 'false') { $wgCirrusSearchMoreLikeThisUseFields = false; } } break; } if ($wgCirrusSearchMoreLikeThisConfig['max_query_terms'] > $wgCirrusSearchMoreLikeThisMaxQueryTermsLimit) { $wgCirrusSearchMoreLikeThisConfig['max_query_terms'] = $wgCirrusSearchMoreLikeThisMaxQueryTermsLimit; } } }
/** * @return float[] */ public static function getDefaultBoostTemplates() { if (self::$defaultBoostTemplates === null) { $cache = \ObjectCache::newAccelerator(CACHE_NONE); self::$defaultBoostTemplates = $cache->getWithSetCallback($cache->makeKey('cirrussearch-boost-templates'), 600, function () { $source = wfMessage('cirrussearch-boost-templates')->inContentLanguage(); if (!$source->isDisabled()) { $lines = Util::parseSettingsInMessage($source->plain()); return Util::parseBoostTemplates(implode(' ', $lines)); // Now parse the templates } return array(); }); } return self::$defaultBoostTemplates; }
private function getIgnoredHeadings() { static $ignoredHeadings = null; if ($ignoredHeadings === null) { $source = wfMessage('cirrussearch-ignored-headings')->inContentLanguage(); $ignoredHeadings = array(); if (!$source->isDisabled()) { $lines = Util::parseSettingsInMessage($source->plain()); $ignoredHeadings = $lines; // Now we just have headings! } } return $ignoredHeadings; }
/** * Wraps the complex pool counter interface to force the single call pattern * that Cirrus always uses. * @param $type same as type parameter on PoolCounter::factory * @param $user the user * @param $workCallback callback when pool counter is aquired. Called with * no parameters. * @param $errorCallback optional callback called on errors. Called with * the error string and the key as parameters. If left undefined defaults * to a function that returns a fatal status and logs an warning. */ public static function doPoolCounterWork($type, $user, $workCallback, $errorCallback = null) { global $wgCirrusSearchPoolCounterKey; // By default the pool counter allows you to lock the same key with // multiple types. That might be useful but it isn't how Cirrus thinks. // Instead, all keys are scoped to their type. if (!$user) { // We don't want to even use the pool counter if there isn't a user. return $workCallback(); } $perUserKey = md5($user->getName()); $perUserKey = "nowait:CirrusSearch:_per_user:{$perUserKey}"; $globalKey = "{$type}:{$wgCirrusSearchPoolCounterKey}"; if ($errorCallback === null) { $errorCallback = function ($error, $key, $userName) { $forUserName = $userName ? "for {userName} " : ''; LoggerFactory::getInstance('CirrusSearch')->warning("Pool error {$forUserName}on {key}: {error}", array('userName' => $userName, 'key' => $key, 'error' => $error)); return Status::newFatal('cirrussearch-backend-error'); }; } $errorHandler = function ($key) use($errorCallback, $user) { return function ($status) use($errorCallback, $key, $user) { $status = $status->getErrorsArray(); // anon usernames are needed within the logs to determine if // specific ips (such as large #'s of users behind a proxy) // need to be whitelisted. We do not need this information // for logged in users and do not store it. $userName = $user->isAnon() ? $user->getName() : ''; return $errorCallback($status[0][0], $key, $userName); }; }; $doPerUserWork = function () use($type, $globalKey, $workCallback, $errorHandler) { // Now that we have the per user lock lets get the operation lock. // Note that this could block, causing the user to wait in line with their lock held. $work = new PoolCounterWorkViaCallback($type, $globalKey, array('doWork' => $workCallback, 'error' => $errorHandler($globalKey))); return $work->execute(); }; $work = new PoolCounterWorkViaCallback('CirrusSearch-PerUser', $perUserKey, array('doWork' => $doPerUserWork, 'error' => function ($status) use($errorHandler, $perUserKey, $doPerUserWork) { $errorCallback = $errorHandler($perUserKey); $errorResult = $errorCallback($status); if (Util::isUserPoolCounterActive()) { return $errorResult; } else { return $doPerUserWork(); } })); return $work->execute(); }
public function execute() { global $wgPoolCounterConf; // Make sure we don't flood the pool counter unset($wgPoolCounterConf['CirrusSearch-Search']); // Set the timeout for maintenance actions $this->setConnectionTimeout(); $this->indexType = $this->getOption('indexType'); $this->indexBaseName = $this->getOption('baseName', wfWikiId()); $indexTypes = $this->getConnection()->getAllIndexTypes(); if (!in_array($this->indexType, $indexTypes)) { $this->error('indexType option must be one of ' . implode(', ', $indexTypes), 1); } $utils = new ConfigUtils($this->getConnection()->getClient(), $this); $this->indexIdentifier = $this->getOption('indexIdentifier'); $filter = null; if ($this->hasOption('filter')) { $filter = new Elastica\Filter\Query(new Elastica\Query\QueryString($this->getOption('filter'))); } $limit = (int) $this->getOption('limit', 0); $query = new Query(); $query->setFields(array('_id', '_type', '_source')); if ($this->hasOption('sourceFields')) { $sourceFields = explode(',', $this->getOption('sourceFields')); $query->setSource(array('include' => $sourceFields)); } if ($filter) { $query->setQuery(new \Elastica\Query\Filtered(new \Elastica\Query\MatchAll(), $filter)); } $scrollOptions = array('search_type' => 'scan', 'scroll' => "15m", 'size' => $this->inputChunkSize); $index = $this->getIndex(); $result = $index->search($query, $scrollOptions); $totalDocsInIndex = $result->getResponse()->getData(); $totalDocsInIndex = $totalDocsInIndex['hits']['total']; $totalDocsToDump = $limit > 0 ? $limit : $totalDocsInIndex; $docsDumped = 0; $this->logToStderr = true; $this->output("Dumping {$totalDocsToDump} documents ({$totalDocsInIndex} in the index)\n"); $self = $this; Util::iterateOverScroll($index, $result->getResponse()->getScrollId(), '15m', function ($results) use($self, &$docsDumped, $totalDocsToDump) { foreach ($results as $result) { $document = array('_id' => $result->getId(), '_type' => $result->getType(), '_source' => $result->getSource()); $self->write($document); $docsDumped++; $self->outputProgress($docsDumped, $totalDocsToDump); } }, $limit, 5); $this->output("Dump done.\n"); }
/** * @param integer $maxDocs the number of docs in the index * @param float[]|null $boostTemplates Array of key values, key is the template name, value the boost factor. * Defaults to Util::getDefaultBoostTemplates() */ public function __construct($maxDocs, $boostTemplates = null) { $this->maxDocs = $maxDocs; $this->boostTemplates = $boostTemplates ?: Util::getDefaultBoostTemplates(); // We normalize incoming links according to the size of the index $this->incomingLinksNorm = (int) ($maxDocs * self::INCOMING_LINKS_MAX_DOCS_FACTOR); if ($this->incomingLinksNorm < 1) { // it's a very small wiki let's force the norm to 1 $this->incomingLinksNorm = 1; } }
public function execute() { global $wgPoolCounterConf, $wgLanguageCode, $wgCirrusSearchPhraseSuggestUseText, $wgCirrusSearchPrefixSearchStartsWithAnyWord, $wgCirrusSearchBannedPlugins, $wgCirrusSearchOptimizeIndexForExperimentalHighlighter, $wgCirrusSearchMaxShardsPerNode, $wgCirrusSearchRefreshInterval; // Make sure we don't flood the pool counter unset($wgPoolCounterConf['CirrusSearch-Search']); // Set the timeout for maintenance actions $this->setConnectionTimeout(); $utils = new ConfigUtils($this->getConnection()->getClient(), $this); $this->indexType = $this->getOption('indexType'); $this->startOver = $this->getOption('startOver', false); $this->indexBaseName = $this->getOption('baseName', wfWikiId()); $this->reindexAndRemoveOk = $this->getOption('reindexAndRemoveOk', false); $this->reindexProcesses = $this->getOption('reindexProcesses', wfIsWindows() ? 1 : 5); $this->reindexAcceptableCountDeviation = Util::parsePotentialPercent($this->getOption('reindexAcceptableCountDeviation', '5%')); $this->reindexChunkSize = $this->getOption('reindexChunkSize', 100); $this->reindexRetryAttempts = $this->getOption('reindexRetryAttempts', 5); $this->printDebugCheckConfig = $this->getOption('debugCheckConfig', false); $this->langCode = $wgLanguageCode; $this->prefixSearchStartsWithAny = $wgCirrusSearchPrefixSearchStartsWithAnyWord; $this->phraseSuggestUseText = $wgCirrusSearchPhraseSuggestUseText; $this->bannedPlugins = $wgCirrusSearchBannedPlugins; $this->optimizeIndexForExperimentalHighlighter = $wgCirrusSearchOptimizeIndexForExperimentalHighlighter; $this->maxShardsPerNode = isset($wgCirrusSearchMaxShardsPerNode[$this->indexType]) ? $wgCirrusSearchMaxShardsPerNode[$this->indexType] : 'unlimited'; $this->refreshInterval = $wgCirrusSearchRefreshInterval; try { $indexTypes = $this->getConnection()->getAllIndexTypes(); if (!in_array($this->indexType, $indexTypes)) { $this->error('indexType option must be one of ' . implode(', ', $indexTypes), 1); } $utils->checkElasticsearchVersion(); $this->availablePlugins = $utils->scanAvailablePlugins($this->bannedPlugins); if ($this->getOption('justCacheWarmers', false)) { $this->validateCacheWarmers(); return; } if ($this->getOption('justAllocation', false)) { $this->validateShardAllocation(); return; } $this->indexIdentifier = $utils->pickIndexIdentifierFromOption($this->getOption('indexIdentifier', 'current'), $this->getIndexTypeName()); $this->analysisConfigBuilder = $this->pickAnalyzer($this->langCode, $this->availablePlugins); $this->validateIndex(); $this->validateAnalyzers(); $this->validateMapping(); $this->validateCacheWarmers(); $this->validateAlias(); $this->updateVersions(); $this->indexNamespaces(); } catch (\Elastica\Exception\Connection\HttpException $e) { $message = $e->getMessage(); $this->output("\nUnexpected Elasticsearch failure.\n"); $this->error("Http error communicating with Elasticsearch: {$message}.\n", 1); } catch (\Elastica\Exception\ExceptionInterface $e) { $type = get_class($e); $message = ElasticsearchIntermediary::extractMessage($e); $trace = $e->getTraceAsString(); $this->output("\nUnexpected Elasticsearch failure.\n"); $this->error("Elasticsearch failed in an unexpected way. This is always a bug in CirrusSearch.\n" . "Error type: {$type}\n" . "Message: {$message}\n" . "Trace:\n" . $trace, 1); } }
private function diff($expectedWarmers, $actualWarmers) { $result = array(); foreach ($expectedWarmers as $key => $value) { if (!isset($actualWarmers[$key]) || !Util::recursiveSame($value, $actualWarmers[$key])) { $result[$key] = $value; } } return $result; }
/** * merge top level multi-queries and resolve returned pageIds into Title objects. * * WARNING: experimental API * * @param string $query the user query * @param \Elastica\Response $response Response from elasticsearch _suggest api * @param array $profiles the suggestion profiles * @param int $limit Maximum suggestions to return, -1 for unlimited * @return SearchSuggestionSet a set of Suggestions */ protected function postProcessSuggest(\Elastica\Response $response, $profiles, $limit = -1) { $this->logContext['elasticTookMs'] = intval($response->getQueryTime() * 1000); $data = $response->getData(); unset($data['_shards']); $suggestions = array(); foreach ($data as $name => $results) { $discount = $profiles[$name]['discount']; foreach ($results as $suggested) { foreach ($suggested['options'] as $suggest) { $output = SuggestBuilder::decodeOutput($suggest['text']); if ($output === null) { // Ignore broken output continue; } $pageId = $output['id']; $type = $output['type']; $score = $discount * $suggest['score']; if (!isset($suggestions[$pageId]) || $score > $suggestions[$pageId]->getScore()) { $suggestion = new SearchSuggestion($score, null, null, $pageId); // If it's a title suggestion we have the text if ($type === SuggestBuilder::TITLE_SUGGESTION) { $suggestion->setText($output['text']); } $suggestions[$pageId] = $suggestion; } } } } // simply sort by existing scores uasort($suggestions, function ($a, $b) { return $b->getScore() - $a->getScore(); }); $this->logContext['hitsTotal'] = count($suggestions); if ($limit > 0) { $suggestions = array_slice($suggestions, 0, $limit, true); } $this->logContext['hitsReturned'] = count($suggestions); $this->logContext['hitsOffset'] = 0; // we must fetch redirect data for redirect suggestions $missingText = array(); foreach ($suggestions as $id => $suggestion) { if ($suggestion->getText() === null) { $missingText[] = $id; } } if (!empty($missingText)) { // Experimental. // // Second pass query to fetch redirects. // It's not clear if it's the best option, this will slowdown the whole query // when we hit a redirect suggestion. // Other option would be to encode redirects as a payload resulting in a // very big index... // XXX: we support only the content index $type = $this->connection->getPageType($this->indexBaseName, Connection::CONTENT_INDEX_TYPE); // NOTE: we are already in a poolCounterWork // Multi get is not supported by elastica $redirResponse = null; try { $redirResponse = $type->request('_mget', 'GET', array('ids' => $missingText), array('_source_include' => 'redirect')); if ($redirResponse->isOk()) { $this->logContext['elasticTook2PassMs'] = intval($redirResponse->getQueryTime() * 1000); $docs = $redirResponse->getData(); foreach ($docs['docs'] as $doc) { if (empty($doc['_source']['redirect'])) { continue; } // We use the original query, we should maybe use the variant that generated this result? $text = Util::chooseBestRedirect($this->term, $doc['_source']['redirect']); if (!empty($suggestions[$doc['_id']])) { $suggestions[$doc['_id']]->setText($text); } } } else { LoggerFactory::getInstance('CirrusSearch')->warning('Unable to fetch redirects for suggestion {query} with results {ids} : {error}', array('query' => $this->term, 'ids' => serialize($missingText), 'error' => $redirResponse->getError())); } } catch (\Elastica\Exception\ExceptionInterface $e) { LoggerFactory::getInstance('CirrusSearch')->warning('Unable to fetch redirects for suggestion {query} with results {ids} : {error}', array('query' => $this->term, 'ids' => serialize($missingText), 'error' => $this->extractMessage($e))); } } return new SearchSuggestionSet(array_filter($suggestions, function ($suggestion) { // text should be not empty for suggestions return $suggestion->getText() != null; })); }
public function testChooseBestRedirect() { $convert = function ($x) { $redirect = array(); foreach ($x as $t) { $redirect[] = array('title' => $t, 'namespace' => 0); } return $redirect; }; $input = $convert(array('Al. Einstein', 'Albert Einstein', 'A. Einstein', 'Einstein, Albert')); $this->assertEquals('Al. Einstein', Util::chooseBestRedirect('a', $input)); $this->assertEquals('Al. Einstein', Util::chooseBestRedirect('al', $input)); $this->assertEquals('Albert Einstein', Util::chooseBestRedirect('albet', $input)); $this->assertEquals('Einstein, Albert', Util::chooseBestRedirect('Einstein', $input)); $this->assertEquals('Einstein, Albert', Util::chooseBestRedirect('Ens', $input)); }
private function indexData() { $query = new Query(); $query->setFields(array('_id', '_type', '_source')); // Exclude content fields to save bandwidth $query->setSource(array('exclude' => array('text', 'source_text', 'opening_text', 'auxiliary_text'))); $query->setQuery(new Elastica\Query\Filtered(new Elastica\Query\MatchAll(), new Elastica\Filter\BoolAnd(array(new Elastica\Filter\Type(Connection::PAGE_TYPE_NAME), new Elastica\Filter\Term(array("namespace" => NS_MAIN)))))); $scrollOptions = array('search_type' => 'scan', 'scroll' => "15m", 'size' => $this->indexChunkSize); // TODO: only content index for now ( we'll have to check how it works with commons ) $sourceIndex = $this->getConnection()->getIndex($this->indexBaseName, Connection::CONTENT_INDEX_TYPE); $result = $sourceIndex->search($query, $scrollOptions); $totalDocsInIndex = $result->getResponse()->getData(); $totalDocsInIndex = $totalDocsInIndex['hits']['total']; $totalDocsToDump = $totalDocsInIndex; $scoreMethodName = $this->getOption('scoringMethod', 'quality'); $this->scoreMethod = SuggestScoringMethodFactory::getScoringMethod($scoreMethodName, $totalDocsInIndex); $builder = new SuggestBuilder($this->scoreMethod, $this->withGeo); $docsDumped = 0; $this->output("Indexing {$totalDocsToDump} documents ({$totalDocsInIndex} in the index)\n"); $self = $this; $destinationType = $this->getIndex()->getType(Connection::TITLE_SUGGEST_TYPE_NAME); $retryAttempts = $this->indexRetryAttempts; Util::iterateOverScroll($sourceIndex, $result->getResponse()->getScrollId(), '15m', function ($results) use($self, &$docsDumped, $totalDocsToDump, $builder, $destinationType, $retryAttempts) { $suggestDocs = array(); foreach ($results as $result) { $docsDumped++; $suggests = $builder->build($result->getId(), $result->getSource()); foreach ($suggests as $suggest) { $suggestDocs[] = new \Elastica\Document(null, $suggest); } } $self->outputProgress($docsDumped, $totalDocsToDump); Util::withRetry($retryAttempts, function () use($destinationType, $suggestDocs) { $destinationType->addDocuments($suggestDocs); }); }, 0, $retryAttempts); $this->output("Indexing done.\n"); }
protected function setProperties() { global $wgLanguageCode, $wgFlowSearchBannedPlugins, $wgFlowSearchOptimizeIndexForExperimentalHighlighter, $wgFlowSearchIndexAllocation, $wgFlowSearchMaintenanceTimeout, $wgFlowSearchRefreshInterval, $wgFlowSearchMaxShardsPerNode, $wgFlowSearchCacheWarmers; $this->connection = Connection::getSingleton(); $this->utils = new ConfigUtils($this->getClient(), $this); $this->indexType = 'flow'; // only 1 index for Flow $this->startOver = $this->getOption('startOver', false); $this->indexBaseName = $this->getOption('baseName', wfWikiId()); $this->reindexAndRemoveOk = $this->getOption('reindexAndRemoveOk', false); $this->reindexProcesses = $this->getOption('reindexProcesses', wfIsWindows() ? 1 : 5); $this->reindexChunkSize = $this->getOption('reindexChunkSize', 100); $this->reindexRetryAttempts = $this->getOption('reindexRetryAttempts', 5); $this->printDebugCheckConfig = $this->getOption('debugCheckConfig', false); $this->langCode = $wgLanguageCode; $this->bannedPlugins = $wgFlowSearchBannedPlugins; $this->optimizeIndexForExperimentalHighlighter = $wgFlowSearchOptimizeIndexForExperimentalHighlighter; $this->indexAllocation = $wgFlowSearchIndexAllocation; $this->maintenanceTimeout = $wgFlowSearchMaintenanceTimeout; $this->refreshInterval = $wgFlowSearchRefreshInterval; $this->maxShardsPerNode = isset($wgFlowSearchMaxShardsPerNode[$this->indexType]) ? $wgFlowSearchMaxShardsPerNode[$this->indexType] : 'unlimited'; $this->cacheWarmers = isset($wgFlowSearchCacheWarmers[$this->indexType]) ? $wgFlowSearchCacheWarmers[$this->indexType] : array(); $this->indexIdentifier = $this->utils->pickIndexIdentifierFromOption($this->getOption('indexIdentifier', 'current'), $this->getIndexTypeName()); $this->reindexAcceptableCountDeviation = Util::parsePotentialPercent($this->getOption('reindexAcceptableCountDeviation', '5%')); $this->availablePlugins = $this->utils->scanAvailablePlugins($this->bannedPlugins); $this->analysisConfigBuilder = $this->pickAnalyzer($this->langCode, $this->availablePlugins); $this->tooFewReplicas = $this->reindexAndRemoveOk && ($this->startOver || !$this->getIndex()->exists()); }
/** * @return float[] */ public static function getDefaultBoostTemplates() { static $defaultBoostTemplates = null; if ($defaultBoostTemplates === null) { $source = wfMessage('cirrussearch-boost-templates')->inContentLanguage(); $defaultBoostTemplates = array(); if (!$source->isDisabled()) { $lines = Util::parseSettingsInMessage($source->plain()); $defaultBoostTemplates = self::parseBoostTemplates(implode(' ', $lines)); // Now parse the templates } } return $defaultBoostTemplates; }
/** * {@inheritDoc} */ public function buildMainQuery(array $fields, $queryString, $phraseSlop) { $plainFields = array(); $stemFields = array(); // Separate plain and stem fields first foreach ($fields as $f) { list($field, $boost) = explode('^', $f, 2); $fieldInfo = array('field' => $field, 'boost' => $boost); if (Util::endsWith($field, '.plain')) { $plainFields[] = $fieldInfo; } else { $stemFields[] = $fieldInfo; } } $query = new \Elastica\Query\Bool(); $query->setMinimumNumberShouldMatch(1); // We always build a common terms query for the plain field $this->attachCommonTermsClause($query, $plainFields, $queryString, $this->profile); // We can use different types of query for the stem field. if (count($stemFields) === 1) { $this->attachSingleFieldStemClause($query, $stemFields[0], $queryString); } else { $this->attachMultiFieldsStemClause($query, $stemFields, $queryString); } return $query; }
/** * @param \WikiPage[] $pages * @param int $flags */ private function buildDocumentsForPages($pages, $flags) { global $wgCirrusSearchUpdateConflictRetryCount; $indexOnSkip = $flags & self::INDEX_ON_SKIP; $skipParse = $flags & self::SKIP_PARSE; $skipLinks = $flags & self::SKIP_LINKS; $forceParse = $flags & self::FORCE_PARSE; $fullDocument = !($skipParse || $skipLinks); $documents = array(); foreach ($pages as $page) { $title = $page->getTitle(); if (!$page->exists()) { LoggerFactory::getInstance('CirrusSearch')->warning('Attempted to build a document for a page that doesn\'t exist. This should be caught ' . "earlier but wasn't. Page: {title}", array('title' => $title)); continue; } $doc = new \Elastica\Document($page->getId(), array('version' => $page->getLatest(), 'version_type' => 'external', 'namespace' => $title->getNamespace(), 'namespace_text' => Util::getNamespaceText($title), 'title' => $title->getText(), 'timestamp' => wfTimestamp(TS_ISO_8601, $page->getTimestamp()))); // Everything as sent as an update to prevent overwriting fields maintained in other processes like // OtherIndex::updateOtherIndex. // But we need a way to index documents that don't already exist. We're willing to upsert any full // documents or any documents that we've been explicitly told it is ok to index when they aren't full. // This is typically just done during the first phase of the initial index build. // A quick note about docAsUpsert's merging behavior: It overwrites all fields provided by doc unless they // are objects in both doc and the indexed source. We're ok with this because all of our fields are either // regular types or lists of objects and lists are overwritten. $doc->setDocAsUpsert($fullDocument || $indexOnSkip); $doc->setRetryOnConflict($wgCirrusSearchUpdateConflictRetryCount); if (!$skipParse) { // Get text to index, based on content and parser output list($content, $parserOutput) = $this->getContentAndParserOutput($page, $forceParse); // Build our page data $pageBuilder = new PageDataBuilder($doc, $title, $content, $parserOutput); $doc = $pageBuilder->build(); // And build the page text itself $textBuilder = new PageTextBuilder($doc, $content, $parserOutput); $doc = $textBuilder->build(); // If we're a file, build its metadata too if ($title->getNamespace() === NS_FILE) { $fileBuilder = new FileDataBuilder($doc, $title); $doc = $fileBuilder->build(); } // Then let hooks have a go MWHooks::run('CirrusSearchBuildDocumentParse', array($doc, $title, $content, $parserOutput, $this->connection)); } if (!$skipLinks) { MWHooks::run('CirrusSearchBuildDocumentLinks', array($doc, $title, $this->connection)); } $documents[] = $doc; } MWHooks::run('CirrusSearchBuildDocumentFinishBatch', array($pages)); return $documents; }
/** * @param SearchContext $context * @param float $weight */ public function __construct(SearchContext $context, $weight) { parent::__construct($context, $weight); // Use the boosted template from query string if available $this->boostTemplates = $context->getBoostTemplatesFromQuery(); // empty array may be returned here in the case of a syntax error // @todo: verify that this is what we want: in case of a syntax error // we disable default boost templates. if ($this->boostTemplates === null) { // Fallback to default otherwize $this->boostTemplates = Util::getDefaultBoostTemplates(); } }