public function execute() { global $wgFlowSearchMaintenanceTimeout; // Set the timeout for maintenance actions Connection::getSingleton()->setTimeout2($wgFlowSearchMaintenanceTimeout); /** @var Updater[] $updaters */ $updaters = Container::get('searchindex.updaters'); foreach ($updaters as $updaterType => $updater) { $fromId = $this->getOption('fromId', null); $fromId = $fromId ? UUID::create($fromId) : null; $toId = $this->getOption('toId', null); $toId = $toId ? UUID::create($toId) : null; $namespace = $this->getOption('namespace', null); $numRevisionsToIndex = $this->getOption('limit', null); $total = 0; while (true) { // if a limit was provided, we should make sure to not fetch // more revisions than asked for $options = array('LIMIT' => $this->mBatchSize); if ($numRevisionsToIndex) { $options['LIMIT'] = min($numRevisionsToIndex, $this->mBatchSize); // since we do this in batches, we'll subtract the size of // each batch until $numRevisionsToIndex is reached $numRevisionsToIndex -= $this->mBatchSize; if ($options['LIMIT'] <= 0) { break; } } $conditions = $updater->buildQueryConditions($fromId, $toId, $namespace); $revisions = $updater->getRevisions($conditions, $options); // stop if we're all out of revisions if (!$revisions) { break; } $total += $updater->updateRevisions($revisions, null, null); $this->output("Indexed {$total} {$updaterType} document(s)\n"); // prepare for next batch, starting at the next id // prevFromId will default to around unix epoch - there can be // no data before that $prevFromId = $fromId ?: UUID::getComparisonUUID('1'); $fromId = $this->getNextFromId($revisions); // make sure we don't get stuck in an infinite loop $diff = $prevFromId->getTimestampObj()->diff($fromId->getTimestampObj()); // invert will be 1 if the diff is a negative time period from // $prevFromId to $fromId, which means that the new $timestamp is // more recent than our current $result if ($diff->invert) { $this->error('Got stuck in an infinite loop.' . "\n" . 'workflow_last_update_timestamp is likely incorrect ' . 'for some workflows.' . "\n" . 'Run maintenance/FlowFixWorkflowLastUpdateTimestamp.php ' . 'to automatically fix those.', 1); } // prevent memory from being filled up Container::get('storage')->clear(); } } }
/** * We want to retrieve the total amount of search word hits * (static::termsAggregation) but our search terms may not be how * ElasticSearch stores the words in its index. * Elastic will "analyze" text (perform stemming, etc) and store * the terms in a normalized way. * AFAICT, there is not really a way to get to that information * from within a search query. * * Luckily, since 1.0, Elastic supports _termvector, which gives * you statistics about the terms in your document. * Since 1.4, Elastic supports feeding _termvector documents to * analyze. * We're going to (ab)use this by letting it respond with term * information on a bogus document that contains only our current * search terms. * So we'll give it a document with just our keywords for the * column that we're searching in (revisions.text) and Elastic will * use that column's configuration to analyze the text we feed it. * It will then respond with the normalized terms & their stats. * * @param string $terms * @return array */ protected function getTerms($terms) { $terms = preg_split('/\\s+/', $terms); // _termvectors only works on a type, but our types are // configured exactly the same so it doesn't matter which $types = Connection::getAllTypes(); $searchable = Connection::getFlowIndex($this->indexBaseName); $searchable = $searchable->getType(array_pop($types)); $query = array('doc' => array('revisions' => array('text' => $terms)), "fields" => array("revisions.text")); // Elastica has no abstraction over _termvector like it has // for _query, so just do the request ourselves $response = $searchable->request('_termvector', Request::POST, $query, array()); $data = $response->getData(); return array_keys($data['term_vectors']['revisions.text']['terms']); }
/** * @param \Elastica\Document[] $documents * @param string|null $shardTimeout Timeout in Elasticsearch time format (1m, 15s, ...) */ protected function sendDocuments(array $documents, $shardTimeout = null) { if (count($documents) === 0) { return; } try { // addDocuments (notice plural) is the bulk api $bulk = new \Elastica\Bulk(Connection::getSingleton()->getClient2()); if ($shardTimeout !== null) { $bulk->setShardTimeout($shardTimeout); } $index = Connection::getFlowIndex(wfWikiId()); $type = $index->getType($this->getTypeName()); $bulk->setType($type); $bulk->addDocuments($documents); $bulk->send(); } catch (\Exception $e) { $documentIds = array_map(function ($doc) { return $doc->getId(); }, $documents); wfWarn(__METHOD__ . ': Failed updating documents (' . implode(',', $documentIds) . '): ' . $e->getMessage()); } }
/** * Set the search index to search in. * false is allowed (means we'll search *all* types) * * @param string|false $type * @throws InvalidInputException */ public function setType($type) { $allowedTypes = array_merge(Connection::getAllTypes(), array(false)); if (!in_array($type, $allowedTypes)) { throw new InvalidInputException('Invalid search sort requested', 'invalid-input'); } $this->type = $type; }
protected function getAllIndices() { return Connection::getAllIndices(); }
public function getParamDescription() { $p = $this->getModulePrefix(); return array('term' => 'Search term', 'title' => "Title of the boards to search in. Cannot be used together with {$p}pageid", 'pageid' => "ID of the boards to search in. Cannot be used together with {$p}title", 'namespaces' => 'Namespaces to search in', 'moderationState' => 'Search for revisions in (a) particular moderation state(s)', 'sort' => 'What to order the search results by', 'type' => 'Desired type of results (' . implode('|', Connection::getAllTypes()) . ')', 'offset' => 'Offset value to start fetching results at', 'limit' => 'Amount of results to fetch'); }