private function indexData() { $query = new Query(); $query->setFields(array('_id', '_type', '_source')); // Exclude content fields to save bandwidth $query->setSource(array('exclude' => array('text', 'source_text', 'opening_text', 'auxiliary_text'))); $query->setQuery(new Elastica\Query\Filtered(new Elastica\Query\MatchAll(), new Elastica\Filter\BoolAnd(array(new Elastica\Filter\Type(Connection::PAGE_TYPE_NAME), new Elastica\Filter\Term(array("namespace" => NS_MAIN)))))); $scrollOptions = array('search_type' => 'scan', 'scroll' => "15m", 'size' => $this->indexChunkSize); // TODO: only content index for now ( we'll have to check how it works with commons ) $sourceIndex = $this->getConnection()->getIndex($this->indexBaseName, Connection::CONTENT_INDEX_TYPE); $result = $sourceIndex->search($query, $scrollOptions); $totalDocsInIndex = $result->getResponse()->getData(); $totalDocsInIndex = $totalDocsInIndex['hits']['total']; $totalDocsToDump = $totalDocsInIndex; $scoreMethodName = $this->getOption('scoringMethod', 'quality'); $this->scoreMethod = SuggestScoringMethodFactory::getScoringMethod($scoreMethodName, $totalDocsInIndex); $builder = new SuggestBuilder($this->scoreMethod, $this->withGeo); $docsDumped = 0; $this->output("Indexing {$totalDocsToDump} documents ({$totalDocsInIndex} in the index)\n"); $self = $this; $destinationType = $this->getIndex()->getType(Connection::TITLE_SUGGEST_TYPE_NAME); $retryAttempts = $this->indexRetryAttempts; Util::iterateOverScroll($sourceIndex, $result->getResponse()->getScrollId(), '15m', function ($results) use($self, &$docsDumped, $totalDocsToDump, $builder, $destinationType, $retryAttempts) { $suggestDocs = array(); foreach ($results as $result) { $docsDumped++; $suggests = $builder->build($result->getId(), $result->getSource()); foreach ($suggests as $suggest) { $suggestDocs[] = new \Elastica\Document(null, $suggest); } } $self->outputProgress($docsDumped, $totalDocsToDump); Util::withRetry($retryAttempts, function () use($destinationType, $suggestDocs) { $destinationType->addDocuments($suggestDocs); }); }, 0, $retryAttempts); $this->output("Indexing done.\n"); }
public function execute() { global $wgPoolCounterConf; // Make sure we don't flood the pool counter unset($wgPoolCounterConf['CirrusSearch-Search']); // Set the timeout for maintenance actions $this->setConnectionTimeout(); $this->indexType = $this->getOption('indexType'); $this->indexBaseName = $this->getOption('baseName', wfWikiId()); $indexTypes = $this->getConnection()->getAllIndexTypes(); if (!in_array($this->indexType, $indexTypes)) { $this->error('indexType option must be one of ' . implode(', ', $indexTypes), 1); } $utils = new ConfigUtils($this->getConnection()->getClient(), $this); $this->indexIdentifier = $this->getOption('indexIdentifier'); $filter = null; if ($this->hasOption('filter')) { $filter = new Elastica\Filter\Query(new Elastica\Query\QueryString($this->getOption('filter'))); } $limit = (int) $this->getOption('limit', 0); $query = new Query(); $query->setFields(array('_id', '_type', '_source')); if ($this->hasOption('sourceFields')) { $sourceFields = explode(',', $this->getOption('sourceFields')); $query->setSource(array('include' => $sourceFields)); } if ($filter) { $query->setQuery(new \Elastica\Query\Filtered(new \Elastica\Query\MatchAll(), $filter)); } $scrollOptions = array('search_type' => 'scan', 'scroll' => "15m", 'size' => $this->inputChunkSize); $index = $this->getIndex(); $result = $index->search($query, $scrollOptions); $totalDocsInIndex = $result->getResponse()->getData(); $totalDocsInIndex = $totalDocsInIndex['hits']['total']; $totalDocsToDump = $limit > 0 ? $limit : $totalDocsInIndex; $docsDumped = 0; $this->logToStderr = true; $this->output("Dumping {$totalDocsToDump} documents ({$totalDocsInIndex} in the index)\n"); $self = $this; Util::iterateOverScroll($index, $result->getResponse()->getScrollId(), '15m', function ($results) use($self, &$docsDumped, $totalDocsToDump) { foreach ($results as $result) { $document = array('_id' => $result->getId(), '_type' => $result->getType(), '_source' => $result->getSource()); $self->write($document); $docsDumped++; $self->outputProgress($docsDumped, $totalDocsToDump); } }, $limit, 5); $this->output("Dump done.\n"); }
/** * Search on Elasticsearch * Example Usage: * * With Elastica Query Builder * $qb = new QueryBuilder(); * $query = $qb->query()->match_all(); * $mainQuery = new \Elastica\Query($query); * $this->doSearch($mainQuery, 1, 10); * * @param ElasticaQuery $query * @param int $page * @param int $itemPerPage * @param string $type Already defined strings at self. * @param int $version Search version for index and type * @return Resultset */ protected function doSearch(ElasticaQuery $query, $page, $itemsPerPage, $type = self::VIEW_LIST, $version = 1) { $query->setFrom(($page - 1) * $itemsPerPage)->setSize($itemsPerPage); if ($type === self::VIEW_LIST) { if ($this->getListViewFields()) { $query->setSource($this->getListViewFields()); } } elseif ($type === self::VIEW_SHORT) { if ($this->getShortViewFields()) { $query->setSource($this->getShortViewFields()); } } elseif ($type === self::VIEW_DETAIL) { if ($this->getDetailViewFields()) { $query->setSource($this->getDetailViewFields()); } } return $this->getClient()->getIndex($this->getIndexName($version))->getType($this->getTypeName($version))->search($query); }