/**
  * @group functional
  */
 public function testSearch()
 {
     $client = $this->_getClient();
     $index = new Index($client, 'test');
     $index->create(array(), true);
     $index->getSettings()->setNumberOfReplicas(0);
     //$index->getSettings()->setNumberOfShards(1);
     $type = new Type($index, 'helloworldmlt');
     $mapping = new Mapping($type, array('email' => array('store' => 'yes', 'type' => 'string', 'index' => 'analyzed'), 'content' => array('store' => 'yes', 'type' => 'string', 'index' => 'analyzed')));
     $mapping->setSource(array('enabled' => false));
     $type->setMapping($mapping);
     $doc = new Document(1000, array('email' => '*****@*****.**', 'content' => 'This is a sample post. Hello World Fuzzy Like This!'));
     $type->addDocument($doc);
     $doc = new Document(1001, array('email' => '*****@*****.**', 'content' => 'This is a fake nospam email address for gmail'));
     $type->addDocument($doc);
     // Refresh index
     $index->refresh();
     $mltQuery = new MoreLikeThis();
     $mltQuery->setLikeText('fake gmail sample');
     $mltQuery->setFields(array('email', 'content'));
     $mltQuery->setMaxQueryTerms(1);
     $mltQuery->setMinDocFrequency(1);
     $mltQuery->setMinTermFrequency(1);
     $query = new Query();
     $query->setFields(array('email', 'content'));
     $query->setQuery($mltQuery);
     $resultSet = $type->search($query);
     $resultSet->getResponse()->getData();
     $this->assertEquals(2, $resultSet->count());
 }
 /**
  * {@inheritdoc}
  */
 public function getCount(array $data, array $keys, $namespace)
 {
     if (!$this->hasNamespace($namespace)) {
         return 0;
     }
     // Build the query
     $query = new Query();
     $query->setFields(array('hash', 'keys'));
     // Add hash term
     $term = new Term();
     $term->setTerm('hash', $this->getHash($data, $keys));
     $query->setPostFilter($term);
     return $this->index->getType($namespace)->search($query)->getTotalHits();
 }
 public function __construct(Index $index, Query $query, LoggerInterface $logger = null)
 {
     //Optimise the query by just getting back the ids and types
     $query->setFields(array('_id', '_type'));
     //If we are in live reading mode, only return published documents
     if (\Versioned::get_reading_mode() == \Versioned::DEFAULT_MODE) {
         $publishedFilter = new Query\BoolQuery();
         $publishedFilter->addMust(new Query\Term([Searchable::$published_field => 'true']));
         $query->setPostFilter($publishedFilter);
     }
     $this->index = $index;
     $this->query = $query;
     $this->logger = $logger;
 }
Example #4
0
 /**
  * @group unit
  */
 public function testSetFields()
 {
     $query = new Query();
     $params = array('query' => 'test');
     $query->setFields(array('firstname', 'lastname'));
     $data = $query->toArray();
     $this->assertContains('firstname', $data['fields']);
     $this->assertContains('lastname', $data['fields']);
     $this->assertCount(2, $data['fields']);
 }
 private function reindexInternal(Type $type, Type $oldType, $children, $childNumber, $chunkSize, $retryAttempts)
 {
     $filter = null;
     $messagePrefix = "";
     if ($childNumber === 1 && $children === 1) {
         $this->outputIndented("\t\tStarting single process reindex\n");
     } else {
         if ($childNumber >= $children) {
             $this->error("Invalid parameters - childNumber >= children ({$childNumber} >= {$children}) ", 1);
         }
         $messagePrefix = "\t\t[{$childNumber}] ";
         $this->outputIndented($messagePrefix . "Starting child process reindex\n");
         // Note that it is not ok to abs(_uid.hashCode) because hashCode(Integer.MIN_VALUE) == Integer.MIN_VALUE
         $filter = new \CirrusSearch\Extra\Filter\IdHashMod($children, $childNumber);
     }
     $properties = $this->mappingConfig[$oldType->getName()]['properties'];
     try {
         $query = new Query();
         $query->setFields(array('_id', '_source'));
         if ($filter) {
             $query->setQuery(new \Elastica\Query\Filtered(new \Elastica\Query\MatchAll(), $filter));
         }
         // Note here we dump from the current index (using the alias) so we can use Connection::getPageType
         $result = $oldType->search($query, array('search_type' => 'scan', 'scroll' => '1h', 'size' => $chunkSize));
         $totalDocsToReindex = $result->getResponse()->getData();
         $totalDocsToReindex = $totalDocsToReindex['hits']['total'];
         $this->outputIndented($messagePrefix . "About to reindex {$totalDocsToReindex} documents\n");
         $operationStartTime = microtime(true);
         $completed = 0;
         $self = $this;
         Util::iterateOverScroll($this->index, $result->getResponse()->getScrollId(), '1h', function ($results) use($properties, $retryAttempts, $messagePrefix, $self, $type, &$completed, $totalDocsToReindex, $operationStartTime) {
             $documents = array();
             foreach ($results as $result) {
                 $documents[] = $self->buildNewDocument($result, $properties);
             }
             $self->withRetry($retryAttempts, $messagePrefix, 'retrying as singles', function () use($self, $type, $messagePrefix, $documents) {
                 $self->sendDocuments($type, $messagePrefix, $documents);
             });
             $completed += sizeof($results);
             $rate = round($completed / (microtime(true) - $operationStartTime));
             $this->outputIndented($messagePrefix . "Reindexed {$completed}/{$totalDocsToReindex} documents at {$rate}/second\n");
         }, 0, $retryAttempts, function ($e, $errors) use($self, $messagePrefix) {
             $self->sleepOnRetry($e, $errors, $messagePrefix, 'fetching documents to reindex');
         });
         $this->outputIndented($messagePrefix . "All done\n");
     } catch (ExceptionInterface $e) {
         // Note that we can't fail the master here, we have to check how many documents are in the new index in the master.
         $type = get_class($e);
         $message = ElasticsearchIntermediary::extractMessage($e);
         LoggerFactory::getInstance('CirrusSearch')->warning("Search backend error during reindex.  Error type is '{type}' and message is:  {message}", array('type' => $type, 'message' => $message));
         die(1);
     }
 }
Example #6
0
 /**
  * @inheritDoc
  */
 public function findAllIds()
 {
     $generator = function (ScanAndScroll $scanAndScroll) {
         foreach ($scanAndScroll as $scrollId => $resultSet) {
             foreach ($resultSet as $result) {
                 (yield $result->getId());
             }
         }
     };
     $query = new Query(new Query\MatchAll());
     $query->setFields([]);
     $search = $this->type->createSearch($query);
     $scanAndScroll = new ScanAndScroll($search);
     return $generator($scanAndScroll);
 }
 public function execute()
 {
     global $wgPoolCounterConf;
     // Make sure we don't flood the pool counter
     unset($wgPoolCounterConf['CirrusSearch-Search']);
     // Set the timeout for maintenance actions
     $this->setConnectionTimeout();
     $this->indexType = $this->getOption('indexType');
     $this->indexBaseName = $this->getOption('baseName', wfWikiId());
     $indexTypes = $this->getConnection()->getAllIndexTypes();
     if (!in_array($this->indexType, $indexTypes)) {
         $this->error('indexType option must be one of ' . implode(', ', $indexTypes), 1);
     }
     $utils = new ConfigUtils($this->getConnection()->getClient(), $this);
     $this->indexIdentifier = $this->getOption('indexIdentifier');
     $filter = null;
     if ($this->hasOption('filter')) {
         $filter = new Elastica\Filter\Query(new Elastica\Query\QueryString($this->getOption('filter')));
     }
     $limit = (int) $this->getOption('limit', 0);
     $query = new Query();
     $query->setFields(array('_id', '_type', '_source'));
     if ($this->hasOption('sourceFields')) {
         $sourceFields = explode(',', $this->getOption('sourceFields'));
         $query->setSource(array('include' => $sourceFields));
     }
     if ($filter) {
         $query->setQuery(new \Elastica\Query\Filtered(new \Elastica\Query\MatchAll(), $filter));
     }
     $scrollOptions = array('search_type' => 'scan', 'scroll' => "15m", 'size' => $this->inputChunkSize);
     $index = $this->getIndex();
     $result = $index->search($query, $scrollOptions);
     $totalDocsInIndex = $result->getResponse()->getData();
     $totalDocsInIndex = $totalDocsInIndex['hits']['total'];
     $totalDocsToDump = $limit > 0 ? $limit : $totalDocsInIndex;
     $docsDumped = 0;
     $this->logToStderr = true;
     $this->output("Dumping {$totalDocsToDump} documents ({$totalDocsInIndex} in the index)\n");
     $self = $this;
     Util::iterateOverScroll($index, $result->getResponse()->getScrollId(), '15m', function ($results) use($self, &$docsDumped, $totalDocsToDump) {
         foreach ($results as $result) {
             $document = array('_id' => $result->getId(), '_type' => $result->getType(), '_source' => $result->getSource());
             $self->write($document);
             $docsDumped++;
             $self->outputProgress($docsDumped, $totalDocsToDump);
         }
     }, $limit, 5);
     $this->output("Dump done.\n");
 }
 private function indexData()
 {
     $query = new Query();
     $query->setFields(array('_id', '_type', '_source'));
     // Exclude content fields to save bandwidth
     $query->setSource(array('exclude' => array('text', 'source_text', 'opening_text', 'auxiliary_text')));
     $query->setQuery(new Elastica\Query\Filtered(new Elastica\Query\MatchAll(), new Elastica\Filter\BoolAnd(array(new Elastica\Filter\Type(Connection::PAGE_TYPE_NAME), new Elastica\Filter\Term(array("namespace" => NS_MAIN))))));
     $scrollOptions = array('search_type' => 'scan', 'scroll' => "15m", 'size' => $this->indexChunkSize);
     // TODO: only content index for now ( we'll have to check how it works with commons )
     $sourceIndex = $this->getConnection()->getIndex($this->indexBaseName, Connection::CONTENT_INDEX_TYPE);
     $result = $sourceIndex->search($query, $scrollOptions);
     $totalDocsInIndex = $result->getResponse()->getData();
     $totalDocsInIndex = $totalDocsInIndex['hits']['total'];
     $totalDocsToDump = $totalDocsInIndex;
     $scoreMethodName = $this->getOption('scoringMethod', 'quality');
     $this->scoreMethod = SuggestScoringMethodFactory::getScoringMethod($scoreMethodName, $totalDocsInIndex);
     $builder = new SuggestBuilder($this->scoreMethod, $this->withGeo);
     $docsDumped = 0;
     $this->output("Indexing {$totalDocsToDump} documents ({$totalDocsInIndex} in the index)\n");
     $self = $this;
     $destinationType = $this->getIndex()->getType(Connection::TITLE_SUGGEST_TYPE_NAME);
     $retryAttempts = $this->indexRetryAttempts;
     Util::iterateOverScroll($sourceIndex, $result->getResponse()->getScrollId(), '15m', function ($results) use($self, &$docsDumped, $totalDocsToDump, $builder, $destinationType, $retryAttempts) {
         $suggestDocs = array();
         foreach ($results as $result) {
             $docsDumped++;
             $suggests = $builder->build($result->getId(), $result->getSource());
             foreach ($suggests as $suggest) {
                 $suggestDocs[] = new \Elastica\Document(null, $suggest);
             }
         }
         $self->outputProgress($docsDumped, $totalDocsToDump);
         Util::withRetry($retryAttempts, function () use($destinationType, $suggestDocs) {
             $destinationType->addDocuments($suggestDocs);
         });
     }, 0, $retryAttempts);
     $this->output("Indexing done.\n");
 }