public function testMultipleCoordinates() { $doc = array('coordinates' => array(array('coord' => array('lat' => 0.7077777777777799, 'lon' => -50.089444444444), 'region' => null, 'dim' => 10000, 'name' => "", 'primary' => true, 'type' => "river", 'globe' => "earth", 'country' => "BR"), array('coord' => array('lat' => -15.518055555556, 'lon' => -71.765277777778), 'region' => null, 'dim' => 10000, 'name' => "", 'primary' => false, 'type' => "river", 'globe' => "earth", 'country' => "BR"))); $builder = new SuggestBuilder(SuggestScoringMethodFactory::getScoringMethod('incomingLinks', 1)); $coord = $builder->findPrimaryCoordinates($doc); $expected = array('lat' => 0.7077777777777799, 'lon' => -50.089444444444); $this->assertSame($expected, $coord); $doc['coordinates'][1]['primary'] = true; $coord = $builder->findPrimaryCoordinates($doc); $expected = array('lat' => 0.7077777777777799, 'lon' => -50.089444444444); $this->assertSame($expected, $coord, "With two primaries coord we choose the first one"); $doc['coordinates'][0]['primary'] = false; $coord = $builder->findPrimaryCoordinates($doc); $expected = array('lat' => -15.518055555556, 'lon' => -71.765277777778); $this->assertSame($expected, $coord, "Choose primary coord even if it's not the first one."); $doc['coordinates'][1]['primary'] = false; $coord = $builder->findPrimaryCoordinates($doc); $expected = array('lat' => 0.7077777777777799, 'lon' => -50.089444444444); $this->assertSame($expected, $coord, "Choose first coord if there's no primary."); $doc['coordinates'][0]['primary'] = true; $doc['coordinates'][0]['globe'] = 'Magrathea'; $coord = $builder->findPrimaryCoordinates($doc); $expected = array('lat' => -15.518055555556, 'lon' => -71.765277777778); $this->assertSame($expected, $coord, "Choose first coord on earth."); $doc['coordinates'][1]['globe'] = 'Magrathea'; $coord = $builder->findPrimaryCoordinates($doc); $this->assertNull($coord, "No coord if none is on earth."); }
/** * merge top level multi-queries and resolve returned pageIds into Title objects. * * WARNING: experimental API * * @param string $query the user query * @param \Elastica\Response $response Response from elasticsearch _suggest api * @param array $profiles the suggestion profiles * @param int $limit Maximum suggestions to return, -1 for unlimited * @return SearchSuggestionSet a set of Suggestions */ protected function postProcessSuggest(\Elastica\Response $response, $profiles, $limit = -1) { $this->logContext['elasticTookMs'] = intval($response->getQueryTime() * 1000); $data = $response->getData(); unset($data['_shards']); $suggestions = array(); foreach ($data as $name => $results) { $discount = $profiles[$name]['discount']; foreach ($results as $suggested) { foreach ($suggested['options'] as $suggest) { $output = SuggestBuilder::decodeOutput($suggest['text']); if ($output === null) { // Ignore broken output continue; } $pageId = $output['id']; $type = $output['type']; $score = $discount * $suggest['score']; if (!isset($suggestions[$pageId]) || $score > $suggestions[$pageId]->getScore()) { $suggestion = new SearchSuggestion($score, null, null, $pageId); // If it's a title suggestion we have the text if ($type === SuggestBuilder::TITLE_SUGGESTION) { $suggestion->setText($output['text']); } $suggestions[$pageId] = $suggestion; } } } } // simply sort by existing scores uasort($suggestions, function ($a, $b) { return $b->getScore() - $a->getScore(); }); $this->logContext['hitsTotal'] = count($suggestions); if ($limit > 0) { $suggestions = array_slice($suggestions, 0, $limit, true); } $this->logContext['hitsReturned'] = count($suggestions); $this->logContext['hitsOffset'] = 0; // we must fetch redirect data for redirect suggestions $missingText = array(); foreach ($suggestions as $id => $suggestion) { if ($suggestion->getText() === null) { $missingText[] = $id; } } if (!empty($missingText)) { // Experimental. // // Second pass query to fetch redirects. // It's not clear if it's the best option, this will slowdown the whole query // when we hit a redirect suggestion. // Other option would be to encode redirects as a payload resulting in a // very big index... // XXX: we support only the content index $type = $this->connection->getPageType($this->indexBaseName, Connection::CONTENT_INDEX_TYPE); // NOTE: we are already in a poolCounterWork // Multi get is not supported by elastica $redirResponse = null; try { $redirResponse = $type->request('_mget', 'GET', array('ids' => $missingText), array('_source_include' => 'redirect')); if ($redirResponse->isOk()) { $this->logContext['elasticTook2PassMs'] = intval($redirResponse->getQueryTime() * 1000); $docs = $redirResponse->getData(); foreach ($docs['docs'] as $doc) { if (empty($doc['_source']['redirect'])) { continue; } // We use the original query, we should maybe use the variant that generated this result? $text = Util::chooseBestRedirect($this->term, $doc['_source']['redirect']); if (!empty($suggestions[$doc['_id']])) { $suggestions[$doc['_id']]->setText($text); } } } else { LoggerFactory::getInstance('CirrusSearch')->warning('Unable to fetch redirects for suggestion {query} with results {ids} : {error}', array('query' => $this->term, 'ids' => serialize($missingText), 'error' => $redirResponse->getError())); } } catch (\Elastica\Exception\ExceptionInterface $e) { LoggerFactory::getInstance('CirrusSearch')->warning('Unable to fetch redirects for suggestion {query} with results {ids} : {error}', array('query' => $this->term, 'ids' => serialize($missingText), 'error' => $this->extractMessage($e))); } } return new SearchSuggestionSet(array_filter($suggestions, function ($suggestion) { // text should be not empty for suggestions return $suggestion->getText() != null; })); }
private function indexData() { $query = new Query(); $query->setFields(array('_id', '_type', '_source')); // Exclude content fields to save bandwidth $query->setSource(array('exclude' => array('text', 'source_text', 'opening_text', 'auxiliary_text'))); $query->setQuery(new Elastica\Query\Filtered(new Elastica\Query\MatchAll(), new Elastica\Filter\BoolAnd(array(new Elastica\Filter\Type(Connection::PAGE_TYPE_NAME), new Elastica\Filter\Term(array("namespace" => NS_MAIN)))))); $scrollOptions = array('search_type' => 'scan', 'scroll' => "15m", 'size' => $this->indexChunkSize); // TODO: only content index for now ( we'll have to check how it works with commons ) $sourceIndex = $this->getConnection()->getIndex($this->indexBaseName, Connection::CONTENT_INDEX_TYPE); $result = $sourceIndex->search($query, $scrollOptions); $totalDocsInIndex = $result->getResponse()->getData(); $totalDocsInIndex = $totalDocsInIndex['hits']['total']; $totalDocsToDump = $totalDocsInIndex; $scoreMethodName = $this->getOption('scoringMethod', 'quality'); $this->scoreMethod = SuggestScoringMethodFactory::getScoringMethod($scoreMethodName, $totalDocsInIndex); $builder = new SuggestBuilder($this->scoreMethod, $this->withGeo); $docsDumped = 0; $this->output("Indexing {$totalDocsToDump} documents ({$totalDocsInIndex} in the index)\n"); $self = $this; $destinationType = $this->getIndex()->getType(Connection::TITLE_SUGGEST_TYPE_NAME); $retryAttempts = $this->indexRetryAttempts; Util::iterateOverScroll($sourceIndex, $result->getResponse()->getScrollId(), '15m', function ($results) use($self, &$docsDumped, $totalDocsToDump, $builder, $destinationType, $retryAttempts) { $suggestDocs = array(); foreach ($results as $result) { $docsDumped++; $suggests = $builder->build($result->getId(), $result->getSource()); foreach ($suggests as $suggest) { $suggestDocs[] = new \Elastica\Document(null, $suggest); } } $self->outputProgress($docsDumped, $totalDocsToDump); Util::withRetry($retryAttempts, function () use($destinationType, $suggestDocs) { $destinationType->addDocuments($suggestDocs); }); }, 0, $retryAttempts); $this->output("Indexing done.\n"); }
public function provideOutputEncoder() { return array('title' => array(array('id' => 123, 'type' => SuggestBuilder::TITLE_SUGGESTION, 'text' => 'This is a title'), SuggestBuilder::encodeTitleOutput(123, "This is a title")), 'redirect' => array(array('id' => 123, 'type' => SuggestBuilder::REDIRECT_SUGGESTION), SuggestBuilder::encodeRedirectOutput(123)), 'Garbage' => array(null, 'Garbage'), 'Broken title' => array(null, '123:t'), 'Partial encoding' => array(null, '123:'), 'null output' => array(null, null)); }