/** * @return false|\Elastica\Query * @throws ErrorException */ public function getQuery() { if ($this->validate() === false) { return false; } $termsAgg = new Terms(self::GROUP_AGGREGATION_NAME); $termsAgg->setField($this->group['by']); $termsAgg->setSize(self::MAX_SIZE_AGGREGATION_ITEMS); $filterBool = new Bool(); if ($this->condition) { $this->addCondition($filterBool); } $this->addAggregation($termsAgg); $queryFiltred = new \Elastica\Query\Filtered(); $queryFiltred->setFilter($filterBool); $queryFiltred->setQuery(new \Elastica\Query\MatchAll()); $query = new Query(); // These are not needed only aggregation $query->setSize(0); $query->setQuery($queryFiltred); $query->addAggregation($termsAgg); return $query; }
/** * @param $searchTerm * @param $taxon * @param $boolFilter * @param $elasticaQuery */ public function applyElasticaQueryType($searchTerm, $taxon, $boolFilter, $elasticaQuery) { if (!$searchTerm) { $query = new \Elastica\Query\Filtered(); $taxonFromRequestFilter = new \Elastica\Filter\Terms(); $taxonFromRequestFilter->setTerms('taxons', array($taxon)); $boolFilter->addMust($taxonFromRequestFilter); $query->setFilter($boolFilter); $elasticaQuery->setQuery($query); } else { if ('all' !== $taxon) { $query = new \Elastica\Query\Filtered(); $query->setQuery(new \Elastica\Query\QueryString($searchTerm)); $taxonFromRequestFilter = new \Elastica\Filter\Terms(); $taxonFromRequestFilter->setTerms('taxons', array($taxon)); $boolFilter->addMust($taxonFromRequestFilter); $query->setFilter($boolFilter); } else { $query = new \Elastica\Query\QueryString($searchTerm); } $elasticaQuery->setQuery($query); } }
protected function doQuery($sourceLanguage, $targetLanguage, $text) { /* Two query system: * 1) Find all strings in source language that match text * 2) Do another query for translations for those strings */ $connection = $this->getClient()->getConnection(); $oldTimeout = $connection->getTimeout(); $connection->setTimeout(10); $fuzzyQuery = new \Elastica\Query\FuzzyLikeThis(); $fuzzyQuery->setLikeText($text); $fuzzyQuery->addFields(array('content')); $boostQuery = new \Elastica\Query\FunctionScore(); if ($this->useWikimediaExtraPlugin()) { $boostQuery->addFunction('levenshtein_distance_score', array('text' => $text, 'field' => 'content')); } else { $groovyScript = <<<GROOVY import org.apache.lucene.search.spell.* new LevensteinDistance().getDistance(srctxt, _source['content']) GROOVY; $script = new \Elastica\Script($groovyScript, array('srctxt' => $text), \Elastica\Script::LANG_GROOVY); $boostQuery->addScriptScoreFunction($script); } $boostQuery->setBoostMode(\Elastica\Query\FunctionScore::BOOST_MODE_REPLACE); // Wrap the fuzzy query so it can be used as a filter. // This is slightly faster, as ES can throw away the scores by this query. $fuzzyFilter = new \Elastica\Filter\Query(); $fuzzyFilter->setQuery($fuzzyQuery); $boostQuery->setFilter($fuzzyFilter); // Use filtered query to wrap function score and language filter $filteredQuery = new \Elastica\Query\Filtered(); $languageFilter = new \Elastica\Filter\Term(); $languageFilter->setTerm('language', $sourceLanguage); $filteredQuery->setFilter($languageFilter); $filteredQuery->setQuery($boostQuery); // The whole query $query = new \Elastica\Query(); $query->setQuery($filteredQuery); // The interface usually displays three best candidates. These might // come from more than three source things, if the translations are // the same. In other words suggestions are grouped by the suggested // translation. This algorithm might not find all suggestions, if the // top N best matching source texts don't have equivalent translations // in the target language, but worse matches which we did not fetch do. // This code tries to balance between doing too many or too big queries // and not fetching enough results to show all possible suggestions. $sizeFirst = 100; $sizeSecond = $sizeFirst * 5; $query->setFrom(0); $query->setSize($sizeFirst); $query->setParam('_source', array('content')); $cutoff = isset($this->config['cutoff']) ? $this->config['cutoff'] : 0.65; $query->setParam('min_score', $cutoff); $query->setSort(array('_score', '_uid')); // This query is doing two unrelated things: // 1) Collect the message contents and scores so that they can // be accessed later for the translations we found. // 2) Build the query string for the query that fetches the translations. $contents = $scores = $terms = array(); do { $resultset = $this->getType()->search($query); if (count($resultset) === 0) { break; } foreach ($resultset->getResults() as $result) { $data = $result->getData(); $score = $result->getScore(); $sourceId = preg_replace('~/[^/]+$~', '', $result->getId()); $contents[$sourceId] = $data['content']; $scores[$sourceId] = $score; $terms[] = "{$sourceId}/{$targetLanguage}"; } // Check if it looks like that we are hitting the long tail already. // Otherwise, we'll do a query to fetch some more to reach a "sane" // breaking point, i.e. include all suggestions with same content // for reliable used X times statistics. if (count(array_unique($scores)) > 5) { break; } // Okay, We are now in second iteration of the loop. We already got // lots of suggestions. We will give up for now even if it means we // return in some sense incomplete results. if (count($resultset) === $sizeSecond) { break; } // After the first query, the smallest score is the new threshold. $query->setParam('min_score', $score); $query->setFrom($query->getParam('size') + $query->getParam('from')); $query->setSize($sizeSecond); // Break if we already got all hits } while ($resultset->getTotalHits() > count($contents)); $suggestions = array(); // Skip second query if first query found nothing. Keeping only one return // statement in this method to avoid forgetting to reset connection timeout if ($terms !== array()) { $idQuery = new \Elastica\Query\Terms(); $idQuery->setTerms('_id', $terms); $query = new \Elastica\Query($idQuery); $query->setSize(25); $query->setParam('_source', array('wiki', 'uri', 'content', 'localid')); $resultset = $this->getType()->search($query); foreach ($resultset->getResults() as $result) { $data = $result->getData(); // Construct the matching source id $sourceId = preg_replace('~/[^/]+$~', '', $result->getId()); $suggestions[] = array('source' => $contents[$sourceId], 'target' => $data['content'], 'context' => $data['localid'], 'quality' => $scores[$sourceId], 'wiki' => $data['wiki'], 'location' => $data['localid'] . '/' . $targetLanguage, 'uri' => $data['uri']); } // Ensure reults are in quality order uasort($suggestions, function ($a, $b) { if ($a['quality'] === $b['quality']) { return 0; } return $a['quality'] < $b['quality'] ? 1 : -1; }); } $connection->setTimeout($oldTimeout); return $suggestions; }