/**
  * @return false|\Elastica\Query
  * @throws ErrorException
  */
 public function getQuery()
 {
     if ($this->validate() === false) {
         return false;
     }
     $termsAgg = new Terms(self::GROUP_AGGREGATION_NAME);
     $termsAgg->setField($this->group['by']);
     $termsAgg->setSize(self::MAX_SIZE_AGGREGATION_ITEMS);
     $filterBool = new Bool();
     if ($this->condition) {
         $this->addCondition($filterBool);
     }
     $this->addAggregation($termsAgg);
     $queryFiltred = new \Elastica\Query\Filtered();
     $queryFiltred->setFilter($filterBool);
     $queryFiltred->setQuery(new \Elastica\Query\MatchAll());
     $query = new Query();
     // These are not needed only aggregation
     $query->setSize(0);
     $query->setQuery($queryFiltred);
     $query->addAggregation($termsAgg);
     return $query;
 }
Example #2
0
 /**
  * @param $searchTerm
  * @param $taxon
  * @param $boolFilter
  * @param $elasticaQuery
  */
 public function applyElasticaQueryType($searchTerm, $taxon, $boolFilter, $elasticaQuery)
 {
     if (!$searchTerm) {
         $query = new \Elastica\Query\Filtered();
         $taxonFromRequestFilter = new \Elastica\Filter\Terms();
         $taxonFromRequestFilter->setTerms('taxons', array($taxon));
         $boolFilter->addMust($taxonFromRequestFilter);
         $query->setFilter($boolFilter);
         $elasticaQuery->setQuery($query);
     } else {
         if ('all' !== $taxon) {
             $query = new \Elastica\Query\Filtered();
             $query->setQuery(new \Elastica\Query\QueryString($searchTerm));
             $taxonFromRequestFilter = new \Elastica\Filter\Terms();
             $taxonFromRequestFilter->setTerms('taxons', array($taxon));
             $boolFilter->addMust($taxonFromRequestFilter);
             $query->setFilter($boolFilter);
         } else {
             $query = new \Elastica\Query\QueryString($searchTerm);
         }
         $elasticaQuery->setQuery($query);
     }
 }
    protected function doQuery($sourceLanguage, $targetLanguage, $text)
    {
        /* Two query system:
         * 1) Find all strings in source language that match text
         * 2) Do another query for translations for those strings
         */
        $connection = $this->getClient()->getConnection();
        $oldTimeout = $connection->getTimeout();
        $connection->setTimeout(10);
        $fuzzyQuery = new \Elastica\Query\FuzzyLikeThis();
        $fuzzyQuery->setLikeText($text);
        $fuzzyQuery->addFields(array('content'));
        $boostQuery = new \Elastica\Query\FunctionScore();
        if ($this->useWikimediaExtraPlugin()) {
            $boostQuery->addFunction('levenshtein_distance_score', array('text' => $text, 'field' => 'content'));
        } else {
            $groovyScript = <<<GROOVY
import org.apache.lucene.search.spell.*
new LevensteinDistance().getDistance(srctxt, _source['content'])
GROOVY;
            $script = new \Elastica\Script($groovyScript, array('srctxt' => $text), \Elastica\Script::LANG_GROOVY);
            $boostQuery->addScriptScoreFunction($script);
        }
        $boostQuery->setBoostMode(\Elastica\Query\FunctionScore::BOOST_MODE_REPLACE);
        // Wrap the fuzzy query so it can be used as a filter.
        // This is slightly faster, as ES can throw away the scores by this query.
        $fuzzyFilter = new \Elastica\Filter\Query();
        $fuzzyFilter->setQuery($fuzzyQuery);
        $boostQuery->setFilter($fuzzyFilter);
        // Use filtered query to wrap function score and language filter
        $filteredQuery = new \Elastica\Query\Filtered();
        $languageFilter = new \Elastica\Filter\Term();
        $languageFilter->setTerm('language', $sourceLanguage);
        $filteredQuery->setFilter($languageFilter);
        $filteredQuery->setQuery($boostQuery);
        // The whole query
        $query = new \Elastica\Query();
        $query->setQuery($filteredQuery);
        // The interface usually displays three best candidates. These might
        // come from more than three source things, if the translations are
        // the same. In other words suggestions are grouped by the suggested
        // translation. This algorithm might not find all suggestions, if the
        // top N best matching source texts don't have equivalent translations
        // in the target language, but worse matches which we did not fetch do.
        // This code tries to balance between doing too many or too big queries
        // and not fetching enough results to show all possible suggestions.
        $sizeFirst = 100;
        $sizeSecond = $sizeFirst * 5;
        $query->setFrom(0);
        $query->setSize($sizeFirst);
        $query->setParam('_source', array('content'));
        $cutoff = isset($this->config['cutoff']) ? $this->config['cutoff'] : 0.65;
        $query->setParam('min_score', $cutoff);
        $query->setSort(array('_score', '_uid'));
        // This query is doing two unrelated things:
        // 1) Collect the message contents and scores so that they can
        //    be accessed later for the translations we found.
        // 2) Build the query string for the query that fetches the translations.
        $contents = $scores = $terms = array();
        do {
            $resultset = $this->getType()->search($query);
            if (count($resultset) === 0) {
                break;
            }
            foreach ($resultset->getResults() as $result) {
                $data = $result->getData();
                $score = $result->getScore();
                $sourceId = preg_replace('~/[^/]+$~', '', $result->getId());
                $contents[$sourceId] = $data['content'];
                $scores[$sourceId] = $score;
                $terms[] = "{$sourceId}/{$targetLanguage}";
            }
            // Check if it looks like that we are hitting the long tail already.
            // Otherwise, we'll do a query to fetch some more to reach a "sane"
            // breaking point, i.e. include all suggestions with same content
            // for reliable used X times statistics.
            if (count(array_unique($scores)) > 5) {
                break;
            }
            // Okay, We are now in second iteration of the loop. We already got
            // lots of suggestions. We will give up for now even if it means we
            // return in some sense incomplete results.
            if (count($resultset) === $sizeSecond) {
                break;
            }
            // After the first query, the smallest score is the new threshold.
            $query->setParam('min_score', $score);
            $query->setFrom($query->getParam('size') + $query->getParam('from'));
            $query->setSize($sizeSecond);
            // Break if we already got all hits
        } while ($resultset->getTotalHits() > count($contents));
        $suggestions = array();
        // Skip second query if first query found nothing. Keeping only one return
        // statement in this method to avoid forgetting to reset connection timeout
        if ($terms !== array()) {
            $idQuery = new \Elastica\Query\Terms();
            $idQuery->setTerms('_id', $terms);
            $query = new \Elastica\Query($idQuery);
            $query->setSize(25);
            $query->setParam('_source', array('wiki', 'uri', 'content', 'localid'));
            $resultset = $this->getType()->search($query);
            foreach ($resultset->getResults() as $result) {
                $data = $result->getData();
                // Construct the matching source id
                $sourceId = preg_replace('~/[^/]+$~', '', $result->getId());
                $suggestions[] = array('source' => $contents[$sourceId], 'target' => $data['content'], 'context' => $data['localid'], 'quality' => $scores[$sourceId], 'wiki' => $data['wiki'], 'location' => $data['localid'] . '/' . $targetLanguage, 'uri' => $data['uri']);
            }
            // Ensure reults are in quality order
            uasort($suggestions, function ($a, $b) {
                if ($a['quality'] === $b['quality']) {
                    return 0;
                }
                return $a['quality'] < $b['quality'] ? 1 : -1;
            });
        }
        $connection->setTimeout($oldTimeout);
        return $suggestions;
    }