private function hydrate(array &$record, array $fields, array $index_fields) { if (!isset($record['databox_id'])) { throw new Exception('Expected a record with the "databox_id" key set.'); } $values = array(); $terms = array(); $filters = array(); $field_names = array(); foreach ($fields as $name => $root_concepts) { // Loop through all values to prepare bulk query $field_values = \igorw\get_in($record, explode('.', $index_fields[$name])); if ($field_values !== null) { // Concepts are databox's specific, but when no root concepts are // given we need to make sure we only match in the right databox. $filter = $root_concepts ? Filter::childOfConcepts($record['databox_id'], $root_concepts) : Filter::byDatabox($record['databox_id']); foreach ($field_values as $value) { $values[] = $value; $terms[] = Term::parse($value); $filters[] = $filter; $field_names[] = $name; } } } $bulk = $this->thesaurus->findConceptsBulk($terms, null, $filters, true); foreach ($bulk as $offset => $item_concepts) { if ($item_concepts && is_array($item_concepts) && count($item_concepts) > 0) { $name = $field_names[$offset]; foreach ($item_concepts as $concept) { $record['concept_path'][$name][] = $concept->getPath(); } } else { $this->candidate_terms->insert($field_names[$offset], $values[$offset]); } } }
/** * Find concepts linked to the provided Term * * In strict mode, term context matching is enforced: * `orange (color)` will *not* match `orange` in the index * * @param Term|string $term Term object or a string containing term's value * @param string|null $lang Input language ("fr", "en", ...) for more effective results * @param Filter|null $filter Filter to restrict search on a specified subset * @param boolean $strict Whether to enable strict search or not * @return Concept[] Matching concepts */ public function findConcepts($term, $lang = null, Filter $filter = null, $strict = false) { if (!$term instanceof TermInterface) { $term = new Term($term); } $this->logger->info(sprintf('Searching for term %s', $term), array('strict' => $strict, 'lang' => $lang)); if ($strict) { $field_suffix = '.strict'; } elseif ($lang) { $field_suffix = sprintf('.%s', $lang); } else { $field_suffix = ''; } $field = sprintf('value%s', $field_suffix); $query = array(); $query['match'][$field]['query'] = $term->getValue(); $query['match'][$field]['operator'] = 'and'; // Allow 25% of non-matching tokens // (not exactly the same that 75% of matching tokens) // $query['match'][$field]['minimum_should_match'] = '-25%'; if ($term->hasContext()) { $value_query = $query; $field = sprintf('context%s', $field_suffix); $context_query = array(); $context_query['match'][$field]['query'] = $term->getContext(); $context_query['match'][$field]['operator'] = 'and'; $query = array(); $query['bool']['must'][0] = $value_query; $query['bool']['must'][1] = $context_query; } elseif ($strict) { $context_filter = array(); $context_filter['missing']['field'] = 'context'; $query = self::applyQueryFilter($query, $context_filter); } if ($lang) { $lang_filter = array(); $lang_filter['term']['lang'] = $lang; $query = self::applyQueryFilter($query, $lang_filter); } if ($filter) { $this->logger->debug('Using filter', array('filter' => Filter::dump($filter))); $query = self::applyQueryFilter($query, $filter->getQueryFilter()); } // Path deduplication $aggs = array(); $aggs['dedup']['terms']['field'] = 'path.raw'; // Search request $params = array(); $params['index'] = $this->options->getIndexName(); $params['type'] = TermIndexer::TYPE_NAME; $params['body']['query'] = $query; $params['body']['aggs'] = $aggs; // Arbitrary score low limit, we need find a more granular way to remove // inexact concepts. // We also need to disable TF/IDF on terms, and try to boost score only // when the search match nearly all tokens of term's value field. $params['body']['min_score'] = $this->options->getMinScore(); // No need to get any hits since we extract data from aggs $params['body']['size'] = 0; $this->logger->debug('Sending search', $params['body']); $response = $this->client->search($params); // Extract concept paths from response $concepts = array(); $buckets = \igorw\get_in($response, ['aggregations', 'dedup', 'buckets'], []); $keys = array(); foreach ($buckets as $bucket) { if (isset($bucket['key'])) { $keys[] = $bucket['key']; $concepts[] = new Concept($bucket['key']); } } $this->logger->info(sprintf('Found %d matching concepts', count($concepts)), array('concepts' => $keys)); return $concepts; }