public function query(AbstractTokenizer $locale, $invoices, array $query = []) { $needles = !empty($query['query']) ? $locale->parseWord($query['query']) : []; $results = []; foreach ($invoices as $invoice) { $tokens = $this->getTokens($locale, $invoice); $scores = []; // Get the number of each token type $this->updateMedianEstimator($tokens); // Similarity score if (!empty($query['query'])) { $scores['query'] = $this->getSimilarityScore($tokens, $needles); } // Amount found score if (!empty($query['amount'])) { $scores['amount'] = $this->containsAmount($tokens, $query['amount']) ? 100 : 0; } // Submission date if (!empty($query['date'])) { $scores['date'] = $this->getSubmissionDateScore($tokens, $query['date']); } // Invoice ID found score if (!empty($query['id'])) { $scores['id'] = $this->containsInvoiceId($tokens, $query['id']) ? 100 : 0; } $results[] = ['invoice' => $invoice, 'score' => $scores]; } $medians = $this->computeMedians(); foreach ($results as $i => $result) { $results[$i]['score'] = $this->computeFinalScore($i, $medians, $result['score']); } if ($this->cache) { $this->cache->commit(); } return $results; }
public function parseWord($text) { $tokens = []; // Get all words of 3+ characters preg_match_all('/(\\w{3,})/u', $text, $matches, PREG_OFFSET_CAPTURE | PREG_SET_ORDER); foreach ($matches as $match) { if (!in_array(mb_strtolower($match[0][0]), $this->frequentWords)) { $tokens[] = new Word($match[0][1], $match[0][0]); } } // Merge adjacent words $tokens = AbstractTokenizer::mergeAdjacentWords($tokens, function (Word $a, Word $b) use($text) { $delimiterOffset = $a->getStart() + $a->getLength(); // Words are adjacent if seperated with a space return $delimiterOffset + 1 === $b->getStart() && $text[$delimiterOffset] === ' '; }); return $tokens; }