예제 #1
0
 public function query(AbstractTokenizer $locale, $invoices, array $query = [])
 {
     $needles = !empty($query['query']) ? $locale->parseWord($query['query']) : [];
     $results = [];
     foreach ($invoices as $invoice) {
         $tokens = $this->getTokens($locale, $invoice);
         $scores = [];
         // Get the number of each token type
         $this->updateMedianEstimator($tokens);
         // Similarity score
         if (!empty($query['query'])) {
             $scores['query'] = $this->getSimilarityScore($tokens, $needles);
         }
         // Amount found score
         if (!empty($query['amount'])) {
             $scores['amount'] = $this->containsAmount($tokens, $query['amount']) ? 100 : 0;
         }
         // Submission date
         if (!empty($query['date'])) {
             $scores['date'] = $this->getSubmissionDateScore($tokens, $query['date']);
         }
         // Invoice ID found score
         if (!empty($query['id'])) {
             $scores['id'] = $this->containsInvoiceId($tokens, $query['id']) ? 100 : 0;
         }
         $results[] = ['invoice' => $invoice, 'score' => $scores];
     }
     $medians = $this->computeMedians();
     foreach ($results as $i => $result) {
         $results[$i]['score'] = $this->computeFinalScore($i, $medians, $result['score']);
     }
     if ($this->cache) {
         $this->cache->commit();
     }
     return $results;
 }
예제 #2
0
 public function parseWord($text)
 {
     $tokens = [];
     // Get all words of 3+ characters
     preg_match_all('/(\\w{3,})/u', $text, $matches, PREG_OFFSET_CAPTURE | PREG_SET_ORDER);
     foreach ($matches as $match) {
         if (!in_array(mb_strtolower($match[0][0]), $this->frequentWords)) {
             $tokens[] = new Word($match[0][1], $match[0][0]);
         }
     }
     // Merge adjacent words
     $tokens = AbstractTokenizer::mergeAdjacentWords($tokens, function (Word $a, Word $b) use($text) {
         $delimiterOffset = $a->getStart() + $a->getLength();
         // Words are adjacent if seperated with a space
         return $delimiterOffset + 1 === $b->getStart() && $text[$delimiterOffset] === ' ';
     });
     return $tokens;
 }