public function testEndsWith() { $this->assertTrue(Text::endsWith('lunches', 's')); $this->assertTrue(Text::endsWith('lunches', 'es')); $this->assertTrue(Text::endsWith('lunches', 'hes')); $this->assertFalse(Text::endsWith('joe', 'is')); }
/** * Lower case the word and return it * @param string $word * @return string */ public function transform($word) { if (Text::endsWith($word, "'s")) { return substr($word, 0, -2); } else { return $word; } }
/** * * @param string $token * @return false|\DateTime */ public function filter($token) { $date = Text::findDate($token); if (!empty($date) && $this->verify($date)) { return new DateTime("{$date['year']}-{$date['month']}-{$date['day']}"); } return false; }
public function testAllSubstrings() { $text = 'abc'; $expected = ['a', 'ab', 'abc', 'b', 'bc', 'c']; $substrings = Text::getAllSubStrings($text); $this->assertCount(6, $substrings); $this->assertEquals($expected, $substrings); }
/** * * @param InvertedIndex $invertedIndex * @return array */ public function queryIndex(InvertedIndex $invertedIndex) { $terms = array_keys($invertedIndex->getIndex()); $found = []; foreach ($terms as $term) { foreach ($this->getQuery() as $queryTerm) { if (Text::contains($term, $queryTerm)) { $found[$term] = $invertedIndex->getDocumentIdsByTerm($term); } } } return $found; }
/** * Returns the Longest common substring * @param string $text1 * @param string $text2 * @return string */ public function similarity($text1, $text2) { $intersection = array_intersect(Text::getAllSubStrings($text1), Text::getAllSubStrings($text2)); $max = 0; $lcs = ''; foreach ($intersection as $substr) { if (strlen($substr) > $max) { $max = strlen($substr); $lcs = $substr; } } return $lcs; }
/** * @return DateTime[] */ public function getDates() { // return the cached copy if (empty($this->dates)) { $getDateFunc = function ($sentence) { $date = Text::findDate($sentence); return new DateTime("{$date['year']}-{$date['month']}-{$date['day']}"); }; $this->dates = array_map($getDateFunc, $this->sentences); // re-index so nulls and offsets are correct. $this->dates = array_values(array_filter($this->dates)); } return $this->dates; }
/** * @param InvertedIndex $invertedIndex * @return arrray */ public function queryIndex(InvertedIndex $invertedIndex) { $r = $invertedIndex->getDocumentIdsByTerm($this->getQuery()[0]); if (!empty($r)) { return [$this->getQuery()[0] => $r]; } // do partial matches $terms = array_keys($invertedIndex->getIndex()); $found = []; foreach ($terms as $term) { if (Text::contains($term, $this->getQueryString())) { $found[$term] = $invertedIndex->getDocumentIdsByTerm($term); } } return $found; }
/** * Returns the Longest common substring * @param string $text1 * @param string $text2 * @return string */ public function similarity($text1, $text2) { if ($this->useCache && !isset($this->cache[$text2])) { $this->cache[$text2] = Text::getAllSubStrings($text2); } $intersection = array_intersect(Text::getAllSubStrings($text1), $this->useCache ? $this->cache[$text2] : Text::getAllSubStrings($text2)); $max = 0; $lcs = ''; foreach ($intersection as $substr) { $strlen = mb_strlen($substr); if ($strlen > $max) { $max = $strlen; $lcs = $substr; } } return $lcs; }
/** * Concept taken from nltk * Find a possible base form for the given form, with the given * part of speech, by checking WordNet's list of exceptional * forms, and by recursively stripping affixes for this part of * speech until a form in WordNet is found. * @todo improve the algorithm, it is really slow * @param string $word * @param string|null $pos * @return string return the base word */ public function getMorph($word, $pos = '') { if (mb_strlen($word) < 3) { return ""; } $searchForFuncWithPos = function (ExceptionMap $exceptionMap) use($word, $pos) { return $exceptionMap->getPos() === $pos && in_array($word, $exceptionMap->getExceptionList()); }; $searchForFuncWithoutPos = function (ExceptionMap $exceptionMap) use($word) { return in_array($word, $exceptionMap->getExceptionList()); }; $found = []; if (!empty($pos)) { $found = array_filter($this->getWordnetCorpus()->getExceptionsMap(), $searchForFuncWithPos); } else { $found = array_filter($this->getWordnetCorpus()->getExceptionsMap(), $searchForFuncWithoutPos); } // found a match in the exceptions data if (!empty($found)) { return array_values($found)[0]->getTarget(); } foreach ($this->getMorphilogicalSubstitutions() as $keyPos => $keyValues) { foreach ($keyValues as $key => $value) { if (Text::endsWith($word, $key)) { $morphedWord = substr($word, 0, -strlen($key)) . $value; $r = $this->getLemma($morphedWord, $keyPos); if (!empty($r)) { $found += array_map(function ($lemma) { return $lemma->getWord(); }, $r); return $found[0]; } } } } if (empty($found)) { return ""; } return $found[0]; }