Author: yooper (yooper)
コード例 #1
0
ファイル: TextTest.php プロジェクト: yooper/php-text-analysis
 public function testEndsWith()
 {
     $this->assertTrue(Text::endsWith('lunches', 's'));
     $this->assertTrue(Text::endsWith('lunches', 'es'));
     $this->assertTrue(Text::endsWith('lunches', 'hes'));
     $this->assertFalse(Text::endsWith('joe', 'is'));
 }
コード例 #2
0
 /**
  * Lower case the word and return it
  * @param string $word
  * @return string 
  */
 public function transform($word)
 {
     if (Text::endsWith($word, "'s")) {
         return substr($word, 0, -2);
     } else {
         return $word;
     }
 }
コード例 #3
0
 /**
  * 
  * @param string $token
  * @return false|\DateTime
  */
 public function filter($token)
 {
     $date = Text::findDate($token);
     if (!empty($date) && $this->verify($date)) {
         return new DateTime("{$date['year']}-{$date['month']}-{$date['day']}");
     }
     return false;
 }
コード例 #4
0
 public function testAllSubstrings()
 {
     $text = 'abc';
     $expected = ['a', 'ab', 'abc', 'b', 'bc', 'c'];
     $substrings = Text::getAllSubStrings($text);
     $this->assertCount(6, $substrings);
     $this->assertEquals($expected, $substrings);
 }
コード例 #5
0
 /**
  * 
  * @param InvertedIndex $invertedIndex
  * @return array
  */
 public function queryIndex(InvertedIndex $invertedIndex)
 {
     $terms = array_keys($invertedIndex->getIndex());
     $found = [];
     foreach ($terms as $term) {
         foreach ($this->getQuery() as $queryTerm) {
             if (Text::contains($term, $queryTerm)) {
                 $found[$term] = $invertedIndex->getDocumentIdsByTerm($term);
             }
         }
     }
     return $found;
 }
 /**
  * Returns the Longest common substring
  * @param string $text1
  * @param string $text2
  * @return string
  */
 public function similarity($text1, $text2)
 {
     $intersection = array_intersect(Text::getAllSubStrings($text1), Text::getAllSubStrings($text2));
     $max = 0;
     $lcs = '';
     foreach ($intersection as $substr) {
         if (strlen($substr) > $max) {
             $max = strlen($substr);
             $lcs = $substr;
         }
     }
     return $lcs;
 }
コード例 #7
0
 /**
  * @return DateTime[]
  */
 public function getDates()
 {
     // return the cached copy
     if (empty($this->dates)) {
         $getDateFunc = function ($sentence) {
             $date = Text::findDate($sentence);
             return new DateTime("{$date['year']}-{$date['month']}-{$date['day']}");
         };
         $this->dates = array_map($getDateFunc, $this->sentences);
         // re-index so nulls and offsets are correct.
         $this->dates = array_values(array_filter($this->dates));
     }
     return $this->dates;
 }
コード例 #8
0
 /**
  * @param InvertedIndex $invertedIndex
  * @return arrray
  */
 public function queryIndex(InvertedIndex $invertedIndex)
 {
     $r = $invertedIndex->getDocumentIdsByTerm($this->getQuery()[0]);
     if (!empty($r)) {
         return [$this->getQuery()[0] => $r];
     }
     // do partial matches
     $terms = array_keys($invertedIndex->getIndex());
     $found = [];
     foreach ($terms as $term) {
         if (Text::contains($term, $this->getQueryString())) {
             $found[$term] = $invertedIndex->getDocumentIdsByTerm($term);
         }
     }
     return $found;
 }
コード例 #9
0
 /**
  * Returns the Longest common substring
  * @param string $text1
  * @param string $text2
  * @return string
  */
 public function similarity($text1, $text2)
 {
     if ($this->useCache && !isset($this->cache[$text2])) {
         $this->cache[$text2] = Text::getAllSubStrings($text2);
     }
     $intersection = array_intersect(Text::getAllSubStrings($text1), $this->useCache ? $this->cache[$text2] : Text::getAllSubStrings($text2));
     $max = 0;
     $lcs = '';
     foreach ($intersection as $substr) {
         $strlen = mb_strlen($substr);
         if ($strlen > $max) {
             $max = $strlen;
             $lcs = $substr;
         }
     }
     return $lcs;
 }
コード例 #10
0
 /**
  * Concept taken from nltk 
  * Find a possible base form for the given form, with the given
  * part of speech, by checking WordNet's list of exceptional
  * forms, and by recursively stripping affixes for this part of
  * speech until a form in WordNet is found.
  * @todo improve the algorithm, it is really slow
  * @param string $word
  * @param string|null $pos
  * @return string return the base word 
  */
 public function getMorph($word, $pos = '')
 {
     if (mb_strlen($word) < 3) {
         return "";
     }
     $searchForFuncWithPos = function (ExceptionMap $exceptionMap) use($word, $pos) {
         return $exceptionMap->getPos() === $pos && in_array($word, $exceptionMap->getExceptionList());
     };
     $searchForFuncWithoutPos = function (ExceptionMap $exceptionMap) use($word) {
         return in_array($word, $exceptionMap->getExceptionList());
     };
     $found = [];
     if (!empty($pos)) {
         $found = array_filter($this->getWordnetCorpus()->getExceptionsMap(), $searchForFuncWithPos);
     } else {
         $found = array_filter($this->getWordnetCorpus()->getExceptionsMap(), $searchForFuncWithoutPos);
     }
     // found a match in the exceptions data
     if (!empty($found)) {
         return array_values($found)[0]->getTarget();
     }
     foreach ($this->getMorphilogicalSubstitutions() as $keyPos => $keyValues) {
         foreach ($keyValues as $key => $value) {
             if (Text::endsWith($word, $key)) {
                 $morphedWord = substr($word, 0, -strlen($key)) . $value;
                 $r = $this->getLemma($morphedWord, $keyPos);
                 if (!empty($r)) {
                     $found += array_map(function ($lemma) {
                         return $lemma->getWord();
                     }, $r);
                     return $found[0];
                 }
             }
         }
     }
     if (empty($found)) {
         return "";
     }
     return $found[0];
 }