public function testTriGram()
 {
     $tokens = ["one", "two", "three", "four"];
     $expected = ["one two three", "two three four"];
     $bigrams = NGramFactory::create($tokens, NGramFactory::TRIGRAM);
     $this->assertEquals($expected, $bigrams);
 }
 /**
  * Compute the Pointwise Mutual Information on the collocations
  * @return array
  */
 public function getCollocationsByPmi()
 {
     $nGramFreqDist = new FreqDist(NGramFactory::create($this->tokens, $this->nGramSize));
     $unigramsFreqDist = new FreqDist($this->tokens);
     $dataSet = [];
     foreach ($nGramFreqDist->getKeys() as $nGramToken) {
         $tokens = explode(" ", $nGramToken);
         $tally = 1;
         foreach ($tokens as $unigramToken) {
             $tally *= $unigramsFreqDist->getKeyValuesByWeight()[$unigramToken];
         }
         // get probabilities of all tokens
         $dataSet[$nGramToken] = log($nGramFreqDist->getKeyValuesByWeight()[$nGramToken] / $tally);
     }
     arsort($dataSet);
     return $dataSet;
 }
示例#3
0
 /**
  * Get all the possible phrases
  * @return array
  */
 public function getPhrases()
 {
     $phrases = [];
     for ($index = $this->nGramSize; $index >= 2; $index--) {
         $phrases = array_merge($phrases, NGramFactory::create($this->getTokens(), $index));
     }
     // you cannot use a phrase if it is a substring of a longer phrase
     // we must exclude all of the substring phrases
     $add = [];
     $remove = [];
     foreach ($phrases as $phrase) {
         if (isset($remove[$phrase])) {
             continue;
         } elseif (!isset($add[$phrase])) {
             $add[$phrase] = true;
             // remove the suffix word
             $remove[substr($phrase, 0, strrpos($phrase, " "))] = true;
             //remove the prefix
             $remove[substr($phrase, strpos($phrase, " ") + 1)] = true;
         }
     }
     return array_keys($add);
 }
示例#4
0
/**
 * 
 * @param array $tokens
 * @param string $separator
 * @return array
 */
function trigrams(array $tokens, $separator = ' ')
{
    return \TextAnalysis\NGrams\NGramFactory::create($tokens, 3, $separator);
}