public function testTriGram() { $tokens = ["one", "two", "three", "four"]; $expected = ["one two three", "two three four"]; $bigrams = NGramFactory::create($tokens, NGramFactory::TRIGRAM); $this->assertEquals($expected, $bigrams); }
/** * Compute the Pointwise Mutual Information on the collocations * @return array */ public function getCollocationsByPmi() { $nGramFreqDist = new FreqDist(NGramFactory::create($this->tokens, $this->nGramSize)); $unigramsFreqDist = new FreqDist($this->tokens); $dataSet = []; foreach ($nGramFreqDist->getKeys() as $nGramToken) { $tokens = explode(" ", $nGramToken); $tally = 1; foreach ($tokens as $unigramToken) { $tally *= $unigramsFreqDist->getKeyValuesByWeight()[$unigramToken]; } // get probabilities of all tokens $dataSet[$nGramToken] = log($nGramFreqDist->getKeyValuesByWeight()[$nGramToken] / $tally); } arsort($dataSet); return $dataSet; }
/** * Get all the possible phrases * @return array */ public function getPhrases() { $phrases = []; for ($index = $this->nGramSize; $index >= 2; $index--) { $phrases = array_merge($phrases, NGramFactory::create($this->getTokens(), $index)); } // you cannot use a phrase if it is a substring of a longer phrase // we must exclude all of the substring phrases $add = []; $remove = []; foreach ($phrases as $phrase) { if (isset($remove[$phrase])) { continue; } elseif (!isset($add[$phrase])) { $add[$phrase] = true; // remove the suffix word $remove[substr($phrase, 0, strrpos($phrase, " "))] = true; //remove the prefix $remove[substr($phrase, strpos($phrase, " ") + 1)] = true; } } return array_keys($add); }
/** * * @param array $tokens * @param string $separator * @return array */ function trigrams(array $tokens, $separator = ' ') { return \TextAnalysis\NGrams\NGramFactory::create($tokens, 3, $separator); }