public function testSingleHapaxFreqDist()
 {
     $freqDist = new FreqDist(array("time"));
     $this->assertTrue(count($freqDist->getHapaxes()) === 1);
     $this->assertEquals(1, $freqDist->getTotalTokens());
     $this->assertEquals(1, $freqDist->getTotalUniqueTokens());
 }
Esempio n. 2
0
 /**
  * Get the term frequency
  * @param DocumentAbstract $document - the document to evaluate
  * @param string $token The token to look for
  * @param int $mode The type of term frequency to use
  * @return int|float 
  */
 public function getTermFrequency(DocumentAbstract $document, $token, $mode = 1)
 {
     $freqDist = new FreqDist($document->getDocumentData());
     $keyValuesByWeight = $freqDist->getKeyValuesByFrequency();
     //The token does not exist in the document
     if (!isset($keyValuesByWeight[$token])) {
         return 0;
     }
     switch ($mode) {
         case self::BOOLEAN_MODE:
             //a test was already performed if the token exists in the document
             //just return true
             return 1;
         case self::LOGARITHMIC_MODE:
             return log($keyValuesByWeight[$token] + 1);
         case self::AUGMENTED_MODE:
             //FreqDist getKeyValuesByFrequency is already sorted
             //in ascending order
             $maxFrequency = current($keyValuesByWeight);
             return 0.5 + 0.5 * $keyValuesByWeight[$token] / $maxFrequency;
             return $keyValuesByWeight;
         case self::FREQUENCY_MODE:
         default:
             return $keyValuesByWeight[$token];
     }
 }
 /**
  * Builds the internal index data structure using the provided collection
  * @param ICollection $collection 
  */
 protected function buildIndex(ICollection $collection)
 {
     //first pass compute frequencies and all the terms in the collection
     foreach ($collection as $id => $document) {
         $freqDist = new FreqDist($document->getDocumentData());
         foreach ($freqDist->getKeyValuesByFrequency() as $term => $freq) {
             if (!isset($this->index[$term])) {
                 $this->index[$term] = array(self::FREQ => 0, self::POSTINGS => array());
             }
             $this->index[$term][self::FREQ] += $freq;
             $this->index[$term][self::POSTINGS][] = $id;
         }
     }
 }
 /**
  * Compute the Pointwise Mutual Information on the collocations
  * @return array
  */
 public function getCollocationsByPmi()
 {
     $nGramFreqDist = new FreqDist(NGramFactory::create($this->tokens, $this->nGramSize));
     $unigramsFreqDist = new FreqDist($this->tokens);
     $dataSet = [];
     foreach ($nGramFreqDist->getKeys() as $nGramToken) {
         $tokens = explode(" ", $nGramToken);
         $tally = 1;
         foreach ($tokens as $unigramToken) {
             $tally *= $unigramsFreqDist->getKeyValuesByWeight()[$unigramToken];
         }
         // get probabilities of all tokens
         $dataSet[$nGramToken] = log($nGramFreqDist->getKeyValuesByWeight()[$nGramToken] / $tally);
     }
     arsort($dataSet);
     return $dataSet;
 }