/** * Get the term frequency * @param DocumentAbstract $document - the document to evaluate * @param string $token The token to look for * @param int $mode The type of term frequency to use * @return int|float */ public function getTermFrequency(DocumentAbstract $document, $token, $mode = 1) { $freqDist = new FreqDist($document->getDocumentData()); $keyValuesByWeight = $freqDist->getKeyValuesByFrequency(); //The token does not exist in the document if (!isset($keyValuesByWeight[$token])) { return 0; } switch ($mode) { case self::BOOLEAN_MODE: //a test was already performed if the token exists in the document //just return true return 1; case self::LOGARITHMIC_MODE: return log($keyValuesByWeight[$token] + 1); case self::AUGMENTED_MODE: //FreqDist getKeyValuesByFrequency is already sorted //in ascending order $maxFrequency = current($keyValuesByWeight); return 0.5 + 0.5 * $keyValuesByWeight[$token] / $maxFrequency; return $keyValuesByWeight; case self::FREQUENCY_MODE: default: return $keyValuesByWeight[$token]; } }
/** * Builds the internal index data structure using the provided collection * @param ICollection $collection */ protected function buildIndex(ICollection $collection) { //first pass compute frequencies and all the terms in the collection foreach ($collection as $id => $document) { $freqDist = new FreqDist($document->getDocumentData()); foreach ($freqDist->getKeyValuesByFrequency() as $term => $freq) { if (!isset($this->index[$term])) { $this->index[$term] = array(self::FREQ => 0, self::POSTINGS => array()); } $this->index[$term][self::FREQ] += $freq; $this->index[$term][self::POSTINGS][] = $id; } } }