/** * Helper function to transform a TrainingSet to an array of feature vectors */ protected function getDocumentArray(TrainingSet $documents, FeatureFactoryInterface $ff) { $docs = array(); foreach ($documents as $d) { $docs[] = $ff->getFeatureArray('', $d); } return $docs; }
/** * Calculate the probability that document $d belongs to the class * $class given a set of possible classes, a feature factory and * the model's weights l[i] * * @param $classes The set of possible classes * @param $ff The feature factory * @param $d The document * @param string $class A class for which we calculate the probability * @return float The probability that document $d belongs to class $class */ public function P(array $classes, FeatureFactoryInterface $ff, DocumentInterface $d, $class) { $exps = array(); foreach ($classes as $cl) { $tmp = 0.0; foreach ($ff->getFeatureArray($cl, $d) as $i) { $tmp += $this->l[$i]; } $exps[$cl] = exp($tmp); } return $exps[$class] / array_sum($exps); }
/** * Count all the features for each document. All parameters are passed * by reference and they are filled in this function. Useful for not * making copies of big arrays. * * @param FeatureFactoryInterface $ff A feature factory to create the features for each document in the set * @param TrainingSet $tset The training set (collection of labeled documents) * @param array $termcount_per_class The count of occurences of each feature in each class * @param array $termcount The total count of occurences of each term * @param array $ndocs_per_class The total number of documents per class * @param array $voc A set of the found features * @param integer $ndocs The number of documents * @return void */ protected function countTrainingSet(FeatureFactoryInterface $ff, TrainingSet $tset, array &$termcount_per_class, array &$termcount, array &$ndocs_per_class, array &$voc, &$ndocs) { foreach ($tset as $tdoc) { $ndocs++; $c = $tdoc->getClass(); $ndocs_per_class[$c]++; $features = $ff->getFeatureArray($c, $tdoc); if (is_int(key($features))) { $features = array_count_values($features); } foreach ($features as $f => $fcnt) { if (!isset($voc[$f])) { $voc[$f] = 0; } $termcount_per_class[$c] += $fcnt; if (isset($termcount[$c][$f])) { $termcount[$c][$f] += $fcnt; } else { $termcount[$c][$f] = $fcnt; } } } }