/** * * @param TrainingSet $tset * The set of documents for which we will compute the idf * @param FeatureFactoryInterface $ff * A feature factory to translate the document data to single tokens */ public function __construct(TrainingSet $tset, FeatureFactoryInterface $ff = null) { if ($ff === null) { $ff = new DataAsFeatures(); } $tset->setAsKey(TrainingSet::CLASS_AS_KEY); foreach ($tset as $class => $doc) { $tokens = $ff->getFeatureArray($class, $doc); // extract tokens from the document $tokens = array_fill_keys($tokens, 1); // make them occur once foreach ($tokens as $token => $v) { if (isset($this->idf[$token])) { $this->idf[$token]++; } else { $this->idf[$token] = 1; } } } // this idf so far contains the doc frequency // we will now inverse it and take the log $D = count($tset); foreach ($this->idf as &$v) { $v = log($D / $v); } $this->logD = log($D); }
/** * Calculate all the features for each possible class of each * document. This is done so that we can optimize without the need * of the FeatureFactory. * * We do not want to use the FeatureFactoryInterface both because it would * be slow to calculate the features over and over again, but also * because we want to be able to optimize externally to * gain speed (PHP is slow!). * * @param $classes A set of the classes in the training set * @param $tset A collection of training documents * @param $ff The feature factory * @return array An array that contains every feature for every possible class of every document */ protected function calculateFeatureArray(array $classes, TrainingSet $tset, FeatureFactoryInterface $ff) { $features = array(); $tset->setAsKey(TrainingSet::OFFSET_AS_KEY); foreach ($tset as $offset => $doc) { $features[$offset] = array(); foreach ($classes as $class) { $features[$offset][$class] = $ff->getFeatureArray($class, $doc); } $features[$offset]['__label__'] = $doc->getClass(); } return $features; }