Example #1
0
 /**
  *
  * @param TrainingSet $tset
  *        	The set of documents for which we will compute the idf
  * @param FeatureFactoryInterface $ff
  *        	A feature factory to translate the document data to single tokens
  */
 public function __construct(TrainingSet $tset, FeatureFactoryInterface $ff = null)
 {
     if ($ff === null) {
         $ff = new DataAsFeatures();
     }
     $tset->setAsKey(TrainingSet::CLASS_AS_KEY);
     foreach ($tset as $class => $doc) {
         $tokens = $ff->getFeatureArray($class, $doc);
         // extract tokens from the document
         $tokens = array_fill_keys($tokens, 1);
         // make them occur once
         foreach ($tokens as $token => $v) {
             if (isset($this->idf[$token])) {
                 $this->idf[$token]++;
             } else {
                 $this->idf[$token] = 1;
             }
         }
     }
     // this idf so far contains the doc frequency
     // we will now inverse it and take the log
     $D = count($tset);
     foreach ($this->idf as &$v) {
         $v = log($D / $v);
     }
     $this->logD = log($D);
 }
Example #2
0
 /**
  * Calculate all the features for each possible class of each
  * document. This is done so that we can optimize without the need
  * of the FeatureFactory.
  *
  * We do not want to use the FeatureFactoryInterface both because it would
  * be slow to calculate the features over and over again, but also
  * because we want to be able to optimize externally to
  * gain speed (PHP is slow!).
  *
  * @param $classes A set of the classes in the training set
  * @param $tset A collection of training documents
  * @param $ff The feature factory
  * @return array An array that contains every feature for every possible class of every document
  */
 protected function calculateFeatureArray(array $classes, TrainingSet $tset, FeatureFactoryInterface $ff)
 {
     $features = array();
     $tset->setAsKey(TrainingSet::OFFSET_AS_KEY);
     foreach ($tset as $offset => $doc) {
         $features[$offset] = array();
         foreach ($classes as $class) {
             $features[$offset][$class] = $ff->getFeatureArray($class, $doc);
         }
         $features[$offset]['__label__'] = $doc->getClass();
     }
     return $features;
 }