Example #1
0
 /**
  * Helper function to transform a TrainingSet to an array of feature vectors
  */
 protected function getDocumentArray(TrainingSet $documents, FeatureFactory $ff)
 {
     $docs = array();
     foreach ($documents as $d) {
         $docs[] = $ff->getFeatureArray('', $d);
     }
     return $docs;
 }
Example #2
0
 /**
  * Calculate the probability that document $d belongs to the class
  * $class given a set of possible classes, a feature factory and
  * the model's weights l[i]
  * 
  * @param $classes The set of possible classes
  * @param $ff The feature factory
  * @param $d The document
  * @param string $class A class for which we calculate the probability
  * @return float The probability that document $d belongs to class $class
  */
 public function P(array $classes, FeatureFactory $ff, Document $d, $class)
 {
     $exps = array();
     foreach ($classes as $cl) {
         $tmp = 0.0;
         foreach ($ff->getFeatureArray($cl, $d) as $i) {
             $tmp += $this->l[$i];
         }
         $exps[$cl] = exp($tmp);
     }
     return $exps[$class] / array_sum($exps);
 }
Example #3
0
 /**
  * Count all the features for each document. All parameters are passed
  * by reference and they are filled in this function. Useful for not
  * making copies of big arrays.
  * 
  * @param FeatureFactory $ff A feature factory to create the features for each document in the set
  * @param TrainingSet $tset The training set (collection of labeled documents)
  * @param array $termcount_per_class The count of occurences of each feature in each class
  * @param array $termcount The total count of occurences of each term
  * @param array $ndocs_per_class The total number of documents per class
  * @param array $voc A set of the found features
  * @param integer $ndocs The number of documents
  * @return void
  */
 protected function countTrainingSet(FeatureFactory $ff, TrainingSet $tset, array &$termcount_per_class, array &$termcount, array &$ndocs_per_class, array &$voc, &$ndocs)
 {
     foreach ($tset as $tdoc) {
         $ndocs++;
         $c = $tdoc->getClass();
         $ndocs_per_class[$c]++;
         $features = $ff->getFeatureArray($c, $tdoc);
         if (is_int(key($features))) {
             $features = array_count_values($features);
         }
         foreach ($features as $f => $fcnt) {
             if (!isset($voc[$f])) {
                 $voc[$f] = 0;
             }
             $termcount_per_class[$c] += $fcnt;
             if (isset($termcount[$c][$f])) {
                 $termcount[$c][$f] += $fcnt;
             } else {
                 $termcount[$c][$f] = $fcnt;
             }
         }
     }
 }