Пример #1
0
 public function getFeatureArray($class, DocumentInterface $doc)
 {
     $tokens = $doc->getDocumentData();
     $tokens = array_count_values($tokens);
     foreach ($tokens as $tok => &$v) {
         $v = min($v, 4);
     }
     return $tokens;
 }
Пример #2
0
 public function classify(array $classes, DocumentInterface $d)
 {
     list($token, $before, $after) = $d->getDocumentData();
     $dotcnt = count(explode('.', $token)) - 1;
     $lastdot = substr($token, -1) == '.';
     if (!$lastdot) {
         // assume that all sentences end in full stops
         return 'O';
     }
     if ($dotcnt > 1) {
         // to catch some naive abbreviations (e.g.: U.S.A.)
         return 'O';
     }
     return 'EOW';
 }
Пример #3
0
 /**
  * For use with TokensDocument mostly.
  * Simply return the data as
  * features. Could contain duplicates (a feature firing twice in
  * for a signle document).
  *
  * @param string $class
  *        	The class for which we are calculating features
  * @param DocumentInterface $d
  *        	The document to calculate features for.
  * @return array
  */
 public function getFeatureArray($class, DocumentInterface $d)
 {
     return $d->getDocumentData();
 }
 public function classify(array $classes, DocumentInterface $d)
 {
     return $classes[$d->getDocumentData() % count($classes)];
 }