public function getFeatureArray($class, DocumentInterface $doc) { $tokens = $doc->getDocumentData(); $tokens = array_count_values($tokens); foreach ($tokens as $tok => &$v) { $v = min($v, 4); } return $tokens; }
public function classify(array $classes, DocumentInterface $d) { list($token, $before, $after) = $d->getDocumentData(); $dotcnt = count(explode('.', $token)) - 1; $lastdot = substr($token, -1) == '.'; if (!$lastdot) { // assume that all sentences end in full stops return 'O'; } if ($dotcnt > 1) { // to catch some naive abbreviations (e.g.: U.S.A.) return 'O'; } return 'EOW'; }
/** * For use with TokensDocument mostly. * Simply return the data as * features. Could contain duplicates (a feature firing twice in * for a signle document). * * @param string $class * The class for which we are calculating features * @param DocumentInterface $d * The document to calculate features for. * @return array */ public function getFeatureArray($class, DocumentInterface $d) { return $d->getDocumentData(); }
public function classify(array $classes, DocumentInterface $d) { return $classes[$d->getDocumentData() % count($classes)]; }