public function classify(array $classes, Document $d) { list($token, $before, $after) = $d->getDocumentData(); $dotcnt = count(explode('.', $token)) - 1; $lastdot = substr($token, -1) == '.'; if (!$lastdot) { // assume that all sentences end in full stops return 'O'; } if ($dotcnt > 1) { // to catch some naive abbreviations U.S.A. return 'O'; } return 'EOW'; }
/** * For use with TokensDocument mostly. Simply return the data as * features. Could contain duplicates (a feature firing twice in * for a signle document). * * @param string $class The class for which we are calculating features * @param Document $d The document to calculate features for. * @return array */ public function getFeatureArray($class, Document $d) { return $d->getDocumentData(); }