$ff = new FunctionFeatures(); $ff->add(function ($class, DocumentInterface $d) { $r = array(); foreach ($d->getDocumentData() as $tok) { $r[] = $class . $tok; } return $r; }); // create // 1. an empty training set // 2. an optimizer // 3. an empty model $tset = new TrainingSet(); $OPTIMIZER_PATH = isset($_ENV["GD_OPTIMIZER"]) ? $_ENV["GD_OPTIMIZER"] : 'gradient-descent'; $optimizer = new ExternalMaxentOptimizer($OPTIMIZER_PATH); $model = new Maxent(array()); // argv[1] and argv[2] are paths to files that contain the paths // to the actual documents. $train = new SplFileObject($argv[1]); $test = new SplFileObject($argv[2]); // fill in the training set foreach ($train as $f) { $f = substr($f, 0, -1); if (strlen($f) == 0) { continue; } $class = "neg"; if (strpos($f, "pos") !== false) { $class = "pos"; } $tset->addDocument($class, new TokensDocument($tok->tokenize(file_get_contents($f))));
return 'prev=' . current($prev); }); $feats->add(function ($class, Document $d) { if ($class != 'START_SENTENCE') { return; } $w = current($d->getDocumentData()); if (ctype_upper($w[0])) { return "isCapitalized"; } }); $s = new TrainingSet(); foreach ($tokens as $index => $token) { $s->addDocument($classes[$index], new WordDocument($tokens, $index, 5)); } $maxent = new Maxent(array()); $maxent->train($feats, $s, new MaxentGradientDescent(0.01, 1, 100000)); $maxent->dumpWeights(); $true_positives = 0; $false_neg = 0; $false_pos = 0; $classifier = new FeatureBasedLinearClassifier($feats, $maxent); $s->setAsKey(TrainingSet::CLASS_AS_KEY); foreach ($s as $class => $doc) { $predicted_class = $classifier->classify(array('O', 'START_SENTENCE'), $doc); if ($class != $predicted_class) { if ($predicted_class == 'O') { $false_neg++; } else { $false_pos++; }