<?php include '../autoloader.php'; use NlpTools\FeatureFactories\FunctionFeatures; use NlpTools\Tokenizers\WhitespaceTokenizer; use NlpTools\Documents\Document; use NlpTools\Documents\WordDocument; $text = file_get_contents('token-test'); $tokenizer = new WhitespaceTokenizer(); $tokens = $tokenizer->tokenize($text); $feats = new FunctionFeatures(); $feats->add(function ($class, Document $d) { return current($d->getDocumentData()); }); $feats->add(function ($class, Document $d) { $w = current($d->getDocumentData()); if (ctype_upper($w[0])) { return "isCapitalized"; } }); $documents = array(); foreach ($tokens as $index => $token) { $documents[$index] = new WordDocument($tokens, $index, 5); } foreach ($documents as $d) { echo '[' . implode(',', $feats->getFeatureArray('0', $d)) . ']', PHP_EOL; }
include '../autoloader.php'; use NlpTools\Tokenizers\WhitespaceTokenizer; use NlpTools\FeatureFactories\FunctionFeatures; use NlpTools\Documents\Document; use NlpTools\Documents\TokensDocument; use NlpTools\Documents\TrainingSet; use NlpTools\Optimizers\ExternalMaxentOptimizer; use NlpTools\Models\Maxent; use NlpTools\Classifiers\FeatureBasedLinearClassifier; // create needed reusable objects, a tokenizer and a feature factory $tok = new WhitespaceTokenizer(); $ff = new FunctionFeatures(); $ff->add(function ($class, DocumentInterface $d) { $r = array(); foreach ($d->getDocumentData() as $tok) { $r[] = $class . $tok; } return $r; }); // create // 1. an empty training set // 2. an optimizer // 3. an empty model $tset = new TrainingSet(); $OPTIMIZER_PATH = isset($_ENV["GD_OPTIMIZER"]) ? $_ENV["GD_OPTIMIZER"] : 'gradient-descent'; $optimizer = new ExternalMaxentOptimizer($OPTIMIZER_PATH); $model = new Maxent(array()); // argv[1] and argv[2] are paths to files that contain the paths // to the actual documents. $train = new SplFileObject($argv[1]); $test = new SplFileObject($argv[2]);
$tokens = array(); $classes = array(); foreach (file('dev-doc') as $line) { $tmp = explode(' ', substr($line, 0, -1)); $tokens[] = $tmp[0]; $classes[] = $tmp[1]; } $feats = new FunctionFeatures(); //$feats->add(function ($class,Document $d) { // return $class.current($d->getDocumentData()); //}); $feats->add(function ($class, Document $d) { if ($class != 'START_SENTENCE') { return; } $dat = $d->getDocumentData(); $prev = $dat[1]; end($prev); return 'prev=' . current($prev); }); $feats->add(function ($class, Document $d) { if ($class != 'START_SENTENCE') { return; } $w = current($d->getDocumentData()); if (ctype_upper($w[0])) { return "isCapitalized"; } }); $s = new TrainingSet(); foreach ($tokens as $index => $token) {