/** * * @param TrainingSet $tset * The set of documents for which we will compute the idf * @param FeatureFactoryInterface $ff * A feature factory to translate the document data to single tokens */ public function __construct(TrainingSet $tset, FeatureFactoryInterface $ff = null) { if ($ff === null) { $ff = new DataAsFeatures(); } $tset->setAsKey(TrainingSet::CLASS_AS_KEY); foreach ($tset as $class => $doc) { $tokens = $ff->getFeatureArray($class, $doc); // extract tokens from the document $tokens = array_fill_keys($tokens, 1); // make them occur once foreach ($tokens as $token => $v) { if (isset($this->idf[$token])) { $this->idf[$token]++; } else { $this->idf[$token] = 1; } } } // this idf so far contains the doc frequency // we will now inverse it and take the log $D = count($tset); foreach ($this->idf as &$v) { $v = log($D / $v); } $this->logD = log($D); }
public function testEuclideanClustering() { $clust = new KMeans(2, new Euclidean(), new EuclidCF(), 0.001); $tset = new TrainingSet(); for ($i = 0; $i < 500; $i++) { $tset->addDocument('A', EuclideanPoint::getRandomPointAround(100, 100, 45)); } for ($i = 0; $i < 500; $i++) { $tset->addDocument('B', EuclideanPoint::getRandomPointAround(200, 100, 45)); } list($clusters, $centroids, $distances) = $clust->cluster($tset, new DataAsFeatures()); $im = $this->drawClusters($tset, $clusters, $centroids, false); if ($im) { imagepng($im, TEST_DATA_DIR . "/Clustering/KmeansTest/clusters.png"); } // since the dataset is artificial and clearly separated, the kmeans // algorithm should always cluster it correctly foreach ($clusters as $clust) { $classes = array(); foreach ($clust as $point_idx) { $class = $tset[$point_idx]->getClass(); if (!isset($classes[$class])) { $classes[$class] = true; } } // assert that all the documents (points) in this cluster belong // in the same class $this->assertCount(1, $classes); } }
public function testIdf() { $ts = new TrainingSet(); $ts->addDocument("", new TokensDocument(array("a", "b", "c", "d"))); $ts->addDocument("", new TokensDocument(array("a", "c", "d"))); $ts->addDocument("", new TokensDocument(array("a"))); $idf = new Idf($ts); $this->assertEquals(0.405, $idf["c"], null, 0.001); $this->assertEquals(1.098, $idf["b"], null, 0.001); $this->assertEquals(1.098, $idf["non-existing"], null, 0.001); $this->assertEquals(0, $idf["a"]); }
/** * Calculate all the features for each possible class of each * document. This is done so that we can optimize without the need * of the FeatureFactory. * * We do not want to use the FeatureFactoryInterface both because it would * be slow to calculate the features over and over again, but also * because we want to be able to optimize externally to * gain speed (PHP is slow!). * * @param $classes A set of the classes in the training set * @param $tset A collection of training documents * @param $ff The feature factory * @return array An array that contains every feature for every possible class of every document */ protected function calculateFeatureArray(array $classes, TrainingSet $tset, FeatureFactoryInterface $ff) { $features = array(); $tset->setAsKey(TrainingSet::OFFSET_AS_KEY); foreach ($tset as $offset => $doc) { $features[$offset] = array(); foreach ($classes as $class) { $features[$offset][$class] = $ff->getFeatureArray($class, $doc); } $features[$offset]['__label__'] = $doc->getClass(); } return $features; }
use NlpTools\Classifiers\FeatureBasedLinearClassifier; // create needed reusable objects, a tokenizer and a feature factory $tok = new WhitespaceTokenizer(); $ff = new FunctionFeatures(); $ff->add(function ($class, DocumentInterface $d) { $r = array(); foreach ($d->getDocumentData() as $tok) { $r[] = $class . $tok; } return $r; }); // create // 1. an empty training set // 2. an optimizer // 3. an empty model $tset = new TrainingSet(); $OPTIMIZER_PATH = isset($_ENV["GD_OPTIMIZER"]) ? $_ENV["GD_OPTIMIZER"] : 'gradient-descent'; $optimizer = new ExternalMaxentOptimizer($OPTIMIZER_PATH); $model = new Maxent(array()); // argv[1] and argv[2] are paths to files that contain the paths // to the actual documents. $train = new SplFileObject($argv[1]); $test = new SplFileObject($argv[2]); // fill in the training set foreach ($train as $f) { $f = substr($f, 0, -1); if (strlen($f) == 0) { continue; } $class = "neg"; if (strpos($f, "pos") !== false) {
$training = array(); $query = "SELECT * from `training` where q_id={$question_id}"; $query_run = mysql_query($query); while ($row = mysql_fetch_array($query_run)) { $single_resp = $row['resp']; $class = $row['class']; array_push($training, array($class, $single_resp)); } $testing = array(); $query = "SELECT * from `response` where q_id={$question_id}"; $query_run = mysql_query($query); while ($row = mysql_fetch_array($query_run)) { $single_resp = $row['resp']; array_push($testing, array('ans', $single_resp)); } $tset = new TrainingSet(); // will hold the training documents $tok = new WhitespaceTokenizer(); // will split into tokens $ff = new DataAsFeatures(); // see features in documentation // ---------- Training ---------------- foreach ($training as $d) { $tset->addDocument($d[0], new TokensDocument($tok->tokenize($d[1]))); } $model = new FeatureBasedNB(); // train a Naive Bayes model $model->train($ff, $tset); // ---------- Classification ---------------- $cls = new MultinomialNBClassifier($ff, $model); $correct = 0;
/** * Train on the given set and fill the models variables * * priors[c] = NDocs[c]/NDocs * condprob[t][c] = count( t in c) + 1 / sum( count( t' in c ) + 1 , for every t' ) * unknown[c] = condbrob['word that doesnt exist in c'][c] ( so that count(t in c)==0 ) * * More information on the algorithm can be found at * http://nlp.stanford.edu/IR-book/html/htmledition/naive-bayes-text-classification-1.html * * @param FeatureFactoryInterface A feature factory to compute features from a training document * @param TrainingSet The training set * @param integer $a_smoothing The parameter for additive smoothing. Defaults to add-one smoothing. * @return array Return a training context to be used for incremental training */ public function train(FeatureFactoryInterface $ff, TrainingSet $tset, $a_smoothing = 1) { $class_set = $tset->getClassSet(); $ctx = array('termcount_per_class' => array_fill_keys($class_set, 0), 'termcount' => array_fill_keys($class_set, array()), 'ndocs_per_class' => array_fill_keys($class_set, 0), 'voc' => array(), 'ndocs' => 0); return $this->train_with_context($ctx, $ff, $tset, $a_smoothing); }
/** * Build a training set from a directory using the following convention: * The directory should contain one subdirectory for each class. The class * name is the subdirectory's base name. Each subdirectory should contain * one file for each document. * * @param string $dir * @return TrainingSet */ public static function buildTrainingSet($dir) { $tok = new WhitespaceAndPunctuationTokenizer(); $tset = new TrainingSet(); foreach (new DirectoryIterator($dir) as $d) { if ($d->isFile() || $d->isDot()) { continue; } $class = $d->getBasename(); foreach (new DirectoryIterator($d->getPathname()) as $f) { if (!$f->isFile()) { continue; } $tset->addDocument($class, new TokensDocument($tok->tokenize(file_get_contents($f->getPathname())))); } } return $tset; }
public function testClustering2() { $N = 50; $tset = new TrainingSet(); for ($i = 0; $i < $N; $i++) { $tset->addDocument('', EuclideanPoint::getRandomPointAround(100, 100, 45)); } for ($i = 0; $i < $N; $i++) { $tset->addDocument('', EuclideanPoint::getRandomPointAround(200, 100, 45)); } $hc = new Hierarchical(new SingleLink(), new Euclidean()); list($dendrogram) = $hc->cluster($tset, new DataAsFeatures()); $dg = $this->drawDendrogram($tset, $dendrogram, 600); $clusters = Hierarchical::dendrogramToClusters($dendrogram, 2); $im = $this->drawClusters($tset, $clusters, null, false, 10); if ($dg) { imagepng($dg, TEST_DATA_DIR . "/Clustering/HierarchicalTest/dendrogram.png"); } if ($im) { imagepng($im, TEST_DATA_DIR . "/Clustering/HierarchicalTest/clusters.png"); } }
function from_img($file) { $im = imagecreatefrompng($file); $d = array(); for ($w = 0; $w < 25; $w++) { $x = (int) ($w % 5); $y = (int) ($w / 5); $c = imagecolorsforindex($im, imagecolorat($im, $x, $y)); $c = $c['red']; if ($c > 0) { $d = array_merge($d, array_fill_keys(range(0, $c - 1), $w)); } } return $d; } $tset = new TrainingSet(); for ($i = 0; $i < 500; $i++) { $f = "data/{$i}"; $tset->addDocument('', new TokensDocument(from_img($f))); } $lda = new Lda(new DataAsFeatures(), 10, 1, 1); $docs = $lda->generateDocs($tset); $lda->initialize($docs); $i = 100; while ($i-- > 0) { $lda->gibbsSample($docs); $topics = $lda->getPhi(); echo $lda->getLogLikelihood(), PHP_EOL; foreach ($topics as $t => $topic) { $it = 100 - $i; $name = sprintf("results/topic-%04d-%04d", $it, $t);
<?php include '../../../autoloader.php'; include '../../testing.php'; include '../cluster_testing.php'; use NlpTools\Clustering\KMeans; use NlpTools\Similarity\Euclidean; use NlpTools\Similarity\CosineSimilarity; use NlpTools\Clustering\CentroidFactories\MeanAngle; use NlpTools\Clustering\CentroidFactories\Euclidean as EuclidCF; use NlpTools\Documents\TrainingSet; use NlpTools\FeatureFactories\DataAsFeatures; use NlpTools\Documents\Document; $NC = 2; // number of clusters $clust = new Kmeans($NC, new Euclidean(), new EuclidCF(), 0.001); $tset = new TrainingSet(); for ($i = 0; $i < 500; $i++) { $tset->addDocument('', EuclideanPoint::getRandomPointAround(100, 100, 45)); } for ($i = 0; $i < 500; $i++) { $tset->addDocument('', EuclideanPoint::getRandomPointAround(200, 100, 45)); } list($clusters, $centroids, $distances) = $clust->cluster($tset, new DataAsFeatures()); $im = draw_clusters($tset, $clusters, $centroids, false); if ($im) { imagepng($im, 'clusters.png'); } else { var_dump($clusters); }
/** * Example of tokenizing using NlpTools Tokenizer * * @author Glenn De Backer <*****@*****.**> */ include 'vendor/autoload.php'; use NlpTools\Tokenizers\WhitespaceTokenizer; use NlpTools\Models\FeatureBasedNB; use NlpTools\Documents\TrainingSet; use NlpTools\Documents\TokensDocument; use NlpTools\FeatureFactories\DataAsFeatures; use NlpTools\Classifiers\MultinomialNBClassifier; // *************** Training *************** $training = array(array('usa', 'new york is a hell of a town'), array('usa', 'the statue of liberty'), array('usa', 'new york is in the united states'), array('usa', 'the white house is in washington'), array('uk', 'london is in the uk'), array('uk', 'the big ben is in london')); // hold our training documents $trainingSet = new TrainingSet(); // our tokenizer $tokenizer = new WhitespaceTokenizer(); // will hold the features $features = new DataAsFeatures(); // iterate over training array foreach ($training as $trainingDocument) { // add to our training set $trainingSet->addDocument($trainingDocument[0], new TokensDocument($tokenizer->tokenize($trainingDocument[1]))); } // train our Naive Bayes Model $bayesModel = new FeatureBasedNB(); $bayesModel->train($features, $trainingSet); // *************** Classify *************** $testSet = array(array('usa', 'i want to see the statue of liberty'), array('usa', 'this is a picture of the white house'), array('usa', 'where in washington'), array('uk', 'i saw the big ben yesterday'), array('uk', 'i went to london to visit a friend')); // init our Naive Bayes Class using the features and our model
} $dat = $d->getDocumentData(); $prev = $dat[1]; end($prev); return 'prev=' . current($prev); }); $feats->add(function ($class, Document $d) { if ($class != 'START_SENTENCE') { return; } $w = current($d->getDocumentData()); if (ctype_upper($w[0])) { return "isCapitalized"; } }); $s = new TrainingSet(); foreach ($tokens as $index => $token) { $s->addDocument($classes[$index], new WordDocument($tokens, $index, 5)); } $maxent = new Maxent(array()); $maxent->train($feats, $s, new MaxentGradientDescent(0.01, 1, 100000)); $maxent->dumpWeights(); $true_positives = 0; $false_neg = 0; $false_pos = 0; $classifier = new FeatureBasedLinearClassifier($feats, $maxent); $s->setAsKey(TrainingSet::CLASS_AS_KEY); foreach ($s as $class => $doc) { $predicted_class = $classifier->classify(array('O', 'START_SENTENCE'), $doc); if ($class != $predicted_class) { if ($predicted_class == 'O') {