Exemple #1
0
 public function testEuclideanClustering()
 {
     $clust = new KMeans(2, new Euclidean(), new EuclidCF(), 0.001);
     $tset = new TrainingSet();
     for ($i = 0; $i < 500; $i++) {
         $tset->addDocument('A', EuclideanPoint::getRandomPointAround(100, 100, 45));
     }
     for ($i = 0; $i < 500; $i++) {
         $tset->addDocument('B', EuclideanPoint::getRandomPointAround(200, 100, 45));
     }
     list($clusters, $centroids, $distances) = $clust->cluster($tset, new DataAsFeatures());
     $im = $this->drawClusters($tset, $clusters, $centroids, false);
     if ($im) {
         imagepng($im, TEST_DATA_DIR . "/Clustering/KmeansTest/clusters.png");
     }
     // since the dataset is artificial and clearly separated, the kmeans
     // algorithm should always cluster it correctly
     foreach ($clusters as $clust) {
         $classes = array();
         foreach ($clust as $point_idx) {
             $class = $tset[$point_idx]->getClass();
             if (!isset($classes[$class])) {
                 $classes[$class] = true;
             }
         }
         // assert that all the documents (points) in this cluster belong
         // in the same class
         $this->assertCount(1, $classes);
     }
 }
Exemple #2
0
 public function testIdf()
 {
     $ts = new TrainingSet();
     $ts->addDocument("", new TokensDocument(array("a", "b", "c", "d")));
     $ts->addDocument("", new TokensDocument(array("a", "c", "d")));
     $ts->addDocument("", new TokensDocument(array("a")));
     $idf = new Idf($ts);
     $this->assertEquals(0.405, $idf["c"], null, 0.001);
     $this->assertEquals(1.098, $idf["b"], null, 0.001);
     $this->assertEquals(1.098, $idf["non-existing"], null, 0.001);
     $this->assertEquals(0, $idf["a"]);
 }
$model = new Maxent(array());
// argv[1] and argv[2] are paths to files that contain the paths
// to the actual documents.
$train = new SplFileObject($argv[1]);
$test = new SplFileObject($argv[2]);
// fill in the training set
foreach ($train as $f) {
    $f = substr($f, 0, -1);
    if (strlen($f) == 0) {
        continue;
    }
    $class = "neg";
    if (strpos($f, "pos") !== false) {
        $class = "pos";
    }
    $tset->addDocument($class, new TokensDocument($tok->tokenize(file_get_contents($f))));
}
// train the model
$model->train($ff, $tset, $optimizer);
// to use the model we need a classifier
$cls = new FeatureBasedLinearClassifier($ff, $model);
// evaluate the model
$correct = 0;
$total = 0;
foreach ($test as $f) {
    $f = substr($f, 0, -1);
    if (strlen($f) == 0) {
        continue;
    }
    $class = "neg";
    if (strpos($f, "pos") !== false) {
 $testing = array();
 $query = "SELECT * from `response` where q_id={$question_id}";
 $query_run = mysql_query($query);
 while ($row = mysql_fetch_array($query_run)) {
     $single_resp = $row['resp'];
     array_push($testing, array('ans', $single_resp));
 }
 $tset = new TrainingSet();
 // will hold the training documents
 $tok = new WhitespaceTokenizer();
 // will split into tokens
 $ff = new DataAsFeatures();
 // see features in documentation
 // ---------- Training ----------------
 foreach ($training as $d) {
     $tset->addDocument($d[0], new TokensDocument($tok->tokenize($d[1])));
 }
 $model = new FeatureBasedNB();
 // train a Naive Bayes model
 $model->train($ff, $tset);
 // ---------- Classification ----------------
 $cls = new MultinomialNBClassifier($ff, $model);
 $correct = 0;
 foreach ($testing as $d) {
     // predict if it is spam or ham
     echo $prediction = $cls->classify(array('ans', 'non'), new TokensDocument($tok->tokenize($d[1])));
     $puts = 0;
     if ($prediction == $d[0]) {
         echo $correct++;
         $puts = 1;
     }
Exemple #5
0
 /**
  * Build a training set from a directory using the following convention:
  * The directory should contain one subdirectory for each class. The class
  * name is the subdirectory's base name. Each subdirectory should contain
  * one file for each document.
  *
  * @param  string      $dir
  * @return TrainingSet
  */
 public static function buildTrainingSet($dir)
 {
     $tok = new WhitespaceAndPunctuationTokenizer();
     $tset = new TrainingSet();
     foreach (new DirectoryIterator($dir) as $d) {
         if ($d->isFile() || $d->isDot()) {
             continue;
         }
         $class = $d->getBasename();
         foreach (new DirectoryIterator($d->getPathname()) as $f) {
             if (!$f->isFile()) {
                 continue;
             }
             $tset->addDocument($class, new TokensDocument($tok->tokenize(file_get_contents($f->getPathname()))));
         }
     }
     return $tset;
 }
 public function testClustering2()
 {
     $N = 50;
     $tset = new TrainingSet();
     for ($i = 0; $i < $N; $i++) {
         $tset->addDocument('', EuclideanPoint::getRandomPointAround(100, 100, 45));
     }
     for ($i = 0; $i < $N; $i++) {
         $tset->addDocument('', EuclideanPoint::getRandomPointAround(200, 100, 45));
     }
     $hc = new Hierarchical(new SingleLink(), new Euclidean());
     list($dendrogram) = $hc->cluster($tset, new DataAsFeatures());
     $dg = $this->drawDendrogram($tset, $dendrogram, 600);
     $clusters = Hierarchical::dendrogramToClusters($dendrogram, 2);
     $im = $this->drawClusters($tset, $clusters, null, false, 10);
     if ($dg) {
         imagepng($dg, TEST_DATA_DIR . "/Clustering/HierarchicalTest/dendrogram.png");
     }
     if ($im) {
         imagepng($im, TEST_DATA_DIR . "/Clustering/HierarchicalTest/clusters.png");
     }
 }
Exemple #7
0
    $d = array();
    for ($w = 0; $w < 25; $w++) {
        $x = (int) ($w % 5);
        $y = (int) ($w / 5);
        $c = imagecolorsforindex($im, imagecolorat($im, $x, $y));
        $c = $c['red'];
        if ($c > 0) {
            $d = array_merge($d, array_fill_keys(range(0, $c - 1), $w));
        }
    }
    return $d;
}
$tset = new TrainingSet();
for ($i = 0; $i < 500; $i++) {
    $f = "data/{$i}";
    $tset->addDocument('', new TokensDocument(from_img($f)));
}
$lda = new Lda(new DataAsFeatures(), 10, 1, 1);
$docs = $lda->generateDocs($tset);
$lda->initialize($docs);
$i = 100;
while ($i-- > 0) {
    $lda->gibbsSample($docs);
    $topics = $lda->getPhi();
    echo $lda->getLogLikelihood(), PHP_EOL;
    foreach ($topics as $t => $topic) {
        $it = 100 - $i;
        $name = sprintf("results/topic-%04d-%04d", $it, $t);
        $max = max($topic);
        create_image(array_map(function ($x) use($topic, $max) {
            return array_map(function ($y) use($x, $topic, $max) {
Exemple #8
0
<?php

include '../../../autoloader.php';
include '../../testing.php';
include '../cluster_testing.php';
use NlpTools\Clustering\KMeans;
use NlpTools\Similarity\Euclidean;
use NlpTools\Similarity\CosineSimilarity;
use NlpTools\Clustering\CentroidFactories\MeanAngle;
use NlpTools\Clustering\CentroidFactories\Euclidean as EuclidCF;
use NlpTools\Documents\TrainingSet;
use NlpTools\FeatureFactories\DataAsFeatures;
use NlpTools\Documents\Document;
$NC = 2;
// number of clusters
$clust = new Kmeans($NC, new Euclidean(), new EuclidCF(), 0.001);
$tset = new TrainingSet();
for ($i = 0; $i < 500; $i++) {
    $tset->addDocument('', EuclideanPoint::getRandomPointAround(100, 100, 45));
}
for ($i = 0; $i < 500; $i++) {
    $tset->addDocument('', EuclideanPoint::getRandomPointAround(200, 100, 45));
}
list($clusters, $centroids, $distances) = $clust->cluster($tset, new DataAsFeatures());
$im = draw_clusters($tset, $clusters, $centroids, false);
if ($im) {
    imagepng($im, 'clusters.png');
} else {
    var_dump($clusters);
}
Exemple #9
0
use NlpTools\Models\FeatureBasedNB;
use NlpTools\Documents\TrainingSet;
use NlpTools\Documents\TokensDocument;
use NlpTools\FeatureFactories\DataAsFeatures;
use NlpTools\Classifiers\MultinomialNBClassifier;
// *************** Training ***************
$training = array(array('usa', 'new york is a hell of a town'), array('usa', 'the statue of liberty'), array('usa', 'new york is in the united states'), array('usa', 'the white house is in washington'), array('uk', 'london is in the uk'), array('uk', 'the big ben is in london'));
// hold our training documents
$trainingSet = new TrainingSet();
// our tokenizer
$tokenizer = new WhitespaceTokenizer();
// will hold the features
$features = new DataAsFeatures();
// iterate over training array
foreach ($training as $trainingDocument) {
    // add to our training set
    $trainingSet->addDocument($trainingDocument[0], new TokensDocument($tokenizer->tokenize($trainingDocument[1])));
}
// train our Naive Bayes Model
$bayesModel = new FeatureBasedNB();
$bayesModel->train($features, $trainingSet);
// *************** Classify ***************
$testSet = array(array('usa', 'i want to see the statue of liberty'), array('usa', 'this is a picture of the white house'), array('usa', 'where in washington'), array('uk', 'i saw the big ben yesterday'), array('uk', 'i went to london to visit a friend'));
// init our Naive Bayes Class using the features and our model
$classifier = new MultinomialNBClassifier($features, $bayesModel);
// iterate over our test set
foreach ($testSet as $testDocument) {
    // predict our sentence
    $prediction = $classifier->classify(array('usa', 'uk'), new TokensDocument($tokenizer->tokenize($testDocument[1])));
    printf("sentence: %s | class: %s | predicted: %s\n", $testDocument[1], $testDocument[0], $prediction);
}
Exemple #10
0
    $prev = $dat[1];
    end($prev);
    return 'prev=' . current($prev);
});
$feats->add(function ($class, Document $d) {
    if ($class != 'START_SENTENCE') {
        return;
    }
    $w = current($d->getDocumentData());
    if (ctype_upper($w[0])) {
        return "isCapitalized";
    }
});
$s = new TrainingSet();
foreach ($tokens as $index => $token) {
    $s->addDocument($classes[$index], new WordDocument($tokens, $index, 5));
}
$maxent = new Maxent(array());
$maxent->train($feats, $s, new MaxentGradientDescent(0.01, 1, 100000));
$maxent->dumpWeights();
$true_positives = 0;
$false_neg = 0;
$false_pos = 0;
$classifier = new FeatureBasedLinearClassifier($feats, $maxent);
$s->setAsKey(TrainingSet::CLASS_AS_KEY);
foreach ($s as $class => $doc) {
    $predicted_class = $classifier->classify(array('O', 'START_SENTENCE'), $doc);
    if ($class != $predicted_class) {
        if ($predicted_class == 'O') {
            $false_neg++;
        } else {