$query_run = mysql_query($query);
 while ($row = mysql_fetch_array($query_run)) {
     $single_resp = $row['resp'];
     $class = $row['class'];
     array_push($training, array($class, $single_resp));
 }
 $testing = array();
 $query = "SELECT * from `response` where q_id={$question_id}";
 $query_run = mysql_query($query);
 while ($row = mysql_fetch_array($query_run)) {
     $single_resp = $row['resp'];
     array_push($testing, array('ans', $single_resp));
 }
 $tset = new TrainingSet();
 // will hold the training documents
 $tok = new WhitespaceTokenizer();
 // will split into tokens
 $ff = new DataAsFeatures();
 // see features in documentation
 // ---------- Training ----------------
 foreach ($training as $d) {
     $tset->addDocument($d[0], new TokensDocument($tok->tokenize($d[1])));
 }
 $model = new FeatureBasedNB();
 // train a Naive Bayes model
 $model->train($ff, $tset);
 // ---------- Classification ----------------
 $cls = new MultinomialNBClassifier($ff, $model);
 $correct = 0;
 foreach ($testing as $d) {
     // predict if it is spam or ham
 * Then call the script like this:
 *    php -d memory_limit=300M sentiment_maxent.php train test
 *
 */
// include the autoloader
include '../autoloader.php';
use NlpTools\Tokenizers\WhitespaceTokenizer;
use NlpTools\FeatureFactories\FunctionFeatures;
use NlpTools\Documents\Document;
use NlpTools\Documents\TokensDocument;
use NlpTools\Documents\TrainingSet;
use NlpTools\Optimizers\ExternalMaxentOptimizer;
use NlpTools\Models\Maxent;
use NlpTools\Classifiers\FeatureBasedLinearClassifier;
// create needed reusable objects, a tokenizer and a feature factory
$tok = new WhitespaceTokenizer();
$ff = new FunctionFeatures();
$ff->add(function ($class, DocumentInterface $d) {
    $r = array();
    foreach ($d->getDocumentData() as $tok) {
        $r[] = $class . $tok;
    }
    return $r;
});
// create
// 1. an empty training set
// 2. an optimizer
// 3. an empty model
$tset = new TrainingSet();
$OPTIMIZER_PATH = isset($_ENV["GD_OPTIMIZER"]) ? $_ENV["GD_OPTIMIZER"] : 'gradient-descent';
$optimizer = new ExternalMaxentOptimizer($OPTIMIZER_PATH);
Beispiel #3
0
<?php

include '../../../autoloader.php';
include '../../testing.php';
use NlpTools\Tokenizers\WhitespaceTokenizer;
function arrays_match($a1, $a2)
{
    return count(array_diff($a1, $a2)) == 0;
}
$tok = new WhitespaceTokenizer();
$s = "This is a simple space delimited string\nwith new lines and many     spaces between the words.\nAlso\ttabs\ttabs\ttabs\ttabs";
$tokens = array('This', 'is', 'a', 'simple', 'space', 'delimited', 'string', 'with', 'new', 'lines', 'and', 'many', 'spaces', 'between', 'the', 'words.', 'Also', 'tabs', 'tabs', 'tabs', 'tabs');
_assert(arrays_match($tok->tokenize($s), $tokens), "Problem tokenizing simple ASCII whitespace with ascii content");
$s = "Ελληνικό κείμενο για παράδειγμα utf-8 χαρακτήρων";
$tokens = array('Ελληνικό', 'κείμενο', 'για', 'παράδειγμα', 'utf-8', 'χαρακτήρων');
_assert(arrays_match($tok->tokenize($s), $tokens), "Problem tokenizing simple ASCII whitespace with utf-8 content");
$s = "Here exists non-breaking space   ";
$tokens = array('Here', 'exists', 'non-breaking', 'space');
_assert(arrays_match($tok->tokenize($s), $tokens), "Problem tokenizing utf-8 whitespace");
Beispiel #4
0
<?php

include '../autoloader.php';
use NlpTools\FeatureFactories\FunctionFeatures;
use NlpTools\Tokenizers\WhitespaceTokenizer;
use NlpTools\Documents\Document;
use NlpTools\Documents\WordDocument;
$text = file_get_contents('token-test');
$tokenizer = new WhitespaceTokenizer();
$tokens = $tokenizer->tokenize($text);
$feats = new FunctionFeatures();
$feats->add(function ($class, Document $d) {
    return current($d->getDocumentData());
});
$feats->add(function ($class, Document $d) {
    $w = current($d->getDocumentData());
    if (ctype_upper($w[0])) {
        return "isCapitalized";
    }
});
$documents = array();
foreach ($tokens as $index => $token) {
    $documents[$index] = new WordDocument($tokens, $index, 5);
}
foreach ($documents as $d) {
    echo '[' . implode(',', $feats->getFeatureArray('0', $d)) . ']', PHP_EOL;
}
 /**
  * Calls internal functions to handle data processing
  * 
  * @param string $str        	
  */
 public function tokenize($str)
 {
     return parent::tokenize($this->execute($str));
 }
Beispiel #6
0
<?php

/**
 * Example of tokenizing using NlpTools Tokenizer
 *
 * @author Glenn De Backer <*****@*****.**>
 */
include 'vendor/autoload.php';
use NlpTools\Tokenizers\WhitespaceTokenizer;
// text we will be converting into tokens
$text = "PHP is a server side scripting language.";
// initialize Whitespace and punctuation tokenizer
$tokenizer = new WhitespaceTokenizer();
// print array of tokens
print_r($tokenizer->tokenize($text));
// won't include it again in the following examples
use NlpTools\Tokenizers\WhitespaceTokenizer;
use NlpTools\Models\FeatureBasedNB;
use NlpTools\Documents\TrainingSet;
use NlpTools\Documents\TokensDocument;
use NlpTools\FeatureFactories\DataAsFeatures;
use NlpTools\Classifiers\MultinomialNBClassifier;
// ---------- Data ----------------
// data is taken from http://archive.ics.uci.edu/ml/datasets/SMS+Spam+Collection
// we use a part for training
$training = array(array('c', 'Chinese Beijing China'), array('c', 'Chinese Chinese Shanghai'), array('c', 'Chinese Macao'), array('j', 'Tokyo Japan'));
// and another for evaluating
$testing = array(array('c', 'Chinese	Chinese	Chinese	Tokyo Japan'), array('c', 'India China Chinese'), array('c', 'Japan '), array('c', 'Tokyo'));
$tset = new TrainingSet();
// will hold the training documents
$tok = new WhitespaceTokenizer();
// will split into tokens
$ff = new DataAsFeatures();
// see features in documentation
// ---------- Training ----------------
foreach ($training as $d) {
    $tset->addDocument($d[0], new TokensDocument($tok->tokenize($d[1])));
}
$model = new FeatureBasedNB();
// train a Naive Bayes model
$model->train($ff, $tset);
$china = array();
$ccount1 = 0;
$jcount = 0;
$japan = array();
// ---------- Classification ----------------
Beispiel #8
0
 *
 * @author Glenn De Backer <*****@*****.**>
 */
include 'vendor/autoload.php';
use NlpTools\Tokenizers\WhitespaceTokenizer;
use NlpTools\Models\FeatureBasedNB;
use NlpTools\Documents\TrainingSet;
use NlpTools\Documents\TokensDocument;
use NlpTools\FeatureFactories\DataAsFeatures;
use NlpTools\Classifiers\MultinomialNBClassifier;
// *************** Training ***************
$training = array(array('usa', 'new york is a hell of a town'), array('usa', 'the statue of liberty'), array('usa', 'new york is in the united states'), array('usa', 'the white house is in washington'), array('uk', 'london is in the uk'), array('uk', 'the big ben is in london'));
// hold our training documents
$trainingSet = new TrainingSet();
// our tokenizer
$tokenizer = new WhitespaceTokenizer();
// will hold the features
$features = new DataAsFeatures();
// iterate over training array
foreach ($training as $trainingDocument) {
    // add to our training set
    $trainingSet->addDocument($trainingDocument[0], new TokensDocument($tokenizer->tokenize($trainingDocument[1])));
}
// train our Naive Bayes Model
$bayesModel = new FeatureBasedNB();
$bayesModel->train($features, $trainingSet);
// *************** Classify ***************
$testSet = array(array('usa', 'i want to see the statue of liberty'), array('usa', 'this is a picture of the white house'), array('usa', 'where in washington'), array('uk', 'i saw the big ben yesterday'), array('uk', 'i went to london to visit a friend'));
// init our Naive Bayes Class using the features and our model
$classifier = new MultinomialNBClassifier($features, $bayesModel);
// iterate over our test set
Beispiel #9
0
<?php

/**
 * Example of using stopwords when using NlpTools
 *
 * @author Glenn De Backer <*****@*****.**>
 */
include 'vendor/autoload.php';
use NlpTools\Tokenizers\WhitespaceTokenizer;
use NlpTools\Documents\TokensDocument;
use NlpTools\Utils\StopWords;
// text we will be converting into tokens
$text = "PHP is a server side scripting language";
// define a list of stop words
$stop = new StopWords(array("is", "a", "as"));
// initialize Whitespace tokenizer
$tokenizer = new WhitespaceTokenizer();
// init token document
$doc = new TokensDocument($tokenizer->tokenize($text));
// apply our stopwords
$doc->applyTransformation($stop);
// print filtered tokens
print_r($doc->getDocumentData());