WhitespaceTokenizer::tokenize, NlpTools\Tokenizers PHP代码示例

示例#1

0

显示文件

文件： getforum.php 项目： nandishkotadia/Sorting-of-forum-responses-based-on-relevance

 $testing = array();
 $query = "SELECT * from `response` where q_id={$question_id}";
 $query_run = mysql_query($query);
 while ($row = mysql_fetch_array($query_run)) {
     $single_resp = $row['resp'];
     array_push($testing, array('ans', $single_resp));
 }
 $tset = new TrainingSet();
 // will hold the training documents
 $tok = new WhitespaceTokenizer();
 // will split into tokens
 $ff = new DataAsFeatures();
 // see features in documentation
 // ---------- Training ----------------
 foreach ($training as $d) {
     $tset->addDocument($d[0], new TokensDocument($tok->tokenize($d[1])));
 }
 $model = new FeatureBasedNB();
 // train a Naive Bayes model
 $model->train($ff, $tset);
 // ---------- Classification ----------------
 $cls = new MultinomialNBClassifier($ff, $model);
 $correct = 0;
 foreach ($testing as $d) {
     // predict if it is spam or ham
     echo $prediction = $cls->classify(array('ans', 'non'), new TokensDocument($tok->tokenize($d[1])));
     $puts = 0;
     if ($prediction == $d[0]) {
         echo $correct++;
         $puts = 1;
     }

示例#2

0

显示文件

文件： sentiment_maxent.php 项目： imonroe/coldreader

$model = new Maxent(array());
// argv[1] and argv[2] are paths to files that contain the paths
// to the actual documents.
$train = new SplFileObject($argv[1]);
$test = new SplFileObject($argv[2]);
// fill in the training set
foreach ($train as $f) {
    $f = substr($f, 0, -1);
    if (strlen($f) == 0) {
        continue;
    }
    $class = "neg";
    if (strpos($f, "pos") !== false) {
        $class = "pos";
    }
    $tset->addDocument($class, new TokensDocument($tok->tokenize(file_get_contents($f))));
}
// train the model
$model->train($ff, $tset, $optimizer);
// to use the model we need a classifier
$cls = new FeatureBasedLinearClassifier($ff, $model);
// evaluate the model
$correct = 0;
$total = 0;
foreach ($test as $f) {
    $f = substr($f, 0, -1);
    if (strlen($f) == 0) {
        continue;
    }
    $class = "neg";
    if (strpos($f, "pos") !== false) {

示例#3

0

显示文件

文件： whitespace.php 项目： Tjorriemorrie/app

<?php

include '../../../autoloader.php';
include '../../testing.php';
use NlpTools\Tokenizers\WhitespaceTokenizer;
function arrays_match($a1, $a2)
{
    return count(array_diff($a1, $a2)) == 0;
}
$tok = new WhitespaceTokenizer();
$s = "This is a simple space delimited string\nwith new lines and many     spaces between the words.\nAlso\ttabs\ttabs\ttabs\ttabs";
$tokens = array('This', 'is', 'a', 'simple', 'space', 'delimited', 'string', 'with', 'new', 'lines', 'and', 'many', 'spaces', 'between', 'the', 'words.', 'Also', 'tabs', 'tabs', 'tabs', 'tabs');
_assert(arrays_match($tok->tokenize($s), $tokens), "Problem tokenizing simple ASCII whitespace with ascii content");
$s = "Ελληνικό κείμενο για παράδειγμα utf-8 χαρακτήρων";
$tokens = array('Ελληνικό', 'κείμενο', 'για', 'παράδειγμα', 'utf-8', 'χαρακτήρων');
_assert(arrays_match($tok->tokenize($s), $tokens), "Problem tokenizing simple ASCII whitespace with utf-8 content");
$s = "Here exists non-breaking space   ";
$tokens = array('Here', 'exists', 'non-breaking', 'space');
_assert(arrays_match($tok->tokenize($s), $tokens), "Problem tokenizing utf-8 whitespace");

示例#4

0

显示文件

文件： feature-test.php 项目： Tjorriemorrie/app

<?php

include '../autoloader.php';
use NlpTools\FeatureFactories\FunctionFeatures;
use NlpTools\Tokenizers\WhitespaceTokenizer;
use NlpTools\Documents\Document;
use NlpTools\Documents\WordDocument;
$text = file_get_contents('token-test');
$tokenizer = new WhitespaceTokenizer();
$tokens = $tokenizer->tokenize($text);
$feats = new FunctionFeatures();
$feats->add(function ($class, Document $d) {
    return current($d->getDocumentData());
});
$feats->add(function ($class, Document $d) {
    $w = current($d->getDocumentData());
    if (ctype_upper($w[0])) {
        return "isCapitalized";
    }
});
$documents = array();
foreach ($tokens as $index => $token) {
    $documents[$index] = new WordDocument($tokens, $index, 5);
}
foreach ($documents as $d) {
    echo '[' . implode(',', $feats->getFeatureArray('0', $d)) . ']', PHP_EOL;
}

示例#5

0

显示文件

文件： PennTreeBankTokenizer.php 项目： imonroe/coldreader

 /**
  * Calls internal functions to handle data processing
  * 
  * @param string $str        	
  */
 public function tokenize($str)
 {
     return parent::tokenize($this->execute($str));
 }

示例#6

0

显示文件

文件： tokenize.php 项目： simplicitylab/Talks

<?php

/**
 * Example of tokenizing using NlpTools Tokenizer
 *
 * @author Glenn De Backer <*****@*****.**>
 */
include 'vendor/autoload.php';
use NlpTools\Tokenizers\WhitespaceTokenizer;
// text we will be converting into tokens
$text = "PHP is a server side scripting language.";
// initialize Whitespace and punctuation tokenizer
$tokenizer = new WhitespaceTokenizer();
// print array of tokens
print_r($tokenizer->tokenize($text));

示例#7

0

显示文件

文件： bayesian.php 项目： nandishkotadia/Sorting-of-forum-responses-based-on-relevance

use NlpTools\Classifiers\MultinomialNBClassifier;
// ---------- Data ----------------
// data is taken from http://archive.ics.uci.edu/ml/datasets/SMS+Spam+Collection
// we use a part for training
$training = array(array('c', 'Chinese Beijing China'), array('c', 'Chinese Chinese Shanghai'), array('c', 'Chinese Macao'), array('j', 'Tokyo Japan'));
// and another for evaluating
$testing = array(array('c', 'Chinese	Chinese	Chinese	Tokyo Japan'), array('c', 'India China Chinese'), array('c', 'Japan '), array('c', 'Tokyo'));
$tset = new TrainingSet();
// will hold the training documents
$tok = new WhitespaceTokenizer();
// will split into tokens
$ff = new DataAsFeatures();
// see features in documentation
// ---------- Training ----------------
foreach ($training as $d) {
    $tset->addDocument($d[0], new TokensDocument($tok->tokenize($d[1])));
}
$model = new FeatureBasedNB();
// train a Naive Bayes model
$model->train($ff, $tset);
$china = array();
$ccount1 = 0;
$jcount = 0;
$japan = array();
// ---------- Classification ----------------
$cls = new MultinomialNBClassifier($ff, $model);
$correct = 0;
foreach ($testing as $d) {
    // predict if it is spam or ham
    $prediction = $cls->classify(array('c', 'j'), new TokensDocument($tok->tokenize($d[1])));
    if ($prediction == 'c') {

示例#8

0

显示文件

文件： classifier.php 项目： simplicitylab/Talks

use NlpTools\Models\FeatureBasedNB;
use NlpTools\Documents\TrainingSet;
use NlpTools\Documents\TokensDocument;
use NlpTools\FeatureFactories\DataAsFeatures;
use NlpTools\Classifiers\MultinomialNBClassifier;
// *************** Training ***************
$training = array(array('usa', 'new york is a hell of a town'), array('usa', 'the statue of liberty'), array('usa', 'new york is in the united states'), array('usa', 'the white house is in washington'), array('uk', 'london is in the uk'), array('uk', 'the big ben is in london'));
// hold our training documents
$trainingSet = new TrainingSet();
// our tokenizer
$tokenizer = new WhitespaceTokenizer();
// will hold the features
$features = new DataAsFeatures();
// iterate over training array
foreach ($training as $trainingDocument) {
    // add to our training set
    $trainingSet->addDocument($trainingDocument[0], new TokensDocument($tokenizer->tokenize($trainingDocument[1])));
}
// train our Naive Bayes Model
$bayesModel = new FeatureBasedNB();
$bayesModel->train($features, $trainingSet);
// *************** Classify ***************
$testSet = array(array('usa', 'i want to see the statue of liberty'), array('usa', 'this is a picture of the white house'), array('usa', 'where in washington'), array('uk', 'i saw the big ben yesterday'), array('uk', 'i went to london to visit a friend'));
// init our Naive Bayes Class using the features and our model
$classifier = new MultinomialNBClassifier($features, $bayesModel);
// iterate over our test set
foreach ($testSet as $testDocument) {
    // predict our sentence
    $prediction = $classifier->classify(array('usa', 'uk'), new TokensDocument($tokenizer->tokenize($testDocument[1])));
    printf("sentence: %s | class: %s | predicted: %s\n", $testDocument[1], $testDocument[0], $prediction);
}

示例#9

0

显示文件

文件： stopwords.php 项目： simplicitylab/Talks

<?php

/**
 * Example of using stopwords when using NlpTools
 *
 * @author Glenn De Backer <*****@*****.**>
 */
include 'vendor/autoload.php';
use NlpTools\Tokenizers\WhitespaceTokenizer;
use NlpTools\Documents\TokensDocument;
use NlpTools\Utils\StopWords;
// text we will be converting into tokens
$text = "PHP is a server side scripting language";
// define a list of stop words
$stop = new StopWords(array("is", "a", "as"));
// initialize Whitespace tokenizer
$tokenizer = new WhitespaceTokenizer();
// init token document
$doc = new TokensDocument($tokenizer->tokenize($text));
// apply our stopwords
$doc->applyTransformation($stop);
// print filtered tokens
print_r($doc->getDocumentData());

PHP NlpTools\Tokenizers WhitespaceTokenizer::tokenize示例