PHP NlpTools\Tokenizers WhitespaceTokenizer::tokenize примеры использования

Язык программирования: PHP

Пространство имен/Пакет: NlpTools\Tokenizers

Класс/Тип: WhitespaceTokenizer

Метод/Функция: tokenize

Примеров на hotexamples.com: 9

PHP NlpTools\Tokenizers WhitespaceTokenizer::tokenize - 9 примеров найдено. Это лучшие примеры PHP кода для NlpTools\Tokenizers\WhitespaceTokenizer::tokenize, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

tokenize(9)

Пример #1

Показать файл

Файл: getforum.php Проект: nandishkotadia/Sorting-of-forum-responses-based-on-relevance

 $testing = array();
 $query = "SELECT * from `response` where q_id={$question_id}";
 $query_run = mysql_query($query);
 while ($row = mysql_fetch_array($query_run)) {
     $single_resp = $row['resp'];
     array_push($testing, array('ans', $single_resp));
 }
 $tset = new TrainingSet();
 // will hold the training documents
 $tok = new WhitespaceTokenizer();
 // will split into tokens
 $ff = new DataAsFeatures();
 // see features in documentation
 // ---------- Training ----------------
 foreach ($training as $d) {
     $tset->addDocument($d[0], new TokensDocument($tok->tokenize($d[1])));
 }
 $model = new FeatureBasedNB();
 // train a Naive Bayes model
 $model->train($ff, $tset);
 // ---------- Classification ----------------
 $cls = new MultinomialNBClassifier($ff, $model);
 $correct = 0;
 foreach ($testing as $d) {
     // predict if it is spam or ham
     echo $prediction = $cls->classify(array('ans', 'non'), new TokensDocument($tok->tokenize($d[1])));
     $puts = 0;
     if ($prediction == $d[0]) {
         echo $correct++;
         $puts = 1;
     }

Пример #2

Показать файл

Файл: sentiment_maxent.php Проект: imonroe/coldreader

$model = new Maxent(array());
// argv[1] and argv[2] are paths to files that contain the paths
// to the actual documents.
$train = new SplFileObject($argv[1]);
$test = new SplFileObject($argv[2]);
// fill in the training set
foreach ($train as $f) {
    $f = substr($f, 0, -1);
    if (strlen($f) == 0) {
        continue;
    }
    $class = "neg";
    if (strpos($f, "pos") !== false) {
        $class = "pos";
    }
    $tset->addDocument($class, new TokensDocument($tok->tokenize(file_get_contents($f))));
}
// train the model
$model->train($ff, $tset, $optimizer);
// to use the model we need a classifier
$cls = new FeatureBasedLinearClassifier($ff, $model);
// evaluate the model
$correct = 0;
$total = 0;
foreach ($test as $f) {
    $f = substr($f, 0, -1);
    if (strlen($f) == 0) {
        continue;
    }
    $class = "neg";
    if (strpos($f, "pos") !== false) {

Пример #3

Показать файл

Файл: whitespace.php Проект: Tjorriemorrie/app

<?php

include '../../../autoloader.php';
include '../../testing.php';
use NlpTools\Tokenizers\WhitespaceTokenizer;
function arrays_match($a1, $a2)
{
    return count(array_diff($a1, $a2)) == 0;
}
$tok = new WhitespaceTokenizer();
$s = "This is a simple space delimited string\nwith new lines and many     spaces between the words.\nAlso\ttabs\ttabs\ttabs\ttabs";
$tokens = array('This', 'is', 'a', 'simple', 'space', 'delimited', 'string', 'with', 'new', 'lines', 'and', 'many', 'spaces', 'between', 'the', 'words.', 'Also', 'tabs', 'tabs', 'tabs', 'tabs');
_assert(arrays_match($tok->tokenize($s), $tokens), "Problem tokenizing simple ASCII whitespace with ascii content");
$s = "Ελληνικό κείμενο για παράδειγμα utf-8 χαρακτήρων";
$tokens = array('Ελληνικό', 'κείμενο', 'για', 'παράδειγμα', 'utf-8', 'χαρακτήρων');
_assert(arrays_match($tok->tokenize($s), $tokens), "Problem tokenizing simple ASCII whitespace with utf-8 content");
$s = "Here exists non-breaking space   ";
$tokens = array('Here', 'exists', 'non-breaking', 'space');
_assert(arrays_match($tok->tokenize($s), $tokens), "Problem tokenizing utf-8 whitespace");

Пример #4

Показать файл

Файл: feature-test.php Проект: Tjorriemorrie/app

<?php

include '../autoloader.php';
use NlpTools\FeatureFactories\FunctionFeatures;
use NlpTools\Tokenizers\WhitespaceTokenizer;
use NlpTools\Documents\Document;
use NlpTools\Documents\WordDocument;
$text = file_get_contents('token-test');
$tokenizer = new WhitespaceTokenizer();
$tokens = $tokenizer->tokenize($text);
$feats = new FunctionFeatures();
$feats->add(function ($class, Document $d) {
    return current($d->getDocumentData());
});
$feats->add(function ($class, Document $d) {
    $w = current($d->getDocumentData());
    if (ctype_upper($w[0])) {
        return "isCapitalized";
    }
});
$documents = array();
foreach ($tokens as $index => $token) {
    $documents[$index] = new WordDocument($tokens, $index, 5);
}
foreach ($documents as $d) {
    echo '[' . implode(',', $feats->getFeatureArray('0', $d)) . ']', PHP_EOL;
}

Пример #5

Показать файл

Файл: PennTreeBankTokenizer.php Проект: imonroe/coldreader

 /**
  * Calls internal functions to handle data processing
  * 
  * @param string $str        	
  */
 public function tokenize($str)
 {
     return parent::tokenize($this->execute($str));
 }

Пример #6

Показать файл

Файл: tokenize.php Проект: simplicitylab/Talks

<?php

/**
 * Example of tokenizing using NlpTools Tokenizer
 *
 * @author Glenn De Backer <*****@*****.**>
 */
include 'vendor/autoload.php';
use NlpTools\Tokenizers\WhitespaceTokenizer;
// text we will be converting into tokens
$text = "PHP is a server side scripting language.";
// initialize Whitespace and punctuation tokenizer
$tokenizer = new WhitespaceTokenizer();
// print array of tokens
print_r($tokenizer->tokenize($text));

Пример #7

Показать файл

Файл: bayesian.php Проект: nandishkotadia/Sorting-of-forum-responses-based-on-relevance

use NlpTools\Classifiers\MultinomialNBClassifier;
// ---------- Data ----------------
// data is taken from http://archive.ics.uci.edu/ml/datasets/SMS+Spam+Collection
// we use a part for training
$training = array(array('c', 'Chinese Beijing China'), array('c', 'Chinese Chinese Shanghai'), array('c', 'Chinese Macao'), array('j', 'Tokyo Japan'));
// and another for evaluating
$testing = array(array('c', 'Chinese	Chinese	Chinese	Tokyo Japan'), array('c', 'India China Chinese'), array('c', 'Japan '), array('c', 'Tokyo'));
$tset = new TrainingSet();
// will hold the training documents
$tok = new WhitespaceTokenizer();
// will split into tokens
$ff = new DataAsFeatures();
// see features in documentation
// ---------- Training ----------------
foreach ($training as $d) {
    $tset->addDocument($d[0], new TokensDocument($tok->tokenize($d[1])));
}
$model = new FeatureBasedNB();
// train a Naive Bayes model
$model->train($ff, $tset);
$china = array();
$ccount1 = 0;
$jcount = 0;
$japan = array();
// ---------- Classification ----------------
$cls = new MultinomialNBClassifier($ff, $model);
$correct = 0;
foreach ($testing as $d) {
    // predict if it is spam or ham
    $prediction = $cls->classify(array('c', 'j'), new TokensDocument($tok->tokenize($d[1])));
    if ($prediction == 'c') {

Пример #8

Показать файл

Файл: classifier.php Проект: simplicitylab/Talks

use NlpTools\Models\FeatureBasedNB;
use NlpTools\Documents\TrainingSet;
use NlpTools\Documents\TokensDocument;
use NlpTools\FeatureFactories\DataAsFeatures;
use NlpTools\Classifiers\MultinomialNBClassifier;
// *************** Training ***************
$training = array(array('usa', 'new york is a hell of a town'), array('usa', 'the statue of liberty'), array('usa', 'new york is in the united states'), array('usa', 'the white house is in washington'), array('uk', 'london is in the uk'), array('uk', 'the big ben is in london'));
// hold our training documents
$trainingSet = new TrainingSet();
// our tokenizer
$tokenizer = new WhitespaceTokenizer();
// will hold the features
$features = new DataAsFeatures();
// iterate over training array
foreach ($training as $trainingDocument) {
    // add to our training set
    $trainingSet->addDocument($trainingDocument[0], new TokensDocument($tokenizer->tokenize($trainingDocument[1])));
}
// train our Naive Bayes Model
$bayesModel = new FeatureBasedNB();
$bayesModel->train($features, $trainingSet);
// *************** Classify ***************
$testSet = array(array('usa', 'i want to see the statue of liberty'), array('usa', 'this is a picture of the white house'), array('usa', 'where in washington'), array('uk', 'i saw the big ben yesterday'), array('uk', 'i went to london to visit a friend'));
// init our Naive Bayes Class using the features and our model
$classifier = new MultinomialNBClassifier($features, $bayesModel);
// iterate over our test set
foreach ($testSet as $testDocument) {
    // predict our sentence
    $prediction = $classifier->classify(array('usa', 'uk'), new TokensDocument($tokenizer->tokenize($testDocument[1])));
    printf("sentence: %s | class: %s | predicted: %s\n", $testDocument[1], $testDocument[0], $prediction);
}

Пример #9

Показать файл

Файл: stopwords.php Проект: simplicitylab/Talks

<?php

/**
 * Example of using stopwords when using NlpTools
 *
 * @author Glenn De Backer <*****@*****.**>
 */
include 'vendor/autoload.php';
use NlpTools\Tokenizers\WhitespaceTokenizer;
use NlpTools\Documents\TokensDocument;
use NlpTools\Utils\StopWords;
// text we will be converting into tokens
$text = "PHP is a server side scripting language";
// define a list of stop words
$stop = new StopWords(array("is", "a", "as"));
// initialize Whitespace tokenizer
$tokenizer = new WhitespaceTokenizer();
// init token document
$doc = new TokensDocument($tokenizer->tokenize($text));
// apply our stopwords
$doc->applyTransformation($stop);
// print filtered tokens
print_r($doc->getDocumentData());