Exemple #1
0
 /**
  * Build a training set from a directory using the following convention:
  * The directory should contain one subdirectory for each class. The class
  * name is the subdirectory's base name. Each subdirectory should contain
  * one file for each document.
  *
  * @param  string      $dir
  * @return TrainingSet
  */
 public static function buildTrainingSet($dir)
 {
     $tok = new WhitespaceAndPunctuationTokenizer();
     $tset = new TrainingSet();
     foreach (new DirectoryIterator($dir) as $d) {
         if ($d->isFile() || $d->isDot()) {
             continue;
         }
         $class = $d->getBasename();
         foreach (new DirectoryIterator($d->getPathname()) as $f) {
             if (!$f->isFile()) {
                 continue;
             }
             $tset->addDocument($class, new TokensDocument($tok->tokenize(file_get_contents($f->getPathname()))));
         }
     }
     return $tset;
 }
<?php

include '../../../autoloader.php';
include '../../testing.php';
use NlpTools\Tokenizers\WhitespaceAndPunctuationTokenizer;
function arrays_match($a1, $a2)
{
    return count(array_diff($a1, $a2)) == 0;
}
$tok = new WhitespaceAndPunctuationTokenizer();
$s = "This is a simple space delimited string\nwith new lines and many     spaces between the words.\nAlso\ttabs\ttabs\ttabs\ttabs";
$tokens = array('This', 'is', 'a', 'simple', 'space', 'delimited', 'string', 'with', 'new', 'lines', 'and', 'many', 'spaces', 'between', 'the', 'words', '.', 'Also', 'tabs', 'tabs', 'tabs', 'tabs');
_assert(arrays_match($tok->tokenize($s), $tokens), "Problem tokenizing simple ASCII whitespace with ascii content");
$s = "Ελληνικό κείμενο για παράδειγμα utf-8 χαρακτήρων";
$tokens = array('Ελληνικό', 'κείμενο', 'για', 'παράδειγμα', 'utf', '-', '8', 'χαρακτήρων');
_assert(arrays_match($tok->tokenize($s), $tokens), "Problem tokenizing simple ASCII whitespace with utf-8 content");
$s = "Here exists non-breaking space   ";
$tokens = array('Here', 'exists', 'non', '-', 'breaking', 'space');
_assert(arrays_match($tok->tokenize($s), $tokens), "Problem tokenizing utf-8 whitespace");