/** * Build a training set from a directory using the following convention: * The directory should contain one subdirectory for each class. The class * name is the subdirectory's base name. Each subdirectory should contain * one file for each document. * * @param string $dir * @return TrainingSet */ public static function buildTrainingSet($dir) { $tok = new WhitespaceAndPunctuationTokenizer(); $tset = new TrainingSet(); foreach (new DirectoryIterator($dir) as $d) { if ($d->isFile() || $d->isDot()) { continue; } $class = $d->getBasename(); foreach (new DirectoryIterator($d->getPathname()) as $f) { if (!$f->isFile()) { continue; } $tset->addDocument($class, new TokensDocument($tok->tokenize(file_get_contents($f->getPathname())))); } } return $tset; }
<?php include '../../../autoloader.php'; include '../../testing.php'; use NlpTools\Tokenizers\WhitespaceAndPunctuationTokenizer; function arrays_match($a1, $a2) { return count(array_diff($a1, $a2)) == 0; } $tok = new WhitespaceAndPunctuationTokenizer(); $s = "This is a simple space delimited string\nwith new lines and many spaces between the words.\nAlso\ttabs\ttabs\ttabs\ttabs"; $tokens = array('This', 'is', 'a', 'simple', 'space', 'delimited', 'string', 'with', 'new', 'lines', 'and', 'many', 'spaces', 'between', 'the', 'words', '.', 'Also', 'tabs', 'tabs', 'tabs', 'tabs'); _assert(arrays_match($tok->tokenize($s), $tokens), "Problem tokenizing simple ASCII whitespace with ascii content"); $s = "Ελληνικό κείμενο για παράδειγμα utf-8 χαρακτήρων"; $tokens = array('Ελληνικό', 'κείμενο', 'για', 'παράδειγμα', 'utf', '-', '8', 'χαρακτήρων'); _assert(arrays_match($tok->tokenize($s), $tokens), "Problem tokenizing simple ASCII whitespace with utf-8 content"); $s = "Here exists non-breaking space "; $tokens = array('Here', 'exists', 'non', '-', 'breaking', 'space'); _assert(arrays_match($tok->tokenize($s), $tokens), "Problem tokenizing utf-8 whitespace");