<?php /** * Convert data from Perl's TextCat LM format to PHP format * used by this tool. */ require_once __DIR__ . '/TextCat.php'; if ($argc != 3) { die("Use {$argv['0']} INPUTDIR OUTPUTDIR\n"); } if (!file_exists($argv[2])) { mkdir($argv[2], 0755, true); } $cat = new TextCat($argv[2]); foreach (new DirectoryIterator($argv[1]) as $file) { if (!$file->isFile()) { continue; } $ngrams = array(); foreach (file($file->getPathname(), FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES) as $line) { list($word, $score) = explode("\t ", $line, 2); $ngrams[$word] = intval($score); } $cat->writeLanguageFile($ngrams, $argv[2] . "/" . $file->getBasename()); } exit(0);
<?php /** * Generate ngrams data from text files. * Run: php felis.php INPUTDIR OUTPUTDIR * INPUTDIR should contain text files e.g. english.txt * OUTPUTDIR would contain ngrams files e.g. english.lm */ // Language model generation failing? // up your memory limit or set $minFreq >0 in TextCat.php // ini_set('memory_limit', '2000000000'); require_once __DIR__ . '/TextCat.php'; // TODO: add option to control model ngram count $maxNgrams = 4000; if ($argc != 3) { die("Use {$argv['0']} INPUTDIR OUTPUTDIR\n"); } if (!file_exists($argv[2])) { mkdir($argv[2], 0755, true); } $cat = new TextCat($argv[2]); foreach (new DirectoryIterator($argv[1]) as $file) { if (!$file->isFile()) { continue; } $ngrams = $cat->createLM(file_get_contents($file->getPathname()), $maxNgrams); $cat->writeLanguageFile($ngrams, $argv[2] . "/" . $file->getBasename(".txt") . ".lm"); } exit(0);