<?php /** * Generate ngrams data from text files. * Run: php felis.php INPUTDIR OUTPUTDIR * INPUTDIR should contain text files e.g. english.txt * OUTPUTDIR would contain ngrams files e.g. english.lm */ // Language model generation failing? // up your memory limit or set $minFreq >0 in TextCat.php // ini_set('memory_limit', '2000000000'); require_once __DIR__ . '/TextCat.php'; // TODO: add option to control model ngram count $maxNgrams = 4000; if ($argc != 3) { die("Use {$argv['0']} INPUTDIR OUTPUTDIR\n"); } if (!file_exists($argv[2])) { mkdir($argv[2], 0755, true); } $cat = new TextCat($argv[2]); foreach (new DirectoryIterator($argv[1]) as $file) { if (!$file->isFile()) { continue; } $ngrams = $cat->createLM(file_get_contents($file->getPathname()), $maxNgrams); $cat->writeLanguageFile($ngrams, $argv[2] . "/" . $file->getBasename(".txt") . ".lm"); } exit(0);