<?php /** * Convert data from Perl's TextCat LM format to PHP format * used by this tool. */ require_once __DIR__ . '/TextCat.php'; if ($argc != 3) { die("Use {$argv['0']} INPUTDIR OUTPUTDIR\n"); } if (!file_exists($argv[2])) { mkdir($argv[2], 0755, true); } $cat = new TextCat($argv[2]); foreach (new DirectoryIterator($argv[1]) as $file) { if (!$file->isFile()) { continue; } $ngrams = array(); foreach (file($file->getPathname(), FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES) as $line) { list($word, $score) = explode("\t ", $line, 2); $ngrams[$word] = intval($score); } $cat->writeLanguageFile($ngrams, $argv[2] . "/" . $file->getBasename()); } exit(0);
-t NUM indicates the topmost number of ngrams that should be used. Default: 3000 -u NUM determines how much worse result must be in order not to be mentioned as an alternative. Typical value: 1.05 or 1.1. Default: 1.05. HELP; echo $help; exit(0); } if (!empty($options['d'])) { $dirs = explode(",", $options['d']); } else { $dirs = array(__DIR__ . "/LM"); } $cat = new TextCat($dirs); if (!empty($options['t'])) { $cat->setMaxNgrams(intval($options['t'])); } if (!empty($options['f'])) { $cat->setMinFreq(intval($options['f'])); } $input = isset($options['l']) ? $options['l'] : file_get_contents("php://stdin"); if (!empty($options['c'])) { $result = $cat->classify($input, explode(",", $options['c'])); } else { $result = $cat->classify($input); } if (empty($result)) { echo "No match found.\n"; exit(1);
<?php /** * Generate ngrams data from text files. * Run: php felis.php INPUTDIR OUTPUTDIR * INPUTDIR should contain text files e.g. english.txt * OUTPUTDIR would contain ngrams files e.g. english.lm */ // Language model generation failing? // up your memory limit or set $minFreq >0 in TextCat.php // ini_set('memory_limit', '2000000000'); require_once __DIR__ . '/TextCat.php'; // TODO: add option to control model ngram count $maxNgrams = 4000; if ($argc != 3) { die("Use {$argv['0']} INPUTDIR OUTPUTDIR\n"); } if (!file_exists($argv[2])) { mkdir($argv[2], 0755, true); } $cat = new TextCat($argv[2]); foreach (new DirectoryIterator($argv[1]) as $file) { if (!$file->isFile()) { continue; } $ngrams = $cat->createLM(file_get_contents($file->getPathname()), $maxNgrams); $cat->writeLanguageFile($ngrams, $argv[2] . "/" . $file->getBasename(".txt") . ".lm"); } exit(0);