コード例 #1
0
ファイル: felis.php プロジェクト: smalyshev/textcat
<?php

/**
 * Generate ngrams data from text files.
 * Run: php felis.php INPUTDIR OUTPUTDIR
 * INPUTDIR should contain text files e.g. english.txt
 * OUTPUTDIR would contain ngrams files e.g. english.lm
 */
// Language model generation failing?
// up your memory limit or set $minFreq >0 in TextCat.php
// ini_set('memory_limit', '2000000000');
require_once __DIR__ . '/TextCat.php';
// TODO: add option to control model ngram count
$maxNgrams = 4000;
if ($argc != 3) {
    die("Use {$argv['0']} INPUTDIR OUTPUTDIR\n");
}
if (!file_exists($argv[2])) {
    mkdir($argv[2], 0755, true);
}
$cat = new TextCat($argv[2]);
foreach (new DirectoryIterator($argv[1]) as $file) {
    if (!$file->isFile()) {
        continue;
    }
    $ngrams = $cat->createLM(file_get_contents($file->getPathname()), $maxNgrams);
    $cat->writeLanguageFile($ngrams, $argv[2] . "/" . $file->getBasename(".txt") . ".lm");
}
exit(0);