コード例 #1
0
ファイル: lm2php.php プロジェクト: smalyshev/textcat
<?php

/**
 * Convert data from Perl's TextCat LM format to PHP format
 * used by this tool.
 */
require_once __DIR__ . '/TextCat.php';
if ($argc != 3) {
    die("Use {$argv['0']} INPUTDIR OUTPUTDIR\n");
}
if (!file_exists($argv[2])) {
    mkdir($argv[2], 0755, true);
}
$cat = new TextCat($argv[2]);
foreach (new DirectoryIterator($argv[1]) as $file) {
    if (!$file->isFile()) {
        continue;
    }
    $ngrams = array();
    foreach (file($file->getPathname(), FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES) as $line) {
        list($word, $score) = explode("\t ", $line, 2);
        $ngrams[$word] = intval($score);
    }
    $cat->writeLanguageFile($ngrams, $argv[2] . "/" . $file->getBasename());
}
exit(0);
コード例 #2
0
ファイル: felis.php プロジェクト: smalyshev/textcat
<?php

/**
 * Generate ngrams data from text files.
 * Run: php felis.php INPUTDIR OUTPUTDIR
 * INPUTDIR should contain text files e.g. english.txt
 * OUTPUTDIR would contain ngrams files e.g. english.lm
 */
// Language model generation failing?
// up your memory limit or set $minFreq >0 in TextCat.php
// ini_set('memory_limit', '2000000000');
require_once __DIR__ . '/TextCat.php';
// TODO: add option to control model ngram count
$maxNgrams = 4000;
if ($argc != 3) {
    die("Use {$argv['0']} INPUTDIR OUTPUTDIR\n");
}
if (!file_exists($argv[2])) {
    mkdir($argv[2], 0755, true);
}
$cat = new TextCat($argv[2]);
foreach (new DirectoryIterator($argv[1]) as $file) {
    if (!$file->isFile()) {
        continue;
    }
    $ngrams = $cat->createLM(file_get_contents($file->getPathname()), $maxNgrams);
    $cat->writeLanguageFile($ngrams, $argv[2] . "/" . $file->getBasename(".txt") . ".lm");
}
exit(0);