Ejemplo n.º 1
0
<?php

/**
 * Convert data from Perl's TextCat LM format to PHP format
 * used by this tool.
 */
require_once __DIR__ . '/TextCat.php';
if ($argc != 3) {
    die("Use {$argv['0']} INPUTDIR OUTPUTDIR\n");
}
if (!file_exists($argv[2])) {
    mkdir($argv[2], 0755, true);
}
$cat = new TextCat($argv[2]);
foreach (new DirectoryIterator($argv[1]) as $file) {
    if (!$file->isFile()) {
        continue;
    }
    $ngrams = array();
    foreach (file($file->getPathname(), FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES) as $line) {
        list($word, $score) = explode("\t ", $line, 2);
        $ngrams[$word] = intval($score);
    }
    $cat->writeLanguageFile($ngrams, $argv[2] . "/" . $file->getBasename());
}
exit(0);
Ejemplo n.º 2
0
    -t NUM  indicates the topmost number of ngrams that should be used.
            Default: 3000
    -u NUM  determines how much worse result must be in order not to be
            mentioned as an alternative. Typical value: 1.05 or 1.1.
            Default: 1.05.

HELP;
    echo $help;
    exit(0);
}
if (!empty($options['d'])) {
    $dirs = explode(",", $options['d']);
} else {
    $dirs = array(__DIR__ . "/LM");
}
$cat = new TextCat($dirs);
if (!empty($options['t'])) {
    $cat->setMaxNgrams(intval($options['t']));
}
if (!empty($options['f'])) {
    $cat->setMinFreq(intval($options['f']));
}
$input = isset($options['l']) ? $options['l'] : file_get_contents("php://stdin");
if (!empty($options['c'])) {
    $result = $cat->classify($input, explode(",", $options['c']));
} else {
    $result = $cat->classify($input);
}
if (empty($result)) {
    echo "No match found.\n";
    exit(1);
Ejemplo n.º 3
0
<?php

/**
 * Generate ngrams data from text files.
 * Run: php felis.php INPUTDIR OUTPUTDIR
 * INPUTDIR should contain text files e.g. english.txt
 * OUTPUTDIR would contain ngrams files e.g. english.lm
 */
// Language model generation failing?
// up your memory limit or set $minFreq >0 in TextCat.php
// ini_set('memory_limit', '2000000000');
require_once __DIR__ . '/TextCat.php';
// TODO: add option to control model ngram count
$maxNgrams = 4000;
if ($argc != 3) {
    die("Use {$argv['0']} INPUTDIR OUTPUTDIR\n");
}
if (!file_exists($argv[2])) {
    mkdir($argv[2], 0755, true);
}
$cat = new TextCat($argv[2]);
foreach (new DirectoryIterator($argv[1]) as $file) {
    if (!$file->isFile()) {
        continue;
    }
    $ngrams = $cat->createLM(file_get_contents($file->getPathname()), $maxNgrams);
    $cat->writeLanguageFile($ngrams, $argv[2] . "/" . $file->getBasename(".txt") . ".lm");
}
exit(0);