/
felis.php
32 lines (28 loc) · 909 Bytes
/
felis.php
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
<?php
/**
* Generate ngrams data from text files.
* Run: php felis.php INPUTDIR OUTPUTDIR
* INPUTDIR should contain text files e.g. english.txt
* OUTPUTDIR would contain ngrams files e.g. english.lm
*/
// Language model generation failing?
// up your memory limit or set $minFreq >0 in TextCat.php
// ini_set('memory_limit', '2000000000');
require_once __DIR__.'/TextCat.php';
// TODO: add option to control model ngram count
$maxNgrams = 4000;
if ( $argc != 3 ) {
die( "Use $argv[0] INPUTDIR OUTPUTDIR\n" );
}
if ( !file_exists( $argv[2] ) ) {
mkdir( $argv[2], 0755, true );
}
$cat = new TextCat( $argv[2] );
foreach ( new DirectoryIterator( $argv[1] ) as $file ) {
if ( !$file->isFile() ) {
continue;
}
$ngrams = $cat->createLM( file_get_contents( $file->getPathname() ), $maxNgrams );
$cat->writeLanguageFile( $ngrams, $argv[2] . "/" . $file->getBasename( ".txt" ) . ".lm" );
}
exit( 0 );