Exemple #1
0
/**
 * Makes an n or all word gram Bloom filter based on the supplied arguments
 * Wikipedia files are assumed to have been place in the PREP_DIR before this
 * is run and writes it into the resources folder of the given locale
 *
 * @param array $args command line arguments with first two elements of $argv
 *     removed. For details on which arguments do what see the $usage variable
 */
function makeNWordGramsFiles($args)
{
    if (!isset($args[1])) {
        $args[1] = "en";
        $args[2] = "en-US";
    }
    if (!isset($args[2])) {
        $args[2] = $args[1];
    }
    if (!isset($args[3])) {
        $args[3] = 2;
        // bigrams
    }
    if (!isset($argv[4])) {
        $args[4] = NWordGrams::PAGE_COUNT_WIKIPEDIA;
    }
    if (!isset($args[5]) && $args[3] == "all" && $args[2] == NWordGrams::PAGE_COUNT_WIKIPEDIA) {
        $args[5] = 400000;
    } else {
        $args[5] = -1;
    }
    $wiki_file_path = PREP_DIR . "/";
    if (!file_exists($wiki_file_path . $args[0])) {
        echo $args[0] . " does not exist in {$wiki_file_path}";
        exit;
    }
    /*
     *This call creates a ngrams text file from input xml file and
     *returns the count of ngrams in the text file.
     */
    list($num_ngrams, $max_gram_len) = NWordGrams::makeNWordGramsTextFile($args[0], $args[1], $args[2], $args[3], $args[4], $args[5]);
    /*
     *This call creates a bloom filter file from n word grams text file based
     *on the language specified.The lang passed as parameter is prefixed
     *to the filter file name. The count of n word grams in text file is passed
     *as a parameter to set the limit of n word grams in the filter file.
     */
    NWordGrams::makeNWordGramsFilterFile($args[2], $args[3], $num_ngrams, $max_gram_len);
}