/** * Creates a bloom filter file from a n word gram text file. The * path of n word gram text file used is based on the input $lang. * The name of output filter file is based on the $lang and the * number n. Size is based on input number of n word grams . * The n word grams are read from text file, stemmed if a stemmer * is available for $lang and then stored in filter file. * * @param string $lang locale to be used to stem n grams. * @param string $num_gram value of n in n-gram (how many words in sequence * should constitute a gram) * @param int $num_ngrams_found count of n word grams in text file. * @param int $max_gram_len value n of longest n gram to be added. * @return none */ static function makeNWordGramsFilterFile($lang, $num_gram, $num_ngrams_found, $max_gram_len = 2) { $filter_path = LOCALE_DIR . "/{$lang}/resources/" . "{$num_gram}" . self::FILTER_SUFFIX; if (file_exists($filter_path)) { unlink($filter_path); //build again from scratch } $ngrams = new BloomFilterFile($filter_path, $num_ngrams_found); $inputFilePath = LOCALE_DIR . "/{$lang}/resources/" . "{$num_gram}" . self::TEXT_SUFFIX; $fp = fopen($inputFilePath, 'r') or die("Can't open ngrams text file"); while (($ngram = fgets($fp)) !== false) { $words = PhraseParser::stemTerms(trim($ngram), $lang); if (strlen($words[0]) == 1) { // get rid of n grams like "a dog" continue; } $ngram_stemmed = implode(" ", $words); $ngrams->add(mb_strtolower($ngram_stemmed)); } fclose($fp); $ngrams->max_gram_len = $max_gram_len; $ngrams->save(); }