Example #1
0
 /**
  * Creates a bloom filter file from a n word gram text file. The
  * path of n word gram text file used is based on the input $lang.
  * The name of output filter file is based on the $lang and the
  * number n. Size is based on input number of n word grams .
  * The n word grams are read from text file, stemmed if a stemmer
  * is available for $lang and then stored in filter file.
  *
  * @param string $lang locale to be used to stem n grams.
  * @param string $num_gram value of n in n-gram (how many words in sequence
  *      should constitute a gram)
  * @param int $num_ngrams_found count of n word grams in text file.
  * @param int $max_gram_len value n of longest n gram to be added.
  * @return none
  */
 static function makeNWordGramsFilterFile($lang, $num_gram, $num_ngrams_found, $max_gram_len = 2)
 {
     $filter_path = LOCALE_DIR . "/{$lang}/resources/" . "{$num_gram}" . self::FILTER_SUFFIX;
     if (file_exists($filter_path)) {
         unlink($filter_path);
         //build again from scratch
     }
     $ngrams = new BloomFilterFile($filter_path, $num_ngrams_found);
     $inputFilePath = LOCALE_DIR . "/{$lang}/resources/" . "{$num_gram}" . self::TEXT_SUFFIX;
     $fp = fopen($inputFilePath, 'r') or die("Can't open ngrams text file");
     while (($ngram = fgets($fp)) !== false) {
         $words = PhraseParser::stemTerms(trim($ngram), $lang);
         if (strlen($words[0]) == 1) {
             // get rid of n grams like "a dog"
             continue;
         }
         $ngram_stemmed = implode(" ", $words);
         $ngrams->add(mb_strtolower($ngram_stemmed));
     }
     fclose($fp);
     $ngrams->max_gram_len = $max_gram_len;
     $ngrams->save();
 }