/** * Makes an n or all word gram Bloom filter based on the supplied arguments * Wikipedia files are assumed to have been place in the PREP_DIR before this * is run and writes it into the resources folder of the given locale * * @param array $args command line arguments with first two elements of $argv * removed. For details on which arguments do what see the $usage variable */ function makeNWordGramsFiles($args) { if (!isset($args[1])) { $args[1] = "en"; $args[2] = "en-US"; } if (!isset($args[2])) { $args[2] = $args[1]; } if (!isset($args[3])) { $args[3] = 2; // bigrams } if (!isset($argv[4])) { $args[4] = NWordGrams::PAGE_COUNT_WIKIPEDIA; } if (!isset($args[5]) && $args[3] == "all" && $args[2] == NWordGrams::PAGE_COUNT_WIKIPEDIA) { $args[5] = 400000; } else { $args[5] = -1; } $wiki_file_path = PREP_DIR . "/"; if (!file_exists($wiki_file_path . $args[0])) { echo $args[0] . " does not exist in {$wiki_file_path}"; exit; } /* *This call creates a ngrams text file from input xml file and *returns the count of ngrams in the text file. */ list($num_ngrams, $max_gram_len) = NWordGrams::makeNWordGramsTextFile($args[0], $args[1], $args[2], $args[3], $args[4], $args[5]); /* *This call creates a bloom filter file from n word grams text file based *on the language specified.The lang passed as parameter is prefixed *to the filter file name. The count of n word grams in text file is passed *as a parameter to set the limit of n word grams in the filter file. */ NWordGrams::makeNWordGramsFilterFile($args[2], $args[3], $num_ngrams, $max_gram_len); }
/** * Used to split a string of text in the language given by $locale into * space separated words. Ex: "acontinuousstringofwords" becomes * "a continuous string of words". It operates by scanning from the end of * the string to the front and splitting on the longest segment that is a * word. * * @param string $segment string to make into a string of space separated * words * @param string $locale IANA tag used to look up dictionary filter to * use to do this segmenting * @return string space separated words */ static function reverseMaximalMatch($segment, $locale) { $len = mb_strlen($segment); $cur_pos = $len - 1; if ($cur_pos < 1) { return $segment; } $out_segment = ""; $word_len = 2; $char_added = mb_substr($segment, $cur_pos, 1); $word_guess = $char_added; $is_ascii = mb_check_encoding($char_added, 'ASCII'); $was_space = trim($char_added) == ""; $added = false; $suffix_backup = -1; $suffix_check = true; while ($cur_pos >= 0) { $old_word_guess = $word_guess; $cur_pos--; $char_added = mb_substr($segment, $cur_pos, 1); $is_space = trim($char_added) == ""; if ($is_space && $was_space) { continue; } if (!$is_space) { $word_guess = $char_added . $word_guess; $was_space = false; } else { $was_space = true; } if ($suffix_check) { $is_suffix = NWordGrams::ngramsContains("*" . $word_guess, $locale, "segment"); } else { $is_suffix = false; } $suffix_check = true; $is_ascii = $is_ascii && mb_check_encoding($char_added, 'ASCII') && !$is_space; if ($is_suffix || $is_ascii) { $word_len++; $added = false; if ($is_suffix) { $suffix_backup = $cur_pos; $suffix_guess = $word_guess; } } else { if ($suffix_backup >= 0) { $cur_pos = $suffix_backup; $suffix_backup = -1; $suffix_check = false; $word_guess = $suffix_guess; $word_len = strlen($word_guess); } else { if (NWordGrams::ngramsContains($word_guess, $locale, "segment") || $is_space) { $out_segment .= " " . strrev($word_guess); $cur_pos--; $suffix_backup = -1; $word_len = 1; $word_guess = mb_substr($segment, $cur_pos, $word_len); $is_ascii = mb_check_encoding($word_guess, 'ASCII'); $was_space = trim($char_added) == ""; $word_len++; $added = true; } else { $word_len = 1; $suffix_backup = -1; $out_segment .= " " . strrev($old_word_guess); $word_guess = mb_substr($segment, $cur_pos, $word_len); $is_ascii = mb_check_encoding($word_guess, 'ASCII'); $was_space = trim($char_added) == ""; $word_len++; $added = true; } } } } if (!$added) { if (NWordGrams::ngramsContains($old_word_guess, $locale, "segment") || mb_check_encoding($old_word_guess, 'ASCII')) { $out_segment .= " " . strrev($old_word_guess); } else { $front = mb_substr($old_word_guess, 0, 1); $back = mb_substr($old_word_guess, 1); $out_segment .= " " . strrev($front) . " " . strrev($back); } } $out_segment = strrev($out_segment); return $out_segment; }