Esempio n. 1
0
/**
 * Makes an n or all word gram Bloom filter based on the supplied arguments
 * Wikipedia files are assumed to have been place in the PREP_DIR before this
 * is run and writes it into the resources folder of the given locale
 *
 * @param array $args command line arguments with first two elements of $argv
 *     removed. For details on which arguments do what see the $usage variable
 */
function makeNWordGramsFiles($args)
{
    if (!isset($args[1])) {
        $args[1] = "en";
        $args[2] = "en-US";
    }
    if (!isset($args[2])) {
        $args[2] = $args[1];
    }
    if (!isset($args[3])) {
        $args[3] = 2;
        // bigrams
    }
    if (!isset($argv[4])) {
        $args[4] = NWordGrams::PAGE_COUNT_WIKIPEDIA;
    }
    if (!isset($args[5]) && $args[3] == "all" && $args[2] == NWordGrams::PAGE_COUNT_WIKIPEDIA) {
        $args[5] = 400000;
    } else {
        $args[5] = -1;
    }
    $wiki_file_path = PREP_DIR . "/";
    if (!file_exists($wiki_file_path . $args[0])) {
        echo $args[0] . " does not exist in {$wiki_file_path}";
        exit;
    }
    /*
     *This call creates a ngrams text file from input xml file and
     *returns the count of ngrams in the text file.
     */
    list($num_ngrams, $max_gram_len) = NWordGrams::makeNWordGramsTextFile($args[0], $args[1], $args[2], $args[3], $args[4], $args[5]);
    /*
     *This call creates a bloom filter file from n word grams text file based
     *on the language specified.The lang passed as parameter is prefixed
     *to the filter file name. The count of n word grams in text file is passed
     *as a parameter to set the limit of n word grams in the filter file.
     */
    NWordGrams::makeNWordGramsFilterFile($args[2], $args[3], $num_ngrams, $max_gram_len);
}
Esempio n. 2
0
 /**
  * Used to split a string of text in the language given by $locale into
  * space separated words. Ex: "acontinuousstringofwords" becomes
  * "a continuous string of words". It operates by scanning from the end of
  * the string to the front and splitting on the longest segment that is a
  * word.
  *
  * @param string $segment string to make into a string of space separated
  *     words
  * @param string $locale IANA tag used to look up dictionary filter to
  *     use to do this segmenting
  * @return string space separated words
  */
 static function reverseMaximalMatch($segment, $locale)
 {
     $len = mb_strlen($segment);
     $cur_pos = $len - 1;
     if ($cur_pos < 1) {
         return $segment;
     }
     $out_segment = "";
     $word_len = 2;
     $char_added = mb_substr($segment, $cur_pos, 1);
     $word_guess = $char_added;
     $is_ascii = mb_check_encoding($char_added, 'ASCII');
     $was_space = trim($char_added) == "";
     $added = false;
     $suffix_backup = -1;
     $suffix_check = true;
     while ($cur_pos >= 0) {
         $old_word_guess = $word_guess;
         $cur_pos--;
         $char_added = mb_substr($segment, $cur_pos, 1);
         $is_space = trim($char_added) == "";
         if ($is_space && $was_space) {
             continue;
         }
         if (!$is_space) {
             $word_guess = $char_added . $word_guess;
             $was_space = false;
         } else {
             $was_space = true;
         }
         if ($suffix_check) {
             $is_suffix = NWordGrams::ngramsContains("*" . $word_guess, $locale, "segment");
         } else {
             $is_suffix = false;
         }
         $suffix_check = true;
         $is_ascii = $is_ascii && mb_check_encoding($char_added, 'ASCII') && !$is_space;
         if ($is_suffix || $is_ascii) {
             $word_len++;
             $added = false;
             if ($is_suffix) {
                 $suffix_backup = $cur_pos;
                 $suffix_guess = $word_guess;
             }
         } else {
             if ($suffix_backup >= 0) {
                 $cur_pos = $suffix_backup;
                 $suffix_backup = -1;
                 $suffix_check = false;
                 $word_guess = $suffix_guess;
                 $word_len = strlen($word_guess);
             } else {
                 if (NWordGrams::ngramsContains($word_guess, $locale, "segment") || $is_space) {
                     $out_segment .= " " . strrev($word_guess);
                     $cur_pos--;
                     $suffix_backup = -1;
                     $word_len = 1;
                     $word_guess = mb_substr($segment, $cur_pos, $word_len);
                     $is_ascii = mb_check_encoding($word_guess, 'ASCII');
                     $was_space = trim($char_added) == "";
                     $word_len++;
                     $added = true;
                 } else {
                     $word_len = 1;
                     $suffix_backup = -1;
                     $out_segment .= " " . strrev($old_word_guess);
                     $word_guess = mb_substr($segment, $cur_pos, $word_len);
                     $is_ascii = mb_check_encoding($word_guess, 'ASCII');
                     $was_space = trim($char_added) == "";
                     $word_len++;
                     $added = true;
                 }
             }
         }
     }
     if (!$added) {
         if (NWordGrams::ngramsContains($old_word_guess, $locale, "segment") || mb_check_encoding($old_word_guess, 'ASCII')) {
             $out_segment .= " " . strrev($old_word_guess);
         } else {
             $front = mb_substr($old_word_guess, 0, 1);
             $back = mb_substr($old_word_guess, 1);
             $out_segment .= " " . strrev($front) . " " . strrev($back);
         }
     }
     $out_segment = strrev($out_segment);
     return $out_segment;
 }