/** * Generates a n word grams text file from input wikipedia xml file. * The input file can be a bz2 compressed or uncompressed. * The input XML file is parsed line by line and pattern for * n word gram is searched. If a n word gram is found it is added to the * array. After the complete file is parsed we remove the duplicate * n word grams and sort them. The resulting array is written to the * text file. The function returns the number of bigrams stored in * the text file. * * @param string $wiki_file compressed or uncompressed wikipedia * XML file path to be used to extract bigrams. This can also * be a folder containing such files * @param string $lang Language to be used to create n grams. * @param string $locale Locale to be used to store results. * @param int $num_gram number of words in grams we are looking for * @param int $ngram_type where in Wiki Dump to extract grams from * @param int $max_terms maximum number of n-grams to compute and put in * file * @return int $num_ngrams_found count of bigrams in text file. */ static function makeNWordGramsTextFile($wiki_file, $lang, $locale, $num_gram = 2, $ngram_type = self::PAGE_COUNT_WIKIPEDIA, $max_terms = -1) { $output_message_threshold = self::BLOCK_SIZE * self::BLOCK_SIZE; $is_count_type = false; switch ($ngram_type) { case self::WIKI_DUMP_TITLE: $pattern = '/<title>[^\\p{P}]+'; $pattern_end = '<\\/title>/u'; $replace_array = array('<title>', '</title>'); break; case self::WIKI_DUMP_REDIRECT: $pattern = '/#redirect\\s\\[\\[[^\\p{P}]+'; $pattern_end = '\\]\\]/u'; $replace_array = array('#redirect [[', ']]'); break; case self::PAGE_COUNT_WIKIPEDIA: $pattern = '/^' . $lang . '\\s[^\\p{P}]+'; $pattern_end = '/u'; $is_count_type = true; break; case self::PAGE_COUNT_WIKTIONARY: $pattern = '/^' . $lang . '.d\\s[^\\p{P}]+'; $pattern_end = '/u'; $is_count_type = true; break; } $is_all = false; $repeat_pattern = "[\\s|_][^\\p{P}]+"; if ($num_gram == "all" || $is_count_type) { $pattern .= "({$repeat_pattern})+"; if ($num_gram == "all") { $is_all = true; } $max_gram_len = -1; } else { for ($i = 1; $i < $num_gram; $i++) { $pattern .= $repeat_pattern; } $max_gram_len = $num_gram; } $pattern .= $pattern_end; $replace_types = array(self::WIKI_DUMP_TITLE, self::WIKI_DUMP_REDIRECT); if (is_dir(PREP_DIR . "/{$wiki_file}")) { $folder_files = glob(PREP_DIR . "/{$wiki_file}/*.{gz,bz}", GLOB_BRACE); } else { $folder_files = array(PREP_DIR . "/{$wiki_file}"); } $ngrams = array(); foreach ($folder_files as $wiki_file_path) { if (strpos($wiki_file_path, "bz2") !== false) { $fr = bzopen($wiki_file_path, 'r') or die("Can't open compressed file"); $read = "bzread"; $close = "bzclose"; } else { if (strpos($wiki_file_path, "gz") !== false) { $fr = gzopen($wiki_file_path, 'r') or die("Can't open compressed file"); $read = "gzread"; $close = "gzclose"; } else { $fr = fopen($wiki_file_path, 'r') or die("Can't open file"); $read = "fread"; $close = "fclose"; } } $ngrams_file_path = LOCALE_DIR . "/{$locale}/resources/" . "{$num_gram}" . self::TEXT_SUFFIX; $input_buffer = ""; $time = time(); echo "Reading wiki file ...{$wiki_file_path}...\n"; $bytes = 0; $bytes_since_last_output = 0; while (!feof($fr)) { $input_text = $read($fr, self::BLOCK_SIZE); $len = strlen($input_text); if ($len == 0) { break; } $bytes += $len; $bytes_since_last_output += $len; if ($bytes_since_last_output > $output_message_threshold) { echo "Have now read " . $bytes . " many bytes." . " Peak memory so far: " . memory_get_peak_usage() . ".\n Number of word grams so far: " . count($ngrams) . ". Elapsed time so far: " . (time() - $time) . "s\n"; $bytes_since_last_output = 0; } $input_buffer .= mb_strtolower($input_text); $lines = explode("\n", $input_buffer); $input_buffer = array_pop($lines); foreach ($lines as $line) { preg_match($pattern, $line, $matches); if (count($matches) > 0) { if ($is_count_type) { $line_parts = explode(" ", $matches[0]); if (isset($line_parts[1]) && isset($line_parts[2])) { $ngram = mb_ereg_replace("_", " ", $line_parts[1]); $char_grams = PhraseParser::getCharGramsTerm(array($ngram), $locale); $ngram = implode(" ", $char_grams); $ngram_num_words = mb_substr_count($ngram, " ") + 1; if ($is_all && $ngram_num_words > 1 || !$is_all && $ngram_num_words == $num_gram) { $ngrams[$ngram] = $line_parts[2]; } } } else { $ngram = mb_ereg_replace($replace_array, "", $matches[0]); $ngram = mb_ereg_replace("_", " ", $ngram); $ngrams[] = $ngram; } if ($is_all && isset($ngram)) { $ngram_num_words = mb_substr_count($ngram, " ") + 1; $max_gram_len = max($max_gram_len, $ngram_num_words); } } } } } if ($is_count_type) { arsort($ngrams); $ngrams = array_keys($ngrams); } $ngrams = array_unique($ngrams); $num_ngrams_found = count($ngrams); if ($max_terms > 0 && $num_ngrams_found > $max_terms) { $ngrams = array_slice($ngrams, 0, $max_terms); } $num_ngrams_found = count($ngrams); // in is_all case add prefix*'s for (n >= 3)-grams if ($is_all) { for ($i = 0; $i < $num_ngrams_found; $i++) { $ngram_in_word = mb_substr_count($ngrams[$i], " ") + 1; if ($ngram_in_word >= 3) { $ngram_parts = explode(" ", $ngrams[$i]); $ngram = $ngram_parts[0]; for ($j = 1; $j < $ngram_in_word - 1; $j++) { $ngram .= " " . $ngram_parts[$j]; $ngrams[] = $ngram . "*"; } } } $ngrams = array_unique($ngrams); $num_ngrams_found = count($ngrams); } sort($ngrams); $ngrams_string = implode("\n", $ngrams); file_put_contents($ngrams_file_path, $ngrams_string); $close($fr); return array($num_ngrams_found, $max_gram_len); }