PHP PhraseParser::getCharGramsTerm Examples

Programming Language: PHP

Class/Type: PhraseParser

Method/Function: getCharGramsTerm

Examples at hotexamples.com: 1

PHP PhraseParser::getCharGramsTerm - 1 examples found. These are the top rated real world PHP examples of PhraseParser::getCharGramsTerm extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

extractPhrasesInLists(6)

getTokenizer(5)

computeSafeSearchScore(4)

calculateMetas(3)

calculateLinkMetas(2)

canonicalizePunctuatedTerms(1)

extractPhrases(1)

extractPhrasesAndCount(1)

extractWordStringPageSummary(1)

getCharGramsTerm(1)

getCosineRank(1)

getIntersection(1)

reverseMaximalMatch(1)

segmentSegment(1)

stemCharGramSegment(1)

stemTerms(1)

Example #1

Show file

File: nword_grams.php Project: yakar/yioop

 /**
  * Generates a n word grams text file from input wikipedia xml file.
  * The input file can be a bz2 compressed or uncompressed.
  * The input XML file is parsed line by line and pattern for
  * n word gram is searched. If a n word gram is found it is added to the
  * array. After the complete file is parsed we remove the duplicate
  * n word grams and sort them. The resulting array is written to the
  * text file. The function returns the number of bigrams stored in
  * the text file.
  *
  * @param string $wiki_file compressed or uncompressed wikipedia
  *     XML file path to be used to extract bigrams. This can also
  *     be a folder containing such files
  * @param string $lang Language to be used to create n grams.
  * @param string $locale Locale to be used to store results.
  * @param int $num_gram number of words in grams we are looking for
  * @param int $ngram_type where in Wiki Dump to extract grams from
  * @param int $max_terms maximum number of n-grams to compute and put in
  *      file
  * @return int $num_ngrams_found count of bigrams in text file.
  */
 static function makeNWordGramsTextFile($wiki_file, $lang, $locale, $num_gram = 2, $ngram_type = self::PAGE_COUNT_WIKIPEDIA, $max_terms = -1)
 {
     $output_message_threshold = self::BLOCK_SIZE * self::BLOCK_SIZE;
     $is_count_type = false;
     switch ($ngram_type) {
         case self::WIKI_DUMP_TITLE:
             $pattern = '/<title>[^\\p{P}]+';
             $pattern_end = '<\\/title>/u';
             $replace_array = array('<title>', '</title>');
             break;
         case self::WIKI_DUMP_REDIRECT:
             $pattern = '/#redirect\\s\\[\\[[^\\p{P}]+';
             $pattern_end = '\\]\\]/u';
             $replace_array = array('#redirect [[', ']]');
             break;
         case self::PAGE_COUNT_WIKIPEDIA:
             $pattern = '/^' . $lang . '\\s[^\\p{P}]+';
             $pattern_end = '/u';
             $is_count_type = true;
             break;
         case self::PAGE_COUNT_WIKTIONARY:
             $pattern = '/^' . $lang . '.d\\s[^\\p{P}]+';
             $pattern_end = '/u';
             $is_count_type = true;
             break;
     }
     $is_all = false;
     $repeat_pattern = "[\\s|_][^\\p{P}]+";
     if ($num_gram == "all" || $is_count_type) {
         $pattern .= "({$repeat_pattern})+";
         if ($num_gram == "all") {
             $is_all = true;
         }
         $max_gram_len = -1;
     } else {
         for ($i = 1; $i < $num_gram; $i++) {
             $pattern .= $repeat_pattern;
         }
         $max_gram_len = $num_gram;
     }
     $pattern .= $pattern_end;
     $replace_types = array(self::WIKI_DUMP_TITLE, self::WIKI_DUMP_REDIRECT);
     if (is_dir(PREP_DIR . "/{$wiki_file}")) {
         $folder_files = glob(PREP_DIR . "/{$wiki_file}/*.{gz,bz}", GLOB_BRACE);
     } else {
         $folder_files = array(PREP_DIR . "/{$wiki_file}");
     }
     $ngrams = array();
     foreach ($folder_files as $wiki_file_path) {
         if (strpos($wiki_file_path, "bz2") !== false) {
             $fr = bzopen($wiki_file_path, 'r') or die("Can't open compressed file");
             $read = "bzread";
             $close = "bzclose";
         } else {
             if (strpos($wiki_file_path, "gz") !== false) {
                 $fr = gzopen($wiki_file_path, 'r') or die("Can't open compressed file");
                 $read = "gzread";
                 $close = "gzclose";
             } else {
                 $fr = fopen($wiki_file_path, 'r') or die("Can't open file");
                 $read = "fread";
                 $close = "fclose";
             }
         }
         $ngrams_file_path = LOCALE_DIR . "/{$locale}/resources/" . "{$num_gram}" . self::TEXT_SUFFIX;
         $input_buffer = "";
         $time = time();
         echo "Reading wiki file ...{$wiki_file_path}...\n";
         $bytes = 0;
         $bytes_since_last_output = 0;
         while (!feof($fr)) {
             $input_text = $read($fr, self::BLOCK_SIZE);
             $len = strlen($input_text);
             if ($len == 0) {
                 break;
             }
             $bytes += $len;
             $bytes_since_last_output += $len;
             if ($bytes_since_last_output > $output_message_threshold) {
                 echo "Have now read " . $bytes . " many bytes." . " Peak memory so far: " . memory_get_peak_usage() . ".\n     Number of word grams so far: " . count($ngrams) . ". Elapsed time so far: " . (time() - $time) . "s\n";
                 $bytes_since_last_output = 0;
             }
             $input_buffer .= mb_strtolower($input_text);
             $lines = explode("\n", $input_buffer);
             $input_buffer = array_pop($lines);
             foreach ($lines as $line) {
                 preg_match($pattern, $line, $matches);
                 if (count($matches) > 0) {
                     if ($is_count_type) {
                         $line_parts = explode(" ", $matches[0]);
                         if (isset($line_parts[1]) && isset($line_parts[2])) {
                             $ngram = mb_ereg_replace("_", " ", $line_parts[1]);
                             $char_grams = PhraseParser::getCharGramsTerm(array($ngram), $locale);
                             $ngram = implode(" ", $char_grams);
                             $ngram_num_words = mb_substr_count($ngram, " ") + 1;
                             if ($is_all && $ngram_num_words > 1 || !$is_all && $ngram_num_words == $num_gram) {
                                 $ngrams[$ngram] = $line_parts[2];
                             }
                         }
                     } else {
                         $ngram = mb_ereg_replace($replace_array, "", $matches[0]);
                         $ngram = mb_ereg_replace("_", " ", $ngram);
                         $ngrams[] = $ngram;
                     }
                     if ($is_all && isset($ngram)) {
                         $ngram_num_words = mb_substr_count($ngram, " ") + 1;
                         $max_gram_len = max($max_gram_len, $ngram_num_words);
                     }
                 }
             }
         }
     }
     if ($is_count_type) {
         arsort($ngrams);
         $ngrams = array_keys($ngrams);
     }
     $ngrams = array_unique($ngrams);
     $num_ngrams_found = count($ngrams);
     if ($max_terms > 0 && $num_ngrams_found > $max_terms) {
         $ngrams = array_slice($ngrams, 0, $max_terms);
     }
     $num_ngrams_found = count($ngrams);
     // in is_all case add prefix*'s for (n >= 3)-grams
     if ($is_all) {
         for ($i = 0; $i < $num_ngrams_found; $i++) {
             $ngram_in_word = mb_substr_count($ngrams[$i], " ") + 1;
             if ($ngram_in_word >= 3) {
                 $ngram_parts = explode(" ", $ngrams[$i]);
                 $ngram = $ngram_parts[0];
                 for ($j = 1; $j < $ngram_in_word - 1; $j++) {
                     $ngram .= " " . $ngram_parts[$j];
                     $ngrams[] = $ngram . "*";
                 }
             }
         }
         $ngrams = array_unique($ngrams);
         $num_ngrams_found = count($ngrams);
     }
     sort($ngrams);
     $ngrams_string = implode("\n", $ngrams);
     file_put_contents($ngrams_file_path, $ngrams_string);
     $close($fr);
     return array($num_ngrams_found, $max_gram_len);
 }