Exemplo n.º 1
0
 /** Counts frequency of occurance of lemmas in meanings and writes to field `pw_lemma_LANG_CODE.frequency`,
  *  if this lemma does not exist in table, that it added there with origin=2 and meaning_id where it has be found.
  */
 public static function count_frequency_lemma_in_meaning()
 {
     $link_db = Piwidict::getDatabaseConnection();
     // set some options
     $opts = array('storage' => PHPMORPHY_STORAGE_FILE, 'predict_by_suffix' => true, 'predict_by_db' => true, 'graminfo_as_text' => true);
     // Path to directory where dictionaries located
     $dir = SITE_ROOT . 'phpmorphy/dicts';
     $lang = 'ru_RU';
     // Create phpMorphy instance
     try {
         $morphy = new phpMorphy($dir, $lang, $opts);
     } catch (phpMorphy_Exception $e) {
         die('Error occured while creating phpMorphy instance: ' . PHP_EOL . $e);
     }
     try {
         $lang_id = (int) TLang::getIDByLangCode(PWLemma::getLangCode());
         $l_table = PWLemma::getTableName();
         $query = "SELECT meaning.id as meaning_id, wiki_text.text as text FROM wiki_text, meaning, lang_pos WHERE  " . "wiki_text.id=meaning.wiki_text_id and meaning.lang_pos_id=lang_pos.id and lang_pos.lang_id={$lang_id}";
         $res_meaning = $link_db->query_e($query, "Query failed in file <b>" . __FILE__ . "</b>, string <b>" . __LINE__ . "</b>");
         while ($row_meaning = $res_meaning->fetch_object()) {
             //print "<p>".$row_meaning->text;
             $words = preg_split('/((^\\p{P}+)|(\\p{P}*\\s+\\p{P}*)|(\\p{P}+$))/u', $row_meaning->text, -1, PREG_SPLIT_NO_EMPTY);
             //print_r($words);
             $words = array_count_values($words);
             foreach ($words as $word => $count) {
                 $lemma = PWLemma::getPhpMorphyLemma($word, $morphy);
                 if (!$lemma) {
                     continue;
                 }
                 $lemma = PWString::restoreCase($lemma, $word);
                 $lemma = str_replace("'", "\\'", $lemma);
                 $cond = "WHERE lemma like '{$lemma}'";
                 $res_lemma = $link_db->query_e("SELECT id,frequency FROM {$l_table} {$cond}", "Query failed in file <b>" . __FILE__ . "</b>, string <b>" . __LINE__ . "</b>");
                 if ($link_db->query_count($res_lemma) == 0) {
                     $query = "INSERT INTO `{$l_table}` (`lemma`,`origin`,`frequency`,`meaning_id`) VALUES ('{$lemma}',2,{$count}," . $row_meaning->meaning_id . ")";
                     //print "<p>$query";
                     $link_db->query_e($query, "Query failed in file <b>" . __FILE__ . "</b>, string <b>" . __LINE__ . "</b>");
                 } else {
                     $row_lemma = $res_lemma->fetch_object();
                     $query = "UPDATE `{$l_table}` SET `frequency`=" . (int) ($count + $row_lemma->frequency) . " {$cond}";
                     //print "<p>$query";
                     $link_db->query_e($query, "Query failed in file <b>" . __FILE__ . "</b>, string <b>" . __LINE__ . "</b>");
                 }
             }
         }
     } catch (phpMorphy_Exception $e) {
         die('Error occured while text processing: ' . $e->getMessage());
     }
 }
Exemplo n.º 2
0
 public static function meaningsToLemmas($word)
 {
     $word_obj_arr = PWLemma::getByLemma($word);
     $words = array();
     foreach ($word_obj_arr as $word_obj) {
         if ($word_obj->getOrigin() > 0) {
             // The page $word does not exist in LANG_CODE.wiktionary.org
             continue;
         }
         $page_id = $word_obj->getID();
         // if origin=0 then word is added from wiktionary, and lemma.id = page.id
         $meaning_arr = TMeaning::getByPageAndLang($page_id, PWLemma::getLangCode());
         foreach ($meaning_arr as $meaning_obj) {
             $meaning_wiki_text = $meaning_obj->getWikiText();
             $meaning_text = $meaning_wiki_text->getText();
             //                $words = array_merge($words,preg_split('/\P{L}+/u', $meaning_text, -1, PREG_SPLIT_NO_EMPTY));
             $words = array_merge($words, preg_split('/((^\\p{P}+)|(\\p{P}*\\s+\\p{P}*)|(\\p{P}+$))/u', $meaning_text, -1, PREG_SPLIT_NO_EMPTY));
         }
     }
     return $words;
 }