Пример #1
0
 function Words2BaseForm($text)
 {
     global $_COMMON_SITE_CONF;
     static $dict_bundle, $morphy;
     require_once $GLOBALS['_PATH']['PATH_INC'] . 'phpMorphy/src/common.php';
     if (!$dict_bundle) {
         $encoding = $_COMMON_SITE_CONF['encodings'][$_COMMON_SITE_CONF['site_encoding']];
         $dir = $GLOBALS['_PATH']['PATH_INC'] . 'phpMorphy/dicts/';
         $dict_bundle = new phpMorphy_FilesBundle($dir, 'rus');
     }
     if (!$morphy) {
         $opts = array('storage' => PHPMORPHY_STORAGE_MEM, 'with_gramtab' => false, 'predict_by_suffix' => true, 'predict_by_db' => true);
         $morphy = new phpMorphy($dict_bundle, $opts);
     }
     setlocale(LC_CTYPE, array('ru_RU.CP1251', 'rus_RUS.CP1251', 'rus_RUS.CP1251', 'Russian_Russia.1251'));
     $words = preg_replace('#\\[.*\\]#isU', '', $text);
     $words = preg_split('#\\s|[,.:;«»!?"\'()]#', $words, -1, PREG_SPLIT_NO_EMPTY);
     $bulk_words = array();
     foreach ($words as $v) {
         if (strlen($v) > 3) {
             $bulk_words[] = strtoupper($v);
         }
     }
     $base_form = $morphy->getBaseForm($bulk_words);
     $fullList = array();
     if (is_array($base_form) && count($base_form)) {
         foreach ($base_form as $k => $v) {
             if (is_array($v)) {
                 foreach ($v as $v1) {
                     if (strlen($v1) > 3) {
                         $fullList[$v1] = 1;
                     }
                 }
             }
         }
     }
     $words = join(' ', array_keys($fullList));
     return $words;
 }
Пример #2
0
 /**
  * Returns base form on word.
  * @param string $content
  * @return string|null
  */
 public function baseForm($content)
 {
     if (empty($content)) {
         return null;
     }
     $words = preg_replace(['/\\[.*\\]/isu', '/[^\\w\\x7F-\\xFF\\s]/i'], "", trim($content));
     $words = preg_replace('/ +/', ' ', $words);
     //preg_match_all('/[a-zA-Z]+/iu',mb_strtoupper($words, CHARSET),$words_latin);
     //$words_latin = (is_array($words_latin) && count($words_latin) > 0) ? ' '.implode(' ', $words_latin[0]) : '';
     $words = preg_split('/\\s|[,.:;!?"\'()]/', $words, -1, PREG_SPLIT_NO_EMPTY);
     $bulkWords = [];
     foreach ($words as $res) {
         if (mb_strlen($res, 'utf-8') > 2) {
             $bulkWords[] = mb_strtoupper($res, 'utf-8');
         }
     }
     //$this->_Morphy->getEncoding();
     $baseForm = static::$morphy->getBaseForm($bulkWords);
     if (is_array($baseForm) && count($baseForm)) {
         $dataWords = [];
         foreach ($baseForm as $key => $arr_res) {
             if (is_array($arr_res)) {
                 foreach ($arr_res as $val_res) {
                     if (mb_strlen($val_res, 'utf-8') > 2) {
                         $dataWords[$val_res] = 1;
                     }
                 }
                 /* те слова, что отсутсвуют в словаре */
             } else {
                 if (!empty($res) && mb_strlen($res, 'utf-8') > 2) {
                     $dataWords[$key] = 1;
                 }
             }
         }
         $words = implode(' ', array_keys($dataWords));
     }
     return $words;
 }
Пример #3
0
//  | Russian        | cp1251      |
//  |------------------------------|
//  | English        | cp1250      |
//  |------------------------------|
//  | German         | cp1252      |
//  *------------------------------*
// $codepage = $morphy->getCodepage();
// setlocale(LC_CTYPE, array('ru_RU.CP1251', 'Russian_Russia.1251'));
// Hint: in this example words $word_one, $word_two are in russian language(cp1251 encoding)
$word_one = 'КОТ';
$word_two = 'СОБАКА';
echo "Testing single mode...\n";
try {
    // word by word processing
    // each function return array with result or FALSE when no form(s) for given word found(or predicted)
    $base_form = $morphy->getBaseForm($word_one);
    $all_forms = $morphy->getAllForms($word_one);
    $pseudo_root = $morphy->getPseudoRoot($word_one);
    if (false === $base_form || false === $all_forms || false === $pseudo_root) {
        die("Can`t find or predict {$word_one} word");
    }
    echo 'base form = ' . implode(', ', $base_form) . "\n";
    echo 'all forms = ' . implode(', ', $all_forms) . "\n";
    echo "Testing bulk mode...\n";
    // bulk mode speed-ups processing up to 50-100%(mainly for getBaseForm method)
    // in bulk mode all function always return array
    $bulk_words = array($word_one, $word_two);
    $base_form = $morphy->getBaseForm($bulk_words);
    $all_forms = $morphy->getAllForms($bulk_words);
    $pseudo_root = $morphy->getPseudoRoot($bulk_words);
    // Bulk result format:
Пример #4
0
function Words2BaseForm($text)
{
    require_once dirname(__FILE__) . '/../sys/phpmorphy/src/common.php';
    // set some options
    $opts = array('storage' => PHPMORPHY_STORAGE_MEM, 'with_gramtab' => false, 'predict_by_suffix' => true, 'predict_by_db' => true);
    $dir = dirname(__FILE__) . '/../sys/phpmorphy/dicts';
    // Create descriptor for dictionary located in $dir directory with russian language
    $dict_bundle = new phpMorphy_FilesBundle($dir, 'rus');
    // Create phpMorphy instance
    $morphy = new phpMorphy($dict_bundle, $opts);
    $words = preg_replace('#\\[.*\\]#isU', '', $text);
    $words = preg_split('#\\s|[,.:;!?"\'()]#', $words, -1, PREG_SPLIT_NO_EMPTY);
    $bulk_words = array();
    foreach ($words as $v) {
        if (strlen($v) > 2) {
            $bulk_words[] = strtoupper($v);
        }
    }
    $base_form = $morphy->getBaseForm($bulk_words);
    $fullList = array();
    if (is_array($base_form) && count($base_form)) {
        foreach ($base_form as $k => $v) {
            if (is_array($v)) {
                foreach ($v as $v1) {
                    if (strlen($v1) > 2) {
                        $fullList[$v1] = 1;
                    }
                }
            }
        }
    }
    $words = join(' ', array_keys($fullList));
    return $words;
}
Пример #5
0
 protected function getLemmas(array $words)
 {
     require_once MODX_CORE_PATH . 'components/modsearch/external/phpmorphy/src/common.php';
     // set some options
     $opts = array('storage' => PHPMORPHY_STORAGE_FILE, 'predict_by_suffix' => true, 'predict_by_db' => true, 'graminfo_as_text' => true);
     // Path to directory where dictionaries located
     $dir = MODX_CORE_PATH . 'components/modsearch/external/phpmorphy/dicts';
     $lang = 'ru_RU';
     // Create phpMorphy instance
     # try {
     #     $morphy = new phpMorphy($dir, $lang, $opts);
     #     // print $morphy->getEncoding();
     #     // exit;
     # } catch(phpMorphy_Exception $e) {
     #     die('Error occured while creating phpMorphy instance: ' . PHP_EOL . $e);
     # }
     if (!($morphy = new phpMorphy($dir, $lang, $opts))) {
         $this->modx->log(xPDO::LOG_LEVEL_ERROR, "[" . __CLASS__ . "] Не был получен объект phpMorphy");
         return false;
     }
     $lemmas = array();
     # print_r($words);
     #
     # exit;
     // print $morphy->getEncoding();
     // if(function_exists('iconv')) {
     //     foreach($words as &$word) {
     //         $word = iconv('windows-1251', $morphy->getEncoding(), $word);
     //     }
     //     unset($word);
     // }
     # try {
     foreach ($words as $word) {
         if (!$word) {
             continue;
         }
         # $word = mb_strtoupper($word, 'UTF-8');
         // by default, phpMorphy finds $word in dictionary and when nothig found, try to predict them
         // you can change this behaviour, via second argument to getXXX or findWord methods
         $base = $morphy->getBaseForm($word);
         $all = $morphy->getAllForms($word);
         $part_of_speech = $morphy->getPartOfSpeech($word);
         // echo $morphy->getLocale();
         // var_dump($morphy->getShmCache()->getFilesList());
         // print_r($base);
         // exit;
         // $base = $morphy->getBaseForm($word, phpMorphy::NORMAL); // normal behaviour
         // $base = $morphy->getBaseForm($word, phpMorphy::IGNORE_PREDICT); // don`t use prediction
         // $base = $morphy->getBaseForm($word, phpMorphy::ONLY_PREDICT); // always predict word
         $is_predicted = $morphy->isLastPredicted();
         // or $morphy->getLastPredictionType() == phpMorphy::PREDICT_BY_NONE
         $is_predicted_by_db = $morphy->getLastPredictionType() == phpMorphy::PREDICT_BY_DB;
         $is_predicted_by_suffix = $morphy->getLastPredictionType() == phpMorphy::PREDICT_BY_SUFFIX;
         // this used for deep analysis
         $collection = $morphy->findWord($word);
         // or var_dump($morphy->getAllFormsWithGramInfo($word)); for debug
         if (false === $collection) {
             # echo $word, " NOT FOUND\n";
             // Если слово не найдено, добавляем его в массив как есть
             $lemmas[] = $word;
             continue;
         } else {
             # print "\n<br />Найдено слово: ". $word;
         }
         foreach ($base as $lemma) {
             $lemmas[] = $lemma;
         }
         #         echo $is_predicted ? '-' : '+', $word, "\n";
         #         echo 'lemmas: ', implode(', ', $base), "\n";
         #         echo 'all: ', implode(', ', $all), "\n";
         #         echo 'poses: ', implode(', ', $part_of_speech), "\n";
         #
         #         echo "\n";
         // $collection collection of paradigm for given word
         // TODO: $collection->getByPartOfSpeech(...);
         # foreach($collection as $paradigm) {
         // TODO: $paradigm->getBaseForm();
         // TODO: $paradigm->getAllForms();
         // TODO: $paradigm->hasGrammems(array('', ''));
         // TODO: $paradigm->getWordFormsByGrammems(array('', ''));
         // TODO: $paradigm->hasPartOfSpeech('');
         // TODO: $paradigm->getWordFormsByPartOfSpeech('');
         # echo "lemma: ", $paradigm[0]->getWord(), "\n";
         #             foreach($paradigm->getFoundWordForm() as $found_word_form) {
         #                 echo
         #                     $found_word_form->getWord(), ' ',
         #                     $found_word_form->getPartOfSpeech(), ' ',
         #                     '(', implode(', ', $found_word_form->getGrammems()), ')',
         #                     "\n";
         #             }
         #             echo "\n";
         #
         #             foreach($paradigm as $word_form) {
         #                 // TODO: $word_form->getWord();
         #                 // TODO: $word_form->getFormNo();
         #                 // TODO: $word_form->getGrammems();
         #                 // TODO: $word_form->getPartOfSpeech();
         #                 // TODO: $word_form->hasGrammems(array('', ''));
         #             }
         # }
         # echo "--\n";
     }
     # } catch(phpMorphy_Exception $e) {
     #     die('Error occured while text processing: ' . $e->getMessage());
     # }
     $lemmas = $this->sanitizeWordsArray($lemmas);
     return array_unique($lemmas);
 }
Пример #6
0
// All words in dictionary in UPPER CASE, so don`t forget set proper locale via setlocale(...) call
// $morphy->getEncoding() returns dictionary encoding
$words = array('КРАКОЗЯБЛИКИ', 'СТАЛИ', 'ВИНА', 'И', 'ДУХИ', 'abc');
/*
if(function_exists('iconv')) {
    foreach($words as &$word) {
        $word = iconv('windows-1251', $morphy->getEncoding(), $word);
    }
    unset($word);
}
*/
try {
    foreach ($words as $word) {
        // by default, phpMorphy finds $word in dictionary and when nothig found, try to predict them
        // you can change this behaviour, via second argument to getXXX or findWord methods
        $base = $morphy->getBaseForm($word);
        $all = $morphy->getAllForms($word);
        $part_of_speech = $morphy->getPartOfSpeech($word);
        // $base = $morphy->getBaseForm($word, phpMorphy::NORMAL); // normal behaviour
        // $base = $morphy->getBaseForm($word, phpMorphy::IGNORE_PREDICT); // don`t use prediction
        // $base = $morphy->getBaseForm($word, phpMorphy::ONLY_PREDICT); // always predict word
        $is_predicted = $morphy->isLastPredicted();
        // or $morphy->getLastPredictionType() == phpMorphy::PREDICT_BY_NONE
        $is_predicted_by_db = $morphy->getLastPredictionType() == phpMorphy::PREDICT_BY_DB;
        $is_predicted_by_suffix = $morphy->getLastPredictionType() == phpMorphy::PREDICT_BY_SUFFIX;
        // this used for deep analysis
        $collection = $morphy->findWord($word);
        // or var_dump($morphy->getAllFormsWithGramInfo($word)); for debug
        if (false === $collection) {
            echo $word, " NOT FOUND\n";
            continue;
Пример #7
0
function bench_morphy($words, $encoding, $dictDir, $lang, $storage, $useBulk, $usePrediction = true)
{
    $opts = array('storage' => $storage, 'predict_by_suffix' => $usePrediction, 'predict_by_db' => false);
    $bundle = new phpMorphy_FilesBundle($dictDir, $lang);
    $morphy = new phpMorphy($bundle, $opts);
    $unicode = phpMorphy_UnicodeHelper_UnicodeHelperAbstract::getHelperForEncoding($morphy->getEncoding());
    echo "Bench phpMorphy[{$encoding}][{$storage}][" . ($useBulk ? 'BULK' : 'SINGLE') . "] : ";
    convert_words($words, $encoding, MB_CASE_UPPER);
    $predicted = 0;
    $b = microtime(true);
    if ($useBulk) {
        $morphy->getBaseForm($words);
    } else {
        foreach ($words as $word) {
            //$unicode->strrev($word); mb_strtoupper($word, 'utf-8');
            //strtr($word, $replace);
            //strrev($word);
            //mb_strtolower($word, 'utf-8');
            $lemma = $morphy->getBaseForm($word);
            if ($morphy->isLastPredicted()) {
                $predicted++;
            }
        }
    }
    $e = microtime(true);
    printf("time = %0.2f sec, words per second = %0.2f, predicted = %d\n", $e - $b, count($words) / ($e - $b), $predicted);
}
Пример #8
0
    $words = preg_split('#\\s|[,.:;!?"\'()]#', $text, -1, PREG_SPLIT_NO_EMPTY);
    $bulk_words = array();
    foreach ($words as $v) {
        if (strlen($v) > 3) {
            $bulk_words[] = strtoupper($v);
        }
    }
    return $morphy->getAllForms($bulk_words);
}
/*

 *

 * @param string $text

 * @return string

 */
function Words2BaseForm($text)
{
    require_once $GLOBALS['PATH_sys'] . 'phpmorphy/src/common.php';
    // set some options
    $opts = array('storage' => PHPMORPHY_STORAGE_MEM, 'with_gramtab' => false, 'predict_by_suffix' => true, 'predict_by_db' => true);
    $dir = $GLOBALS['PATH_sys'] . 'phpmorphy/dicts';
    // Create descriptor for dictionary located in $dir directory with russian language
    $dict_bundle = new phpMorphy_FilesBundle($dir, 'rus');
    // Create phpMorphy instance
    $morphy = new phpMorphy($dict_bundle, $opts);
    // All words in dictionary in UPPER CASE, so don`t forget set proper locale
    // Supported dicts and locales:
    //  *------------------------------*
    //  | Dict. language | Locale name |
    //  |------------------------------|
    //  | Russian        | cp1251      |
    //  |------------------------------|
    //  | English        | cp1250      |
    //  |------------------------------|
    //  | German         | cp1252      |
    //  *------------------------------*
    // $codepage = $morphy->getCodepage();
    //setlocale(LC_CTYPE, array('ru_RU.CP1251', 'Russian_Russia.1251'));
    $words = preg_replace('#\\[.*\\]#isU', '', $text);
    $words = preg_split('#\\s|[,.:;!?"\'()]#', $words, -1, PREG_SPLIT_NO_EMPTY);
    $bulk_words = array();
    foreach ($words as $v) {
        if (strlen($v) > 3) {
            $bulk_words[] = mb_strtoupper($v, "UTF-8");