function Words2BaseForm($text) { global $_COMMON_SITE_CONF; static $dict_bundle, $morphy; require_once $GLOBALS['_PATH']['PATH_INC'] . 'phpMorphy/src/common.php'; if (!$dict_bundle) { $encoding = $_COMMON_SITE_CONF['encodings'][$_COMMON_SITE_CONF['site_encoding']]; $dir = $GLOBALS['_PATH']['PATH_INC'] . 'phpMorphy/dicts/'; $dict_bundle = new phpMorphy_FilesBundle($dir, 'rus'); } if (!$morphy) { $opts = array('storage' => PHPMORPHY_STORAGE_MEM, 'with_gramtab' => false, 'predict_by_suffix' => true, 'predict_by_db' => true); $morphy = new phpMorphy($dict_bundle, $opts); } setlocale(LC_CTYPE, array('ru_RU.CP1251', 'rus_RUS.CP1251', 'rus_RUS.CP1251', 'Russian_Russia.1251')); $words = preg_replace('#\\[.*\\]#isU', '', $text); $words = preg_split('#\\s|[,.:;«»!?"\'()]#', $words, -1, PREG_SPLIT_NO_EMPTY); $bulk_words = array(); foreach ($words as $v) { if (strlen($v) > 3) { $bulk_words[] = strtoupper($v); } } $base_form = $morphy->getBaseForm($bulk_words); $fullList = array(); if (is_array($base_form) && count($base_form)) { foreach ($base_form as $k => $v) { if (is_array($v)) { foreach ($v as $v1) { if (strlen($v1) > 3) { $fullList[$v1] = 1; } } } } } $words = join(' ', array_keys($fullList)); return $words; }
/** * Returns base form on word. * @param string $content * @return string|null */ public function baseForm($content) { if (empty($content)) { return null; } $words = preg_replace(['/\\[.*\\]/isu', '/[^\\w\\x7F-\\xFF\\s]/i'], "", trim($content)); $words = preg_replace('/ +/', ' ', $words); //preg_match_all('/[a-zA-Z]+/iu',mb_strtoupper($words, CHARSET),$words_latin); //$words_latin = (is_array($words_latin) && count($words_latin) > 0) ? ' '.implode(' ', $words_latin[0]) : ''; $words = preg_split('/\\s|[,.:;!?"\'()]/', $words, -1, PREG_SPLIT_NO_EMPTY); $bulkWords = []; foreach ($words as $res) { if (mb_strlen($res, 'utf-8') > 2) { $bulkWords[] = mb_strtoupper($res, 'utf-8'); } } //$this->_Morphy->getEncoding(); $baseForm = static::$morphy->getBaseForm($bulkWords); if (is_array($baseForm) && count($baseForm)) { $dataWords = []; foreach ($baseForm as $key => $arr_res) { if (is_array($arr_res)) { foreach ($arr_res as $val_res) { if (mb_strlen($val_res, 'utf-8') > 2) { $dataWords[$val_res] = 1; } } /* те слова, что отсутсвуют в словаре */ } else { if (!empty($res) && mb_strlen($res, 'utf-8') > 2) { $dataWords[$key] = 1; } } } $words = implode(' ', array_keys($dataWords)); } return $words; }
// | Russian | cp1251 | // |------------------------------| // | English | cp1250 | // |------------------------------| // | German | cp1252 | // *------------------------------* // $codepage = $morphy->getCodepage(); // setlocale(LC_CTYPE, array('ru_RU.CP1251', 'Russian_Russia.1251')); // Hint: in this example words $word_one, $word_two are in russian language(cp1251 encoding) $word_one = 'КОТ'; $word_two = 'СОБАКА'; echo "Testing single mode...\n"; try { // word by word processing // each function return array with result or FALSE when no form(s) for given word found(or predicted) $base_form = $morphy->getBaseForm($word_one); $all_forms = $morphy->getAllForms($word_one); $pseudo_root = $morphy->getPseudoRoot($word_one); if (false === $base_form || false === $all_forms || false === $pseudo_root) { die("Can`t find or predict {$word_one} word"); } echo 'base form = ' . implode(', ', $base_form) . "\n"; echo 'all forms = ' . implode(', ', $all_forms) . "\n"; echo "Testing bulk mode...\n"; // bulk mode speed-ups processing up to 50-100%(mainly for getBaseForm method) // in bulk mode all function always return array $bulk_words = array($word_one, $word_two); $base_form = $morphy->getBaseForm($bulk_words); $all_forms = $morphy->getAllForms($bulk_words); $pseudo_root = $morphy->getPseudoRoot($bulk_words); // Bulk result format:
function Words2BaseForm($text) { require_once dirname(__FILE__) . '/../sys/phpmorphy/src/common.php'; // set some options $opts = array('storage' => PHPMORPHY_STORAGE_MEM, 'with_gramtab' => false, 'predict_by_suffix' => true, 'predict_by_db' => true); $dir = dirname(__FILE__) . '/../sys/phpmorphy/dicts'; // Create descriptor for dictionary located in $dir directory with russian language $dict_bundle = new phpMorphy_FilesBundle($dir, 'rus'); // Create phpMorphy instance $morphy = new phpMorphy($dict_bundle, $opts); $words = preg_replace('#\\[.*\\]#isU', '', $text); $words = preg_split('#\\s|[,.:;!?"\'()]#', $words, -1, PREG_SPLIT_NO_EMPTY); $bulk_words = array(); foreach ($words as $v) { if (strlen($v) > 2) { $bulk_words[] = strtoupper($v); } } $base_form = $morphy->getBaseForm($bulk_words); $fullList = array(); if (is_array($base_form) && count($base_form)) { foreach ($base_form as $k => $v) { if (is_array($v)) { foreach ($v as $v1) { if (strlen($v1) > 2) { $fullList[$v1] = 1; } } } } } $words = join(' ', array_keys($fullList)); return $words; }
protected function getLemmas(array $words) { require_once MODX_CORE_PATH . 'components/modsearch/external/phpmorphy/src/common.php'; // set some options $opts = array('storage' => PHPMORPHY_STORAGE_FILE, 'predict_by_suffix' => true, 'predict_by_db' => true, 'graminfo_as_text' => true); // Path to directory where dictionaries located $dir = MODX_CORE_PATH . 'components/modsearch/external/phpmorphy/dicts'; $lang = 'ru_RU'; // Create phpMorphy instance # try { # $morphy = new phpMorphy($dir, $lang, $opts); # // print $morphy->getEncoding(); # // exit; # } catch(phpMorphy_Exception $e) { # die('Error occured while creating phpMorphy instance: ' . PHP_EOL . $e); # } if (!($morphy = new phpMorphy($dir, $lang, $opts))) { $this->modx->log(xPDO::LOG_LEVEL_ERROR, "[" . __CLASS__ . "] Не был получен объект phpMorphy"); return false; } $lemmas = array(); # print_r($words); # # exit; // print $morphy->getEncoding(); // if(function_exists('iconv')) { // foreach($words as &$word) { // $word = iconv('windows-1251', $morphy->getEncoding(), $word); // } // unset($word); // } # try { foreach ($words as $word) { if (!$word) { continue; } # $word = mb_strtoupper($word, 'UTF-8'); // by default, phpMorphy finds $word in dictionary and when nothig found, try to predict them // you can change this behaviour, via second argument to getXXX or findWord methods $base = $morphy->getBaseForm($word); $all = $morphy->getAllForms($word); $part_of_speech = $morphy->getPartOfSpeech($word); // echo $morphy->getLocale(); // var_dump($morphy->getShmCache()->getFilesList()); // print_r($base); // exit; // $base = $morphy->getBaseForm($word, phpMorphy::NORMAL); // normal behaviour // $base = $morphy->getBaseForm($word, phpMorphy::IGNORE_PREDICT); // don`t use prediction // $base = $morphy->getBaseForm($word, phpMorphy::ONLY_PREDICT); // always predict word $is_predicted = $morphy->isLastPredicted(); // or $morphy->getLastPredictionType() == phpMorphy::PREDICT_BY_NONE $is_predicted_by_db = $morphy->getLastPredictionType() == phpMorphy::PREDICT_BY_DB; $is_predicted_by_suffix = $morphy->getLastPredictionType() == phpMorphy::PREDICT_BY_SUFFIX; // this used for deep analysis $collection = $morphy->findWord($word); // or var_dump($morphy->getAllFormsWithGramInfo($word)); for debug if (false === $collection) { # echo $word, " NOT FOUND\n"; // Если слово не найдено, добавляем его в массив как есть $lemmas[] = $word; continue; } else { # print "\n<br />Найдено слово: ". $word; } foreach ($base as $lemma) { $lemmas[] = $lemma; } # echo $is_predicted ? '-' : '+', $word, "\n"; # echo 'lemmas: ', implode(', ', $base), "\n"; # echo 'all: ', implode(', ', $all), "\n"; # echo 'poses: ', implode(', ', $part_of_speech), "\n"; # # echo "\n"; // $collection collection of paradigm for given word // TODO: $collection->getByPartOfSpeech(...); # foreach($collection as $paradigm) { // TODO: $paradigm->getBaseForm(); // TODO: $paradigm->getAllForms(); // TODO: $paradigm->hasGrammems(array('', '')); // TODO: $paradigm->getWordFormsByGrammems(array('', '')); // TODO: $paradigm->hasPartOfSpeech(''); // TODO: $paradigm->getWordFormsByPartOfSpeech(''); # echo "lemma: ", $paradigm[0]->getWord(), "\n"; # foreach($paradigm->getFoundWordForm() as $found_word_form) { # echo # $found_word_form->getWord(), ' ', # $found_word_form->getPartOfSpeech(), ' ', # '(', implode(', ', $found_word_form->getGrammems()), ')', # "\n"; # } # echo "\n"; # # foreach($paradigm as $word_form) { # // TODO: $word_form->getWord(); # // TODO: $word_form->getFormNo(); # // TODO: $word_form->getGrammems(); # // TODO: $word_form->getPartOfSpeech(); # // TODO: $word_form->hasGrammems(array('', '')); # } # } # echo "--\n"; } # } catch(phpMorphy_Exception $e) { # die('Error occured while text processing: ' . $e->getMessage()); # } $lemmas = $this->sanitizeWordsArray($lemmas); return array_unique($lemmas); }
// All words in dictionary in UPPER CASE, so don`t forget set proper locale via setlocale(...) call // $morphy->getEncoding() returns dictionary encoding $words = array('КРАКОЗЯБЛИКИ', 'СТАЛИ', 'ВИНА', 'И', 'ДУХИ', 'abc'); /* if(function_exists('iconv')) { foreach($words as &$word) { $word = iconv('windows-1251', $morphy->getEncoding(), $word); } unset($word); } */ try { foreach ($words as $word) { // by default, phpMorphy finds $word in dictionary and when nothig found, try to predict them // you can change this behaviour, via second argument to getXXX or findWord methods $base = $morphy->getBaseForm($word); $all = $morphy->getAllForms($word); $part_of_speech = $morphy->getPartOfSpeech($word); // $base = $morphy->getBaseForm($word, phpMorphy::NORMAL); // normal behaviour // $base = $morphy->getBaseForm($word, phpMorphy::IGNORE_PREDICT); // don`t use prediction // $base = $morphy->getBaseForm($word, phpMorphy::ONLY_PREDICT); // always predict word $is_predicted = $morphy->isLastPredicted(); // or $morphy->getLastPredictionType() == phpMorphy::PREDICT_BY_NONE $is_predicted_by_db = $morphy->getLastPredictionType() == phpMorphy::PREDICT_BY_DB; $is_predicted_by_suffix = $morphy->getLastPredictionType() == phpMorphy::PREDICT_BY_SUFFIX; // this used for deep analysis $collection = $morphy->findWord($word); // or var_dump($morphy->getAllFormsWithGramInfo($word)); for debug if (false === $collection) { echo $word, " NOT FOUND\n"; continue;
function bench_morphy($words, $encoding, $dictDir, $lang, $storage, $useBulk, $usePrediction = true) { $opts = array('storage' => $storage, 'predict_by_suffix' => $usePrediction, 'predict_by_db' => false); $bundle = new phpMorphy_FilesBundle($dictDir, $lang); $morphy = new phpMorphy($bundle, $opts); $unicode = phpMorphy_UnicodeHelper_UnicodeHelperAbstract::getHelperForEncoding($morphy->getEncoding()); echo "Bench phpMorphy[{$encoding}][{$storage}][" . ($useBulk ? 'BULK' : 'SINGLE') . "] : "; convert_words($words, $encoding, MB_CASE_UPPER); $predicted = 0; $b = microtime(true); if ($useBulk) { $morphy->getBaseForm($words); } else { foreach ($words as $word) { //$unicode->strrev($word); mb_strtoupper($word, 'utf-8'); //strtr($word, $replace); //strrev($word); //mb_strtolower($word, 'utf-8'); $lemma = $morphy->getBaseForm($word); if ($morphy->isLastPredicted()) { $predicted++; } } } $e = microtime(true); printf("time = %0.2f sec, words per second = %0.2f, predicted = %d\n", $e - $b, count($words) / ($e - $b), $predicted); }
$words = preg_split('#\\s|[,.:;!?"\'()]#', $text, -1, PREG_SPLIT_NO_EMPTY); $bulk_words = array(); foreach ($words as $v) { if (strlen($v) > 3) { $bulk_words[] = strtoupper($v); } } return $morphy->getAllForms($bulk_words); } /* * * @param string $text * @return string */ function Words2BaseForm($text) { require_once $GLOBALS['PATH_sys'] . 'phpmorphy/src/common.php'; // set some options $opts = array('storage' => PHPMORPHY_STORAGE_MEM, 'with_gramtab' => false, 'predict_by_suffix' => true, 'predict_by_db' => true); $dir = $GLOBALS['PATH_sys'] . 'phpmorphy/dicts'; // Create descriptor for dictionary located in $dir directory with russian language $dict_bundle = new phpMorphy_FilesBundle($dir, 'rus'); // Create phpMorphy instance $morphy = new phpMorphy($dict_bundle, $opts); // All words in dictionary in UPPER CASE, so don`t forget set proper locale // Supported dicts and locales: // *------------------------------* // | Dict. language | Locale name | // |------------------------------| // | Russian | cp1251 | // |------------------------------| // | English | cp1250 | // |------------------------------| // | German | cp1252 | // *------------------------------* // $codepage = $morphy->getCodepage(); //setlocale(LC_CTYPE, array('ru_RU.CP1251', 'Russian_Russia.1251')); $words = preg_replace('#\\[.*\\]#isU', '', $text); $words = preg_split('#\\s|[,.:;!?"\'()]#', $words, -1, PREG_SPLIT_NO_EMPTY); $bulk_words = array(); foreach ($words as $v) { if (strlen($v) > 3) { $bulk_words[] = mb_strtoupper($v, "UTF-8");