function extract_gramtab($graminfoFile, $outDir, $asText) { $factory = new phpMorphy_Storage_Factory(); $graminfo = phpMorphy_GramInfo::create($factory->open(PHPMORPHY_STORAGE_FILE, $graminfoFile, false), false); $poses = $graminfo->readAllPartOfSpeech(); $grammems = $graminfo->readAllGrammems(); $ancodes = $graminfo->readAllAncodes(); if ($asText) { foreach ($ancodes as &$ancode) { $pos_id = $ancode['pos_id']; if (!isset($poses[$pos_id])) { throw new Exception("Unknown pos_id '{$pos_id}' found"); } $ancode['pos_id'] = $poses[$pos_id]['name']; foreach ($ancode['grammem_ids'] as &$grammem_id) { if (!isset($grammems[$grammem_id])) { throw new Exception("Unknown grammem_id '{$grammem_id}' found"); } $grammem_id = $grammems[$grammem_id]['name']; } } //$poses = replace_keys_with_name($poses); //$grammems = replace_keys_with_name($grammems); } $result = array('poses' => $poses, 'grammems' => $grammems, 'ancodes' => $ancodes); $type = $asText ? '_txt' : ''; $out_file = 'gramtab' . $type . '.' . strtolower($graminfo->getLocale()) . '.bin'; $out_file = $outDir . '/' . $out_file; if (false === file_put_contents($out_file, serialize($result))) { throw new Exception("Can`t write '{$out_file}'"); } }
#!/usr/bin/php <?php if (2 == (ini_get('mbstring.func_overload') & 2)) { die("don`t overload string functions in mbstring extension, see mbstring.func_overload option"); } if ($argc < 3) { echo "Usage " . $argv[0] . " MORPH_DATA_FILE OUT_DIR"; exit; } require_once dirname(__FILE__) . '/../src/common.php'; $file = $argv[1]; $out_dir = $argv[2]; $use_references = true; try { $factory = new phpMorphy_Storage_Factory(); $graminfo = phpMorphy_GramInfo::create($factory->open(PHPMORPHY_STORAGE_FILE, $file, false), false); $ancodes_map = new Map('ancodes'); $flexias_map = new Map('affixes'); $i = 0; foreach ($graminfo->readAllFlexia() as $id => $flexia) { $offset = $flexia['header']['offset']; // + $graminfo->getGramInfoHeaderSize(); $ancodes_map->update($flexia, $offset); //$flexias_map->update($flexia, $offset); $i++; } echo "Total flexias = {$i}, unique ancodes = " . count($ancodes_map->getMap()) . ', unique flexias = ' . count($flexias_map->getMap()) . PHP_EOL; $out_file_format = $out_dir . '/%s.' . strtolower($graminfo->getLocale()) . '.bin'; file_put_contents(sprintf($out_file_format, 'morph_data_ancodes_cache'), serialize($ancodes_map->compose($use_references))); //file_put_contents(sprintf($out_file_format, 'morph_data_flexias_cache'), serialize($flexias_map->compose($use_references))); } catch (Exception $e) {
#!/usr/bin/env php <?php require_once __DIR__ . '/../init.php'; if ($argc < 3) { echo "Usage " . $argv[0] . " MORPH_DATA_FILE LANGUAGE OUT_DIR"; exit; } $graminfo_file = $argv[1]; $language = $argv[2]; $out_dir = $argv[3]; try { $factory = new phpMorphy_Storage_Factory(); $graminfo = phpMorphy_GramInfo_GramInfoAbstract::create($factory->create(phpMorphy::STORAGE_FILE, $graminfo_file, false), false); $out_file = $out_dir . '/morph_data_ancodes_map.' . strtolower($graminfo->getLocale()) . '.bin'; $gramtab_map = get_gramtab_map($language); $valid_ancodes = array_flip(array_values($gramtab_map)); $ancodes_map = array(); foreach (get_all_ancodes($graminfo) as $id => $value) { if (isset($gramtab_map[$value])) { $orig_ancode = $gramtab_map[$value]; $ancodes_map[$id] = $orig_ancode; } else { // TODO: typically ancodes don`t contain digits, so we can generateHeaderPhpDoc mapping to char + digit ancodes do { $new_ancode = chr(mt_rand(ord('a'), ord('z'))) . chr(mt_rand(ord('a'), ord('z'))); } while (isset($valid_ancodes[$new_ancode])); echo "'{$value}' not found in gramtab, assume {$new_ancode}" . PHP_EOL; $ancodes_map[$id] = $new_ancode; } } foreach ($ancodes_map as &$ancode) {
#!/usr/bin/env php <?php set_include_path(__DIR__ . '/../../src/' . PATH_SEPARATOR . get_include_path()); require 'phpMorphy.php'; if ($argc < 3) { echo "Usage " . $argv[0] . " MORPH_DATA_FILE OUT_DIR"; exit; } $file = $argv[1]; $out_dir = $argv[2]; $use_references = true; try { $factory = new phpMorphy_Storage_Factory(); $graminfo = phpMorphy_GramInfo_GramInfoAbstract::create($factory->create(PHPMORPHY_STORAGE_FILE, $file, false), false); $ancodes_map = new Map('ancodes'); $flexias_map = new Map('affixes'); $i = 0; foreach ($graminfo->readAllFlexia() as $id => $flexia) { $offset = $flexia['header']['offset']; // + $graminfo->getGramInfoHeaderSize(); $ancodes_map->update($flexia, $offset); //$flexias_map->update($flexia, $offset); $i++; } echo "Total flexias = {$i}, unique ancodes = " . count($ancodes_map->getMap()) . ', unique flexias = ' . count($flexias_map->getMap()) . PHP_EOL; $out_file_format = $out_dir . '/%s.' . strtolower($graminfo->getLocale()) . '.bin'; file_put_contents(sprintf($out_file_format, 'morph_data_ancodes_cache'), serialize($ancodes_map->compose($use_references))); //file_put_contents(sprintf($out_file_format, 'morph_data_flexias_cache'), serialize($flexias_map->compose($use_references))); } catch (Exception $e) { echo $e; exit(1);
function extract_gramtab($graminfoFile, $outDir, $asText, $case) { $factory = new phpMorphy_Storage_Factory(); $graminfo = phpMorphy_GramInfo_GramInfoAbstract::create($factory->create(phpMorphy::STORAGE_FILE, $graminfoFile, false), false); $grammems_processor = GrammemsProcessorAbstract::create($graminfo->getLocale()); $pos_case_converter = CaseConverterAbstract::create($graminfo->getEncoding(), 'upper'); $grammems_case_converter = CaseConverterAbstract::create($graminfo->getEncoding(), $case); $poses = $graminfo->readAllPartOfSpeech(); $grammems = $graminfo->readAllGrammems(); $ancodes = $graminfo->readAllAncodes(); foreach ($poses as &$pos) { $pos['name'] = $pos_case_converter->convert($pos['name']); } unset($pos); foreach ($grammems as &$grammem) { $grammem['name'] = $grammems_case_converter->convert($grammem['name']); } unset($grammem); foreach ($ancodes as &$ancode) { $ancode['grammem_ids'] = $grammems_processor->process($ancode['pos_id'], $ancode['grammem_ids']); unset($ancode['offset']); } unset($ancode); if ($asText) { foreach ($ancodes as &$ancode) { $pos_id = $ancode['pos_id']; if (!isset($poses[$pos_id])) { throw new Exception("Unknown pos_id '{$pos_id}' found"); } $ancode['pos_id'] = $pos_case_converter->convert($poses[$pos_id]['name']); foreach ($ancode['grammem_ids'] as &$grammem_id) { if (!isset($grammems[$grammem_id])) { throw new Exception("Unknown grammem_id '{$grammem_id}' found"); } $grammem_id = $grammems_case_converter->convert($grammems[$grammem_id]['name']); } } unset($ancode); //$poses = replace_keys_with_name($poses); //$grammems = replace_keys_with_name($grammems); } $result = array('poses' => $poses, 'grammems' => $grammems, 'ancodes' => $ancodes); $type = $asText ? '_txt' : ''; $out_file = 'gramtab' . $type . '.' . strtolower($graminfo->getLocale()) . '.bin'; $out_file = $outDir . '/' . $out_file; if (false === file_put_contents($out_file, serialize($result))) { throw new Exception("Can`t write '{$out_file}'"); } }