Пример #1
0
function extract_gramtab($graminfoFile, $outDir, $asText)
{
    $factory = new phpMorphy_Storage_Factory();
    $graminfo = phpMorphy_GramInfo::create($factory->open(PHPMORPHY_STORAGE_FILE, $graminfoFile, false), false);
    $poses = $graminfo->readAllPartOfSpeech();
    $grammems = $graminfo->readAllGrammems();
    $ancodes = $graminfo->readAllAncodes();
    if ($asText) {
        foreach ($ancodes as &$ancode) {
            $pos_id = $ancode['pos_id'];
            if (!isset($poses[$pos_id])) {
                throw new Exception("Unknown pos_id '{$pos_id}' found");
            }
            $ancode['pos_id'] = $poses[$pos_id]['name'];
            foreach ($ancode['grammem_ids'] as &$grammem_id) {
                if (!isset($grammems[$grammem_id])) {
                    throw new Exception("Unknown grammem_id '{$grammem_id}' found");
                }
                $grammem_id = $grammems[$grammem_id]['name'];
            }
        }
        //$poses = replace_keys_with_name($poses);
        //$grammems = replace_keys_with_name($grammems);
    }
    $result = array('poses' => $poses, 'grammems' => $grammems, 'ancodes' => $ancodes);
    $type = $asText ? '_txt' : '';
    $out_file = 'gramtab' . $type . '.' . strtolower($graminfo->getLocale()) . '.bin';
    $out_file = $outDir . '/' . $out_file;
    if (false === file_put_contents($out_file, serialize($result))) {
        throw new Exception("Can`t write '{$out_file}'");
    }
}
Пример #2
0
#!/usr/bin/php
<?php 
if (2 == (ini_get('mbstring.func_overload') & 2)) {
    die("don`t overload string functions in mbstring extension, see mbstring.func_overload option");
}
if ($argc < 3) {
    echo "Usage " . $argv[0] . " MORPH_DATA_FILE OUT_DIR";
    exit;
}
require_once dirname(__FILE__) . '/../src/common.php';
$file = $argv[1];
$out_dir = $argv[2];
$use_references = true;
try {
    $factory = new phpMorphy_Storage_Factory();
    $graminfo = phpMorphy_GramInfo::create($factory->open(PHPMORPHY_STORAGE_FILE, $file, false), false);
    $ancodes_map = new Map('ancodes');
    $flexias_map = new Map('affixes');
    $i = 0;
    foreach ($graminfo->readAllFlexia() as $id => $flexia) {
        $offset = $flexia['header']['offset'];
        // + $graminfo->getGramInfoHeaderSize();
        $ancodes_map->update($flexia, $offset);
        //$flexias_map->update($flexia, $offset);
        $i++;
    }
    echo "Total flexias = {$i}, unique ancodes = " . count($ancodes_map->getMap()) . ', unique flexias = ' . count($flexias_map->getMap()) . PHP_EOL;
    $out_file_format = $out_dir . '/%s.' . strtolower($graminfo->getLocale()) . '.bin';
    file_put_contents(sprintf($out_file_format, 'morph_data_ancodes_cache'), serialize($ancodes_map->compose($use_references)));
    //file_put_contents(sprintf($out_file_format, 'morph_data_flexias_cache'), serialize($flexias_map->compose($use_references)));
} catch (Exception $e) {
#!/usr/bin/env php
<?php 
require_once __DIR__ . '/../init.php';
if ($argc < 3) {
    echo "Usage " . $argv[0] . " MORPH_DATA_FILE LANGUAGE OUT_DIR";
    exit;
}
$graminfo_file = $argv[1];
$language = $argv[2];
$out_dir = $argv[3];
try {
    $factory = new phpMorphy_Storage_Factory();
    $graminfo = phpMorphy_GramInfo_GramInfoAbstract::create($factory->create(phpMorphy::STORAGE_FILE, $graminfo_file, false), false);
    $out_file = $out_dir . '/morph_data_ancodes_map.' . strtolower($graminfo->getLocale()) . '.bin';
    $gramtab_map = get_gramtab_map($language);
    $valid_ancodes = array_flip(array_values($gramtab_map));
    $ancodes_map = array();
    foreach (get_all_ancodes($graminfo) as $id => $value) {
        if (isset($gramtab_map[$value])) {
            $orig_ancode = $gramtab_map[$value];
            $ancodes_map[$id] = $orig_ancode;
        } else {
            // TODO: typically ancodes don`t contain digits, so we can generateHeaderPhpDoc mapping to char + digit ancodes
            do {
                $new_ancode = chr(mt_rand(ord('a'), ord('z'))) . chr(mt_rand(ord('a'), ord('z')));
            } while (isset($valid_ancodes[$new_ancode]));
            echo "'{$value}' not found in gramtab, assume {$new_ancode}" . PHP_EOL;
            $ancodes_map[$id] = $new_ancode;
        }
    }
    foreach ($ancodes_map as &$ancode) {
Пример #4
0
#!/usr/bin/env php
<?php 
set_include_path(__DIR__ . '/../../src/' . PATH_SEPARATOR . get_include_path());
require 'phpMorphy.php';
if ($argc < 3) {
    echo "Usage " . $argv[0] . " MORPH_DATA_FILE OUT_DIR";
    exit;
}
$file = $argv[1];
$out_dir = $argv[2];
$use_references = true;
try {
    $factory = new phpMorphy_Storage_Factory();
    $graminfo = phpMorphy_GramInfo_GramInfoAbstract::create($factory->create(PHPMORPHY_STORAGE_FILE, $file, false), false);
    $ancodes_map = new Map('ancodes');
    $flexias_map = new Map('affixes');
    $i = 0;
    foreach ($graminfo->readAllFlexia() as $id => $flexia) {
        $offset = $flexia['header']['offset'];
        // + $graminfo->getGramInfoHeaderSize();
        $ancodes_map->update($flexia, $offset);
        //$flexias_map->update($flexia, $offset);
        $i++;
    }
    echo "Total flexias = {$i}, unique ancodes = " . count($ancodes_map->getMap()) . ', unique flexias = ' . count($flexias_map->getMap()) . PHP_EOL;
    $out_file_format = $out_dir . '/%s.' . strtolower($graminfo->getLocale()) . '.bin';
    file_put_contents(sprintf($out_file_format, 'morph_data_ancodes_cache'), serialize($ancodes_map->compose($use_references)));
    //file_put_contents(sprintf($out_file_format, 'morph_data_flexias_cache'), serialize($flexias_map->compose($use_references)));
} catch (Exception $e) {
    echo $e;
    exit(1);
Пример #5
0
function extract_gramtab($graminfoFile, $outDir, $asText, $case)
{
    $factory = new phpMorphy_Storage_Factory();
    $graminfo = phpMorphy_GramInfo_GramInfoAbstract::create($factory->create(phpMorphy::STORAGE_FILE, $graminfoFile, false), false);
    $grammems_processor = GrammemsProcessorAbstract::create($graminfo->getLocale());
    $pos_case_converter = CaseConverterAbstract::create($graminfo->getEncoding(), 'upper');
    $grammems_case_converter = CaseConverterAbstract::create($graminfo->getEncoding(), $case);
    $poses = $graminfo->readAllPartOfSpeech();
    $grammems = $graminfo->readAllGrammems();
    $ancodes = $graminfo->readAllAncodes();
    foreach ($poses as &$pos) {
        $pos['name'] = $pos_case_converter->convert($pos['name']);
    }
    unset($pos);
    foreach ($grammems as &$grammem) {
        $grammem['name'] = $grammems_case_converter->convert($grammem['name']);
    }
    unset($grammem);
    foreach ($ancodes as &$ancode) {
        $ancode['grammem_ids'] = $grammems_processor->process($ancode['pos_id'], $ancode['grammem_ids']);
        unset($ancode['offset']);
    }
    unset($ancode);
    if ($asText) {
        foreach ($ancodes as &$ancode) {
            $pos_id = $ancode['pos_id'];
            if (!isset($poses[$pos_id])) {
                throw new Exception("Unknown pos_id '{$pos_id}' found");
            }
            $ancode['pos_id'] = $pos_case_converter->convert($poses[$pos_id]['name']);
            foreach ($ancode['grammem_ids'] as &$grammem_id) {
                if (!isset($grammems[$grammem_id])) {
                    throw new Exception("Unknown grammem_id '{$grammem_id}' found");
                }
                $grammem_id = $grammems_case_converter->convert($grammems[$grammem_id]['name']);
            }
        }
        unset($ancode);
        //$poses = replace_keys_with_name($poses);
        //$grammems = replace_keys_with_name($grammems);
    }
    $result = array('poses' => $poses, 'grammems' => $grammems, 'ancodes' => $ancodes);
    $type = $asText ? '_txt' : '';
    $out_file = 'gramtab' . $type . '.' . strtolower($graminfo->getLocale()) . '.bin';
    $out_file = $outDir . '/' . $out_file;
    if (false === file_put_contents($out_file, serialize($result))) {
        throw new Exception("Can`t write '{$out_file}'");
    }
}