Пример #1
0
 /**
  * 构造函数
  * 打开并加载拼音词库
  */
 public function __construct($file = null)
 {
     require_once dirname(__FILE__) . '/xdb.class.php';
     if ($file === null) {
         $file = dirname(__FILE__) . '/py.xdb';
     }
     $xdb = new XTreeDB();
     if (!$xdb->Open($file, 'r')) {
         trigger_error('Failed to load pinyin database: ' . $file, E_USER_ERROR);
     }
     $this->xdb = $xdb;
 }
Пример #2
0
// Dump the plain text dictionary from .xdb file used by SCWS
// Usage: php dump_xdb_file.php <xdb file> [output file]
// $Id: $
ini_set('memory_limit', '1024M');
set_time_limit(0);
if (!isset($_SERVER['argv'][1]) || !is_file($_SERVER['argv'][1])) {
    echo "Usage: {$_SERVER['argv'][0]} <xdb file> [output file]\n";
    exit(0);
}
$output = isset($_SERVER['argv'][2]) ? $_SERVER['argv'][2] : 'php://stdout';
if (!($fd = @fopen($output, 'w'))) {
    echo "ERROR: can not open the output file: {$output}\n";
    exit(0);
}
require 'xdb.class.php';
$xdb = new XTreeDB();
if (!$xdb->Open($_SERVER['argv'][1])) {
    fclose($fd);
    echo "ERROR: input file {$_SERVER['argv'][1]} maybe not a valid XDB file.\n";
    exit(0);
}
$line = "# WORD\tTF\tIDF\tATTR\n";
fwrite($fd, $line);
$xdb->Reset();
while ($tmp = $xdb->Next()) {
    if (strlen($tmp['value']) != 12) {
        continue;
    }
    $word = $tmp['key'];
    $data = unpack("ftf/fidf/Cflag/a3attr", $tmp['value']);
    if (!($data['flag'] & 0x1)) {
Пример #3
0
 public function actionMake($output = null, $py = null, $yj = null)
 {
     if ($output === null) {
         $output = $this->dictFile;
     }
     if ($py === null) {
         $py = $this->inputPath . '/py.txt';
     }
     if ($yj === null) {
         $yj = $this->inputPath . '/yj.txt';
     }
     if (file_exists($output) && !unlink($output)) {
         $this->usageError('输出文件已存在并且不可删除,请用 --output=... 指定路径!');
     }
     if (!file_exists($py) || !($fd = fopen($py, 'r'))) {
         $this->usageError('拼音文件不存在或打开失败,请用 --py=... 来指定!');
     }
     require_once Yii::getPathOfAlias('application.vendors.xdb') . '.class.php';
     $xdb = new XTreeDB();
     if (!$xdb->Open($output, 'w')) {
         fclose($fd);
         $this->usageError('无法以写入方式打开输出文件,请使用 --output=... 指定路径!');
     }
     echo "拼音文件:" . $py . "\n";
     echo "输出文件:" . $output . "\n";
     echo "开始制作拼音库,正在加载拼音列表 ... ";
     $znum = 0;
     $mchars = $words = array();
     while ($line = fgets($fd, 256)) {
         $line = trim($line);
         if (substr($line, 0, 1) === '#' || $line === '') {
             continue;
         }
         list($key, $value) = explode(' ', $line, 2);
         $value = trim($value);
         if (strlen($key) > 3) {
             if (($pos = strpos($value, ' ')) !== false) {
                 $value = substr($value, 0, $pos);
             }
             $words[$key] = $value;
         } else {
             if (strpos($value, ' ') !== false) {
                 $values = array_unique(preg_split('/\\s+/', $value));
                 if (count($values) > 1) {
                     $mchars[$key] = implode(' ', $values);
                 }
                 $value = $values[0];
             }
             $xdb->Put($key, $value);
             $znum++;
         }
     }
     fclose($fd);
     echo "完成,共 " . $znum . " 个字," . count($mchars) . " 个多音字," . count($words) . " 个词组。\n";
     echo "开始分析包含多音字的词组 ... ";
     $add_num = $skip_num = $max_len = 0;
     foreach ($words as $word => $value) {
         $save = false;
         for ($off = 0; $off < strlen($word); $off += 3) {
             $char = substr($word, $off, 3);
             if (isset($mchars[$char])) {
                 $save = true;
                 break;
             }
         }
         if (!$save || strlen($word) > self::MAX_WORD_LEN) {
             $skip_num++;
         } else {
             $add_num++;
             $xdb->Put($word, $value);
             if (strlen($word) > $max_len) {
                 $max_len = strlen($word);
             }
         }
     }
     echo "完成,共添加 {$add_num} 个,跳过 {$skip_num} 个,最大长词为 {$max_len} 字节。\n";
     if (file_exists($yj)) {
         echo "开始加载音节数据 ... ";
         $lines = file($yj);
         $yinjie = array();
         foreach ($lines as $line) {
             $line = trim($line);
             if ($line === '') {
                 continue;
             }
             if (isset($yinjie[$line])) {
                 $yinjie[$line] |= 0x1;
             } else {
                 $yinjie[$line] = 0x1;
             }
             for ($i = 1; $i < strlen($line); $i++) {
                 $part = substr($line, 0, $i);
                 if (isset($yinjie[$part])) {
                     $yinjie[$part] |= 0x2;
                 } else {
                     $yinjie[$part] = 0x2;
                 }
             }
         }
         foreach ($yinjie as $key => $value) {
             $xdb->Put($key, $value);
         }
         echo "完成,共计 " . count($lines) . " 个拼音,合计 " . count($yinjie) . " 条记录。\n";
     }
     echo "正在优化整理数据库 ... ";
     $xdb->Optimize();
     $xdb->Close();
     echo "完成!\n";
 }
Пример #4
0
<?php

// xxx
// convert dict.txt -> dict.xdb
//
define("_WORD_ALONE_", 0x4000000);
define("_WORD_PART_", 0x8000000);
define("_WORD_MAXLEN_", 12);
$file = $_SERVER['argv'][1];
if (!isset($file)) {
    $file = "dict.xdb";
}
require 'xdb.class.php';
$db = new XTreeDB();
$db->Open($file, 'w');
if (!$db) {
    die("fail to open dictionary file.\n");
}
$word_num = 0;
$record_num = 0;
$skip_num = 0;
// load to memory first
$rec = array();
$fd = fopen("dict.txt", "r");
echo "Loading data into memory ... \n";
while ($line = fgets($fd, 256)) {
    $line = trim($line);
    if (empty($line)) {
        continue;
    }
    $w = preg_split("/[\\s]+/", $line);
Пример #5
0
    echo "Usage: mbstring exteions is required.\n";
    exit(0);
}
$input = isset($_SERVER['argv'][2]) ? $_SERVER['argv'][2] : 'php://stdin';
if (!($fd = @fopen($input, 'r'))) {
    echo "ERROR: can not open the input file: {$input}\n";
    exit(0);
}
//
$output = $_SERVER['argv'][1];
if (file_exists($output)) {
    echo "ERROR: output xdb file exists: {$output}\n";
    exit(0);
}
require 'xdb.class.php';
$xdb = new XTreeDB();
if (!$xdb->Open($output, 'w')) {
    echo "ERROR: can not open the XDB to write: {$output}\n";
    exit(0);
}
// load data
mb_internal_encoding(IS_UTF8_TXT ? 'UTF-8' : 'gbk');
$total = 0;
$rec = array();
echo "INFO: Loading text file data ... ";
while ($line = fgets($fd, 512)) {
    if (substr($line, 0, 1) == '#') {
        continue;
    }
    list($word, $tf, $idf, $attr) = explode("\t", $line, 4);
    $k = ord($word[0]) + ord($word[1]) & 0x3f;
Пример #6
0
    echo "       {$_SERVER['argv'][0]} dict.txt dict.xdb\n";
    echo "       {$_SERVER['argv'][0]} dict.txt dict.cdb\n\n";
    exit(0);
}
// get the paramters
$input = $_SERVER['argv'][1];
$output = $_SERVER['argv'][2];
// create the dbm file
if (strrchr($output, '.') == '.hdb') {
    require 'hdb.class.php';
    $db = new HashTreeDB(0, 0x3ffd);
    $ok = $db->Open($output, 'w');
}
if (strrchr($output, '.') == '.xdb') {
    require 'xdb.class.php';
    $db = new XTreeDB(0, 0x3ffd);
    $ok = $db->Open($output, 'w');
} else {
    require 'dba.class.php';
    $db = new DbaHandler();
    $ok = $db->Open($output, 'n');
}
if (!$ok) {
    echo "ERROR: cann't setup the database({$output}).\n";
    exit(0);
}
// check the input file
$fd = fopen($input, "r");
if (!$fd) {
    $db->Close();
    echo "ERROR: cann't read the input file({$input}).\n";