/** * 构造函数 * 打开并加载拼音词库 */ public function __construct($file = null) { require_once dirname(__FILE__) . '/xdb.class.php'; if ($file === null) { $file = dirname(__FILE__) . '/py.xdb'; } $xdb = new XTreeDB(); if (!$xdb->Open($file, 'r')) { trigger_error('Failed to load pinyin database: ' . $file, E_USER_ERROR); } $this->xdb = $xdb; }
// Usage: php dump_xdb_file.php <xdb file> [output file] // $Id: $ ini_set('memory_limit', '1024M'); set_time_limit(0); if (!isset($_SERVER['argv'][1]) || !is_file($_SERVER['argv'][1])) { echo "Usage: {$_SERVER['argv'][0]} <xdb file> [output file]\n"; exit(0); } $output = isset($_SERVER['argv'][2]) ? $_SERVER['argv'][2] : 'php://stdout'; if (!($fd = @fopen($output, 'w'))) { echo "ERROR: can not open the output file: {$output}\n"; exit(0); } require 'xdb.class.php'; $xdb = new XTreeDB(); if (!$xdb->Open($_SERVER['argv'][1])) { fclose($fd); echo "ERROR: input file {$_SERVER['argv'][1]} maybe not a valid XDB file.\n"; exit(0); } $line = "# WORD\tTF\tIDF\tATTR\n"; fwrite($fd, $line); $xdb->Reset(); while ($tmp = $xdb->Next()) { if (strlen($tmp['value']) != 12) { continue; } $word = $tmp['key']; $data = unpack("ftf/fidf/Cflag/a3attr", $tmp['value']); if (!($data['flag'] & 0x1)) { continue;
public function actionMake($output = null, $py = null, $yj = null) { if ($output === null) { $output = $this->dictFile; } if ($py === null) { $py = $this->inputPath . '/py.txt'; } if ($yj === null) { $yj = $this->inputPath . '/yj.txt'; } if (file_exists($output) && !unlink($output)) { $this->usageError('输出文件已存在并且不可删除,请用 --output=... 指定路径!'); } if (!file_exists($py) || !($fd = fopen($py, 'r'))) { $this->usageError('拼音文件不存在或打开失败,请用 --py=... 来指定!'); } require_once Yii::getPathOfAlias('application.vendors.xdb') . '.class.php'; $xdb = new XTreeDB(); if (!$xdb->Open($output, 'w')) { fclose($fd); $this->usageError('无法以写入方式打开输出文件,请使用 --output=... 指定路径!'); } echo "拼音文件:" . $py . "\n"; echo "输出文件:" . $output . "\n"; echo "开始制作拼音库,正在加载拼音列表 ... "; $znum = 0; $mchars = $words = array(); while ($line = fgets($fd, 256)) { $line = trim($line); if (substr($line, 0, 1) === '#' || $line === '') { continue; } list($key, $value) = explode(' ', $line, 2); $value = trim($value); if (strlen($key) > 3) { if (($pos = strpos($value, ' ')) !== false) { $value = substr($value, 0, $pos); } $words[$key] = $value; } else { if (strpos($value, ' ') !== false) { $values = array_unique(preg_split('/\\s+/', $value)); if (count($values) > 1) { $mchars[$key] = implode(' ', $values); } $value = $values[0]; } $xdb->Put($key, $value); $znum++; } } fclose($fd); echo "完成,共 " . $znum . " 个字," . count($mchars) . " 个多音字," . count($words) . " 个词组。\n"; echo "开始分析包含多音字的词组 ... "; $add_num = $skip_num = $max_len = 0; foreach ($words as $word => $value) { $save = false; for ($off = 0; $off < strlen($word); $off += 3) { $char = substr($word, $off, 3); if (isset($mchars[$char])) { $save = true; break; } } if (!$save || strlen($word) > self::MAX_WORD_LEN) { $skip_num++; } else { $add_num++; $xdb->Put($word, $value); if (strlen($word) > $max_len) { $max_len = strlen($word); } } } echo "完成,共添加 {$add_num} 个,跳过 {$skip_num} 个,最大长词为 {$max_len} 字节。\n"; if (file_exists($yj)) { echo "开始加载音节数据 ... "; $lines = file($yj); $yinjie = array(); foreach ($lines as $line) { $line = trim($line); if ($line === '') { continue; } if (isset($yinjie[$line])) { $yinjie[$line] |= 0x1; } else { $yinjie[$line] = 0x1; } for ($i = 1; $i < strlen($line); $i++) { $part = substr($line, 0, $i); if (isset($yinjie[$part])) { $yinjie[$part] |= 0x2; } else { $yinjie[$part] = 0x2; } } } foreach ($yinjie as $key => $value) { $xdb->Put($key, $value); } echo "完成,共计 " . count($lines) . " 个拼音,合计 " . count($yinjie) . " 条记录。\n"; } echo "正在优化整理数据库 ... "; $xdb->Optimize(); $xdb->Close(); echo "完成!\n"; }
<?php // xxx // convert dict.txt -> dict.xdb // define("_WORD_ALONE_", 0x4000000); define("_WORD_PART_", 0x8000000); define("_WORD_MAXLEN_", 12); $file = $_SERVER['argv'][1]; if (!isset($file)) { $file = "dict.xdb"; } require 'xdb.class.php'; $db = new XTreeDB(); $db->Open($file, 'w'); if (!$db) { die("fail to open dictionary file.\n"); } $word_num = 0; $record_num = 0; $skip_num = 0; // load to memory first $rec = array(); $fd = fopen("dict.txt", "r"); echo "Loading data into memory ... \n"; while ($line = fgets($fd, 256)) { $line = trim($line); if (empty($line)) { continue; } $w = preg_split("/[\\s]+/", $line);
exit(0); } $input = isset($_SERVER['argv'][2]) ? $_SERVER['argv'][2] : 'php://stdin'; if (!($fd = @fopen($input, 'r'))) { echo "ERROR: can not open the input file: {$input}\n"; exit(0); } // $output = $_SERVER['argv'][1]; if (file_exists($output)) { echo "ERROR: output xdb file exists: {$output}\n"; exit(0); } require 'xdb.class.php'; $xdb = new XTreeDB(); if (!$xdb->Open($output, 'w')) { echo "ERROR: can not open the XDB to write: {$output}\n"; exit(0); } // load data mb_internal_encoding(IS_UTF8_TXT ? 'UTF-8' : 'gbk'); $total = 0; $rec = array(); echo "INFO: Loading text file data ... "; while ($line = fgets($fd, 512)) { if (substr($line, 0, 1) == '#') { continue; } list($word, $tf, $idf, $attr) = explode("\t", $line, 4); $k = ord($word[0]) + ord($word[1]) & 0x3f; $attr = trim($attr);
echo " {$_SERVER['argv'][0]} dict.txt dict.cdb\n\n"; exit(0); } // get the paramters $input = $_SERVER['argv'][1]; $output = $_SERVER['argv'][2]; // create the dbm file if (strrchr($output, '.') == '.hdb') { require 'hdb.class.php'; $db = new HashTreeDB(0, 0x3ffd); $ok = $db->Open($output, 'w'); } if (strrchr($output, '.') == '.xdb') { require 'xdb.class.php'; $db = new XTreeDB(0, 0x3ffd); $ok = $db->Open($output, 'w'); } else { require 'dba.class.php'; $db = new DbaHandler(); $ok = $db->Open($output, 'n'); } if (!$ok) { echo "ERROR: cann't setup the database({$output}).\n"; exit(0); } // check the input file $fd = fopen($input, "r"); if (!$fd) { $db->Close(); echo "ERROR: cann't read the input file({$input}).\n"; exit(0);