public function actionMake($output = null, $py = null, $yj = null) { if ($output === null) { $output = $this->dictFile; } if ($py === null) { $py = $this->inputPath . '/py.txt'; } if ($yj === null) { $yj = $this->inputPath . '/yj.txt'; } if (file_exists($output) && !unlink($output)) { $this->usageError('输出文件已存在并且不可删除,请用 --output=... 指定路径!'); } if (!file_exists($py) || !($fd = fopen($py, 'r'))) { $this->usageError('拼音文件不存在或打开失败,请用 --py=... 来指定!'); } require_once Yii::getPathOfAlias('application.vendors.xdb') . '.class.php'; $xdb = new XTreeDB(); if (!$xdb->Open($output, 'w')) { fclose($fd); $this->usageError('无法以写入方式打开输出文件,请使用 --output=... 指定路径!'); } echo "拼音文件:" . $py . "\n"; echo "输出文件:" . $output . "\n"; echo "开始制作拼音库,正在加载拼音列表 ... "; $znum = 0; $mchars = $words = array(); while ($line = fgets($fd, 256)) { $line = trim($line); if (substr($line, 0, 1) === '#' || $line === '') { continue; } list($key, $value) = explode(' ', $line, 2); $value = trim($value); if (strlen($key) > 3) { if (($pos = strpos($value, ' ')) !== false) { $value = substr($value, 0, $pos); } $words[$key] = $value; } else { if (strpos($value, ' ') !== false) { $values = array_unique(preg_split('/\\s+/', $value)); if (count($values) > 1) { $mchars[$key] = implode(' ', $values); } $value = $values[0]; } $xdb->Put($key, $value); $znum++; } } fclose($fd); echo "完成,共 " . $znum . " 个字," . count($mchars) . " 个多音字," . count($words) . " 个词组。\n"; echo "开始分析包含多音字的词组 ... "; $add_num = $skip_num = $max_len = 0; foreach ($words as $word => $value) { $save = false; for ($off = 0; $off < strlen($word); $off += 3) { $char = substr($word, $off, 3); if (isset($mchars[$char])) { $save = true; break; } } if (!$save || strlen($word) > self::MAX_WORD_LEN) { $skip_num++; } else { $add_num++; $xdb->Put($word, $value); if (strlen($word) > $max_len) { $max_len = strlen($word); } } } echo "完成,共添加 {$add_num} 个,跳过 {$skip_num} 个,最大长词为 {$max_len} 字节。\n"; if (file_exists($yj)) { echo "开始加载音节数据 ... "; $lines = file($yj); $yinjie = array(); foreach ($lines as $line) { $line = trim($line); if ($line === '') { continue; } if (isset($yinjie[$line])) { $yinjie[$line] |= 0x1; } else { $yinjie[$line] = 0x1; } for ($i = 1; $i < strlen($line); $i++) { $part = substr($line, 0, $i); if (isset($yinjie[$part])) { $yinjie[$part] |= 0x2; } else { $yinjie[$part] = 0x2; } } } foreach ($yinjie as $key => $value) { $xdb->Put($key, $value); } echo "完成,共计 " . count($lines) . " 个拼音,合计 " . count($yinjie) . " 条记录。\n"; } echo "正在优化整理数据库 ... "; $xdb->Optimize(); $xdb->Close(); echo "完成!\n"; }
} $output = isset($_SERVER['argv'][2]) ? $_SERVER['argv'][2] : 'php://stdout'; if (!($fd = @fopen($output, 'w'))) { echo "ERROR: can not open the output file: {$output}\n"; exit(0); } require 'xdb.class.php'; $xdb = new XTreeDB(); if (!$xdb->Open($_SERVER['argv'][1])) { fclose($fd); echo "ERROR: input file {$_SERVER['argv'][1]} maybe not a valid XDB file.\n"; exit(0); } $line = "# WORD\tTF\tIDF\tATTR\n"; fwrite($fd, $line); $xdb->Reset(); while ($tmp = $xdb->Next()) { if (strlen($tmp['value']) != 12) { continue; } $word = $tmp['key']; $data = unpack("ftf/fidf/Cflag/a3attr", $tmp['value']); if (!($data['flag'] & 0x1)) { continue; } $line = sprintf("%s\t%.2f\t%.2f\t%.2s\n", $word, $data['tf'], $data['idf'], $data['attr']); fwrite($fd, $line); } fclose($fd); $xdb->Close();
$len -= 2; while ($len > 2) { $len -= 2; $word = substr($word, 0, -2); $r = $rec[$word]; if (!$r) { $record_num++; $r = _WORD_PART_; } else { if ($r & _WORD_PART_) { continue; } else { $r |= _WORD_PART_; } } $rec[$word] = $r; } if ($word_num % 10000 == 0) { echo "{$word_num} ... \n"; flush(); } } fclose($fd); echo "Loading OK! words num: {$word_num} records num: {$record_num} skip num: {$skip_num} \n"; echo "Try to insert into cdb records ... \n"; foreach ($rec as $key => $value) { $db->Put($key, $value); } $db->Optimize(); $db->Close(); echo "\n";