$len -= 2; while ($len > 2) { $len -= 2; $word = substr($word, 0, -2); $r = $rec[$word]; if (!$r) { $record_num++; $r = _WORD_PART_; } else { if ($r & _WORD_PART_) { continue; } else { $r |= _WORD_PART_; } } $rec[$word] = $r; } if ($word_num % 10000 == 0) { echo "{$word_num} ... \n"; flush(); } } fclose($fd); echo "Loading OK! words num: {$word_num} records num: {$record_num} skip num: {$skip_num} \n"; echo "Try to insert into cdb records ... \n"; foreach ($rec as $key => $value) { $db->Put($key, $value); } $db->Optimize(); $db->Close(); echo "\n";
public function actionMake($output = null, $py = null, $yj = null) { if ($output === null) { $output = $this->dictFile; } if ($py === null) { $py = $this->inputPath . '/py.txt'; } if ($yj === null) { $yj = $this->inputPath . '/yj.txt'; } if (file_exists($output) && !unlink($output)) { $this->usageError('输出文件已存在并且不可删除,请用 --output=... 指定路径!'); } if (!file_exists($py) || !($fd = fopen($py, 'r'))) { $this->usageError('拼音文件不存在或打开失败,请用 --py=... 来指定!'); } require_once Yii::getPathOfAlias('application.vendors.xdb') . '.class.php'; $xdb = new XTreeDB(); if (!$xdb->Open($output, 'w')) { fclose($fd); $this->usageError('无法以写入方式打开输出文件,请使用 --output=... 指定路径!'); } echo "拼音文件:" . $py . "\n"; echo "输出文件:" . $output . "\n"; echo "开始制作拼音库,正在加载拼音列表 ... "; $znum = 0; $mchars = $words = array(); while ($line = fgets($fd, 256)) { $line = trim($line); if (substr($line, 0, 1) === '#' || $line === '') { continue; } list($key, $value) = explode(' ', $line, 2); $value = trim($value); if (strlen($key) > 3) { if (($pos = strpos($value, ' ')) !== false) { $value = substr($value, 0, $pos); } $words[$key] = $value; } else { if (strpos($value, ' ') !== false) { $values = array_unique(preg_split('/\\s+/', $value)); if (count($values) > 1) { $mchars[$key] = implode(' ', $values); } $value = $values[0]; } $xdb->Put($key, $value); $znum++; } } fclose($fd); echo "完成,共 " . $znum . " 个字," . count($mchars) . " 个多音字," . count($words) . " 个词组。\n"; echo "开始分析包含多音字的词组 ... "; $add_num = $skip_num = $max_len = 0; foreach ($words as $word => $value) { $save = false; for ($off = 0; $off < strlen($word); $off += 3) { $char = substr($word, $off, 3); if (isset($mchars[$char])) { $save = true; break; } } if (!$save || strlen($word) > self::MAX_WORD_LEN) { $skip_num++; } else { $add_num++; $xdb->Put($word, $value); if (strlen($word) > $max_len) { $max_len = strlen($word); } } } echo "完成,共添加 {$add_num} 个,跳过 {$skip_num} 个,最大长词为 {$max_len} 字节。\n"; if (file_exists($yj)) { echo "开始加载音节数据 ... "; $lines = file($yj); $yinjie = array(); foreach ($lines as $line) { $line = trim($line); if ($line === '') { continue; } if (isset($yinjie[$line])) { $yinjie[$line] |= 0x1; } else { $yinjie[$line] = 0x1; } for ($i = 1; $i < strlen($line); $i++) { $part = substr($line, 0, $i); if (isset($yinjie[$part])) { $yinjie[$part] |= 0x2; } else { $yinjie[$part] = 0x2; } } } foreach ($yinjie as $key => $value) { $xdb->Put($key, $value); } echo "完成,共计 " . count($lines) . " 个拼音,合计 " . count($yinjie) . " 条记录。\n"; } echo "正在优化整理数据库 ... "; $xdb->Optimize(); $xdb->Close(); echo "完成!\n"; }
$rec[$k][$temp] = array(); } $rec[$k][$temp]['part'] = 1; } } fclose($fd); // load ok & try to save it to DBM echo "OK, Total words={$total}\n"; for ($k = 0; $k < 0x40; $k++) { if (!isset($rec[$k])) { continue; } $cnt = 0; printf("Inserting [%02d/64] ... ", $k); foreach ($rec[$k] as $w => $v) { $flag = isset($v['tf']) ? 0x1 : 0; if ($v['part']) { $flag |= 0x2; } $data = pack('ffCa3', $v['tf'], $v['idf'], $flag, $v['attr']); $xdb->Put($w, $data); $cnt++; } printf("%d Records saved.\n", $cnt); } // save echo "INFO: optimizing ... "; flush(); $xdb->Optimize(); $xdb->Close(); echo "DONE!\n";