Exemple #1
0
    $len -= 2;
    while ($len > 2) {
        $len -= 2;
        $word = substr($word, 0, -2);
        $r = $rec[$word];
        if (!$r) {
            $record_num++;
            $r = _WORD_PART_;
        } else {
            if ($r & _WORD_PART_) {
                continue;
            } else {
                $r |= _WORD_PART_;
            }
        }
        $rec[$word] = $r;
    }
    if ($word_num % 10000 == 0) {
        echo "{$word_num} ... \n";
        flush();
    }
}
fclose($fd);
echo "Loading OK! words num: {$word_num}  records num: {$record_num} skip num: {$skip_num} \n";
echo "Try to insert into cdb records ... \n";
foreach ($rec as $key => $value) {
    $db->Put($key, $value);
}
$db->Optimize();
$db->Close();
echo "\n";
Exemple #2
0
 public function actionMake($output = null, $py = null, $yj = null)
 {
     if ($output === null) {
         $output = $this->dictFile;
     }
     if ($py === null) {
         $py = $this->inputPath . '/py.txt';
     }
     if ($yj === null) {
         $yj = $this->inputPath . '/yj.txt';
     }
     if (file_exists($output) && !unlink($output)) {
         $this->usageError('输出文件已存在并且不可删除,请用 --output=... 指定路径!');
     }
     if (!file_exists($py) || !($fd = fopen($py, 'r'))) {
         $this->usageError('拼音文件不存在或打开失败,请用 --py=... 来指定!');
     }
     require_once Yii::getPathOfAlias('application.vendors.xdb') . '.class.php';
     $xdb = new XTreeDB();
     if (!$xdb->Open($output, 'w')) {
         fclose($fd);
         $this->usageError('无法以写入方式打开输出文件,请使用 --output=... 指定路径!');
     }
     echo "拼音文件:" . $py . "\n";
     echo "输出文件:" . $output . "\n";
     echo "开始制作拼音库,正在加载拼音列表 ... ";
     $znum = 0;
     $mchars = $words = array();
     while ($line = fgets($fd, 256)) {
         $line = trim($line);
         if (substr($line, 0, 1) === '#' || $line === '') {
             continue;
         }
         list($key, $value) = explode(' ', $line, 2);
         $value = trim($value);
         if (strlen($key) > 3) {
             if (($pos = strpos($value, ' ')) !== false) {
                 $value = substr($value, 0, $pos);
             }
             $words[$key] = $value;
         } else {
             if (strpos($value, ' ') !== false) {
                 $values = array_unique(preg_split('/\\s+/', $value));
                 if (count($values) > 1) {
                     $mchars[$key] = implode(' ', $values);
                 }
                 $value = $values[0];
             }
             $xdb->Put($key, $value);
             $znum++;
         }
     }
     fclose($fd);
     echo "完成,共 " . $znum . " 个字," . count($mchars) . " 个多音字," . count($words) . " 个词组。\n";
     echo "开始分析包含多音字的词组 ... ";
     $add_num = $skip_num = $max_len = 0;
     foreach ($words as $word => $value) {
         $save = false;
         for ($off = 0; $off < strlen($word); $off += 3) {
             $char = substr($word, $off, 3);
             if (isset($mchars[$char])) {
                 $save = true;
                 break;
             }
         }
         if (!$save || strlen($word) > self::MAX_WORD_LEN) {
             $skip_num++;
         } else {
             $add_num++;
             $xdb->Put($word, $value);
             if (strlen($word) > $max_len) {
                 $max_len = strlen($word);
             }
         }
     }
     echo "完成,共添加 {$add_num} 个,跳过 {$skip_num} 个,最大长词为 {$max_len} 字节。\n";
     if (file_exists($yj)) {
         echo "开始加载音节数据 ... ";
         $lines = file($yj);
         $yinjie = array();
         foreach ($lines as $line) {
             $line = trim($line);
             if ($line === '') {
                 continue;
             }
             if (isset($yinjie[$line])) {
                 $yinjie[$line] |= 0x1;
             } else {
                 $yinjie[$line] = 0x1;
             }
             for ($i = 1; $i < strlen($line); $i++) {
                 $part = substr($line, 0, $i);
                 if (isset($yinjie[$part])) {
                     $yinjie[$part] |= 0x2;
                 } else {
                     $yinjie[$part] = 0x2;
                 }
             }
         }
         foreach ($yinjie as $key => $value) {
             $xdb->Put($key, $value);
         }
         echo "完成,共计 " . count($lines) . " 个拼音,合计 " . count($yinjie) . " 条记录。\n";
     }
     echo "正在优化整理数据库 ... ";
     $xdb->Optimize();
     $xdb->Close();
     echo "完成!\n";
 }
Exemple #3
0
            $rec[$k][$temp] = array();
        }
        $rec[$k][$temp]['part'] = 1;
    }
}
fclose($fd);
// load ok & try to save it to DBM
echo "OK, Total words={$total}\n";
for ($k = 0; $k < 0x40; $k++) {
    if (!isset($rec[$k])) {
        continue;
    }
    $cnt = 0;
    printf("Inserting [%02d/64] ... ", $k);
    foreach ($rec[$k] as $w => $v) {
        $flag = isset($v['tf']) ? 0x1 : 0;
        if ($v['part']) {
            $flag |= 0x2;
        }
        $data = pack('ffCa3', $v['tf'], $v['idf'], $flag, $v['attr']);
        $xdb->Put($w, $data);
        $cnt++;
    }
    printf("%d Records saved.\n", $cnt);
}
// save
echo "INFO: optimizing ... ";
flush();
$xdb->Optimize();
$xdb->Close();
echo "DONE!\n";