Exemple #1
0
 public function actionMake($output = null, $py = null, $yj = null)
 {
     if ($output === null) {
         $output = $this->dictFile;
     }
     if ($py === null) {
         $py = $this->inputPath . '/py.txt';
     }
     if ($yj === null) {
         $yj = $this->inputPath . '/yj.txt';
     }
     if (file_exists($output) && !unlink($output)) {
         $this->usageError('输出文件已存在并且不可删除,请用 --output=... 指定路径!');
     }
     if (!file_exists($py) || !($fd = fopen($py, 'r'))) {
         $this->usageError('拼音文件不存在或打开失败,请用 --py=... 来指定!');
     }
     require_once Yii::getPathOfAlias('application.vendors.xdb') . '.class.php';
     $xdb = new XTreeDB();
     if (!$xdb->Open($output, 'w')) {
         fclose($fd);
         $this->usageError('无法以写入方式打开输出文件,请使用 --output=... 指定路径!');
     }
     echo "拼音文件:" . $py . "\n";
     echo "输出文件:" . $output . "\n";
     echo "开始制作拼音库,正在加载拼音列表 ... ";
     $znum = 0;
     $mchars = $words = array();
     while ($line = fgets($fd, 256)) {
         $line = trim($line);
         if (substr($line, 0, 1) === '#' || $line === '') {
             continue;
         }
         list($key, $value) = explode(' ', $line, 2);
         $value = trim($value);
         if (strlen($key) > 3) {
             if (($pos = strpos($value, ' ')) !== false) {
                 $value = substr($value, 0, $pos);
             }
             $words[$key] = $value;
         } else {
             if (strpos($value, ' ') !== false) {
                 $values = array_unique(preg_split('/\\s+/', $value));
                 if (count($values) > 1) {
                     $mchars[$key] = implode(' ', $values);
                 }
                 $value = $values[0];
             }
             $xdb->Put($key, $value);
             $znum++;
         }
     }
     fclose($fd);
     echo "完成,共 " . $znum . " 个字," . count($mchars) . " 个多音字," . count($words) . " 个词组。\n";
     echo "开始分析包含多音字的词组 ... ";
     $add_num = $skip_num = $max_len = 0;
     foreach ($words as $word => $value) {
         $save = false;
         for ($off = 0; $off < strlen($word); $off += 3) {
             $char = substr($word, $off, 3);
             if (isset($mchars[$char])) {
                 $save = true;
                 break;
             }
         }
         if (!$save || strlen($word) > self::MAX_WORD_LEN) {
             $skip_num++;
         } else {
             $add_num++;
             $xdb->Put($word, $value);
             if (strlen($word) > $max_len) {
                 $max_len = strlen($word);
             }
         }
     }
     echo "完成,共添加 {$add_num} 个,跳过 {$skip_num} 个,最大长词为 {$max_len} 字节。\n";
     if (file_exists($yj)) {
         echo "开始加载音节数据 ... ";
         $lines = file($yj);
         $yinjie = array();
         foreach ($lines as $line) {
             $line = trim($line);
             if ($line === '') {
                 continue;
             }
             if (isset($yinjie[$line])) {
                 $yinjie[$line] |= 0x1;
             } else {
                 $yinjie[$line] = 0x1;
             }
             for ($i = 1; $i < strlen($line); $i++) {
                 $part = substr($line, 0, $i);
                 if (isset($yinjie[$part])) {
                     $yinjie[$part] |= 0x2;
                 } else {
                     $yinjie[$part] = 0x2;
                 }
             }
         }
         foreach ($yinjie as $key => $value) {
             $xdb->Put($key, $value);
         }
         echo "完成,共计 " . count($lines) . " 个拼音,合计 " . count($yinjie) . " 条记录。\n";
     }
     echo "正在优化整理数据库 ... ";
     $xdb->Optimize();
     $xdb->Close();
     echo "完成!\n";
 }
Exemple #2
0
}
$output = isset($_SERVER['argv'][2]) ? $_SERVER['argv'][2] : 'php://stdout';
if (!($fd = @fopen($output, 'w'))) {
    echo "ERROR: can not open the output file: {$output}\n";
    exit(0);
}
require 'xdb.class.php';
$xdb = new XTreeDB();
if (!$xdb->Open($_SERVER['argv'][1])) {
    fclose($fd);
    echo "ERROR: input file {$_SERVER['argv'][1]} maybe not a valid XDB file.\n";
    exit(0);
}
$line = "# WORD\tTF\tIDF\tATTR\n";
fwrite($fd, $line);
$xdb->Reset();
while ($tmp = $xdb->Next()) {
    if (strlen($tmp['value']) != 12) {
        continue;
    }
    $word = $tmp['key'];
    $data = unpack("ftf/fidf/Cflag/a3attr", $tmp['value']);
    if (!($data['flag'] & 0x1)) {
        continue;
    }
    $line = sprintf("%s\t%.2f\t%.2f\t%.2s\n", $word, $data['tf'], $data['idf'], $data['attr']);
    fwrite($fd, $line);
}
fclose($fd);
$xdb->Close();
Exemple #3
0
    $len -= 2;
    while ($len > 2) {
        $len -= 2;
        $word = substr($word, 0, -2);
        $r = $rec[$word];
        if (!$r) {
            $record_num++;
            $r = _WORD_PART_;
        } else {
            if ($r & _WORD_PART_) {
                continue;
            } else {
                $r |= _WORD_PART_;
            }
        }
        $rec[$word] = $r;
    }
    if ($word_num % 10000 == 0) {
        echo "{$word_num} ... \n";
        flush();
    }
}
fclose($fd);
echo "Loading OK! words num: {$word_num}  records num: {$record_num} skip num: {$skip_num} \n";
echo "Try to insert into cdb records ... \n";
foreach ($rec as $key => $value) {
    $db->Put($key, $value);
}
$db->Optimize();
$db->Close();
echo "\n";