Esempio n. 1
0
    /**
     * 开始执行分析
     * @parem bool optimize 是否对结果进行优化
     * @return bool
     */
    static private function StartAnalysis($optimize = TRUE) {
        if (!self::$isLoadDic) {
            self::$LoadDict();
        }
        self::$simpleResult = self::$finallyResult = array();
        self::$sourceString .= chr(0) . chr(32);
        $slen = strlen(self::$sourceString);
        $sbcArr = array();
        $j = 0;
//全角与半角字符对照表
        for ($i = 0xFF00; $i < 0xFF5F; $i++) {
            $scb = 0x20 + $j;
            $j++;
            $sbcArr[$i] = $scb;
        }
//对字符串进行粗分
        $onstr = '';
        $lastc = 1; //1 中/韩/日文, 2 英文/数字/符号('.', '@', '#', '+'), 3 ANSI符号 4 纯数字 5 非ANSI符号或不支持字符
        $s = 0;
        $ansiWordMatch = "[0-9a-z@#%\+\.-]";
        $notNumberMatch = "[a-z@#%\+]";
        for ($i = 0; $i < $slen; $i++) {
            $c = self::$sourceString[$i] . self::$sourceString[++$i];
            $cn = hexdec(bin2hex($c));
            $cn = isset($sbcArr[$cn]) ? $sbcArr[$cn] : $cn;
//ANSI字符
            if ($cn < 0x80) {
                if (preg_match('/' . $ansiWordMatch . '/i', chr($cn))) {
                    if ($lastc != 2 && $onstr != '') {
                        self::$simpleResult[$s]['w'] = $onstr;
                        self::$simpleResult[$s]['t'] = $lastc;
                        self::_deep_analysis($onstr, $lastc, $s, $optimize);
                        $s++;
                        $onstr = '';
                    }
                    $lastc = 2;
                    $onstr .= chr(0) . chr($cn);
                } else {
                    if ($onstr != '') {
                        self::$simpleResult[$s]['w'] = $onstr;
                        if ($lastc == 2) {
                            if (!preg_match('/' . $notNumberMatch . '/i', iconv(UCS2, 'utf-8', $onstr)))
                                $lastc = 4;
                        }
                        self::$simpleResult[$s]['t'] = $lastc;
                        if ($lastc != 4)
                            self::_deep_analysis($onstr, $lastc, $s, $optimize);
                        $s++;
                    }
                    $onstr = '';
                    $lastc = 3;
                    if ($cn < 31) {
                        continue;
                    } else {
                        self::$simpleResult[$s]['w'] = chr(0) . chr($cn);
                        self::$simpleResult[$s]['t'] = 3;
                        $s++;
                    }
                }
            }
//普通字符
            else {
//正常文字
                if (($cn > 0x3FFF && $cn < 0x9FA6) || ($cn > 0xF8FF && $cn < 0xFA2D)
                        || ($cn > 0xABFF && $cn < 0xD7A4) || ($cn > 0x3040 && $cn < 0x312B)) {
                    if ($lastc != 1 && $onstr != '') {
                        self::$simpleResult[$s]['w'] = $onstr;
                        if ($lastc == 2) {
                            if (!preg_match('/' . $notNumberMatch . '/i', iconv(UCS2, 'utf-8', $onstr)))
                                $lastc = 4;
                        }
                        self::$simpleResult[$s]['t'] = $lastc;
                        if ($lastc != 4)
                            self::_deep_analysis($onstr, $lastc, $s, $optimize);
                        $s++;
                        $onstr = '';
                    }
                    $lastc = 1;
                    $onstr .= $c;
                }
//特殊符号
                else {
                    if ($onstr != '') {
                        self::$simpleResult[$s]['w'] = $onstr;
                        if ($lastc == 2) {
                            if (!preg_match('/' . $notNumberMatch . '/i', iconv(UCS2, 'utf-8', $onstr)))
                                $lastc = 4;
                        }
                        self::$simpleResult[$s]['t'] = $lastc;
                        if ($lastc != 4)
                            self::_deep_analysis($onstr, $lastc, $s, $optimize);
                        $s++;
                    }

//检测书名
                    if ($cn == 0x300A) {
                        $tmpw = '';
                        $n = 1;
                        $isok = FALSE;
                        $ew = chr(0x30) . chr(0x0B);
                        while (TRUE) {
                            if (!isset(self::$sourceString[$i + $n]) && !isset(self::$sourceString[$i + $n + 1]))
                                break;
                            $w = self::$sourceString[$i + $n] . self::$sourceString[$i + $n + 1];
                            if ($w == $ew) {
                                self::$simpleResult[$s]['w'] = $c;
                                self::$simpleResult[$s]['t'] = 5;
                                $s++;

                                self::$simpleResult[$s]['w'] = $tmpw;
                                self::$newWords[$tmpw] = 1;
                                if (!isset(self::$newWords[$tmpw])) {
                                    self::$foundWordStr .= self::$_out_string_encoding($tmpw) . '/nb, ';
                                    self::$SetWordInfos($tmpw, array('c' => 1, 'm' => 'nb'));
                                }
                                self::$simpleResult[$s]['t'] = 13;

                                $s++;

//最大切分模式对书名继续分词
                                if (self::$differMax) {
                                    self::$simpleResult[$s]['w'] = $tmpw;
                                    self::$simpleResult[$s]['t'] = 21;
                                    self::_deep_analysis($tmpw, $lastc, $s, $optimize);
                                    $s++;
                                }

                                self::$simpleResult[$s]['w'] = $ew;
                                self::$simpleResult[$s]['t'] = 5;
                                $s++;

                                $i = $i + $n + 1;
                                $isok = TRUE;
                                $onstr = '';
                                $lastc = 5;
                                break;
                            } else {
                                $n = $n + 2;
                                $tmpw .= $w;
                                if (strlen($tmpw) > 60) {
                                    break;
                                }
                            }
                        }//while
                        if (!$isok) {
                            self::$simpleResult[$s]['w'] = $c;
                            self::$simpleResult[$s]['t'] = 5;
                            $s++;
                            $onstr = '';
                            $lastc = 5;
                        }
                        continue;
                    }

                    $onstr = '';
                    $lastc = 5;
                    if ($cn == 0x3000) {
                        continue;
                    } else {
                        self::$simpleResult[$s]['w'] = $c;
                        self::$simpleResult[$s]['t'] = 5;
                        $s++;
                    }
                }//2byte symbol
            }//end 2byte char
        }//end for
//处理分词后的结果
        self::_sort_finally_result();
    }