/** * 开始执行分析 * @parem bool optimize 是否对结果进行优化 * @return bool */ static private function StartAnalysis($optimize = TRUE) { if (!self::$isLoadDic) { self::$LoadDict(); } self::$simpleResult = self::$finallyResult = array(); self::$sourceString .= chr(0) . chr(32); $slen = strlen(self::$sourceString); $sbcArr = array(); $j = 0; //全角与半角字符对照表 for ($i = 0xFF00; $i < 0xFF5F; $i++) { $scb = 0x20 + $j; $j++; $sbcArr[$i] = $scb; } //对字符串进行粗分 $onstr = ''; $lastc = 1; //1 中/韩/日文, 2 英文/数字/符号('.', '@', '#', '+'), 3 ANSI符号 4 纯数字 5 非ANSI符号或不支持字符 $s = 0; $ansiWordMatch = "[0-9a-z@#%\+\.-]"; $notNumberMatch = "[a-z@#%\+]"; for ($i = 0; $i < $slen; $i++) { $c = self::$sourceString[$i] . self::$sourceString[++$i]; $cn = hexdec(bin2hex($c)); $cn = isset($sbcArr[$cn]) ? $sbcArr[$cn] : $cn; //ANSI字符 if ($cn < 0x80) { if (preg_match('/' . $ansiWordMatch . '/i', chr($cn))) { if ($lastc != 2 && $onstr != '') { self::$simpleResult[$s]['w'] = $onstr; self::$simpleResult[$s]['t'] = $lastc; self::_deep_analysis($onstr, $lastc, $s, $optimize); $s++; $onstr = ''; } $lastc = 2; $onstr .= chr(0) . chr($cn); } else { if ($onstr != '') { self::$simpleResult[$s]['w'] = $onstr; if ($lastc == 2) { if (!preg_match('/' . $notNumberMatch . '/i', iconv(UCS2, 'utf-8', $onstr))) $lastc = 4; } self::$simpleResult[$s]['t'] = $lastc; if ($lastc != 4) self::_deep_analysis($onstr, $lastc, $s, $optimize); $s++; } $onstr = ''; $lastc = 3; if ($cn < 31) { continue; } else { self::$simpleResult[$s]['w'] = chr(0) . chr($cn); self::$simpleResult[$s]['t'] = 3; $s++; } } } //普通字符 else { //正常文字 if (($cn > 0x3FFF && $cn < 0x9FA6) || ($cn > 0xF8FF && $cn < 0xFA2D) || ($cn > 0xABFF && $cn < 0xD7A4) || ($cn > 0x3040 && $cn < 0x312B)) { if ($lastc != 1 && $onstr != '') { self::$simpleResult[$s]['w'] = $onstr; if ($lastc == 2) { if (!preg_match('/' . $notNumberMatch . '/i', iconv(UCS2, 'utf-8', $onstr))) $lastc = 4; } self::$simpleResult[$s]['t'] = $lastc; if ($lastc != 4) self::_deep_analysis($onstr, $lastc, $s, $optimize); $s++; $onstr = ''; } $lastc = 1; $onstr .= $c; } //特殊符号 else { if ($onstr != '') { self::$simpleResult[$s]['w'] = $onstr; if ($lastc == 2) { if (!preg_match('/' . $notNumberMatch . '/i', iconv(UCS2, 'utf-8', $onstr))) $lastc = 4; } self::$simpleResult[$s]['t'] = $lastc; if ($lastc != 4) self::_deep_analysis($onstr, $lastc, $s, $optimize); $s++; } //检测书名 if ($cn == 0x300A) { $tmpw = ''; $n = 1; $isok = FALSE; $ew = chr(0x30) . chr(0x0B); while (TRUE) { if (!isset(self::$sourceString[$i + $n]) && !isset(self::$sourceString[$i + $n + 1])) break; $w = self::$sourceString[$i + $n] . self::$sourceString[$i + $n + 1]; if ($w == $ew) { self::$simpleResult[$s]['w'] = $c; self::$simpleResult[$s]['t'] = 5; $s++; self::$simpleResult[$s]['w'] = $tmpw; self::$newWords[$tmpw] = 1; if (!isset(self::$newWords[$tmpw])) { self::$foundWordStr .= self::$_out_string_encoding($tmpw) . '/nb, '; self::$SetWordInfos($tmpw, array('c' => 1, 'm' => 'nb')); } self::$simpleResult[$s]['t'] = 13; $s++; //最大切分模式对书名继续分词 if (self::$differMax) { self::$simpleResult[$s]['w'] = $tmpw; self::$simpleResult[$s]['t'] = 21; self::_deep_analysis($tmpw, $lastc, $s, $optimize); $s++; } self::$simpleResult[$s]['w'] = $ew; self::$simpleResult[$s]['t'] = 5; $s++; $i = $i + $n + 1; $isok = TRUE; $onstr = ''; $lastc = 5; break; } else { $n = $n + 2; $tmpw .= $w; if (strlen($tmpw) > 60) { break; } } }//while if (!$isok) { self::$simpleResult[$s]['w'] = $c; self::$simpleResult[$s]['t'] = 5; $s++; $onstr = ''; $lastc = 5; } continue; } $onstr = ''; $lastc = 5; if ($cn == 0x3000) { continue; } else { self::$simpleResult[$s]['w'] = $c; self::$simpleResult[$s]['t'] = 5; $s++; } }//2byte symbol }//end 2byte char }//end for //处理分词后的结果 self::_sort_finally_result(); }