/** * 文本分词 * @param string $text 需要分词的文本 * @return array */ public static function segmentAll($text) { $list = array(); if (empty($text)) { return $list; } //检测是否已安装php_scws扩展 if (function_exists("scws_open")) { $sh = scws_open(); scws_set_charset($sh, 'utf8'); scws_set_dict($sh, APP_ROOT_PATH . 'system/scws/dict.utf8.xdb'); scws_set_rule($sh, APP_ROOT_PATH . 'system/rules.utf8.ini'); scws_set_ignore($sh, true); scws_send_text($sh, $text); while ($words = scws_get_result($sh)) { foreach ($words as $word) { $list[] = $word['word']; } } scws_close($sh); } else { require_once APP_ROOT_PATH . 'system/scws/pscws4.class.php'; $pscws = new PSCWS4(); $pscws->set_dict(APP_ROOT_PATH . 'system/scws/dict.utf8.xdb'); $pscws->set_rule(APP_ROOT_PATH . 'system/scws/rules.utf8.ini'); $pscws->set_ignore(true); $pscws->send_text($text); while ($words = $pscws->get_result()) { foreach ($words as $word) { $list[] = $word['word']; } } $pscws->close(); } return $list; }
scws_set_dict($cws, ini_get('scws.default.fpath') . '/dict.utf8.xdb'); scws_set_rule($cws, ini_get('scws.default.fpath') . '/rules.utf8.ini'); //scws_set_ignore($cws, true); //scws_set_multi($cws, true); scws_send_text($cws, $text); echo "\n"; // top words printf("No. WordString Attr Weight(times)\n"); printf("-------------------------------------------------\n"); $list = scws_get_tops($cws, 10, "~v"); $cnt = 1; foreach ($list as $tmp) { printf("%02d. %-24.24s %-4.2s %.2f(%d)\n", $cnt, $tmp['word'], $tmp['attr'], $tmp['weight'], $tmp['times']); $cnt++; } echo "\n\n-------------------------------------------------\n"; // segment while ($res = scws_get_result($cws)) { foreach ($res as $tmp) { if ($tmp['len'] == 1 && $tmp['word'] == "\r") { continue; } if ($tmp['len'] == 1 && $tmp['word'] == "\n") { echo $tmp['word']; } else { printf("%s/%s ", $tmp['word'], $tmp['attr']); } } } echo "\n\n"; scws_close($cws);
/** * 分词处理,需要服务器安装scwc分词库作为支持 * * @param string $str * @return Array */ function scws($str) { if (!function_exists('scws_open')) { return false; } $rst = array(); $str = preg_replace("/[\\s\t\r\n]+/", '', $str); if (!empty($str)) { $sh = scws_open(); scws_set_charset($sh, 'utf8'); scws_set_ignore($sh, true); scws_set_multi($sh, SCWS_MULTI_SHORT | SCWS_MULTI_DUALITY); scws_set_duality($sh, true); scws_send_text($sh, $str); while ($row = scws_get_result($sh)) { $rst = array_merge($rst, $row); } scws_close($sh); } return $rst; }
public function cutString($string) { $text = ''; scws_send_text($this->tool, $string); while ($tmp = scws_get_result($this->tool)) { foreach ($tmp as $k => $v) { if (!in_array($v['attr'], $this->filter)) { $text .= $v['word'] . " "; } } } return $text; }