Exemplo n.º 1
0
 /**  
  * 文本分词
  * @param string $text 需要分词的文本
  * @return array
  */
 public static function segmentAll($text)
 {
     $list = array();
     if (empty($text)) {
         return $list;
     }
     //检测是否已安装php_scws扩展
     if (function_exists("scws_open")) {
         $sh = scws_open();
         scws_set_charset($sh, 'utf8');
         scws_set_dict($sh, APP_ROOT_PATH . 'system/scws/dict.utf8.xdb');
         scws_set_rule($sh, APP_ROOT_PATH . 'system/rules.utf8.ini');
         scws_set_ignore($sh, true);
         scws_send_text($sh, $text);
         while ($words = scws_get_result($sh)) {
             foreach ($words as $word) {
                 $list[] = $word['word'];
             }
         }
         scws_close($sh);
     } else {
         require_once APP_ROOT_PATH . 'system/scws/pscws4.class.php';
         $pscws = new PSCWS4();
         $pscws->set_dict(APP_ROOT_PATH . 'system/scws/dict.utf8.xdb');
         $pscws->set_rule(APP_ROOT_PATH . 'system/scws/rules.utf8.ini');
         $pscws->set_ignore(true);
         $pscws->send_text($text);
         while ($words = $pscws->get_result()) {
             foreach ($words as $word) {
                 $list[] = $word['word'];
             }
         }
         $pscws->close();
     }
     return $list;
 }
Exemplo n.º 2
0
Arquivo: scws.php Projeto: lamphp/scws
scws_set_dict($cws, ini_get('scws.default.fpath') . '/dict.utf8.xdb');
scws_set_rule($cws, ini_get('scws.default.fpath') . '/rules.utf8.ini');
//scws_set_ignore($cws, true);
//scws_set_multi($cws, true);
scws_send_text($cws, $text);
echo "\n";
// top words
printf("No. WordString               Attr  Weight(times)\n");
printf("-------------------------------------------------\n");
$list = scws_get_tops($cws, 10, "~v");
$cnt = 1;
foreach ($list as $tmp) {
    printf("%02d. %-24.24s %-4.2s  %.2f(%d)\n", $cnt, $tmp['word'], $tmp['attr'], $tmp['weight'], $tmp['times']);
    $cnt++;
}
echo "\n\n-------------------------------------------------\n";
// segment
while ($res = scws_get_result($cws)) {
    foreach ($res as $tmp) {
        if ($tmp['len'] == 1 && $tmp['word'] == "\r") {
            continue;
        }
        if ($tmp['len'] == 1 && $tmp['word'] == "\n") {
            echo $tmp['word'];
        } else {
            printf("%s/%s ", $tmp['word'], $tmp['attr']);
        }
    }
}
echo "\n\n";
scws_close($cws);
Exemplo n.º 3
0
/**
 * 分词处理,需要服务器安装scwc分词库作为支持
 *
 * @param string $str            
 * @return Array
 */
function scws($str)
{
    if (!function_exists('scws_open')) {
        return false;
    }
    $rst = array();
    $str = preg_replace("/[\\s\t\r\n]+/", '', $str);
    if (!empty($str)) {
        $sh = scws_open();
        scws_set_charset($sh, 'utf8');
        scws_set_ignore($sh, true);
        scws_set_multi($sh, SCWS_MULTI_SHORT | SCWS_MULTI_DUALITY);
        scws_set_duality($sh, true);
        scws_send_text($sh, $str);
        while ($row = scws_get_result($sh)) {
            $rst = array_merge($rst, $row);
        }
        scws_close($sh);
    }
    return $rst;
}
Exemplo n.º 4
0
 public function cutString($string)
 {
     $text = '';
     scws_send_text($this->tool, $string);
     while ($tmp = scws_get_result($this->tool)) {
         foreach ($tmp as $k => $v) {
             if (!in_array($v['attr'], $this->filter)) {
                 $text .= $v['word'] . " ";
             }
         }
     }
     return $text;
 }