Esempio n. 1
0
 public function init()
 {
     scws_set_charset($this->tool, "utf8");
     scws_set_dict($this->tool, self::ROOT . "etc/dict.utf8.xdb");
     scws_set_rule($this->tool, self::ROOT . "etc/rules.utf8.ini");
     scws_set_ignore($this->tool, true);
     //清楚标点
     scws_set_multi($this->tool, self::SCWS_MULTI_DUALITY);
     #scws_set_multi($this->tool,self::SCWS_MULTI_SHORT);
     scws_set_duality($this->tool, false);
 }
 public static function segments($arr, $num = 10)
 {
     $list = array();
     if (empty($text)) {
         return $list;
     }
     $words = array();
     //检测是否已安装php_scws扩展
     if (function_exists("scws_open")) {
         $sh = scws_open();
         scws_set_charset($sh, 'utf8');
         scws_set_dict($sh, APP_ROOT_PATH . 'system/scws/dict.utf8.xdb');
         scws_set_rule($sh, APP_ROOT_PATH . 'system/scws/rules.utf8.ini');
         scws_set_ignore($sh, true);
         foreach ($arr as $key => $text) {
             scws_send_text($sh, $text);
             $words[] = scws_get_tops($sh, $num);
         }
         scws_close($sh);
     } else {
         require_once APP_ROOT_PATH . 'system/scws/pscws4.class.php';
         $pscws = new PSCWS4();
         $pscws->set_dict(APP_ROOT_PATH . 'system/scws/dict.utf8.xdb');
         $pscws->set_rule(APP_ROOT_PATH . 'system/scws/rules.utf8.ini');
         $pscws->set_ignore(true);
         foreach ($arr as $key => $text) {
             $pscws->send_text($text);
             $words[] = $pscws->get_tops($num);
         }
         $pscws->close();
     }
     for ($i = 0; $i < $num; $i++) {
         foreach ($words as $item) {
             if (isset($item[$i])) {
                 $word = $item[$i]['word'];
                 if (isset($list[$word])) {
                     $list[$word]++;
                 } else {
                     $list[$word] = 1;
                 }
             }
         }
     }
     $list = array_slice($list, 0, $num);
     return array_keys($list);
 }
Esempio n. 3
0
 /**  
  * 文本分词
  * @param string $text 需要分词的文本
  * @return array
  */
 public static function segmentAll($text)
 {
     $list = array();
     if (empty($text)) {
         return $list;
     }
     //检测是否已安装php_scws扩展
     if (function_exists("scws_open")) {
         $sh = scws_open();
         scws_set_charset($sh, 'utf8');
         scws_set_dict($sh, APP_ROOT_PATH . 'system/scws/dict.utf8.xdb');
         scws_set_rule($sh, APP_ROOT_PATH . 'system/rules.utf8.ini');
         scws_set_ignore($sh, true);
         scws_send_text($sh, $text);
         while ($words = scws_get_result($sh)) {
             foreach ($words as $word) {
                 $list[] = $word['word'];
             }
         }
         scws_close($sh);
     } else {
         require_once APP_ROOT_PATH . 'system/scws/pscws4.class.php';
         $pscws = new PSCWS4();
         $pscws->set_dict(APP_ROOT_PATH . 'system/scws/dict.utf8.xdb');
         $pscws->set_rule(APP_ROOT_PATH . 'system/scws/rules.utf8.ini');
         $pscws->set_ignore(true);
         $pscws->send_text($text);
         while ($words = $pscws->get_result()) {
             foreach ($words as $word) {
                 $list[] = $word['word'];
             }
         }
         $pscws->close();
     }
     return $list;
 }
Esempio n. 4
0
/**
 * 分词处理,取出词频最高的词组,并可以指定词性进行查找
 *
 * @param string $str            
 * @param int $limit
 *            可选参数,返回的词的最大数量,缺省是 10
 * @param string $attr
 *            可选参数,是一系列词性组成的字符串,各词性之间以半角的逗号隔开, 这表示返回的词性必须在列表中,如果以~开头,则表示取反,词性必须不在列表中,缺省为NULL,返回全部词性,不过滤。
 * @return multitype:
 */
function scwsTop($str, $limit = 10, $attr = null)
{
    if (!function_exists('scws_open')) {
        return false;
    }
    $rst = array();
    $str = preg_replace("/[\\s\t\r\n]+/", '', $str);
    if (!empty($str)) {
        $sh = scws_open();
        scws_set_charset($sh, 'utf8');
        scws_set_ignore($sh, true);
        scws_set_multi($sh, SCWS_MULTI_SHORT | SCWS_MULTI_DUALITY);
        scws_set_duality($sh, true);
        scws_send_text($sh, $str);
        $rst = scws_get_tops($sh, $limit, $attr);
        scws_close($sh);
    }
    return $rst;
}