Esempio n. 1
0
 public function scws($key)
 {
     if (function_exists('scws')) {
         $so = scws_new();
         $so->set_charset('utf-8');
         $so->add_dict(ini_get('scws.default.fpath') . '/dict.utf8.xdb');
         //自定义词库
         //$so->add_dict(APP_PATH . '/library/dict/scws.txt', SCWS_XDICT_TXT);
         $so->set_rule(ini_get('scws.default.fpath') . '/rules.utf8.ini');
         $so->set_ignore(true);
         $so->set_multi(true);
         $so->set_duality(true);
     } else {
         require_once APP_PATH . '/library/scws/pscws4.class.php';
         $so = new PSCWS4('utf-8');
         $so->set_dict(APP_PATH . '/library/scws/etc/dict.utf8.xdb');
         $so->set_rule(APP_PATH . '/library/scws/etc/rules.utf8.ini');
         $so->set_multi(true);
         $so->set_ignore(true);
         $so->set_duality(true);
     }
     $keys = str_replace(array(" ", " ", "\t", "\n", "\r"), array("", "", "", "", ""), $key);
     $so->send_text($keys);
     $words_array = $so->get_result();
     $words = '';
     foreach ($words_array as $v) {
         $words = $words . '|"' . $v['word'] . '"';
     }
     $so->close();
     return $words = trim($words, '|');
 }
Esempio n. 2
0
 public function __construct($config)
 {
     $this->scws = \scws_new();
     foreach ($config as $key => $value) {
         $this->scws->{'set_' . $key}($value);
     }
 }
Esempio n. 3
0
 public function __construct()
 {
     $this->scws = scws_new();
     $this->scws->set_charset('utf8');
     $this->scws->set_rule(ini_get('scws.default.fpath') . '/rules.utf8.ini');
     $this->scws->set_dict(ini_get('scws.default.fpath') . '/dict.utf8.xdb');
     $this->scws->set_duality(true);
     $this->scws->set_ignore(true);
 }
Esempio n. 4
0
 function __construct()
 {
     $this->_cws = scws_new();
     if (defined('SCWS_DICT')) {
         $scws_dict = SCWS_DICT;
         $this->_cws->set_dict($scws_dict);
     }
     if (defined('SCWS_RULE')) {
         $scws_rule = SCWS_RULE;
         $this->_cws->set_rule($scws_rule);
     }
 }
Esempio n. 5
0
    private function postTopicR($uid, $bkid, $title, $content)
    {
        $redis = $this->redis;
        $tablePre = $this->tablePre;
        // 获取帖子id
        $topicid = $redis->hIncrBy($tablePre . 'counter', 'topicid', 1);
        // 获取帖子的关键字
        if (function_exists('scws_new')) {
            $so = scws_new();
            $so->set_charset('utf-8');
            $words = array();
            $textArr = array($title, $content);
            $so->set_dict($this->dictPath);
            foreach ($textArr as $text) {
                $so->send_text($text);
                while ($tmp = $so->get_result()) {
                    foreach ($tmp as $strTmp) {
                        if (mb_strlen($strTmp['word'], 'UTF-8') > 1) {
                            $words[] = $strTmp['word'];
                        }
                    }
                }
            }
            $so->close();
            $words = array_unique($words);
            //批量添加关键字索引脚本
            $lua = 'local arrLength=#(ARGV)
			local topicid=ARGV[arrLength]
			local tablePre=ARGV[arrLength-1]
			for i=1, (arrLength-2) do      
			    redis.call(\'zadd\',tablePre..\'keywords:\'..ARGV[i]..\':topiclist\',topicid,topicid)
			end';
            $words[] = $tablePre;
            $words[] = $topicid;
            $redis->eval($lua, $words);
        }
        // 向全局最新帖子列表中加入此帖子id
        $redis->lPush($tablePre . 'new_posts', $topicid);
        // 向所在板块最新帖子列表中加入此帖子id
        $redis->lPush($tablePre . 'bkid:' . $bkid . ':new_posts', $topicid);
        //向所在板块的父节点插入帖子记录
        $pid = $redis->hGet($tablePre . 'bkid:' . $bkid . ':bkinfo', 'pid');
        $redis->lPush($tablePre . 'bkid:' . $pid . ':child_posts', $topicid);
        // 将帖子加入所在板块的帖子有序集合中,按时间排序
        $postTime = time();
        $redis->zAdd($tablePre . 'bkid:' . $bkid . ':posts', $postTime, $topicid);
        // 存放帖子信息
        $topicInfo = array('topicid' => $topicid, 'bkid' => $bkid, 'uid' => $uid, 'title' => $title, 'content' => $content, 'post_time' => $postTime, 'last_update' => 0, 'view_num' => 0, 'reply_num' => 0, 'score' => 0, 'score_uid' => 0, 'score_comment' => 0);
        $redis->hMset($tablePre . 'topicid:' . $topicid . ':info', $topicInfo);
        return $bkid;
    }
Esempio n. 6
0
 /**
  * get the special words judge
  * Enter description here ...
  * @param unknown_type $content
  * @param unknown_type $type
  */
 public static function wordType($content, $type = '')
 {
     $obj = scws_new();
     $dicpath = empty($type) ? C('dictionay.defaultdic') : C('dictionay.' . $type);
     if (empty($dicpath)) {
         throw_exception("Load dictionary node: dic file Error");
     }
     $obj->set_charset('utf8');
     $obj->set_dict($dicpath, SCWS_XDICT_TXT);
     $obj->send_text($content);
     $result = $obj->get_result();
     $obj->close();
     return $result;
 }
Esempio n. 7
0
 public function __construct()
 {
     $this->_cws = scws_new();
     $scws_dict = config::get('search.scws.dict', null);
     $scws_rule = config::get('search.scws.rule', null);
     if ($scws_dict) {
         $this->_cws->set_dict($scws_dict);
     }
     if ($scws_dict) {
         $this->_cws->set_rule($scws_rule);
     }
     //1-15分词方式
     //$this->_cws->set_multi(8);
 }
Esempio n. 8
0
 public function init($text = '')
 {
     //如果服务器没有启用scws扩展,则使用原生phpscws4库
     if (function_exists('scws_new')) {
         $this->scws = scws_new('utf8');
     } else {
         require_cache('./addons/libs/scws/pscws4.class.php');
         $this->scws = new PSCWS4('utf8');
     }
     $this->scws->set_charset('utf8');
     $this->scws->set_dict($this->dict);
     $this->scws->set_rule($this->rule);
     $this->setText($text);
 }
Esempio n. 9
0
 private static function _init()
 {
     if (!self::$_SCWS) {
         self::$_SCWS = scws_new();
         if (!self::$_SCWS) {
             return false;
         }
         self::$_SCWS->set_charset('utf8');
         self::$_SCWS->set_rule(DBConfig::$SCWS['rule']);
         //设置配置文件utf8
         self::$_SCWS->set_ignore(true);
         //忽略标点
         //字典 默认 自定义 txt类型
         self::$_SCWS->set_dict(DBConfig::$SCWS['default_dict']);
         self::$_SCWS->add_dict(DBConfig::$SCWS['my_dict'], SCWS_XDICT_TXT);
     }
     return true;
 }
Esempio n. 10
0
 public function text_to_segment($text, $with_speech = NULL)
 {
     $ignore = $this->ignore;
     $charset = $this->charset;
     $dict = $this->dict;
     $rule = $this->rule;
     $cws = scws_new();
     $cws->set_charset($charset);
     $cws->set_dict($dict);
     $cws->set_rule($rule);
     //$cws->set_ignore(true);
     $cws->send_text($text);
     $outputText = "";
     $outputSpeech = "";
     while ($tmp = $cws->get_result()) {
         $lineT = '';
         $lineS = '';
         foreach ($tmp as $w) {
             $lS = $w['word'] . Segmentor::$speech_separator . $this->_speech_filter($w['attr']) . ' ';
             $lT = $w["word"] . " ";
             $lineT .= $lT;
             $lineS .= $lS;
         }
         $outputText .= $lineT;
         $outputSpeech .= $lineS;
     }
     $outputText = $this->_ignore_stopword($outputText);
     $outputSpeech = $this->_ignore_stopword($outputSpeech);
     if (is_null($with_speech)) {
         return array('text' => $outputText, 'speech' => $outputSpeech);
     } else {
         if (FALSE === $with_speech) {
             return $outputText;
         } else {
             return $outputSpeech;
         }
     }
 }
Esempio n. 11
0
}
// 是否只看统计表
if (isset($_REQUEST['stats']) && !strcmp($_REQUEST['stats'], 'yes')) {
    $stats = true;
    $checked_stats = ' checked';
}
$xattr =& $_REQUEST['xattr'];
if (!isset($xattr)) {
    $xattr = '~v';
}
$limit =& $_REQUEST['limit'];
if (!isset($limit)) {
    $limit = 10;
}
// do the segment
$cws = scws_new();
$cws->set_charset('utf8');
//
// use default dictionary & rules
//
$cws->set_rule(ini_get('scws.default.fpath') . '/rules_cht.utf8.ini');
$cws->set_dict(ini_get('scws.default.fpath') . '/dict_cht.utf8.xdb');
$cws->set_ignore($ignore);
$cws->send_text($mydata);
?>
<html>
<head>
<meta http-equiv="Content-type" content="text/html; charset=utf-8">
<title>PHP简易中文分词(SCWS) 第4版在线演示 (by hightman)</title>
<style type="text/css">
<!--
Esempio n. 12
0
 /**
  * 利用scws 分词
  * @param string $words
  * @return array
  */
 public static function scws($words)
 {
     $so = scws_new();
     $so->set_charset('utf-8');
     //默认词库
     $so->set_dict(ini_get('scws.default.fpath') . '/dict.utf8.xdb');
     //        $extra_path = ini_get('scws.default.fpath') . '/dict_extra.txt';
     //        if(file_exists($extra_path)){
     //            $so->add_dict($extra_path,SCWS_XDICT_TXT);
     //        }
     //默认规则
     $so->set_rule(ini_get('scws.default.fpath') . '/rules.utf8.ini');
     //设定分词返回结果时是否去除一些特殊的标点符号
     $so->set_ignore(true);
     //设定分词返回结果时是否复式分割,如“中国人”返回“中国+人+中国人”三个词。
     // 按位异或的 1 | 2 | 4 | 8 分别表示: 短词 | 二元 | 主要单字 | 所有单字
     //1,2,4,8 分别对应常量 SCWS_MULTI_SHORT? SCWS_MULTI_DUALITY SCWS_MULTI_ZMAIN SCWS_MULTI_ZALL
     $so->set_multi(7);
     //设定是否将闲散文字自动以二字分词法聚合
     $so->set_duality(false);
     // 这里没有调用 set_dict 和 set_rule 系统会自动试调用 ini 中指定路径下的词典和规则文件
     $so->send_text($words);
     //按照词的属性取前2个短词
     $result = $so->get_tops(10, "XB,XZ,XS,nt,nz,an,ng,vn,i,j,n,v");
     $so->close();
     return $result;
 }
Esempio n. 13
0
 /**
  * 中文分词
  *
  * @param string $content
  * @param enum $mode 分词的规则,enum(SCWS_MULTI_SHORT, SCWS_MULTI_DUALITY, SCWS_MULTI_ZMAIN, SCWS_MULTI_ZALL)
  *     分别表示短语,二元,主要单字,所有单字
  *
  * @return array $splits 包含被分的词的数组
  */
 public static function cnSplit($content, $mode = SCWS_MULTI_ZALL)
 {
     if (empty($content) || !function_exists('scws_new')) {
         return $content;
     }
     $ws_obj = scws_new();
     $ws_obj->set_charset('utf8');
     $ws_obj->send_text($content);
     $ws_obj->set_multi($mode);
     $splits = array();
     while ($tmp = $ws_obj->get_result()) {
         foreach ($tmp as $each) {
             $splits[] = $each['word'];
         }
     }
     $ws_obj->close();
     return $splits;
 }
function cws($str)
{
    //切词函数
    $cws = scws_new();
    $cws->set_multi(16);
    $cws->set_ignore(true);
    $cws->set_duality(true);
    $cws->send_text($str);
    $wod = array();
    while ($tmp = $cws->get_result()) {
        foreach ($tmp as $v) {
            $wod[] = $v;
        }
    }
    return $wod;
}
 public function talk_similarity_get()
 {
     $this->check_token();
     $mp = M("member_post");
     $scws = scws_new();
     $scws->set_charset('utf8');
     $scws->set_dict(ini_get('scws.default.fpath') . '/dict.utf8.xdb');
     $scws->set_rule(ini_get('scws.default.fpath') . '/rules.utf8.ini');
     $scws->set_multi(1);
     $scws->set_ignore(true);
     $id = I('get.id', 0, 'intval');
     $centent = strip_tags($mp->getFieldById($id, "content"));
     $scws->send_text($centent);
     $words = array();
     // 汇总分词结果
     while ($tmp = $scws->get_result()) {
         foreach ($tmp as $t) {
             if (($t['attr'] == "en" || $t['attr'] == "n" || $t['attr'] == "r") && !in_array($t['word'], $words) && mb_strlen($t['word'], "utf-8") > 1) {
                 $words[] = $t['word'];
             }
         }
     }
     if (count($words) > 2) {
         $order = array();
         $where = array("id" => array('neq', $id), "pid" => 0);
         $_where = array();
         foreach ($words as $w) {
             $w = mysql_escape_string($w);
             $order[] = "(case when LOCATE('{$w}',content) > 0 then 1 else 0 end)";
             $_where[] = "content like '%{$w}%'";
         }
         $order = "add_time desc,(" . implode(" + ", $order) . ") desc";
         $_where = "(" . implode(" or ", $_where) . ")";
         $where["_string"] = $_where;
         $data = $mp->where($where)->field("id,member_nickname,content")->order($order)->limit(10)->select();
         $this->success($data);
     } else {
         $this->success();
     }
 }
 /**
  * [分词  【http://www.xunsearch.com/scws/docs.php#pscws23】]
  *
  * @param [type] $text [description]
  *
  * @return [type] [description]
  *
  * @description 分词只是一个简单的例子,你可以使用任意的分词服务
  */
 private function segment($text)
 {
     $outText = array();
     //实例化
     $so = scws_new();
     //字符集
     $so->set_charset('utf8');
     //处理
     $so->send_text($text);
     //便利出需要的数组
     while ($res = $so->get_result()) {
         foreach ($res as $v) {
             $outText[] = $v['word'];
         }
     }
     //关闭
     $so->close();
     return $outText;
 }
Esempio n. 17
0
 /**
  * 分词,返回关键字
  * @param type $content
  * @return type
  * @author yangguofeng
  */
 public static function getKeywords($content)
 {
     $words = [];
     $scws = scws_new();
     $scws->set_charset('utf8');
     // 这里没有调用 set_dict 和 set_rule 系统会自动试调用 ini 中指定路径下的词典和规则文件
     $scws->send_text($content);
     $scws->add_dict(ini_get("scws.default.fpath") . "/lianchuang.txt", SCWS_XDICT_TXT);
     $scws->set_ignore(TRUE);
     //是否复式分割,如“中国人”返回“中国+人+中国人”三个词。
     $scws->set_multi(false);
     //设定将文字自动以二字分词法聚合
     // $scws->set_duality(true);
     $result = $scws->get_words('@');
     if (is_array($result) && count($result) > 0) {
         foreach ($result as $key => $value) {
             $words[] = $value['word'];
         }
     }
     $scws->close();
     return $words;
 }