public function scws($key) { if (function_exists('scws')) { $so = scws_new(); $so->set_charset('utf-8'); $so->add_dict(ini_get('scws.default.fpath') . '/dict.utf8.xdb'); //自定义词库 //$so->add_dict(APP_PATH . '/library/dict/scws.txt', SCWS_XDICT_TXT); $so->set_rule(ini_get('scws.default.fpath') . '/rules.utf8.ini'); $so->set_ignore(true); $so->set_multi(true); $so->set_duality(true); } else { require_once APP_PATH . '/library/scws/pscws4.class.php'; $so = new PSCWS4('utf-8'); $so->set_dict(APP_PATH . '/library/scws/etc/dict.utf8.xdb'); $so->set_rule(APP_PATH . '/library/scws/etc/rules.utf8.ini'); $so->set_multi(true); $so->set_ignore(true); $so->set_duality(true); } $keys = str_replace(array(" ", " ", "\t", "\n", "\r"), array("", "", "", "", ""), $key); $so->send_text($keys); $words_array = $so->get_result(); $words = ''; foreach ($words_array as $v) { $words = $words . '|"' . $v['word'] . '"'; } $so->close(); return $words = trim($words, '|'); }
public function __construct($config) { $this->scws = \scws_new(); foreach ($config as $key => $value) { $this->scws->{'set_' . $key}($value); } }
public function __construct() { $this->scws = scws_new(); $this->scws->set_charset('utf8'); $this->scws->set_rule(ini_get('scws.default.fpath') . '/rules.utf8.ini'); $this->scws->set_dict(ini_get('scws.default.fpath') . '/dict.utf8.xdb'); $this->scws->set_duality(true); $this->scws->set_ignore(true); }
function __construct() { $this->_cws = scws_new(); if (defined('SCWS_DICT')) { $scws_dict = SCWS_DICT; $this->_cws->set_dict($scws_dict); } if (defined('SCWS_RULE')) { $scws_rule = SCWS_RULE; $this->_cws->set_rule($scws_rule); } }
private function postTopicR($uid, $bkid, $title, $content) { $redis = $this->redis; $tablePre = $this->tablePre; // 获取帖子id $topicid = $redis->hIncrBy($tablePre . 'counter', 'topicid', 1); // 获取帖子的关键字 if (function_exists('scws_new')) { $so = scws_new(); $so->set_charset('utf-8'); $words = array(); $textArr = array($title, $content); $so->set_dict($this->dictPath); foreach ($textArr as $text) { $so->send_text($text); while ($tmp = $so->get_result()) { foreach ($tmp as $strTmp) { if (mb_strlen($strTmp['word'], 'UTF-8') > 1) { $words[] = $strTmp['word']; } } } } $so->close(); $words = array_unique($words); //批量添加关键字索引脚本 $lua = 'local arrLength=#(ARGV) local topicid=ARGV[arrLength] local tablePre=ARGV[arrLength-1] for i=1, (arrLength-2) do redis.call(\'zadd\',tablePre..\'keywords:\'..ARGV[i]..\':topiclist\',topicid,topicid) end'; $words[] = $tablePre; $words[] = $topicid; $redis->eval($lua, $words); } // 向全局最新帖子列表中加入此帖子id $redis->lPush($tablePre . 'new_posts', $topicid); // 向所在板块最新帖子列表中加入此帖子id $redis->lPush($tablePre . 'bkid:' . $bkid . ':new_posts', $topicid); //向所在板块的父节点插入帖子记录 $pid = $redis->hGet($tablePre . 'bkid:' . $bkid . ':bkinfo', 'pid'); $redis->lPush($tablePre . 'bkid:' . $pid . ':child_posts', $topicid); // 将帖子加入所在板块的帖子有序集合中,按时间排序 $postTime = time(); $redis->zAdd($tablePre . 'bkid:' . $bkid . ':posts', $postTime, $topicid); // 存放帖子信息 $topicInfo = array('topicid' => $topicid, 'bkid' => $bkid, 'uid' => $uid, 'title' => $title, 'content' => $content, 'post_time' => $postTime, 'last_update' => 0, 'view_num' => 0, 'reply_num' => 0, 'score' => 0, 'score_uid' => 0, 'score_comment' => 0); $redis->hMset($tablePre . 'topicid:' . $topicid . ':info', $topicInfo); return $bkid; }
/** * get the special words judge * Enter description here ... * @param unknown_type $content * @param unknown_type $type */ public static function wordType($content, $type = '') { $obj = scws_new(); $dicpath = empty($type) ? C('dictionay.defaultdic') : C('dictionay.' . $type); if (empty($dicpath)) { throw_exception("Load dictionary node: dic file Error"); } $obj->set_charset('utf8'); $obj->set_dict($dicpath, SCWS_XDICT_TXT); $obj->send_text($content); $result = $obj->get_result(); $obj->close(); return $result; }
public function __construct() { $this->_cws = scws_new(); $scws_dict = config::get('search.scws.dict', null); $scws_rule = config::get('search.scws.rule', null); if ($scws_dict) { $this->_cws->set_dict($scws_dict); } if ($scws_dict) { $this->_cws->set_rule($scws_rule); } //1-15分词方式 //$this->_cws->set_multi(8); }
public function init($text = '') { //如果服务器没有启用scws扩展,则使用原生phpscws4库 if (function_exists('scws_new')) { $this->scws = scws_new('utf8'); } else { require_cache('./addons/libs/scws/pscws4.class.php'); $this->scws = new PSCWS4('utf8'); } $this->scws->set_charset('utf8'); $this->scws->set_dict($this->dict); $this->scws->set_rule($this->rule); $this->setText($text); }
private static function _init() { if (!self::$_SCWS) { self::$_SCWS = scws_new(); if (!self::$_SCWS) { return false; } self::$_SCWS->set_charset('utf8'); self::$_SCWS->set_rule(DBConfig::$SCWS['rule']); //设置配置文件utf8 self::$_SCWS->set_ignore(true); //忽略标点 //字典 默认 自定义 txt类型 self::$_SCWS->set_dict(DBConfig::$SCWS['default_dict']); self::$_SCWS->add_dict(DBConfig::$SCWS['my_dict'], SCWS_XDICT_TXT); } return true; }
public function text_to_segment($text, $with_speech = NULL) { $ignore = $this->ignore; $charset = $this->charset; $dict = $this->dict; $rule = $this->rule; $cws = scws_new(); $cws->set_charset($charset); $cws->set_dict($dict); $cws->set_rule($rule); //$cws->set_ignore(true); $cws->send_text($text); $outputText = ""; $outputSpeech = ""; while ($tmp = $cws->get_result()) { $lineT = ''; $lineS = ''; foreach ($tmp as $w) { $lS = $w['word'] . Segmentor::$speech_separator . $this->_speech_filter($w['attr']) . ' '; $lT = $w["word"] . " "; $lineT .= $lT; $lineS .= $lS; } $outputText .= $lineT; $outputSpeech .= $lineS; } $outputText = $this->_ignore_stopword($outputText); $outputSpeech = $this->_ignore_stopword($outputSpeech); if (is_null($with_speech)) { return array('text' => $outputText, 'speech' => $outputSpeech); } else { if (FALSE === $with_speech) { return $outputText; } else { return $outputSpeech; } } }
} // 是否只看统计表 if (isset($_REQUEST['stats']) && !strcmp($_REQUEST['stats'], 'yes')) { $stats = true; $checked_stats = ' checked'; } $xattr =& $_REQUEST['xattr']; if (!isset($xattr)) { $xattr = '~v'; } $limit =& $_REQUEST['limit']; if (!isset($limit)) { $limit = 10; } // do the segment $cws = scws_new(); $cws->set_charset('utf8'); // // use default dictionary & rules // $cws->set_rule(ini_get('scws.default.fpath') . '/rules_cht.utf8.ini'); $cws->set_dict(ini_get('scws.default.fpath') . '/dict_cht.utf8.xdb'); $cws->set_ignore($ignore); $cws->send_text($mydata); ?> <html> <head> <meta http-equiv="Content-type" content="text/html; charset=utf-8"> <title>PHP简易中文分词(SCWS) 第4版在线演示 (by hightman)</title> <style type="text/css"> <!--
/** * 利用scws 分词 * @param string $words * @return array */ public static function scws($words) { $so = scws_new(); $so->set_charset('utf-8'); //默认词库 $so->set_dict(ini_get('scws.default.fpath') . '/dict.utf8.xdb'); // $extra_path = ini_get('scws.default.fpath') . '/dict_extra.txt'; // if(file_exists($extra_path)){ // $so->add_dict($extra_path,SCWS_XDICT_TXT); // } //默认规则 $so->set_rule(ini_get('scws.default.fpath') . '/rules.utf8.ini'); //设定分词返回结果时是否去除一些特殊的标点符号 $so->set_ignore(true); //设定分词返回结果时是否复式分割,如“中国人”返回“中国+人+中国人”三个词。 // 按位异或的 1 | 2 | 4 | 8 分别表示: 短词 | 二元 | 主要单字 | 所有单字 //1,2,4,8 分别对应常量 SCWS_MULTI_SHORT? SCWS_MULTI_DUALITY SCWS_MULTI_ZMAIN SCWS_MULTI_ZALL $so->set_multi(7); //设定是否将闲散文字自动以二字分词法聚合 $so->set_duality(false); // 这里没有调用 set_dict 和 set_rule 系统会自动试调用 ini 中指定路径下的词典和规则文件 $so->send_text($words); //按照词的属性取前2个短词 $result = $so->get_tops(10, "XB,XZ,XS,nt,nz,an,ng,vn,i,j,n,v"); $so->close(); return $result; }
/** * 中文分词 * * @param string $content * @param enum $mode 分词的规则,enum(SCWS_MULTI_SHORT, SCWS_MULTI_DUALITY, SCWS_MULTI_ZMAIN, SCWS_MULTI_ZALL) * 分别表示短语,二元,主要单字,所有单字 * * @return array $splits 包含被分的词的数组 */ public static function cnSplit($content, $mode = SCWS_MULTI_ZALL) { if (empty($content) || !function_exists('scws_new')) { return $content; } $ws_obj = scws_new(); $ws_obj->set_charset('utf8'); $ws_obj->send_text($content); $ws_obj->set_multi($mode); $splits = array(); while ($tmp = $ws_obj->get_result()) { foreach ($tmp as $each) { $splits[] = $each['word']; } } $ws_obj->close(); return $splits; }
function cws($str) { //切词函数 $cws = scws_new(); $cws->set_multi(16); $cws->set_ignore(true); $cws->set_duality(true); $cws->send_text($str); $wod = array(); while ($tmp = $cws->get_result()) { foreach ($tmp as $v) { $wod[] = $v; } } return $wod; }
public function talk_similarity_get() { $this->check_token(); $mp = M("member_post"); $scws = scws_new(); $scws->set_charset('utf8'); $scws->set_dict(ini_get('scws.default.fpath') . '/dict.utf8.xdb'); $scws->set_rule(ini_get('scws.default.fpath') . '/rules.utf8.ini'); $scws->set_multi(1); $scws->set_ignore(true); $id = I('get.id', 0, 'intval'); $centent = strip_tags($mp->getFieldById($id, "content")); $scws->send_text($centent); $words = array(); // 汇总分词结果 while ($tmp = $scws->get_result()) { foreach ($tmp as $t) { if (($t['attr'] == "en" || $t['attr'] == "n" || $t['attr'] == "r") && !in_array($t['word'], $words) && mb_strlen($t['word'], "utf-8") > 1) { $words[] = $t['word']; } } } if (count($words) > 2) { $order = array(); $where = array("id" => array('neq', $id), "pid" => 0); $_where = array(); foreach ($words as $w) { $w = mysql_escape_string($w); $order[] = "(case when LOCATE('{$w}',content) > 0 then 1 else 0 end)"; $_where[] = "content like '%{$w}%'"; } $order = "add_time desc,(" . implode(" + ", $order) . ") desc"; $_where = "(" . implode(" or ", $_where) . ")"; $where["_string"] = $_where; $data = $mp->where($where)->field("id,member_nickname,content")->order($order)->limit(10)->select(); $this->success($data); } else { $this->success(); } }
/** * [分词 【http://www.xunsearch.com/scws/docs.php#pscws23】] * * @param [type] $text [description] * * @return [type] [description] * * @description 分词只是一个简单的例子,你可以使用任意的分词服务 */ private function segment($text) { $outText = array(); //实例化 $so = scws_new(); //字符集 $so->set_charset('utf8'); //处理 $so->send_text($text); //便利出需要的数组 while ($res = $so->get_result()) { foreach ($res as $v) { $outText[] = $v['word']; } } //关闭 $so->close(); return $outText; }
/** * 分词,返回关键字 * @param type $content * @return type * @author yangguofeng */ public static function getKeywords($content) { $words = []; $scws = scws_new(); $scws->set_charset('utf8'); // 这里没有调用 set_dict 和 set_rule 系统会自动试调用 ini 中指定路径下的词典和规则文件 $scws->send_text($content); $scws->add_dict(ini_get("scws.default.fpath") . "/lianchuang.txt", SCWS_XDICT_TXT); $scws->set_ignore(TRUE); //是否复式分割,如“中国人”返回“中国+人+中国人”三个词。 $scws->set_multi(false); //设定将文字自动以二字分词法聚合 // $scws->set_duality(true); $result = $scws->get_words('@'); if (is_array($result) && count($result) > 0) { foreach ($result as $key => $value) { $words[] = $value['word']; } } $scws->close(); return $words; }