/** * [keyword 获取关键词] * @param [type] $text [文本] * @param string $filter [过滤条件] * @return [type] [description] */ public function keyword($text, $filter = '') { $filter = $this->makeFilter($filter); vendor('pscws4.class#pscws4'); $pscws = new PSCWS4(); $pscws->set_charset('utf-8'); $pscws->send_text($text); $some = $pscws->get_tops(10, $filter); $_result = $this->makeAttr($some); $filter_reg = explode(',', str_replace('~', '', $filter)); if (!empty($filter)) { if ($filter != "~") { foreach ($_result as $k => $v) { if (in_array($v['attr'], $filter_reg)) { $_res[] = $v; } } } else { $_res = $_result; } } else { $_res = $_result; } return $_res; }
发展中国家 上海大学城书店 表面的东西 今天我买了一辆面的,于是我坐着面的去上班 化妆和服装 这个门把手坏了,请把手拿开 将军任命了一名中将,产量三年中将增长两倍 王军虎去广州了,王军虎头虎脑的 欧阳明练功很厉害可是马明练不厉害 毛泽东北京华烟云 人中出吕布 马中出赤兔Q1,中我要买Q币充值 EOF; // require 'pscws4.class.php'; $cws = new PSCWS4(); $cws->set_charset('utf8'); $cws->set_dict('etc/dict.utf8.xdb'); $cws->set_rule('etc/rules.utf8.ini'); //$cws->set_multi(3); //$cws->set_ignore(true); //$cws->set_debug(true); //$cws->set_duality(true); $cws->send_text($text); if (php_sapi_name() != 'cli') { header('Content-Type: text/plain'); } echo "pscws version: " . $cws->version() . "\n"; echo "Segment result:\n\n"; while ($tmp = $cws->get_result()) { $line = ''; foreach ($tmp as $w) {
$keyword = Helper_Archive::pregReplace($keyword, 6); //只能搜索中文英文和数字 $typeid = Helper_Archive::pregReplace($typeid, 2); if (isset($totalresult)) { $pageno = Helper_Archive::pregReplace($pageno, 2); $totalresult = Helper_Archive::pregReplace($totalresult, 2); } addSearchkey($keyword); //添加热搜词 require dirname(__FILE__) . "/cloudsearch/pscws4.class.php"; $pscws = new PSCWS4('utf-8'); // // 接下来, 设定一些分词参数或选项, set_dict 是必须的, 若想智能识别人名等需要 set_rule // // 包括: set_charset, set_dict, set_rule, set_ignore, set_multi, set_debug, set_duality ... 等方法 $pscws->set_charset('utf-8'); $pscws->set_rule(dirname(__FILE__) . '/cloudsearch/rules.utf8.ini'); $pscws->set_dict(dirname(__FILE__) . '/cloudsearch/dict.utf8.xdb'); $pscws->send_text($keyword); while ($some = $pscws->get_result()) { foreach ($some as $word) { $words[] = $word['word']; } } $where = "ishidden=0"; foreach ($words as $k => $v) { $where .= " and title like '%{$v}%'"; if (mb_strlen($v, 'utf-8') > 1) { $whereor .= " or title like '%{$v}%'"; } }
public function chinese_split($text) { // 建立分词类对像, 参数为字符集, 默认为 gbk, 可在后面调用 set_charset 改变 $pscws = new PSCWS4('utf8'); // 接下来, 设定一些分词参数或选项, set_dict 是必须的, 若想智能识别人名等需要 set_rule // 包括: set_charset, set_dict, set_rule, set_ignore, set_multi, set_debug, set_duality ... 等方法 $pscws->set_charset('utf8'); $pscws->set_dict('./pscws4/etc/dict.ct.utf8.xdb'); //繁體UTF8 Chinese Traditional $pscws->set_rule('./pscws4/etc/rules_cht.utf8.ini'); //繁體UTF8 //$pscws->set_dict('./etc/dict.sc.utf8.xdb');//简体UTF8 simplified Chinese //$pscws->set_rule('./etc/rules.utf8.ini');//简体UTF8 simplified Chinese //$pscws->set_dict('./etc/dict.sc.gbk.xdb');//简体GBK simplified Chinese //$pscws->set_rule('./etc/rules.ini');//简体GBK simplified Chinese // 分词调用 send_text() 将待分词的字符串传入, 紧接着循环调用 get_result() 方法取回一系列分好的词 // 直到 get_result() 返回 false 为止 // 返回的词是一个关联数组, 包含: word 词本身, idf 逆词率(重), off 在text中的偏移, len 长度, attr 词性 $pscws->send_text($text); while ($some = $pscws->get_result()) { foreach ($some as $word) { //文章词组处理 var_dump($word); } } // 在 send_text 之后可以调用 get_tops() 返回分词结果的词语按权重统计的前 N 个词 // 常用于提取关键词, 参数用法参见下面的详细介绍. // 返回的数组元素是一个词, 它又包含: word 词本身, weight 词重, times 次数, attr 词性 //$tops = $pscws->get_tops(10, 'n,v'); //var_dump($tops); }