public function init() { scws_set_charset($this->tool, "utf8"); scws_set_dict($this->tool, self::ROOT . "etc/dict.utf8.xdb"); scws_set_rule($this->tool, self::ROOT . "etc/rules.utf8.ini"); scws_set_ignore($this->tool, true); //清楚标点 scws_set_multi($this->tool, self::SCWS_MULTI_DUALITY); #scws_set_multi($this->tool,self::SCWS_MULTI_SHORT); scws_set_duality($this->tool, false); }
/** * get tops content of the input, get number of word from return * @param string $content input content * @param int $tops number of words */ public static function keywords($content, $type = '', $tops = 10) { $obj = scws_open(); //load dic path and rules $dicpath = empty($type) ? C('dictionay.defaultdic') : C('dictionay.' . $type); $rules = C('dictionay.defaultrule'); if (empty($dicpath) || empty($rules)) { throw_exception("Load dictionary node: dic file Error"); } //var_dump($type,$dicpath); scws_set_charset($obj, 'utf8'); scws_set_dict($obj, $dicpath, SCWS_XDICT_TXT); scws_set_duality($obj, true); scws_set_rule($obj, $rules); scws_send_text($obj, $content); return scws_get_tops($obj, $tops); }
public static function segments($arr, $num = 10) { $list = array(); if (empty($text)) { return $list; } $words = array(); //检测是否已安装php_scws扩展 if (function_exists("scws_open")) { $sh = scws_open(); scws_set_charset($sh, 'utf8'); scws_set_dict($sh, APP_ROOT_PATH . 'system/scws/dict.utf8.xdb'); scws_set_rule($sh, APP_ROOT_PATH . 'system/scws/rules.utf8.ini'); scws_set_ignore($sh, true); foreach ($arr as $key => $text) { scws_send_text($sh, $text); $words[] = scws_get_tops($sh, $num); } scws_close($sh); } else { require_once APP_ROOT_PATH . 'system/scws/pscws4.class.php'; $pscws = new PSCWS4(); $pscws->set_dict(APP_ROOT_PATH . 'system/scws/dict.utf8.xdb'); $pscws->set_rule(APP_ROOT_PATH . 'system/scws/rules.utf8.ini'); $pscws->set_ignore(true); foreach ($arr as $key => $text) { $pscws->send_text($text); $words[] = $pscws->get_tops($num); } $pscws->close(); } for ($i = 0; $i < $num; $i++) { foreach ($words as $item) { if (isset($item[$i])) { $word = $item[$i]['word']; if (isset($list[$word])) { $list[$word]++; } else { $list[$word] = 1; } } } } $list = array_slice($list, 0, $num); return array_keys($list); }
/** * 文本分词 * @param string $text 需要分词的文本 * @return array */ public static function segmentAll($text) { $list = array(); if (empty($text)) { return $list; } //检测是否已安装php_scws扩展 if (function_exists("scws_open")) { $sh = scws_open(); scws_set_charset($sh, 'utf8'); scws_set_dict($sh, APP_ROOT_PATH . 'system/scws/dict.utf8.xdb'); scws_set_rule($sh, APP_ROOT_PATH . 'system/rules.utf8.ini'); scws_set_ignore($sh, true); scws_send_text($sh, $text); while ($words = scws_get_result($sh)) { foreach ($words as $word) { $list[] = $word['word']; } } scws_close($sh); } else { require_once APP_ROOT_PATH . 'system/scws/pscws4.class.php'; $pscws = new PSCWS4(); $pscws->set_dict(APP_ROOT_PATH . 'system/scws/dict.utf8.xdb'); $pscws->set_rule(APP_ROOT_PATH . 'system/scws/rules.utf8.ini'); $pscws->set_ignore(true); $pscws->send_text($text); while ($words = $pscws->get_result()) { foreach ($words as $word) { $list[] = $word['word']; } } $pscws->close(); } return $list; }
<?php require 'global.php'; require '../include/db_config.php'; set_time_limit(0); $con = @mysql_connect($dbhost, $dbuser, $dbpw) or die('error'); mysql_select_db($dbname); mysql_query('set names gbk'); while (1) { $sql = 'select link_id,title,keywords,description,fulltxt from ve123_links where key_status=0 limit 0,10'; $result = mysql_query($sql); $num = mysql_num_rows($result); if ($num == 0) { die('“创建索引” 完成'); } while ($row = mysql_fetch_assoc($result)) { $content = strip_tags(htmlspecialchars_decode($row['title'] . $row['keywords'] . $row['description'] . $row['fulltxt'])); $content = preg_replace("/[\\s ]+/i", '', $content); $content = str_replace(' ', '', $content); $sh = scws_open(); scws_set_charset($sh, 'gbk'); scws_send_text($sh, $content); $top = scws_get_tops($sh, 20); $keyswords = array(); foreach ($top as $keys) { $sql2 = "insert into ve123_links_keys(keywords,link_id) values('" . $keys['word'] . "','" . $row['link_id'] . "')"; mysql_query($sql2); } mysql_query("update ve123_links set key_status=1 where link_id='" . $row['link_id'] . "'"); } }
/** * 分词处理,取出词频最高的词组,并可以指定词性进行查找 * * @param string $str * @param int $limit * 可选参数,返回的词的最大数量,缺省是 10 * @param string $attr * 可选参数,是一系列词性组成的字符串,各词性之间以半角的逗号隔开, 这表示返回的词性必须在列表中,如果以~开头,则表示取反,词性必须不在列表中,缺省为NULL,返回全部词性,不过滤。 * @return multitype: */ function scwsTop($str, $limit = 10, $attr = null) { if (!function_exists('scws_open')) { return false; } $rst = array(); $str = preg_replace("/[\\s\t\r\n]+/", '', $str); if (!empty($str)) { $sh = scws_open(); scws_set_charset($sh, 'utf8'); scws_set_ignore($sh, true); scws_set_multi($sh, SCWS_MULTI_SHORT | SCWS_MULTI_DUALITY); scws_set_duality($sh, true); scws_send_text($sh, $str); $rst = scws_get_tops($sh, $limit, $attr); scws_close($sh); } return $rst; }
$str = $function($module); } else { $str = "Module {$module} is not compiled into PHP"; } echo "{$str}\n\n"; $text = <<<EOF 陈凯歌并不是《无极》的唯一著作权人,一部电影的整体版权归电影制片厂所有。 一部电影的作者包括导演、摄影、编剧等创作人员,这些创作人员对他们的创作是有版权的。不经过制片人授权,其他人不能对电影做拷贝、发行、反映,不能通过网络来传播,既不能把电影改编成小说、连环画等其他艺术形式发表,也不能把一部几个小时才能放完的电影改编成半个小时就能放完的短片。 著作权和版权在我国是同一个概念,是法律赋予作品创作者的专有权利。所谓专有权利就是没有经过权利人许可又不是法律规定的例外,要使用这个作品,就必须经过作者授权,没有授权就是侵权。 一九八零年春天 EOF; $cws = scws_open(); scws_set_charset($cws, "utf8"); scws_set_dict($cws, ini_get('scws.default.fpath') . '/dict.utf8.xdb'); scws_set_rule($cws, ini_get('scws.default.fpath') . '/rules.utf8.ini'); //scws_set_ignore($cws, true); //scws_set_multi($cws, true); scws_send_text($cws, $text); echo "\n"; // top words printf("No. WordString Attr Weight(times)\n"); printf("-------------------------------------------------\n"); $list = scws_get_tops($cws, 10, "~v"); $cnt = 1; foreach ($list as $tmp) { printf("%02d. %-24.24s %-4.2s %.2f(%d)\n", $cnt, $tmp['word'], $tmp['attr'], $tmp['weight'], $tmp['times']); $cnt++; }
$str = $function($module); } else { $str = "Module {$module} is not compiled into PHP"; } echo "{$str}\n\n"; $text = <<<EOF 陈凯歌并不是《无极》的唯一著作权人,一部电影的整体版权归电影制片厂所有。 一部电影的作者包括导演、摄影、编剧等创作人员,这些创作人员对他们的创作是有版权的。不经过制片人授权,其他人不能对电影做拷贝、发行、反映,不能通过网络来传播,既不能把电影改编成小说、连环画等其他艺术形式发表,也不能把一部几个小时才能放完的电影改编成半个小时就能放完的短片。 著作权和版权在我国是同一个概念,是法律赋予作品创作者的专有权利。所谓专有权利就是没有经过权利人许可又不是法律规定的例外,要使用这个作品,就必须经过作者授权,没有授权就是侵权。 一九八零年春天 EOF; $cws = scws_open(); scws_set_charset($cws, "gbk"); scws_set_dict($cws, ini_get('scws.default.fpath') . '/dict.xdb'); scws_set_rule($cws, ini_get('scws.default.fpath') . '/rules.ini'); //scws_set_ignore($cws, true); //scws_set_multi($cws, true); scws_send_text($cws, $text); echo "<pre>\n"; // top words printf("No. WordString Attr Weight(times)\n"); printf("-------------------------------------------------\n"); $list = scws_get_tops($cws, 10, "~v"); $cnt = 1; foreach ($list as $tmp) { printf("%02d. %-24.24s %-4.2s %.2f(%d)\n", $cnt, $tmp['word'], $tmp['attr'], $tmp['weight'], $tmp['times']); $cnt++; }