/** * Static method extractTags * * @param string $content # input content * @param int $top_k # top_k * @param array $options # other options * * @return array $tags */ public static function extractTags($content, $top_k = 20, $options = array()) { $defaults = array('mode' => 'default'); $options = array_merge($defaults, $options); $tags = array(); $words = Jieba::cut($content); $freq = array(); $total = 0.0; foreach ($words as $w) { $w = trim($w); if (mb_strlen($w, 'UTF-8') < 2) { continue; } if (isset($freq[$w])) { $freq[$w] = $freq[$w] + 1.0; } else { $freq[$w] = 0.0 + 1.0; } $total = $total + 1.0; } foreach ($freq as $k => $v) { $freq[$k] = $v / $total; } $tf_idf_list = array(); foreach ($freq as $k => $v) { if (isset(self::$idf_freq[$k])) { $idf_freq = self::$idf_freq[$k]; } else { $idf_freq = self::$max_idf; } $tf_idf_list[$k] = $v * $idf_freq; } arsort($tf_idf_list); $tags = array_slice($tf_idf_list, 0, $top_k, true); return $tags; }
/** * Static method __cutDAG * * @param string $sentence # input sentence * @param array $options # other options * * @return array $words */ public static function __cutDAG($sentence, $options = array()) { $defaults = array('mode' => 'default'); $options = array_merge($defaults, $options); $words = array(); $N = mb_strlen($sentence, 'UTF-8'); $DAG = Jieba::getDAG($sentence); Jieba::calc($sentence, $DAG); $x = 0; $buf = ''; while ($x < $N) { $current_route_keys = array_keys(Jieba::$route[$x]); $y = $current_route_keys[0] + 1; $l_word = mb_substr($sentence, $x, $y - $x, 'UTF-8'); if ($y - $x == 1) { $buf = $buf . $l_word; } else { if (mb_strlen($buf, 'UTF-8') > 0) { if (mb_strlen($buf, 'UTF-8') == 1) { if (isset(self::$word_tag[$buf])) { $buf_tag = self::$word_tag[$buf]; } else { $buf_tag = "x"; } array_push($words, array("word" => $buf, "tag" => $buf_tag)); $buf = ''; } else { $regognized = self::__cutDetail($buf); foreach ($regognized as $key => $word) { array_push($words, $word); } $buf = ''; } } if (isset(self::$word_tag[$l_word])) { $buf_tag = self::$word_tag[$l_word]; } else { $buf_tag = "x"; } array_push($words, array("word" => $l_word, "tag" => $buf_tag)); } $x = $y; } if (mb_strlen($buf, 'UTF-8') > 0) { if (mb_strlen($buf, 'UTF-8') == 1) { if (isset(self::$word_tag[$buf])) { $buf_tag = self::$word_tag[$buf]; } else { $buf_tag = "x"; } array_push($words, array("word" => $buf, "tag" => $buf_tag)); } else { $regognized = self::__cutDetail($buf); foreach ($regognized as $key => $word) { array_push($words, $word); } } } return $words; }