Esempio n. 1
0
 /**
  * Static method extractTags
  *
  * @param string  $content  # input content
  * @param int     $top_k    # top_k
  * @param array   $options  # other options
  *
  * @return array $tags
  */
 public static function extractTags($content, $top_k = 20, $options = array())
 {
     $defaults = array('mode' => 'default');
     $options = array_merge($defaults, $options);
     $tags = array();
     $words = Jieba::cut($content);
     $freq = array();
     $total = 0.0;
     foreach ($words as $w) {
         $w = trim($w);
         if (mb_strlen($w, 'UTF-8') < 2) {
             continue;
         }
         if (isset($freq[$w])) {
             $freq[$w] = $freq[$w] + 1.0;
         } else {
             $freq[$w] = 0.0 + 1.0;
         }
         $total = $total + 1.0;
     }
     foreach ($freq as $k => $v) {
         $freq[$k] = $v / $total;
     }
     $tf_idf_list = array();
     foreach ($freq as $k => $v) {
         if (isset(self::$idf_freq[$k])) {
             $idf_freq = self::$idf_freq[$k];
         } else {
             $idf_freq = self::$max_idf;
         }
         $tf_idf_list[$k] = $v * $idf_freq;
     }
     arsort($tf_idf_list);
     $tags = array_slice($tf_idf_list, 0, $top_k, true);
     return $tags;
 }
Esempio n. 2
0
 /**
  * Static method __cutDAG
  *
  * @param string $sentence # input sentence
  * @param array  $options  # other options
  *
  * @return array $words
  */
 public static function __cutDAG($sentence, $options = array())
 {
     $defaults = array('mode' => 'default');
     $options = array_merge($defaults, $options);
     $words = array();
     $N = mb_strlen($sentence, 'UTF-8');
     $DAG = Jieba::getDAG($sentence);
     Jieba::calc($sentence, $DAG);
     $x = 0;
     $buf = '';
     while ($x < $N) {
         $current_route_keys = array_keys(Jieba::$route[$x]);
         $y = $current_route_keys[0] + 1;
         $l_word = mb_substr($sentence, $x, $y - $x, 'UTF-8');
         if ($y - $x == 1) {
             $buf = $buf . $l_word;
         } else {
             if (mb_strlen($buf, 'UTF-8') > 0) {
                 if (mb_strlen($buf, 'UTF-8') == 1) {
                     if (isset(self::$word_tag[$buf])) {
                         $buf_tag = self::$word_tag[$buf];
                     } else {
                         $buf_tag = "x";
                     }
                     array_push($words, array("word" => $buf, "tag" => $buf_tag));
                     $buf = '';
                 } else {
                     $regognized = self::__cutDetail($buf);
                     foreach ($regognized as $key => $word) {
                         array_push($words, $word);
                     }
                     $buf = '';
                 }
             }
             if (isset(self::$word_tag[$l_word])) {
                 $buf_tag = self::$word_tag[$l_word];
             } else {
                 $buf_tag = "x";
             }
             array_push($words, array("word" => $l_word, "tag" => $buf_tag));
         }
         $x = $y;
     }
     if (mb_strlen($buf, 'UTF-8') > 0) {
         if (mb_strlen($buf, 'UTF-8') == 1) {
             if (isset(self::$word_tag[$buf])) {
                 $buf_tag = self::$word_tag[$buf];
             } else {
                 $buf_tag = "x";
             }
             array_push($words, array("word" => $buf, "tag" => $buf_tag));
         } else {
             $regognized = self::__cutDetail($buf);
             foreach ($regognized as $key => $word) {
                 array_push($words, $word);
             }
         }
     }
     return $words;
 }