/** * Static method __cutDAG * * @param string $sentence # input sentence * @param array $options # other options * * @return array $words */ public static function __cutDAG($sentence, $options = array()) { $defaults = array('mode' => 'default'); $options = array_merge($defaults, $options); $words = array(); $N = mb_strlen($sentence, 'UTF-8'); $i = 0; $j = 0; $p = self::$trie; $DAG = array(); $word_c = array(); while ($i < $N) { $c = mb_substr($sentence, $j, 1, 'UTF-8'); if (count($word_c) == 0) { $next_word_key = $c; } else { $next_word_key = implode('.', $word_c) . '.' . $c; } if (self::$trie->exists($next_word_key)) { array_push($word_c, $c); $next_word_key_value = self::$trie->get($next_word_key); if ($next_word_key_value == array("end" => "") || isset($next_word_key_value["end"]) || isset($next_word_key_value[0]["end"])) { if (!isset($DAG[$i])) { $DAG[$i] = array(); } array_push($DAG[$i], $j); } $j += 1; if ($j >= $N) { $word_c = array(); $i += 1; $j = $i; } } else { $word_c = array(); $i += 1; $j = $i; } } for ($i = 0; $i < $N; $i++) { if (!isset($DAG[$i])) { $DAG[$i] = array($i); } } self::calc($sentence, $DAG); $x = 0; $buf = ''; while ($x < $N) { $current_route_keys = array_keys(self::$route[$x]); $y = $current_route_keys[0] + 1; $l_word = mb_substr($sentence, $x, $y - $x, 'UTF-8'); if ($y - $x == 1) { $buf = $buf . $l_word; } else { if (mb_strlen($buf, 'UTF-8') > 0) { if (mb_strlen($buf, 'UTF-8') == 1) { array_push($words, $buf); $buf = ''; } else { $regognized = Finalseg::__cut($buf); foreach ($regognized as $key => $word) { array_push($words, $word); } $buf = ''; } } array_push($words, $l_word); } $x = $y; } if (mb_strlen($buf, 'UTF-8') > 0) { if (mb_strlen($buf, 'UTF-8') == 1) { array_push($words, $buf); } else { $regognized = Finalseg::__cut($buf); foreach ($regognized as $key => $word) { array_push($words, $word); } } } return $words; }
#!/usr/bin/php <?php /** * demo_extract_tags.php * * PHP version 5 * * @category PHP * @package /src/cmd/ * @author Fukuball Lin <*****@*****.**> * @license MIT Licence * @version GIT: <fukuball/jieba-php> * @link https://github.com/fukuball/jieba-php */ ini_set('memory_limit', '600M'); require_once dirname(dirname(__FILE__)) . "/vendor/multi-array/MultiArray.php"; require_once dirname(dirname(__FILE__)) . "/vendor/multi-array/Factory/MultiArrayFactory.php"; require_once dirname(dirname(__FILE__)) . "/class/Jieba.php"; require_once dirname(dirname(__FILE__)) . "/class/Finalseg.php"; require_once dirname(dirname(__FILE__)) . "/class/JiebaAnalyse.php"; use Fukuball\Jieba; use Fukuball\Finalseg; use Fukuball\JiebaAnalyse; Jieba::init(array('mode' => 'test', 'dict' => 'samll')); Finalseg::init(); JiebaAnalyse::init(); $top_k = 10; $content = file_get_contents(dirname(dirname(__FILE__)) . "/dict/lyric.txt", "r"); $tags = JiebaAnalyse::extractTags($content, $top_k); var_dump($tags);
public function testFinalsegCut() { $case_array = array("怜香惜", "玉", "也", "得", "要", "看", "对象", "啊"); $seg_list = Finalseg::cut("怜香惜玉也得要看对象啊!"); $this->assertEquals($case_array, $seg_list); }