Enhance default splitter function to handle UTF-8 characters.
public static split ( string $str, integer $length = 1, boolean $yoon = false ) : array | ||
$str | string | The string to split. |
$length | integer | (optional) Define an optional substring length. Default to 1. |
$yoon | boolean | (optional) Whether considering the base syllable and the following yoon character as a single character or not Default to false. |
return | array | An array of strings. |
public function testSplitSequenceOfCharacters() { $result = Helper::split($this->mixCharacters); $this->assertSame(array('今', '日', '、', 'J', 'o', 'o', '「', 'ジ', 'ョ', 'オ', '」', 'は', '学', '校', 'に', 'い', 'ま', 'す', '。'), $result); }
public function segment($input) { $results = array(); $segments = array('B3', 'B2', 'B1'); $types = array('O', 'O', 'O'); $parts = Helper::split($input); for ($i = 0; $i < count($parts); ++$i) { $segments[] = $parts[$i]; $types[] = $this->getType($parts[$i]); } $segments[] = 'E1'; $segments[] = 'E2'; $segments[] = 'E3'; $types[] = 'O'; $types[] = 'O'; $types[] = 'O'; $word = $segments[3]; $p1 = 'U'; $p2 = 'U'; $p3 = 'U'; for ($i = 4; $i < count($segments) - 3; ++$i) { $score = $this->BIAS; $w1 = $segments[$i - 3]; $w2 = $segments[$i - 2]; $w3 = $segments[$i - 1]; $w4 = $segments[$i]; $w5 = $segments[$i + 1]; $w6 = $segments[$i + 2]; $c1 = $types[$i - 3]; $c2 = $types[$i - 2]; $c3 = $types[$i - 1]; $c4 = $types[$i]; $c5 = $types[$i + 1]; $c6 = $types[$i + 2]; $score += $this->getScore($this->UP1, $p1); $score += $this->getScore($this->UP2, $p2); $score += $this->getScore($this->UP3, $p3); $score += $this->getScore($this->BP1, $p1 . $p2); $score += $this->getScore($this->BP2, $p2 . $p3); $score += $this->getScore($this->UW1, $w1); $score += $this->getScore($this->UW2, $w2); $score += $this->getScore($this->UW3, $w3); $score += $this->getScore($this->UW4, $w4); $score += $this->getScore($this->UW5, $w5); $score += $this->getScore($this->UW6, $w6); $score += $this->getScore($this->BW1, $w2 . $w3); $score += $this->getScore($this->BW2, $w3 . $w4); $score += $this->getScore($this->BW3, $w4 . $w5); $score += $this->getScore($this->TW1, $w1 . $w2 . $w3); $score += $this->getScore($this->TW2, $w2 . $w3 . $w4); $score += $this->getScore($this->TW3, $w3 . $w4 . $w5); $score += $this->getScore($this->TW4, $w4 . $w5 . $w6); $score += $this->getScore($this->UC1, $c1); $score += $this->getScore($this->UC2, $c2); $score += $this->getScore($this->UC3, $c3); $score += $this->getScore($this->UC4, $c4); $score += $this->getScore($this->UC5, $c5); $score += $this->getScore($this->UC6, $c6); $score += $this->getScore($this->BC1, $c2 . $c3); $score += $this->getScore($this->BC2, $c3 . $c4); $score += $this->getScore($this->BC3, $c4 . $c5); $score += $this->getScore($this->TC1, $c1 . $c2 . $c3); $score += $this->getScore($this->TC2, $c2 . $c3 . $c4); $score += $this->getScore($this->TC3, $c3 . $c4 . $c5); $score += $this->getScore($this->TC4, $c4 . $c5 . $c6); $score += $this->getScore($this->UQ1, $p1 . $c1); $score += $this->getScore($this->UQ2, $p2 . $c2); $score += $this->getScore($this->UQ1, $p3 . $c3); $score += $this->getScore($this->BQ1, $p2 . $c2 . $c3); $score += $this->getScore($this->BQ2, $p2 . $c3 . $c4); $score += $this->getScore($this->BQ3, $p3 . $c2 . $c3); $score += $this->getScore($this->BQ4, $p3 . $c3 . $c4); $score += $this->getScore($this->TQ1, $p2 . $c1 . $c2 . $c3); $score += $this->getScore($this->TQ2, $p2 . $c2 . $c3 . $c4); $score += $this->getScore($this->TQ3, $p3 . $c1 . $c2 . $c3); $score += $this->getScore($this->TQ4, $p3 . $c2 . $c3 . $c4); $p = 'O'; if ($score > 0) { $results[] = $word; $word = ''; $p = 'B'; } $p1 = $p2; $p2 = $p3; $p3 = $p; $word .= $segments[$i]; } $results[] = $word; return $results; }