function utf8_hangul_getSearchRule($str, $lastchar = 1, $use_unicode = true) { $rule = ''; $val = utf8_to_unicode($str); $len = sizeof($val); if ($lastchar and $len > 1) { // make a regex using with the last char $last = array_pop($val); $rule = unicode_to_utf8($val); $val = array($last); $len = sizeof($val); } for ($i = 0; $i < $len; $i++) { $ch = $val[$i]; $wch = array(); $ustart = array(); $uend = array(); if ($ch >= 0xac00 and $ch <= 0xd7a3 or $ch >= 0x3130 and $ch <= 0x318f) { $wch = hangul_to_jamo(array($ch)); } else { $rule .= unicode_to_utf8(array($ch)); continue; } $wlen = sizeof($wch); $ket = ''; if ($wlen >= 3) { // 종각 => 종(각|가[가-깋]) $mrule = array(); $mrule[] = unicode_to_utf8(array($ch)); $save = $wch[2]; unset($wch[2]); $tmp = jamo_to_syllable($wch); $mrule[] = unicode_to_utf8($tmp); $save = hangul_jongseong_to_cjamo($save); $wch = hangul_to_jamo($save); $wlen = sizeof($wch); $rule .= '(' . implode('|', $mrule); $ket = ')'; if ($wlen > 1) { $rule .= ')'; continue; } } if ($wlen == 1) { if ($wch[0] >= 0x1100 and $wch[0] <= 0x1112) { $wch[1] = 0x1161; $start = jamo_to_syllable($wch); $ustart = unicode_to_utf8($start); $wch[1] = 0x1175; $wch[2] = 0x11c2; $end = jamo_to_syllable($wch); $uend = unicode_to_utf8($end); } else { $rule .= unicode_to_utf8($wch) . $ket; continue; } } else { if ($wlen == 2) { if ($wch[0] >= 0x1100 and $wch[0] <= 0x1112) { $start = jamo_to_syllable($wch); $ustart = unicode_to_utf8($start); $wch[2] = 0x11c2; $end = jamo_to_syllable($wch); $uend = unicode_to_utf8($end); } else { $rule .= unicode_to_utf8($wch); continue; } } } if ($use_unicode) { $crule = '[' . $ustart . '-' . $uend . ']'; } else { $rule .= sprintf("\\x%02X", ord($ustart[0])); $crule = ''; if ($ustart[1] == $uend[1]) { $crule .= sprintf("\\x%02X", ord($ustart[1])); $crule .= sprintf("[\\x%02X-\\x%02X]", ord($ustart[2]), ord($uend[2])); } else { $sch = ord($ustart[1]); $ech = ord($uend[1]); $subrule = array(); $subrule[] = sprintf("\\x%02X[\\x%02X-\\xBF]", $sch, ord($ustart[2])); if ($sch + 1 == $ech - 1) { $subrule[] = sprintf("\\x%02X[\\x80-\\xBF]", $sch + 1); } else { if ($sch + 1 != $ech) { $subrule[] = sprintf("[\\x%02X-\\x%02X][\\x80-\\xBF]", $sch + 1, $ech - 1); } } $subrule[] = sprintf("\\x%02X[\\x80-\\x%02X]", ord($uend[1]), ord($uend[2])); $crule .= '(' . implode('|', $subrule) . ')'; } } $rule .= $crule . $ket; } return $rule; }
function verbIrr($stem, &$match) { # 각종 규칙 불규칙 처리 $ustem = utf8_to_unicode($stem); $uend = utf8_to_unicode($match[1]); $ch = array_pop($ustem); $ed = $uend[0]; $save = ''; if ($this->isHangul($ch)) { $j = hangul_to_jamo($ch); $ej = hangul_to_jamo($ed); $sj = sizeof($j); if ($sj == 3 and $j[2] == 0x11bb) { // 랐-다, 었-다, 겠-다, 였-다 if (in_array($j[1], array(0x1161, 0x1165, 0x1166, 0x1167))) { if ($j[0] == 0x1105 and in_array($j[1], array(0x1161, 0x1165, 0x1167))) { // 랐,렀,렸 // 갈렸-다 } else { if (in_array($j[0], array(0x1100, 0x110b, 0x110c))) { # 겠,았 array_unshift($uend, $ch); unset($ch); } else { if ($j[1] == 0x1167 and in_array($j[0], array(0x1101, 0x1102, 0x1103, 0x1105, 0x1106, 0x1107, 0x1109, 0x110c, 0x110e, 0x110f, 0x1110, 0x1111, 0x1112))) { # 여 변환 // 혔 -> ㅎ+었 -> 히+었 $j[1] = 0x1165; $syll = jamo_to_syllable(array(0x110b, $j[1], $j[2])); array_unshift($uend, $syll[0]); /* 혔 -> 히+었, 폈 -> 피+었 */ $j[1] = 0x1175; $syll = jamo_to_syllable(array($j[0], $j[1])); $ch = $syll[0]; } else { if (in_array($j[0], array(0x1101, 0x1104, 0x110a, 0x1111, 0x1112))) { # 우 불규칙 /* 떴 -> ㄸ + 었 */ $syll = jamo_to_syllable(array(0x110b, $j[1], $j[2])); array_unshift($uend, $syll[0]); /* ㄸ -> 뜨 */ $j[1] = 0x1173; /* ㅡ */ if ($j[0] == 0x1111) { $j[1] = 0x116e; } /* 펐 푸+었 */ jamo_to_syllable(array($j[0], $j[1])); /* 쓰 */ $ch = $syll[0]; } else { if (in_array($j[0], array(0x1101, 0x1104, 0x110a, 0x1111, 0x1112))) { } } } } } } else { if ($j[0] == 0x1112 and in_array($j[1], array(0x1162))) { array_push($ustem, 0xd558); /* 하 */ $syll = jamo_to_syllable(array(0x110b, 0x1167, 0x11bb)); array_unshift($uend, $syll[0]); #$match[1]='여'.$match[1]; /* 해 -> 하 + 여 */ unset($ch); } else { /* ㅆ를 떼어낸다. */ #print '~~'.$stem.'~~'; $syll = jamo_to_syllable(array($j[0], $j[1])); array_unshift($uend, $j[2]); #array_unshift($uend,hangul_jongseong_to_cjamo($j[2])); $ch = $syll[0]; unset($j[2]); #unset($ch); } } if (!$ch) { $ch = array_pop($ustem); $j = hangul_to_jamo($ch); } $ed = $uend[0]; $ej = hangul_to_jamo($ed); } else { if (!empty($j[2]) and in_array($j[2], array(0x11ab, 0x11af, 0x11b8))) { // 합-시다 갑-시다 갈-래 // 하-ㅂ시다 가-ㅂ시다 가-ㄹ래 // if ($j[2] == 0x11af and $ej[0] == 0x1105) { //if ($j[1] == 0x1173 and $j[2]== 0x11af and $ej[0]==0x1105) { // 르 불규칙 // 흘-러:흐르+러 unset($j[2]); $syll = jamo_to_syllable($j); array_push($ustem, $syll[0]); /* 흐 */ $j[0] = $ej[0]; $j[1] = 0x1173; $syll = jamo_to_syllable($j); /* 르 */ $ch = $syll[0]; } else { array_unshift($uend, $j[2]); $syll = jamo_to_syllable(array($j[0], $j[1])); $ch = $syll[0]; $ed = $j[2]; unset($j[2]); } } } // ㄷ 불규칙 // 들-어 -> 듣-다 $sj = sizeof($j); if ($sj == 3 and $j[2] == 0x11af and in_array($ej[0], array(0x110b, 0x1105))) { while (in_array($ej[1], array(0x1161, 0x1165, 0x1173))) { // 아어으 // 라러르 $se = sizeof($ej); if ($se == 3) { if ($ej[1] == 0x1173 and !in_array($ej[2], 0x11ab, 0x11af)) { break; } // 은을 } else { if ($j[2] == 0x11af and sizeof($ej) == 2 and $ej[0] == 0x1105) { break; } } $syll = jamo_to_syllable(array($j[0], $j[1], 0x11ae)); $ch = $syll[0]; break; } } // ㅅ 불규칙 // * 지-어:짓-어 // * 이-어:잇-어 if (sizeof($ej) == 2) { if ($ej[0] == 0x110b) { $j[2] = 0x11ba; $syll = jamo_to_syllable($j); /* +ㅅ */ $ch = $syll[0]; $sj = 3; } } if ($sj == 2) { if (in_array($j[0], array(0x110c)) and in_array($j[1], array(0x116e, 0x1175))) { /* 주, 지 */ array_unshift($uend, $ch); unset($ch); $ch = array_pop($ustem); $j = hangul_to_jamo($ch); } if ($j[1] == 0x1165 and in_array($j[0], array(0x1101, 0x1104, 0x110a, 0x1111))) { /* 꺼,떠,써,퍼 */ $syll = jamo_to_syllable(array(0x110b, 0x1165)); /* 어 */ array_unshift($uend, $syll[0]); if ($j[0] == 0x1111) { $syll = jamo_to_syllable(array($j[0], 0x116e)); } else { $syll = jamo_to_syllable(array($j[0], 0x1173)); } /* 쓰 */ array_push($ustem, $syll[0]); unset($ch); $ch = array_pop($ustem); $j = hangul_to_jamo($ch); } // 음운 축약 if (in_array($j[0], array(0x1105, 0x1112)) and $j[1] == 0x1162) { // ㅎ 불규칙(어미) 파랗+아서 -> 파라+아서 -> 파래서 /* 파래-서 -> 파라-아서 */ $j[1] = 0x1161; $syll = jamo_to_syllable($j); /* 래 -> 라+ 아 */ $ch = $syll[0]; $syll = jamo_to_syllable(array(0x110b, 0x1161)); /* 아 */ $ed = $syll[0]; array_unshift($uend, $ed); $ej[0] = 0x110b; $ej[0] = 0x1161; } else { if ($j[0] == 0x1112 and in_array($j[1], array(0x1162))) { // 해-서 = 하-여서 $j[1] = 0x1161; $syll = jamo_to_syllable($j); /* 해 -> 하 + 여 */ $ch = $syll[0]; $syll = jamo_to_syllable(array(0x110b, 0x1167)); /* 여 */ $ed = $syll[0]; array_unshift($uend, $ed); $ej[0] = 0x110b; $ej[0] = 0x1167; } else { if (in_array($j[0], array(0x1105, 0x1109)) and in_array($j[1], array(0x1167))) { // 하셔-서 = 하시-어서 // 가려-서 = 가리-어서 $j[1] = 0x1175; /* ㅣ */ $syll = jamo_to_syllable($j); /* ㅕ -> 이-어 */ $ch = $syll[0]; $syll = jamo_to_syllable(array(0x110b, 0x1165)); /* 어 */ $ed = $syll[0]; array_unshift($uend, $ed); $ej[0] = 0x110b; $ej[0] = 0x1165; } } } if ($j[0] == 0x1109 and $j[1] == 0x1175) { /* 시: 존칭처리 */ array_unshift($uend, $ch); $ej = $j; $ch = array_pop($ustem); $j = hangul_to_jamo($ch); } // ㅎ 불규칙 if (in_array($j[0], array(0x1105, 0x1106)) and in_array($j[1], array(0x1161, 0x1165))) { $syll = jamo_to_syllable(array($j[0], $j[1], 0x11c2)); /* 랗,렇 */ array_push($ustem, $syll[0]); unset($ch); unset($j); } } while ($sj == 2 and $j[0] == 0x110b and in_array($j[1], array(0x116a, 0x116e, 0x116f)) and sizeof($ustem) >= 1) { // XXX // 그리워: 그리우+어 -> 그립+워 # /* 와 우 워 */ $ch1 = array_pop($ustem); $jamo = hangul_to_jamo($ch1); if (sizeof($jamo) == 2) { if ($jamo[1] != 0x1175) { $syll = jamo_to_syllable(array($jamo[0], $jamo[1], 0x11b8)); array_push($ustem, $syll[0]); /* add ㅂ */ } else { array_push($ustem, $ch1); } array_unshift($uend, $ch); unset($ch); } else { array_push($ustem, $ch1); } break; } if ($ch) { array_push($ustem, $ch); } $match[1] = unicode_to_utf8($uend); return unicode_to_utf8($ustem); } $match[1] = $save . $match[1]; return $stem; #print "<pre>"; #print($word.'-'.$match[1]); #print_r($match); }
function utf8_hangul_getSearchRule($str, $lastchar = 1, $use_unicode = true) { $rule = ''; $val = utf8_to_unicode($str); $len = sizeof($val); if ($lastchar and $len > 1) { // make a regex using with the last char $last = array_pop($val); $rule = unicode_to_utf8($val); $val = array($last); $len = sizeof($val); } else { // make regex for consonant only letters // ㄱㅎ => (ㄱ|[가-깋])(ㅎ|[하-힣]) // save the last char $last = array_pop($val); $len = sizeof($val); for ($i = 0; $i < $len; $i++) { $ch = $val[$i]; if ($ch >= 0x3130 and $ch <= 0x318f) { $wch = hangul_to_jamo(array($ch)); if ($wch[0] >= 0x1100 and $wch[0] <= 0x1112) { $wch[1] = 0x1161; $start = jamo_to_syllable($wch); $ustart = unicode_to_utf8($start); $wch[1] = 0x1175; $wch[2] = 0x11c2; $end = jamo_to_syllable($wch); $uend = unicode_to_utf8($end); } else { $rule .= unicode_to_utf8($wch); continue; } $crule = '(' . unicode_to_utf8(array($ch)) . '|'; $crule .= hangul_regex_range($ustart, $uend, $use_unicode); $crule .= ')'; } else { $crule = unicode_to_utf8(array($ch)); } $rule .= $crule; } // lastchar $val = array($last); $len = sizeof($val); } for ($i = 0; $i < $len; $i++) { $ch = $val[$i]; $wch = array(); $ustart = array(); $uend = array(); if ($ch >= 0xac00 and $ch <= 0xd7a3 or $ch >= 0x3130 and $ch <= 0x318f) { $wch = hangul_to_jamo(array($ch)); } else { $rule .= unicode_to_utf8(array($ch)); continue; } $wlen = sizeof($wch); $ket = ''; if ($wlen >= 3) { // 종각 => 종(각|가[가-깋]) $mrule = array(); $mrule[] = unicode_to_utf8(array($ch)); $save = $wch[2]; unset($wch[2]); $tmp = jamo_to_syllable($wch); $mrule[] = unicode_to_utf8($tmp); $save = hangul_jongseong_to_cjamo($save); $wch = hangul_to_jamo($save); $wlen = sizeof($wch); $rule .= '(' . implode('|', $mrule); $ket = ')'; if ($wlen > 1) { $rule .= ')'; continue; } } if ($wlen == 1) { if ($wch[0] >= 0x1100 and $wch[0] <= 0x1112) { $wch[1] = 0x1161; $start = jamo_to_syllable($wch); $ustart = unicode_to_utf8($start); $wch[1] = 0x1175; $wch[2] = 0x11c2; $end = jamo_to_syllable($wch); $uend = unicode_to_utf8($end); } else { $rule .= unicode_to_utf8($wch) . $ket; continue; } } else { if ($wlen == 2) { if ($wch[0] >= 0x1100 and $wch[0] <= 0x1112) { $start = jamo_to_syllable($wch); $ustart = unicode_to_utf8($start); $wch[2] = 0x11c2; $end = jamo_to_syllable($wch); $uend = unicode_to_utf8($end); } else { $rule .= unicode_to_utf8($wch); continue; } } } $crule = hangul_regex_range($ustart, $uend, $use_unicode); $rule .= $crule . $ket; } return $rule; }