Example #1
0
function utf8_hangul_getSearchRule($str, $lastchar = 1, $use_unicode = true)
{
    $rule = '';
    $val = utf8_to_unicode($str);
    $len = sizeof($val);
    if ($lastchar and $len > 1) {
        // make a regex using with the last char
        $last = array_pop($val);
        $rule = unicode_to_utf8($val);
        $val = array($last);
        $len = sizeof($val);
    }
    for ($i = 0; $i < $len; $i++) {
        $ch = $val[$i];
        $wch = array();
        $ustart = array();
        $uend = array();
        if ($ch >= 0xac00 and $ch <= 0xd7a3 or $ch >= 0x3130 and $ch <= 0x318f) {
            $wch = hangul_to_jamo(array($ch));
        } else {
            $rule .= unicode_to_utf8(array($ch));
            continue;
        }
        $wlen = sizeof($wch);
        $ket = '';
        if ($wlen >= 3) {
            // 종각 => 종(각|가[가-깋])
            $mrule = array();
            $mrule[] = unicode_to_utf8(array($ch));
            $save = $wch[2];
            unset($wch[2]);
            $tmp = jamo_to_syllable($wch);
            $mrule[] = unicode_to_utf8($tmp);
            $save = hangul_jongseong_to_cjamo($save);
            $wch = hangul_to_jamo($save);
            $wlen = sizeof($wch);
            $rule .= '(' . implode('|', $mrule);
            $ket = ')';
            if ($wlen > 1) {
                $rule .= ')';
                continue;
            }
        }
        if ($wlen == 1) {
            if ($wch[0] >= 0x1100 and $wch[0] <= 0x1112) {
                $wch[1] = 0x1161;
                $start = jamo_to_syllable($wch);
                $ustart = unicode_to_utf8($start);
                $wch[1] = 0x1175;
                $wch[2] = 0x11c2;
                $end = jamo_to_syllable($wch);
                $uend = unicode_to_utf8($end);
            } else {
                $rule .= unicode_to_utf8($wch) . $ket;
                continue;
            }
        } else {
            if ($wlen == 2) {
                if ($wch[0] >= 0x1100 and $wch[0] <= 0x1112) {
                    $start = jamo_to_syllable($wch);
                    $ustart = unicode_to_utf8($start);
                    $wch[2] = 0x11c2;
                    $end = jamo_to_syllable($wch);
                    $uend = unicode_to_utf8($end);
                } else {
                    $rule .= unicode_to_utf8($wch);
                    continue;
                }
            }
        }
        if ($use_unicode) {
            $crule = '[' . $ustart . '-' . $uend . ']';
        } else {
            $rule .= sprintf("\\x%02X", ord($ustart[0]));
            $crule = '';
            if ($ustart[1] == $uend[1]) {
                $crule .= sprintf("\\x%02X", ord($ustart[1]));
                $crule .= sprintf("[\\x%02X-\\x%02X]", ord($ustart[2]), ord($uend[2]));
            } else {
                $sch = ord($ustart[1]);
                $ech = ord($uend[1]);
                $subrule = array();
                $subrule[] = sprintf("\\x%02X[\\x%02X-\\xBF]", $sch, ord($ustart[2]));
                if ($sch + 1 == $ech - 1) {
                    $subrule[] = sprintf("\\x%02X[\\x80-\\xBF]", $sch + 1);
                } else {
                    if ($sch + 1 != $ech) {
                        $subrule[] = sprintf("[\\x%02X-\\x%02X][\\x80-\\xBF]", $sch + 1, $ech - 1);
                    }
                }
                $subrule[] = sprintf("\\x%02X[\\x80-\\x%02X]", ord($uend[1]), ord($uend[2]));
                $crule .= '(' . implode('|', $subrule) . ')';
            }
        }
        $rule .= $crule . $ket;
    }
    return $rule;
}
Example #2
0
 function verbIrr($stem, &$match)
 {
     # 각종 규칙 불규칙 처리
     $ustem = utf8_to_unicode($stem);
     $uend = utf8_to_unicode($match[1]);
     $ch = array_pop($ustem);
     $ed = $uend[0];
     $save = '';
     if ($this->isHangul($ch)) {
         $j = hangul_to_jamo($ch);
         $ej = hangul_to_jamo($ed);
         $sj = sizeof($j);
         if ($sj == 3 and $j[2] == 0x11bb) {
             // 랐-다, 었-다, 겠-다, 였-다
             if (in_array($j[1], array(0x1161, 0x1165, 0x1166, 0x1167))) {
                 if ($j[0] == 0x1105 and in_array($j[1], array(0x1161, 0x1165, 0x1167))) {
                     // 랐,렀,렸
                     // 갈렸-다
                 } else {
                     if (in_array($j[0], array(0x1100, 0x110b, 0x110c))) {
                         # 겠,았
                         array_unshift($uend, $ch);
                         unset($ch);
                     } else {
                         if ($j[1] == 0x1167 and in_array($j[0], array(0x1101, 0x1102, 0x1103, 0x1105, 0x1106, 0x1107, 0x1109, 0x110c, 0x110e, 0x110f, 0x1110, 0x1111, 0x1112))) {
                             # 여 변환
                             // 혔 -> ㅎ+었 -> 히+었
                             $j[1] = 0x1165;
                             $syll = jamo_to_syllable(array(0x110b, $j[1], $j[2]));
                             array_unshift($uend, $syll[0]);
                             /* 혔 -> 히+었, 폈 -> 피+었 */
                             $j[1] = 0x1175;
                             $syll = jamo_to_syllable(array($j[0], $j[1]));
                             $ch = $syll[0];
                         } else {
                             if (in_array($j[0], array(0x1101, 0x1104, 0x110a, 0x1111, 0x1112))) {
                                 # 우 불규칙
                                 /* 떴 -> ㄸ + 었 */
                                 $syll = jamo_to_syllable(array(0x110b, $j[1], $j[2]));
                                 array_unshift($uend, $syll[0]);
                                 /* ㄸ -> 뜨 */
                                 $j[1] = 0x1173;
                                 /* ㅡ */
                                 if ($j[0] == 0x1111) {
                                     $j[1] = 0x116e;
                                 }
                                 /* 펐 푸+었 */
                                 jamo_to_syllable(array($j[0], $j[1]));
                                 /* 쓰 */
                                 $ch = $syll[0];
                             } else {
                                 if (in_array($j[0], array(0x1101, 0x1104, 0x110a, 0x1111, 0x1112))) {
                                 }
                             }
                         }
                     }
                 }
             } else {
                 if ($j[0] == 0x1112 and in_array($j[1], array(0x1162))) {
                     array_push($ustem, 0xd558);
                     /* 하 */
                     $syll = jamo_to_syllable(array(0x110b, 0x1167, 0x11bb));
                     array_unshift($uend, $syll[0]);
                     #$match[1]='여'.$match[1]; /* 해 -> 하 + 여 */
                     unset($ch);
                 } else {
                     /* ㅆ를 떼어낸다. */
                     #print '~~'.$stem.'~~';
                     $syll = jamo_to_syllable(array($j[0], $j[1]));
                     array_unshift($uend, $j[2]);
                     #array_unshift($uend,hangul_jongseong_to_cjamo($j[2]));
                     $ch = $syll[0];
                     unset($j[2]);
                     #unset($ch);
                 }
             }
             if (!$ch) {
                 $ch = array_pop($ustem);
                 $j = hangul_to_jamo($ch);
             }
             $ed = $uend[0];
             $ej = hangul_to_jamo($ed);
         } else {
             if (!empty($j[2]) and in_array($j[2], array(0x11ab, 0x11af, 0x11b8))) {
                 // 합-시다   갑-시다   갈-래
                 // 하-ㅂ시다 가-ㅂ시다 가-ㄹ래
                 //
                 if ($j[2] == 0x11af and $ej[0] == 0x1105) {
                     //if ($j[1] == 0x1173 and $j[2]== 0x11af and $ej[0]==0x1105) {
                     // 르 불규칙
                     // 흘-러:흐르+러
                     unset($j[2]);
                     $syll = jamo_to_syllable($j);
                     array_push($ustem, $syll[0]);
                     /* 흐 */
                     $j[0] = $ej[0];
                     $j[1] = 0x1173;
                     $syll = jamo_to_syllable($j);
                     /* 르 */
                     $ch = $syll[0];
                 } else {
                     array_unshift($uend, $j[2]);
                     $syll = jamo_to_syllable(array($j[0], $j[1]));
                     $ch = $syll[0];
                     $ed = $j[2];
                     unset($j[2]);
                 }
             }
         }
         // ㄷ 불규칙
         // 들-어 -> 듣-다
         $sj = sizeof($j);
         if ($sj == 3 and $j[2] == 0x11af and in_array($ej[0], array(0x110b, 0x1105))) {
             while (in_array($ej[1], array(0x1161, 0x1165, 0x1173))) {
                 // 아어으
                 // 라러르
                 $se = sizeof($ej);
                 if ($se == 3) {
                     if ($ej[1] == 0x1173 and !in_array($ej[2], 0x11ab, 0x11af)) {
                         break;
                     }
                     // 은을
                 } else {
                     if ($j[2] == 0x11af and sizeof($ej) == 2 and $ej[0] == 0x1105) {
                         break;
                     }
                 }
                 $syll = jamo_to_syllable(array($j[0], $j[1], 0x11ae));
                 $ch = $syll[0];
                 break;
             }
         }
         // ㅅ 불규칙
         // * 지-어:짓-어
         // * 이-어:잇-어
         if (sizeof($ej) == 2) {
             if ($ej[0] == 0x110b) {
                 $j[2] = 0x11ba;
                 $syll = jamo_to_syllable($j);
                 /* +ㅅ */
                 $ch = $syll[0];
                 $sj = 3;
             }
         }
         if ($sj == 2) {
             if (in_array($j[0], array(0x110c)) and in_array($j[1], array(0x116e, 0x1175))) {
                 /* 주, 지 */
                 array_unshift($uend, $ch);
                 unset($ch);
                 $ch = array_pop($ustem);
                 $j = hangul_to_jamo($ch);
             }
             if ($j[1] == 0x1165 and in_array($j[0], array(0x1101, 0x1104, 0x110a, 0x1111))) {
                 /* 꺼,떠,써,퍼 */
                 $syll = jamo_to_syllable(array(0x110b, 0x1165));
                 /* 어 */
                 array_unshift($uend, $syll[0]);
                 if ($j[0] == 0x1111) {
                     $syll = jamo_to_syllable(array($j[0], 0x116e));
                 } else {
                     $syll = jamo_to_syllable(array($j[0], 0x1173));
                 }
                 /* 쓰 */
                 array_push($ustem, $syll[0]);
                 unset($ch);
                 $ch = array_pop($ustem);
                 $j = hangul_to_jamo($ch);
             }
             // 음운 축약
             if (in_array($j[0], array(0x1105, 0x1112)) and $j[1] == 0x1162) {
                 // ㅎ 불규칙(어미) 파랗+아서 -> 파라+아서 -> 파래서
                 /* 파래-서 -> 파라-아서 */
                 $j[1] = 0x1161;
                 $syll = jamo_to_syllable($j);
                 /* 래 -> 라+ 아 */
                 $ch = $syll[0];
                 $syll = jamo_to_syllable(array(0x110b, 0x1161));
                 /* 아 */
                 $ed = $syll[0];
                 array_unshift($uend, $ed);
                 $ej[0] = 0x110b;
                 $ej[0] = 0x1161;
             } else {
                 if ($j[0] == 0x1112 and in_array($j[1], array(0x1162))) {
                     // 해-서 = 하-여서
                     $j[1] = 0x1161;
                     $syll = jamo_to_syllable($j);
                     /* 해 -> 하 + 여 */
                     $ch = $syll[0];
                     $syll = jamo_to_syllable(array(0x110b, 0x1167));
                     /* 여 */
                     $ed = $syll[0];
                     array_unshift($uend, $ed);
                     $ej[0] = 0x110b;
                     $ej[0] = 0x1167;
                 } else {
                     if (in_array($j[0], array(0x1105, 0x1109)) and in_array($j[1], array(0x1167))) {
                         // 하셔-서 = 하시-어서
                         // 가려-서 = 가리-어서
                         $j[1] = 0x1175;
                         /* ㅣ */
                         $syll = jamo_to_syllable($j);
                         /* ㅕ -> 이-어 */
                         $ch = $syll[0];
                         $syll = jamo_to_syllable(array(0x110b, 0x1165));
                         /* 어 */
                         $ed = $syll[0];
                         array_unshift($uend, $ed);
                         $ej[0] = 0x110b;
                         $ej[0] = 0x1165;
                     }
                 }
             }
             if ($j[0] == 0x1109 and $j[1] == 0x1175) {
                 /* 시: 존칭처리 */
                 array_unshift($uend, $ch);
                 $ej = $j;
                 $ch = array_pop($ustem);
                 $j = hangul_to_jamo($ch);
             }
             // ㅎ 불규칙
             if (in_array($j[0], array(0x1105, 0x1106)) and in_array($j[1], array(0x1161, 0x1165))) {
                 $syll = jamo_to_syllable(array($j[0], $j[1], 0x11c2));
                 /* 랗,렇 */
                 array_push($ustem, $syll[0]);
                 unset($ch);
                 unset($j);
             }
         }
         while ($sj == 2 and $j[0] == 0x110b and in_array($j[1], array(0x116a, 0x116e, 0x116f)) and sizeof($ustem) >= 1) {
             // XXX
             // 그리워: 그리우+어 -> 그립+워
             # /* 와 우 워 */
             $ch1 = array_pop($ustem);
             $jamo = hangul_to_jamo($ch1);
             if (sizeof($jamo) == 2) {
                 if ($jamo[1] != 0x1175) {
                     $syll = jamo_to_syllable(array($jamo[0], $jamo[1], 0x11b8));
                     array_push($ustem, $syll[0]);
                     /* add ㅂ */
                 } else {
                     array_push($ustem, $ch1);
                 }
                 array_unshift($uend, $ch);
                 unset($ch);
             } else {
                 array_push($ustem, $ch1);
             }
             break;
         }
         if ($ch) {
             array_push($ustem, $ch);
         }
         $match[1] = unicode_to_utf8($uend);
         return unicode_to_utf8($ustem);
     }
     $match[1] = $save . $match[1];
     return $stem;
     #print "<pre>";
     #print($word.'-'.$match[1]);
     #print_r($match);
 }
Example #3
0
function utf8_hangul_getSearchRule($str, $lastchar = 1, $use_unicode = true)
{
    $rule = '';
    $val = utf8_to_unicode($str);
    $len = sizeof($val);
    if ($lastchar and $len > 1) {
        // make a regex using with the last char
        $last = array_pop($val);
        $rule = unicode_to_utf8($val);
        $val = array($last);
        $len = sizeof($val);
    } else {
        // make regex for consonant only letters
        // ㄱㅎ => (ㄱ|[가-깋])(ㅎ|[하-힣])
        // save the last char
        $last = array_pop($val);
        $len = sizeof($val);
        for ($i = 0; $i < $len; $i++) {
            $ch = $val[$i];
            if ($ch >= 0x3130 and $ch <= 0x318f) {
                $wch = hangul_to_jamo(array($ch));
                if ($wch[0] >= 0x1100 and $wch[0] <= 0x1112) {
                    $wch[1] = 0x1161;
                    $start = jamo_to_syllable($wch);
                    $ustart = unicode_to_utf8($start);
                    $wch[1] = 0x1175;
                    $wch[2] = 0x11c2;
                    $end = jamo_to_syllable($wch);
                    $uend = unicode_to_utf8($end);
                } else {
                    $rule .= unicode_to_utf8($wch);
                    continue;
                }
                $crule = '(' . unicode_to_utf8(array($ch)) . '|';
                $crule .= hangul_regex_range($ustart, $uend, $use_unicode);
                $crule .= ')';
            } else {
                $crule = unicode_to_utf8(array($ch));
            }
            $rule .= $crule;
        }
        // lastchar
        $val = array($last);
        $len = sizeof($val);
    }
    for ($i = 0; $i < $len; $i++) {
        $ch = $val[$i];
        $wch = array();
        $ustart = array();
        $uend = array();
        if ($ch >= 0xac00 and $ch <= 0xd7a3 or $ch >= 0x3130 and $ch <= 0x318f) {
            $wch = hangul_to_jamo(array($ch));
        } else {
            $rule .= unicode_to_utf8(array($ch));
            continue;
        }
        $wlen = sizeof($wch);
        $ket = '';
        if ($wlen >= 3) {
            // 종각 => 종(각|가[가-깋])
            $mrule = array();
            $mrule[] = unicode_to_utf8(array($ch));
            $save = $wch[2];
            unset($wch[2]);
            $tmp = jamo_to_syllable($wch);
            $mrule[] = unicode_to_utf8($tmp);
            $save = hangul_jongseong_to_cjamo($save);
            $wch = hangul_to_jamo($save);
            $wlen = sizeof($wch);
            $rule .= '(' . implode('|', $mrule);
            $ket = ')';
            if ($wlen > 1) {
                $rule .= ')';
                continue;
            }
        }
        if ($wlen == 1) {
            if ($wch[0] >= 0x1100 and $wch[0] <= 0x1112) {
                $wch[1] = 0x1161;
                $start = jamo_to_syllable($wch);
                $ustart = unicode_to_utf8($start);
                $wch[1] = 0x1175;
                $wch[2] = 0x11c2;
                $end = jamo_to_syllable($wch);
                $uend = unicode_to_utf8($end);
            } else {
                $rule .= unicode_to_utf8($wch) . $ket;
                continue;
            }
        } else {
            if ($wlen == 2) {
                if ($wch[0] >= 0x1100 and $wch[0] <= 0x1112) {
                    $start = jamo_to_syllable($wch);
                    $ustart = unicode_to_utf8($start);
                    $wch[2] = 0x11c2;
                    $end = jamo_to_syllable($wch);
                    $uend = unicode_to_utf8($end);
                } else {
                    $rule .= unicode_to_utf8($wch);
                    continue;
                }
            }
        }
        $crule = hangul_regex_range($ustart, $uend, $use_unicode);
        $rule .= $crule . $ket;
    }
    return $rule;
}