function utf8_hangul_getSearchRule($str, $lastchar = 1, $use_unicode = true) { $rule = ''; $val = utf8_to_unicode($str); $len = sizeof($val); if ($lastchar and $len > 1) { // make a regex using with the last char $last = array_pop($val); $rule = unicode_to_utf8($val); $val = array($last); $len = sizeof($val); } for ($i = 0; $i < $len; $i++) { $ch = $val[$i]; $wch = array(); $ustart = array(); $uend = array(); if ($ch >= 0xac00 and $ch <= 0xd7a3 or $ch >= 0x3130 and $ch <= 0x318f) { $wch = hangul_to_jamo(array($ch)); } else { $rule .= unicode_to_utf8(array($ch)); continue; } $wlen = sizeof($wch); $ket = ''; if ($wlen >= 3) { // 종각 => 종(각|가[가-깋]) $mrule = array(); $mrule[] = unicode_to_utf8(array($ch)); $save = $wch[2]; unset($wch[2]); $tmp = jamo_to_syllable($wch); $mrule[] = unicode_to_utf8($tmp); $save = hangul_jongseong_to_cjamo($save); $wch = hangul_to_jamo($save); $wlen = sizeof($wch); $rule .= '(' . implode('|', $mrule); $ket = ')'; if ($wlen > 1) { $rule .= ')'; continue; } } if ($wlen == 1) { if ($wch[0] >= 0x1100 and $wch[0] <= 0x1112) { $wch[1] = 0x1161; $start = jamo_to_syllable($wch); $ustart = unicode_to_utf8($start); $wch[1] = 0x1175; $wch[2] = 0x11c2; $end = jamo_to_syllable($wch); $uend = unicode_to_utf8($end); } else { $rule .= unicode_to_utf8($wch) . $ket; continue; } } else { if ($wlen == 2) { if ($wch[0] >= 0x1100 and $wch[0] <= 0x1112) { $start = jamo_to_syllable($wch); $ustart = unicode_to_utf8($start); $wch[2] = 0x11c2; $end = jamo_to_syllable($wch); $uend = unicode_to_utf8($end); } else { $rule .= unicode_to_utf8($wch); continue; } } } if ($use_unicode) { $crule = '[' . $ustart . '-' . $uend . ']'; } else { $rule .= sprintf("\\x%02X", ord($ustart[0])); $crule = ''; if ($ustart[1] == $uend[1]) { $crule .= sprintf("\\x%02X", ord($ustart[1])); $crule .= sprintf("[\\x%02X-\\x%02X]", ord($ustart[2]), ord($uend[2])); } else { $sch = ord($ustart[1]); $ech = ord($uend[1]); $subrule = array(); $subrule[] = sprintf("\\x%02X[\\x%02X-\\xBF]", $sch, ord($ustart[2])); if ($sch + 1 == $ech - 1) { $subrule[] = sprintf("\\x%02X[\\x80-\\xBF]", $sch + 1); } else { if ($sch + 1 != $ech) { $subrule[] = sprintf("[\\x%02X-\\x%02X][\\x80-\\xBF]", $sch + 1, $ech - 1); } } $subrule[] = sprintf("\\x%02X[\\x80-\\x%02X]", ord($uend[1]), ord($uend[2])); $crule .= '(' . implode('|', $subrule) . ')'; } } $rule .= $crule . $ket; } return $rule; }
function getWordRule($word, $lastchar = 1) { $rule = $word; $val = utf8_to_unicode($word); $len = sizeof($val); #print $word.':'.$len; if ($len >= 1) { // make a regex using with the last char $ch = array_pop($val); if ($ch >= 0xac00 and $ch <= 0xd7a3 or $ch >= 0x3130 and $ch <= 0x318f) { $jamo = hangul_to_jamo(array($ch)); $wlen = sizeof($jamo); if ($wlen >= 3) { if (in_array($jamo[2], array(0x11ab, 0x11af, 0x11b7, 0x11b8, 0x11bb))) { $rule = unicode_to_utf8($val); if ($lastchar == 1) { $rule .= unicode_to_utf8(jamo_to_syllable(array($jamo[0], $jamo[1]))); } else { $rule .= unicode_to_utf8(array(hangul_choseong_to_cjamo($jamo[0]))); $rule .= unicode_to_utf8(array(hangul_jungseong_to_cjamo($jamo[1]))); } $rule .= unicode_to_utf8(array(hangul_jongseong_to_cjamo($jamo[2]))); } } } } return $rule; }
function utf8_hangul_getSearchRule($str, $lastchar = 1, $use_unicode = true) { $rule = ''; $val = utf8_to_unicode($str); $len = sizeof($val); if ($lastchar and $len > 1) { // make a regex using with the last char $last = array_pop($val); $rule = unicode_to_utf8($val); $val = array($last); $len = sizeof($val); } else { // make regex for consonant only letters // ㄱㅎ => (ㄱ|[가-깋])(ㅎ|[하-힣]) // save the last char $last = array_pop($val); $len = sizeof($val); for ($i = 0; $i < $len; $i++) { $ch = $val[$i]; if ($ch >= 0x3130 and $ch <= 0x318f) { $wch = hangul_to_jamo(array($ch)); if ($wch[0] >= 0x1100 and $wch[0] <= 0x1112) { $wch[1] = 0x1161; $start = jamo_to_syllable($wch); $ustart = unicode_to_utf8($start); $wch[1] = 0x1175; $wch[2] = 0x11c2; $end = jamo_to_syllable($wch); $uend = unicode_to_utf8($end); } else { $rule .= unicode_to_utf8($wch); continue; } $crule = '(' . unicode_to_utf8(array($ch)) . '|'; $crule .= hangul_regex_range($ustart, $uend, $use_unicode); $crule .= ')'; } else { $crule = unicode_to_utf8(array($ch)); } $rule .= $crule; } // lastchar $val = array($last); $len = sizeof($val); } for ($i = 0; $i < $len; $i++) { $ch = $val[$i]; $wch = array(); $ustart = array(); $uend = array(); if ($ch >= 0xac00 and $ch <= 0xd7a3 or $ch >= 0x3130 and $ch <= 0x318f) { $wch = hangul_to_jamo(array($ch)); } else { $rule .= unicode_to_utf8(array($ch)); continue; } $wlen = sizeof($wch); $ket = ''; if ($wlen >= 3) { // 종각 => 종(각|가[가-깋]) $mrule = array(); $mrule[] = unicode_to_utf8(array($ch)); $save = $wch[2]; unset($wch[2]); $tmp = jamo_to_syllable($wch); $mrule[] = unicode_to_utf8($tmp); $save = hangul_jongseong_to_cjamo($save); $wch = hangul_to_jamo($save); $wlen = sizeof($wch); $rule .= '(' . implode('|', $mrule); $ket = ')'; if ($wlen > 1) { $rule .= ')'; continue; } } if ($wlen == 1) { if ($wch[0] >= 0x1100 and $wch[0] <= 0x1112) { $wch[1] = 0x1161; $start = jamo_to_syllable($wch); $ustart = unicode_to_utf8($start); $wch[1] = 0x1175; $wch[2] = 0x11c2; $end = jamo_to_syllable($wch); $uend = unicode_to_utf8($end); } else { $rule .= unicode_to_utf8($wch) . $ket; continue; } } else { if ($wlen == 2) { if ($wch[0] >= 0x1100 and $wch[0] <= 0x1112) { $start = jamo_to_syllable($wch); $ustart = unicode_to_utf8($start); $wch[2] = 0x11c2; $end = jamo_to_syllable($wch); $uend = unicode_to_utf8($end); } else { $rule .= unicode_to_utf8($wch); continue; } } } $crule = hangul_regex_range($ustart, $uend, $use_unicode); $rule .= $crule . $ket; } return $rule; }