コード例 #1
0
ファイル: unicode.php プロジェクト: reviforks/moniwiki
function utf8_hangul_getSearchRule($str, $lastchar = 1, $use_unicode = true)
{
    $rule = '';
    $val = utf8_to_unicode($str);
    $len = sizeof($val);
    if ($lastchar and $len > 1) {
        // make a regex using with the last char
        $last = array_pop($val);
        $rule = unicode_to_utf8($val);
        $val = array($last);
        $len = sizeof($val);
    }
    for ($i = 0; $i < $len; $i++) {
        $ch = $val[$i];
        $wch = array();
        $ustart = array();
        $uend = array();
        if ($ch >= 0xac00 and $ch <= 0xd7a3 or $ch >= 0x3130 and $ch <= 0x318f) {
            $wch = hangul_to_jamo(array($ch));
        } else {
            $rule .= unicode_to_utf8(array($ch));
            continue;
        }
        $wlen = sizeof($wch);
        $ket = '';
        if ($wlen >= 3) {
            // 종각 => 종(각|가[가-깋])
            $mrule = array();
            $mrule[] = unicode_to_utf8(array($ch));
            $save = $wch[2];
            unset($wch[2]);
            $tmp = jamo_to_syllable($wch);
            $mrule[] = unicode_to_utf8($tmp);
            $save = hangul_jongseong_to_cjamo($save);
            $wch = hangul_to_jamo($save);
            $wlen = sizeof($wch);
            $rule .= '(' . implode('|', $mrule);
            $ket = ')';
            if ($wlen > 1) {
                $rule .= ')';
                continue;
            }
        }
        if ($wlen == 1) {
            if ($wch[0] >= 0x1100 and $wch[0] <= 0x1112) {
                $wch[1] = 0x1161;
                $start = jamo_to_syllable($wch);
                $ustart = unicode_to_utf8($start);
                $wch[1] = 0x1175;
                $wch[2] = 0x11c2;
                $end = jamo_to_syllable($wch);
                $uend = unicode_to_utf8($end);
            } else {
                $rule .= unicode_to_utf8($wch) . $ket;
                continue;
            }
        } else {
            if ($wlen == 2) {
                if ($wch[0] >= 0x1100 and $wch[0] <= 0x1112) {
                    $start = jamo_to_syllable($wch);
                    $ustart = unicode_to_utf8($start);
                    $wch[2] = 0x11c2;
                    $end = jamo_to_syllable($wch);
                    $uend = unicode_to_utf8($end);
                } else {
                    $rule .= unicode_to_utf8($wch);
                    continue;
                }
            }
        }
        if ($use_unicode) {
            $crule = '[' . $ustart . '-' . $uend . ']';
        } else {
            $rule .= sprintf("\\x%02X", ord($ustart[0]));
            $crule = '';
            if ($ustart[1] == $uend[1]) {
                $crule .= sprintf("\\x%02X", ord($ustart[1]));
                $crule .= sprintf("[\\x%02X-\\x%02X]", ord($ustart[2]), ord($uend[2]));
            } else {
                $sch = ord($ustart[1]);
                $ech = ord($uend[1]);
                $subrule = array();
                $subrule[] = sprintf("\\x%02X[\\x%02X-\\xBF]", $sch, ord($ustart[2]));
                if ($sch + 1 == $ech - 1) {
                    $subrule[] = sprintf("\\x%02X[\\x80-\\xBF]", $sch + 1);
                } else {
                    if ($sch + 1 != $ech) {
                        $subrule[] = sprintf("[\\x%02X-\\x%02X][\\x80-\\xBF]", $sch + 1, $ech - 1);
                    }
                }
                $subrule[] = sprintf("\\x%02X[\\x80-\\x%02X]", ord($uend[1]), ord($uend[2]));
                $crule .= '(' . implode('|', $subrule) . ')';
            }
        }
        $rule .= $crule . $ket;
    }
    return $rule;
}
コード例 #2
0
ファイル: stemmer.ko.php プロジェクト: ahastudio/moniwiki
 function getWordRule($word, $lastchar = 1)
 {
     $rule = $word;
     $val = utf8_to_unicode($word);
     $len = sizeof($val);
     #print $word.':'.$len;
     if ($len >= 1) {
         // make a regex using with the last char
         $ch = array_pop($val);
         if ($ch >= 0xac00 and $ch <= 0xd7a3 or $ch >= 0x3130 and $ch <= 0x318f) {
             $jamo = hangul_to_jamo(array($ch));
             $wlen = sizeof($jamo);
             if ($wlen >= 3) {
                 if (in_array($jamo[2], array(0x11ab, 0x11af, 0x11b7, 0x11b8, 0x11bb))) {
                     $rule = unicode_to_utf8($val);
                     if ($lastchar == 1) {
                         $rule .= unicode_to_utf8(jamo_to_syllable(array($jamo[0], $jamo[1])));
                     } else {
                         $rule .= unicode_to_utf8(array(hangul_choseong_to_cjamo($jamo[0])));
                         $rule .= unicode_to_utf8(array(hangul_jungseong_to_cjamo($jamo[1])));
                     }
                     $rule .= unicode_to_utf8(array(hangul_jongseong_to_cjamo($jamo[2])));
                 }
             }
         }
     }
     return $rule;
 }
コード例 #3
0
ファイル: unicode.php プロジェクト: ahastudio/moniwiki
function utf8_hangul_getSearchRule($str, $lastchar = 1, $use_unicode = true)
{
    $rule = '';
    $val = utf8_to_unicode($str);
    $len = sizeof($val);
    if ($lastchar and $len > 1) {
        // make a regex using with the last char
        $last = array_pop($val);
        $rule = unicode_to_utf8($val);
        $val = array($last);
        $len = sizeof($val);
    } else {
        // make regex for consonant only letters
        // ㄱㅎ => (ㄱ|[가-깋])(ㅎ|[하-힣])
        // save the last char
        $last = array_pop($val);
        $len = sizeof($val);
        for ($i = 0; $i < $len; $i++) {
            $ch = $val[$i];
            if ($ch >= 0x3130 and $ch <= 0x318f) {
                $wch = hangul_to_jamo(array($ch));
                if ($wch[0] >= 0x1100 and $wch[0] <= 0x1112) {
                    $wch[1] = 0x1161;
                    $start = jamo_to_syllable($wch);
                    $ustart = unicode_to_utf8($start);
                    $wch[1] = 0x1175;
                    $wch[2] = 0x11c2;
                    $end = jamo_to_syllable($wch);
                    $uend = unicode_to_utf8($end);
                } else {
                    $rule .= unicode_to_utf8($wch);
                    continue;
                }
                $crule = '(' . unicode_to_utf8(array($ch)) . '|';
                $crule .= hangul_regex_range($ustart, $uend, $use_unicode);
                $crule .= ')';
            } else {
                $crule = unicode_to_utf8(array($ch));
            }
            $rule .= $crule;
        }
        // lastchar
        $val = array($last);
        $len = sizeof($val);
    }
    for ($i = 0; $i < $len; $i++) {
        $ch = $val[$i];
        $wch = array();
        $ustart = array();
        $uend = array();
        if ($ch >= 0xac00 and $ch <= 0xd7a3 or $ch >= 0x3130 and $ch <= 0x318f) {
            $wch = hangul_to_jamo(array($ch));
        } else {
            $rule .= unicode_to_utf8(array($ch));
            continue;
        }
        $wlen = sizeof($wch);
        $ket = '';
        if ($wlen >= 3) {
            // 종각 => 종(각|가[가-깋])
            $mrule = array();
            $mrule[] = unicode_to_utf8(array($ch));
            $save = $wch[2];
            unset($wch[2]);
            $tmp = jamo_to_syllable($wch);
            $mrule[] = unicode_to_utf8($tmp);
            $save = hangul_jongseong_to_cjamo($save);
            $wch = hangul_to_jamo($save);
            $wlen = sizeof($wch);
            $rule .= '(' . implode('|', $mrule);
            $ket = ')';
            if ($wlen > 1) {
                $rule .= ')';
                continue;
            }
        }
        if ($wlen == 1) {
            if ($wch[0] >= 0x1100 and $wch[0] <= 0x1112) {
                $wch[1] = 0x1161;
                $start = jamo_to_syllable($wch);
                $ustart = unicode_to_utf8($start);
                $wch[1] = 0x1175;
                $wch[2] = 0x11c2;
                $end = jamo_to_syllable($wch);
                $uend = unicode_to_utf8($end);
            } else {
                $rule .= unicode_to_utf8($wch) . $ket;
                continue;
            }
        } else {
            if ($wlen == 2) {
                if ($wch[0] >= 0x1100 and $wch[0] <= 0x1112) {
                    $start = jamo_to_syllable($wch);
                    $ustart = unicode_to_utf8($start);
                    $wch[2] = 0x11c2;
                    $end = jamo_to_syllable($wch);
                    $uend = unicode_to_utf8($end);
                } else {
                    $rule .= unicode_to_utf8($wch);
                    continue;
                }
            }
        }
        $crule = hangul_regex_range($ustart, $uend, $use_unicode);
        $rule .= $crule . $ket;
    }
    return $rule;
}