Esempio n. 1
0
/**
 * gbk转utf8
 * @param $gbstr
 */
function gbk_to_utf8($gbstr)
{
    global $CODETABLE;
    if (empty($CODETABLE)) {
        $filename = CODETABLEDIR . 'gb-unicode.table';
        $fp = fopen($filename, 'rb');
        while ($l = fgets($fp, 15)) {
            $CODETABLE[hexdec(substr($l, 0, 6))] = substr($l, 7, 6);
        }
        fclose($fp);
    }
    $ret = '';
    $utf8 = '';
    while ($gbstr) {
        if (ord(substr($gbstr, 0, 1)) > 0x80) {
            $thisW = substr($gbstr, 0, 2);
            $gbstr = substr($gbstr, 2, strlen($gbstr));
            $utf8 = '';
            @($utf8 = unicode_to_utf8(hexdec($CODETABLE[hexdec(bin2hex($thisW)) - 0x8080])));
            if ($utf8 != '') {
                for ($i = 0; $i < strlen($utf8); $i += 3) {
                    $ret .= chr(substr($utf8, $i, 3));
                }
            }
        } else {
            $ret .= substr($gbstr, 0, 1);
            $gbstr = substr($gbstr, 1, strlen($gbstr));
        }
    }
    return $ret;
}
/**
 * Convert unicode decimla list to utf-8 encoded character
 *
 * @param string $strDecVal [comma separated list of] decimal character value(s)
 * @return string
 */
function convToUTF8($strDecVal)
{
    $codes = explode(',', str_replace(' ', '', $strDecVal));
    $symbol = '';
    foreach ($codes as $code) {
        $symbol .= unicode_to_utf8(intval($code));
    }
    return $symbol;
}
Esempio n. 3
0
function decode($text)
{
    global $base, $tmin, $tmax, $skew, $damp, $initial_bias, $initial_n, $prefix, $delim;
    $n = $initial_n;
    $i = 0;
    $bias = $initial_bias;
    $output = array();
    if (substr($text, 0, strlen($prefix)) != $prefix) {
        return $text;
    } else {
        $text = str_replace($prefix, "", $text);
    }
    $delim_pos = strrpos($text, $delim);
    if ($delim_pos !== false) {
        for ($j = 0; $j < $delim_pos; $j++) {
            array_push($output, $text[$j]);
        }
        $text = substr($text, $delim_pos + 1);
    }
    for (; strlen($text) > 0;) {
        $oldi = $i;
        $w = 1;
        for ($k = $base; 1; $k = $k + $base) {
            $digit = decode_digit($text[0]);
            $text = substr($text, 1);
            $i = $i + $digit * $w;
            $t = 0;
            if ($k <= $bias + $tmin) {
                $t = $tmin;
            } elseif ($k >= $bias + $tmax) {
                $t = $tmax;
            } else {
                $t = $k - $bias;
            }
            if ($digit < $t) {
                break;
            }
            $w = $w * ($base - $t);
        }
        $bias = adapt($i - $oldi, sizeof($output) + 1, $oldi == 0);
        $n = $n + floor($i / (sizeof($output) + 1));
        $i = $i % (sizeof($output) + 1);
        $tmp = $output;
        $output = array();
        $j = 0;
        for ($j = 0; $j < $i; $j++) {
            array_push($output, $tmp[$j]);
        }
        array_push($output, unicode_to_utf8($n));
        for ($j = $j; $j < sizeof($tmp); $j++) {
            array_push($output, $tmp[$j]);
        }
        $i++;
    }
    return implode($output);
}
Esempio n. 4
0
function utf8_keepalphanum($string)
{
    global $UTF8_ALPHA_CHARS;
    $chars = utf8_to_unicode($string);
    for ($i = 0, $size = count($chars); $i < $size; ++$i) {
        if (!in_array($chars[$i], $UTF8_ALPHA_CHARS)) {
            unset($chars[$i]);
        }
    }
    return unicode_to_utf8($chars);
}
Esempio n. 5
0
function utf8_keepalphanum($string)
{
    // a-z A-Z . _ -, extended latin chars, Cyrillic and Greek
    static $UTF8_ALPHA_CHARS = array(0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5a, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x2e, 0x2d, 0x5f, 0x20, 0xc1, 0xe1, 0x106, 0x107, 0xc9, 0xe9, 0xcd, 0xed, 0x139, 0x13a, 0x143, 0x144, 0xd3, 0xf3, 0x154, 0x155, 0x15a, 0x15b, 0xda, 0xfa, 0xdd, 0xfd, 0x179, 0x17a, 0x10f, 0x13d, 0x13e, 0x165, 0x102, 0x103, 0x11e, 0x11f, 0x16c, 0x16d, 0x10c, 0x10d, 0x10e, 0x11a, 0x11b, 0x147, 0x148, 0x158, 0x159, 0x160, 0x161, 0x164, 0x17d, 0x17e, 0xc7, 0xe7, 0x122, 0x123, 0x136, 0x137, 0x13b, 0x13c, 0x145, 0x146, 0x156, 0x157, 0x15e, 0x15f, 0x162, 0x163, 0xc2, 0xe2, 0x108, 0x109, 0xca, 0xea, 0x11c, 0x11d, 0x124, 0x125, 0xce, 0xee, 0x134, 0x135, 0xd4, 0xf4, 0x15c, 0x15d, 0xdb, 0xfb, 0x174, 0x175, 0x176, 0x177, 0xc4, 0xe4, 0xcb, 0xeb, 0xcf, 0xef, 0xd6, 0xf6, 0xdc, 0xfc, 0x178, 0xff, 0x10a, 0x10b, 0x116, 0x117, 0x120, 0x121, 0x130, 0x131, 0x17b, 0x17c, 0x150, 0x151, 0x170, 0x171, 0xc0, 0xe0, 0xc8, 0xe8, 0xcc, 0xec, 0xd2, 0xf2, 0xd9, 0xf9, 0x1a0, 0x1a1, 0x1af, 0x1b0, 0x100, 0x101, 0x112, 0x113, 0x12a, 0x12b, 0x14c, 0x14d, 0x16a, 0x16b, 0x104, 0x105, 0x118, 0x119, 0x12e, 0x12f, 0x172, 0x173, 0xc5, 0xe5, 0x16e, 0x16f, 0x110, 0x111, 0x126, 0x127, 0x141, 0x142, 0xd8, 0xf8, 0xc3, 0xe3, 0xd1, 0xf1, 0xd5, 0xf5, 0xc6, 0xe6, 0x152, 0x153, 0xd0, 0xf0, 0xde, 0xfe, 0xdf, 0x17f, 0x391, 0x392, 0x393, 0x394, 0x395, 0x396, 0x397, 0x398, 0x399, 0x39a, 0x39b, 0x39c, 0x39d, 0x39e, 0x39f, 0x3a0, 0x3a1, 0x3a3, 0x3a4, 0x3a5, 0x3a6, 0x3a7, 0x3a8, 0x3a9, 0x386, 0x388, 0x389, 0x38a, 0x38c, 0x38e, 0x38f, 0x3aa, 0x3ab, 0x3b1, 0x3b2, 0x3b3, 0x3b4, 0x3b5, 0x3b6, 0x3b7, 0x3b8, 0x3b9, 0x3ba, 0x3bb, 0x3bc, 0x3bd, 0x3be, 0x3bf, 0x3c0, 0x3c1, 0x3c3, 0x3c2, 0x3c4, 0x3c5, 0x3c6, 0x3c7, 0x3c8, 0x3c9, 0x3ac, 0x3ad, 0x3ae, 0x3af, 0x3cc, 0x3cd, 0x3ce, 0x3ca, 0x3cb, 0x390, 0x3b0, 0x410, 0x411, 0x412, 0x413, 0x414, 0x415, 0x401, 0x416, 0x417, 0x406, 0x419, 0x41a, 0x41b, 0x41c, 0x41d, 0x41e, 0x41f, 0x420, 0x421, 0x422, 0x423, 0x40e, 0x424, 0x425, 0x426, 0x427, 0x428, 0x42b, 0x42c, 0x42d, 0x42e, 0x42f, 0x430, 0x431, 0x432, 0x433, 0x434, 0x435, 0x451, 0x436, 0x437, 0x456, 0x439, 0x43a, 0x43b, 0x43c, 0x43d, 0x43e, 0x43f, 0x440, 0x441, 0x442, 0x443, 0x45e, 0x444, 0x445, 0x446, 0x447, 0x448, 0x44b, 0x44c, 0x44d, 0x44e, 0x44f, 0x418, 0x429, 0x42a, 0x438, 0x449, 0x44a, 0x403, 0x405, 0x408, 0x409, 0x40a, 0x40c, 0x40f, 0x453, 0x455, 0x458, 0x459, 0x45a, 0x45c, 0x45f, 0x402, 0x40b, 0x452, 0x45b, 0x490, 0x404, 0x407, 0x491, 0x454, 0x457, 0x4e8, 0x4ae, 0x4e9, 0x4af);
    $chars = utf8_to_unicode($string);
    for ($i = 0, $size = count($chars); $i < $size; ++$i) {
        if (!in_array($chars[$i], $UTF8_ALPHA_CHARS)) {
            unset($chars[$i]);
        }
    }
    return unicode_to_utf8($chars);
}
 function utf8_strtoupper($string)
 {
     static $lower_to_upper;
     if ($lower_to_upper == null) {
         $lower_to_upper = array(0x61 => 0x41, 0x3c6 => 0x3a6, 0x163 => 0x162, 0xe5 => 0xc5, 0x62 => 0x42, 0x13a => 0x139, 0xe1 => 0xc1, 0x142 => 0x141, 0x3cd => 0x38e, 0x101 => 0x100, 0x491 => 0x490, 0x3b4 => 0x394, 0x15b => 0x15a, 0x64 => 0x44, 0x3b3 => 0x393, 0xf4 => 0xd4, 0x44a => 0x42a, 0x439 => 0x419, 0x113 => 0x112, 0x43c => 0x41c, 0x15f => 0x15e, 0x144 => 0x143, 0xee => 0xce, 0x45e => 0x40e, 0x44f => 0x42f, 0x3ba => 0x39a, 0x155 => 0x154, 0x69 => 0x49, 0x73 => 0x53, 0x1e1f => 0x1e1e, 0x135 => 0x134, 0x447 => 0x427, 0x3c0 => 0x3a0, 0x438 => 0x418, 0xf3 => 0xd3, 0x440 => 0x420, 0x454 => 0x404, 0x435 => 0x415, 0x449 => 0x429, 0x14b => 0x14a, 0x431 => 0x411, 0x459 => 0x409, 0x1e03 => 0x1e02, 0xf6 => 0xd6, 0xf9 => 0xd9, 0x6e => 0x4e, 0x451 => 0x401, 0x3c4 => 0x3a4, 0x443 => 0x423, 0x15d => 0x15c, 0x453 => 0x403, 0x3c8 => 0x3a8, 0x159 => 0x158, 0x67 => 0x47, 0xe4 => 0xc4, 0x3ac => 0x386, 0x3ae => 0x389, 0x167 => 0x166, 0x3be => 0x39e, 0x165 => 0x164, 0x117 => 0x116, 0x109 => 0x108, 0x76 => 0x56, 0xfe => 0xde, 0x157 => 0x156, 0xfa => 0xda, 0x1e61 => 0x1e60, 0x1e83 => 0x1e82, 0xe2 => 0xc2, 0x119 => 0x118, 0x146 => 0x145, 0x70 => 0x50, 0x151 => 0x150, 0x44e => 0x42e, 0x129 => 0x128, 0x3c7 => 0x3a7, 0x13e => 0x13d, 0x442 => 0x422, 0x7a => 0x5a, 0x448 => 0x428, 0x3c1 => 0x3a1, 0x1e81 => 0x1e80, 0x16d => 0x16c, 0xf5 => 0xd5, 0x75 => 0x55, 0x177 => 0x176, 0xfc => 0xdc, 0x1e57 => 0x1e56, 0x3c3 => 0x3a3, 0x43a => 0x41a, 0x6d => 0x4d, 0x16b => 0x16a, 0x171 => 0x170, 0x444 => 0x424, 0xec => 0xcc, 0x169 => 0x168, 0x3bf => 0x39f, 0x6b => 0x4b, 0xf2 => 0xd2, 0xe0 => 0xc0, 0x434 => 0x414, 0x3c9 => 0x3a9, 0x1e6b => 0x1e6a, 0xe3 => 0xc3, 0x44d => 0x42d, 0x436 => 0x416, 0x1a1 => 0x1a0, 0x10d => 0x10c, 0x11d => 0x11c, 0xf0 => 0xd0, 0x13c => 0x13b, 0x45f => 0x40f, 0x45a => 0x40a, 0xe8 => 0xc8, 0x3c5 => 0x3a5, 0x66 => 0x46, 0xfd => 0xdd, 0x63 => 0x43, 0x21b => 0x21a, 0xea => 0xca, 0x3b9 => 0x399, 0x17a => 0x179, 0xef => 0xcf, 0x1b0 => 0x1af, 0x65 => 0x45, 0x3bb => 0x39b, 0x3b8 => 0x398, 0x3bc => 0x39c, 0x45c => 0x40c, 0x43f => 0x41f, 0x44c => 0x42c, 0xfe => 0xde, 0xf0 => 0xd0, 0x1ef3 => 0x1ef2, 0x68 => 0x48, 0xeb => 0xcb, 0x111 => 0x110, 0x433 => 0x413, 0x12f => 0x12e, 0xe6 => 0xc6, 0x78 => 0x58, 0x161 => 0x160, 0x16f => 0x16e, 0x3b1 => 0x391, 0x457 => 0x407, 0x173 => 0x172, 0xff => 0x178, 0x6f => 0x4f, 0x43b => 0x41b, 0x3b5 => 0x395, 0x445 => 0x425, 0x121 => 0x120, 0x17e => 0x17d, 0x17c => 0x17b, 0x3b6 => 0x396, 0x3b2 => 0x392, 0x3ad => 0x388, 0x1e85 => 0x1e84, 0x175 => 0x174, 0x71 => 0x51, 0x437 => 0x417, 0x1e0b => 0x1e0a, 0x148 => 0x147, 0x105 => 0x104, 0x458 => 0x408, 0x14d => 0x14c, 0xed => 0xcd, 0x79 => 0x59, 0x10b => 0x10a, 0x3ce => 0x38f, 0x72 => 0x52, 0x430 => 0x410, 0x455 => 0x405, 0x452 => 0x402, 0x127 => 0x126, 0x137 => 0x136, 0x12b => 0x12a, 0x3af => 0x38a, 0x44b => 0x42b, 0x6c => 0x4c, 0x3b7 => 0x397, 0x125 => 0x124, 0x219 => 0x218, 0xfb => 0xdb, 0x11f => 0x11e, 0x43e => 0x41e, 0x1e41 => 0x1e40, 0x3bd => 0x39d, 0x107 => 0x106, 0x3cb => 0x3ab, 0x446 => 0x426, 0xfe => 0xde, 0xe7 => 0xc7, 0x3ca => 0x3aa, 0x441 => 0x421, 0x432 => 0x412, 0x10f => 0x10e, 0xf8 => 0xd8, 0x77 => 0x57, 0x11b => 0x11a, 0x74 => 0x54, 0x6a => 0x4a, 0x45b => 0x40b, 0x456 => 0x406, 0x103 => 0x102, 0x3bb => 0x39b, 0xf1 => 0xd1, 0x43d => 0x41d, 0x3cc => 0x38c, 0xe9 => 0xc9, 0xf0 => 0xd0, 0x457 => 0x407, 0x123 => 0x122);
     }
     $unicode = utf8_to_unicode($string);
     if (!$unicode) {
         return false;
     }
     for ($i = 0; $i < count($unicode); $i++) {
         if (isset($lower_to_upper[$unicode[$i]])) {
             $unicode[$i] = $lower_to_upper[$unicode[$i]];
         }
     }
     return unicode_to_utf8($unicode);
 }
Esempio n. 7
0
 /**
  * UTF-8 to UTF-16BE conversion.
  *
  * Maybe really UCS-2 without mb_string due to utf8_to_unicode limits
  */
 protected function utf16be_to_utf8(&$str)
 {
     $uni = unpack('n*', $str);
     return unicode_to_utf8($uni);
 }
Esempio n. 8
0
function unicode_to_ansi($string)
{
    if (!strlen($string)) {
        return '';
    }
    // check for unicode length validness
    if (strlen($string) % 2 != 0) {
        return '';
    } else {
        return Utf8ToWin(unicode_to_utf8($string));
    }
    // alternative
    //return mb_convert_encoding($string, "cp1251", "UTF-16LE");
}
Esempio n. 9
0
/**
 * Removes special characters (nonalphanumeric) from a UTF-8 string
 *
 * Be sure to specify all specialchars you give in $repl in $keep, too
 * or it won't work.
 *
 * This function adds the controlchars 0x00 to 0x19 to the array of
 * stripped chars (they are not included in $UTF8_SPECIAL_CHARS)
 *
 * @author Andreas Gohr <*****@*****.**>
 * @param  string $string The UTF8 string to strip of special chars
 * @param  string $repl   Replace special with this string
 * @param  string $keep   Special chars to keep (in UTF8)
 */
function utf8_stripspecials($string, $repl = '', $keep = '')
{
    global $UTF8_SPECIAL_CHARS;
    if ($keep != '') {
        $specials = array_diff($UTF8_SPECIAL_CHARS, utf8_to_unicode($keep));
    } else {
        $specials = $UTF8_SPECIAL_CHARS;
    }
    $specials = unicode_to_utf8($specials);
    $specials = preg_quote($specials, '/');
    return preg_replace('/[\\x00-\\x19' . $specials . ']/u', $repl, $string);
}
Esempio n. 10
0
/**
 * gbk转utf8
 * @param $gbstr
 */
function gbk_to_utf8($gbstr)
{
    $filename = EXTENSION_DIR . 'encoding' . DIRECTORY_SEPARATOR . 'gb-unicode.table';
    $CODETABLE = array();
    $fp = fopen($filename, 'rb');
    while ($l = fgets($fp, 15)) {
        $CODETABLE[hexdec(substr($l, 0, 6))] = substr($l, 7, 6);
    }
    fclose($fp);
    $ret = '';
    $utf8 = '';
    while ($gbstr) {
        if (ord(substr($gbstr, 0, 1)) > 0x80) {
            $thisW = substr($gbstr, 0, 2);
            $gbstr = substr($gbstr, 2, strlen($gbstr));
            $utf8 = '';
            @($utf8 = unicode_to_utf8(hexdec($CODETABLE[hexdec(bin2hex($thisW)) - 0x8080])));
            if ($utf8 != '') {
                for ($i = 0; $i < strlen($utf8); $i += 3) {
                    $ret .= chr(substr($utf8, $i, 3));
                }
            }
        } else {
            $ret .= substr($gbstr, 0, 1);
            $gbstr = substr($gbstr, 1, strlen($gbstr));
        }
    }
    return $ret;
}
Esempio n. 11
0
 /**
  *  decoding process
  *    - split the string into substrings at any occurrence of pre or post indicator characters
  *    - check the first character of the substring
  *      - if its not a pre_indicator character
  *        - if previous character was converted, skip over post_indicator character
  *        - copy codepoint values of remaining characters to the output array
  *        - clear any converted flag
  *      (continue to next substring)
  *
  *     _ else (its a pre_indicator character)
  *       - if string length is 1, copy the post_indicator character to the output array
  *       (continue to next substring)
  *
  *       - else (string length > 1)
  *         - skip the pre-indicator character and convert remaining string from base36 to base10
  *         - increase codepoint value for non-printable ASCII characters (add 0x20)
  *         - append codepoint to output array
  *       (continue to next substring)
  *
  * @param    string    $filename     a 'safe' encoded ASCII string,
  * @return   string    decoded utf8 representation of $filename
  *
  * @author   Christopher Smith <*****@*****.**>
  */
 public function decode($filename)
 {
     return unicode_to_utf8(self::safe_to_unicode(strtolower($filename)));
 }
Esempio n. 12
0
 //$i = 0;
 while (strpos($buffer, '$') !== false) {
     //echo $i++ . ": strpos=". (string)strpos($buffer, '$') ."\nbuffer: $buffer\n";
     $replaced = false;
     foreach ($translation as $symbol => $character) {
         $sym_pos = strpos($buffer, $symbol);
         if ($sym_pos !== false) {
             $sym_length = strlen($symbol);
             $piece1 = substr($buffer, 0, $sym_pos);
             if ($character['switch']) {
                 // the character after the special charater needs to come before it
                 $partnerchar = utf8_encode($buffer[$sym_pos + $sym_length]);
                 $piece2 = unicode_to_utf8(array_merge(utf8_to_unicode($partnerchar), $character['unicode']));
                 $piece3start = $sym_pos + $sym_length + 1;
             } else {
                 $piece2 = unicode_to_utf8($character['unicode']);
                 $piece3start = $sym_pos + $sym_length;
             }
             $piece2 = utf8_decode(UtfNormal::NFKC($piece2));
             // strip out any ? characters, which are characters not existing in ISO-8859-1
             $piece2 = str_replace('?', '', $piece2);
             $piece3 = substr($buffer, $piece3start);
             $buffer = $piece1 . $piece2 . $piece3;
             $replaced = true;
             continue;
         }
     }
     if (!$replaced) {
         // we've encountered some character that we have no translation for
         echo "unable to find a translation to transform this buffer, the untranslatable code will be stripped out:\n{$buffer}\n";
         $pieces = preg_split('/\\$\\d*/', $buffer, 2);
<?php

define('UNICODE_EMOJI_PATH', 'http://unicode.org/Public/UNIDATA/EmojiSources.txt');
define('JSON_WRITE_PATH', 'EmojiSources.json');
$contents = file_get_contents(UNICODE_EMOJI_PATH);
$pattern = '/^([0-9A-F\\s]+);([0-9A-F]+)?;([0-9A-F]+)?;([0-9A-F]+)?$/m';
$emojiList = array();
if (preg_match_all($pattern, $contents, $matches)) {
    $j = sizeof($matches[1]);
    for ($i = 0; $i < $j; $i++) {
        $unicode = trim($matches[1][$i]);
        if (strpos($unicode, ' ') !== FALSE) {
            $array = explode(' ', $unicode);
            $utf8hex = unicode_to_utf8($array[0]) . unicode_to_utf8($array[1]);
        } else {
            $utf8hex = unicode_to_utf8($unicode);
        }
        $map = array();
        $map['unicode'] = $unicode;
        $map['utf8hex'] = $utf8hex;
        $map['sjis_docomo'] = $matches[2][$i];
        $map['sjis_kddi'] = $matches[3][$i];
        $map['sjis_softbank'] = $matches[4][$i];
        $emojiList[] = $map;
    }
    $jsonData = json_encode($emojiList);
    file_put_contents(JSON_WRITE_PATH, $jsonData);
    printf("Created a file. [%s]\n", JSON_WRITE_PATH);
} else {
    printf("[ERROR] Failed parse file. [%s]\n", UNICODE_EMOJI_PATH);
}
Esempio n. 14
0
function utf8_hangul_getSearchRule($str, $lastchar = 1, $use_unicode = true)
{
    $rule = '';
    $val = utf8_to_unicode($str);
    $len = sizeof($val);
    if ($lastchar and $len > 1) {
        // make a regex using with the last char
        $last = array_pop($val);
        $rule = unicode_to_utf8($val);
        $val = array($last);
        $len = sizeof($val);
    }
    for ($i = 0; $i < $len; $i++) {
        $ch = $val[$i];
        $wch = array();
        $ustart = array();
        $uend = array();
        if ($ch >= 0xac00 and $ch <= 0xd7a3 or $ch >= 0x3130 and $ch <= 0x318f) {
            $wch = hangul_to_jamo(array($ch));
        } else {
            $rule .= unicode_to_utf8(array($ch));
            continue;
        }
        $wlen = sizeof($wch);
        $ket = '';
        if ($wlen >= 3) {
            // 종각 => 종(각|가[가-깋])
            $mrule = array();
            $mrule[] = unicode_to_utf8(array($ch));
            $save = $wch[2];
            unset($wch[2]);
            $tmp = jamo_to_syllable($wch);
            $mrule[] = unicode_to_utf8($tmp);
            $save = hangul_jongseong_to_cjamo($save);
            $wch = hangul_to_jamo($save);
            $wlen = sizeof($wch);
            $rule .= '(' . implode('|', $mrule);
            $ket = ')';
            if ($wlen > 1) {
                $rule .= ')';
                continue;
            }
        }
        if ($wlen == 1) {
            if ($wch[0] >= 0x1100 and $wch[0] <= 0x1112) {
                $wch[1] = 0x1161;
                $start = jamo_to_syllable($wch);
                $ustart = unicode_to_utf8($start);
                $wch[1] = 0x1175;
                $wch[2] = 0x11c2;
                $end = jamo_to_syllable($wch);
                $uend = unicode_to_utf8($end);
            } else {
                $rule .= unicode_to_utf8($wch) . $ket;
                continue;
            }
        } else {
            if ($wlen == 2) {
                if ($wch[0] >= 0x1100 and $wch[0] <= 0x1112) {
                    $start = jamo_to_syllable($wch);
                    $ustart = unicode_to_utf8($start);
                    $wch[2] = 0x11c2;
                    $end = jamo_to_syllable($wch);
                    $uend = unicode_to_utf8($end);
                } else {
                    $rule .= unicode_to_utf8($wch);
                    continue;
                }
            }
        }
        if ($use_unicode) {
            $crule = '[' . $ustart . '-' . $uend . ']';
        } else {
            $rule .= sprintf("\\x%02X", ord($ustart[0]));
            $crule = '';
            if ($ustart[1] == $uend[1]) {
                $crule .= sprintf("\\x%02X", ord($ustart[1]));
                $crule .= sprintf("[\\x%02X-\\x%02X]", ord($ustart[2]), ord($uend[2]));
            } else {
                $sch = ord($ustart[1]);
                $ech = ord($uend[1]);
                $subrule = array();
                $subrule[] = sprintf("\\x%02X[\\x%02X-\\xBF]", $sch, ord($ustart[2]));
                if ($sch + 1 == $ech - 1) {
                    $subrule[] = sprintf("\\x%02X[\\x80-\\xBF]", $sch + 1);
                } else {
                    if ($sch + 1 != $ech) {
                        $subrule[] = sprintf("[\\x%02X-\\x%02X][\\x80-\\xBF]", $sch + 1, $ech - 1);
                    }
                }
                $subrule[] = sprintf("\\x%02X[\\x80-\\x%02X]", ord($uend[1]), ord($uend[2]));
                $crule .= '(' . implode('|', $subrule) . ')';
            }
        }
        $rule .= $crule . $ket;
    }
    return $rule;
}
 function unicode_to_utf8($str)
 {
     return unicode_to_utf8($str);
 }
Esempio n. 16
0
 function test_to_4byte()
 {
     $out = "􀀁";
     $in = array(1048577);
     $this->assertEqual(unicode_to_utf8($in), $out);
 }
Esempio n. 17
0
 function verbIrr($stem, &$match)
 {
     # 각종 규칙 불규칙 처리
     $ustem = utf8_to_unicode($stem);
     $uend = utf8_to_unicode($match[1]);
     $ch = array_pop($ustem);
     $ed = $uend[0];
     $save = '';
     if ($this->isHangul($ch)) {
         $j = hangul_to_jamo($ch);
         $ej = hangul_to_jamo($ed);
         $sj = sizeof($j);
         if ($sj == 3 and $j[2] == 0x11bb) {
             // 랐-다, 었-다, 겠-다, 였-다
             if (in_array($j[1], array(0x1161, 0x1165, 0x1166, 0x1167))) {
                 if ($j[0] == 0x1105 and in_array($j[1], array(0x1161, 0x1165, 0x1167))) {
                     // 랐,렀,렸
                     // 갈렸-다
                 } else {
                     if (in_array($j[0], array(0x1100, 0x110b, 0x110c))) {
                         # 겠,았
                         array_unshift($uend, $ch);
                         unset($ch);
                     } else {
                         if ($j[1] == 0x1167 and in_array($j[0], array(0x1101, 0x1102, 0x1103, 0x1105, 0x1106, 0x1107, 0x1109, 0x110c, 0x110e, 0x110f, 0x1110, 0x1111, 0x1112))) {
                             # 여 변환
                             // 혔 -> ㅎ+었 -> 히+었
                             $j[1] = 0x1165;
                             $syll = jamo_to_syllable(array(0x110b, $j[1], $j[2]));
                             array_unshift($uend, $syll[0]);
                             /* 혔 -> 히+었, 폈 -> 피+었 */
                             $j[1] = 0x1175;
                             $syll = jamo_to_syllable(array($j[0], $j[1]));
                             $ch = $syll[0];
                         } else {
                             if (in_array($j[0], array(0x1101, 0x1104, 0x110a, 0x1111, 0x1112))) {
                                 # 우 불규칙
                                 /* 떴 -> ㄸ + 었 */
                                 $syll = jamo_to_syllable(array(0x110b, $j[1], $j[2]));
                                 array_unshift($uend, $syll[0]);
                                 /* ㄸ -> 뜨 */
                                 $j[1] = 0x1173;
                                 /* ㅡ */
                                 if ($j[0] == 0x1111) {
                                     $j[1] = 0x116e;
                                 }
                                 /* 펐 푸+었 */
                                 jamo_to_syllable(array($j[0], $j[1]));
                                 /* 쓰 */
                                 $ch = $syll[0];
                             } else {
                                 if (in_array($j[0], array(0x1101, 0x1104, 0x110a, 0x1111, 0x1112))) {
                                 }
                             }
                         }
                     }
                 }
             } else {
                 if ($j[0] == 0x1112 and in_array($j[1], array(0x1162))) {
                     array_push($ustem, 0xd558);
                     /* 하 */
                     $syll = jamo_to_syllable(array(0x110b, 0x1167, 0x11bb));
                     array_unshift($uend, $syll[0]);
                     #$match[1]='여'.$match[1]; /* 해 -> 하 + 여 */
                     unset($ch);
                 } else {
                     /* ㅆ를 떼어낸다. */
                     #print '~~'.$stem.'~~';
                     $syll = jamo_to_syllable(array($j[0], $j[1]));
                     array_unshift($uend, $j[2]);
                     #array_unshift($uend,hangul_jongseong_to_cjamo($j[2]));
                     $ch = $syll[0];
                     unset($j[2]);
                     #unset($ch);
                 }
             }
             if (!$ch) {
                 $ch = array_pop($ustem);
                 $j = hangul_to_jamo($ch);
             }
             $ed = $uend[0];
             $ej = hangul_to_jamo($ed);
         } else {
             if (!empty($j[2]) and in_array($j[2], array(0x11ab, 0x11af, 0x11b8))) {
                 // 합-시다   갑-시다   갈-래
                 // 하-ㅂ시다 가-ㅂ시다 가-ㄹ래
                 //
                 if ($j[2] == 0x11af and $ej[0] == 0x1105) {
                     //if ($j[1] == 0x1173 and $j[2]== 0x11af and $ej[0]==0x1105) {
                     // 르 불규칙
                     // 흘-러:흐르+러
                     unset($j[2]);
                     $syll = jamo_to_syllable($j);
                     array_push($ustem, $syll[0]);
                     /* 흐 */
                     $j[0] = $ej[0];
                     $j[1] = 0x1173;
                     $syll = jamo_to_syllable($j);
                     /* 르 */
                     $ch = $syll[0];
                 } else {
                     array_unshift($uend, $j[2]);
                     $syll = jamo_to_syllable(array($j[0], $j[1]));
                     $ch = $syll[0];
                     $ed = $j[2];
                     unset($j[2]);
                 }
             }
         }
         // ㄷ 불규칙
         // 들-어 -> 듣-다
         $sj = sizeof($j);
         if ($sj == 3 and $j[2] == 0x11af and in_array($ej[0], array(0x110b, 0x1105))) {
             while (in_array($ej[1], array(0x1161, 0x1165, 0x1173))) {
                 // 아어으
                 // 라러르
                 $se = sizeof($ej);
                 if ($se == 3) {
                     if ($ej[1] == 0x1173 and !in_array($ej[2], 0x11ab, 0x11af)) {
                         break;
                     }
                     // 은을
                 } else {
                     if ($j[2] == 0x11af and sizeof($ej) == 2 and $ej[0] == 0x1105) {
                         break;
                     }
                 }
                 $syll = jamo_to_syllable(array($j[0], $j[1], 0x11ae));
                 $ch = $syll[0];
                 break;
             }
         }
         // ㅅ 불규칙
         // * 지-어:짓-어
         // * 이-어:잇-어
         if (sizeof($ej) == 2) {
             if ($ej[0] == 0x110b) {
                 $j[2] = 0x11ba;
                 $syll = jamo_to_syllable($j);
                 /* +ㅅ */
                 $ch = $syll[0];
                 $sj = 3;
             }
         }
         if ($sj == 2) {
             if (in_array($j[0], array(0x110c)) and in_array($j[1], array(0x116e, 0x1175))) {
                 /* 주, 지 */
                 array_unshift($uend, $ch);
                 unset($ch);
                 $ch = array_pop($ustem);
                 $j = hangul_to_jamo($ch);
             }
             if ($j[1] == 0x1165 and in_array($j[0], array(0x1101, 0x1104, 0x110a, 0x1111))) {
                 /* 꺼,떠,써,퍼 */
                 $syll = jamo_to_syllable(array(0x110b, 0x1165));
                 /* 어 */
                 array_unshift($uend, $syll[0]);
                 if ($j[0] == 0x1111) {
                     $syll = jamo_to_syllable(array($j[0], 0x116e));
                 } else {
                     $syll = jamo_to_syllable(array($j[0], 0x1173));
                 }
                 /* 쓰 */
                 array_push($ustem, $syll[0]);
                 unset($ch);
                 $ch = array_pop($ustem);
                 $j = hangul_to_jamo($ch);
             }
             // 음운 축약
             if (in_array($j[0], array(0x1105, 0x1112)) and $j[1] == 0x1162) {
                 // ㅎ 불규칙(어미) 파랗+아서 -> 파라+아서 -> 파래서
                 /* 파래-서 -> 파라-아서 */
                 $j[1] = 0x1161;
                 $syll = jamo_to_syllable($j);
                 /* 래 -> 라+ 아 */
                 $ch = $syll[0];
                 $syll = jamo_to_syllable(array(0x110b, 0x1161));
                 /* 아 */
                 $ed = $syll[0];
                 array_unshift($uend, $ed);
                 $ej[0] = 0x110b;
                 $ej[0] = 0x1161;
             } else {
                 if ($j[0] == 0x1112 and in_array($j[1], array(0x1162))) {
                     // 해-서 = 하-여서
                     $j[1] = 0x1161;
                     $syll = jamo_to_syllable($j);
                     /* 해 -> 하 + 여 */
                     $ch = $syll[0];
                     $syll = jamo_to_syllable(array(0x110b, 0x1167));
                     /* 여 */
                     $ed = $syll[0];
                     array_unshift($uend, $ed);
                     $ej[0] = 0x110b;
                     $ej[0] = 0x1167;
                 } else {
                     if (in_array($j[0], array(0x1105, 0x1109)) and in_array($j[1], array(0x1167))) {
                         // 하셔-서 = 하시-어서
                         // 가려-서 = 가리-어서
                         $j[1] = 0x1175;
                         /* ㅣ */
                         $syll = jamo_to_syllable($j);
                         /* ㅕ -> 이-어 */
                         $ch = $syll[0];
                         $syll = jamo_to_syllable(array(0x110b, 0x1165));
                         /* 어 */
                         $ed = $syll[0];
                         array_unshift($uend, $ed);
                         $ej[0] = 0x110b;
                         $ej[0] = 0x1165;
                     }
                 }
             }
             if ($j[0] == 0x1109 and $j[1] == 0x1175) {
                 /* 시: 존칭처리 */
                 array_unshift($uend, $ch);
                 $ej = $j;
                 $ch = array_pop($ustem);
                 $j = hangul_to_jamo($ch);
             }
             // ㅎ 불규칙
             if (in_array($j[0], array(0x1105, 0x1106)) and in_array($j[1], array(0x1161, 0x1165))) {
                 $syll = jamo_to_syllable(array($j[0], $j[1], 0x11c2));
                 /* 랗,렇 */
                 array_push($ustem, $syll[0]);
                 unset($ch);
                 unset($j);
             }
         }
         while ($sj == 2 and $j[0] == 0x110b and in_array($j[1], array(0x116a, 0x116e, 0x116f)) and sizeof($ustem) >= 1) {
             // XXX
             // 그리워: 그리우+어 -> 그립+워
             # /* 와 우 워 */
             $ch1 = array_pop($ustem);
             $jamo = hangul_to_jamo($ch1);
             if (sizeof($jamo) == 2) {
                 if ($jamo[1] != 0x1175) {
                     $syll = jamo_to_syllable(array($jamo[0], $jamo[1], 0x11b8));
                     array_push($ustem, $syll[0]);
                     /* add ㅂ */
                 } else {
                     array_push($ustem, $ch1);
                 }
                 array_unshift($uend, $ch);
                 unset($ch);
             } else {
                 array_push($ustem, $ch1);
             }
             break;
         }
         if ($ch) {
             array_push($ustem, $ch);
         }
         $match[1] = unicode_to_utf8($uend);
         return unicode_to_utf8($ustem);
     }
     $match[1] = $save . $match[1];
     return $stem;
     #print "<pre>";
     #print($word.'-'.$match[1]);
     #print_r($match);
 }
 function read_data(&$l)
 {
     $item_type = $this->read_byte();
     if (!$this->state) {
         return '';
     }
     $l = true;
     switch ($item_type) {
         case 0:
             // List End
             $l = false;
             break;
         case 1:
             // List Start
             $x = true;
             $out = array();
             while ($this->state) {
                 $list_value = $this->read_data($x);
                 if (!$x || !$this->state) {
                     break;
                 }
                 array_push($out, $list_value);
             }
             return $out;
             break;
         case 2:
             return $this->read_byte();
             break;
         case 3:
             return $this->read_word();
             break;
         case 4:
             // GUESS
             return $this->read_dword();
             break;
         case 5:
             $this->read_skip(10);
             // DateTime
             break;
         case 6:
         case 7:
             return $this->read_ace_str();
             break;
         case 8:
             return true;
             break;
         case 9:
             return false;
             break;
         case 11:
             // String List
             while ($this->state && $this->read_ace_str()) {
             }
             break;
         case 10:
         case 12:
             return $this->read_ace_data();
             break;
         case 18:
             // UNICODE
             return Utf8ToWin(unicode_to_utf8($this->read_ace_uni_str()));
             break;
         case 19:
             $this->read_skip(8);
             // Int64
             break;
         case 20:
             // UTF-8
             return Utf8ToWin($this->read_ace_data());
             break;
         default:
             $this->push_error('ERR_UNK_ACEFTP_ITEM_TYPE: ' . $item_type);
     }
     return '';
 }
Esempio n. 19
0
 function makeutf8($c)
 {
     return unicode_to_utf8(array(ord($c)));
 }
Esempio n. 20
0
function utf8_hangul_getSearchRule($str, $lastchar = 1, $use_unicode = true)
{
    $rule = '';
    $val = utf8_to_unicode($str);
    $len = sizeof($val);
    if ($lastchar and $len > 1) {
        // make a regex using with the last char
        $last = array_pop($val);
        $rule = unicode_to_utf8($val);
        $val = array($last);
        $len = sizeof($val);
    } else {
        // make regex for consonant only letters
        // ㄱㅎ => (ㄱ|[가-깋])(ㅎ|[하-힣])
        // save the last char
        $last = array_pop($val);
        $len = sizeof($val);
        for ($i = 0; $i < $len; $i++) {
            $ch = $val[$i];
            if ($ch >= 0x3130 and $ch <= 0x318f) {
                $wch = hangul_to_jamo(array($ch));
                if ($wch[0] >= 0x1100 and $wch[0] <= 0x1112) {
                    $wch[1] = 0x1161;
                    $start = jamo_to_syllable($wch);
                    $ustart = unicode_to_utf8($start);
                    $wch[1] = 0x1175;
                    $wch[2] = 0x11c2;
                    $end = jamo_to_syllable($wch);
                    $uend = unicode_to_utf8($end);
                } else {
                    $rule .= unicode_to_utf8($wch);
                    continue;
                }
                $crule = '(' . unicode_to_utf8(array($ch)) . '|';
                $crule .= hangul_regex_range($ustart, $uend, $use_unicode);
                $crule .= ')';
            } else {
                $crule = unicode_to_utf8(array($ch));
            }
            $rule .= $crule;
        }
        // lastchar
        $val = array($last);
        $len = sizeof($val);
    }
    for ($i = 0; $i < $len; $i++) {
        $ch = $val[$i];
        $wch = array();
        $ustart = array();
        $uend = array();
        if ($ch >= 0xac00 and $ch <= 0xd7a3 or $ch >= 0x3130 and $ch <= 0x318f) {
            $wch = hangul_to_jamo(array($ch));
        } else {
            $rule .= unicode_to_utf8(array($ch));
            continue;
        }
        $wlen = sizeof($wch);
        $ket = '';
        if ($wlen >= 3) {
            // 종각 => 종(각|가[가-깋])
            $mrule = array();
            $mrule[] = unicode_to_utf8(array($ch));
            $save = $wch[2];
            unset($wch[2]);
            $tmp = jamo_to_syllable($wch);
            $mrule[] = unicode_to_utf8($tmp);
            $save = hangul_jongseong_to_cjamo($save);
            $wch = hangul_to_jamo($save);
            $wlen = sizeof($wch);
            $rule .= '(' . implode('|', $mrule);
            $ket = ')';
            if ($wlen > 1) {
                $rule .= ')';
                continue;
            }
        }
        if ($wlen == 1) {
            if ($wch[0] >= 0x1100 and $wch[0] <= 0x1112) {
                $wch[1] = 0x1161;
                $start = jamo_to_syllable($wch);
                $ustart = unicode_to_utf8($start);
                $wch[1] = 0x1175;
                $wch[2] = 0x11c2;
                $end = jamo_to_syllable($wch);
                $uend = unicode_to_utf8($end);
            } else {
                $rule .= unicode_to_utf8($wch) . $ket;
                continue;
            }
        } else {
            if ($wlen == 2) {
                if ($wch[0] >= 0x1100 and $wch[0] <= 0x1112) {
                    $start = jamo_to_syllable($wch);
                    $ustart = unicode_to_utf8($start);
                    $wch[2] = 0x11c2;
                    $end = jamo_to_syllable($wch);
                    $uend = unicode_to_utf8($end);
                } else {
                    $rule .= unicode_to_utf8($wch);
                    continue;
                }
            }
        }
        $crule = hangul_regex_range($ustart, $uend, $use_unicode);
        $rule .= $crule . $ket;
    }
    return $rule;
}