/** * gbk转utf8 * @param $gbstr */ function gbk_to_utf8($gbstr) { global $CODETABLE; if (empty($CODETABLE)) { $filename = CODETABLEDIR . 'gb-unicode.table'; $fp = fopen($filename, 'rb'); while ($l = fgets($fp, 15)) { $CODETABLE[hexdec(substr($l, 0, 6))] = substr($l, 7, 6); } fclose($fp); } $ret = ''; $utf8 = ''; while ($gbstr) { if (ord(substr($gbstr, 0, 1)) > 0x80) { $thisW = substr($gbstr, 0, 2); $gbstr = substr($gbstr, 2, strlen($gbstr)); $utf8 = ''; @($utf8 = unicode_to_utf8(hexdec($CODETABLE[hexdec(bin2hex($thisW)) - 0x8080]))); if ($utf8 != '') { for ($i = 0; $i < strlen($utf8); $i += 3) { $ret .= chr(substr($utf8, $i, 3)); } } } else { $ret .= substr($gbstr, 0, 1); $gbstr = substr($gbstr, 1, strlen($gbstr)); } } return $ret; }
/** * Convert unicode decimla list to utf-8 encoded character * * @param string $strDecVal [comma separated list of] decimal character value(s) * @return string */ function convToUTF8($strDecVal) { $codes = explode(',', str_replace(' ', '', $strDecVal)); $symbol = ''; foreach ($codes as $code) { $symbol .= unicode_to_utf8(intval($code)); } return $symbol; }
function decode($text) { global $base, $tmin, $tmax, $skew, $damp, $initial_bias, $initial_n, $prefix, $delim; $n = $initial_n; $i = 0; $bias = $initial_bias; $output = array(); if (substr($text, 0, strlen($prefix)) != $prefix) { return $text; } else { $text = str_replace($prefix, "", $text); } $delim_pos = strrpos($text, $delim); if ($delim_pos !== false) { for ($j = 0; $j < $delim_pos; $j++) { array_push($output, $text[$j]); } $text = substr($text, $delim_pos + 1); } for (; strlen($text) > 0;) { $oldi = $i; $w = 1; for ($k = $base; 1; $k = $k + $base) { $digit = decode_digit($text[0]); $text = substr($text, 1); $i = $i + $digit * $w; $t = 0; if ($k <= $bias + $tmin) { $t = $tmin; } elseif ($k >= $bias + $tmax) { $t = $tmax; } else { $t = $k - $bias; } if ($digit < $t) { break; } $w = $w * ($base - $t); } $bias = adapt($i - $oldi, sizeof($output) + 1, $oldi == 0); $n = $n + floor($i / (sizeof($output) + 1)); $i = $i % (sizeof($output) + 1); $tmp = $output; $output = array(); $j = 0; for ($j = 0; $j < $i; $j++) { array_push($output, $tmp[$j]); } array_push($output, unicode_to_utf8($n)); for ($j = $j; $j < sizeof($tmp); $j++) { array_push($output, $tmp[$j]); } $i++; } return implode($output); }
function utf8_keepalphanum($string) { global $UTF8_ALPHA_CHARS; $chars = utf8_to_unicode($string); for ($i = 0, $size = count($chars); $i < $size; ++$i) { if (!in_array($chars[$i], $UTF8_ALPHA_CHARS)) { unset($chars[$i]); } } return unicode_to_utf8($chars); }
function utf8_keepalphanum($string) { // a-z A-Z . _ -, extended latin chars, Cyrillic and Greek static $UTF8_ALPHA_CHARS = array(0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5a, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x2e, 0x2d, 0x5f, 0x20, 0xc1, 0xe1, 0x106, 0x107, 0xc9, 0xe9, 0xcd, 0xed, 0x139, 0x13a, 0x143, 0x144, 0xd3, 0xf3, 0x154, 0x155, 0x15a, 0x15b, 0xda, 0xfa, 0xdd, 0xfd, 0x179, 0x17a, 0x10f, 0x13d, 0x13e, 0x165, 0x102, 0x103, 0x11e, 0x11f, 0x16c, 0x16d, 0x10c, 0x10d, 0x10e, 0x11a, 0x11b, 0x147, 0x148, 0x158, 0x159, 0x160, 0x161, 0x164, 0x17d, 0x17e, 0xc7, 0xe7, 0x122, 0x123, 0x136, 0x137, 0x13b, 0x13c, 0x145, 0x146, 0x156, 0x157, 0x15e, 0x15f, 0x162, 0x163, 0xc2, 0xe2, 0x108, 0x109, 0xca, 0xea, 0x11c, 0x11d, 0x124, 0x125, 0xce, 0xee, 0x134, 0x135, 0xd4, 0xf4, 0x15c, 0x15d, 0xdb, 0xfb, 0x174, 0x175, 0x176, 0x177, 0xc4, 0xe4, 0xcb, 0xeb, 0xcf, 0xef, 0xd6, 0xf6, 0xdc, 0xfc, 0x178, 0xff, 0x10a, 0x10b, 0x116, 0x117, 0x120, 0x121, 0x130, 0x131, 0x17b, 0x17c, 0x150, 0x151, 0x170, 0x171, 0xc0, 0xe0, 0xc8, 0xe8, 0xcc, 0xec, 0xd2, 0xf2, 0xd9, 0xf9, 0x1a0, 0x1a1, 0x1af, 0x1b0, 0x100, 0x101, 0x112, 0x113, 0x12a, 0x12b, 0x14c, 0x14d, 0x16a, 0x16b, 0x104, 0x105, 0x118, 0x119, 0x12e, 0x12f, 0x172, 0x173, 0xc5, 0xe5, 0x16e, 0x16f, 0x110, 0x111, 0x126, 0x127, 0x141, 0x142, 0xd8, 0xf8, 0xc3, 0xe3, 0xd1, 0xf1, 0xd5, 0xf5, 0xc6, 0xe6, 0x152, 0x153, 0xd0, 0xf0, 0xde, 0xfe, 0xdf, 0x17f, 0x391, 0x392, 0x393, 0x394, 0x395, 0x396, 0x397, 0x398, 0x399, 0x39a, 0x39b, 0x39c, 0x39d, 0x39e, 0x39f, 0x3a0, 0x3a1, 0x3a3, 0x3a4, 0x3a5, 0x3a6, 0x3a7, 0x3a8, 0x3a9, 0x386, 0x388, 0x389, 0x38a, 0x38c, 0x38e, 0x38f, 0x3aa, 0x3ab, 0x3b1, 0x3b2, 0x3b3, 0x3b4, 0x3b5, 0x3b6, 0x3b7, 0x3b8, 0x3b9, 0x3ba, 0x3bb, 0x3bc, 0x3bd, 0x3be, 0x3bf, 0x3c0, 0x3c1, 0x3c3, 0x3c2, 0x3c4, 0x3c5, 0x3c6, 0x3c7, 0x3c8, 0x3c9, 0x3ac, 0x3ad, 0x3ae, 0x3af, 0x3cc, 0x3cd, 0x3ce, 0x3ca, 0x3cb, 0x390, 0x3b0, 0x410, 0x411, 0x412, 0x413, 0x414, 0x415, 0x401, 0x416, 0x417, 0x406, 0x419, 0x41a, 0x41b, 0x41c, 0x41d, 0x41e, 0x41f, 0x420, 0x421, 0x422, 0x423, 0x40e, 0x424, 0x425, 0x426, 0x427, 0x428, 0x42b, 0x42c, 0x42d, 0x42e, 0x42f, 0x430, 0x431, 0x432, 0x433, 0x434, 0x435, 0x451, 0x436, 0x437, 0x456, 0x439, 0x43a, 0x43b, 0x43c, 0x43d, 0x43e, 0x43f, 0x440, 0x441, 0x442, 0x443, 0x45e, 0x444, 0x445, 0x446, 0x447, 0x448, 0x44b, 0x44c, 0x44d, 0x44e, 0x44f, 0x418, 0x429, 0x42a, 0x438, 0x449, 0x44a, 0x403, 0x405, 0x408, 0x409, 0x40a, 0x40c, 0x40f, 0x453, 0x455, 0x458, 0x459, 0x45a, 0x45c, 0x45f, 0x402, 0x40b, 0x452, 0x45b, 0x490, 0x404, 0x407, 0x491, 0x454, 0x457, 0x4e8, 0x4ae, 0x4e9, 0x4af); $chars = utf8_to_unicode($string); for ($i = 0, $size = count($chars); $i < $size; ++$i) { if (!in_array($chars[$i], $UTF8_ALPHA_CHARS)) { unset($chars[$i]); } } return unicode_to_utf8($chars); }
function utf8_strtoupper($string) { static $lower_to_upper; if ($lower_to_upper == null) { $lower_to_upper = array(0x61 => 0x41, 0x3c6 => 0x3a6, 0x163 => 0x162, 0xe5 => 0xc5, 0x62 => 0x42, 0x13a => 0x139, 0xe1 => 0xc1, 0x142 => 0x141, 0x3cd => 0x38e, 0x101 => 0x100, 0x491 => 0x490, 0x3b4 => 0x394, 0x15b => 0x15a, 0x64 => 0x44, 0x3b3 => 0x393, 0xf4 => 0xd4, 0x44a => 0x42a, 0x439 => 0x419, 0x113 => 0x112, 0x43c => 0x41c, 0x15f => 0x15e, 0x144 => 0x143, 0xee => 0xce, 0x45e => 0x40e, 0x44f => 0x42f, 0x3ba => 0x39a, 0x155 => 0x154, 0x69 => 0x49, 0x73 => 0x53, 0x1e1f => 0x1e1e, 0x135 => 0x134, 0x447 => 0x427, 0x3c0 => 0x3a0, 0x438 => 0x418, 0xf3 => 0xd3, 0x440 => 0x420, 0x454 => 0x404, 0x435 => 0x415, 0x449 => 0x429, 0x14b => 0x14a, 0x431 => 0x411, 0x459 => 0x409, 0x1e03 => 0x1e02, 0xf6 => 0xd6, 0xf9 => 0xd9, 0x6e => 0x4e, 0x451 => 0x401, 0x3c4 => 0x3a4, 0x443 => 0x423, 0x15d => 0x15c, 0x453 => 0x403, 0x3c8 => 0x3a8, 0x159 => 0x158, 0x67 => 0x47, 0xe4 => 0xc4, 0x3ac => 0x386, 0x3ae => 0x389, 0x167 => 0x166, 0x3be => 0x39e, 0x165 => 0x164, 0x117 => 0x116, 0x109 => 0x108, 0x76 => 0x56, 0xfe => 0xde, 0x157 => 0x156, 0xfa => 0xda, 0x1e61 => 0x1e60, 0x1e83 => 0x1e82, 0xe2 => 0xc2, 0x119 => 0x118, 0x146 => 0x145, 0x70 => 0x50, 0x151 => 0x150, 0x44e => 0x42e, 0x129 => 0x128, 0x3c7 => 0x3a7, 0x13e => 0x13d, 0x442 => 0x422, 0x7a => 0x5a, 0x448 => 0x428, 0x3c1 => 0x3a1, 0x1e81 => 0x1e80, 0x16d => 0x16c, 0xf5 => 0xd5, 0x75 => 0x55, 0x177 => 0x176, 0xfc => 0xdc, 0x1e57 => 0x1e56, 0x3c3 => 0x3a3, 0x43a => 0x41a, 0x6d => 0x4d, 0x16b => 0x16a, 0x171 => 0x170, 0x444 => 0x424, 0xec => 0xcc, 0x169 => 0x168, 0x3bf => 0x39f, 0x6b => 0x4b, 0xf2 => 0xd2, 0xe0 => 0xc0, 0x434 => 0x414, 0x3c9 => 0x3a9, 0x1e6b => 0x1e6a, 0xe3 => 0xc3, 0x44d => 0x42d, 0x436 => 0x416, 0x1a1 => 0x1a0, 0x10d => 0x10c, 0x11d => 0x11c, 0xf0 => 0xd0, 0x13c => 0x13b, 0x45f => 0x40f, 0x45a => 0x40a, 0xe8 => 0xc8, 0x3c5 => 0x3a5, 0x66 => 0x46, 0xfd => 0xdd, 0x63 => 0x43, 0x21b => 0x21a, 0xea => 0xca, 0x3b9 => 0x399, 0x17a => 0x179, 0xef => 0xcf, 0x1b0 => 0x1af, 0x65 => 0x45, 0x3bb => 0x39b, 0x3b8 => 0x398, 0x3bc => 0x39c, 0x45c => 0x40c, 0x43f => 0x41f, 0x44c => 0x42c, 0xfe => 0xde, 0xf0 => 0xd0, 0x1ef3 => 0x1ef2, 0x68 => 0x48, 0xeb => 0xcb, 0x111 => 0x110, 0x433 => 0x413, 0x12f => 0x12e, 0xe6 => 0xc6, 0x78 => 0x58, 0x161 => 0x160, 0x16f => 0x16e, 0x3b1 => 0x391, 0x457 => 0x407, 0x173 => 0x172, 0xff => 0x178, 0x6f => 0x4f, 0x43b => 0x41b, 0x3b5 => 0x395, 0x445 => 0x425, 0x121 => 0x120, 0x17e => 0x17d, 0x17c => 0x17b, 0x3b6 => 0x396, 0x3b2 => 0x392, 0x3ad => 0x388, 0x1e85 => 0x1e84, 0x175 => 0x174, 0x71 => 0x51, 0x437 => 0x417, 0x1e0b => 0x1e0a, 0x148 => 0x147, 0x105 => 0x104, 0x458 => 0x408, 0x14d => 0x14c, 0xed => 0xcd, 0x79 => 0x59, 0x10b => 0x10a, 0x3ce => 0x38f, 0x72 => 0x52, 0x430 => 0x410, 0x455 => 0x405, 0x452 => 0x402, 0x127 => 0x126, 0x137 => 0x136, 0x12b => 0x12a, 0x3af => 0x38a, 0x44b => 0x42b, 0x6c => 0x4c, 0x3b7 => 0x397, 0x125 => 0x124, 0x219 => 0x218, 0xfb => 0xdb, 0x11f => 0x11e, 0x43e => 0x41e, 0x1e41 => 0x1e40, 0x3bd => 0x39d, 0x107 => 0x106, 0x3cb => 0x3ab, 0x446 => 0x426, 0xfe => 0xde, 0xe7 => 0xc7, 0x3ca => 0x3aa, 0x441 => 0x421, 0x432 => 0x412, 0x10f => 0x10e, 0xf8 => 0xd8, 0x77 => 0x57, 0x11b => 0x11a, 0x74 => 0x54, 0x6a => 0x4a, 0x45b => 0x40b, 0x456 => 0x406, 0x103 => 0x102, 0x3bb => 0x39b, 0xf1 => 0xd1, 0x43d => 0x41d, 0x3cc => 0x38c, 0xe9 => 0xc9, 0xf0 => 0xd0, 0x457 => 0x407, 0x123 => 0x122); } $unicode = utf8_to_unicode($string); if (!$unicode) { return false; } for ($i = 0; $i < count($unicode); $i++) { if (isset($lower_to_upper[$unicode[$i]])) { $unicode[$i] = $lower_to_upper[$unicode[$i]]; } } return unicode_to_utf8($unicode); }
/** * UTF-8 to UTF-16BE conversion. * * Maybe really UCS-2 without mb_string due to utf8_to_unicode limits */ protected function utf16be_to_utf8(&$str) { $uni = unpack('n*', $str); return unicode_to_utf8($uni); }
function unicode_to_ansi($string) { if (!strlen($string)) { return ''; } // check for unicode length validness if (strlen($string) % 2 != 0) { return ''; } else { return Utf8ToWin(unicode_to_utf8($string)); } // alternative //return mb_convert_encoding($string, "cp1251", "UTF-16LE"); }
/** * Removes special characters (nonalphanumeric) from a UTF-8 string * * Be sure to specify all specialchars you give in $repl in $keep, too * or it won't work. * * This function adds the controlchars 0x00 to 0x19 to the array of * stripped chars (they are not included in $UTF8_SPECIAL_CHARS) * * @author Andreas Gohr <*****@*****.**> * @param string $string The UTF8 string to strip of special chars * @param string $repl Replace special with this string * @param string $keep Special chars to keep (in UTF8) */ function utf8_stripspecials($string, $repl = '', $keep = '') { global $UTF8_SPECIAL_CHARS; if ($keep != '') { $specials = array_diff($UTF8_SPECIAL_CHARS, utf8_to_unicode($keep)); } else { $specials = $UTF8_SPECIAL_CHARS; } $specials = unicode_to_utf8($specials); $specials = preg_quote($specials, '/'); return preg_replace('/[\\x00-\\x19' . $specials . ']/u', $repl, $string); }
/** * gbk转utf8 * @param $gbstr */ function gbk_to_utf8($gbstr) { $filename = EXTENSION_DIR . 'encoding' . DIRECTORY_SEPARATOR . 'gb-unicode.table'; $CODETABLE = array(); $fp = fopen($filename, 'rb'); while ($l = fgets($fp, 15)) { $CODETABLE[hexdec(substr($l, 0, 6))] = substr($l, 7, 6); } fclose($fp); $ret = ''; $utf8 = ''; while ($gbstr) { if (ord(substr($gbstr, 0, 1)) > 0x80) { $thisW = substr($gbstr, 0, 2); $gbstr = substr($gbstr, 2, strlen($gbstr)); $utf8 = ''; @($utf8 = unicode_to_utf8(hexdec($CODETABLE[hexdec(bin2hex($thisW)) - 0x8080]))); if ($utf8 != '') { for ($i = 0; $i < strlen($utf8); $i += 3) { $ret .= chr(substr($utf8, $i, 3)); } } } else { $ret .= substr($gbstr, 0, 1); $gbstr = substr($gbstr, 1, strlen($gbstr)); } } return $ret; }
/** * decoding process * - split the string into substrings at any occurrence of pre or post indicator characters * - check the first character of the substring * - if its not a pre_indicator character * - if previous character was converted, skip over post_indicator character * - copy codepoint values of remaining characters to the output array * - clear any converted flag * (continue to next substring) * * _ else (its a pre_indicator character) * - if string length is 1, copy the post_indicator character to the output array * (continue to next substring) * * - else (string length > 1) * - skip the pre-indicator character and convert remaining string from base36 to base10 * - increase codepoint value for non-printable ASCII characters (add 0x20) * - append codepoint to output array * (continue to next substring) * * @param string $filename a 'safe' encoded ASCII string, * @return string decoded utf8 representation of $filename * * @author Christopher Smith <*****@*****.**> */ public function decode($filename) { return unicode_to_utf8(self::safe_to_unicode(strtolower($filename))); }
//$i = 0; while (strpos($buffer, '$') !== false) { //echo $i++ . ": strpos=". (string)strpos($buffer, '$') ."\nbuffer: $buffer\n"; $replaced = false; foreach ($translation as $symbol => $character) { $sym_pos = strpos($buffer, $symbol); if ($sym_pos !== false) { $sym_length = strlen($symbol); $piece1 = substr($buffer, 0, $sym_pos); if ($character['switch']) { // the character after the special charater needs to come before it $partnerchar = utf8_encode($buffer[$sym_pos + $sym_length]); $piece2 = unicode_to_utf8(array_merge(utf8_to_unicode($partnerchar), $character['unicode'])); $piece3start = $sym_pos + $sym_length + 1; } else { $piece2 = unicode_to_utf8($character['unicode']); $piece3start = $sym_pos + $sym_length; } $piece2 = utf8_decode(UtfNormal::NFKC($piece2)); // strip out any ? characters, which are characters not existing in ISO-8859-1 $piece2 = str_replace('?', '', $piece2); $piece3 = substr($buffer, $piece3start); $buffer = $piece1 . $piece2 . $piece3; $replaced = true; continue; } } if (!$replaced) { // we've encountered some character that we have no translation for echo "unable to find a translation to transform this buffer, the untranslatable code will be stripped out:\n{$buffer}\n"; $pieces = preg_split('/\\$\\d*/', $buffer, 2);
<?php define('UNICODE_EMOJI_PATH', 'http://unicode.org/Public/UNIDATA/EmojiSources.txt'); define('JSON_WRITE_PATH', 'EmojiSources.json'); $contents = file_get_contents(UNICODE_EMOJI_PATH); $pattern = '/^([0-9A-F\\s]+);([0-9A-F]+)?;([0-9A-F]+)?;([0-9A-F]+)?$/m'; $emojiList = array(); if (preg_match_all($pattern, $contents, $matches)) { $j = sizeof($matches[1]); for ($i = 0; $i < $j; $i++) { $unicode = trim($matches[1][$i]); if (strpos($unicode, ' ') !== FALSE) { $array = explode(' ', $unicode); $utf8hex = unicode_to_utf8($array[0]) . unicode_to_utf8($array[1]); } else { $utf8hex = unicode_to_utf8($unicode); } $map = array(); $map['unicode'] = $unicode; $map['utf8hex'] = $utf8hex; $map['sjis_docomo'] = $matches[2][$i]; $map['sjis_kddi'] = $matches[3][$i]; $map['sjis_softbank'] = $matches[4][$i]; $emojiList[] = $map; } $jsonData = json_encode($emojiList); file_put_contents(JSON_WRITE_PATH, $jsonData); printf("Created a file. [%s]\n", JSON_WRITE_PATH); } else { printf("[ERROR] Failed parse file. [%s]\n", UNICODE_EMOJI_PATH); }
function utf8_hangul_getSearchRule($str, $lastchar = 1, $use_unicode = true) { $rule = ''; $val = utf8_to_unicode($str); $len = sizeof($val); if ($lastchar and $len > 1) { // make a regex using with the last char $last = array_pop($val); $rule = unicode_to_utf8($val); $val = array($last); $len = sizeof($val); } for ($i = 0; $i < $len; $i++) { $ch = $val[$i]; $wch = array(); $ustart = array(); $uend = array(); if ($ch >= 0xac00 and $ch <= 0xd7a3 or $ch >= 0x3130 and $ch <= 0x318f) { $wch = hangul_to_jamo(array($ch)); } else { $rule .= unicode_to_utf8(array($ch)); continue; } $wlen = sizeof($wch); $ket = ''; if ($wlen >= 3) { // 종각 => 종(각|가[가-깋]) $mrule = array(); $mrule[] = unicode_to_utf8(array($ch)); $save = $wch[2]; unset($wch[2]); $tmp = jamo_to_syllable($wch); $mrule[] = unicode_to_utf8($tmp); $save = hangul_jongseong_to_cjamo($save); $wch = hangul_to_jamo($save); $wlen = sizeof($wch); $rule .= '(' . implode('|', $mrule); $ket = ')'; if ($wlen > 1) { $rule .= ')'; continue; } } if ($wlen == 1) { if ($wch[0] >= 0x1100 and $wch[0] <= 0x1112) { $wch[1] = 0x1161; $start = jamo_to_syllable($wch); $ustart = unicode_to_utf8($start); $wch[1] = 0x1175; $wch[2] = 0x11c2; $end = jamo_to_syllable($wch); $uend = unicode_to_utf8($end); } else { $rule .= unicode_to_utf8($wch) . $ket; continue; } } else { if ($wlen == 2) { if ($wch[0] >= 0x1100 and $wch[0] <= 0x1112) { $start = jamo_to_syllable($wch); $ustart = unicode_to_utf8($start); $wch[2] = 0x11c2; $end = jamo_to_syllable($wch); $uend = unicode_to_utf8($end); } else { $rule .= unicode_to_utf8($wch); continue; } } } if ($use_unicode) { $crule = '[' . $ustart . '-' . $uend . ']'; } else { $rule .= sprintf("\\x%02X", ord($ustart[0])); $crule = ''; if ($ustart[1] == $uend[1]) { $crule .= sprintf("\\x%02X", ord($ustart[1])); $crule .= sprintf("[\\x%02X-\\x%02X]", ord($ustart[2]), ord($uend[2])); } else { $sch = ord($ustart[1]); $ech = ord($uend[1]); $subrule = array(); $subrule[] = sprintf("\\x%02X[\\x%02X-\\xBF]", $sch, ord($ustart[2])); if ($sch + 1 == $ech - 1) { $subrule[] = sprintf("\\x%02X[\\x80-\\xBF]", $sch + 1); } else { if ($sch + 1 != $ech) { $subrule[] = sprintf("[\\x%02X-\\x%02X][\\x80-\\xBF]", $sch + 1, $ech - 1); } } $subrule[] = sprintf("\\x%02X[\\x80-\\x%02X]", ord($uend[1]), ord($uend[2])); $crule .= '(' . implode('|', $subrule) . ')'; } } $rule .= $crule . $ket; } return $rule; }
function unicode_to_utf8($str) { return unicode_to_utf8($str); }
function test_to_4byte() { $out = ""; $in = array(1048577); $this->assertEqual(unicode_to_utf8($in), $out); }
function verbIrr($stem, &$match) { # 각종 규칙 불규칙 처리 $ustem = utf8_to_unicode($stem); $uend = utf8_to_unicode($match[1]); $ch = array_pop($ustem); $ed = $uend[0]; $save = ''; if ($this->isHangul($ch)) { $j = hangul_to_jamo($ch); $ej = hangul_to_jamo($ed); $sj = sizeof($j); if ($sj == 3 and $j[2] == 0x11bb) { // 랐-다, 었-다, 겠-다, 였-다 if (in_array($j[1], array(0x1161, 0x1165, 0x1166, 0x1167))) { if ($j[0] == 0x1105 and in_array($j[1], array(0x1161, 0x1165, 0x1167))) { // 랐,렀,렸 // 갈렸-다 } else { if (in_array($j[0], array(0x1100, 0x110b, 0x110c))) { # 겠,았 array_unshift($uend, $ch); unset($ch); } else { if ($j[1] == 0x1167 and in_array($j[0], array(0x1101, 0x1102, 0x1103, 0x1105, 0x1106, 0x1107, 0x1109, 0x110c, 0x110e, 0x110f, 0x1110, 0x1111, 0x1112))) { # 여 변환 // 혔 -> ㅎ+었 -> 히+었 $j[1] = 0x1165; $syll = jamo_to_syllable(array(0x110b, $j[1], $j[2])); array_unshift($uend, $syll[0]); /* 혔 -> 히+었, 폈 -> 피+었 */ $j[1] = 0x1175; $syll = jamo_to_syllable(array($j[0], $j[1])); $ch = $syll[0]; } else { if (in_array($j[0], array(0x1101, 0x1104, 0x110a, 0x1111, 0x1112))) { # 우 불규칙 /* 떴 -> ㄸ + 었 */ $syll = jamo_to_syllable(array(0x110b, $j[1], $j[2])); array_unshift($uend, $syll[0]); /* ㄸ -> 뜨 */ $j[1] = 0x1173; /* ㅡ */ if ($j[0] == 0x1111) { $j[1] = 0x116e; } /* 펐 푸+었 */ jamo_to_syllable(array($j[0], $j[1])); /* 쓰 */ $ch = $syll[0]; } else { if (in_array($j[0], array(0x1101, 0x1104, 0x110a, 0x1111, 0x1112))) { } } } } } } else { if ($j[0] == 0x1112 and in_array($j[1], array(0x1162))) { array_push($ustem, 0xd558); /* 하 */ $syll = jamo_to_syllable(array(0x110b, 0x1167, 0x11bb)); array_unshift($uend, $syll[0]); #$match[1]='여'.$match[1]; /* 해 -> 하 + 여 */ unset($ch); } else { /* ㅆ를 떼어낸다. */ #print '~~'.$stem.'~~'; $syll = jamo_to_syllable(array($j[0], $j[1])); array_unshift($uend, $j[2]); #array_unshift($uend,hangul_jongseong_to_cjamo($j[2])); $ch = $syll[0]; unset($j[2]); #unset($ch); } } if (!$ch) { $ch = array_pop($ustem); $j = hangul_to_jamo($ch); } $ed = $uend[0]; $ej = hangul_to_jamo($ed); } else { if (!empty($j[2]) and in_array($j[2], array(0x11ab, 0x11af, 0x11b8))) { // 합-시다 갑-시다 갈-래 // 하-ㅂ시다 가-ㅂ시다 가-ㄹ래 // if ($j[2] == 0x11af and $ej[0] == 0x1105) { //if ($j[1] == 0x1173 and $j[2]== 0x11af and $ej[0]==0x1105) { // 르 불규칙 // 흘-러:흐르+러 unset($j[2]); $syll = jamo_to_syllable($j); array_push($ustem, $syll[0]); /* 흐 */ $j[0] = $ej[0]; $j[1] = 0x1173; $syll = jamo_to_syllable($j); /* 르 */ $ch = $syll[0]; } else { array_unshift($uend, $j[2]); $syll = jamo_to_syllable(array($j[0], $j[1])); $ch = $syll[0]; $ed = $j[2]; unset($j[2]); } } } // ㄷ 불규칙 // 들-어 -> 듣-다 $sj = sizeof($j); if ($sj == 3 and $j[2] == 0x11af and in_array($ej[0], array(0x110b, 0x1105))) { while (in_array($ej[1], array(0x1161, 0x1165, 0x1173))) { // 아어으 // 라러르 $se = sizeof($ej); if ($se == 3) { if ($ej[1] == 0x1173 and !in_array($ej[2], 0x11ab, 0x11af)) { break; } // 은을 } else { if ($j[2] == 0x11af and sizeof($ej) == 2 and $ej[0] == 0x1105) { break; } } $syll = jamo_to_syllable(array($j[0], $j[1], 0x11ae)); $ch = $syll[0]; break; } } // ㅅ 불규칙 // * 지-어:짓-어 // * 이-어:잇-어 if (sizeof($ej) == 2) { if ($ej[0] == 0x110b) { $j[2] = 0x11ba; $syll = jamo_to_syllable($j); /* +ㅅ */ $ch = $syll[0]; $sj = 3; } } if ($sj == 2) { if (in_array($j[0], array(0x110c)) and in_array($j[1], array(0x116e, 0x1175))) { /* 주, 지 */ array_unshift($uend, $ch); unset($ch); $ch = array_pop($ustem); $j = hangul_to_jamo($ch); } if ($j[1] == 0x1165 and in_array($j[0], array(0x1101, 0x1104, 0x110a, 0x1111))) { /* 꺼,떠,써,퍼 */ $syll = jamo_to_syllable(array(0x110b, 0x1165)); /* 어 */ array_unshift($uend, $syll[0]); if ($j[0] == 0x1111) { $syll = jamo_to_syllable(array($j[0], 0x116e)); } else { $syll = jamo_to_syllable(array($j[0], 0x1173)); } /* 쓰 */ array_push($ustem, $syll[0]); unset($ch); $ch = array_pop($ustem); $j = hangul_to_jamo($ch); } // 음운 축약 if (in_array($j[0], array(0x1105, 0x1112)) and $j[1] == 0x1162) { // ㅎ 불규칙(어미) 파랗+아서 -> 파라+아서 -> 파래서 /* 파래-서 -> 파라-아서 */ $j[1] = 0x1161; $syll = jamo_to_syllable($j); /* 래 -> 라+ 아 */ $ch = $syll[0]; $syll = jamo_to_syllable(array(0x110b, 0x1161)); /* 아 */ $ed = $syll[0]; array_unshift($uend, $ed); $ej[0] = 0x110b; $ej[0] = 0x1161; } else { if ($j[0] == 0x1112 and in_array($j[1], array(0x1162))) { // 해-서 = 하-여서 $j[1] = 0x1161; $syll = jamo_to_syllable($j); /* 해 -> 하 + 여 */ $ch = $syll[0]; $syll = jamo_to_syllable(array(0x110b, 0x1167)); /* 여 */ $ed = $syll[0]; array_unshift($uend, $ed); $ej[0] = 0x110b; $ej[0] = 0x1167; } else { if (in_array($j[0], array(0x1105, 0x1109)) and in_array($j[1], array(0x1167))) { // 하셔-서 = 하시-어서 // 가려-서 = 가리-어서 $j[1] = 0x1175; /* ㅣ */ $syll = jamo_to_syllable($j); /* ㅕ -> 이-어 */ $ch = $syll[0]; $syll = jamo_to_syllable(array(0x110b, 0x1165)); /* 어 */ $ed = $syll[0]; array_unshift($uend, $ed); $ej[0] = 0x110b; $ej[0] = 0x1165; } } } if ($j[0] == 0x1109 and $j[1] == 0x1175) { /* 시: 존칭처리 */ array_unshift($uend, $ch); $ej = $j; $ch = array_pop($ustem); $j = hangul_to_jamo($ch); } // ㅎ 불규칙 if (in_array($j[0], array(0x1105, 0x1106)) and in_array($j[1], array(0x1161, 0x1165))) { $syll = jamo_to_syllable(array($j[0], $j[1], 0x11c2)); /* 랗,렇 */ array_push($ustem, $syll[0]); unset($ch); unset($j); } } while ($sj == 2 and $j[0] == 0x110b and in_array($j[1], array(0x116a, 0x116e, 0x116f)) and sizeof($ustem) >= 1) { // XXX // 그리워: 그리우+어 -> 그립+워 # /* 와 우 워 */ $ch1 = array_pop($ustem); $jamo = hangul_to_jamo($ch1); if (sizeof($jamo) == 2) { if ($jamo[1] != 0x1175) { $syll = jamo_to_syllable(array($jamo[0], $jamo[1], 0x11b8)); array_push($ustem, $syll[0]); /* add ㅂ */ } else { array_push($ustem, $ch1); } array_unshift($uend, $ch); unset($ch); } else { array_push($ustem, $ch1); } break; } if ($ch) { array_push($ustem, $ch); } $match[1] = unicode_to_utf8($uend); return unicode_to_utf8($ustem); } $match[1] = $save . $match[1]; return $stem; #print "<pre>"; #print($word.'-'.$match[1]); #print_r($match); }
function read_data(&$l) { $item_type = $this->read_byte(); if (!$this->state) { return ''; } $l = true; switch ($item_type) { case 0: // List End $l = false; break; case 1: // List Start $x = true; $out = array(); while ($this->state) { $list_value = $this->read_data($x); if (!$x || !$this->state) { break; } array_push($out, $list_value); } return $out; break; case 2: return $this->read_byte(); break; case 3: return $this->read_word(); break; case 4: // GUESS return $this->read_dword(); break; case 5: $this->read_skip(10); // DateTime break; case 6: case 7: return $this->read_ace_str(); break; case 8: return true; break; case 9: return false; break; case 11: // String List while ($this->state && $this->read_ace_str()) { } break; case 10: case 12: return $this->read_ace_data(); break; case 18: // UNICODE return Utf8ToWin(unicode_to_utf8($this->read_ace_uni_str())); break; case 19: $this->read_skip(8); // Int64 break; case 20: // UTF-8 return Utf8ToWin($this->read_ace_data()); break; default: $this->push_error('ERR_UNK_ACEFTP_ITEM_TYPE: ' . $item_type); } return ''; }
function makeutf8($c) { return unicode_to_utf8(array(ord($c))); }
function utf8_hangul_getSearchRule($str, $lastchar = 1, $use_unicode = true) { $rule = ''; $val = utf8_to_unicode($str); $len = sizeof($val); if ($lastchar and $len > 1) { // make a regex using with the last char $last = array_pop($val); $rule = unicode_to_utf8($val); $val = array($last); $len = sizeof($val); } else { // make regex for consonant only letters // ㄱㅎ => (ㄱ|[가-깋])(ㅎ|[하-힣]) // save the last char $last = array_pop($val); $len = sizeof($val); for ($i = 0; $i < $len; $i++) { $ch = $val[$i]; if ($ch >= 0x3130 and $ch <= 0x318f) { $wch = hangul_to_jamo(array($ch)); if ($wch[0] >= 0x1100 and $wch[0] <= 0x1112) { $wch[1] = 0x1161; $start = jamo_to_syllable($wch); $ustart = unicode_to_utf8($start); $wch[1] = 0x1175; $wch[2] = 0x11c2; $end = jamo_to_syllable($wch); $uend = unicode_to_utf8($end); } else { $rule .= unicode_to_utf8($wch); continue; } $crule = '(' . unicode_to_utf8(array($ch)) . '|'; $crule .= hangul_regex_range($ustart, $uend, $use_unicode); $crule .= ')'; } else { $crule = unicode_to_utf8(array($ch)); } $rule .= $crule; } // lastchar $val = array($last); $len = sizeof($val); } for ($i = 0; $i < $len; $i++) { $ch = $val[$i]; $wch = array(); $ustart = array(); $uend = array(); if ($ch >= 0xac00 and $ch <= 0xd7a3 or $ch >= 0x3130 and $ch <= 0x318f) { $wch = hangul_to_jamo(array($ch)); } else { $rule .= unicode_to_utf8(array($ch)); continue; } $wlen = sizeof($wch); $ket = ''; if ($wlen >= 3) { // 종각 => 종(각|가[가-깋]) $mrule = array(); $mrule[] = unicode_to_utf8(array($ch)); $save = $wch[2]; unset($wch[2]); $tmp = jamo_to_syllable($wch); $mrule[] = unicode_to_utf8($tmp); $save = hangul_jongseong_to_cjamo($save); $wch = hangul_to_jamo($save); $wlen = sizeof($wch); $rule .= '(' . implode('|', $mrule); $ket = ')'; if ($wlen > 1) { $rule .= ')'; continue; } } if ($wlen == 1) { if ($wch[0] >= 0x1100 and $wch[0] <= 0x1112) { $wch[1] = 0x1161; $start = jamo_to_syllable($wch); $ustart = unicode_to_utf8($start); $wch[1] = 0x1175; $wch[2] = 0x11c2; $end = jamo_to_syllable($wch); $uend = unicode_to_utf8($end); } else { $rule .= unicode_to_utf8($wch) . $ket; continue; } } else { if ($wlen == 2) { if ($wch[0] >= 0x1100 and $wch[0] <= 0x1112) { $start = jamo_to_syllable($wch); $ustart = unicode_to_utf8($start); $wch[2] = 0x11c2; $end = jamo_to_syllable($wch); $uend = unicode_to_utf8($end); } else { $rule .= unicode_to_utf8($wch); continue; } } } $crule = hangul_regex_range($ustart, $uend, $use_unicode); $rule .= $crule . $ket; } return $rule; }