function utf8_strpos($haystack, $needle, $offset = 0) { if (!defined('UTF8_NOMBSTRING') && function_exists('mb_strpos')) { return mb_strpos($haystack, $needle, $offset, 'utf-8'); } $haystack = utf8_to_unicode($haystack); $needle = utf8_to_unicode($needle); $position = $offset; $found = false; while (!$found && $position < count($haystack)) { if ($needle[0] == $haystack[$position]) { for ($i = 1; $i < count($needle); $i++) { if ($needle[$i] != $haystack[$position + $i]) { break; } } // for if ($i == count($needle)) { $found = true; $position--; } // if } // if $position++; } // while return $found == true ? $position : false; }
function testString() { $unicode = array(); $unicode[0] = 73; $unicode[1] = 241; $unicode[2] = 116; $unicode[3] = 235; $unicode[4] = 114; $unicode[5] = 110; $unicode[6] = 226; $unicode[7] = 116; $unicode[8] = 105; $unicode[9] = 244; $unicode[10] = 110; $unicode[11] = 224; $unicode[12] = 108; $unicode[13] = 105; $unicode[14] = 122; $unicode[15] = 230; $unicode[16] = 116; $unicode[17] = 105; $unicode[18] = 248; $unicode[19] = 110; $this->assertEqual(utf8_to_unicode('Iñtërnâtiônàlizætiøn'), $unicode); }
/** * Check whether a string is composed with chinese chars * @param string $str UTF8-encoded str * @return bool TRUE for chinese str */ function is_chinese($str) { $unicode = utf8_to_unicode($str); foreach ($unicode as $char) { if (!detect_CJK($char)) { return FALSE; } } return TRUE; }
function utf8_keepalphanum($string) { global $UTF8_ALPHA_CHARS; $chars = utf8_to_unicode($string); for ($i = 0, $size = count($chars); $i < $size; ++$i) { if (!in_array($chars[$i], $UTF8_ALPHA_CHARS)) { unset($chars[$i]); } } return unicode_to_utf8($chars); }
function utf8_keepalphanum($string) { // a-z A-Z . _ -, extended latin chars, Cyrillic and Greek static $UTF8_ALPHA_CHARS = array(0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5a, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x2e, 0x2d, 0x5f, 0x20, 0xc1, 0xe1, 0x106, 0x107, 0xc9, 0xe9, 0xcd, 0xed, 0x139, 0x13a, 0x143, 0x144, 0xd3, 0xf3, 0x154, 0x155, 0x15a, 0x15b, 0xda, 0xfa, 0xdd, 0xfd, 0x179, 0x17a, 0x10f, 0x13d, 0x13e, 0x165, 0x102, 0x103, 0x11e, 0x11f, 0x16c, 0x16d, 0x10c, 0x10d, 0x10e, 0x11a, 0x11b, 0x147, 0x148, 0x158, 0x159, 0x160, 0x161, 0x164, 0x17d, 0x17e, 0xc7, 0xe7, 0x122, 0x123, 0x136, 0x137, 0x13b, 0x13c, 0x145, 0x146, 0x156, 0x157, 0x15e, 0x15f, 0x162, 0x163, 0xc2, 0xe2, 0x108, 0x109, 0xca, 0xea, 0x11c, 0x11d, 0x124, 0x125, 0xce, 0xee, 0x134, 0x135, 0xd4, 0xf4, 0x15c, 0x15d, 0xdb, 0xfb, 0x174, 0x175, 0x176, 0x177, 0xc4, 0xe4, 0xcb, 0xeb, 0xcf, 0xef, 0xd6, 0xf6, 0xdc, 0xfc, 0x178, 0xff, 0x10a, 0x10b, 0x116, 0x117, 0x120, 0x121, 0x130, 0x131, 0x17b, 0x17c, 0x150, 0x151, 0x170, 0x171, 0xc0, 0xe0, 0xc8, 0xe8, 0xcc, 0xec, 0xd2, 0xf2, 0xd9, 0xf9, 0x1a0, 0x1a1, 0x1af, 0x1b0, 0x100, 0x101, 0x112, 0x113, 0x12a, 0x12b, 0x14c, 0x14d, 0x16a, 0x16b, 0x104, 0x105, 0x118, 0x119, 0x12e, 0x12f, 0x172, 0x173, 0xc5, 0xe5, 0x16e, 0x16f, 0x110, 0x111, 0x126, 0x127, 0x141, 0x142, 0xd8, 0xf8, 0xc3, 0xe3, 0xd1, 0xf1, 0xd5, 0xf5, 0xc6, 0xe6, 0x152, 0x153, 0xd0, 0xf0, 0xde, 0xfe, 0xdf, 0x17f, 0x391, 0x392, 0x393, 0x394, 0x395, 0x396, 0x397, 0x398, 0x399, 0x39a, 0x39b, 0x39c, 0x39d, 0x39e, 0x39f, 0x3a0, 0x3a1, 0x3a3, 0x3a4, 0x3a5, 0x3a6, 0x3a7, 0x3a8, 0x3a9, 0x386, 0x388, 0x389, 0x38a, 0x38c, 0x38e, 0x38f, 0x3aa, 0x3ab, 0x3b1, 0x3b2, 0x3b3, 0x3b4, 0x3b5, 0x3b6, 0x3b7, 0x3b8, 0x3b9, 0x3ba, 0x3bb, 0x3bc, 0x3bd, 0x3be, 0x3bf, 0x3c0, 0x3c1, 0x3c3, 0x3c2, 0x3c4, 0x3c5, 0x3c6, 0x3c7, 0x3c8, 0x3c9, 0x3ac, 0x3ad, 0x3ae, 0x3af, 0x3cc, 0x3cd, 0x3ce, 0x3ca, 0x3cb, 0x390, 0x3b0, 0x410, 0x411, 0x412, 0x413, 0x414, 0x415, 0x401, 0x416, 0x417, 0x406, 0x419, 0x41a, 0x41b, 0x41c, 0x41d, 0x41e, 0x41f, 0x420, 0x421, 0x422, 0x423, 0x40e, 0x424, 0x425, 0x426, 0x427, 0x428, 0x42b, 0x42c, 0x42d, 0x42e, 0x42f, 0x430, 0x431, 0x432, 0x433, 0x434, 0x435, 0x451, 0x436, 0x437, 0x456, 0x439, 0x43a, 0x43b, 0x43c, 0x43d, 0x43e, 0x43f, 0x440, 0x441, 0x442, 0x443, 0x45e, 0x444, 0x445, 0x446, 0x447, 0x448, 0x44b, 0x44c, 0x44d, 0x44e, 0x44f, 0x418, 0x429, 0x42a, 0x438, 0x449, 0x44a, 0x403, 0x405, 0x408, 0x409, 0x40a, 0x40c, 0x40f, 0x453, 0x455, 0x458, 0x459, 0x45a, 0x45c, 0x45f, 0x402, 0x40b, 0x452, 0x45b, 0x490, 0x404, 0x407, 0x491, 0x454, 0x457, 0x4e8, 0x4ae, 0x4e9, 0x4af); $chars = utf8_to_unicode($string); for ($i = 0, $size = count($chars); $i < $size; ++$i) { if (!in_array($chars[$i], $UTF8_ALPHA_CHARS)) { unset($chars[$i]); } } return unicode_to_utf8($chars); }
/** * UTF-8 aware alternative to strtoupper * Make a string uppercase * Note: The concept of a characters "case" only exists is some alphabets * such as Latin, Greek, Cyrillic, Armenian and archaic Georgian - it does * not exist in the Chinese alphabet, for example. See Unicode Standard * Annex #21: Case Mappings * Note: requires utf8_to_unicode and utf8_from_unicode * @author Andreas Gohr <*****@*****.**> * @param string * @return mixed either string in lowercase or FALSE is UTF-8 invalid * @see http://www.php.net/strtoupper * @see utf8_to_unicode * @see utf8_from_unicode * @see http://www.unicode.org/reports/tr21/tr21-5.html * @see http://dev.splitbrain.org/view/darcs/dokuwiki/inc/utf8.php * @package utf8 * @subpackage strings */ function utf8_strtoupper($string) { global $UTF8_LOWER_TO_UPPER; $uni = utf8_to_unicode($string); if (!$uni) { return FALSE; } $cnt = count($uni); for ($i = 0; $i < $cnt; $i++) { if (isset($UTF8_LOWER_TO_UPPER[$uni[$i]])) { $uni[$i] = $UTF8_LOWER_TO_UPPER[$uni[$i]]; } } return utf8_from_unicode($uni); }
function utf8_strtoupper($string) { static $lower_to_upper; if ($lower_to_upper == null) { $lower_to_upper = array(0x61 => 0x41, 0x3c6 => 0x3a6, 0x163 => 0x162, 0xe5 => 0xc5, 0x62 => 0x42, 0x13a => 0x139, 0xe1 => 0xc1, 0x142 => 0x141, 0x3cd => 0x38e, 0x101 => 0x100, 0x491 => 0x490, 0x3b4 => 0x394, 0x15b => 0x15a, 0x64 => 0x44, 0x3b3 => 0x393, 0xf4 => 0xd4, 0x44a => 0x42a, 0x439 => 0x419, 0x113 => 0x112, 0x43c => 0x41c, 0x15f => 0x15e, 0x144 => 0x143, 0xee => 0xce, 0x45e => 0x40e, 0x44f => 0x42f, 0x3ba => 0x39a, 0x155 => 0x154, 0x69 => 0x49, 0x73 => 0x53, 0x1e1f => 0x1e1e, 0x135 => 0x134, 0x447 => 0x427, 0x3c0 => 0x3a0, 0x438 => 0x418, 0xf3 => 0xd3, 0x440 => 0x420, 0x454 => 0x404, 0x435 => 0x415, 0x449 => 0x429, 0x14b => 0x14a, 0x431 => 0x411, 0x459 => 0x409, 0x1e03 => 0x1e02, 0xf6 => 0xd6, 0xf9 => 0xd9, 0x6e => 0x4e, 0x451 => 0x401, 0x3c4 => 0x3a4, 0x443 => 0x423, 0x15d => 0x15c, 0x453 => 0x403, 0x3c8 => 0x3a8, 0x159 => 0x158, 0x67 => 0x47, 0xe4 => 0xc4, 0x3ac => 0x386, 0x3ae => 0x389, 0x167 => 0x166, 0x3be => 0x39e, 0x165 => 0x164, 0x117 => 0x116, 0x109 => 0x108, 0x76 => 0x56, 0xfe => 0xde, 0x157 => 0x156, 0xfa => 0xda, 0x1e61 => 0x1e60, 0x1e83 => 0x1e82, 0xe2 => 0xc2, 0x119 => 0x118, 0x146 => 0x145, 0x70 => 0x50, 0x151 => 0x150, 0x44e => 0x42e, 0x129 => 0x128, 0x3c7 => 0x3a7, 0x13e => 0x13d, 0x442 => 0x422, 0x7a => 0x5a, 0x448 => 0x428, 0x3c1 => 0x3a1, 0x1e81 => 0x1e80, 0x16d => 0x16c, 0xf5 => 0xd5, 0x75 => 0x55, 0x177 => 0x176, 0xfc => 0xdc, 0x1e57 => 0x1e56, 0x3c3 => 0x3a3, 0x43a => 0x41a, 0x6d => 0x4d, 0x16b => 0x16a, 0x171 => 0x170, 0x444 => 0x424, 0xec => 0xcc, 0x169 => 0x168, 0x3bf => 0x39f, 0x6b => 0x4b, 0xf2 => 0xd2, 0xe0 => 0xc0, 0x434 => 0x414, 0x3c9 => 0x3a9, 0x1e6b => 0x1e6a, 0xe3 => 0xc3, 0x44d => 0x42d, 0x436 => 0x416, 0x1a1 => 0x1a0, 0x10d => 0x10c, 0x11d => 0x11c, 0xf0 => 0xd0, 0x13c => 0x13b, 0x45f => 0x40f, 0x45a => 0x40a, 0xe8 => 0xc8, 0x3c5 => 0x3a5, 0x66 => 0x46, 0xfd => 0xdd, 0x63 => 0x43, 0x21b => 0x21a, 0xea => 0xca, 0x3b9 => 0x399, 0x17a => 0x179, 0xef => 0xcf, 0x1b0 => 0x1af, 0x65 => 0x45, 0x3bb => 0x39b, 0x3b8 => 0x398, 0x3bc => 0x39c, 0x45c => 0x40c, 0x43f => 0x41f, 0x44c => 0x42c, 0xfe => 0xde, 0xf0 => 0xd0, 0x1ef3 => 0x1ef2, 0x68 => 0x48, 0xeb => 0xcb, 0x111 => 0x110, 0x433 => 0x413, 0x12f => 0x12e, 0xe6 => 0xc6, 0x78 => 0x58, 0x161 => 0x160, 0x16f => 0x16e, 0x3b1 => 0x391, 0x457 => 0x407, 0x173 => 0x172, 0xff => 0x178, 0x6f => 0x4f, 0x43b => 0x41b, 0x3b5 => 0x395, 0x445 => 0x425, 0x121 => 0x120, 0x17e => 0x17d, 0x17c => 0x17b, 0x3b6 => 0x396, 0x3b2 => 0x392, 0x3ad => 0x388, 0x1e85 => 0x1e84, 0x175 => 0x174, 0x71 => 0x51, 0x437 => 0x417, 0x1e0b => 0x1e0a, 0x148 => 0x147, 0x105 => 0x104, 0x458 => 0x408, 0x14d => 0x14c, 0xed => 0xcd, 0x79 => 0x59, 0x10b => 0x10a, 0x3ce => 0x38f, 0x72 => 0x52, 0x430 => 0x410, 0x455 => 0x405, 0x452 => 0x402, 0x127 => 0x126, 0x137 => 0x136, 0x12b => 0x12a, 0x3af => 0x38a, 0x44b => 0x42b, 0x6c => 0x4c, 0x3b7 => 0x397, 0x125 => 0x124, 0x219 => 0x218, 0xfb => 0xdb, 0x11f => 0x11e, 0x43e => 0x41e, 0x1e41 => 0x1e40, 0x3bd => 0x39d, 0x107 => 0x106, 0x3cb => 0x3ab, 0x446 => 0x426, 0xfe => 0xde, 0xe7 => 0xc7, 0x3ca => 0x3aa, 0x441 => 0x421, 0x432 => 0x412, 0x10f => 0x10e, 0xf8 => 0xd8, 0x77 => 0x57, 0x11b => 0x11a, 0x74 => 0x54, 0x6a => 0x4a, 0x45b => 0x40b, 0x456 => 0x406, 0x103 => 0x102, 0x3bb => 0x39b, 0xf1 => 0xd1, 0x43d => 0x41d, 0x3cc => 0x38c, 0xe9 => 0xc9, 0xf0 => 0xd0, 0x457 => 0x407, 0x123 => 0x122); } $unicode = utf8_to_unicode($string); if (!$unicode) { return false; } for ($i = 0; $i < count($unicode); $i++) { if (isset($lower_to_upper[$unicode[$i]])) { $unicode[$i] = $lower_to_upper[$unicode[$i]]; } } return unicode_to_utf8($unicode); }
/** * utf8strtolower * * @param string $string * * @note Port of phputf8's utf8_strtolower() * * @return string */ public static function utf8strtolower($string) { static $UTF8_UPPER_TO_LOWER = null; if (is_null($UTF8_UPPER_TO_LOWER)) { $UTF8_UPPER_TO_LOWER = array(0x41 => 0x61, 0x3a6 => 0x3c6, 0x162 => 0x163, 0xc5 => 0xe5, 0x42 => 0x62, 0x139 => 0x13a, 0xc1 => 0xe1, 0x141 => 0x142, 0x38e => 0x3cd, 0x100 => 0x101, 0x490 => 0x491, 0x394 => 0x3b4, 0x15a => 0x15b, 0x44 => 0x64, 0x393 => 0x3b3, 0xd4 => 0xf4, 0x42a => 0x44a, 0x419 => 0x439, 0x112 => 0x113, 0x41c => 0x43c, 0x15e => 0x15f, 0x143 => 0x144, 0xce => 0xee, 0x40e => 0x45e, 0x42f => 0x44f, 0x39a => 0x3ba, 0x154 => 0x155, 0x49 => 0x69, 0x53 => 0x73, 0x1e1e => 0x1e1f, 0x134 => 0x135, 0x427 => 0x447, 0x3a0 => 0x3c0, 0x418 => 0x438, 0xd3 => 0xf3, 0x420 => 0x440, 0x404 => 0x454, 0x415 => 0x435, 0x429 => 0x449, 0x14a => 0x14b, 0x411 => 0x431, 0x409 => 0x459, 0x1e02 => 0x1e03, 0xd6 => 0xf6, 0xd9 => 0xf9, 0x4e => 0x6e, 0x401 => 0x451, 0x3a4 => 0x3c4, 0x423 => 0x443, 0x15c => 0x15d, 0x403 => 0x453, 0x3a8 => 0x3c8, 0x158 => 0x159, 0x47 => 0x67, 0xc4 => 0xe4, 0x386 => 0x3ac, 0x389 => 0x3ae, 0x166 => 0x167, 0x39e => 0x3be, 0x164 => 0x165, 0x116 => 0x117, 0x108 => 0x109, 0x56 => 0x76, 0xde => 0xfe, 0x156 => 0x157, 0xda => 0xfa, 0x1e60 => 0x1e61, 0x1e82 => 0x1e83, 0xc2 => 0xe2, 0x118 => 0x119, 0x145 => 0x146, 0x50 => 0x70, 0x150 => 0x151, 0x42e => 0x44e, 0x128 => 0x129, 0x3a7 => 0x3c7, 0x13d => 0x13e, 0x422 => 0x442, 0x5a => 0x7a, 0x428 => 0x448, 0x3a1 => 0x3c1, 0x1e80 => 0x1e81, 0x16c => 0x16d, 0xd5 => 0xf5, 0x55 => 0x75, 0x176 => 0x177, 0xdc => 0xfc, 0x1e56 => 0x1e57, 0x3a3 => 0x3c3, 0x41a => 0x43a, 0x4d => 0x6d, 0x16a => 0x16b, 0x170 => 0x171, 0x424 => 0x444, 0xcc => 0xec, 0x168 => 0x169, 0x39f => 0x3bf, 0x4b => 0x6b, 0xd2 => 0xf2, 0xc0 => 0xe0, 0x414 => 0x434, 0x3a9 => 0x3c9, 0x1e6a => 0x1e6b, 0xc3 => 0xe3, 0x42d => 0x44d, 0x416 => 0x436, 0x1a0 => 0x1a1, 0x10c => 0x10d, 0x11c => 0x11d, 0xd0 => 0xf0, 0x13b => 0x13c, 0x40f => 0x45f, 0x40a => 0x45a, 0xc8 => 0xe8, 0x3a5 => 0x3c5, 0x46 => 0x66, 0xdd => 0xfd, 0x43 => 0x63, 0x21a => 0x21b, 0xca => 0xea, 0x399 => 0x3b9, 0x179 => 0x17a, 0xcf => 0xef, 0x1af => 0x1b0, 0x45 => 0x65, 0x39b => 0x3bb, 0x398 => 0x3b8, 0x39c => 0x3bc, 0x40c => 0x45c, 0x41f => 0x43f, 0x42c => 0x44c, 0xde => 0xfe, 0xd0 => 0xf0, 0x1ef2 => 0x1ef3, 0x48 => 0x68, 0xcb => 0xeb, 0x110 => 0x111, 0x413 => 0x433, 0x12e => 0x12f, 0xc6 => 0xe6, 0x58 => 0x78, 0x160 => 0x161, 0x16e => 0x16f, 0x391 => 0x3b1, 0x407 => 0x457, 0x172 => 0x173, 0x178 => 0xff, 0x4f => 0x6f, 0x41b => 0x43b, 0x395 => 0x3b5, 0x425 => 0x445, 0x120 => 0x121, 0x17d => 0x17e, 0x17b => 0x17c, 0x396 => 0x3b6, 0x392 => 0x3b2, 0x388 => 0x3ad, 0x1e84 => 0x1e85, 0x174 => 0x175, 0x51 => 0x71, 0x417 => 0x437, 0x1e0a => 0x1e0b, 0x147 => 0x148, 0x104 => 0x105, 0x408 => 0x458, 0x14c => 0x14d, 0xcd => 0xed, 0x59 => 0x79, 0x10a => 0x10b, 0x38f => 0x3ce, 0x52 => 0x72, 0x410 => 0x430, 0x405 => 0x455, 0x402 => 0x452, 0x126 => 0x127, 0x136 => 0x137, 0x12a => 0x12b, 0x38a => 0x3af, 0x42b => 0x44b, 0x4c => 0x6c, 0x397 => 0x3b7, 0x124 => 0x125, 0x218 => 0x219, 0xdb => 0xfb, 0x11e => 0x11f, 0x41e => 0x43e, 0x1e40 => 0x1e41, 0x39d => 0x3bd, 0x106 => 0x107, 0x3ab => 0x3cb, 0x426 => 0x446, 0xde => 0xfe, 0xc7 => 0xe7, 0x3aa => 0x3ca, 0x421 => 0x441, 0x412 => 0x432, 0x10e => 0x10f, 0xd8 => 0xf8, 0x57 => 0x77, 0x11a => 0x11b, 0x54 => 0x74, 0x4a => 0x6a, 0x40b => 0x45b, 0x406 => 0x456, 0x102 => 0x103, 0x39b => 0x3bb, 0xd1 => 0xf1, 0x41d => 0x43d, 0x38c => 0x3cc, 0xc9 => 0xe9, 0xd0 => 0xf0, 0x407 => 0x457, 0x122 => 0x123); } $uni = utf8_to_unicode($string); if (!$uni) { return false; } $cnt = count($uni); for ($i = 0; $i < $cnt; $i++) { if (isset($UTF8_UPPER_TO_LOWER[$uni[$i]])) { $uni[$i] = $UTF8_UPPER_TO_LOWER[$uni[$i]]; } } return static::fromUnicode($uni); }
function utf8_to_gbk($_obfuscate_lEJkeU8) { global $UC2GBTABLE; $_obfuscate_0ZRpoQQÿ = ""; if (empty($_obfuscate_M7zu18TTxzhvAÿÿ)) { $_obfuscate_JTe7jJ4eGW8ÿ = CODETABLEDIR . "gb-unicode.table"; $_obfuscate_YBYÿ = fopen($_obfuscate_JTe7jJ4eGW8ÿ, "rb"); while ($A = fgets($_obfuscate_YBYÿ, 15)) { $UC2GBTABLE[hexdec(substr($A, 7, 6))] = hexdec(substr($A, 0, 6)); } fclose($_obfuscate_YBYÿ); } $_obfuscate_0ZRpoQQÿ = ""; $_obfuscate_7ypN_Aÿÿ = strlen($_obfuscate_lEJkeU8); $_obfuscate_7wÿÿ = 0; for (; $_obfuscate_7wÿÿ < $_obfuscate_7ypN_Aÿÿ; ++$_obfuscate_7wÿÿ) { $_obfuscate_KQÿÿ = $_obfuscate_lEJkeU8[$_obfuscate_7wÿÿ]; $_obfuscate_s7Uÿ = decbin(ord($_obfuscate_lEJkeU8[$_obfuscate_7wÿÿ])); if (strlen($_obfuscate_s7Uÿ) == 8) { $_obfuscate_TsNQCdQÿ = strpos(decbin(ord($_obfuscate_s7Uÿ)), "0"); $_obfuscate_XAÿÿ = 0; for (; $_obfuscate_XAÿÿ < $_obfuscate_TsNQCdQÿ; ++$_obfuscate_XAÿÿ) { ++$_obfuscate_7wÿÿ; $_obfuscate_KQÿÿ .= $_obfuscate_lEJkeU8[$_obfuscate_7wÿÿ]; } $_obfuscate_KQÿÿ = utf8_to_unicode($_obfuscate_KQÿÿ); if (isset($UC2GBTABLE[$_obfuscate_KQÿÿ])) { $_obfuscate_KQÿÿ = dechex($UC2GBTABLE[$_obfuscate_KQÿÿ] + 32896); $_obfuscate_0ZRpoQQÿ .= chr(hexdec($_obfuscate_KQÿÿ[0] . $_obfuscate_KQÿÿ[1])) . chr(hexdec($_obfuscate_KQÿÿ[2] . $_obfuscate_KQÿÿ[3])); } else { $_obfuscate_0ZRpoQQÿ .= "&#" . $_obfuscate_KQÿÿ . ";"; } } else { $_obfuscate_0ZRpoQQÿ .= $_obfuscate_KQÿÿ; } } $_obfuscate_0ZRpoQQÿ = trim($_obfuscate_0ZRpoQQÿ); return $_obfuscate_0ZRpoQQÿ; }
/** * utf8转gbk * @param $utfstr */ function utf8_to_gbk($utfstr) { global $UC2GBTABLE; $okstr = ''; if (empty($UC2GBTABLE)) { $filename = CODETABLEDIR . 'gb-unicode.table'; $fp = fopen($filename, 'rb'); while ($l = fgets($fp, 15)) { $UC2GBTABLE[hexdec(substr($l, 7, 6))] = hexdec(substr($l, 0, 6)); } fclose($fp); } $okstr = ''; $ulen = strlen($utfstr); for ($i = 0; $i < $ulen; $i++) { $c = $utfstr[$i]; $cb = decbin(ord($utfstr[$i])); if (strlen($cb) == 8) { $csize = strpos(decbin(ord($cb)), '0'); for ($j = 0; $j < $csize; $j++) { $i++; $c .= $utfstr[$i]; } $c = utf8_to_unicode($c); if (isset($UC2GBTABLE[$c])) { $c = dechex($UC2GBTABLE[$c] + 0x8080); $okstr .= chr(hexdec($c[0] . $c[1])) . chr(hexdec($c[2] . $c[3])); } else { $okstr .= '&#' . $c . ';'; } } else { $okstr .= $c; } } $okstr = trim($okstr); return $okstr; }
/** * function detect_encoding($text) * Detects the encoding of a particular text * @return - one of GSM_7BIT, GSM_7BIT_EX, UTF16 */ public static function detect_encoding($text, &$ex_chars) { if (!is_array($text)) { $text = utf8_to_unicode($text); } $utf16_chars = array_diff($text, self::int_gsm_7bit_combined_map()); if (count($utf16_chars)) { return self::UTF16; } $ex_chars = array_intersect($text, self::int_gsm_7bit_ex_map()); if (count($ex_chars)) { return self::GSM_7BIT_EX; } else { return self::GSM_7BIT; } }
function verbIrr($stem, &$match) { # 각종 규칙 불규칙 처리 $ustem = utf8_to_unicode($stem); $uend = utf8_to_unicode($match[1]); $ch = array_pop($ustem); $ed = $uend[0]; $save = ''; if ($this->isHangul($ch)) { $j = hangul_to_jamo($ch); $ej = hangul_to_jamo($ed); $sj = sizeof($j); if ($sj == 3 and $j[2] == 0x11bb) { // 랐-다, 었-다, 겠-다, 였-다 if (in_array($j[1], array(0x1161, 0x1165, 0x1166, 0x1167))) { if ($j[0] == 0x1105 and in_array($j[1], array(0x1161, 0x1165, 0x1167))) { // 랐,렀,렸 // 갈렸-다 } else { if (in_array($j[0], array(0x1100, 0x110b, 0x110c))) { # 겠,았 array_unshift($uend, $ch); unset($ch); } else { if ($j[1] == 0x1167 and in_array($j[0], array(0x1101, 0x1102, 0x1103, 0x1105, 0x1106, 0x1107, 0x1109, 0x110c, 0x110e, 0x110f, 0x1110, 0x1111, 0x1112))) { # 여 변환 // 혔 -> ㅎ+었 -> 히+었 $j[1] = 0x1165; $syll = jamo_to_syllable(array(0x110b, $j[1], $j[2])); array_unshift($uend, $syll[0]); /* 혔 -> 히+었, 폈 -> 피+었 */ $j[1] = 0x1175; $syll = jamo_to_syllable(array($j[0], $j[1])); $ch = $syll[0]; } else { if (in_array($j[0], array(0x1101, 0x1104, 0x110a, 0x1111, 0x1112))) { # 우 불규칙 /* 떴 -> ㄸ + 었 */ $syll = jamo_to_syllable(array(0x110b, $j[1], $j[2])); array_unshift($uend, $syll[0]); /* ㄸ -> 뜨 */ $j[1] = 0x1173; /* ㅡ */ if ($j[0] == 0x1111) { $j[1] = 0x116e; } /* 펐 푸+었 */ jamo_to_syllable(array($j[0], $j[1])); /* 쓰 */ $ch = $syll[0]; } else { if (in_array($j[0], array(0x1101, 0x1104, 0x110a, 0x1111, 0x1112))) { } } } } } } else { if ($j[0] == 0x1112 and in_array($j[1], array(0x1162))) { array_push($ustem, 0xd558); /* 하 */ $syll = jamo_to_syllable(array(0x110b, 0x1167, 0x11bb)); array_unshift($uend, $syll[0]); #$match[1]='여'.$match[1]; /* 해 -> 하 + 여 */ unset($ch); } else { /* ㅆ를 떼어낸다. */ #print '~~'.$stem.'~~'; $syll = jamo_to_syllable(array($j[0], $j[1])); array_unshift($uend, $j[2]); #array_unshift($uend,hangul_jongseong_to_cjamo($j[2])); $ch = $syll[0]; unset($j[2]); #unset($ch); } } if (!$ch) { $ch = array_pop($ustem); $j = hangul_to_jamo($ch); } $ed = $uend[0]; $ej = hangul_to_jamo($ed); } else { if (!empty($j[2]) and in_array($j[2], array(0x11ab, 0x11af, 0x11b8))) { // 합-시다 갑-시다 갈-래 // 하-ㅂ시다 가-ㅂ시다 가-ㄹ래 // if ($j[2] == 0x11af and $ej[0] == 0x1105) { //if ($j[1] == 0x1173 and $j[2]== 0x11af and $ej[0]==0x1105) { // 르 불규칙 // 흘-러:흐르+러 unset($j[2]); $syll = jamo_to_syllable($j); array_push($ustem, $syll[0]); /* 흐 */ $j[0] = $ej[0]; $j[1] = 0x1173; $syll = jamo_to_syllable($j); /* 르 */ $ch = $syll[0]; } else { array_unshift($uend, $j[2]); $syll = jamo_to_syllable(array($j[0], $j[1])); $ch = $syll[0]; $ed = $j[2]; unset($j[2]); } } } // ㄷ 불규칙 // 들-어 -> 듣-다 $sj = sizeof($j); if ($sj == 3 and $j[2] == 0x11af and in_array($ej[0], array(0x110b, 0x1105))) { while (in_array($ej[1], array(0x1161, 0x1165, 0x1173))) { // 아어으 // 라러르 $se = sizeof($ej); if ($se == 3) { if ($ej[1] == 0x1173 and !in_array($ej[2], 0x11ab, 0x11af)) { break; } // 은을 } else { if ($j[2] == 0x11af and sizeof($ej) == 2 and $ej[0] == 0x1105) { break; } } $syll = jamo_to_syllable(array($j[0], $j[1], 0x11ae)); $ch = $syll[0]; break; } } // ㅅ 불규칙 // * 지-어:짓-어 // * 이-어:잇-어 if (sizeof($ej) == 2) { if ($ej[0] == 0x110b) { $j[2] = 0x11ba; $syll = jamo_to_syllable($j); /* +ㅅ */ $ch = $syll[0]; $sj = 3; } } if ($sj == 2) { if (in_array($j[0], array(0x110c)) and in_array($j[1], array(0x116e, 0x1175))) { /* 주, 지 */ array_unshift($uend, $ch); unset($ch); $ch = array_pop($ustem); $j = hangul_to_jamo($ch); } if ($j[1] == 0x1165 and in_array($j[0], array(0x1101, 0x1104, 0x110a, 0x1111))) { /* 꺼,떠,써,퍼 */ $syll = jamo_to_syllable(array(0x110b, 0x1165)); /* 어 */ array_unshift($uend, $syll[0]); if ($j[0] == 0x1111) { $syll = jamo_to_syllable(array($j[0], 0x116e)); } else { $syll = jamo_to_syllable(array($j[0], 0x1173)); } /* 쓰 */ array_push($ustem, $syll[0]); unset($ch); $ch = array_pop($ustem); $j = hangul_to_jamo($ch); } // 음운 축약 if (in_array($j[0], array(0x1105, 0x1112)) and $j[1] == 0x1162) { // ㅎ 불규칙(어미) 파랗+아서 -> 파라+아서 -> 파래서 /* 파래-서 -> 파라-아서 */ $j[1] = 0x1161; $syll = jamo_to_syllable($j); /* 래 -> 라+ 아 */ $ch = $syll[0]; $syll = jamo_to_syllable(array(0x110b, 0x1161)); /* 아 */ $ed = $syll[0]; array_unshift($uend, $ed); $ej[0] = 0x110b; $ej[0] = 0x1161; } else { if ($j[0] == 0x1112 and in_array($j[1], array(0x1162))) { // 해-서 = 하-여서 $j[1] = 0x1161; $syll = jamo_to_syllable($j); /* 해 -> 하 + 여 */ $ch = $syll[0]; $syll = jamo_to_syllable(array(0x110b, 0x1167)); /* 여 */ $ed = $syll[0]; array_unshift($uend, $ed); $ej[0] = 0x110b; $ej[0] = 0x1167; } else { if (in_array($j[0], array(0x1105, 0x1109)) and in_array($j[1], array(0x1167))) { // 하셔-서 = 하시-어서 // 가려-서 = 가리-어서 $j[1] = 0x1175; /* ㅣ */ $syll = jamo_to_syllable($j); /* ㅕ -> 이-어 */ $ch = $syll[0]; $syll = jamo_to_syllable(array(0x110b, 0x1165)); /* 어 */ $ed = $syll[0]; array_unshift($uend, $ed); $ej[0] = 0x110b; $ej[0] = 0x1165; } } } if ($j[0] == 0x1109 and $j[1] == 0x1175) { /* 시: 존칭처리 */ array_unshift($uend, $ch); $ej = $j; $ch = array_pop($ustem); $j = hangul_to_jamo($ch); } // ㅎ 불규칙 if (in_array($j[0], array(0x1105, 0x1106)) and in_array($j[1], array(0x1161, 0x1165))) { $syll = jamo_to_syllable(array($j[0], $j[1], 0x11c2)); /* 랗,렇 */ array_push($ustem, $syll[0]); unset($ch); unset($j); } } while ($sj == 2 and $j[0] == 0x110b and in_array($j[1], array(0x116a, 0x116e, 0x116f)) and sizeof($ustem) >= 1) { // XXX // 그리워: 그리우+어 -> 그립+워 # /* 와 우 워 */ $ch1 = array_pop($ustem); $jamo = hangul_to_jamo($ch1); if (sizeof($jamo) == 2) { if ($jamo[1] != 0x1175) { $syll = jamo_to_syllable(array($jamo[0], $jamo[1], 0x11b8)); array_push($ustem, $syll[0]); /* add ㅂ */ } else { array_push($ustem, $ch1); } array_unshift($uend, $ch); unset($ch); } else { array_push($ustem, $ch1); } break; } if ($ch) { array_push($ustem, $ch); } $match[1] = unicode_to_utf8($uend); return unicode_to_utf8($ustem); } $match[1] = $save . $match[1]; return $stem; #print "<pre>"; #print($word.'-'.$match[1]); #print_r($match); }
/** * Convert an UTF-8 string to a safe ASCII String * * conversion process * - if codepoint is a plain or post_indicator character, * - if previous character was "converted", append post_indicator to output, clear "converted" flag * - append ascii byte for character to output * (continue to next character) * * - if codepoint is a pre_indicator character, * - append ascii byte for character to output, set "converted" flag * (continue to next character) * * (all remaining characters) * - reduce codepoint value for non-printable ASCII characters (0x00 - 0x1f). Space becomes our zero. * - convert reduced value to base36 (0-9a-z) * - append $pre_indicator characater followed by base36 string to output, set converted flag * (continue to next character) * * @param string $filename a utf8 string, should only include printable characters - not 0x00-0x1f * @return string an encoded representation of $filename using only 'safe' ASCII characters * * @author Christopher Smith <*****@*****.**> */ public function encode($filename) { return self::unicode_to_safe(utf8_to_unicode($filename)); }
function gw_send_sms($mobile_sender, $sms_to, $sms_msg, $gp_code = "", $uid = "", $smslog_id = "", $msg_type = "text", $unicode = "0") { global $clktl_param; global $gateway_number; if ($gateway_number) { $sms_from = $gateway_number; } else { $sms_from = $mobile_sender; } switch ($msg_type) { case "flash": $sms_type = "SMS_FLASH"; break; case "logo": $sms_type = "SMS_NOKIA_OLOGO"; break; case "picture": $sms_type = "SMS_NOKIA_PICTURE"; break; case "ringtone": case "rtttl": $sms_type = "SMS_NOKIA_RTTTL"; break; case "text": default: $sms_type = "SMS_TEXT"; } // $query_string = "sendmsg?api_id=".$clktl_param[api_id]."&user="******"&password="******"&to=$sms_to&msg_type=$sms_type&text=".rawurlencode($sms_msg)."&deliv_ack=1&callback=3&unicode=$unicode&concat=3&from=".rawurlencode($sms_from); // no concat if ($unicode) { $sms_msg = utf8_to_unicode($sms_msg); $query_string = "sendmsg?api_id=" . $clktl_param[api_id] . "&user="******"&password="******"&to={$sms_to}&msg_type={$sms_type}&text={$sms_msg}&deliv_ack=1&callback=3&unicode={$unicode}&from=" . rawurlencode($sms_from); } else { $query_string = "sendmsg?api_id=" . $clktl_param[api_id] . "&user="******"&password="******"&to={$sms_to}&msg_type={$sms_type}&text=" . rawurlencode($sms_msg) . "&deliv_ack=1&callback=3&unicode={$unicode}&from=" . rawurlencode($sms_from); } $url = $clktl_param[send_url] . "/" . $query_string; $fd = file($url); $ok = false; $p_status = DLR_FAILED; if ($fd) { $response = split(":", $fd); $err_code = trim($response[1]); if (strtoupper($response[0]) == "ID") { if ($apimsgid = trim($response[1])) { clktl_setsmsapimsgid($smslog_id, $apimsgid); list($c_sms_credit, $c_sms_status) = clktl_getsmsstatus($smslog_id); if ($c_sms_status) { $p_status = $c_sms_status; } else { $p_status = DLR_PENDING; } } else { $p_status = DLR_SENT; } } $ok = true; } setsmsdeliverystatus($smslog_id, $uid, $p_status); return $ok; }
/** * UTF-8 to UTF-16BE conversion. * * Maybe really UCS-2 without mb_string due to utf8_to_unicode limits */ function utf8_to_utf16be(&$str, $bom = false) { $out = $bom ? "��" : ''; if (!defined('UTF8_NOMBSTRING') && function_exists('mb_convert_encoding')) { return $out . mb_convert_encoding($str, 'UTF-16BE', 'UTF-8'); } $uni = utf8_to_unicode($str); foreach ($uni as $cp) { $out .= pack('n', $cp); } return $out; }
function utf8_strtoupper($string) { static $UTF8_LOWER_TO_UPPER = NULL; if (is_null($UTF8_LOWER_TO_UPPER)) { $UTF8_LOWER_TO_UPPER = array( 0x0061 => 0x0041, 0x03C6 => 0x03A6, 0x0163 => 0x0162, 0x00E5 => 0x00C5, 0x0062 => 0x0042, 0x013A => 0x0139, 0x00E1 => 0x00C1, 0x0142 => 0x0141, 0x03CD => 0x038E, 0x0101 => 0x0100, 0x0491 => 0x0490, 0x03B4 => 0x0394, 0x015B => 0x015A, 0x0064 => 0x0044, 0x03B3 => 0x0393, 0x00F4 => 0x00D4, 0x044A => 0x042A, 0x0439 => 0x0419, 0x0113 => 0x0112, 0x043C => 0x041C, 0x015F => 0x015E, 0x0144 => 0x0143, 0x00EE => 0x00CE, 0x045E => 0x040E, 0x044F => 0x042F, 0x03BA => 0x039A, 0x0155 => 0x0154, 0x0069 => 0x0049, 0x0073 => 0x0053, 0x1E1F => 0x1E1E, 0x0135 => 0x0134, 0x0447 => 0x0427, 0x03C0 => 0x03A0, 0x0438 => 0x0418, 0x00F3 => 0x00D3, 0x0440 => 0x0420, 0x0454 => 0x0404, 0x0435 => 0x0415, 0x0449 => 0x0429, 0x014B => 0x014A, 0x0431 => 0x0411, 0x0459 => 0x0409, 0x1E03 => 0x1E02, 0x00F6 => 0x00D6, 0x00F9 => 0x00D9, 0x006E => 0x004E, 0x0451 => 0x0401, 0x03C4 => 0x03A4, 0x0443 => 0x0423, 0x015D => 0x015C, 0x0453 => 0x0403, 0x03C8 => 0x03A8, 0x0159 => 0x0158, 0x0067 => 0x0047, 0x00E4 => 0x00C4, 0x03AC => 0x0386, 0x03AE => 0x0389, 0x0167 => 0x0166, 0x03BE => 0x039E, 0x0165 => 0x0164, 0x0117 => 0x0116, 0x0109 => 0x0108, 0x0076 => 0x0056, 0x00FE => 0x00DE, 0x0157 => 0x0156, 0x00FA => 0x00DA, 0x1E61 => 0x1E60, 0x1E83 => 0x1E82, 0x00E2 => 0x00C2, 0x0119 => 0x0118, 0x0146 => 0x0145, 0x0070 => 0x0050, 0x0151 => 0x0150, 0x044E => 0x042E, 0x0129 => 0x0128, 0x03C7 => 0x03A7, 0x013E => 0x013D, 0x0442 => 0x0422, 0x007A => 0x005A, 0x0448 => 0x0428, 0x03C1 => 0x03A1, 0x1E81 => 0x1E80, 0x016D => 0x016C, 0x00F5 => 0x00D5, 0x0075 => 0x0055, 0x0177 => 0x0176, 0x00FC => 0x00DC, 0x1E57 => 0x1E56, 0x03C3 => 0x03A3, 0x043A => 0x041A, 0x006D => 0x004D, 0x016B => 0x016A, 0x0171 => 0x0170, 0x0444 => 0x0424, 0x00EC => 0x00CC, 0x0169 => 0x0168, 0x03BF => 0x039F, 0x006B => 0x004B, 0x00F2 => 0x00D2, 0x00E0 => 0x00C0, 0x0434 => 0x0414, 0x03C9 => 0x03A9, 0x1E6B => 0x1E6A, 0x00E3 => 0x00C3, 0x044D => 0x042D, 0x0436 => 0x0416, 0x01A1 => 0x01A0, 0x010D => 0x010C, 0x011D => 0x011C, 0x00F0 => 0x00D0, 0x013C => 0x013B, 0x045F => 0x040F, 0x045A => 0x040A, 0x00E8 => 0x00C8, 0x03C5 => 0x03A5, 0x0066 => 0x0046, 0x00FD => 0x00DD, 0x0063 => 0x0043, 0x021B => 0x021A, 0x00EA => 0x00CA, 0x03B9 => 0x0399, 0x017A => 0x0179, 0x00EF => 0x00CF, 0x01B0 => 0x01AF, 0x0065 => 0x0045, 0x03BB => 0x039B, 0x03B8 => 0x0398, 0x03BC => 0x039C, 0x045C => 0x040C, 0x043F => 0x041F, 0x044C => 0x042C, 0x00FE => 0x00DE, 0x00F0 => 0x00D0, 0x1EF3 => 0x1EF2, 0x0068 => 0x0048, 0x00EB => 0x00CB, 0x0111 => 0x0110, 0x0433 => 0x0413, 0x012F => 0x012E, 0x00E6 => 0x00C6, 0x0078 => 0x0058, 0x0161 => 0x0160, 0x016F => 0x016E, 0x03B1 => 0x0391, 0x0457 => 0x0407, 0x0173 => 0x0172, 0x00FF => 0x0178, 0x006F => 0x004F, 0x043B => 0x041B, 0x03B5 => 0x0395, 0x0445 => 0x0425, 0x0121 => 0x0120, 0x017E => 0x017D, 0x017C => 0x017B, 0x03B6 => 0x0396, 0x03B2 => 0x0392, 0x03AD => 0x0388, 0x1E85 => 0x1E84, 0x0175 => 0x0174, 0x0071 => 0x0051, 0x0437 => 0x0417, 0x1E0B => 0x1E0A, 0x0148 => 0x0147, 0x0105 => 0x0104, 0x0458 => 0x0408, 0x014D => 0x014C, 0x00ED => 0x00CD, 0x0079 => 0x0059, 0x010B => 0x010A, 0x03CE => 0x038F, 0x0072 => 0x0052, 0x0430 => 0x0410, 0x0455 => 0x0405, 0x0452 => 0x0402, 0x0127 => 0x0126, 0x0137 => 0x0136, 0x012B => 0x012A, 0x03AF => 0x038A, 0x044B => 0x042B, 0x006C => 0x004C, 0x03B7 => 0x0397, 0x0125 => 0x0124, 0x0219 => 0x0218, 0x00FB => 0x00DB, 0x011F => 0x011E, 0x043E => 0x041E, 0x1E41 => 0x1E40, 0x03BD => 0x039D, 0x0107 => 0x0106, 0x03CB => 0x03AB, 0x0446 => 0x0426, 0x00FE => 0x00DE, 0x00E7 => 0x00C7, 0x03CA => 0x03AA, 0x0441 => 0x0421, 0x0432 => 0x0412, 0x010F => 0x010E, 0x00F8 => 0x00D8, 0x0077 => 0x0057, 0x011B => 0x011A, 0x0074 => 0x0054, 0x006A => 0x004A, 0x045B => 0x040B, 0x0456 => 0x0406, 0x0103 => 0x0102, 0x03BB => 0x039B, 0x00F1 => 0x00D1, 0x043D => 0x041D, 0x03CC => 0x038C, 0x00E9 => 0x00C9, 0x00F0 => 0x00D0, 0x0457 => 0x0407, 0x0123 => 0x0122 ); } $unicode = utf8_to_unicode($string); if (!$unicode) { return false; } $count = count($unicode); for ($i = 0; $i < $count; $i++){ if (isset($UTF8_LOWER_TO_UPPER[$unicode[$i]]) ) { $unicode[$i] = $UTF8_LOWER_TO_UPPER[$unicode[$i]]; } } return utf8_from_unicode($unicode); }
function utf8_to_unicode($str) { return utf8_to_unicode($str); }
function utf8_hangul_getSearchRule($str, $lastchar = 1, $use_unicode = true) { $rule = ''; $val = utf8_to_unicode($str); $len = sizeof($val); if ($lastchar and $len > 1) { // make a regex using with the last char $last = array_pop($val); $rule = unicode_to_utf8($val); $val = array($last); $len = sizeof($val); } for ($i = 0; $i < $len; $i++) { $ch = $val[$i]; $wch = array(); $ustart = array(); $uend = array(); if ($ch >= 0xac00 and $ch <= 0xd7a3 or $ch >= 0x3130 and $ch <= 0x318f) { $wch = hangul_to_jamo(array($ch)); } else { $rule .= unicode_to_utf8(array($ch)); continue; } $wlen = sizeof($wch); $ket = ''; if ($wlen >= 3) { // 종각 => 종(각|가[가-깋]) $mrule = array(); $mrule[] = unicode_to_utf8(array($ch)); $save = $wch[2]; unset($wch[2]); $tmp = jamo_to_syllable($wch); $mrule[] = unicode_to_utf8($tmp); $save = hangul_jongseong_to_cjamo($save); $wch = hangul_to_jamo($save); $wlen = sizeof($wch); $rule .= '(' . implode('|', $mrule); $ket = ')'; if ($wlen > 1) { $rule .= ')'; continue; } } if ($wlen == 1) { if ($wch[0] >= 0x1100 and $wch[0] <= 0x1112) { $wch[1] = 0x1161; $start = jamo_to_syllable($wch); $ustart = unicode_to_utf8($start); $wch[1] = 0x1175; $wch[2] = 0x11c2; $end = jamo_to_syllable($wch); $uend = unicode_to_utf8($end); } else { $rule .= unicode_to_utf8($wch) . $ket; continue; } } else { if ($wlen == 2) { if ($wch[0] >= 0x1100 and $wch[0] <= 0x1112) { $start = jamo_to_syllable($wch); $ustart = unicode_to_utf8($start); $wch[2] = 0x11c2; $end = jamo_to_syllable($wch); $uend = unicode_to_utf8($end); } else { $rule .= unicode_to_utf8($wch); continue; } } } if ($use_unicode) { $crule = '[' . $ustart . '-' . $uend . ']'; } else { $rule .= sprintf("\\x%02X", ord($ustart[0])); $crule = ''; if ($ustart[1] == $uend[1]) { $crule .= sprintf("\\x%02X", ord($ustart[1])); $crule .= sprintf("[\\x%02X-\\x%02X]", ord($ustart[2]), ord($uend[2])); } else { $sch = ord($ustart[1]); $ech = ord($uend[1]); $subrule = array(); $subrule[] = sprintf("\\x%02X[\\x%02X-\\xBF]", $sch, ord($ustart[2])); if ($sch + 1 == $ech - 1) { $subrule[] = sprintf("\\x%02X[\\x80-\\xBF]", $sch + 1); } else { if ($sch + 1 != $ech) { $subrule[] = sprintf("[\\x%02X-\\x%02X][\\x80-\\xBF]", $sch + 1, $ech - 1); } } $subrule[] = sprintf("\\x%02X[\\x80-\\x%02X]", ord($uend[1]), ord($uend[2])); $crule .= '(' . implode('|', $subrule) . ')'; } } $rule .= $crule . $ket; } return $rule; }
function crawlStatus($currElement) { $src = $hyperlink = $locn = $userURL = $followers = $rts = $rtu = null; //$currElement = $statusArray[$i]; $createdAt = return_between($currElement, "<created_at>", "</created_at>", EXCL); //format the date to Database datetime type (for date based comparisons) $dtFormat = dateFormat($createdAt); $tempsid = split_string($currElement, "</created_at>", AFTER, EXCL); $tempsid = split_string($tempsid, "</id>", BEFORE, EXCL); $sid = split_string($tempsid, "<id>", AFTER, EXCL); $text = return_between($currElement, "<text>", "</text>", EXCL); //this and next functions called to handle unicode characters or non english text $text = utf8_to_unicode($text); $text = unicode_to_entities_preserving_ascii($text); //preg match to extract URL from tweets, if present (currently for http), match string can be modified for better handling $do = preg_match('@(https?://([-\\w\\.]+)+(:\\d+)?(/([\\w/_\\.]*(\\?\\S+)?)?)?)@', $text, $matches); if ($do = true) { //if url present $hyperlink = expandTinyURL(htmlentities($matches['0'])); } //tweets usually contain tiny urls ->expansion needed $src = return_between($currElement, "<source>", "</source>", EXCL); $src = strip_tags($src); //gathering reply to information, if the tweet is a reply $rts = return_between($currElement, "<in_reply_to_status_id>", "</in_reply_to_status_id>", EXCL); $rtu = return_between($currElement, "<in_reply_to_user_id>", "</in_reply_to_user_id>", EXCL); //extracting user information as an array $userprofile = return_between($currElement, "<user>", "</user>", EXCL); $flag = 0; insertDB($sid, $text, $hyperlink, $dtFormat, $rts, $rtu, $src, $userprofile); }
while (!feof($handlein)) { $buffer = fgets($handlein, 8192); $encoding_replaced = false; //$i = 0; while (strpos($buffer, '$') !== false) { //echo $i++ . ": strpos=". (string)strpos($buffer, '$') ."\nbuffer: $buffer\n"; $replaced = false; foreach ($translation as $symbol => $character) { $sym_pos = strpos($buffer, $symbol); if ($sym_pos !== false) { $sym_length = strlen($symbol); $piece1 = substr($buffer, 0, $sym_pos); if ($character['switch']) { // the character after the special charater needs to come before it $partnerchar = utf8_encode($buffer[$sym_pos + $sym_length]); $piece2 = unicode_to_utf8(array_merge(utf8_to_unicode($partnerchar), $character['unicode'])); $piece3start = $sym_pos + $sym_length + 1; } else { $piece2 = unicode_to_utf8($character['unicode']); $piece3start = $sym_pos + $sym_length; } $piece2 = utf8_decode(UtfNormal::NFKC($piece2)); // strip out any ? characters, which are characters not existing in ISO-8859-1 $piece2 = str_replace('?', '', $piece2); $piece3 = substr($buffer, $piece3start); $buffer = $piece1 . $piece2 . $piece3; $replaced = true; continue; } } if (!$replaced) {
/** * @dataProvider providerFailingUtf8ToUnicode */ public function testFailingUtf8ToUnicodeReturnValue($str) { $this->assertFalse(@utf8_to_unicode($str, true)); }
function encode($text) { global $base, $tmin, $tmax, $skew, $damp, $initial_bias, $initial_n, $prefix, $delim; $text = utf8_to_unicode($text); $codecount = 0; $basic_string = ""; $extended_string = ""; for ($i = 0; $i < sizeof($text); $i++) { if ($text[$i] < $initial_n) { $basic_string .= chr($text[$i]); $codecount++; } } $n = $initial_n; $delta = 0; $bias = $initial_bias; $h = $codecount; while ($h < sizeof($text)) { $m = 100000; for ($j = 0; $j < sizeof($text); $j++) { if ($text[$j] >= $n && $text[$j] <= $m) { $m = $text[$j]; } } $delta = $delta + ($m - $n) * ($h + 1); $n = $m; for ($j = 0; $j < sizeof($text); $j++) { $c = $text[$j]; if ($c < $n) { $delta++; } elseif ($c == $n) { $q = $delta; for ($k = $base; 1; $k = $k + $base) { $t = 0; if ($k <= $bias + $tmin) { $t = $tmin; } elseif ($k >= $bias + $tmax) { $t = $tmax; } else { $t = $k - $bias; } if ($q < $t) { break; } $extended_string .= encode_digit($t + ($q - $t) % ($base - $t)); $q = floor(($q - $t) / ($base - $t)); } $extended_string .= encode_digit($q); $bias = adapt($delta, $h + 1, $h == $codecount); $delta = 0; $h++; } } $delta++; $n++; } if (strlen($basic_string) > 0 && strlen($extended_string) < 1) { $encoded = $basic_string; } elseif (strlen($basic_string) > 0 && strlen($extended_string) > 0) { $encoded = $prefix . $basic_string . $delim . $extended_string; } elseif (strlen($basic_string) < 1 && strlen($extended_string) > 0) { $encoded = $prefix . $extended_string; } return $encoded; }
function test_from_4byte() { $in = ""; $out = array(1048577); $this->assertEqual(utf8_to_unicode($in), $out); }
/** * utf8字符串分隔为unicode字符串 * @param string $str 要转换的字符串 * @param string $depart 分隔,默认为空格为单字 * @return string */ function str_to_unicode_word($str, $depart = ' ') { $arr = array(); $str_len = mb_strlen($str, 'utf-8'); for ($i = 0; $i < $str_len; $i++) { $s = mb_substr($str, $i, 1, 'utf-8'); if ($s != ' ' && $s != ' ') { $arr[] = 'ux' . utf8_to_unicode($s); } } return implode($depart, $arr); }
$total_words = 0; $responseitem = ""; // // Get the text from the client (POST or GET queries accepted) // $stxt = isset($_REQUEST['txt']) ? $_REQUEST['txt'] : FALSE; // // If the user submitted text... // if ($stxt !== FALSE) { // // This turns all extended unicode characters into periods. // It makes the spellchecker ignore unicode characters without // upsetting the character indexes for spellchecker results. // $stxt = unicode_to_periods(utf8_to_unicode($stxt)); // // Fire up ASpell and load the english dictionary. // // Possible reasons for this error: // - The dictionary you're trying to load is not installed (see aspell.sourceforge.net) // - Some other error beyond our control // (PHP/Win32 sometimes has problems init'ing the library, trying again usually works.) // // Added 26-03-2005 - Retry feature. // Try loading the library up to 3 times before failing. Seems to work well, // except it increases the processing time for the script (the failed tries take some time.) // $psp = FALSE; $loaded_psp = FALSE; for ($i = 1; $i <= 3; $i++) {
function filterHTML($codeContent) { global $state; // Array holds information about the current state of parsing // e.g. the current tag, the last tag, etc. $state = array(); // Clear the document in case function is called several times on one page clearDoc(); // set up state defaults setState('abort_filtering', false); // used to abort filtering on encountering an unsupported tag setState('current_tag', ""); // the current tag being processed setState('in_list', false); // whether the parser is currently inside a list (<ul> or <ol> etc...) setState('is_ordered_list', false); // whether the current list being parsed is an <ol> (for preservation) setState('last_tag', ""); // the last tag that was processed setState('css', ""); // keeps track of css styles to add to the next p tag (used to grab textformat attributes) setState('empty', true); // keeps track of whether an element is empty setState('depth', 0); // keeps track of the depth of the current node setState('style_depth', 0); // keeps track of the depth of the current styling node (used to remove redundant styles) setState('last_style_depth', 0); // keeps track of the depth of the current styling node (used to remove redundant styles) setState('last_depth', 0); // keeps track of the last depth processed setState('last_font', ""); // keeps track of the last font family (used to avoid redundant styles) setState('last_size', ""); // keeps track of the last font size (used to avoid redundant styles) setState('last_color', ""); // keeps track of the last foreground color (used to avoid redundant styles) // // URL-Decode the incoming content (obedit will url-encode it before sending.) // $codeContent = urldecode($codeContent); // // Initialize the XML parser and set element handler functions // $xml_parser = xml_parser_create(); xml_parser_set_option($xml_parser, XML_OPTION_CASE_FOLDING, true); xml_parser_set_option($xml_parser, XML_OPTION_TARGET_ENCODING, "UTF-8"); xml_set_element_handler($xml_parser, "startElement", "endElement"); xml_set_character_data_handler($xml_parser, "characterData"); // // Wrap the obedit document in XML to prevent validation errors // from the XML parser (this extra stuff is ignored by the filter and won't show up in the output) // $xml_data = "<?xml version='1.0'?><parserdocument>".$codeContent."</parserdocument>"; // // Do the parsing. See the handler functions above. // $filterResult = @xml_parse($xml_parser, $xml_data, true); // // If the document is invalidly formed or not valid obedit-generated HTML (e.g. hand-coded), // then spit out the originally submitted HTML (but still perform unicode conversion) // if (!$filterResult || filterWasAborted()) { clearDoc(); add2Doc(unicode_to_entities(utf8_to_unicode($codeContent))); } else { // // If we're here, the document is a valid RTE document and was successfully parsed and filtered. // // Now we just have to check if the document ended with a list, and if so, // close the list off with the appropriate </ul> or </ol> to wrap things up. // if (isInsideList()) { if (isOrderedList()) add2Doc("\n</ol>"); else add2Doc("\n</ul>"); setInsideList(false); } } // Free up memory xml_parser_free($xml_parser); // Return the final HTML document return getDoc(); }
/** * UTF-8 to UTF-16BE conversion. * * Maybe really UCS-2 without mb_string due to utf8_to_unicode limits */ function utf8_to_utf16be(&$str, $bom = false) { $out = $bom ? "��" : ''; if (UTF8_MBSTRING) { return $out . mb_convert_encoding($str, 'UTF-16BE', 'UTF-8'); } $uni = utf8_to_unicode($str); foreach ($uni as $cp) { $out .= pack('n', $cp); } return $out; }
function utf8_hangul_getSearchRule($str, $lastchar = 1, $use_unicode = true) { $rule = ''; $val = utf8_to_unicode($str); $len = sizeof($val); if ($lastchar and $len > 1) { // make a regex using with the last char $last = array_pop($val); $rule = unicode_to_utf8($val); $val = array($last); $len = sizeof($val); } else { // make regex for consonant only letters // ㄱㅎ => (ㄱ|[가-깋])(ㅎ|[하-힣]) // save the last char $last = array_pop($val); $len = sizeof($val); for ($i = 0; $i < $len; $i++) { $ch = $val[$i]; if ($ch >= 0x3130 and $ch <= 0x318f) { $wch = hangul_to_jamo(array($ch)); if ($wch[0] >= 0x1100 and $wch[0] <= 0x1112) { $wch[1] = 0x1161; $start = jamo_to_syllable($wch); $ustart = unicode_to_utf8($start); $wch[1] = 0x1175; $wch[2] = 0x11c2; $end = jamo_to_syllable($wch); $uend = unicode_to_utf8($end); } else { $rule .= unicode_to_utf8($wch); continue; } $crule = '(' . unicode_to_utf8(array($ch)) . '|'; $crule .= hangul_regex_range($ustart, $uend, $use_unicode); $crule .= ')'; } else { $crule = unicode_to_utf8(array($ch)); } $rule .= $crule; } // lastchar $val = array($last); $len = sizeof($val); } for ($i = 0; $i < $len; $i++) { $ch = $val[$i]; $wch = array(); $ustart = array(); $uend = array(); if ($ch >= 0xac00 and $ch <= 0xd7a3 or $ch >= 0x3130 and $ch <= 0x318f) { $wch = hangul_to_jamo(array($ch)); } else { $rule .= unicode_to_utf8(array($ch)); continue; } $wlen = sizeof($wch); $ket = ''; if ($wlen >= 3) { // 종각 => 종(각|가[가-깋]) $mrule = array(); $mrule[] = unicode_to_utf8(array($ch)); $save = $wch[2]; unset($wch[2]); $tmp = jamo_to_syllable($wch); $mrule[] = unicode_to_utf8($tmp); $save = hangul_jongseong_to_cjamo($save); $wch = hangul_to_jamo($save); $wlen = sizeof($wch); $rule .= '(' . implode('|', $mrule); $ket = ')'; if ($wlen > 1) { $rule .= ')'; continue; } } if ($wlen == 1) { if ($wch[0] >= 0x1100 and $wch[0] <= 0x1112) { $wch[1] = 0x1161; $start = jamo_to_syllable($wch); $ustart = unicode_to_utf8($start); $wch[1] = 0x1175; $wch[2] = 0x11c2; $end = jamo_to_syllable($wch); $uend = unicode_to_utf8($end); } else { $rule .= unicode_to_utf8($wch) . $ket; continue; } } else { if ($wlen == 2) { if ($wch[0] >= 0x1100 and $wch[0] <= 0x1112) { $start = jamo_to_syllable($wch); $ustart = unicode_to_utf8($start); $wch[2] = 0x11c2; $end = jamo_to_syllable($wch); $uend = unicode_to_utf8($end); } else { $rule .= unicode_to_utf8($wch); continue; } } } $crule = hangul_regex_range($ustart, $uend, $use_unicode); $rule .= $crule . $ket; } return $rule; }