public function testUTF8vCodepoints() { $strings = array('' => array(), 'x' => array(0x78), 'quack' => array(0x71, 0x75, 0x61, 0x63, 0x6b), "x東y" => array(0x78, 0x6771, 0x79), "»" => array(0xbb), "☃" => array(0x2603), "" => array(0xffff), "💩" => array(0x1f4a9), "x͠y" => array(0x78, 0x360, 0x79)); foreach ($strings as $str => $expect) { $this->assertEqual($expect, phutil_utf8v_codepoints($str), 'Codepoint Vector of ' . $str); } }
/** * Find the console display length of a UTF-8 string. This may differ from the * character length of the string if it contains double-width characters, like * many Chinese characters. * * This method is based on a C implementation here, which is based on the IEEE * standards. The source has more discussion and addresses more considerations * than this implementation does. * * http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c * * NOTE: We currently do not handle combining characters correctly. * * NOTE: We currently assume width 1 for East-Asian ambiguous characters. * * NOTE: This function is VERY slow. * * @param string A valid UTF-8 string. * @return int The console display length of the string. * @group utf8 */ function phutil_utf8_console_strlen($string) { $string_v = phutil_utf8v_codepoints($string); $len = 0; foreach ($string_v as $c) { if ($c == 0) { continue; } $len += 1 + ($c >= 0x1100 && ($c <= 0x115f || $c == 0x2329 || $c == 0x232a || $c >= 0x2e80 && $c <= 0xa4cf && $c != 0x303f || $c >= 0xac00 && $c <= 0xd7a3 || $c >= 0xf900 && $c <= 0xfaff || $c >= 0xfe10 && $c <= 0xfe19 || $c >= 0xfe30 && $c <= 0xfe6f || $c >= 0xff00 && $c <= 0xff60 || $c >= 0xffe0 && $c <= 0xffe6 || $c >= 0x20000 && $c <= 0x2fffd || $c >= 0x30000 && $c <= 0x3fffd)); } return $len; }
/** * Determine if a given unicode character is a combining character or not. * * @param string A single unicode character. * @return boolean True or false. */ function phutil_utf8_is_combining_character($character) { $components = phutil_utf8v_codepoints($character); // Combining Diacritical Marks (0300 - 036F). // Combining Diacritical Marks Supplement (1DC0 - 1DFF). // Combining Diacritical Marks for Symbols (20D0 - 20FF). // Combining Half Marks (FE20 - FE2F). foreach ($components as $codepoint) { if ($codepoint >= 0x300 && $codepoint <= 0x36f || $codepoint >= 0x1dc0 && $codepoint <= 0x1dff || $codepoint >= 0x20d0 && $codepoint <= 0x20ff || $codepoint >= 0xfe20 && $codepoint <= 0xfe2f) { return true; } } return false; }