public function testUTF8vCodepoints()
 {
     $strings = array('' => array(), 'x' => array(0x78), 'quack' => array(0x71, 0x75, 0x61, 0x63, 0x6b), "x東y" => array(0x78, 0x6771, 0x79), "»" => array(0xbb), "☃" => array(0x2603), "￿" => array(0xffff), "💩" => array(0x1f4a9), "x͠y" => array(0x78, 0x360, 0x79));
     foreach ($strings as $str => $expect) {
         $this->assertEqual($expect, phutil_utf8v_codepoints($str), 'Codepoint Vector of ' . $str);
     }
 }
Beispiel #2
0
/**
 * Find the console display length of a UTF-8 string. This may differ from the
 * character length of the string if it contains double-width characters, like
 * many Chinese characters.
 *
 * This method is based on a C implementation here, which is based on the IEEE
 * standards. The source has more discussion and addresses more considerations
 * than this implementation does.
 *
 *   http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c
 *
 * NOTE: We currently do not handle combining characters correctly.
 *
 * NOTE: We currently assume width 1 for East-Asian ambiguous characters.
 *
 * NOTE: This function is VERY slow.
 *
 * @param   string  A valid UTF-8 string.
 * @return  int     The console display length of the string.
 * @group   utf8
 */
function phutil_utf8_console_strlen($string)
{
    $string_v = phutil_utf8v_codepoints($string);
    $len = 0;
    foreach ($string_v as $c) {
        if ($c == 0) {
            continue;
        }
        $len += 1 + ($c >= 0x1100 && ($c <= 0x115f || $c == 0x2329 || $c == 0x232a || $c >= 0x2e80 && $c <= 0xa4cf && $c != 0x303f || $c >= 0xac00 && $c <= 0xd7a3 || $c >= 0xf900 && $c <= 0xfaff || $c >= 0xfe10 && $c <= 0xfe19 || $c >= 0xfe30 && $c <= 0xfe6f || $c >= 0xff00 && $c <= 0xff60 || $c >= 0xffe0 && $c <= 0xffe6 || $c >= 0x20000 && $c <= 0x2fffd || $c >= 0x30000 && $c <= 0x3fffd));
    }
    return $len;
}
Beispiel #3
0
/**
 * Determine if a given unicode character is a combining character or not.
 *
 * @param   string              A single unicode character.
 * @return  boolean             True or false.
 */
function phutil_utf8_is_combining_character($character)
{
    $components = phutil_utf8v_codepoints($character);
    // Combining Diacritical Marks (0300 - 036F).
    // Combining Diacritical Marks Supplement (1DC0 - 1DFF).
    // Combining Diacritical Marks for Symbols (20D0 - 20FF).
    // Combining Half Marks (FE20 - FE2F).
    foreach ($components as $codepoint) {
        if ($codepoint >= 0x300 && $codepoint <= 0x36f || $codepoint >= 0x1dc0 && $codepoint <= 0x1dff || $codepoint >= 0x20d0 && $codepoint <= 0x20ff || $codepoint >= 0xfe20 && $codepoint <= 0xfe2f) {
            return true;
        }
    }
    return false;
}