Exemple #1
0
/**
 * Returns an array of Unicode code points.
 *
 * @param  string $str    The UTF-8 encoded string
 * @param  bool   $strict Should the function throw an error if something goes wrong
 * @return array of Unicode code points or false on failure
 */
function utf8_to_unicode($str, $strict = false)
{
    $rv = array();
    $i = 0;
    $len = strlen($str);
    while ($i < $len) {
        $n = $i;
        $ord = ord($str[$i++]);
        // Move to the next octet at once
        if (utf8_is_ascii($ord)) {
            $rv[] = $ord;
        } elseif (utf8_is_identifier($ord, $seqlen)) {
            $cp = utf8_decode_identifier($ord, $seqlen);
            $offset = $seqlen - 2;
            // Check for exists the last octet of the sequence at once
            if ($i + $offset >= $len) {
                if ($strict) {
                    trigger_error(sprintf('utf8_to_unicode: Incomplete ' . 'multi-octet sequence in UTF-8 at octet: %d', $n));
                    return false;
                }
                continue;
            }
            do {
                $ord = ord($str[$i]);
                if (!utf8_is_trail($ord)) {
                    if ($strict) {
                        trigger_error(sprintf('utf8_to_unicode: Unexpected ' . 'value at octet: %d, value: 0x%X', $i, $ord));
                        return false;
                    }
                    continue 2;
                    // Prepare this octet again
                }
                $i++;
                $cp |= utf8_decode_trail($ord, $offset);
            } while ($offset--);
            if (!unicode_is_valid($cp)) {
                if ($strict) {
                    $msg = unicode_is_surrogate($cp) ? 'Illegal surrogate in UTF-8' : 'Codepoint out of Unicode range';
                    trigger_error(sprintf('utf8_to_unicode: %s at octet: %d, ' . 'value: 0x%X', $msg, $n, $cp));
                    return false;
                }
                continue;
            }
            if (utf8_sequence_length($cp) != $seqlen) {
                if ($strict) {
                    trigger_error(sprintf('utf8_to_unicode: Illegal sequence ' . 'identifier in UTF-8 at octet: %d, value: 0x%X', $n, $cp));
                    return false;
                }
                continue;
            }
            $rv[] = $cp;
        } elseif ($strict) {
            trigger_error(sprintf('utf8_to_unicode: Illegal sequence ' . 'identifier in UTF-8 at octet: %d, value: 0x%X', $n, $ord));
            return false;
        }
    }
    return $rv;
}
Exemple #2
0
 /**
  * @dataProvider providerUtf8IsIdentifier
  */
 public function testUtf8IsIdentifier($ord, $rv, $rseqlen)
 {
     $this->assertEquals(utf8_is_identifier($ord, $seqlen), $rv);
     $this->assertEquals($seqlen, $rseqlen);
 }
Exemple #3
0
/**
 * This function may return boolean false, but may also return a non-boolean 0
 * which evaluates to false. Use the === operator for testing the return value
 * of this function.
 *
 * @param  string $str The UTF-8 encoded string
 * @param  int    $i   The current position of the string
 * @param  int    $rcp If passed, this will be set to the Unicode code point
 *                     for the character
 * @return string The character or false on the start of the string
 */
function utf8_get_last_char($str, &$i = null, &$rcp = null)
{
    if ($i === null) {
        $i = strlen($str) - 1;
    }
    while (isset($str[$i])) {
        $n = $i--;
        // Save current position and move to the previous octet
        $char = $str[$n++];
        // Move to the next octet at once
        $ord = ord($char);
        if (utf8_is_ascii($ord)) {
            $rcp = $ord;
            return $char;
        }
        if (utf8_is_identifier($ord, $seqlen)) {
            $cp = utf8_decode_identifier($ord, $seqlen);
            $offset = $seqlen - 2;
            // Check for exists the last octet of the sequence at once
            if (!isset($str[$n + $offset])) {
                continue;
            }
            do {
                $ord = ord($str[$n]);
                if (!utf8_is_trail($ord)) {
                    continue 2;
                    // Skip this sequence
                }
                $cp |= utf8_decode_trail($ord, $offset);
                $char .= $str[$n++];
            } while ($offset--);
            if (unicode_is_valid($cp) && utf8_sequence_length($cp) == $seqlen) {
                $rcp = $cp;
                return $char;
            }
        }
    }
    return false;
}