/** * Returns an array of Unicode code points. * * @param string $str The UTF-8 encoded string * @param bool $strict Should the function throw an error if something goes wrong * @return array of Unicode code points or false on failure */ function utf8_to_unicode($str, $strict = false) { $rv = array(); $i = 0; $len = strlen($str); while ($i < $len) { $n = $i; $ord = ord($str[$i++]); // Move to the next octet at once if (utf8_is_ascii($ord)) { $rv[] = $ord; } elseif (utf8_is_identifier($ord, $seqlen)) { $cp = utf8_decode_identifier($ord, $seqlen); $offset = $seqlen - 2; // Check for exists the last octet of the sequence at once if ($i + $offset >= $len) { if ($strict) { trigger_error(sprintf('utf8_to_unicode: Incomplete ' . 'multi-octet sequence in UTF-8 at octet: %d', $n)); return false; } continue; } do { $ord = ord($str[$i]); if (!utf8_is_trail($ord)) { if ($strict) { trigger_error(sprintf('utf8_to_unicode: Unexpected ' . 'value at octet: %d, value: 0x%X', $i, $ord)); return false; } continue 2; // Prepare this octet again } $i++; $cp |= utf8_decode_trail($ord, $offset); } while ($offset--); if (!unicode_is_valid($cp)) { if ($strict) { $msg = unicode_is_surrogate($cp) ? 'Illegal surrogate in UTF-8' : 'Codepoint out of Unicode range'; trigger_error(sprintf('utf8_to_unicode: %s at octet: %d, ' . 'value: 0x%X', $msg, $n, $cp)); return false; } continue; } if (utf8_sequence_length($cp) != $seqlen) { if ($strict) { trigger_error(sprintf('utf8_to_unicode: Illegal sequence ' . 'identifier in UTF-8 at octet: %d, value: 0x%X', $n, $cp)); return false; } continue; } $rv[] = $cp; } elseif ($strict) { trigger_error(sprintf('utf8_to_unicode: Illegal sequence ' . 'identifier in UTF-8 at octet: %d, value: 0x%X', $n, $ord)); return false; } } return $rv; }
/** * @dataProvider providerUtf8IsIdentifier */ public function testUtf8IsIdentifier($ord, $rv, $rseqlen) { $this->assertEquals(utf8_is_identifier($ord, $seqlen), $rv); $this->assertEquals($seqlen, $rseqlen); }
/** * This function may return boolean false, but may also return a non-boolean 0 * which evaluates to false. Use the === operator for testing the return value * of this function. * * @param string $str The UTF-8 encoded string * @param int $i The current position of the string * @param int $rcp If passed, this will be set to the Unicode code point * for the character * @return string The character or false on the start of the string */ function utf8_get_last_char($str, &$i = null, &$rcp = null) { if ($i === null) { $i = strlen($str) - 1; } while (isset($str[$i])) { $n = $i--; // Save current position and move to the previous octet $char = $str[$n++]; // Move to the next octet at once $ord = ord($char); if (utf8_is_ascii($ord)) { $rcp = $ord; return $char; } if (utf8_is_identifier($ord, $seqlen)) { $cp = utf8_decode_identifier($ord, $seqlen); $offset = $seqlen - 2; // Check for exists the last octet of the sequence at once if (!isset($str[$n + $offset])) { continue; } do { $ord = ord($str[$n]); if (!utf8_is_trail($ord)) { continue 2; // Skip this sequence } $cp |= utf8_decode_trail($ord, $offset); $char .= $str[$n++]; } while ($offset--); if (unicode_is_valid($cp) && utf8_sequence_length($cp) == $seqlen) { $rcp = $cp; return $char; } } } return false; }