/** * Convert a UTF-8 string to normal form C, canonical composition. * Fast return for pure ASCII strings; some lesser optimizations for * strings containing only known-good characters. * * @param $string String: a valid UTF-8 string. Input is not validated. * @return string a UTF-8 string in normal form C */ static function toNFC($string) { if (NORMALIZE_INTL) { return normalizer_normalize($string, Normalizer::FORM_C); } elseif (NORMALIZE_ICU) { return utf8_normalize($string, UNORM_NFC); } elseif (UtfNormal::quickIsNFC($string)) { return $string; } else { return UtfNormal::NFC($string); } }
/** @todo document */ function doTestTripleBytes($head, $tail) { for ($first = 0xc0; $first < 0x100; $first++) { for ($second = 0x80; $second < 0x100; $second++) { #for( $third = 0x80; $third < 0x100; $third++ ) { for ($third = 0x80; $third < 0x81; $third++) { $char = $head . chr($first) . chr($second) . chr($third) . $tail; $clean = UtfNormal::cleanUp($char); $x = sprintf("%02X,%02X,%02X", $first, $second, $third); if ($first >= 0xe0 && $first < 0xf0 && $second < 0xc0 && $third < 0xc0) { if ($first == 0xe0 && $second < 0xa0) { $this->assertEquals(bin2hex($head . UTF8_REPLACEMENT . $tail), bin2hex($clean), "Overlong triplet {$x} should be rejected"); } elseif ($first == 0xed && chr($first) . chr($second) . chr($third) >= UTF8_SURROGATE_FIRST) { $this->assertEquals(bin2hex($head . UTF8_REPLACEMENT . $tail), bin2hex($clean), "Surrogate triplet {$x} should be rejected"); } else { $this->assertEquals(bin2hex(UtfNormal::NFC($char)), bin2hex($clean), "Triplet {$x} should be intact"); } } elseif ($first > 0xc1 && $first < 0xe0 && $second < 0xc0) { $this->assertEquals(bin2hex(UtfNormal::NFC($head . chr($first) . chr($second)) . UTF8_REPLACEMENT . $tail), bin2hex($clean), "Valid 2-byte {$x} + broken tail"); } elseif ($second > 0xc1 && $second < 0xe0 && $third < 0xc0) { $this->assertEquals(bin2hex($head . UTF8_REPLACEMENT . UtfNormal::NFC(chr($second) . chr($third) . $tail)), bin2hex($clean), "Broken head + valid 2-byte {$x}"); } elseif (($first > 0xfd || $second > 0xfd) && ($second > 0xbf && $third > 0xbf || $second < 0xc0 && $third < 0xc0 || $second > 0xfd || $third > 0xfd)) { # fe and ff are not legal head bytes -- expect three replacement chars $this->assertEquals(bin2hex($head . UTF8_REPLACEMENT . UTF8_REPLACEMENT . UTF8_REPLACEMENT . $tail), bin2hex($clean), "Forbidden triplet {$x} should be rejected"); } elseif ($first > 0xc2 && $second < 0xc0 && $third < 0xc0) { $this->assertEquals(bin2hex($head . UTF8_REPLACEMENT . $tail), bin2hex($clean), "Forbidden triplet {$x} should be rejected"); } else { $this->assertEquals(bin2hex($head . UTF8_REPLACEMENT . UTF8_REPLACEMENT . $tail), bin2hex($clean), "Forbidden triplet {$x} should be rejected"); } } } } }