예제 #1
0
/**
 * Take a series of space-separated hexadecimal numbers representing
 * Unicode code points and return a UTF-8 string composed of those
 * characters. Used by UTF-8 data generation and testing routines.
 *
 * @param $sequence String
 * @return String
 * @private
 */
function hexSequenceToUtf8($sequence)
{
    $utf = '';
    foreach (explode(' ', $sequence) as $hex) {
        $n = hexdec($hex);
        $utf .= codepointToUtf8($n);
    }
    return $utf;
}
 function onCategoryMultisortSortkeys_getRadical($data, $chcp)
 {
     if (!array_key_exists($chcp, $data->radicalStrokeCounts)) {
         return '';
     } else {
         list($radicalId, $rest) = $data->radicalStrokeCounts[$chcp];
         $radicalCp = $data->radicals[$radicalId];
         return sprintf('%s%03d', codepointToUtf8($radicalCp), $rest);
     }
 }
 /**
  * This test is *very* expensive!
  * @todo document
  */
 function XtestAllChars()
 {
     $rep = UTF8_REPLACEMENT;
     for ($i = 0x0; $i < UNICODE_MAX; $i++) {
         $char = codepointToUtf8($i);
         $clean = UtfNormal::cleanUp($char);
         $x = sprintf("%04X", $i);
         if ($i % 0x1000 == 0) {
             echo "U+{$x}\n";
         }
         if ($i == 0x9 || $i == 0xa || $i == 0xd || $i > 0x1f && $i < UNICODE_SURROGATE_FIRST || $i > UNICODE_SURROGATE_LAST && $i < 0xfffe || $i > 0xffff && $i <= UNICODE_MAX) {
             if (isset(UtfNormal::$utfCanonicalComp[$char]) || isset(UtfNormal::$utfCanonicalDecomp[$char])) {
                 $comp = UtfNormal::NFC($char);
                 $this->assertEquals(bin2hex($comp), bin2hex($clean), "U+{$x} should be decomposed");
             } else {
                 $this->assertEquals(bin2hex($char), bin2hex($clean), "U+{$x} should be intact");
             }
         } else {
             $this->assertEquals(bin2hex($rep), bin2hex($clean), $x);
         }
     }
 }
예제 #4
0
        print "{$total} ";
    }
}
fclose($in);
$ok = reportResults($total, $success, $failure) && $ok;
$in = fopen("UnicodeData.txt", "rt");
if (!$in) {
    print "Can't open UnicodeData.txt for reading.\n";
    print "If necessary, fetch this file from the internet:\n";
    print "http://www.unicode.org/Public/UNIDATA/UnicodeData.txt\n";
    exit(-1);
}
print "Now testing invariants...\n";
while (false !== ($line = fgets($in))) {
    $cols = explode(';', $line);
    $char = codepointToUtf8(hexdec($cols[0]));
    $desc = $cols[0] . ": " . $cols[1];
    if ($char < " " || $char >= UTF8_SURROGATE_FIRST && $char <= UTF8_SURROGATE_LAST) {
        # Can't check NULL with the ICU plugin, as null bytes fail in C land.
        # Skip other control characters, as we strip them for XML safety.
        # Surrogates are illegal on their own or in UTF-8, ignore.
        continue;
    }
    if (empty($testedChars[$char])) {
        $total++;
        if (testInvariant($normalizer, $char, $desc)) {
            $success++;
        } else {
            $failure++;
        }
        if ($total % 100 == 0) {
예제 #5
0
 function generateFirstChars()
 {
     $file = fopen("{$this->dataDir}/allkeys.txt", 'r');
     if (!$file) {
         $this->error("Unable to open allkeys.txt");
         exit(1);
     }
     global $IP;
     $outFile = fopen("{$IP}/serialized/first-letters-root.ser", 'w');
     if (!$outFile) {
         $this->error("Unable to open output file first-letters-root.ser");
         exit(1);
     }
     $goodTertiaryChars = array();
     // For each character with an entry in allkeys.txt, overwrite the implicit
     // entry in $this->weights that came from the UCD.
     // Also gather a list of tertiary weights, for use in selecting the group header
     while (false !== ($line = fgets($file))) {
         // We're only interested in single-character weights, pick them out with a regex
         $line = trim($line);
         if (!preg_match('/^([0-9A-F]+)\\s*;\\s*([^#]*)/', $line, $m)) {
             continue;
         }
         $cp = hexdec($m[1]);
         $allWeights = trim($m[2]);
         $primary = '';
         $tertiary = '';
         if (!isset($this->weights[$cp])) {
             // Non-printable, ignore
             continue;
         }
         foreach (StringUtils::explode('[', $allWeights) as $weightStr) {
             preg_match_all('/[*.]([0-9A-F]+)/', $weightStr, $m);
             if (!empty($m[1])) {
                 if ($m[1][0] !== '0000') {
                     $primary .= '.' . $m[1][0];
                 }
                 if ($m[1][2] !== '0000') {
                     $tertiary .= '.' . $m[1][2];
                 }
             }
         }
         $this->weights[$cp] = $primary;
         if ($tertiary === '.0008' || $tertiary === '.000E') {
             $goodTertiaryChars[$cp] = true;
         }
     }
     fclose($file);
     // Identify groups of characters with the same primary weight
     $this->groups = array();
     asort($this->weights, SORT_STRING);
     $prevWeight = reset($this->weights);
     $group = array();
     foreach ($this->weights as $cp => $weight) {
         if ($weight !== $prevWeight) {
             $this->groups[$prevWeight] = $group;
             $prevWeight = $weight;
             if (isset($this->groups[$weight])) {
                 $group = $this->groups[$weight];
             } else {
                 $group = array();
             }
         }
         $group[] = $cp;
     }
     if ($group) {
         $this->groups[$prevWeight] = $group;
     }
     // If one character has a given primary weight sequence, and a second
     // character has a longer primary weight sequence with an initial
     // portion equal to the first character, then remove the second
     // character. This avoids having characters like U+A732 (double A)
     // polluting the basic latin sort area.
     foreach ($this->groups as $weight => $group) {
         if (preg_match('/(\\.[0-9A-F]*)\\./', $weight, $m)) {
             if (isset($this->groups[$m[1]])) {
                 unset($this->groups[$weight]);
             }
         }
     }
     ksort($this->groups, SORT_STRING);
     // Identify the header character in each group
     $headerChars = array();
     $prevChar = "";
     $tertiaryCollator = new Collator('root');
     $primaryCollator = new Collator('root');
     $primaryCollator->setStrength(Collator::PRIMARY);
     $numOutOfOrder = 0;
     foreach ($this->groups as $weight => $group) {
         $uncomposedChars = array();
         $goodChars = array();
         foreach ($group as $cp) {
             if (isset($goodTertiaryChars[$cp])) {
                 $goodChars[] = $cp;
             }
             if (!isset($this->mappedChars[$cp])) {
                 $uncomposedChars[] = $cp;
             }
         }
         $x = array_intersect($goodChars, $uncomposedChars);
         if (!$x) {
             $x = $uncomposedChars;
             if (!$x) {
                 $x = $group;
             }
         }
         // Use ICU to pick the lowest sorting character in the selection
         $tertiaryCollator->sort($x);
         $cp = $x[0];
         $char = codepointToUtf8($cp);
         $headerChars[] = $char;
         if ($primaryCollator->compare($char, $prevChar) <= 0) {
             $numOutOfOrder++;
             /*
             				printf( "Out of order: U+%05X > U+%05X\n",
             					utf8ToCodepoint( $prevChar ),
             					utf8ToCodepoint( $char ) );
             */
         }
         $prevChar = $char;
         if ($this->debugOutFile) {
             fwrite($this->debugOutFile, sprintf("%05X %s %s (%s)\n", $cp, $weight, $char, implode(' ', array_map('codepointToUtf8', $group))));
         }
     }
     print "Out of order: {$numOutOfOrder} / " . count($headerChars) . "\n";
     fwrite($outFile, serialize($headerChars));
 }
예제 #6
0
 /**
  * Prepare a conversion array for converting Windows Code Page 1252 to
  * UTF-8. This should provide proper conversion of text that was miscoded
  * as Windows-1252 by naughty user-agents, and doesn't rely on an outside
  * iconv library.
  *
  * @return array
  * @access private
  */
 function prepareWindows1252()
 {
     # Mappings from:
     # http://www.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP1252.TXT
     static $cp1252 = array(0x80 => 0x20ac, 0x81 => 0xfffd, 0x82 => 0x201a, 0x83 => 0x192, 0x84 => 0x201e, 0x85 => 0x2026, 0x86 => 0x2020, 0x87 => 0x2021, 0x88 => 0x2c6, 0x89 => 0x2030, 0x8a => 0x160, 0x8b => 0x2039, 0x8c => 0x152, 0x8d => 0xfffd, 0x8e => 0x17d, 0x8f => 0xfffd, 0x90 => 0xfffd, 0x91 => 0x2018, 0x92 => 0x2019, 0x93 => 0x201c, 0x94 => 0x201d, 0x95 => 0x2022, 0x96 => 0x2013, 0x97 => 0x2014, 0x98 => 0x2dc, 0x99 => 0x2122, 0x9a => 0x161, 0x9b => 0x203a, 0x9c => 0x153, 0x9d => 0xfffd, 0x9e => 0x17e, 0x9f => 0x178);
     $pairs = array();
     for ($i = 0; $i < 0x100; $i++) {
         $unicode = isset($cp1252[$i]) ? $cp1252[$i] : $i;
         $pairs[chr($i)] = codepointToUtf8($unicode);
     }
     return $pairs;
 }
}
$wikiUpperChars = array();
$wikiLowerChars = array();
print "Reading character definitions...\n";
while (false !== ($line = fgets($in))) {
    $columns = explode(';', $line);
    $codepoint = $columns[0];
    $name = $columns[1];
    $simpleUpper = $columns[12];
    $simpleLower = $columns[13];
    $source = codepointToUtf8(hexdec($codepoint));
    if ($simpleUpper) {
        $wikiUpperChars[$source] = codepointToUtf8(hexdec($simpleUpper));
    }
    if ($simpleLower) {
        $wikiLowerChars[$source] = codepointToUtf8(hexdec($simpleLower));
    }
}
fclose($in);
$out = fopen("Utf8Case.php", "wt");
if ($out) {
    $outUpperChars = escapeArray($wikiUpperChars);
    $outLowerChars = escapeArray($wikiLowerChars);
    $outdata = "<" . "?php\n/**\n * Simple 1:1 upper/lowercase switching arrays for utf-8 text.\n * Won't get context-sensitive things yet.\n *\n * Hack for bugs in ucfirst() and company\n *\n * These are pulled from memcached if possible, as this is faster than filling\n * up a big array manually.\n *\n * @file\n * @ingroup Language\n */\n\n/**\n * Translation array to get upper case character\n */\n\$wikiUpperChars = {$outUpperChars};\n\n/**\n * Translation array to get lower case character\n */\n\$wikiLowerChars = {$outLowerChars};\n";
    fputs($out, $outdata);
    fclose($out);
    print "Wrote out Utf8Case.php\n";
} else {
    print "Can't create file Utf8Case.php\n";
    exit(-1);
}
예제 #8
0
 /**
  * callback for replacement of unicode notations
  * @param	array	preg matches
  * @return	string	replacement string
  */
 private function getItemParsedCallback($matches)
 {
     require_once "include/Unicode/UtfNormalUtil.php";
     return codepointToUtf8(hexdec(substr($matches[0], 2)));
 }
    public function execute()
    {
        $dir = __DIR__;
        $endl = "\n";
        $lines = file("{$dir}/equivset.in");
        if (!$lines) {
            $this->error("Unable to open equivset.in\n", 1);
        }
        $setsFile = fopen("{$dir}/equivset.txt", 'w');
        if (!$setsFile) {
            $this->error("Unable to open equivset.txt for writing\n", 1);
        }
        fwrite($setsFile, <<<EOT
# This file is generated by generateEquivset.php
# It shows sets of equivalent characters, one set per line, with characters
# separated by whitespace. This file is not used by MediaWiki, rather it is
# intended as a human-readable version of equivset.php, for debugging and
# review purposes.

EOT
);
        $outputFile = fopen("{$dir}/equivset.php", 'w');
        if (!$outputFile) {
            $this->error("Unable to open equivset.php for writing\n", 1);
        }
        fwrite($outputFile, "<?" . "php{$endl}" . <<<EOT
# This file is generated by generateEquivset.php
# It contains a map of characters, encoded in UTF-8, such that running strtr()
# on a string with this map will cause confusable characters to be reduced to
# a canonical representation. The same array is also available in serialized
# form, in equivset.ser.

EOT
);
        $serializedFile = fopen("{$dir}/equivset.ser", 'w');
        if (!$serializedFile) {
            $this->error("Unable to open equivset.ser for writing\n", 1);
        }
        # \s matches \xa0 in non-unicode mode, which is not what we want
        # So we need to make our own whitespace class
        $sp = '[\\ \\t]';
        $lineNum = 0;
        $setsByChar = array();
        $sets = array();
        $exitStatus = 0;
        foreach ($lines as $line) {
            ++$lineNum;
            $mapToEmpty = false;
            # Whether the line ends with a null character
            $mapToEmpty = strpos($line, "") === strlen($line) - 2;
            $line = trim($line);
            # Filter comments
            if (!$line || $line[0] == '#') {
                continue;
            }
            # Process line
            if (!preg_match("/^(?P<hexleft> [A-F0-9]+) {$sp}+ (?P<charleft> .+?) {$sp}+ => {$sp}+ (?:(?P<hexright> [A-F0-9]+) {$sp}+|) (?P<charright> .+?) {$sp}* (?: \\#.*|) \$ /x", $line, $m)) {
                $this->output("Error: invalid entry at line {$lineNum}: {$line}\n");
                $exitStatus = 1;
                continue;
            }
            $error = false;
            if ($mapToEmpty) {
                $m['charright'] = '';
            } else {
                if (codepointToUtf8(hexdec($m['hexleft'])) != $m['charleft']) {
                    $actual = utf8ToCodepoint($m['charleft']);
                    if ($actual === false) {
                        $this->output("Bytes: " . strlen($m['charleft']) . "\n");
                        $this->output(bin2hex($line) . "\n");
                        $hexForm = bin2hex($m['charleft']);
                        $this->output("Invalid UTF-8 character \"{$m['charleft']}\" ({$hexForm}) at line {$lineNum}: {$line}\n");
                    } else {
                        $this->output("Error: left number ({$m['hexleft']}) does not match left character ({$actual}) " . "at line {$lineNum}: {$line}\n");
                    }
                    $error = true;
                }
                if (!empty($m['hexright']) && codepointToUtf8(hexdec($m['hexright'])) != $m['charright']) {
                    $actual = utf8ToCodepoint($m['charright']);
                    if ($actual === false) {
                        $hexForm = bin2hex($m['charright']);
                        $this->output("Invalid UTF-8 character \"{$m['charleft']}\" ({$hexForm}) at line {$lineNum}: {$line}\n");
                    } else {
                        $this->output("Error: right number ({$m['hexright']}) does not match right character ({$actual}) " . "at line {$lineNum}: {$line}\n");
                    }
                    $error = true;
                }
                if ($error) {
                    $exitStatus = 1;
                    continue;
                }
            }
            # Find the set for the right character, add a new one if necessary
            if (isset($setsByChar[$m['charright']])) {
                $setName = $setsByChar[$m['charright']];
            } else {
                # New set
                $setName = $m['charright'];
                $sets[$setName] = array($m['charright']);
                $setsByChar[$setName] = $setName;
            }
            # Add the left character to the set
            $sets[$setName][] = $m['charleft'];
            $setsByChar[$m['charleft']] = $setName;
        }
        # Sets output
        foreach ($sets as $members) {
            fwrite($setsFile, implode(' ', $members) . $endl);
        }
        # Map output
        $output = var_export($setsByChar, true);
        $output = str_replace("\n", $endl, $output);
        fwrite($outputFile, '$equivset = ' . "{$output};{$endl}");
        # Serialized codepoint map
        $codepointMap = array();
        foreach ($setsByChar as $char => $setName) {
            $key = $char === '' ? '' : utf8ToCodepoint($char);
            $value = $setName === '' ? '' : utf8ToCodepoint($setName);
            $codepointMap[$key] = $value;
        }
        fwrite($serializedFile, serialize($codepointMap));
        fclose($setsFile);
        fclose($outputFile);
        fclose($serializedFile);
        $text = 'Finished';
        if ($exitStatus > 0) {
            $text .= ' with errors';
        }
        $this->error($text, $exitStatus);
    }
 private static function badCharErr($msgId, $point)
 {
     $symbol = codepointToUtf8($point);
     // Combining marks are combined with the previous character. If abusing character is a
     // combining mark, prepend it with space to show them correctly.
     if (self::getScriptCode($point) == "SCRIPT_COMBINING_MARKS") {
         $symbol = ' ' . $symbol;
     }
     $code = sprintf('U+%04X', $point);
     if (preg_match('/\\A\\p{C}\\z/u', $symbol)) {
         $char = wfMessage('antispoof-bad-char-non-printable', $code)->text();
     } else {
         $char = wfMessage('antispoof-bad-char', $symbol, $code)->text();
     }
     return array("ERROR", wfMessage($msgId, $char)->text());
 }
/**
 * Function converts an Javascript escaped string back into a string with
 * specified charset (default is UTF-8).
 * Modified function from http://pure-essence.net/stuff/code/utf8RawUrlDecode.phps
 *
 * @param $source String escaped with Javascript's escape() function
 * @param $iconv_to String destination character set will be used as second parameter
 * in the iconv function. Default is UTF-8.
 * @return string
 */
function js_unescape($source, $iconv_to = 'UTF-8')
{
    $decodedStr = '';
    $pos = 0;
    $len = strlen($source);
    while ($pos < $len) {
        $charAt = substr($source, $pos, 1);
        if ($charAt == '%') {
            $pos++;
            $charAt = substr($source, $pos, 1);
            if ($charAt == 'u') {
                // we got a unicode character
                $pos++;
                $unicodeHexVal = substr($source, $pos, 4);
                $unicode = hexdec($unicodeHexVal);
                $decodedStr .= codepointToUtf8($unicode);
                $pos += 4;
            } else {
                // we have an escaped ascii character
                $hexVal = substr($source, $pos, 2);
                $decodedStr .= chr(hexdec($hexVal));
                $pos += 2;
            }
        } else {
            $decodedStr .= $charAt;
            $pos++;
        }
    }
    if ($iconv_to != "UTF-8") {
        $decodedStr = iconv("utf-8", $iconv_to, $decodedStr);
    }
    return $decodedStr;
}
예제 #12
0
 /**
  * @param $list array
  * @return string
  */
 public static function listToString($list)
 {
     $out = '';
     foreach ($list as $cp) {
         $out .= codepointToUtf8($cp);
     }
     return $out;
 }
예제 #13
0
function hexUnicodeToUtf8($hexcp)
{
    return @codepointToUtf8(@hexDec($hexcp));
}
예제 #14
0
 /**
  * If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD,
  * return the UTF-8 encoding of that character. Otherwise, returns
  * pseudo-entity source (eg &foo;)
  *
  * @param string $name
  * @return string
  */
 static function decodeEntity($name)
 {
     global $wgHtmlEntities, $wgHtmlEntityAliases;
     if (isset($wgHtmlEntityAliases[$name])) {
         $name = $wgHtmlEntityAliases[$name];
     }
     if (isset($wgHtmlEntities[$name])) {
         return codepointToUtf8($wgHtmlEntities[$name]);
     } else {
         return "&{$name};";
     }
 }
예제 #15
0
 /**
  * Return UTF-8 string for a codepoint if that is a valid
  * character reference, otherwise U+FFFD REPLACEMENT CHARACTER.
  * @param $codepoint Integer
  * @return String
  */
 static function decodeChar($codepoint)
 {
     if (SGString::validateCodepoint($codepoint)) {
         return codepointToUtf8($codepoint);
     } else {
         return UTF8_REPLACEMENT;
     }
 }
예제 #16
0
 /**
  * If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD,
  * return the UTF-8 encoding of that character. Otherwise, returns
  * pseudo-entity source (eg &foo;)
  *
  * @param $name Strings
  * @return String
  */
 static function decodeEntity($name)
 {
     if (isset(self::$htmlEntityAliases[$name])) {
         $name = self::$htmlEntityAliases[$name];
     }
     if (isset(self::$htmlEntities[$name])) {
         return codepointToUtf8(self::$htmlEntities[$name]);
     } else {
         return "&{$name};";
     }
 }
예제 #17
0
}
$compatibilityDecomp = array();
$canonicalDecomp = array();
$canonicalComp = array();
$combiningClass = array();
$total = 0;
$compat = 0;
$canon = 0;
print "Reading character definitions...\n";
while (false !== ($line = fgets($in))) {
    $columns = explode(';', $line);
    $codepoint = $columns[0];
    $name = $columns[1];
    $canonicalCombiningClass = $columns[3];
    $decompositionMapping = $columns[5];
    $source = codepointToUtf8(hexdec($codepoint));
    if ($canonicalCombiningClass != 0) {
        $combiningClass[$source] = intval($canonicalCombiningClass);
    }
    if ($decompositionMapping === '') {
        continue;
    }
    if (preg_match('/^<(.+)> (.*)$/', $decompositionMapping, $matches)) {
        # Compatibility decomposition
        $canonical = false;
        $decompositionMapping = $matches[2];
        $compat++;
    } else {
        $canonical = true;
        $canon++;
    }
예제 #18
0
 /**
  * Reverse the previously applied transliteration of non-ASCII characters
  * back to UTF-8. Used to protect data from corruption by broken web browsers
  * as listed in $wgBrowserBlackList.
  *
  * @param string $invalue
  * @return string
  * @access private
  */
 function unmakesafe($invalue)
 {
     $result = "";
     for ($i = 0; $i < strlen($invalue); $i++) {
         if (substr($invalue, $i, 3) == "&#x" && $invalue[$i + 3] != '0') {
             $i += 3;
             $hexstring = "";
             do {
                 $hexstring .= $invalue[$i];
                 $i++;
             } while (ctype_xdigit($invalue[$i]) && $i < strlen($invalue));
             // Do some sanity checks. These aren't needed for reversability,
             // but should help keep the breakage down if the editor
             // breaks one of the entities whilst editing.
             if (substr($invalue, $i, 1) == ";" and strlen($hexstring) <= 6) {
                 $codepoint = hexdec($hexstring);
                 $result .= codepointToUtf8($codepoint);
             } else {
                 $result .= "&#x" . $hexstring . substr($invalue, $i, 1);
             }
         } else {
             $result .= substr($invalue, $i, 1);
         }
     }
     // reverse the transform that we made for reversability reasons.
     return strtr($result, array("&#x0" => "&#x"));
 }
예제 #19
0
     continue;
 }
 $error = false;
 if (codepointToUtf8(hexdec($m['hexleft'])) != $m['charleft']) {
     $actual = utf8ToCodepoint($m['charleft']);
     if ($actual === false) {
         print "Bytes: " . strlen($m['charleft']) . "\n";
         print bin2hex($line) . "\n";
         $hexForm = bin2hex($m['charleft']);
         print "Invalid UTF-8 character \"{$m['charleft']}\" ({$hexForm}) at line {$lineNum}: {$line}\n";
     } else {
         print "Error: left number ({$m['hexleft']}) does not match left character ({$actual}) " . "at line {$lineNum}: {$line}\n";
     }
     $error = true;
 }
 if (!empty($m['hexright']) && codepointToUtf8(hexdec($m['hexright'])) != $m['charright']) {
     $actual = utf8ToCodepoint($m['charright']);
     if ($actual === false) {
         $hexForm = bin2hex($m['charright']);
         print "Invalid UTF-8 character \"{$m['charleft']}\" ({$hexForm}) at line {$lineNum}: {$line}\n";
     } else {
         print "Error: right number ({$m['hexright']}) does not match right character ({$actual}) " . "at line {$lineNum}: {$line}\n";
     }
     $error = true;
 }
 if ($error) {
     continue;
 }
 # Find the set for the right character, add a new one if necessary
 if (isset($setsByChar[$m['charright']])) {
     $setName = $setsByChar[$m['charright']];
예제 #20
0
 /**
  * Generate HTML for a spacer image
  * @return String: HTML <img> tag
  */
 protected function spacerArrow()
 {
     return $this->arrow('', codepointToUtf8(0xa0));
     // non-breaking space
 }
예제 #21
0
define('UTF8_HANGUL_LEND', codepointToUtf8(UNICODE_HANGUL_LEND));
define('UTF8_HANGUL_VEND', codepointToUtf8(UNICODE_HANGUL_VEND));
define('UTF8_HANGUL_TEND', codepointToUtf8(UNICODE_HANGUL_TEND));
define('UTF8_SURROGATE_FIRST', codepointToUtf8(UNICODE_SURROGATE_FIRST));
define('UTF8_SURROGATE_LAST', codepointToUtf8(UNICODE_SURROGATE_LAST));
define('UTF8_MAX', codepointToUtf8(UNICODE_MAX));
define('UTF8_REPLACEMENT', codepointToUtf8(UNICODE_REPLACEMENT));
#define( 'UTF8_REPLACEMENT', '!' );
define('UTF8_OVERLONG_A', "��");
define('UTF8_OVERLONG_B', "���");
define('UTF8_OVERLONG_C', "����");
# These two ranges are illegal
define('UTF8_FDD0', codepointToUtf8(0xfdd0));
define('UTF8_FDEF', codepointToUtf8(0xfdef));
define('UTF8_FFFE', codepointToUtf8(0xfffe));
define('UTF8_FFFF', codepointToUtf8(0xffff));
define('UTF8_HEAD', false);
define('UTF8_TAIL', true);
/**
 * For using the ICU wrapper
 */
define('UNORM_NONE', 1);
define('UNORM_NFD', 2);
define('UNORM_NFKD', 3);
define('UNORM_NFC', 4);
define('UNORM_DEFAULT', UNORM_NFC);
define('UNORM_NFKC', 5);
define('UNORM_FCD', 6);
define('NORMALIZE_ICU', function_exists('utf8_normalize'));
/**
 *