/** * Take a series of space-separated hexadecimal numbers representing * Unicode code points and return a UTF-8 string composed of those * characters. Used by UTF-8 data generation and testing routines. * * @param $sequence String * @return String * @private */ function hexSequenceToUtf8($sequence) { $utf = ''; foreach (explode(' ', $sequence) as $hex) { $n = hexdec($hex); $utf .= codepointToUtf8($n); } return $utf; }
function onCategoryMultisortSortkeys_getRadical($data, $chcp) { if (!array_key_exists($chcp, $data->radicalStrokeCounts)) { return ''; } else { list($radicalId, $rest) = $data->radicalStrokeCounts[$chcp]; $radicalCp = $data->radicals[$radicalId]; return sprintf('%s%03d', codepointToUtf8($radicalCp), $rest); } }
/** * This test is *very* expensive! * @todo document */ function XtestAllChars() { $rep = UTF8_REPLACEMENT; for ($i = 0x0; $i < UNICODE_MAX; $i++) { $char = codepointToUtf8($i); $clean = UtfNormal::cleanUp($char); $x = sprintf("%04X", $i); if ($i % 0x1000 == 0) { echo "U+{$x}\n"; } if ($i == 0x9 || $i == 0xa || $i == 0xd || $i > 0x1f && $i < UNICODE_SURROGATE_FIRST || $i > UNICODE_SURROGATE_LAST && $i < 0xfffe || $i > 0xffff && $i <= UNICODE_MAX) { if (isset(UtfNormal::$utfCanonicalComp[$char]) || isset(UtfNormal::$utfCanonicalDecomp[$char])) { $comp = UtfNormal::NFC($char); $this->assertEquals(bin2hex($comp), bin2hex($clean), "U+{$x} should be decomposed"); } else { $this->assertEquals(bin2hex($char), bin2hex($clean), "U+{$x} should be intact"); } } else { $this->assertEquals(bin2hex($rep), bin2hex($clean), $x); } } }
print "{$total} "; } } fclose($in); $ok = reportResults($total, $success, $failure) && $ok; $in = fopen("UnicodeData.txt", "rt"); if (!$in) { print "Can't open UnicodeData.txt for reading.\n"; print "If necessary, fetch this file from the internet:\n"; print "http://www.unicode.org/Public/UNIDATA/UnicodeData.txt\n"; exit(-1); } print "Now testing invariants...\n"; while (false !== ($line = fgets($in))) { $cols = explode(';', $line); $char = codepointToUtf8(hexdec($cols[0])); $desc = $cols[0] . ": " . $cols[1]; if ($char < " " || $char >= UTF8_SURROGATE_FIRST && $char <= UTF8_SURROGATE_LAST) { # Can't check NULL with the ICU plugin, as null bytes fail in C land. # Skip other control characters, as we strip them for XML safety. # Surrogates are illegal on their own or in UTF-8, ignore. continue; } if (empty($testedChars[$char])) { $total++; if (testInvariant($normalizer, $char, $desc)) { $success++; } else { $failure++; } if ($total % 100 == 0) {
function generateFirstChars() { $file = fopen("{$this->dataDir}/allkeys.txt", 'r'); if (!$file) { $this->error("Unable to open allkeys.txt"); exit(1); } global $IP; $outFile = fopen("{$IP}/serialized/first-letters-root.ser", 'w'); if (!$outFile) { $this->error("Unable to open output file first-letters-root.ser"); exit(1); } $goodTertiaryChars = array(); // For each character with an entry in allkeys.txt, overwrite the implicit // entry in $this->weights that came from the UCD. // Also gather a list of tertiary weights, for use in selecting the group header while (false !== ($line = fgets($file))) { // We're only interested in single-character weights, pick them out with a regex $line = trim($line); if (!preg_match('/^([0-9A-F]+)\\s*;\\s*([^#]*)/', $line, $m)) { continue; } $cp = hexdec($m[1]); $allWeights = trim($m[2]); $primary = ''; $tertiary = ''; if (!isset($this->weights[$cp])) { // Non-printable, ignore continue; } foreach (StringUtils::explode('[', $allWeights) as $weightStr) { preg_match_all('/[*.]([0-9A-F]+)/', $weightStr, $m); if (!empty($m[1])) { if ($m[1][0] !== '0000') { $primary .= '.' . $m[1][0]; } if ($m[1][2] !== '0000') { $tertiary .= '.' . $m[1][2]; } } } $this->weights[$cp] = $primary; if ($tertiary === '.0008' || $tertiary === '.000E') { $goodTertiaryChars[$cp] = true; } } fclose($file); // Identify groups of characters with the same primary weight $this->groups = array(); asort($this->weights, SORT_STRING); $prevWeight = reset($this->weights); $group = array(); foreach ($this->weights as $cp => $weight) { if ($weight !== $prevWeight) { $this->groups[$prevWeight] = $group; $prevWeight = $weight; if (isset($this->groups[$weight])) { $group = $this->groups[$weight]; } else { $group = array(); } } $group[] = $cp; } if ($group) { $this->groups[$prevWeight] = $group; } // If one character has a given primary weight sequence, and a second // character has a longer primary weight sequence with an initial // portion equal to the first character, then remove the second // character. This avoids having characters like U+A732 (double A) // polluting the basic latin sort area. foreach ($this->groups as $weight => $group) { if (preg_match('/(\\.[0-9A-F]*)\\./', $weight, $m)) { if (isset($this->groups[$m[1]])) { unset($this->groups[$weight]); } } } ksort($this->groups, SORT_STRING); // Identify the header character in each group $headerChars = array(); $prevChar = ""; $tertiaryCollator = new Collator('root'); $primaryCollator = new Collator('root'); $primaryCollator->setStrength(Collator::PRIMARY); $numOutOfOrder = 0; foreach ($this->groups as $weight => $group) { $uncomposedChars = array(); $goodChars = array(); foreach ($group as $cp) { if (isset($goodTertiaryChars[$cp])) { $goodChars[] = $cp; } if (!isset($this->mappedChars[$cp])) { $uncomposedChars[] = $cp; } } $x = array_intersect($goodChars, $uncomposedChars); if (!$x) { $x = $uncomposedChars; if (!$x) { $x = $group; } } // Use ICU to pick the lowest sorting character in the selection $tertiaryCollator->sort($x); $cp = $x[0]; $char = codepointToUtf8($cp); $headerChars[] = $char; if ($primaryCollator->compare($char, $prevChar) <= 0) { $numOutOfOrder++; /* printf( "Out of order: U+%05X > U+%05X\n", utf8ToCodepoint( $prevChar ), utf8ToCodepoint( $char ) ); */ } $prevChar = $char; if ($this->debugOutFile) { fwrite($this->debugOutFile, sprintf("%05X %s %s (%s)\n", $cp, $weight, $char, implode(' ', array_map('codepointToUtf8', $group)))); } } print "Out of order: {$numOutOfOrder} / " . count($headerChars) . "\n"; fwrite($outFile, serialize($headerChars)); }
/** * Prepare a conversion array for converting Windows Code Page 1252 to * UTF-8. This should provide proper conversion of text that was miscoded * as Windows-1252 by naughty user-agents, and doesn't rely on an outside * iconv library. * * @return array * @access private */ function prepareWindows1252() { # Mappings from: # http://www.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP1252.TXT static $cp1252 = array(0x80 => 0x20ac, 0x81 => 0xfffd, 0x82 => 0x201a, 0x83 => 0x192, 0x84 => 0x201e, 0x85 => 0x2026, 0x86 => 0x2020, 0x87 => 0x2021, 0x88 => 0x2c6, 0x89 => 0x2030, 0x8a => 0x160, 0x8b => 0x2039, 0x8c => 0x152, 0x8d => 0xfffd, 0x8e => 0x17d, 0x8f => 0xfffd, 0x90 => 0xfffd, 0x91 => 0x2018, 0x92 => 0x2019, 0x93 => 0x201c, 0x94 => 0x201d, 0x95 => 0x2022, 0x96 => 0x2013, 0x97 => 0x2014, 0x98 => 0x2dc, 0x99 => 0x2122, 0x9a => 0x161, 0x9b => 0x203a, 0x9c => 0x153, 0x9d => 0xfffd, 0x9e => 0x17e, 0x9f => 0x178); $pairs = array(); for ($i = 0; $i < 0x100; $i++) { $unicode = isset($cp1252[$i]) ? $cp1252[$i] : $i; $pairs[chr($i)] = codepointToUtf8($unicode); } return $pairs; }
} $wikiUpperChars = array(); $wikiLowerChars = array(); print "Reading character definitions...\n"; while (false !== ($line = fgets($in))) { $columns = explode(';', $line); $codepoint = $columns[0]; $name = $columns[1]; $simpleUpper = $columns[12]; $simpleLower = $columns[13]; $source = codepointToUtf8(hexdec($codepoint)); if ($simpleUpper) { $wikiUpperChars[$source] = codepointToUtf8(hexdec($simpleUpper)); } if ($simpleLower) { $wikiLowerChars[$source] = codepointToUtf8(hexdec($simpleLower)); } } fclose($in); $out = fopen("Utf8Case.php", "wt"); if ($out) { $outUpperChars = escapeArray($wikiUpperChars); $outLowerChars = escapeArray($wikiLowerChars); $outdata = "<" . "?php\n/**\n * Simple 1:1 upper/lowercase switching arrays for utf-8 text.\n * Won't get context-sensitive things yet.\n *\n * Hack for bugs in ucfirst() and company\n *\n * These are pulled from memcached if possible, as this is faster than filling\n * up a big array manually.\n *\n * @file\n * @ingroup Language\n */\n\n/**\n * Translation array to get upper case character\n */\n\$wikiUpperChars = {$outUpperChars};\n\n/**\n * Translation array to get lower case character\n */\n\$wikiLowerChars = {$outLowerChars};\n"; fputs($out, $outdata); fclose($out); print "Wrote out Utf8Case.php\n"; } else { print "Can't create file Utf8Case.php\n"; exit(-1); }
/** * callback for replacement of unicode notations * @param array preg matches * @return string replacement string */ private function getItemParsedCallback($matches) { require_once "include/Unicode/UtfNormalUtil.php"; return codepointToUtf8(hexdec(substr($matches[0], 2))); }
public function execute() { $dir = __DIR__; $endl = "\n"; $lines = file("{$dir}/equivset.in"); if (!$lines) { $this->error("Unable to open equivset.in\n", 1); } $setsFile = fopen("{$dir}/equivset.txt", 'w'); if (!$setsFile) { $this->error("Unable to open equivset.txt for writing\n", 1); } fwrite($setsFile, <<<EOT # This file is generated by generateEquivset.php # It shows sets of equivalent characters, one set per line, with characters # separated by whitespace. This file is not used by MediaWiki, rather it is # intended as a human-readable version of equivset.php, for debugging and # review purposes. EOT ); $outputFile = fopen("{$dir}/equivset.php", 'w'); if (!$outputFile) { $this->error("Unable to open equivset.php for writing\n", 1); } fwrite($outputFile, "<?" . "php{$endl}" . <<<EOT # This file is generated by generateEquivset.php # It contains a map of characters, encoded in UTF-8, such that running strtr() # on a string with this map will cause confusable characters to be reduced to # a canonical representation. The same array is also available in serialized # form, in equivset.ser. EOT ); $serializedFile = fopen("{$dir}/equivset.ser", 'w'); if (!$serializedFile) { $this->error("Unable to open equivset.ser for writing\n", 1); } # \s matches \xa0 in non-unicode mode, which is not what we want # So we need to make our own whitespace class $sp = '[\\ \\t]'; $lineNum = 0; $setsByChar = array(); $sets = array(); $exitStatus = 0; foreach ($lines as $line) { ++$lineNum; $mapToEmpty = false; # Whether the line ends with a null character $mapToEmpty = strpos($line, "") === strlen($line) - 2; $line = trim($line); # Filter comments if (!$line || $line[0] == '#') { continue; } # Process line if (!preg_match("/^(?P<hexleft> [A-F0-9]+) {$sp}+ (?P<charleft> .+?) {$sp}+ => {$sp}+ (?:(?P<hexright> [A-F0-9]+) {$sp}+|) (?P<charright> .+?) {$sp}* (?: \\#.*|) \$ /x", $line, $m)) { $this->output("Error: invalid entry at line {$lineNum}: {$line}\n"); $exitStatus = 1; continue; } $error = false; if ($mapToEmpty) { $m['charright'] = ''; } else { if (codepointToUtf8(hexdec($m['hexleft'])) != $m['charleft']) { $actual = utf8ToCodepoint($m['charleft']); if ($actual === false) { $this->output("Bytes: " . strlen($m['charleft']) . "\n"); $this->output(bin2hex($line) . "\n"); $hexForm = bin2hex($m['charleft']); $this->output("Invalid UTF-8 character \"{$m['charleft']}\" ({$hexForm}) at line {$lineNum}: {$line}\n"); } else { $this->output("Error: left number ({$m['hexleft']}) does not match left character ({$actual}) " . "at line {$lineNum}: {$line}\n"); } $error = true; } if (!empty($m['hexright']) && codepointToUtf8(hexdec($m['hexright'])) != $m['charright']) { $actual = utf8ToCodepoint($m['charright']); if ($actual === false) { $hexForm = bin2hex($m['charright']); $this->output("Invalid UTF-8 character \"{$m['charleft']}\" ({$hexForm}) at line {$lineNum}: {$line}\n"); } else { $this->output("Error: right number ({$m['hexright']}) does not match right character ({$actual}) " . "at line {$lineNum}: {$line}\n"); } $error = true; } if ($error) { $exitStatus = 1; continue; } } # Find the set for the right character, add a new one if necessary if (isset($setsByChar[$m['charright']])) { $setName = $setsByChar[$m['charright']]; } else { # New set $setName = $m['charright']; $sets[$setName] = array($m['charright']); $setsByChar[$setName] = $setName; } # Add the left character to the set $sets[$setName][] = $m['charleft']; $setsByChar[$m['charleft']] = $setName; } # Sets output foreach ($sets as $members) { fwrite($setsFile, implode(' ', $members) . $endl); } # Map output $output = var_export($setsByChar, true); $output = str_replace("\n", $endl, $output); fwrite($outputFile, '$equivset = ' . "{$output};{$endl}"); # Serialized codepoint map $codepointMap = array(); foreach ($setsByChar as $char => $setName) { $key = $char === '' ? '' : utf8ToCodepoint($char); $value = $setName === '' ? '' : utf8ToCodepoint($setName); $codepointMap[$key] = $value; } fwrite($serializedFile, serialize($codepointMap)); fclose($setsFile); fclose($outputFile); fclose($serializedFile); $text = 'Finished'; if ($exitStatus > 0) { $text .= ' with errors'; } $this->error($text, $exitStatus); }
private static function badCharErr($msgId, $point) { $symbol = codepointToUtf8($point); // Combining marks are combined with the previous character. If abusing character is a // combining mark, prepend it with space to show them correctly. if (self::getScriptCode($point) == "SCRIPT_COMBINING_MARKS") { $symbol = ' ' . $symbol; } $code = sprintf('U+%04X', $point); if (preg_match('/\\A\\p{C}\\z/u', $symbol)) { $char = wfMessage('antispoof-bad-char-non-printable', $code)->text(); } else { $char = wfMessage('antispoof-bad-char', $symbol, $code)->text(); } return array("ERROR", wfMessage($msgId, $char)->text()); }
/** * Function converts an Javascript escaped string back into a string with * specified charset (default is UTF-8). * Modified function from http://pure-essence.net/stuff/code/utf8RawUrlDecode.phps * * @param $source String escaped with Javascript's escape() function * @param $iconv_to String destination character set will be used as second parameter * in the iconv function. Default is UTF-8. * @return string */ function js_unescape($source, $iconv_to = 'UTF-8') { $decodedStr = ''; $pos = 0; $len = strlen($source); while ($pos < $len) { $charAt = substr($source, $pos, 1); if ($charAt == '%') { $pos++; $charAt = substr($source, $pos, 1); if ($charAt == 'u') { // we got a unicode character $pos++; $unicodeHexVal = substr($source, $pos, 4); $unicode = hexdec($unicodeHexVal); $decodedStr .= codepointToUtf8($unicode); $pos += 4; } else { // we have an escaped ascii character $hexVal = substr($source, $pos, 2); $decodedStr .= chr(hexdec($hexVal)); $pos += 2; } } else { $decodedStr .= $charAt; $pos++; } } if ($iconv_to != "UTF-8") { $decodedStr = iconv("utf-8", $iconv_to, $decodedStr); } return $decodedStr; }
/** * @param $list array * @return string */ public static function listToString($list) { $out = ''; foreach ($list as $cp) { $out .= codepointToUtf8($cp); } return $out; }
function hexUnicodeToUtf8($hexcp) { return @codepointToUtf8(@hexDec($hexcp)); }
/** * If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD, * return the UTF-8 encoding of that character. Otherwise, returns * pseudo-entity source (eg &foo;) * * @param string $name * @return string */ static function decodeEntity($name) { global $wgHtmlEntities, $wgHtmlEntityAliases; if (isset($wgHtmlEntityAliases[$name])) { $name = $wgHtmlEntityAliases[$name]; } if (isset($wgHtmlEntities[$name])) { return codepointToUtf8($wgHtmlEntities[$name]); } else { return "&{$name};"; } }
/** * Return UTF-8 string for a codepoint if that is a valid * character reference, otherwise U+FFFD REPLACEMENT CHARACTER. * @param $codepoint Integer * @return String */ static function decodeChar($codepoint) { if (SGString::validateCodepoint($codepoint)) { return codepointToUtf8($codepoint); } else { return UTF8_REPLACEMENT; } }
/** * If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD, * return the UTF-8 encoding of that character. Otherwise, returns * pseudo-entity source (eg &foo;) * * @param $name Strings * @return String */ static function decodeEntity($name) { if (isset(self::$htmlEntityAliases[$name])) { $name = self::$htmlEntityAliases[$name]; } if (isset(self::$htmlEntities[$name])) { return codepointToUtf8(self::$htmlEntities[$name]); } else { return "&{$name};"; } }
} $compatibilityDecomp = array(); $canonicalDecomp = array(); $canonicalComp = array(); $combiningClass = array(); $total = 0; $compat = 0; $canon = 0; print "Reading character definitions...\n"; while (false !== ($line = fgets($in))) { $columns = explode(';', $line); $codepoint = $columns[0]; $name = $columns[1]; $canonicalCombiningClass = $columns[3]; $decompositionMapping = $columns[5]; $source = codepointToUtf8(hexdec($codepoint)); if ($canonicalCombiningClass != 0) { $combiningClass[$source] = intval($canonicalCombiningClass); } if ($decompositionMapping === '') { continue; } if (preg_match('/^<(.+)> (.*)$/', $decompositionMapping, $matches)) { # Compatibility decomposition $canonical = false; $decompositionMapping = $matches[2]; $compat++; } else { $canonical = true; $canon++; }
/** * Reverse the previously applied transliteration of non-ASCII characters * back to UTF-8. Used to protect data from corruption by broken web browsers * as listed in $wgBrowserBlackList. * * @param string $invalue * @return string * @access private */ function unmakesafe($invalue) { $result = ""; for ($i = 0; $i < strlen($invalue); $i++) { if (substr($invalue, $i, 3) == "&#x" && $invalue[$i + 3] != '0') { $i += 3; $hexstring = ""; do { $hexstring .= $invalue[$i]; $i++; } while (ctype_xdigit($invalue[$i]) && $i < strlen($invalue)); // Do some sanity checks. These aren't needed for reversability, // but should help keep the breakage down if the editor // breaks one of the entities whilst editing. if (substr($invalue, $i, 1) == ";" and strlen($hexstring) <= 6) { $codepoint = hexdec($hexstring); $result .= codepointToUtf8($codepoint); } else { $result .= "&#x" . $hexstring . substr($invalue, $i, 1); } } else { $result .= substr($invalue, $i, 1); } } // reverse the transform that we made for reversability reasons. return strtr($result, array("�" => "&#x")); }
continue; } $error = false; if (codepointToUtf8(hexdec($m['hexleft'])) != $m['charleft']) { $actual = utf8ToCodepoint($m['charleft']); if ($actual === false) { print "Bytes: " . strlen($m['charleft']) . "\n"; print bin2hex($line) . "\n"; $hexForm = bin2hex($m['charleft']); print "Invalid UTF-8 character \"{$m['charleft']}\" ({$hexForm}) at line {$lineNum}: {$line}\n"; } else { print "Error: left number ({$m['hexleft']}) does not match left character ({$actual}) " . "at line {$lineNum}: {$line}\n"; } $error = true; } if (!empty($m['hexright']) && codepointToUtf8(hexdec($m['hexright'])) != $m['charright']) { $actual = utf8ToCodepoint($m['charright']); if ($actual === false) { $hexForm = bin2hex($m['charright']); print "Invalid UTF-8 character \"{$m['charleft']}\" ({$hexForm}) at line {$lineNum}: {$line}\n"; } else { print "Error: right number ({$m['hexright']}) does not match right character ({$actual}) " . "at line {$lineNum}: {$line}\n"; } $error = true; } if ($error) { continue; } # Find the set for the right character, add a new one if necessary if (isset($setsByChar[$m['charright']])) { $setName = $setsByChar[$m['charright']];
/** * Generate HTML for a spacer image * @return String: HTML <img> tag */ protected function spacerArrow() { return $this->arrow('', codepointToUtf8(0xa0)); // non-breaking space }
define('UTF8_HANGUL_LEND', codepointToUtf8(UNICODE_HANGUL_LEND)); define('UTF8_HANGUL_VEND', codepointToUtf8(UNICODE_HANGUL_VEND)); define('UTF8_HANGUL_TEND', codepointToUtf8(UNICODE_HANGUL_TEND)); define('UTF8_SURROGATE_FIRST', codepointToUtf8(UNICODE_SURROGATE_FIRST)); define('UTF8_SURROGATE_LAST', codepointToUtf8(UNICODE_SURROGATE_LAST)); define('UTF8_MAX', codepointToUtf8(UNICODE_MAX)); define('UTF8_REPLACEMENT', codepointToUtf8(UNICODE_REPLACEMENT)); #define( 'UTF8_REPLACEMENT', '!' ); define('UTF8_OVERLONG_A', "��"); define('UTF8_OVERLONG_B', "���"); define('UTF8_OVERLONG_C', "����"); # These two ranges are illegal define('UTF8_FDD0', codepointToUtf8(0xfdd0)); define('UTF8_FDEF', codepointToUtf8(0xfdef)); define('UTF8_FFFE', codepointToUtf8(0xfffe)); define('UTF8_FFFF', codepointToUtf8(0xffff)); define('UTF8_HEAD', false); define('UTF8_TAIL', true); /** * For using the ICU wrapper */ define('UNORM_NONE', 1); define('UNORM_NFD', 2); define('UNORM_NFKD', 3); define('UNORM_NFC', 4); define('UNORM_DEFAULT', UNORM_NFC); define('UNORM_NFKC', 5); define('UNORM_FCD', 6); define('NORMALIZE_ICU', function_exists('utf8_normalize')); /** *