/** * Take a UTF-8 string and return a space-separated series of hex * numbers representing Unicode code points. For debugging. * * @param string $str UTF-8 string. * @return string * @private */ function utf8ToHexSequence($str) { $buf = ''; foreach (preg_split('//u', $str, -1, PREG_SPLIT_NO_EMPTY) as $cp) { $buf .= sprintf('%04x ', utf8ToCodepoint($cp)); } return rtrim($buf); }
/** * Get the first character of a string. * * @param $s string * @return string */ function firstChar($s) { $matches = array(); preg_match('/^([\\x00-\\x7f]|[\\xc0-\\xdf][\\x80-\\xbf]|' . '[\\xe0-\\xef][\\x80-\\xbf]{2}|[\\xf0-\\xf7][\\x80-\\xbf]{3})/', $s, $matches); if (isset($matches[1])) { if (strlen($matches[1]) != 3) { return $matches[1]; } // Break down Hangul syllables to grab the first jamo $code = utf8ToCodepoint($matches[1]); if ($code < 0xac00 || 0xd7a4 <= $code) { return $matches[1]; } elseif ($code < 0xb098) { return "ㄱ"; } elseif ($code < 0xb2e4) { return "ㄴ"; } elseif ($code < 0xb77c) { return "ㄷ"; } elseif ($code < 0xb9c8) { return "ㄹ"; } elseif ($code < 0xbc14) { return "ㅁ"; } elseif ($code < 0xc0ac) { return "ㅂ"; } elseif ($code < 0xc544) { return "ㅅ"; } elseif ($code < 0xc790) { return "ㅇ"; } elseif ($code < 0xcc28) { return "ㅈ"; } elseif ($code < 0xce74) { return "ㅊ"; } elseif ($code < 0xd0c0) { return "ㅋ"; } elseif ($code < 0xd30c) { return "ㅌ"; } elseif ($code < 0xd558) { return "ㅍ"; } else { return "ㅎ"; } } else { return ''; } }
/** * get the unicode index of an item * @param string $a_item */ private function getItemCodepoint($a_item) { if (preg_match('/^[uU]\\+[0-9a-fA-F]+$/', $a_item)) { return (int) hexdec(substr($a_item, 2)); } else { //take the codepoint of the first character require_once "include/Unicode/UtfNormalUtil.php"; return (int) utf8ToCodepoint($a_item); } }
/** * Normalize CSS into a format we can easily search for hostile input * - decode character references * - decode escape sequences * - convert characters that IE6 interprets into ascii * - remove comments, unless the entire value is one single comment * @param string $value the css string * @return string normalized css */ public static function normalizeCss($value) { // Decode character references like { $value = Sanitizer::decodeCharReferences($value); // Decode escape sequences and line continuation // See the grammar in the CSS 2 spec, appendix D. // This has to be done AFTER decoding character references. // This means it isn't possible for this function to return // unsanitized escape sequences. It is possible to manufacture // input that contains character references that decode to // escape sequences that decode to character references, but // it's OK for the return value to contain character references // because the caller is supposed to escape those anyway. static $decodeRegex; if (!$decodeRegex) { $space = '[\\x20\\t\\r\\n\\f]'; $nl = '(?:\\n|\\r\\n|\\r|\\f)'; $backslash = '\\\\'; $decodeRegex = "/ {$backslash}\n\t\t\t\t(?:\n\t\t\t\t\t({$nl}) | # 1. Line continuation\n\t\t\t\t\t([0-9A-Fa-f]{1,6}){$space}? | # 2. character number\n\t\t\t\t\t(.) | # 3. backslash cancelling special meaning\n\t\t\t\t\t() | # 4. backslash at end of string\n\t\t\t\t)/xu"; } $value = preg_replace_callback($decodeRegex, array(__CLASS__, 'cssDecodeCallback'), $value); // Normalize Halfwidth and Fullwidth Unicode block that IE6 might treat as ascii $value = preg_replace_callback('/[!-[]-z]/u', function ($matches) { $cp = utf8ToCodepoint($matches[0]); if ($cp === false) { return ''; } return chr($cp - 65248); // ASCII range \x21-\x7A }, $value); // Convert more characters IE6 might treat as ascii // U+0280, U+0274, U+207F, U+029F, U+026A, U+207D, U+208D $value = str_replace(array('ʀ', 'ɴ', 'ⁿ', 'ʟ', 'ɪ', '⁽', '₍'), array('r', 'n', 'n', 'l', 'i', '(', '('), $value); // Let the value through if it's nothing but a single comment, to // allow other functions which may reject it to pass some error // message through. if (!preg_match('! ^ \\s* /\\* [^*\\/]* \\*/ \\s* $ !x', $value)) { // Remove any comments; IE gets token splitting wrong // This must be done AFTER decoding character references and // escape sequences, because those steps can introduce comments // This step cannot introduce character references or escape // sequences, because it replaces comments with spaces rather // than removing them completely. $value = StringUtils::delimiterReplace('/*', '*/', ' ', $value); // Remove anything after a comment-start token, to guard against // incorrect client implementations. $commentPos = strpos($value, '/*'); if ($commentPos !== false) { $value = substr($value, 0, $commentPos); } } // S followed by repeat, iteration, or prolonged sound marks, // which IE will treat as "ss" $value = preg_replace('/s(?: \\xE3\\x80\\xB1 | # U+3031 \\xE3\\x82\\x9D | # U+309D \\xE3\\x83\\xBC | # U+30FC \\xE3\\x83\\xBD | # U+30FD \\xEF\\xB9\\xBC | # U+FE7C \\xEF\\xB9\\xBD | # U+FE7D \\xEF\\xBD\\xB0 # U+FF70 )/ix', 'ss', $value); return $value; }
/** * Normalize Unicode U+FF01 to U+FF5A * @param character $char * @return character in ASCII range \x21-\x7A */ static function cssNormalizeUnicodeWidth($matches) { $cp = utf8ToCodepoint($matches[0]); if ($cp === false) { return ''; } return chr($cp - 65248); // ASCII range \x21-\x7A }
} # Find the set for the right character, add a new one if necessary if (isset($setsByChar[$m['charright']])) { $setName = $setsByChar[$m['charright']]; } else { # New set $setName = $m['charright']; $sets[$setName] = array($m['charright']); $setsByChar[$setName] = $setName; } # Add the left character to the set $sets[$setName][] = $m['charleft']; $setsByChar[$m['charleft']] = $setName; } # Sets output foreach ($sets as $setName => $members) { fwrite($setsFile, implode(' ', $members) . $endl); } # Map output $output = var_export($setsByChar, true); $output = str_replace("\n", $endl, $output); fwrite($outputFile, '$equivset = ' . "{$output}{$endl}?" . ">{$endl}"); # Serialized codepoint map $codepointMap = array(); foreach ($setsByChar as $char => $setName) { $codepointMap[utf8ToCodepoint($char)] = utf8ToCodepoint($setName); } fwrite($serializedFile, serialize($codepointMap)); fclose($setsFile); fclose($outputFile); fclose($serializedFile);
function getFirstLetter($string) { $string = strval($string); if ($string === '') { return ''; } // Check for CJK $firstChar = mb_substr($string, 0, 1, 'UTF-8'); if (ord($firstChar) > 0x7f && self::isCjk(utf8ToCodepoint($firstChar))) { return $firstChar; } $sortKey = $this->getPrimarySortKey($string); // Do a binary search to find the correct letter to sort under $min = $this->findLowerBound(array($this, 'getSortKeyByLetterIndex'), $this->getFirstLetterCount(), 'strcmp', $sortKey); if ($min === false) { // Before the first letter return ''; } return $this->getLetterByIndex($min); }
public function execute() { $dir = __DIR__; $endl = "\n"; $lines = file("{$dir}/equivset.in"); if (!$lines) { $this->error("Unable to open equivset.in\n", 1); } $setsFile = fopen("{$dir}/equivset.txt", 'w'); if (!$setsFile) { $this->error("Unable to open equivset.txt for writing\n", 1); } fwrite($setsFile, <<<EOT # This file is generated by generateEquivset.php # It shows sets of equivalent characters, one set per line, with characters # separated by whitespace. This file is not used by MediaWiki, rather it is # intended as a human-readable version of equivset.php, for debugging and # review purposes. EOT ); $outputFile = fopen("{$dir}/equivset.php", 'w'); if (!$outputFile) { $this->error("Unable to open equivset.php for writing\n", 1); } fwrite($outputFile, "<?" . "php{$endl}" . <<<EOT # This file is generated by generateEquivset.php # It contains a map of characters, encoded in UTF-8, such that running strtr() # on a string with this map will cause confusable characters to be reduced to # a canonical representation. The same array is also available in serialized # form, in equivset.ser. EOT ); $serializedFile = fopen("{$dir}/equivset.ser", 'w'); if (!$serializedFile) { $this->error("Unable to open equivset.ser for writing\n", 1); } # \s matches \xa0 in non-unicode mode, which is not what we want # So we need to make our own whitespace class $sp = '[\\ \\t]'; $lineNum = 0; $setsByChar = array(); $sets = array(); $exitStatus = 0; foreach ($lines as $line) { ++$lineNum; $mapToEmpty = false; # Whether the line ends with a null character $mapToEmpty = strpos($line, "") === strlen($line) - 2; $line = trim($line); # Filter comments if (!$line || $line[0] == '#') { continue; } # Process line if (!preg_match("/^(?P<hexleft> [A-F0-9]+) {$sp}+ (?P<charleft> .+?) {$sp}+ => {$sp}+ (?:(?P<hexright> [A-F0-9]+) {$sp}+|) (?P<charright> .+?) {$sp}* (?: \\#.*|) \$ /x", $line, $m)) { $this->output("Error: invalid entry at line {$lineNum}: {$line}\n"); $exitStatus = 1; continue; } $error = false; if ($mapToEmpty) { $m['charright'] = ''; } else { if (codepointToUtf8(hexdec($m['hexleft'])) != $m['charleft']) { $actual = utf8ToCodepoint($m['charleft']); if ($actual === false) { $this->output("Bytes: " . strlen($m['charleft']) . "\n"); $this->output(bin2hex($line) . "\n"); $hexForm = bin2hex($m['charleft']); $this->output("Invalid UTF-8 character \"{$m['charleft']}\" ({$hexForm}) at line {$lineNum}: {$line}\n"); } else { $this->output("Error: left number ({$m['hexleft']}) does not match left character ({$actual}) " . "at line {$lineNum}: {$line}\n"); } $error = true; } if (!empty($m['hexright']) && codepointToUtf8(hexdec($m['hexright'])) != $m['charright']) { $actual = utf8ToCodepoint($m['charright']); if ($actual === false) { $hexForm = bin2hex($m['charright']); $this->output("Invalid UTF-8 character \"{$m['charleft']}\" ({$hexForm}) at line {$lineNum}: {$line}\n"); } else { $this->output("Error: right number ({$m['hexright']}) does not match right character ({$actual}) " . "at line {$lineNum}: {$line}\n"); } $error = true; } if ($error) { $exitStatus = 1; continue; } } # Find the set for the right character, add a new one if necessary if (isset($setsByChar[$m['charright']])) { $setName = $setsByChar[$m['charright']]; } else { # New set $setName = $m['charright']; $sets[$setName] = array($m['charright']); $setsByChar[$setName] = $setName; } # Add the left character to the set $sets[$setName][] = $m['charleft']; $setsByChar[$m['charleft']] = $setName; } # Sets output foreach ($sets as $members) { fwrite($setsFile, implode(' ', $members) . $endl); } # Map output $output = var_export($setsByChar, true); $output = str_replace("\n", $endl, $output); fwrite($outputFile, '$equivset = ' . "{$output};{$endl}"); # Serialized codepoint map $codepointMap = array(); foreach ($setsByChar as $char => $setName) { $key = $char === '' ? '' : utf8ToCodepoint($char); $value = $setName === '' ? '' : utf8ToCodepoint($setName); $codepointMap[$key] = $value; } fwrite($serializedFile, serialize($codepointMap)); fclose($setsFile); fclose($outputFile); fclose($serializedFile); $text = 'Finished'; if ($exitStatus > 0) { $text .= ' with errors'; } $this->error($text, $exitStatus); }
/** * Convert string into array of Unicode code points as integers * @param $str * @return array */ public static function stringToList($str) { $ar = array(); if (!preg_match_all('/./us', $str, $ar)) { return array(); } $out = array(); foreach ($ar[0] as $char) { $out[] = utf8ToCodepoint($char); } return $out; }
/** * Get the first character of a string. * * @param $s string * @return string */ function firstChar( $s ) { $matches = array(); preg_match( '/^([\x00-\x7f]|[\xc0-\xdf][\x80-\xbf]|' . '[\xe0-\xef][\x80-\xbf]{2}|[\xf0-\xf7][\x80-\xbf]{3})/', $s, $matches ); if ( isset( $matches[1] ) ) { if ( strlen( $matches[1] ) != 3 ) { return $matches[1]; } // Break down Hangul syllables to grab the first jamo $code = utf8ToCodepoint( $matches[1] ); if ( $code < 0xac00 || 0xd7a4 <= $code ) { return $matches[1]; } elseif ( $code < 0xb098 ) { return "\xe3\x84\xb1"; } elseif ( $code < 0xb2e4 ) { return "\xe3\x84\xb4"; } elseif ( $code < 0xb77c ) { return "\xe3\x84\xb7"; } elseif ( $code < 0xb9c8 ) { return "\xe3\x84\xb9"; } elseif ( $code < 0xbc14 ) { return "\xe3\x85\x81"; } elseif ( $code < 0xc0ac ) { return "\xe3\x85\x82"; } elseif ( $code < 0xc544 ) { return "\xe3\x85\x85"; } elseif ( $code < 0xc790 ) { return "\xe3\x85\x87"; } elseif ( $code < 0xcc28 ) { return "\xe3\x85\x88"; } elseif ( $code < 0xce74 ) { return "\xe3\x85\x8a"; } elseif ( $code < 0xd0c0 ) { return "\xe3\x85\x8b"; } elseif ( $code < 0xd30c ) { return "\xe3\x85\x8c"; } elseif ( $code < 0xd558 ) { return "\xe3\x85\x8d"; } else { return "\xe3\x85\x8e"; } } else { return ''; } }
/** * Mangle XML-invalid names to be valid in XML * @param string $name * @param array $preserveKeys Names to not mangle * @return string Mangled name */ private static function mangleName($name, $preserveKeys = array()) { static $nsc = null, $nc = null; if (in_array($name, $preserveKeys, true)) { return $name; } if ($name === '') { return '_'; } if ($nsc === null) { // Note we omit ':' from $nsc and $nc because it's reserved for XML // namespacing, and we omit '_' from $nsc (but not $nc) because we // reserve it. $nsc = 'A-Za-z\\x{C0}-\\x{D6}\\x{D8}-\\x{F6}\\x{F8}-\\x{2FF}\\x{370}-\\x{37D}\\x{37F}-\\x{1FFF}' . '\\x{200C}-\\x{200D}\\x{2070}-\\x{218F}\\x{2C00}-\\x{2FEF}\\x{3001}-\\x{D7FF}' . '\\x{F900}-\\x{FDCF}\\x{FDF0}-\\x{FFFD}\\x{10000}-\\x{EFFFF}'; $nc = $nsc . '_\\-.0-9\\x{B7}\\x{300}-\\x{36F}\\x{203F}-\\x{2040}'; } if (preg_match("/^[{$nsc}][{$nc}]*\$/uS", $name)) { return $name; } return '_' . preg_replace_callback("/[^{$nc}]/uS", function ($m) { return sprintf('.%X.', utf8ToCodepoint($m[0])); }, str_replace('.', '.2E.', $name)); }
function onCategoryMultisortSortkeys_buildRadicalSortkey($data, $str) { $result = ''; foreach ($this->onCategoryMultisortSortkeys_splitString($str) as $ch) { # One UTF-8 character can have 4 bytes max. $c = str_pad($ch, 4); $chcp = utf8ToCodepoint($ch); # One radical-stroke entry always has 3 (radical) + 3 (stroke) = 6 bytes, or blank if unavailable. $r = str_pad($this->onCategoryMultisortSortkeys_getRadical($data, $chcp), 6); $result .= $r . $c; } return $result; }