Пример #1
0
/**
 * Take a UTF-8 string and return a space-separated series of hex
 * numbers representing Unicode code points. For debugging.
 *
 * @param string $str UTF-8 string.
 * @return string
 * @private
 */
function utf8ToHexSequence($str)
{
    $buf = '';
    foreach (preg_split('//u', $str, -1, PREG_SPLIT_NO_EMPTY) as $cp) {
        $buf .= sprintf('%04x ', utf8ToCodepoint($cp));
    }
    return rtrim($buf);
}
Пример #2
0
 /**
  * Get the first character of a string.
  *
  * @param $s string
  * @return string
  */
 function firstChar($s)
 {
     $matches = array();
     preg_match('/^([\\x00-\\x7f]|[\\xc0-\\xdf][\\x80-\\xbf]|' . '[\\xe0-\\xef][\\x80-\\xbf]{2}|[\\xf0-\\xf7][\\x80-\\xbf]{3})/', $s, $matches);
     if (isset($matches[1])) {
         if (strlen($matches[1]) != 3) {
             return $matches[1];
         }
         // Break down Hangul syllables to grab the first jamo
         $code = utf8ToCodepoint($matches[1]);
         if ($code < 0xac00 || 0xd7a4 <= $code) {
             return $matches[1];
         } elseif ($code < 0xb098) {
             return "ㄱ";
         } elseif ($code < 0xb2e4) {
             return "ㄴ";
         } elseif ($code < 0xb77c) {
             return "ㄷ";
         } elseif ($code < 0xb9c8) {
             return "ㄹ";
         } elseif ($code < 0xbc14) {
             return "ㅁ";
         } elseif ($code < 0xc0ac) {
             return "ㅂ";
         } elseif ($code < 0xc544) {
             return "ㅅ";
         } elseif ($code < 0xc790) {
             return "ㅇ";
         } elseif ($code < 0xcc28) {
             return "ㅈ";
         } elseif ($code < 0xce74) {
             return "ㅊ";
         } elseif ($code < 0xd0c0) {
             return "ㅋ";
         } elseif ($code < 0xd30c) {
             return "ㅌ";
         } elseif ($code < 0xd558) {
             return "ㅍ";
         } else {
             return "ㅎ";
         }
     } else {
         return '';
     }
 }
Пример #3
0
 /**
  * get the unicode index of an item
  * @param string $a_item
  */
 private function getItemCodepoint($a_item)
 {
     if (preg_match('/^[uU]\\+[0-9a-fA-F]+$/', $a_item)) {
         return (int) hexdec(substr($a_item, 2));
     } else {
         //take the codepoint of the first character
         require_once "include/Unicode/UtfNormalUtil.php";
         return (int) utf8ToCodepoint($a_item);
     }
 }
Пример #4
0
    /**
     * Normalize CSS into a format we can easily search for hostile input
     *  - decode character references
     *  - decode escape sequences
     *  - convert characters that IE6 interprets into ascii
     *  - remove comments, unless the entire value is one single comment
     * @param string $value the css string
     * @return string normalized css
     */
    public static function normalizeCss($value)
    {
        // Decode character references like &#123;
        $value = Sanitizer::decodeCharReferences($value);
        // Decode escape sequences and line continuation
        // See the grammar in the CSS 2 spec, appendix D.
        // This has to be done AFTER decoding character references.
        // This means it isn't possible for this function to return
        // unsanitized escape sequences. It is possible to manufacture
        // input that contains character references that decode to
        // escape sequences that decode to character references, but
        // it's OK for the return value to contain character references
        // because the caller is supposed to escape those anyway.
        static $decodeRegex;
        if (!$decodeRegex) {
            $space = '[\\x20\\t\\r\\n\\f]';
            $nl = '(?:\\n|\\r\\n|\\r|\\f)';
            $backslash = '\\\\';
            $decodeRegex = "/ {$backslash}\n\t\t\t\t(?:\n\t\t\t\t\t({$nl}) |  # 1. Line continuation\n\t\t\t\t\t([0-9A-Fa-f]{1,6}){$space}? |  # 2. character number\n\t\t\t\t\t(.) | # 3. backslash cancelling special meaning\n\t\t\t\t\t() | # 4. backslash at end of string\n\t\t\t\t)/xu";
        }
        $value = preg_replace_callback($decodeRegex, array(__CLASS__, 'cssDecodeCallback'), $value);
        // Normalize Halfwidth and Fullwidth Unicode block that IE6 might treat as ascii
        $value = preg_replace_callback('/[!-[]-z]/u', function ($matches) {
            $cp = utf8ToCodepoint($matches[0]);
            if ($cp === false) {
                return '';
            }
            return chr($cp - 65248);
            // ASCII range \x21-\x7A
        }, $value);
        // Convert more characters IE6 might treat as ascii
        // U+0280, U+0274, U+207F, U+029F, U+026A, U+207D, U+208D
        $value = str_replace(array('ʀ', 'ɴ', 'ⁿ', 'ʟ', 'ɪ', '⁽', '₍'), array('r', 'n', 'n', 'l', 'i', '(', '('), $value);
        // Let the value through if it's nothing but a single comment, to
        // allow other functions which may reject it to pass some error
        // message through.
        if (!preg_match('! ^ \\s* /\\* [^*\\/]* \\*/ \\s* $ !x', $value)) {
            // Remove any comments; IE gets token splitting wrong
            // This must be done AFTER decoding character references and
            // escape sequences, because those steps can introduce comments
            // This step cannot introduce character references or escape
            // sequences, because it replaces comments with spaces rather
            // than removing them completely.
            $value = StringUtils::delimiterReplace('/*', '*/', ' ', $value);
            // Remove anything after a comment-start token, to guard against
            // incorrect client implementations.
            $commentPos = strpos($value, '/*');
            if ($commentPos !== false) {
                $value = substr($value, 0, $commentPos);
            }
        }
        // S followed by repeat, iteration, or prolonged sound marks,
        // which IE will treat as "ss"
        $value = preg_replace('/s(?:
				\\xE3\\x80\\xB1 | # U+3031
				\\xE3\\x82\\x9D | # U+309D
				\\xE3\\x83\\xBC | # U+30FC
				\\xE3\\x83\\xBD | # U+30FD
				\\xEF\\xB9\\xBC | # U+FE7C
				\\xEF\\xB9\\xBD | # U+FE7D
				\\xEF\\xBD\\xB0   # U+FF70
			)/ix', 'ss', $value);
        return $value;
    }
Пример #5
0
 /**
  * Normalize Unicode U+FF01 to U+FF5A
  * @param character $char
  * @return character in ASCII range \x21-\x7A
  */
 static function cssNormalizeUnicodeWidth($matches)
 {
     $cp = utf8ToCodepoint($matches[0]);
     if ($cp === false) {
         return '';
     }
     return chr($cp - 65248);
     // ASCII range \x21-\x7A
 }
Пример #6
0
    }
    # Find the set for the right character, add a new one if necessary
    if (isset($setsByChar[$m['charright']])) {
        $setName = $setsByChar[$m['charright']];
    } else {
        # New set
        $setName = $m['charright'];
        $sets[$setName] = array($m['charright']);
        $setsByChar[$setName] = $setName;
    }
    # Add the left character to the set
    $sets[$setName][] = $m['charleft'];
    $setsByChar[$m['charleft']] = $setName;
}
# Sets output
foreach ($sets as $setName => $members) {
    fwrite($setsFile, implode(' ', $members) . $endl);
}
# Map output
$output = var_export($setsByChar, true);
$output = str_replace("\n", $endl, $output);
fwrite($outputFile, '$equivset = ' . "{$output}{$endl}?" . ">{$endl}");
# Serialized codepoint map
$codepointMap = array();
foreach ($setsByChar as $char => $setName) {
    $codepointMap[utf8ToCodepoint($char)] = utf8ToCodepoint($setName);
}
fwrite($serializedFile, serialize($codepointMap));
fclose($setsFile);
fclose($outputFile);
fclose($serializedFile);
Пример #7
0
 function getFirstLetter($string)
 {
     $string = strval($string);
     if ($string === '') {
         return '';
     }
     // Check for CJK
     $firstChar = mb_substr($string, 0, 1, 'UTF-8');
     if (ord($firstChar) > 0x7f && self::isCjk(utf8ToCodepoint($firstChar))) {
         return $firstChar;
     }
     $sortKey = $this->getPrimarySortKey($string);
     // Do a binary search to find the correct letter to sort under
     $min = $this->findLowerBound(array($this, 'getSortKeyByLetterIndex'), $this->getFirstLetterCount(), 'strcmp', $sortKey);
     if ($min === false) {
         // Before the first letter
         return '';
     }
     return $this->getLetterByIndex($min);
 }
    public function execute()
    {
        $dir = __DIR__;
        $endl = "\n";
        $lines = file("{$dir}/equivset.in");
        if (!$lines) {
            $this->error("Unable to open equivset.in\n", 1);
        }
        $setsFile = fopen("{$dir}/equivset.txt", 'w');
        if (!$setsFile) {
            $this->error("Unable to open equivset.txt for writing\n", 1);
        }
        fwrite($setsFile, <<<EOT
# This file is generated by generateEquivset.php
# It shows sets of equivalent characters, one set per line, with characters
# separated by whitespace. This file is not used by MediaWiki, rather it is
# intended as a human-readable version of equivset.php, for debugging and
# review purposes.

EOT
);
        $outputFile = fopen("{$dir}/equivset.php", 'w');
        if (!$outputFile) {
            $this->error("Unable to open equivset.php for writing\n", 1);
        }
        fwrite($outputFile, "<?" . "php{$endl}" . <<<EOT
# This file is generated by generateEquivset.php
# It contains a map of characters, encoded in UTF-8, such that running strtr()
# on a string with this map will cause confusable characters to be reduced to
# a canonical representation. The same array is also available in serialized
# form, in equivset.ser.

EOT
);
        $serializedFile = fopen("{$dir}/equivset.ser", 'w');
        if (!$serializedFile) {
            $this->error("Unable to open equivset.ser for writing\n", 1);
        }
        # \s matches \xa0 in non-unicode mode, which is not what we want
        # So we need to make our own whitespace class
        $sp = '[\\ \\t]';
        $lineNum = 0;
        $setsByChar = array();
        $sets = array();
        $exitStatus = 0;
        foreach ($lines as $line) {
            ++$lineNum;
            $mapToEmpty = false;
            # Whether the line ends with a null character
            $mapToEmpty = strpos($line, "") === strlen($line) - 2;
            $line = trim($line);
            # Filter comments
            if (!$line || $line[0] == '#') {
                continue;
            }
            # Process line
            if (!preg_match("/^(?P<hexleft> [A-F0-9]+) {$sp}+ (?P<charleft> .+?) {$sp}+ => {$sp}+ (?:(?P<hexright> [A-F0-9]+) {$sp}+|) (?P<charright> .+?) {$sp}* (?: \\#.*|) \$ /x", $line, $m)) {
                $this->output("Error: invalid entry at line {$lineNum}: {$line}\n");
                $exitStatus = 1;
                continue;
            }
            $error = false;
            if ($mapToEmpty) {
                $m['charright'] = '';
            } else {
                if (codepointToUtf8(hexdec($m['hexleft'])) != $m['charleft']) {
                    $actual = utf8ToCodepoint($m['charleft']);
                    if ($actual === false) {
                        $this->output("Bytes: " . strlen($m['charleft']) . "\n");
                        $this->output(bin2hex($line) . "\n");
                        $hexForm = bin2hex($m['charleft']);
                        $this->output("Invalid UTF-8 character \"{$m['charleft']}\" ({$hexForm}) at line {$lineNum}: {$line}\n");
                    } else {
                        $this->output("Error: left number ({$m['hexleft']}) does not match left character ({$actual}) " . "at line {$lineNum}: {$line}\n");
                    }
                    $error = true;
                }
                if (!empty($m['hexright']) && codepointToUtf8(hexdec($m['hexright'])) != $m['charright']) {
                    $actual = utf8ToCodepoint($m['charright']);
                    if ($actual === false) {
                        $hexForm = bin2hex($m['charright']);
                        $this->output("Invalid UTF-8 character \"{$m['charleft']}\" ({$hexForm}) at line {$lineNum}: {$line}\n");
                    } else {
                        $this->output("Error: right number ({$m['hexright']}) does not match right character ({$actual}) " . "at line {$lineNum}: {$line}\n");
                    }
                    $error = true;
                }
                if ($error) {
                    $exitStatus = 1;
                    continue;
                }
            }
            # Find the set for the right character, add a new one if necessary
            if (isset($setsByChar[$m['charright']])) {
                $setName = $setsByChar[$m['charright']];
            } else {
                # New set
                $setName = $m['charright'];
                $sets[$setName] = array($m['charright']);
                $setsByChar[$setName] = $setName;
            }
            # Add the left character to the set
            $sets[$setName][] = $m['charleft'];
            $setsByChar[$m['charleft']] = $setName;
        }
        # Sets output
        foreach ($sets as $members) {
            fwrite($setsFile, implode(' ', $members) . $endl);
        }
        # Map output
        $output = var_export($setsByChar, true);
        $output = str_replace("\n", $endl, $output);
        fwrite($outputFile, '$equivset = ' . "{$output};{$endl}");
        # Serialized codepoint map
        $codepointMap = array();
        foreach ($setsByChar as $char => $setName) {
            $key = $char === '' ? '' : utf8ToCodepoint($char);
            $value = $setName === '' ? '' : utf8ToCodepoint($setName);
            $codepointMap[$key] = $value;
        }
        fwrite($serializedFile, serialize($codepointMap));
        fclose($setsFile);
        fclose($outputFile);
        fclose($serializedFile);
        $text = 'Finished';
        if ($exitStatus > 0) {
            $text .= ' with errors';
        }
        $this->error($text, $exitStatus);
    }
 /**
  * Convert string into array of Unicode code points as integers
  * @param $str
  * @return array
  */
 public static function stringToList($str)
 {
     $ar = array();
     if (!preg_match_all('/./us', $str, $ar)) {
         return array();
     }
     $out = array();
     foreach ($ar[0] as $char) {
         $out[] = utf8ToCodepoint($char);
     }
     return $out;
 }
Пример #10
0
	/**
	 * Get the first character of a string.
	 *
	 * @param $s string
	 * @return string
	 */
	function firstChar( $s ) {
		$matches = array();
		preg_match(
			'/^([\x00-\x7f]|[\xc0-\xdf][\x80-\xbf]|' .
				'[\xe0-\xef][\x80-\xbf]{2}|[\xf0-\xf7][\x80-\xbf]{3})/',
			$s,
			$matches
		);

		if ( isset( $matches[1] ) ) {
			if ( strlen( $matches[1] ) != 3 ) {
				return $matches[1];
			}

			// Break down Hangul syllables to grab the first jamo
			$code = utf8ToCodepoint( $matches[1] );
			if ( $code < 0xac00 || 0xd7a4 <= $code ) {
				return $matches[1];
			} elseif ( $code < 0xb098 ) {
				return "\xe3\x84\xb1";
			} elseif ( $code < 0xb2e4 ) {
				return "\xe3\x84\xb4";
			} elseif ( $code < 0xb77c ) {
				return "\xe3\x84\xb7";
			} elseif ( $code < 0xb9c8 ) {
				return "\xe3\x84\xb9";
			} elseif ( $code < 0xbc14 ) {
				return "\xe3\x85\x81";
			} elseif ( $code < 0xc0ac ) {
				return "\xe3\x85\x82";
			} elseif ( $code < 0xc544 ) {
				return "\xe3\x85\x85";
			} elseif ( $code < 0xc790 ) {
				return "\xe3\x85\x87";
			} elseif ( $code < 0xcc28 ) {
				return "\xe3\x85\x88";
			} elseif ( $code < 0xce74 ) {
				return "\xe3\x85\x8a";
			} elseif ( $code < 0xd0c0 ) {
				return "\xe3\x85\x8b";
			} elseif ( $code < 0xd30c ) {
				return "\xe3\x85\x8c";
			} elseif ( $code < 0xd558 ) {
				return "\xe3\x85\x8d";
			} else {
				return "\xe3\x85\x8e";
			}
		} else {
			return '';
		}
	}
Пример #11
0
 /**
  * Mangle XML-invalid names to be valid in XML
  * @param string $name
  * @param array $preserveKeys Names to not mangle
  * @return string Mangled name
  */
 private static function mangleName($name, $preserveKeys = array())
 {
     static $nsc = null, $nc = null;
     if (in_array($name, $preserveKeys, true)) {
         return $name;
     }
     if ($name === '') {
         return '_';
     }
     if ($nsc === null) {
         // Note we omit ':' from $nsc and $nc because it's reserved for XML
         // namespacing, and we omit '_' from $nsc (but not $nc) because we
         // reserve it.
         $nsc = 'A-Za-z\\x{C0}-\\x{D6}\\x{D8}-\\x{F6}\\x{F8}-\\x{2FF}\\x{370}-\\x{37D}\\x{37F}-\\x{1FFF}' . '\\x{200C}-\\x{200D}\\x{2070}-\\x{218F}\\x{2C00}-\\x{2FEF}\\x{3001}-\\x{D7FF}' . '\\x{F900}-\\x{FDCF}\\x{FDF0}-\\x{FFFD}\\x{10000}-\\x{EFFFF}';
         $nc = $nsc . '_\\-.0-9\\x{B7}\\x{300}-\\x{36F}\\x{203F}-\\x{2040}';
     }
     if (preg_match("/^[{$nsc}][{$nc}]*\$/uS", $name)) {
         return $name;
     }
     return '_' . preg_replace_callback("/[^{$nc}]/uS", function ($m) {
         return sprintf('.%X.', utf8ToCodepoint($m[0]));
     }, str_replace('.', '.2E.', $name));
 }
 function onCategoryMultisortSortkeys_buildRadicalSortkey($data, $str)
 {
     $result = '';
     foreach ($this->onCategoryMultisortSortkeys_splitString($str) as $ch) {
         # One UTF-8 character can have 4 bytes max.
         $c = str_pad($ch, 4);
         $chcp = utf8ToCodepoint($ch);
         # One radical-stroke entry always has 3 (radical) + 3 (stroke) = 6 bytes, or blank if unavailable.
         $r = str_pad($this->onCategoryMultisortSortkeys_getRadical($data, $chcp), 6);
         $result .= $r . $c;
     }
     return $result;
 }