/** * @return DOMDocument DOM to manipulate */ public function getDoc() { if (!$this->doc) { // DOMDocument::loadHTML apparently isn't very good with encodings, so // convert input to ASCII by encoding everything above 128 as entities. if (function_exists('mb_convert_encoding')) { $html = mb_convert_encoding($this->html, 'HTML-ENTITIES', 'UTF-8'); } else { $html = preg_replace_callback('/[\\x{80}-\\x{10ffff}]/u', function ($m) { return '&#' . UtfNormal\Utils::utf8ToCodepoint($m[0]) . ';'; }, $this->html); } // Workaround for bug that caused spaces before references // to disappear during processing: https://phabricator.wikimedia.org/T55086 // TODO: Please replace with a better fix if one can be found. $html = str_replace(' <', ' <', $html); libxml_use_internal_errors(true); $loader = libxml_disable_entity_loader(); $this->doc = new DOMDocument(); $this->doc->strictErrorChecking = false; $this->doc->loadHTML($html); libxml_disable_entity_loader($loader); libxml_use_internal_errors(false); $this->doc->encoding = 'UTF-8'; } return $this->doc; }
/** * Get the first character of a string. * * @param string $s * @return string */ function firstChar($s) { $matches = array(); preg_match('/^([\\x00-\\x7f]|[\\xc0-\\xdf][\\x80-\\xbf]|' . '[\\xe0-\\xef][\\x80-\\xbf]{2}|[\\xf0-\\xf7][\\x80-\\xbf]{3})/', $s, $matches); if (isset($matches[1])) { if (strlen($matches[1]) != 3) { return $matches[1]; } // Break down Hangul syllables to grab the first jamo $code = UtfNormal\Utils::utf8ToCodepoint($matches[1]); if ($code < 0xac00 || 0xd7a4 <= $code) { return $matches[1]; } elseif ($code < 0xb098) { return "ㄱ"; } elseif ($code < 0xb2e4) { return "ㄴ"; } elseif ($code < 0xb77c) { return "ㄷ"; } elseif ($code < 0xb9c8) { return "ㄹ"; } elseif ($code < 0xbc14) { return "ㅁ"; } elseif ($code < 0xc0ac) { return "ㅂ"; } elseif ($code < 0xc544) { return "ㅅ"; } elseif ($code < 0xc790) { return "ㅇ"; } elseif ($code < 0xcc28) { return "ㅈ"; } elseif ($code < 0xce74) { return "ㅊ"; } elseif ($code < 0xd0c0) { return "ㅋ"; } elseif ($code < 0xd30c) { return "ㅌ"; } elseif ($code < 0xd558) { return "ㅍ"; } else { return "ㅎ"; } } else { return ''; } }
/** * Normalize CSS into a format we can easily search for hostile input * - decode character references * - decode escape sequences * - convert characters that IE6 interprets into ascii * - remove comments, unless the entire value is one single comment * @param string $value the css string * @return string normalized css */ public static function normalizeCss($value) { // Decode character references like { $value = Sanitizer::decodeCharReferences($value); // Decode escape sequences and line continuation // See the grammar in the CSS 2 spec, appendix D. // This has to be done AFTER decoding character references. // This means it isn't possible for this function to return // unsanitized escape sequences. It is possible to manufacture // input that contains character references that decode to // escape sequences that decode to character references, but // it's OK for the return value to contain character references // because the caller is supposed to escape those anyway. static $decodeRegex; if (!$decodeRegex) { $space = '[\\x20\\t\\r\\n\\f]'; $nl = '(?:\\n|\\r\\n|\\r|\\f)'; $backslash = '\\\\'; $decodeRegex = "/ {$backslash}\n\t\t\t\t(?:\n\t\t\t\t\t({$nl}) | # 1. Line continuation\n\t\t\t\t\t([0-9A-Fa-f]{1,6}){$space}? | # 2. character number\n\t\t\t\t\t(.) | # 3. backslash cancelling special meaning\n\t\t\t\t\t() | # 4. backslash at end of string\n\t\t\t\t)/xu"; } $value = preg_replace_callback($decodeRegex, array(__CLASS__, 'cssDecodeCallback'), $value); // Normalize Halfwidth and Fullwidth Unicode block that IE6 might treat as ascii $value = preg_replace_callback('/[!-[]-z]/u', function ($matches) { $cp = UtfNormal\Utils::utf8ToCodepoint($matches[0]); if ($cp === false) { return ''; } return chr($cp - 65248); // ASCII range \x21-\x7A }, $value); // Convert more characters IE6 might treat as ascii // U+0280, U+0274, U+207F, U+029F, U+026A, U+207D, U+208D $value = str_replace(array('ʀ', 'ɴ', 'ⁿ', 'ʟ', 'ɪ', '⁽', '₍'), array('r', 'n', 'n', 'l', 'i', '(', '('), $value); // Let the value through if it's nothing but a single comment, to // allow other functions which may reject it to pass some error // message through. if (!preg_match('! ^ \\s* /\\* [^*\\/]* \\*/ \\s* $ !x', $value)) { // Remove any comments; IE gets token splitting wrong // This must be done AFTER decoding character references and // escape sequences, because those steps can introduce comments // This step cannot introduce character references or escape // sequences, because it replaces comments with spaces rather // than removing them completely. $value = StringUtils::delimiterReplace('/*', '*/', ' ', $value); // Remove anything after a comment-start token, to guard against // incorrect client implementations. $commentPos = strpos($value, '/*'); if ($commentPos !== false) { $value = substr($value, 0, $commentPos); } } // S followed by repeat, iteration, or prolonged sound marks, // which IE will treat as "ss" $value = preg_replace('/s(?: \\xE3\\x80\\xB1 | # U+3031 \\xE3\\x82\\x9D | # U+309D \\xE3\\x83\\xBC | # U+30FC \\xE3\\x83\\xBD | # U+30FD \\xEF\\xB9\\xBC | # U+FE7C \\xEF\\xB9\\xBD | # U+FE7D \\xEF\\xBD\\xB0 # U+FF70 )/ix', 'ss', $value); return $value; }
function getFirstLetter($string) { $string = strval($string); if ($string === '') { return ''; } // Check for CJK $firstChar = mb_substr($string, 0, 1, 'UTF-8'); if (ord($firstChar) > 0x7f && self::isCjk(UtfNormal\Utils::utf8ToCodepoint($firstChar))) { return $firstChar; } $sortKey = $this->getPrimarySortKey($string); // Do a binary search to find the correct letter to sort under $min = ArrayUtils::findLowerBound(array($this, 'getSortKeyByLetterIndex'), $this->getFirstLetterCount(), 'strcmp', $sortKey); if ($min === false) { // Before the first letter return ''; } return $this->getLetterByIndex($min); }
public function getFirstLetter($string) { $string = strval($string); if ($string === '') { return ''; } $firstChar = mb_substr($string, 0, 1, 'UTF-8'); // If the first character is a CJK character, just return that character. if (ord($firstChar) > 0x7f && self::isCjk(UtfNormal\Utils::utf8ToCodepoint($firstChar))) { return $firstChar; } $sortKey = $this->getPrimarySortKey($string); // Do a binary search to find the correct letter to sort under $min = ArrayUtils::findLowerBound([$this, 'getSortKeyByLetterIndex'], $this->getFirstLetterCount(), 'strcmp', $sortKey); if ($min === false) { // Before the first letter return ''; } $sortLetter = $this->getLetterByIndex($min); if ($this->useNumericCollation) { // If the sort letter is a number, return '0–9' (or localized equivalent). // ASCII value of 0 is 48. ASCII value of 9 is 57. // Note that this also applies to non-Arabic numerals since they are // mapped to Arabic numeral sort letters. For example, ২ sorts as 2. if (ord($sortLetter) >= 48 && ord($sortLetter) <= 57) { $sortLetter = wfMessage('category-header-numerals')->numParams(0, 9)->text(); } } return $sortLetter; }
/** * Mangle XML-invalid names to be valid in XML * @param string $name * @param array $preserveKeys Names to not mangle * @return string Mangled name */ private static function mangleName($name, $preserveKeys = []) { static $nsc = null, $nc = null; if (in_array($name, $preserveKeys, true)) { return $name; } if ($name === '') { return '_'; } if ($nsc === null) { // Note we omit ':' from $nsc and $nc because it's reserved for XML // namespacing, and we omit '_' from $nsc (but not $nc) because we // reserve it. $nsc = 'A-Za-z\\x{C0}-\\x{D6}\\x{D8}-\\x{F6}\\x{F8}-\\x{2FF}\\x{370}-\\x{37D}\\x{37F}-\\x{1FFF}' . '\\x{200C}-\\x{200D}\\x{2070}-\\x{218F}\\x{2C00}-\\x{2FEF}\\x{3001}-\\x{D7FF}' . '\\x{F900}-\\x{FDCF}\\x{FDF0}-\\x{FFFD}\\x{10000}-\\x{EFFFF}'; $nc = $nsc . '_\\-.0-9\\x{B7}\\x{300}-\\x{36F}\\x{203F}-\\x{2040}'; } if (preg_match("/^[{$nsc}][{$nc}]*\$/uS", $name)) { return $name; } return '_' . preg_replace_callback("/[^{$nc}]/uS", function ($m) { return sprintf('.%X.', UtfNormal\Utils::utf8ToCodepoint($m[0])); }, str_replace('.', '.2E.', $name)); }