Пример #1
0
 /**
  * @return DOMDocument DOM to manipulate
  */
 public function getDoc()
 {
     if (!$this->doc) {
         // DOMDocument::loadHTML apparently isn't very good with encodings, so
         // convert input to ASCII by encoding everything above 128 as entities.
         if (function_exists('mb_convert_encoding')) {
             $html = mb_convert_encoding($this->html, 'HTML-ENTITIES', 'UTF-8');
         } else {
             $html = preg_replace_callback('/[\\x{80}-\\x{10ffff}]/u', function ($m) {
                 return '&#' . UtfNormal\Utils::utf8ToCodepoint($m[0]) . ';';
             }, $this->html);
         }
         // Workaround for bug that caused spaces before references
         // to disappear during processing: https://phabricator.wikimedia.org/T55086
         // TODO: Please replace with a better fix if one can be found.
         $html = str_replace(' <', '&#32;<', $html);
         libxml_use_internal_errors(true);
         $loader = libxml_disable_entity_loader();
         $this->doc = new DOMDocument();
         $this->doc->strictErrorChecking = false;
         $this->doc->loadHTML($html);
         libxml_disable_entity_loader($loader);
         libxml_use_internal_errors(false);
         $this->doc->encoding = 'UTF-8';
     }
     return $this->doc;
 }
Пример #2
0
 /**
  * Get the first character of a string.
  *
  * @param string $s
  * @return string
  */
 function firstChar($s)
 {
     $matches = array();
     preg_match('/^([\\x00-\\x7f]|[\\xc0-\\xdf][\\x80-\\xbf]|' . '[\\xe0-\\xef][\\x80-\\xbf]{2}|[\\xf0-\\xf7][\\x80-\\xbf]{3})/', $s, $matches);
     if (isset($matches[1])) {
         if (strlen($matches[1]) != 3) {
             return $matches[1];
         }
         // Break down Hangul syllables to grab the first jamo
         $code = UtfNormal\Utils::utf8ToCodepoint($matches[1]);
         if ($code < 0xac00 || 0xd7a4 <= $code) {
             return $matches[1];
         } elseif ($code < 0xb098) {
             return "ㄱ";
         } elseif ($code < 0xb2e4) {
             return "ㄴ";
         } elseif ($code < 0xb77c) {
             return "ㄷ";
         } elseif ($code < 0xb9c8) {
             return "ㄹ";
         } elseif ($code < 0xbc14) {
             return "ㅁ";
         } elseif ($code < 0xc0ac) {
             return "ㅂ";
         } elseif ($code < 0xc544) {
             return "ㅅ";
         } elseif ($code < 0xc790) {
             return "ㅇ";
         } elseif ($code < 0xcc28) {
             return "ㅈ";
         } elseif ($code < 0xce74) {
             return "ㅊ";
         } elseif ($code < 0xd0c0) {
             return "ㅋ";
         } elseif ($code < 0xd30c) {
             return "ㅌ";
         } elseif ($code < 0xd558) {
             return "ㅍ";
         } else {
             return "ㅎ";
         }
     } else {
         return '';
     }
 }
Пример #3
0
    /**
     * Normalize CSS into a format we can easily search for hostile input
     *  - decode character references
     *  - decode escape sequences
     *  - convert characters that IE6 interprets into ascii
     *  - remove comments, unless the entire value is one single comment
     * @param string $value the css string
     * @return string normalized css
     */
    public static function normalizeCss($value)
    {
        // Decode character references like &#123;
        $value = Sanitizer::decodeCharReferences($value);
        // Decode escape sequences and line continuation
        // See the grammar in the CSS 2 spec, appendix D.
        // This has to be done AFTER decoding character references.
        // This means it isn't possible for this function to return
        // unsanitized escape sequences. It is possible to manufacture
        // input that contains character references that decode to
        // escape sequences that decode to character references, but
        // it's OK for the return value to contain character references
        // because the caller is supposed to escape those anyway.
        static $decodeRegex;
        if (!$decodeRegex) {
            $space = '[\\x20\\t\\r\\n\\f]';
            $nl = '(?:\\n|\\r\\n|\\r|\\f)';
            $backslash = '\\\\';
            $decodeRegex = "/ {$backslash}\n\t\t\t\t(?:\n\t\t\t\t\t({$nl}) |  # 1. Line continuation\n\t\t\t\t\t([0-9A-Fa-f]{1,6}){$space}? |  # 2. character number\n\t\t\t\t\t(.) | # 3. backslash cancelling special meaning\n\t\t\t\t\t() | # 4. backslash at end of string\n\t\t\t\t)/xu";
        }
        $value = preg_replace_callback($decodeRegex, array(__CLASS__, 'cssDecodeCallback'), $value);
        // Normalize Halfwidth and Fullwidth Unicode block that IE6 might treat as ascii
        $value = preg_replace_callback('/[!-[]-z]/u', function ($matches) {
            $cp = UtfNormal\Utils::utf8ToCodepoint($matches[0]);
            if ($cp === false) {
                return '';
            }
            return chr($cp - 65248);
            // ASCII range \x21-\x7A
        }, $value);
        // Convert more characters IE6 might treat as ascii
        // U+0280, U+0274, U+207F, U+029F, U+026A, U+207D, U+208D
        $value = str_replace(array('ʀ', 'ɴ', 'ⁿ', 'ʟ', 'ɪ', '⁽', '₍'), array('r', 'n', 'n', 'l', 'i', '(', '('), $value);
        // Let the value through if it's nothing but a single comment, to
        // allow other functions which may reject it to pass some error
        // message through.
        if (!preg_match('! ^ \\s* /\\* [^*\\/]* \\*/ \\s* $ !x', $value)) {
            // Remove any comments; IE gets token splitting wrong
            // This must be done AFTER decoding character references and
            // escape sequences, because those steps can introduce comments
            // This step cannot introduce character references or escape
            // sequences, because it replaces comments with spaces rather
            // than removing them completely.
            $value = StringUtils::delimiterReplace('/*', '*/', ' ', $value);
            // Remove anything after a comment-start token, to guard against
            // incorrect client implementations.
            $commentPos = strpos($value, '/*');
            if ($commentPos !== false) {
                $value = substr($value, 0, $commentPos);
            }
        }
        // S followed by repeat, iteration, or prolonged sound marks,
        // which IE will treat as "ss"
        $value = preg_replace('/s(?:
				\\xE3\\x80\\xB1 | # U+3031
				\\xE3\\x82\\x9D | # U+309D
				\\xE3\\x83\\xBC | # U+30FC
				\\xE3\\x83\\xBD | # U+30FD
				\\xEF\\xB9\\xBC | # U+FE7C
				\\xEF\\xB9\\xBD | # U+FE7D
				\\xEF\\xBD\\xB0   # U+FF70
			)/ix', 'ss', $value);
        return $value;
    }
Пример #4
0
 function getFirstLetter($string)
 {
     $string = strval($string);
     if ($string === '') {
         return '';
     }
     // Check for CJK
     $firstChar = mb_substr($string, 0, 1, 'UTF-8');
     if (ord($firstChar) > 0x7f && self::isCjk(UtfNormal\Utils::utf8ToCodepoint($firstChar))) {
         return $firstChar;
     }
     $sortKey = $this->getPrimarySortKey($string);
     // Do a binary search to find the correct letter to sort under
     $min = ArrayUtils::findLowerBound(array($this, 'getSortKeyByLetterIndex'), $this->getFirstLetterCount(), 'strcmp', $sortKey);
     if ($min === false) {
         // Before the first letter
         return '';
     }
     return $this->getLetterByIndex($min);
 }
Пример #5
0
 public function getFirstLetter($string)
 {
     $string = strval($string);
     if ($string === '') {
         return '';
     }
     $firstChar = mb_substr($string, 0, 1, 'UTF-8');
     // If the first character is a CJK character, just return that character.
     if (ord($firstChar) > 0x7f && self::isCjk(UtfNormal\Utils::utf8ToCodepoint($firstChar))) {
         return $firstChar;
     }
     $sortKey = $this->getPrimarySortKey($string);
     // Do a binary search to find the correct letter to sort under
     $min = ArrayUtils::findLowerBound([$this, 'getSortKeyByLetterIndex'], $this->getFirstLetterCount(), 'strcmp', $sortKey);
     if ($min === false) {
         // Before the first letter
         return '';
     }
     $sortLetter = $this->getLetterByIndex($min);
     if ($this->useNumericCollation) {
         // If the sort letter is a number, return '0–9' (or localized equivalent).
         // ASCII value of 0 is 48. ASCII value of 9 is 57.
         // Note that this also applies to non-Arabic numerals since they are
         // mapped to Arabic numeral sort letters. For example, ২ sorts as 2.
         if (ord($sortLetter) >= 48 && ord($sortLetter) <= 57) {
             $sortLetter = wfMessage('category-header-numerals')->numParams(0, 9)->text();
         }
     }
     return $sortLetter;
 }
Пример #6
0
 /**
  * Mangle XML-invalid names to be valid in XML
  * @param string $name
  * @param array $preserveKeys Names to not mangle
  * @return string Mangled name
  */
 private static function mangleName($name, $preserveKeys = [])
 {
     static $nsc = null, $nc = null;
     if (in_array($name, $preserveKeys, true)) {
         return $name;
     }
     if ($name === '') {
         return '_';
     }
     if ($nsc === null) {
         // Note we omit ':' from $nsc and $nc because it's reserved for XML
         // namespacing, and we omit '_' from $nsc (but not $nc) because we
         // reserve it.
         $nsc = 'A-Za-z\\x{C0}-\\x{D6}\\x{D8}-\\x{F6}\\x{F8}-\\x{2FF}\\x{370}-\\x{37D}\\x{37F}-\\x{1FFF}' . '\\x{200C}-\\x{200D}\\x{2070}-\\x{218F}\\x{2C00}-\\x{2FEF}\\x{3001}-\\x{D7FF}' . '\\x{F900}-\\x{FDCF}\\x{FDF0}-\\x{FFFD}\\x{10000}-\\x{EFFFF}';
         $nc = $nsc . '_\\-.0-9\\x{B7}\\x{300}-\\x{36F}\\x{203F}-\\x{2040}';
     }
     if (preg_match("/^[{$nsc}][{$nc}]*\$/uS", $name)) {
         return $name;
     }
     return '_' . preg_replace_callback("/[^{$nc}]/uS", function ($m) {
         return sprintf('.%X.', UtfNormal\Utils::utf8ToCodepoint($m[0]));
     }, str_replace('.', '.2E.', $name));
 }