Exemple #1
0
 public static function getNamedCharacterReferences()
 {
     if (!self::$namedCharacterReferences) {
         self::$namedCharacterReferences = unserialize(file_get_contents(dirname(__FILE__) . '/named-character-references.ser'));
     }
     return self::$namedCharacterReferences;
 }
Exemple #2
0
 public static function getNamedCharacterReferenceMaxLength()
 {
     if (!self::$namedCharacterReferenceMaxLength) {
         $namedCharacterReferences = self::getNamedCharacterReferences();
         $lengths = array_map('strlen', array_keys($namedCharacterReferences));
         self::$namedCharacterReferenceMaxLength = max($lengths);
     }
     return self::$namedCharacterReferenceMaxLength;
 }
 public static function getNamedCharacterReferences()
 {
     if (!self::$namedCharacterReferences) {
         $url = get_template_directory_uri() . '/core/inc/HTML5/named-character-references.ser';
         $response = wp_remote_get(esc_url_raw($url));
         /* Will result in $api_response being an array of data,
            parsed from the JSON response of the API listed above */
         $named_character_references = json_decode(wp_remote_retrieve_body($response), true);
         self::$namedCharacterReferences = unserialize($named_character_references);
     }
     return self::$namedCharacterReferences;
 }
 private function consumeCharacterReference($allowed = false, $inattr = false)
 {
     $chars = $this->stream->char();
     if ($chars[0] === "\t" || $chars[0] === "\n" || $chars[0] === "\f" || $chars[0] === " " || $chars[0] === '<' || $chars[0] === '&' || $chars === false || $chars[0] === $allowed) {
         $this->stream->unget();
         return '&';
     } elseif ($chars[0] === '#') {
         $chars .= $this->stream->char();
         if (isset($chars[1]) && ($chars[1] === 'x' || $chars[1] === 'X')) {
             $char_class = self::HEX;
             $hex = true;
         } else {
             $chars = $chars[0];
             $this->stream->unget();
             $char_class = self::DIGIT;
             $hex = false;
         }
         $consumed = $this->stream->charsWhile($char_class);
         if ($consumed === '' || $consumed === false) {
             $this->emitToken(array('type' => self::PARSEERROR, 'data' => 'expected-numeric-entity'));
             return '&' . $chars;
         } else {
             if ($this->stream->char() !== ';') {
                 $this->stream->unget();
                 $this->emitToken(array('type' => self::PARSEERROR, 'data' => 'numeric-entity-without-semicolon'));
             }
             $codepoint = $hex ? hexdec($consumed) : (int) $consumed;
             $new_codepoint = HTML5_Data::getRealCodepoint($codepoint);
             if ($new_codepoint) {
                 $this->emitToken(array('type' => self::PARSEERROR, 'data' => 'illegal-windows-1252-entity'));
                 return HTML5_Data::utf8chr($new_codepoint);
             } else {
                 if ($codepoint > 0x10ffff) {
                     $this->emitToken(array('type' => self::PARSEERROR, 'data' => 'overlong-character-entity'));
                     return "�";
                 }
                 if ($codepoint >= 0x0 && $codepoint <= 0x8 || $codepoint === 0xb || $codepoint >= 0xe && $codepoint <= 0x1f || $codepoint >= 0x7f && $codepoint <= 0x9f || $codepoint >= 0xd800 && $codepoint <= 0xdfff || $codepoint >= 0xfdd0 && $codepoint <= 0xfdef || ($codepoint & 0xfffe) === 0xfffe || $codepoint == 0x10ffff || $codepoint == 0x10fffe) {
                     $this->emitToken(array('type' => self::PARSEERROR, 'data' => 'illegal-codepoint-for-numeric-entity'));
                 }
                 return HTML5_Data::utf8chr($codepoint);
             }
         }
     } else {
         $refs = HTML5_Data::getNamedCharacterReferences();
         $codepoint = false;
         $char = $chars;
         while ($char !== false && isset($refs[$char])) {
             $refs = $refs[$char];
             if (isset($refs['codepoint'])) {
                 $id = $chars;
                 $codepoint = $refs['codepoint'];
             }
             $chars .= $char = $this->stream->char();
         }
         $this->stream->unget();
         if ($char !== false) {
             $chars = substr($chars, 0, -1);
         }
         if (!$codepoint) {
             $this->emitToken(array('type' => self::PARSEERROR, 'data' => 'expected-named-entity'));
             return '&' . $chars;
         }
         $semicolon = true;
         if (substr($id, -1) !== ';') {
             $this->emitToken(array('type' => self::PARSEERROR, 'data' => 'named-entity-without-semicolon'));
             $semicolon = false;
         }
         if ($inattr && !$semicolon) {
             if (strlen($chars) > strlen($id)) {
                 $next = substr($chars, strlen($id), 1);
             } else {
                 $next = $this->stream->char();
                 $this->stream->unget();
             }
             if ('0' <= $next && $next <= '9' || 'A' <= $next && $next <= 'Z' || 'a' <= $next && $next <= 'z') {
                 return '&' . $chars;
             }
         }
         return HTML5_Data::utf8chr($codepoint) . substr($chars, strlen($id));
     }
 }
Exemple #5
0
 private function consumeCharacterReference($allowed = false, $inattr = false)
 {
     // This goes quite far against spec, and is far closer to the Python
     // impl., mainly because we don't do the large unconsuming the spec
     // requires.
     // All consumed characters.
     $chars = $this->stream->char();
     /* This section defines how to consume a character
             reference. This definition is used when parsing character
             references in text and in attributes.
     
             The behavior depends on the identity of the next character
             (the one immediately after the U+0026 AMPERSAND character): */
     if ($chars[0] === "\t" || $chars[0] === "\n" || $chars[0] === "\f" || $chars[0] === " " || $chars[0] === '<' || $chars[0] === '&' || $chars === false || $chars[0] === $allowed) {
         /* U+0009 CHARACTER TABULATION
               U+000A LINE FEED (LF)
               U+000C FORM FEED (FF)
               U+0020 SPACE
               U+003C LESS-THAN SIGN
               U+0026 AMPERSAND
               EOF
               The additional allowed character, if there is one
            Not a character reference. No characters are consumed,
            and nothing is returned. (This is not an error, either.) */
         // We already consumed, so unconsume.
         $this->stream->unget();
         return '&';
     } elseif ($chars[0] === '#') {
         /* Consume the U+0023 NUMBER SIGN. */
         // Um, yeah, we already did that.
         /* The behavior further depends on the character after
            the U+0023 NUMBER SIGN: */
         $chars .= $this->stream->char();
         if (isset($chars[1]) && ($chars[1] === 'x' || $chars[1] === 'X')) {
             /* U+0078 LATIN SMALL LETTER X
                U+0058 LATIN CAPITAL LETTER X */
             /* Consume the X. */
             // Um, yeah, we already did that.
             /* Follow the steps below, but using the range of
                characters U+0030 DIGIT ZERO through to U+0039 DIGIT
                NINE, U+0061 LATIN SMALL LETTER A through to U+0066
                LATIN SMALL LETTER F, and U+0041 LATIN CAPITAL LETTER
                A, through to U+0046 LATIN CAPITAL LETTER F (in other
                words, 0123456789, ABCDEF, abcdef). */
             $char_class = self::HEX;
             /* When it comes to interpreting the
                number, interpret it as a hexadecimal number. */
             $hex = true;
         } else {
             /* Anything else */
             // Unconsume because we shouldn't have consumed this.
             $chars = $chars[0];
             $this->stream->unget();
             /* Follow the steps below, but using the range of
                characters U+0030 DIGIT ZERO through to U+0039 DIGIT
                NINE (i.e. just 0123456789). */
             $char_class = self::DIGIT;
             /* When it comes to interpreting the number,
                interpret it as a decimal number. */
             $hex = false;
         }
         /* Consume as many characters as match the range of characters given above. */
         $consumed = $this->stream->charsWhile($char_class);
         if ($consumed === '' || $consumed === false) {
             /* If no characters match the range, then don't consume
                any characters (and unconsume the U+0023 NUMBER SIGN
                character and, if appropriate, the X character). This
                is a parse error; nothing is returned. */
             $this->emitToken(array('type' => self::PARSEERROR, 'data' => 'expected-numeric-entity'));
             return '&' . $chars;
         } else {
             /* Otherwise, if the next character is a U+003B SEMICOLON,
                consume that too. If it isn't, there is a parse error. */
             if ($this->stream->char() !== ';') {
                 $this->stream->unget();
                 $this->emitToken(array('type' => self::PARSEERROR, 'data' => 'numeric-entity-without-semicolon'));
             }
             /* If one or more characters match the range, then take
                them all and interpret the string of characters as a number
                (either hexadecimal or decimal as appropriate). */
             $codepoint = $hex ? hexdec($consumed) : (int) $consumed;
             /* If that number is one of the numbers in the first column
                of the following table, then this is a parse error. Find the
                row with that number in the first column, and return a
                character token for the Unicode character given in the
                second column of that row. */
             $new_codepoint = HTML5_Data::getRealCodepoint($codepoint);
             if ($new_codepoint) {
                 $this->emitToken(array('type' => self::PARSEERROR, 'data' => 'illegal-windows-1252-entity'));
                 return HTML5_Data::utf8chr($new_codepoint);
             } else {
                 /* Otherwise, if the number is greater than 0x10FFFF, then
                  * this is a parse error. Return a U+FFFD REPLACEMENT
                  * CHARACTER. */
                 if ($codepoint > 0x10ffff) {
                     $this->emitToken(array('type' => self::PARSEERROR, 'data' => 'overlong-character-entity'));
                     return "�";
                 }
                 /* Otherwise, return a character token for the Unicode
                  * character whose code point is that number.  If the
                  * number is in the range 0x0001 to 0x0008,    0x000E to
                  * 0x001F,  0x007F  to 0x009F, 0xD800 to 0xDFFF, 0xFDD0 to
                  * 0xFDEF, or is one of 0x000B, 0xFFFE, 0xFFFF, 0x1FFFE,
                  * 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, 0x3FFFF, 0x4FFFE,
                  * 0x4FFFF, 0x5FFFE, 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE,
                  * 0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE,
                  * 0xAFFFF, 0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE,
                  * 0xDFFFF, 0xEFFFE, 0xEFFFF, 0xFFFFE, 0xFFFFF, 0x10FFFE,
                  * or 0x10FFFF, then this is a parse error. */
                 // && has higher precedence than ||
                 if ($codepoint >= 0x0 && $codepoint <= 0x8 || $codepoint === 0xb || $codepoint >= 0xe && $codepoint <= 0x1f || $codepoint >= 0x7f && $codepoint <= 0x9f || $codepoint >= 0xd800 && $codepoint <= 0xdfff || $codepoint >= 0xfdd0 && $codepoint <= 0xfdef || ($codepoint & 0xfffe) === 0xfffe || $codepoint == 0x10ffff || $codepoint == 0x10fffe) {
                     $this->emitToken(array('type' => self::PARSEERROR, 'data' => 'illegal-codepoint-for-numeric-entity'));
                 }
                 return HTML5_Data::utf8chr($codepoint);
             }
         }
     } else {
         /* Anything else */
         /* Consume the maximum number of characters possible,
            with the consumed characters matching one of the
            identifiers in the first column of the named character
            references table (in a case-sensitive manner). */
         // What we actually do here is consume as much as we can while it
         // matches the start of one of the identifiers in the first column.
         $refs = HTML5_Data::getNamedCharacterReferences();
         // Get the longest string which is the start of an identifier
         // ($chars) as well as the longest identifier which matches ($id)
         // and its codepoint ($codepoint).
         $codepoint = false;
         $char = $chars;
         while ($char !== false && isset($refs[$char])) {
             $refs = $refs[$char];
             if (isset($refs['codepoint'])) {
                 $id = $chars;
                 $codepoint = $refs['codepoint'];
             }
             $chars .= $char = $this->stream->char();
         }
         // Unconsume the one character we just took which caused the while
         // statement to fail. This could be anything and could cause state
         // changes (as if it matches the while loop it must be
         // alphanumeric so we can just concat it to whatever we get later).
         $this->stream->unget();
         if ($char !== false) {
             $chars = substr($chars, 0, -1);
         }
         /* If no match can be made, then this is a parse error.
            No characters are consumed, and nothing is returned. */
         if (!$codepoint) {
             $this->emitToken(array('type' => self::PARSEERROR, 'data' => 'expected-named-entity'));
             return '&' . $chars;
         }
         /* If the last character matched is not a U+003B SEMICOLON
            (;), there is a parse error. */
         $semicolon = true;
         if (substr($id, -1) !== ';') {
             $this->emitToken(array('type' => self::PARSEERROR, 'data' => 'named-entity-without-semicolon'));
             $semicolon = false;
         }
         /* If the character reference is being consumed as part of
            an attribute, and the last character matched is not a
            U+003B SEMICOLON (;), and the next character is in the
            range U+0030 DIGIT ZERO to U+0039 DIGIT NINE, U+0041
            LATIN CAPITAL LETTER A to U+005A LATIN CAPITAL LETTER Z,
            or U+0061 LATIN SMALL LETTER A to U+007A LATIN SMALL LETTER Z,
            then, for historical reasons, all the characters that were
            matched after the U+0026 AMPERSAND (&) must be unconsumed,
            and nothing is returned. */
         if ($inattr && !$semicolon) {
             // The next character is either the next character in $chars or in the stream.
             if (strlen($chars) > strlen($id)) {
                 $next = substr($chars, strlen($id), 1);
             } else {
                 $next = $this->stream->char();
                 $this->stream->unget();
             }
             if ('0' <= $next && $next <= '9' || 'A' <= $next && $next <= 'Z' || 'a' <= $next && $next <= 'z') {
                 return '&' . $chars;
             }
         }
         /* Otherwise, return a character token for the character
            corresponding to the character reference name (as given
            by the second column of the named character references table). */
         return HTML5_Data::utf8chr($codepoint) . substr($chars, strlen($id));
     }
 }