public static function getNamedCharacterReferences() { if (!self::$namedCharacterReferences) { self::$namedCharacterReferences = unserialize(file_get_contents(dirname(__FILE__) . '/named-character-references.ser')); } return self::$namedCharacterReferences; }
public static function getNamedCharacterReferenceMaxLength() { if (!self::$namedCharacterReferenceMaxLength) { $namedCharacterReferences = self::getNamedCharacterReferences(); $lengths = array_map('strlen', array_keys($namedCharacterReferences)); self::$namedCharacterReferenceMaxLength = max($lengths); } return self::$namedCharacterReferenceMaxLength; }
public static function getNamedCharacterReferences() { if (!self::$namedCharacterReferences) { $url = get_template_directory_uri() . '/core/inc/HTML5/named-character-references.ser'; $response = wp_remote_get(esc_url_raw($url)); /* Will result in $api_response being an array of data, parsed from the JSON response of the API listed above */ $named_character_references = json_decode(wp_remote_retrieve_body($response), true); self::$namedCharacterReferences = unserialize($named_character_references); } return self::$namedCharacterReferences; }
private function consumeCharacterReference($allowed = false, $inattr = false) { $chars = $this->stream->char(); if ($chars[0] === "\t" || $chars[0] === "\n" || $chars[0] === "\f" || $chars[0] === " " || $chars[0] === '<' || $chars[0] === '&' || $chars === false || $chars[0] === $allowed) { $this->stream->unget(); return '&'; } elseif ($chars[0] === '#') { $chars .= $this->stream->char(); if (isset($chars[1]) && ($chars[1] === 'x' || $chars[1] === 'X')) { $char_class = self::HEX; $hex = true; } else { $chars = $chars[0]; $this->stream->unget(); $char_class = self::DIGIT; $hex = false; } $consumed = $this->stream->charsWhile($char_class); if ($consumed === '' || $consumed === false) { $this->emitToken(array('type' => self::PARSEERROR, 'data' => 'expected-numeric-entity')); return '&' . $chars; } else { if ($this->stream->char() !== ';') { $this->stream->unget(); $this->emitToken(array('type' => self::PARSEERROR, 'data' => 'numeric-entity-without-semicolon')); } $codepoint = $hex ? hexdec($consumed) : (int) $consumed; $new_codepoint = HTML5_Data::getRealCodepoint($codepoint); if ($new_codepoint) { $this->emitToken(array('type' => self::PARSEERROR, 'data' => 'illegal-windows-1252-entity')); return HTML5_Data::utf8chr($new_codepoint); } else { if ($codepoint > 0x10ffff) { $this->emitToken(array('type' => self::PARSEERROR, 'data' => 'overlong-character-entity')); return "�"; } if ($codepoint >= 0x0 && $codepoint <= 0x8 || $codepoint === 0xb || $codepoint >= 0xe && $codepoint <= 0x1f || $codepoint >= 0x7f && $codepoint <= 0x9f || $codepoint >= 0xd800 && $codepoint <= 0xdfff || $codepoint >= 0xfdd0 && $codepoint <= 0xfdef || ($codepoint & 0xfffe) === 0xfffe || $codepoint == 0x10ffff || $codepoint == 0x10fffe) { $this->emitToken(array('type' => self::PARSEERROR, 'data' => 'illegal-codepoint-for-numeric-entity')); } return HTML5_Data::utf8chr($codepoint); } } } else { $refs = HTML5_Data::getNamedCharacterReferences(); $codepoint = false; $char = $chars; while ($char !== false && isset($refs[$char])) { $refs = $refs[$char]; if (isset($refs['codepoint'])) { $id = $chars; $codepoint = $refs['codepoint']; } $chars .= $char = $this->stream->char(); } $this->stream->unget(); if ($char !== false) { $chars = substr($chars, 0, -1); } if (!$codepoint) { $this->emitToken(array('type' => self::PARSEERROR, 'data' => 'expected-named-entity')); return '&' . $chars; } $semicolon = true; if (substr($id, -1) !== ';') { $this->emitToken(array('type' => self::PARSEERROR, 'data' => 'named-entity-without-semicolon')); $semicolon = false; } if ($inattr && !$semicolon) { if (strlen($chars) > strlen($id)) { $next = substr($chars, strlen($id), 1); } else { $next = $this->stream->char(); $this->stream->unget(); } if ('0' <= $next && $next <= '9' || 'A' <= $next && $next <= 'Z' || 'a' <= $next && $next <= 'z') { return '&' . $chars; } } return HTML5_Data::utf8chr($codepoint) . substr($chars, strlen($id)); } }
private function consumeCharacterReference($allowed = false, $inattr = false) { // This goes quite far against spec, and is far closer to the Python // impl., mainly because we don't do the large unconsuming the spec // requires. // All consumed characters. $chars = $this->stream->char(); /* This section defines how to consume a character reference. This definition is used when parsing character references in text and in attributes. The behavior depends on the identity of the next character (the one immediately after the U+0026 AMPERSAND character): */ if ($chars[0] === "\t" || $chars[0] === "\n" || $chars[0] === "\f" || $chars[0] === " " || $chars[0] === '<' || $chars[0] === '&' || $chars === false || $chars[0] === $allowed) { /* U+0009 CHARACTER TABULATION U+000A LINE FEED (LF) U+000C FORM FEED (FF) U+0020 SPACE U+003C LESS-THAN SIGN U+0026 AMPERSAND EOF The additional allowed character, if there is one Not a character reference. No characters are consumed, and nothing is returned. (This is not an error, either.) */ // We already consumed, so unconsume. $this->stream->unget(); return '&'; } elseif ($chars[0] === '#') { /* Consume the U+0023 NUMBER SIGN. */ // Um, yeah, we already did that. /* The behavior further depends on the character after the U+0023 NUMBER SIGN: */ $chars .= $this->stream->char(); if (isset($chars[1]) && ($chars[1] === 'x' || $chars[1] === 'X')) { /* U+0078 LATIN SMALL LETTER X U+0058 LATIN CAPITAL LETTER X */ /* Consume the X. */ // Um, yeah, we already did that. /* Follow the steps below, but using the range of characters U+0030 DIGIT ZERO through to U+0039 DIGIT NINE, U+0061 LATIN SMALL LETTER A through to U+0066 LATIN SMALL LETTER F, and U+0041 LATIN CAPITAL LETTER A, through to U+0046 LATIN CAPITAL LETTER F (in other words, 0123456789, ABCDEF, abcdef). */ $char_class = self::HEX; /* When it comes to interpreting the number, interpret it as a hexadecimal number. */ $hex = true; } else { /* Anything else */ // Unconsume because we shouldn't have consumed this. $chars = $chars[0]; $this->stream->unget(); /* Follow the steps below, but using the range of characters U+0030 DIGIT ZERO through to U+0039 DIGIT NINE (i.e. just 0123456789). */ $char_class = self::DIGIT; /* When it comes to interpreting the number, interpret it as a decimal number. */ $hex = false; } /* Consume as many characters as match the range of characters given above. */ $consumed = $this->stream->charsWhile($char_class); if ($consumed === '' || $consumed === false) { /* If no characters match the range, then don't consume any characters (and unconsume the U+0023 NUMBER SIGN character and, if appropriate, the X character). This is a parse error; nothing is returned. */ $this->emitToken(array('type' => self::PARSEERROR, 'data' => 'expected-numeric-entity')); return '&' . $chars; } else { /* Otherwise, if the next character is a U+003B SEMICOLON, consume that too. If it isn't, there is a parse error. */ if ($this->stream->char() !== ';') { $this->stream->unget(); $this->emitToken(array('type' => self::PARSEERROR, 'data' => 'numeric-entity-without-semicolon')); } /* If one or more characters match the range, then take them all and interpret the string of characters as a number (either hexadecimal or decimal as appropriate). */ $codepoint = $hex ? hexdec($consumed) : (int) $consumed; /* If that number is one of the numbers in the first column of the following table, then this is a parse error. Find the row with that number in the first column, and return a character token for the Unicode character given in the second column of that row. */ $new_codepoint = HTML5_Data::getRealCodepoint($codepoint); if ($new_codepoint) { $this->emitToken(array('type' => self::PARSEERROR, 'data' => 'illegal-windows-1252-entity')); return HTML5_Data::utf8chr($new_codepoint); } else { /* Otherwise, if the number is greater than 0x10FFFF, then * this is a parse error. Return a U+FFFD REPLACEMENT * CHARACTER. */ if ($codepoint > 0x10ffff) { $this->emitToken(array('type' => self::PARSEERROR, 'data' => 'overlong-character-entity')); return "�"; } /* Otherwise, return a character token for the Unicode * character whose code point is that number. If the * number is in the range 0x0001 to 0x0008, 0x000E to * 0x001F, 0x007F to 0x009F, 0xD800 to 0xDFFF, 0xFDD0 to * 0xFDEF, or is one of 0x000B, 0xFFFE, 0xFFFF, 0x1FFFE, * 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, 0x3FFFF, 0x4FFFE, * 0x4FFFF, 0x5FFFE, 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE, * 0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, * 0xAFFFF, 0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE, * 0xDFFFF, 0xEFFFE, 0xEFFFF, 0xFFFFE, 0xFFFFF, 0x10FFFE, * or 0x10FFFF, then this is a parse error. */ // && has higher precedence than || if ($codepoint >= 0x0 && $codepoint <= 0x8 || $codepoint === 0xb || $codepoint >= 0xe && $codepoint <= 0x1f || $codepoint >= 0x7f && $codepoint <= 0x9f || $codepoint >= 0xd800 && $codepoint <= 0xdfff || $codepoint >= 0xfdd0 && $codepoint <= 0xfdef || ($codepoint & 0xfffe) === 0xfffe || $codepoint == 0x10ffff || $codepoint == 0x10fffe) { $this->emitToken(array('type' => self::PARSEERROR, 'data' => 'illegal-codepoint-for-numeric-entity')); } return HTML5_Data::utf8chr($codepoint); } } } else { /* Anything else */ /* Consume the maximum number of characters possible, with the consumed characters matching one of the identifiers in the first column of the named character references table (in a case-sensitive manner). */ // What we actually do here is consume as much as we can while it // matches the start of one of the identifiers in the first column. $refs = HTML5_Data::getNamedCharacterReferences(); // Get the longest string which is the start of an identifier // ($chars) as well as the longest identifier which matches ($id) // and its codepoint ($codepoint). $codepoint = false; $char = $chars; while ($char !== false && isset($refs[$char])) { $refs = $refs[$char]; if (isset($refs['codepoint'])) { $id = $chars; $codepoint = $refs['codepoint']; } $chars .= $char = $this->stream->char(); } // Unconsume the one character we just took which caused the while // statement to fail. This could be anything and could cause state // changes (as if it matches the while loop it must be // alphanumeric so we can just concat it to whatever we get later). $this->stream->unget(); if ($char !== false) { $chars = substr($chars, 0, -1); } /* If no match can be made, then this is a parse error. No characters are consumed, and nothing is returned. */ if (!$codepoint) { $this->emitToken(array('type' => self::PARSEERROR, 'data' => 'expected-named-entity')); return '&' . $chars; } /* If the last character matched is not a U+003B SEMICOLON (;), there is a parse error. */ $semicolon = true; if (substr($id, -1) !== ';') { $this->emitToken(array('type' => self::PARSEERROR, 'data' => 'named-entity-without-semicolon')); $semicolon = false; } /* If the character reference is being consumed as part of an attribute, and the last character matched is not a U+003B SEMICOLON (;), and the next character is in the range U+0030 DIGIT ZERO to U+0039 DIGIT NINE, U+0041 LATIN CAPITAL LETTER A to U+005A LATIN CAPITAL LETTER Z, or U+0061 LATIN SMALL LETTER A to U+007A LATIN SMALL LETTER Z, then, for historical reasons, all the characters that were matched after the U+0026 AMPERSAND (&) must be unconsumed, and nothing is returned. */ if ($inattr && !$semicolon) { // The next character is either the next character in $chars or in the stream. if (strlen($chars) > strlen($id)) { $next = substr($chars, strlen($id), 1); } else { $next = $this->stream->char(); $this->stream->unget(); } if ('0' <= $next && $next <= '9' || 'A' <= $next && $next <= 'Z' || 'a' <= $next && $next <= 'z') { return '&' . $chars; } } /* Otherwise, return a character token for the character corresponding to the character reference name (as given by the second column of the named character references table). */ return HTML5_Data::utf8chr($codepoint) . substr($chars, strlen($id)); } }