/** * Take an array of attribute names and values and normalize or discard * illegal values for the given element type. * * - Discards attributes not on a whitelist for the given element * - Unsafe style attributes are discarded * - Invalid id attributes are re-encoded * * @param array $attribs * @param string $element * @return array * * @todo Check for legal values where the DTD limits things. * @todo Check for unique id attribute :P */ static function validateTagAttributes($attribs, $element) { return Sanitizer::validateAttributes($attribs, Sanitizer::attributeWhitelist($element)); }
/** * Take an array of attribute names and values and normalize or discard * illegal values for the given element type. * * - Discards attributes not on a whitelist for the given element * - Unsafe style attributes are discarded * * @param array $attribs * @param string $element * @return array * * @todo Check for legal values where the DTD limits things. * @todo Check for unique id attribute :P */ static function validateTagAttributes($attribs, $element) { $whitelist = array_flip(Sanitizer::attributeWhitelist($element)); $out = array(); foreach ($attribs as $attribute => $value) { if (!isset($whitelist[$attribute])) { continue; } # Strip javascript "expression" from stylesheets. # http://msdn.microsoft.com/workshop/author/dhtml/overview/recalc.asp if ($attribute == 'style') { $value = Sanitizer::checkCss($value); if ($value === false) { # haxx0r continue; } } if ($attribute === 'id') { $value = Sanitizer::escapeId($value); } // If this attribute was previously set, override it. // Output should only have one attribute of each name. $out[$attribute] = $value; } return $out; }
/** * Take a tag soup fragment listing an HTML element's attributes * and normalize it to well-formed XML, discarding unwanted attributes. * * - Normalizes attribute names to lowercase * - Discards attributes not on a whitelist for the given element * - Turns broken or invalid entities into plaintext * - Double-quotes all attribute values * - Attributes without values are given the name as attribute * - Double attributes are discarded * - Unsafe style attributes are discarded * - Prepends space if there are attributes. * * @param string $text * @param string $element * @return string * * @todo Check for legal values where the DTD limits things. * @todo Check for unique id attribute :P */ function fixTagAttributes($text, $element) { global $wgUrlProtocols; if (trim($text) == '') { return ''; } # Unquoted attribute # Since we quote this later, this can be anything distinguishable # from the end of the attribute if (!preg_match_all(MW_ATTRIBS_REGEX, $text, $pairs, PREG_SET_ORDER)) { return ''; } $whitelist = array_flip(Sanitizer::attributeWhitelist($element)); $attribs = array(); foreach ($pairs as $set) { $attribute = strtolower($set[1]); if (!isset($whitelist[$attribute])) { continue; } $raw = Sanitizer::getTagAttributeCallback($set); $value = Sanitizer::normalizeAttributeValue($raw); # Strip javascript "expression" from stylesheets. # http://msdn.microsoft.com/workshop/author/dhtml/overview/recalc.asp if ($attribute == 'style' && preg_match('/(expression|tps*:\\/\\/|url\\s*\\().*/is', Sanitizer::decodeCharReferences($value))) { # haxx0r continue; } # Templates and links may be expanded in later parsing, # creating invalid or dangerous output. Suppress this. $value = strtr($value, array('{' => '{', '[' => '[', "''" => '''', 'ISBN' => 'ISBN', 'RFC' => 'RFC', 'PMID' => 'PMID')); # Stupid hack $value = preg_replace_callback('/(' . $wgUrlProtocols . ')/', array('Sanitizer', 'armorLinksCallback'), $value); // If this attribute was previously set, override it. // Output should only have one attribute of each name. $attribs[$attribute] = "{$attribute}=\"{$value}\""; } if (empty($attribs)) { return ''; } else { return ' ' . implode(' ', $attribs); } }
/** * Take a tag soup fragment listing an HTML element's attributes * and normalize it to well-formed XML, discarding unwanted attributes. * * - Normalizes attribute names to lowercase * - Discards attributes not on a whitelist for the given element * - Turns broken or invalid entities into plaintext * - Double-quotes all attribute values * - Attributes without values are given the name as attribute * - Double attributes are discarded * - Unsafe style attributes are discarded * - Prepends space if there are attributes. * * @param string $text * @param string $element * @return string * * @todo Check for legal values where the DTD limits things. * @todo Check for unique id attribute :P */ function fixTagAttributes($text, $element) { if (trim($text) == '') { return ''; } # Unquoted attribute # Since we quote this later, this can be anything distinguishable # from the end of the attribute $pairs = array(); if (!preg_match_all(MW_ATTRIBS_REGEX, $text, $pairs, PREG_SET_ORDER)) { return ''; } $whitelist = array_flip(Sanitizer::attributeWhitelist($element)); $attribs = array(); foreach ($pairs as $set) { $attribute = strtolower($set[1]); if (!isset($whitelist[$attribute])) { continue; } $raw = Sanitizer::getTagAttributeCallback($set); $value = Sanitizer::normalizeAttributeValue($raw); # Strip javascript "expression" from stylesheets. # http://msdn.microsoft.com/workshop/author/dhtml/overview/recalc.asp if ($attribute == 'style') { $stripped = Sanitizer::decodeCharReferences($value); // Remove any comments; IE gets token splitting wrong $stripped = preg_replace('!/\\*.*?\\*/!S', ' ', $stripped); $value = htmlspecialchars($stripped); // ... and continue checks $stripped = preg_replace('!\\\\([0-9A-Fa-f]{1,6})[ \\n\\r\\t\\f]?!e', 'codepointToUtf8(hexdec("$1"))', $stripped); $stripped = str_replace('\\', '', $stripped); if (preg_match('/(expression|tps*:\\/\\/|url\\s*\\().*/is', $stripped)) { # haxx0r continue; } } if ($attribute === 'id') { $value = Sanitizer::escapeId($value); } # Templates and links may be expanded in later parsing, # creating invalid or dangerous output. Suppress this. $value = strtr($value, array('<' => '<', '>' => '>', '"' => '"', '{' => '{', '[' => '[', "''" => '''', 'ISBN' => 'ISBN', 'RFC' => 'RFC', 'PMID' => 'PMID')); # Stupid hack $value = preg_replace_callback('/(' . wfUrlProtocols() . ')/', array('Sanitizer', 'armorLinksCallback'), $value); // If this attribute was previously set, override it. // Output should only have one attribute of each name. $attribs[$attribute] = "{$attribute}=\"{$value}\""; } return count($attribs) ? ' ' . implode(' ', $attribs) : ''; }