/** * Filters HTML to prevent cross-site-scripting (XSS) vulnerabilities. * * Based on kses by Ulf Harnhammar, see http://sourceforge.net/projects/kses. * For examples of various XSS attacks, see: http://ha.ckers.org/xss.html. * * This code does five things: * - Removes characters and constructs that can trick browsers. * - Makes sure all HTML entities are well-formed. * - Makes sure all HTML tags and attributes are well-formed. * - Makes sure no HTML tags contain URLs with a disallowed protocol (e.g. * javascript:). * - Marks the sanitized, XSS-safe version of $string as safe markup for * rendering. * * @param $string * The string with raw HTML in it. It will be stripped of everything that * can cause an XSS attack. * @param array $html_tags * An array of HTML tags. * @param bool $mode * (optional) Defaults to FILTER_MODE_WHITELIST ($html_tags is used as a * whitelist of allowed tags), but can also be set to FILTER_MODE_BLACKLIST * ($html_tags is used as a blacklist of disallowed tags). * * @return string * An XSS safe version of $string, or an empty string if $string is not * valid UTF-8. * * @see \Drupal\Component\Utility\Unicode::validateUtf8() * @see \Drupal\Component\Utility\SafeMarkup * * @ingroup sanitization */ public static function filter($string, $html_tags = array('a', 'em', 'strong', 'cite', 'blockquote', 'code', 'ul', 'ol', 'li', 'dl', 'dt', 'dd'), $mode = Xss::FILTER_MODE_WHITELIST) { // Only operate on valid UTF-8 strings. This is necessary to prevent cross // site scripting issues on Internet Explorer 6. if (!Unicode::validateUtf8($string)) { return ''; } // Remove NULL characters (ignored by some browsers). $string = str_replace(chr(0), '', $string); // Remove Netscape 4 JS entities. $string = preg_replace('%&\\s*\\{[^}]*(\\}\\s*;?|$)%', '', $string); // Defuse all HTML entities. $string = str_replace('&', '&', $string); // Change back only well-formed entities in our whitelist: // Decimal numeric entities. $string = preg_replace('/&#([0-9]+;)/', '&#\\1', $string); // Hexadecimal numeric entities. $string = preg_replace('/&#[Xx]0*((?:[0-9A-Fa-f]{2})+;)/', '&#x\\1', $string); // Named entities. $string = preg_replace('/&([A-Za-z][A-Za-z0-9]*;)/', '&\\1', $string); $html_tags = array_flip($html_tags); $splitter = function ($matches) use($html_tags, $mode) { return static::split($matches[1], $html_tags, $mode); }; return SafeMarkup::set(preg_replace_callback('% ( <(?=[^a-zA-Z!/]) # a lone < | # or <!--.*?--> # a comment | # or <[^>]*(>|$) # a string that starts with a <, up until the > or the end of the string | # or > # just a > )%x', $splitter, $string)); }
/** * Filters HTML to prevent cross-site-scripting (XSS) vulnerabilities. * * Based on kses by Ulf Harnhammar, see http://sourceforge.net/projects/kses. * For examples of various XSS attacks, see: http://ha.ckers.org/xss.html. * * This code does four things: * - Removes characters and constructs that can trick browsers. * - Makes sure all HTML entities are well-formed. * - Makes sure all HTML tags and attributes are well-formed. * - Makes sure no HTML tags contain URLs with a disallowed protocol (e.g. * javascript:). * * @param $string * The string with raw HTML in it. It will be stripped of everything that * can cause an XSS attack. * @param array $html_tags * An array of HTML tags. * * @return string * An XSS safe version of $string, or an empty string if $string is not * valid UTF-8. * * @see \Drupal\Component\Utility\Unicode::validateUtf8() * * @ingroup sanitization */ public static function filter($string, array $html_tags = NULL) { if (is_null($html_tags)) { $html_tags = static::$htmlTags; } // Only operate on valid UTF-8 strings. This is necessary to prevent cross // site scripting issues on Internet Explorer 6. if (!Unicode::validateUtf8($string)) { return ''; } // Remove NULL characters (ignored by some browsers). $string = str_replace(chr(0), '', $string); // Remove Netscape 4 JS entities. $string = preg_replace('%&\\s*\\{[^}]*(\\}\\s*;?|$)%', '', $string); // Defuse all HTML entities. $string = str_replace('&', '&', $string); // Change back only well-formed entities in our whitelist: // Decimal numeric entities. $string = preg_replace('/&#([0-9]+;)/', '&#\\1', $string); // Hexadecimal numeric entities. $string = preg_replace('/&#[Xx]0*((?:[0-9A-Fa-f]{2})+;)/', '&#x\\1', $string); // Named entities. $string = preg_replace('/&([A-Za-z][A-Za-z0-9]*;)/', '&\\1', $string); $html_tags = array_flip($html_tags); // Late static binding does not work inside anonymous functions. $class = get_called_class(); $splitter = function ($matches) use($html_tags, $class) { return $class::split($matches[1], $html_tags, $class); }; // Strip any tags that are not in the whitelist, then mark the text as safe // for output. All other known XSS vectors have been filtered out by this // point and any HTML tags remaining will have been deliberately allowed, so // it is acceptable to call SafeMarkup::set() on the resultant string. return preg_replace_callback('% ( <(?=[^a-zA-Z!/]) # a lone < | # or <!--.*?--> # a comment | # or <[^>]*(>|$) # a string that starts with a <, up until the > or the end of the string | # or > # just a > )%x', $splitter, $string); }