/** * Cleans up HTML, removes dangerous tags and attributes, and * removes HTML comments * @param string $text * @param callable $processCallback Callback to do any variable or parameter * replacements in HTML attribute values * @param array|bool $args Arguments for the processing callback * @param array $extratags For any extra tags to include * @param array $removetags For any tags (default or extra) to exclude * @return string */ public static function removeHTMLtags($text, $processCallback = null, $args = array(), $extratags = array(), $removetags = array()) { extract(self::getRecognizedTagData($extratags, $removetags)); # Remove HTML comments $text = Sanitizer::removeHTMLcomments($text); $bits = explode('<', $text); $text = str_replace('>', '>', array_shift($bits)); if (!MWTidy::isEnabled()) { $tagstack = $tablestack = array(); foreach ($bits as $x) { $regs = array(); # $slash: Does the current element start with a '/'? # $t: Current element name # $params: String between element name and > # $brace: Ending '>' or '/>' # $rest: Everything until the next element of $bits if (preg_match(self::ELEMENT_BITS_REGEX, $x, $regs)) { list(, $slash, $t, $params, $brace, $rest) = $regs; } else { $slash = $t = $params = $brace = $rest = null; } $badtag = false; if (isset($htmlelements[$t = strtolower($t)])) { # Check our stack if ($slash && isset($htmlsingleonly[$t])) { $badtag = true; } elseif ($slash) { # Closing a tag... is it the one we just opened? MediaWiki\suppressWarnings(); $ot = array_pop($tagstack); MediaWiki\restoreWarnings(); if ($ot != $t) { if (isset($htmlsingleallowed[$ot])) { # Pop all elements with an optional close tag # and see if we find a match below them $optstack = array(); array_push($optstack, $ot); MediaWiki\suppressWarnings(); $ot = array_pop($tagstack); MediaWiki\restoreWarnings(); while ($ot != $t && isset($htmlsingleallowed[$ot])) { array_push($optstack, $ot); MediaWiki\suppressWarnings(); $ot = array_pop($tagstack); MediaWiki\restoreWarnings(); } if ($t != $ot) { # No match. Push the optional elements back again $badtag = true; MediaWiki\suppressWarnings(); $ot = array_pop($optstack); MediaWiki\restoreWarnings(); while ($ot) { array_push($tagstack, $ot); MediaWiki\suppressWarnings(); $ot = array_pop($optstack); MediaWiki\restoreWarnings(); } } } else { MediaWiki\suppressWarnings(); array_push($tagstack, $ot); MediaWiki\restoreWarnings(); # <li> can be nested in <ul> or <ol>, skip those cases: if (!isset($htmllist[$ot]) || !isset($listtags[$t])) { $badtag = true; } } } else { if ($t == 'table') { $tagstack = array_pop($tablestack); } } $newparams = ''; } else { # Keep track for later if (isset($tabletags[$t]) && !in_array('table', $tagstack)) { $badtag = true; } elseif (in_array($t, $tagstack) && !isset($htmlnest[$t])) { $badtag = true; # Is it a self closed htmlpair ? (bug 5487) } elseif ($brace == '/>' && isset($htmlpairs[$t])) { $badtag = true; } elseif (isset($htmlsingleonly[$t])) { # Hack to force empty tag for unclosable elements $brace = '/>'; } elseif (isset($htmlsingle[$t])) { # Hack to not close $htmlsingle tags $brace = null; # Still need to push this optionally-closed tag to # the tag stack so that we can match end tags # instead of marking them as bad. array_push($tagstack, $t); } elseif (isset($tabletags[$t]) && in_array($t, $tagstack)) { // New table tag but forgot to close the previous one $text .= "</{$t}>"; } else { if ($t == 'table') { array_push($tablestack, $tagstack); $tagstack = array(); } array_push($tagstack, $t); } # Replace any variables or template parameters with # plaintext results. if (is_callable($processCallback)) { call_user_func_array($processCallback, array(&$params, $args)); } if (!Sanitizer::validateTag($params, $t)) { $badtag = true; } # Strip non-approved attributes from the tag $newparams = Sanitizer::fixTagAttributes($params, $t); } if (!$badtag) { $rest = str_replace('>', '>', $rest); $close = $brace == '/>' && !$slash ? ' /' : ''; $text .= "<{$slash}{$t}{$newparams}{$close}>{$rest}"; continue; } } $text .= '<' . str_replace('>', '>', $x); } # Close off any remaining tags while (is_array($tagstack) && ($t = array_pop($tagstack))) { $text .= "</{$t}>\n"; if ($t == 'table') { $tagstack = array_pop($tablestack); } } } else { # this might be possible using tidy itself foreach ($bits as $x) { if (preg_match(self::ELEMENT_BITS_REGEX, $x, $regs)) { list(, $slash, $t, $params, $brace, $rest) = $regs; $badtag = false; if (isset($htmlelements[$t = strtolower($t)])) { if (is_callable($processCallback)) { call_user_func_array($processCallback, array(&$params, $args)); } if (!Sanitizer::validateTag($params, $t)) { $badtag = true; } $newparams = Sanitizer::fixTagAttributes($params, $t); if (!$badtag) { $rest = str_replace('>', '>', $rest); $text .= "<{$slash}{$t}{$newparams}{$brace}{$rest}"; continue; } } } $text .= '<' . str_replace('>', '>', $x); } } return $text; }
/** * Cleans up HTML, removes dangerous tags and attributes, and * removes HTML comments * @private * @param string $text * @param callable $processCallback Callback to do any variable or parameter * replacements in HTML attribute values * @param array $args Arguments for the processing callback * @param array $extratags For any extra tags to include * @param array $removetags For any tags (default or extra) to exclude * @return string */ static function removeHTMLtags($text, $processCallback = null, $args = array(), $extratags = array(), $removetags = array()) { global $wgUseTidy, $wgAllowMicrodataAttributes, $wgAllowImageTag; static $htmlpairsStatic, $htmlsingle, $htmlsingleonly, $htmlnest, $tabletags, $htmllist, $listtags, $htmlsingleallowed, $htmlelementsStatic, $staticInitialised; wfProfileIn(__METHOD__); // Base our staticInitialised variable off of the global config state so that if the globals // are changed (like in the screwed up test system) we will re-initialise the settings. $globalContext = implode('-', compact('wgAllowMicrodataAttributes', 'wgAllowImageTag')); if (!$staticInitialised || $staticInitialised != $globalContext) { $htmlpairsStatic = array('b', 'bdi', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's', 'strike', 'strong', 'tt', 'var', 'div', 'center', 'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre', 'ruby', 'rt', 'rb', 'rp', 'p', 'span', 'abbr', 'dfn', 'kbd', 'samp', 'data', 'time', 'mark'); $htmlsingle = array('br', 'wbr', 'hr', 'li', 'dt', 'dd'); $htmlsingleonly = array('br', 'wbr', 'hr'); if ($wgAllowMicrodataAttributes) { $htmlsingle[] = $htmlsingleonly[] = 'meta'; $htmlsingle[] = $htmlsingleonly[] = 'link'; } $htmlnest = array('table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul', 'li', 'dl', 'dt', 'dd', 'font', 'big', 'small', 'sub', 'sup', 'span', 'var', 'kbd', 'samp', 'em', 'strong', 'q', 'ruby', 'bdo'); $tabletags = array('td', 'th', 'tr'); $htmllist = array('ul', 'ol'); $listtags = array('li'); if ($wgAllowImageTag) { $htmlsingle[] = 'img'; $htmlsingleonly[] = 'img'; } $htmlsingleallowed = array_unique(array_merge($htmlsingle, $tabletags)); $htmlelementsStatic = array_unique(array_merge($htmlsingle, $htmlpairsStatic, $htmlnest)); # Convert them all to hashtables for faster lookup $vars = array('htmlpairsStatic', 'htmlsingle', 'htmlsingleonly', 'htmlnest', 'tabletags', 'htmllist', 'listtags', 'htmlsingleallowed', 'htmlelementsStatic'); foreach ($vars as $var) { ${$var} = array_flip(${$var}); } $staticInitialised = $globalContext; } # Populate $htmlpairs and $htmlelements with the $extratags and $removetags arrays $extratags = array_flip($extratags); $removetags = array_flip($removetags); $htmlpairs = array_merge($extratags, $htmlpairsStatic); $htmlelements = array_diff_key(array_merge($extratags, $htmlelementsStatic), $removetags); # Remove HTML comments $text = Sanitizer::removeHTMLcomments($text); $bits = explode('<', $text); $text = str_replace('>', '>', array_shift($bits)); if (!$wgUseTidy) { $tagstack = $tablestack = array(); foreach ($bits as $x) { $regs = array(); # $slash: Does the current element start with a '/'? # $t: Current element name # $params: String between element name and > # $brace: Ending '>' or '/>' # $rest: Everything until the next element of $bits if (preg_match('!^(/?)([^\\s/>]+)([^>]*?)(/{0,1}>)([^<]*)$!', $x, $regs)) { list(, $slash, $t, $params, $brace, $rest) = $regs; } else { $slash = $t = $params = $brace = $rest = null; } $badtag = false; if (isset($htmlelements[$t = strtolower($t)])) { # Check our stack if ($slash && isset($htmlsingleonly[$t])) { $badtag = true; } elseif ($slash) { # Closing a tag... is it the one we just opened? $ot = @array_pop($tagstack); if ($ot != $t) { if (isset($htmlsingleallowed[$ot])) { # Pop all elements with an optional close tag # and see if we find a match below them $optstack = array(); array_push($optstack, $ot); wfSuppressWarnings(); $ot = array_pop($tagstack); wfRestoreWarnings(); while ($ot != $t && isset($htmlsingleallowed[$ot])) { array_push($optstack, $ot); wfSuppressWarnings(); $ot = array_pop($tagstack); wfRestoreWarnings(); } if ($t != $ot) { # No match. Push the optional elements back again $badtag = true; wfSuppressWarnings(); $ot = array_pop($optstack); wfRestoreWarnings(); while ($ot) { array_push($tagstack, $ot); wfSuppressWarnings(); $ot = array_pop($optstack); wfRestoreWarnings(); } } } else { @array_push($tagstack, $ot); # <li> can be nested in <ul> or <ol>, skip those cases: if (!isset($htmllist[$ot]) || !isset($listtags[$t])) { $badtag = true; } } } else { if ($t == 'table') { $tagstack = array_pop($tablestack); } } $newparams = ''; } else { # Keep track for later if (isset($tabletags[$t]) && !in_array('table', $tagstack)) { $badtag = true; } elseif (in_array($t, $tagstack) && !isset($htmlnest[$t])) { $badtag = true; # Is it a self closed htmlpair ? (bug 5487) } elseif ($brace == '/>' && isset($htmlpairs[$t])) { $badtag = true; } elseif (isset($htmlsingleonly[$t])) { # Hack to force empty tag for unclosable elements $brace = '/>'; } elseif (isset($htmlsingle[$t])) { # Hack to not close $htmlsingle tags $brace = null; # Still need to push this optionally-closed tag to # the tag stack so that we can match end tags # instead of marking them as bad. array_push($tagstack, $t); } elseif (isset($tabletags[$t]) && in_array($t, $tagstack)) { // New table tag but forgot to close the previous one $text .= "</{$t}>"; } else { if ($t == 'table') { array_push($tablestack, $tagstack); $tagstack = array(); } array_push($tagstack, $t); } # Replace any variables or template parameters with # plaintext results. if (is_callable($processCallback)) { call_user_func_array($processCallback, array(&$params, $args)); } if (!Sanitizer::validateTag($params, $t)) { $badtag = true; } # Strip non-approved attributes from the tag $newparams = Sanitizer::fixTagAttributes($params, $t); } if (!$badtag) { $rest = str_replace('>', '>', $rest); $close = $brace == '/>' && !$slash ? ' /' : ''; $text .= "<{$slash}{$t}{$newparams}{$close}>{$rest}"; continue; } } $text .= '<' . str_replace('>', '>', $x); } # Close off any remaining tags while (is_array($tagstack) && ($t = array_pop($tagstack))) { $text .= "</{$t}>\n"; if ($t == 'table') { $tagstack = array_pop($tablestack); } } } else { # this might be possible using tidy itself foreach ($bits as $x) { preg_match('/^(\\/?)(\\w+)([^>]*?)(\\/{0,1}>)([^<]*)$/', $x, $regs); @(list(, $slash, $t, $params, $brace, $rest) = $regs); $badtag = false; if (isset($htmlelements[$t = strtolower($t)])) { if (is_callable($processCallback)) { call_user_func_array($processCallback, array(&$params, $args)); } if (!Sanitizer::validateTag($params, $t)) { $badtag = true; } $newparams = Sanitizer::fixTagAttributes($params, $t); if (!$badtag) { $rest = str_replace('>', '>', $rest); $text .= "<{$slash}{$t}{$newparams}{$brace}{$rest}"; continue; } } $text .= '<' . str_replace('>', '>', $x); } } wfProfileOut(__METHOD__); return $text; }