예제 #1
0
 /**
  * Cleans up HTML, removes dangerous tags and attributes, and
  * removes HTML comments
  * @param string $text
  * @param callable $processCallback Callback to do any variable or parameter
  *   replacements in HTML attribute values
  * @param array|bool $args Arguments for the processing callback
  * @param array $extratags For any extra tags to include
  * @param array $removetags For any tags (default or extra) to exclude
  * @return string
  */
 public static function removeHTMLtags($text, $processCallback = null, $args = array(), $extratags = array(), $removetags = array())
 {
     extract(self::getRecognizedTagData($extratags, $removetags));
     # Remove HTML comments
     $text = Sanitizer::removeHTMLcomments($text);
     $bits = explode('<', $text);
     $text = str_replace('>', '&gt;', array_shift($bits));
     if (!MWTidy::isEnabled()) {
         $tagstack = $tablestack = array();
         foreach ($bits as $x) {
             $regs = array();
             # $slash: Does the current element start with a '/'?
             # $t: Current element name
             # $params: String between element name and >
             # $brace: Ending '>' or '/>'
             # $rest: Everything until the next element of $bits
             if (preg_match(self::ELEMENT_BITS_REGEX, $x, $regs)) {
                 list(, $slash, $t, $params, $brace, $rest) = $regs;
             } else {
                 $slash = $t = $params = $brace = $rest = null;
             }
             $badtag = false;
             if (isset($htmlelements[$t = strtolower($t)])) {
                 # Check our stack
                 if ($slash && isset($htmlsingleonly[$t])) {
                     $badtag = true;
                 } elseif ($slash) {
                     # Closing a tag... is it the one we just opened?
                     MediaWiki\suppressWarnings();
                     $ot = array_pop($tagstack);
                     MediaWiki\restoreWarnings();
                     if ($ot != $t) {
                         if (isset($htmlsingleallowed[$ot])) {
                             # Pop all elements with an optional close tag
                             # and see if we find a match below them
                             $optstack = array();
                             array_push($optstack, $ot);
                             MediaWiki\suppressWarnings();
                             $ot = array_pop($tagstack);
                             MediaWiki\restoreWarnings();
                             while ($ot != $t && isset($htmlsingleallowed[$ot])) {
                                 array_push($optstack, $ot);
                                 MediaWiki\suppressWarnings();
                                 $ot = array_pop($tagstack);
                                 MediaWiki\restoreWarnings();
                             }
                             if ($t != $ot) {
                                 # No match. Push the optional elements back again
                                 $badtag = true;
                                 MediaWiki\suppressWarnings();
                                 $ot = array_pop($optstack);
                                 MediaWiki\restoreWarnings();
                                 while ($ot) {
                                     array_push($tagstack, $ot);
                                     MediaWiki\suppressWarnings();
                                     $ot = array_pop($optstack);
                                     MediaWiki\restoreWarnings();
                                 }
                             }
                         } else {
                             MediaWiki\suppressWarnings();
                             array_push($tagstack, $ot);
                             MediaWiki\restoreWarnings();
                             # <li> can be nested in <ul> or <ol>, skip those cases:
                             if (!isset($htmllist[$ot]) || !isset($listtags[$t])) {
                                 $badtag = true;
                             }
                         }
                     } else {
                         if ($t == 'table') {
                             $tagstack = array_pop($tablestack);
                         }
                     }
                     $newparams = '';
                 } else {
                     # Keep track for later
                     if (isset($tabletags[$t]) && !in_array('table', $tagstack)) {
                         $badtag = true;
                     } elseif (in_array($t, $tagstack) && !isset($htmlnest[$t])) {
                         $badtag = true;
                         # Is it a self closed htmlpair ? (bug 5487)
                     } elseif ($brace == '/>' && isset($htmlpairs[$t])) {
                         $badtag = true;
                     } elseif (isset($htmlsingleonly[$t])) {
                         # Hack to force empty tag for unclosable elements
                         $brace = '/>';
                     } elseif (isset($htmlsingle[$t])) {
                         # Hack to not close $htmlsingle tags
                         $brace = null;
                         # Still need to push this optionally-closed tag to
                         # the tag stack so that we can match end tags
                         # instead of marking them as bad.
                         array_push($tagstack, $t);
                     } elseif (isset($tabletags[$t]) && in_array($t, $tagstack)) {
                         // New table tag but forgot to close the previous one
                         $text .= "</{$t}>";
                     } else {
                         if ($t == 'table') {
                             array_push($tablestack, $tagstack);
                             $tagstack = array();
                         }
                         array_push($tagstack, $t);
                     }
                     # Replace any variables or template parameters with
                     # plaintext results.
                     if (is_callable($processCallback)) {
                         call_user_func_array($processCallback, array(&$params, $args));
                     }
                     if (!Sanitizer::validateTag($params, $t)) {
                         $badtag = true;
                     }
                     # Strip non-approved attributes from the tag
                     $newparams = Sanitizer::fixTagAttributes($params, $t);
                 }
                 if (!$badtag) {
                     $rest = str_replace('>', '&gt;', $rest);
                     $close = $brace == '/>' && !$slash ? ' /' : '';
                     $text .= "<{$slash}{$t}{$newparams}{$close}>{$rest}";
                     continue;
                 }
             }
             $text .= '&lt;' . str_replace('>', '&gt;', $x);
         }
         # Close off any remaining tags
         while (is_array($tagstack) && ($t = array_pop($tagstack))) {
             $text .= "</{$t}>\n";
             if ($t == 'table') {
                 $tagstack = array_pop($tablestack);
             }
         }
     } else {
         # this might be possible using tidy itself
         foreach ($bits as $x) {
             if (preg_match(self::ELEMENT_BITS_REGEX, $x, $regs)) {
                 list(, $slash, $t, $params, $brace, $rest) = $regs;
                 $badtag = false;
                 if (isset($htmlelements[$t = strtolower($t)])) {
                     if (is_callable($processCallback)) {
                         call_user_func_array($processCallback, array(&$params, $args));
                     }
                     if (!Sanitizer::validateTag($params, $t)) {
                         $badtag = true;
                     }
                     $newparams = Sanitizer::fixTagAttributes($params, $t);
                     if (!$badtag) {
                         $rest = str_replace('>', '&gt;', $rest);
                         $text .= "<{$slash}{$t}{$newparams}{$brace}{$rest}";
                         continue;
                     }
                 }
             }
             $text .= '&lt;' . str_replace('>', '&gt;', $x);
         }
     }
     return $text;
 }
예제 #2
0
 /**
  * Cleans up HTML, removes dangerous tags and attributes, and
  * removes HTML comments
  * @private
  * @param string $text
  * @param callable $processCallback Callback to do any variable or parameter
  *   replacements in HTML attribute values
  * @param array $args Arguments for the processing callback
  * @param array $extratags For any extra tags to include
  * @param array $removetags For any tags (default or extra) to exclude
  * @return string
  */
 static function removeHTMLtags($text, $processCallback = null, $args = array(), $extratags = array(), $removetags = array())
 {
     global $wgUseTidy, $wgAllowMicrodataAttributes, $wgAllowImageTag;
     static $htmlpairsStatic, $htmlsingle, $htmlsingleonly, $htmlnest, $tabletags, $htmllist, $listtags, $htmlsingleallowed, $htmlelementsStatic, $staticInitialised;
     wfProfileIn(__METHOD__);
     // Base our staticInitialised variable off of the global config state so that if the globals
     // are changed (like in the screwed up test system) we will re-initialise the settings.
     $globalContext = implode('-', compact('wgAllowMicrodataAttributes', 'wgAllowImageTag'));
     if (!$staticInitialised || $staticInitialised != $globalContext) {
         $htmlpairsStatic = array('b', 'bdi', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's', 'strike', 'strong', 'tt', 'var', 'div', 'center', 'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre', 'ruby', 'rt', 'rb', 'rp', 'p', 'span', 'abbr', 'dfn', 'kbd', 'samp', 'data', 'time', 'mark');
         $htmlsingle = array('br', 'wbr', 'hr', 'li', 'dt', 'dd');
         $htmlsingleonly = array('br', 'wbr', 'hr');
         if ($wgAllowMicrodataAttributes) {
             $htmlsingle[] = $htmlsingleonly[] = 'meta';
             $htmlsingle[] = $htmlsingleonly[] = 'link';
         }
         $htmlnest = array('table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul', 'li', 'dl', 'dt', 'dd', 'font', 'big', 'small', 'sub', 'sup', 'span', 'var', 'kbd', 'samp', 'em', 'strong', 'q', 'ruby', 'bdo');
         $tabletags = array('td', 'th', 'tr');
         $htmllist = array('ul', 'ol');
         $listtags = array('li');
         if ($wgAllowImageTag) {
             $htmlsingle[] = 'img';
             $htmlsingleonly[] = 'img';
         }
         $htmlsingleallowed = array_unique(array_merge($htmlsingle, $tabletags));
         $htmlelementsStatic = array_unique(array_merge($htmlsingle, $htmlpairsStatic, $htmlnest));
         # Convert them all to hashtables for faster lookup
         $vars = array('htmlpairsStatic', 'htmlsingle', 'htmlsingleonly', 'htmlnest', 'tabletags', 'htmllist', 'listtags', 'htmlsingleallowed', 'htmlelementsStatic');
         foreach ($vars as $var) {
             ${$var} = array_flip(${$var});
         }
         $staticInitialised = $globalContext;
     }
     # Populate $htmlpairs and $htmlelements with the $extratags and $removetags arrays
     $extratags = array_flip($extratags);
     $removetags = array_flip($removetags);
     $htmlpairs = array_merge($extratags, $htmlpairsStatic);
     $htmlelements = array_diff_key(array_merge($extratags, $htmlelementsStatic), $removetags);
     # Remove HTML comments
     $text = Sanitizer::removeHTMLcomments($text);
     $bits = explode('<', $text);
     $text = str_replace('>', '&gt;', array_shift($bits));
     if (!$wgUseTidy) {
         $tagstack = $tablestack = array();
         foreach ($bits as $x) {
             $regs = array();
             # $slash: Does the current element start with a '/'?
             # $t: Current element name
             # $params: String between element name and >
             # $brace: Ending '>' or '/>'
             # $rest: Everything until the next element of $bits
             if (preg_match('!^(/?)([^\\s/>]+)([^>]*?)(/{0,1}>)([^<]*)$!', $x, $regs)) {
                 list(, $slash, $t, $params, $brace, $rest) = $regs;
             } else {
                 $slash = $t = $params = $brace = $rest = null;
             }
             $badtag = false;
             if (isset($htmlelements[$t = strtolower($t)])) {
                 # Check our stack
                 if ($slash && isset($htmlsingleonly[$t])) {
                     $badtag = true;
                 } elseif ($slash) {
                     # Closing a tag... is it the one we just opened?
                     $ot = @array_pop($tagstack);
                     if ($ot != $t) {
                         if (isset($htmlsingleallowed[$ot])) {
                             # Pop all elements with an optional close tag
                             # and see if we find a match below them
                             $optstack = array();
                             array_push($optstack, $ot);
                             wfSuppressWarnings();
                             $ot = array_pop($tagstack);
                             wfRestoreWarnings();
                             while ($ot != $t && isset($htmlsingleallowed[$ot])) {
                                 array_push($optstack, $ot);
                                 wfSuppressWarnings();
                                 $ot = array_pop($tagstack);
                                 wfRestoreWarnings();
                             }
                             if ($t != $ot) {
                                 # No match. Push the optional elements back again
                                 $badtag = true;
                                 wfSuppressWarnings();
                                 $ot = array_pop($optstack);
                                 wfRestoreWarnings();
                                 while ($ot) {
                                     array_push($tagstack, $ot);
                                     wfSuppressWarnings();
                                     $ot = array_pop($optstack);
                                     wfRestoreWarnings();
                                 }
                             }
                         } else {
                             @array_push($tagstack, $ot);
                             # <li> can be nested in <ul> or <ol>, skip those cases:
                             if (!isset($htmllist[$ot]) || !isset($listtags[$t])) {
                                 $badtag = true;
                             }
                         }
                     } else {
                         if ($t == 'table') {
                             $tagstack = array_pop($tablestack);
                         }
                     }
                     $newparams = '';
                 } else {
                     # Keep track for later
                     if (isset($tabletags[$t]) && !in_array('table', $tagstack)) {
                         $badtag = true;
                     } elseif (in_array($t, $tagstack) && !isset($htmlnest[$t])) {
                         $badtag = true;
                         # Is it a self closed htmlpair ? (bug 5487)
                     } elseif ($brace == '/>' && isset($htmlpairs[$t])) {
                         $badtag = true;
                     } elseif (isset($htmlsingleonly[$t])) {
                         # Hack to force empty tag for unclosable elements
                         $brace = '/>';
                     } elseif (isset($htmlsingle[$t])) {
                         # Hack to not close $htmlsingle tags
                         $brace = null;
                         # Still need to push this optionally-closed tag to
                         # the tag stack so that we can match end tags
                         # instead of marking them as bad.
                         array_push($tagstack, $t);
                     } elseif (isset($tabletags[$t]) && in_array($t, $tagstack)) {
                         // New table tag but forgot to close the previous one
                         $text .= "</{$t}>";
                     } else {
                         if ($t == 'table') {
                             array_push($tablestack, $tagstack);
                             $tagstack = array();
                         }
                         array_push($tagstack, $t);
                     }
                     # Replace any variables or template parameters with
                     # plaintext results.
                     if (is_callable($processCallback)) {
                         call_user_func_array($processCallback, array(&$params, $args));
                     }
                     if (!Sanitizer::validateTag($params, $t)) {
                         $badtag = true;
                     }
                     # Strip non-approved attributes from the tag
                     $newparams = Sanitizer::fixTagAttributes($params, $t);
                 }
                 if (!$badtag) {
                     $rest = str_replace('>', '&gt;', $rest);
                     $close = $brace == '/>' && !$slash ? ' /' : '';
                     $text .= "<{$slash}{$t}{$newparams}{$close}>{$rest}";
                     continue;
                 }
             }
             $text .= '&lt;' . str_replace('>', '&gt;', $x);
         }
         # Close off any remaining tags
         while (is_array($tagstack) && ($t = array_pop($tagstack))) {
             $text .= "</{$t}>\n";
             if ($t == 'table') {
                 $tagstack = array_pop($tablestack);
             }
         }
     } else {
         # this might be possible using tidy itself
         foreach ($bits as $x) {
             preg_match('/^(\\/?)(\\w+)([^>]*?)(\\/{0,1}>)([^<]*)$/', $x, $regs);
             @(list(, $slash, $t, $params, $brace, $rest) = $regs);
             $badtag = false;
             if (isset($htmlelements[$t = strtolower($t)])) {
                 if (is_callable($processCallback)) {
                     call_user_func_array($processCallback, array(&$params, $args));
                 }
                 if (!Sanitizer::validateTag($params, $t)) {
                     $badtag = true;
                 }
                 $newparams = Sanitizer::fixTagAttributes($params, $t);
                 if (!$badtag) {
                     $rest = str_replace('>', '&gt;', $rest);
                     $text .= "<{$slash}{$t}{$newparams}{$brace}{$rest}";
                     continue;
                 }
             }
             $text .= '&lt;' . str_replace('>', '&gt;', $x);
         }
     }
     wfProfileOut(__METHOD__);
     return $text;
 }