/** * Cleans up HTML, removes dangerous tags and attributes, and * removes HTML comments * @param string $text * @param callable $processCallback Callback to do any variable or parameter * replacements in HTML attribute values * @param array|bool $args Arguments for the processing callback * @param array $extratags For any extra tags to include * @param array $removetags For any tags (default or extra) to exclude * @return string */ public static function removeHTMLtags($text, $processCallback = null, $args = array(), $extratags = array(), $removetags = array()) { extract(self::getRecognizedTagData($extratags, $removetags)); # Remove HTML comments $text = Sanitizer::removeHTMLcomments($text); $bits = explode('<', $text); $text = str_replace('>', '>', array_shift($bits)); if (!MWTidy::isEnabled()) { $tagstack = $tablestack = array(); foreach ($bits as $x) { $regs = array(); # $slash: Does the current element start with a '/'? # $t: Current element name # $params: String between element name and > # $brace: Ending '>' or '/>' # $rest: Everything until the next element of $bits if (preg_match(self::ELEMENT_BITS_REGEX, $x, $regs)) { list(, $slash, $t, $params, $brace, $rest) = $regs; } else { $slash = $t = $params = $brace = $rest = null; } $badtag = false; if (isset($htmlelements[$t = strtolower($t)])) { # Check our stack if ($slash && isset($htmlsingleonly[$t])) { $badtag = true; } elseif ($slash) { # Closing a tag... is it the one we just opened? MediaWiki\suppressWarnings(); $ot = array_pop($tagstack); MediaWiki\restoreWarnings(); if ($ot != $t) { if (isset($htmlsingleallowed[$ot])) { # Pop all elements with an optional close tag # and see if we find a match below them $optstack = array(); array_push($optstack, $ot); MediaWiki\suppressWarnings(); $ot = array_pop($tagstack); MediaWiki\restoreWarnings(); while ($ot != $t && isset($htmlsingleallowed[$ot])) { array_push($optstack, $ot); MediaWiki\suppressWarnings(); $ot = array_pop($tagstack); MediaWiki\restoreWarnings(); } if ($t != $ot) { # No match. Push the optional elements back again $badtag = true; MediaWiki\suppressWarnings(); $ot = array_pop($optstack); MediaWiki\restoreWarnings(); while ($ot) { array_push($tagstack, $ot); MediaWiki\suppressWarnings(); $ot = array_pop($optstack); MediaWiki\restoreWarnings(); } } } else { MediaWiki\suppressWarnings(); array_push($tagstack, $ot); MediaWiki\restoreWarnings(); # <li> can be nested in <ul> or <ol>, skip those cases: if (!isset($htmllist[$ot]) || !isset($listtags[$t])) { $badtag = true; } } } else { if ($t == 'table') { $tagstack = array_pop($tablestack); } } $newparams = ''; } else { # Keep track for later if (isset($tabletags[$t]) && !in_array('table', $tagstack)) { $badtag = true; } elseif (in_array($t, $tagstack) && !isset($htmlnest[$t])) { $badtag = true; # Is it a self closed htmlpair ? (bug 5487) } elseif ($brace == '/>' && isset($htmlpairs[$t])) { $badtag = true; } elseif (isset($htmlsingleonly[$t])) { # Hack to force empty tag for unclosable elements $brace = '/>'; } elseif (isset($htmlsingle[$t])) { # Hack to not close $htmlsingle tags $brace = null; # Still need to push this optionally-closed tag to # the tag stack so that we can match end tags # instead of marking them as bad. array_push($tagstack, $t); } elseif (isset($tabletags[$t]) && in_array($t, $tagstack)) { // New table tag but forgot to close the previous one $text .= "</{$t}>"; } else { if ($t == 'table') { array_push($tablestack, $tagstack); $tagstack = array(); } array_push($tagstack, $t); } # Replace any variables or template parameters with # plaintext results. if (is_callable($processCallback)) { call_user_func_array($processCallback, array(&$params, $args)); } if (!Sanitizer::validateTag($params, $t)) { $badtag = true; } # Strip non-approved attributes from the tag $newparams = Sanitizer::fixTagAttributes($params, $t); } if (!$badtag) { $rest = str_replace('>', '>', $rest); $close = $brace == '/>' && !$slash ? ' /' : ''; $text .= "<{$slash}{$t}{$newparams}{$close}>{$rest}"; continue; } } $text .= '<' . str_replace('>', '>', $x); } # Close off any remaining tags while (is_array($tagstack) && ($t = array_pop($tagstack))) { $text .= "</{$t}>\n"; if ($t == 'table') { $tagstack = array_pop($tablestack); } } } else { # this might be possible using tidy itself foreach ($bits as $x) { if (preg_match(self::ELEMENT_BITS_REGEX, $x, $regs)) { list(, $slash, $t, $params, $brace, $rest) = $regs; $badtag = false; if (isset($htmlelements[$t = strtolower($t)])) { if (is_callable($processCallback)) { call_user_func_array($processCallback, array(&$params, $args)); } if (!Sanitizer::validateTag($params, $t)) { $badtag = true; } $newparams = Sanitizer::fixTagAttributes($params, $t); if (!$badtag) { $rest = str_replace('>', '>', $rest); $text .= "<{$slash}{$t}{$newparams}{$brace}{$rest}"; continue; } } } $text .= '<' . str_replace('>', '>', $x); } } return $text; }
/** * Split summary of a wikilog article from the contents. * If summary is part of the parser output, use it; otherwise, try to * extract it from the content text (section zero, before the first * heading). * * @param $parserOutput ParserOutput object. * @return Two-element array with summary and content. Summary may be * NULL if nonexistent. */ public static function splitSummaryContent( $parserOutput ) { global $wgUseTidy; $content = Sanitizer::removeHTMLcomments( $parserOutput->getText() ); if ( isset( $parserOutput->mExtWikilog ) && $parserOutput->mExtWikilog->mSummary ) { # Parser output contains wikilog output and summary, use it. $summary = Sanitizer::removeHTMLcomments( $parserOutput->mExtWikilog->mSummary ); } else { # Try to extract summary from the content text. $blocks = preg_split( '/<(h[1-6]).*?>.*?<\\/\\1>/i', $content, 2 ); if ( count( $blocks ) > 1 ) { # Long article with multiple sections, use only the first one. $summary = $blocks[0]; # It is possible for the regex to split on a heading that is # not a child of the root element (e.g. <div><h2>...</h2> # </div> leaving an open <div> tag). In order to handle such # cases, we pass the summary through tidy if it is available. if ( $wgUseTidy ) { $summary = MWTidy::tidy( $summary ); } } else { # Short article with a single section, use no summary and # leave to the caller to decide what to do. $summary = null; } } return array( $summary, $content ); }
/** * Cleans up HTML, removes dangerous tags and attributes, and * removes HTML comments * @private * @param $text String * @param $processCallback Callback to do any variable or parameter replacements in HTML attribute values * @param $args Array for the processing callback * @param $extratags Array for any extra tags to include * @param $removetags Array for any tags (default or extra) to exclude * @return string */ static function removeHTMLtags($text, $processCallback = null, $args = array(), $extratags = array(), $removetags = array()) { global $wgUseTidy; static $htmlpairsStatic, $htmlsingle, $htmlsingleonly, $htmlnest, $tabletags, $htmllist, $listtags, $htmlsingleallowed, $htmlelementsStatic, $staticInitialised; wfProfileIn(__METHOD__); if (!$staticInitialised) { $htmlpairsStatic = array('b', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's', 'strike', 'strong', 'tt', 'var', 'div', 'center', 'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre', 'ruby', 'rt', 'rb', 'rp', 'p', 'span', 'abbr', 'dfn', 'kbd', 'samp', 'thead', 'tbody', 'tfoot'); $htmlsingle = array('br', 'hr', 'li', 'dt', 'dd'); $htmlsingleonly = array('br', 'hr'); $htmlnest = array('table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul', 'dl', 'font', 'big', 'small', 'sub', 'sup', 'span'); $tabletags = array('td', 'th', 'tr'); $htmllist = array('ul', 'ol'); $listtags = array('li'); global $wgAllowImageTag; if ($wgAllowImageTag) { $htmlsingle[] = 'img'; $htmlsingleonly[] = 'img'; } $htmlsingleallowed = array_unique(array_merge($htmlsingle, $tabletags)); $htmlelementsStatic = array_unique(array_merge($htmlsingle, $htmlpairsStatic, $htmlnest)); # Convert them all to hashtables for faster lookup $vars = array('htmlpairsStatic', 'htmlsingle', 'htmlsingleonly', 'htmlnest', 'tabletags', 'htmllist', 'listtags', 'htmlsingleallowed', 'htmlelementsStatic'); foreach ($vars as $var) { ${$var} = array_flip(${$var}); } $staticInitialised = true; } # Populate $htmlpairs and $htmlelements with the $extratags and $removetags arrays $extratags = array_flip($extratags); $removetags = array_flip($removetags); $htmlpairs = array_merge($extratags, $htmlpairsStatic); $htmlelements = array_diff_key(array_merge($extratags, $htmlelementsStatic), $removetags); # Remove HTML comments $text = Sanitizer::removeHTMLcomments($text); $bits = explode('<', $text); $text = str_replace('>', '>', array_shift($bits)); if (!$wgUseTidy) { $tagstack = $tablestack = array(); foreach ($bits as $x) { $regs = array(); # $slash: Does the current element start with a '/'? # $t: Current element name # $params: String between element name and > # $brace: Ending '>' or '/>' # $rest: Everything until the next element of $bits if (preg_match('!^(/?)(\\w+)([^>]*?)(/{0,1}>)([^<]*)$!', $x, $regs)) { list(, $slash, $t, $params, $brace, $rest) = $regs; } else { $slash = $t = $params = $brace = $rest = null; } $badtag = false; if (isset($htmlelements[$t = strtolower($t)])) { # Check our stack if ($slash && isset($htmlsingleonly[$t])) { $badtag = true; } elseif ($slash) { # Closing a tag... is it the one we just opened? $ot = @array_pop($tagstack); if ($ot != $t) { if (isset($htmlsingleallowed[$ot])) { # Pop all elements with an optional close tag # and see if we find a match below them $optstack = array(); array_push($optstack, $ot); $ot = @array_pop($tagstack); while ($ot != $t && isset($htmlsingleallowed[$ot])) { array_push($optstack, $ot); $ot = @array_pop($tagstack); } if ($t != $ot) { # No match. Push the optional elements back again $badtag = true; while ($ot = @array_pop($optstack)) { array_push($tagstack, $ot); } } } else { @array_push($tagstack, $ot); # <li> can be nested in <ul> or <ol>, skip those cases: if (!isset($htmllist[$ot]) || !isset($listtags[$t])) { $badtag = true; } } } else { if ($t == 'table') { $tagstack = array_pop($tablestack); } } $newparams = ''; } else { # Keep track for later if (isset($tabletags[$t]) && !in_array('table', $tagstack)) { $badtag = true; } elseif (in_array($t, $tagstack) && !isset($htmlnest[$t])) { $badtag = true; # Is it a self closed htmlpair ? (bug 5487) } elseif ($brace == '/>' && isset($htmlpairs[$t])) { $badtag = true; } elseif (isset($htmlsingleonly[$t])) { # Hack to force empty tag for uncloseable elements $brace = '/>'; } elseif (isset($htmlsingle[$t])) { # Hack to not close $htmlsingle tags $brace = null; } elseif (isset($tabletags[$t]) && in_array($t, $tagstack)) { // New table tag but forgot to close the previous one $text .= "</{$t}>"; } else { if ($t == 'table') { array_push($tablestack, $tagstack); $tagstack = array(); } array_push($tagstack, $t); } # Replace any variables or template parameters with # plaintext results. if (is_callable($processCallback)) { call_user_func_array($processCallback, array(&$params, $args)); } # Strip non-approved attributes from the tag $newparams = Sanitizer::fixTagAttributes($params, $t); } if (!$badtag) { $rest = str_replace('>', '>', $rest); $close = $brace == '/>' && !$slash ? ' /' : ''; $text .= "<{$slash}{$t}{$newparams}{$close}>{$rest}"; continue; } } $text .= '<' . str_replace('>', '>', $x); } # Close off any remaining tags while (is_array($tagstack) && ($t = array_pop($tagstack))) { $text .= "</{$t}>\n"; if ($t == 'table') { $tagstack = array_pop($tablestack); } } } else { # this might be possible using tidy itself foreach ($bits as $x) { preg_match('/^(\\/?)(\\w+)([^>]*?)(\\/{0,1}>)([^<]*)$/', $x, $regs); @(list(, $slash, $t, $params, $brace, $rest) = $regs); if (isset($htmlelements[$t = strtolower($t)])) { if (is_callable($processCallback)) { call_user_func_array($processCallback, array(&$params, $args)); } $newparams = Sanitizer::fixTagAttributes($params, $t); $rest = str_replace('>', '>', $rest); $text .= "<{$slash}{$t}{$newparams}{$brace}{$rest}"; } else { $text .= '<' . str_replace('>', '>', $x); } } } wfProfileOut(__METHOD__); return $text; }
/** * Return the text of a template, after recursively * replacing any variables or templates within the template. * * @param array $piece The parts of the template * $piece['text']: matched text * $piece['title']: the title, i.e. the part before the | * $piece['parts']: the parameter array * @return string the text of the template * @private */ function braceSubstitution($piece) { global $wgContLang, $wgLang, $wgAllowDisplayTitle, $wgNonincludableNamespaces; $fname = __METHOD__; wfProfileIn($fname); wfProfileIn(__METHOD__ . '-setup'); # Flags $found = false; # $text has been filled $nowiki = false; # wiki markup in $text should be escaped $noparse = false; # Unsafe HTML tags should not be stripped, etc. $noargs = false; # Don't replace triple-brace arguments in $text $replaceHeadings = false; # Make the edit section links go to the template not the article $headingOffset = 0; # Skip headings when number, to account for those that weren't transcluded. $isHTML = false; # $text is HTML, armour it against wikitext transformation $forceRawInterwiki = false; # Force interwiki transclusion to be done in raw mode not rendered # Title object, where $text came from $title = NULL; $linestart = ''; # $part1 is the bit before the first |, and must contain only title characters # $args is a list of arguments, starting from index 0, not including $part1 $titleText = $part1 = $piece['title']; # If the third subpattern matched anything, it will start with | if (null == $piece['parts']) { $replaceWith = $this->variableSubstitution(array($piece['text'], $piece['title'])); if ($replaceWith != $piece['text']) { $text = $replaceWith; $found = true; $noparse = true; $noargs = true; } } $args = null == $piece['parts'] ? array() : $piece['parts']; wfProfileOut(__METHOD__ . '-setup'); # SUBST wfProfileIn(__METHOD__ . '-modifiers'); if (!$found) { $mwSubst =& MagicWord::get('subst'); if ($mwSubst->matchStartAndRemove($part1) xor $this->ot['wiki']) { # One of two possibilities is true: # 1) Found SUBST but not in the PST phase # 2) Didn't find SUBST and in the PST phase # In either case, return without further processing $text = $piece['text']; $found = true; $noparse = true; $noargs = true; } } # MSG, MSGNW and RAW if (!$found) { # Check for MSGNW: $mwMsgnw =& MagicWord::get('msgnw'); if ($mwMsgnw->matchStartAndRemove($part1)) { $nowiki = true; } else { # Remove obsolete MSG: $mwMsg =& MagicWord::get('msg'); $mwMsg->matchStartAndRemove($part1); } # Check for RAW: $mwRaw =& MagicWord::get('raw'); if ($mwRaw->matchStartAndRemove($part1)) { $forceRawInterwiki = true; } } wfProfileOut(__METHOD__ . '-modifiers'); //save path level before recursing into functions & templates. $lastPathLevel = $this->mTemplatePath; # Parser functions if (!$found) { wfProfileIn(__METHOD__ . '-pfunc'); $colonPos = strpos($part1, ':'); if ($colonPos !== false) { # Case sensitive functions $function = substr($part1, 0, $colonPos); if (isset($this->mFunctionSynonyms[1][$function])) { $function = $this->mFunctionSynonyms[1][$function]; } else { # Case insensitive functions $function = strtolower($function); if (isset($this->mFunctionSynonyms[0][$function])) { $function = $this->mFunctionSynonyms[0][$function]; } else { $function = false; } } if ($function) { $funcArgs = array_map('trim', $args); $funcArgs = array_merge(array(&$this, trim(substr($part1, $colonPos + 1))), $funcArgs); $result = call_user_func_array($this->mFunctionHooks[$function], $funcArgs); $found = true; // The text is usually already parsed, doesn't need triple-brace tags expanded, etc. //$noargs = true; //$noparse = true; if (is_array($result)) { if (isset($result[0])) { $text = $linestart . $result[0]; unset($result[0]); } // Extract flags into the local scope // This allows callers to set flags such as nowiki, noparse, found, etc. extract($result); } else { $text = $linestart . $result; } } } wfProfileOut(__METHOD__ . '-pfunc'); } # Template table test # Did we encounter this template already? If yes, it is in the cache # and we need to check for loops. if (!$found && isset($this->mTemplates[$piece['title']])) { $found = true; # Infinite loop test if (isset($this->mTemplatePath[$part1])) { $noparse = true; $noargs = true; $found = true; $text = $linestart . "[[{$part1}]]<!-- WARNING: template loop detected -->"; wfDebug(__METHOD__ . ": template loop broken at '{$part1}'\n"); } else { # set $text to cached message. $text = $linestart . $this->mTemplates[$piece['title']]; #treat title for cached page the same as others $ns = NS_TEMPLATE; $subpage = ''; $part1 = $this->maybeDoSubpageLink($part1, $subpage); if ($subpage !== '') { $ns = $this->mTitle->getNamespace(); } $title = Title::newFromText($part1, $ns); //used by include size checking $titleText = $title->getPrefixedText(); //used by edit section links $replaceHeadings = true; } } # Load from database if (!$found) { wfProfileIn(__METHOD__ . '-loadtpl'); $ns = NS_TEMPLATE; # declaring $subpage directly in the function call # does not work correctly with references and breaks # {{/subpage}}-style inclusions $subpage = ''; $part1 = $this->maybeDoSubpageLink($part1, $subpage); if ($subpage !== '') { $ns = $this->mTitle->getNamespace(); } $title = Title::newFromText($part1, $ns); if (!is_null($title)) { $titleText = $title->getPrefixedText(); # Check for language variants if the template is not found if ($wgContLang->hasVariants() && $title->getArticleID() == 0) { $wgContLang->findVariantLink($part1, $title); } if (!$title->isExternal()) { if ($title->getNamespace() == NS_SPECIAL && $this->mOptions->getAllowSpecialInclusion() && $this->ot['html']) { $text = SpecialPage::capturePath($title); if (is_string($text)) { $found = true; $noparse = true; $noargs = true; $isHTML = true; $this->disableCache(); } } else { if ($wgNonincludableNamespaces && in_array($title->getNamespace(), $wgNonincludableNamespaces)) { $found = false; //access denied wfDebug("{$fname}: template inclusion denied for " . $title->getPrefixedDBkey()); } else { list($articleContent, $title) = $this->fetchTemplateAndtitle($title); if ($articleContent !== false) { $found = true; $text = $articleContent; $replaceHeadings = true; } } } # If the title is valid but undisplayable, make a link to it if (!$found && ($this->ot['html'] || $this->ot['pre'])) { $text = "[[:{$titleText}]]"; $found = true; } } elseif ($title->isTrans()) { // Interwiki transclusion if ($this->ot['html'] && !$forceRawInterwiki) { $text = $this->interwikiTransclude($title, 'render'); $isHTML = true; $noparse = true; } else { $text = $this->interwikiTransclude($title, 'raw'); $replaceHeadings = true; } $found = true; } # Template cache array insertion # Use the original $piece['title'] not the mangled $part1, so that # modifiers such as RAW: produce separate cache entries if ($found) { if ($isHTML) { // A special page; don't store it in the template cache. } else { $this->mTemplates[$piece['title']] = $text; } $text = $linestart . $text; } } wfProfileOut(__METHOD__ . '-loadtpl'); } if ($found && !$this->incrementIncludeSize('pre-expand', strlen($text))) { # Error, oversize inclusion $text = $linestart . "[[{$titleText}]]<!-- WARNING: template omitted, pre-expand include size too large -->"; $noparse = true; $noargs = true; } # Recursive parsing, escaping and link table handling # Only for HTML output if ($nowiki && $found && ($this->ot['html'] || $this->ot['pre'])) { $text = wfEscapeWikiText($text); } elseif (!$this->ot['msg'] && $found) { if ($noargs) { $assocArgs = array(); } else { # Clean up argument array $assocArgs = self::createAssocArgs($args); # Add a new element to the templace recursion path $this->mTemplatePath[$part1] = 1; } if (!$noparse) { # If there are any <onlyinclude> tags, only include them if (in_string('<onlyinclude>', $text) && in_string('</onlyinclude>', $text)) { $replacer = new OnlyIncludeReplacer(); StringUtils::delimiterReplaceCallback('<onlyinclude>', '</onlyinclude>', array(&$replacer, 'replace'), $text); $text = $replacer->output; } # Remove <noinclude> sections and <includeonly> tags $text = StringUtils::delimiterReplace('<noinclude>', '</noinclude>', '', $text); $text = strtr($text, array('<includeonly>' => '', '</includeonly>' => '')); if ($this->ot['html'] || $this->ot['pre']) { # Strip <nowiki>, <pre>, etc. $text = $this->strip($text, $this->mStripState); if ($this->ot['html']) { $text = Sanitizer::removeHTMLtags($text, array(&$this, 'replaceVariables'), $assocArgs); } elseif ($this->ot['pre'] && $this->mOptions->getRemoveComments()) { $text = Sanitizer::removeHTMLcomments($text); } } $text = $this->replaceVariables($text, $assocArgs); # If the template begins with a table or block-level # element, it should be treated as beginning a new line. if (!$piece['lineStart'] && preg_match('/^(?:{\\||:|;|#|\\*)/', $text)) { $text = "\n" . $text; } } elseif (!$noargs) { # $noparse and !$noargs # Just replace the arguments, not any double-brace items # This is used for rendered interwiki transclusion $text = $this->replaceVariables($text, $assocArgs, true); } } # Prune lower levels off the recursion check path $this->mTemplatePath = $lastPathLevel; if ($found && !$this->incrementIncludeSize('post-expand', strlen($text))) { # Error, oversize inclusion $text = $linestart . "[[{$titleText}]]<!-- WARNING: template omitted, post-expand include size too large -->"; $noparse = true; $noargs = true; } if (!$found) { wfProfileOut($fname); return $piece['text']; } else { wfProfileIn(__METHOD__ . '-placeholders'); if ($isHTML) { # Replace raw HTML by a placeholder # Add a blank line preceding, to prevent it from mucking up # immediately preceding headings $text = "\n\n" . $this->insertStripItem($text, $this->mStripState); } else { # replace ==section headers== # XXX this needs to go away once we have a better parser. if (!$this->ot['wiki'] && !$this->ot['pre'] && $replaceHeadings) { if (!is_null($title)) { $encodedname = base64_encode($title->getPrefixedDBkey()); } else { $encodedname = base64_encode(""); } $m = preg_split('/(^={1,6}.*?={1,6}\\s*?$)/m', $text, -1, PREG_SPLIT_DELIM_CAPTURE); $text = ''; $nsec = $headingOffset; for ($i = 0; $i < count($m); $i += 2) { $text .= $m[$i]; if (!isset($m[$i + 1]) || $m[$i + 1] == "") { continue; } $hl = $m[$i + 1]; if (strstr($hl, "<!--MWTEMPLATESECTION")) { $text .= $hl; continue; } $m2 = array(); preg_match('/^(={1,6})(.*?)(={1,6}\\s*?)$/m', $hl, $m2); $text .= $m2[1] . $m2[2] . "<!--MWTEMPLATESECTION=" . $encodedname . "&" . base64_encode("{$nsec}") . "-->" . $m2[3]; $nsec++; } } } wfProfileOut(__METHOD__ . '-placeholders'); } # Prune lower levels off the recursion check path $this->mTemplatePath = $lastPathLevel; if (!$found) { wfProfileOut($fname); return $piece['text']; } else { wfProfileOut($fname); return $text; } }
/** * Cleans up HTML, removes dangerous tags and attributes, and * removes HTML comments * @access private * @param string $text * @param callback $processCallback to do any variable or parameter replacements in HTML attribute values * @param array $args for the processing callback * @return string */ function removeHTMLtags($text, $processCallback = null, $args = array()) { global $wgUseTidy, $wgUserHtml; $fname = 'Parser::removeHTMLtags'; wfProfileIn($fname); if ($wgUserHtml) { $htmlpairs = array('b', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's', 'strike', 'strong', 'tt', 'var', 'div', 'center', 'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre', 'ruby', 'rt', 'rb', 'rp', 'p', 'span'); $htmlsingle = array('br', 'hr', 'li', 'dt', 'dd'); $htmlsingleonly = array('br', 'hr'); $htmlnest = array('table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul', 'dl', 'font', 'big', 'small', 'sub', 'sup', 'span'); $tabletags = array('td', 'th', 'tr', 'tbody'); } else { $htmlpairs = array(); $htmlsingle = array(); $htmlnest = array(); $tabletags = array(); } $htmlsingle = array_merge($tabletags, $htmlsingle); $htmlelements = array_merge($htmlsingle, $htmlpairs); # Remove HTML comments $text = Sanitizer::removeHTMLcomments($text); $bits = explode('<', $text); $text = array_shift($bits); if (!$wgUseTidy) { $tagstack = array(); $tablestack = array(); foreach ($bits as $x) { $prev = error_reporting(E_ALL & ~(E_NOTICE | E_WARNING)); preg_match('/^(\\/?)(\\w+)([^>]*?)(\\/{0,1}>)([^<]*)$/', $x, $regs); list($qbar, $slash, $t, $params, $brace, $rest) = $regs; error_reporting($prev); $badtag = 0; if (in_array($t = strtolower($t), $htmlelements)) { # Check our stack if ($slash) { # Closing a tag... if (in_array($t, $htmlsingleonly)) { $badtag = 1; } elseif (!in_array($t, $htmlsingle) && ($ot = @array_pop($tagstack)) != $t) { @array_push($tagstack, $ot); $badtag = 1; } else { if ($t == 'table') { $tagstack = array_pop($tablestack); } $newparams = ''; } } else { # Keep track for later if (in_array($t, $tabletags) && !in_array('table', $tagstack)) { $badtag = 1; } else { if (in_array($t, $tagstack) && !in_array($t, $htmlnest)) { $badtag = 1; } elseif (in_array($t, $htmlsingleonly)) { # Hack to force empty tag for uncloseable elements $brace = '/>'; } else { if (!in_array($t, $htmlsingle)) { if ($t == 'table') { array_push($tablestack, $tagstack); $tagstack = array(); } array_push($tagstack, $t); } } } # Replace any variables or template parameters with # plaintext results. if (is_callable($processCallback)) { call_user_func_array($processCallback, array(&$params, $args)); } # Strip non-approved attributes from the tag $newparams = Sanitizer::fixTagAttributes($params, $t); } if (!$badtag) { $rest = str_replace('>', '>', $rest); $close = $brace == '/>' ? ' /' : ''; $text .= "<{$slash}{$t}{$newparams}{$close}>{$rest}"; continue; } } $text .= '<' . str_replace('>', '>', $x); } # Close off any remaining tags while (is_array($tagstack) && ($t = array_pop($tagstack))) { $text .= "</{$t}>\n"; if ($t == 'table') { $tagstack = array_pop($tablestack); } } } else { # this might be possible using tidy itself foreach ($bits as $x) { preg_match('/^(\\/?)(\\w+)([^>]*?)(\\/{0,1}>)([^<]*)$/', $x, $regs); @(list($qbar, $slash, $t, $params, $brace, $rest) = $regs); if (in_array($t = strtolower($t), $htmlelements)) { if (is_callable($processCallback)) { call_user_func_array($processCallback, array(&$params, $args)); } $newparams = Sanitizer::fixTagAttributes($params, $t); $rest = str_replace('>', '>', $rest); $text .= "<{$slash}{$t}{$newparams}{$brace}{$rest}"; } else { $text .= '<' . str_replace('>', '>', $x); } } } wfProfileOut($fname); return $text; }
/** * Generates and returns a single feed entry. * @param $row The wikilog comment database entry. * @return A new WlSyndicationEntry object. */ function formatFeedEntry( $row ) { global $wgMimeType; # Create comment object. $item = $this->mSingleItem ? $this->mSingleItem : WikilogItem::newFromRow( $row ); $comment = WikilogComment::newFromRow( $item, $row ); # Prepare some strings. if ( $comment->mUserID ) { $usertext = $comment->mUserText; } else { $usertext = wfMsgForContent( 'wikilog-comment-anonsig', $comment->mUserText, ''/*talk*/, $comment->mAnonName ); } if ( $this->mSingleItem ) { $title = wfMsgForContent( 'wikilog-comment-feed-title1', $comment->mID, $usertext ); } else { $title = wfMsgForContent( 'wikilog-comment-feed-title2', $comment->mID, $usertext, $comment->mItem->mName ); } # Create new syndication entry. $entry = new WlSyndicationEntry( self::makeEntryId( $comment ), $title, $comment->mUpdated, $comment->getCommentArticleTitle()->getFullUrl() ); # Comment text. if ( $comment->mCommentRev ) { list( $article, $parserOutput ) = WikilogUtils::parsedArticle( $comment->mCommentTitle, true ); $content = Sanitizer::removeHTMLcomments( $parserOutput->getText() ); if ( $content ) { $entry->setContent( new WlTextConstruct( 'html', $content ) ); } } # Author. $usertitle = Title::makeTitle( NS_USER, $comment->mUserText ); $useruri = $usertitle->exists() ? $usertitle->getFullUrl() : null; $entry->addAuthor( $usertext, $useruri ); # Timestamp $entry->setPublished( $comment->mTimestamp ); return $entry; }
/** * Cleans up HTML, removes dangerous tags and attributes, and * removes HTML comments * @private * @param string $text * @param callback $processCallback to do any variable or parameter replacements in HTML attribute values * @param array $args for the processing callback * @return string */ static function removeHTMLtags($text, $processCallback = null, $args = array()) { global $wgUseTidy, $wgUserHtml; static $htmlpairs, $htmlsingle, $htmlsingleonly, $htmlnest, $tabletags, $htmllist, $listtags, $htmlsingleallowed, $htmlelements, $staticInitialised; wfProfileIn(__METHOD__); if (!$staticInitialised) { if ($wgUserHtml) { $htmlpairs = array('b', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's', 'strike', 'strong', 'tt', 'var', 'div', 'center', 'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre', 'ruby', 'rt', 'rb', 'rp', 'p', 'span', 'u'); $htmlsingle = array('br', 'hr', 'li', 'dt', 'dd'); $htmlsingleonly = array('br', 'hr'); $htmlnest = array('table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul', 'dl', 'font', 'big', 'small', 'sub', 'sup', 'span'); $tabletags = array('td', 'th', 'tr'); $htmllist = array('ul', 'ol'); $listtags = array('li'); } else { $htmlpairs = array(); $htmlsingle = array(); $htmlnest = array(); $tabletags = array(); } $htmlsingleallowed = array_merge($htmlsingle, $tabletags); $htmlelements = array_merge($htmlsingle, $htmlpairs, $htmlnest); # Convert them all to hashtables for faster lookup $vars = array('htmlpairs', 'htmlsingle', 'htmlsingleonly', 'htmlnest', 'tabletags', 'htmllist', 'listtags', 'htmlsingleallowed', 'htmlelements'); foreach ($vars as $var) { ${$var} = array_flip(${$var}); } $staticInitialised = true; } # Remove HTML comments $text = Sanitizer::removeHTMLcomments($text); $bits = explode('<', $text); $text = array_shift($bits); if (!$wgUseTidy) { $tagstack = $tablestack = array(); foreach ($bits as $x) { $prev = error_reporting(E_ALL & ~(E_NOTICE | E_WARNING)); preg_match('!^(/?)(\\w+)([^>]*?)(/{0,1}>)([^<]*)$!', $x, $regs); list($qbar, $slash, $t, $params, $brace, $rest) = $regs; error_reporting($prev); $badtag = 0; if (isset($htmlelements[$t = strtolower($t)])) { # Check our stack if ($slash) { # Closing a tag... if (isset($htmlsingleonly[$t])) { $badtag = 1; } elseif (($ot = @array_pop($tagstack)) != $t) { if (isset($htmlsingleallowed[$ot])) { # Pop all elements with an optional close tag # and see if we find a match below them $optstack = array(); array_push($optstack, $ot); while (($ot = @array_pop($tagstack)) != $t && isset($htmlsingleallowed[$ot])) { array_push($optstack, $ot); } if ($t != $ot) { # No match. Push the optinal elements back again $badtag = 1; while ($ot = @array_pop($optstack)) { array_push($tagstack, $ot); } } } else { @array_push($tagstack, $ot); # <li> can be nested in <ul> or <ol>, skip those cases: if (!(isset($htmllist[$ot]) && isset($listtags[$t]))) { $badtag = 1; } } } else { if ($t == 'table') { $tagstack = array_pop($tablestack); } } $newparams = ''; } else { # Keep track for later if (isset($tabletags[$t]) && !in_array('table', $tagstack)) { $badtag = 1; } else { if (in_array($t, $tagstack) && !isset($htmlnest[$t])) { $badtag = 1; # Is it a self closed htmlpair ? (bug 5487) } else { if ($brace == '/>' && isset($htmlpairs[$t])) { $badtag = 1; } elseif (isset($htmlsingleonly[$t])) { # Hack to force empty tag for uncloseable elements $brace = '/>'; } else { if (isset($htmlsingle[$t])) { # Hack to not close $htmlsingle tags $brace = NULL; } else { if ($t == 'table') { array_push($tablestack, $tagstack); $tagstack = array(); } array_push($tagstack, $t); } } } } # Replace any variables or template parameters with # plaintext results. if (is_callable($processCallback)) { call_user_func_array($processCallback, array(&$params, $args)); } # Strip non-approved attributes from the tag $newparams = Sanitizer::fixTagAttributes($params, $t); } if (!$badtag) { $rest = str_replace('>', '>', $rest); $close = $brace == '/>' ? ' /' : ''; $text .= "<{$slash}{$t}{$newparams}{$close}>{$rest}"; continue; } } $text .= '<' . str_replace('>', '>', $x); } # Close off any remaining tags while (is_array($tagstack) && ($t = array_pop($tagstack))) { $text .= "</{$t}>\n"; if ($t == 'table') { $tagstack = array_pop($tablestack); } } } else { # this might be possible using tidy itself foreach ($bits as $x) { preg_match('/^(\\/?)(\\w+)([^>]*?)(\\/{0,1}>)([^<]*)$/', $x, $regs); @(list($qbar, $slash, $t, $params, $brace, $rest) = $regs); if (isset($htmlelements[$t = strtolower($t)])) { if (is_callable($processCallback)) { call_user_func_array($processCallback, array(&$params, $args)); } $newparams = Sanitizer::fixTagAttributes($params, $t); $rest = str_replace('>', '>', $rest); $text .= "<{$slash}{$t}{$newparams}{$brace}{$rest}"; } else { $text .= '<' . str_replace('>', '>', $x); } } } wfProfileOut(__METHOD__); return $text; }
/** * Cleans up HTML, removes dangerous tags and attributes, and * removes HTML comments * @param string $text * @param callable $processCallback Callback to do any variable or parameter * replacements in HTML attribute values * @param array|bool $args Arguments for the processing callback * @param array $extratags For any extra tags to include * @param array $removetags For any tags (default or extra) to exclude * @return string */ public static function removeHTMLtags($text, $processCallback = null, $args = array(), $extratags = array(), $removetags = array()) { global $wgUseTidy, $wgAllowMicrodataAttributes, $wgAllowImageTag; static $htmlpairsStatic, $htmlsingle, $htmlsingleonly, $htmlnest, $tabletags, $htmllist, $listtags, $htmlsingleallowed, $htmlelementsStatic, $staticInitialised; // Base our staticInitialised variable off of the global config state so that if the globals // are changed (like in the screwed up test system) we will re-initialise the settings. $globalContext = implode('-', compact('wgAllowMicrodataAttributes', 'wgAllowImageTag')); if (!$staticInitialised || $staticInitialised != $globalContext) { $htmlpairsStatic = array('b', 'bdi', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's', 'strike', 'strong', 'tt', 'var', 'div', 'center', 'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre', 'ruby', 'rb', 'rp', 'rt', 'rtc', 'p', 'span', 'abbr', 'dfn', 'kbd', 'samp', 'data', 'time', 'mark'); $htmlsingle = array('br', 'wbr', 'hr', 'li', 'dt', 'dd'); $htmlsingleonly = array('br', 'wbr', 'hr'); if ($wgAllowMicrodataAttributes) { $htmlsingle[] = $htmlsingleonly[] = 'meta'; $htmlsingle[] = $htmlsingleonly[] = 'link'; } $htmlnest = array('table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul', 'li', 'dl', 'dt', 'dd', 'font', 'big', 'small', 'sub', 'sup', 'span', 'var', 'kbd', 'samp', 'em', 'strong', 'q', 'ruby', 'bdo'); $tabletags = array('td', 'th', 'tr'); $htmllist = array('ul', 'ol'); $listtags = array('li'); if ($wgAllowImageTag) { $htmlsingle[] = 'img'; $htmlsingleonly[] = 'img'; } $htmlsingleallowed = array_unique(array_merge($htmlsingle, $tabletags)); $htmlelementsStatic = array_unique(array_merge($htmlsingle, $htmlpairsStatic, $htmlnest)); # Convert them all to hashtables for faster lookup $vars = array('htmlpairsStatic', 'htmlsingle', 'htmlsingleonly', 'htmlnest', 'tabletags', 'htmllist', 'listtags', 'htmlsingleallowed', 'htmlelementsStatic'); foreach ($vars as $var) { ${$var} = array_flip(${$var}); } $staticInitialised = $globalContext; } # Populate $htmlpairs and $htmlelements with the $extratags and $removetags arrays $extratags = array_flip($extratags); $removetags = array_flip($removetags); $htmlpairs = array_merge($extratags, $htmlpairsStatic); $htmlelements = array_diff_key(array_merge($extratags, $htmlelementsStatic), $removetags); # Remove HTML comments $text = Sanitizer::removeHTMLcomments($text); $bits = explode('<', $text); $text = str_replace('>', '>', array_shift($bits)); if (!$wgUseTidy) { $tagstack = $tablestack = array(); foreach ($bits as $x) { $regs = array(); # $slash: Does the current element start with a '/'? # $t: Current element name # $params: String between element name and > # $brace: Ending '>' or '/>' # $rest: Everything until the next element of $bits if (preg_match(self::ELEMENT_BITS_REGEX, $x, $regs)) { list(, $slash, $t, $params, $brace, $rest) = $regs; } else { $slash = $t = $params = $brace = $rest = null; } $badtag = false; if (isset($htmlelements[$t = strtolower($t)])) { # Check our stack if ($slash && isset($htmlsingleonly[$t])) { $badtag = true; } elseif ($slash) { # Closing a tag... is it the one we just opened? wfSuppressWarnings(); $ot = array_pop($tagstack); wfRestoreWarnings(); if ($ot != $t) { if (isset($htmlsingleallowed[$ot])) { # Pop all elements with an optional close tag # and see if we find a match below them $optstack = array(); array_push($optstack, $ot); wfSuppressWarnings(); $ot = array_pop($tagstack); wfRestoreWarnings(); while ($ot != $t && isset($htmlsingleallowed[$ot])) { array_push($optstack, $ot); wfSuppressWarnings(); $ot = array_pop($tagstack); wfRestoreWarnings(); } if ($t != $ot) { # No match. Push the optional elements back again $badtag = true; wfSuppressWarnings(); $ot = array_pop($optstack); wfRestoreWarnings(); while ($ot) { array_push($tagstack, $ot); wfSuppressWarnings(); $ot = array_pop($optstack); wfRestoreWarnings(); } } } else { wfSuppressWarnings(); array_push($tagstack, $ot); wfRestoreWarnings(); # <li> can be nested in <ul> or <ol>, skip those cases: if (!isset($htmllist[$ot]) || !isset($listtags[$t])) { $badtag = true; } } } else { if ($t == 'table') { $tagstack = array_pop($tablestack); } } $newparams = ''; } else { # Keep track for later if (isset($tabletags[$t]) && !in_array('table', $tagstack)) { $badtag = true; } elseif (in_array($t, $tagstack) && !isset($htmlnest[$t])) { $badtag = true; # Is it a self closed htmlpair ? (bug 5487) } elseif ($brace == '/>' && isset($htmlpairs[$t])) { $badtag = true; } elseif (isset($htmlsingleonly[$t])) { # Hack to force empty tag for unclosable elements $brace = '/>'; } elseif (isset($htmlsingle[$t])) { # Hack to not close $htmlsingle tags $brace = null; # Still need to push this optionally-closed tag to # the tag stack so that we can match end tags # instead of marking them as bad. array_push($tagstack, $t); } elseif (isset($tabletags[$t]) && in_array($t, $tagstack)) { // New table tag but forgot to close the previous one $text .= "</{$t}>"; } else { if ($t == 'table') { array_push($tablestack, $tagstack); $tagstack = array(); } array_push($tagstack, $t); } # Replace any variables or template parameters with # plaintext results. if (is_callable($processCallback)) { call_user_func_array($processCallback, array(&$params, $args)); } if (!Sanitizer::validateTag($params, $t)) { $badtag = true; } # Strip non-approved attributes from the tag $newparams = Sanitizer::fixTagAttributes($params, $t); } if (!$badtag) { $rest = str_replace('>', '>', $rest); $close = $brace == '/>' && !$slash ? ' /' : ''; $text .= "<{$slash}{$t}{$newparams}{$close}>{$rest}"; continue; } } $text .= '<' . str_replace('>', '>', $x); } # Close off any remaining tags while (is_array($tagstack) && ($t = array_pop($tagstack))) { $text .= "</{$t}>\n"; if ($t == 'table') { $tagstack = array_pop($tablestack); } } } else { # this might be possible using tidy itself foreach ($bits as $x) { if (preg_match(self::ELEMENT_BITS_REGEX, $x, $regs)) { list(, $slash, $t, $params, $brace, $rest) = $regs; $badtag = false; if (isset($htmlelements[$t = strtolower($t)])) { if (is_callable($processCallback)) { call_user_func_array($processCallback, array(&$params, $args)); } if (!Sanitizer::validateTag($params, $t)) { $badtag = true; } $newparams = Sanitizer::fixTagAttributes($params, $t); if (!$badtag) { $rest = str_replace('>', '>', $rest); $text .= "<{$slash}{$t}{$newparams}{$brace}{$rest}"; continue; } } } $text .= '<' . str_replace('>', '>', $x); } } return $text; }