function __construct($listmode, $secseparators, $multisecseparators, $inlinetext, $listattr = '', $itemattr = '', $listseparators, $iOffset, $dominantSection) { // default for inlinetext (if not in mode=userformat) if ($listmode != 'userformat' && $inlinetext == '') { $inlinetext = ' - '; } $this->name = $listmode; $_listattr = $listattr == '' ? '' : ' ' . Sanitizer::fixTagAttributes($listattr, 'ul'); $_itemattr = $itemattr == '' ? '' : ' ' . Sanitizer::fixTagAttributes($itemattr, 'li'); $this->sSectionTags = $secseparators; $this->aMultiSecSeparators = $multisecseparators; $this->iDominantSection = $dominantSection - 1; // 0 based index switch ($listmode) { case 'inline': if (stristr($inlinetext, '<BR />')) { //one item per line (pseudo-inline) $this->sListStart = '<DIV' . $_listattr . '>'; $this->sListEnd = '</DIV>'; } $this->sItemStart = '<SPAN' . $_itemattr . '>'; $this->sItemEnd = '</SPAN>'; $this->sInline = $inlinetext; break; case 'ordered': if ($iOffset == 0) { $this->sListStart = '<OL start=1 ' . $_listattr . '>'; } else { $this->sListStart = '<OL start=' . ($iOffset + 1) . ' ' . $_listattr . '>'; } $this->sListEnd = '</OL>'; $this->sItemStart = '<LI' . $_itemattr . '>'; $this->sItemEnd = '</LI>'; break; case 'unordered': $this->sListStart = '<UL' . $_listattr . '>'; $this->sListEnd = '</UL>'; $this->sItemStart = '<LI' . $_itemattr . '>'; $this->sItemEnd = '</LI>'; break; case 'definition': $this->sListStart = '<DL' . $_listattr . '>'; $this->sListEnd = '</DL>'; // item html attributes on dt element or dd element ? $this->sHeadingStart = '<DT>'; $this->sHeadingEnd = '</DT><DD>'; $this->sItemEnd = '</DD>'; break; case 'H2': case 'H3': case 'H4': $this->sListStart = '<DIV' . $_listattr . '>'; $this->sListEnd = '</DIV>'; $this->sHeadingStart = '<' . $listmode . '>'; $this->sHeadingEnd = '</' . $listmode . '>'; break; case 'userformat': switch (count($listseparators)) { case 4: $this->sListEnd = $listseparators[3]; case 3: $this->sItemEnd = $listseparators[2]; case 2: $this->sItemStart = $listseparators[1]; case 1: $this->sListStart = $listseparators[0]; } $this->sInline = $inlinetext; break; } }
/** * Cleans up HTML, removes dangerous tags and attributes, and * removes HTML comments * @private * @param $text String * @param $processCallback Callback to do any variable or parameter replacements in HTML attribute values * @param $args Array for the processing callback * @param $extratags Array for any extra tags to include * @param $removetags Array for any tags (default or extra) to exclude * @return string */ static function removeHTMLtags($text, $processCallback = null, $args = array(), $extratags = array(), $removetags = array()) { global $wgUseTidy; static $htmlpairsStatic, $htmlsingle, $htmlsingleonly, $htmlnest, $tabletags, $htmllist, $listtags, $htmlsingleallowed, $htmlelementsStatic, $staticInitialised; wfProfileIn(__METHOD__); if (!$staticInitialised) { $htmlpairsStatic = array('b', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's', 'strike', 'strong', 'tt', 'var', 'div', 'center', 'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre', 'ruby', 'rt', 'rb', 'rp', 'p', 'span', 'abbr', 'dfn', 'kbd', 'samp', 'thead', 'tbody', 'tfoot'); $htmlsingle = array('br', 'hr', 'li', 'dt', 'dd'); $htmlsingleonly = array('br', 'hr'); $htmlnest = array('table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul', 'dl', 'font', 'big', 'small', 'sub', 'sup', 'span'); $tabletags = array('td', 'th', 'tr'); $htmllist = array('ul', 'ol'); $listtags = array('li'); global $wgAllowImageTag; if ($wgAllowImageTag) { $htmlsingle[] = 'img'; $htmlsingleonly[] = 'img'; } $htmlsingleallowed = array_unique(array_merge($htmlsingle, $tabletags)); $htmlelementsStatic = array_unique(array_merge($htmlsingle, $htmlpairsStatic, $htmlnest)); # Convert them all to hashtables for faster lookup $vars = array('htmlpairsStatic', 'htmlsingle', 'htmlsingleonly', 'htmlnest', 'tabletags', 'htmllist', 'listtags', 'htmlsingleallowed', 'htmlelementsStatic'); foreach ($vars as $var) { ${$var} = array_flip(${$var}); } $staticInitialised = true; } # Populate $htmlpairs and $htmlelements with the $extratags and $removetags arrays $extratags = array_flip($extratags); $removetags = array_flip($removetags); $htmlpairs = array_merge($extratags, $htmlpairsStatic); $htmlelements = array_diff_key(array_merge($extratags, $htmlelementsStatic), $removetags); # Remove HTML comments $text = Sanitizer::removeHTMLcomments($text); $bits = explode('<', $text); $text = str_replace('>', '>', array_shift($bits)); if (!$wgUseTidy) { $tagstack = $tablestack = array(); foreach ($bits as $x) { $regs = array(); # $slash: Does the current element start with a '/'? # $t: Current element name # $params: String between element name and > # $brace: Ending '>' or '/>' # $rest: Everything until the next element of $bits if (preg_match('!^(/?)(\\w+)([^>]*?)(/{0,1}>)([^<]*)$!', $x, $regs)) { list(, $slash, $t, $params, $brace, $rest) = $regs; } else { $slash = $t = $params = $brace = $rest = null; } $badtag = false; if (isset($htmlelements[$t = strtolower($t)])) { # Check our stack if ($slash && isset($htmlsingleonly[$t])) { $badtag = true; } elseif ($slash) { # Closing a tag... is it the one we just opened? $ot = @array_pop($tagstack); if ($ot != $t) { if (isset($htmlsingleallowed[$ot])) { # Pop all elements with an optional close tag # and see if we find a match below them $optstack = array(); array_push($optstack, $ot); $ot = @array_pop($tagstack); while ($ot != $t && isset($htmlsingleallowed[$ot])) { array_push($optstack, $ot); $ot = @array_pop($tagstack); } if ($t != $ot) { # No match. Push the optional elements back again $badtag = true; while ($ot = @array_pop($optstack)) { array_push($tagstack, $ot); } } } else { @array_push($tagstack, $ot); # <li> can be nested in <ul> or <ol>, skip those cases: if (!isset($htmllist[$ot]) || !isset($listtags[$t])) { $badtag = true; } } } else { if ($t == 'table') { $tagstack = array_pop($tablestack); } } $newparams = ''; } else { # Keep track for later if (isset($tabletags[$t]) && !in_array('table', $tagstack)) { $badtag = true; } elseif (in_array($t, $tagstack) && !isset($htmlnest[$t])) { $badtag = true; # Is it a self closed htmlpair ? (bug 5487) } elseif ($brace == '/>' && isset($htmlpairs[$t])) { $badtag = true; } elseif (isset($htmlsingleonly[$t])) { # Hack to force empty tag for uncloseable elements $brace = '/>'; } elseif (isset($htmlsingle[$t])) { # Hack to not close $htmlsingle tags $brace = null; } elseif (isset($tabletags[$t]) && in_array($t, $tagstack)) { // New table tag but forgot to close the previous one $text .= "</{$t}>"; } else { if ($t == 'table') { array_push($tablestack, $tagstack); $tagstack = array(); } array_push($tagstack, $t); } # Replace any variables or template parameters with # plaintext results. if (is_callable($processCallback)) { call_user_func_array($processCallback, array(&$params, $args)); } # Strip non-approved attributes from the tag $newparams = Sanitizer::fixTagAttributes($params, $t); } if (!$badtag) { $rest = str_replace('>', '>', $rest); $close = $brace == '/>' && !$slash ? ' /' : ''; $text .= "<{$slash}{$t}{$newparams}{$close}>{$rest}"; continue; } } $text .= '<' . str_replace('>', '>', $x); } # Close off any remaining tags while (is_array($tagstack) && ($t = array_pop($tagstack))) { $text .= "</{$t}>\n"; if ($t == 'table') { $tagstack = array_pop($tablestack); } } } else { # this might be possible using tidy itself foreach ($bits as $x) { preg_match('/^(\\/?)(\\w+)([^>]*?)(\\/{0,1}>)([^<]*)$/', $x, $regs); @(list(, $slash, $t, $params, $brace, $rest) = $regs); if (isset($htmlelements[$t = strtolower($t)])) { if (is_callable($processCallback)) { call_user_func_array($processCallback, array(&$params, $args)); } $newparams = Sanitizer::fixTagAttributes($params, $t); $rest = str_replace('>', '>', $rest); $text .= "<{$slash}{$t}{$newparams}{$brace}{$rest}"; } else { $text .= '<' . str_replace('>', '>', $x); } } } wfProfileOut(__METHOD__); return $text; }
/** * @dataProvider provideAttributeSupport */ function testAttributeSupport($tag, $attributes, $expected, $message) { $this->assertEquals($expected, Sanitizer::fixTagAttributes($attributes, $tag), $message); }
/** * parse the wiki syntax used to render tables * * @private */ function doTableStuff($text) { wfProfileIn(__METHOD__); # RTE (Rich Text Editor) - begin # Used to determine whether the Parser running in RTE mode or not global $wgRTEParserEnabled; # RTE - end $lines = StringUtils::explode("\n", $text); $out = ''; $td_history = array(); # Is currently a td tag open? $last_tag_history = array(); # Save history of last lag activated (td, th or caption) $tr_history = array(); # Is currently a tr tag open? $tr_attributes = array(); # history of tr attributes $has_opened_tr = array(); # Did this table open a <tr> element? $indent_level = 0; # indent level of the table foreach ($lines as $outLine) { $line = trim($outLine); # RTE (Rich Text Editor) - begin # @author: Inez Korczyński # Initialize this variable regardless of the RTE mode being on/off, # then it can be used in next batch of code without checking RTE mode. $RTEcomment = null; if (!empty($wgRTEParserEnabled)) { # Check if there is a wikitext comment placholder at the beginning of given line, # then cut it off - to have proper MediaWiki table processing - and store in variable for later recovery $RTEdataIdx = RTEMarker::getDataIdx(RTEMarker::PLACEHOLDER, $line, false); if ($RTEdataIdx != null) { $RTEdata = RTEData::get('placeholder', $RTEdataIdx); if ($RTEdata && $RTEdata['type'] == 'comment') { $RTEcomment = substr($line, 0, 9); $line = substr($line, 9); } } } # RTE - end if ($line === '') { # empty line, go to next line $out .= $outLine . "\n"; continue; } $first_character = $line[0]; $matches = array(); if (preg_match('/^(:*)\\{\\|(.*)$/', $line, $matches)) { # First check if we are starting a new table $indent_level = strlen($matches[1]); $attributes = $this->mStripState->unstripBoth($matches[2]); # RTE (Rich Text Editor) - begin # @author: Inez Korczyński if (!empty($wgRTEParserEnabled)) { # Throw an RTE edgacase if there is RTR marker (\x7f) in table attributes # Example: {| {{some template call}} if (strpos($attributes, "") !== false) { RTE::$edgeCases[] = 'COMPLEX.04'; } } # RTE - end $attributes = Sanitizer::fixTagAttributes($attributes, 'table'); $outLine = str_repeat('<dl><dd>', $indent_level) . "<table{$attributes}>"; # RTE (Rich Text Editor) - begin $outLine = $RTEcomment . $outLine; $RTEcomment = null; # RTE - end array_push($td_history, false); array_push($last_tag_history, ''); array_push($tr_history, false); array_push($tr_attributes, ''); array_push($has_opened_tr, false); } elseif (count($td_history) == 0) { # Don't do any of the following $out .= $outLine . "\n"; continue; } elseif (substr($line, 0, 2) === '|}') { # We are ending a table $line = '</table>' . substr($line, 2); $last_tag = array_pop($last_tag_history); if (!array_pop($has_opened_tr)) { $line = "<tr><td></td></tr>{$line}"; } if (array_pop($tr_history)) { $line = "</tr>{$line}"; } if (array_pop($td_history)) { $line = "</{$last_tag}>{$line}"; } array_pop($tr_attributes); $outLine = $line . str_repeat('</dd></dl>', $indent_level); } elseif (substr($line, 0, 2) === '|-') { # Now we have a table row $line = preg_replace('#^\\|-+#', '', $line); # Whats after the tag is now only attributes $attributes = $this->mStripState->unstripBoth($line); # RTE (Rich Text Editor) - begin # @author: Inez Korczyński if (!empty($wgRTEParserEnabled)) { # Throw an RTE edgacase if there is RTE marker (\x7f) in row attributes if (strpos($attributes, "") !== false) { RTE::$edgeCases[] = 'COMPLEX.05'; } } # RTE - end $attributes = Sanitizer::fixTagAttributes($attributes, 'tr'); array_pop($tr_attributes); array_push($tr_attributes, $attributes); $line = ''; $last_tag = array_pop($last_tag_history); array_pop($has_opened_tr); array_push($has_opened_tr, true); if (array_pop($tr_history)) { $line = '</tr>'; } if (array_pop($td_history)) { $line = "</{$last_tag}>{$line}"; } $outLine = $line; array_push($tr_history, false); array_push($td_history, false); array_push($last_tag_history, ''); } elseif ($first_character === '|' || $first_character === '!' || substr($line, 0, 2) === '|+') { # This might be cell elements, td, th or captions if (substr($line, 0, 2) === '|+') { $first_character = '+'; $line = substr($line, 1); } $line = substr($line, 1); if ($first_character === '!') { $line = str_replace('!!', '||', $line); } # Split up multiple cells on the same line. # FIXME : This can result in improper nesting of tags processed # by earlier parser steps, but should avoid splitting up eg # attribute values containing literal "||". $cells = StringUtils::explodeMarkup('||', $line); $outLine = ''; # Loop through each table cell foreach ($cells as $cell) { $previous = ''; if ($first_character !== '+') { $tr_after = array_pop($tr_attributes); if (!array_pop($tr_history)) { $previous = "<tr{$tr_after}>\n"; } array_push($tr_history, true); array_push($tr_attributes, ''); array_pop($has_opened_tr); array_push($has_opened_tr, true); } $last_tag = array_pop($last_tag_history); if (array_pop($td_history)) { $previous = "</{$last_tag}>{$previous}"; } if ($first_character === '|') { $last_tag = 'td'; } elseif ($first_character === '!') { $last_tag = 'th'; } elseif ($first_character === '+') { $last_tag = 'caption'; } else { $last_tag = ''; } array_push($last_tag_history, $last_tag); # A cell could contain both parameters and data $cell_data = explode('|', $cell, 2); # Bug 553: Note that a '|' inside an invalid link should not # be mistaken as delimiting cell parameters if (strpos($cell_data[0], '[[') !== false) { $cell = "{$previous}<{$last_tag}>{$cell}"; } elseif (count($cell_data) == 1) { $cell = "{$previous}<{$last_tag}>{$cell_data[0]}"; } else { $attributes = $this->mStripState->unstripBoth($cell_data[0]); $attributes = Sanitizer::fixTagAttributes($attributes, $last_tag); $cell = "{$previous}<{$last_tag}{$attributes}>{$cell_data[1]}"; } $outLine .= $cell; array_push($td_history, true); } } else { # RTE (Rich Text Editor) - begin # @author: Inez Korczyński if (!empty($wgRTEParserEnabled)) { if (empty($td_history[0]) || $last_tag == 'caption') { if (strpos($outLine, "-comment-") !== false) { RTE::$edgeCases[] = 'COMPLEX.06'; } } } # RTE - end } # RTE (Rich Text Editor) - begin # @author: Inez Korczyński if (!empty($RTEcomment)) { # Throw an edgecase if $RTEcomment did not get flushed (nulled) yet RTE::$edgeCases[] = 'COMPLEX.10'; } # RTE - end $out .= $outLine . "\n"; } # Closing open td, tr && table while (count($td_history) > 0) { if (array_pop($td_history)) { $out .= "</td>\n"; } if (array_pop($tr_history)) { $out .= "</tr>\n"; } if (!array_pop($has_opened_tr)) { $out .= "<tr><td></td></tr>\n"; } $out .= "</table>\n"; } # Remove trailing line-ending (b/c) if (substr($out, -1) === "\n") { $out = substr($out, 0, -1); } # special case: don't return empty table if ($out === "<table>\n<tr><td></td></tr>\n</table>") { $out = ''; } wfProfileOut(__METHOD__); return $out; }
/** * Cleans up HTML, removes dangerous tags and attributes, and * removes HTML comments * @private * @param string $text * @param callback $processCallback to do any variable or parameter replacements in HTML attribute values * @param array $args for the processing callback * @return string */ static function removeHTMLtags($text, $processCallback = null, $args = array()) { global $wgUseTidy, $wgUserHtml; static $htmlpairs, $htmlsingle, $htmlsingleonly, $htmlnest, $tabletags, $htmllist, $listtags, $htmlsingleallowed, $htmlelements, $staticInitialised; wfProfileIn(__METHOD__); if (!$staticInitialised) { if ($wgUserHtml) { $htmlpairs = array('b', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's', 'strike', 'strong', 'tt', 'var', 'div', 'center', 'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre', 'ruby', 'rt', 'rb', 'rp', 'p', 'span', 'u'); $htmlsingle = array('br', 'hr', 'li', 'dt', 'dd'); $htmlsingleonly = array('br', 'hr'); $htmlnest = array('table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul', 'dl', 'font', 'big', 'small', 'sub', 'sup', 'span'); $tabletags = array('td', 'th', 'tr'); $htmllist = array('ul', 'ol'); $listtags = array('li'); } else { $htmlpairs = array(); $htmlsingle = array(); $htmlnest = array(); $tabletags = array(); } $htmlsingleallowed = array_merge($htmlsingle, $tabletags); $htmlelements = array_merge($htmlsingle, $htmlpairs, $htmlnest); # Convert them all to hashtables for faster lookup $vars = array('htmlpairs', 'htmlsingle', 'htmlsingleonly', 'htmlnest', 'tabletags', 'htmllist', 'listtags', 'htmlsingleallowed', 'htmlelements'); foreach ($vars as $var) { ${$var} = array_flip(${$var}); } $staticInitialised = true; } # Remove HTML comments $text = Sanitizer::removeHTMLcomments($text); $bits = explode('<', $text); $text = array_shift($bits); if (!$wgUseTidy) { $tagstack = $tablestack = array(); foreach ($bits as $x) { $prev = error_reporting(E_ALL & ~(E_NOTICE | E_WARNING)); preg_match('!^(/?)(\\w+)([^>]*?)(/{0,1}>)([^<]*)$!', $x, $regs); list($qbar, $slash, $t, $params, $brace, $rest) = $regs; error_reporting($prev); $badtag = 0; if (isset($htmlelements[$t = strtolower($t)])) { # Check our stack if ($slash) { # Closing a tag... if (isset($htmlsingleonly[$t])) { $badtag = 1; } elseif (($ot = @array_pop($tagstack)) != $t) { if (isset($htmlsingleallowed[$ot])) { # Pop all elements with an optional close tag # and see if we find a match below them $optstack = array(); array_push($optstack, $ot); while (($ot = @array_pop($tagstack)) != $t && isset($htmlsingleallowed[$ot])) { array_push($optstack, $ot); } if ($t != $ot) { # No match. Push the optinal elements back again $badtag = 1; while ($ot = @array_pop($optstack)) { array_push($tagstack, $ot); } } } else { @array_push($tagstack, $ot); # <li> can be nested in <ul> or <ol>, skip those cases: if (!(isset($htmllist[$ot]) && isset($listtags[$t]))) { $badtag = 1; } } } else { if ($t == 'table') { $tagstack = array_pop($tablestack); } } $newparams = ''; } else { # Keep track for later if (isset($tabletags[$t]) && !in_array('table', $tagstack)) { $badtag = 1; } else { if (in_array($t, $tagstack) && !isset($htmlnest[$t])) { $badtag = 1; # Is it a self closed htmlpair ? (bug 5487) } else { if ($brace == '/>' && isset($htmlpairs[$t])) { $badtag = 1; } elseif (isset($htmlsingleonly[$t])) { # Hack to force empty tag for uncloseable elements $brace = '/>'; } else { if (isset($htmlsingle[$t])) { # Hack to not close $htmlsingle tags $brace = NULL; } else { if ($t == 'table') { array_push($tablestack, $tagstack); $tagstack = array(); } array_push($tagstack, $t); } } } } # Replace any variables or template parameters with # plaintext results. if (is_callable($processCallback)) { call_user_func_array($processCallback, array(&$params, $args)); } # Strip non-approved attributes from the tag $newparams = Sanitizer::fixTagAttributes($params, $t); } if (!$badtag) { $rest = str_replace('>', '>', $rest); $close = $brace == '/>' ? ' /' : ''; $text .= "<{$slash}{$t}{$newparams}{$close}>{$rest}"; continue; } } $text .= '<' . str_replace('>', '>', $x); } # Close off any remaining tags while (is_array($tagstack) && ($t = array_pop($tagstack))) { $text .= "</{$t}>\n"; if ($t == 'table') { $tagstack = array_pop($tablestack); } } } else { # this might be possible using tidy itself foreach ($bits as $x) { preg_match('/^(\\/?)(\\w+)([^>]*?)(\\/{0,1}>)([^<]*)$/', $x, $regs); @(list($qbar, $slash, $t, $params, $brace, $rest) = $regs); if (isset($htmlelements[$t = strtolower($t)])) { if (is_callable($processCallback)) { call_user_func_array($processCallback, array(&$params, $args)); } $newparams = Sanitizer::fixTagAttributes($params, $t); $rest = str_replace('>', '>', $rest); $text .= "<{$slash}{$t}{$newparams}{$brace}{$rest}"; } else { $text .= '<' . str_replace('>', '>', $x); } } } wfProfileOut(__METHOD__); return $text; }
/** * Cleans up HTML, removes dangerous tags and attributes, and * removes HTML comments * @access private * @param string $text * @param callback $processCallback to do any variable or parameter replacements in HTML attribute values * @param array $args for the processing callback * @return string */ function removeHTMLtags($text, $processCallback = null, $args = array()) { global $wgUseTidy, $wgUserHtml; $fname = 'Parser::removeHTMLtags'; wfProfileIn($fname); if ($wgUserHtml) { $htmlpairs = array('b', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's', 'strike', 'strong', 'tt', 'var', 'div', 'center', 'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre', 'ruby', 'rt', 'rb', 'rp', 'p', 'span'); $htmlsingle = array('br', 'hr', 'li', 'dt', 'dd'); $htmlsingleonly = array('br', 'hr'); $htmlnest = array('table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul', 'dl', 'font', 'big', 'small', 'sub', 'sup', 'span'); $tabletags = array('td', 'th', 'tr', 'tbody'); } else { $htmlpairs = array(); $htmlsingle = array(); $htmlnest = array(); $tabletags = array(); } $htmlsingle = array_merge($tabletags, $htmlsingle); $htmlelements = array_merge($htmlsingle, $htmlpairs); # Remove HTML comments $text = Sanitizer::removeHTMLcomments($text); $bits = explode('<', $text); $text = array_shift($bits); if (!$wgUseTidy) { $tagstack = array(); $tablestack = array(); foreach ($bits as $x) { $prev = error_reporting(E_ALL & ~(E_NOTICE | E_WARNING)); preg_match('/^(\\/?)(\\w+)([^>]*?)(\\/{0,1}>)([^<]*)$/', $x, $regs); list($qbar, $slash, $t, $params, $brace, $rest) = $regs; error_reporting($prev); $badtag = 0; if (in_array($t = strtolower($t), $htmlelements)) { # Check our stack if ($slash) { # Closing a tag... if (in_array($t, $htmlsingleonly)) { $badtag = 1; } elseif (!in_array($t, $htmlsingle) && ($ot = @array_pop($tagstack)) != $t) { @array_push($tagstack, $ot); $badtag = 1; } else { if ($t == 'table') { $tagstack = array_pop($tablestack); } $newparams = ''; } } else { # Keep track for later if (in_array($t, $tabletags) && !in_array('table', $tagstack)) { $badtag = 1; } else { if (in_array($t, $tagstack) && !in_array($t, $htmlnest)) { $badtag = 1; } elseif (in_array($t, $htmlsingleonly)) { # Hack to force empty tag for uncloseable elements $brace = '/>'; } else { if (!in_array($t, $htmlsingle)) { if ($t == 'table') { array_push($tablestack, $tagstack); $tagstack = array(); } array_push($tagstack, $t); } } } # Replace any variables or template parameters with # plaintext results. if (is_callable($processCallback)) { call_user_func_array($processCallback, array(&$params, $args)); } # Strip non-approved attributes from the tag $newparams = Sanitizer::fixTagAttributes($params, $t); } if (!$badtag) { $rest = str_replace('>', '>', $rest); $close = $brace == '/>' ? ' /' : ''; $text .= "<{$slash}{$t}{$newparams}{$close}>{$rest}"; continue; } } $text .= '<' . str_replace('>', '>', $x); } # Close off any remaining tags while (is_array($tagstack) && ($t = array_pop($tagstack))) { $text .= "</{$t}>\n"; if ($t == 'table') { $tagstack = array_pop($tablestack); } } } else { # this might be possible using tidy itself foreach ($bits as $x) { preg_match('/^(\\/?)(\\w+)([^>]*?)(\\/{0,1}>)([^<]*)$/', $x, $regs); @(list($qbar, $slash, $t, $params, $brace, $rest) = $regs); if (in_array($t = strtolower($t), $htmlelements)) { if (is_callable($processCallback)) { call_user_func_array($processCallback, array(&$params, $args)); } $newparams = Sanitizer::fixTagAttributes($params, $t); $rest = str_replace('>', '>', $rest); $text .= "<{$slash}{$t}{$newparams}{$brace}{$rest}"; } else { $text .= '<' . str_replace('>', '>', $x); } } } wfProfileOut($fname); return $text; }
/** * parse the wiki syntax used to render tables * * @private */ function doTableStuff($t) { $fname = 'Parser::doTableStuff'; wfProfileIn($fname); $t = explode("\n", $t); $td = array(); # Is currently a td tag open? $ltd = array(); # Was it TD or TH? $tr = array(); # Is currently a tr tag open? $ltr = array(); # tr attributes $has_opened_tr = array(); # Did this table open a <tr> element? $indent_level = 0; # indent level of the table foreach ($t as $k => $x) { $x = trim($x); $fc = substr($x, 0, 1); if (preg_match('/^(:*)\\{\\|(.*)$/', $x, $matches)) { $indent_level = strlen($matches[1]); $attributes = $this->unstripForHTML($matches[2]); $t[$k] = str_repeat('<dl><dd>', $indent_level) . '<table' . Sanitizer::fixTagAttributes($attributes, 'table') . '>'; array_push($td, false); array_push($ltd, ''); array_push($tr, false); array_push($ltr, ''); array_push($has_opened_tr, false); } else { if (count($td) == 0) { } else { if ('|}' == substr($x, 0, 2)) { $z = "</table>" . substr($x, 2); $l = array_pop($ltd); if (!array_pop($has_opened_tr)) { $z = "<tr><td></td></tr>" . $z; } if (array_pop($tr)) { $z = '</tr>' . $z; } if (array_pop($td)) { $z = '</' . $l . '>' . $z; } array_pop($ltr); $t[$k] = $z . str_repeat('</dd></dl>', $indent_level); } else { if ('|-' == substr($x, 0, 2)) { # Allows for |--------------- $x = substr($x, 1); while ($x != '' && substr($x, 0, 1) == '-') { $x = substr($x, 1); } $z = ''; $l = array_pop($ltd); array_pop($has_opened_tr); array_push($has_opened_tr, true); if (array_pop($tr)) { $z = '</tr>' . $z; } if (array_pop($td)) { $z = '</' . $l . '>' . $z; } array_pop($ltr); $t[$k] = $z; array_push($tr, false); array_push($td, false); array_push($ltd, ''); $attributes = $this->unstripForHTML($x); array_push($ltr, Sanitizer::fixTagAttributes($attributes, 'tr')); } else { if ('|' == $fc || '!' == $fc || '|+' == substr($x, 0, 2)) { # Caption # $x is a table row if ('|+' == substr($x, 0, 2)) { $fc = '+'; $x = substr($x, 1); } $after = substr($x, 1); if ($fc == '!') { $after = str_replace('!!', '||', $after); } // Split up multiple cells on the same line. // FIXME: This can result in improper nesting of tags processed // by earlier parser steps, but should avoid splitting up eg // attribute values containing literal "||". $after = wfExplodeMarkup('||', $after); $t[$k] = ''; # Loop through each table cell foreach ($after as $theline) { $z = ''; if ($fc != '+') { $tra = array_pop($ltr); if (!array_pop($tr)) { $z = '<tr' . $tra . ">\n"; } array_push($tr, true); array_push($ltr, ''); array_pop($has_opened_tr); array_push($has_opened_tr, true); } $l = array_pop($ltd); if (array_pop($td)) { $z = '</' . $l . '>' . $z; } if ($fc == '|') { $l = 'td'; } else { if ($fc == '!') { $l = 'th'; } else { if ($fc == '+') { $l = 'caption'; } else { $l = ''; } } } array_push($ltd, $l); # Cell parameters $y = explode('|', $theline, 2); # Note that a '|' inside an invalid link should not # be mistaken as delimiting cell parameters if (strpos($y[0], '[[') !== false) { $y = array($theline); } if (count($y) == 1) { $y = "{$z}<{$l}>{$y[0]}"; } else { $attributes = $this->unstripForHTML($y[0]); $y = "{$z}<{$l}" . Sanitizer::fixTagAttributes($attributes, $l) . ">{$y[1]}"; } $t[$k] .= $y; array_push($td, true); } } } } } } } # Closing open td, tr && table while (count($td) > 0) { $l = array_pop($ltd); if (array_pop($td)) { $t[] = '</td>'; } if (array_pop($tr)) { $t[] = '</tr>'; } if (!array_pop($has_opened_tr)) { $t[] = "<tr><td></td></tr>"; } $t[] = '</table>'; } $t = implode("\n", $t); # special case: don't return empty table if ($t == "<table>\n<tr><td></td></tr>\n</table>") { $t = ''; } wfProfileOut($fname); return $t; }
function testDeprecatedAttributesDisabled() { global $wgCleanupPresentationalAttributes; $wgCleanupPresentationalAttributes = false; $this->assertEquals(' clear="left"', Sanitizer::fixTagAttributes('clear="left"', 'br'), 'Deprecated attributes are not converted to styles when enabled.'); }
/** * Parse tables * * @param string Content * @return string Content */ function parse_tables($content) { $lines = explode("\n", $content); $out = ''; $td_history = array(); // Is currently a td tag open? $last_tag_history = array(); // Save history of last lag activated (td, th or caption) $tr_history = array(); // Is currently a tr tag open? $tr_attributes = array(); // history of tr attributes $has_opened_tr = array(); // Did this table open a <tr> element? $indent_level = 0; // indent level of the table foreach ($lines as $outLine) { $line = trim($outLine); if ($line === '') { // empty line, go to next line $out .= $outLine . "\n"; continue; } $first_character = $line[0]; $matches = array(); if (preg_match('/^(:*)\\{\\|(.*)$/', $line, $matches)) { // First check if we are starting a new table $indent_level = strlen($matches[1]); $attributes = Sanitizer::fixTagAttributes($matches[2], 'table'); $outLine = str_repeat('<dl><dd>', $indent_level) . "<table{$attributes}>"; array_push($td_history, false); array_push($last_tag_history, ''); array_push($tr_history, false); array_push($tr_attributes, ''); array_push($has_opened_tr, false); } elseif (count($td_history) == 0) { // Don't do any of the following $out .= $outLine . "\n"; continue; } elseif (substr($line, 0, 2) === '|}') { // We are ending a table $line = '</table>' . substr($line, 2); $last_tag = array_pop($last_tag_history); if (!array_pop($has_opened_tr)) { $line = "<tr><td></td></tr>{$line}"; } if (array_pop($tr_history)) { $line = "</tr>{$line}"; } if (array_pop($td_history)) { $line = "</{$last_tag}>{$line}"; } array_pop($tr_attributes); $outLine = $line . str_repeat('</dd></dl>', $indent_level); } elseif (substr($line, 0, 2) === '|-') { // Now we have a table row $line = preg_replace('#^\\|-+#', '', $line); // Whats after the tag is now only attributes $attributes = Sanitizer::fixTagAttributes($line, 'tr'); array_pop($tr_attributes); array_push($tr_attributes, $attributes); $line = ''; $last_tag = array_pop($last_tag_history); array_pop($has_opened_tr); array_push($has_opened_tr, true); if (array_pop($tr_history)) { $line = '</tr>'; } if (array_pop($td_history)) { $line = "</{$last_tag}>{$line}"; } $outLine = $line; array_push($tr_history, false); array_push($td_history, false); array_push($last_tag_history, ''); } elseif ($first_character === '|' || $first_character === '!' || substr($line, 0, 2) === '|+') { // This might be cell elements, td, th or captions if (substr($line, 0, 2) === '|+') { $first_character = '+'; $line = substr($line, 1); } $line = substr($line, 1); if ($first_character === '!') { $line = str_replace('!!', '||', $line); } // Split up multiple cells on the same line. $cells = explode('||', $line); $outLine = ''; // Loop through each table cell foreach ($cells as $cell) { $previous = ''; if ($first_character !== '+') { $tr_after = array_pop($tr_attributes); if (!array_pop($tr_history)) { $previous = "<tr{$tr_after}>\n"; } array_push($tr_history, true); array_push($tr_attributes, ''); array_pop($has_opened_tr); array_push($has_opened_tr, true); } $last_tag = array_pop($last_tag_history); if (array_pop($td_history)) { $previous = "</{$last_tag}>\n{$previous}"; } if ($first_character === '|') { $last_tag = 'td'; } elseif ($first_character === '!') { $last_tag = 'th'; } elseif ($first_character === '+') { $last_tag = 'caption'; } else { $last_tag = ''; } array_push($last_tag_history, $last_tag); // A cell could contain both parameters and data $cell_data = explode('|', $cell, 2); if (strpos($cell_data[0], '[[') !== false) { $cell = "{$previous}<{$last_tag}>{$cell}"; } elseif (count($cell_data) == 1) { $cell = "{$previous}<{$last_tag}>{$cell_data[0]}"; } else { $attributes = Sanitizer::fixTagAttributes($cell_data[0], $last_tag); $cell = "{$previous}<{$last_tag}{$attributes}>{$cell_data[1]}"; } $outLine .= $cell; array_push($td_history, true); } } $out .= $outLine . "\n"; } // Closing open td, tr && table while (count($td_history) > 0) { if (array_pop($td_history)) { $out .= "</td>\n"; } if (array_pop($tr_history)) { $out .= "</tr>\n"; } if (!array_pop($has_opened_tr)) { $out .= "<tr><td></td></tr>\n"; } $out .= "</table>\n"; } if (substr($out, -1) === "\n") { // Remove trailing line-ending (b/c) $out = substr($out, 0, -1); } if ($out === "<table>\n<tr><td></td></tr>\n</table>") { // special case: don't return empty table $out = ''; } return $out; }
/** * @dataProvider provideDeprecatedAttributes */ function testDeprecatedAttributesUnaltered($inputAttr, $inputEl) { $this->assertEquals(" {$inputAttr}", Sanitizer::fixTagAttributes($inputAttr, $inputEl)); }
function testDeprecatedAttributes() { $GLOBALS['wgCleanupPresentationalAttributes'] = true; $this->assertEquals(Sanitizer::fixTagAttributes('clear="left"', 'br'), ' style="clear: left;"', 'Deprecated attributes are converted to styles when enabled.'); $this->assertEquals(Sanitizer::fixTagAttributes('clear="all"', 'br'), ' style="clear: both;"', 'clear=all is converted to clear: both; not clear: all;'); $this->assertEquals(Sanitizer::fixTagAttributes('CLEAR="ALL"', 'br'), ' style="clear: both;"', 'clear=ALL is not treated differently from clear=all'); $this->assertEquals(Sanitizer::fixTagAttributes('width="100"', 'td'), ' style="width: 100px;"', 'Numeric sizes use pixels instead of numbers.'); $this->assertEquals(Sanitizer::fixTagAttributes('width="100%"', 'td'), ' style="width: 100%;"', 'Units are allowed in sizes.'); $this->assertEquals(Sanitizer::fixTagAttributes('WIDTH="100%"', 'td'), ' style="width: 100%;"', 'Uppercase WIDTH is treated as lowercase width.'); $this->assertEquals(Sanitizer::fixTagAttributes('WiDTh="100%"', 'td'), ' style="width: 100%;"', 'Mixed case does not break WiDTh.'); $this->assertEquals(Sanitizer::fixTagAttributes('nowrap="true"', 'td'), ' style="white-space: nowrap;"', 'nowrap attribute is output as white-space: nowrap; not something else.'); $this->assertEquals(Sanitizer::fixTagAttributes('nowrap=""', 'td'), ' style="white-space: nowrap;"', 'nowrap="" is considered true, not false'); $this->assertEquals(Sanitizer::fixTagAttributes('NOWRAP="true"', 'td'), ' style="white-space: nowrap;"', 'nowrap attribute works when uppercase.'); $this->assertEquals(Sanitizer::fixTagAttributes('NoWrAp="true"', 'td'), ' style="white-space: nowrap;"', 'nowrap attribute works when mixed-case.'); $GLOBALS['wgCleanupPresentationalAttributes'] = false; $this->assertEquals(Sanitizer::fixTagAttributes('clear="left"', 'br'), ' clear="left"', 'Deprecated attributes are not converted to styles when enabled.'); }
private function externalTableHelper($t) { $latexformat = ''; $t = trim($t); $t = explode("\n", $t); $ltd = array(); # Is current cell TD or TH? $tr = array(); # Is currently a tr tag open? $ltr = array(); # tr attributes $cellcount_max = array(); $cellcount_current = array(); $tableheader = array(); $thkr = array(); # table header index array $th = 0; $has_opened_tr = array(); # Did this table open a <tr> element? $anyCells = false; $firstCellOfRow = true; $ltx_caption = ''; $in_table = 0; foreach ($t as $k => $x) { $x = trim($x); if ($x == '') { // empty line, go to next line continue; } $fc = substr($x, 0, 1); //$matches = array(); if (preg_match('/^(:*)\\{\\|(.*)$/', $x, $matches)) { /* preg_match("/latexfmt=\"(.*?)\"/", $attributes, $latexformat); $latexwidth = '\linewidth'; if ( preg_match("/latexwidth=\"(.*?)\"/", $attributes, $latexwidth_a) ) { $latexwidth = $latexwidth_a[1]; $latexwidth = str_replace('\(\backslash{}\)', '\\', $latexwidth); } $latexformat = $latexformat[1]; $latexformat = str_replace("\\", "", $latexformat);*/ if ($in_table == 0) { /* new top-level table, initialise arrays */ $latexformat = ''; $cellcount_max = array(); $cellcount_current = array(); $tableheader = array(); $thkr = array(); # table header index array $th = 0; } $in_table++; array_push($ltd, ''); array_push($tr, false); array_push($ltr, ''); array_push($has_opened_tr, false); //Start of table: Extract LaTeX tips from attributes, make header. $attributes = $this->unstripForHTML($matches[2]); $this->debugMessage('Table: Attributes: ', $attributes); $attributes = str_replace($this->sc['backslash'], '\\', $attributes); $attributes_test = $this->parseAttrString($attributes); if (array_key_exists('latexfmt', $attributes_test)) { $latexformat = $attributes_test['latexfmt']; $latexformat = str_replace("\\", "", $latexformat); $this->debugMessage('Table: latexfmt: ', $latexformat); } if (array_key_exists('latexwidth', $attributes_test)) { $latexwidth = $attributes_test['latexwidth']; $latexwidth = str_replace('\\(\\backslash{}\\)', '\\', $latexwidth); $this->debugMessage('Table: latexwidth: ', $latexwidth); } else { $latexwidth = '\\linewidth'; } // start-of-table array_push($thkr, $k); $tableheader[$in_table]['width'] = $latexwidth; $tableheader[$in_table]['format'] = $latexformat; $cellcount_max[$in_table] = 0; // start-of-row $cellcount_current[$in_table] = 0; $this->addPackageDependency('tabularx'); $firstCellOfRow = true; } else { if ('|}' == substr($x, 0, 2) || '|\\}' == substr($x, 0, 3)) { //End of table. Pop stacks and print latex ending. $l = array_pop($ltd); if (!array_pop($has_opened_tr)) { $t[$k - 1] = $t[$k - 1] . "\\tabularnewline \\hline"; } if (array_pop($tr)) { $t[$k - 1] = $t[$k - 1] . '\\tabularnewline \\hline'; } array_pop($ltr); // end-of-row code $cellcount_max[$in_table] = max($cellcount_max[$in_table], $cellcount_current[$in_table]); // end-of-table $thk = array_pop($thkr); $latexwidth = $tableheader[$in_table]['width']; if ($tableheader[$in_table]['format'] == '') { $latexformat = array(); for ($i = 0; $i < $cellcount_max[$in_table]; $i++) { array_push($latexformat, 'Y'); } $latexformat = '|' . implode('|', $latexformat) . '|'; } else { $latexformat = $tableheader[$in_table]['format']; } if ($in_table > 1) { $t[$thk] = "{\\begin{tabularx}{{$latexwidth}}{{$latexformat}}\\hline"; $t[$k] = "\\end{tabularx}}" . trim($ltx_caption); } else { // This table is not nested $this->debugMessage('Table: inserted latexfmt: ', $latexformat); $this->debugMessage('Table: inserted latexwidth ', $latexwidth); wfRunHooks("w2lTableLaTeXAttributes", array(&$this, &$latexformat, &$latexwidth)); $table_head = "\\begin{tabularx}{{$latexwidth}}{{$latexformat}}\\hline"; $table_foot = "\\end{tabularx}\n" . trim($ltx_caption); wfRunHooks("w2lTableHead", array(&$this, &$table_head)); wfRunHooks("w2lTableFoot", array(&$this, &$table_foot)); $t[$thk] = $table_head; $t[$k] = $table_foot; unset($table_head, $table_foot); } $in_table--; $ltx_caption = ''; } else { if ('|-' == substr($x, 0, 2)) { # Allows for |--------------- if (strpos($x, '----') == 1) { $add_hline = '\\hline'; } else { $add_hline = ''; } $x = substr($x, 1); while ($x != '' && substr($x, 0, 1) == '-') { $x = substr($x, 1); } $z = ''; $l = array_pop($ltd); array_pop($has_opened_tr); array_push($has_opened_tr, true); if (array_pop($tr)) { $t[$k - 1] = $t[$k - 1] . '\\tabularnewline \\hline' . $add_hline; } array_pop($ltr); $t[$k] = $z; array_push($tr, false); array_push($ltd, ''); // end-of-row $cellcount_max[$in_table] = max($cellcount_max[$in_table], $cellcount_current[$in_table]); // start-of-row $cellcount_current[$in_table] = 0; $attributes = $this->unstripForHTML($x); array_push($ltr, Sanitizer::fixTagAttributes($attributes, 'tr')); $firstCellOfRow = true; $add_hline = ''; //$cellcounter[] = 0; } else { if (('|' === $fc || '!' === $fc || '|+' === substr($x, 0, 2)) && $in_table != 0) { # Caption # $x is a table row if ('|+' == substr($x, 0, 2)) { $fc = '+'; $x = substr($x, 1); } $after = substr($x, 1); if ($fc == '!') { $after = str_replace('!!', '||', $after); } // Split up multiple cells on the same line. // FIXME: This can result in improper nesting of tags processed // by earlier parser steps, but should avoid splitting up eg // attribute values containing literal "||". $cells = StringUtils::explodeMarkup('||', $after); $t[$k] = ''; # Loop through each table cell foreach ($cells as $theline) { $z = ''; if ($fc != '+') { $tra = array_pop($ltr); if (!array_pop($tr)) { $z = "\n"; } // has been: "\n" array_push($tr, true); array_push($ltr, ''); // current-row-cell $cellcount_current[$in_table]++; array_pop($has_opened_tr); array_push($has_opened_tr, true); } $l = array_pop($ltd); //heading cells and normal cells are equal in LaTeX: if (($fc == '|' || $fc == '!') && !$firstCellOfRow) { $l = ' & '; } else { if ($fc == '+') { $ltx_caption .= $theline; continue; //Missing support for caption here! } else { $l = ''; } } //$firstCellOfRow = false; array_push($ltd, $l); # Cell parameters $y = explode('|', $theline, 2); # Note that a '|' inside an invalid link should not # be mistaken as delimiting cell parameters if (strpos($y[0], '[[') !== false) { $y = array($theline); } if (count($y) == 1) { $y[0] = $this->fixContentforTableCells($y[0]); if ($fc == '!') { //Heading cell highlighting $y = "{$z}{$l}" . "\\textbf{" . "{$y[0]}}"; } else { $y = "{$z}{$l}{$y[0]}"; } } else { $attributes = $this->unstripForHTML($y[0]); $multi_col = $this->checkColspan($attributes); //$y = "{$z}<{$l}".Sanitizer::fixTagAttributes($attributes, $l).">{$y[1]}" ; if ($firstCellOfRow == false) { $addSep = '&'; } else { $addSep = ''; } $y = "{$z}" . $addSep . '\\multicolumn{' . $multi_col['colspan'] . '}{' . $multi_col['latexfmt'] . '}{' . $y[1] . '}'; } $firstCellOfRow = false; // was some lines up... $t[$k] .= $y; $anyCells = true; } } } } } } $t = implode("\n", $t); # special case: don't return empty table //if(!$anyCells) $t = ''; //$t .= trim($ltx_caption); return $t; }
/** * @dataProvider provideDeprecatedAttributes */ function testDeprecatedAttributes($input, $tag, $expected, $message = null) { $GLOBALS['wgCleanupPresentationalAttributes'] = true; $this->assertEquals($expected, Sanitizer::fixTagAttributes($input, $tag), $message); }
/** * Cleans up HTML, removes dangerous tags and attributes, and * removes HTML comments * @param string $text * @param callable $processCallback Callback to do any variable or parameter * replacements in HTML attribute values * @param array|bool $args Arguments for the processing callback * @param array $extratags For any extra tags to include * @param array $removetags For any tags (default or extra) to exclude * @return string */ public static function removeHTMLtags($text, $processCallback = null, $args = array(), $extratags = array(), $removetags = array()) { extract(self::getRecognizedTagData($extratags, $removetags)); # Remove HTML comments $text = Sanitizer::removeHTMLcomments($text); $bits = explode('<', $text); $text = str_replace('>', '>', array_shift($bits)); if (!MWTidy::isEnabled()) { $tagstack = $tablestack = array(); foreach ($bits as $x) { $regs = array(); # $slash: Does the current element start with a '/'? # $t: Current element name # $params: String between element name and > # $brace: Ending '>' or '/>' # $rest: Everything until the next element of $bits if (preg_match(self::ELEMENT_BITS_REGEX, $x, $regs)) { list(, $slash, $t, $params, $brace, $rest) = $regs; } else { $slash = $t = $params = $brace = $rest = null; } $badtag = false; if (isset($htmlelements[$t = strtolower($t)])) { # Check our stack if ($slash && isset($htmlsingleonly[$t])) { $badtag = true; } elseif ($slash) { # Closing a tag... is it the one we just opened? MediaWiki\suppressWarnings(); $ot = array_pop($tagstack); MediaWiki\restoreWarnings(); if ($ot != $t) { if (isset($htmlsingleallowed[$ot])) { # Pop all elements with an optional close tag # and see if we find a match below them $optstack = array(); array_push($optstack, $ot); MediaWiki\suppressWarnings(); $ot = array_pop($tagstack); MediaWiki\restoreWarnings(); while ($ot != $t && isset($htmlsingleallowed[$ot])) { array_push($optstack, $ot); MediaWiki\suppressWarnings(); $ot = array_pop($tagstack); MediaWiki\restoreWarnings(); } if ($t != $ot) { # No match. Push the optional elements back again $badtag = true; MediaWiki\suppressWarnings(); $ot = array_pop($optstack); MediaWiki\restoreWarnings(); while ($ot) { array_push($tagstack, $ot); MediaWiki\suppressWarnings(); $ot = array_pop($optstack); MediaWiki\restoreWarnings(); } } } else { MediaWiki\suppressWarnings(); array_push($tagstack, $ot); MediaWiki\restoreWarnings(); # <li> can be nested in <ul> or <ol>, skip those cases: if (!isset($htmllist[$ot]) || !isset($listtags[$t])) { $badtag = true; } } } else { if ($t == 'table') { $tagstack = array_pop($tablestack); } } $newparams = ''; } else { # Keep track for later if (isset($tabletags[$t]) && !in_array('table', $tagstack)) { $badtag = true; } elseif (in_array($t, $tagstack) && !isset($htmlnest[$t])) { $badtag = true; # Is it a self closed htmlpair ? (bug 5487) } elseif ($brace == '/>' && isset($htmlpairs[$t])) { $badtag = true; } elseif (isset($htmlsingleonly[$t])) { # Hack to force empty tag for unclosable elements $brace = '/>'; } elseif (isset($htmlsingle[$t])) { # Hack to not close $htmlsingle tags $brace = null; # Still need to push this optionally-closed tag to # the tag stack so that we can match end tags # instead of marking them as bad. array_push($tagstack, $t); } elseif (isset($tabletags[$t]) && in_array($t, $tagstack)) { // New table tag but forgot to close the previous one $text .= "</{$t}>"; } else { if ($t == 'table') { array_push($tablestack, $tagstack); $tagstack = array(); } array_push($tagstack, $t); } # Replace any variables or template parameters with # plaintext results. if (is_callable($processCallback)) { call_user_func_array($processCallback, array(&$params, $args)); } if (!Sanitizer::validateTag($params, $t)) { $badtag = true; } # Strip non-approved attributes from the tag $newparams = Sanitizer::fixTagAttributes($params, $t); } if (!$badtag) { $rest = str_replace('>', '>', $rest); $close = $brace == '/>' && !$slash ? ' /' : ''; $text .= "<{$slash}{$t}{$newparams}{$close}>{$rest}"; continue; } } $text .= '<' . str_replace('>', '>', $x); } # Close off any remaining tags while (is_array($tagstack) && ($t = array_pop($tagstack))) { $text .= "</{$t}>\n"; if ($t == 'table') { $tagstack = array_pop($tablestack); } } } else { # this might be possible using tidy itself foreach ($bits as $x) { if (preg_match(self::ELEMENT_BITS_REGEX, $x, $regs)) { list(, $slash, $t, $params, $brace, $rest) = $regs; $badtag = false; if (isset($htmlelements[$t = strtolower($t)])) { if (is_callable($processCallback)) { call_user_func_array($processCallback, array(&$params, $args)); } if (!Sanitizer::validateTag($params, $t)) { $badtag = true; } $newparams = Sanitizer::fixTagAttributes($params, $t); if (!$badtag) { $rest = str_replace('>', '>', $rest); $text .= "<{$slash}{$t}{$newparams}{$brace}{$rest}"; continue; } } } $text .= '<' . str_replace('>', '>', $x); } } return $text; }
/** * @dataProvider provideDeprecatedAttributes * @covers Sanitizer::fixTagAttributes */ public function testDeprecatedAttributesUnaltered($inputAttr, $inputEl, $message = '') { $this->assertEquals(" {$inputAttr}", Sanitizer::fixTagAttributes($inputAttr, $inputEl), $message); }
/** * parse the wiki syntax used to render tables * * @private * @return string */ function doTableStuff($text) { wfProfileIn(__METHOD__); $lines = StringUtils::explode("\n", $text); $out = ''; $td_history = array(); # Is currently a td tag open? $last_tag_history = array(); # Save history of last lag activated (td, th or caption) $tr_history = array(); # Is currently a tr tag open? $tr_attributes = array(); # history of tr attributes $has_opened_tr = array(); # Did this table open a <tr> element? $indent_level = 0; # indent level of the table foreach ($lines as $outLine) { $line = trim($outLine); if ($line === '') { # empty line, go to next line $out .= $outLine . "\n"; continue; } $first_character = $line[0]; $matches = array(); if (preg_match('/^(:*)\\{\\|(.*)$/', $line, $matches)) { # First check if we are starting a new table $indent_level = strlen($matches[1]); $attributes = $this->mStripState->unstripBoth($matches[2]); $attributes = Sanitizer::fixTagAttributes($attributes, 'table'); $outLine = str_repeat('<dl><dd>', $indent_level) . "<table{$attributes}>"; array_push($td_history, false); array_push($last_tag_history, ''); array_push($tr_history, false); array_push($tr_attributes, ''); array_push($has_opened_tr, false); } elseif (count($td_history) == 0) { # Don't do any of the following $out .= $outLine . "\n"; continue; } elseif (substr($line, 0, 2) === '|}') { # We are ending a table $line = '</table>' . substr($line, 2); $last_tag = array_pop($last_tag_history); if (!array_pop($has_opened_tr)) { $line = "<tr><td></td></tr>{$line}"; } if (array_pop($tr_history)) { $line = "</tr>{$line}"; } if (array_pop($td_history)) { $line = "</{$last_tag}>{$line}"; } array_pop($tr_attributes); $outLine = $line . str_repeat('</dd></dl>', $indent_level); } elseif (substr($line, 0, 2) === '|-') { # Now we have a table row $line = preg_replace('#^\\|-+#', '', $line); # Whats after the tag is now only attributes $attributes = $this->mStripState->unstripBoth($line); $attributes = Sanitizer::fixTagAttributes($attributes, 'tr'); array_pop($tr_attributes); array_push($tr_attributes, $attributes); $line = ''; $last_tag = array_pop($last_tag_history); array_pop($has_opened_tr); array_push($has_opened_tr, true); if (array_pop($tr_history)) { $line = '</tr>'; } if (array_pop($td_history)) { $line = "</{$last_tag}>{$line}"; } $outLine = $line; array_push($tr_history, false); array_push($td_history, false); array_push($last_tag_history, ''); } elseif ($first_character === '|' || $first_character === '!' || substr($line, 0, 2) === '|+') { # This might be cell elements, td, th or captions if (substr($line, 0, 2) === '|+') { $first_character = '+'; $line = substr($line, 1); } $line = substr($line, 1); if ($first_character === '!') { $line = str_replace('!!', '||', $line); } # Split up multiple cells on the same line. # FIXME : This can result in improper nesting of tags processed # by earlier parser steps, but should avoid splitting up eg # attribute values containing literal "||". $cells = StringUtils::explodeMarkup('||', $line); $outLine = ''; # Loop through each table cell foreach ($cells as $cell) { $previous = ''; if ($first_character !== '+') { $tr_after = array_pop($tr_attributes); if (!array_pop($tr_history)) { $previous = "<tr{$tr_after}>\n"; } array_push($tr_history, true); array_push($tr_attributes, ''); array_pop($has_opened_tr); array_push($has_opened_tr, true); } $last_tag = array_pop($last_tag_history); if (array_pop($td_history)) { $previous = "</{$last_tag}>\n{$previous}"; } if ($first_character === '|') { $last_tag = 'td'; } elseif ($first_character === '!') { $last_tag = 'th'; } elseif ($first_character === '+') { $last_tag = 'caption'; } else { $last_tag = ''; } array_push($last_tag_history, $last_tag); # A cell could contain both parameters and data $cell_data = explode('|', $cell, 2); # Bug 553: Note that a '|' inside an invalid link should not # be mistaken as delimiting cell parameters if (strpos($cell_data[0], '[[') !== false) { $cell = "{$previous}<{$last_tag}>{$cell}"; } elseif (count($cell_data) == 1) { $cell = "{$previous}<{$last_tag}>{$cell_data[0]}"; } else { $attributes = $this->mStripState->unstripBoth($cell_data[0]); $attributes = Sanitizer::fixTagAttributes($attributes, $last_tag); $cell = "{$previous}<{$last_tag}{$attributes}>{$cell_data[1]}"; } $outLine .= $cell; array_push($td_history, true); } } $out .= $outLine . "\n"; } # Closing open td, tr && table while (count($td_history) > 0) { if (array_pop($td_history)) { $out .= "</td>\n"; } if (array_pop($tr_history)) { $out .= "</tr>\n"; } if (!array_pop($has_opened_tr)) { $out .= "<tr><td></td></tr>\n"; } $out .= "</table>\n"; } # Remove trailing line-ending (b/c) if (substr($out, -1) === "\n") { $out = substr($out, 0, -1); } # special case: don't return empty table if ($out === "<table>\n<tr><td></td></tr>\n</table>") { $out = ''; } wfProfileOut(__METHOD__); return $out; }
/** * Cleans up HTML, removes dangerous tags and attributes, and * removes HTML comments * @param string $text * @param callable $processCallback Callback to do any variable or parameter * replacements in HTML attribute values * @param array|bool $args Arguments for the processing callback * @param array $extratags For any extra tags to include * @param array $removetags For any tags (default or extra) to exclude * @return string */ public static function removeHTMLtags($text, $processCallback = null, $args = array(), $extratags = array(), $removetags = array()) { global $wgUseTidy, $wgAllowMicrodataAttributes, $wgAllowImageTag; static $htmlpairsStatic, $htmlsingle, $htmlsingleonly, $htmlnest, $tabletags, $htmllist, $listtags, $htmlsingleallowed, $htmlelementsStatic, $staticInitialised; // Base our staticInitialised variable off of the global config state so that if the globals // are changed (like in the screwed up test system) we will re-initialise the settings. $globalContext = implode('-', compact('wgAllowMicrodataAttributes', 'wgAllowImageTag')); if (!$staticInitialised || $staticInitialised != $globalContext) { $htmlpairsStatic = array('b', 'bdi', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's', 'strike', 'strong', 'tt', 'var', 'div', 'center', 'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre', 'ruby', 'rb', 'rp', 'rt', 'rtc', 'p', 'span', 'abbr', 'dfn', 'kbd', 'samp', 'data', 'time', 'mark'); $htmlsingle = array('br', 'wbr', 'hr', 'li', 'dt', 'dd'); $htmlsingleonly = array('br', 'wbr', 'hr'); if ($wgAllowMicrodataAttributes) { $htmlsingle[] = $htmlsingleonly[] = 'meta'; $htmlsingle[] = $htmlsingleonly[] = 'link'; } $htmlnest = array('table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul', 'li', 'dl', 'dt', 'dd', 'font', 'big', 'small', 'sub', 'sup', 'span', 'var', 'kbd', 'samp', 'em', 'strong', 'q', 'ruby', 'bdo'); $tabletags = array('td', 'th', 'tr'); $htmllist = array('ul', 'ol'); $listtags = array('li'); if ($wgAllowImageTag) { $htmlsingle[] = 'img'; $htmlsingleonly[] = 'img'; } $htmlsingleallowed = array_unique(array_merge($htmlsingle, $tabletags)); $htmlelementsStatic = array_unique(array_merge($htmlsingle, $htmlpairsStatic, $htmlnest)); # Convert them all to hashtables for faster lookup $vars = array('htmlpairsStatic', 'htmlsingle', 'htmlsingleonly', 'htmlnest', 'tabletags', 'htmllist', 'listtags', 'htmlsingleallowed', 'htmlelementsStatic'); foreach ($vars as $var) { ${$var} = array_flip(${$var}); } $staticInitialised = $globalContext; } # Populate $htmlpairs and $htmlelements with the $extratags and $removetags arrays $extratags = array_flip($extratags); $removetags = array_flip($removetags); $htmlpairs = array_merge($extratags, $htmlpairsStatic); $htmlelements = array_diff_key(array_merge($extratags, $htmlelementsStatic), $removetags); # Remove HTML comments $text = Sanitizer::removeHTMLcomments($text); $bits = explode('<', $text); $text = str_replace('>', '>', array_shift($bits)); if (!$wgUseTidy) { $tagstack = $tablestack = array(); foreach ($bits as $x) { $regs = array(); # $slash: Does the current element start with a '/'? # $t: Current element name # $params: String between element name and > # $brace: Ending '>' or '/>' # $rest: Everything until the next element of $bits if (preg_match(self::ELEMENT_BITS_REGEX, $x, $regs)) { list(, $slash, $t, $params, $brace, $rest) = $regs; } else { $slash = $t = $params = $brace = $rest = null; } $badtag = false; if (isset($htmlelements[$t = strtolower($t)])) { # Check our stack if ($slash && isset($htmlsingleonly[$t])) { $badtag = true; } elseif ($slash) { # Closing a tag... is it the one we just opened? wfSuppressWarnings(); $ot = array_pop($tagstack); wfRestoreWarnings(); if ($ot != $t) { if (isset($htmlsingleallowed[$ot])) { # Pop all elements with an optional close tag # and see if we find a match below them $optstack = array(); array_push($optstack, $ot); wfSuppressWarnings(); $ot = array_pop($tagstack); wfRestoreWarnings(); while ($ot != $t && isset($htmlsingleallowed[$ot])) { array_push($optstack, $ot); wfSuppressWarnings(); $ot = array_pop($tagstack); wfRestoreWarnings(); } if ($t != $ot) { # No match. Push the optional elements back again $badtag = true; wfSuppressWarnings(); $ot = array_pop($optstack); wfRestoreWarnings(); while ($ot) { array_push($tagstack, $ot); wfSuppressWarnings(); $ot = array_pop($optstack); wfRestoreWarnings(); } } } else { wfSuppressWarnings(); array_push($tagstack, $ot); wfRestoreWarnings(); # <li> can be nested in <ul> or <ol>, skip those cases: if (!isset($htmllist[$ot]) || !isset($listtags[$t])) { $badtag = true; } } } else { if ($t == 'table') { $tagstack = array_pop($tablestack); } } $newparams = ''; } else { # Keep track for later if (isset($tabletags[$t]) && !in_array('table', $tagstack)) { $badtag = true; } elseif (in_array($t, $tagstack) && !isset($htmlnest[$t])) { $badtag = true; # Is it a self closed htmlpair ? (bug 5487) } elseif ($brace == '/>' && isset($htmlpairs[$t])) { $badtag = true; } elseif (isset($htmlsingleonly[$t])) { # Hack to force empty tag for unclosable elements $brace = '/>'; } elseif (isset($htmlsingle[$t])) { # Hack to not close $htmlsingle tags $brace = null; # Still need to push this optionally-closed tag to # the tag stack so that we can match end tags # instead of marking them as bad. array_push($tagstack, $t); } elseif (isset($tabletags[$t]) && in_array($t, $tagstack)) { // New table tag but forgot to close the previous one $text .= "</{$t}>"; } else { if ($t == 'table') { array_push($tablestack, $tagstack); $tagstack = array(); } array_push($tagstack, $t); } # Replace any variables or template parameters with # plaintext results. if (is_callable($processCallback)) { call_user_func_array($processCallback, array(&$params, $args)); } if (!Sanitizer::validateTag($params, $t)) { $badtag = true; } # Strip non-approved attributes from the tag $newparams = Sanitizer::fixTagAttributes($params, $t); } if (!$badtag) { $rest = str_replace('>', '>', $rest); $close = $brace == '/>' && !$slash ? ' /' : ''; $text .= "<{$slash}{$t}{$newparams}{$close}>{$rest}"; continue; } } $text .= '<' . str_replace('>', '>', $x); } # Close off any remaining tags while (is_array($tagstack) && ($t = array_pop($tagstack))) { $text .= "</{$t}>\n"; if ($t == 'table') { $tagstack = array_pop($tablestack); } } } else { # this might be possible using tidy itself foreach ($bits as $x) { if (preg_match(self::ELEMENT_BITS_REGEX, $x, $regs)) { list(, $slash, $t, $params, $brace, $rest) = $regs; $badtag = false; if (isset($htmlelements[$t = strtolower($t)])) { if (is_callable($processCallback)) { call_user_func_array($processCallback, array(&$params, $args)); } if (!Sanitizer::validateTag($params, $t)) { $badtag = true; } $newparams = Sanitizer::fixTagAttributes($params, $t); if (!$badtag) { $rest = str_replace('>', '>', $rest); $text .= "<{$slash}{$t}{$newparams}{$brace}{$rest}"; continue; } } } $text .= '<' . str_replace('>', '>', $x); } } return $text; }