protected function setUp() { parent::setUp(); if (!MWTidy::isEnabled()) { $this->markTestSkipped('Tidy not found'); } }
/** * Helper function for parse() that transforms half-parsed HTML into fully * parsed HTML. * * @param string $text * @param bool $isMain * @param bool $linestart * @return string */ private function internalParseHalfParsed($text, $isMain = true, $linestart = true) { $text = $this->mStripState->unstripGeneral($text); if ($isMain) { Hooks::run('ParserAfterUnstrip', array(&$this, &$text)); } # Clean up special characters, only run once, next-to-last before doBlockLevels $fixtags = array('/(.) (?=\\?|:|;|!|%|\\302\\273)/' => '\\1 ', '/(\\302\\253) /' => '\\1 ', '/ (!\\s*important)/' => ' \\1'); $text = preg_replace(array_keys($fixtags), array_values($fixtags), $text); $text = $this->doBlockLevels($text, $linestart); $this->replaceLinkHolders($text); /** * The input doesn't get language converted if * a) It's disabled * b) Content isn't converted * c) It's a conversion table * d) it is an interface message (which is in the user language) */ if (!($this->mOptions->getDisableContentConversion() || isset($this->mDoubleUnderscores['nocontentconvert']))) { if (!$this->mOptions->getInterfaceMessage()) { # The position of the convert() call should not be changed. it # assumes that the links are all replaced and the only thing left # is the <nowiki> mark. $text = $this->getConverterLanguage()->convert($text); } } $text = $this->mStripState->unstripNoWiki($text); if ($isMain) { Hooks::run('ParserBeforeTidy', array(&$this, &$text)); } $text = $this->replaceTransparentTags($text); $text = $this->mStripState->unstripGeneral($text); $text = Sanitizer::normalizeCharReferences($text); if (MWTidy::isEnabled() && $this->mOptions->getTidy()) { $text = MWTidy::tidy($text); } else { # attempt to sanitize at least some nesting problems # (bug #2702 and quite a few others) $tidyregs = array('/(<([bi])>)(<([bi])>)?([^<]*)(<\\/?a[^<]*>)([^<]*)(<\\/\\4>)?(<\\/\\2>)/' => '\\1\\3\\5\\8\\9\\6\\1\\3\\7\\8\\9', '/(<a[^>]+>)([^<]*)(<a[^>]+>[^<]*)<\\/a>(.*)<\\/a>/' => '\\1\\2</a>\\3</a>\\1\\4</a>', '/(<([aib]) [^>]+>)([^<]*)(<div([^>]*)>)(.*)(<\\/div>)([^<]*)(<\\/\\2>)/' => '\\1\\3<div\\5>\\6</div>\\8\\9', '/<([bi])><\\/\\1>/' => ''); $text = preg_replace(array_keys($tidyregs), array_values($tidyregs), $text); } if ($isMain) { Hooks::run('ParserAfterTidy', array(&$this, &$text)); } return $text; }
/** * Cleans up HTML, removes dangerous tags and attributes, and * removes HTML comments * @param string $text * @param callable $processCallback Callback to do any variable or parameter * replacements in HTML attribute values * @param array|bool $args Arguments for the processing callback * @param array $extratags For any extra tags to include * @param array $removetags For any tags (default or extra) to exclude * @return string */ public static function removeHTMLtags($text, $processCallback = null, $args = array(), $extratags = array(), $removetags = array()) { extract(self::getRecognizedTagData($extratags, $removetags)); # Remove HTML comments $text = Sanitizer::removeHTMLcomments($text); $bits = explode('<', $text); $text = str_replace('>', '>', array_shift($bits)); if (!MWTidy::isEnabled()) { $tagstack = $tablestack = array(); foreach ($bits as $x) { $regs = array(); # $slash: Does the current element start with a '/'? # $t: Current element name # $params: String between element name and > # $brace: Ending '>' or '/>' # $rest: Everything until the next element of $bits if (preg_match(self::ELEMENT_BITS_REGEX, $x, $regs)) { list(, $slash, $t, $params, $brace, $rest) = $regs; } else { $slash = $t = $params = $brace = $rest = null; } $badtag = false; if (isset($htmlelements[$t = strtolower($t)])) { # Check our stack if ($slash && isset($htmlsingleonly[$t])) { $badtag = true; } elseif ($slash) { # Closing a tag... is it the one we just opened? MediaWiki\suppressWarnings(); $ot = array_pop($tagstack); MediaWiki\restoreWarnings(); if ($ot != $t) { if (isset($htmlsingleallowed[$ot])) { # Pop all elements with an optional close tag # and see if we find a match below them $optstack = array(); array_push($optstack, $ot); MediaWiki\suppressWarnings(); $ot = array_pop($tagstack); MediaWiki\restoreWarnings(); while ($ot != $t && isset($htmlsingleallowed[$ot])) { array_push($optstack, $ot); MediaWiki\suppressWarnings(); $ot = array_pop($tagstack); MediaWiki\restoreWarnings(); } if ($t != $ot) { # No match. Push the optional elements back again $badtag = true; MediaWiki\suppressWarnings(); $ot = array_pop($optstack); MediaWiki\restoreWarnings(); while ($ot) { array_push($tagstack, $ot); MediaWiki\suppressWarnings(); $ot = array_pop($optstack); MediaWiki\restoreWarnings(); } } } else { MediaWiki\suppressWarnings(); array_push($tagstack, $ot); MediaWiki\restoreWarnings(); # <li> can be nested in <ul> or <ol>, skip those cases: if (!isset($htmllist[$ot]) || !isset($listtags[$t])) { $badtag = true; } } } else { if ($t == 'table') { $tagstack = array_pop($tablestack); } } $newparams = ''; } else { # Keep track for later if (isset($tabletags[$t]) && !in_array('table', $tagstack)) { $badtag = true; } elseif (in_array($t, $tagstack) && !isset($htmlnest[$t])) { $badtag = true; # Is it a self closed htmlpair ? (bug 5487) } elseif ($brace == '/>' && isset($htmlpairs[$t])) { $badtag = true; } elseif (isset($htmlsingleonly[$t])) { # Hack to force empty tag for unclosable elements $brace = '/>'; } elseif (isset($htmlsingle[$t])) { # Hack to not close $htmlsingle tags $brace = null; # Still need to push this optionally-closed tag to # the tag stack so that we can match end tags # instead of marking them as bad. array_push($tagstack, $t); } elseif (isset($tabletags[$t]) && in_array($t, $tagstack)) { // New table tag but forgot to close the previous one $text .= "</{$t}>"; } else { if ($t == 'table') { array_push($tablestack, $tagstack); $tagstack = array(); } array_push($tagstack, $t); } # Replace any variables or template parameters with # plaintext results. if (is_callable($processCallback)) { call_user_func_array($processCallback, array(&$params, $args)); } if (!Sanitizer::validateTag($params, $t)) { $badtag = true; } # Strip non-approved attributes from the tag $newparams = Sanitizer::fixTagAttributes($params, $t); } if (!$badtag) { $rest = str_replace('>', '>', $rest); $close = $brace == '/>' && !$slash ? ' /' : ''; $text .= "<{$slash}{$t}{$newparams}{$close}>{$rest}"; continue; } } $text .= '<' . str_replace('>', '>', $x); } # Close off any remaining tags while (is_array($tagstack) && ($t = array_pop($tagstack))) { $text .= "</{$t}>\n"; if ($t == 'table') { $tagstack = array_pop($tablestack); } } } else { # this might be possible using tidy itself foreach ($bits as $x) { if (preg_match(self::ELEMENT_BITS_REGEX, $x, $regs)) { list(, $slash, $t, $params, $brace, $rest) = $regs; $badtag = false; if (isset($htmlelements[$t = strtolower($t)])) { if (is_callable($processCallback)) { call_user_func_array($processCallback, array(&$params, $args)); } if (!Sanitizer::validateTag($params, $t)) { $badtag = true; } $newparams = Sanitizer::fixTagAttributes($params, $t); if (!$badtag) { $rest = str_replace('>', '>', $rest); $text .= "<{$slash}{$t}{$newparams}{$brace}{$rest}"; continue; } } } $text .= '<' . str_replace('>', '>', $x); } } return $text; }
/** * Asserts that the given string is valid HTML document. * * @since 1.23 * * @note Will mark the test as skipped if the "tidy" module is not installed. * @note This ignores $wgUseTidy, so we can check for valid HTML even (and especially) * when automatic tidying is disabled. * * @param string $html A complete HTML document */ protected function assertValidHtmlDocument($html) { // Note: we only validate if the tidy PHP extension is available. // In case wgTidyInternal is false, MWTidy would fall back to the command line version // of tidy. In that case however, we can not reliably detect whether a failing validation // is due to malformed HTML, or caused by tidy not being installed as a command line tool. // That would cause all HTML assertions to fail on a system that has no tidy installed. if (!$GLOBALS['wgTidyInternal'] || !MWTidy::isEnabled()) { $this->markTestSkipped('Tidy extension not installed'); } $errorBuffer = ''; MWTidy::checkErrors($html, $errorBuffer); $allErrors = preg_split('/[\\r\\n]+/', $errorBuffer); // Filter Tidy warnings which aren't useful for us. // Tidy eg. often cries about parameters missing which have actually // been deprecated since HTML4, thus we should not care about them. $errors = preg_grep('/^(.*Warning: (trimming empty|.* lacks ".*?" attribute).*|\\s*)$/m', $allErrors, PREG_GREP_INVERT); $this->assertEmpty($errors, implode("\n", $errors)); }