Ejemplo n.º 1
0
 protected function setUp()
 {
     parent::setUp();
     if (!MWTidy::isEnabled()) {
         $this->markTestSkipped('Tidy not found');
     }
 }
Ejemplo n.º 2
0
 /**
  * Helper function for parse() that transforms half-parsed HTML into fully
  * parsed HTML.
  *
  * @param string $text
  * @param bool $isMain
  * @param bool $linestart
  * @return string
  */
 private function internalParseHalfParsed($text, $isMain = true, $linestart = true)
 {
     $text = $this->mStripState->unstripGeneral($text);
     if ($isMain) {
         Hooks::run('ParserAfterUnstrip', array(&$this, &$text));
     }
     # Clean up special characters, only run once, next-to-last before doBlockLevels
     $fixtags = array('/(.) (?=\\?|:|;|!|%|\\302\\273)/' => '\\1 ', '/(\\302\\253) /' => '\\1 ', '/ (!\\s*important)/' => ' \\1');
     $text = preg_replace(array_keys($fixtags), array_values($fixtags), $text);
     $text = $this->doBlockLevels($text, $linestart);
     $this->replaceLinkHolders($text);
     /**
      * The input doesn't get language converted if
      * a) It's disabled
      * b) Content isn't converted
      * c) It's a conversion table
      * d) it is an interface message (which is in the user language)
      */
     if (!($this->mOptions->getDisableContentConversion() || isset($this->mDoubleUnderscores['nocontentconvert']))) {
         if (!$this->mOptions->getInterfaceMessage()) {
             # The position of the convert() call should not be changed. it
             # assumes that the links are all replaced and the only thing left
             # is the <nowiki> mark.
             $text = $this->getConverterLanguage()->convert($text);
         }
     }
     $text = $this->mStripState->unstripNoWiki($text);
     if ($isMain) {
         Hooks::run('ParserBeforeTidy', array(&$this, &$text));
     }
     $text = $this->replaceTransparentTags($text);
     $text = $this->mStripState->unstripGeneral($text);
     $text = Sanitizer::normalizeCharReferences($text);
     if (MWTidy::isEnabled() && $this->mOptions->getTidy()) {
         $text = MWTidy::tidy($text);
     } else {
         # attempt to sanitize at least some nesting problems
         # (bug #2702 and quite a few others)
         $tidyregs = array('/(<([bi])>)(<([bi])>)?([^<]*)(<\\/?a[^<]*>)([^<]*)(<\\/\\4>)?(<\\/\\2>)/' => '\\1\\3\\5\\8\\9\\6\\1\\3\\7\\8\\9', '/(<a[^>]+>)([^<]*)(<a[^>]+>[^<]*)<\\/a>(.*)<\\/a>/' => '\\1\\2</a>\\3</a>\\1\\4</a>', '/(<([aib]) [^>]+>)([^<]*)(<div([^>]*)>)(.*)(<\\/div>)([^<]*)(<\\/\\2>)/' => '\\1\\3&lt;div\\5&gt;\\6&lt;/div&gt;\\8\\9', '/<([bi])><\\/\\1>/' => '');
         $text = preg_replace(array_keys($tidyregs), array_values($tidyregs), $text);
     }
     if ($isMain) {
         Hooks::run('ParserAfterTidy', array(&$this, &$text));
     }
     return $text;
 }
Ejemplo n.º 3
0
 /**
  * Cleans up HTML, removes dangerous tags and attributes, and
  * removes HTML comments
  * @param string $text
  * @param callable $processCallback Callback to do any variable or parameter
  *   replacements in HTML attribute values
  * @param array|bool $args Arguments for the processing callback
  * @param array $extratags For any extra tags to include
  * @param array $removetags For any tags (default or extra) to exclude
  * @return string
  */
 public static function removeHTMLtags($text, $processCallback = null, $args = array(), $extratags = array(), $removetags = array())
 {
     extract(self::getRecognizedTagData($extratags, $removetags));
     # Remove HTML comments
     $text = Sanitizer::removeHTMLcomments($text);
     $bits = explode('<', $text);
     $text = str_replace('>', '&gt;', array_shift($bits));
     if (!MWTidy::isEnabled()) {
         $tagstack = $tablestack = array();
         foreach ($bits as $x) {
             $regs = array();
             # $slash: Does the current element start with a '/'?
             # $t: Current element name
             # $params: String between element name and >
             # $brace: Ending '>' or '/>'
             # $rest: Everything until the next element of $bits
             if (preg_match(self::ELEMENT_BITS_REGEX, $x, $regs)) {
                 list(, $slash, $t, $params, $brace, $rest) = $regs;
             } else {
                 $slash = $t = $params = $brace = $rest = null;
             }
             $badtag = false;
             if (isset($htmlelements[$t = strtolower($t)])) {
                 # Check our stack
                 if ($slash && isset($htmlsingleonly[$t])) {
                     $badtag = true;
                 } elseif ($slash) {
                     # Closing a tag... is it the one we just opened?
                     MediaWiki\suppressWarnings();
                     $ot = array_pop($tagstack);
                     MediaWiki\restoreWarnings();
                     if ($ot != $t) {
                         if (isset($htmlsingleallowed[$ot])) {
                             # Pop all elements with an optional close tag
                             # and see if we find a match below them
                             $optstack = array();
                             array_push($optstack, $ot);
                             MediaWiki\suppressWarnings();
                             $ot = array_pop($tagstack);
                             MediaWiki\restoreWarnings();
                             while ($ot != $t && isset($htmlsingleallowed[$ot])) {
                                 array_push($optstack, $ot);
                                 MediaWiki\suppressWarnings();
                                 $ot = array_pop($tagstack);
                                 MediaWiki\restoreWarnings();
                             }
                             if ($t != $ot) {
                                 # No match. Push the optional elements back again
                                 $badtag = true;
                                 MediaWiki\suppressWarnings();
                                 $ot = array_pop($optstack);
                                 MediaWiki\restoreWarnings();
                                 while ($ot) {
                                     array_push($tagstack, $ot);
                                     MediaWiki\suppressWarnings();
                                     $ot = array_pop($optstack);
                                     MediaWiki\restoreWarnings();
                                 }
                             }
                         } else {
                             MediaWiki\suppressWarnings();
                             array_push($tagstack, $ot);
                             MediaWiki\restoreWarnings();
                             # <li> can be nested in <ul> or <ol>, skip those cases:
                             if (!isset($htmllist[$ot]) || !isset($listtags[$t])) {
                                 $badtag = true;
                             }
                         }
                     } else {
                         if ($t == 'table') {
                             $tagstack = array_pop($tablestack);
                         }
                     }
                     $newparams = '';
                 } else {
                     # Keep track for later
                     if (isset($tabletags[$t]) && !in_array('table', $tagstack)) {
                         $badtag = true;
                     } elseif (in_array($t, $tagstack) && !isset($htmlnest[$t])) {
                         $badtag = true;
                         # Is it a self closed htmlpair ? (bug 5487)
                     } elseif ($brace == '/>' && isset($htmlpairs[$t])) {
                         $badtag = true;
                     } elseif (isset($htmlsingleonly[$t])) {
                         # Hack to force empty tag for unclosable elements
                         $brace = '/>';
                     } elseif (isset($htmlsingle[$t])) {
                         # Hack to not close $htmlsingle tags
                         $brace = null;
                         # Still need to push this optionally-closed tag to
                         # the tag stack so that we can match end tags
                         # instead of marking them as bad.
                         array_push($tagstack, $t);
                     } elseif (isset($tabletags[$t]) && in_array($t, $tagstack)) {
                         // New table tag but forgot to close the previous one
                         $text .= "</{$t}>";
                     } else {
                         if ($t == 'table') {
                             array_push($tablestack, $tagstack);
                             $tagstack = array();
                         }
                         array_push($tagstack, $t);
                     }
                     # Replace any variables or template parameters with
                     # plaintext results.
                     if (is_callable($processCallback)) {
                         call_user_func_array($processCallback, array(&$params, $args));
                     }
                     if (!Sanitizer::validateTag($params, $t)) {
                         $badtag = true;
                     }
                     # Strip non-approved attributes from the tag
                     $newparams = Sanitizer::fixTagAttributes($params, $t);
                 }
                 if (!$badtag) {
                     $rest = str_replace('>', '&gt;', $rest);
                     $close = $brace == '/>' && !$slash ? ' /' : '';
                     $text .= "<{$slash}{$t}{$newparams}{$close}>{$rest}";
                     continue;
                 }
             }
             $text .= '&lt;' . str_replace('>', '&gt;', $x);
         }
         # Close off any remaining tags
         while (is_array($tagstack) && ($t = array_pop($tagstack))) {
             $text .= "</{$t}>\n";
             if ($t == 'table') {
                 $tagstack = array_pop($tablestack);
             }
         }
     } else {
         # this might be possible using tidy itself
         foreach ($bits as $x) {
             if (preg_match(self::ELEMENT_BITS_REGEX, $x, $regs)) {
                 list(, $slash, $t, $params, $brace, $rest) = $regs;
                 $badtag = false;
                 if (isset($htmlelements[$t = strtolower($t)])) {
                     if (is_callable($processCallback)) {
                         call_user_func_array($processCallback, array(&$params, $args));
                     }
                     if (!Sanitizer::validateTag($params, $t)) {
                         $badtag = true;
                     }
                     $newparams = Sanitizer::fixTagAttributes($params, $t);
                     if (!$badtag) {
                         $rest = str_replace('>', '&gt;', $rest);
                         $text .= "<{$slash}{$t}{$newparams}{$brace}{$rest}";
                         continue;
                     }
                 }
             }
             $text .= '&lt;' . str_replace('>', '&gt;', $x);
         }
     }
     return $text;
 }
Ejemplo n.º 4
0
 /**
  * Asserts that the given string is valid HTML document.
  *
  * @since 1.23
  *
  * @note Will mark the test as skipped if the "tidy" module is not installed.
  * @note This ignores $wgUseTidy, so we can check for valid HTML even (and especially)
  *        when automatic tidying is disabled.
  *
  * @param string $html A complete HTML document
  */
 protected function assertValidHtmlDocument($html)
 {
     // Note: we only validate if the tidy PHP extension is available.
     // In case wgTidyInternal is false, MWTidy would fall back to the command line version
     // of tidy. In that case however, we can not reliably detect whether a failing validation
     // is due to malformed HTML, or caused by tidy not being installed as a command line tool.
     // That would cause all HTML assertions to fail on a system that has no tidy installed.
     if (!$GLOBALS['wgTidyInternal'] || !MWTidy::isEnabled()) {
         $this->markTestSkipped('Tidy extension not installed');
     }
     $errorBuffer = '';
     MWTidy::checkErrors($html, $errorBuffer);
     $allErrors = preg_split('/[\\r\\n]+/', $errorBuffer);
     // Filter Tidy warnings which aren't useful for us.
     // Tidy eg. often cries about parameters missing which have actually
     // been deprecated since HTML4, thus we should not care about them.
     $errors = preg_grep('/^(.*Warning: (trimming empty|.* lacks ".*?" attribute).*|\\s*)$/m', $allErrors, PREG_GREP_INVERT);
     $this->assertEmpty($errors, implode("\n", $errors));
 }