示例#1
0
 /**
  * Extract parts of the text - opening, main and auxiliary.
  */
 private function extractWikitextParts()
 {
     if (!is_null($this->allText)) {
         return;
     }
     $this->parserOutput->setEditSectionTokens(false);
     $this->parserOutput->setTOCEnabled(false);
     $text = $this->parserOutput->getText();
     if (strlen($text) == 0) {
         $this->allText = "";
         // empty text - nothing to seek here
         return;
     }
     $opening = null;
     $this->openingText = $this->extractHeadingBeforeFirstHeading($text);
     // Add extra spacing around break tags so text crammed together like<br>this
     // doesn't make one word.
     $text = str_replace('<br', "\n<br", $text);
     $formatter = new HtmlFormatter($text);
     // Strip elements from the page that we never want in the search text.
     $formatter->remove($this->excludedElementSelectors);
     $formatter->filterContent();
     // Strip elements from the page that are auxiliary text.  These will still be
     // searched but matches will be ranked lower and non-auxiliary matches will be
     // preferred in highlighting.
     $formatter->remove($this->auxiliaryElementSelectors);
     $auxiliaryElements = $formatter->filterContent();
     $this->allText = trim(Sanitizer::stripAllTags($formatter->getText()));
     foreach ($auxiliaryElements as $auxiliaryElement) {
         $this->auxText[] = trim(Sanitizer::stripAllTags($formatter->getText($auxiliaryElement)));
     }
 }
示例#2
0
 /**
  * This function accomplishes several tasks:
  * 1) Auto-number headings if that option is enabled
  * 2) Add an [edit] link to sections for users who have enabled the option and can edit the page
  * 3) Add a Table of contents on the top for users who have enabled the option
  * 4) Auto-anchor headings
  *
  * It loops through all headlines, collects the necessary data, then splits up the
  * string and re-inserts the newly formatted headlines.
  *
  * @param $text String
  * @param string $origText original, untouched wikitext
  * @param $isMain Boolean
  * @return mixed|string
  * @private
  */
 function formatHeadings($text, $origText, $isMain = true)
 {
     global $wgMaxTocLevel, $wgExperimentalHtmlIds;
     # Inhibit editsection links if requested in the page
     if (isset($this->mDoubleUnderscores['noeditsection'])) {
         $maybeShowEditLink = $showEditLink = false;
     } else {
         $maybeShowEditLink = true;
         /* Actual presence will depend on ParserOptions option */
         $showEditLink = $this->mOptions->getEditSection();
     }
     if ($showEditLink) {
         $this->mOutput->setEditSectionTokens(true);
     }
     # Get all headlines for numbering them and adding funky stuff like [edit]
     # links - this is for later, but we need the number of headlines right now
     $matches = array();
     $numMatches = preg_match_all('/<H(?P<level>[1-6])(?P<attrib>.*?' . '>)\\s*(?P<header>[\\s\\S]*?)\\s*<\\/H[1-6] *>/i', $text, $matches);
     # if there are fewer than 4 headlines in the article, do not show TOC
     # unless it's been explicitly enabled.
     $enoughToc = $this->mShowToc && ($numMatches >= 4 || $this->mForceTocPosition);
     # Allow user to stipulate that a page should have a "new section"
     # link added via __NEWSECTIONLINK__
     if (isset($this->mDoubleUnderscores['newsectionlink'])) {
         $this->mOutput->setNewSection(true);
     }
     # Allow user to remove the "new section"
     # link via __NONEWSECTIONLINK__
     if (isset($this->mDoubleUnderscores['nonewsectionlink'])) {
         $this->mOutput->hideNewSection(true);
     }
     # if the string __FORCETOC__ (not case-sensitive) occurs in the HTML,
     # override above conditions and always show TOC above first header
     if (isset($this->mDoubleUnderscores['forcetoc'])) {
         $this->mShowToc = true;
         $enoughToc = true;
     }
     # headline counter
     $headlineCount = 0;
     $numVisible = 0;
     # Ugh .. the TOC should have neat indentation levels which can be
     # passed to the skin functions. These are determined here
     $toc = '';
     $full = '';
     $head = array();
     $sublevelCount = array();
     $levelCount = array();
     $level = 0;
     $prevlevel = 0;
     $toclevel = 0;
     $prevtoclevel = 0;
     $markerRegex = "{$this->mUniqPrefix}-h-(\\d+)-" . self::MARKER_SUFFIX;
     $baseTitleText = $this->mTitle->getPrefixedDBkey();
     $oldType = $this->mOutputType;
     $this->setOutputType(self::OT_WIKI);
     $frame = $this->getPreprocessor()->newFrame();
     $root = $this->preprocessToDom($origText);
     $node = $root->getFirstChild();
     $byteOffset = 0;
     $tocraw = array();
     $refers = array();
     foreach ($matches[3] as $headline) {
         $isTemplate = false;
         $titleText = false;
         $sectionIndex = false;
         $numbering = '';
         $markerMatches = array();
         if (preg_match("/^{$markerRegex}/", $headline, $markerMatches)) {
             $serial = $markerMatches[1];
             list($titleText, $sectionIndex) = $this->mHeadings[$serial];
             $isTemplate = $titleText != $baseTitleText;
             $headline = preg_replace("/^{$markerRegex}\\s*/", "", $headline);
         }
         if ($toclevel) {
             $prevlevel = $level;
         }
         $level = $matches[1][$headlineCount];
         if ($level > $prevlevel) {
             # Increase TOC level
             $toclevel++;
             $sublevelCount[$toclevel] = 0;
             if ($toclevel < $wgMaxTocLevel) {
                 $prevtoclevel = $toclevel;
                 $toc .= Linker::tocIndent();
                 $numVisible++;
             }
         } elseif ($level < $prevlevel && $toclevel > 1) {
             # Decrease TOC level, find level to jump to
             for ($i = $toclevel; $i > 0; $i--) {
                 if ($levelCount[$i] == $level) {
                     # Found last matching level
                     $toclevel = $i;
                     break;
                 } elseif ($levelCount[$i] < $level) {
                     # Found first matching level below current level
                     $toclevel = $i + 1;
                     break;
                 }
             }
             if ($i == 0) {
                 $toclevel = 1;
             }
             if ($toclevel < $wgMaxTocLevel) {
                 if ($prevtoclevel < $wgMaxTocLevel) {
                     # Unindent only if the previous toc level was shown :p
                     $toc .= Linker::tocUnindent($prevtoclevel - $toclevel);
                     $prevtoclevel = $toclevel;
                 } else {
                     $toc .= Linker::tocLineEnd();
                 }
             }
         } else {
             # No change in level, end TOC line
             if ($toclevel < $wgMaxTocLevel) {
                 $toc .= Linker::tocLineEnd();
             }
         }
         $levelCount[$toclevel] = $level;
         # count number of headlines for each level
         $sublevelCount[$toclevel]++;
         $dot = 0;
         for ($i = 1; $i <= $toclevel; $i++) {
             if (!empty($sublevelCount[$i])) {
                 if ($dot) {
                     $numbering .= '.';
                 }
                 $numbering .= $this->getTargetLanguage()->formatNum($sublevelCount[$i]);
                 $dot = 1;
             }
         }
         # The safe header is a version of the header text safe to use for links
         # Remove link placeholders by the link text.
         #     <!--LINK number-->
         # turns into
         #     link text with suffix
         # Do this before unstrip since link text can contain strip markers
         $safeHeadline = $this->replaceLinkHoldersText($headline);
         # Avoid insertion of weird stuff like <math> by expanding the relevant sections
         $safeHeadline = $this->mStripState->unstripBoth($safeHeadline);
         # Strip out HTML (first regex removes any tag not allowed)
         # Allowed tags are:
         # * <sup> and <sub> (bug 8393)
         # * <i> (bug 26375)
         # * <b> (r105284)
         # * <span dir="rtl"> and <span dir="ltr"> (bug 35167)
         #
         # We strip any parameter from accepted tags (second regex), except dir="rtl|ltr" from <span>,
         # to allow setting directionality in toc items.
         $tocline = preg_replace(array('#<(?!/?(span|sup|sub|i|b)(?: [^>]*)?>).*?' . '>#', '#<(/?(?:span(?: dir="(?:rtl|ltr)")?|sup|sub|i|b))(?: .*?)?' . '>#'), array('', '<$1>'), $safeHeadline);
         $tocline = trim($tocline);
         # For the anchor, strip out HTML-y stuff period
         $safeHeadline = preg_replace('/<.*?' . '>/', '', $safeHeadline);
         $safeHeadline = Sanitizer::normalizeSectionNameWhitespace($safeHeadline);
         # Save headline for section edit hint before it's escaped
         $headlineHint = $safeHeadline;
         if ($wgExperimentalHtmlIds) {
             # For reverse compatibility, provide an id that's
             # HTML4-compatible, like we used to.
             #
             # It may be worth noting, academically, that it's possible for
             # the legacy anchor to conflict with a non-legacy headline
             # anchor on the page.  In this case likely the "correct" thing
             # would be to either drop the legacy anchors or make sure
             # they're numbered first.  However, this would require people
             # to type in section names like "abc_.D7.93.D7.90.D7.A4"
             # manually, so let's not bother worrying about it.
             $legacyHeadline = Sanitizer::escapeId($safeHeadline, array('noninitial', 'legacy'));
             $safeHeadline = Sanitizer::escapeId($safeHeadline);
             if ($legacyHeadline == $safeHeadline) {
                 # No reason to have both (in fact, we can't)
                 $legacyHeadline = false;
             }
         } else {
             $legacyHeadline = false;
             $safeHeadline = Sanitizer::escapeId($safeHeadline, 'noninitial');
         }
         # HTML names must be case-insensitively unique (bug 10721).
         # This does not apply to Unicode characters per
         # http://dev.w3.org/html5/spec/infrastructure.html#case-sensitivity-and-string-comparison
         # @todo FIXME: We may be changing them depending on the current locale.
         $arrayKey = strtolower($safeHeadline);
         if ($legacyHeadline === false) {
             $legacyArrayKey = false;
         } else {
             $legacyArrayKey = strtolower($legacyHeadline);
         }
         # count how many in assoc. array so we can track dupes in anchors
         if (isset($refers[$arrayKey])) {
             $refers[$arrayKey]++;
         } else {
             $refers[$arrayKey] = 1;
         }
         if (isset($refers[$legacyArrayKey])) {
             $refers[$legacyArrayKey]++;
         } else {
             $refers[$legacyArrayKey] = 1;
         }
         # Don't number the heading if it is the only one (looks silly)
         if (count($matches[3]) > 1 && $this->mOptions->getNumberHeadings()) {
             # the two are different if the line contains a link
             $headline = Html::element('span', array('class' => 'mw-headline-number'), $numbering) . ' ' . $headline;
         }
         # Create the anchor for linking from the TOC to the section
         $anchor = $safeHeadline;
         $legacyAnchor = $legacyHeadline;
         if ($refers[$arrayKey] > 1) {
             $anchor .= '_' . $refers[$arrayKey];
         }
         if ($legacyHeadline !== false && $refers[$legacyArrayKey] > 1) {
             $legacyAnchor .= '_' . $refers[$legacyArrayKey];
         }
         if ($enoughToc && (!isset($wgMaxTocLevel) || $toclevel < $wgMaxTocLevel)) {
             $toc .= Linker::tocLine($anchor, $tocline, $numbering, $toclevel, $isTemplate ? false : $sectionIndex);
         }
         # Add the section to the section tree
         # Find the DOM node for this header
         $noOffset = $isTemplate || $sectionIndex === false;
         while ($node && !$noOffset) {
             if ($node->getName() === 'h') {
                 $bits = $node->splitHeading();
                 if ($bits['i'] == $sectionIndex) {
                     break;
                 }
             }
             $byteOffset += mb_strlen($this->mStripState->unstripBoth($frame->expand($node, PPFrame::RECOVER_ORIG)));
             $node = $node->getNextSibling();
         }
         $tocraw[] = array('toclevel' => $toclevel, 'level' => $level, 'line' => $tocline, 'number' => $numbering, 'index' => ($isTemplate ? 'T-' : '') . $sectionIndex, 'fromtitle' => $titleText, 'byteoffset' => $noOffset ? null : $byteOffset, 'anchor' => $anchor);
         # give headline the correct <h#> tag
         if ($maybeShowEditLink && $sectionIndex !== false) {
             // Output edit section links as markers with styles that can be customized by skins
             if ($isTemplate) {
                 # Put a T flag in the section identifier, to indicate to extractSections()
                 # that sections inside <includeonly> should be counted.
                 $editlinkArgs = array($titleText, "T-{$sectionIndex}");
             } else {
                 $editlinkArgs = array($this->mTitle->getPrefixedText(), $sectionIndex, $headlineHint);
             }
             // We use a bit of pesudo-xml for editsection markers. The language converter is run later on
             // Using a UNIQ style marker leads to the converter screwing up the tokens when it converts stuff
             // And trying to insert strip tags fails too. At this point all real inputted tags have already been escaped
             // so we don't have to worry about a user trying to input one of these markers directly.
             // We use a page and section attribute to stop the language converter from converting these important bits
             // of data, but put the headline hint inside a content block because the language converter is supposed to
             // be able to convert that piece of data.
             $editlink = '<mw:editsection page="' . htmlspecialchars($editlinkArgs[0]);
             $editlink .= '" section="' . htmlspecialchars($editlinkArgs[1]) . '"';
             if (isset($editlinkArgs[2])) {
                 $editlink .= '>' . $editlinkArgs[2] . '</mw:editsection>';
             } else {
                 $editlink .= '/>';
             }
         } else {
             $editlink = '';
         }
         $head[$headlineCount] = Linker::makeHeadline($level, $matches['attrib'][$headlineCount], $anchor, $headline, $editlink, $legacyAnchor);
         $headlineCount++;
     }
     $this->setOutputType($oldType);
     # Never ever show TOC if no headers
     if ($numVisible < 1) {
         $enoughToc = false;
     }
     if ($enoughToc) {
         if ($prevtoclevel > 0 && $prevtoclevel < $wgMaxTocLevel) {
             $toc .= Linker::tocUnindent($prevtoclevel - 1);
         }
         $toc = Linker::tocList($toc, $this->mOptions->getUserLangObj());
         $this->mOutput->setTOCHTML($toc);
         $toc = self::TOC_START . $toc . self::TOC_END;
     }
     if ($isMain) {
         $this->mOutput->setSections($tocraw);
     }
     # split up and insert constructed headlines
     $blocks = preg_split('/<H[1-6].*?' . '>[\\s\\S]*?<\\/H[1-6]>/i', $text);
     $i = 0;
     // build an array of document sections
     $sections = array();
     foreach ($blocks as $block) {
         // $head is zero-based, sections aren't.
         if (empty($head[$i - 1])) {
             $sections[$i] = $block;
         } else {
             $sections[$i] = $head[$i - 1] . $block;
         }
         /**
          * Send a hook, one per section.
          * The idea here is to be able to make section-level DIVs, but to do so in a
          * lower-impact, more correct way than r50769
          *
          * $this : caller
          * $section : the section number
          * &$sectionContent : ref to the content of the section
          * $showEditLinks : boolean describing whether this section has an edit link
          */
         wfRunHooks('ParserSectionCreate', array($this, $i, &$sections[$i], $showEditLink));
         $i++;
     }
     if ($enoughToc && $isMain && !$this->mForceTocPosition) {
         // append the TOC at the beginning
         // Top anchor now in skin
         $sections[0] = $sections[0] . $toc . "\n";
     }
     $full .= join('', $sections);
     if ($this->mForceTocPosition) {
         return str_replace('<!--MWTOC-->', $toc, $full);
     } else {
         return $full;
     }
 }