Example #1
0
 /**
  * Preprocess some wikitext and return the document tree.
  * This is the ghost of Parser::replace_variables().
  *
  * @param string $text The text to parse
  * @param integer flags Bitwise combination of:
  *          Parser::PTD_FOR_INCLUSION    Handle <noinclude>/<includeonly> as if the text is being
  *                                     included. Default is to assume a direct page view.
  *
  * The generated DOM tree must depend only on the input text and the flags.
  * The DOM tree must be the same in OT_HTML and OT_WIKI mode, to avoid a regression of bug 4899.
  *
  * Any flag added to the $flags parameter here, or any other parameter liable to cause a
  * change in the DOM tree for a given text, must be passed through the section identifier
  * in the section edit link and thus back to extractSections().
  *
  * The output of this function is currently only cached in process memory, but a persistent
  * cache may be implemented at a later date which takes further advantage of these strict
  * dependency requirements.
  *
  * @private
  */
 function preprocessToObj($text, $flags = 0)
 {
     wfProfileIn(__METHOD__);
     wfProfileIn(__METHOD__ . '-makexml');
     $rules = array('{' => array('end' => '}', 'names' => array(2 => 'template', 3 => 'tplarg'), 'min' => 2, 'max' => 3), '[' => array('end' => ']', 'names' => array(2 => null), 'min' => 2, 'max' => 2));
     $forInclusion = $flags & Parser::PTD_FOR_INCLUSION;
     $xmlishElements = $this->parser->getStripList();
     $enableOnlyinclude = false;
     if ($forInclusion) {
         $ignoredTags = array('includeonly', '/includeonly');
         $ignoredElements = array('noinclude');
         $xmlishElements[] = 'noinclude';
         if (strpos($text, '<onlyinclude>') !== false && strpos($text, '</onlyinclude>') !== false) {
             $enableOnlyinclude = true;
         }
     } else {
         $ignoredTags = array('noinclude', '/noinclude', 'onlyinclude', '/onlyinclude');
         $ignoredElements = array('includeonly');
         $xmlishElements[] = 'includeonly';
     }
     $xmlishRegex = implode('|', array_merge($xmlishElements, $ignoredTags));
     // Use "A" modifier (anchored) instead of "^", because ^ doesn't work with an offset
     $elementsRegex = "~({$xmlishRegex})(?:\\s|\\/>|>)|(!--)~iA";
     $stack = new PPDStack();
     $searchBase = "[{<\n";
     #}
     $revText = strrev($text);
     // For fast reverse searches
     $i = 0;
     # Input pointer, starts out pointing to a pseudo-newline before the start
     $accum =& $stack->getAccum();
     # Current accumulator
     $accum = '<root>';
     $findEquals = false;
     # True to find equals signs in arguments
     $findPipe = false;
     # True to take notice of pipe characters
     $headingIndex = 1;
     $inHeading = false;
     # True if $i is inside a possible heading
     $noMoreGT = false;
     # True if there are no more greater-than (>) signs right of $i
     $findOnlyinclude = $enableOnlyinclude;
     # True to ignore all input up to the next <onlyinclude>
     $fakeLineStart = true;
     # Do a line-start run without outputting an LF character
     while (true) {
         //$this->memCheck();
         if ($findOnlyinclude) {
             // Ignore all input up to the next <onlyinclude>
             $startPos = strpos($text, '<onlyinclude>', $i);
             if ($startPos === false) {
                 // Ignored section runs to the end
                 $accum .= '<ignore>' . htmlspecialchars(substr($text, $i)) . '</ignore>';
                 break;
             }
             $tagEndPos = $startPos + strlen('<onlyinclude>');
             // past-the-end
             $accum .= '<ignore>' . htmlspecialchars(substr($text, $i, $tagEndPos - $i)) . '</ignore>';
             $i = $tagEndPos;
             $findOnlyinclude = false;
         }
         if ($fakeLineStart) {
             $found = 'line-start';
             $curChar = '';
         } else {
             # Find next opening brace, closing brace or pipe
             $search = $searchBase;
             if ($stack->top === false) {
                 $currentClosing = '';
             } else {
                 $currentClosing = $stack->top->close;
                 $search .= $currentClosing;
             }
             if ($findPipe) {
                 $search .= '|';
             }
             if ($findEquals) {
                 // First equals will be for the template
                 $search .= '=';
             }
             $rule = null;
             # Output literal section, advance input counter
             $literalLength = strcspn($text, $search, $i);
             if ($literalLength > 0) {
                 $accum .= htmlspecialchars(substr($text, $i, $literalLength));
                 $i += $literalLength;
             }
             if ($i >= strlen($text)) {
                 if ($currentClosing == "\n") {
                     // Do a past-the-end run to finish off the heading
                     $curChar = '';
                     $found = 'line-end';
                 } else {
                     # All done
                     break;
                 }
             } else {
                 $curChar = $text[$i];
                 if ($curChar == '|') {
                     $found = 'pipe';
                 } elseif ($curChar == '=') {
                     $found = 'equals';
                 } elseif ($curChar == '<') {
                     $found = 'angle';
                 } elseif ($curChar == "\n") {
                     if ($inHeading) {
                         $found = 'line-end';
                     } else {
                         $found = 'line-start';
                     }
                 } elseif ($curChar == $currentClosing) {
                     $found = 'close';
                 } elseif (isset($rules[$curChar])) {
                     $found = 'open';
                     $rule = $rules[$curChar];
                 } else {
                     # Some versions of PHP have a strcspn which stops on null characters
                     # Ignore and continue
                     ++$i;
                     continue;
                 }
             }
         }
         if ($found == 'angle') {
             $matches = false;
             // Handle </onlyinclude>
             if ($enableOnlyinclude && substr($text, $i, strlen('</onlyinclude>')) == '</onlyinclude>') {
                 $findOnlyinclude = true;
                 continue;
             }
             // Determine element name
             if (!preg_match($elementsRegex, $text, $matches, 0, $i + 1)) {
                 // Element name missing or not listed
                 $accum .= '&lt;';
                 ++$i;
                 continue;
             }
             // Handle comments
             if (isset($matches[2]) && $matches[2] == '!--') {
                 // To avoid leaving blank lines, when a comment is both preceded
                 // and followed by a newline (ignoring spaces), trim leading and
                 // trailing spaces and one of the newlines.
                 // Find the end
                 $endPos = strpos($text, '-->', $i + 4);
                 if ($endPos === false) {
                     // Unclosed comment in input, runs to end
                     $inner = substr($text, $i);
                     $accum .= '<comment>' . htmlspecialchars($inner) . '</comment>';
                     $i = strlen($text);
                 } else {
                     // Search backwards for leading whitespace
                     $wsStart = $i ? $i - strspn($revText, ' ', strlen($text) - $i) : 0;
                     // Search forwards for trailing whitespace
                     // $wsEnd will be the position of the last space
                     $wsEnd = $endPos + 2 + strspn($text, ' ', $endPos + 3);
                     // Eat the line if possible
                     // TODO: This could theoretically be done if $wsStart == 0, i.e. for comments at
                     // the overall start. That's not how Sanitizer::removeHTMLcomments() did it, but
                     // it's a possible beneficial b/c break.
                     if ($wsStart > 0 && substr($text, $wsStart - 1, 1) == "\n" && substr($text, $wsEnd + 1, 1) == "\n") {
                         $startPos = $wsStart;
                         $endPos = $wsEnd + 1;
                         // Remove leading whitespace from the end of the accumulator
                         // Sanity check first though
                         $wsLength = $i - $wsStart;
                         if ($wsLength > 0 && substr($accum, -$wsLength) === str_repeat(' ', $wsLength)) {
                             $accum = substr($accum, 0, -$wsLength);
                         }
                         // Do a line-start run next time to look for headings after the comment
                         $fakeLineStart = true;
                     } else {
                         // No line to eat, just take the comment itself
                         $startPos = $i;
                         $endPos += 2;
                     }
                     if ($stack->top) {
                         $part = $stack->top->getCurrentPart();
                         if (isset($part->commentEnd) && $part->commentEnd == $wsStart - 1) {
                             // Comments abutting, no change in visual end
                             $part->commentEnd = $wsEnd;
                         } else {
                             $part->visualEnd = $wsStart;
                             $part->commentEnd = $endPos;
                         }
                     }
                     $i = $endPos + 1;
                     $inner = substr($text, $startPos, $endPos - $startPos + 1);
                     $accum .= '<comment>' . htmlspecialchars($inner) . '</comment>';
                 }
                 continue;
             }
             $name = $matches[1];
             $lowerName = strtolower($name);
             $attrStart = $i + strlen($name) + 1;
             // Find end of tag
             $tagEndPos = $noMoreGT ? false : strpos($text, '>', $attrStart);
             if ($tagEndPos === false) {
                 // Infinite backtrack
                 // Disable tag search to prevent worst-case O(N^2) performance
                 $noMoreGT = true;
                 $accum .= '&lt;';
                 ++$i;
                 continue;
             }
             // Handle ignored tags
             if (in_array($lowerName, $ignoredTags)) {
                 $accum .= '<ignore>' . htmlspecialchars(substr($text, $i, $tagEndPos - $i + 1)) . '</ignore>';
                 $i = $tagEndPos + 1;
                 continue;
             }
             $tagStartPos = $i;
             if ($text[$tagEndPos - 1] == '/') {
                 $attrEnd = $tagEndPos - 1;
                 $inner = null;
                 $i = $tagEndPos + 1;
                 $close = '';
             } else {
                 $attrEnd = $tagEndPos;
                 // Find closing tag
                 if (preg_match("/<\\/{$name}\\s*>/i", $text, $matches, PREG_OFFSET_CAPTURE, $tagEndPos + 1)) {
                     $inner = substr($text, $tagEndPos + 1, $matches[0][1] - $tagEndPos - 1);
                     $i = $matches[0][1] + strlen($matches[0][0]);
                     $close = '<close>' . htmlspecialchars($matches[0][0]) . '</close>';
                 } else {
                     // No end tag -- let it run out to the end of the text.
                     $inner = substr($text, $tagEndPos + 1);
                     $i = strlen($text);
                     $close = '';
                 }
             }
             // <includeonly> and <noinclude> just become <ignore> tags
             if (in_array($lowerName, $ignoredElements)) {
                 $accum .= '<ignore>' . htmlspecialchars(substr($text, $tagStartPos, $i - $tagStartPos)) . '</ignore>';
                 continue;
             }
             $accum .= '<ext>';
             if ($attrEnd <= $attrStart) {
                 $attr = '';
             } else {
                 $attr = substr($text, $attrStart, $attrEnd - $attrStart);
             }
             $accum .= '<name>' . htmlspecialchars($name) . '</name>' . '<attr>' . htmlspecialchars($attr) . '</attr>';
             if ($inner !== null) {
                 $accum .= '<inner>' . htmlspecialchars($inner) . '</inner>';
             }
             $accum .= $close . '</ext>';
         } elseif ($found == 'line-start') {
             // Is this the start of a heading?
             // Line break belongs before the heading element in any case
             if ($fakeLineStart) {
                 $fakeLineStart = false;
             } else {
                 $accum .= $curChar;
                 $i++;
             }
             $count = strspn($text, '=', $i, 6);
             if ($count == 1 && $findEquals) {
                 // DWIM: This looks kind of like a name/value separator
                 // Let's let the equals handler have it and break the potential heading
                 // This is heuristic, but AFAICT the methods for completely correct disambiguation are very complex.
             } elseif ($count > 0) {
                 $piece = array('open' => "\n", 'close' => "\n", 'parts' => array(new PPDPart(str_repeat('=', $count))), 'startPos' => $i, 'count' => $count);
                 $stack->push($piece);
                 $accum =& $stack->getAccum();
                 extract($stack->getFlags());
                 $i += $count;
             }
         } elseif ($found == 'line-end') {
             $piece = $stack->top;
             // A heading must be open, otherwise \n wouldn't have been in the search list
             assert($piece->open == "\n");
             $part = $piece->getCurrentPart();
             // Search back through the input to see if it has a proper close
             // Do this using the reversed string since the other solutions (end anchor, etc.) are inefficient
             $wsLength = strspn($revText, " \t", strlen($text) - $i);
             $searchStart = $i - $wsLength;
             if (isset($part->commentEnd) && $searchStart - 1 == $part->commentEnd) {
                 // Comment found at line end
                 // Search for equals signs before the comment
                 $searchStart = $part->visualEnd;
                 $searchStart -= strspn($revText, " \t", strlen($text) - $searchStart);
             }
             $count = $piece->count;
             $equalsLength = strspn($revText, '=', strlen($text) - $searchStart);
             if ($equalsLength > 0) {
                 if ($i - $equalsLength == $piece->startPos) {
                     // This is just a single string of equals signs on its own line
                     // Replicate the doHeadings behaviour /={count}(.+)={count}/
                     // First find out how many equals signs there really are (don't stop at 6)
                     $count = $equalsLength;
                     if ($count < 3) {
                         $count = 0;
                     } else {
                         $count = min(6, intval(($count - 1) / 2));
                     }
                 } else {
                     $count = min($equalsLength, $count);
                 }
                 if ($count > 0) {
                     // Normal match, output <h>
                     $element = "<h level=\"{$count}\" i=\"{$headingIndex}\">{$accum}</h>";
                     $headingIndex++;
                 } else {
                     // Single equals sign on its own line, count=0
                     $element = $accum;
                 }
             } else {
                 // No match, no <h>, just pass down the inner text
                 $element = $accum;
             }
             // Unwind the stack
             $stack->pop();
             $accum =& $stack->getAccum();
             extract($stack->getFlags());
             // Append the result to the enclosing accumulator
             $accum .= $element;
             // Note that we do NOT increment the input pointer.
             // This is because the closing linebreak could be the opening linebreak of
             // another heading. Infinite loops are avoided because the next iteration MUST
             // hit the heading open case above, which unconditionally increments the
             // input pointer.
         } elseif ($found == 'open') {
             # count opening brace characters
             $count = strspn($text, $curChar, $i);
             # we need to add to stack only if opening brace count is enough for one of the rules
             if ($count >= $rule['min']) {
                 # Add it to the stack
                 $piece = array('open' => $curChar, 'close' => $rule['end'], 'count' => $count, 'lineStart' => $i > 0 && $text[$i - 1] == "\n");
                 $stack->push($piece);
                 $accum =& $stack->getAccum();
                 extract($stack->getFlags());
             } else {
                 # Add literal brace(s)
                 $accum .= htmlspecialchars(str_repeat($curChar, $count));
             }
             $i += $count;
         } elseif ($found == 'close') {
             $piece = $stack->top;
             # lets check if there are enough characters for closing brace
             $maxCount = $piece->count;
             $count = strspn($text, $curChar, $i, $maxCount);
             # check for maximum matching characters (if there are 5 closing
             # characters, we will probably need only 3 - depending on the rules)
             $matchingCount = 0;
             $rule = $rules[$piece->open];
             if ($count > $rule['max']) {
                 # The specified maximum exists in the callback array, unless the caller
                 # has made an error
                 $matchingCount = $rule['max'];
             } else {
                 # Count is less than the maximum
                 # Skip any gaps in the callback array to find the true largest match
                 # Need to use array_key_exists not isset because the callback can be null
                 $matchingCount = $count;
                 while ($matchingCount > 0 && !array_key_exists($matchingCount, $rule['names'])) {
                     --$matchingCount;
                 }
             }
             if ($matchingCount <= 0) {
                 # No matching element found in callback array
                 # Output a literal closing brace and continue
                 $accum .= htmlspecialchars(str_repeat($curChar, $count));
                 $i += $count;
                 continue;
             }
             $name = $rule['names'][$matchingCount];
             if ($name === null) {
                 // No element, just literal text
                 $element = $piece->breakSyntax($matchingCount) . str_repeat($rule['end'], $matchingCount);
             } else {
                 # Create XML element
                 # Note: $parts is already XML, does not need to be encoded further
                 $parts = $piece->parts;
                 $title = $parts[0]->out;
                 unset($parts[0]);
                 # The invocation is at the start of the line if lineStart is set in
                 # the stack, and all opening brackets are used up.
                 if ($maxCount == $matchingCount && !empty($piece->lineStart)) {
                     $attr = ' lineStart="1"';
                 } else {
                     $attr = '';
                 }
                 $element = "<{$name}{$attr}>";
                 $element .= "<title>{$title}</title>";
                 $argIndex = 1;
                 foreach ($parts as $partIndex => $part) {
                     if (isset($part->eqpos)) {
                         $argName = substr($part->out, 0, $part->eqpos);
                         $argValue = substr($part->out, $part->eqpos + 1);
                         $element .= "<part><name>{$argName}</name>=<value>{$argValue}</value></part>";
                     } else {
                         $element .= "<part><name index=\"{$argIndex}\" /><value>{$part->out}</value></part>";
                         $argIndex++;
                     }
                 }
                 $element .= "</{$name}>";
             }
             # Advance input pointer
             $i += $matchingCount;
             # Unwind the stack
             $stack->pop();
             $accum =& $stack->getAccum();
             # Re-add the old stack element if it still has unmatched opening characters remaining
             if ($matchingCount < $piece->count) {
                 $piece->parts = array(new PPDPart());
                 $piece->count -= $matchingCount;
                 # do we still qualify for any callback with remaining count?
                 $names = $rules[$piece->open]['names'];
                 $skippedBraces = 0;
                 $enclosingAccum =& $accum;
                 while ($piece->count) {
                     if (array_key_exists($piece->count, $names)) {
                         $stack->push($piece);
                         $accum =& $stack->getAccum();
                         break;
                     }
                     --$piece->count;
                     $skippedBraces++;
                 }
                 $enclosingAccum .= str_repeat($piece->open, $skippedBraces);
             }
             extract($stack->getFlags());
             # Add XML element to the enclosing accumulator
             $accum .= $element;
         } elseif ($found == 'pipe') {
             $findEquals = true;
             // shortcut for getFlags()
             $stack->addPart();
             $accum =& $stack->getAccum();
             ++$i;
         } elseif ($found == 'equals') {
             $findEquals = false;
             // shortcut for getFlags()
             $stack->getCurrentPart()->eqpos = strlen($accum);
             $accum .= '=';
             ++$i;
         }
     }
     # Output any remaining unclosed brackets
     foreach ($stack->stack as $piece) {
         $stack->rootAccum .= $piece->breakSyntax();
     }
     $stack->rootAccum .= '</root>';
     $xml = $stack->rootAccum;
     wfProfileOut(__METHOD__ . '-makexml');
     wfProfileIn(__METHOD__ . '-loadXML');
     $dom = new DOMDocument();
     wfSuppressWarnings();
     $result = $dom->loadXML($xml);
     wfRestoreWarnings();
     if (!$result) {
         // Try running the XML through UtfNormal to get rid of invalid characters
         $xml = UtfNormal::cleanUp($xml);
         $result = $dom->loadXML($xml);
         if (!$result) {
             throw new MWException(__METHOD__ . ' generated invalid XML');
         }
     }
     $obj = new PPNode_DOM($dom->documentElement);
     wfProfileOut(__METHOD__ . '-loadXML');
     wfProfileOut(__METHOD__);
     return $obj;
 }
Example #2
0
 /**
  * @return array
  */
 public function getFlags()
 {
     if (!count($this->stack)) {
         return array('findEquals' => false, 'findPipe' => false, 'inHeading' => false);
     } else {
         return $this->top->getFlags();
     }
 }
Example #3
0
 public function __construct()
 {
     $this->elementClass = 'PPDStackElement_Hash';
     parent::__construct();
     $this->rootAccum = new PPDAccum_Hash();
 }