/** * Preprocess some wikitext and return the document tree. * This is the ghost of Parser::replace_variables(). * * @param string $text The text to parse * @param integer flags Bitwise combination of: * Parser::PTD_FOR_INCLUSION Handle <noinclude>/<includeonly> as if the text is being * included. Default is to assume a direct page view. * * The generated DOM tree must depend only on the input text and the flags. * The DOM tree must be the same in OT_HTML and OT_WIKI mode, to avoid a regression of bug 4899. * * Any flag added to the $flags parameter here, or any other parameter liable to cause a * change in the DOM tree for a given text, must be passed through the section identifier * in the section edit link and thus back to extractSections(). * * The output of this function is currently only cached in process memory, but a persistent * cache may be implemented at a later date which takes further advantage of these strict * dependency requirements. * * @private */ function preprocessToObj($text, $flags = 0) { wfProfileIn(__METHOD__); wfProfileIn(__METHOD__ . '-makexml'); $rules = array('{' => array('end' => '}', 'names' => array(2 => 'template', 3 => 'tplarg'), 'min' => 2, 'max' => 3), '[' => array('end' => ']', 'names' => array(2 => null), 'min' => 2, 'max' => 2)); $forInclusion = $flags & Parser::PTD_FOR_INCLUSION; $xmlishElements = $this->parser->getStripList(); $enableOnlyinclude = false; if ($forInclusion) { $ignoredTags = array('includeonly', '/includeonly'); $ignoredElements = array('noinclude'); $xmlishElements[] = 'noinclude'; if (strpos($text, '<onlyinclude>') !== false && strpos($text, '</onlyinclude>') !== false) { $enableOnlyinclude = true; } } else { $ignoredTags = array('noinclude', '/noinclude', 'onlyinclude', '/onlyinclude'); $ignoredElements = array('includeonly'); $xmlishElements[] = 'includeonly'; } $xmlishRegex = implode('|', array_merge($xmlishElements, $ignoredTags)); // Use "A" modifier (anchored) instead of "^", because ^ doesn't work with an offset $elementsRegex = "~({$xmlishRegex})(?:\\s|\\/>|>)|(!--)~iA"; $stack = new PPDStack(); $searchBase = "[{<\n"; #} $revText = strrev($text); // For fast reverse searches $i = 0; # Input pointer, starts out pointing to a pseudo-newline before the start $accum =& $stack->getAccum(); # Current accumulator $accum = '<root>'; $findEquals = false; # True to find equals signs in arguments $findPipe = false; # True to take notice of pipe characters $headingIndex = 1; $inHeading = false; # True if $i is inside a possible heading $noMoreGT = false; # True if there are no more greater-than (>) signs right of $i $findOnlyinclude = $enableOnlyinclude; # True to ignore all input up to the next <onlyinclude> $fakeLineStart = true; # Do a line-start run without outputting an LF character while (true) { //$this->memCheck(); if ($findOnlyinclude) { // Ignore all input up to the next <onlyinclude> $startPos = strpos($text, '<onlyinclude>', $i); if ($startPos === false) { // Ignored section runs to the end $accum .= '<ignore>' . htmlspecialchars(substr($text, $i)) . '</ignore>'; break; } $tagEndPos = $startPos + strlen('<onlyinclude>'); // past-the-end $accum .= '<ignore>' . htmlspecialchars(substr($text, $i, $tagEndPos - $i)) . '</ignore>'; $i = $tagEndPos; $findOnlyinclude = false; } if ($fakeLineStart) { $found = 'line-start'; $curChar = ''; } else { # Find next opening brace, closing brace or pipe $search = $searchBase; if ($stack->top === false) { $currentClosing = ''; } else { $currentClosing = $stack->top->close; $search .= $currentClosing; } if ($findPipe) { $search .= '|'; } if ($findEquals) { // First equals will be for the template $search .= '='; } $rule = null; # Output literal section, advance input counter $literalLength = strcspn($text, $search, $i); if ($literalLength > 0) { $accum .= htmlspecialchars(substr($text, $i, $literalLength)); $i += $literalLength; } if ($i >= strlen($text)) { if ($currentClosing == "\n") { // Do a past-the-end run to finish off the heading $curChar = ''; $found = 'line-end'; } else { # All done break; } } else { $curChar = $text[$i]; if ($curChar == '|') { $found = 'pipe'; } elseif ($curChar == '=') { $found = 'equals'; } elseif ($curChar == '<') { $found = 'angle'; } elseif ($curChar == "\n") { if ($inHeading) { $found = 'line-end'; } else { $found = 'line-start'; } } elseif ($curChar == $currentClosing) { $found = 'close'; } elseif (isset($rules[$curChar])) { $found = 'open'; $rule = $rules[$curChar]; } else { # Some versions of PHP have a strcspn which stops on null characters # Ignore and continue ++$i; continue; } } } if ($found == 'angle') { $matches = false; // Handle </onlyinclude> if ($enableOnlyinclude && substr($text, $i, strlen('</onlyinclude>')) == '</onlyinclude>') { $findOnlyinclude = true; continue; } // Determine element name if (!preg_match($elementsRegex, $text, $matches, 0, $i + 1)) { // Element name missing or not listed $accum .= '<'; ++$i; continue; } // Handle comments if (isset($matches[2]) && $matches[2] == '!--') { // To avoid leaving blank lines, when a comment is both preceded // and followed by a newline (ignoring spaces), trim leading and // trailing spaces and one of the newlines. // Find the end $endPos = strpos($text, '-->', $i + 4); if ($endPos === false) { // Unclosed comment in input, runs to end $inner = substr($text, $i); $accum .= '<comment>' . htmlspecialchars($inner) . '</comment>'; $i = strlen($text); } else { // Search backwards for leading whitespace $wsStart = $i ? $i - strspn($revText, ' ', strlen($text) - $i) : 0; // Search forwards for trailing whitespace // $wsEnd will be the position of the last space $wsEnd = $endPos + 2 + strspn($text, ' ', $endPos + 3); // Eat the line if possible // TODO: This could theoretically be done if $wsStart == 0, i.e. for comments at // the overall start. That's not how Sanitizer::removeHTMLcomments() did it, but // it's a possible beneficial b/c break. if ($wsStart > 0 && substr($text, $wsStart - 1, 1) == "\n" && substr($text, $wsEnd + 1, 1) == "\n") { $startPos = $wsStart; $endPos = $wsEnd + 1; // Remove leading whitespace from the end of the accumulator // Sanity check first though $wsLength = $i - $wsStart; if ($wsLength > 0 && substr($accum, -$wsLength) === str_repeat(' ', $wsLength)) { $accum = substr($accum, 0, -$wsLength); } // Do a line-start run next time to look for headings after the comment $fakeLineStart = true; } else { // No line to eat, just take the comment itself $startPos = $i; $endPos += 2; } if ($stack->top) { $part = $stack->top->getCurrentPart(); if (isset($part->commentEnd) && $part->commentEnd == $wsStart - 1) { // Comments abutting, no change in visual end $part->commentEnd = $wsEnd; } else { $part->visualEnd = $wsStart; $part->commentEnd = $endPos; } } $i = $endPos + 1; $inner = substr($text, $startPos, $endPos - $startPos + 1); $accum .= '<comment>' . htmlspecialchars($inner) . '</comment>'; } continue; } $name = $matches[1]; $lowerName = strtolower($name); $attrStart = $i + strlen($name) + 1; // Find end of tag $tagEndPos = $noMoreGT ? false : strpos($text, '>', $attrStart); if ($tagEndPos === false) { // Infinite backtrack // Disable tag search to prevent worst-case O(N^2) performance $noMoreGT = true; $accum .= '<'; ++$i; continue; } // Handle ignored tags if (in_array($lowerName, $ignoredTags)) { $accum .= '<ignore>' . htmlspecialchars(substr($text, $i, $tagEndPos - $i + 1)) . '</ignore>'; $i = $tagEndPos + 1; continue; } $tagStartPos = $i; if ($text[$tagEndPos - 1] == '/') { $attrEnd = $tagEndPos - 1; $inner = null; $i = $tagEndPos + 1; $close = ''; } else { $attrEnd = $tagEndPos; // Find closing tag if (preg_match("/<\\/{$name}\\s*>/i", $text, $matches, PREG_OFFSET_CAPTURE, $tagEndPos + 1)) { $inner = substr($text, $tagEndPos + 1, $matches[0][1] - $tagEndPos - 1); $i = $matches[0][1] + strlen($matches[0][0]); $close = '<close>' . htmlspecialchars($matches[0][0]) . '</close>'; } else { // No end tag -- let it run out to the end of the text. $inner = substr($text, $tagEndPos + 1); $i = strlen($text); $close = ''; } } // <includeonly> and <noinclude> just become <ignore> tags if (in_array($lowerName, $ignoredElements)) { $accum .= '<ignore>' . htmlspecialchars(substr($text, $tagStartPos, $i - $tagStartPos)) . '</ignore>'; continue; } $accum .= '<ext>'; if ($attrEnd <= $attrStart) { $attr = ''; } else { $attr = substr($text, $attrStart, $attrEnd - $attrStart); } $accum .= '<name>' . htmlspecialchars($name) . '</name>' . '<attr>' . htmlspecialchars($attr) . '</attr>'; if ($inner !== null) { $accum .= '<inner>' . htmlspecialchars($inner) . '</inner>'; } $accum .= $close . '</ext>'; } elseif ($found == 'line-start') { // Is this the start of a heading? // Line break belongs before the heading element in any case if ($fakeLineStart) { $fakeLineStart = false; } else { $accum .= $curChar; $i++; } $count = strspn($text, '=', $i, 6); if ($count == 1 && $findEquals) { // DWIM: This looks kind of like a name/value separator // Let's let the equals handler have it and break the potential heading // This is heuristic, but AFAICT the methods for completely correct disambiguation are very complex. } elseif ($count > 0) { $piece = array('open' => "\n", 'close' => "\n", 'parts' => array(new PPDPart(str_repeat('=', $count))), 'startPos' => $i, 'count' => $count); $stack->push($piece); $accum =& $stack->getAccum(); extract($stack->getFlags()); $i += $count; } } elseif ($found == 'line-end') { $piece = $stack->top; // A heading must be open, otherwise \n wouldn't have been in the search list assert($piece->open == "\n"); $part = $piece->getCurrentPart(); // Search back through the input to see if it has a proper close // Do this using the reversed string since the other solutions (end anchor, etc.) are inefficient $wsLength = strspn($revText, " \t", strlen($text) - $i); $searchStart = $i - $wsLength; if (isset($part->commentEnd) && $searchStart - 1 == $part->commentEnd) { // Comment found at line end // Search for equals signs before the comment $searchStart = $part->visualEnd; $searchStart -= strspn($revText, " \t", strlen($text) - $searchStart); } $count = $piece->count; $equalsLength = strspn($revText, '=', strlen($text) - $searchStart); if ($equalsLength > 0) { if ($i - $equalsLength == $piece->startPos) { // This is just a single string of equals signs on its own line // Replicate the doHeadings behaviour /={count}(.+)={count}/ // First find out how many equals signs there really are (don't stop at 6) $count = $equalsLength; if ($count < 3) { $count = 0; } else { $count = min(6, intval(($count - 1) / 2)); } } else { $count = min($equalsLength, $count); } if ($count > 0) { // Normal match, output <h> $element = "<h level=\"{$count}\" i=\"{$headingIndex}\">{$accum}</h>"; $headingIndex++; } else { // Single equals sign on its own line, count=0 $element = $accum; } } else { // No match, no <h>, just pass down the inner text $element = $accum; } // Unwind the stack $stack->pop(); $accum =& $stack->getAccum(); extract($stack->getFlags()); // Append the result to the enclosing accumulator $accum .= $element; // Note that we do NOT increment the input pointer. // This is because the closing linebreak could be the opening linebreak of // another heading. Infinite loops are avoided because the next iteration MUST // hit the heading open case above, which unconditionally increments the // input pointer. } elseif ($found == 'open') { # count opening brace characters $count = strspn($text, $curChar, $i); # we need to add to stack only if opening brace count is enough for one of the rules if ($count >= $rule['min']) { # Add it to the stack $piece = array('open' => $curChar, 'close' => $rule['end'], 'count' => $count, 'lineStart' => $i > 0 && $text[$i - 1] == "\n"); $stack->push($piece); $accum =& $stack->getAccum(); extract($stack->getFlags()); } else { # Add literal brace(s) $accum .= htmlspecialchars(str_repeat($curChar, $count)); } $i += $count; } elseif ($found == 'close') { $piece = $stack->top; # lets check if there are enough characters for closing brace $maxCount = $piece->count; $count = strspn($text, $curChar, $i, $maxCount); # check for maximum matching characters (if there are 5 closing # characters, we will probably need only 3 - depending on the rules) $matchingCount = 0; $rule = $rules[$piece->open]; if ($count > $rule['max']) { # The specified maximum exists in the callback array, unless the caller # has made an error $matchingCount = $rule['max']; } else { # Count is less than the maximum # Skip any gaps in the callback array to find the true largest match # Need to use array_key_exists not isset because the callback can be null $matchingCount = $count; while ($matchingCount > 0 && !array_key_exists($matchingCount, $rule['names'])) { --$matchingCount; } } if ($matchingCount <= 0) { # No matching element found in callback array # Output a literal closing brace and continue $accum .= htmlspecialchars(str_repeat($curChar, $count)); $i += $count; continue; } $name = $rule['names'][$matchingCount]; if ($name === null) { // No element, just literal text $element = $piece->breakSyntax($matchingCount) . str_repeat($rule['end'], $matchingCount); } else { # Create XML element # Note: $parts is already XML, does not need to be encoded further $parts = $piece->parts; $title = $parts[0]->out; unset($parts[0]); # The invocation is at the start of the line if lineStart is set in # the stack, and all opening brackets are used up. if ($maxCount == $matchingCount && !empty($piece->lineStart)) { $attr = ' lineStart="1"'; } else { $attr = ''; } $element = "<{$name}{$attr}>"; $element .= "<title>{$title}</title>"; $argIndex = 1; foreach ($parts as $partIndex => $part) { if (isset($part->eqpos)) { $argName = substr($part->out, 0, $part->eqpos); $argValue = substr($part->out, $part->eqpos + 1); $element .= "<part><name>{$argName}</name>=<value>{$argValue}</value></part>"; } else { $element .= "<part><name index=\"{$argIndex}\" /><value>{$part->out}</value></part>"; $argIndex++; } } $element .= "</{$name}>"; } # Advance input pointer $i += $matchingCount; # Unwind the stack $stack->pop(); $accum =& $stack->getAccum(); # Re-add the old stack element if it still has unmatched opening characters remaining if ($matchingCount < $piece->count) { $piece->parts = array(new PPDPart()); $piece->count -= $matchingCount; # do we still qualify for any callback with remaining count? $names = $rules[$piece->open]['names']; $skippedBraces = 0; $enclosingAccum =& $accum; while ($piece->count) { if (array_key_exists($piece->count, $names)) { $stack->push($piece); $accum =& $stack->getAccum(); break; } --$piece->count; $skippedBraces++; } $enclosingAccum .= str_repeat($piece->open, $skippedBraces); } extract($stack->getFlags()); # Add XML element to the enclosing accumulator $accum .= $element; } elseif ($found == 'pipe') { $findEquals = true; // shortcut for getFlags() $stack->addPart(); $accum =& $stack->getAccum(); ++$i; } elseif ($found == 'equals') { $findEquals = false; // shortcut for getFlags() $stack->getCurrentPart()->eqpos = strlen($accum); $accum .= '='; ++$i; } } # Output any remaining unclosed brackets foreach ($stack->stack as $piece) { $stack->rootAccum .= $piece->breakSyntax(); } $stack->rootAccum .= '</root>'; $xml = $stack->rootAccum; wfProfileOut(__METHOD__ . '-makexml'); wfProfileIn(__METHOD__ . '-loadXML'); $dom = new DOMDocument(); wfSuppressWarnings(); $result = $dom->loadXML($xml); wfRestoreWarnings(); if (!$result) { // Try running the XML through UtfNormal to get rid of invalid characters $xml = UtfNormal::cleanUp($xml); $result = $dom->loadXML($xml); if (!$result) { throw new MWException(__METHOD__ . ' generated invalid XML'); } } $obj = new PPNode_DOM($dom->documentElement); wfProfileOut(__METHOD__ . '-loadXML'); wfProfileOut(__METHOD__); return $obj; }
/** * @return array */ public function getFlags() { if (!count($this->stack)) { return array('findEquals' => false, 'findPipe' => false, 'inHeading' => false); } else { return $this->top->getFlags(); } }
public function __construct() { $this->elementClass = 'PPDStackElement_Hash'; parent::__construct(); $this->rootAccum = new PPDAccum_Hash(); }