/** * @param string $text * @param int $flags * @return string */ public function preprocessToXml($text, $flags = 0) { $forInclusion = $flags & Parser::PTD_FOR_INCLUSION; $xmlishElements = $this->parser->getStripList(); $enableOnlyinclude = false; if ($forInclusion) { $ignoredTags = array('includeonly', '/includeonly'); $ignoredElements = array('noinclude'); $xmlishElements[] = 'noinclude'; if (strpos($text, '<onlyinclude>') !== false && strpos($text, '</onlyinclude>') !== false) { $enableOnlyinclude = true; } } else { $ignoredTags = array('noinclude', '/noinclude', 'onlyinclude', '/onlyinclude'); $ignoredElements = array('includeonly'); $xmlishElements[] = 'includeonly'; } $xmlishRegex = implode('|', array_merge($xmlishElements, $ignoredTags)); // Use "A" modifier (anchored) instead of "^", because ^ doesn't work with an offset $elementsRegex = "~({$xmlishRegex})(?:\\s|\\/>|>)|(!--)~iA"; $stack = new PPDStack(); $searchBase = "[{<\n"; # } // For fast reverse searches $revText = strrev($text); $lengthText = strlen($text); // Input pointer, starts out pointing to a pseudo-newline before the start $i = 0; // Current accumulator $accum =& $stack->getAccum(); $accum = '<root>'; // True to find equals signs in arguments $findEquals = false; // True to take notice of pipe characters $findPipe = false; $headingIndex = 1; // True if $i is inside a possible heading $inHeading = false; // True if there are no more greater-than (>) signs right of $i $noMoreGT = false; // True to ignore all input up to the next <onlyinclude> $findOnlyinclude = $enableOnlyinclude; // Do a line-start run without outputting an LF character $fakeLineStart = true; while (true) { // $this->memCheck(); if ($findOnlyinclude) { // Ignore all input up to the next <onlyinclude> $startPos = strpos($text, '<onlyinclude>', $i); if ($startPos === false) { // Ignored section runs to the end $accum .= '<ignore>' . htmlspecialchars(substr($text, $i)) . '</ignore>'; break; } $tagEndPos = $startPos + strlen('<onlyinclude>'); // past-the-end $accum .= '<ignore>' . htmlspecialchars(substr($text, $i, $tagEndPos - $i)) . '</ignore>'; $i = $tagEndPos; $findOnlyinclude = false; } if ($fakeLineStart) { $found = 'line-start'; $curChar = ''; } else { # Find next opening brace, closing brace or pipe $search = $searchBase; if ($stack->top === false) { $currentClosing = ''; } else { $currentClosing = $stack->top->close; $search .= $currentClosing; } if ($findPipe) { $search .= '|'; } if ($findEquals) { // First equals will be for the template $search .= '='; } $rule = null; # Output literal section, advance input counter $literalLength = strcspn($text, $search, $i); if ($literalLength > 0) { $accum .= htmlspecialchars(substr($text, $i, $literalLength)); $i += $literalLength; } if ($i >= $lengthText) { if ($currentClosing == "\n") { // Do a past-the-end run to finish off the heading $curChar = ''; $found = 'line-end'; } else { # All done break; } } else { $curChar = $text[$i]; if ($curChar == '|') { $found = 'pipe'; } elseif ($curChar == '=') { $found = 'equals'; } elseif ($curChar == '<') { $found = 'angle'; } elseif ($curChar == "\n") { if ($inHeading) { $found = 'line-end'; } else { $found = 'line-start'; } } elseif ($curChar == $currentClosing) { $found = 'close'; } elseif (isset($this->rules[$curChar])) { $found = 'open'; $rule = $this->rules[$curChar]; } else { # Some versions of PHP have a strcspn which stops on null characters # Ignore and continue ++$i; continue; } } } if ($found == 'angle') { $matches = false; // Handle </onlyinclude> if ($enableOnlyinclude && substr($text, $i, strlen('</onlyinclude>')) == '</onlyinclude>') { $findOnlyinclude = true; continue; } // Determine element name if (!preg_match($elementsRegex, $text, $matches, 0, $i + 1)) { // Element name missing or not listed $accum .= '<'; ++$i; continue; } // Handle comments if (isset($matches[2]) && $matches[2] == '!--') { // To avoid leaving blank lines, when a sequence of // space-separated comments is both preceded and followed by // a newline (ignoring spaces), then // trim leading and trailing spaces and the trailing newline. // Find the end $endPos = strpos($text, '-->', $i + 4); if ($endPos === false) { // Unclosed comment in input, runs to end $inner = substr($text, $i); $accum .= '<comment>' . htmlspecialchars($inner) . '</comment>'; $i = $lengthText; } else { // Search backwards for leading whitespace $wsStart = $i ? $i - strspn($revText, " \t", $lengthText - $i) : 0; // Search forwards for trailing whitespace // $wsEnd will be the position of the last space (or the '>' if there's none) $wsEnd = $endPos + 2 + strspn($text, " \t", $endPos + 3); // Keep looking forward as long as we're finding more // comments. $comments = array(array($wsStart, $wsEnd)); while (substr($text, $wsEnd + 1, 4) == '<!--') { $c = strpos($text, '-->', $wsEnd + 4); if ($c === false) { break; } $c = $c + 2 + strspn($text, " \t", $c + 3); $comments[] = array($wsEnd + 1, $c); $wsEnd = $c; } // Eat the line if possible // TODO: This could theoretically be done if $wsStart == 0, i.e. for comments at // the overall start. That's not how Sanitizer::removeHTMLcomments() did it, but // it's a possible beneficial b/c break. if ($wsStart > 0 && substr($text, $wsStart - 1, 1) == "\n" && substr($text, $wsEnd + 1, 1) == "\n") { // Remove leading whitespace from the end of the accumulator // Sanity check first though $wsLength = $i - $wsStart; if ($wsLength > 0 && strspn($accum, " \t", -$wsLength) === $wsLength) { $accum = substr($accum, 0, -$wsLength); } // Dump all but the last comment to the accumulator foreach ($comments as $j => $com) { $startPos = $com[0]; $endPos = $com[1] + 1; if ($j == count($comments) - 1) { break; } $inner = substr($text, $startPos, $endPos - $startPos); $accum .= '<comment>' . htmlspecialchars($inner) . '</comment>'; } // Do a line-start run next time to look for headings after the comment $fakeLineStart = true; } else { // No line to eat, just take the comment itself $startPos = $i; $endPos += 2; } if ($stack->top) { $part = $stack->top->getCurrentPart(); if (!(isset($part->commentEnd) && $part->commentEnd == $wsStart - 1)) { $part->visualEnd = $wsStart; } // Else comments abutting, no change in visual end $part->commentEnd = $endPos; } $i = $endPos + 1; $inner = substr($text, $startPos, $endPos - $startPos + 1); $accum .= '<comment>' . htmlspecialchars($inner) . '</comment>'; } continue; } $name = $matches[1]; $lowerName = strtolower($name); $attrStart = $i + strlen($name) + 1; // Find end of tag $tagEndPos = $noMoreGT ? false : strpos($text, '>', $attrStart); if ($tagEndPos === false) { // Infinite backtrack // Disable tag search to prevent worst-case O(N^2) performance $noMoreGT = true; $accum .= '<'; ++$i; continue; } // Handle ignored tags if (in_array($lowerName, $ignoredTags)) { $accum .= '<ignore>' . htmlspecialchars(substr($text, $i, $tagEndPos - $i + 1)) . '</ignore>'; $i = $tagEndPos + 1; continue; } $tagStartPos = $i; if ($text[$tagEndPos - 1] == '/') { $attrEnd = $tagEndPos - 1; $inner = null; $i = $tagEndPos + 1; $close = ''; } else { $attrEnd = $tagEndPos; // Find closing tag if (preg_match("/<\\/" . preg_quote($name, '/') . "\\s*>/i", $text, $matches, PREG_OFFSET_CAPTURE, $tagEndPos + 1)) { $inner = substr($text, $tagEndPos + 1, $matches[0][1] - $tagEndPos - 1); $i = $matches[0][1] + strlen($matches[0][0]); $close = '<close>' . htmlspecialchars($matches[0][0]) . '</close>'; } else { // No end tag -- let it run out to the end of the text. $inner = substr($text, $tagEndPos + 1); $i = $lengthText; $close = ''; } } // <includeonly> and <noinclude> just become <ignore> tags if (in_array($lowerName, $ignoredElements)) { $accum .= '<ignore>' . htmlspecialchars(substr($text, $tagStartPos, $i - $tagStartPos)) . '</ignore>'; continue; } $accum .= '<ext>'; if ($attrEnd <= $attrStart) { $attr = ''; } else { $attr = substr($text, $attrStart, $attrEnd - $attrStart); } $accum .= '<name>' . htmlspecialchars($name) . '</name>' . '<attr>' . htmlspecialchars($attr) . '</attr>'; if ($inner !== null) { $accum .= '<inner>' . htmlspecialchars($inner) . '</inner>'; } $accum .= $close . '</ext>'; } elseif ($found == 'line-start') { // Is this the start of a heading? // Line break belongs before the heading element in any case if ($fakeLineStart) { $fakeLineStart = false; } else { $accum .= $curChar; $i++; } $count = strspn($text, '=', $i, 6); if ($count == 1 && $findEquals) { // DWIM: This looks kind of like a name/value separator. // Let's let the equals handler have it and break the // potential heading. This is heuristic, but AFAICT the // methods for completely correct disambiguation are very // complex. } elseif ($count > 0) { $piece = array('open' => "\n", 'close' => "\n", 'parts' => array(new PPDPart(str_repeat('=', $count))), 'startPos' => $i, 'count' => $count); $stack->push($piece); $accum =& $stack->getAccum(); $flags = $stack->getFlags(); extract($flags); $i += $count; } } elseif ($found == 'line-end') { $piece = $stack->top; // A heading must be open, otherwise \n wouldn't have been in the search list assert('$piece->open == "\\n"'); $part = $piece->getCurrentPart(); // Search back through the input to see if it has a proper close. // Do this using the reversed string since the other solutions // (end anchor, etc.) are inefficient. $wsLength = strspn($revText, " \t", $lengthText - $i); $searchStart = $i - $wsLength; if (isset($part->commentEnd) && $searchStart - 1 == $part->commentEnd) { // Comment found at line end // Search for equals signs before the comment $searchStart = $part->visualEnd; $searchStart -= strspn($revText, " \t", $lengthText - $searchStart); } $count = $piece->count; $equalsLength = strspn($revText, '=', $lengthText - $searchStart); if ($equalsLength > 0) { if ($searchStart - $equalsLength == $piece->startPos) { // This is just a single string of equals signs on its own line // Replicate the doHeadings behavior /={count}(.+)={count}/ // First find out how many equals signs there really are (don't stop at 6) $count = $equalsLength; if ($count < 3) { $count = 0; } else { $count = min(6, intval(($count - 1) / 2)); } } else { $count = min($equalsLength, $count); } if ($count > 0) { // Normal match, output <h> $element = "<h level=\"{$count}\" i=\"{$headingIndex}\">{$accum}</h>"; $headingIndex++; } else { // Single equals sign on its own line, count=0 $element = $accum; } } else { // No match, no <h>, just pass down the inner text $element = $accum; } // Unwind the stack $stack->pop(); $accum =& $stack->getAccum(); $flags = $stack->getFlags(); extract($flags); // Append the result to the enclosing accumulator $accum .= $element; // Note that we do NOT increment the input pointer. // This is because the closing linebreak could be the opening linebreak of // another heading. Infinite loops are avoided because the next iteration MUST // hit the heading open case above, which unconditionally increments the // input pointer. } elseif ($found == 'open') { # count opening brace characters $count = strspn($text, $curChar, $i); # we need to add to stack only if opening brace count is enough for one of the rules if ($count >= $rule['min']) { # Add it to the stack $piece = array('open' => $curChar, 'close' => $rule['end'], 'count' => $count, 'lineStart' => $i > 0 && $text[$i - 1] == "\n"); $stack->push($piece); $accum =& $stack->getAccum(); $flags = $stack->getFlags(); extract($flags); } else { # Add literal brace(s) $accum .= htmlspecialchars(str_repeat($curChar, $count)); } $i += $count; } elseif ($found == 'close') { $piece = $stack->top; # lets check if there are enough characters for closing brace $maxCount = $piece->count; $count = strspn($text, $curChar, $i, $maxCount); # check for maximum matching characters (if there are 5 closing # characters, we will probably need only 3 - depending on the rules) $rule = $this->rules[$piece->open]; if ($count > $rule['max']) { # The specified maximum exists in the callback array, unless the caller # has made an error $matchingCount = $rule['max']; } else { # Count is less than the maximum # Skip any gaps in the callback array to find the true largest match # Need to use array_key_exists not isset because the callback can be null $matchingCount = $count; while ($matchingCount > 0 && !array_key_exists($matchingCount, $rule['names'])) { --$matchingCount; } } if ($matchingCount <= 0) { # No matching element found in callback array # Output a literal closing brace and continue $accum .= htmlspecialchars(str_repeat($curChar, $count)); $i += $count; continue; } $name = $rule['names'][$matchingCount]; if ($name === null) { // No element, just literal text $element = $piece->breakSyntax($matchingCount) . str_repeat($rule['end'], $matchingCount); } else { # Create XML element # Note: $parts is already XML, does not need to be encoded further $parts = $piece->parts; $title = $parts[0]->out; unset($parts[0]); # The invocation is at the start of the line if lineStart is set in # the stack, and all opening brackets are used up. if ($maxCount == $matchingCount && !empty($piece->lineStart)) { $attr = ' lineStart="1"'; } else { $attr = ''; } $element = "<{$name}{$attr}>"; $element .= "<title>{$title}</title>"; $argIndex = 1; foreach ($parts as $part) { if (isset($part->eqpos)) { $argName = substr($part->out, 0, $part->eqpos); $argValue = substr($part->out, $part->eqpos + 1); $element .= "<part><name>{$argName}</name>=<value>{$argValue}</value></part>"; } else { $element .= "<part><name index=\"{$argIndex}\" /><value>{$part->out}</value></part>"; $argIndex++; } } $element .= "</{$name}>"; } # Advance input pointer $i += $matchingCount; # Unwind the stack $stack->pop(); $accum =& $stack->getAccum(); # Re-add the old stack element if it still has unmatched opening characters remaining if ($matchingCount < $piece->count) { $piece->parts = array(new PPDPart()); $piece->count -= $matchingCount; # do we still qualify for any callback with remaining count? $min = $this->rules[$piece->open]['min']; if ($piece->count >= $min) { $stack->push($piece); $accum =& $stack->getAccum(); } else { $accum .= str_repeat($piece->open, $piece->count); } } $flags = $stack->getFlags(); extract($flags); # Add XML element to the enclosing accumulator $accum .= $element; } elseif ($found == 'pipe') { $findEquals = true; // shortcut for getFlags() $stack->addPart(); $accum =& $stack->getAccum(); ++$i; } elseif ($found == 'equals') { $findEquals = false; // shortcut for getFlags() $stack->getCurrentPart()->eqpos = strlen($accum); $accum .= '='; ++$i; } } # Output any remaining unclosed brackets foreach ($stack->stack as $piece) { $stack->rootAccum .= $piece->breakSyntax(); } $stack->rootAccum .= '</root>'; $xml = $stack->rootAccum; return $xml; }
/** * Parser function to extension tag adaptor * @param Parser $parser * @param PPFrame $frame * @param PPNode[] $args * @return string */ public static function tagObj($parser, $frame, $args) { if (!count($args)) { return ''; } $tagName = strtolower(trim($frame->expand(array_shift($args)))); if (count($args)) { $inner = $frame->expand(array_shift($args)); } else { $inner = null; } $attributes = []; foreach ($args as $arg) { $bits = $arg->splitArg(); if (strval($bits['index']) === '') { $name = trim($frame->expand($bits['name'], PPFrame::STRIP_COMMENTS)); $value = trim($frame->expand($bits['value'])); if (preg_match('/^(?:["\'](.+)["\']|""|\'\')$/s', $value, $m)) { $value = isset($m[1]) ? $m[1] : ''; } $attributes[$name] = $value; } } $stripList = $parser->getStripList(); if (!in_array($tagName, $stripList)) { // we can't handle this tag (at least not now), so just re-emit it as an ordinary tag $attrText = ''; foreach ($attributes as $name => $value) { $attrText .= ' ' . htmlspecialchars($name) . '="' . htmlspecialchars($value) . '"'; } if ($inner === null) { return "<{$tagName}{$attrText}/>"; } return "<{$tagName}{$attrText}>{$inner}</{$tagName}>"; } $params = ['name' => $tagName, 'inner' => $inner, 'attributes' => $attributes, 'close' => "</{$tagName}>"]; return $parser->extensionSubstitution($params, $frame); }
/** * Parser function to extension tag adaptor * @param Parser $parser * @param PPFrame $frame * @param array $args * @return string */ public static function tagObj($parser, $frame, $args) { if (!count($args)) { return ''; } $tagName = strtolower(trim($frame->expand(array_shift($args)))); if (count($args)) { $inner = $frame->expand(array_shift($args)); } else { $inner = null; } $stripList = $parser->getStripList(); if (!in_array($tagName, $stripList)) { return '<span class="error">' . wfMessage('unknown_extension_tag', $tagName)->inContentLanguage()->text() . '</span>'; } $attributes = array(); foreach ($args as $arg) { $bits = $arg->splitArg(); if (strval($bits['index']) === '') { $name = trim($frame->expand($bits['name'], PPFrame::STRIP_COMMENTS)); $value = trim($frame->expand($bits['value'])); if (preg_match('/^(?:["\'](.+)["\']|""|\'\')$/s', $value, $m)) { $value = isset($m[1]) ? $m[1] : ''; } $attributes[$name] = $value; } } $params = array('name' => $tagName, 'inner' => $inner, 'attributes' => $attributes, 'close' => "</{$tagName}>"); return $parser->extensionSubstitution($params, $frame); }
/** * Preprocess some wikitext and return the document tree. * This is the ghost of Parser::replace_variables(). * * @param $text String: the text to parse * @param $flags Integer: bitwise combination of: * Parser::PTD_FOR_INCLUSION Handle <noinclude>/<includeonly> as if the text is being * included. Default is to assume a direct page view. * * The generated DOM tree must depend only on the input text and the flags. * The DOM tree must be the same in OT_HTML and OT_WIKI mode, to avoid a regression of bug 4899. * * Any flag added to the $flags parameter here, or any other parameter liable to cause a * change in the DOM tree for a given text, must be passed through the section identifier * in the section edit link and thus back to extractSections(). * * The output of this function is currently only cached in process memory, but a persistent * cache may be implemented at a later date which takes further advantage of these strict * dependency requirements. * * @return PPNode_Hash_Tree */ function preprocessToObj($text, $flags = 0) { wfProfileIn(__METHOD__); // Check cache. global $wgMemc, $wgPreprocessorCacheThreshold; $cacheable = $wgPreprocessorCacheThreshold !== false && strlen($text) > $wgPreprocessorCacheThreshold; if ($cacheable) { wfProfileIn(__METHOD__ . '-cacheable'); $cacheKey = wfMemcKey('preprocess-hash', md5($text), $flags); $cacheValue = $wgMemc->get($cacheKey); if ($cacheValue) { $version = substr($cacheValue, 0, 8); if (intval($version) == self::CACHE_VERSION) { $hash = unserialize(substr($cacheValue, 8)); // From the cache wfDebugLog("Preprocessor", "Loaded preprocessor hash from memcached (key {$cacheKey})"); wfProfileOut(__METHOD__ . '-cacheable'); wfProfileOut(__METHOD__); return $hash; } } wfProfileIn(__METHOD__ . '-cache-miss'); } $rules = array('{' => array('end' => '}', 'names' => array(2 => 'template', 3 => 'tplarg'), 'min' => 2, 'max' => 3), '[' => array('end' => ']', 'names' => array(2 => null), 'min' => 2, 'max' => 2)); $forInclusion = $flags & Parser::PTD_FOR_INCLUSION; $xmlishElements = $this->parser->getStripList(); $enableOnlyinclude = false; if ($forInclusion) { $ignoredTags = array('includeonly', '/includeonly'); $ignoredElements = array('noinclude'); $xmlishElements[] = 'noinclude'; if (strpos($text, '<onlyinclude>') !== false && strpos($text, '</onlyinclude>') !== false) { $enableOnlyinclude = true; } } else { $ignoredTags = array('noinclude', '/noinclude', 'onlyinclude', '/onlyinclude'); $ignoredElements = array('includeonly'); $xmlishElements[] = 'includeonly'; } $xmlishRegex = implode('|', array_merge($xmlishElements, $ignoredTags)); // Use "A" modifier (anchored) instead of "^", because ^ doesn't work with an offset $elementsRegex = "~({$xmlishRegex})(?:\\s|\\/>|>)|(!--)~iA"; $stack = new PPDStack_Hash(); $searchBase = "[{<\n"; $revText = strrev($text); // For fast reverse searches $i = 0; # Input pointer, starts out pointing to a pseudo-newline before the start $accum =& $stack->getAccum(); # Current accumulator $findEquals = false; # True to find equals signs in arguments $findPipe = false; # True to take notice of pipe characters $headingIndex = 1; $inHeading = false; # True if $i is inside a possible heading $noMoreGT = false; # True if there are no more greater-than (>) signs right of $i $findOnlyinclude = $enableOnlyinclude; # True to ignore all input up to the next <onlyinclude> $fakeLineStart = true; # Do a line-start run without outputting an LF character while (true) { //$this->memCheck(); if ($findOnlyinclude) { // Ignore all input up to the next <onlyinclude> $startPos = strpos($text, '<onlyinclude>', $i); if ($startPos === false) { // Ignored section runs to the end $accum->addNodeWithText('ignore', substr($text, $i)); break; } $tagEndPos = $startPos + strlen('<onlyinclude>'); // past-the-end $accum->addNodeWithText('ignore', substr($text, $i, $tagEndPos - $i)); $i = $tagEndPos; $findOnlyinclude = false; } if ($fakeLineStart) { $found = 'line-start'; $curChar = ''; } else { # Find next opening brace, closing brace or pipe $search = $searchBase; if ($stack->top === false) { $currentClosing = ''; } else { $currentClosing = $stack->top->close; $search .= $currentClosing; } if ($findPipe) { $search .= '|'; } if ($findEquals) { // First equals will be for the template $search .= '='; } $rule = null; # Output literal section, advance input counter $literalLength = strcspn($text, $search, $i); if ($literalLength > 0) { $accum->addLiteral(substr($text, $i, $literalLength)); $i += $literalLength; } if ($i >= strlen($text)) { if ($currentClosing == "\n") { // Do a past-the-end run to finish off the heading $curChar = ''; $found = 'line-end'; } else { # All done break; } } else { $curChar = $text[$i]; if ($curChar == '|') { $found = 'pipe'; } elseif ($curChar == '=') { $found = 'equals'; } elseif ($curChar == '<') { $found = 'angle'; } elseif ($curChar == "\n") { if ($inHeading) { $found = 'line-end'; } else { $found = 'line-start'; } } elseif ($curChar == $currentClosing) { $found = 'close'; } elseif (isset($rules[$curChar])) { $found = 'open'; $rule = $rules[$curChar]; } else { # Some versions of PHP have a strcspn which stops on null characters # Ignore and continue ++$i; continue; } } } if ($found == 'angle') { $matches = false; // Handle </onlyinclude> if ($enableOnlyinclude && substr($text, $i, strlen('</onlyinclude>')) == '</onlyinclude>') { $findOnlyinclude = true; continue; } // Determine element name if (!preg_match($elementsRegex, $text, $matches, 0, $i + 1)) { // Element name missing or not listed $accum->addLiteral('<'); ++$i; continue; } // Handle comments if (isset($matches[2]) && $matches[2] == '!--') { // To avoid leaving blank lines, when a comment is both preceded // and followed by a newline (ignoring spaces), trim leading and // trailing spaces and one of the newlines. // Find the end $endPos = strpos($text, '-->', $i + 4); if ($endPos === false) { // Unclosed comment in input, runs to end $inner = substr($text, $i); $accum->addNodeWithText('comment', $inner); $i = strlen($text); } else { // Search backwards for leading whitespace $wsStart = $i ? $i - strspn($revText, ' ', strlen($text) - $i) : 0; // Search forwards for trailing whitespace // $wsEnd will be the position of the last space (or the '>' if there's none) $wsEnd = $endPos + 2 + strspn($text, ' ', $endPos + 3); // Eat the line if possible // TODO: This could theoretically be done if $wsStart == 0, i.e. for comments at // the overall start. That's not how Sanitizer::removeHTMLcomments() did it, but // it's a possible beneficial b/c break. if ($wsStart > 0 && substr($text, $wsStart - 1, 1) == "\n" && substr($text, $wsEnd + 1, 1) == "\n") { $startPos = $wsStart; $endPos = $wsEnd + 1; // Remove leading whitespace from the end of the accumulator // Sanity check first though $wsLength = $i - $wsStart; if ($wsLength > 0 && $accum->lastNode instanceof PPNode_Hash_Text && substr($accum->lastNode->value, -$wsLength) === str_repeat(' ', $wsLength)) { $accum->lastNode->value = substr($accum->lastNode->value, 0, -$wsLength); } // Do a line-start run next time to look for headings after the comment $fakeLineStart = true; } else { // No line to eat, just take the comment itself $startPos = $i; $endPos += 2; } if ($stack->top) { $part = $stack->top->getCurrentPart(); if (!(isset($part->commentEnd) && $part->commentEnd == $wsStart - 1)) { $part->visualEnd = $wsStart; } // Else comments abutting, no change in visual end $part->commentEnd = $endPos; } $i = $endPos + 1; $inner = substr($text, $startPos, $endPos - $startPos + 1); $accum->addNodeWithText('comment', $inner); } continue; } $name = $matches[1]; $lowerName = strtolower($name); $attrStart = $i + strlen($name) + 1; // Find end of tag $tagEndPos = $noMoreGT ? false : strpos($text, '>', $attrStart); if ($tagEndPos === false) { // Infinite backtrack // Disable tag search to prevent worst-case O(N^2) performance $noMoreGT = true; $accum->addLiteral('<'); ++$i; continue; } // Handle ignored tags if (in_array($lowerName, $ignoredTags)) { $accum->addNodeWithText('ignore', substr($text, $i, $tagEndPos - $i + 1)); $i = $tagEndPos + 1; continue; } $tagStartPos = $i; if ($text[$tagEndPos - 1] == '/') { // Short end tag $attrEnd = $tagEndPos - 1; $inner = null; $i = $tagEndPos + 1; $close = null; } else { $attrEnd = $tagEndPos; // Find closing tag if (preg_match("/<\\/" . preg_quote($name, '/') . "\\s*>/i", $text, $matches, PREG_OFFSET_CAPTURE, $tagEndPos + 1)) { $inner = substr($text, $tagEndPos + 1, $matches[0][1] - $tagEndPos - 1); $i = $matches[0][1] + strlen($matches[0][0]); $close = $matches[0][0]; } else { // No end tag -- let it run out to the end of the text. $inner = substr($text, $tagEndPos + 1); $i = strlen($text); $close = null; } } // <includeonly> and <noinclude> just become <ignore> tags if (in_array($lowerName, $ignoredElements)) { $accum->addNodeWithText('ignore', substr($text, $tagStartPos, $i - $tagStartPos)); continue; } if ($attrEnd <= $attrStart) { $attr = ''; } else { // Note that the attr element contains the whitespace between name and attribute, // this is necessary for precise reconstruction during pre-save transform. $attr = substr($text, $attrStart, $attrEnd - $attrStart); } $extNode = new PPNode_Hash_Tree('ext'); $extNode->addChild(PPNode_Hash_Tree::newWithText('name', $name)); $extNode->addChild(PPNode_Hash_Tree::newWithText('attr', $attr)); if ($inner !== null) { $extNode->addChild(PPNode_Hash_Tree::newWithText('inner', $inner)); } if ($close !== null) { $extNode->addChild(PPNode_Hash_Tree::newWithText('close', $close)); } $accum->addNode($extNode); } elseif ($found == 'line-start') { // Is this the start of a heading? // Line break belongs before the heading element in any case if ($fakeLineStart) { $fakeLineStart = false; } else { $accum->addLiteral($curChar); $i++; } $count = strspn($text, '=', $i, 6); if ($count == 1 && $findEquals) { // DWIM: This looks kind of like a name/value separator // Let's let the equals handler have it and break the potential heading // This is heuristic, but AFAICT the methods for completely correct disambiguation are very complex. } elseif ($count > 0) { $piece = array('open' => "\n", 'close' => "\n", 'parts' => array(new PPDPart_Hash(str_repeat('=', $count))), 'startPos' => $i, 'count' => $count); $stack->push($piece); $accum =& $stack->getAccum(); extract($stack->getFlags()); $i += $count; } } elseif ($found == 'line-end') { $piece = $stack->top; // A heading must be open, otherwise \n wouldn't have been in the search list assert($piece->open == "\n"); $part = $piece->getCurrentPart(); // Search back through the input to see if it has a proper close // Do this using the reversed string since the other solutions (end anchor, etc.) are inefficient $wsLength = strspn($revText, " \t", strlen($text) - $i); $searchStart = $i - $wsLength; if (isset($part->commentEnd) && $searchStart - 1 == $part->commentEnd) { // Comment found at line end // Search for equals signs before the comment $searchStart = $part->visualEnd; $searchStart -= strspn($revText, " \t", strlen($text) - $searchStart); } $count = $piece->count; $equalsLength = strspn($revText, '=', strlen($text) - $searchStart); if ($equalsLength > 0) { if ($searchStart - $equalsLength == $piece->startPos) { // This is just a single string of equals signs on its own line // Replicate the doHeadings behaviour /={count}(.+)={count}/ // First find out how many equals signs there really are (don't stop at 6) $count = $equalsLength; if ($count < 3) { $count = 0; } else { $count = min(6, intval(($count - 1) / 2)); } } else { $count = min($equalsLength, $count); } if ($count > 0) { // Normal match, output <h> $element = new PPNode_Hash_Tree('possible-h'); $element->addChild(new PPNode_Hash_Attr('level', $count)); $element->addChild(new PPNode_Hash_Attr('i', $headingIndex++)); $element->lastChild->nextSibling = $accum->firstNode; $element->lastChild = $accum->lastNode; } else { // Single equals sign on its own line, count=0 $element = $accum; } } else { // No match, no <h>, just pass down the inner text $element = $accum; } // Unwind the stack $stack->pop(); $accum =& $stack->getAccum(); extract($stack->getFlags()); // Append the result to the enclosing accumulator if ($element instanceof PPNode) { $accum->addNode($element); } else { $accum->addAccum($element); } // Note that we do NOT increment the input pointer. // This is because the closing linebreak could be the opening linebreak of // another heading. Infinite loops are avoided because the next iteration MUST // hit the heading open case above, which unconditionally increments the // input pointer. } elseif ($found == 'open') { # count opening brace characters $count = strspn($text, $curChar, $i); # we need to add to stack only if opening brace count is enough for one of the rules if ($count >= $rule['min']) { # Add it to the stack $piece = array('open' => $curChar, 'close' => $rule['end'], 'count' => $count, 'lineStart' => $i > 0 && $text[$i - 1] == "\n"); $stack->push($piece); $accum =& $stack->getAccum(); extract($stack->getFlags()); } else { # Add literal brace(s) $accum->addLiteral(str_repeat($curChar, $count)); } $i += $count; } elseif ($found == 'close') { $piece = $stack->top; # lets check if there are enough characters for closing brace $maxCount = $piece->count; $count = strspn($text, $curChar, $i, $maxCount); # check for maximum matching characters (if there are 5 closing # characters, we will probably need only 3 - depending on the rules) $rule = $rules[$piece->open]; if ($count > $rule['max']) { # The specified maximum exists in the callback array, unless the caller # has made an error $matchingCount = $rule['max']; } else { # Count is less than the maximum # Skip any gaps in the callback array to find the true largest match # Need to use array_key_exists not isset because the callback can be null $matchingCount = $count; while ($matchingCount > 0 && !array_key_exists($matchingCount, $rule['names'])) { --$matchingCount; } } if ($matchingCount <= 0) { # No matching element found in callback array # Output a literal closing brace and continue $accum->addLiteral(str_repeat($curChar, $count)); $i += $count; continue; } $name = $rule['names'][$matchingCount]; if ($name === null) { // No element, just literal text $element = $piece->breakSyntax($matchingCount); $element->addLiteral(str_repeat($rule['end'], $matchingCount)); } else { # Create XML element # Note: $parts is already XML, does not need to be encoded further $parts = $piece->parts; $titleAccum = $parts[0]->out; unset($parts[0]); $element = new PPNode_Hash_Tree($name); # The invocation is at the start of the line if lineStart is set in # the stack, and all opening brackets are used up. if ($maxCount == $matchingCount && !empty($piece->lineStart)) { $element->addChild(new PPNode_Hash_Attr('lineStart', 1)); } $titleNode = new PPNode_Hash_Tree('title'); $titleNode->firstChild = $titleAccum->firstNode; $titleNode->lastChild = $titleAccum->lastNode; $element->addChild($titleNode); $argIndex = 1; foreach ($parts as $part) { if (isset($part->eqpos)) { // Find equals $lastNode = false; for ($node = $part->out->firstNode; $node; $node = $node->nextSibling) { if ($node === $part->eqpos) { break; } $lastNode = $node; } if (!$node) { throw new MWException(__METHOD__ . ': eqpos not found'); } if ($node->name !== 'equals') { throw new MWException(__METHOD__ . ': eqpos is not equals'); } $equalsNode = $node; // Construct name node $nameNode = new PPNode_Hash_Tree('name'); if ($lastNode !== false) { $lastNode->nextSibling = false; $nameNode->firstChild = $part->out->firstNode; $nameNode->lastChild = $lastNode; } // Construct value node $valueNode = new PPNode_Hash_Tree('value'); if ($equalsNode->nextSibling !== false) { $valueNode->firstChild = $equalsNode->nextSibling; $valueNode->lastChild = $part->out->lastNode; } $partNode = new PPNode_Hash_Tree('part'); $partNode->addChild($nameNode); $partNode->addChild($equalsNode->firstChild); $partNode->addChild($valueNode); $element->addChild($partNode); } else { $partNode = new PPNode_Hash_Tree('part'); $nameNode = new PPNode_Hash_Tree('name'); $nameNode->addChild(new PPNode_Hash_Attr('index', $argIndex++)); $valueNode = new PPNode_Hash_Tree('value'); $valueNode->firstChild = $part->out->firstNode; $valueNode->lastChild = $part->out->lastNode; $partNode->addChild($nameNode); $partNode->addChild($valueNode); $element->addChild($partNode); } } } # Advance input pointer $i += $matchingCount; # Unwind the stack $stack->pop(); $accum =& $stack->getAccum(); # Re-add the old stack element if it still has unmatched opening characters remaining if ($matchingCount < $piece->count) { $piece->parts = array(new PPDPart_Hash()); $piece->count -= $matchingCount; # do we still qualify for any callback with remaining count? $names = $rules[$piece->open]['names']; $skippedBraces = 0; $enclosingAccum =& $accum; while ($piece->count) { if (array_key_exists($piece->count, $names)) { $stack->push($piece); $accum =& $stack->getAccum(); break; } --$piece->count; $skippedBraces++; } $enclosingAccum->addLiteral(str_repeat($piece->open, $skippedBraces)); } extract($stack->getFlags()); # Add XML element to the enclosing accumulator if ($element instanceof PPNode) { $accum->addNode($element); } else { $accum->addAccum($element); } } elseif ($found == 'pipe') { $findEquals = true; // shortcut for getFlags() $stack->addPart(); $accum =& $stack->getAccum(); ++$i; } elseif ($found == 'equals') { $findEquals = false; // shortcut for getFlags() $accum->addNodeWithText('equals', '='); $stack->getCurrentPart()->eqpos = $accum->lastNode; ++$i; } } # Output any remaining unclosed brackets foreach ($stack->stack as $piece) { $stack->rootAccum->addAccum($piece->breakSyntax()); } # Enable top-level headings for ($node = $stack->rootAccum->firstNode; $node; $node = $node->nextSibling) { if (isset($node->name) && $node->name === 'possible-h') { $node->name = 'h'; } } $rootNode = new PPNode_Hash_Tree('root'); $rootNode->firstChild = $stack->rootAccum->firstNode; $rootNode->lastChild = $stack->rootAccum->lastNode; // Cache if ($cacheable) { $cacheValue = sprintf("%08d", self::CACHE_VERSION) . serialize($rootNode); $wgMemc->set($cacheKey, $cacheValue, 86400); wfProfileOut(__METHOD__ . '-cache-miss'); wfProfileOut(__METHOD__ . '-cacheable'); wfDebugLog("Preprocessor", "Saved preprocessor Hash to memcached (key {$cacheKey})"); } wfProfileOut(__METHOD__); return $rootNode; }
public function getStripList() { return array_merge((array) parent::getStripList(), array('noinclude', 'includeonly', 'onlyinclude', 'references')); }