/** * @brief Performs a search for the given pattern past the given index. * @param $search the pattern to search for * @param $index the minimum string index (offset) of a result * @param $matches a reference to the return location of the match groups * @return the index or false if no match is found. */ public function match($search, $index, &$matches) { $r = false; // return value if (isset($this->cache[$search])) { $a = $this->cache[$search]; if ($a === false) { return false; } // no more results $r = $a[0]; $matches = $a[1]; assert($matches !== null); if ($r >= $index) { // cache is good! return $r; } } // cache not set, or out of date, we have to perform the match if (!($ret = preg_match($search, $this->string, $matches_, PREG_OFFSET_CAPTURE, $index))) { if ($ret === false && LUMINOUS_DEBUG) { throw new Exception('preg_match returned false for pattern: "' . $search . '", with code: ' . LuminousUtils::pcre_error_decode(preg_last_error())); } $this->cache[$search] = false; return false; } $r = $matches_[0][1]; // strip the offsets from the match_groups foreach ($matches_ as $i => &$v) { $v = $v[0]; } $this->cache[$search] = array($r, $matches_); $matches = $matches_; return $r; }
function _test_escape_token($token) { $escaped = LuminousUtils::escape_token($token); // name should be unchanged assert($token[0] === $escaped[0]); $expected = $token[2] ? $token[1] : LuminousUtils::escape_string($token[1]); assert($escaped[1] === $expected); assert($escaped[2]); }
static function str_filter($token) { if (strpos($token[1], '~') == false) { return $token; } $token = LuminousUtils::escape_token($token); $token[1] = preg_replace('/~(?:\\d+|.)/', '<INTERPOLATION>$0</INTERPOLATION>', $token[1]); return $token; }
static function comment_filter($token) { $token = LuminousUtils::escape_token($token); $str =& $token[1]; // It pays to run the strpos checks first. if (strpos(substr($str, 1), '"') !== false) { $str = preg_replace('/(?<!^)"(?>[^"]*)"/', "<STRING>\$0</STRING>", $str); } if (strpos($str, ':') !== false) { $str = preg_replace('/(?<=^")((?>\\W*))((?>[A-Z]\\w+(?>(?>\\s+\\w+)*)))(:\\s*)(.*)/', '$1<DOCTAG>$2</DOCTAG>$3<DOCSTR>$4</DOCSTR>', $str); } return $token; }
static function str_filter($token) { if ($token[1][0] !== '"' && $token[0] !== 'HEREDOC') { return $token; } elseif (strpos($token[1], '$') === false) { return $token; } $token = LuminousUtils::escape_token($token); // matches $var, ${var} and {$var} syntax $token[1] = preg_replace('/ (?: \\$\\{ | \\{\\$ ) [^}]++ \\} | \\$\\$?[a-zA-Z_]\\w* /x', '<VARIABLE>$0</VARIABLE>', $token[1]); return $token; }
public static function string_filter($token) { $token = LuminousUtils::escape_token($token); $token[1] = preg_replace("/\\\$(?:\\w+|\\{[^}\n]+\\})/", '<VARIABLE>$0</VARIABLE>', $token[1]); return $token; }
function str_override($matches) { $this->pos($this->pos() + strlen($matches[0])); $this->record($matches[0], 'DELIMITER'); $f = $matches[1]; $type = 'STRING'; if ($f === 'm' || $f === 'qr' || $f === 's' || $f === 'tr' || $f === 'y') { $type = 'REGEX'; } elseif ($f === 'qw') { $type = 'SPLIT_STRING'; } $this->consume_string($matches[3], $type); if ($f === 's' || $f === 'tr' || $f === 'y') { // s/tr/y take two strings, e.g. s/something/somethingelse/, so we // have to consume the next delimiter (if it exists) and consume the // string, again. // if delims were balanced, there's a new delimiter right here, e.g. // s[something][somethingelse] $this->skip_whitespace(); $balanced = LuminousUtils::balance_delimiter($matches[3]) !== $matches[3]; if ($balanced) { $delim2 = $this->scan('/[^a-zA-Z0-9]/'); if ($delim2 !== null) { $this->record($delim2, 'DELIMITER'); $this->consume_string($delim2, 'STRING'); } } else { $this->consume_string($matches[3], 'STRING'); } } if ($type === 'REGEX' && $this->scan('/[cgimosxpe]+/')) { $this->record($this->match(), 'KEYWORD'); } }
function main() { // we're aiming to handle context, unified and normal diff all at once here // because it doesn't really seem that hard. $child = null; $last_index = -1; while (!$this->eos()) { $index = $this->pos(); assert($index > $last_index); $last_index = $index; assert($this->bol()); $tok = null; if ($this->scan('/diff\\s.*$/m') !== null) { $tok = 'KEYWORD'; } elseif ($this->scan($this->patterns['range']) !== null) { $tok = 'DIFF_RANGE'; } elseif ($this->scan("/-{3}[ \t]*\$/m")) { $tok = null; } elseif ($this->scan('/(?:\\**|=*|\\w.*)$/m') !== null) { $tok = 'KEYWORD'; } elseif ($this->scan("@[+\\-\\*]{3}(\\s+([^\\s]*)([ \t]|\$))?.*@m") !== null) { $m = $this->match_groups(); // unified uses +++, context uses * if ($m[0][0] === '+' || $m[0][0] === '*') { $tok = 'DIFF_HEADER_NEW'; } else { $tok = 'DIFF_HEADER_OLD'; } if (isset($m[2])) { $filename = preg_replace('@.*\\\\/@', '', $m[2]); $child = self::get_child_scanner($filename); } } elseif ($this->scan('/\\\\.*/') !== null) { $tok = null; } elseif ($this->scan($this->patterns['codeblock']) !== null) { // this is actual source code. // we're going to format this here. // we're going to extract the block, and try to re-assemble it as // verbatim code, then highlight it via a child scanner, then split up // the lines, re-apply the necessary prefixes (e.g. + or -) to them, // and store them as being a DIFF_ token. // we have to do it like this, rather than line by line, otherwise // multiline tokens aren't going to work properly. There's stilla risk // that the diff will be fragmented such the child scanner gets it // wrong but that can't be helped. // TODO restructure this so the complicated bits aren't done if there's // no child scanner to pass it down to $block = $this->match(); if (!strlen($block)) { assert(0); } $lines = explode("\n", $block); $verbatim = array(); $verbatim_ = ''; $types = array(); $prefixes = array(); foreach ($lines as $l) { if (!strlen($l) || $l[0] === ' ') { $types[] = 'DIFF_UNCHANGED'; } elseif ($l[0] === '+' || $l[0] === '>') { $types[] = 'DIFF_NEW'; } elseif ($l[0] === '!' || $l[0] === '<' || $l[0] === '-') { $types[] = 'DIFF_OLD'; } else { assert(0); } $prefixes[] = isset($l[0]) ? $l[0] : ''; $verbatim_[] = substr($l, 1); } $verbatim = implode("\n", $verbatim_); $escaped = false; $tagged; if ($child !== null) { $c = new $child(); $c->init(); $c->string($verbatim); $c->main(); $tagged = $c->tagged(); $escaped = true; } else { $tagged = $verbatim; } $exp = explode("\n", $tagged); assert(count($exp) === count($prefixes)); foreach ($exp as $i => $v) { $t = $types[$i]; // if the sub-scanner escaped the line, we also need to escape the // prefix for consistency $prefix = $prefixes[$i]; if ($escaped) { $prefix = LuminousUtils::escape_string($prefix); } $text = $prefix . $v; $this->record($text, $t, $escaped); if ($i < count($exp) - 1) { $this->record("\n", null); } } if ($this->eol()) { $this->record($this->get(), null); } continue; } else { $this->scan('/.*/'); } // previous else clause can capture empty strings if ($this->match() !== '') { $this->record($this->match(), $tok); } assert($this->eol()); // consume newline if (!$this->eos()) { $this->record($this->get(), null); } } }
/** * Recursive function to collapse the token tree into XML * @internal */ protected function collapse_token_tree($node) { $text = ''; foreach ($node['children'] as $c) { if (is_string($c)) { $text .= LuminousUtils::escape_string($c); } else { $text .= $this->collapse_token_tree($c); } } $token_name = $node['token_name']; $token = array($node['token_name'], $text, true); $token_ = $this->rule_mapper_filter(array($token)); $token = $token_[0]; if (isset($this->filters[$token_name])) { foreach ($this->filters[$token_name] as $filter) { $token = call_user_func($filter[1], $token); } } list($token_name, $text, ) = $token; return $token_name === null ? $text : LuminousUtils::tag_block($token_name, $text); }
/** * @brief Tries to highlight PCRE style regular expression syntax */ static function pcre($token, $delimited = true) { $token = self::string($token); $token = LuminousUtils::escape_token($token); $str =& $token[1]; $flags = array(); if ($delimited) { $str = preg_replace('/^[^[:alnum:]<>\\s]/', '<DELIMITER>$0</DELIMITER>', $str); if (preg_match("/[[:alpha:]]+\$/", $str, $matches)) { $m = $matches[0]; $flags = str_split($m); $str = preg_replace("/((?<!\\A)[^[:alnum:]\\s<>])([[:alpha:]]+)\$/", "<DELIMITER>\$1</DELIMITER><KEYWORD>\$2</KEYWORD>", $str); } else { $str = preg_replace('/[^[:alnum:]<>]$/', '<DELIMITER>$0</DELIMITER>', $str); } } $str = preg_replace("/((?<!\\\\)[\\*\\+\\.|])|((?<![\\(\\\\])\\?)/", "<REGEX_OPERATOR>\$0</REGEX_OPERATOR>", $str); $str = preg_replace("/(?<=\\()\\?(?:(?:[a-zA-Z:!|=])|(?:(?:<)[=!]))/", "<REGEX_SUBPATTERN>\$0</REGEX_SUBPATTERN>", $str); $str = preg_replace("/(?<!\\\\)[\\(\\)]/", "<REGEX_SUBPATTERN_MARKER>\$0</REGEX_SUBPATTERN_MARKER>", $str); $str = preg_replace("/(?<!\\\\)[\\[\\]]/", "<REGEX_CLASS_MARKER>\$0</REGEX_CLASS_MARKER>", $str); $str = preg_replace("/(?<!\\\\)\n \\{\n (\n ((?>\\d+)(,(?>\\d+)?)?)\n |\n (,(?>\\d+))\n )\n \\}/x", "<REGEX_REPEAT_MARKER>\$0</REGEX_REPEAT_MARKER>", $str); // extended regex: # signifies a comment if (in_array('x', $flags)) { $str = preg_replace('/(?<!\\\\)#.*$/m', '<COMMENT>$0</COMMENT>', $str); } return $token; }
static function preprocessor_filter($token) { $token = LuminousUtils::escape_token($token); $token[1] = preg_replace_callback("@\n (?P<STR> \" (?> [^\\\\\n\"]+ | \\\\. )* (?: \"|\$) | (?<=<) .*? (?=>))\n | // .*\n | /\\* (?s:.*?) (\\*/ | \$)\n @x", array('LuminousCppScanner', 'preprocessor_filter_cb'), $token[1]); return $token; }
public function main() { while (!$this->eos()) { if ($this->bol() && !empty($this->heredocs)) { $this->do_heredoc(); } if ($this->interpolation) { $c = $this->peek(); if ($c === '{') { $this->curley_braces++; } elseif ($c === '}') { $this->curley_braces--; if ($this->curley_braces <= 0) { break; } } } if ($this->rails && $this->check('/-?%>/')) { break; } $c = $this->peek(); if ($c === '=' && $this->scan('/^=begin .*? (^=end|\\z)/msx')) { $this->record($this->match(), 'DOCCOMMENT'); } elseif ($c === '#' && $this->scan($this->comment_regex)) { $this->record($this->match(), 'COMMENT'); } elseif ($this->scan($this->numeric) !== null) { $this->record($this->match(), 'NUMERIC'); } elseif ($c === '$' && $this->scan('/\\$ (?: (?:[!@`\'\\+1~=\\/\\\\,;\\._0\\*\\$\\?:"&<>]) | (?: -[0adFiIlpvw]) | (?:DEBUG|FILENAME|LOAD_PATH|stderr|stdin|stdout|VERBOSE) )/x') || $this->scan('/(\\$|@@?)\\w+/')) { $this->record($this->match(), 'VARIABLE'); } elseif ($this->scan('/:\\w+/')) { $this->record($this->match(), 'VALUE'); } elseif ($c === '<' && $this->scan('/(<<(-?))([\'"`]?)([A-Z_]\\w*)(\\3)/i')) { $m = $this->match_groups(); $this->record($m[0], 'DELIMITER'); $hdoc = array($m[4], $m[2] === '-', $m[3] !== "'"); $this->heredocs[] = $hdoc; } elseif (($c === '"' || $c === "'" || $c === '`' || $c === '%') && $this->scan('/[\'"`]|%( [qQrswWx](?![[:alnum:]]|$) | (?![[:alnum:]\\s]|$))/xm') || $c === '/' && $this->is_regex()) { $interpolation = false; $type = 'STRING'; $delimiter; $pos; $fancy_delim = false; $split = false; if ($c === '/') { $interpolation = true; $type = 'REGEX'; $delimiter = $c; $pos = $this->pos(); $this->get(); } else { $pos = $this->match_pos(); $delimiter = $this->match(); if ($delimiter === '"') { $interpolation = true; } elseif ($delimiter === "'") { } elseif ($delimiter === '`') { $type = 'FUNCTION'; } else { $delimiter = $this->get(); $m1 = $this->match_group(1); if ($m1 === 'Q' || $m1 === 'r' || $m1 === 'W' || $m1 === 'x') { $interpolation = true; } if ($m1 === 'w' || $m1 === 'W') { $split = true; } if ($m1 === 'x') { $type = 'FUNCTION'; } elseif ($m1 === 'r') { $type = 'REGEX'; } $fancy_delim = true; $this->record($this->match() . $delimiter, 'DELIMITER'); $pos = $this->pos(); } } $data = array($type, $delimiter, LuminousUtils::balance_delimiter($delimiter), $pos, $interpolation, $fancy_delim, $split); $this->do_string($data); } elseif ((ctype_alpha($c) || $c === '_') && ($m = $this->scan('/[_a-zA-Z]\\w*[!?]?/')) !== null) { $this->record($m, ctype_upper($m[0]) ? 'CONSTANT' : 'IDENT'); if ($m === '__END__') { if (!$this->interpolation) { $this->record($this->rest(), null); $this->terminate(); } break; } } elseif ($this->scan($this->operator_regex)) { $this->record($this->match(), 'OPERATOR'); } elseif ($this->scan("/[ \t]+/")) { $this->record($this->match(), null); } else { $this->record($this->get(), null); } } // In case not everything was popped if (isset($this->state_[0])) { $this->record(substr($this->string(), $this->state_[0][3], $this->pos() - $this->state_[0][3]), $this->state_[0][0]); $this->terminate(); } }