<?php /** * Runs tests against the PHP parser. */ require_once getenv('MW_INSTALL_PATH') !== false ? getenv('MW_INSTALL_PATH') . "/maintenance/commandLine.inc" : __DIR__ . '/../../../maintenance/commandLine.inc'; $tester = new AbuseFilterParser(); $test_path = __DIR__ . "/parserTests"; $tests = glob($test_path . "/*.t"); $check = 0; $pass = 0; foreach ($tests as $test) { $result = substr($test, 0, -2) . ".r"; $rule = trim(file_get_contents($test)); $output = trim(file_get_contents($result)) == 'MATCH'; $testname = basename($test); print "Trying test {$testname}...\n"; try { $check++; $actual = intval($tester->parse($rule)); if ($actual == $output) { print "-PASSED.\n"; $pass++; } else { print "-FAILED - expected output {$output}, actual output {$actual}.\n"; print "-Expression: {$rule}\n"; // export $vars = var_export($tester->mTokens, true); file_put_contents($test . '.parsed', $vars); } } catch (AFPException $excep) {
static function nextToken($code, $offset) { $tok = ''; // Check for infinite loops if (self::$lastHandledToken == array($code, $offset)) { // Should never happen throw new AFPException("Entered infinite loop. Offset {$offset} of {$code}"); } self::$lastHandledToken = array($code, $offset); // Spaces $matches = array(); if (preg_match('/\\s+/uA', $code, $matches, 0, $offset)) { $offset += strlen($matches[0]); } if ($offset >= strlen($code)) { return array('', AFPToken::TNone, $code, $offset); } // Comments if (substr($code, $offset, 2) == '/*') { $end = strpos($code, '*/', $offset); return self::nextToken($code, $end + 2); } // Commas if ($code[$offset] == ',') { return array(',', AFPToken::TComma, $code, $offset + 1); } // Braces if ($code[$offset] == '(' or $code[$offset] == ')') { return array($code[$offset], AFPToken::TBrace, $code, $offset + 1); } // Square brackets if ($code[$offset] == '[' or $code[$offset] == ']') { return array($code[$offset], AFPToken::TSquareBracket, $code, $offset + 1); } // Semicolons if ($code[$offset] == ';') { return array(';', AFPToken::TStatementSeparator, $code, $offset + 1); } // Strings if ($code[$offset] == '"' || $code[$offset] == "'") { $type = $code[$offset]; $offset++; $strLen = strlen($code); while ($offset < $strLen) { if ($code[$offset] == $type) { $offset++; return array($tok, AFPToken::TString, $code, $offset); } // Performance: Use a PHP function (implemented in C) // to scan ahead. $addLength = strcspn($code, $type . "\\", $offset); if ($addLength) { $tok .= substr($code, $offset, $addLength); $offset += $addLength; } elseif ($code[$offset] == '\\') { switch ($code[$offset + 1]) { case '\\': $tok .= '\\'; break; case $type: $tok .= $type; break; case 'n': $tok .= "\n"; break; case 'r': $tok .= "\r"; break; case 't': $tok .= "\t"; break; case 'x': $chr = substr($code, $offset + 2, 2); if (preg_match('/^[0-9A-Fa-f]{2}$/', $chr)) { $chr = base_convert($chr, 16, 10); $tok .= chr($chr); $offset += 2; # \xXX -- 2 done later } else { $tok .= 'x'; } break; default: $tok .= "\\" . $code[$offset + 1]; } $offset += 2; } else { $tok .= $code[$offset]; $offset++; } } throw new AFPUserVisibleException('unclosedstring', $offset, array()); } // Find operators static $operator_regex = null; // Match using a regex. Regexes are faster than PHP if (!$operator_regex) { $quoted_operators = array(); foreach (self::$mOps as $op) { $quoted_operators[] = preg_quote($op, '/'); } $operator_regex = '/(' . implode('|', $quoted_operators) . ')/A'; } $matches = array(); preg_match($operator_regex, $code, $matches, 0, $offset); if (count($matches)) { $tok = $matches[0]; $offset += strlen($tok); return array($tok, AFPToken::TOp, $code, $offset); } // Find bare numbers $bases = array('b' => 2, 'x' => 16, 'o' => 8); $baseChars = array(2 => '[01]', 16 => '[0-9A-Fa-f]', 8 => '[0-8]', 10 => '[0-9.]'); $baseClass = '[' . implode('', array_keys($bases)) . ']'; $radixRegex = "/([0-9A-Fa-f]+(?:\\.\\d*)?|\\.\\d+)({$baseClass})?/Au"; $matches = array(); if (preg_match($radixRegex, $code, $matches, 0, $offset)) { $input = $matches[1]; $baseChar = @$matches[2]; // Sometimes the base char gets mixed in with the rest of it because // the regex targets hex, too. // This mostly happens with binary if (!$baseChar && !empty($bases[substr($input, -1)])) { $baseChar = substr($input, -1, 1); $input = substr($input, 0, -1); } if ($baseChar) { $base = $bases[$baseChar]; } else { $base = 10; } // Check against the appropriate character class for input validation $baseRegex = "/^" . $baseChars[$base] . "+\$/"; if (preg_match($baseRegex, $input)) { if ($base != 10) { $num = base_convert($input, $base, 10); } else { $num = $input; } $offset += strlen($matches[0]); $float = in_string('.', $input); return array($float ? doubleval($num) : intval($num), $float ? AFPToken::TFloat : AFPToken::TInt, $code, $offset); } } // The rest are considered IDs // Regex match > PHP $idSymbolRegex = '/[0-9A-Za-z_]+/A'; $matches = array(); if (preg_match($idSymbolRegex, $code, $matches, 0, $offset)) { $tok = $matches[0]; $type = in_array($tok, self::$mKeywords) ? AFPToken::TKeyword : AFPToken::TID; return array($tok, $type, $code, $offset + strlen($tok)); } throw new AFPUserVisibleException('unrecognisedtoken', $offset, array(substr($code, $offset))); }