function minus($minuend, $subtrahend) { return $minuend - $subtrahend; } # N -> number $N = new RegexParser("#^(0|[1-9][0-9]*)#", function ($match) { return (int) $match; }); # P -> "-" N $P = new ConcParser(array(new StringParser("-"), $N), function ($minus, $n) { return $n; }); # Naive left-recursive grammar looks like this and raises an exception # when instantiated. try { # S -> N # S -> S P $grammar = new Grammar("S", array("S" => new LazyAltParser(array("N", new ConcParser(array("S", "P"), "minus"))), "P" => $P, "N" => $N)); var_dump(false); } catch (GrammarException $e) { # Left-recursive in S var_dump(true); } # Fix the grammar like so: # S -> N P* $grammar = new Grammar("S", array("S" => new ConcParser(array($N, new GreedyStarParser("P")), function ($n, $ps) { return array_reduce($ps, "minus", $n); # clever bit }), "P" => $P, "N" => $N)); var_dump($grammar->parse("5-4-3") === -2); # true
foreach ($this->mAction as $id => $row) { $s .= "\t{$id} => array(\n"; foreach ($row as $t => $action) { if ($action[0] == 'shift') { $s .= "\t\t'{$t}' => array( 0, {$action[1]} ),\n"; } if ($action[0] == 'reduce') { $s .= "\t\t'{$t}' => array( 1, {$action[1]} ),\n"; } if ($action[0] == 'accept') { $s .= "\t\t'{$t}' => array( 2, null ),\n"; } } $s .= "\t),\n"; } $s .= ");\n\n"; $s .= "static \$goto = array(\n"; foreach ($this->mGoto as $id => $row) { $body = $this->formatAssocArray($row); $s .= "\t{$id} => {$body},\n"; } $s .= ");\n\n"; $s .= "}\n"; return $s; } } $definition = file_get_contents(dirname(__FILE__) . '/syntax.txt'); $grammar = Grammar::parse($definition); $grammar->buildLRTable(); file_put_contents('LRTableBuildReport.html', $grammar->buildHTMLDump()); file_put_contents('LRTable.php', $grammar->buildPHPFile());
return "\f"; }), "ESCAPED_N" => new StringParser("\\n", function ($string) { return "\n"; }), "ESCAPED_R" => new StringParser("\\r", function ($string) { return "\r"; }), "ESCAPED_T" => new StringParser("\\t", function ($string) { return "\t"; }), "ESCAPED_UTF8" => new RegexParser("#^\\\\u[0-9a-fA-F]{4}#", function ($match) { return Utf8Parser::getBytes(hexdec(substr($match, 2, 4))); }))); // if executing this file directly, run unit tests if (__FILE__ !== $_SERVER["SCRIPT_FILENAME"]) { return; } $start = microtime(true); $parseTree = $jsonGrammar->parse(" { \"string\" : true, \"\\\"\" : false, \"\\u9874asdh\" : [ null, { }, -9488.44E+093 ] } "); print "Parsing completed in " . (microtime(true) - $start) . " seconds\n"; var_dump(true); // for successful parsing // print_r($parseTree); var_dump(count($parseTree) === 3); var_dump($parseTree["string"] === true); var_dump($parseTree["\""] === false); var_dump($parseTree["顴asdh"] === array(null, array(), -9.488439999999999E+96)); print "2\n"; // failure modes foreach (array("{ \"string ", "{ \"\\UAAAA\" ", "{ \"\\u000i\" ", "{ \"a\" : tru ", "{ \"a\" : +9 ", "{ \"a\" : 9. ", "{ \"a\" : 0a8.52 ", "{ \"a\" : 8E ", "{ \"a\" : 08 ", "[ \"a\" , 8 ]", " \"a\" ", "{\"\" :7}", "{\"\":7}", "{\"\n\" :7}", "{\"\r\" :7}", "{\"\t\" :7}") as $string) { try { $jsonGrammar->parse($string); var_dump(false); } catch (Exception $e) {
var_dump($parser->match("fff", 0) === array("j" => 2, "value" => array("f", "f"))); $parser = new GreedyMultiParser(new StringParser("f"), 1, null); try { $parser->match("", 0); var_dump(false); } catch (ParseFailureException $e) { var_dump(true); } var_dump($parser->match("f", 0) === array("j" => 1, "value" => array("f"))); var_dump($parser->match("ff", 0) === array("j" => 2, "value" => array("f", "f"))); var_dump($parser->match("fff", 0) === array("j" => 3, "value" => array("f", "f", "f"))); var_dump($parser->match("ffg", 0) === array("j" => 2, "value" => array("f", "f"))); print "11\n"; $grammar = new Grammar("<A>", array("<A>" => new EmptyParser())); try { $grammar->parse("a"); var_dump(false); } catch (ParseFailureException $e) { var_dump(true); } var_dump($grammar->parse("") === null); print "12A\n"; try { $grammar = new Grammar("<S>", array("<S>" => new GreedyMultiParser("<A>", 7, null), "<A>" => new EmptyParser())); var_dump(false); } catch (GrammarException $e) { var_dump(true); } try { $grammar = new Grammar("<S>", array("<S>" => new GreedyStarParser("<A>"), "<A>" => new GreedyStarParser("<B>"), "<B>" => new EmptyParser())); var_dump(false);
}), new StringParser("\""), "whitespace"), function ($quote1, $chars, $quote2, $whitespace) { return new StringParser($chars); }), "digit" => new RegexParser("#^[0-9]#"), "letter" => new RegexParser("#^[a-zA-Z]#"), "character" => new RegexParser("#^([^\"]|\"\")#", function ($match0) { if ($match0 === "\"\"") { return "\""; } return $match0; }), "whitespace" => new RegexParser("#^[ \n\r\t]*#")), function ($syntax) { $parsers = array(); foreach ($syntax as $production) { if (count($parsers) === 0) { $top = $production["identifier"]; } $parsers[$production["identifier"]] = $production["expression"]; } if (count($parsers) === 0) { throw new Exception("No rules."); } return new Grammar($top, $parsers); }); // if executing this file directly, run unit tests if (__FILE__ !== $_SERVER["SCRIPT_FILENAME"]) { return; } // This is the syntax for Wirth syntax notation except it lacks whitespace $string = "\n\t\tSYNTAX = { PRODUCTION } .\n\t\tPRODUCTION = IDENTIFIER \"=\" EXPRESSION \".\" .\n\t\tEXPRESSION = TERM { \"|\" TERM } .\n\t\tTERM = FACTOR { FACTOR } .\n\t\tFACTOR = IDENTIFIER\n\t\t\t\t\t\t\t | LITERAL\n\t\t\t\t\t\t\t | \"[\" EXPRESSION \"]\"\n\t\t\t\t\t\t\t | \"(\" EXPRESSION \")\"\n\t\t\t\t\t\t\t | \"{\" EXPRESSION \"}\" .\n\t\tIDENTIFIER = letter { letter } .\n\t\tLITERAL = \"\"\"\" character { character } \"\"\"\" .\n\t\tdigit = \"0\" | \"1\" | \"2\" | \"3\" | \"4\" | \"5\" | \"6\" | \"7\" | \"8\" | \"9\" .\n\t\tupper = \"A\" | \"B\" | \"C\" | \"D\" | \"E\" | \"F\" | \"G\" | \"H\" | \"I\" | \"J\" \n\t\t | \"K\" | \"L\" | \"M\" | \"N\" | \"O\" | \"P\" | \"Q\" | \"R\" | \"S\" | \"T\" \n\t\t | \"U\" | \"V\" | \"W\" | \"X\" | \"Y\" | \"Z\" .\n\t\tlower = \"a\" | \"b\" | \"c\" | \"d\" | \"e\" | \"f\" | \"g\" | \"h\" | \"i\" | \"j\" \n\t\t | \"k\" | \"l\" | \"m\" | \"n\" | \"o\" | \"p\" | \"q\" | \"r\" | \"s\" | \"t\" \n\t\t | \"u\" | \"v\" | \"w\" | \"x\" | \"y\" | \"z\" .\n\t\tletter = upper | lower .\n\t\tcharacter = letter | digit | \"=\" | \".\" | \"\"\"\"\"\" .\n\t"; $wirthGrammar->parse($string)->parse("SYNTAX={PRODUCTION}."); var_dump(true); # for a successful parse ?>
} $parsers[$rule["rule-name"]] = $rule["expression"]; } if (count($parsers) === 0) { throw new Exception("No rules."); } return new Grammar($top, $parsers); }); // if executing this file directly, run unit tests if (__FILE__ !== $_SERVER["SCRIPT_FILENAME"]) { return; } // Full rule set $string = "\n\t<postal-address> ::= <name-part> <street-address> <zip-part>\n\t<name-part> ::= <personal-part> <name-part> | <personal-part> <last-name> <opt-jr-part> <EOL>\n\t<personal-part> ::= <initial> \".\" | <first-name>\n\t<street-address> ::= <house-num> <street-name> <opt-apt-num> <EOL>\n\t<zip-part> ::= <town-name> \",\" <state-code> <ZIP-code> <EOL>\n\t<opt-jr-part> ::= \"Sr.\" | \"Jr.\" | <roman-numeral> | \"\"\n\n\t<last-name> ::= 'MacLaurin '\n\t<EOL> ::= '\n'\n\t<initial> ::= 'b'\n\t<first-name> ::= 'Steve '\n\t<house-num> ::= '173 '\n\t<street-name> ::= 'Acacia Avenue '\n\t<opt-apt-num> ::= '7A'\n\t<town-name> ::= 'Stevenage'\n\t<state-code> ::= ' KY '\n\t<ZIP-code> ::= '33445'\n\t<roman-numeral> ::= 'g'\n"; $start = microtime(true); $grammar2 = $bnfGrammar->parse($string); print "Parsing completed in " . (microtime(true) - $start) . " seconds\n"; $start = microtime(true); $grammar2->parse("Steve MacLaurin \n173 Acacia Avenue 7A\nStevenage, KY 33445\n"); print "Parsing completed in " . (microtime(true) - $start) . " seconds\n"; $string = "\n\t<syntax> ::= <rule> | <rule> <syntax>\n\t<rule> ::= <opt-whitespace> \"<\" <rule-name> \">\" <opt-whitespace> \"::=\" <opt-whitespace> <expression> <line-end>\n\t<opt-whitespace> ::= \" \" <opt-whitespace> | \"\"\n\t<expression> ::= <list> | <list> \"|\" <expression>\n\t<line-end> ::= <opt-whitespace> <EOL> <line-end> | <opt-whitespace> <EOL>\n\t<list> ::= <term> | <term> <opt-whitespace> <list>\n\t<term> ::= <literal> | \"<\" <rule-name> \">\"\n\t<literal> ::= '\"' <text> '\"' | \"'\" <text> \"'\"\n\t\n\t<rule-name> ::= 'a'\n\t<EOL> ::= '\n'\n\t<text> ::= 'b'\n"; $start = microtime(true); $grammar3 = $bnfGrammar->parse($string); print "Parsing completed in " . (microtime(true) - $start) . " seconds\n"; $start = microtime(true); $grammar3->parse(" <a> ::= 'b' \n"); print "Parsing completed in " . (microtime(true) - $start) . " seconds\n"; // Should raise a ParseFailureException before trying to instantiate a Grammar $string = " <incomplete ::="; try { $bnfGrammar->parse($string);
} $parsers[$rule["name"]] = $rule["lazyaltparser"]; } return new Grammar($top, $parsers); }); // if executing this file directly, run unit tests if (__FILE__ !== $_SERVER["SCRIPT_FILENAME"]) { return; } // parentheses inside your BNF *always* force an array to exist in the output // *, +, ? and {m,n} are not disguised parentheses; they expand into the main expression // in the absence of a function to call, an array is is built instead print "0A\n"; // basic // array("a") or new S("a") $grammar2 = $locoGrammar->parse(" S ::= 'a' "); var_dump($grammar2->parse("a") === array("a")); // concatenation // array("a", "b") or new S("a", "b") $grammar2 = $locoGrammar->parse(" S ::= 'a' 'b' "); var_dump($grammar2->parse("ab") === array("a", "b")); // alternation // array("a") or array("b") or new S("a") or new S("b") $grammar2 = $locoGrammar->parse(" S ::= 'a' | 'b' "); var_dump($grammar2->parse("a") === array("a")); var_dump($grammar2->parse("b") === array("b")); // alternation 2 // array("a") or array("b", "c") or new S("a") or new S("b", "c") $grammar2 = $locoGrammar->parse(" S ::= 'a' | 'b' 'c' "); var_dump($grammar2->parse("a") === array("a")); var_dump($grammar2->parse("bc") === array("b", "c"));
if (count($parsers) === 0) { $top = $rule["rule-name"]; } $parsers[$rule["rule-name"]] = $rule["expression"]; } if (count($parsers) === 0) { throw new Exception("No rules."); } return new Grammar($top, $parsers); }); // if executing this file directly, run unit tests if (__FILE__ !== $_SERVER["SCRIPT_FILENAME"]) { return; } $string = "a = 'PROGRAM' ;"; $ebnfGrammar->parse($string)->parse("PROGRAM"); var_dump(true); // Should raise a ParseFailureException before trying to instantiate a Grammar // with no rules and raising a GrammarException $string = "a = 'PROGRAM ;"; try { $ebnfGrammar->parse($string); var_dump(false); } catch (ParseFailureException $e) { var_dump(true); } // Full rule set $string = "\n\t\t(* a simple program syntax in EBNF - Wikipedia *)\n\t\tprogram = 'PROGRAM' , white space , identifier , white space ,\n\t\t\t\t\t\t\t 'BEGIN' , white space ,\n\t\t\t\t\t\t\t { assignment , \";\" , white space } ,\n\t\t\t\t\t\t\t 'END.' ;\n\t\tidentifier = alphabetic character , { alphabetic character | digit } ;\n\t\tnumber = [ \"-\" ] , digit , { digit } ;\n\t\tstring = '\"' , { all characters } , '\"' ;\n\t\tassignment = identifier , \":=\" , ( number | identifier | string ) ;\n\t\talphabetic character = \"A\" | \"B\" | \"C\" | \"D\" | \"E\" | \"F\" | \"G\"\n\t\t\t\t\t\t\t\t\t\t\t\t | \"H\" | \"I\" | \"J\" | \"K\" | \"L\" | \"M\" | \"N\"\n\t\t\t\t\t\t\t\t\t\t\t\t | \"O\" | \"P\" | \"Q\" | \"R\" | \"S\" | \"T\" | \"U\"\n\t\t\t\t\t\t\t\t\t\t\t\t | \"V\" | \"W\" | \"X\" | \"Y\" | \"Z\" ;\n\t\tdigit = \"0\" | \"1\" | \"2\" | \"3\" | \"4\" | \"5\" | \"6\" | \"7\" | \"8\" | \"9\" ;\n\t\twhite space = ( \" \" | \"\n\" ) , { \" \" | \"\n\" } ;\n\t\tall characters = \"H\" | \"e\" | \"l\" | \"o\" | \" \" | \"w\" | \"r\" | \"d\" | \"!\" ;\n\t"; $pascalGrammar = $ebnfGrammar->parse($string); var_dump(true); $string = "PROGRAM DEMO1\n" . "BEGIN\n" . " A0:=3;\n" . " B:=45;\n" . " H:=-100023;\n" . " C:=A;\n" . " D123:=B34A;\n" . " BABOON:=GIRAFFE;\n" . " TEXT:=\"Hello world!\";\n" . "END.";
# This code is in the public domain. # http://qntm.org/loco $simpleCommentGrammar = new Grammar("<comment>", array("<comment>" => new GreedyStarParser("<blockorwhitespace>", function () { return implode("", func_get_args()); }), "<blockorwhitespace>" => new LazyAltParser(array("<h5>", "<p>", "WHITESPACE")), "<p>" => new ConcParser(array("OPEN_P", "<text>", "CLOSE_P"), function ($open_p, $text, $close_p) { return $open_p . $text . $close_p; }), "<h5>" => new ConcParser(array("OPEN_H5", "<text>", "CLOSE_H5"), function ($open_h5, $text, $close_h5) { return $open_h5 . $text . $close_h5; }), "<strong>" => new ConcParser(array("OPEN_STRONG", "<text>", "CLOSE_STRONG"), function ($open_strong, $text, $close_strong) { return $open_strong . $text . $close_strong; }), "<em>" => new ConcParser(array("OPEN_EM", "<text>", "CLOSE_EM"), function ($open_em, $text, $close_em) { return $open_em . $text . $close_em; }), "<text>" => new GreedyStarParser("<atom>", function () { return implode("", func_get_args()); }), "<atom>" => new LazyAltParser(array("<char>", "<strong>", "<em>", "FULL_BR")), "<char>" => new LazyAltParser(array("UTF8_EXCEPT", "GREATER_THAN", "LESS_THAN", "AMPERSAND")), "WHITESPACE" => new RegexParser("#^[ \n\r\t]+#"), "OPEN_P" => new RegexParser("#^<p[ \n\r\t]*>#"), "CLOSE_P" => new RegexParser("#^</p[ \n\r\t]*>#"), "OPEN_H5" => new RegexParser("#^<h5[ \n\r\t]*>#"), "CLOSE_H5" => new RegexParser("#^</h5[ \n\r\t]*>#"), "OPEN_EM" => new RegexParser("#^<em[ \n\r\t]*>#"), "CLOSE_EM" => new RegexParser("#^</em[ \n\r\t]*>#"), "OPEN_STRONG" => new RegexParser("#^<strong[ \n\r\t]*>#"), "CLOSE_STRONG" => new RegexParser("#^</strong[ \n\r\t]*>#"), "FULL_BR" => new RegexParser("#^<br[ \n\r\t]*/>#"), "UTF8_EXCEPT" => new Utf8Parser(array("<", ">", "&")), "GREATER_THAN" => new StringParser(">"), "LESS_THAN" => new StringParser("<"), "AMPERSAND" => new StringParser("&"))); // if executing this file directly, run unit tests if (__FILE__ !== $_SERVER["SCRIPT_FILENAME"]) { return; } $start = microtime(true); $string = $simpleCommentGrammar->parse("<h5> Title<br /><em\n><strong\n></strong>&</em></h5> \r\n\t <p ><</p >"); print "Parsing completed in " . (microtime(true) - $start) . " seconds\n"; var_dump($string === "<h5> Title<br /><em\n><strong\n></strong>&</em></h5> \r\n\t <p ><</p >"); foreach (array("<h5 style=\"\">", "&", "<", "salkhsfg>", "</p", "<br") as $string) { try { $simpleCommentGrammar->parse($string); var_dump(false); } catch (Exception $e) { var_dump(true); } }
} public function __toString() { return implode("", $this->mults); } } // Each Pattern is an alternation between several "Concs" // This is the top-level Pattern object returned by the lexer. class Pattern { public $concs; public function __construct($concs) { foreach ($concs as $conc) { if (!is_a($conc, "Conc")) { throw new Exception("Not a Conc: " . var_export($conc, true)); } } $this->concs = $concs; } public function __toString() { return implode("|", $this->concs); } } // apologies for the relative lack of exhaustive unit tests foreach (array("a{2}", "a{2,}", "a{2,8}", "[\$%\\^]{2,8}", "[ab]*", "([ab]*a)", "([ab]*a|[bc]*c)", "([ab]*a|[bc]*c)?", "([ab]*a|[bc]*c)?b*", "[a-zA-Z]", "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789", "[a]", "[abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789]", "[|(){},?*+\\[\\]\\^.\\\\]", "[\\f\\n\\r\\t\\v\\-]", "\\|", "\\(\\)\\{\\},\\?\\*\\+\\[\\]^.-\\f\\n\\r\\t\\v\\w\\d\\s\\W\\D\\S\\\\", "abcdef", "19\\d\\d-\\d\\d-\\d\\d", "[\$%\\^]{2,}", "[\$%\\^]{2}", "") as $string) { $pattern = $regexGrammar->parse($string); print $pattern . "\n"; var_dump(true); }