/** * This functions receives a token and handles it to build the structure * * @param Token $token Token * * @return void */ public function receiveToken(Token $token) { switch ($token->getType()) { //Regex start delimiter case Token::TYPE_REGEX_START_DELIMITER: //Create the regex container if it does not exists if (!$this->_regexContainer) { $this->_regexContainer = new Pattern\Regex(); $this->_containersStack->push($this->_regexContainer); } //Set the delimiter $this->_regexContainer->setDelimiter($token->getIdentifier()); $this->_currentItem = null; break; //Regex end delimiter //Regex end delimiter case Token::TYPE_REGEX_END_DELIMITER: //Anchor the regex if required if ($this->_pendingEndAnchor) { $this->_containersStack->top()->setEndAnchored(true); } break; //Regex modifiers //Regex modifiers case Token::TYPE_REGEX_MODIFIERS: //Set the modifiers $this->_regexContainer->setModifiers($token->getIdentifier()); break; //Simple character //Simple character case Token::TYPE_CHAR: //If the current item is already a char append data to it if ($this->_currentItem && $this->_currentItem instanceof Pattern\Char && $this->_tokensStack->top()->getType() === Token::TYPE_CHAR) { $this->_currentItem->setChar($this->_currentItem->getChar() . $token->getSubject()); } else { //Otherwise create a simple character and add it to the //current container $this->_currentItem = new Pattern\Char($token->getIdentifier()); $this->_containersStack->top()->addChild($this->_currentItem); } break; //Non-printing character identifier //Non-printing character identifier case Token::TYPE_NON_PRINTING_CHAR: //Create a non-printing character identifier and add it to the //current container $this->_currentItem = new Pattern\NonPrintingChar($token->getIdentifier()); $this->_containersStack->top()->addChild($this->_currentItem); break; //Generic character type identifier //Generic character type identifier case Token::TYPE_GENERIC_CHAR_TYPE: //Create a generic character type identifier and add it to the //current container $this->_currentItem = new Pattern\GenericCharType($token->getIdentifier()); $this->_containersStack->top()->addChild($this->_currentItem); break; //Simple assertion identifier //Simple assertion identifier case Token::TYPE_SIMPLE_ASSERTION: //Create a simple assertion identifier and add it to the current //container $this->_currentItem = new Pattern\SimpleAssertion($token->getIdentifier()); $this->_containersStack->top()->addChild($this->_currentItem); break; //Dot //Dot case Token::TYPE_DOT: //Create a dot and add it to the current container $this->_currentItem = new Pattern\Dot(); $this->_containersStack->top()->addChild($this->_currentItem); break; //Single byte identifier //Single byte identifier case Token::TYPE_BYTE: //Create a single byte identifier and add it to the current //container $this->_currentItem = new Pattern\Byte(); $this->_containersStack->top()->addChild($this->_currentItem); break; //Control character identifier //Control character identifier case Token::TYPE_CONTROL_CHAR: //Create a control character identifier and add it to the //current container $this->_currentItem = new Pattern\ControlChar($token->getSubject()); $this->_containersStack->top()->addChild($this->_currentItem); break; //Extended unicode sequence identifier //Extended unicode sequence identifier case Token::TYPE_EXT_UNICODE_SEQUENCE: //Create an extended unicode sequence identifier and add it to //the current container $this->_currentItem = new Pattern\UnicodeCharClass($token->getIdentifier()); $this->_containersStack->top()->addChild($this->_currentItem); break; //Unicode character class identifier //Unicode character class identifier case Token::TYPE_UNICODE_CHAR_CLASS: //Create a unicode character class identifier and add it to //the current container $this->_currentItem = new Pattern\UnicodeCharClass(rtrim(ltrim($token->getSubject(), "{"), "}"), $token->getIdentifier() === "P"); $this->_containersStack->top()->addChild($this->_currentItem); break; //Hexadecimal character identifier //Hexadecimal character identifier case Token::TYPE_HEX_CHAR: //Create a hexadecimal character identifier and add it to the //current container $this->_currentItem = new Pattern\HexChar($token->getSubject()); $this->_containersStack->top()->addChild($this->_currentItem); break; //Octal character identifier //Octal character identifier case Token::TYPE_OCTAL_CHAR: //Create a octal character identifier and add it to the //current container $this->_currentItem = new Pattern\OctalChar($token->getSubject()); $this->_containersStack->top()->addChild($this->_currentItem); break; //Back reference identifier //Back reference identifier case Token::TYPE_BACK_REFERENCE: //Create a back reference identifier and add it to the //current container $this->_currentItem = new Pattern\BackReference($token->getSubject()); $this->_containersStack->top()->addChild($this->_currentItem); break; //Recursive pattern identifier //Recursive pattern identifier case Token::TYPE_RECURSIVE_PATTERN: //Create a recursive pattern identifier and add it to the //current container $this->_currentItem = new Pattern\RecursivePattern($token->getSubject()); $this->_containersStack->top()->addChild($this->_currentItem); break; //Alternation identifier //Alternation identifier case Token::TYPE_ALTERNATION: if ($this->_containersStack->top() instanceof Pattern\Alternation) { //If already inside an alternation group, create a new //alternation $currentAlternation = $this->_containersStack->pop(); $newAlternation = new Pattern\Alternation(); $currentAlternation->getParent()->addChild($newAlternation); //Anchor the current alternation if required if ($this->_pendingEndAnchor) { $currentAlternation->setEndAnchored(true); } } elseif ($this->_containersStack->top() instanceof Pattern\ConditionalThen) { //Remove the "then" part and add the "else" part $this->_containersStack->pop(); $newAlternation = new Pattern\ConditionalElse(); $this->_containersStack->top()->addChild($newAlternation); } else { //Create a new alternation and move all the children from //the current container to the new alternation $currentContainer = $this->_containersStack->top(); $children = $currentContainer->getChildren(); //Create the alternation group structure $alternationGroup = new Pattern\AlternationGroup(); $alternation = new Pattern\Alternation(); $alternation->addChildren($children); $alternationGroup->addChild($alternation); $currentContainer->addChild($alternationGroup); $newAlternation = new Pattern\Alternation(); $alternationGroup->addChild($newAlternation); //Move the start anchor from the container to the //alternation that contains its children if ($currentContainer->getStartAnchored()) { $currentContainer->setStartAnchored(false); $alternation->setStartAnchored(true); } } $this->_containersStack->push($newAlternation); $this->_currentItem = null; break; //Subpattern start character //Subpattern start character case Token::TYPE_SUBPATTERN_START: //Create a new subpattern and add it to the container stack $subPattern = new Pattern\SubPattern(); $this->_containersStack->top()->addChild($subPattern); $this->_containersStack->push($subPattern); $this->_currentItem = null; break; //Subpattern end character //Subpattern end character case Token::TYPE_SUBPATTERN_END: //Anchor the container if required if ($this->_pendingEndAnchor) { $this->_containersStack->top()->setEndAnchored(true); } //If the current container is an alternation remove it first if ($this->_containersStack->top() instanceof Pattern\Alternation) { $this->_containersStack->pop(); } //Remove the subpattern from the container stack and make it //the current item $this->_currentItem = $this->_containersStack->pop(); //If inside a conditional subpattern if ($this->_containersStack->top() instanceof Pattern\ConditionalSubPattern) { //If the pattern was an assertion if ($this->_currentItem instanceof Pattern\Assertion) { //Add the "then" part $then = new Pattern\ConditionalThen(); $this->_containersStack->top()->addChild($then); $this->_containersStack->push($then); $this->_currentItem = null; } else { $this->_currentItem = $this->_containersStack->pop(); } } break; //Subpattern non capturing flag and modifiers //Subpattern non capturing flag and modifiers case Token::TYPE_SUBPATTERN_NON_CAPTURING: //Set the subpattern as non capturing and set its modifiers if //present $this->_containersStack->top()->setCapture(false); if ($token->getSubject()) { $this->_containersStack->top()->setModifiers($token->getSubject()); } break; //Subpattern group matches identifier //Subpattern group matches identifier case Token::TYPE_SUBPATTERN_GROUP_MATCHES: //Enable subpattern group matches mode and make the subpattern //non capturing $this->_containersStack->top()->setGroupMatches(true)->setCapture(false); break; //Subpattern once only identifier //Subpattern once only identifier case Token::TYPE_SUBPATTERN_ONCE_ONLY: //Enable once only mode and make the subpattern non capturing $this->_containersStack->top()->setOnceOnly(true)->setCapture(false); break; //Subpattern name //Subpattern name case Token::TYPE_SUBPATTERN_NAME: //Set the subpattern name $this->_containersStack->top()->setName($token->getSubject()); break; //Internal option identifier //Internal option identifier case Token::TYPE_INTERNAL_OPTION: //Create an internal option and add it to the current container $this->_containersStack->top()->addChild(new Pattern\InternalOption($token->getSubject())); $this->_currentItem = null; break; //Assertion identifier //Assertion identifier case Token::TYPE_LOOKAHEAD_ASSERTION: case Token::TYPE_LOOKBEHIND_ASSERTION: $assertion = new Pattern\Assertion(strpos($token->getIdentifier(), "<") === false, strpos($token->getIdentifier(), "!") !== false); $this->_containersStack->top()->addChild($assertion); $this->_containersStack->push($assertion); $this->_currentItem = null; break; //Comment //Comment case Token::TYPE_COMMENT: //Create comment and add it to the current container $this->_containersStack->top()->addChild(new Pattern\Comment($token->getSubject())); break; //Repetition identifier //Repetition identifier case Token::TYPE_REPETITION: $this->_handleRepetition($token); break; //Start anchor identifier //Start anchor identifier case Token::TYPE_START_ANCHOR: //Ignore if the container already contains children if (!$this->_containersStack->top()->hasChildren()) { $this->_containersStack->top()->setStartAnchored(true); } $this->_currentItem = null; break; //End anchor identifier //End anchor identifier case Token::TYPE_END_ANCHOR: //Set only the pending end anchor flag. It will be unset when //another token is emitted and it will be evaluated only when //a container is closed $this->_pendingEndAnchor = true; break; //Start char class identifier //Start char class identifier case Token::TYPE_CHAR_CLASS_START: //Create a new character class and add it to the container stack $charClass = new Pattern\CharClass(); $this->_containersStack->top()->addChild($charClass); $this->_containersStack->push($charClass); $this->_currentItem = null; break; //Char class negation identifier //Char class negation identifier case Token::TYPE_CHAR_CLASS_NEGATE: //Negate the current char class $this->_containersStack->top()->setNegate(true); break; //End char class identifier //End char class identifier case Token::TYPE_CHAR_CLASS_END: //Remove the char class from the container stack and make it //the current item $this->_currentItem = $this->_containersStack->pop(); break; //Posix char class identifier //Posix char class identifier case Token::TYPE_POSIX_CHAR_CLASS: //Remove the char class from the container stack and make it //the current item $negate = false; $class = $token->getSubject(); if (strpos($class, "^") === 0) { $negate = true; $class = ltrim($class, "^"); } //Create a posix char class and add it to the current container $this->_containersStack->top()->addChild(new Pattern\PosixCharClass($class, $negate)); break; //Char class range identifier //Char class range identifier case Token::TYPE_CHAR_CLASS_RANGE: //Create a new character class range and add it to the //container stack, move the last inserted item to the range $currentChildren = $this->_containersStack->top()->getChildren(); $range = new Pattern\CharClassRange(); $range->addChild($currentChildren[count($currentChildren) - 1]); $this->_containersStack->top()->addChild($range); $this->_containersStack->push($range); $this->_currentItem = null; break; //Conditional subpattern identifier //Conditional subpattern identifier case Token::TYPE_CONDITIONAL_SUBPATTERN: //Create a new conditional subpattern and add it to the //container stack $subPattern = new Pattern\ConditionalSubPattern(); $this->_containersStack->top()->addChild($subPattern); $this->_containersStack->push($subPattern); $this->_currentItem = null; break; } //Push the token in the tokens stack $this->_tokensStack->push($token); //Unset the pending end anchor flag if the token is not an end anchor if ($token->getType() !== Token::TYPE_END_ANCHOR) { $this->_pendingEndAnchor = false; } //If the current container is a character class range and the current //token is not the char class range identifier, remove the char class //range from the container stack if ($this->_containersStack->top() instanceof Pattern\CharClassRange && $token->getType() !== Token::TYPE_CHAR_CLASS_RANGE) { $this->_containersStack->pop(); } }
/** * Starts the tokenization proces * * @return void */ public function tokenize() { //Since delimiters are the only exception to the normal regex syntax and //the tokenizer needs to know regex modifiers to handle some situations, //parse them immediately and strip them from the regex list($delimiter, $endDelimiter, $rModifiers) = $this->_stripDelimitersAndModifiers(); $checkEndDelimiter = $delimiter === $endDelimiter; //Store regex length $this->_length = strlen($this->_regex); //Loop regex characters while (($char = $this->_consume()) !== null) { //If character is backslash and it's not escaped if ($char === "\\" && !$this->_escaped) { //Set escaped flag to true $this->_escaped = true; continue; } elseif (!$this->_inCharClass && !$this->_escaped && $char === ".") { //Emit a dot token $this->_emitToken(Token::TYPE_DOT, $char); } elseif ($this->_escaped && Rules::validateGenericCharType($char)) { //Emit a generic character type token $this->_emitToken(Token::TYPE_GENERIC_CHAR_TYPE, $char); } elseif (!$this->_inCharClass && $this->_escaped && Rules::validateSimpleAssertion($char)) { //Emit a simple assertion token $this->_emitToken(Token::TYPE_SIMPLE_ASSERTION, $char); } elseif ($this->_escaped && Rules::validateNonPrintingChar($char)) { //Emit a non-printing character token $this->_emitToken(Token::TYPE_NON_PRINTING_CHAR, $char); } elseif (!$this->_inCharClass && $this->_escaped && $char === "X") { //Emit a extended unicode sequence token $this->_emitToken(Token::TYPE_EXT_UNICODE_SEQUENCE, $char); } elseif (!$this->_inCharClass && $this->_escaped && $char === "C") { //Emit a single byte identifier token $this->_emitToken(Token::TYPE_BYTE, $char); } elseif (!$this->_inCharClass && !$this->_escaped && ($char === "^" || $char === "\$")) { //Emit an anchor token $this->_emitToken($char === "^" ? Token::TYPE_START_ANCHOR : Token::TYPE_END_ANCHOR, $char); } elseif (!$this->_inCharClass && !$this->_escaped && $char === "|") { //Emit an alternation identifier token $this->_emitToken(Token::TYPE_ALTERNATION, $char); } elseif (!$this->_inCharClass && $this->_escaped && ($char === "p" || $char === "P")) { //Take the next character $nextChar = $this->_consume(); //If there are no characters left throw an exception if ($nextChar === null) { throw new Exception\Generic("Unspecified character class form \\" . $char); } elseif ($nextChar !== "{") { $this->_emitToken(Token::TYPE_UNICODE_CHAR_CLASS, $char, $nextChar); } else { //Find everything until the closing bracket $nextChars = $this->_consumeUntil("}", true); //If the closing bracket has not been found throw an //exception if ($nextChars === null) { throw new Exception\Generic("Unclosed \\" . $char . " character class"); } else { $this->_emitToken(Token::TYPE_UNICODE_CHAR_CLASS, $char, $nextChar . $nextChars); } } } elseif ($this->_escaped && $char === "x") { $nextChar = $this->_consume(); $tokenSubject = ""; if ($nextChar === "{") { $nextChars = $this->_consumeUntil("}", true); if ($nextChars === null) { throw new Exception\Generic("Unclosed brace in hex char"); } $tokenSubject = trim($nextChars, "}"); } elseif ($nextChar !== null) { $this->_unconsume(); //Find following hexadecimal digits for ($i = 0; $i < 2; $i++) { $nextChar = $this->_consume(); if ($nextChar !== null && Rules::validateHexString($nextChar)) { $tokenSubject .= $nextChar; } else { $nextChar !== null && $this->_unconsume(); break; } } } //Emit the hexadecimal character token $this->_emitToken(Token::TYPE_HEX_CHAR, $char, $tokenSubject); } elseif ($this->_escaped && $char === "c") { //Take the next character $nextChar = $this->_consumeIgnoreEscape(); //If there are no characters left throw an exception if ($nextChar === null) { throw new Exception\Generic("Character not specified for control character"); } //Otherwise emit the control character token $this->_emitToken(Token::TYPE_CONTROL_CHAR, $char, $nextChar); } elseif (!$this->_inCharClass && !$this->_escaped && ($char === "*" || $char === "+" || $char === "?")) { //Emit a repetition token $this->_emitToken(Token::TYPE_REPETITION, $char); } elseif (!$this->_inCharClass && !$this->_escaped && $char === "{" && ($nextChars = $this->_consumeRegex("/^\\d+(?:,\\d*)?\\}/"))) { //Emit a repetition token $this->_emitToken(Token::TYPE_REPETITION, $char, rtrim($nextChars, "}")); } elseif (!$this->_inCharClass && !$this->_escaped && $char === "(") { $this->_handleSubpattern(); } elseif (!$this->_inCharClass && !$this->_escaped && $char === ")") { //Throw exception if there are no open subpatterns if (!$this->_openSubpatterns) { throw new Exception\Generic("Unmatched parenthesis"); } //Emit a subpattern end token $this->_emitToken(Token::TYPE_SUBPATTERN_END, $char); $this->_openSubpatterns--; $this->_modifiersStack->pop(); } elseif (!$this->_inCharClass && !$this->_escaped && $char === "[") { //Emit a char class start token $this->_emitToken(Token::TYPE_CHAR_CLASS_START, $char); $this->_inCharClass = true; //Consume next char $char = $this->_consume(); //If the character is a char class negation if ($char === "^") { //Emit the char class negate token $this->_emitToken(Token::TYPE_CHAR_CLASS_NEGATE, $char); $char = $this->_consume(); } //If the first char in a char class is a closed square bracket if ($char === "]") { //Emit the bracket as char token $this->_emitToken(Token::TYPE_CHAR, $char); } else { $this->_unconsume(); } } elseif ($this->_inCharClass && !$this->_escaped && $char === "]") { //Emit a char class end token $this->_emitToken(Token::TYPE_CHAR_CLASS_END, $char); $this->_inCharClass = false; } elseif ($this->_inCharClass && !$this->_escaped && $char === "-" && in_array($this->_lastToken->getType(), $this->_allowedInCharClassRange)) { //Enable the after char class range mode $this->_afterCharClassRange = true; } elseif ($this->_inCharClass && !$this->_escaped && $char === "[" && ($nextChars = $this->_consumeRegex("/^:\\^?[a-z]+:\\]/"))) { //Emit a posix char class token $subject = str_replace(array(":", "]"), "", $nextChars); $this->_emitToken(Token::TYPE_POSIX_CHAR_CLASS, $char . $nextChars, $subject); } elseif (!$this->_inCharClass && $this->_escaped && ($char === "g" || $char === "k")) { //It's a back reference. Check for the reference identifier if ($char === "g") { $testPattern = "(?|(\\d+)|\\{(-?\\d+|\\w+)\\})"; } else { $testPattern = "(<\\w+>|'\\w+'|\\{\\w+\\})"; } $nextChars = $this->_consumeRegex("/^{$testPattern}/", 1); if ($nextChars === null) { throw new Exception\Generic("Invalid backreference"); } if ($char === "k") { $nextChars = substr($nextChars, 1, -1); } //Check reference validity if (!$this->_checkValidReference($nextChars)) { throw new Exception\Generic("Reference to non-existent subpattern '{$nextChars}'"); } //Emit a backreference token $this->_emitToken(Token::TYPE_BACK_REFERENCE, $char, $nextChars); } elseif ($this->_escaped && is_numeric($char)) { //Char class does not handle back references so if the character //is not octal process the character again without the escape if ($this->_inCharClass && $char > 7) { $this->_unconsume(); $this->_escaped = false; continue; } //If the character is a 0 consume up to 2 octal digits, //otherwise consume all the following digits if ($char === "0" || $this->_inCharClass) { $testPattern = "^[0-7]{1,2}"; } else { $testPattern = "^\\d+"; } //Consume following numbers $nextChars = $this->_consumeRegex("/^{$testPattern}/"); if ($nextChars !== null) { $char .= $nextChars; } //If the first digit is 0 or its a valid octal number and there //are not enough back references $hasReference = $this->_checkValidReference($char); if ($char[0] === "0" || $this->_inCharClass || preg_match("/^[0-7]{2,3}\$/", $char) && !$hasReference) { $this->_emitToken(Token::TYPE_OCTAL_CHAR, $char); } elseif ($hasReference) { //Emit a backreference token $this->_emitToken(Token::TYPE_BACK_REFERENCE, "\\", $char); } else { throw new Exception\Generic("Reference to non-existent subpattern '{$char}'"); } } elseif (!$this->_escaped && $checkEndDelimiter && $char === $endDelimiter) { //Throw an exception throw new Exception\InvalidDelimiter("Unescaped end delimiter '{$char}' inside regex"); } else { //If the character is not escaped and the "x" modifier is active if (!$this->_escaped && strpos($this->_modifiersStack->top(), "x") !== false) { //If it is a "#" if ($char === "#") { //Emit a comment token $nextChars = $this->_consumeUntil("\n"); if ($nextChars === null) { $nextChars = $this->_consumeRemaining(); } $this->_emitToken(Token::TYPE_COMMENT, $char, $nextChars); continue; } elseif (preg_match("/\\s/", $char)) { continue; } } //Emit the character as a simple pattern token $this->_emitToken(Token::TYPE_CHAR, $char); } //Reset the escaped state $this->_escaped = false; } //If the escaped state is already active it means that no end delimiter //has been found, so an exception must be thrown if ($this->_escaped) { throw new Exception\InvalidDelimiter("End delimiter '{$endDelimiter}' not found"); } //Throw exception if there are unclosed subpatterns if ($this->_openSubpatterns) { throw new Exception\Generic("The regex contains unclosed subpatterns"); } //Throw exception if there are unclosed char classes if ($this->_inCharClass) { throw new Exception\Generic("The regex contains unclosed character classes"); } //Emit the end delimiter token $this->_emitToken(Token::TYPE_REGEX_END_DELIMITER, $endDelimiter); //If regex modifiers were specified emit the token if ($rModifiers) { $this->_emitToken(Token::TYPE_REGEX_MODIFIERS, $rModifiers); } }