/** * Handles a repetition token * * @param Token $token Token * * @return void */ protected function _handleRepetition(Token $token) { //If there is no current item, throw exception if ($this->_currentItem === null) { throw new Exception\InvalidRepetition("Nothing to repeat"); } //Repetitions are allowed only after certain tokens, so check the last //emitted token $lastToken = $this->_tokensStack->top(); switch ($lastToken->getType()) { //Handle lazy repetition case Token::TYPE_REPETITION: $prevLastToken = $this->_tokensStack->offsetGet(1); //if this token is "?" and follows a repetition token that //does not come after another repetition token set the lazy flag if ($token->getIdentifier() === "?" && $prevLastToken->getType() !== Token::TYPE_REPETITION) { //Check if last repetition supports the lazy flag $lastRepetition = $this->_currentItem->getRepetition(); if ($lastRepetition->supportsLazy()) { $lastRepetition->setLazy(true); } return; } else { throw new Exception\InvalidRepetition("Nothing to repeat"); } break; //Tokens that can handle the repetition //Tokens that can handle the repetition case Token::TYPE_NON_PRINTING_CHAR: case Token::TYPE_GENERIC_CHAR_TYPE: case Token::TYPE_CONTROL_CHAR: case Token::TYPE_EXT_UNICODE_SEQUENCE: case Token::TYPE_UNICODE_CHAR_CLASS: case Token::TYPE_HEX_CHAR: case Token::TYPE_DOT: case Token::TYPE_BYTE: case Token::TYPE_SUBPATTERN_END: case Token::TYPE_COMMENT: case Token::TYPE_OCTAL_CHAR: case Token::TYPE_BACK_REFERENCE: case Token::TYPE_CHAR_CLASS_END: case Token::TYPE_RECURSIVE_PATTERN: break; //When simple characters are grouped, repetition is valid only //for the last one, so it needs to be splitted so that the last //character belongs to a different object //When simple characters are grouped, repetition is valid only //for the last one, so it needs to be splitted so that the last //character belongs to a different object case Token::TYPE_CHAR: $chars = $this->_currentItem->getChar(); if (strlen($chars) > 1) { $this->_currentItem->setChar(substr($chars, 0, -1)); $this->_currentItem = new Pattern\Char($chars[strlen($chars) - 1]); $this->_containersStack->top()->addChild($this->_currentItem); } break; default: throw new Exception\InvalidRepetition("Repetition cannot be inserted at this point"); break; } //Get the right repetition class switch ($token->getIdentifier()) { case "*": $repetition = new Pattern\Repetition\ZeroOrMore(); break; case "+": $repetition = new Pattern\Repetition\OneOrMore(); break; case "?": $repetition = new Pattern\Repetition\Optional(); break; case "{": //Check if {} if (strpos($token->getSubject(), ",") === false) { $repetition = new Pattern\Repetition\Number($token->getSubject()); } else { $limits = explode(",", $token->getSubject()); $repetition = new Pattern\Repetition\Range($limits[0], $limits[1] === "" ? null : $limits[1]); } break; } //Set the repetition on the current item $this->_currentItem->setRepetition($repetition); }
/** * Starts the tokenization proces * * @return void */ public function tokenize() { //Since delimiters are the only exception to the normal regex syntax and //the tokenizer needs to know regex modifiers to handle some situations, //parse them immediately and strip them from the regex list($delimiter, $endDelimiter, $rModifiers) = $this->_stripDelimitersAndModifiers(); $checkEndDelimiter = $delimiter === $endDelimiter; //Store regex length $this->_length = strlen($this->_regex); //Loop regex characters while (($char = $this->_consume()) !== null) { //If character is backslash and it's not escaped if ($char === "\\" && !$this->_escaped) { //Set escaped flag to true $this->_escaped = true; continue; } elseif (!$this->_inCharClass && !$this->_escaped && $char === ".") { //Emit a dot token $this->_emitToken(Token::TYPE_DOT, $char); } elseif ($this->_escaped && Rules::validateGenericCharType($char)) { //Emit a generic character type token $this->_emitToken(Token::TYPE_GENERIC_CHAR_TYPE, $char); } elseif (!$this->_inCharClass && $this->_escaped && Rules::validateSimpleAssertion($char)) { //Emit a simple assertion token $this->_emitToken(Token::TYPE_SIMPLE_ASSERTION, $char); } elseif ($this->_escaped && Rules::validateNonPrintingChar($char)) { //Emit a non-printing character token $this->_emitToken(Token::TYPE_NON_PRINTING_CHAR, $char); } elseif (!$this->_inCharClass && $this->_escaped && $char === "X") { //Emit a extended unicode sequence token $this->_emitToken(Token::TYPE_EXT_UNICODE_SEQUENCE, $char); } elseif (!$this->_inCharClass && $this->_escaped && $char === "C") { //Emit a single byte identifier token $this->_emitToken(Token::TYPE_BYTE, $char); } elseif (!$this->_inCharClass && !$this->_escaped && ($char === "^" || $char === "\$")) { //Emit an anchor token $this->_emitToken($char === "^" ? Token::TYPE_START_ANCHOR : Token::TYPE_END_ANCHOR, $char); } elseif (!$this->_inCharClass && !$this->_escaped && $char === "|") { //Emit an alternation identifier token $this->_emitToken(Token::TYPE_ALTERNATION, $char); } elseif (!$this->_inCharClass && $this->_escaped && ($char === "p" || $char === "P")) { //Take the next character $nextChar = $this->_consume(); //If there are no characters left throw an exception if ($nextChar === null) { throw new Exception\Generic("Unspecified character class form \\" . $char); } elseif ($nextChar !== "{") { $this->_emitToken(Token::TYPE_UNICODE_CHAR_CLASS, $char, $nextChar); } else { //Find everything until the closing bracket $nextChars = $this->_consumeUntil("}", true); //If the closing bracket has not been found throw an //exception if ($nextChars === null) { throw new Exception\Generic("Unclosed \\" . $char . " character class"); } else { $this->_emitToken(Token::TYPE_UNICODE_CHAR_CLASS, $char, $nextChar . $nextChars); } } } elseif ($this->_escaped && $char === "x") { $nextChar = $this->_consume(); $tokenSubject = ""; if ($nextChar === "{") { $nextChars = $this->_consumeUntil("}", true); if ($nextChars === null) { throw new Exception\Generic("Unclosed brace in hex char"); } $tokenSubject = trim($nextChars, "}"); } elseif ($nextChar !== null) { $this->_unconsume(); //Find following hexadecimal digits for ($i = 0; $i < 2; $i++) { $nextChar = $this->_consume(); if ($nextChar !== null && Rules::validateHexString($nextChar)) { $tokenSubject .= $nextChar; } else { $nextChar !== null && $this->_unconsume(); break; } } } //Emit the hexadecimal character token $this->_emitToken(Token::TYPE_HEX_CHAR, $char, $tokenSubject); } elseif ($this->_escaped && $char === "c") { //Take the next character $nextChar = $this->_consumeIgnoreEscape(); //If there are no characters left throw an exception if ($nextChar === null) { throw new Exception\Generic("Character not specified for control character"); } //Otherwise emit the control character token $this->_emitToken(Token::TYPE_CONTROL_CHAR, $char, $nextChar); } elseif (!$this->_inCharClass && !$this->_escaped && ($char === "*" || $char === "+" || $char === "?")) { //Emit a repetition token $this->_emitToken(Token::TYPE_REPETITION, $char); } elseif (!$this->_inCharClass && !$this->_escaped && $char === "{" && ($nextChars = $this->_consumeRegex("/^\\d+(?:,\\d*)?\\}/"))) { //Emit a repetition token $this->_emitToken(Token::TYPE_REPETITION, $char, rtrim($nextChars, "}")); } elseif (!$this->_inCharClass && !$this->_escaped && $char === "(") { $this->_handleSubpattern(); } elseif (!$this->_inCharClass && !$this->_escaped && $char === ")") { //Throw exception if there are no open subpatterns if (!$this->_openSubpatterns) { throw new Exception\Generic("Unmatched parenthesis"); } //Emit a subpattern end token $this->_emitToken(Token::TYPE_SUBPATTERN_END, $char); $this->_openSubpatterns--; $this->_modifiersStack->pop(); } elseif (!$this->_inCharClass && !$this->_escaped && $char === "[") { //Emit a char class start token $this->_emitToken(Token::TYPE_CHAR_CLASS_START, $char); $this->_inCharClass = true; //Consume next char $char = $this->_consume(); //If the character is a char class negation if ($char === "^") { //Emit the char class negate token $this->_emitToken(Token::TYPE_CHAR_CLASS_NEGATE, $char); $char = $this->_consume(); } //If the first char in a char class is a closed square bracket if ($char === "]") { //Emit the bracket as char token $this->_emitToken(Token::TYPE_CHAR, $char); } else { $this->_unconsume(); } } elseif ($this->_inCharClass && !$this->_escaped && $char === "]") { //Emit a char class end token $this->_emitToken(Token::TYPE_CHAR_CLASS_END, $char); $this->_inCharClass = false; } elseif ($this->_inCharClass && !$this->_escaped && $char === "-" && in_array($this->_lastToken->getType(), $this->_allowedInCharClassRange)) { //Enable the after char class range mode $this->_afterCharClassRange = true; } elseif ($this->_inCharClass && !$this->_escaped && $char === "[" && ($nextChars = $this->_consumeRegex("/^:\\^?[a-z]+:\\]/"))) { //Emit a posix char class token $subject = str_replace(array(":", "]"), "", $nextChars); $this->_emitToken(Token::TYPE_POSIX_CHAR_CLASS, $char . $nextChars, $subject); } elseif (!$this->_inCharClass && $this->_escaped && ($char === "g" || $char === "k")) { //It's a back reference. Check for the reference identifier if ($char === "g") { $testPattern = "(?|(\\d+)|\\{(-?\\d+|\\w+)\\})"; } else { $testPattern = "(<\\w+>|'\\w+'|\\{\\w+\\})"; } $nextChars = $this->_consumeRegex("/^{$testPattern}/", 1); if ($nextChars === null) { throw new Exception\Generic("Invalid backreference"); } if ($char === "k") { $nextChars = substr($nextChars, 1, -1); } //Check reference validity if (!$this->_checkValidReference($nextChars)) { throw new Exception\Generic("Reference to non-existent subpattern '{$nextChars}'"); } //Emit a backreference token $this->_emitToken(Token::TYPE_BACK_REFERENCE, $char, $nextChars); } elseif ($this->_escaped && is_numeric($char)) { //Char class does not handle back references so if the character //is not octal process the character again without the escape if ($this->_inCharClass && $char > 7) { $this->_unconsume(); $this->_escaped = false; continue; } //If the character is a 0 consume up to 2 octal digits, //otherwise consume all the following digits if ($char === "0" || $this->_inCharClass) { $testPattern = "^[0-7]{1,2}"; } else { $testPattern = "^\\d+"; } //Consume following numbers $nextChars = $this->_consumeRegex("/^{$testPattern}/"); if ($nextChars !== null) { $char .= $nextChars; } //If the first digit is 0 or its a valid octal number and there //are not enough back references $hasReference = $this->_checkValidReference($char); if ($char[0] === "0" || $this->_inCharClass || preg_match("/^[0-7]{2,3}\$/", $char) && !$hasReference) { $this->_emitToken(Token::TYPE_OCTAL_CHAR, $char); } elseif ($hasReference) { //Emit a backreference token $this->_emitToken(Token::TYPE_BACK_REFERENCE, "\\", $char); } else { throw new Exception\Generic("Reference to non-existent subpattern '{$char}'"); } } elseif (!$this->_escaped && $checkEndDelimiter && $char === $endDelimiter) { //Throw an exception throw new Exception\InvalidDelimiter("Unescaped end delimiter '{$char}' inside regex"); } else { //If the character is not escaped and the "x" modifier is active if (!$this->_escaped && strpos($this->_modifiersStack->top(), "x") !== false) { //If it is a "#" if ($char === "#") { //Emit a comment token $nextChars = $this->_consumeUntil("\n"); if ($nextChars === null) { $nextChars = $this->_consumeRemaining(); } $this->_emitToken(Token::TYPE_COMMENT, $char, $nextChars); continue; } elseif (preg_match("/\\s/", $char)) { continue; } } //Emit the character as a simple pattern token $this->_emitToken(Token::TYPE_CHAR, $char); } //Reset the escaped state $this->_escaped = false; } //If the escaped state is already active it means that no end delimiter //has been found, so an exception must be thrown if ($this->_escaped) { throw new Exception\InvalidDelimiter("End delimiter '{$endDelimiter}' not found"); } //Throw exception if there are unclosed subpatterns if ($this->_openSubpatterns) { throw new Exception\Generic("The regex contains unclosed subpatterns"); } //Throw exception if there are unclosed char classes if ($this->_inCharClass) { throw new Exception\Generic("The regex contains unclosed character classes"); } //Emit the end delimiter token $this->_emitToken(Token::TYPE_REGEX_END_DELIMITER, $endDelimiter); //If regex modifiers were specified emit the token if ($rModifiers) { $this->_emitToken(Token::TYPE_REGEX_MODIFIERS, $rModifiers); } }