Example #1
0
 /**
  * Handles a repetition token
  * 
  * @param Token $token Token
  * 
  * @return void
  */
 protected function _handleRepetition(Token $token)
 {
     //If there is no current item, throw exception
     if ($this->_currentItem === null) {
         throw new Exception\InvalidRepetition("Nothing to repeat");
     }
     //Repetitions are allowed only after certain tokens, so check the last
     //emitted token
     $lastToken = $this->_tokensStack->top();
     switch ($lastToken->getType()) {
         //Handle lazy repetition
         case Token::TYPE_REPETITION:
             $prevLastToken = $this->_tokensStack->offsetGet(1);
             //if this token is "?" and follows a repetition token that
             //does not come after another repetition token set the lazy flag
             if ($token->getIdentifier() === "?" && $prevLastToken->getType() !== Token::TYPE_REPETITION) {
                 //Check if last repetition supports the lazy flag
                 $lastRepetition = $this->_currentItem->getRepetition();
                 if ($lastRepetition->supportsLazy()) {
                     $lastRepetition->setLazy(true);
                 }
                 return;
             } else {
                 throw new Exception\InvalidRepetition("Nothing to repeat");
             }
             break;
             //Tokens that can handle the repetition
         //Tokens that can handle the repetition
         case Token::TYPE_NON_PRINTING_CHAR:
         case Token::TYPE_GENERIC_CHAR_TYPE:
         case Token::TYPE_CONTROL_CHAR:
         case Token::TYPE_EXT_UNICODE_SEQUENCE:
         case Token::TYPE_UNICODE_CHAR_CLASS:
         case Token::TYPE_HEX_CHAR:
         case Token::TYPE_DOT:
         case Token::TYPE_BYTE:
         case Token::TYPE_SUBPATTERN_END:
         case Token::TYPE_COMMENT:
         case Token::TYPE_OCTAL_CHAR:
         case Token::TYPE_BACK_REFERENCE:
         case Token::TYPE_CHAR_CLASS_END:
         case Token::TYPE_RECURSIVE_PATTERN:
             break;
             //When simple characters are grouped, repetition is valid only
             //for the last one, so it needs to be splitted so that the last
             //character belongs to a different object
         //When simple characters are grouped, repetition is valid only
         //for the last one, so it needs to be splitted so that the last
         //character belongs to a different object
         case Token::TYPE_CHAR:
             $chars = $this->_currentItem->getChar();
             if (strlen($chars) > 1) {
                 $this->_currentItem->setChar(substr($chars, 0, -1));
                 $this->_currentItem = new Pattern\Char($chars[strlen($chars) - 1]);
                 $this->_containersStack->top()->addChild($this->_currentItem);
             }
             break;
         default:
             throw new Exception\InvalidRepetition("Repetition cannot be inserted at this point");
             break;
     }
     //Get the right repetition class
     switch ($token->getIdentifier()) {
         case "*":
             $repetition = new Pattern\Repetition\ZeroOrMore();
             break;
         case "+":
             $repetition = new Pattern\Repetition\OneOrMore();
             break;
         case "?":
             $repetition = new Pattern\Repetition\Optional();
             break;
         case "{":
             //Check if {}
             if (strpos($token->getSubject(), ",") === false) {
                 $repetition = new Pattern\Repetition\Number($token->getSubject());
             } else {
                 $limits = explode(",", $token->getSubject());
                 $repetition = new Pattern\Repetition\Range($limits[0], $limits[1] === "" ? null : $limits[1]);
             }
             break;
     }
     //Set the repetition on the current item
     $this->_currentItem->setRepetition($repetition);
 }
Example #2
0
 /**
  * Starts the tokenization proces
  * 
  * @return void
  */
 public function tokenize()
 {
     //Since delimiters are the only exception to the normal regex syntax and
     //the tokenizer needs to know regex modifiers to handle some situations,
     //parse them immediately and strip them from the regex
     list($delimiter, $endDelimiter, $rModifiers) = $this->_stripDelimitersAndModifiers();
     $checkEndDelimiter = $delimiter === $endDelimiter;
     //Store regex length
     $this->_length = strlen($this->_regex);
     //Loop regex characters
     while (($char = $this->_consume()) !== null) {
         //If character is backslash and it's not escaped
         if ($char === "\\" && !$this->_escaped) {
             //Set escaped flag to true
             $this->_escaped = true;
             continue;
         } elseif (!$this->_inCharClass && !$this->_escaped && $char === ".") {
             //Emit a dot token
             $this->_emitToken(Token::TYPE_DOT, $char);
         } elseif ($this->_escaped && Rules::validateGenericCharType($char)) {
             //Emit a generic character type token
             $this->_emitToken(Token::TYPE_GENERIC_CHAR_TYPE, $char);
         } elseif (!$this->_inCharClass && $this->_escaped && Rules::validateSimpleAssertion($char)) {
             //Emit a simple assertion token
             $this->_emitToken(Token::TYPE_SIMPLE_ASSERTION, $char);
         } elseif ($this->_escaped && Rules::validateNonPrintingChar($char)) {
             //Emit a non-printing character token
             $this->_emitToken(Token::TYPE_NON_PRINTING_CHAR, $char);
         } elseif (!$this->_inCharClass && $this->_escaped && $char === "X") {
             //Emit a extended unicode sequence token
             $this->_emitToken(Token::TYPE_EXT_UNICODE_SEQUENCE, $char);
         } elseif (!$this->_inCharClass && $this->_escaped && $char === "C") {
             //Emit a single byte identifier token
             $this->_emitToken(Token::TYPE_BYTE, $char);
         } elseif (!$this->_inCharClass && !$this->_escaped && ($char === "^" || $char === "\$")) {
             //Emit an anchor token
             $this->_emitToken($char === "^" ? Token::TYPE_START_ANCHOR : Token::TYPE_END_ANCHOR, $char);
         } elseif (!$this->_inCharClass && !$this->_escaped && $char === "|") {
             //Emit an alternation identifier token
             $this->_emitToken(Token::TYPE_ALTERNATION, $char);
         } elseif (!$this->_inCharClass && $this->_escaped && ($char === "p" || $char === "P")) {
             //Take the next character
             $nextChar = $this->_consume();
             //If there are no characters left throw an exception
             if ($nextChar === null) {
                 throw new Exception\Generic("Unspecified character class form \\" . $char);
             } elseif ($nextChar !== "{") {
                 $this->_emitToken(Token::TYPE_UNICODE_CHAR_CLASS, $char, $nextChar);
             } else {
                 //Find everything until the closing bracket
                 $nextChars = $this->_consumeUntil("}", true);
                 //If the closing bracket has not been found throw an
                 //exception
                 if ($nextChars === null) {
                     throw new Exception\Generic("Unclosed \\" . $char . " character class");
                 } else {
                     $this->_emitToken(Token::TYPE_UNICODE_CHAR_CLASS, $char, $nextChar . $nextChars);
                 }
             }
         } elseif ($this->_escaped && $char === "x") {
             $nextChar = $this->_consume();
             $tokenSubject = "";
             if ($nextChar === "{") {
                 $nextChars = $this->_consumeUntil("}", true);
                 if ($nextChars === null) {
                     throw new Exception\Generic("Unclosed brace in hex char");
                 }
                 $tokenSubject = trim($nextChars, "}");
             } elseif ($nextChar !== null) {
                 $this->_unconsume();
                 //Find following hexadecimal digits
                 for ($i = 0; $i < 2; $i++) {
                     $nextChar = $this->_consume();
                     if ($nextChar !== null && Rules::validateHexString($nextChar)) {
                         $tokenSubject .= $nextChar;
                     } else {
                         $nextChar !== null && $this->_unconsume();
                         break;
                     }
                 }
             }
             //Emit the hexadecimal character token
             $this->_emitToken(Token::TYPE_HEX_CHAR, $char, $tokenSubject);
         } elseif ($this->_escaped && $char === "c") {
             //Take the next character
             $nextChar = $this->_consumeIgnoreEscape();
             //If there are no characters left throw an exception
             if ($nextChar === null) {
                 throw new Exception\Generic("Character not specified for control character");
             }
             //Otherwise emit the control character token
             $this->_emitToken(Token::TYPE_CONTROL_CHAR, $char, $nextChar);
         } elseif (!$this->_inCharClass && !$this->_escaped && ($char === "*" || $char === "+" || $char === "?")) {
             //Emit a repetition token
             $this->_emitToken(Token::TYPE_REPETITION, $char);
         } elseif (!$this->_inCharClass && !$this->_escaped && $char === "{" && ($nextChars = $this->_consumeRegex("/^\\d+(?:,\\d*)?\\}/"))) {
             //Emit a repetition token
             $this->_emitToken(Token::TYPE_REPETITION, $char, rtrim($nextChars, "}"));
         } elseif (!$this->_inCharClass && !$this->_escaped && $char === "(") {
             $this->_handleSubpattern();
         } elseif (!$this->_inCharClass && !$this->_escaped && $char === ")") {
             //Throw exception if there are no open subpatterns
             if (!$this->_openSubpatterns) {
                 throw new Exception\Generic("Unmatched parenthesis");
             }
             //Emit a subpattern end token
             $this->_emitToken(Token::TYPE_SUBPATTERN_END, $char);
             $this->_openSubpatterns--;
             $this->_modifiersStack->pop();
         } elseif (!$this->_inCharClass && !$this->_escaped && $char === "[") {
             //Emit a char class start token
             $this->_emitToken(Token::TYPE_CHAR_CLASS_START, $char);
             $this->_inCharClass = true;
             //Consume next char
             $char = $this->_consume();
             //If the character is a char class negation
             if ($char === "^") {
                 //Emit the char class negate token
                 $this->_emitToken(Token::TYPE_CHAR_CLASS_NEGATE, $char);
                 $char = $this->_consume();
             }
             //If the first char in a char class is a closed square bracket
             if ($char === "]") {
                 //Emit the bracket as char token
                 $this->_emitToken(Token::TYPE_CHAR, $char);
             } else {
                 $this->_unconsume();
             }
         } elseif ($this->_inCharClass && !$this->_escaped && $char === "]") {
             //Emit a char class end token
             $this->_emitToken(Token::TYPE_CHAR_CLASS_END, $char);
             $this->_inCharClass = false;
         } elseif ($this->_inCharClass && !$this->_escaped && $char === "-" && in_array($this->_lastToken->getType(), $this->_allowedInCharClassRange)) {
             //Enable the after char class range mode
             $this->_afterCharClassRange = true;
         } elseif ($this->_inCharClass && !$this->_escaped && $char === "[" && ($nextChars = $this->_consumeRegex("/^:\\^?[a-z]+:\\]/"))) {
             //Emit a posix char class token
             $subject = str_replace(array(":", "]"), "", $nextChars);
             $this->_emitToken(Token::TYPE_POSIX_CHAR_CLASS, $char . $nextChars, $subject);
         } elseif (!$this->_inCharClass && $this->_escaped && ($char === "g" || $char === "k")) {
             //It's a back reference. Check for the reference identifier
             if ($char === "g") {
                 $testPattern = "(?|(\\d+)|\\{(-?\\d+|\\w+)\\})";
             } else {
                 $testPattern = "(<\\w+>|'\\w+'|\\{\\w+\\})";
             }
             $nextChars = $this->_consumeRegex("/^{$testPattern}/", 1);
             if ($nextChars === null) {
                 throw new Exception\Generic("Invalid backreference");
             }
             if ($char === "k") {
                 $nextChars = substr($nextChars, 1, -1);
             }
             //Check reference validity
             if (!$this->_checkValidReference($nextChars)) {
                 throw new Exception\Generic("Reference to non-existent subpattern '{$nextChars}'");
             }
             //Emit a backreference token
             $this->_emitToken(Token::TYPE_BACK_REFERENCE, $char, $nextChars);
         } elseif ($this->_escaped && is_numeric($char)) {
             //Char class does not handle back references so if the character
             //is not octal process the character again without the escape
             if ($this->_inCharClass && $char > 7) {
                 $this->_unconsume();
                 $this->_escaped = false;
                 continue;
             }
             //If the character is a 0 consume up to 2 octal digits,
             //otherwise consume all the following digits
             if ($char === "0" || $this->_inCharClass) {
                 $testPattern = "^[0-7]{1,2}";
             } else {
                 $testPattern = "^\\d+";
             }
             //Consume following numbers
             $nextChars = $this->_consumeRegex("/^{$testPattern}/");
             if ($nextChars !== null) {
                 $char .= $nextChars;
             }
             //If the first digit is 0 or its a valid octal number and there
             //are not enough back references
             $hasReference = $this->_checkValidReference($char);
             if ($char[0] === "0" || $this->_inCharClass || preg_match("/^[0-7]{2,3}\$/", $char) && !$hasReference) {
                 $this->_emitToken(Token::TYPE_OCTAL_CHAR, $char);
             } elseif ($hasReference) {
                 //Emit a backreference token
                 $this->_emitToken(Token::TYPE_BACK_REFERENCE, "\\", $char);
             } else {
                 throw new Exception\Generic("Reference to non-existent subpattern '{$char}'");
             }
         } elseif (!$this->_escaped && $checkEndDelimiter && $char === $endDelimiter) {
             //Throw an exception
             throw new Exception\InvalidDelimiter("Unescaped end delimiter '{$char}' inside regex");
         } else {
             //If the character is not escaped and the "x" modifier is active
             if (!$this->_escaped && strpos($this->_modifiersStack->top(), "x") !== false) {
                 //If it is a "#"
                 if ($char === "#") {
                     //Emit a comment token
                     $nextChars = $this->_consumeUntil("\n");
                     if ($nextChars === null) {
                         $nextChars = $this->_consumeRemaining();
                     }
                     $this->_emitToken(Token::TYPE_COMMENT, $char, $nextChars);
                     continue;
                 } elseif (preg_match("/\\s/", $char)) {
                     continue;
                 }
             }
             //Emit the character as a simple pattern token
             $this->_emitToken(Token::TYPE_CHAR, $char);
         }
         //Reset the escaped state
         $this->_escaped = false;
     }
     //If the escaped state is already active it means that no end delimiter
     //has been found, so an exception must be thrown
     if ($this->_escaped) {
         throw new Exception\InvalidDelimiter("End delimiter '{$endDelimiter}' not found");
     }
     //Throw exception if there are unclosed subpatterns
     if ($this->_openSubpatterns) {
         throw new Exception\Generic("The regex contains unclosed subpatterns");
     }
     //Throw exception if there are unclosed char classes
     if ($this->_inCharClass) {
         throw new Exception\Generic("The regex contains unclosed character classes");
     }
     //Emit the end delimiter token
     $this->_emitToken(Token::TYPE_REGEX_END_DELIMITER, $endDelimiter);
     //If regex modifiers were specified emit the token
     if ($rModifiers) {
         $this->_emitToken(Token::TYPE_REGEX_MODIFIERS, $rModifiers);
     }
 }