Example #1
0
 /**
  * Sets the identifier. It can be one of the following:
  * "b", "B", "A", "Z", "z", "G", "Q", "E", "K"
  * 
  * @param string $identifier Identifier to match
  * 
  * @return SimpleAssertion
  * 
  * @throws \REBuilder\Exception\Generic
  * 
  * @link http://php.net/manual/en/regexp.reference.escape.php
  */
 public function setIdentifier($identifier)
 {
     if (!\REBuilder\Parser\Rules::validateSimpleAssertion($identifier)) {
         throw new \REBuilder\Exception\Generic("'{$identifier}' is not a valid simple assertion type identifier");
     }
     return parent::setIdentifier($identifier);
 }
Example #2
0
 /**
  * Starts the tokenization proces
  * 
  * @return void
  */
 public function tokenize()
 {
     //Since delimiters are the only exception to the normal regex syntax and
     //the tokenizer needs to know regex modifiers to handle some situations,
     //parse them immediately and strip them from the regex
     list($delimiter, $endDelimiter, $rModifiers) = $this->_stripDelimitersAndModifiers();
     $checkEndDelimiter = $delimiter === $endDelimiter;
     //Store regex length
     $this->_length = strlen($this->_regex);
     //Loop regex characters
     while (($char = $this->_consume()) !== null) {
         //If character is backslash and it's not escaped
         if ($char === "\\" && !$this->_escaped) {
             //Set escaped flag to true
             $this->_escaped = true;
             continue;
         } elseif (!$this->_inCharClass && !$this->_escaped && $char === ".") {
             //Emit a dot token
             $this->_emitToken(Token::TYPE_DOT, $char);
         } elseif ($this->_escaped && Rules::validateGenericCharType($char)) {
             //Emit a generic character type token
             $this->_emitToken(Token::TYPE_GENERIC_CHAR_TYPE, $char);
         } elseif (!$this->_inCharClass && $this->_escaped && Rules::validateSimpleAssertion($char)) {
             //Emit a simple assertion token
             $this->_emitToken(Token::TYPE_SIMPLE_ASSERTION, $char);
         } elseif ($this->_escaped && Rules::validateNonPrintingChar($char)) {
             //Emit a non-printing character token
             $this->_emitToken(Token::TYPE_NON_PRINTING_CHAR, $char);
         } elseif (!$this->_inCharClass && $this->_escaped && $char === "X") {
             //Emit a extended unicode sequence token
             $this->_emitToken(Token::TYPE_EXT_UNICODE_SEQUENCE, $char);
         } elseif (!$this->_inCharClass && $this->_escaped && $char === "C") {
             //Emit a single byte identifier token
             $this->_emitToken(Token::TYPE_BYTE, $char);
         } elseif (!$this->_inCharClass && !$this->_escaped && ($char === "^" || $char === "\$")) {
             //Emit an anchor token
             $this->_emitToken($char === "^" ? Token::TYPE_START_ANCHOR : Token::TYPE_END_ANCHOR, $char);
         } elseif (!$this->_inCharClass && !$this->_escaped && $char === "|") {
             //Emit an alternation identifier token
             $this->_emitToken(Token::TYPE_ALTERNATION, $char);
         } elseif (!$this->_inCharClass && $this->_escaped && ($char === "p" || $char === "P")) {
             //Take the next character
             $nextChar = $this->_consume();
             //If there are no characters left throw an exception
             if ($nextChar === null) {
                 throw new Exception\Generic("Unspecified character class form \\" . $char);
             } elseif ($nextChar !== "{") {
                 $this->_emitToken(Token::TYPE_UNICODE_CHAR_CLASS, $char, $nextChar);
             } else {
                 //Find everything until the closing bracket
                 $nextChars = $this->_consumeUntil("}", true);
                 //If the closing bracket has not been found throw an
                 //exception
                 if ($nextChars === null) {
                     throw new Exception\Generic("Unclosed \\" . $char . " character class");
                 } else {
                     $this->_emitToken(Token::TYPE_UNICODE_CHAR_CLASS, $char, $nextChar . $nextChars);
                 }
             }
         } elseif ($this->_escaped && $char === "x") {
             $nextChar = $this->_consume();
             $tokenSubject = "";
             if ($nextChar === "{") {
                 $nextChars = $this->_consumeUntil("}", true);
                 if ($nextChars === null) {
                     throw new Exception\Generic("Unclosed brace in hex char");
                 }
                 $tokenSubject = trim($nextChars, "}");
             } elseif ($nextChar !== null) {
                 $this->_unconsume();
                 //Find following hexadecimal digits
                 for ($i = 0; $i < 2; $i++) {
                     $nextChar = $this->_consume();
                     if ($nextChar !== null && Rules::validateHexString($nextChar)) {
                         $tokenSubject .= $nextChar;
                     } else {
                         $nextChar !== null && $this->_unconsume();
                         break;
                     }
                 }
             }
             //Emit the hexadecimal character token
             $this->_emitToken(Token::TYPE_HEX_CHAR, $char, $tokenSubject);
         } elseif ($this->_escaped && $char === "c") {
             //Take the next character
             $nextChar = $this->_consumeIgnoreEscape();
             //If there are no characters left throw an exception
             if ($nextChar === null) {
                 throw new Exception\Generic("Character not specified for control character");
             }
             //Otherwise emit the control character token
             $this->_emitToken(Token::TYPE_CONTROL_CHAR, $char, $nextChar);
         } elseif (!$this->_inCharClass && !$this->_escaped && ($char === "*" || $char === "+" || $char === "?")) {
             //Emit a repetition token
             $this->_emitToken(Token::TYPE_REPETITION, $char);
         } elseif (!$this->_inCharClass && !$this->_escaped && $char === "{" && ($nextChars = $this->_consumeRegex("/^\\d+(?:,\\d*)?\\}/"))) {
             //Emit a repetition token
             $this->_emitToken(Token::TYPE_REPETITION, $char, rtrim($nextChars, "}"));
         } elseif (!$this->_inCharClass && !$this->_escaped && $char === "(") {
             $this->_handleSubpattern();
         } elseif (!$this->_inCharClass && !$this->_escaped && $char === ")") {
             //Throw exception if there are no open subpatterns
             if (!$this->_openSubpatterns) {
                 throw new Exception\Generic("Unmatched parenthesis");
             }
             //Emit a subpattern end token
             $this->_emitToken(Token::TYPE_SUBPATTERN_END, $char);
             $this->_openSubpatterns--;
             $this->_modifiersStack->pop();
         } elseif (!$this->_inCharClass && !$this->_escaped && $char === "[") {
             //Emit a char class start token
             $this->_emitToken(Token::TYPE_CHAR_CLASS_START, $char);
             $this->_inCharClass = true;
             //Consume next char
             $char = $this->_consume();
             //If the character is a char class negation
             if ($char === "^") {
                 //Emit the char class negate token
                 $this->_emitToken(Token::TYPE_CHAR_CLASS_NEGATE, $char);
                 $char = $this->_consume();
             }
             //If the first char in a char class is a closed square bracket
             if ($char === "]") {
                 //Emit the bracket as char token
                 $this->_emitToken(Token::TYPE_CHAR, $char);
             } else {
                 $this->_unconsume();
             }
         } elseif ($this->_inCharClass && !$this->_escaped && $char === "]") {
             //Emit a char class end token
             $this->_emitToken(Token::TYPE_CHAR_CLASS_END, $char);
             $this->_inCharClass = false;
         } elseif ($this->_inCharClass && !$this->_escaped && $char === "-" && in_array($this->_lastToken->getType(), $this->_allowedInCharClassRange)) {
             //Enable the after char class range mode
             $this->_afterCharClassRange = true;
         } elseif ($this->_inCharClass && !$this->_escaped && $char === "[" && ($nextChars = $this->_consumeRegex("/^:\\^?[a-z]+:\\]/"))) {
             //Emit a posix char class token
             $subject = str_replace(array(":", "]"), "", $nextChars);
             $this->_emitToken(Token::TYPE_POSIX_CHAR_CLASS, $char . $nextChars, $subject);
         } elseif (!$this->_inCharClass && $this->_escaped && ($char === "g" || $char === "k")) {
             //It's a back reference. Check for the reference identifier
             if ($char === "g") {
                 $testPattern = "(?|(\\d+)|\\{(-?\\d+|\\w+)\\})";
             } else {
                 $testPattern = "(<\\w+>|'\\w+'|\\{\\w+\\})";
             }
             $nextChars = $this->_consumeRegex("/^{$testPattern}/", 1);
             if ($nextChars === null) {
                 throw new Exception\Generic("Invalid backreference");
             }
             if ($char === "k") {
                 $nextChars = substr($nextChars, 1, -1);
             }
             //Check reference validity
             if (!$this->_checkValidReference($nextChars)) {
                 throw new Exception\Generic("Reference to non-existent subpattern '{$nextChars}'");
             }
             //Emit a backreference token
             $this->_emitToken(Token::TYPE_BACK_REFERENCE, $char, $nextChars);
         } elseif ($this->_escaped && is_numeric($char)) {
             //Char class does not handle back references so if the character
             //is not octal process the character again without the escape
             if ($this->_inCharClass && $char > 7) {
                 $this->_unconsume();
                 $this->_escaped = false;
                 continue;
             }
             //If the character is a 0 consume up to 2 octal digits,
             //otherwise consume all the following digits
             if ($char === "0" || $this->_inCharClass) {
                 $testPattern = "^[0-7]{1,2}";
             } else {
                 $testPattern = "^\\d+";
             }
             //Consume following numbers
             $nextChars = $this->_consumeRegex("/^{$testPattern}/");
             if ($nextChars !== null) {
                 $char .= $nextChars;
             }
             //If the first digit is 0 or its a valid octal number and there
             //are not enough back references
             $hasReference = $this->_checkValidReference($char);
             if ($char[0] === "0" || $this->_inCharClass || preg_match("/^[0-7]{2,3}\$/", $char) && !$hasReference) {
                 $this->_emitToken(Token::TYPE_OCTAL_CHAR, $char);
             } elseif ($hasReference) {
                 //Emit a backreference token
                 $this->_emitToken(Token::TYPE_BACK_REFERENCE, "\\", $char);
             } else {
                 throw new Exception\Generic("Reference to non-existent subpattern '{$char}'");
             }
         } elseif (!$this->_escaped && $checkEndDelimiter && $char === $endDelimiter) {
             //Throw an exception
             throw new Exception\InvalidDelimiter("Unescaped end delimiter '{$char}' inside regex");
         } else {
             //If the character is not escaped and the "x" modifier is active
             if (!$this->_escaped && strpos($this->_modifiersStack->top(), "x") !== false) {
                 //If it is a "#"
                 if ($char === "#") {
                     //Emit a comment token
                     $nextChars = $this->_consumeUntil("\n");
                     if ($nextChars === null) {
                         $nextChars = $this->_consumeRemaining();
                     }
                     $this->_emitToken(Token::TYPE_COMMENT, $char, $nextChars);
                     continue;
                 } elseif (preg_match("/\\s/", $char)) {
                     continue;
                 }
             }
             //Emit the character as a simple pattern token
             $this->_emitToken(Token::TYPE_CHAR, $char);
         }
         //Reset the escaped state
         $this->_escaped = false;
     }
     //If the escaped state is already active it means that no end delimiter
     //has been found, so an exception must be thrown
     if ($this->_escaped) {
         throw new Exception\InvalidDelimiter("End delimiter '{$endDelimiter}' not found");
     }
     //Throw exception if there are unclosed subpatterns
     if ($this->_openSubpatterns) {
         throw new Exception\Generic("The regex contains unclosed subpatterns");
     }
     //Throw exception if there are unclosed char classes
     if ($this->_inCharClass) {
         throw new Exception\Generic("The regex contains unclosed character classes");
     }
     //Emit the end delimiter token
     $this->_emitToken(Token::TYPE_REGEX_END_DELIMITER, $endDelimiter);
     //If regex modifiers were specified emit the token
     if ($rModifiers) {
         $this->_emitToken(Token::TYPE_REGEX_MODIFIERS, $rModifiers);
     }
 }