/** * Will return a normalized ie unicode sequences been evaluated. * * @return string a normalized character class string * @param ReverseRegex\Generator\Scope $head * @param ReverseRegex\Generator\Scope $set * @param Lexer $lexer the lexer to normalize */ public function normalize(Scope $head, Scope $set, Lexer $lexer) { $collection = array(); $unicode = new Unicode(); while ($lexer->moveNext() && !$lexer->isNextToken(Lexer::T_SET_CLOSE)) { $value = null; switch (true) { case $lexer->isNextTokenAny(array(Lexer::T_SHORT_UNICODE_X, Lexer::T_SHORT_P, Lexer::T_SHORT_X)): $collection[] = $unicode->evaluate($lexer); break; case $lexer->isNextTokenAny(array(Lexer::T_LITERAL_CHAR, Lexer::T_LITERAL_NUMERIC)): $collection[] = $lexer->lookahead['value']; break; case $lexer->isNextToken(Lexer::T_SET_RANGE): $collection[] = '-'; break; case $lexer->isNextToken(Lexer::T_ESCAPE_CHAR): $collection[] = '\\'; break; default: throw new ParserException('Illegal meta character detected in character class'); } } /* if($lexer->lookahead['type'] === null) { throw new ParserException('Closing character set token not found'); } */ return '[' . implode('', $collection) . ']'; }
/** * Will parse the regex into generator * * @access public * @return */ public function parse($sub = false) { try { while ($this->lexer->moveNext()) { $result = null; $scope = null; $parser = null; switch (true) { case $this->lexer->isNextToken(Lexer::T_GROUP_OPEN): # is the group character the first token? is the regex wrapped in brackets. //if($this->lexer->token === null) { // continue; //} # note this is a new group create new parser instance. $parser = new Parser($this->lexer, new Scope(), new Scope()); $this->left = $parser->parse(true)->getResult(); $this->head->attach($this->left); break; case $this->lexer->isNextToken(Lexer::T_GROUP_CLOSE): # group is finished don't want to contine this loop break = 2 break 2; break; case $this->lexer->isNextTokenAny(array(Lexer::T_LITERAL_CHAR, Lexer::T_LITERAL_NUMERIC)): # test for literal characters (abcd) $this->left = new LiteralScope(); $this->left->addLiteral($this->lexer->lookahead['value']); $this->head->attach($this->left); break; case $this->lexer->isNextToken(Lexer::T_SET_OPEN): # character classes [a-z] $this->left = new LiteralScope(); self::createSubParser('character')->parse($this->left, $this->head, $this->lexer); $this->head->attach($this->left); break; case $this->lexer->isNextTokenAny(array(Lexer::T_DOT, Lexer::T_SHORT_D, Lexer::T_SHORT_NOT_D, Lexer::T_SHORT_W, Lexer::T_SHORT_NOT_W, Lexer::T_SHORT_S, Lexer::T_SHORT_NOT_S)): # match short (. \d \D \w \W \s \S) $this->left = new LiteralScope(); self::createSubParser('short')->parse($this->left, $this->head, $this->lexer); $this->head->attach($this->left); break; case $this->lexer->isNextTokenAny(array(Lexer::T_SHORT_P, Lexer::T_SHORT_UNICODE_X, Lexer::T_SHORT_X)): # match short (\p{L} \x \X ) $this->left = new LiteralScope(); self::createSubParser('unicode')->parse($this->left, $this->head, $this->lexer); $this->head->attach($this->left); break; case $this->lexer->isNextTokenAny(array(Lexer::T_QUANTIFIER_OPEN, Lexer::T_QUANTIFIER_PLUS, Lexer::T_QUANTIFIER_QUESTION, Lexer::T_QUANTIFIER_STAR, Lexer::T_QUANTIFIER_OPEN)): # match quantifiers self::createSubParser('quantifer')->parse($this->left, $this->head, $this->lexer); break; case $this->lexer->isNextToken(Lexer::T_CHOICE_BAR): # match alternations $this->left = $this->head; $this->head = new Scope(); $this->result->useAlternatingStrategy(); $this->result->attach($this->head); break; default: # ignore character } } } catch (ParserException $e) { $pos = $this->lexer->lookahead['position']; $compressed = $this->compress(); throw new ParserException(sprintf('Error found STARTING at position %s after `%s` with msg %s ', $pos, $compressed, $e->getMessage())); } return $this; }