/** * @throws \InvalidArgumentException * @return HtmlToken[] */ public function tokenizer() { if ($this->_SegmentedString->eos()) { return array(); } while (true) { $this->_startPos = $startPos = $this->_SegmentedString->tell(); $result = $this->nextToken($this->_SegmentedString); $this->_state = static::DataState; $endPos = $this->_SegmentedString->tell(); if ($result === false && $endPos - $startPos === 0) { throw new \InvalidArgumentException('Given invalid string or invalid statement.'); } $startState = $this->_startState; // In other than `DataState`, `nextToken` return the type of Character, it contains the type of EndTag. // SegmentedString go back to the end of the type of Character position. $type = $this->_Token->getType(); if ($type === HTMLToken::Character && $this->_bufferedEndTagName !== '' && ($startState === static::RAWTEXTState || $startState === static::RCDATAState || $startState === static::ScriptDataState)) { $length = strlen($this->_Token->getData()); // HTMLToken::Character $this->_buffer = array_slice($this->_buffer, 0, $length); $this->_compactBuffer($startPos, $startPos + $length, $type); $token = $this->_Token; $this->_tokens[] = $token; // process again for type of EndTag $this->_SegmentedString->seek($startPos + $length); $this->_state = $startState; } else { $this->_compactBuffer($startPos, $endPos, $type); $token = $this->_Token; $this->_tokens[] = $token; // FIXME: The tokenizer should do this work for us. if ($type === HTMLToken::StartTag) { $this->_updateStateFor($token->getTagName()); } else { $this->_state = static::DataState; } } $this->_startState = $this->_state; $this->_buffer = array(); $this->_bufferedEndTagName = ''; $this->_temporaryBuffer = ''; $this->_Token = new HTMLToken(); if ($this->_SegmentedString->eos()) { break; } } return $this->_tokens; }