public function read($initialState, $allowWhitespace = true) { if (is_string($initialState)) { $state = Morphes_Core_Model_Html_State::INITIAL_RAWTEXT; $rawElement = strtolower($initialState); } else { $state = $initialState; } $startPos = $this->_pos; $token = array('pos' => $this->_pos, 'line' => $this->_line, 'column' => $this->_column); $readNext = true; while ($state != Morphes_Core_Model_Html_State::FINISHED) { $ch = $this->_ch !== false ? ord($this->_ch) : false; switch ($state) { case Morphes_Core_Model_Html_State::INITIAL_TEXT: if ($ch === false) { $token['type'] = Morphes_Core_Model_Html_Token::EOF; $readNext = false; $state = Morphes_Core_Model_Html_State::FINISHED; } elseif ($ch == ord('<')) { // here we assume we have enough characters in read buffer. It is always the case for now, // later it may break if we work with underlying stream, not memory buffer if (mb_substr($this->_source, $this->_pos, 2) == '</') { $this->_move(1); $token['type'] = Morphes_Core_Model_Html_Token::TAG_END; $state = Morphes_Core_Model_Html_State::FINISHED; } elseif (mb_substr($this->_source, $this->_pos, 8) == '<![CDATA') { $this->_move(7); $token['type'] = Morphes_Core_Model_Html_Token::CDATA; $state = Morphes_Core_Model_Html_State::CDATA; } elseif (mb_substr($this->_source, $this->_pos, 4) == '<!--') { $this->_move(3); $token['type'] = Morphes_Core_Model_Html_Token::COMMENT; $state = Morphes_Core_Model_Html_State::COMMENT; } else { $token['type'] = Morphes_Core_Model_Html_Token::TAG_START; $state = Morphes_Core_Model_Html_State::FINISHED; } } else { $token['type'] = Morphes_Core_Model_Html_Token::TEXT; $state = Morphes_Core_Model_Html_State::TEXT; } break; case Morphes_Core_Model_Html_State::INITIAL: if ($ch == ord(' ') || $ch == ord("\r") || $ch == ord("\t") || $ch == ord("\n") || $ch == ord("\f")) { if (!$allowWhitespace) { $token['end_pos'] = $this->_pos; throw new Exception(Mage::helper('morphes_core')->__('HTML read error %s: whitespace not expected%s', Morphes_Core_Model_Html_Token::getPosition($token), $this->getSourceAt($token))); } } else { $token['pos'] = $this->_pos; $token['line'] = $this->_line; $token['column'] = $this->_column; if ($ch === false) { $token['type'] = Morphes_Core_Model_Html_Token::EOF; $readNext = false; $state = Morphes_Core_Model_Html_State::FINISHED; } elseif ($ch == ord('>')) { $token['type'] = Morphes_Core_Model_Html_Token::TAG_CLOSE; $state = Morphes_Core_Model_Html_State::FINISHED; } elseif ($ch == ord('/') && mb_substr($this->_source, $this->_pos, 2) == '/>') { $this->_move(1); $token['type'] = Morphes_Core_Model_Html_Token::TAG_SELF_CLOSE; $state = Morphes_Core_Model_Html_State::FINISHED; } elseif ($ch == ord('=')) { $token['type'] = Morphes_Core_Model_Html_Token::EQ; $state = Morphes_Core_Model_Html_State::FINISHED; } elseif ($ch == ord('!') || ord('a') <= $ch && $ch <= ord('z') || ord('A') <= $ch && $ch <= ord('Z')) { $state = Morphes_Core_Model_Html_State::NAME; $token['type'] = Morphes_Core_Model_Html_Token::NAME; } else { $token['end_pos'] = $this->_pos; throw new Exception(Mage::helper('morphes_core')->__('HTML read error %s: unexpected character%s', Morphes_Core_Model_Html_Token::getPosition($token), $this->getSourceAt($token))); } } break; case Morphes_Core_Model_Html_State::INITIAL_VALUE: if ($ch == ord(' ') || $ch == ord("\r") || $ch == ord("\t") || $ch == ord("\n") || $ch == ord("\f")) { if (!$allowWhitespace) { $token['end_pos'] = $this->_pos; throw new Exception(Mage::helper('morphes_core')->__('HTML read error %s: whitespace not expected%s', Morphes_Core_Model_Html_Token::getPosition($token), $this->getSourceAt($token))); } } else { $token['pos'] = $this->_pos; $token['line'] = $this->_line; $token['column'] = $this->_column; $token['type'] = Morphes_Core_Model_Html_Token::VALUE; if ($ch === false) { $token['type'] = Morphes_Core_Model_Html_Token::EOF; $readNext = false; $state = Morphes_Core_Model_Html_State::FINISHED; } elseif ($ch == ord("'")) { $state = Morphes_Core_Model_Html_State::SINGLE_QUOTED_VALUE; $token['pos']++; $token['column']++; } elseif ($ch == ord('"')) { $state = Morphes_Core_Model_Html_State::DOUBLE_QUOTED_VALUE; $token['pos']++; $token['column']++; } elseif (!($ch == ord('=') || $ch == ord('<') || $ch == ord('>') || $ch == ord('`'))) { $state = Morphes_Core_Model_Html_State::UNQUOTED_VALUE; } else { $token['end_pos'] = $this->_pos; throw new Exception(Mage::helper('morphes_core')->__('HTML read error %s: unexpected character%s', Morphes_Core_Model_Html_Token::getPosition($token), $this->getSourceAt($token))); } } break; case Morphes_Core_Model_Html_State::INITIAL_RAWTEXT: if ($ch === false) { $token['type'] = Morphes_Core_Model_Html_Token::EOF; $readNext = false; $state = Morphes_Core_Model_Html_State::FINISHED; } else { $token['type'] = Morphes_Core_Model_Html_Token::TEXT; $state = Morphes_Core_Model_Html_State::RAWTEXT; if ($ch == ord('<') && strtolower(mb_substr($this->_source, $this->_pos, 2 + mb_strlen($rawElement))) == '</' . $rawElement) { if (mb_strlen($this->_source) > $this->_pos + 2 + mb_strlen($rawElement)) { $nextCh = ord(mb_substr($this->_source, $this->_pos + 2 + mb_strlen($rawElement), 1)); if ($nextCh == ord('>') || $nextCh == ord('/') || $ch == ord(' ') || $ch == ord("\r") || $ch == ord("\t") || $ch == ord("\n") || $ch == ord("\f")) { $readNext = false; $state = Morphes_Core_Model_Html_State::FINISHED; } } } } break; case Morphes_Core_Model_Html_State::CDATA: if ($ch === false) { $token['end_pos'] = $this->_pos; throw new Exception(Mage::helper('morphes_core')->__('HTML read error %s: unexpected end of text%s', Morphes_Core_Model_Html_Token::getPosition($token), $this->getSourceAt($token))); } elseif ($ch == ord(']') && mb_substr($this->_source, $this->_pos, 3) == ']]>') { $this->_move(2); $state = Morphes_Core_Model_Html_State::FINISHED; } break; case Morphes_Core_Model_Html_State::COMMENT: if ($ch === false) { $token['end_pos'] = $this->_pos; throw new Exception(Mage::helper('morphes_core')->__('HTML read error %s: unexpected end of text%s', Morphes_Core_Model_Html_Token::getPosition($token), $this->getSourceAt($token))); } elseif ($ch == ord('-') && mb_substr($this->_source, $this->_pos, 3) == '-->') { $this->_move(2); $state = Morphes_Core_Model_Html_State::FINISHED; } break; case Morphes_Core_Model_Html_State::TEXT: if ($ch === false || $ch == ord('<')) { $readNext = false; $state = Morphes_Core_Model_Html_State::FINISHED; } break; case Morphes_Core_Model_Html_State::RAWTEXT: if ($ch === false) { $readNext = false; $state = Morphes_Core_Model_Html_State::FINISHED; } elseif ($ch == ord('<') && strtolower(mb_substr($this->_source, $this->_pos, 2 + mb_strlen($rawElement))) == '</' . $rawElement) { if (mb_strlen($this->_source) > $this->_pos + 2 + mb_strlen($rawElement)) { $nextCh = ord(mb_substr($this->_source, $this->_pos + 2 + mb_strlen($rawElement), 1)); if ($nextCh == ord('>') || $nextCh == ord('/') || $ch == ord(' ') || $ch == ord("\r") || $ch == ord("\t") || $ch == ord("\n") || $ch == ord("\f")) { $readNext = false; $state = Morphes_Core_Model_Html_State::FINISHED; } } } break; case Morphes_Core_Model_Html_State::NAME: if ($ch === false || !($ch == ord('!') || ord('a') <= $ch && $ch <= ord('z') || ord('A') <= $ch && $ch <= ord('Z') || $ch == ord('_') || $ch == ord('-') || $ch == ord(':') || ord('0') <= $ch && $ch <= ord('9'))) { $readNext = false; $state = Morphes_Core_Model_Html_State::FINISHED; } break; case Morphes_Core_Model_Html_State::UNQUOTED_VALUE: if ($ch === false || $ch == ord('"') || $ch == ord("'") || $ch == ord(' ') || $ch == ord("\r") || $ch == ord("\t") || $ch == ord("\n") || $ch == ord("\f") || $ch == ord('=') || $ch == ord('<') || $ch == ord('>') || $ch == ord('`')) { $readNext = false; $state = Morphes_Core_Model_Html_State::FINISHED; } break; case Morphes_Core_Model_Html_State::SINGLE_QUOTED_VALUE: if ($ch === false) { $token['end_pos'] = $this->_pos; throw new Exception(Mage::helper('morphes_core')->__('HTML read error %s: unexpected end of text%s', Morphes_Core_Model_Html_Token::getPosition($token), $this->getSourceAt($token))); } elseif ($ch == ord("'")) { $state = Morphes_Core_Model_Html_State::FINISHED; $token['end_pos'] = $this->_pos; } break; case Morphes_Core_Model_Html_State::DOUBLE_QUOTED_VALUE: if ($ch === false) { $token['end_pos'] = $this->_pos; throw new Exception(Mage::helper('morphes_core')->__('HTML read error %s: unexpected end of text%s', Morphes_Core_Model_Html_Token::getPosition($token), $this->getSourceAt($token))); } elseif ($ch == ord('"')) { $state = Morphes_Core_Model_Html_State::FINISHED; $token['end_pos'] = $this->_pos; } break; default: throw new Exception('Not implemented'); } if ($readNext) { $this->_ch = $this->_read(); } } if (!isset($token['end_pos'])) { $token['end_pos'] = $this->_pos; } $token['text'] = $token['end_pos'] == $token['pos'] ? '' : mb_substr($this->_source, $token['pos'], $token['end_pos'] - $token['pos']); $token['full_text'] = $this->_pos == $startPos ? '' : mb_substr($this->_source, $startPos, $this->_pos - $startPos); return $token; }
public function parseElement($parentContent = null) { $result = $this->_beforeParsingElement($parentContent); $this->_read(Morphes_Core_Model_Html_State::INITIAL, Morphes_Core_Model_Html_Token::NAME, false); $elementName = $this->_token['text']; array_push($this->_openedElements, $elementName); $void = Morphes_Core_Model_Html_Token::isVoid($elementName); $rawText = Morphes_Core_Model_Html_Token::isRawText($elementName); $this->_processElementName($parentContent, $result, $this->_token, $elementName, $void, $rawText); while ($this->_read(Morphes_Core_Model_Html_State::INITIAL) == Morphes_Core_Model_Html_Token::NAME || $this->_token['type'] == Morphes_Core_Model_Html_Token::EQ) { if ($this->_token['type'] != Morphes_Core_Model_Html_Token::EQ) { $attributeName = $this->_token['text']; $this->_processAttributeName($parentContent, $result, $this->_token, $attributeName); } else { $this->_processAttributeEq($parentContent, $result, $this->_token); $this->_read(Morphes_Core_Model_Html_State::INITIAL_VALUE, Morphes_Core_Model_Html_Token::VALUE, true); $attributeValue = $this->_token['text']; $this->_processAttributeValue($parentContent, $result, $this->_token, $attributeValue); } } switch ($this->_token['type']) { case Morphes_Core_Model_Html_Token::TAG_SELF_CLOSE: $this->_processElementClose($parentContent, $result, $this->_token); array_pop($this->_openedElements); break; case Morphes_Core_Model_Html_Token::TAG_CLOSE: $this->_processElementClose($parentContent, $result, $this->_token); if (!$void) { $this->_read($rawText ? $elementName : Morphes_Core_Model_Html_State::INITIAL_TEXT); $this->_afterParsingChildContent($parentContent, $result, $this->parseContent($this->_beforeParsingChildContent($parentContent, $result))); if ($this->_token['type'] != Morphes_Core_Model_Html_Token::TAG_END) { throw new Exception(Mage::helper('morphes_core')->__('HTML parser error %s: %s expected%s', Morphes_Core_Model_Html_Token::getPosition($this->_token), Morphes_Core_Model_Html_Token::getName(Morphes_Core_Model_Html_Token::TAG_END), $this->getReader()->getSourceAt($this->_token))); } $this->_read(Morphes_Core_Model_Html_State::INITIAL, Morphes_Core_Model_Html_Token::NAME, false); array_pop($this->_openedElements); if ($this->_token['text'] != $elementName) { if (in_array($this->_token['text'], $this->_openedElements)) { $this->getReader()->move(-3 - mb_strlen($this->_token['text'])); $this->_processElementEnd($parentContent, $result, $this->_token, $elementName); } else { throw new Exception(Mage::helper('morphes_core')->__('HTML parser error %s: closing tag for %s expected%s', Morphes_Core_Model_Html_Token::getPosition($this->_token), $elementName, $this->getReader()->getSourceAt($this->_token))); } } else { $this->_read(Morphes_Core_Model_Html_State::INITIAL, Morphes_Core_Model_Html_Token::TAG_CLOSE, false); $this->_processElementEnd($parentContent, $result, $this->_token, $elementName); } } else { array_pop($this->_openedElements); } break; default: throw new Exception(Mage::helper('morphes_core')->__('HTML parser error %s: %s or %s expected%s', Morphes_Core_Model_Html_Token::getPosition($this->_token), Morphes_Core_Model_Html_Token::getName(Morphes_Core_Model_Html_Token::TAG_SELF_CLOSE), Morphes_Core_Model_Html_Token::getName(Morphes_Core_Model_Html_Token::TAG_CLOSE), $this->getReader()->getSourceAt($this->_token))); } $this->_read(Morphes_Core_Model_Html_State::INITIAL_TEXT); return $this->_afterParsingElement($parentContent, $result); }