/** * Parse given html into an AST without attempting to fix any of the HTML * * @param string $html * @return PHPricot_Document */ public function parse($html) { $this->stack = array(); $this->document = $this->currentParent = new PHPricot_Document(); $this->parser = html_parser_create(); html_parser_data_handler($this->parser, array($this, "text")); html_parser_starttag_handler($this->parser, array($this, "startTag")); html_parser_endtag_handler($this->parser, array($this, "endTag")); html_parser_comment_handler($this->parser, array($this, "comment")); html_parser_parse($this->parser, $html); html_parser_free($this->parser); return $this->document; }
/** * Parses HTML. Return list of following format: * start tag: array(H_START, tagName, tagAttrs) * end tag: array(H_END, tagName) * text: array(H_TEXT, text) * comment: array(H_COMMENT, comment) * Tags names are UPPERCASED. * * @param html string Source html * @return list */ public function parseHtml($html) { static $s, $e, $d, $c; if (!$s) { define('H_START', 0); define('H_END', 1); define('H_TEXT', 2); define('H_COMMENT', 3); $s = create_function('$tag, $attrs', '$GLOBALS["__parsed_html"][] = array(H_START, $tag, $attrs);'); $e = create_function('$tag', '$GLOBALS["__parsed_html"][] = array(H_END, $tag);'); $d = create_function('$text', '$GLOBALS["__parsed_html"][] = array(H_TEXT, $text);'); $c = create_function('$text', '$GLOBALS["__parsed_html"][] = array(H_COMMENT, $text);'); } $GLOBALS["__parsed_html"] = array(); $p = html_parser_create(); html_parser_starttag_handler($p, $s); html_parser_endtag_handler($p, $e); html_parser_data_handler($p, $d); html_parser_comment_handler($p, $c); html_parser_parse($p, $html, 1); html_parser_free($p); return $GLOBALS["__parsed_html"]; }