/** * Parses a full HTML document. * @param $text HTML text to parse * @param $builder Custom builder implementation * @return Parsed HTML as DOMDocument */ public static function parse($text, $builder = null) { // Cleanup invalid HTML $doc = new DOMDocument(); if (mb_detect_encoding($text, "UTF-8", true) == "UTF-8") { @$doc->loadHTML('<?xml encoding="UTF-8" ?>' . $text); } else { @$doc->loadHTML($text); } $text = $doc->saveHTML(); $tokenizer = new HTML5_Tokenizer($text, $builder); $tokenizer->parse(); return $tokenizer->save(); }
public function parse() { $this->content_model = $this->_contentModelFlag; if ($this->_lastStartFlag) { $this->token = array('type' => self::STARTTAG, 'name' => $this->_lastStartFlag); } return parent::parse(); }
public function invoke($test) { // this is totally the wrong interface to use, but // for now we need testing $tokenizer = new HTML5_Tokenizer($test['data']); $GLOBALS['TIME'] -= get_microtime(); if (isset($test['document-fragment'])) { $tokenizer->parseFragment($test['document-fragment']); } else { $tokenizer->parse(); } $GLOBALS['TIME'] += get_microtime(); $this->assertIdentical($test['document'], HTML5_TestData::strDom($tokenizer->save()), $test); }
/** * Loads an HTML string * Parse errors are stored in the global array _dompdf_warnings. * @todo use the $encoding variable * * @param string $str HTML text to load * @param string $encoding Not used yet */ function load_html($str, $encoding = null) { $this->save_locale(); // FIXME: Determine character encoding, switch to UTF8, update meta tag. Need better http/file stream encoding detection, currently relies on text or meta tag. mb_detect_order('auto'); if (mb_detect_encoding($str) !== 'UTF-8') { $metatags = array('@<meta\\s+http-equiv="Content-Type"\\s+content="(?:[\\w/]+)(?:;\\s*?charset=([^\\s"]+))?@i', '@<meta\\s+content="(?:[\\w/]+)(?:;\\s*?charset=([^\\s"]+))"?\\s+http-equiv="Content-Type"@i', '@<meta [^>]*charset\\s*=\\s*["\']?\\s*([^"\' ]+)@i'); foreach ($metatags as $metatag) { if (preg_match($metatag, $str, $matches)) { break; } } if (mb_detect_encoding($str) == '') { if (isset($matches[1])) { $encoding = strtoupper($matches[1]); } else { $encoding = 'UTF-8'; } } else { if (isset($matches[1])) { $encoding = strtoupper($matches[1]); } else { $encoding = 'auto'; } } if ($encoding !== 'UTF-8') { $str = mb_convert_encoding($str, 'UTF-8', $encoding); } if (isset($matches[1])) { $str = preg_replace('/charset=([^\\s"]+)/i', 'charset=UTF-8', $str); } else { $str = str_replace('<head>', '<head><meta http-equiv="Content-Type" content="text/html;charset=UTF-8">', $str); } } else { $encoding = 'UTF-8'; } // remove BOM mark from UTF-8, it's treated as document text by DOMDocument // FIXME: roll this into the encoding detection using UTF-8/16/32 BOM (http://us2.php.net/manual/en/function.mb-detect-encoding.php#91051)? if (substr($str, 0, 3) == chr(0xef) . chr(0xbb) . chr(0xbf)) { $str = substr($str, 3); } // Parse embedded php, first-pass if ($this->get_option("enable_php")) { ob_start(); eval("?" . ">{$str}"); $str = ob_get_clean(); } // if the document contains non utf-8 with a utf-8 meta tag chars and was // detected as utf-8 by mbstring, problems could happen. // http://devzone.zend.com/article/8855 if ($encoding !== 'UTF-8') { $re = '/<meta ([^>]*)((?:charset=[^"\' ]+)([^>]*)|(?:charset=["\'][^"\' ]+["\']))([^>]*)>/i'; $str = preg_replace($re, '<meta $1$3>', $str); } // Store parsing warnings as messages set_error_handler("record_warnings"); // @todo Take the quirksmode into account // http://hsivonen.iki.fi/doctype/ // https://developer.mozilla.org/en/mozilla's_quirks_mode $quirksmode = false; if ($this->get_option("enable_html5_parser")) { $tokenizer = new HTML5_Tokenizer($str); $tokenizer->parse(); $doc = $tokenizer->save(); // Remove #text children nodes in nodes that shouldn't have $tag_names = array("html", "table", "tbody", "thead", "tfoot", "tr"); foreach ($tag_names as $tag_name) { $nodes = $doc->getElementsByTagName($tag_name); foreach ($nodes as $node) { self::remove_text_nodes($node); } } $quirksmode = $tokenizer->getTree()->getQuirksMode() > HTML5_TreeBuilder::NO_QUIRKS; } else { // loadHTML assumes ISO-8859-1 unless otherwise specified, but there are // bugs in how DOMDocument determines the actual encoding. Converting to // HTML-ENTITIES prior to import appears to resolve the issue. // http://devzone.zend.com/1538/php-dom-xml-extension-encoding-processing/ (see #4) // http://stackoverflow.com/a/11310258/264628 $doc = new DOMDocument(); $doc->preserveWhiteSpace = true; $doc->loadHTML(mb_convert_encoding($str, 'HTML-ENTITIES', 'UTF-8')); // If some text is before the doctype, we are in quirksmode if (preg_match("/^(.+)<!doctype/i", ltrim($str), $matches)) { $quirksmode = true; } elseif (!preg_match("/^<!doctype/i", ltrim($str), $matches)) { $quirksmode = true; } else { // HTML5 <!DOCTYPE html> if (!$doc->doctype->publicId && !$doc->doctype->systemId) { $quirksmode = false; } // not XHTML if (!preg_match("/xhtml/i", $doc->doctype->publicId)) { $quirksmode = true; } } } $this->_xml = $doc; $this->_quirksmode = $quirksmode; $this->_tree = new Frame_Tree($this->_xml); restore_error_handler(); $this->restore_locale(); }
/** * Parses a full HTML document. * @param $text HTML text to parse * @param $builder Custom builder implementation * @return Parsed HTML as DOMDocument */ public static function parse($text, $builder = null) { $tokenizer = new HTML5_Tokenizer($text, $builder); $tokenizer->parse(); return $tokenizer->save(); }
function load_html($str, $encoding = null) { $this->save_locale(); mb_detect_order('auto'); if (mb_detect_encoding($str) !== 'UTF-8') { $metatags = array('@<meta\\s+http-equiv="Content-Type"\\s+content="(?:[\\w/]+)(?:;\\s*?charset=([^\\s"]+))?@i', '@<meta\\s+content="(?:[\\w/]+)(?:;\\s*?charset=([^\\s"]+))"?\\s+http-equiv="Content-Type"@i', '@<meta [^>]*charset\\s*=\\s*["\']?\\s*([^"\' ]+)@i'); foreach ($metatags as $metatag) { if (preg_match($metatag, $str, $matches)) { break; } } if (mb_detect_encoding($str) == '') { if (isset($matches[1])) { $encoding = strtoupper($matches[1]); } else { $encoding = 'UTF-8'; } } else { if (isset($matches[1])) { $encoding = strtoupper($matches[1]); } else { $encoding = 'auto'; } } if ($encoding !== 'UTF-8') { $str = mb_convert_encoding($str, 'UTF-8', $encoding); } if (isset($matches[1])) { $str = preg_replace('/charset=([^\\s"]+)/i', 'charset=UTF-8', $str); } else { $str = str_replace('<head>', '<head><meta http-equiv="Content-Type" content="text/html;charset=UTF-8">', $str); } } else { $encoding = 'UTF-8'; } if (substr($str, 0, 3) == chr(0xef) . chr(0xbb) . chr(0xbf)) { $str = substr($str, 3); } if (DOMPDF_ENABLE_PHP) { ob_start(); eval("?" . ">{$str}"); $str = ob_get_clean(); } if ($encoding !== 'UTF-8') { $re = '/<meta ([^>]*)((?:charset=[^"\' ]+)([^>]*)|(?:charset=["\'][^"\' ]+["\']))([^>]*)>/i'; $str = preg_replace($re, '<meta $1$3>', $str); } set_error_handler("record_warnings"); $quirksmode = false; if (DOMPDF_ENABLE_HTML5PARSER) { $tokenizer = new HTML5_Tokenizer($str); $tokenizer->parse(); $doc = $tokenizer->save(); $tag_names = array("html", "table", "tbody", "thead", "tfoot", "tr"); foreach ($tag_names as $tag_name) { $nodes = $doc->getElementsByTagName($tag_name); foreach ($nodes as $node) { self::remove_text_nodes($node); } } $quirksmode = $tokenizer->getTree()->getQuirksMode() > HTML5_TreeBuilder::NO_QUIRKS; } else { $doc = new DOMDocument(); $doc->preserveWhiteSpace = true; $doc->loadHTML($str); if (preg_match("/^(.+)<(!doctype|html)/i", ltrim($str), $matches)) { $quirksmode = true; } else { if (!$doc->doctype->publicId && !$doc->doctype->systemId) { $quirksmode = false; } if (!preg_match("/xhtml/i", $doc->doctype->publicId)) { $quirksmode = true; } } } $this->_xml = $doc; $this->_quirksmode = $quirksmode; $this->_tree = new Frame_Tree($this->_xml); restore_error_handler(); $this->restore_locale(); }