/** * Loads an HTML string * Parse errors are stored in the global array _dompdf_warnings. * @todo use the $encoding variable * * @param string $str HTML text to load * @param string $encoding Not used yet */ public function loadHtml($str, $encoding = null) { $this->saveLocale(); // FIXME: Determine character encoding, switch to UTF8, update meta tag. Need better http/file stream encoding detection, currently relies on text or meta tag. mb_detect_order('auto'); if (mb_detect_encoding($str) !== 'UTF-8') { $metatags = array('@<meta\\s+http-equiv="Content-Type"\\s+content="(?:[\\w/]+)(?:;\\s*?charset=([^\\s"]+))?@i', '@<meta\\s+content="(?:[\\w/]+)(?:;\\s*?charset=([^\\s"]+))"?\\s+http-equiv="Content-Type"@i', '@<meta [^>]*charset\\s*=\\s*["\']?\\s*([^"\' ]+)@i'); foreach ($metatags as $metatag) { if (preg_match($metatag, $str, $matches)) { break; } } if (mb_detect_encoding($str) == '') { if (isset($matches[1])) { $encoding = strtoupper($matches[1]); } else { $encoding = 'UTF-8'; } } else { if (isset($matches[1])) { $encoding = strtoupper($matches[1]); } else { $encoding = 'auto'; } } if ($encoding !== 'UTF-8') { $str = mb_convert_encoding($str, 'UTF-8', $encoding); } if (isset($matches[1])) { $str = preg_replace('/charset=([^\\s"]+)/i', 'charset=UTF-8', $str); } else { $str = str_replace('<head>', '<head><meta http-equiv="Content-Type" content="text/html;charset=UTF-8">', $str); } } else { $encoding = 'UTF-8'; } // remove BOM mark from UTF-8, it's treated as document text by DOMDocument // FIXME: roll this into the encoding detection using UTF-8/16/32 BOM (http://us2.php.net/manual/en/function.mb-detect-encoding.php#91051)? if (substr($str, 0, 3) == chr(0xef) . chr(0xbb) . chr(0xbf)) { $str = substr($str, 3); } // Parse embedded php, first-pass if ($this->options->isPhpEnabled()) { ob_start(); eval("?" . ">{$str}"); $str = ob_get_clean(); } // if the document contains non utf-8 with a utf-8 meta tag chars and was // detected as utf-8 by mbstring, problems could happen. // http://devzone.zend.com/article/8855 if ($encoding !== 'UTF-8') { $re = '/<meta ([^>]*)((?:charset=[^"\' ]+)([^>]*)|(?:charset=["\'][^"\' ]+["\']))([^>]*)>/i'; $str = preg_replace($re, '<meta $1$3>', $str); } // Store parsing warnings as messages set_error_handler(array("\\Dompdf\\Helpers", "record_warnings")); // @todo Take the quirksmode into account // http://hsivonen.iki.fi/doctype/ // https://developer.mozilla.org/en/mozilla's_quirks_mode $quirksmode = false; if ($this->options->isHtml5ParserEnabled()) { $tokenizer = new HTML5_Tokenizer($str); $tokenizer->parse(); $doc = $tokenizer->save(); // Remove #text children nodes in nodes that shouldn't have $tag_names = array("html", "table", "tbody", "thead", "tfoot", "tr"); foreach ($tag_names as $tag_name) { $nodes = $doc->getElementsByTagName($tag_name); foreach ($nodes as $node) { self::remove_text_nodes($node); } } $quirksmode = $tokenizer->getTree()->getQuirksMode() > HTML5_TreeBuilder::NO_QUIRKS; } else { // loadHTML assumes ISO-8859-1 unless otherwise specified, but there are // bugs in how DOMDocument determines the actual encoding. Converting to // HTML-ENTITIES prior to import appears to resolve the issue. // http://devzone.zend.com/1538/php-dom-xml-extension-encoding-processing/ (see #4) // http://stackoverflow.com/a/11310258/264628 $doc = new DOMDocument(); $doc->preserveWhiteSpace = true; $doc->loadHTML(mb_convert_encoding($str, 'HTML-ENTITIES', 'UTF-8')); // If some text is before the doctype, we are in quirksmode if (preg_match("/^(.+)<!doctype/i", ltrim($str), $matches)) { $quirksmode = true; } elseif (!preg_match("/^<!doctype/i", ltrim($str), $matches)) { $quirksmode = true; } else { // HTML5 <!DOCTYPE html> if (!$doc->doctype->publicId && !$doc->doctype->systemId) { $quirksmode = false; } // not XHTML if (!preg_match("/xhtml/i", $doc->doctype->publicId)) { $quirksmode = true; } } } $this->dom = $doc; $this->quirksmode = $quirksmode; $this->tree = new FrameTree($this->dom); restore_error_handler(); $this->restoreLocale(); }