Esempio n. 1
1
 /**
  * Parses a full HTML document.
  * @param $text HTML text to parse
  * @param $builder Custom builder implementation
  * @return Parsed HTML as DOMDocument
  */
 public static function parse($text, $builder = null)
 {
     // Cleanup invalid HTML
     $doc = new DOMDocument();
     if (mb_detect_encoding($text, "UTF-8", true) == "UTF-8") {
         @$doc->loadHTML('<?xml encoding="UTF-8" ?>' . $text);
     } else {
         @$doc->loadHTML($text);
     }
     $text = $doc->saveHTML();
     $tokenizer = new HTML5_Tokenizer($text, $builder);
     $tokenizer->parse();
     return $tokenizer->save();
 }
Esempio n. 2
0
 public function parse()
 {
     $this->content_model = $this->_contentModelFlag;
     if ($this->_lastStartFlag) {
         $this->token = array('type' => self::STARTTAG, 'name' => $this->_lastStartFlag);
     }
     return parent::parse();
 }
Esempio n. 3
0
 public function invoke($test)
 {
     // this is totally the wrong interface to use, but
     // for now we need testing
     $tokenizer = new HTML5_Tokenizer($test['data']);
     $GLOBALS['TIME'] -= get_microtime();
     if (isset($test['document-fragment'])) {
         $tokenizer->parseFragment($test['document-fragment']);
     } else {
         $tokenizer->parse();
     }
     $GLOBALS['TIME'] += get_microtime();
     $this->assertIdentical($test['document'], HTML5_TestData::strDom($tokenizer->save()), $test);
 }
Esempio n. 4
0
 /**
  * Loads an HTML string
  * Parse errors are stored in the global array _dompdf_warnings.
  * @todo use the $encoding variable
  *
  * @param string $str      HTML text to load
  * @param string $encoding Not used yet
  */
 function load_html($str, $encoding = null)
 {
     $this->save_locale();
     // FIXME: Determine character encoding, switch to UTF8, update meta tag. Need better http/file stream encoding detection, currently relies on text or meta tag.
     mb_detect_order('auto');
     if (mb_detect_encoding($str) !== 'UTF-8') {
         $metatags = array('@<meta\\s+http-equiv="Content-Type"\\s+content="(?:[\\w/]+)(?:;\\s*?charset=([^\\s"]+))?@i', '@<meta\\s+content="(?:[\\w/]+)(?:;\\s*?charset=([^\\s"]+))"?\\s+http-equiv="Content-Type"@i', '@<meta [^>]*charset\\s*=\\s*["\']?\\s*([^"\' ]+)@i');
         foreach ($metatags as $metatag) {
             if (preg_match($metatag, $str, $matches)) {
                 break;
             }
         }
         if (mb_detect_encoding($str) == '') {
             if (isset($matches[1])) {
                 $encoding = strtoupper($matches[1]);
             } else {
                 $encoding = 'UTF-8';
             }
         } else {
             if (isset($matches[1])) {
                 $encoding = strtoupper($matches[1]);
             } else {
                 $encoding = 'auto';
             }
         }
         if ($encoding !== 'UTF-8') {
             $str = mb_convert_encoding($str, 'UTF-8', $encoding);
         }
         if (isset($matches[1])) {
             $str = preg_replace('/charset=([^\\s"]+)/i', 'charset=UTF-8', $str);
         } else {
             $str = str_replace('<head>', '<head><meta http-equiv="Content-Type" content="text/html;charset=UTF-8">', $str);
         }
     } else {
         $encoding = 'UTF-8';
     }
     // remove BOM mark from UTF-8, it's treated as document text by DOMDocument
     // FIXME: roll this into the encoding detection using UTF-8/16/32 BOM (http://us2.php.net/manual/en/function.mb-detect-encoding.php#91051)?
     if (substr($str, 0, 3) == chr(0xef) . chr(0xbb) . chr(0xbf)) {
         $str = substr($str, 3);
     }
     // Parse embedded php, first-pass
     if ($this->get_option("enable_php")) {
         ob_start();
         eval("?" . ">{$str}");
         $str = ob_get_clean();
     }
     // if the document contains non utf-8 with a utf-8 meta tag chars and was
     // detected as utf-8 by mbstring, problems could happen.
     // http://devzone.zend.com/article/8855
     if ($encoding !== 'UTF-8') {
         $re = '/<meta ([^>]*)((?:charset=[^"\' ]+)([^>]*)|(?:charset=["\'][^"\' ]+["\']))([^>]*)>/i';
         $str = preg_replace($re, '<meta $1$3>', $str);
     }
     // Store parsing warnings as messages
     set_error_handler("record_warnings");
     // @todo Take the quirksmode into account
     // http://hsivonen.iki.fi/doctype/
     // https://developer.mozilla.org/en/mozilla's_quirks_mode
     $quirksmode = false;
     if ($this->get_option("enable_html5_parser")) {
         $tokenizer = new HTML5_Tokenizer($str);
         $tokenizer->parse();
         $doc = $tokenizer->save();
         // Remove #text children nodes in nodes that shouldn't have
         $tag_names = array("html", "table", "tbody", "thead", "tfoot", "tr");
         foreach ($tag_names as $tag_name) {
             $nodes = $doc->getElementsByTagName($tag_name);
             foreach ($nodes as $node) {
                 self::remove_text_nodes($node);
             }
         }
         $quirksmode = $tokenizer->getTree()->getQuirksMode() > HTML5_TreeBuilder::NO_QUIRKS;
     } else {
         // loadHTML assumes ISO-8859-1 unless otherwise specified, but there are
         // bugs in how DOMDocument determines the actual encoding. Converting to
         // HTML-ENTITIES prior to import appears to resolve the issue.
         // http://devzone.zend.com/1538/php-dom-xml-extension-encoding-processing/ (see #4)
         // http://stackoverflow.com/a/11310258/264628
         $doc = new DOMDocument();
         $doc->preserveWhiteSpace = true;
         $doc->loadHTML(mb_convert_encoding($str, 'HTML-ENTITIES', 'UTF-8'));
         // If some text is before the doctype, we are in quirksmode
         if (preg_match("/^(.+)<!doctype/i", ltrim($str), $matches)) {
             $quirksmode = true;
         } elseif (!preg_match("/^<!doctype/i", ltrim($str), $matches)) {
             $quirksmode = true;
         } else {
             // HTML5 <!DOCTYPE html>
             if (!$doc->doctype->publicId && !$doc->doctype->systemId) {
                 $quirksmode = false;
             }
             // not XHTML
             if (!preg_match("/xhtml/i", $doc->doctype->publicId)) {
                 $quirksmode = true;
             }
         }
     }
     $this->_xml = $doc;
     $this->_quirksmode = $quirksmode;
     $this->_tree = new Frame_Tree($this->_xml);
     restore_error_handler();
     $this->restore_locale();
 }
Esempio n. 5
0
 /**
  * Parses a full HTML document.
  * @param $text HTML text to parse
  * @param $builder Custom builder implementation
  * @return Parsed HTML as DOMDocument
  */
 public static function parse($text, $builder = null)
 {
     $tokenizer = new HTML5_Tokenizer($text, $builder);
     $tokenizer->parse();
     return $tokenizer->save();
 }
 function load_html($str, $encoding = null)
 {
     $this->save_locale();
     mb_detect_order('auto');
     if (mb_detect_encoding($str) !== 'UTF-8') {
         $metatags = array('@<meta\\s+http-equiv="Content-Type"\\s+content="(?:[\\w/]+)(?:;\\s*?charset=([^\\s"]+))?@i', '@<meta\\s+content="(?:[\\w/]+)(?:;\\s*?charset=([^\\s"]+))"?\\s+http-equiv="Content-Type"@i', '@<meta [^>]*charset\\s*=\\s*["\']?\\s*([^"\' ]+)@i');
         foreach ($metatags as $metatag) {
             if (preg_match($metatag, $str, $matches)) {
                 break;
             }
         }
         if (mb_detect_encoding($str) == '') {
             if (isset($matches[1])) {
                 $encoding = strtoupper($matches[1]);
             } else {
                 $encoding = 'UTF-8';
             }
         } else {
             if (isset($matches[1])) {
                 $encoding = strtoupper($matches[1]);
             } else {
                 $encoding = 'auto';
             }
         }
         if ($encoding !== 'UTF-8') {
             $str = mb_convert_encoding($str, 'UTF-8', $encoding);
         }
         if (isset($matches[1])) {
             $str = preg_replace('/charset=([^\\s"]+)/i', 'charset=UTF-8', $str);
         } else {
             $str = str_replace('<head>', '<head><meta http-equiv="Content-Type" content="text/html;charset=UTF-8">', $str);
         }
     } else {
         $encoding = 'UTF-8';
     }
     if (substr($str, 0, 3) == chr(0xef) . chr(0xbb) . chr(0xbf)) {
         $str = substr($str, 3);
     }
     if (DOMPDF_ENABLE_PHP) {
         ob_start();
         eval("?" . ">{$str}");
         $str = ob_get_clean();
     }
     if ($encoding !== 'UTF-8') {
         $re = '/<meta ([^>]*)((?:charset=[^"\' ]+)([^>]*)|(?:charset=["\'][^"\' ]+["\']))([^>]*)>/i';
         $str = preg_replace($re, '<meta $1$3>', $str);
     }
     set_error_handler("record_warnings");
     $quirksmode = false;
     if (DOMPDF_ENABLE_HTML5PARSER) {
         $tokenizer = new HTML5_Tokenizer($str);
         $tokenizer->parse();
         $doc = $tokenizer->save();
         $tag_names = array("html", "table", "tbody", "thead", "tfoot", "tr");
         foreach ($tag_names as $tag_name) {
             $nodes = $doc->getElementsByTagName($tag_name);
             foreach ($nodes as $node) {
                 self::remove_text_nodes($node);
             }
         }
         $quirksmode = $tokenizer->getTree()->getQuirksMode() > HTML5_TreeBuilder::NO_QUIRKS;
     } else {
         $doc = new DOMDocument();
         $doc->preserveWhiteSpace = true;
         $doc->loadHTML($str);
         if (preg_match("/^(.+)<(!doctype|html)/i", ltrim($str), $matches)) {
             $quirksmode = true;
         } else {
             if (!$doc->doctype->publicId && !$doc->doctype->systemId) {
                 $quirksmode = false;
             }
             if (!preg_match("/xhtml/i", $doc->doctype->publicId)) {
                 $quirksmode = true;
             }
         }
     }
     $this->_xml = $doc;
     $this->_quirksmode = $quirksmode;
     $this->_tree = new Frame_Tree($this->_xml);
     restore_error_handler();
     $this->restore_locale();
 }