Exemple #1
0
 /**
  * Parses an HTML fragment.
  * @param $text HTML text to parse
  * @param $context String name of context element to pretend parsing is in.
  * @param $builder Custom builder implementation
  * @return Parsed HTML as DOMDocument
  */
 public static function parseFragment($text, $context = null, $builder = null, $allowedTags = null, $bbcode = array(), $user_name = "")
 {
     $tokenizer = new HTML5_Tokenizer($text, $builder);
     $tokenizer->allowed_tags = $allowedTags;
     $tokenizer->tree->bbcode = $tokenizer->bbcode = $bbcode;
     $tokenizer->userName = $user_name;
     $bucket = "parserModify";
     include "lib/pluginloader.php";
     $tokenizer->parseFragment($context);
     return $tokenizer->save();
 }
 public function invoke($test)
 {
     // this is totally the wrong interface to use, but
     // for now we need testing
     $tokenizer = new HTML5_Tokenizer($test['data']);
     $GLOBALS['TIME'] -= get_microtime();
     if (isset($test['document-fragment'])) {
         $tokenizer->parseFragment($test['document-fragment']);
     } else {
         $tokenizer->parse();
     }
     $GLOBALS['TIME'] += get_microtime();
     $this->assertIdentical($test['document'], HTML5_TestData::strDom($tokenizer->save()), $test);
 }
 public function parse()
 {
     $this->content_model = $this->_contentModelFlag;
     if ($this->_lastStartFlag) {
         $this->token = array('type' => self::STARTTAG, 'name' => $this->_lastStartFlag);
     }
     return parent::parse();
 }
 protected function emitToken($token, $checkStream = true, $dry = false)
 {
     parent::emitToken($token, $checkStream, true);
     // tree handling code omitted
     switch ($token['type']) {
         case self::DOCTYPE:
             if (!isset($token['name'])) {
                 $token['name'] = null;
             }
             if (!isset($token['public'])) {
                 $token['public'] = null;
             }
             if (!isset($token['system'])) {
                 $token['system'] = null;
             }
             $this->outputTokens[] = array('DOCTYPE', $token['name'], $token['public'], $token['system'], empty($token['force-quirks']));
             break;
         case self::STARTTAG:
             $attr = new stdclass();
             foreach ($token['attr'] as $keypair) {
                 // XXX this is IMPORTANT behavior, check if it's
                 // in TreeBuilder
                 $name = $keypair['name'];
                 if (isset($attr->{$name})) {
                     continue;
                 }
                 $attr->{$name} = $keypair['value'];
             }
             $start = array('StartTag', $token['name'], $attr);
             if (isset($token['self-closing'])) {
                 $start[] = true;
             }
             $this->outputTokens[] = $start;
             break;
         case self::ENDTAG:
             $this->outputTokens[] = array('EndTag', $token['name']);
             break;
         case self::COMMENT:
             $this->outputTokens[] = array('Comment', $token['data']);
             break;
         case self::CHARACTER:
         case self::SPACECHARACTER:
             if (count($this->outputTokens)) {
                 $old = array_pop($this->outputTokens);
                 if ($old[0] === 'Character') {
                     $old[1] .= $token['data'];
                     $this->outputTokens[] = $old;
                     break;
                 }
                 $this->outputTokens[] = $old;
             }
             $this->outputTokens[] = array('Character', $token['data']);
             break;
         case self::PARSEERROR:
             $this->outputTokens[] = 'ParseError';
             break;
     }
 }
Exemple #5
0
 /**
  * Loads an HTML string
  * Parse errors are stored in the global array _dompdf_warnings.
  * @todo use the $encoding variable
  *
  * @param string $str      HTML text to load
  * @param string $encoding Not used yet
  */
 function load_html($str, $encoding = null)
 {
     $this->save_locale();
     // FIXME: Determine character encoding, switch to UTF8, update meta tag. Need better http/file stream encoding detection, currently relies on text or meta tag.
     mb_detect_order('auto');
     if (mb_detect_encoding($str) !== 'UTF-8') {
         $metatags = array('@<meta\\s+http-equiv="Content-Type"\\s+content="(?:[\\w/]+)(?:;\\s*?charset=([^\\s"]+))?@i', '@<meta\\s+content="(?:[\\w/]+)(?:;\\s*?charset=([^\\s"]+))"?\\s+http-equiv="Content-Type"@i', '@<meta [^>]*charset\\s*=\\s*["\']?\\s*([^"\' ]+)@i');
         foreach ($metatags as $metatag) {
             if (preg_match($metatag, $str, $matches)) {
                 break;
             }
         }
         if (mb_detect_encoding($str) == '') {
             if (isset($matches[1])) {
                 $encoding = strtoupper($matches[1]);
             } else {
                 $encoding = 'UTF-8';
             }
         } else {
             if (isset($matches[1])) {
                 $encoding = strtoupper($matches[1]);
             } else {
                 $encoding = 'auto';
             }
         }
         if ($encoding !== 'UTF-8') {
             $str = mb_convert_encoding($str, 'UTF-8', $encoding);
         }
         if (isset($matches[1])) {
             $str = preg_replace('/charset=([^\\s"]+)/i', 'charset=UTF-8', $str);
         } else {
             $str = str_replace('<head>', '<head><meta http-equiv="Content-Type" content="text/html;charset=UTF-8">', $str);
         }
     } else {
         $encoding = 'UTF-8';
     }
     // remove BOM mark from UTF-8, it's treated as document text by DOMDocument
     // FIXME: roll this into the encoding detection using UTF-8/16/32 BOM (http://us2.php.net/manual/en/function.mb-detect-encoding.php#91051)?
     if (substr($str, 0, 3) == chr(0xef) . chr(0xbb) . chr(0xbf)) {
         $str = substr($str, 3);
     }
     // Parse embedded php, first-pass
     if ($this->get_option("enable_php")) {
         ob_start();
         eval("?" . ">{$str}");
         $str = ob_get_clean();
     }
     // if the document contains non utf-8 with a utf-8 meta tag chars and was
     // detected as utf-8 by mbstring, problems could happen.
     // http://devzone.zend.com/article/8855
     if ($encoding !== 'UTF-8') {
         $re = '/<meta ([^>]*)((?:charset=[^"\' ]+)([^>]*)|(?:charset=["\'][^"\' ]+["\']))([^>]*)>/i';
         $str = preg_replace($re, '<meta $1$3>', $str);
     }
     // Store parsing warnings as messages
     set_error_handler("record_warnings");
     // @todo Take the quirksmode into account
     // http://hsivonen.iki.fi/doctype/
     // https://developer.mozilla.org/en/mozilla's_quirks_mode
     $quirksmode = false;
     if ($this->get_option("enable_html5_parser")) {
         $tokenizer = new HTML5_Tokenizer($str);
         $tokenizer->parse();
         $doc = $tokenizer->save();
         // Remove #text children nodes in nodes that shouldn't have
         $tag_names = array("html", "table", "tbody", "thead", "tfoot", "tr");
         foreach ($tag_names as $tag_name) {
             $nodes = $doc->getElementsByTagName($tag_name);
             foreach ($nodes as $node) {
                 self::remove_text_nodes($node);
             }
         }
         $quirksmode = $tokenizer->getTree()->getQuirksMode() > HTML5_TreeBuilder::NO_QUIRKS;
     } else {
         // loadHTML assumes ISO-8859-1 unless otherwise specified, but there are
         // bugs in how DOMDocument determines the actual encoding. Converting to
         // HTML-ENTITIES prior to import appears to resolve the issue.
         // http://devzone.zend.com/1538/php-dom-xml-extension-encoding-processing/ (see #4)
         // http://stackoverflow.com/a/11310258/264628
         $doc = new DOMDocument();
         $doc->preserveWhiteSpace = true;
         $doc->loadHTML(mb_convert_encoding($str, 'HTML-ENTITIES', 'UTF-8'));
         // If some text is before the doctype, we are in quirksmode
         if (preg_match("/^(.+)<!doctype/i", ltrim($str), $matches)) {
             $quirksmode = true;
         } elseif (!preg_match("/^<!doctype/i", ltrim($str), $matches)) {
             $quirksmode = true;
         } else {
             // HTML5 <!DOCTYPE html>
             if (!$doc->doctype->publicId && !$doc->doctype->systemId) {
                 $quirksmode = false;
             }
             // not XHTML
             if (!preg_match("/xhtml/i", $doc->doctype->publicId)) {
                 $quirksmode = true;
             }
         }
     }
     $this->_xml = $doc;
     $this->_quirksmode = $quirksmode;
     $this->_tree = new Frame_Tree($this->_xml);
     restore_error_handler();
     $this->restore_locale();
 }
 /**
  * Parses an HTML fragment.
  * @param $text HTML text to parse
  * @param $context String name of context element to pretend parsing is in.
  * @param $builder Custom builder implementation
  * @return Parsed HTML as DOMDocument
  */
 public static function parseFragment($text, $context = null, $builder = null)
 {
     $tokenizer = new HTML5_Tokenizer($text, $builder);
     $tokenizer->parseFragment($context);
     return $tokenizer->save();
 }
 function load_html($str, $encoding = null)
 {
     $this->save_locale();
     mb_detect_order('auto');
     if (mb_detect_encoding($str) !== 'UTF-8') {
         $metatags = array('@<meta\\s+http-equiv="Content-Type"\\s+content="(?:[\\w/]+)(?:;\\s*?charset=([^\\s"]+))?@i', '@<meta\\s+content="(?:[\\w/]+)(?:;\\s*?charset=([^\\s"]+))"?\\s+http-equiv="Content-Type"@i', '@<meta [^>]*charset\\s*=\\s*["\']?\\s*([^"\' ]+)@i');
         foreach ($metatags as $metatag) {
             if (preg_match($metatag, $str, $matches)) {
                 break;
             }
         }
         if (mb_detect_encoding($str) == '') {
             if (isset($matches[1])) {
                 $encoding = strtoupper($matches[1]);
             } else {
                 $encoding = 'UTF-8';
             }
         } else {
             if (isset($matches[1])) {
                 $encoding = strtoupper($matches[1]);
             } else {
                 $encoding = 'auto';
             }
         }
         if ($encoding !== 'UTF-8') {
             $str = mb_convert_encoding($str, 'UTF-8', $encoding);
         }
         if (isset($matches[1])) {
             $str = preg_replace('/charset=([^\\s"]+)/i', 'charset=UTF-8', $str);
         } else {
             $str = str_replace('<head>', '<head><meta http-equiv="Content-Type" content="text/html;charset=UTF-8">', $str);
         }
     } else {
         $encoding = 'UTF-8';
     }
     if (substr($str, 0, 3) == chr(0xef) . chr(0xbb) . chr(0xbf)) {
         $str = substr($str, 3);
     }
     if (DOMPDF_ENABLE_PHP) {
         ob_start();
         eval("?" . ">{$str}");
         $str = ob_get_clean();
     }
     if ($encoding !== 'UTF-8') {
         $re = '/<meta ([^>]*)((?:charset=[^"\' ]+)([^>]*)|(?:charset=["\'][^"\' ]+["\']))([^>]*)>/i';
         $str = preg_replace($re, '<meta $1$3>', $str);
     }
     set_error_handler("record_warnings");
     $quirksmode = false;
     if (DOMPDF_ENABLE_HTML5PARSER) {
         $tokenizer = new HTML5_Tokenizer($str);
         $tokenizer->parse();
         $doc = $tokenizer->save();
         $tag_names = array("html", "table", "tbody", "thead", "tfoot", "tr");
         foreach ($tag_names as $tag_name) {
             $nodes = $doc->getElementsByTagName($tag_name);
             foreach ($nodes as $node) {
                 self::remove_text_nodes($node);
             }
         }
         $quirksmode = $tokenizer->getTree()->getQuirksMode() > HTML5_TreeBuilder::NO_QUIRKS;
     } else {
         $doc = new DOMDocument();
         $doc->preserveWhiteSpace = true;
         $doc->loadHTML($str);
         if (preg_match("/^(.+)<(!doctype|html)/i", ltrim($str), $matches)) {
             $quirksmode = true;
         } else {
             if (!$doc->doctype->publicId && !$doc->doctype->systemId) {
                 $quirksmode = false;
             }
             if (!preg_match("/xhtml/i", $doc->doctype->publicId)) {
                 $quirksmode = true;
             }
         }
     }
     $this->_xml = $doc;
     $this->_quirksmode = $quirksmode;
     $this->_tree = new Frame_Tree($this->_xml);
     restore_error_handler();
     $this->restore_locale();
 }