function &process($html, &$pipeline)
 {
     // Run the XML parser on the XHTML we've prepared
     $dom_tree = TreeBuilder::build($html);
     // Check if parser returned valid document
     if (is_null($dom_tree)) {
         readfile(HTML2PS_DIR . '/templates/cannot_parse.html');
         error_log(sprintf("Cannot parse document: %s", $pipeline->get_base_url()));
         die;
     }
     /**
      * Detect the base URI for this document. 
      * 
      * According to the HTML 4.01 p. 12.4.1:
      * User agents must calculate the base URI according to the following precedences (highest priority to lowest):
      * 
      * 1. The base URI is set by the BASE element.
      * 2. The base URI is given by meta data discovered during a protocol interaction, such as an HTTP header (see [RFC2616]).
      * 3. By default, the base URI is that of the current document. Not all HTML documents have a base URI (e.g., a valid HTML document may appear in an email and may not be designated by a URI). Such HTML documents are considered erroneous if they contain relative URIs and rely on a default base URI.
      */
     /** 
      * Check if BASE element present; use its first occurrence
      */
     $this->_scan_base($dom_tree, $pipeline);
     /**
      * @todo fall back to the protocol metadata
      */
     /**
      * Parse STYLE / LINK nodes containing CSS references and definitions 
      * This should be done here, as the document body may include STYLE node 
      * (this violates HTML standard, but is rather often appears in Web)
      */
     scan_styles($dom_tree, $pipeline);
     // Temporary hack: convert CSS rule array to CSS object
     global $g_css;
     global $g_css_obj;
     $g_css_obj = new CSSObject();
     foreach ($g_css as $rule) {
         $g_css_obj->add_rule($rule, $pipeline);
     }
     $body = traverse_dom_tree_pdf($dom_tree);
     $box =& create_pdf_box($body, $pipeline);
     return $box;
 }
function scan_styles($root, &$pipeline)
{
    switch ($root->node_type()) {
        case XML_ELEMENT_NODE:
            if ($root->tagname() === 'style') {
                // Parse <style ...> ... </style> nodes
                //
                parse_style_node($root, $pipeline);
            } elseif ($root->tagname() === 'link') {
                // Parse <link rel="stylesheet" ...> nodes
                //
                $rel = strtolower($root->get_attribute("rel"));
                $type = strtolower($root->get_attribute("type"));
                if ($root->has_attribute("media")) {
                    $media = explode(",", $root->get_attribute("media"));
                } else {
                    $media = array();
                }
                if ($rel == "stylesheet" && ($type == "text/css" || $type == "") && (count($media) == 0 || is_allowed_media($media))) {
                    $src = $root->get_attribute("href");
                    if ($src) {
                        css_import($src, $pipeline);
                    }
                }
            }
            // Note that we continue processing here!
        // Note that we continue processing here!
        case XML_DOCUMENT_NODE:
            // Scan all child nodes
            $child = $root->first_child();
            while ($child) {
                scan_styles($child, $pipeline);
                $child = $child->next_sibling();
            }
            break;
    }
}
Пример #3
0
 function FrameBox(&$root, &$pipeline)
 {
     // Inherit 'border' CSS value from parent (FRAMESET tag), if current FRAME
     // has no FRAMEBORDER attribute, and FRAMESET has one
     $parent = $root->parent();
     if (!$root->has_attribute('frameborder') && $parent->has_attribute('frameborder')) {
         pop_border();
         push_border(get_border());
     }
     $this->GenericContainerBox($root);
     // If NO src attribute specified, just return.
     if (!$root->has_attribute('src')) {
         return;
     }
     // Determine the fullly qualified URL of the frame content
     $src = $root->get_attribute('src');
     $url = $pipeline->guess_url($src);
     $data = $pipeline->fetch($url);
     /**
      * If framed page could not be fetched return immediately
      */
     if (is_null($data)) {
         return;
     }
     /**
      * Render only iframes containing HTML only
      *
      * Note that content-type header may contain additional information after the ';' sign
      */
     $content_type = $data->get_additional_data('Content-Type');
     $content_type_array = explode(';', $content_type);
     if ($content_type_array[0] != "text/html") {
         return;
     }
     $html = $data->get_content();
     // Remove control symbols if any
     $html = preg_replace('/[\\x00-\\x07]/', "", $html);
     $converter = Converter::create();
     $html = $converter->to_utf8($html, $data->detect_encoding());
     $html = html2xhtml($html);
     $tree = TreeBuilder::build($html);
     // Save current stylesheet, as each frame may load its own stylesheets
     //
     global $g_css;
     $old_css = $g_css;
     global $g_css_obj;
     $old_obj = $g_css_obj;
     scan_styles($tree, $pipeline);
     // Temporary hack: convert CSS rule array to CSS object
     $g_css_obj = new CSSObject();
     foreach ($g_css as $rule) {
         $g_css_obj->add_rule($rule, $pipeline);
     }
     // TODO: stinks. Rewrite
     //
     $frame_root = traverse_dom_tree_pdf($tree);
     $box_child =& create_pdf_box($frame_root, $pipeline);
     $this->add_child($box_child);
     // Restore old stylesheet
     //
     $g_css = $old_css;
     $g_css_obj = $old_obj;
     $pipeline->pop_base_url();
 }