function &process($html, &$pipeline) { // Run the XML parser on the XHTML we've prepared $dom_tree = TreeBuilder::build($html); // Check if parser returned valid document if (is_null($dom_tree)) { readfile(HTML2PS_DIR . '/templates/cannot_parse.html'); error_log(sprintf("Cannot parse document: %s", $pipeline->get_base_url())); die; } /** * Detect the base URI for this document. * * According to the HTML 4.01 p. 12.4.1: * User agents must calculate the base URI according to the following precedences (highest priority to lowest): * * 1. The base URI is set by the BASE element. * 2. The base URI is given by meta data discovered during a protocol interaction, such as an HTTP header (see [RFC2616]). * 3. By default, the base URI is that of the current document. Not all HTML documents have a base URI (e.g., a valid HTML document may appear in an email and may not be designated by a URI). Such HTML documents are considered erroneous if they contain relative URIs and rely on a default base URI. */ /** * Check if BASE element present; use its first occurrence */ $this->_scan_base($dom_tree, $pipeline); /** * @todo fall back to the protocol metadata */ /** * Parse STYLE / LINK nodes containing CSS references and definitions * This should be done here, as the document body may include STYLE node * (this violates HTML standard, but is rather often appears in Web) */ scan_styles($dom_tree, $pipeline); // Temporary hack: convert CSS rule array to CSS object global $g_css; global $g_css_obj; $g_css_obj = new CSSObject(); foreach ($g_css as $rule) { $g_css_obj->add_rule($rule, $pipeline); } $body = traverse_dom_tree_pdf($dom_tree); $box =& create_pdf_box($body, $pipeline); return $box; }
function scan_styles($root, &$pipeline) { switch ($root->node_type()) { case XML_ELEMENT_NODE: if ($root->tagname() === 'style') { // Parse <style ...> ... </style> nodes // parse_style_node($root, $pipeline); } elseif ($root->tagname() === 'link') { // Parse <link rel="stylesheet" ...> nodes // $rel = strtolower($root->get_attribute("rel")); $type = strtolower($root->get_attribute("type")); if ($root->has_attribute("media")) { $media = explode(",", $root->get_attribute("media")); } else { $media = array(); } if ($rel == "stylesheet" && ($type == "text/css" || $type == "") && (count($media) == 0 || is_allowed_media($media))) { $src = $root->get_attribute("href"); if ($src) { css_import($src, $pipeline); } } } // Note that we continue processing here! // Note that we continue processing here! case XML_DOCUMENT_NODE: // Scan all child nodes $child = $root->first_child(); while ($child) { scan_styles($child, $pipeline); $child = $child->next_sibling(); } break; } }
function FrameBox(&$root, &$pipeline) { // Inherit 'border' CSS value from parent (FRAMESET tag), if current FRAME // has no FRAMEBORDER attribute, and FRAMESET has one $parent = $root->parent(); if (!$root->has_attribute('frameborder') && $parent->has_attribute('frameborder')) { pop_border(); push_border(get_border()); } $this->GenericContainerBox($root); // If NO src attribute specified, just return. if (!$root->has_attribute('src')) { return; } // Determine the fullly qualified URL of the frame content $src = $root->get_attribute('src'); $url = $pipeline->guess_url($src); $data = $pipeline->fetch($url); /** * If framed page could not be fetched return immediately */ if (is_null($data)) { return; } /** * Render only iframes containing HTML only * * Note that content-type header may contain additional information after the ';' sign */ $content_type = $data->get_additional_data('Content-Type'); $content_type_array = explode(';', $content_type); if ($content_type_array[0] != "text/html") { return; } $html = $data->get_content(); // Remove control symbols if any $html = preg_replace('/[\\x00-\\x07]/', "", $html); $converter = Converter::create(); $html = $converter->to_utf8($html, $data->detect_encoding()); $html = html2xhtml($html); $tree = TreeBuilder::build($html); // Save current stylesheet, as each frame may load its own stylesheets // global $g_css; $old_css = $g_css; global $g_css_obj; $old_obj = $g_css_obj; scan_styles($tree, $pipeline); // Temporary hack: convert CSS rule array to CSS object $g_css_obj = new CSSObject(); foreach ($g_css as $rule) { $g_css_obj->add_rule($rule, $pipeline); } // TODO: stinks. Rewrite // $frame_root = traverse_dom_tree_pdf($tree); $box_child =& create_pdf_box($frame_root, $pipeline); $this->add_child($box_child); // Restore old stylesheet // $g_css = $old_css; $g_css_obj = $old_obj; $pipeline->pop_base_url(); }