/** * @param $url * @param bool $use_tidy * @return array */ function get_url($url, $use_tidy = TRUE) { global $cookies; $smarty = TikiLib::lib('smarty'); $result = array(); $get = get_from_dom($url->getElementsByTagName('get')->item(0)); $post = get_from_dom($url->getElementsByTagName('post')->item(0)); $xpath = $url->getElementsByTagName('xpath')->item(0)->textContent; $data = $url->getElementsByTagName('data')->item(0)->textContent; $urlstr = $url->getAttribute("src"); $referer = $url->getAttribute("referer"); $result['data'] = $data; if (extension_loaded("tidy")) { $data = tidy_parse_string($data, array(), 'utf8'); tidy_diagnose($data); if ($use_tidy) { $result['ref_error_count'] = tidy_error_count($data); $result['ref_error_msg'] = tidy_get_error_buffer($data); } } else { $result['ref_error_msg'] = tra("Tidy Extension not present"); } $result['url'] = $urlstr; $result['xpath'] = $xpath; $result['method'] = $url->getAttribute("method"); $result['post'] = $post; $result['get'] = $get; $result['referer'] = $referer; return $result; }
function enlight_xpath($url, $xpath) { global $smarty, $cookies,$base_url; static $purifier; static $loaded = false; $result = array(); $data = $url->getElementsByTagName('data')->item(0)->textContent; if (trim($data) == '') { return tra('The page is empty'); } if (extension_loaded('tidy')) { $data = tidy_parse_string($data, array(), 'utf8'); tidy_diagnose($data); } else { if (!$loaded) { require_once('lib/htmlpurifier_tiki/HTMLPurifier.tiki.php'); $config = getHTMLPurifierTikiConfig(); $config->set('Attr.EnableID', true); $purifier = new HTMLPurifier($config); $loaded = true; } if ($purifier) { $data = '<html><head><meta http-equiv="Content-Type" content="text/html; charset=utf-8" /></head><body>'.$purifier->purify($data).'</body></html>'; //$data = $purifier->purify($data); } } $dom_ref = DOMDocument::loadHTML($data); $xp_ref = new DomXPath($dom_ref); $res_ref = $xp_ref->query('//head'); $base = $dom_ref->createElement('base'); $base->setAttribute('href', $base_url); $res_ref->item(0)->insertBefore($base, $res_ref->item(0)->firstChild); $res_ref = $xp_ref->query($xpath); foreach ($res_ref as $ref) { $ref->setAttribute('style', 'background-color: red;'); } return $dom_ref->saveHTML(); }
/** * @param $url * @param bool $use_tidy * @return array */ function verif_url($url, $use_tidy = TRUE) { global $cookies; static $purifier; static $loaded = false; $smarty = TikiLib::lib('smarty'); $result = array(); $get = get_from_dom($url->getElementsByTagName('get')->item(0)); $post = get_from_dom($url->getElementsByTagName('post')->item(0)); $xpath = $url->getElementsByTagName('xpath')->item(0)->textContent; $data = $url->getElementsByTagName('data')->item(0)->textContent; $urlstr = $url->getAttribute('src'); if (extension_loaded('http')) { $options['timeout'] = 2; $options['connecttimeout'] = 2; $options['url'] = $url->getAttribute('src'); $options['referer'] = $url->getAttribute('referer'); $options['redirect'] = 0; $options['cookies'] = $cookies; $options['cookiestore'] = tempnam('/tmp/', 'tiki-tests'); // Close the session to avoid timeout session_write_close(); switch (strtolower($url->getAttribute('method'))) { case 'get': $buffer = http_get($urlstr, $options, $info); break; case 'post': $buffer = http_post_fields($urlstr, $post, NULL, $options, $info); } $headers = http_parse_headers($buffer); if (isset($headers['Set-Cookie'])) { foreach ($headers['Set-Cookie'] as $c) { TikiLib::parse_str($c, $cookies); } } $buffer = http_parse_message($buffer)->body; } elseif (extension_loaded('curl')) { $curl = curl_init(); curl_setopt($curl, CURLOPT_URL, $urlstr); curl_setopt($curl, CURLOPT_RETURNTRANSFER, true); curl_setopt($curl, CURLOPT_CONNECTTIMEOUT, 2); curl_setopt($curl, CURLOPT_TIMEOUT, 2); curl_setopt($curl, CURLOPT_SSL_VERIFYPEER, false); curl_setopt($curl, CURLOPT_HEADER, true); curl_setopt($curl, CURLOPT_REFERER, $url->getAttribute('referer')); curl_setopt($curl, CURLOPT_FOLLOWLOCATION, false); curl_setopt($curl, CURLOPT_USERAGENT, 'TikiTest'); // We deal with the cookies $cookies_string = ''; foreach ($cookies as $c => $v) { $cookies_string .= "{$c}={$v}; path=/;"; } curl_setopt($curl, CURLOPT_COOKIE, $cookies_string); switch (strtolower($url->getAttribute('method'))) { case 'get': curl_setopt($curl, CURLOPT_HTTPGET, true); break; case 'post': curl_setopt($curl, CURLOPT_POST, true); $post_string = ''; foreach ($post as $p => $v) { if ($post_string != '') { $post_string .= '&'; } $post_string .= "{$p}={$v}"; } curl_setopt($curl, CURLOPT_POSTFIELDS, $post_string); } // Close the session to avoid timeout session_write_close(); $http_response = curl_exec($curl); $header_size = curl_getinfo($curl, CURLINFO_HEADER_SIZE); $header = substr($http_response, 0, $header_size); $body = substr($http_response, $header_size); preg_match_all('|Set-Cookie: (.*);|U', $header, $cookies_array); foreach ($cookies_array[1] as $c) { $cookies_tmp .= "&{$c}"; } TikiLib::parse_str($cookies_tmp, $cookies_titi); if (!is_array($cookies)) { $cookies = array(); } $cookies = array_merge($cookies, $cookies_titi); $buffer = $body; curl_close($curl); } if (extension_loaded('tidy')) { $data = tidy_parse_string($data, array(), 'utf8'); $buffer = tidy_parse_string($buffer, array(), 'utf8'); if ($use_tidy) { tidy_diagnose($data); $result['ref_error_count'] = tidy_error_count($data); $result['ref_error_msg'] = tidy_get_error_buffer($data); tidy_diagnose($buffer); $result['replay_error_count'] = tidy_error_count($buffer); $result['replay_error_msg'] = tidy_get_error_buffer($buffer); } } else { if (!$loaded) { require_once 'lib/htmlpurifier_tiki/HTMLPurifier.tiki.php'; $config = getHTMLPurifierTikiConfig(); $purifier = new HTMLPurifier($config); $loaded = true; } if ($purifier) { $data = '<html><body>' . $purifier->purify($data) . '</body></html>'; $buffer = '<html><body>' . $purifier->purify($buffer) . '</body></html>'; } $result['ref_error_msg'] = tra('The Tidy extension is not present'); $result['replay_error_msg'] = tra('The Tidy extension is not present'); } // If we have a XPath then we extract the new DOM and print it in HTML if (trim($xpath) != '') { $dom_ref = DOMDocument::loadHTML($data); $xp_ref = new DomXPath($dom_ref); $res_ref = $xp_ref->query($xpath); $new_data = new DOMDocument('1.0'); $root = $new_data->createElement('html'); $root = $new_data->appendChild($root); $body = $new_data->createElement('html'); $body = $root->appendChild($body); foreach ($res_ref as $ref) { $tmp = $new_data->importNode($ref, TRUE); $body->appendChild($tmp); } $data = $new_data->saveHTML(); $dom_buffer = DOMDocument::loadHTML($buffer); $xp_buffer = new DomXPath($dom_buffer); $res_buffer = $xp_buffer->query($xpath); $new_buffer = new DOMDocument('1.0'); $root = $new_buffer->createElement('html'); $root = $new_buffer->appendChild($root); $body = $new_buffer->createElement('html'); $body = $root->appendChild($body); foreach ($res_buffer as $ref) { $tmp = $new_buffer->importNode($ref, TRUE); $body->appendChild($tmp); } $buffer = $new_buffer->saveHTML(); } $tmp = diff2($data, $buffer, "htmldiff"); if (trim($xpath) != '') { $result['html'] = preg_replace(array("/<html>/", "/<\\/html>/"), array("<div style='overflow: auto; width:500px; text-align: center'> ", "</div>"), $tmp); } else { $result['html'] = preg_replace(array("/<html.*<body/U", "/<\\/body><\\/html>/U"), array("<div style='overflow: auto; width:500px; text-align: center' ", "</div>"), $tmp); } $result['url'] = $urlstr; $result['method'] = $url->getAttribute('method'); if (strtolower($result['method']) == 'post') { $result['post'] = $post; } return $result; }
<?php $a = tidy_parse_string('<HTML></HTML>'); var_dump(tidy_diagnose($a)); echo str_replace("\r", "", tidy_get_error_buffer($a)); $html = <<<HTML <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 3.2//EN"> <html> <head><title>foo</title></head> <body><p>hello</p></body> </html> HTML; $a = tidy_parse_string($html); var_dump(tidy_diagnose($a)); echo tidy_get_error_buffer($a);