/** * Return array contains formated XHTML string * created from the responded HTML of the given URL. * array[code] => HTTP status code * array[headers] => HTTP headers * array[headers] => formated XHTML string made from the entity body * Throw exception if error. * * @param string $url * @param integer $cache_lifetime * @param boolean $conditional_request * @param array $headers * @param array $post * @return array */ public final function getXhtml($url, $cache_lifetime = 0, $conditional_request = false, $headers = array(), $post = array()) { /* * \x21\x23-\x3b\x3d\x3f-\x5a\x5c\x5f\x61-\x7a\x7c\x7e */ if (!preg_match('/^https?:\\/\\/\\w[\\w\\-\\.]+/i', $url)) { throw new Exception("Not a valid or fully qualified HTTP URL."); } $data = false; $cache_lifetime = (int) $cache_lifetime; $use_cache = !empty($this->cacheDir) and $cache_lifetime > 0; if ($use_cache) { $cache = new Cache_Lite(array('cacheDir' => $this->cacheDir, 'lifeTime' => $cache_lifetime)); $params = array(); foreach ($headers as $key => $value) { if (!empty($value)) { $params[] = urlencode($key) . '=' . urlencode($value); } } foreach ($post as $key => $value) { $params[] = urlencode($key) . '=' . urlencode($value); } $cache_id = "{$url}?" . implode('&', $params); if (false !== ($data = $cache->get($cache_id))) { $data = unserialize($data); } } /* * Access to the URL if not cached * or if the cache has either Last-Modified or Etag header * and conditional request is specified. */ if ($conditional_request and (!isset($data['headers']['last-modified']) or !isset($data['headers']['etag']))) { $conditional_request = false; } if (!$data or $conditional_request) { if (isset($data['headers']['last-modified']) and (!isset($headers['last-modified']) or empty($headers['last-modified']))) { $headers['last-modified'] = $data['headers']['last-modified']; } if (isset($data['headers']['etag']) and (!isset($headers['etag']) or empty($headers['etag']))) { $headers['etag'] = $data['headers']['etag']; } try { $response = $this->getHttpResponse($url, $headers, $post); } catch (Exception $e) { if (!$data) { throw $e; } } /* * Use cache if the responded HTTP status code is 304. * If 200, format the responded HTML of the given URL to XHTML. */ if (!$data or isset($response['code']) and $response['code'] != 304) { $data =& $response; /* * If status code was 200 and Content-Type was not (X)HTML, * the status code was forcibly altered to 204. * @see HTTP_Request_Listener_Extended->update(). */ if ($data['code'] != 200 and $data['code'] != 204) { throw new Exception("Responded HTTP Status Code is {$data['code']}."); } elseif (isset($data['headers']['content-type']) and !preg_match('/^(?:text|application)\\/x?html\\b/', $data['headers']['content-type'])) { throw new Exception("Responded Content-Type is {$data['headers']['content-type']}"); } elseif (empty($data['body'])) { throw new Exception("Responded entity body is empty."); } elseif (!preg_match('/<\\w+[^>]*?>/', $data['body'], $matches)) { throw new Exception("Responded entity body does not contain a markup symbol."); } elseif (false !== strpos($matches[0], "")) { throw new Exception("Responded entity body contains NULL."); } /* * Remove BOM and NULLs. */ $data['body'] = preg_replace('/^\\xef\\xbb\\xbf/', '', $data['body']); $data['body'] = str_replace("", '', $data['body']); /* * Initialize the backups. */ $this->backup = array(); $this->backup_count = 0; /* * Removing SCRIPT and STYLE is recommended. * The following substitute code will capsulate the content of the tags in CDATA. * If use it, be sure that some JavaScript method such as document.write * is not compliant with XHTML/XML. */ $tags = array('script', 'style'); foreach ($tags as $tag) { $data['body'] = preg_replace("/<{$tag}\\b[^>]*?>.*?<\\/{$tag}\\b[^>]*?>/si", '', $data['body']); /* $data['body'] = preg_replace_callback( "/(<$tag\b[^>]*?>)(.*?)(<\/$tag\b[^>]*?>)/si", create_function('$matches', ' $content = trim($matches[2]); if (empty($content) or preg_match("/^<!\[CDATA\[.*?\]\]>$/s", $content)) { return $matches[0]; } else { $content = preg_replace("/^<!-+/", "", $content); $content = preg_replace("/-+>$/", "", $content); $content = preg_replace("/\s*\/\/$/s", "", trim($content)); return "$matches[1]<![CDATA[\n$content\n]]>$matches[3]"; } '), $data['body'] ); */ } /* * Backup CDATA sections for later process. */ $data['body'] = preg_replace_callback('/<!\\[CDATA\\[.*?\\]\\]>/s', array($this, 'backup'), $data['body']); /* * Comment section must not contain two or more adjacent hyphens. */ $data['body'] = preg_replace_callback('/<!--(.*?)-->/si', create_function('$matches', ' return "<!-- ".preg_replace("/-{2,}/", "-", $matches[1])." -->"; '), $data['body']); /* * Backup comment sections for later process. */ $data['body'] = preg_replace_callback('/<!--.*?-->/s', array($this, 'backup'), $data['body']); /* * Process tags that is potentially dangerous for XML parsers. */ $data['body'] = preg_replace_callback('/(<textarea\\b[^>]*?>)(.*?)(<\\/textarea\\b[^>]*?>)/si', create_function('$matches', ' return $matches[1].str_replace("<", "<", $matches[2]).$matches[3]; '), $data['body']); $data['body'] = preg_replace_callback('/<xmp\\b[^>]*?>(.*?)<\\/xmp\\b[^>]*?>/si', create_function('$matches', ' return "<pre>".str_replace("<", "<", $matches[1])."</pre>"; '), $data['body']); $data['body'] = preg_replace_callback('/<plaintext\\b[^>]*?>(.*)$/si', create_function('$matches', ' return "<pre>".str_replace("<", "<", $matches[1])."</pre>"; '), $data['body']); /* * Remove DTD declarations, wrongly placed comments etc. * This must be done before removing DOCTYPE. */ $data['body'] = preg_replace('/<!(?!DOCTYPE)[^>]*?>/si', '', $data['body']); /* * XML and DOCTYPE declaration will be replaced. */ $data['body'] = preg_replace('/<!DOCTYPE\\b[^>]*?>/si', '', $data['body']); $data['body'] = preg_replace('/<\\?xml\\b[^>]*?\\?>/si', '', $data['body']); if (preg_match('/^\\s*$/s', $data['body'])) { throw new Exception('The entity body became empty after preprocessing.'); } /* * Detect character encoding and convert to UTF-8. */ $encoding = false; if (isset($data['headers']['content-type'])) { $encoding = $this->getCharsetFromCType($data['headers']['content-type']); } if (!$encoding and preg_match_all('/<meta\\b[^>]*?>/si', $data['body'], $matches)) { foreach ($matches[0] as $value) { if (strtolower($this->getAttribute('http-equiv', $value)) == 'content-type' and false !== ($encoding = $this->getAttribute('content', $value))) { $encoding = $this->getCharsetFromCType($encoding); break; } } } /* * Use mbstring to convert character encoding if available. * Otherwise use iconv (iconv may try to detect character encoding automatically). * Do not trust the declared encoding and do conversion even if UTF-8. */ if (extension_loaded('mbstring')) { if (!$encoding) { @mb_detect_order('ASCII, JIS, UTF-8, EUC-JP, SJIS'); if (false === ($encoding = @mb_preferred_mime_name(@mb_detect_encoding($data['body'])))) { throw new Exception('Failed detecting character encoding.'); } } @mb_convert_variables('UTF-8', $encoding, $data, $this->backup); } else { if (false === ($data['body'] = @iconv($encoding, 'UTF-8', $data['body']))) { throw new Exception('Failed converting character encoding.'); } foreach ($this->backup as $key => $value) { if (false === ($this->backup[$key] = @iconv($encoding, 'UTF-8', $value))) { throw new Exception('Failed converting character encoding.'); } } } /* * Restore CDATAs and comments. */ for ($i = 0; $i < $this->backup_count; $i++) { $data['body'] = str_replace("<restore count=\"{$i}\" />", $this->backup[$i], $data['body']); } /* * Use Tidy to format HTML if available. * Otherwise, use HTMLParser class (is slower and consumes much memory). */ if (extension_loaded('tidy')) { $tidy = new tidy(); $tidy->parseString($data['body'], array('output-xhtml' => true), 'UTF8'); $tidy->cleanRepair(); $data['body'] = $tidy->html(); } else { require_once 'HTMLParser.class.php'; $parser = new HTMLParser(); $format_rule = (require 'xhtml1-transitional_dtd.inc.php'); $parser->setRule($format_rule); $parser->setRoot('html', array('xmlns' => 'http://www.w3.org/1999/xhtml')); $parser->setGenericParent('body'); $parser->parse($data['body']); $data['body'] = $parser->dump(); } /* * Valid XHTML DOCTYPE declaration (with DTD URI) is required * for SimpleXMLElement->asXML() method to produce proper XHTML tags. */ $declarations = '<?xml version="1.0" encoding="UTF-8"?>'; $declarations .= '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" '; $declarations .= '"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">'; $data['body'] = "{$declarations}{$data['body']}"; if ($use_cache) { $cache->save(serialize($data), $cache_id); } } } return $data; }