function tidyHtml($html) { $config = ["indent" => 2, "clean" => false, "char-encoding" => "utf8"]; $tidy = new tidy(); $tidy->parseString($html, $config, 'utf8'); $tidy->cleanRepair(); $ret = $tidy->html()->child[1]->value; $ret = substr($ret, 7, -7); return $ret; }
function GetXML($html) { // Specify configuration $config = array('output-xml' => true, 'numeric-entities' => true, 'hide-comments' => true); // Tidy $tidy = new tidy(); $tidy->parseString($html, $config, 'utf8'); $tidy->cleanRepair(); $xHTML = $tidy->html(); return new SimpleXMLElement($xHTML); }
static private function rawToSimpleXML($data) { /* * Конфиг Tidy */ $tidy_config = array( 'input-encoding' => 'utf-8', 'output-encoding' => 'utf8', 'output-xml' => TRUE, 'add-xml-decl' => TRUE, 'hide-comments' => TRUE ); /* * Загрузка данных и очистка от ошибок */ $tidy = new tidy(); $tidy->parseString($data, $tidy_config, $tidy_config['output-encoding']); $tidy->cleanRepair(); $tidy_out = $tidy->html()->value; unset($tidy); /* * Инициализация XML DOM */ $dom = new DOMDocument(); $dom->strictErrorChecking = FALSE; @$dom->loadHTML($tidy_out); /* * Инициализация SimpleXML */ $simplexml = simplexml_import_dom($dom); unset($dom); return $simplexml; }
/** * Get URL info * * This method fetches info about the URL, like the HTTP response code and content type. * * @return array Info about the URL */ public function get() { $res = $this->server->get($this->url_key); if ($res === false) { curl_setopt($this->curl, CURLOPT_URL, $this->url); curl_setopt($this->curl, CURLOPT_RETURNTRANSFER, true); curl_setopt($this->curl, CURLOPT_FILETIME, true); curl_setopt($this->curl, CURLOPT_AUTOREFERER, true); curl_setopt($this->curl, CURLOPT_FOLLOWLOCATION, true); curl_setopt($this->curl, CURLOPT_MAXREDIRS, 6); curl_setopt($this->curl, CURLOPT_USERAGENT, "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_2; en-us) AppleWebKit/531.21.8 (KHTML, like Gecko) Version/4.0.4 Safari/531.21.10"); $html = curl_exec($this->curl); // print_r(htmlentities($body)); curl_close($this->curl); $tidy_config = array('clean' => true, 'output-html' => true, 'wrap' => 78, 'quiet' => 1); $tidy = new tidy(); $tidy->parseString($html, $tidy_config); $tidy->cleanRepair(); $html = $tidy->html()->value; // Buffer DOM errors rather than emitting them as warnings $oldSetting = libxml_use_internal_errors(true); $dom = new DOMDocument(); $dom->loadHTML($html); $xpath = new DOMXPath($dom); $titles = $xpath->evaluate('//*[name()="title"]'); $title = $titles->item(0)->nodeValue; // Clear any existing errors from previous operations libxml_clear_errors(); // Revert error buffering to its previous setting libxml_use_internal_errors($oldSetting); $res = array('title' => $title); $res = json_encode($res); $this->server->add($this->url_key, $res, MEMCACHE_COMPRESSED, self::CACHE_LIMIT); } return json_decode($res, TRUE); }
public function add_html($html, $title, $config) { if ($config['tidy']) { $tidy = new tidy(); $tidy->parseString($html, $config, 'utf8'); $tidy->cleanRepair(); $html = $tidy->html()->value; } $doc = new DOMDocument(); @$doc->loadHTML($html); //$html = $doc->saveXML(); if (!$title) { $title = 'Untitled'; $heads = $doc->getElementsByTagName('head'); if ($heads) { $titles = $heads->item(0)->getElementsByTagName('title'); if ($titles) { $title = $titles->item(0)->nodeValue; } } } // Check images // Handle <img> tags $html = preg_replace_callback('~(<img [^>]*?)src=([\'"])(.+?)[\'"]~', array($this, 'img_callback'), $html); if ($config['split']) { $splits = $this->split($html); $first = TRUE; foreach ($splits as $split) { $this->add_spine_item($split[0], $split[1]); if ($config['toc']) { if ($first) { $this->set_item_toc($title, TRUE, FALSE); } else { $this->set_item_toc(NULL, TRUE, TRUE); } $first = FALSE; } } } else { $this->add_spine_item($html); if ($config['toc']) { $this->set_item_toc($title, TRUE); } } return $title; }
/** * 修正html中的语法错误 * */ function cy_html_repair($html, $encoding = 'UTF8') { $config = array('clean' => true, 'output-xml' => true, 'output-xhtml' => true, 'wrap' => 200); $t = new tidy(); $t->parseString($html, $config, $encoding); $t->cleanRepair(); // fix html return $t->html(); }
/** * Return array contains formated XHTML string * created from the responded HTML of the given URL. * array[code] => HTTP status code * array[headers] => HTTP headers * array[headers] => formated XHTML string made from the entity body * Throw exception if error. * * @param string $url * @param integer $cache_lifetime * @param boolean $conditional_request * @param array $headers * @param array $post * @return array */ public final function getXhtml($url, $cache_lifetime = 0, $conditional_request = false, $headers = array(), $post = array()) { /* * \x21\x23-\x3b\x3d\x3f-\x5a\x5c\x5f\x61-\x7a\x7c\x7e */ if (!preg_match('/^https?:\\/\\/\\w[\\w\\-\\.]+/i', $url)) { throw new Exception("Not a valid or fully qualified HTTP URL."); } $data = false; $cache_lifetime = (int) $cache_lifetime; $use_cache = !empty($this->cacheDir) and $cache_lifetime > 0; if ($use_cache) { $cache = new Cache_Lite(array('cacheDir' => $this->cacheDir, 'lifeTime' => $cache_lifetime)); $params = array(); foreach ($headers as $key => $value) { if (!empty($value)) { $params[] = urlencode($key) . '=' . urlencode($value); } } foreach ($post as $key => $value) { $params[] = urlencode($key) . '=' . urlencode($value); } $cache_id = "{$url}?" . implode('&', $params); if (false !== ($data = $cache->get($cache_id))) { $data = unserialize($data); } } /* * Access to the URL if not cached * or if the cache has either Last-Modified or Etag header * and conditional request is specified. */ if ($conditional_request and (!isset($data['headers']['last-modified']) or !isset($data['headers']['etag']))) { $conditional_request = false; } if (!$data or $conditional_request) { if (isset($data['headers']['last-modified']) and (!isset($headers['last-modified']) or empty($headers['last-modified']))) { $headers['last-modified'] = $data['headers']['last-modified']; } if (isset($data['headers']['etag']) and (!isset($headers['etag']) or empty($headers['etag']))) { $headers['etag'] = $data['headers']['etag']; } try { $response = $this->getHttpResponse($url, $headers, $post); } catch (Exception $e) { if (!$data) { throw $e; } } /* * Use cache if the responded HTTP status code is 304. * If 200, format the responded HTML of the given URL to XHTML. */ if (!$data or isset($response['code']) and $response['code'] != 304) { $data =& $response; /* * If status code was 200 and Content-Type was not (X)HTML, * the status code was forcibly altered to 204. * @see HTTP_Request_Listener_Extended->update(). */ if ($data['code'] != 200 and $data['code'] != 204) { throw new Exception("Responded HTTP Status Code is {$data['code']}."); } elseif (isset($data['headers']['content-type']) and !preg_match('/^(?:text|application)\\/x?html\\b/', $data['headers']['content-type'])) { throw new Exception("Responded Content-Type is {$data['headers']['content-type']}"); } elseif (empty($data['body'])) { throw new Exception("Responded entity body is empty."); } elseif (!preg_match('/<\\w+[^>]*?>/', $data['body'], $matches)) { throw new Exception("Responded entity body does not contain a markup symbol."); } elseif (false !== strpos($matches[0], "")) { throw new Exception("Responded entity body contains NULL."); } /* * Remove BOM and NULLs. */ $data['body'] = preg_replace('/^\\xef\\xbb\\xbf/', '', $data['body']); $data['body'] = str_replace("", '', $data['body']); /* * Initialize the backups. */ $this->backup = array(); $this->backup_count = 0; /* * Removing SCRIPT and STYLE is recommended. * The following substitute code will capsulate the content of the tags in CDATA. * If use it, be sure that some JavaScript method such as document.write * is not compliant with XHTML/XML. */ $tags = array('script', 'style'); foreach ($tags as $tag) { $data['body'] = preg_replace("/<{$tag}\\b[^>]*?>.*?<\\/{$tag}\\b[^>]*?>/si", '', $data['body']); /* $data['body'] = preg_replace_callback( "/(<$tag\b[^>]*?>)(.*?)(<\/$tag\b[^>]*?>)/si", create_function('$matches', ' $content = trim($matches[2]); if (empty($content) or preg_match("/^<!\[CDATA\[.*?\]\]>$/s", $content)) { return $matches[0]; } else { $content = preg_replace("/^<!-+/", "", $content); $content = preg_replace("/-+>$/", "", $content); $content = preg_replace("/\s*\/\/$/s", "", trim($content)); return "$matches[1]<![CDATA[\n$content\n]]>$matches[3]"; } '), $data['body'] ); */ } /* * Backup CDATA sections for later process. */ $data['body'] = preg_replace_callback('/<!\\[CDATA\\[.*?\\]\\]>/s', array($this, 'backup'), $data['body']); /* * Comment section must not contain two or more adjacent hyphens. */ $data['body'] = preg_replace_callback('/<!--(.*?)-->/si', create_function('$matches', ' return "<!-- ".preg_replace("/-{2,}/", "-", $matches[1])." -->"; '), $data['body']); /* * Backup comment sections for later process. */ $data['body'] = preg_replace_callback('/<!--.*?-->/s', array($this, 'backup'), $data['body']); /* * Process tags that is potentially dangerous for XML parsers. */ $data['body'] = preg_replace_callback('/(<textarea\\b[^>]*?>)(.*?)(<\\/textarea\\b[^>]*?>)/si', create_function('$matches', ' return $matches[1].str_replace("<", "<", $matches[2]).$matches[3]; '), $data['body']); $data['body'] = preg_replace_callback('/<xmp\\b[^>]*?>(.*?)<\\/xmp\\b[^>]*?>/si', create_function('$matches', ' return "<pre>".str_replace("<", "<", $matches[1])."</pre>"; '), $data['body']); $data['body'] = preg_replace_callback('/<plaintext\\b[^>]*?>(.*)$/si', create_function('$matches', ' return "<pre>".str_replace("<", "<", $matches[1])."</pre>"; '), $data['body']); /* * Remove DTD declarations, wrongly placed comments etc. * This must be done before removing DOCTYPE. */ $data['body'] = preg_replace('/<!(?!DOCTYPE)[^>]*?>/si', '', $data['body']); /* * XML and DOCTYPE declaration will be replaced. */ $data['body'] = preg_replace('/<!DOCTYPE\\b[^>]*?>/si', '', $data['body']); $data['body'] = preg_replace('/<\\?xml\\b[^>]*?\\?>/si', '', $data['body']); if (preg_match('/^\\s*$/s', $data['body'])) { throw new Exception('The entity body became empty after preprocessing.'); } /* * Detect character encoding and convert to UTF-8. */ $encoding = false; if (isset($data['headers']['content-type'])) { $encoding = $this->getCharsetFromCType($data['headers']['content-type']); } if (!$encoding and preg_match_all('/<meta\\b[^>]*?>/si', $data['body'], $matches)) { foreach ($matches[0] as $value) { if (strtolower($this->getAttribute('http-equiv', $value)) == 'content-type' and false !== ($encoding = $this->getAttribute('content', $value))) { $encoding = $this->getCharsetFromCType($encoding); break; } } } /* * Use mbstring to convert character encoding if available. * Otherwise use iconv (iconv may try to detect character encoding automatically). * Do not trust the declared encoding and do conversion even if UTF-8. */ if (extension_loaded('mbstring')) { if (!$encoding) { @mb_detect_order('ASCII, JIS, UTF-8, EUC-JP, SJIS'); if (false === ($encoding = @mb_preferred_mime_name(@mb_detect_encoding($data['body'])))) { throw new Exception('Failed detecting character encoding.'); } } @mb_convert_variables('UTF-8', $encoding, $data, $this->backup); } else { if (false === ($data['body'] = @iconv($encoding, 'UTF-8', $data['body']))) { throw new Exception('Failed converting character encoding.'); } foreach ($this->backup as $key => $value) { if (false === ($this->backup[$key] = @iconv($encoding, 'UTF-8', $value))) { throw new Exception('Failed converting character encoding.'); } } } /* * Restore CDATAs and comments. */ for ($i = 0; $i < $this->backup_count; $i++) { $data['body'] = str_replace("<restore count=\"{$i}\" />", $this->backup[$i], $data['body']); } /* * Use Tidy to format HTML if available. * Otherwise, use HTMLParser class (is slower and consumes much memory). */ if (extension_loaded('tidy')) { $tidy = new tidy(); $tidy->parseString($data['body'], array('output-xhtml' => true), 'UTF8'); $tidy->cleanRepair(); $data['body'] = $tidy->html(); } else { require_once 'HTMLParser.class.php'; $parser = new HTMLParser(); $format_rule = (require 'xhtml1-transitional_dtd.inc.php'); $parser->setRule($format_rule); $parser->setRoot('html', array('xmlns' => 'http://www.w3.org/1999/xhtml')); $parser->setGenericParent('body'); $parser->parse($data['body']); $data['body'] = $parser->dump(); } /* * Valid XHTML DOCTYPE declaration (with DTD URI) is required * for SimpleXMLElement->asXML() method to produce proper XHTML tags. */ $declarations = '<?xml version="1.0" encoding="UTF-8"?>'; $declarations .= '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" '; $declarations .= '"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">'; $data['body'] = "{$declarations}{$data['body']}"; if ($use_cache) { $cache->save(serialize($data), $cache_id); } } } return $data; }