Example #1
0
function tidyHtml($html)
{
    $config = ["indent" => 2, "clean" => false, "char-encoding" => "utf8"];
    $tidy = new tidy();
    $tidy->parseString($html, $config, 'utf8');
    $tidy->cleanRepair();
    $ret = $tidy->html()->child[1]->value;
    $ret = substr($ret, 7, -7);
    return $ret;
}
 function GetXML($html)
 {
     // Specify configuration
     $config = array('output-xml' => true, 'numeric-entities' => true, 'hide-comments' => true);
     // Tidy
     $tidy = new tidy();
     $tidy->parseString($html, $config, 'utf8');
     $tidy->cleanRepair();
     $xHTML = $tidy->html();
     return new SimpleXMLElement($xHTML);
 }
Example #3
0
    static private function rawToSimpleXML($data)
	{

        /*
        * Конфиг Tidy
        */
		$tidy_config = array(
			'input-encoding' => 'utf-8',
			'output-encoding' => 'utf8',
			'output-xml' => TRUE,
			'add-xml-decl' => TRUE,
			'hide-comments' => TRUE
		);

		/*
		* Загрузка данных и очистка от ошибок
		*/
		$tidy = new tidy();
		$tidy->parseString($data, $tidy_config, $tidy_config['output-encoding']);
		$tidy->cleanRepair();
		$tidy_out = $tidy->html()->value;
		unset($tidy);

		/*
		* Инициализация XML DOM
		*/
		$dom = new DOMDocument();
		$dom->strictErrorChecking = FALSE;
		@$dom->loadHTML($tidy_out);

		/*
		* Инициализация SimpleXML
		*/
		$simplexml = simplexml_import_dom($dom);
		unset($dom);

		return $simplexml;
    }
Example #4
0
 /**
  * Get URL info
  *
  * This method fetches info about the URL, like the HTTP response code and content type.
  *
  * @return array  Info about the URL
  */
 public function get()
 {
     $res = $this->server->get($this->url_key);
     if ($res === false) {
         curl_setopt($this->curl, CURLOPT_URL, $this->url);
         curl_setopt($this->curl, CURLOPT_RETURNTRANSFER, true);
         curl_setopt($this->curl, CURLOPT_FILETIME, true);
         curl_setopt($this->curl, CURLOPT_AUTOREFERER, true);
         curl_setopt($this->curl, CURLOPT_FOLLOWLOCATION, true);
         curl_setopt($this->curl, CURLOPT_MAXREDIRS, 6);
         curl_setopt($this->curl, CURLOPT_USERAGENT, "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_2; en-us) AppleWebKit/531.21.8 (KHTML, like Gecko) Version/4.0.4 Safari/531.21.10");
         $html = curl_exec($this->curl);
         // print_r(htmlentities($body));
         curl_close($this->curl);
         $tidy_config = array('clean' => true, 'output-html' => true, 'wrap' => 78, 'quiet' => 1);
         $tidy = new tidy();
         $tidy->parseString($html, $tidy_config);
         $tidy->cleanRepair();
         $html = $tidy->html()->value;
         // Buffer DOM errors rather than emitting them as warnings
         $oldSetting = libxml_use_internal_errors(true);
         $dom = new DOMDocument();
         $dom->loadHTML($html);
         $xpath = new DOMXPath($dom);
         $titles = $xpath->evaluate('//*[name()="title"]');
         $title = $titles->item(0)->nodeValue;
         // Clear any existing errors from previous operations
         libxml_clear_errors();
         // Revert error buffering to its previous setting
         libxml_use_internal_errors($oldSetting);
         $res = array('title' => $title);
         $res = json_encode($res);
         $this->server->add($this->url_key, $res, MEMCACHE_COMPRESSED, self::CACHE_LIMIT);
     }
     return json_decode($res, TRUE);
 }
 public function add_html($html, $title, $config)
 {
     if ($config['tidy']) {
         $tidy = new tidy();
         $tidy->parseString($html, $config, 'utf8');
         $tidy->cleanRepair();
         $html = $tidy->html()->value;
     }
     $doc = new DOMDocument();
     @$doc->loadHTML($html);
     //$html = $doc->saveXML();
     if (!$title) {
         $title = 'Untitled';
         $heads = $doc->getElementsByTagName('head');
         if ($heads) {
             $titles = $heads->item(0)->getElementsByTagName('title');
             if ($titles) {
                 $title = $titles->item(0)->nodeValue;
             }
         }
     }
     // Check images
     // Handle <img> tags
     $html = preg_replace_callback('~(<img [^>]*?)src=([\'"])(.+?)[\'"]~', array($this, 'img_callback'), $html);
     if ($config['split']) {
         $splits = $this->split($html);
         $first = TRUE;
         foreach ($splits as $split) {
             $this->add_spine_item($split[0], $split[1]);
             if ($config['toc']) {
                 if ($first) {
                     $this->set_item_toc($title, TRUE, FALSE);
                 } else {
                     $this->set_item_toc(NULL, TRUE, TRUE);
                 }
                 $first = FALSE;
             }
         }
     } else {
         $this->add_spine_item($html);
         if ($config['toc']) {
             $this->set_item_toc($title, TRUE);
         }
     }
     return $title;
 }
Example #6
0
/**
 * 修正html中的语法错误 
 *
 */
function cy_html_repair($html, $encoding = 'UTF8')
{
    $config = array('clean' => true, 'output-xml' => true, 'output-xhtml' => true, 'wrap' => 200);
    $t = new tidy();
    $t->parseString($html, $config, $encoding);
    $t->cleanRepair();
    // fix html
    return $t->html();
}
 /**
  * Return array contains formated XHTML string
  * created from the responded HTML of the given URL.
  * array[code] => HTTP status code
  * array[headers] => HTTP headers
  * array[headers] => formated XHTML string made from the entity body
  * Throw exception if error.
  *
  * @param  string  $url
  * @param  integer $cache_lifetime
  * @param  boolean $conditional_request
  * @param  array   $headers
  * @param  array   $post
  * @return array
  */
 public final function getXhtml($url, $cache_lifetime = 0, $conditional_request = false, $headers = array(), $post = array())
 {
     /*
      * \x21\x23-\x3b\x3d\x3f-\x5a\x5c\x5f\x61-\x7a\x7c\x7e
      */
     if (!preg_match('/^https?:\\/\\/\\w[\\w\\-\\.]+/i', $url)) {
         throw new Exception("Not a valid or fully qualified HTTP URL.");
     }
     $data = false;
     $cache_lifetime = (int) $cache_lifetime;
     $use_cache = !empty($this->cacheDir) and $cache_lifetime > 0;
     if ($use_cache) {
         $cache = new Cache_Lite(array('cacheDir' => $this->cacheDir, 'lifeTime' => $cache_lifetime));
         $params = array();
         foreach ($headers as $key => $value) {
             if (!empty($value)) {
                 $params[] = urlencode($key) . '=' . urlencode($value);
             }
         }
         foreach ($post as $key => $value) {
             $params[] = urlencode($key) . '=' . urlencode($value);
         }
         $cache_id = "{$url}?" . implode('&', $params);
         if (false !== ($data = $cache->get($cache_id))) {
             $data = unserialize($data);
         }
     }
     /*
      * Access to the URL if not cached
      * or if the cache has either Last-Modified or Etag header
      * and conditional request is specified.
      */
     if ($conditional_request and (!isset($data['headers']['last-modified']) or !isset($data['headers']['etag']))) {
         $conditional_request = false;
     }
     if (!$data or $conditional_request) {
         if (isset($data['headers']['last-modified']) and (!isset($headers['last-modified']) or empty($headers['last-modified']))) {
             $headers['last-modified'] = $data['headers']['last-modified'];
         }
         if (isset($data['headers']['etag']) and (!isset($headers['etag']) or empty($headers['etag']))) {
             $headers['etag'] = $data['headers']['etag'];
         }
         try {
             $response = $this->getHttpResponse($url, $headers, $post);
         } catch (Exception $e) {
             if (!$data) {
                 throw $e;
             }
         }
         /*
          * Use cache if the responded HTTP status code is 304.
          * If 200, format the responded HTML of the given URL to XHTML.
          */
         if (!$data or isset($response['code']) and $response['code'] != 304) {
             $data =& $response;
             /*
              * If status code was 200 and Content-Type was not (X)HTML,
              * the status code was forcibly altered to 204.
              * @see HTTP_Request_Listener_Extended->update().
              */
             if ($data['code'] != 200 and $data['code'] != 204) {
                 throw new Exception("Responded HTTP Status Code is {$data['code']}.");
             } elseif (isset($data['headers']['content-type']) and !preg_match('/^(?:text|application)\\/x?html\\b/', $data['headers']['content-type'])) {
                 throw new Exception("Responded Content-Type is {$data['headers']['content-type']}");
             } elseif (empty($data['body'])) {
                 throw new Exception("Responded entity body is empty.");
             } elseif (!preg_match('/<\\w+[^>]*?>/', $data['body'], $matches)) {
                 throw new Exception("Responded entity body does not contain a markup symbol.");
             } elseif (false !== strpos($matches[0], "")) {
                 throw new Exception("Responded entity body contains NULL.");
             }
             /*
              * Remove BOM and NULLs.
              */
             $data['body'] = preg_replace('/^\\xef\\xbb\\xbf/', '', $data['body']);
             $data['body'] = str_replace("", '', $data['body']);
             /*
              * Initialize the backups.
              */
             $this->backup = array();
             $this->backup_count = 0;
             /*
              * Removing SCRIPT and STYLE is recommended.
              * The following substitute code will capsulate the content of the tags in CDATA.
              * If use it, be sure that some JavaScript method such as document.write
              * is not compliant with XHTML/XML.
              */
             $tags = array('script', 'style');
             foreach ($tags as $tag) {
                 $data['body'] = preg_replace("/<{$tag}\\b[^>]*?>.*?<\\/{$tag}\\b[^>]*?>/si", '', $data['body']);
                 /*
                 $data['body'] = preg_replace_callback(
                     "/(<$tag\b[^>]*?>)(.*?)(<\/$tag\b[^>]*?>)/si",
                     create_function('$matches', '
                         $content = trim($matches[2]);
                         if (empty($content)
                             or preg_match("/^<!\[CDATA\[.*?\]\]>$/s", $content)) {
                             return $matches[0];
                         } else {
                             $content = preg_replace("/^<!-+/", "", $content);
                             $content = preg_replace("/-+>$/", "", $content);
                             $content = preg_replace("/\s*\/\/$/s", "", trim($content));
                             return "$matches[1]<![CDATA[\n$content\n]]>$matches[3]";
                         }
                     '),
                     $data['body']
                 );
                 */
             }
             /*
              * Backup CDATA sections for later process.
              */
             $data['body'] = preg_replace_callback('/<!\\[CDATA\\[.*?\\]\\]>/s', array($this, 'backup'), $data['body']);
             /*
              * Comment section must not contain two or more adjacent hyphens.
              */
             $data['body'] = preg_replace_callback('/<!--(.*?)-->/si', create_function('$matches', '
                     return "<!-- ".preg_replace("/-{2,}/", "-", $matches[1])." -->";
                 '), $data['body']);
             /*
              * Backup comment sections for later process.
              */
             $data['body'] = preg_replace_callback('/<!--.*?-->/s', array($this, 'backup'), $data['body']);
             /*
              * Process tags that is potentially dangerous for XML parsers.
              */
             $data['body'] = preg_replace_callback('/(<textarea\\b[^>]*?>)(.*?)(<\\/textarea\\b[^>]*?>)/si', create_function('$matches', '
                     return $matches[1].str_replace("<", "&lt;", $matches[2]).$matches[3];
                 '), $data['body']);
             $data['body'] = preg_replace_callback('/<xmp\\b[^>]*?>(.*?)<\\/xmp\\b[^>]*?>/si', create_function('$matches', '
                     return "<pre>".str_replace("<", "&lt;", $matches[1])."</pre>";
                 '), $data['body']);
             $data['body'] = preg_replace_callback('/<plaintext\\b[^>]*?>(.*)$/si', create_function('$matches', '
                     return "<pre>".str_replace("<", "&lt;", $matches[1])."</pre>";
                 '), $data['body']);
             /*
              * Remove DTD declarations, wrongly placed comments etc.
              * This must be done before removing DOCTYPE.
              */
             $data['body'] = preg_replace('/<!(?!DOCTYPE)[^>]*?>/si', '', $data['body']);
             /*
              * XML and DOCTYPE declaration will be replaced.
              */
             $data['body'] = preg_replace('/<!DOCTYPE\\b[^>]*?>/si', '', $data['body']);
             $data['body'] = preg_replace('/<\\?xml\\b[^>]*?\\?>/si', '', $data['body']);
             if (preg_match('/^\\s*$/s', $data['body'])) {
                 throw new Exception('The entity body became empty after preprocessing.');
             }
             /*
              * Detect character encoding and convert to UTF-8.
              */
             $encoding = false;
             if (isset($data['headers']['content-type'])) {
                 $encoding = $this->getCharsetFromCType($data['headers']['content-type']);
             }
             if (!$encoding and preg_match_all('/<meta\\b[^>]*?>/si', $data['body'], $matches)) {
                 foreach ($matches[0] as $value) {
                     if (strtolower($this->getAttribute('http-equiv', $value)) == 'content-type' and false !== ($encoding = $this->getAttribute('content', $value))) {
                         $encoding = $this->getCharsetFromCType($encoding);
                         break;
                     }
                 }
             }
             /*
              * Use mbstring to convert character encoding if available.
              * Otherwise use iconv (iconv may try to detect character encoding automatically).
              * Do not trust the declared encoding and do conversion even if UTF-8.
              */
             if (extension_loaded('mbstring')) {
                 if (!$encoding) {
                     @mb_detect_order('ASCII, JIS, UTF-8, EUC-JP, SJIS');
                     if (false === ($encoding = @mb_preferred_mime_name(@mb_detect_encoding($data['body'])))) {
                         throw new Exception('Failed detecting character encoding.');
                     }
                 }
                 @mb_convert_variables('UTF-8', $encoding, $data, $this->backup);
             } else {
                 if (false === ($data['body'] = @iconv($encoding, 'UTF-8', $data['body']))) {
                     throw new Exception('Failed converting character encoding.');
                 }
                 foreach ($this->backup as $key => $value) {
                     if (false === ($this->backup[$key] = @iconv($encoding, 'UTF-8', $value))) {
                         throw new Exception('Failed converting character encoding.');
                     }
                 }
             }
             /*
              * Restore CDATAs and comments.
              */
             for ($i = 0; $i < $this->backup_count; $i++) {
                 $data['body'] = str_replace("<restore count=\"{$i}\" />", $this->backup[$i], $data['body']);
             }
             /*
              * Use Tidy to format HTML if available.
              * Otherwise, use HTMLParser class (is slower and consumes much memory).
              */
             if (extension_loaded('tidy')) {
                 $tidy = new tidy();
                 $tidy->parseString($data['body'], array('output-xhtml' => true), 'UTF8');
                 $tidy->cleanRepair();
                 $data['body'] = $tidy->html();
             } else {
                 require_once 'HTMLParser.class.php';
                 $parser = new HTMLParser();
                 $format_rule = (require 'xhtml1-transitional_dtd.inc.php');
                 $parser->setRule($format_rule);
                 $parser->setRoot('html', array('xmlns' => 'http://www.w3.org/1999/xhtml'));
                 $parser->setGenericParent('body');
                 $parser->parse($data['body']);
                 $data['body'] = $parser->dump();
             }
             /*
              * Valid XHTML DOCTYPE declaration (with DTD URI) is required
              * for SimpleXMLElement->asXML() method to produce proper XHTML tags.
              */
             $declarations = '<?xml version="1.0" encoding="UTF-8"?>';
             $declarations .= '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" ';
             $declarations .= '"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">';
             $data['body'] = "{$declarations}{$data['body']}";
             if ($use_cache) {
                 $cache->save(serialize($data), $cache_id);
             }
         }
     }
     return $data;
 }