Exemplo n.º 1
0
 protected function _tidy($text)
 {
     // tidy up the text
     $tidy = new tidy();
     $tidy->parseString($text, array(), 'utf8');
     $tidy->cleanRepair();
     // get only the body portion
     $body = tidy_get_body($tidy);
     return $body->value;
 }
Exemplo n.º 2
0
function html_standardization($html)
{
    if (!function_exists('tidy_repair_string')) {
        return $html;
    }
    $str = tidy_repair_string($html, array('output-xhtml' => true), 'utf8');
    if (!$str) {
        return $html;
    }
    $str = tidy_parse_string($str, array('output-xhtml' => true), 'utf8');
    $standard_html = '';
    $nodes = @tidy_get_body($str)->child;
    if (!is_array($nodes)) {
        $returnVal = 0;
        return $html;
    }
    foreach ($nodes as $n) {
        $standard_html .= $n->value;
    }
    return $standard_html;
}
Exemplo n.º 3
0
	/**
	 * Cleanup HTML code (requires HTML Tidy library).
	 * @param $html (string) htmlcode to fix
	 * @param $default_css (string) CSS commands to add
	 * @param $tagvs (array) parameters for setHtmlVSpace method
	 * @param $tidy_options (array) options for tidy_parse_string function
	 * @return string XHTML code cleaned up
	 * @author Nicola Asuni
	 * @public
	 * @since 5.9.017 (2010-11-16)
	 * @see setHtmlVSpace()
	 */
	public function fixHTMLCode($html, $default_css='', $tagvs='', $tidy_options='') {
		// configure parameters for HTML Tidy
		if ($tidy_options === '') {
			$tidy_options = array (
				'clean' => 1,
				'drop-empty-paras' => 0,
				'drop-proprietary-attributes' => 1,
				'fix-backslash' => 1,
				'hide-comments' => 1,
				'join-styles' => 1,
				'lower-literals' => 1,
				'merge-divs' => 1,
				'merge-spans' => 1,
				'output-xhtml' => 1,
				'word-2000' => 1,
				'wrap' => 0,
				'output-bom' => 0,
				//'char-encoding' => 'utf8',
				//'input-encoding' => 'utf8',
				//'output-encoding' => 'utf8'
			);
		}
		// clean up the HTML code
		$tidy = tidy_parse_string($html, $tidy_options);
		// fix the HTML
		$tidy->cleanRepair();
		// get the CSS part
		$tidy_head = tidy_get_head($tidy);
		$css = $tidy_head->value;
		$css = preg_replace('/<style([^>]+)>/ims', '<style>', $css);
		$css = preg_replace('/<\/style>(.*)<style>/ims', "\n", $css);
		$css = str_replace('/*<![CDATA[*/', '', $css);
		$css = str_replace('/*]]>*/', '', $css);
		preg_match('/<style>(.*)<\/style>/ims', $css, $matches);
		if (isset($matches[1])) {
			$css = strtolower($matches[1]);
		} else {
			$css = '';
		}
		// include default css
		$css = '<style>'.$default_css.$css.'</style>';
		// get the body part
		$tidy_body = tidy_get_body($tidy);
		$html = $tidy_body->value;
		// fix some self-closing tags
		$html = str_replace('<br>', '<br />', $html);
		// remove some empty tag blocks
		$html = preg_replace('/<div([^\>]*)><\/div>/', '', $html);
		$html = preg_replace('/<p([^\>]*)><\/p>/', '', $html);
		if ($tagvs !== '') {
			// set vertical space for some XHTML tags
			$this->setHtmlVSpace($tagvs);
		}
		// return the cleaned XHTML code + CSS
		return $css.$html;
	}
Exemplo n.º 4
0
<?php

// bug report taken from http://news.php.net/php.notes/130628
$inputs = array('<frameset > </frameset>', '<html><frameset> </frameset> </html');
foreach ($inputs as $input) {
    $t = tidy_parse_string($input);
    $t->cleanRepair();
    var_dump(tidy_get_body($t));
}
echo "Done\n";
Exemplo n.º 5
0
 public function sanitizeString($content, $isMultiline = false)
 {
     $content = $this->sanitizeEntities($content);
     if ($isMultiline) {
         //TODO: check if this is redundant now that I'm using apply_filters()'
         $content = $this->sanitizeShortCodes($content);
         $content = apply_filters('the_content', $content);
         $content = wpautop($content);
         if ($this->tidy) {
             $this->tidy->parseString($content, array('anchor-as-name' => false, 'clean' => true), 'utf8');
             $this->tidy->cleanRepair();
             //Tidy makes a full html document, with head section, so get just the body
             //then strip out the body tag
             $content = tidy_get_body($this->tidy);
             $content = rtrim($content, '</body>');
             $content = ltrim($content, '<body>');
         }
         $element = "div";
     } else {
         $element = "span";
     }
     $content = trim($content);
     //using loadHTML because it is more forgiving than loadXML
     $tmpHTML = new DOMDocument('1.0', 'UTF-8');
     @$tmpHTML->loadHTML("<html><head><meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\"/></head><body><{$element} xmlns='http://www.w3.org/1999/xhtml'>{$content}</{$element}></body></html>");
     if ($this->checkImgSrcs) {
         $this->checkImageSources($tmpHTML);
     }
     $contentDiv = $tmpHTML->getElementsByTagName($element)->item(0);
     $imported = $this->dom->importNode($contentDiv, true);
     return $imported;
 }
Exemplo n.º 6
0
 /**
  * 
  * Returns a final rendering of the processed text.
  * 
  * This replaces any remaining HTML tokens, un-encodes special
  * Markdown characters, and optionally runs the text through
  * [Tidy][].
  * 
  * [Tidy]: http://php.net/tidy
  * 
  * @param string $text The processed and cleaned text.
  * 
  * @return string The final rendering of the text.
  * 
  */
 public function render($text)
 {
     // replace any remaining HTML tokens
     $text = $this->unHtmlToken($text);
     // replace all special chars in the text.
     $text = $this->unEncode($text);
     if (!$this->_config['tidy']) {
         // tidy explicitly disabled
         return $text;
     }
     // tidy up the text
     $tidy = new tidy();
     $opts = (array) $this->_config['tidy'];
     $tidy->parseString($text, $opts, 'utf8');
     $tidy->cleanRepair();
     // get only the body portion
     $body = trim(tidy_get_body($tidy)->value);
     // remove <body> and </body>
     return substr($body, 6, -7);
 }