protected function _tidy($text) { // tidy up the text $tidy = new tidy(); $tidy->parseString($text, array(), 'utf8'); $tidy->cleanRepair(); // get only the body portion $body = tidy_get_body($tidy); return $body->value; }
function html_standardization($html) { if (!function_exists('tidy_repair_string')) { return $html; } $str = tidy_repair_string($html, array('output-xhtml' => true), 'utf8'); if (!$str) { return $html; } $str = tidy_parse_string($str, array('output-xhtml' => true), 'utf8'); $standard_html = ''; $nodes = @tidy_get_body($str)->child; if (!is_array($nodes)) { $returnVal = 0; return $html; } foreach ($nodes as $n) { $standard_html .= $n->value; } return $standard_html; }
/** * Cleanup HTML code (requires HTML Tidy library). * @param $html (string) htmlcode to fix * @param $default_css (string) CSS commands to add * @param $tagvs (array) parameters for setHtmlVSpace method * @param $tidy_options (array) options for tidy_parse_string function * @return string XHTML code cleaned up * @author Nicola Asuni * @public * @since 5.9.017 (2010-11-16) * @see setHtmlVSpace() */ public function fixHTMLCode($html, $default_css='', $tagvs='', $tidy_options='') { // configure parameters for HTML Tidy if ($tidy_options === '') { $tidy_options = array ( 'clean' => 1, 'drop-empty-paras' => 0, 'drop-proprietary-attributes' => 1, 'fix-backslash' => 1, 'hide-comments' => 1, 'join-styles' => 1, 'lower-literals' => 1, 'merge-divs' => 1, 'merge-spans' => 1, 'output-xhtml' => 1, 'word-2000' => 1, 'wrap' => 0, 'output-bom' => 0, //'char-encoding' => 'utf8', //'input-encoding' => 'utf8', //'output-encoding' => 'utf8' ); } // clean up the HTML code $tidy = tidy_parse_string($html, $tidy_options); // fix the HTML $tidy->cleanRepair(); // get the CSS part $tidy_head = tidy_get_head($tidy); $css = $tidy_head->value; $css = preg_replace('/<style([^>]+)>/ims', '<style>', $css); $css = preg_replace('/<\/style>(.*)<style>/ims', "\n", $css); $css = str_replace('/*<![CDATA[*/', '', $css); $css = str_replace('/*]]>*/', '', $css); preg_match('/<style>(.*)<\/style>/ims', $css, $matches); if (isset($matches[1])) { $css = strtolower($matches[1]); } else { $css = ''; } // include default css $css = '<style>'.$default_css.$css.'</style>'; // get the body part $tidy_body = tidy_get_body($tidy); $html = $tidy_body->value; // fix some self-closing tags $html = str_replace('<br>', '<br />', $html); // remove some empty tag blocks $html = preg_replace('/<div([^\>]*)><\/div>/', '', $html); $html = preg_replace('/<p([^\>]*)><\/p>/', '', $html); if ($tagvs !== '') { // set vertical space for some XHTML tags $this->setHtmlVSpace($tagvs); } // return the cleaned XHTML code + CSS return $css.$html; }
<?php // bug report taken from http://news.php.net/php.notes/130628 $inputs = array('<frameset > </frameset>', '<html><frameset> </frameset> </html'); foreach ($inputs as $input) { $t = tidy_parse_string($input); $t->cleanRepair(); var_dump(tidy_get_body($t)); } echo "Done\n";
public function sanitizeString($content, $isMultiline = false) { $content = $this->sanitizeEntities($content); if ($isMultiline) { //TODO: check if this is redundant now that I'm using apply_filters()' $content = $this->sanitizeShortCodes($content); $content = apply_filters('the_content', $content); $content = wpautop($content); if ($this->tidy) { $this->tidy->parseString($content, array('anchor-as-name' => false, 'clean' => true), 'utf8'); $this->tidy->cleanRepair(); //Tidy makes a full html document, with head section, so get just the body //then strip out the body tag $content = tidy_get_body($this->tidy); $content = rtrim($content, '</body>'); $content = ltrim($content, '<body>'); } $element = "div"; } else { $element = "span"; } $content = trim($content); //using loadHTML because it is more forgiving than loadXML $tmpHTML = new DOMDocument('1.0', 'UTF-8'); @$tmpHTML->loadHTML("<html><head><meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\"/></head><body><{$element} xmlns='http://www.w3.org/1999/xhtml'>{$content}</{$element}></body></html>"); if ($this->checkImgSrcs) { $this->checkImageSources($tmpHTML); } $contentDiv = $tmpHTML->getElementsByTagName($element)->item(0); $imported = $this->dom->importNode($contentDiv, true); return $imported; }
/** * * Returns a final rendering of the processed text. * * This replaces any remaining HTML tokens, un-encodes special * Markdown characters, and optionally runs the text through * [Tidy][]. * * [Tidy]: http://php.net/tidy * * @param string $text The processed and cleaned text. * * @return string The final rendering of the text. * */ public function render($text) { // replace any remaining HTML tokens $text = $this->unHtmlToken($text); // replace all special chars in the text. $text = $this->unEncode($text); if (!$this->_config['tidy']) { // tidy explicitly disabled return $text; } // tidy up the text $tidy = new tidy(); $opts = (array) $this->_config['tidy']; $tidy->parseString($text, $opts, 'utf8'); $tidy->cleanRepair(); // get only the body portion $body = trim(tidy_get_body($tidy)->value); // remove <body> and </body> return substr($body, 6, -7); }