function load_html($html) { $tidy = tidy_parse_string($html); tidy_clean_repair($tidy); $html = tidy_get_html($tidy); phpQuery::unloadDocuments(); return phpQuery::newDocumentHTML($html); }
public function afterRender($event, $view) { if (!extension_loaded('tidy')) { return; } $options = array('hide-comments' => true, 'tidy-mark' => false, 'indent' => true, 'indent-spaces' => 4, 'new-blocklevel-tags' => 'article,header,footer,section,nav', 'new-inline-tags' => 'video,audio,canvas,ruby,rt,rp', 'doctype' => '<!DOCTYPE HTML>', 'sort-attributes' => 'alpha', 'vertical-space' => false, 'output-xhtml' => true, 'wrap' => 150, 'wrap-attributes' => false, 'break-before-br' => false); $buffer = tidy_parse_string($view->getContent(), $options, 'utf8'); tidy_clean_repair($buffer); $buffer = str_replace(array('<html lang="en" xmlns="http://www.w3.org/1999/xhtml">', '<html xmlns="http://www.w3.org/1999/xhtml">'), '<!DOCTYPE html>', $buffer); $buffer = str_replace(">\n</script>", "></script>", $buffer); $view->setContent((string) $buffer); }
function output($title = '', $body = '', $head = '') { global $settings, $authid, $checkleft, $checkright, $head, $error, $error_die; if (theme('output_error') != false) { $body = theme('output_error'); $title = 'Error'; $panels = false; $lowerpanel = false; $panel = ''; unset($error_die); } else { $panels = true; } //display panels if ($panels != false) { $panel = theme('displaypanels'); $lowerpanel = theme('displaylowerpanel'); } if (isset($error) && !empty($error)) { $errors = '<br />' . theme('title', 'Error') . theme('start_content') . '<div class="errors"><ul>'; foreach ($error as $error1) { $errors .= '<li>' . $error1 . '</li>'; } $errors .= '</ul></div>' . theme('end_content'); unset($error); } else { $errors = ''; } if (isset($_GET['page']) && $_GET['page'] > 1) { $title = $title . ' - Page ' . $_GET['page']; } $output = theme('head', stripslashes($title), $head) . '<body>'; if ($settings['maintenance_mode'] == 'on') { $output .= '<div class="titlebg">WARNING: Maintenance Mode is on</div>'; } $output .= '<div id="container"> ' . theme('top') . theme('links'); $output .= $panel; //display the data $output .= $errors . '<br />' . stripslashes($body); $output .= $lowerpanel . theme('footer'); //SEO Friendly Links include IN_PATH . '/functions/seofriendlyurls.php'; //Check if the tidy library is installed if (extension_loaded('tidy')) { //yay it is, lets clean up all the HTML, so it looks all nice in View Source in your browser :) $options = array("indent" => true, 'wrap' => 0); $output = tidy_parse_string($output, $options); tidy_clean_repair($output); } die($output); }
/** * Turn a string or array into valid, standards-compliant (x)HTML * * Uses configuraton options in tidy.conf - which should minimally have show-body-only set to yes * * @param mixed $text The data to be tidied up * @return mixed $result Tidied data */ function tidy($text) { static $tidy_funcs; static $tidy_conf; if (!isset($tidy_conf)) { $tidy_conf = SETTINGS_INC . 'tidy.conf'; } if (is_array($text)) { $result = array(); foreach (array_keys($text) as $key) { $result[$key] = tidy($text[$key]); } return $result; } // determine what tidy libraries are available if (empty($tidy_funcs)) { $tidy_funcs = get_extension_funcs('tidy'); } $tidy_1_lib_available = !empty($tidy_funcs) && array_search('tidy_setopt', $tidy_funcs) !== false; $tidy_2_lib_available = !empty($tidy_funcs) && array_search('tidy_setopt', $tidy_funcs) === false; $tidy_command_line_available = TIDY_EXE ? file_exists(TIDY_EXE) : false; $text = protect_string_from_tidy($text); $text = '<html><body>' . $text . '</body></html>'; if ($tidy_2_lib_available) { $tidy = new tidy(); $tidy->parseString($text, $tidy_conf, 'utf8'); $tidy->cleanRepair(); $result = $tidy; } elseif ($tidy_1_lib_available) { tidy_load_config($tidy_conf); tidy_set_encoding('utf8'); tidy_parse_string($text); tidy_clean_repair(); $result = tidy_get_output(); } elseif ($tidy_command_line_available) { $arg = escapeshellarg($text); // escape the bad stuff in the text $cmd = 'echo ' . $arg . ' | ' . TIDY_EXE . ' -q -config ' . $tidy_conf . ' 2> /dev/null'; // the actual command - pipes the input to tidy which diverts its output to the random file $result = shell_exec($cmd); // execute the command } else { trigger_error('tidy does not appear to be available within php or at the command line - no tidying is taking place.'); $result = $text; } return trim($result); }
/** * Use the HTML tidy PECL extension to use the tidy library in-process, * saving the overhead of spawning a new process. Currently written to * the PHP 4.3.x version of the extension, may not work on PHP 5. * * 'pear install tidy' should be able to compile the extension module. */ private static function internal($text) { global $wgTidyConf; $fname = 'Parser::internalTidy'; wfProfileIn($fname); tidy_load_config($wgTidyConf); tidy_set_encoding('utf8'); tidy_parse_string($text); tidy_clean_repair(); if (tidy_get_status() == 2) { // 2 is magic number for fatal error // http://www.php.net/manual/en/function.tidy-get-status.php $cleansource = null; } else { $cleansource = tidy_get_output(); } wfProfileOut($fname); return $cleansource; }
/** * tidy the data * * @access public * @param string data * @return string compressed data */ function apply($data) { if (!function_exists('tidy_parse_string')) { return $data; } /** * tidy 1.0 */ if (function_exists('tidy_setopt') && is_array($this->_params)) { foreach ($this->_params as $opt => $value) { tidy_setopt($opt, $value); } tidy_parse_string($data); tidy_clean_repair(); $data = tidy_get_output(); } else { $tidy = tidy_parse_string($data, $this->_params); tidy_clean_repair($tidy); $data = tidy_get_output($tidy); } return $data; }
function return_parsed_bbcode($message, $nowrap = false) { // never strip_tags here, see Page.Talks for details $message = str_replace("[b]", "<b>", $message); $message = str_replace("[/b]", "</b>", $message); $message = str_replace("[i]", "<i>", $message); $message = str_replace("[/i]", "</i>", $message); $message = str_replace("[u]", "<u>", $message); $message = str_replace("[/u]", "</u>", $message); $message = str_replace("[center]", "<div align=\"center\">", $message); $message = str_replace("[/center]", "</div>", $message); $message = str_replace("[left]", "<div align=\"left\">", $message); $message = str_replace("[/left]", "</div>", $message); $message = str_replace("[right]", "<div align=\"right\">", $message); $message = str_replace("[/right]", "</div>", $message); $message = str_replace("[ol]", "<ol>", $message); $message = str_replace("[ul]", "<ul>", $message); $message = str_replace("[li]", "<li>", $message); $message = str_replace("[/ol]", "</ol>", $message); $message = str_replace("[/ul]", "</ul>", $message); $message = str_replace("[br]", "<br>", $message); $message = eregi_replace("\\[img\\]([^\\[]*)\\[/img\\]", "<img src=\"\\1\" border=\"0\">", $message); $message = eregi_replace("\\[url\\](https?://[^\\[]*)\\[/url\\]", "<a href=\"\\1\">\\1</a>", $message); if (function_exists("tidy_get_output")) { if (!$nowrap) { $config = array('indent' => FALSE, 'output-xhtml' => TRUE, 'show-body-only' => TRUE, 'wrap' => 80); } else { $config = array('indent' => FALSE, 'output-xhtml' => TRUE, 'show-body-only' => TRUE); } tidy_set_encoding('UTF8'); foreach ($config as $key => $value) { tidy_setopt($key, $value); } tidy_parse_string($message); tidy_clean_repair(); $message = tidy_get_output(); } return $message; }
function TidyClean() { if (!class_exists('tidy')) { if (function_exists('tidy_parse_string')) { //use procedural style for compatibility with PHP 4.3 tidy_set_encoding($this->Encoding); foreach ($this->TidyConfig as $key => $value) { tidy_setopt($key, $value); } tidy_parse_string($this->html); tidy_clean_repair(); $this->html = tidy_get_output(); } else { print "<b>No tidy support. Please enable it in your php.ini.\r\nOnly basic cleaning is beeing applied\r\n</b>"; } } else { //PHP 5 only !!! $tidy = new tidy(); $tidy->parseString($this->html, $this->TidyConfig, $this->Encoding); $tidy->cleanRepair(); $this->html = $tidy; } }
function tidy_output($init) { $init->process(); ob_start(); ob_start(); $output = $init->run(); $tidy = 'xhtml'; if ($tidy == 'xhtml') { $options = array('output-xhtml' => true, 'indent' => true, 'input-encoding' => 'utf8', 'output-encoding' => 'utf8'); $output = tidy_parse_string($output, $options); tidy_clean_repair($output); } if ($tidy == 'html') { $options = array('output-html' => true, 'indent' => true, 'input-encoding' => 'utf8', 'output-encoding' => 'utf8', 'clean' => true); $output = tidy_parse_string($output, $options); tidy_clean_repair($output); } echo $output; ob_end_flush(); header('Content-Length: ' . ob_get_length()); ob_end_flush(); }
private function tidyThis($source) { switch ($this->tidy_mode) { case 'exec': $tmp_file = $this->tmp_dir . md5($source) . '.txt'; file_put_contents($tmp_file, $source); exec("tidy -utf8 -indent -asxhtml -numeric -bare -quiet {$tmp_file}", $tidy); unlink($tmp_file); return implode("\n", $tidy); break; case 'php': $tidy = tidy_parse_string($source); return tidy_clean_repair($tidy); break; default: return $source; break; } }
/** * Use HTML Tidy to validate the $text * Only runs when $config['HTML_Tidy'] is off * * @param string $text The html content to be checked. Passed by reference */ static function tidyFix(&$text, $ignore_config = false) { global $config; if (!$ignore_config) { if (empty($config['HTML_Tidy']) || $config['HTML_Tidy'] == 'off') { return true; } } if (!function_exists('tidy_parse_string')) { return false; } $options = array(); $options['wrap'] = 0; //keeps tidy from wrapping... want the least amount of space changing as possible.. could get rid of spaces between words with the str_replaces below $options['doctype'] = 'omit'; //omit, auto, strict, transitional, user $options['drop-empty-paras'] = true; //drop empty paragraphs $options['output-xhtml'] = true; //need this so that <br> will be <br/> .. etc $options['show-body-only'] = true; $options['hide-comments'] = false; //$options['anchor-as-name'] = true; //default is true, but not alwasy availabel. When true, adds an id attribute to anchor; when false, removes the name attribute... poorly designed, but we need it to be true // // php4 // if (function_exists('tidy_setopt')) { $options['char-encoding'] = 'utf8'; gp_edit::tidyOptions($options); $tidy = tidy_parse_string($text); tidy_clean_repair(); if (tidy_get_status() === 2) { // 2 is magic number for fatal error // http://www.php.net/manual/en/function.tidy-get-status.php $tidyErrors[] = 'Tidy found serious XHTML errors: <br/>' . nl2br(htmlspecialchars(tidy_get_error_buffer($tidy))); return false; } $text = tidy_get_output(); // // php5 // } else { $tidy = tidy_parse_string($text, $options, 'utf8'); tidy_clean_repair($tidy); if (tidy_get_status($tidy) === 2) { // 2 is magic number for fatal error // http://www.php.net/manual/en/function.tidy-get-status.php $tidyErrors[] = 'Tidy found serious XHTML errors: <br/>' . nl2br(htmlspecialchars(tidy_get_error_buffer($tidy))); return false; } $text = tidy_get_output($tidy); } return true; }
function cleanXHTML($string) { if (!extension_loaded('tidy')) { if (!dl('tidy.so')) { return strip_tags($string, '<br><a>'); } } tidy_setopt('output-xhtml', true); tidy_setopt('doctype', 'omit'); tidy_setopt('show-body-only', true); tidy_parse_string($string); tidy_clean_repair(); $clean = tidy_get_output(); if (!$clean) { return strip_tags($string, '<br><a>'); } return $clean; }
public function process($html, $url, $smart_tidy = true) { $this->reset(); // use user submitted config and merge it with regular one if (isset($this->userSubmittedConfig)) { $this->debug('Using user-submitted site config'); $this->config = $this->userSubmittedConfig; if ($this->config->autodetect_on_failure()) { $this->debug('Merging user-submitted site config with site config files associated with this URL and/or content'); $this->config->append($this->buildSiteConfig($url, $html)); } } else { $this->config = $this->buildSiteConfig($url, $html); } // do string replacements if (!empty($this->config->find_string)) { if (count($this->config->find_string) == count($this->config->replace_string)) { $html = str_replace($this->config->find_string, $this->config->replace_string, $html, $_count); $this->debug("Strings replaced: {$_count} (find_string and/or replace_string)"); } else { $this->debug('Skipped string replacement - incorrect number of find-replace strings in site config'); } unset($_count); } // use tidy (if it exists)? // This fixes problems with some sites which would otherwise // trouble DOMDocument's HTML parsing. (Although sometimes it // makes matters worse, which is why you can override it in site config files.) $tidied = false; if ($this->config->tidy() && function_exists('tidy_parse_string') && $smart_tidy) { $this->debug('Using Tidy'); $tidy = tidy_parse_string($html, self::$tidy_config, 'UTF8'); if (tidy_clean_repair($tidy)) { $original_html = $html; $tidied = true; $html = $tidy->value; } unset($tidy); } // load and parse html if ($this->parserOverride) { // from querystring: &parser=xxx $_parser = $this->parserOverride; } else { // from site config file: parser: xxx $_parser = $this->config->parser(); } // for backword compatibility... if ($_parser == 'html5lib') { $_parser = 'html5php'; } if (!in_array($_parser, $this->allowedParsers)) { $this->debug("HTML parser {$_parser} not listed, using " . $this->defaultParser . " instead"); $_parser = $this->defaultParser; } $this->debug("Attempting to parse HTML with {$_parser}"); $this->readability = new Readability($html, $url, $_parser); // we use xpath to find elements in the given HTML document // see http://en.wikipedia.org/wiki/XPath_1.0 $xpath = new DOMXPath($this->readability->dom); // try to get next page link foreach ($this->config->next_page_link as $pattern) { $elems = @$xpath->evaluate($pattern, $this->readability->dom); if (is_string($elems)) { $this->nextPageUrl = trim($elems); break; } elseif ($elems instanceof DOMNodeList && $elems->length > 0) { foreach ($elems as $item) { if ($item instanceof DOMElement && $item->hasAttribute('href')) { $this->nextPageUrl = $item->getAttribute('href'); break 2; } elseif ($item instanceof DOMAttr && $item->value) { $this->nextPageUrl = $item->value; break 2; } } } } // check if this is a native ad foreach ($this->config->native_ad_clue as $pattern) { $elems = @$xpath->evaluate($pattern, $this->readability->dom); if ($elems instanceof DOMNodeList && $elems->length > 0) { $this->nativeAd = true; break; } } // try to get title foreach ($this->config->title as $pattern) { // $this->debug("Trying $pattern"); $elems = @$xpath->evaluate($pattern, $this->readability->dom); if (is_string($elems)) { $this->title = trim($elems); $this->debug('Title expression evaluated as string: ' . $this->title); $this->debug("...XPath match: {$pattern}"); break; } elseif ($elems instanceof DOMNodeList && $elems->length > 0) { $this->title = $elems->item(0)->textContent; $this->debug('Title matched: ' . $this->title); $this->debug("...XPath match: {$pattern}"); // remove title from document try { @$elems->item(0)->parentNode->removeChild($elems->item(0)); } catch (DOMException $e) { // do nothing } break; } } // try to get author (if it hasn't already been set) if (empty($this->author)) { foreach ($this->config->author as $pattern) { $elems = @$xpath->evaluate($pattern, $this->readability->dom); if (is_string($elems)) { if (trim($elems) != '') { $this->author[] = trim($elems); $this->debug('Author expression evaluated as string: ' . trim($elems)); $this->debug("...XPath match: {$pattern}"); break; } } elseif ($elems instanceof DOMNodeList && $elems->length > 0) { foreach ($elems as $elem) { if (!isset($elem->parentNode)) { continue; } $this->author[] = trim($elem->textContent); $this->debug('Author matched: ' . trim($elem->textContent)); } if (!empty($this->author)) { $this->debug("...XPath match: {$pattern}"); break; } } } } // try to get language $_lang_xpath = array('//html[@lang]/@lang', '//meta[@name="DC.language"]/@content'); foreach ($_lang_xpath as $pattern) { $elems = @$xpath->evaluate($pattern, $this->readability->dom); if (is_string($elems)) { if (trim($elems) != '') { $this->language = trim($elems); $this->debug('Language matched: ' . $this->language); break; } } elseif ($elems instanceof DOMNodeList && $elems->length > 0) { foreach ($elems as $elem) { if (!isset($elem->parentNode)) { continue; } $this->language = trim($elem->textContent); $this->debug('Language matched: ' . $this->language); } if ($this->language) { break; } } } // try to get date foreach ($this->config->date as $pattern) { $elems = @$xpath->evaluate($pattern, $this->readability->dom); if (is_string($elems)) { $this->date = strtotime(trim($elems, "; \t\n\r\v")); } elseif ($elems instanceof DOMNodeList && $elems->length > 0) { $this->date = $elems->item(0)->textContent; $this->date = strtotime(trim($this->date, "; \t\n\r\v")); // remove date from document // $elems->item(0)->parentNode->removeChild($elems->item(0)); } if (!$this->date) { $this->date = null; } else { $this->debug('Date matched: ' . date('Y-m-d H:i:s', $this->date)); $this->debug("...XPath match: {$pattern}"); break; } } // strip elements (using xpath expressions) foreach ($this->config->strip as $pattern) { $elems = @$xpath->query($pattern, $this->readability->dom); // check for matches if ($elems && $elems->length > 0) { $this->debug('Stripping ' . $elems->length . ' elements (strip)'); for ($i = $elems->length - 1; $i >= 0; $i--) { if ($elems->item($i)->parentNode) { $elems->item($i)->parentNode->removeChild($elems->item($i)); } } } } // strip elements (using id and class attribute values) foreach ($this->config->strip_id_or_class as $string) { $string = strtr($string, array("'" => '', '"' => '')); $elems = @$xpath->query("//*[contains(@class, '{$string}') or contains(@id, '{$string}')]", $this->readability->dom); // check for matches if ($elems && $elems->length > 0) { $this->debug('Stripping ' . $elems->length . ' elements (strip_id_or_class)'); for ($i = $elems->length - 1; $i >= 0; $i--) { $elems->item($i)->parentNode->removeChild($elems->item($i)); } } } // strip images (using src attribute values) foreach ($this->config->strip_image_src as $string) { $string = strtr($string, array("'" => '', '"' => '')); $elems = @$xpath->query("//img[contains(@src, '{$string}')]", $this->readability->dom); // check for matches if ($elems && $elems->length > 0) { $this->debug('Stripping ' . $elems->length . ' image elements'); for ($i = $elems->length - 1; $i >= 0; $i--) { $elems->item($i)->parentNode->removeChild($elems->item($i)); } } } // strip elements using Readability.com and Instapaper.com ignore class names // .entry-unrelated and .instapaper_ignore // See https://www.readability.com/publishers/guidelines/#view-plainGuidelines // and http://blog.instapaper.com/post/730281947 $elems = @$xpath->query("//*[contains(concat(' ',normalize-space(@class),' '),' entry-unrelated ') or contains(concat(' ',normalize-space(@class),' '),' instapaper_ignore ')]", $this->readability->dom); // check for matches if ($elems && $elems->length > 0) { $this->debug('Stripping ' . $elems->length . ' .entry-unrelated,.instapaper_ignore elements'); for ($i = $elems->length - 1; $i >= 0; $i--) { $elems->item($i)->parentNode->removeChild($elems->item($i)); } } // strip elements that contain style="display: none;" $elems = @$xpath->query("//*[contains(@style,'display:none')]", $this->readability->dom); // check for matches if ($elems && $elems->length > 0) { $this->debug('Stripping ' . $elems->length . ' elements with inline display:none style'); for ($i = $elems->length - 1; $i >= 0; $i--) { $elems->item($i)->parentNode->removeChild($elems->item($i)); } } // try to get body foreach ($this->config->body as $pattern) { $elems = @$xpath->query($pattern, $this->readability->dom); // check for matches if ($elems && $elems->length > 0) { $this->debug('Body matched'); $this->debug("...XPath match: {$pattern}"); if ($elems->length == 1) { $this->body = $elems->item(0); // prune (clean up elements that may not be content) if ($this->config->prune()) { $this->debug('...pruning content'); $this->readability->prepArticle($this->body); } break; } else { $this->body = $this->readability->dom->createElement('div'); $this->debug($elems->length . ' body elems found'); foreach ($elems as $elem) { if (!isset($elem->parentNode)) { continue; } $isDescendant = false; foreach ($this->body->childNodes as $parent) { if ($this->isDescendant($parent, $elem)) { $isDescendant = true; break; } } if ($isDescendant) { $this->debug('...element is child of another body element, skipping.'); } else { // prune (clean up elements that may not be content) if ($this->config->prune()) { $this->debug('Pruning content'); $this->readability->prepArticle($elem); } $this->debug('...element added to body'); $this->body->appendChild($elem); } } if ($this->body->hasChildNodes()) { break; } } } } // auto detect? $detect_title = $detect_body = $detect_author = $detect_date = false; // detect title? if (!isset($this->title)) { if (empty($this->config->title) || $this->config->autodetect_on_failure()) { $detect_title = true; } } // detect body? if (!isset($this->body)) { if (empty($this->config->body) || $this->config->autodetect_on_failure()) { $detect_body = true; } } // detect author? if (empty($this->author)) { if (empty($this->config->author) || $this->config->autodetect_on_failure()) { $detect_author = true; } } // detect date? if (!isset($this->date)) { if (empty($this->config->date) || $this->config->autodetect_on_failure()) { $detect_date = true; } } // check for hNews if ($detect_title || $detect_body) { // check for hentry $elems = @$xpath->query("//*[contains(concat(' ',normalize-space(@class),' '),' hentry ')]", $this->readability->dom); if ($elems && $elems->length > 0) { $this->debug('hNews: found hentry'); $hentry = $elems->item(0); if ($detect_title) { // check for entry-title $elems = @$xpath->query(".//*[contains(concat(' ',normalize-space(@class),' '),' entry-title ')]", $hentry); if ($elems && $elems->length > 0) { $this->title = $elems->item(0)->textContent; $this->debug('hNews: found entry-title: ' . $this->title); // remove title from document $elems->item(0)->parentNode->removeChild($elems->item(0)); $detect_title = false; } } if ($detect_date) { // check for time element with pubdate attribute $elems = @$xpath->query(".//time[@pubdate or @pubDate] | .//abbr[contains(concat(' ',normalize-space(@class),' '),' published ')]", $hentry); if ($elems && $elems->length > 0) { $this->date = strtotime(trim($elems->item(0)->textContent)); // remove date from document //$elems->item(0)->parentNode->removeChild($elems->item(0)); if ($this->date) { $this->debug('hNews: found publication date: ' . date('Y-m-d H:i:s', $this->date)); $detect_date = false; } else { $this->date = null; } } } if ($detect_author) { // check for time element with pubdate attribute $elems = @$xpath->query(".//*[contains(concat(' ',normalize-space(@class),' '),' vcard ') and (contains(concat(' ',normalize-space(@class),' '),' author ') or contains(concat(' ',normalize-space(@class),' '),' byline '))]", $hentry); if ($elems && $elems->length > 0) { $author = $elems->item(0); $fn = @$xpath->query(".//*[contains(concat(' ',normalize-space(@class),' '),' fn ')]", $author); if ($fn && $fn->length > 0) { foreach ($fn as $_fn) { if (trim($_fn->textContent) != '') { $this->author[] = trim($_fn->textContent); $this->debug('hNews: found author: ' . trim($_fn->textContent)); } } } else { if (trim($author->textContent) != '') { $this->author[] = trim($author->textContent); $this->debug('hNews: found author: ' . trim($author->textContent)); } } $detect_author = empty($this->author); } } // check for entry-content. // according to hAtom spec, if there are multiple elements marked entry-content, // we include all of these in the order they appear - see http://microformats.org/wiki/hatom#Entry_Content if ($detect_body) { $elems = @$xpath->query(".//*[contains(concat(' ',normalize-space(@class),' '),' entry-content ')]", $hentry); if ($elems && $elems->length > 0) { $this->debug('hNews: found entry-content'); if ($elems->length == 1) { // what if it's empty? (some sites misuse hNews - place their content outside an empty entry-content element) $e = $elems->item(0); if ($e->tagName == 'img' || trim($e->textContent) != '') { $this->body = $elems->item(0); // prune (clean up elements that may not be content) if ($this->config->prune()) { $this->debug('Pruning content'); $this->readability->prepArticle($this->body); } $detect_body = false; } else { $this->debug('hNews: skipping entry-content - appears not to contain content'); } unset($e); } else { $this->body = $this->readability->dom->createElement('div'); $this->debug($elems->length . ' entry-content elems found'); foreach ($elems as $elem) { if (!isset($elem->parentNode)) { continue; } $isDescendant = false; foreach ($this->body->childNodes as $parent) { if ($this->isDescendant($parent, $elem)) { $isDescendant = true; break; } } if ($isDescendant) { $this->debug('Element is child of another body element, skipping.'); } else { // prune (clean up elements that may not be content) if ($this->config->prune()) { $this->debug('Pruning content'); $this->readability->prepArticle($elem); } $this->debug('Element added to body'); $this->body->appendChild($elem); } } $detect_body = false; } } } } } // check for elements marked with instapaper_title if ($detect_title) { // check for instapaper_title $elems = @$xpath->query("//*[contains(concat(' ',normalize-space(@class),' '),' instapaper_title ')]", $this->readability->dom); if ($elems && $elems->length > 0) { $this->title = $elems->item(0)->textContent; $this->debug('Title found (.instapaper_title): ' . $this->title); // remove title from document $elems->item(0)->parentNode->removeChild($elems->item(0)); $detect_title = false; } } // check for elements marked with instapaper_body if ($detect_body) { $elems = @$xpath->query("//*[contains(concat(' ',normalize-space(@class),' '),' instapaper_body ')]", $this->readability->dom); if ($elems && $elems->length > 0) { $this->debug('body found (.instapaper_body)'); $this->body = $elems->item(0); // prune (clean up elements that may not be content) if ($this->config->prune()) { $this->debug('Pruning content'); $this->readability->prepArticle($this->body); } $detect_body = false; } } // check for elements marked with itemprop="articleBody" (from Schema.org) if ($detect_body) { $elems = @$xpath->query("//*[@itemprop='articleBody']", $this->readability->dom); if ($elems && $elems->length > 0) { $this->debug('body found (Schema.org itemprop="articleBody")'); if ($elems->length == 1) { // what if it's empty? (content placed outside an empty itemprop='articleBody' element) $e = $elems->item(0); if ($e->tagName == 'img' || trim($e->textContent) != '') { $this->body = $elems->item(0); // prune (clean up elements that may not be content) if ($this->config->prune()) { $this->debug('Pruning content'); $this->readability->prepArticle($this->body); } $detect_body = false; } else { $this->debug('Schema.org: skipping itemprop="articleBody" - appears not to contain content'); } unset($e); } else { $this->body = $this->readability->dom->createElement('div'); $this->debug($elems->length . ' itemprop="articleBody" elems found'); foreach ($elems as $elem) { if (!isset($elem->parentNode)) { continue; } $isDescendant = false; foreach ($this->body->childNodes as $parent) { if ($this->isDescendant($parent, $elem)) { $isDescendant = true; break; } } if ($isDescendant) { $this->debug('Element is child of another body element, skipping.'); } else { // prune (clean up elements that may not be content) if ($this->config->prune()) { $this->debug('Pruning content'); $this->readability->prepArticle($elem); } $this->debug('Element added to body'); $this->body->appendChild($elem); } } $detect_body = false; } } } // Find author in rel="author" marked element // We only use this if there's exactly one. // If there's more than one, it could indicate more than // one author, but it could also indicate that we're processing // a page listing different articles with different authors. if ($detect_author) { $elems = @$xpath->query("//a[contains(concat(' ',normalize-space(@rel),' '),' author ')]", $this->readability->dom); if ($elems && $elems->length == 1) { $author = trim($elems->item(0)->textContent); if ($author != '') { $this->debug("Author found (rel=\"author\"): {$author}"); $this->author[] = $author; $detect_author = false; } } } // Find date in pubdate marked time element // For the same reason given above, we only use this // if there's exactly one element. if ($detect_date) { $elems = @$xpath->query("//time[@pubdate or @pubDate]", $this->readability->dom); if ($elems && $elems->length == 1) { $this->date = strtotime(trim($elems->item(0)->textContent)); // remove date from document //$elems->item(0)->parentNode->removeChild($elems->item(0)); if ($this->date) { $this->debug('Date found (pubdate marked time element): ' . date('Y-m-d H:i:s', $this->date)); $detect_date = false; } else { $this->date = null; } } } // still missing title or body, so we detect using Readability if ($detect_title || $detect_body) { $this->debug('Using Readability'); // clone body if we're only using Readability for title (otherwise it may interfere with body element) if (isset($this->body)) { $this->body = $this->body->cloneNode(true); } $success = $this->readability->init(); } if ($detect_title) { $this->debug('Detecting title'); $this->title = $this->readability->getTitle()->textContent; } if ($detect_body && $success) { $this->debug('Detecting body'); $this->body = $this->readability->getContent(); if ($this->body->childNodes->length == 1 && $this->body->firstChild->nodeType === XML_ELEMENT_NODE) { $this->body = $this->body->firstChild; } // prune (clean up elements that may not be content) if ($this->config->prune()) { $this->debug('Pruning content'); $this->readability->prepArticle($this->body); } } if (isset($this->body)) { // remove scripts $this->readability->removeScripts($this->body); // remove any h1-h6 elements that appear as first thing in the body // and which match our title if (isset($this->title) && $this->title != '') { $firstChild = $this->body->firstChild; while ($firstChild->nodeType && $firstChild->nodeType !== XML_ELEMENT_NODE) { $firstChild = $firstChild->nextSibling; } if ($firstChild->nodeType === XML_ELEMENT_NODE && in_array(strtolower($firstChild->tagName), array('h1', 'h2', 'h3', 'h4', 'h5', 'h6')) && strtolower(trim($firstChild->textContent)) == strtolower(trim($this->title))) { $this->body->removeChild($firstChild); } } // prevent self-closing iframes $elems = $this->body->getElementsByTagName('iframe'); for ($i = $elems->length - 1; $i >= 0; $i--) { $e = $elems->item($i); if (!$e->hasChildNodes()) { $e->appendChild($this->body->ownerDocument->createTextNode('[embedded content]')); } } // remove image lazy loading - WordPress plugin http://wordpress.org/extend/plugins/lazy-load/ // the plugin replaces the src attribute to point to a 1x1 gif and puts the original src // inside the data-lazy-src attribute. It also places the original image inside a noscript element // next to the amended one. $elems = @$xpath->query("//img[@data-lazy-src]", $this->body); for ($i = $elems->length - 1; $i >= 0; $i--) { $e = $elems->item($i); // let's see if we can grab image from noscript if ($e->nextSibling !== null && $e->nextSibling->nodeName === 'noscript') { $_new_elem = $e->ownerDocument->createDocumentFragment(); @$_new_elem->appendXML($e->nextSibling->innerHTML); $e->nextSibling->parentNode->replaceChild($_new_elem, $e->nextSibling); $e->parentNode->removeChild($e); } else { // Use data-lazy-src as src value $e->setAttribute('src', $e->getAttribute('data-lazy-src')); $e->removeAttribute('data-lazy-src'); } } $this->success = true; } // if we've had no success and we've used tidy, there's a chance // that tidy has messed up. So let's try again without tidy... if (!$this->success && $tidied && $smart_tidy) { $this->debug('Trying again without tidy'); $this->process($original_html, $url, false); } return $this->success; }
/** * Create instance of Readability * @param string UTF-8 encoded string * @param string (optional) URL associated with HTML (for footnotes) * @param string (optional) Which parser to use for turning raw HTML into a DOMDocument * @param boolean (optional) Use tidy */ function __construct($html, $url = null, $parser = 'libxml', $use_tidy = true) { $this->url = $url; $this->debugText = 'Parsing URL: ' . $url . "\n"; if ($url) { $this->domainRegExp = '/' . strtr(preg_replace('/www\\d*\\./', '', parse_url($url)['host']), array('.' => '\\.')) . '/'; } mb_internal_encoding("UTF-8"); mb_http_output("UTF-8"); mb_regex_encoding("UTF-8"); $this->imageCache = new ImageCaching(); // HACK: dirty cleanup to replace some stuff; shouldn't use regexps with HTML but well... if (!$this->flagIsActive(self::FLAG_DISABLE_PREFILTER)) { try { foreach ($this->pre_filters as $search => $replace) { $html = preg_replace($search, $replace, $html); } unset($search, $replace); } catch (Exception $e) { $this->debugText .= "Cleaning raw HTML failed. Ignoring: " . $e->getMessage(); } } if (trim($html) === '') { $html = '<html></html>'; } /** * Use tidy (if it exists). * This fixes problems with some sites which would otherwise trouble DOMDocument's HTML parsing. * Although sometimes it makes matters worse, which is why there is an option to disable it. * **/ if ($use_tidy && function_exists('tidy_parse_string')) { $this->debugText .= 'Tidying document' . "\n"; $tidy = tidy_parse_string($html, $this->tidy_config, 'UTF8'); if (tidy_clean_repair($tidy)) { $original_html = $html; $this->tidied = true; $html = $tidy->value; $html = preg_replace('/<html[^>]+>/i', '<html>', $html); $html = preg_replace('/[\\r\\n]+/is', "\n", $html); } unset($tidy); } $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8"); if ($parser == 'html5lib' && ($this->dom = HTML5_Parser::parse($html))) { // all good } else { libxml_use_internal_errors(true); $this->dom = new DOMDocument(); $this->dom->preserveWhiteSpace = false; @$this->dom->loadHTML($html, LIBXML_NOBLANKS | LIBXML_COMPACT | LIBXML_NOERROR); } $this->dom->registerNodeClass('DOMElement', 'JSLikeHTMLElement'); }
function nexista_devBuffer($init) { $init->process(); ob_start(); ob_start(); header('Cache-Control: no-cache, must-revalidate'); header('Last-Modified: ' . gmdate("D, d M Y H:i:s") . ' GMT'); nexista_development_console(); $output = $init->run(); if (isset($_GET['view_flow'])) { if ($_GET['view_flow'] == 'true') { nexista_view_flow(); } } if ($_GET['client_view_flow'] == 'true') { $mynid = $_GET['nid']; $_SESSION['client_view_flow'] = 'true'; } elseif ($_GET['client_view_flow'] == 'false') { $mynid = $_GET['nid']; $_SESSION['client_view_flow'] = 'false'; } if ($_SESSION['client_view_flow'] == 'true') { $flow_viewport = nexista_view_flow(); } $output = str_replace('</body>', '', $output); $output = str_replace('</html>', '', $output); $output .= $flow_viewport; $output .= nexista_final_notices($cache_type, 'dev'); $output .= '</body></html>'; $tidy = false; if ($tidy == 'xhtml') { $options = array('output-xhtml' => true, 'indent' => true, 'input-encoding' => 'utf8', 'output-encoding' => 'utf8', 'clean' => true); $output = tidy_parse_string($output, $options); tidy_clean_repair($output); } if ($tidy == 'html') { $options = array('output-html' => true, 'indent' => true, 'input-encoding' => 'utf8', 'output-encoding' => 'utf8', 'clean' => true); $output = tidy_parse_string($output, $options); tidy_clean_repair($output); } echo $output; ob_end_flush(); header('Content-Length: ' . ob_get_length()); ob_end_flush(); }
/** * nv_valid_html() * * @param string $html * @param mixed $config * @param string $encoding * @return */ function nv_valid_html($html, $config, $encoding = 'utf8') { global $sys_info; if ($sys_info['supports_tidy'] == "class") { $tidy = new tidy(); $tidy->parseString($html, $config, $encoding); $tidy->cleanRepair(); return $tidy; } if ($sys_info['supports_tidy'] == "func") { $tidy = tidy_parse_string($html, $config, $encoding); tidy_clean_repair(); return $tidy; } return $html; }
function viewAction() { if (!$this->validateProblemAccess()) { return; } $prob = $this->view->prob; $this->view->content_html = file_get_contents(get_file_name("data/problems/" . $this->_request->get("probid") . "/index.html")); if (function_exists("tidy_parse_string") && $this->_request->get("tidy") != "false") { /* tidy to XHTML strict */ $opt = array("output-xhtml" => true, "add-xml-decl" => true, "bare" => true, "clean" => true, "quote-ampersand" => true, "doctype" => "strict"); $tidy = tidy_parse_string($this->view->content_html, $opt); tidy_clean_repair($tidy); $this->view->content_html = tidy_get_output($tidy); $this->fixImages(); /* redo the tidy, I agree it's slow, but easy way out. :) */ $opt = array("output-xhtml" => true, "doctype" => "strict", "show-body-only" => true); $tidy = tidy_parse_string($this->view->content_html, $opt); tidy_clean_repair($tidy); $this->view->content_html = tidy_get_output($tidy); } if ($this->_request->get("plain") == "true") { $this->_helper->layout->disableLayout(); $this->_helper->viewRenderer->setNoRender(); $this->getResponse()->setBody($this->view->content_html); } }
public function process($html, $url, $smart_tidy = true) { $this->reset(); // extract host name $host = @parse_url($url, PHP_URL_HOST); if (!($this->config = SiteConfig::build($host))) { // no match, check HTML for fingerprints if (!empty($this->fingerprints) && ($_fphost = $this->findHostUsingFingerprints($html))) { $this->config = SiteConfig::build($_fphost); } unset($_fphost); if (!$this->config) { // no match, so use defaults $this->config = new SiteConfig(); } } //echo count($this->config->body); // store copy of config in our static cache array in case we need to process another URL SiteConfig::add_to_cache($host, $this->config); // do string replacements foreach ($this->config->replace_string as $_repl) { $html = str_replace($_repl[0], $_repl[1], $html); } unset($_repl); // use tidy (if it exists)? // This fixes problems with some sites which would otherwise // trouble DOMDocument's HTML parsing. (Although sometimes it // makes matters worse, which is why you can override it in site config files.) $tidied = false; if ($this->config->tidy && function_exists('tidy_parse_string') && $smart_tidy) { $this->debug('Using Tidy'); $tidy = tidy_parse_string($html, self::$tidy_config, 'UTF8'); if (tidy_clean_repair($tidy)) { $original_html = $html; $tidied = true; $html = $tidy->value; } unset($tidy); } // load and parse html $this->readability = new Readability($html, $url); // we use xpath to find elements in the given HTML document // see http://en.wikipedia.org/wiki/XPath_1.0 $xpath = new DOMXPath($this->readability->dom); // try to get title foreach ($this->config->title as $pattern) { $elems = @$xpath->evaluate($pattern, $this->readability->dom); if (is_string($elems)) { $this->debug('Title expression evaluated as string'); $this->title = trim($elems); break; } elseif ($elems instanceof DOMNodeList && $elems->length > 0) { $this->debug('Title matched'); $this->title = $elems->item(0)->textContent; // remove title from document try { $elems->item(0)->parentNode->removeChild($elems->item(0)); } catch (DOMException $e) { // do nothing } break; } } // try to get author (if it hasn't already been set) if (empty($this->author)) { foreach ($this->config->author as $pattern) { $elems = @$xpath->evaluate($pattern, $this->readability->dom); if (is_string($elems)) { $this->debug('Author expression evaluated as string'); if (trim($elems) != '') { $this->author[] = trim($elems); break; } } elseif ($elems instanceof DOMNodeList && $elems->length > 0) { foreach ($elems as $elem) { if (!isset($elem->parentNode)) { continue; } $this->author[] = trim($elem->textContent); } if (!empty($this->author)) { break; } } } } // try to get language $_lang_xpath = array('//html[@lang]/@lang', '//meta[@name="DC.language"]/@content'); foreach ($_lang_xpath as $pattern) { $elems = @$xpath->evaluate($pattern, $this->readability->dom); if (is_string($elems)) { if (trim($elems) != '') { $this->language = trim($elems); break; } } elseif ($elems instanceof DOMNodeList && $elems->length > 0) { foreach ($elems as $elem) { if (!isset($elem->parentNode)) { continue; } $this->language = trim($elem->textContent); } if ($this->language) { break; } } } // try to get date foreach ($this->config->date as $pattern) { $elems = @$xpath->evaluate($pattern, $this->readability->dom); if (is_string($elems)) { $this->debug('Date expression evaluated as string'); $this->date = strtotime(trim($elems, "; \t\n\r\v")); } elseif ($elems instanceof DOMNodeList && $elems->length > 0) { $this->debug('Date matched'); $this->date = $elems->item(0)->textContent; $this->date = strtotime(trim($this->date, "; \t\n\r\v")); // remove date from document // $elems->item(0)->parentNode->removeChild($elems->item(0)); } if (!$this->date) { $this->date = null; } else { break; } } // strip elements (using xpath expressions) foreach ($this->config->strip as $pattern) { $elems = @$xpath->query($pattern, $this->readability->dom); // check for matches if ($elems && $elems->length > 0) { $this->debug('Stripping ' . $elems->length . ' elements (strip)'); for ($i = $elems->length - 1; $i >= 0; $i--) { $elems->item($i)->parentNode->removeChild($elems->item($i)); } } } // strip elements (using id and class attribute values) foreach ($this->config->strip_id_or_class as $string) { $string = strtr($string, array("'" => '', '"' => '')); $elems = @$xpath->query("//*[contains(@class, '{$string}') or contains(@id, '{$string}')]", $this->readability->dom); // check for matches if ($elems && $elems->length > 0) { $this->debug('Stripping ' . $elems->length . ' elements (strip_id_or_class)'); for ($i = $elems->length - 1; $i >= 0; $i--) { $elems->item($i)->parentNode->removeChild($elems->item($i)); } } } // strip images (using src attribute values) foreach ($this->config->strip_image_src as $string) { $string = strtr($string, array("'" => '', '"' => '')); $elems = @$xpath->query("//img[contains(@src, '{$string}')]", $this->readability->dom); // check for matches if ($elems && $elems->length > 0) { $this->debug('Stripping ' . $elems->length . ' image elements'); for ($i = $elems->length - 1; $i >= 0; $i--) { $elems->item($i)->parentNode->removeChild($elems->item($i)); } } } // strip elements using Readability.com and Instapaper.com ignore class names // .entry-unrelated and .instapaper_ignore // See https://www.readability.com/publishers/guidelines/#view-plainGuidelines // and http://blog.instapaper.com/post/730281947 $elems = @$xpath->query("//*[contains(concat(' ',normalize-space(@class),' '),' entry-unrelated ') or contains(concat(' ',normalize-space(@class),' '),' instapaper_ignore ')]", $this->readability->dom); // check for matches if ($elems && $elems->length > 0) { $this->debug('Stripping ' . $elems->length . ' .entry-unrelated,.instapaper_ignore elements'); for ($i = $elems->length - 1; $i >= 0; $i--) { $elems->item($i)->parentNode->removeChild($elems->item($i)); } } // strip elements that contain style="display: none;" $elems = @$xpath->query("//*[contains(@style,'display:none')]", $this->readability->dom); // check for matches if ($elems && $elems->length > 0) { $this->debug('Stripping ' . $elems->length . ' elements with inline display:none style'); for ($i = $elems->length - 1; $i >= 0; $i--) { $elems->item($i)->parentNode->removeChild($elems->item($i)); } } // try to get body foreach ($this->config->body as $pattern) { $elems = @$xpath->query($pattern, $this->readability->dom); // check for matches //echo "elems->length: [" . $pattern. "]\n\n"; //echo "elems->length: [" . $this->readability->dom->innerHTML. "]\n\n"; if ($elems && $elems->length > 0) { //echo "elems->length matched: [" . $pattern. "]\n\n"; //print_r($next_page_pattern); $this->body = $this->getMatchedBody($elems); $next_page_content = $this->retrieveNextPage($xpath, $url); //if ($next_page_content !== FALSE) { // $body->appendChild($next_page_content); //$next_page_content = $this->retrieveNextPage($xpath, $body, $url); //} //$this->body = $body; if ($elems->length === 1) { break; } } } //echo "auto detect之前: [" . $this->body->innerHTML . "]\n\n"; // auto detect? $detect_title = $detect_body = $detect_author = $detect_date = false; // detect title? if (!isset($this->title)) { if (empty($this->config->title) || $this->config->autodetect_on_failure) { $detect_title = true; } } // detect body? if (!isset($this->body)) { if (empty($this->config->body) || $this->config->autodetect_on_failure) { $detect_body = true; } } // detect author? if (empty($this->author)) { if (empty($this->config->author) || $this->config->autodetect_on_failure) { $detect_author = true; } } // detect date? if (!isset($this->date)) { if (empty($this->config->date) || $this->config->autodetect_on_failure) { $detect_date = true; } } // check for hNews if ($detect_title || $detect_body) { // check for hentry $elems = @$xpath->query("//*[contains(concat(' ',normalize-space(@class),' '),' hentry ')]", $this->readability->dom); if ($elems && $elems->length > 0) { $this->debug('hNews: found hentry'); $hentry = $elems->item(0); if ($detect_title) { // check for entry-title $elems = @$xpath->query(".//*[contains(concat(' ',normalize-space(@class),' '),' entry-title ')]", $hentry); if ($elems && $elems->length > 0) { $this->debug('hNews: found entry-title'); $this->title = $elems->item(0)->textContent; // remove title from document $elems->item(0)->parentNode->removeChild($elems->item(0)); $detect_title = false; } } if ($detect_date) { // check for time element with pubdate attribute $elems = @$xpath->query(".//time[@pubdate] | .//abbr[contains(concat(' ',normalize-space(@class),' '),' published ')]", $hentry); if ($elems && $elems->length > 0) { $this->debug('hNews: found publication date'); $this->date = strtotime(trim($elems->item(0)->textContent)); // remove date from document //$elems->item(0)->parentNode->removeChild($elems->item(0)); if ($this->date) { $detect_date = false; } else { $this->date = null; } } } if ($detect_author) { // check for time element with pubdate attribute $elems = @$xpath->query(".//*[contains(concat(' ',normalize-space(@class),' '),' vcard ') and (contains(concat(' ',normalize-space(@class),' '),' author ') or contains(concat(' ',normalize-space(@class),' '),' byline '))]", $hentry); if ($elems && $elems->length > 0) { $this->debug('hNews: found author'); $author = $elems->item(0); $fn = @$xpath->query(".//*[contains(concat(' ',normalize-space(@class),' '),' fn ')]", $author); if ($fn && $fn->length > 0) { foreach ($fn as $_fn) { if (trim($_fn->textContent) != '') { $this->author[] = trim($_fn->textContent); } } } else { if (trim($author->textContent) != '') { $this->author[] = trim($author->textContent); } } $detect_author = empty($this->author); } } // check for entry-content. // according to hAtom spec, if there are multiple elements marked entry-content, // we include all of these in the order they appear - see http://microformats.org/wiki/hatom#Entry_Content if ($detect_body) { $elems = @$xpath->query(".//*[contains(concat(' ',normalize-space(@class),' '),' entry-content ')]", $hentry); if ($elems && $elems->length > 0) { $this->debug('hNews: found entry-content'); if ($elems->length == 1) { // what if it's empty? (some sites misuse hNews - place their content outside an empty entry-content element) $e = $elems->item(0); if ($e->tagName == 'img' || trim($e->textContent) != '') { $this->body = $elems->item(0); //echo "elems->item: [" . $this->body->innerHTML . "]\n\n"; // prune (clean up elements that may not be content) if ($this->config->prune) { $this->debug('Pruning content'); $this->readability->prepArticle($this->body); } $detect_body = false; } else { $this->debug('hNews: skipping entry-content - appears not to contain content'); } unset($e); } else { $this->body = $this->readability->dom->createElement('div'); //echo "elems->item: [" . $this->body->innerHTML . "]\n\n"; $this->debug($elems->length . ' entry-content elems found'); foreach ($elems as $elem) { if (!isset($elem->parentNode)) { continue; } $isDescendant = false; foreach ($this->body->childNodes as $parent) { if ($this->isDescendant($parent, $elem)) { $isDescendant = true; break; } } if ($isDescendant) { $this->debug('Element is child of another body element, skipping.'); } else { // prune (clean up elements that may not be content) if ($this->config->prune) { $this->debug('Pruning content'); $this->readability->prepArticle($elem); } $this->debug('Element added to body'); $this->body->appendChild($elem); } } echo "elems->item: [" . $this->body->innerHTML . "]\n\n"; $detect_body = false; } } } } } //echo "elems->item: [" . $this->body->innerHTML . "]\n\n"; // check for elements marked with instapaper_title if ($detect_title) { // check for instapaper_title $elems = @$xpath->query("//*[contains(concat(' ',normalize-space(@class),' '),' instapaper_title ')]", $this->readability->dom); if ($elems && $elems->length > 0) { $this->debug('title found (.instapaper_title)'); $this->title = $elems->item(0)->textContent; // remove title from document $elems->item(0)->parentNode->removeChild($elems->item(0)); $detect_title = false; } } // check for elements marked with instapaper_body if ($detect_body) { $elems = @$xpath->query("//*[contains(concat(' ',normalize-space(@class),' '),' instapaper_body ')]", $this->readability->dom); if ($elems && $elems->length > 0) { $this->debug('body found (.instapaper_body)'); $this->body = $elems->item(0); // prune (clean up elements that may not be content) if ($this->config->prune) { $this->debug('Pruning content'); $this->readability->prepArticle($this->body); } $detect_body = false; } } //echo "after detect_body: [" . $this->body->innerHTML . "]\n\n"; // Find author in rel="author" marked element // We only use this if there's exactly one. // If there's more than one, it could indicate more than // one author, but it could also indicate that we're processing // a page listing different articles with different authors. if ($detect_author) { $elems = @$xpath->query("//a[contains(concat(' ',normalize-space(@rel),' '),' author ')]", $this->readability->dom); if ($elems && $elems->length == 1) { $this->debug('Author found (rel="author")'); $author = trim($elems->item(0)->textContent); if ($author != '') { $this->author[] = $author; $detect_author = false; } } } // Find date in pubdate marked time element // For the same reason given above, we only use this // if there's exactly one element. if ($detect_date) { $elems = @$xpath->query("//time[@pubdate]", $this->readability->dom); if ($elems && $elems->length == 1) { $this->debug('Date found (pubdate marked time element)'); $this->date = strtotime(trim($elems->item(0)->textContent)); // remove date from document //$elems->item(0)->parentNode->removeChild($elems->item(0)); if ($this->date) { $detect_date = false; } else { $this->date = null; } } } // still missing title or body, so we detect using Readability if ($detect_title || $detect_body) { $this->debug('Using Readability'); // clone body if we're only using Readability for title (otherwise it may interfere with body element) if (isset($this->body)) { $this->body = $this->body->cloneNode(true); } $success = $this->readability->init(); } if ($detect_title) { $this->debug('Detecting title'); $this->title = $this->readability->getTitle()->textContent; } //echo "before detect body success [" . $this->body->innerHTML . "]\n\n"; if ($detect_body && $success) { $this->debug('Detecting body'); $this->body = $this->readability->getContent(); //echo "getContent() : [" . $this->body->innerHTML . "] \n\n" ; if ($this->body->childNodes->length == 1 && $this->body->firstChild->nodeType === XML_ELEMENT_NODE) { $this->body = $this->body->firstChild; } // prune (clean up elements that may not be content) if ($this->config->prune) { $this->debug('Pruning content'); $this->readability->prepArticle($this->body); } } //echo "如果沒有Body [" . $this->body->innerHTML . "]\n\n"; if (isset($this->body)) { // remove scripts $this->readability->removeScripts($this->body); // remove any h1-h6 elements that appear as first thing in the body // and which match our title if (isset($this->title) && $this->title != '') { $firstChild = $this->body->firstChild; while ($firstChild->nodeType && $firstChild->nodeType !== XML_ELEMENT_NODE) { $firstChild = $firstChild->nextSibling; } if ($firstChild->nodeType === XML_ELEMENT_NODE && in_array(strtolower($firstChild->tagName), array('h1', 'h2', 'h3', 'h4', 'h5', 'h6')) && strtolower(trim($firstChild->textContent)) == strtolower(trim($this->title))) { $this->body->removeChild($firstChild); } } $this->success = true; } //echo "下一頁之前: [" . $this->body->innerHTML . "]\n\n"; // 20131011 要實作下一頁的偵測!!! if (isset($this->body)) { //$elems = @$xpath->query("//a[starts-with(@href, /?p=) and &page=2']", $this->readability->dom); $next_page_pattern = $this->options->next_page_pattern; //echo $next_page_pattern; //$next_page_pattern = "//a[contains(@href, '&page=')]"; //$next_page_pattern = "//a"; $elems = @$xpath->query($next_page_pattern, $this->readability->dom); ////echo $elems->length; //$link = @$xpath->query("//a[contains(@href, '&page=')]/@href", $this->readability->dom); //if ($link, $) { if ($elems && $elems->length > 0) { try { @($elem = $this->readability->dom->createElement('div', $elems->item(0)->getAttribute("href"))); } catch (Exception $e) { } $elem = $this->readability->dom->createElement('div', "aaa"); $attributes = $elems->item($elems->legnth)->attributes; $href = $attributes->getNamedItem("href")->value; if (substr($href, 0, 4) !== "http") { //echo $href; $url_component = parse_url($url); //$href = urlencode($href); //$elem = $this->readability->dom->createElement('div', $href); //$this->body = $elem; $permalink = $url_component["scheme"] . "://" . $url_component["host"] . $href; } else { $permalink = $href; } //$permalink = $this->getNextPagePermalink($elems); //echo $permalink; //echo "[[[[".$permalink."]]]]"; //$permalink = "http://blog.soft.idv.tw/?p=1606&page=2"; $extractor = new ContentExtractor(dirname(__FILE__) . '/site_config/custom', dirname(__FILE__) . '/site_config/standard'); $extractor->fingerprints = $options->fingerprints; $elem = new ContentExtractor($this->path, $this->fallback); $extractor->fingerprints = $this->fingerprints; $http = new HumbleHttpAgent(); $response = $http->get($permalink, true); //echo 'status_code: '. $response['status_code'] . "\n\n"; if ($permalink && ($response = $http->get($permalink, true)) && ($response['status_code'] < 300 || $response['status_code'] > 400)) { $html = $response['body']; //echo "html: " .$html; // remove strange things $html = str_replace('</[>', '', $html); $html = convert_to_utf8($html, $response['headers']); $extract_result = $extractor->process($html, $permalink); //$readability = $extractor->readability; $content_block = $extract_result ? $extractor->getContent() : null; //echo "content_block->innerHTML: ". $content_block->innerHTML . "\n\n"; //$this->body->appendChild($elem); } //$doc = $this->readability->dom->("<node>".$content_block->C14N()."</node>"); //$content = $content_block-> //$content = $this->readability->dom->createElement('div', $content_block->innerHTML); $doc = new DOMDocument(); if (@$doc->loadHTML($content_block->innerHTML)) { $doc->saveHTML(); //$content = $this->readability->dom->loadHTML($content_block->innerHTML); $content = $this->readability->dom->createElement('div', $content_block->innerHTML); $content = $this->readability->dom->importNode($content_block, true); $this->body->appendChild($content); //$this->body->appendChild($doc); //$xpath = new DOMXPath($this->readability->dom); //$elems = @$xpath->query($extract_pattern, $content_block); //$this->body->appendChild($content_block); //$this->body = $content_block; } } } // if we've had no success and we've used tidy, there's a chance // that tidy has messed up. So let's try again without tidy... if (!$this->success && $tidied && $smart_tidy) { $this->debug('Trying again without tidy'); $this->process($original_html, $url, false); } return $this->success; }
/** * Use HTML Tidy to validate the $text * Only runs when $config['HTML_Tidy'] is off * * @param string $text The html content to be checked. Passed by reference */ public static function tidyFix(&$text, $ignore_config = false) { global $config; if (!$ignore_config) { if (empty($config['HTML_Tidy']) || $config['HTML_Tidy'] == 'off') { return true; } } if (!function_exists('tidy_parse_string')) { return false; } $options = array(); $options['wrap'] = 0; //keeps tidy from wrapping... want the least amount of space changing as possible.. could get rid of spaces between words with the str_replaces below $options['doctype'] = 'omit'; //omit, auto, strict, transitional, user $options['drop-empty-paras'] = true; //drop empty paragraphs $options['output-xhtml'] = true; //need this so that <br> will be <br/> .. etc $options['show-body-only'] = true; $options['hide-comments'] = false; $tidy = tidy_parse_string($text, $options, 'utf8'); tidy_clean_repair($tidy); if (tidy_get_status($tidy) === 2) { // 2 is magic number for fatal error // http://www.php.net/manual/en/function.tidy-get-status.php return false; } $text = tidy_get_output($tidy); return true; }
<?php /* create tidy resource based on HTML string */ $a = tidy_parse_string("<HTML></HTML>"); tidy_clean_repair($a); // repair the given HTML $out = tidy_get_output($a); // get output echo nl2br(htmlspecialchars($out));
function fetch_via_wordpress_ynf($fansub_id, $url, $last_fetched_item_date) { $elements = array(); $tidy_config = "tidy.conf"; $error_connect = FALSE; $html_text = file_get_contents($url) or $error_connect = TRUE; if ($error_connect) { return array('error_connect', array()); } $tidy = tidy_parse_string($html_text, $tidy_config, 'UTF8'); tidy_clean_repair($tidy); $html = str_get_html(tidy_get_output($tidy)); $go_on = TRUE; while ($go_on) { //parse through the HTML and build up the elements feed as we go along foreach ($html->find('article') as $article) { if ($article->find('h1.entry-title a', 0) !== NULL) { //Create an empty item $item = array(); //Look up and add elements to the item $title = $article->find('h1.entry-title a', 0); $item[0] = $title->innertext; $item[1] = $article->find('div.entry-content', 0)->innertext; $description = str_replace("text-align:center;", "", $article->find('div.entry-content', 0)->innertext); $item[2] = parse_description($description); //The format is: 2013-09-02T14:43:43+00:00 $datetext = $article->find('time', 0)->datetime; $date = date_create_from_format('Y-m-d\\TH:i:sP', $datetext); $date->setTimeZone(new DateTimeZone('Europe/Berlin')); $item[3] = $date->format('Y-m-d H:i:s'); $item[4] = $title->href; $item[5] = fetch_and_parse_image($fansub_id, $url, $description); $elements[] = $item; } } $texts = $html->find('text'); $go_on = FALSE; if (count($elements) > 0 && $elements[count($elements) - 1][3] >= $last_fetched_item_date) { foreach ($texts as $text) { if ($text->plaintext == ' Entrades més antigues') { //Not sleeping, Wordpress.com does not appear to be rate-limited $html_text = file_get_contents($text->parent->href) or $error_connect = TRUE; if ($error_connect) { return array('error_connect', array()); } $tidy = tidy_parse_string($html_text, $tidy_config, 'UTF8'); tidy_clean_repair($tidy); $html = str_get_html(tidy_get_output($tidy)); $go_on = TRUE; break; } } } } return array('ok', $elements); }
/** * Load HTML in a DOMDocument. * Apply Pre filters * Cleanup HTML using Tidy (or not). * * @todo This should be called in init() instead of from __construct */ private function loadHtml() { $this->original_html = $this->html; $this->logger->debug('Parsing URL: ' . $this->url); if ($this->url) { $this->domainRegExp = '/' . strtr(preg_replace('/www\\d*\\./', '', parse_url($this->url, PHP_URL_HOST)), array('.' => '\\.')) . '/'; } mb_internal_encoding('UTF-8'); mb_http_output('UTF-8'); mb_regex_encoding('UTF-8'); // HACK: dirty cleanup to replace some stuff; shouldn't use regexps with HTML but well... if (!$this->flagIsActive(self::FLAG_DISABLE_PREFILTER)) { foreach ($this->pre_filters as $search => $replace) { $this->html = preg_replace($search, $replace, $this->html); } unset($search, $replace); } if (trim($this->html) === '') { $this->html = '<html></html>'; } /* * Use tidy (if it exists). * This fixes problems with some sites which would otherwise trouble DOMDocument's HTML parsing. * Although sometimes it makes matters worse, which is why there is an option to disable it. */ if ($this->useTidy) { $this->logger->debug('Tidying document'); $tidy = tidy_parse_string($this->html, $this->tidy_config, 'UTF8'); if (tidy_clean_repair($tidy)) { $this->tidied = true; $this->html = $tidy->value; $this->html = preg_replace('/[\\r\\n]+/is', "\n", $this->html); } unset($tidy); } $this->html = mb_convert_encoding($this->html, 'HTML-ENTITIES', 'UTF-8'); if (!($this->parser === 'html5lib' && ($this->dom = \HTML5_Parser::parse($this->html)))) { libxml_use_internal_errors(true); $this->dom = new \DOMDocument(); $this->dom->preserveWhiteSpace = false; if (PHP_VERSION_ID >= 50400) { $this->dom->loadHTML($this->html, LIBXML_NOBLANKS | LIBXML_COMPACT | LIBXML_NOERROR); } else { $this->dom->loadHTML($this->html); } libxml_use_internal_errors(false); } $this->dom->registerNodeClass('DOMElement', 'Readability\\JSLikeHTMLElement'); }
// Feed untidy source into the stdin fwrite($pipes[0], $source); fclose($pipes[0]); // Read clean source out to the browser while (!feof($pipes[1])) { //echo fgets($pipes[1], 1024); $newsrc .= fgets($pipes[1], 1024); } fclose($pipes[1]); // Clean up after ourselves proc_close($process); } else { /* Use tidy if it's available from PECL */ if (function_exists('tidy_parse_string')) { $tempsrc = tidy_parse_string($source); tidy_clean_repair(); $newsrc = tidy_get_output(); } else { // Better give them back what they came with, so they don't lose it all... $newsrc = "<body>\n" . $source . "\n</body>"; } } // Split our source into an array by lines $srcLines = preg_split("/\n/", $newsrc, -1, PREG_SPLIT_NO_EMPTY); // Get only the lines between the body tags $startLn = 0; while (strpos($srcLines[$startLn++], '<body') === false && $startLn < sizeof($srcLines)) { } $endLn = $startLn; while (strpos($srcLines[$endLn++], '</body') === false && $endLn < sizeof($srcLines)) { }
/** * Fetch hCard for the specified URL. * * @param string $url URL to get hCard from * @return array array containing the hCard object (key: 'hcard') as well as the raw XML (key: 'xml') * @access private */ function ext_profile_hcard_from_url($url) { if (function_exists('tidy_clean_repair')) { $page = wp_remote_fopen($url); $page = tidy_clean_repair($page); } else { $page = wp_remote_fopen('http://cgi.w3.org/cgi-bin/tidy?forceXML=on&docAddr=' . urlencode($url)); } $page = str_replace(' ', ' ', $page); // parse hCard $hkit = extended_profile_hkit(); @($hcard = $hkit->getByString('hcard', $page)); if ($hcard['preferred']) { // use preferred card if available, as specified by hKit $preferred_hcard = $hcard['preferred'][0]; } elseif ($hcard['all']) { foreach ($hcard['all'] as $card) { if ($card['uid'] == $url) { $preferred_hcard = $card; break; } if (!is_array($card['url']) && $card['url'] == $url) { $preferred_hcard = $card; break; } if (is_array($card['url']) && in_array($url, $card['url'])) { $preferred_hcard = $card; break; } } if (!$preferred_hcard) { $preferred_hcard = $hcard['all'][0]; } } return array('hcard' => $preferred_hcard, 'xml' => $hcard['xml']); }
//$html = convert_to_utf8($html, $response['headers']); //$html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8"); } else { die('Failed to fetch URL'); } if (trim($html) == '') { die('Empty response :('); } // use Tidy? if (isset($_GET['tidy']) && $_GET['tidy'] === '1') { if (!function_exists('tidy_parse_string')) { die('Tidy requested but not available on server.'); } $tidy_config = array('clean' => true, 'output-xhtml' => true, 'logical-emphasis' => true, 'show-body-only' => false, 'new-blocklevel-tags' => 'article, aside, footer, header, hgroup, menu, nav, section, details, datagrid', 'new-inline-tags' => 'mark, time, meter, progress, data', 'wrap' => 0, 'drop-empty-paras' => true, 'drop-proprietary-attributes' => false, 'enclose-text' => true, 'enclose-block-text' => true, 'merge-divs' => true, 'merge-spans' => true, 'char-encoding' => 'utf8', 'hide-comments' => true); $tidy = tidy_parse_string($html, $tidy_config, 'UTF8'); if (tidy_clean_repair($tidy)) { $original_html = $html; $html = $tidy->value; } } //TODO: use HTML5 parser? //TODO: escape $url for insering in JS variable $js_inject = ' <!--ff-script--> <script src="' . $base . '/js/jquery-latest.min.js"></script> <!--script src="' . $base . '/js/toolbox.expose.js"></script--> <script src="' . $base . '/js/css2xpath.js"></script> <script src="' . $base . '/js/jquery.dom-outline-1.0.js"></script> <script src="' . $base . '/init.js.php?url=' . urlencode($url) . '"></script> <!--/ff-script--> ';
/** * Generates HTML from an array of tokens. * @param $tokens Array of HTMLPurifier_Token * @param $config HTMLPurifier_Config object * @return Generated HTML */ function generateFromTokens($tokens, $config, &$context) { $html = ''; if (!$config) { $config = HTMLPurifier_Config::createDefault(); } $this->_scriptFix = $config->get('Output', 'CommentScriptContents'); $this->_def = $config->getHTMLDefinition(); $this->_xhtml = $this->_def->doctype->xml; if (!$tokens) { return ''; } for ($i = 0, $size = count($tokens); $i < $size; $i++) { if ($this->_scriptFix && $tokens[$i]->name === 'script' && $i + 2 < $size && $tokens[$i + 2]->type == 'end') { // script special case // the contents of the script block must be ONE token // for this to work $html .= $this->generateFromToken($tokens[$i++]); $html .= $this->generateScriptFromToken($tokens[$i++]); // We're not going to do this: it wouldn't be valid anyway //while ($tokens[$i]->name != 'script') { // $html .= $this->generateScriptFromToken($tokens[$i++]); //} } $html .= $this->generateFromToken($tokens[$i]); } if ($config->get('Output', 'TidyFormat') && extension_loaded('tidy')) { $tidy_options = array('indent' => true, 'output-xhtml' => $this->_xhtml, 'show-body-only' => true, 'indent-spaces' => 2, 'wrap' => 68); if (version_compare(PHP_VERSION, '5', '<')) { tidy_set_encoding('utf8'); foreach ($tidy_options as $key => $value) { tidy_setopt($key, $value); } tidy_parse_string($html); tidy_clean_repair(); $html = tidy_get_output(); } else { $tidy = new Tidy(); $tidy->parseString($html, $tidy_options, 'utf8'); $tidy->cleanRepair(); $html = (string) $tidy; } } // normalize newlines to system $nl = $config->get('Output', 'Newline'); if ($nl === null) { $nl = PHP_EOL; } $html = str_replace("\n", $nl, $html); return $html; }
public function index($data) { $html = $this->render(); // return $html; $options = array("indent" => true, "indent-spaces" => "2", "wrap" => "90", "output-html" => true, "hide-comments" => true); $tidy = tidy_parse_string($html, $options, 'utf8'); tidy_clean_repair($tidy); return tidy_get_output($tidy); }
public function process($html, $url, $smart_tidy = true) { $this->reset(); // extract host name $host = @parse_url($url, PHP_URL_HOST); if (!($this->config = SiteConfig::build($host))) { // no match, so use defaults $this->config = new SiteConfig(); } // store copy of config in our static cache array in case we need to process another URL SiteConfig::add_to_cache($host, $this->config); // use tidy (if it exists)? // This fixes problems with some sites which would otherwise // trouble DOMDocument's HTML parsing. (Although sometimes it // makes matters worse, which is why you can override it in site config files.) $tidied = false; if ($this->config->tidy && function_exists('tidy_parse_string') && $smart_tidy) { $this->debug('Using Tidy'); $tidy = tidy_parse_string($html, self::$tidy_config, 'UTF8'); if (tidy_clean_repair($tidy)) { $original_html = $html; $tidied = true; $html = $tidy->value; } unset($tidy); } // load and parse html $this->readability = new Readability($html, $url); // we use xpath to find elements in the given HTML document // see http://en.wikipedia.org/wiki/XPath_1.0 $xpath = new DOMXPath($this->readability->dom); // strip elements (using xpath expressions) foreach ($this->config->strip as $pattern) { $elems = @$xpath->query($pattern, $this->readability->dom); // check for matches if ($elems && $elems->length > 0) { $this->debug('Stripping ' . $elems->length . ' elements (strip)'); for ($i = $elems->length - 1; $i >= 0; $i--) { $elems->item($i)->parentNode->removeChild($elems->item($i)); } } } // strip elements (using id and class attribute values) foreach ($this->config->strip_id_or_class as $string) { $string = strtr($string, array("'" => '', '"' => '')); $elems = @$xpath->query("//*[contains(@class, '{$string}') or contains(@id, '{$string}')]", $this->readability->dom); // check for matches if ($elems && $elems->length > 0) { $this->debug('Stripping ' . $elems->length . ' elements (strip_id_or_class)'); for ($i = $elems->length - 1; $i >= 0; $i--) { $elems->item($i)->parentNode->removeChild($elems->item($i)); } } } // strip images (using src attribute values) foreach ($this->config->strip_image_src as $string) { $string = strtr($string, array("'" => '', '"' => '')); $elems = @$xpath->query("//img[contains(@src, '{$string}')]", $this->readability->dom); // check for matches if ($elems && $elems->length > 0) { $this->debug('Stripping ' . $elems->length . ' image elements'); for ($i = $elems->length - 1; $i >= 0; $i--) { $elems->item($i)->parentNode->removeChild($elems->item($i)); } } } // strip elements using Readability.com and Instapaper.com ignore class names // .entry-unrelated and .instapaper_ignore // See https://www.readability.com/publishers/guidelines/#view-plainGuidelines // and http://blog.instapaper.com/post/730281947 $elems = @$xpath->query("//*[contains(concat(' ',normalize-space(@class),' '),' entry-unrelated ') or contains(concat(' ',normalize-space(@class),' '),' instapaper_ignore ')]", $this->readability->dom); // check for matches if ($elems && $elems->length > 0) { $this->debug('Stripping ' . $elems->length . ' .entry-unrelated,.instapaper_ignore elements'); for ($i = $elems->length - 1; $i >= 0; $i--) { $elems->item($i)->parentNode->removeChild($elems->item($i)); } } // strip elements that contain style="display: none;" $elems = @$xpath->query("//*[contains(@style,'display:none')]", $this->readability->dom); // check for matches if ($elems && $elems->length > 0) { $this->debug('Stripping ' . $elems->length . ' elements with inline display:none style'); for ($i = $elems->length - 1; $i >= 0; $i--) { $elems->item($i)->parentNode->removeChild($elems->item($i)); } } // try to get title foreach ($this->config->title as $pattern) { $elems = @$xpath->evaluate($pattern, $this->readability->dom); if (is_string($elems)) { $this->debug('Title expression evaluated as string'); $this->title = trim($elems); break; } elseif ($elems instanceof DOMNodeList && $elems->length > 0) { $this->debug('Title matched'); $this->title = $elems->item(0)->textContent; break; } } // try to get body foreach ($this->config->body as $pattern) { $elems = @$xpath->query($pattern, $this->readability->dom); // check for matches if ($elems && $elems->length > 0) { $this->debug('Body matched'); if ($elems->length == 1) { $this->body = $elems->item(0); // prune (clean up elements that may not be content) if ($this->config->prune) { $this->debug('Pruning content'); $this->readability->prepArticle($this->body); } break; } else { $this->body = $this->readability->dom->createElement('div'); $this->debug($elems->length . ' body elems found'); foreach ($elems as $elem) { $isDescendant = false; foreach ($this->body->childNodes as $parent) { if ($this->isDescendant($parent, $elem)) { $isDescendant = true; break; } } if ($isDescendant) { $this->debug('Element is child of another body element, skipping.'); } else { // prune (clean up elements that may not be content) if ($this->config->prune) { $this->debug('Pruning content'); $this->readability->prepArticle($elem); } $this->debug('Element added to body'); $this->body->appendChild($elem); } } } } } // auto detect? $detect_title = $detect_body = false; // detect title? if (!isset($this->title)) { if (empty($this->config->title) || !empty($this->config->title) && $this->config->autodetect_on_failure) { $detect_title = true; } } // detect body? if (!isset($this->body)) { if (empty($this->config->body) || !empty($this->config->body) && $this->config->autodetect_on_failure) { $detect_body = true; } } // check for hNews if ($detect_title || $detect_body) { // check for hentry $elems = @$xpath->query("//*[contains(concat(' ',normalize-space(@class),' '),' hentry ')]", $this->readability->dom); if ($elems && $elems->length > 0) { $this->debug('hNews: found hentry'); $hentry = $elems->item(0); if ($detect_title) { // check for entry-title $elems = @$xpath->query(".//*[contains(concat(' ',normalize-space(@class),' '),' entry-title ')]", $hentry); if ($elems && $elems->length > 0) { $this->debug('hNews: found entry-title'); $this->title = $elems->item(0)->textContent; $detect_title = false; } } // check for entry-content. // according to hAtom spec, if there are multiple elements marked entry-content, // we include all of these in the order they appear - see http://microformats.org/wiki/hatom#Entry_Content if ($detect_body) { $elems = @$xpath->query(".//*[contains(concat(' ',normalize-space(@class),' '),' entry-content ')]", $hentry); if ($elems && $elems->length > 0) { $this->debug('hNews: found entry-content'); if ($elems->length == 1) { // what if it's empty? (some sites misuse hNews - place their content outside an empty entry-content element) $e = $elems->item(0); if ($e->tagName == 'img' || trim($e->textContent) != '') { $this->body = $elems->item(0); // prune (clean up elements that may not be content) if ($this->config->prune) { $this->debug('Pruning content'); $this->readability->prepArticle($this->body); } $detect_body = false; } else { $this->debug('hNews: skipping entry-content - appears not to contain content'); } unset($e); } else { $this->body = $this->readability->dom->createElement('div'); $this->debug($elems->length . ' entry-content elems found'); foreach ($elems as $elem) { $isDescendant = false; foreach ($this->body->childNodes as $parent) { if ($this->isDescendant($parent, $elem)) { $isDescendant = true; break; } } if ($isDescendant) { $this->debug('Element is child of another body element, skipping.'); } else { // prune (clean up elements that may not be content) if ($this->config->prune) { $this->debug('Pruning content'); $this->readability->prepArticle($elem); } $this->debug('Element added to body'); $this->body->appendChild($elem); } } $detect_body = false; } } } } } // check for elements marked with instapaper_title if ($detect_title) { // check for instapaper_title $elems = @$xpath->query("//*[contains(concat(' ',normalize-space(@class),' '),' instapaper_title ')]", $this->readability->dom); if ($elems && $elems->length > 0) { $this->debug('title found (.instapaper_title)'); $this->title = $elems->item(0)->textContent; $detect_title = false; } } // check for elements marked with instapaper_body if ($detect_body) { $elems = @$xpath->query("//*[contains(concat(' ',normalize-space(@class),' '),' instapaper_body ')]", $this->readability->dom); if ($elems && $elems->length > 0) { $this->debug('body found (.instapaper_body)'); $this->body = $elems->item(0); // prune (clean up elements that may not be content) if ($this->config->prune) { $this->debug('Pruning content'); $this->readability->prepArticle($this->body); } $detect_body = false; } } // still missing title or body, so we detect using Readability if ($detect_title || $detect_body) { $this->debug('Using Readability'); // clone body if we're only using Readability for title (otherwise it may interfere with body element) if (isset($this->body)) { $this->body = $this->body->cloneNode(true); } $success = $this->readability->init(); } if ($detect_title) { $this->debug('Detecting title'); $this->title = $this->readability->getTitle()->textContent; } if ($detect_body && $success) { $this->debug('Detecting body'); $this->body = $this->readability->getContent(); if ($this->body->childNodes->length == 1 && $this->body->firstChild->nodeType === XML_ELEMENT_NODE) { $this->body = $this->body->firstChild; } // prune (clean up elements that may not be content) if ($this->config->prune) { $this->debug('Pruning content'); $this->readability->prepArticle($this->body); } } if (isset($this->body)) { // remove scripts $this->readability->removeScripts($this->body); $this->success = true; } // if we've had no success and we've used tidy, there's a chance // that tidy has messed up. So let's try again without tidy... if (!$this->success && $tidied && $smart_tidy) { $this->debug('Trying again without tidy'); $this->process($original_html, $url, false); } return $this->success; }
<?php $tidy = tidy_parse_file("clean_ex1.html", array("clean" => true)); tidy_clean_repair($tidy); echo $tidy;
private function _tidyClean($source) { if (TIDY_CLEAN) { if (function_exists('tidy_parse_string')) { $source = tidy_parse_string($source); tidy_clean_repair($source); } } return trim($source); }