Exemplo n.º 1
0
 function load_html($html)
 {
     $tidy = tidy_parse_string($html);
     tidy_clean_repair($tidy);
     $html = tidy_get_html($tidy);
     phpQuery::unloadDocuments();
     return phpQuery::newDocumentHTML($html);
 }
Exemplo n.º 2
0
 public function afterRender($event, $view)
 {
     if (!extension_loaded('tidy')) {
         return;
     }
     $options = array('hide-comments' => true, 'tidy-mark' => false, 'indent' => true, 'indent-spaces' => 4, 'new-blocklevel-tags' => 'article,header,footer,section,nav', 'new-inline-tags' => 'video,audio,canvas,ruby,rt,rp', 'doctype' => '<!DOCTYPE HTML>', 'sort-attributes' => 'alpha', 'vertical-space' => false, 'output-xhtml' => true, 'wrap' => 150, 'wrap-attributes' => false, 'break-before-br' => false);
     $buffer = tidy_parse_string($view->getContent(), $options, 'utf8');
     tidy_clean_repair($buffer);
     $buffer = str_replace(array('<html lang="en" xmlns="http://www.w3.org/1999/xhtml">', '<html xmlns="http://www.w3.org/1999/xhtml">'), '<!DOCTYPE html>', $buffer);
     $buffer = str_replace(">\n</script>", "></script>", $buffer);
     $view->setContent((string) $buffer);
 }
Exemplo n.º 3
0
function output($title = '', $body = '', $head = '')
{
    global $settings, $authid, $checkleft, $checkright, $head, $error, $error_die;
    if (theme('output_error') != false) {
        $body = theme('output_error');
        $title = 'Error';
        $panels = false;
        $lowerpanel = false;
        $panel = '';
        unset($error_die);
    } else {
        $panels = true;
    }
    //display panels
    if ($panels != false) {
        $panel = theme('displaypanels');
        $lowerpanel = theme('displaylowerpanel');
    }
    if (isset($error) && !empty($error)) {
        $errors = '<br />' . theme('title', 'Error') . theme('start_content') . '<div class="errors"><ul>';
        foreach ($error as $error1) {
            $errors .= '<li>' . $error1 . '</li>';
        }
        $errors .= '</ul></div>' . theme('end_content');
        unset($error);
    } else {
        $errors = '';
    }
    if (isset($_GET['page']) && $_GET['page'] > 1) {
        $title = $title . ' - Page ' . $_GET['page'];
    }
    $output = theme('head', stripslashes($title), $head) . '<body>';
    if ($settings['maintenance_mode'] == 'on') {
        $output .= '<div class="titlebg">WARNING: Maintenance Mode is on</div>';
    }
    $output .= '<div id="container">
	' . theme('top') . theme('links');
    $output .= $panel;
    //display the data
    $output .= $errors . '<br />' . stripslashes($body);
    $output .= $lowerpanel . theme('footer');
    //SEO Friendly Links
    include IN_PATH . '/functions/seofriendlyurls.php';
    //Check if the tidy library is installed
    if (extension_loaded('tidy')) {
        //yay it is, lets clean up all the HTML, so it looks all nice in View Source in your browser :)
        $options = array("indent" => true, 'wrap' => 0);
        $output = tidy_parse_string($output, $options);
        tidy_clean_repair($output);
    }
    die($output);
}
Exemplo n.º 4
0
/**
 * Turn a string or array into valid, standards-compliant (x)HTML
 *
 * Uses configuraton options in tidy.conf - which should minimally have show-body-only set to yes
 *
 * @param mixed $text The data to be tidied up
 * @return mixed $result Tidied data
 */
function tidy($text)
{
    static $tidy_funcs;
    static $tidy_conf;
    if (!isset($tidy_conf)) {
        $tidy_conf = SETTINGS_INC . 'tidy.conf';
    }
    if (is_array($text)) {
        $result = array();
        foreach (array_keys($text) as $key) {
            $result[$key] = tidy($text[$key]);
        }
        return $result;
    }
    // determine what tidy libraries are available
    if (empty($tidy_funcs)) {
        $tidy_funcs = get_extension_funcs('tidy');
    }
    $tidy_1_lib_available = !empty($tidy_funcs) && array_search('tidy_setopt', $tidy_funcs) !== false;
    $tidy_2_lib_available = !empty($tidy_funcs) && array_search('tidy_setopt', $tidy_funcs) === false;
    $tidy_command_line_available = TIDY_EXE ? file_exists(TIDY_EXE) : false;
    $text = protect_string_from_tidy($text);
    $text = '<html><body>' . $text . '</body></html>';
    if ($tidy_2_lib_available) {
        $tidy = new tidy();
        $tidy->parseString($text, $tidy_conf, 'utf8');
        $tidy->cleanRepair();
        $result = $tidy;
    } elseif ($tidy_1_lib_available) {
        tidy_load_config($tidy_conf);
        tidy_set_encoding('utf8');
        tidy_parse_string($text);
        tidy_clean_repair();
        $result = tidy_get_output();
    } elseif ($tidy_command_line_available) {
        $arg = escapeshellarg($text);
        // escape the bad stuff in the text
        $cmd = 'echo ' . $arg . ' | ' . TIDY_EXE . ' -q -config ' . $tidy_conf . ' 2> /dev/null';
        // the actual command - pipes the input to tidy which diverts its output to the random file
        $result = shell_exec($cmd);
        // execute the command
    } else {
        trigger_error('tidy does not appear to be available within php or at the command line - no tidying is taking place.');
        $result = $text;
    }
    return trim($result);
}
Exemplo n.º 5
0
 /**
  * Use the HTML tidy PECL extension to use the tidy library in-process,
  * saving the overhead of spawning a new process. Currently written to
  * the PHP 4.3.x version of the extension, may not work on PHP 5.
  *
  * 'pear install tidy' should be able to compile the extension module.
  */
 private static function internal($text)
 {
     global $wgTidyConf;
     $fname = 'Parser::internalTidy';
     wfProfileIn($fname);
     tidy_load_config($wgTidyConf);
     tidy_set_encoding('utf8');
     tidy_parse_string($text);
     tidy_clean_repair();
     if (tidy_get_status() == 2) {
         // 2 is magic number for fatal error
         // http://www.php.net/manual/en/function.tidy-get-status.php
         $cleansource = null;
     } else {
         $cleansource = tidy_get_output();
     }
     wfProfileOut($fname);
     return $cleansource;
 }
Exemplo n.º 6
0
 /**
  * tidy the data
  *
  * @access	public
  * @param	string		data
  * @return	string		compressed data
  */
 function apply($data)
 {
     if (!function_exists('tidy_parse_string')) {
         return $data;
     }
     /**
      * tidy 1.0
      */
     if (function_exists('tidy_setopt') && is_array($this->_params)) {
         foreach ($this->_params as $opt => $value) {
             tidy_setopt($opt, $value);
         }
         tidy_parse_string($data);
         tidy_clean_repair();
         $data = tidy_get_output();
     } else {
         $tidy = tidy_parse_string($data, $this->_params);
         tidy_clean_repair($tidy);
         $data = tidy_get_output($tidy);
     }
     return $data;
 }
Exemplo n.º 7
0
function return_parsed_bbcode($message, $nowrap = false)
{
    // never strip_tags here, see Page.Talks for details
    $message = str_replace("[b]", "<b>", $message);
    $message = str_replace("[/b]", "</b>", $message);
    $message = str_replace("[i]", "<i>", $message);
    $message = str_replace("[/i]", "</i>", $message);
    $message = str_replace("[u]", "<u>", $message);
    $message = str_replace("[/u]", "</u>", $message);
    $message = str_replace("[center]", "<div align=\"center\">", $message);
    $message = str_replace("[/center]", "</div>", $message);
    $message = str_replace("[left]", "<div align=\"left\">", $message);
    $message = str_replace("[/left]", "</div>", $message);
    $message = str_replace("[right]", "<div align=\"right\">", $message);
    $message = str_replace("[/right]", "</div>", $message);
    $message = str_replace("[ol]", "<ol>", $message);
    $message = str_replace("[ul]", "<ul>", $message);
    $message = str_replace("[li]", "<li>", $message);
    $message = str_replace("[/ol]", "</ol>", $message);
    $message = str_replace("[/ul]", "</ul>", $message);
    $message = str_replace("[br]", "<br>", $message);
    $message = eregi_replace("\\[img\\]([^\\[]*)\\[/img\\]", "<img src=\"\\1\" border=\"0\">", $message);
    $message = eregi_replace("\\[url\\](https?://[^\\[]*)\\[/url\\]", "<a href=\"\\1\">\\1</a>", $message);
    if (function_exists("tidy_get_output")) {
        if (!$nowrap) {
            $config = array('indent' => FALSE, 'output-xhtml' => TRUE, 'show-body-only' => TRUE, 'wrap' => 80);
        } else {
            $config = array('indent' => FALSE, 'output-xhtml' => TRUE, 'show-body-only' => TRUE);
        }
        tidy_set_encoding('UTF8');
        foreach ($config as $key => $value) {
            tidy_setopt($key, $value);
        }
        tidy_parse_string($message);
        tidy_clean_repair();
        $message = tidy_get_output();
    }
    return $message;
}
Exemplo n.º 8
0
 function TidyClean()
 {
     if (!class_exists('tidy')) {
         if (function_exists('tidy_parse_string')) {
             //use procedural style for compatibility with PHP 4.3
             tidy_set_encoding($this->Encoding);
             foreach ($this->TidyConfig as $key => $value) {
                 tidy_setopt($key, $value);
             }
             tidy_parse_string($this->html);
             tidy_clean_repair();
             $this->html = tidy_get_output();
         } else {
             print "<b>No tidy support. Please enable it in your php.ini.\r\nOnly basic cleaning is beeing applied\r\n</b>";
         }
     } else {
         //PHP 5 only !!!
         $tidy = new tidy();
         $tidy->parseString($this->html, $this->TidyConfig, $this->Encoding);
         $tidy->cleanRepair();
         $this->html = $tidy;
     }
 }
Exemplo n.º 9
0
function tidy_output($init)
{
    $init->process();
    ob_start();
    ob_start();
    $output = $init->run();
    $tidy = 'xhtml';
    if ($tidy == 'xhtml') {
        $options = array('output-xhtml' => true, 'indent' => true, 'input-encoding' => 'utf8', 'output-encoding' => 'utf8');
        $output = tidy_parse_string($output, $options);
        tidy_clean_repair($output);
    }
    if ($tidy == 'html') {
        $options = array('output-html' => true, 'indent' => true, 'input-encoding' => 'utf8', 'output-encoding' => 'utf8', 'clean' => true);
        $output = tidy_parse_string($output, $options);
        tidy_clean_repair($output);
    }
    echo $output;
    ob_end_flush();
    header('Content-Length: ' . ob_get_length());
    ob_end_flush();
}
Exemplo n.º 10
0
 private function tidyThis($source)
 {
     switch ($this->tidy_mode) {
         case 'exec':
             $tmp_file = $this->tmp_dir . md5($source) . '.txt';
             file_put_contents($tmp_file, $source);
             exec("tidy -utf8 -indent -asxhtml -numeric -bare -quiet {$tmp_file}", $tidy);
             unlink($tmp_file);
             return implode("\n", $tidy);
             break;
         case 'php':
             $tidy = tidy_parse_string($source);
             return tidy_clean_repair($tidy);
             break;
         default:
             return $source;
             break;
     }
 }
Exemplo n.º 11
0
 /**
  * Use HTML Tidy to validate the $text
  * Only runs when $config['HTML_Tidy'] is off
  *
  * @param string $text The html content to be checked. Passed by reference
  */
 static function tidyFix(&$text, $ignore_config = false)
 {
     global $config;
     if (!$ignore_config) {
         if (empty($config['HTML_Tidy']) || $config['HTML_Tidy'] == 'off') {
             return true;
         }
     }
     if (!function_exists('tidy_parse_string')) {
         return false;
     }
     $options = array();
     $options['wrap'] = 0;
     //keeps tidy from wrapping... want the least amount of space changing as possible.. could get rid of spaces between words with the str_replaces below
     $options['doctype'] = 'omit';
     //omit, auto, strict, transitional, user
     $options['drop-empty-paras'] = true;
     //drop empty paragraphs
     $options['output-xhtml'] = true;
     //need this so that <br> will be <br/> .. etc
     $options['show-body-only'] = true;
     $options['hide-comments'] = false;
     //$options['anchor-as-name'] = true;		//default is true, but not alwasy availabel. When true, adds an id attribute to anchor; when false, removes the name attribute... poorly designed, but we need it to be true
     //
     //	php4
     //
     if (function_exists('tidy_setopt')) {
         $options['char-encoding'] = 'utf8';
         gp_edit::tidyOptions($options);
         $tidy = tidy_parse_string($text);
         tidy_clean_repair();
         if (tidy_get_status() === 2) {
             // 2 is magic number for fatal error
             // http://www.php.net/manual/en/function.tidy-get-status.php
             $tidyErrors[] = 'Tidy found serious XHTML errors: <br/>' . nl2br(htmlspecialchars(tidy_get_error_buffer($tidy)));
             return false;
         }
         $text = tidy_get_output();
         //
         //	php5
         //
     } else {
         $tidy = tidy_parse_string($text, $options, 'utf8');
         tidy_clean_repair($tidy);
         if (tidy_get_status($tidy) === 2) {
             // 2 is magic number for fatal error
             // http://www.php.net/manual/en/function.tidy-get-status.php
             $tidyErrors[] = 'Tidy found serious XHTML errors: <br/>' . nl2br(htmlspecialchars(tidy_get_error_buffer($tidy)));
             return false;
         }
         $text = tidy_get_output($tidy);
     }
     return true;
 }
Exemplo n.º 12
0
 function cleanXHTML($string)
 {
     if (!extension_loaded('tidy')) {
         if (!dl('tidy.so')) {
             return strip_tags($string, '<br><a>');
         }
     }
     tidy_setopt('output-xhtml', true);
     tidy_setopt('doctype', 'omit');
     tidy_setopt('show-body-only', true);
     tidy_parse_string($string);
     tidy_clean_repair();
     $clean = tidy_get_output();
     if (!$clean) {
         return strip_tags($string, '<br><a>');
     }
     return $clean;
 }
Exemplo n.º 13
0
 public function process($html, $url, $smart_tidy = true)
 {
     $this->reset();
     // use user submitted config and merge it with regular one
     if (isset($this->userSubmittedConfig)) {
         $this->debug('Using user-submitted site config');
         $this->config = $this->userSubmittedConfig;
         if ($this->config->autodetect_on_failure()) {
             $this->debug('Merging user-submitted site config with site config files associated with this URL and/or content');
             $this->config->append($this->buildSiteConfig($url, $html));
         }
     } else {
         $this->config = $this->buildSiteConfig($url, $html);
     }
     // do string replacements
     if (!empty($this->config->find_string)) {
         if (count($this->config->find_string) == count($this->config->replace_string)) {
             $html = str_replace($this->config->find_string, $this->config->replace_string, $html, $_count);
             $this->debug("Strings replaced: {$_count} (find_string and/or replace_string)");
         } else {
             $this->debug('Skipped string replacement - incorrect number of find-replace strings in site config');
         }
         unset($_count);
     }
     // use tidy (if it exists)?
     // This fixes problems with some sites which would otherwise
     // trouble DOMDocument's HTML parsing. (Although sometimes it
     // makes matters worse, which is why you can override it in site config files.)
     $tidied = false;
     if ($this->config->tidy() && function_exists('tidy_parse_string') && $smart_tidy) {
         $this->debug('Using Tidy');
         $tidy = tidy_parse_string($html, self::$tidy_config, 'UTF8');
         if (tidy_clean_repair($tidy)) {
             $original_html = $html;
             $tidied = true;
             $html = $tidy->value;
         }
         unset($tidy);
     }
     // load and parse html
     if ($this->parserOverride) {
         // from querystring: &parser=xxx
         $_parser = $this->parserOverride;
     } else {
         // from site config file: parser: xxx
         $_parser = $this->config->parser();
     }
     // for backword compatibility...
     if ($_parser == 'html5lib') {
         $_parser = 'html5php';
     }
     if (!in_array($_parser, $this->allowedParsers)) {
         $this->debug("HTML parser {$_parser} not listed, using " . $this->defaultParser . " instead");
         $_parser = $this->defaultParser;
     }
     $this->debug("Attempting to parse HTML with {$_parser}");
     $this->readability = new Readability($html, $url, $_parser);
     // we use xpath to find elements in the given HTML document
     // see http://en.wikipedia.org/wiki/XPath_1.0
     $xpath = new DOMXPath($this->readability->dom);
     // try to get next page link
     foreach ($this->config->next_page_link as $pattern) {
         $elems = @$xpath->evaluate($pattern, $this->readability->dom);
         if (is_string($elems)) {
             $this->nextPageUrl = trim($elems);
             break;
         } elseif ($elems instanceof DOMNodeList && $elems->length > 0) {
             foreach ($elems as $item) {
                 if ($item instanceof DOMElement && $item->hasAttribute('href')) {
                     $this->nextPageUrl = $item->getAttribute('href');
                     break 2;
                 } elseif ($item instanceof DOMAttr && $item->value) {
                     $this->nextPageUrl = $item->value;
                     break 2;
                 }
             }
         }
     }
     // check if this is a native ad
     foreach ($this->config->native_ad_clue as $pattern) {
         $elems = @$xpath->evaluate($pattern, $this->readability->dom);
         if ($elems instanceof DOMNodeList && $elems->length > 0) {
             $this->nativeAd = true;
             break;
         }
     }
     // try to get title
     foreach ($this->config->title as $pattern) {
         // $this->debug("Trying $pattern");
         $elems = @$xpath->evaluate($pattern, $this->readability->dom);
         if (is_string($elems)) {
             $this->title = trim($elems);
             $this->debug('Title expression evaluated as string: ' . $this->title);
             $this->debug("...XPath match: {$pattern}");
             break;
         } elseif ($elems instanceof DOMNodeList && $elems->length > 0) {
             $this->title = $elems->item(0)->textContent;
             $this->debug('Title matched: ' . $this->title);
             $this->debug("...XPath match: {$pattern}");
             // remove title from document
             try {
                 @$elems->item(0)->parentNode->removeChild($elems->item(0));
             } catch (DOMException $e) {
                 // do nothing
             }
             break;
         }
     }
     // try to get author (if it hasn't already been set)
     if (empty($this->author)) {
         foreach ($this->config->author as $pattern) {
             $elems = @$xpath->evaluate($pattern, $this->readability->dom);
             if (is_string($elems)) {
                 if (trim($elems) != '') {
                     $this->author[] = trim($elems);
                     $this->debug('Author expression evaluated as string: ' . trim($elems));
                     $this->debug("...XPath match: {$pattern}");
                     break;
                 }
             } elseif ($elems instanceof DOMNodeList && $elems->length > 0) {
                 foreach ($elems as $elem) {
                     if (!isset($elem->parentNode)) {
                         continue;
                     }
                     $this->author[] = trim($elem->textContent);
                     $this->debug('Author matched: ' . trim($elem->textContent));
                 }
                 if (!empty($this->author)) {
                     $this->debug("...XPath match: {$pattern}");
                     break;
                 }
             }
         }
     }
     // try to get language
     $_lang_xpath = array('//html[@lang]/@lang', '//meta[@name="DC.language"]/@content');
     foreach ($_lang_xpath as $pattern) {
         $elems = @$xpath->evaluate($pattern, $this->readability->dom);
         if (is_string($elems)) {
             if (trim($elems) != '') {
                 $this->language = trim($elems);
                 $this->debug('Language matched: ' . $this->language);
                 break;
             }
         } elseif ($elems instanceof DOMNodeList && $elems->length > 0) {
             foreach ($elems as $elem) {
                 if (!isset($elem->parentNode)) {
                     continue;
                 }
                 $this->language = trim($elem->textContent);
                 $this->debug('Language matched: ' . $this->language);
             }
             if ($this->language) {
                 break;
             }
         }
     }
     // try to get date
     foreach ($this->config->date as $pattern) {
         $elems = @$xpath->evaluate($pattern, $this->readability->dom);
         if (is_string($elems)) {
             $this->date = strtotime(trim($elems, "; \t\n\r\v"));
         } elseif ($elems instanceof DOMNodeList && $elems->length > 0) {
             $this->date = $elems->item(0)->textContent;
             $this->date = strtotime(trim($this->date, "; \t\n\r\v"));
             // remove date from document
             // $elems->item(0)->parentNode->removeChild($elems->item(0));
         }
         if (!$this->date) {
             $this->date = null;
         } else {
             $this->debug('Date matched: ' . date('Y-m-d H:i:s', $this->date));
             $this->debug("...XPath match: {$pattern}");
             break;
         }
     }
     // strip elements (using xpath expressions)
     foreach ($this->config->strip as $pattern) {
         $elems = @$xpath->query($pattern, $this->readability->dom);
         // check for matches
         if ($elems && $elems->length > 0) {
             $this->debug('Stripping ' . $elems->length . ' elements (strip)');
             for ($i = $elems->length - 1; $i >= 0; $i--) {
                 if ($elems->item($i)->parentNode) {
                     $elems->item($i)->parentNode->removeChild($elems->item($i));
                 }
             }
         }
     }
     // strip elements (using id and class attribute values)
     foreach ($this->config->strip_id_or_class as $string) {
         $string = strtr($string, array("'" => '', '"' => ''));
         $elems = @$xpath->query("//*[contains(@class, '{$string}') or contains(@id, '{$string}')]", $this->readability->dom);
         // check for matches
         if ($elems && $elems->length > 0) {
             $this->debug('Stripping ' . $elems->length . ' elements (strip_id_or_class)');
             for ($i = $elems->length - 1; $i >= 0; $i--) {
                 $elems->item($i)->parentNode->removeChild($elems->item($i));
             }
         }
     }
     // strip images (using src attribute values)
     foreach ($this->config->strip_image_src as $string) {
         $string = strtr($string, array("'" => '', '"' => ''));
         $elems = @$xpath->query("//img[contains(@src, '{$string}')]", $this->readability->dom);
         // check for matches
         if ($elems && $elems->length > 0) {
             $this->debug('Stripping ' . $elems->length . ' image elements');
             for ($i = $elems->length - 1; $i >= 0; $i--) {
                 $elems->item($i)->parentNode->removeChild($elems->item($i));
             }
         }
     }
     // strip elements using Readability.com and Instapaper.com ignore class names
     // .entry-unrelated and .instapaper_ignore
     // See https://www.readability.com/publishers/guidelines/#view-plainGuidelines
     // and http://blog.instapaper.com/post/730281947
     $elems = @$xpath->query("//*[contains(concat(' ',normalize-space(@class),' '),' entry-unrelated ') or contains(concat(' ',normalize-space(@class),' '),' instapaper_ignore ')]", $this->readability->dom);
     // check for matches
     if ($elems && $elems->length > 0) {
         $this->debug('Stripping ' . $elems->length . ' .entry-unrelated,.instapaper_ignore elements');
         for ($i = $elems->length - 1; $i >= 0; $i--) {
             $elems->item($i)->parentNode->removeChild($elems->item($i));
         }
     }
     // strip elements that contain style="display: none;"
     $elems = @$xpath->query("//*[contains(@style,'display:none')]", $this->readability->dom);
     // check for matches
     if ($elems && $elems->length > 0) {
         $this->debug('Stripping ' . $elems->length . ' elements with inline display:none style');
         for ($i = $elems->length - 1; $i >= 0; $i--) {
             $elems->item($i)->parentNode->removeChild($elems->item($i));
         }
     }
     // try to get body
     foreach ($this->config->body as $pattern) {
         $elems = @$xpath->query($pattern, $this->readability->dom);
         // check for matches
         if ($elems && $elems->length > 0) {
             $this->debug('Body matched');
             $this->debug("...XPath match: {$pattern}");
             if ($elems->length == 1) {
                 $this->body = $elems->item(0);
                 // prune (clean up elements that may not be content)
                 if ($this->config->prune()) {
                     $this->debug('...pruning content');
                     $this->readability->prepArticle($this->body);
                 }
                 break;
             } else {
                 $this->body = $this->readability->dom->createElement('div');
                 $this->debug($elems->length . ' body elems found');
                 foreach ($elems as $elem) {
                     if (!isset($elem->parentNode)) {
                         continue;
                     }
                     $isDescendant = false;
                     foreach ($this->body->childNodes as $parent) {
                         if ($this->isDescendant($parent, $elem)) {
                             $isDescendant = true;
                             break;
                         }
                     }
                     if ($isDescendant) {
                         $this->debug('...element is child of another body element, skipping.');
                     } else {
                         // prune (clean up elements that may not be content)
                         if ($this->config->prune()) {
                             $this->debug('Pruning content');
                             $this->readability->prepArticle($elem);
                         }
                         $this->debug('...element added to body');
                         $this->body->appendChild($elem);
                     }
                 }
                 if ($this->body->hasChildNodes()) {
                     break;
                 }
             }
         }
     }
     // auto detect?
     $detect_title = $detect_body = $detect_author = $detect_date = false;
     // detect title?
     if (!isset($this->title)) {
         if (empty($this->config->title) || $this->config->autodetect_on_failure()) {
             $detect_title = true;
         }
     }
     // detect body?
     if (!isset($this->body)) {
         if (empty($this->config->body) || $this->config->autodetect_on_failure()) {
             $detect_body = true;
         }
     }
     // detect author?
     if (empty($this->author)) {
         if (empty($this->config->author) || $this->config->autodetect_on_failure()) {
             $detect_author = true;
         }
     }
     // detect date?
     if (!isset($this->date)) {
         if (empty($this->config->date) || $this->config->autodetect_on_failure()) {
             $detect_date = true;
         }
     }
     // check for hNews
     if ($detect_title || $detect_body) {
         // check for hentry
         $elems = @$xpath->query("//*[contains(concat(' ',normalize-space(@class),' '),' hentry ')]", $this->readability->dom);
         if ($elems && $elems->length > 0) {
             $this->debug('hNews: found hentry');
             $hentry = $elems->item(0);
             if ($detect_title) {
                 // check for entry-title
                 $elems = @$xpath->query(".//*[contains(concat(' ',normalize-space(@class),' '),' entry-title ')]", $hentry);
                 if ($elems && $elems->length > 0) {
                     $this->title = $elems->item(0)->textContent;
                     $this->debug('hNews: found entry-title: ' . $this->title);
                     // remove title from document
                     $elems->item(0)->parentNode->removeChild($elems->item(0));
                     $detect_title = false;
                 }
             }
             if ($detect_date) {
                 // check for time element with pubdate attribute
                 $elems = @$xpath->query(".//time[@pubdate or @pubDate] | .//abbr[contains(concat(' ',normalize-space(@class),' '),' published ')]", $hentry);
                 if ($elems && $elems->length > 0) {
                     $this->date = strtotime(trim($elems->item(0)->textContent));
                     // remove date from document
                     //$elems->item(0)->parentNode->removeChild($elems->item(0));
                     if ($this->date) {
                         $this->debug('hNews: found publication date: ' . date('Y-m-d H:i:s', $this->date));
                         $detect_date = false;
                     } else {
                         $this->date = null;
                     }
                 }
             }
             if ($detect_author) {
                 // check for time element with pubdate attribute
                 $elems = @$xpath->query(".//*[contains(concat(' ',normalize-space(@class),' '),' vcard ') and (contains(concat(' ',normalize-space(@class),' '),' author ') or contains(concat(' ',normalize-space(@class),' '),' byline '))]", $hentry);
                 if ($elems && $elems->length > 0) {
                     $author = $elems->item(0);
                     $fn = @$xpath->query(".//*[contains(concat(' ',normalize-space(@class),' '),' fn ')]", $author);
                     if ($fn && $fn->length > 0) {
                         foreach ($fn as $_fn) {
                             if (trim($_fn->textContent) != '') {
                                 $this->author[] = trim($_fn->textContent);
                                 $this->debug('hNews: found author: ' . trim($_fn->textContent));
                             }
                         }
                     } else {
                         if (trim($author->textContent) != '') {
                             $this->author[] = trim($author->textContent);
                             $this->debug('hNews: found author: ' . trim($author->textContent));
                         }
                     }
                     $detect_author = empty($this->author);
                 }
             }
             // check for entry-content.
             // according to hAtom spec, if there are multiple elements marked entry-content,
             // we include all of these in the order they appear - see http://microformats.org/wiki/hatom#Entry_Content
             if ($detect_body) {
                 $elems = @$xpath->query(".//*[contains(concat(' ',normalize-space(@class),' '),' entry-content ')]", $hentry);
                 if ($elems && $elems->length > 0) {
                     $this->debug('hNews: found entry-content');
                     if ($elems->length == 1) {
                         // what if it's empty? (some sites misuse hNews - place their content outside an empty entry-content element)
                         $e = $elems->item(0);
                         if ($e->tagName == 'img' || trim($e->textContent) != '') {
                             $this->body = $elems->item(0);
                             // prune (clean up elements that may not be content)
                             if ($this->config->prune()) {
                                 $this->debug('Pruning content');
                                 $this->readability->prepArticle($this->body);
                             }
                             $detect_body = false;
                         } else {
                             $this->debug('hNews: skipping entry-content - appears not to contain content');
                         }
                         unset($e);
                     } else {
                         $this->body = $this->readability->dom->createElement('div');
                         $this->debug($elems->length . ' entry-content elems found');
                         foreach ($elems as $elem) {
                             if (!isset($elem->parentNode)) {
                                 continue;
                             }
                             $isDescendant = false;
                             foreach ($this->body->childNodes as $parent) {
                                 if ($this->isDescendant($parent, $elem)) {
                                     $isDescendant = true;
                                     break;
                                 }
                             }
                             if ($isDescendant) {
                                 $this->debug('Element is child of another body element, skipping.');
                             } else {
                                 // prune (clean up elements that may not be content)
                                 if ($this->config->prune()) {
                                     $this->debug('Pruning content');
                                     $this->readability->prepArticle($elem);
                                 }
                                 $this->debug('Element added to body');
                                 $this->body->appendChild($elem);
                             }
                         }
                         $detect_body = false;
                     }
                 }
             }
         }
     }
     // check for elements marked with instapaper_title
     if ($detect_title) {
         // check for instapaper_title
         $elems = @$xpath->query("//*[contains(concat(' ',normalize-space(@class),' '),' instapaper_title ')]", $this->readability->dom);
         if ($elems && $elems->length > 0) {
             $this->title = $elems->item(0)->textContent;
             $this->debug('Title found (.instapaper_title): ' . $this->title);
             // remove title from document
             $elems->item(0)->parentNode->removeChild($elems->item(0));
             $detect_title = false;
         }
     }
     // check for elements marked with instapaper_body
     if ($detect_body) {
         $elems = @$xpath->query("//*[contains(concat(' ',normalize-space(@class),' '),' instapaper_body ')]", $this->readability->dom);
         if ($elems && $elems->length > 0) {
             $this->debug('body found (.instapaper_body)');
             $this->body = $elems->item(0);
             // prune (clean up elements that may not be content)
             if ($this->config->prune()) {
                 $this->debug('Pruning content');
                 $this->readability->prepArticle($this->body);
             }
             $detect_body = false;
         }
     }
     // check for elements marked with itemprop="articleBody" (from Schema.org)
     if ($detect_body) {
         $elems = @$xpath->query("//*[@itemprop='articleBody']", $this->readability->dom);
         if ($elems && $elems->length > 0) {
             $this->debug('body found (Schema.org itemprop="articleBody")');
             if ($elems->length == 1) {
                 // what if it's empty? (content placed outside an empty itemprop='articleBody' element)
                 $e = $elems->item(0);
                 if ($e->tagName == 'img' || trim($e->textContent) != '') {
                     $this->body = $elems->item(0);
                     // prune (clean up elements that may not be content)
                     if ($this->config->prune()) {
                         $this->debug('Pruning content');
                         $this->readability->prepArticle($this->body);
                     }
                     $detect_body = false;
                 } else {
                     $this->debug('Schema.org: skipping itemprop="articleBody" - appears not to contain content');
                 }
                 unset($e);
             } else {
                 $this->body = $this->readability->dom->createElement('div');
                 $this->debug($elems->length . ' itemprop="articleBody" elems found');
                 foreach ($elems as $elem) {
                     if (!isset($elem->parentNode)) {
                         continue;
                     }
                     $isDescendant = false;
                     foreach ($this->body->childNodes as $parent) {
                         if ($this->isDescendant($parent, $elem)) {
                             $isDescendant = true;
                             break;
                         }
                     }
                     if ($isDescendant) {
                         $this->debug('Element is child of another body element, skipping.');
                     } else {
                         // prune (clean up elements that may not be content)
                         if ($this->config->prune()) {
                             $this->debug('Pruning content');
                             $this->readability->prepArticle($elem);
                         }
                         $this->debug('Element added to body');
                         $this->body->appendChild($elem);
                     }
                 }
                 $detect_body = false;
             }
         }
     }
     // Find author in rel="author" marked element
     // We only use this if there's exactly one.
     // If there's more than one, it could indicate more than
     // one author, but it could also indicate that we're processing
     // a page listing different articles with different authors.
     if ($detect_author) {
         $elems = @$xpath->query("//a[contains(concat(' ',normalize-space(@rel),' '),' author ')]", $this->readability->dom);
         if ($elems && $elems->length == 1) {
             $author = trim($elems->item(0)->textContent);
             if ($author != '') {
                 $this->debug("Author found (rel=\"author\"): {$author}");
                 $this->author[] = $author;
                 $detect_author = false;
             }
         }
     }
     // Find date in pubdate marked time element
     // For the same reason given above, we only use this
     // if there's exactly one element.
     if ($detect_date) {
         $elems = @$xpath->query("//time[@pubdate or @pubDate]", $this->readability->dom);
         if ($elems && $elems->length == 1) {
             $this->date = strtotime(trim($elems->item(0)->textContent));
             // remove date from document
             //$elems->item(0)->parentNode->removeChild($elems->item(0));
             if ($this->date) {
                 $this->debug('Date found (pubdate marked time element): ' . date('Y-m-d H:i:s', $this->date));
                 $detect_date = false;
             } else {
                 $this->date = null;
             }
         }
     }
     // still missing title or body, so we detect using Readability
     if ($detect_title || $detect_body) {
         $this->debug('Using Readability');
         // clone body if we're only using Readability for title (otherwise it may interfere with body element)
         if (isset($this->body)) {
             $this->body = $this->body->cloneNode(true);
         }
         $success = $this->readability->init();
     }
     if ($detect_title) {
         $this->debug('Detecting title');
         $this->title = $this->readability->getTitle()->textContent;
     }
     if ($detect_body && $success) {
         $this->debug('Detecting body');
         $this->body = $this->readability->getContent();
         if ($this->body->childNodes->length == 1 && $this->body->firstChild->nodeType === XML_ELEMENT_NODE) {
             $this->body = $this->body->firstChild;
         }
         // prune (clean up elements that may not be content)
         if ($this->config->prune()) {
             $this->debug('Pruning content');
             $this->readability->prepArticle($this->body);
         }
     }
     if (isset($this->body)) {
         // remove scripts
         $this->readability->removeScripts($this->body);
         // remove any h1-h6 elements that appear as first thing in the body
         // and which match our title
         if (isset($this->title) && $this->title != '') {
             $firstChild = $this->body->firstChild;
             while ($firstChild->nodeType && $firstChild->nodeType !== XML_ELEMENT_NODE) {
                 $firstChild = $firstChild->nextSibling;
             }
             if ($firstChild->nodeType === XML_ELEMENT_NODE && in_array(strtolower($firstChild->tagName), array('h1', 'h2', 'h3', 'h4', 'h5', 'h6')) && strtolower(trim($firstChild->textContent)) == strtolower(trim($this->title))) {
                 $this->body->removeChild($firstChild);
             }
         }
         // prevent self-closing iframes
         $elems = $this->body->getElementsByTagName('iframe');
         for ($i = $elems->length - 1; $i >= 0; $i--) {
             $e = $elems->item($i);
             if (!$e->hasChildNodes()) {
                 $e->appendChild($this->body->ownerDocument->createTextNode('[embedded content]'));
             }
         }
         // remove image lazy loading - WordPress plugin http://wordpress.org/extend/plugins/lazy-load/
         // the plugin replaces the src attribute to point to a 1x1 gif and puts the original src
         // inside the data-lazy-src attribute. It also places the original image inside a noscript element
         // next to the amended one.
         $elems = @$xpath->query("//img[@data-lazy-src]", $this->body);
         for ($i = $elems->length - 1; $i >= 0; $i--) {
             $e = $elems->item($i);
             // let's see if we can grab image from noscript
             if ($e->nextSibling !== null && $e->nextSibling->nodeName === 'noscript') {
                 $_new_elem = $e->ownerDocument->createDocumentFragment();
                 @$_new_elem->appendXML($e->nextSibling->innerHTML);
                 $e->nextSibling->parentNode->replaceChild($_new_elem, $e->nextSibling);
                 $e->parentNode->removeChild($e);
             } else {
                 // Use data-lazy-src as src value
                 $e->setAttribute('src', $e->getAttribute('data-lazy-src'));
                 $e->removeAttribute('data-lazy-src');
             }
         }
         $this->success = true;
     }
     // if we've had no success and we've used tidy, there's a chance
     // that tidy has messed up. So let's try again without tidy...
     if (!$this->success && $tidied && $smart_tidy) {
         $this->debug('Trying again without tidy');
         $this->process($original_html, $url, false);
     }
     return $this->success;
 }
Exemplo n.º 14
0
 /**
  * Create instance of Readability
  * @param string UTF-8 encoded string
  * @param string (optional) URL associated with HTML (for footnotes)
  * @param string (optional) Which parser to use for turning raw HTML into a DOMDocument
  * @param boolean (optional) Use tidy
  */
 function __construct($html, $url = null, $parser = 'libxml', $use_tidy = true)
 {
     $this->url = $url;
     $this->debugText = 'Parsing URL: ' . $url . "\n";
     if ($url) {
         $this->domainRegExp = '/' . strtr(preg_replace('/www\\d*\\./', '', parse_url($url)['host']), array('.' => '\\.')) . '/';
     }
     mb_internal_encoding("UTF-8");
     mb_http_output("UTF-8");
     mb_regex_encoding("UTF-8");
     $this->imageCache = new ImageCaching();
     // HACK: dirty cleanup to replace some stuff; shouldn't use regexps with HTML but well...
     if (!$this->flagIsActive(self::FLAG_DISABLE_PREFILTER)) {
         try {
             foreach ($this->pre_filters as $search => $replace) {
                 $html = preg_replace($search, $replace, $html);
             }
             unset($search, $replace);
         } catch (Exception $e) {
             $this->debugText .= "Cleaning raw HTML failed. Ignoring: " . $e->getMessage();
         }
     }
     if (trim($html) === '') {
         $html = '<html></html>';
     }
     /**
      * Use tidy (if it exists).
      * This fixes problems with some sites which would otherwise trouble DOMDocument's HTML parsing.
      * Although sometimes it makes matters worse, which is why there is an option to disable it.
      *
      **/
     if ($use_tidy && function_exists('tidy_parse_string')) {
         $this->debugText .= 'Tidying document' . "\n";
         $tidy = tidy_parse_string($html, $this->tidy_config, 'UTF8');
         if (tidy_clean_repair($tidy)) {
             $original_html = $html;
             $this->tidied = true;
             $html = $tidy->value;
             $html = preg_replace('/<html[^>]+>/i', '<html>', $html);
             $html = preg_replace('/[\\r\\n]+/is', "\n", $html);
         }
         unset($tidy);
     }
     $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8");
     if ($parser == 'html5lib' && ($this->dom = HTML5_Parser::parse($html))) {
         // all good
     } else {
         libxml_use_internal_errors(true);
         $this->dom = new DOMDocument();
         $this->dom->preserveWhiteSpace = false;
         @$this->dom->loadHTML($html, LIBXML_NOBLANKS | LIBXML_COMPACT | LIBXML_NOERROR);
     }
     $this->dom->registerNodeClass('DOMElement', 'JSLikeHTMLElement');
 }
Exemplo n.º 15
0
function nexista_devBuffer($init)
{
    $init->process();
    ob_start();
    ob_start();
    header('Cache-Control: no-cache, must-revalidate');
    header('Last-Modified: ' . gmdate("D, d M Y H:i:s") . ' GMT');
    nexista_development_console();
    $output = $init->run();
    if (isset($_GET['view_flow'])) {
        if ($_GET['view_flow'] == 'true') {
            nexista_view_flow();
        }
    }
    if ($_GET['client_view_flow'] == 'true') {
        $mynid = $_GET['nid'];
        $_SESSION['client_view_flow'] = 'true';
    } elseif ($_GET['client_view_flow'] == 'false') {
        $mynid = $_GET['nid'];
        $_SESSION['client_view_flow'] = 'false';
    }
    if ($_SESSION['client_view_flow'] == 'true') {
        $flow_viewport = nexista_view_flow();
    }
    $output = str_replace('</body>', '', $output);
    $output = str_replace('</html>', '', $output);
    $output .= $flow_viewport;
    $output .= nexista_final_notices($cache_type, 'dev');
    $output .= '</body></html>';
    $tidy = false;
    if ($tidy == 'xhtml') {
        $options = array('output-xhtml' => true, 'indent' => true, 'input-encoding' => 'utf8', 'output-encoding' => 'utf8', 'clean' => true);
        $output = tidy_parse_string($output, $options);
        tidy_clean_repair($output);
    }
    if ($tidy == 'html') {
        $options = array('output-html' => true, 'indent' => true, 'input-encoding' => 'utf8', 'output-encoding' => 'utf8', 'clean' => true);
        $output = tidy_parse_string($output, $options);
        tidy_clean_repair($output);
    }
    echo $output;
    ob_end_flush();
    header('Content-Length: ' . ob_get_length());
    ob_end_flush();
}
Exemplo n.º 16
0
/**
 * nv_valid_html()
 *
 * @param string $html
 * @param mixed $config
 * @param string $encoding
 * @return
 */
function nv_valid_html($html, $config, $encoding = 'utf8')
{
    global $sys_info;
    if ($sys_info['supports_tidy'] == "class") {
        $tidy = new tidy();
        $tidy->parseString($html, $config, $encoding);
        $tidy->cleanRepair();
        return $tidy;
    }
    if ($sys_info['supports_tidy'] == "func") {
        $tidy = tidy_parse_string($html, $config, $encoding);
        tidy_clean_repair();
        return $tidy;
    }
    return $html;
}
Exemplo n.º 17
0
 function viewAction()
 {
     if (!$this->validateProblemAccess()) {
         return;
     }
     $prob = $this->view->prob;
     $this->view->content_html = file_get_contents(get_file_name("data/problems/" . $this->_request->get("probid") . "/index.html"));
     if (function_exists("tidy_parse_string") && $this->_request->get("tidy") != "false") {
         /* tidy to XHTML strict */
         $opt = array("output-xhtml" => true, "add-xml-decl" => true, "bare" => true, "clean" => true, "quote-ampersand" => true, "doctype" => "strict");
         $tidy = tidy_parse_string($this->view->content_html, $opt);
         tidy_clean_repair($tidy);
         $this->view->content_html = tidy_get_output($tidy);
         $this->fixImages();
         /* redo the tidy, I agree it's slow, but easy way out. :) */
         $opt = array("output-xhtml" => true, "doctype" => "strict", "show-body-only" => true);
         $tidy = tidy_parse_string($this->view->content_html, $opt);
         tidy_clean_repair($tidy);
         $this->view->content_html = tidy_get_output($tidy);
     }
     if ($this->_request->get("plain") == "true") {
         $this->_helper->layout->disableLayout();
         $this->_helper->viewRenderer->setNoRender();
         $this->getResponse()->setBody($this->view->content_html);
     }
 }
Exemplo n.º 18
0
 public function process($html, $url, $smart_tidy = true)
 {
     $this->reset();
     // extract host name
     $host = @parse_url($url, PHP_URL_HOST);
     if (!($this->config = SiteConfig::build($host))) {
         // no match, check HTML for fingerprints
         if (!empty($this->fingerprints) && ($_fphost = $this->findHostUsingFingerprints($html))) {
             $this->config = SiteConfig::build($_fphost);
         }
         unset($_fphost);
         if (!$this->config) {
             // no match, so use defaults
             $this->config = new SiteConfig();
         }
     }
     //echo count($this->config->body);
     // store copy of config in our static cache array in case we need to process another URL
     SiteConfig::add_to_cache($host, $this->config);
     // do string replacements
     foreach ($this->config->replace_string as $_repl) {
         $html = str_replace($_repl[0], $_repl[1], $html);
     }
     unset($_repl);
     // use tidy (if it exists)?
     // This fixes problems with some sites which would otherwise
     // trouble DOMDocument's HTML parsing. (Although sometimes it
     // makes matters worse, which is why you can override it in site config files.)
     $tidied = false;
     if ($this->config->tidy && function_exists('tidy_parse_string') && $smart_tidy) {
         $this->debug('Using Tidy');
         $tidy = tidy_parse_string($html, self::$tidy_config, 'UTF8');
         if (tidy_clean_repair($tidy)) {
             $original_html = $html;
             $tidied = true;
             $html = $tidy->value;
         }
         unset($tidy);
     }
     // load and parse html
     $this->readability = new Readability($html, $url);
     // we use xpath to find elements in the given HTML document
     // see http://en.wikipedia.org/wiki/XPath_1.0
     $xpath = new DOMXPath($this->readability->dom);
     // try to get title
     foreach ($this->config->title as $pattern) {
         $elems = @$xpath->evaluate($pattern, $this->readability->dom);
         if (is_string($elems)) {
             $this->debug('Title expression evaluated as string');
             $this->title = trim($elems);
             break;
         } elseif ($elems instanceof DOMNodeList && $elems->length > 0) {
             $this->debug('Title matched');
             $this->title = $elems->item(0)->textContent;
             // remove title from document
             try {
                 $elems->item(0)->parentNode->removeChild($elems->item(0));
             } catch (DOMException $e) {
                 // do nothing
             }
             break;
         }
     }
     // try to get author (if it hasn't already been set)
     if (empty($this->author)) {
         foreach ($this->config->author as $pattern) {
             $elems = @$xpath->evaluate($pattern, $this->readability->dom);
             if (is_string($elems)) {
                 $this->debug('Author expression evaluated as string');
                 if (trim($elems) != '') {
                     $this->author[] = trim($elems);
                     break;
                 }
             } elseif ($elems instanceof DOMNodeList && $elems->length > 0) {
                 foreach ($elems as $elem) {
                     if (!isset($elem->parentNode)) {
                         continue;
                     }
                     $this->author[] = trim($elem->textContent);
                 }
                 if (!empty($this->author)) {
                     break;
                 }
             }
         }
     }
     // try to get language
     $_lang_xpath = array('//html[@lang]/@lang', '//meta[@name="DC.language"]/@content');
     foreach ($_lang_xpath as $pattern) {
         $elems = @$xpath->evaluate($pattern, $this->readability->dom);
         if (is_string($elems)) {
             if (trim($elems) != '') {
                 $this->language = trim($elems);
                 break;
             }
         } elseif ($elems instanceof DOMNodeList && $elems->length > 0) {
             foreach ($elems as $elem) {
                 if (!isset($elem->parentNode)) {
                     continue;
                 }
                 $this->language = trim($elem->textContent);
             }
             if ($this->language) {
                 break;
             }
         }
     }
     // try to get date
     foreach ($this->config->date as $pattern) {
         $elems = @$xpath->evaluate($pattern, $this->readability->dom);
         if (is_string($elems)) {
             $this->debug('Date expression evaluated as string');
             $this->date = strtotime(trim($elems, "; \t\n\r\v"));
         } elseif ($elems instanceof DOMNodeList && $elems->length > 0) {
             $this->debug('Date matched');
             $this->date = $elems->item(0)->textContent;
             $this->date = strtotime(trim($this->date, "; \t\n\r\v"));
             // remove date from document
             // $elems->item(0)->parentNode->removeChild($elems->item(0));
         }
         if (!$this->date) {
             $this->date = null;
         } else {
             break;
         }
     }
     // strip elements (using xpath expressions)
     foreach ($this->config->strip as $pattern) {
         $elems = @$xpath->query($pattern, $this->readability->dom);
         // check for matches
         if ($elems && $elems->length > 0) {
             $this->debug('Stripping ' . $elems->length . ' elements (strip)');
             for ($i = $elems->length - 1; $i >= 0; $i--) {
                 $elems->item($i)->parentNode->removeChild($elems->item($i));
             }
         }
     }
     // strip elements (using id and class attribute values)
     foreach ($this->config->strip_id_or_class as $string) {
         $string = strtr($string, array("'" => '', '"' => ''));
         $elems = @$xpath->query("//*[contains(@class, '{$string}') or contains(@id, '{$string}')]", $this->readability->dom);
         // check for matches
         if ($elems && $elems->length > 0) {
             $this->debug('Stripping ' . $elems->length . ' elements (strip_id_or_class)');
             for ($i = $elems->length - 1; $i >= 0; $i--) {
                 $elems->item($i)->parentNode->removeChild($elems->item($i));
             }
         }
     }
     // strip images (using src attribute values)
     foreach ($this->config->strip_image_src as $string) {
         $string = strtr($string, array("'" => '', '"' => ''));
         $elems = @$xpath->query("//img[contains(@src, '{$string}')]", $this->readability->dom);
         // check for matches
         if ($elems && $elems->length > 0) {
             $this->debug('Stripping ' . $elems->length . ' image elements');
             for ($i = $elems->length - 1; $i >= 0; $i--) {
                 $elems->item($i)->parentNode->removeChild($elems->item($i));
             }
         }
     }
     // strip elements using Readability.com and Instapaper.com ignore class names
     // .entry-unrelated and .instapaper_ignore
     // See https://www.readability.com/publishers/guidelines/#view-plainGuidelines
     // and http://blog.instapaper.com/post/730281947
     $elems = @$xpath->query("//*[contains(concat(' ',normalize-space(@class),' '),' entry-unrelated ') or contains(concat(' ',normalize-space(@class),' '),' instapaper_ignore ')]", $this->readability->dom);
     // check for matches
     if ($elems && $elems->length > 0) {
         $this->debug('Stripping ' . $elems->length . ' .entry-unrelated,.instapaper_ignore elements');
         for ($i = $elems->length - 1; $i >= 0; $i--) {
             $elems->item($i)->parentNode->removeChild($elems->item($i));
         }
     }
     // strip elements that contain style="display: none;"
     $elems = @$xpath->query("//*[contains(@style,'display:none')]", $this->readability->dom);
     // check for matches
     if ($elems && $elems->length > 0) {
         $this->debug('Stripping ' . $elems->length . ' elements with inline display:none style');
         for ($i = $elems->length - 1; $i >= 0; $i--) {
             $elems->item($i)->parentNode->removeChild($elems->item($i));
         }
     }
     // try to get body
     foreach ($this->config->body as $pattern) {
         $elems = @$xpath->query($pattern, $this->readability->dom);
         // check for matches
         //echo "elems->length: [" . $pattern. "]\n\n";
         //echo "elems->length: [" . $this->readability->dom->innerHTML. "]\n\n";
         if ($elems && $elems->length > 0) {
             //echo "elems->length matched: [" . $pattern. "]\n\n";
             //print_r($next_page_pattern);
             $this->body = $this->getMatchedBody($elems);
             $next_page_content = $this->retrieveNextPage($xpath, $url);
             //if ($next_page_content !== FALSE) {
             //    $body->appendChild($next_page_content);
             //$next_page_content = $this->retrieveNextPage($xpath, $body, $url);
             //}
             //$this->body = $body;
             if ($elems->length === 1) {
                 break;
             }
         }
     }
     //echo "auto detect之前: [" . $this->body->innerHTML . "]\n\n";
     // auto detect?
     $detect_title = $detect_body = $detect_author = $detect_date = false;
     // detect title?
     if (!isset($this->title)) {
         if (empty($this->config->title) || $this->config->autodetect_on_failure) {
             $detect_title = true;
         }
     }
     // detect body?
     if (!isset($this->body)) {
         if (empty($this->config->body) || $this->config->autodetect_on_failure) {
             $detect_body = true;
         }
     }
     // detect author?
     if (empty($this->author)) {
         if (empty($this->config->author) || $this->config->autodetect_on_failure) {
             $detect_author = true;
         }
     }
     // detect date?
     if (!isset($this->date)) {
         if (empty($this->config->date) || $this->config->autodetect_on_failure) {
             $detect_date = true;
         }
     }
     // check for hNews
     if ($detect_title || $detect_body) {
         // check for hentry
         $elems = @$xpath->query("//*[contains(concat(' ',normalize-space(@class),' '),' hentry ')]", $this->readability->dom);
         if ($elems && $elems->length > 0) {
             $this->debug('hNews: found hentry');
             $hentry = $elems->item(0);
             if ($detect_title) {
                 // check for entry-title
                 $elems = @$xpath->query(".//*[contains(concat(' ',normalize-space(@class),' '),' entry-title ')]", $hentry);
                 if ($elems && $elems->length > 0) {
                     $this->debug('hNews: found entry-title');
                     $this->title = $elems->item(0)->textContent;
                     // remove title from document
                     $elems->item(0)->parentNode->removeChild($elems->item(0));
                     $detect_title = false;
                 }
             }
             if ($detect_date) {
                 // check for time element with pubdate attribute
                 $elems = @$xpath->query(".//time[@pubdate] | .//abbr[contains(concat(' ',normalize-space(@class),' '),' published ')]", $hentry);
                 if ($elems && $elems->length > 0) {
                     $this->debug('hNews: found publication date');
                     $this->date = strtotime(trim($elems->item(0)->textContent));
                     // remove date from document
                     //$elems->item(0)->parentNode->removeChild($elems->item(0));
                     if ($this->date) {
                         $detect_date = false;
                     } else {
                         $this->date = null;
                     }
                 }
             }
             if ($detect_author) {
                 // check for time element with pubdate attribute
                 $elems = @$xpath->query(".//*[contains(concat(' ',normalize-space(@class),' '),' vcard ') and (contains(concat(' ',normalize-space(@class),' '),' author ') or contains(concat(' ',normalize-space(@class),' '),' byline '))]", $hentry);
                 if ($elems && $elems->length > 0) {
                     $this->debug('hNews: found author');
                     $author = $elems->item(0);
                     $fn = @$xpath->query(".//*[contains(concat(' ',normalize-space(@class),' '),' fn ')]", $author);
                     if ($fn && $fn->length > 0) {
                         foreach ($fn as $_fn) {
                             if (trim($_fn->textContent) != '') {
                                 $this->author[] = trim($_fn->textContent);
                             }
                         }
                     } else {
                         if (trim($author->textContent) != '') {
                             $this->author[] = trim($author->textContent);
                         }
                     }
                     $detect_author = empty($this->author);
                 }
             }
             // check for entry-content.
             // according to hAtom spec, if there are multiple elements marked entry-content,
             // we include all of these in the order they appear - see http://microformats.org/wiki/hatom#Entry_Content
             if ($detect_body) {
                 $elems = @$xpath->query(".//*[contains(concat(' ',normalize-space(@class),' '),' entry-content ')]", $hentry);
                 if ($elems && $elems->length > 0) {
                     $this->debug('hNews: found entry-content');
                     if ($elems->length == 1) {
                         // what if it's empty? (some sites misuse hNews - place their content outside an empty entry-content element)
                         $e = $elems->item(0);
                         if ($e->tagName == 'img' || trim($e->textContent) != '') {
                             $this->body = $elems->item(0);
                             //echo "elems->item: [" . $this->body->innerHTML . "]\n\n";
                             // prune (clean up elements that may not be content)
                             if ($this->config->prune) {
                                 $this->debug('Pruning content');
                                 $this->readability->prepArticle($this->body);
                             }
                             $detect_body = false;
                         } else {
                             $this->debug('hNews: skipping entry-content - appears not to contain content');
                         }
                         unset($e);
                     } else {
                         $this->body = $this->readability->dom->createElement('div');
                         //echo "elems->item: [" . $this->body->innerHTML . "]\n\n";
                         $this->debug($elems->length . ' entry-content elems found');
                         foreach ($elems as $elem) {
                             if (!isset($elem->parentNode)) {
                                 continue;
                             }
                             $isDescendant = false;
                             foreach ($this->body->childNodes as $parent) {
                                 if ($this->isDescendant($parent, $elem)) {
                                     $isDescendant = true;
                                     break;
                                 }
                             }
                             if ($isDescendant) {
                                 $this->debug('Element is child of another body element, skipping.');
                             } else {
                                 // prune (clean up elements that may not be content)
                                 if ($this->config->prune) {
                                     $this->debug('Pruning content');
                                     $this->readability->prepArticle($elem);
                                 }
                                 $this->debug('Element added to body');
                                 $this->body->appendChild($elem);
                             }
                         }
                         echo "elems->item: [" . $this->body->innerHTML . "]\n\n";
                         $detect_body = false;
                     }
                 }
             }
         }
     }
     //echo "elems->item: [" . $this->body->innerHTML . "]\n\n";
     // check for elements marked with instapaper_title
     if ($detect_title) {
         // check for instapaper_title
         $elems = @$xpath->query("//*[contains(concat(' ',normalize-space(@class),' '),' instapaper_title ')]", $this->readability->dom);
         if ($elems && $elems->length > 0) {
             $this->debug('title found (.instapaper_title)');
             $this->title = $elems->item(0)->textContent;
             // remove title from document
             $elems->item(0)->parentNode->removeChild($elems->item(0));
             $detect_title = false;
         }
     }
     // check for elements marked with instapaper_body
     if ($detect_body) {
         $elems = @$xpath->query("//*[contains(concat(' ',normalize-space(@class),' '),' instapaper_body ')]", $this->readability->dom);
         if ($elems && $elems->length > 0) {
             $this->debug('body found (.instapaper_body)');
             $this->body = $elems->item(0);
             // prune (clean up elements that may not be content)
             if ($this->config->prune) {
                 $this->debug('Pruning content');
                 $this->readability->prepArticle($this->body);
             }
             $detect_body = false;
         }
     }
     //echo "after detect_body: [" . $this->body->innerHTML . "]\n\n";
     // Find author in rel="author" marked element
     // We only use this if there's exactly one.
     // If there's more than one, it could indicate more than
     // one author, but it could also indicate that we're processing
     // a page listing different articles with different authors.
     if ($detect_author) {
         $elems = @$xpath->query("//a[contains(concat(' ',normalize-space(@rel),' '),' author ')]", $this->readability->dom);
         if ($elems && $elems->length == 1) {
             $this->debug('Author found (rel="author")');
             $author = trim($elems->item(0)->textContent);
             if ($author != '') {
                 $this->author[] = $author;
                 $detect_author = false;
             }
         }
     }
     // Find date in pubdate marked time element
     // For the same reason given above, we only use this
     // if there's exactly one element.
     if ($detect_date) {
         $elems = @$xpath->query("//time[@pubdate]", $this->readability->dom);
         if ($elems && $elems->length == 1) {
             $this->debug('Date found (pubdate marked time element)');
             $this->date = strtotime(trim($elems->item(0)->textContent));
             // remove date from document
             //$elems->item(0)->parentNode->removeChild($elems->item(0));
             if ($this->date) {
                 $detect_date = false;
             } else {
                 $this->date = null;
             }
         }
     }
     // still missing title or body, so we detect using Readability
     if ($detect_title || $detect_body) {
         $this->debug('Using Readability');
         // clone body if we're only using Readability for title (otherwise it may interfere with body element)
         if (isset($this->body)) {
             $this->body = $this->body->cloneNode(true);
         }
         $success = $this->readability->init();
     }
     if ($detect_title) {
         $this->debug('Detecting title');
         $this->title = $this->readability->getTitle()->textContent;
     }
     //echo "before detect body success [" . $this->body->innerHTML . "]\n\n";
     if ($detect_body && $success) {
         $this->debug('Detecting body');
         $this->body = $this->readability->getContent();
         //echo "getContent() : [" . $this->body->innerHTML . "] \n\n" ;
         if ($this->body->childNodes->length == 1 && $this->body->firstChild->nodeType === XML_ELEMENT_NODE) {
             $this->body = $this->body->firstChild;
         }
         // prune (clean up elements that may not be content)
         if ($this->config->prune) {
             $this->debug('Pruning content');
             $this->readability->prepArticle($this->body);
         }
     }
     //echo "如果沒有Body [" . $this->body->innerHTML . "]\n\n";
     if (isset($this->body)) {
         // remove scripts
         $this->readability->removeScripts($this->body);
         // remove any h1-h6 elements that appear as first thing in the body
         // and which match our title
         if (isset($this->title) && $this->title != '') {
             $firstChild = $this->body->firstChild;
             while ($firstChild->nodeType && $firstChild->nodeType !== XML_ELEMENT_NODE) {
                 $firstChild = $firstChild->nextSibling;
             }
             if ($firstChild->nodeType === XML_ELEMENT_NODE && in_array(strtolower($firstChild->tagName), array('h1', 'h2', 'h3', 'h4', 'h5', 'h6')) && strtolower(trim($firstChild->textContent)) == strtolower(trim($this->title))) {
                 $this->body->removeChild($firstChild);
             }
         }
         $this->success = true;
     }
     //echo "下一頁之前: [" . $this->body->innerHTML . "]\n\n";
     // 20131011 要實作下一頁的偵測!!!
     if (isset($this->body)) {
         //$elems = @$xpath->query("//a[starts-with(@href, /?p=) and &page=2']", $this->readability->dom);
         $next_page_pattern = $this->options->next_page_pattern;
         //echo $next_page_pattern;
         //$next_page_pattern = "//a[contains(@href, '&page=')]";
         //$next_page_pattern = "//a";
         $elems = @$xpath->query($next_page_pattern, $this->readability->dom);
         ////echo $elems->length;
         //$link = @$xpath->query("//a[contains(@href, '&page=')]/@href", $this->readability->dom);
         //if ($link, $) {
         if ($elems && $elems->length > 0) {
             try {
                 @($elem = $this->readability->dom->createElement('div', $elems->item(0)->getAttribute("href")));
             } catch (Exception $e) {
             }
             $elem = $this->readability->dom->createElement('div', "aaa");
             $attributes = $elems->item($elems->legnth)->attributes;
             $href = $attributes->getNamedItem("href")->value;
             if (substr($href, 0, 4) !== "http") {
                 //echo $href;
                 $url_component = parse_url($url);
                 //$href = urlencode($href);
                 //$elem = $this->readability->dom->createElement('div', $href);
                 //$this->body = $elem;
                 $permalink = $url_component["scheme"] . "://" . $url_component["host"] . $href;
             } else {
                 $permalink = $href;
             }
             //$permalink = $this->getNextPagePermalink($elems);
             //echo $permalink;
             //echo "[[[[".$permalink."]]]]";
             //$permalink = "http://blog.soft.idv.tw/?p=1606&page=2";
             $extractor = new ContentExtractor(dirname(__FILE__) . '/site_config/custom', dirname(__FILE__) . '/site_config/standard');
             $extractor->fingerprints = $options->fingerprints;
             $elem = new ContentExtractor($this->path, $this->fallback);
             $extractor->fingerprints = $this->fingerprints;
             $http = new HumbleHttpAgent();
             $response = $http->get($permalink, true);
             //echo 'status_code: '. $response['status_code'] . "\n\n";
             if ($permalink && ($response = $http->get($permalink, true)) && ($response['status_code'] < 300 || $response['status_code'] > 400)) {
                 $html = $response['body'];
                 //echo "html: " .$html;
                 // remove strange things
                 $html = str_replace('</[>', '', $html);
                 $html = convert_to_utf8($html, $response['headers']);
                 $extract_result = $extractor->process($html, $permalink);
                 //$readability = $extractor->readability;
                 $content_block = $extract_result ? $extractor->getContent() : null;
                 //echo "content_block->innerHTML: ". $content_block->innerHTML . "\n\n";
                 //$this->body->appendChild($elem);
             }
             //$doc = $this->readability->dom->("<node>".$content_block->C14N()."</node>");
             //$content = $content_block->
             //$content = $this->readability->dom->createElement('div', $content_block->innerHTML);
             $doc = new DOMDocument();
             if (@$doc->loadHTML($content_block->innerHTML)) {
                 $doc->saveHTML();
                 //$content = $this->readability->dom->loadHTML($content_block->innerHTML);
                 $content = $this->readability->dom->createElement('div', $content_block->innerHTML);
                 $content = $this->readability->dom->importNode($content_block, true);
                 $this->body->appendChild($content);
                 //$this->body->appendChild($doc);
                 //$xpath = new DOMXPath($this->readability->dom);
                 //$elems = @$xpath->query($extract_pattern, $content_block);
                 //$this->body->appendChild($content_block);
                 //$this->body = $content_block;
             }
         }
     }
     // if we've had no success and we've used tidy, there's a chance
     // that tidy has messed up. So let's try again without tidy...
     if (!$this->success && $tidied && $smart_tidy) {
         $this->debug('Trying again without tidy');
         $this->process($original_html, $url, false);
     }
     return $this->success;
 }
Exemplo n.º 19
0
 /**
  * Use HTML Tidy to validate the $text
  * Only runs when $config['HTML_Tidy'] is off
  *
  * @param string $text The html content to be checked. Passed by reference
  */
 public static function tidyFix(&$text, $ignore_config = false)
 {
     global $config;
     if (!$ignore_config) {
         if (empty($config['HTML_Tidy']) || $config['HTML_Tidy'] == 'off') {
             return true;
         }
     }
     if (!function_exists('tidy_parse_string')) {
         return false;
     }
     $options = array();
     $options['wrap'] = 0;
     //keeps tidy from wrapping... want the least amount of space changing as possible.. could get rid of spaces between words with the str_replaces below
     $options['doctype'] = 'omit';
     //omit, auto, strict, transitional, user
     $options['drop-empty-paras'] = true;
     //drop empty paragraphs
     $options['output-xhtml'] = true;
     //need this so that <br> will be <br/> .. etc
     $options['show-body-only'] = true;
     $options['hide-comments'] = false;
     $tidy = tidy_parse_string($text, $options, 'utf8');
     tidy_clean_repair($tidy);
     if (tidy_get_status($tidy) === 2) {
         // 2 is magic number for fatal error
         // http://www.php.net/manual/en/function.tidy-get-status.php
         return false;
     }
     $text = tidy_get_output($tidy);
     return true;
 }
Exemplo n.º 20
0
<?php

/* create tidy resource based on HTML string */
$a = tidy_parse_string("<HTML></HTML>");
tidy_clean_repair($a);
// repair the given HTML
$out = tidy_get_output($a);
// get output
echo nl2br(htmlspecialchars($out));
Exemplo n.º 21
0
function fetch_via_wordpress_ynf($fansub_id, $url, $last_fetched_item_date)
{
    $elements = array();
    $tidy_config = "tidy.conf";
    $error_connect = FALSE;
    $html_text = file_get_contents($url) or $error_connect = TRUE;
    if ($error_connect) {
        return array('error_connect', array());
    }
    $tidy = tidy_parse_string($html_text, $tidy_config, 'UTF8');
    tidy_clean_repair($tidy);
    $html = str_get_html(tidy_get_output($tidy));
    $go_on = TRUE;
    while ($go_on) {
        //parse through the HTML and build up the elements feed as we go along
        foreach ($html->find('article') as $article) {
            if ($article->find('h1.entry-title a', 0) !== NULL) {
                //Create an empty item
                $item = array();
                //Look up and add elements to the item
                $title = $article->find('h1.entry-title a', 0);
                $item[0] = $title->innertext;
                $item[1] = $article->find('div.entry-content', 0)->innertext;
                $description = str_replace("text-align:center;", "", $article->find('div.entry-content', 0)->innertext);
                $item[2] = parse_description($description);
                //The format is: 2013-09-02T14:43:43+00:00
                $datetext = $article->find('time', 0)->datetime;
                $date = date_create_from_format('Y-m-d\\TH:i:sP', $datetext);
                $date->setTimeZone(new DateTimeZone('Europe/Berlin'));
                $item[3] = $date->format('Y-m-d H:i:s');
                $item[4] = $title->href;
                $item[5] = fetch_and_parse_image($fansub_id, $url, $description);
                $elements[] = $item;
            }
        }
        $texts = $html->find('text');
        $go_on = FALSE;
        if (count($elements) > 0 && $elements[count($elements) - 1][3] >= $last_fetched_item_date) {
            foreach ($texts as $text) {
                if ($text->plaintext == ' Entrades més antigues') {
                    //Not sleeping, Wordpress.com does not appear to be rate-limited
                    $html_text = file_get_contents($text->parent->href) or $error_connect = TRUE;
                    if ($error_connect) {
                        return array('error_connect', array());
                    }
                    $tidy = tidy_parse_string($html_text, $tidy_config, 'UTF8');
                    tidy_clean_repair($tidy);
                    $html = str_get_html(tidy_get_output($tidy));
                    $go_on = TRUE;
                    break;
                }
            }
        }
    }
    return array('ok', $elements);
}
Exemplo n.º 22
0
 /**
  * Load HTML in a DOMDocument.
  * Apply Pre filters
  * Cleanup HTML using Tidy (or not).
  *
  * @todo This should be called in init() instead of from __construct
  */
 private function loadHtml()
 {
     $this->original_html = $this->html;
     $this->logger->debug('Parsing URL: ' . $this->url);
     if ($this->url) {
         $this->domainRegExp = '/' . strtr(preg_replace('/www\\d*\\./', '', parse_url($this->url, PHP_URL_HOST)), array('.' => '\\.')) . '/';
     }
     mb_internal_encoding('UTF-8');
     mb_http_output('UTF-8');
     mb_regex_encoding('UTF-8');
     // HACK: dirty cleanup to replace some stuff; shouldn't use regexps with HTML but well...
     if (!$this->flagIsActive(self::FLAG_DISABLE_PREFILTER)) {
         foreach ($this->pre_filters as $search => $replace) {
             $this->html = preg_replace($search, $replace, $this->html);
         }
         unset($search, $replace);
     }
     if (trim($this->html) === '') {
         $this->html = '<html></html>';
     }
     /*
      * Use tidy (if it exists).
      * This fixes problems with some sites which would otherwise trouble DOMDocument's HTML parsing.
      * Although sometimes it makes matters worse, which is why there is an option to disable it.
      */
     if ($this->useTidy) {
         $this->logger->debug('Tidying document');
         $tidy = tidy_parse_string($this->html, $this->tidy_config, 'UTF8');
         if (tidy_clean_repair($tidy)) {
             $this->tidied = true;
             $this->html = $tidy->value;
             $this->html = preg_replace('/[\\r\\n]+/is', "\n", $this->html);
         }
         unset($tidy);
     }
     $this->html = mb_convert_encoding($this->html, 'HTML-ENTITIES', 'UTF-8');
     if (!($this->parser === 'html5lib' && ($this->dom = \HTML5_Parser::parse($this->html)))) {
         libxml_use_internal_errors(true);
         $this->dom = new \DOMDocument();
         $this->dom->preserveWhiteSpace = false;
         if (PHP_VERSION_ID >= 50400) {
             $this->dom->loadHTML($this->html, LIBXML_NOBLANKS | LIBXML_COMPACT | LIBXML_NOERROR);
         } else {
             $this->dom->loadHTML($this->html);
         }
         libxml_use_internal_errors(false);
     }
     $this->dom->registerNodeClass('DOMElement', 'Readability\\JSLikeHTMLElement');
 }
Exemplo n.º 23
0
    // Feed untidy source into the stdin
    fwrite($pipes[0], $source);
    fclose($pipes[0]);
    // Read clean source out to the browser
    while (!feof($pipes[1])) {
        //echo fgets($pipes[1], 1024);
        $newsrc .= fgets($pipes[1], 1024);
    }
    fclose($pipes[1]);
    // Clean up after ourselves
    proc_close($process);
} else {
    /* Use tidy if it's available from PECL */
    if (function_exists('tidy_parse_string')) {
        $tempsrc = tidy_parse_string($source);
        tidy_clean_repair();
        $newsrc = tidy_get_output();
    } else {
        // Better give them back what they came with, so they don't lose it all...
        $newsrc = "<body>\n" . $source . "\n</body>";
    }
}
// Split our source into an array by lines
$srcLines = preg_split("/\n/", $newsrc, -1, PREG_SPLIT_NO_EMPTY);
// Get only the lines between the body tags
$startLn = 0;
while (strpos($srcLines[$startLn++], '<body') === false && $startLn < sizeof($srcLines)) {
}
$endLn = $startLn;
while (strpos($srcLines[$endLn++], '</body') === false && $endLn < sizeof($srcLines)) {
}
/**
 * Fetch hCard for the specified URL.
 *
 * @param string $url URL to get hCard from
 * @return array array containing the hCard object (key: 'hcard') as well as the raw XML (key: 'xml')
 * @access private
 */
function ext_profile_hcard_from_url($url)
{
    if (function_exists('tidy_clean_repair')) {
        $page = wp_remote_fopen($url);
        $page = tidy_clean_repair($page);
    } else {
        $page = wp_remote_fopen('http://cgi.w3.org/cgi-bin/tidy?forceXML=on&docAddr=' . urlencode($url));
    }
    $page = str_replace('&nbsp;', '&#160;', $page);
    // parse hCard
    $hkit = extended_profile_hkit();
    @($hcard = $hkit->getByString('hcard', $page));
    if ($hcard['preferred']) {
        // use preferred card if available, as specified by hKit
        $preferred_hcard = $hcard['preferred'][0];
    } elseif ($hcard['all']) {
        foreach ($hcard['all'] as $card) {
            if ($card['uid'] == $url) {
                $preferred_hcard = $card;
                break;
            }
            if (!is_array($card['url']) && $card['url'] == $url) {
                $preferred_hcard = $card;
                break;
            }
            if (is_array($card['url']) && in_array($url, $card['url'])) {
                $preferred_hcard = $card;
                break;
            }
        }
        if (!$preferred_hcard) {
            $preferred_hcard = $hcard['all'][0];
        }
    }
    return array('hcard' => $preferred_hcard, 'xml' => $hcard['xml']);
}
Exemplo n.º 25
0
    //$html = convert_to_utf8($html, $response['headers']);
    //$html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8");
} else {
    die('Failed to fetch URL');
}
if (trim($html) == '') {
    die('Empty response :(');
}
// use Tidy?
if (isset($_GET['tidy']) && $_GET['tidy'] === '1') {
    if (!function_exists('tidy_parse_string')) {
        die('Tidy requested but not available on server.');
    }
    $tidy_config = array('clean' => true, 'output-xhtml' => true, 'logical-emphasis' => true, 'show-body-only' => false, 'new-blocklevel-tags' => 'article, aside, footer, header, hgroup, menu, nav, section, details, datagrid', 'new-inline-tags' => 'mark, time, meter, progress, data', 'wrap' => 0, 'drop-empty-paras' => true, 'drop-proprietary-attributes' => false, 'enclose-text' => true, 'enclose-block-text' => true, 'merge-divs' => true, 'merge-spans' => true, 'char-encoding' => 'utf8', 'hide-comments' => true);
    $tidy = tidy_parse_string($html, $tidy_config, 'UTF8');
    if (tidy_clean_repair($tidy)) {
        $original_html = $html;
        $html = $tidy->value;
    }
}
//TODO: use HTML5 parser?
//TODO: escape $url for insering in JS variable
$js_inject = '
<!--ff-script-->
<script src="' . $base . '/js/jquery-latest.min.js"></script>
<!--script src="' . $base . '/js/toolbox.expose.js"></script-->
<script src="' . $base . '/js/css2xpath.js"></script>
<script src="' . $base . '/js/jquery.dom-outline-1.0.js"></script>
<script src="' . $base . '/init.js.php?url=' . urlencode($url) . '"></script>
<!--/ff-script-->
';
Exemplo n.º 26
0
 /**
  * Generates HTML from an array of tokens.
  * @param $tokens Array of HTMLPurifier_Token
  * @param $config HTMLPurifier_Config object
  * @return Generated HTML
  */
 function generateFromTokens($tokens, $config, &$context)
 {
     $html = '';
     if (!$config) {
         $config = HTMLPurifier_Config::createDefault();
     }
     $this->_scriptFix = $config->get('Output', 'CommentScriptContents');
     $this->_def = $config->getHTMLDefinition();
     $this->_xhtml = $this->_def->doctype->xml;
     if (!$tokens) {
         return '';
     }
     for ($i = 0, $size = count($tokens); $i < $size; $i++) {
         if ($this->_scriptFix && $tokens[$i]->name === 'script' && $i + 2 < $size && $tokens[$i + 2]->type == 'end') {
             // script special case
             // the contents of the script block must be ONE token
             // for this to work
             $html .= $this->generateFromToken($tokens[$i++]);
             $html .= $this->generateScriptFromToken($tokens[$i++]);
             // We're not going to do this: it wouldn't be valid anyway
             //while ($tokens[$i]->name != 'script') {
             //    $html .= $this->generateScriptFromToken($tokens[$i++]);
             //}
         }
         $html .= $this->generateFromToken($tokens[$i]);
     }
     if ($config->get('Output', 'TidyFormat') && extension_loaded('tidy')) {
         $tidy_options = array('indent' => true, 'output-xhtml' => $this->_xhtml, 'show-body-only' => true, 'indent-spaces' => 2, 'wrap' => 68);
         if (version_compare(PHP_VERSION, '5', '<')) {
             tidy_set_encoding('utf8');
             foreach ($tidy_options as $key => $value) {
                 tidy_setopt($key, $value);
             }
             tidy_parse_string($html);
             tidy_clean_repair();
             $html = tidy_get_output();
         } else {
             $tidy = new Tidy();
             $tidy->parseString($html, $tidy_options, 'utf8');
             $tidy->cleanRepair();
             $html = (string) $tidy;
         }
     }
     // normalize newlines to system
     $nl = $config->get('Output', 'Newline');
     if ($nl === null) {
         $nl = PHP_EOL;
     }
     $html = str_replace("\n", $nl, $html);
     return $html;
 }
 public function index($data)
 {
     $html = $this->render();
     // return $html;
     $options = array("indent" => true, "indent-spaces" => "2", "wrap" => "90", "output-html" => true, "hide-comments" => true);
     $tidy = tidy_parse_string($html, $options, 'utf8');
     tidy_clean_repair($tidy);
     return tidy_get_output($tidy);
 }
Exemplo n.º 28
0
 public function process($html, $url, $smart_tidy = true)
 {
     $this->reset();
     // extract host name
     $host = @parse_url($url, PHP_URL_HOST);
     if (!($this->config = SiteConfig::build($host))) {
         // no match, so use defaults
         $this->config = new SiteConfig();
     }
     // store copy of config in our static cache array in case we need to process another URL
     SiteConfig::add_to_cache($host, $this->config);
     // use tidy (if it exists)?
     // This fixes problems with some sites which would otherwise
     // trouble DOMDocument's HTML parsing. (Although sometimes it
     // makes matters worse, which is why you can override it in site config files.)
     $tidied = false;
     if ($this->config->tidy && function_exists('tidy_parse_string') && $smart_tidy) {
         $this->debug('Using Tidy');
         $tidy = tidy_parse_string($html, self::$tidy_config, 'UTF8');
         if (tidy_clean_repair($tidy)) {
             $original_html = $html;
             $tidied = true;
             $html = $tidy->value;
         }
         unset($tidy);
     }
     // load and parse html
     $this->readability = new Readability($html, $url);
     // we use xpath to find elements in the given HTML document
     // see http://en.wikipedia.org/wiki/XPath_1.0
     $xpath = new DOMXPath($this->readability->dom);
     // strip elements (using xpath expressions)
     foreach ($this->config->strip as $pattern) {
         $elems = @$xpath->query($pattern, $this->readability->dom);
         // check for matches
         if ($elems && $elems->length > 0) {
             $this->debug('Stripping ' . $elems->length . ' elements (strip)');
             for ($i = $elems->length - 1; $i >= 0; $i--) {
                 $elems->item($i)->parentNode->removeChild($elems->item($i));
             }
         }
     }
     // strip elements (using id and class attribute values)
     foreach ($this->config->strip_id_or_class as $string) {
         $string = strtr($string, array("'" => '', '"' => ''));
         $elems = @$xpath->query("//*[contains(@class, '{$string}') or contains(@id, '{$string}')]", $this->readability->dom);
         // check for matches
         if ($elems && $elems->length > 0) {
             $this->debug('Stripping ' . $elems->length . ' elements (strip_id_or_class)');
             for ($i = $elems->length - 1; $i >= 0; $i--) {
                 $elems->item($i)->parentNode->removeChild($elems->item($i));
             }
         }
     }
     // strip images (using src attribute values)
     foreach ($this->config->strip_image_src as $string) {
         $string = strtr($string, array("'" => '', '"' => ''));
         $elems = @$xpath->query("//img[contains(@src, '{$string}')]", $this->readability->dom);
         // check for matches
         if ($elems && $elems->length > 0) {
             $this->debug('Stripping ' . $elems->length . ' image elements');
             for ($i = $elems->length - 1; $i >= 0; $i--) {
                 $elems->item($i)->parentNode->removeChild($elems->item($i));
             }
         }
     }
     // strip elements using Readability.com and Instapaper.com ignore class names
     // .entry-unrelated and .instapaper_ignore
     // See https://www.readability.com/publishers/guidelines/#view-plainGuidelines
     // and http://blog.instapaper.com/post/730281947
     $elems = @$xpath->query("//*[contains(concat(' ',normalize-space(@class),' '),' entry-unrelated ') or contains(concat(' ',normalize-space(@class),' '),' instapaper_ignore ')]", $this->readability->dom);
     // check for matches
     if ($elems && $elems->length > 0) {
         $this->debug('Stripping ' . $elems->length . ' .entry-unrelated,.instapaper_ignore elements');
         for ($i = $elems->length - 1; $i >= 0; $i--) {
             $elems->item($i)->parentNode->removeChild($elems->item($i));
         }
     }
     // strip elements that contain style="display: none;"
     $elems = @$xpath->query("//*[contains(@style,'display:none')]", $this->readability->dom);
     // check for matches
     if ($elems && $elems->length > 0) {
         $this->debug('Stripping ' . $elems->length . ' elements with inline display:none style');
         for ($i = $elems->length - 1; $i >= 0; $i--) {
             $elems->item($i)->parentNode->removeChild($elems->item($i));
         }
     }
     // try to get title
     foreach ($this->config->title as $pattern) {
         $elems = @$xpath->evaluate($pattern, $this->readability->dom);
         if (is_string($elems)) {
             $this->debug('Title expression evaluated as string');
             $this->title = trim($elems);
             break;
         } elseif ($elems instanceof DOMNodeList && $elems->length > 0) {
             $this->debug('Title matched');
             $this->title = $elems->item(0)->textContent;
             break;
         }
     }
     // try to get body
     foreach ($this->config->body as $pattern) {
         $elems = @$xpath->query($pattern, $this->readability->dom);
         // check for matches
         if ($elems && $elems->length > 0) {
             $this->debug('Body matched');
             if ($elems->length == 1) {
                 $this->body = $elems->item(0);
                 // prune (clean up elements that may not be content)
                 if ($this->config->prune) {
                     $this->debug('Pruning content');
                     $this->readability->prepArticle($this->body);
                 }
                 break;
             } else {
                 $this->body = $this->readability->dom->createElement('div');
                 $this->debug($elems->length . ' body elems found');
                 foreach ($elems as $elem) {
                     $isDescendant = false;
                     foreach ($this->body->childNodes as $parent) {
                         if ($this->isDescendant($parent, $elem)) {
                             $isDescendant = true;
                             break;
                         }
                     }
                     if ($isDescendant) {
                         $this->debug('Element is child of another body element, skipping.');
                     } else {
                         // prune (clean up elements that may not be content)
                         if ($this->config->prune) {
                             $this->debug('Pruning content');
                             $this->readability->prepArticle($elem);
                         }
                         $this->debug('Element added to body');
                         $this->body->appendChild($elem);
                     }
                 }
             }
         }
     }
     // auto detect?
     $detect_title = $detect_body = false;
     // detect title?
     if (!isset($this->title)) {
         if (empty($this->config->title) || !empty($this->config->title) && $this->config->autodetect_on_failure) {
             $detect_title = true;
         }
     }
     // detect body?
     if (!isset($this->body)) {
         if (empty($this->config->body) || !empty($this->config->body) && $this->config->autodetect_on_failure) {
             $detect_body = true;
         }
     }
     // check for hNews
     if ($detect_title || $detect_body) {
         // check for hentry
         $elems = @$xpath->query("//*[contains(concat(' ',normalize-space(@class),' '),' hentry ')]", $this->readability->dom);
         if ($elems && $elems->length > 0) {
             $this->debug('hNews: found hentry');
             $hentry = $elems->item(0);
             if ($detect_title) {
                 // check for entry-title
                 $elems = @$xpath->query(".//*[contains(concat(' ',normalize-space(@class),' '),' entry-title ')]", $hentry);
                 if ($elems && $elems->length > 0) {
                     $this->debug('hNews: found entry-title');
                     $this->title = $elems->item(0)->textContent;
                     $detect_title = false;
                 }
             }
             // check for entry-content.
             // according to hAtom spec, if there are multiple elements marked entry-content,
             // we include all of these in the order they appear - see http://microformats.org/wiki/hatom#Entry_Content
             if ($detect_body) {
                 $elems = @$xpath->query(".//*[contains(concat(' ',normalize-space(@class),' '),' entry-content ')]", $hentry);
                 if ($elems && $elems->length > 0) {
                     $this->debug('hNews: found entry-content');
                     if ($elems->length == 1) {
                         // what if it's empty? (some sites misuse hNews - place their content outside an empty entry-content element)
                         $e = $elems->item(0);
                         if ($e->tagName == 'img' || trim($e->textContent) != '') {
                             $this->body = $elems->item(0);
                             // prune (clean up elements that may not be content)
                             if ($this->config->prune) {
                                 $this->debug('Pruning content');
                                 $this->readability->prepArticle($this->body);
                             }
                             $detect_body = false;
                         } else {
                             $this->debug('hNews: skipping entry-content - appears not to contain content');
                         }
                         unset($e);
                     } else {
                         $this->body = $this->readability->dom->createElement('div');
                         $this->debug($elems->length . ' entry-content elems found');
                         foreach ($elems as $elem) {
                             $isDescendant = false;
                             foreach ($this->body->childNodes as $parent) {
                                 if ($this->isDescendant($parent, $elem)) {
                                     $isDescendant = true;
                                     break;
                                 }
                             }
                             if ($isDescendant) {
                                 $this->debug('Element is child of another body element, skipping.');
                             } else {
                                 // prune (clean up elements that may not be content)
                                 if ($this->config->prune) {
                                     $this->debug('Pruning content');
                                     $this->readability->prepArticle($elem);
                                 }
                                 $this->debug('Element added to body');
                                 $this->body->appendChild($elem);
                             }
                         }
                         $detect_body = false;
                     }
                 }
             }
         }
     }
     // check for elements marked with instapaper_title
     if ($detect_title) {
         // check for instapaper_title
         $elems = @$xpath->query("//*[contains(concat(' ',normalize-space(@class),' '),' instapaper_title ')]", $this->readability->dom);
         if ($elems && $elems->length > 0) {
             $this->debug('title found (.instapaper_title)');
             $this->title = $elems->item(0)->textContent;
             $detect_title = false;
         }
     }
     // check for elements marked with instapaper_body
     if ($detect_body) {
         $elems = @$xpath->query("//*[contains(concat(' ',normalize-space(@class),' '),' instapaper_body ')]", $this->readability->dom);
         if ($elems && $elems->length > 0) {
             $this->debug('body found (.instapaper_body)');
             $this->body = $elems->item(0);
             // prune (clean up elements that may not be content)
             if ($this->config->prune) {
                 $this->debug('Pruning content');
                 $this->readability->prepArticle($this->body);
             }
             $detect_body = false;
         }
     }
     // still missing title or body, so we detect using Readability
     if ($detect_title || $detect_body) {
         $this->debug('Using Readability');
         // clone body if we're only using Readability for title (otherwise it may interfere with body element)
         if (isset($this->body)) {
             $this->body = $this->body->cloneNode(true);
         }
         $success = $this->readability->init();
     }
     if ($detect_title) {
         $this->debug('Detecting title');
         $this->title = $this->readability->getTitle()->textContent;
     }
     if ($detect_body && $success) {
         $this->debug('Detecting body');
         $this->body = $this->readability->getContent();
         if ($this->body->childNodes->length == 1 && $this->body->firstChild->nodeType === XML_ELEMENT_NODE) {
             $this->body = $this->body->firstChild;
         }
         // prune (clean up elements that may not be content)
         if ($this->config->prune) {
             $this->debug('Pruning content');
             $this->readability->prepArticle($this->body);
         }
     }
     if (isset($this->body)) {
         // remove scripts
         $this->readability->removeScripts($this->body);
         $this->success = true;
     }
     // if we've had no success and we've used tidy, there's a chance
     // that tidy has messed up. So let's try again without tidy...
     if (!$this->success && $tidied && $smart_tidy) {
         $this->debug('Trying again without tidy');
         $this->process($original_html, $url, false);
     }
     return $this->success;
 }
Exemplo n.º 29
0
<?php

$tidy = tidy_parse_file("clean_ex1.html", array("clean" => true));
tidy_clean_repair($tidy);
echo $tidy;
Exemplo n.º 30
0
 private function _tidyClean($source)
 {
     if (TIDY_CLEAN) {
         if (function_exists('tidy_parse_string')) {
             $source = tidy_parse_string($source);
             tidy_clean_repair($source);
         }
     }
     return trim($source);
 }