function getSinglePage($item, $html, $url)
{
    global $http, $extractor;
    //$url = "http://chinese.engadget.com/2014/04/21/nintendo-game-boy-25th-anniversary/";
    //echo "getSinglePage: " . $url . "\n";
    $host = @parse_url($url, PHP_URL_HOST);
    $site_config = SiteConfig::build($host);
    if ($site_config === false) {
        // check for fingerprints
        if (!empty($extractor->fingerprints) && ($_fphost = $extractor->findHostUsingFingerprints($html))) {
            $site_config = SiteConfig::build($_fphost);
        }
        if ($site_config === false) {
            $site_config = new SiteConfig();
        }
        SiteConfig::add_to_cache($host, $site_config);
        return false;
    } else {
        SiteConfig::add_to_cache($host, $site_config);
    }
    $splink = null;
    if (!empty($site_config->single_page_link)) {
        $splink = $site_config->single_page_link;
    } elseif (!empty($site_config->single_page_link_in_feed)) {
        // single page link xpath is targeted at feed
        $splink = $site_config->single_page_link_in_feed;
        // so let's replace HTML with feed item description
        $html = $item->get_description();
    }
    if (isset($splink)) {
        // Build DOM tree from HTML
        $readability = new Readability($html, $url);
        $xpath = new DOMXPath($readability->dom);
        // Loop through single_page_link xpath expressions
        $single_page_url = null;
        foreach ($splink as $pattern) {
            $elems = @$xpath->evaluate($pattern, $readability->dom);
            if (is_string($elems)) {
                $single_page_url = trim($elems);
                break;
            } elseif ($elems instanceof DOMNodeList && $elems->length > 0) {
                foreach ($elems as $item) {
                    if ($item instanceof DOMElement && $item->hasAttribute('href')) {
                        $single_page_url = $item->getAttribute('href');
                        break;
                    } elseif ($item instanceof DOMAttr && $item->value) {
                        $single_page_url = $item->value;
                        break;
                    }
                }
            }
        }
        // If we've got URL, resolve against $url
        if (isset($single_page_url) && ($single_page_url = makeAbsoluteStr($url, $single_page_url))) {
            // check it's not what we have already!
            if ($single_page_url != $url) {
                // it's not, so let's try to fetch it...
                $_prev_ref = $http->referer;
                $http->referer = $single_page_url;
                if (($response = $http->get($single_page_url, true)) && $response['status_code'] < 300) {
                    $http->referer = $_prev_ref;
                    return $response;
                }
                $http->referer = $_prev_ref;
            }
        }
    }
    return false;
}
 public function buildSiteConfig($url, $html = '', $add_to_cache = true)
 {
     // extract host name
     $host = @parse_url($url, PHP_URL_HOST);
     $host = strtolower($host);
     if (substr($host, 0, 4) == 'www.') {
         $host = substr($host, 4);
     }
     // is merged version already cached?
     if (SiteConfig::is_cached("{$host}.merged")) {
         $this->debug("Returning cached and merged site config for {$host}");
         return SiteConfig::build("{$host}.merged");
     }
     // let's build from site_config/custom/ and standard/
     $config = SiteConfig::build($host);
     if ($add_to_cache && $config && !SiteConfig::is_cached("{$host}")) {
         SiteConfig::add_to_cache($host, $config);
     }
     // if no match, use defaults
     if (!$config) {
         $config = new SiteConfig();
     }
     // load fingerprint config?
     if ($config->autodetect_on_failure()) {
         // check HTML for fingerprints
         if (!empty($this->fingerprints) && ($_fphost = $this->findHostUsingFingerprints($html))) {
             if ($config_fingerprint = SiteConfig::build($_fphost)) {
                 $this->debug("Appending site config settings from {$_fphost} (fingerprint match)");
                 $config->append($config_fingerprint);
                 if ($add_to_cache && !SiteConfig::is_cached($_fphost)) {
                     //$config_fingerprint->cache_in_apc = true;
                     SiteConfig::add_to_cache($_fphost, $config_fingerprint);
                 }
             }
         }
     }
     // load global config?
     if ($config->autodetect_on_failure()) {
         if ($config_global = SiteConfig::build('global', true)) {
             $this->debug('Appending site config settings from global.txt');
             $config->append($config_global);
             if ($add_to_cache && !SiteConfig::is_cached('global')) {
                 //$config_global->cache_in_apc = true;
                 SiteConfig::add_to_cache('global', $config_global);
             }
         }
     }
     // store copy of merged config
     if ($add_to_cache) {
         // do not store in APC if wildcard match
         $use_apc = $host == $config->cache_key;
         $config->cache_key = null;
         SiteConfig::add_to_cache("{$host}.merged", $config, $use_apc);
     }
     return $config;
 }
Esempio n. 3
0
 echo '<p>Saved to <strong>' . $savepath . '</strong></p>';
 // check caching
 if ($options->caching) {
     echo '<p>Note: caching is enabled &mdash; you may have to disable caching or delete cache files to see changes.<p>';
 }
 if ($options->apc && function_exists('apc_delete') && function_exists('apc_cache_info')) {
     $_apc_data = apc_cache_info('user');
     foreach ($_apc_data['cache_list'] as $_apc_item) {
         if (substr($_apc_item['info'], 0, 3) == 'sc.') {
             apc_delete($_apc_item['info']);
         }
     }
     echo '<p>Cleared site config cache in APC.</p>';
 }
 SiteConfig::set_config_path(dirname($savepath));
 $sconfig = SiteConfig::build($save, $exact_host_match = true);
 if ($sconfig) {
     if (!empty($sconfig->test_url)) {
         echo '<h4>Test URLs</h4>';
         echo '<ul>';
         foreach ($sconfig->test_url as $test_url) {
             $ftr_test_url = $test_url;
             if (strtolower(substr($ftr_test_url, 0, 7)) == 'http://') {
                 $ftr_test_url = substr($ftr_test_url, 7);
             }
             $ftr_test_url = '../makefulltextfeed.php?url=' . urlencode($ftr_test_url);
             echo '<li>';
             echo '<a href="' . htmlspecialchars($test_url) . '" target="_blank">' . htmlspecialchars($test_url) . '</a>';
             echo ' | <a href="' . $ftr_test_url . '" target="_blank">Full-Text RSS result</a>';
             echo ' | <a href="' . $ftr_test_url . '&debug" target="_blank">Debug</a>';
             echo '</li>';
 public function process($html, $url, $smart_tidy = true)
 {
     $this->reset();
     // extract host name
     $host = @parse_url($url, PHP_URL_HOST);
     if (!($this->config = SiteConfig::build($host))) {
         // no match, so use defaults
         $this->config = new SiteConfig();
     }
     // store copy of config in our static cache array in case we need to process another URL
     SiteConfig::add_to_cache($host, $this->config);
     // use tidy (if it exists)?
     // This fixes problems with some sites which would otherwise
     // trouble DOMDocument's HTML parsing. (Although sometimes it
     // makes matters worse, which is why you can override it in site config files.)
     $tidied = false;
     if ($this->config->tidy && function_exists('tidy_parse_string') && $smart_tidy) {
         $this->debug('Using Tidy');
         $tidy = tidy_parse_string($html, self::$tidy_config, 'UTF8');
         if (tidy_clean_repair($tidy)) {
             $original_html = $html;
             $tidied = true;
             $html = $tidy->value;
         }
         unset($tidy);
     }
     // load and parse html
     $this->readability = new Readability($html, $url);
     // we use xpath to find elements in the given HTML document
     // see http://en.wikipedia.org/wiki/XPath_1.0
     $xpath = new DOMXPath($this->readability->dom);
     // strip elements (using xpath expressions)
     foreach ($this->config->strip as $pattern) {
         $elems = @$xpath->query($pattern, $this->readability->dom);
         // check for matches
         if ($elems && $elems->length > 0) {
             $this->debug('Stripping ' . $elems->length . ' elements (strip)');
             for ($i = $elems->length - 1; $i >= 0; $i--) {
                 $elems->item($i)->parentNode->removeChild($elems->item($i));
             }
         }
     }
     // strip elements (using id and class attribute values)
     foreach ($this->config->strip_id_or_class as $string) {
         $string = strtr($string, array("'" => '', '"' => ''));
         $elems = @$xpath->query("//*[contains(@class, '{$string}') or contains(@id, '{$string}')]", $this->readability->dom);
         // check for matches
         if ($elems && $elems->length > 0) {
             $this->debug('Stripping ' . $elems->length . ' elements (strip_id_or_class)');
             for ($i = $elems->length - 1; $i >= 0; $i--) {
                 $elems->item($i)->parentNode->removeChild($elems->item($i));
             }
         }
     }
     // strip images (using src attribute values)
     foreach ($this->config->strip_image_src as $string) {
         $string = strtr($string, array("'" => '', '"' => ''));
         $elems = @$xpath->query("//img[contains(@src, '{$string}')]", $this->readability->dom);
         // check for matches
         if ($elems && $elems->length > 0) {
             $this->debug('Stripping ' . $elems->length . ' image elements');
             for ($i = $elems->length - 1; $i >= 0; $i--) {
                 $elems->item($i)->parentNode->removeChild($elems->item($i));
             }
         }
     }
     // strip elements using Readability.com and Instapaper.com ignore class names
     // .entry-unrelated and .instapaper_ignore
     // See https://www.readability.com/publishers/guidelines/#view-plainGuidelines
     // and http://blog.instapaper.com/post/730281947
     $elems = @$xpath->query("//*[contains(concat(' ',normalize-space(@class),' '),' entry-unrelated ') or contains(concat(' ',normalize-space(@class),' '),' instapaper_ignore ')]", $this->readability->dom);
     // check for matches
     if ($elems && $elems->length > 0) {
         $this->debug('Stripping ' . $elems->length . ' .entry-unrelated,.instapaper_ignore elements');
         for ($i = $elems->length - 1; $i >= 0; $i--) {
             $elems->item($i)->parentNode->removeChild($elems->item($i));
         }
     }
     // strip elements that contain style="display: none;"
     $elems = @$xpath->query("//*[contains(@style,'display:none')]", $this->readability->dom);
     // check for matches
     if ($elems && $elems->length > 0) {
         $this->debug('Stripping ' . $elems->length . ' elements with inline display:none style');
         for ($i = $elems->length - 1; $i >= 0; $i--) {
             $elems->item($i)->parentNode->removeChild($elems->item($i));
         }
     }
     // try to get title
     foreach ($this->config->title as $pattern) {
         $elems = @$xpath->evaluate($pattern, $this->readability->dom);
         if (is_string($elems)) {
             $this->debug('Title expression evaluated as string');
             $this->title = trim($elems);
             break;
         } elseif ($elems instanceof DOMNodeList && $elems->length > 0) {
             $this->debug('Title matched');
             $this->title = $elems->item(0)->textContent;
             break;
         }
     }
     // try to get body
     foreach ($this->config->body as $pattern) {
         $elems = @$xpath->query($pattern, $this->readability->dom);
         // check for matches
         if ($elems && $elems->length > 0) {
             $this->debug('Body matched');
             if ($elems->length == 1) {
                 $this->body = $elems->item(0);
                 // prune (clean up elements that may not be content)
                 if ($this->config->prune) {
                     $this->debug('Pruning content');
                     $this->readability->prepArticle($this->body);
                 }
                 break;
             } else {
                 $this->body = $this->readability->dom->createElement('div');
                 $this->debug($elems->length . ' body elems found');
                 foreach ($elems as $elem) {
                     $isDescendant = false;
                     foreach ($this->body->childNodes as $parent) {
                         if ($this->isDescendant($parent, $elem)) {
                             $isDescendant = true;
                             break;
                         }
                     }
                     if ($isDescendant) {
                         $this->debug('Element is child of another body element, skipping.');
                     } else {
                         // prune (clean up elements that may not be content)
                         if ($this->config->prune) {
                             $this->debug('Pruning content');
                             $this->readability->prepArticle($elem);
                         }
                         $this->debug('Element added to body');
                         $this->body->appendChild($elem);
                     }
                 }
             }
         }
     }
     // auto detect?
     $detect_title = $detect_body = false;
     // detect title?
     if (!isset($this->title)) {
         if (empty($this->config->title) || !empty($this->config->title) && $this->config->autodetect_on_failure) {
             $detect_title = true;
         }
     }
     // detect body?
     if (!isset($this->body)) {
         if (empty($this->config->body) || !empty($this->config->body) && $this->config->autodetect_on_failure) {
             $detect_body = true;
         }
     }
     // check for hNews
     if ($detect_title || $detect_body) {
         // check for hentry
         $elems = @$xpath->query("//*[contains(concat(' ',normalize-space(@class),' '),' hentry ')]", $this->readability->dom);
         if ($elems && $elems->length > 0) {
             $this->debug('hNews: found hentry');
             $hentry = $elems->item(0);
             if ($detect_title) {
                 // check for entry-title
                 $elems = @$xpath->query(".//*[contains(concat(' ',normalize-space(@class),' '),' entry-title ')]", $hentry);
                 if ($elems && $elems->length > 0) {
                     $this->debug('hNews: found entry-title');
                     $this->title = $elems->item(0)->textContent;
                     $detect_title = false;
                 }
             }
             // check for entry-content.
             // according to hAtom spec, if there are multiple elements marked entry-content,
             // we include all of these in the order they appear - see http://microformats.org/wiki/hatom#Entry_Content
             if ($detect_body) {
                 $elems = @$xpath->query(".//*[contains(concat(' ',normalize-space(@class),' '),' entry-content ')]", $hentry);
                 if ($elems && $elems->length > 0) {
                     $this->debug('hNews: found entry-content');
                     if ($elems->length == 1) {
                         // what if it's empty? (some sites misuse hNews - place their content outside an empty entry-content element)
                         $e = $elems->item(0);
                         if ($e->tagName == 'img' || trim($e->textContent) != '') {
                             $this->body = $elems->item(0);
                             // prune (clean up elements that may not be content)
                             if ($this->config->prune) {
                                 $this->debug('Pruning content');
                                 $this->readability->prepArticle($this->body);
                             }
                             $detect_body = false;
                         } else {
                             $this->debug('hNews: skipping entry-content - appears not to contain content');
                         }
                         unset($e);
                     } else {
                         $this->body = $this->readability->dom->createElement('div');
                         $this->debug($elems->length . ' entry-content elems found');
                         foreach ($elems as $elem) {
                             $isDescendant = false;
                             foreach ($this->body->childNodes as $parent) {
                                 if ($this->isDescendant($parent, $elem)) {
                                     $isDescendant = true;
                                     break;
                                 }
                             }
                             if ($isDescendant) {
                                 $this->debug('Element is child of another body element, skipping.');
                             } else {
                                 // prune (clean up elements that may not be content)
                                 if ($this->config->prune) {
                                     $this->debug('Pruning content');
                                     $this->readability->prepArticle($elem);
                                 }
                                 $this->debug('Element added to body');
                                 $this->body->appendChild($elem);
                             }
                         }
                         $detect_body = false;
                     }
                 }
             }
         }
     }
     // check for elements marked with instapaper_title
     if ($detect_title) {
         // check for instapaper_title
         $elems = @$xpath->query("//*[contains(concat(' ',normalize-space(@class),' '),' instapaper_title ')]", $this->readability->dom);
         if ($elems && $elems->length > 0) {
             $this->debug('title found (.instapaper_title)');
             $this->title = $elems->item(0)->textContent;
             $detect_title = false;
         }
     }
     // check for elements marked with instapaper_body
     if ($detect_body) {
         $elems = @$xpath->query("//*[contains(concat(' ',normalize-space(@class),' '),' instapaper_body ')]", $this->readability->dom);
         if ($elems && $elems->length > 0) {
             $this->debug('body found (.instapaper_body)');
             $this->body = $elems->item(0);
             // prune (clean up elements that may not be content)
             if ($this->config->prune) {
                 $this->debug('Pruning content');
                 $this->readability->prepArticle($this->body);
             }
             $detect_body = false;
         }
     }
     // still missing title or body, so we detect using Readability
     if ($detect_title || $detect_body) {
         $this->debug('Using Readability');
         // clone body if we're only using Readability for title (otherwise it may interfere with body element)
         if (isset($this->body)) {
             $this->body = $this->body->cloneNode(true);
         }
         $success = $this->readability->init();
     }
     if ($detect_title) {
         $this->debug('Detecting title');
         $this->title = $this->readability->getTitle()->textContent;
     }
     if ($detect_body && $success) {
         $this->debug('Detecting body');
         $this->body = $this->readability->getContent();
         if ($this->body->childNodes->length == 1 && $this->body->firstChild->nodeType === XML_ELEMENT_NODE) {
             $this->body = $this->body->firstChild;
         }
         // prune (clean up elements that may not be content)
         if ($this->config->prune) {
             $this->debug('Pruning content');
             $this->readability->prepArticle($this->body);
         }
     }
     if (isset($this->body)) {
         // remove scripts
         $this->readability->removeScripts($this->body);
         $this->success = true;
     }
     // if we've had no success and we've used tidy, there's a chance
     // that tidy has messed up. So let's try again without tidy...
     if (!$this->success && $tidied && $smart_tidy) {
         $this->debug('Trying again without tidy');
         $this->process($original_html, $url, false);
     }
     return $this->success;
 }
function getSinglePage($item, $html, $url)
{
    global $http;
    $host = @parse_url($url, PHP_URL_HOST);
    $site_config = SiteConfig::build($host);
    if ($site_config === false) {
        return false;
    }
    $splink = null;
    if (!empty($site_config->single_page_link)) {
        $splink = $site_config->single_page_link;
    } elseif (!empty($site_config->single_page_link_in_feed)) {
        // single page link xpath is targeted at feed
        $splink = $site_config->single_page_link_in_feed;
        // so let's replace HTML with feed item description
        $html = $item->get_description();
    }
    if (isset($splink)) {
        // Build DOM tree from HTML
        $readability = new Readability($html, $url);
        $xpath = new DOMXPath($readability->dom);
        // Loop through single_page_link xpath expressions
        $single_page_url = null;
        foreach ($splink as $pattern) {
            $elems = @$xpath->evaluate($pattern, $readability->dom);
            if (is_string($elems)) {
                $single_page_url = trim($elems);
                break;
            } elseif ($elems instanceof DOMNodeList && $elems->length > 0) {
                foreach ($elems as $item) {
                    if ($item->hasAttribute('href')) {
                        $single_page_url = $item->getAttribute('href');
                        break;
                    }
                }
            }
        }
        // If we've got URL, resolve against $url
        if (isset($single_page_url) && ($single_page_url = makeAbsoluteStr($url, $single_page_url))) {
            // check it's not what we have already!
            if ($single_page_url != $url) {
                // it's not, so let's try to fetch it...
                if (($response = $http->get($single_page_url, true)) && $response['status_code'] < 300) {
                    return $response;
                }
            }
        }
    }
    return false;
}
 public function process($html, $url, $smart_tidy = true)
 {
     $this->reset();
     // extract host name
     $host = @parse_url($url, PHP_URL_HOST);
     if (!($this->config = SiteConfig::build($host))) {
         // no match, check HTML for fingerprints
         if (!empty($this->fingerprints) && ($_fphost = $this->findHostUsingFingerprints($html))) {
             $this->config = SiteConfig::build($_fphost);
         }
         unset($_fphost);
         if (!$this->config) {
             // no match, so use defaults
             $this->config = new SiteConfig();
         }
     }
     //echo count($this->config->body);
     // store copy of config in our static cache array in case we need to process another URL
     SiteConfig::add_to_cache($host, $this->config);
     // do string replacements
     foreach ($this->config->replace_string as $_repl) {
         $html = str_replace($_repl[0], $_repl[1], $html);
     }
     unset($_repl);
     // use tidy (if it exists)?
     // This fixes problems with some sites which would otherwise
     // trouble DOMDocument's HTML parsing. (Although sometimes it
     // makes matters worse, which is why you can override it in site config files.)
     $tidied = false;
     if ($this->config->tidy && function_exists('tidy_parse_string') && $smart_tidy) {
         $this->debug('Using Tidy');
         $tidy = tidy_parse_string($html, self::$tidy_config, 'UTF8');
         if (tidy_clean_repair($tidy)) {
             $original_html = $html;
             $tidied = true;
             $html = $tidy->value;
         }
         unset($tidy);
     }
     // load and parse html
     $this->readability = new Readability($html, $url);
     // we use xpath to find elements in the given HTML document
     // see http://en.wikipedia.org/wiki/XPath_1.0
     $xpath = new DOMXPath($this->readability->dom);
     // try to get title
     foreach ($this->config->title as $pattern) {
         $elems = @$xpath->evaluate($pattern, $this->readability->dom);
         if (is_string($elems)) {
             $this->debug('Title expression evaluated as string');
             $this->title = trim($elems);
             break;
         } elseif ($elems instanceof DOMNodeList && $elems->length > 0) {
             $this->debug('Title matched');
             $this->title = $elems->item(0)->textContent;
             // remove title from document
             try {
                 $elems->item(0)->parentNode->removeChild($elems->item(0));
             } catch (DOMException $e) {
                 // do nothing
             }
             break;
         }
     }
     // try to get author (if it hasn't already been set)
     if (empty($this->author)) {
         foreach ($this->config->author as $pattern) {
             $elems = @$xpath->evaluate($pattern, $this->readability->dom);
             if (is_string($elems)) {
                 $this->debug('Author expression evaluated as string');
                 if (trim($elems) != '') {
                     $this->author[] = trim($elems);
                     break;
                 }
             } elseif ($elems instanceof DOMNodeList && $elems->length > 0) {
                 foreach ($elems as $elem) {
                     if (!isset($elem->parentNode)) {
                         continue;
                     }
                     $this->author[] = trim($elem->textContent);
                 }
                 if (!empty($this->author)) {
                     break;
                 }
             }
         }
     }
     // try to get language
     $_lang_xpath = array('//html[@lang]/@lang', '//meta[@name="DC.language"]/@content');
     foreach ($_lang_xpath as $pattern) {
         $elems = @$xpath->evaluate($pattern, $this->readability->dom);
         if (is_string($elems)) {
             if (trim($elems) != '') {
                 $this->language = trim($elems);
                 break;
             }
         } elseif ($elems instanceof DOMNodeList && $elems->length > 0) {
             foreach ($elems as $elem) {
                 if (!isset($elem->parentNode)) {
                     continue;
                 }
                 $this->language = trim($elem->textContent);
             }
             if ($this->language) {
                 break;
             }
         }
     }
     // try to get date
     foreach ($this->config->date as $pattern) {
         $elems = @$xpath->evaluate($pattern, $this->readability->dom);
         if (is_string($elems)) {
             $this->debug('Date expression evaluated as string');
             $this->date = strtotime(trim($elems, "; \t\n\r\v"));
         } elseif ($elems instanceof DOMNodeList && $elems->length > 0) {
             $this->debug('Date matched');
             $this->date = $elems->item(0)->textContent;
             $this->date = strtotime(trim($this->date, "; \t\n\r\v"));
             // remove date from document
             // $elems->item(0)->parentNode->removeChild($elems->item(0));
         }
         if (!$this->date) {
             $this->date = null;
         } else {
             break;
         }
     }
     // strip elements (using xpath expressions)
     foreach ($this->config->strip as $pattern) {
         $elems = @$xpath->query($pattern, $this->readability->dom);
         // check for matches
         if ($elems && $elems->length > 0) {
             $this->debug('Stripping ' . $elems->length . ' elements (strip)');
             for ($i = $elems->length - 1; $i >= 0; $i--) {
                 $elems->item($i)->parentNode->removeChild($elems->item($i));
             }
         }
     }
     // strip elements (using id and class attribute values)
     foreach ($this->config->strip_id_or_class as $string) {
         $string = strtr($string, array("'" => '', '"' => ''));
         $elems = @$xpath->query("//*[contains(@class, '{$string}') or contains(@id, '{$string}')]", $this->readability->dom);
         // check for matches
         if ($elems && $elems->length > 0) {
             $this->debug('Stripping ' . $elems->length . ' elements (strip_id_or_class)');
             for ($i = $elems->length - 1; $i >= 0; $i--) {
                 $elems->item($i)->parentNode->removeChild($elems->item($i));
             }
         }
     }
     // strip images (using src attribute values)
     foreach ($this->config->strip_image_src as $string) {
         $string = strtr($string, array("'" => '', '"' => ''));
         $elems = @$xpath->query("//img[contains(@src, '{$string}')]", $this->readability->dom);
         // check for matches
         if ($elems && $elems->length > 0) {
             $this->debug('Stripping ' . $elems->length . ' image elements');
             for ($i = $elems->length - 1; $i >= 0; $i--) {
                 $elems->item($i)->parentNode->removeChild($elems->item($i));
             }
         }
     }
     // strip elements using Readability.com and Instapaper.com ignore class names
     // .entry-unrelated and .instapaper_ignore
     // See https://www.readability.com/publishers/guidelines/#view-plainGuidelines
     // and http://blog.instapaper.com/post/730281947
     $elems = @$xpath->query("//*[contains(concat(' ',normalize-space(@class),' '),' entry-unrelated ') or contains(concat(' ',normalize-space(@class),' '),' instapaper_ignore ')]", $this->readability->dom);
     // check for matches
     if ($elems && $elems->length > 0) {
         $this->debug('Stripping ' . $elems->length . ' .entry-unrelated,.instapaper_ignore elements');
         for ($i = $elems->length - 1; $i >= 0; $i--) {
             $elems->item($i)->parentNode->removeChild($elems->item($i));
         }
     }
     // strip elements that contain style="display: none;"
     $elems = @$xpath->query("//*[contains(@style,'display:none')]", $this->readability->dom);
     // check for matches
     if ($elems && $elems->length > 0) {
         $this->debug('Stripping ' . $elems->length . ' elements with inline display:none style');
         for ($i = $elems->length - 1; $i >= 0; $i--) {
             $elems->item($i)->parentNode->removeChild($elems->item($i));
         }
     }
     // try to get body
     foreach ($this->config->body as $pattern) {
         $elems = @$xpath->query($pattern, $this->readability->dom);
         // check for matches
         //echo "elems->length: [" . $pattern. "]\n\n";
         //echo "elems->length: [" . $this->readability->dom->innerHTML. "]\n\n";
         if ($elems && $elems->length > 0) {
             //echo "elems->length matched: [" . $pattern. "]\n\n";
             //print_r($next_page_pattern);
             $this->body = $this->getMatchedBody($elems);
             $next_page_content = $this->retrieveNextPage($xpath, $url);
             //if ($next_page_content !== FALSE) {
             //    $body->appendChild($next_page_content);
             //$next_page_content = $this->retrieveNextPage($xpath, $body, $url);
             //}
             //$this->body = $body;
             if ($elems->length === 1) {
                 break;
             }
         }
     }
     //echo "auto detect之前: [" . $this->body->innerHTML . "]\n\n";
     // auto detect?
     $detect_title = $detect_body = $detect_author = $detect_date = false;
     // detect title?
     if (!isset($this->title)) {
         if (empty($this->config->title) || $this->config->autodetect_on_failure) {
             $detect_title = true;
         }
     }
     // detect body?
     if (!isset($this->body)) {
         if (empty($this->config->body) || $this->config->autodetect_on_failure) {
             $detect_body = true;
         }
     }
     // detect author?
     if (empty($this->author)) {
         if (empty($this->config->author) || $this->config->autodetect_on_failure) {
             $detect_author = true;
         }
     }
     // detect date?
     if (!isset($this->date)) {
         if (empty($this->config->date) || $this->config->autodetect_on_failure) {
             $detect_date = true;
         }
     }
     // check for hNews
     if ($detect_title || $detect_body) {
         // check for hentry
         $elems = @$xpath->query("//*[contains(concat(' ',normalize-space(@class),' '),' hentry ')]", $this->readability->dom);
         if ($elems && $elems->length > 0) {
             $this->debug('hNews: found hentry');
             $hentry = $elems->item(0);
             if ($detect_title) {
                 // check for entry-title
                 $elems = @$xpath->query(".//*[contains(concat(' ',normalize-space(@class),' '),' entry-title ')]", $hentry);
                 if ($elems && $elems->length > 0) {
                     $this->debug('hNews: found entry-title');
                     $this->title = $elems->item(0)->textContent;
                     // remove title from document
                     $elems->item(0)->parentNode->removeChild($elems->item(0));
                     $detect_title = false;
                 }
             }
             if ($detect_date) {
                 // check for time element with pubdate attribute
                 $elems = @$xpath->query(".//time[@pubdate] | .//abbr[contains(concat(' ',normalize-space(@class),' '),' published ')]", $hentry);
                 if ($elems && $elems->length > 0) {
                     $this->debug('hNews: found publication date');
                     $this->date = strtotime(trim($elems->item(0)->textContent));
                     // remove date from document
                     //$elems->item(0)->parentNode->removeChild($elems->item(0));
                     if ($this->date) {
                         $detect_date = false;
                     } else {
                         $this->date = null;
                     }
                 }
             }
             if ($detect_author) {
                 // check for time element with pubdate attribute
                 $elems = @$xpath->query(".//*[contains(concat(' ',normalize-space(@class),' '),' vcard ') and (contains(concat(' ',normalize-space(@class),' '),' author ') or contains(concat(' ',normalize-space(@class),' '),' byline '))]", $hentry);
                 if ($elems && $elems->length > 0) {
                     $this->debug('hNews: found author');
                     $author = $elems->item(0);
                     $fn = @$xpath->query(".//*[contains(concat(' ',normalize-space(@class),' '),' fn ')]", $author);
                     if ($fn && $fn->length > 0) {
                         foreach ($fn as $_fn) {
                             if (trim($_fn->textContent) != '') {
                                 $this->author[] = trim($_fn->textContent);
                             }
                         }
                     } else {
                         if (trim($author->textContent) != '') {
                             $this->author[] = trim($author->textContent);
                         }
                     }
                     $detect_author = empty($this->author);
                 }
             }
             // check for entry-content.
             // according to hAtom spec, if there are multiple elements marked entry-content,
             // we include all of these in the order they appear - see http://microformats.org/wiki/hatom#Entry_Content
             if ($detect_body) {
                 $elems = @$xpath->query(".//*[contains(concat(' ',normalize-space(@class),' '),' entry-content ')]", $hentry);
                 if ($elems && $elems->length > 0) {
                     $this->debug('hNews: found entry-content');
                     if ($elems->length == 1) {
                         // what if it's empty? (some sites misuse hNews - place their content outside an empty entry-content element)
                         $e = $elems->item(0);
                         if ($e->tagName == 'img' || trim($e->textContent) != '') {
                             $this->body = $elems->item(0);
                             //echo "elems->item: [" . $this->body->innerHTML . "]\n\n";
                             // prune (clean up elements that may not be content)
                             if ($this->config->prune) {
                                 $this->debug('Pruning content');
                                 $this->readability->prepArticle($this->body);
                             }
                             $detect_body = false;
                         } else {
                             $this->debug('hNews: skipping entry-content - appears not to contain content');
                         }
                         unset($e);
                     } else {
                         $this->body = $this->readability->dom->createElement('div');
                         //echo "elems->item: [" . $this->body->innerHTML . "]\n\n";
                         $this->debug($elems->length . ' entry-content elems found');
                         foreach ($elems as $elem) {
                             if (!isset($elem->parentNode)) {
                                 continue;
                             }
                             $isDescendant = false;
                             foreach ($this->body->childNodes as $parent) {
                                 if ($this->isDescendant($parent, $elem)) {
                                     $isDescendant = true;
                                     break;
                                 }
                             }
                             if ($isDescendant) {
                                 $this->debug('Element is child of another body element, skipping.');
                             } else {
                                 // prune (clean up elements that may not be content)
                                 if ($this->config->prune) {
                                     $this->debug('Pruning content');
                                     $this->readability->prepArticle($elem);
                                 }
                                 $this->debug('Element added to body');
                                 $this->body->appendChild($elem);
                             }
                         }
                         echo "elems->item: [" . $this->body->innerHTML . "]\n\n";
                         $detect_body = false;
                     }
                 }
             }
         }
     }
     //echo "elems->item: [" . $this->body->innerHTML . "]\n\n";
     // check for elements marked with instapaper_title
     if ($detect_title) {
         // check for instapaper_title
         $elems = @$xpath->query("//*[contains(concat(' ',normalize-space(@class),' '),' instapaper_title ')]", $this->readability->dom);
         if ($elems && $elems->length > 0) {
             $this->debug('title found (.instapaper_title)');
             $this->title = $elems->item(0)->textContent;
             // remove title from document
             $elems->item(0)->parentNode->removeChild($elems->item(0));
             $detect_title = false;
         }
     }
     // check for elements marked with instapaper_body
     if ($detect_body) {
         $elems = @$xpath->query("//*[contains(concat(' ',normalize-space(@class),' '),' instapaper_body ')]", $this->readability->dom);
         if ($elems && $elems->length > 0) {
             $this->debug('body found (.instapaper_body)');
             $this->body = $elems->item(0);
             // prune (clean up elements that may not be content)
             if ($this->config->prune) {
                 $this->debug('Pruning content');
                 $this->readability->prepArticle($this->body);
             }
             $detect_body = false;
         }
     }
     //echo "after detect_body: [" . $this->body->innerHTML . "]\n\n";
     // Find author in rel="author" marked element
     // We only use this if there's exactly one.
     // If there's more than one, it could indicate more than
     // one author, but it could also indicate that we're processing
     // a page listing different articles with different authors.
     if ($detect_author) {
         $elems = @$xpath->query("//a[contains(concat(' ',normalize-space(@rel),' '),' author ')]", $this->readability->dom);
         if ($elems && $elems->length == 1) {
             $this->debug('Author found (rel="author")');
             $author = trim($elems->item(0)->textContent);
             if ($author != '') {
                 $this->author[] = $author;
                 $detect_author = false;
             }
         }
     }
     // Find date in pubdate marked time element
     // For the same reason given above, we only use this
     // if there's exactly one element.
     if ($detect_date) {
         $elems = @$xpath->query("//time[@pubdate]", $this->readability->dom);
         if ($elems && $elems->length == 1) {
             $this->debug('Date found (pubdate marked time element)');
             $this->date = strtotime(trim($elems->item(0)->textContent));
             // remove date from document
             //$elems->item(0)->parentNode->removeChild($elems->item(0));
             if ($this->date) {
                 $detect_date = false;
             } else {
                 $this->date = null;
             }
         }
     }
     // still missing title or body, so we detect using Readability
     if ($detect_title || $detect_body) {
         $this->debug('Using Readability');
         // clone body if we're only using Readability for title (otherwise it may interfere with body element)
         if (isset($this->body)) {
             $this->body = $this->body->cloneNode(true);
         }
         $success = $this->readability->init();
     }
     if ($detect_title) {
         $this->debug('Detecting title');
         $this->title = $this->readability->getTitle()->textContent;
     }
     //echo "before detect body success [" . $this->body->innerHTML . "]\n\n";
     if ($detect_body && $success) {
         $this->debug('Detecting body');
         $this->body = $this->readability->getContent();
         //echo "getContent() : [" . $this->body->innerHTML . "] \n\n" ;
         if ($this->body->childNodes->length == 1 && $this->body->firstChild->nodeType === XML_ELEMENT_NODE) {
             $this->body = $this->body->firstChild;
         }
         // prune (clean up elements that may not be content)
         if ($this->config->prune) {
             $this->debug('Pruning content');
             $this->readability->prepArticle($this->body);
         }
     }
     //echo "如果沒有Body [" . $this->body->innerHTML . "]\n\n";
     if (isset($this->body)) {
         // remove scripts
         $this->readability->removeScripts($this->body);
         // remove any h1-h6 elements that appear as first thing in the body
         // and which match our title
         if (isset($this->title) && $this->title != '') {
             $firstChild = $this->body->firstChild;
             while ($firstChild->nodeType && $firstChild->nodeType !== XML_ELEMENT_NODE) {
                 $firstChild = $firstChild->nextSibling;
             }
             if ($firstChild->nodeType === XML_ELEMENT_NODE && in_array(strtolower($firstChild->tagName), array('h1', 'h2', 'h3', 'h4', 'h5', 'h6')) && strtolower(trim($firstChild->textContent)) == strtolower(trim($this->title))) {
                 $this->body->removeChild($firstChild);
             }
         }
         $this->success = true;
     }
     //echo "下一頁之前: [" . $this->body->innerHTML . "]\n\n";
     // 20131011 要實作下一頁的偵測!!!
     if (isset($this->body)) {
         //$elems = @$xpath->query("//a[starts-with(@href, /?p=) and &page=2']", $this->readability->dom);
         $next_page_pattern = $this->options->next_page_pattern;
         //echo $next_page_pattern;
         //$next_page_pattern = "//a[contains(@href, '&page=')]";
         //$next_page_pattern = "//a";
         $elems = @$xpath->query($next_page_pattern, $this->readability->dom);
         ////echo $elems->length;
         //$link = @$xpath->query("//a[contains(@href, '&page=')]/@href", $this->readability->dom);
         //if ($link, $) {
         if ($elems && $elems->length > 0) {
             try {
                 @($elem = $this->readability->dom->createElement('div', $elems->item(0)->getAttribute("href")));
             } catch (Exception $e) {
             }
             $elem = $this->readability->dom->createElement('div', "aaa");
             $attributes = $elems->item($elems->legnth)->attributes;
             $href = $attributes->getNamedItem("href")->value;
             if (substr($href, 0, 4) !== "http") {
                 //echo $href;
                 $url_component = parse_url($url);
                 //$href = urlencode($href);
                 //$elem = $this->readability->dom->createElement('div', $href);
                 //$this->body = $elem;
                 $permalink = $url_component["scheme"] . "://" . $url_component["host"] . $href;
             } else {
                 $permalink = $href;
             }
             //$permalink = $this->getNextPagePermalink($elems);
             //echo $permalink;
             //echo "[[[[".$permalink."]]]]";
             //$permalink = "http://blog.soft.idv.tw/?p=1606&page=2";
             $extractor = new ContentExtractor(dirname(__FILE__) . '/site_config/custom', dirname(__FILE__) . '/site_config/standard');
             $extractor->fingerprints = $options->fingerprints;
             $elem = new ContentExtractor($this->path, $this->fallback);
             $extractor->fingerprints = $this->fingerprints;
             $http = new HumbleHttpAgent();
             $response = $http->get($permalink, true);
             //echo 'status_code: '. $response['status_code'] . "\n\n";
             if ($permalink && ($response = $http->get($permalink, true)) && ($response['status_code'] < 300 || $response['status_code'] > 400)) {
                 $html = $response['body'];
                 //echo "html: " .$html;
                 // remove strange things
                 $html = str_replace('</[>', '', $html);
                 $html = convert_to_utf8($html, $response['headers']);
                 $extract_result = $extractor->process($html, $permalink);
                 //$readability = $extractor->readability;
                 $content_block = $extract_result ? $extractor->getContent() : null;
                 //echo "content_block->innerHTML: ". $content_block->innerHTML . "\n\n";
                 //$this->body->appendChild($elem);
             }
             //$doc = $this->readability->dom->("<node>".$content_block->C14N()."</node>");
             //$content = $content_block->
             //$content = $this->readability->dom->createElement('div', $content_block->innerHTML);
             $doc = new DOMDocument();
             if (@$doc->loadHTML($content_block->innerHTML)) {
                 $doc->saveHTML();
                 //$content = $this->readability->dom->loadHTML($content_block->innerHTML);
                 $content = $this->readability->dom->createElement('div', $content_block->innerHTML);
                 $content = $this->readability->dom->importNode($content_block, true);
                 $this->body->appendChild($content);
                 //$this->body->appendChild($doc);
                 //$xpath = new DOMXPath($this->readability->dom);
                 //$elems = @$xpath->query($extract_pattern, $content_block);
                 //$this->body->appendChild($content_block);
                 //$this->body = $content_block;
             }
         }
     }
     // if we've had no success and we've used tidy, there's a chance
     // that tidy has messed up. So let's try again without tidy...
     if (!$this->success && $tidied && $smart_tidy) {
         $this->debug('Trying again without tidy');
         $this->process($original_html, $url, false);
     }
     return $this->success;
 }