function getSinglePage($item, $html, $url) { global $http, $extractor; //$url = "http://chinese.engadget.com/2014/04/21/nintendo-game-boy-25th-anniversary/"; //echo "getSinglePage: " . $url . "\n"; $host = @parse_url($url, PHP_URL_HOST); $site_config = SiteConfig::build($host); if ($site_config === false) { // check for fingerprints if (!empty($extractor->fingerprints) && ($_fphost = $extractor->findHostUsingFingerprints($html))) { $site_config = SiteConfig::build($_fphost); } if ($site_config === false) { $site_config = new SiteConfig(); } SiteConfig::add_to_cache($host, $site_config); return false; } else { SiteConfig::add_to_cache($host, $site_config); } $splink = null; if (!empty($site_config->single_page_link)) { $splink = $site_config->single_page_link; } elseif (!empty($site_config->single_page_link_in_feed)) { // single page link xpath is targeted at feed $splink = $site_config->single_page_link_in_feed; // so let's replace HTML with feed item description $html = $item->get_description(); } if (isset($splink)) { // Build DOM tree from HTML $readability = new Readability($html, $url); $xpath = new DOMXPath($readability->dom); // Loop through single_page_link xpath expressions $single_page_url = null; foreach ($splink as $pattern) { $elems = @$xpath->evaluate($pattern, $readability->dom); if (is_string($elems)) { $single_page_url = trim($elems); break; } elseif ($elems instanceof DOMNodeList && $elems->length > 0) { foreach ($elems as $item) { if ($item instanceof DOMElement && $item->hasAttribute('href')) { $single_page_url = $item->getAttribute('href'); break; } elseif ($item instanceof DOMAttr && $item->value) { $single_page_url = $item->value; break; } } } } // If we've got URL, resolve against $url if (isset($single_page_url) && ($single_page_url = makeAbsoluteStr($url, $single_page_url))) { // check it's not what we have already! if ($single_page_url != $url) { // it's not, so let's try to fetch it... $_prev_ref = $http->referer; $http->referer = $single_page_url; if (($response = $http->get($single_page_url, true)) && $response['status_code'] < 300) { $http->referer = $_prev_ref; return $response; } $http->referer = $_prev_ref; } } } return false; }
public function process($html, $url, $smart_tidy = true) { $this->reset(); // extract host name $host = @parse_url($url, PHP_URL_HOST); if (!($this->config = SiteConfig::build($host))) { // no match, so use defaults $this->config = new SiteConfig(); } // store copy of config in our static cache array in case we need to process another URL SiteConfig::add_to_cache($host, $this->config); // use tidy (if it exists)? // This fixes problems with some sites which would otherwise // trouble DOMDocument's HTML parsing. (Although sometimes it // makes matters worse, which is why you can override it in site config files.) $tidied = false; if ($this->config->tidy && function_exists('tidy_parse_string') && $smart_tidy) { $this->debug('Using Tidy'); $tidy = tidy_parse_string($html, self::$tidy_config, 'UTF8'); if (tidy_clean_repair($tidy)) { $original_html = $html; $tidied = true; $html = $tidy->value; } unset($tidy); } // load and parse html $this->readability = new Readability($html, $url); // we use xpath to find elements in the given HTML document // see http://en.wikipedia.org/wiki/XPath_1.0 $xpath = new DOMXPath($this->readability->dom); // strip elements (using xpath expressions) foreach ($this->config->strip as $pattern) { $elems = @$xpath->query($pattern, $this->readability->dom); // check for matches if ($elems && $elems->length > 0) { $this->debug('Stripping ' . $elems->length . ' elements (strip)'); for ($i = $elems->length - 1; $i >= 0; $i--) { $elems->item($i)->parentNode->removeChild($elems->item($i)); } } } // strip elements (using id and class attribute values) foreach ($this->config->strip_id_or_class as $string) { $string = strtr($string, array("'" => '', '"' => '')); $elems = @$xpath->query("//*[contains(@class, '{$string}') or contains(@id, '{$string}')]", $this->readability->dom); // check for matches if ($elems && $elems->length > 0) { $this->debug('Stripping ' . $elems->length . ' elements (strip_id_or_class)'); for ($i = $elems->length - 1; $i >= 0; $i--) { $elems->item($i)->parentNode->removeChild($elems->item($i)); } } } // strip images (using src attribute values) foreach ($this->config->strip_image_src as $string) { $string = strtr($string, array("'" => '', '"' => '')); $elems = @$xpath->query("//img[contains(@src, '{$string}')]", $this->readability->dom); // check for matches if ($elems && $elems->length > 0) { $this->debug('Stripping ' . $elems->length . ' image elements'); for ($i = $elems->length - 1; $i >= 0; $i--) { $elems->item($i)->parentNode->removeChild($elems->item($i)); } } } // strip elements using Readability.com and Instapaper.com ignore class names // .entry-unrelated and .instapaper_ignore // See https://www.readability.com/publishers/guidelines/#view-plainGuidelines // and http://blog.instapaper.com/post/730281947 $elems = @$xpath->query("//*[contains(concat(' ',normalize-space(@class),' '),' entry-unrelated ') or contains(concat(' ',normalize-space(@class),' '),' instapaper_ignore ')]", $this->readability->dom); // check for matches if ($elems && $elems->length > 0) { $this->debug('Stripping ' . $elems->length . ' .entry-unrelated,.instapaper_ignore elements'); for ($i = $elems->length - 1; $i >= 0; $i--) { $elems->item($i)->parentNode->removeChild($elems->item($i)); } } // strip elements that contain style="display: none;" $elems = @$xpath->query("//*[contains(@style,'display:none')]", $this->readability->dom); // check for matches if ($elems && $elems->length > 0) { $this->debug('Stripping ' . $elems->length . ' elements with inline display:none style'); for ($i = $elems->length - 1; $i >= 0; $i--) { $elems->item($i)->parentNode->removeChild($elems->item($i)); } } // try to get title foreach ($this->config->title as $pattern) { $elems = @$xpath->evaluate($pattern, $this->readability->dom); if (is_string($elems)) { $this->debug('Title expression evaluated as string'); $this->title = trim($elems); break; } elseif ($elems instanceof DOMNodeList && $elems->length > 0) { $this->debug('Title matched'); $this->title = $elems->item(0)->textContent; break; } } // try to get body foreach ($this->config->body as $pattern) { $elems = @$xpath->query($pattern, $this->readability->dom); // check for matches if ($elems && $elems->length > 0) { $this->debug('Body matched'); if ($elems->length == 1) { $this->body = $elems->item(0); // prune (clean up elements that may not be content) if ($this->config->prune) { $this->debug('Pruning content'); $this->readability->prepArticle($this->body); } break; } else { $this->body = $this->readability->dom->createElement('div'); $this->debug($elems->length . ' body elems found'); foreach ($elems as $elem) { $isDescendant = false; foreach ($this->body->childNodes as $parent) { if ($this->isDescendant($parent, $elem)) { $isDescendant = true; break; } } if ($isDescendant) { $this->debug('Element is child of another body element, skipping.'); } else { // prune (clean up elements that may not be content) if ($this->config->prune) { $this->debug('Pruning content'); $this->readability->prepArticle($elem); } $this->debug('Element added to body'); $this->body->appendChild($elem); } } } } } // auto detect? $detect_title = $detect_body = false; // detect title? if (!isset($this->title)) { if (empty($this->config->title) || !empty($this->config->title) && $this->config->autodetect_on_failure) { $detect_title = true; } } // detect body? if (!isset($this->body)) { if (empty($this->config->body) || !empty($this->config->body) && $this->config->autodetect_on_failure) { $detect_body = true; } } // check for hNews if ($detect_title || $detect_body) { // check for hentry $elems = @$xpath->query("//*[contains(concat(' ',normalize-space(@class),' '),' hentry ')]", $this->readability->dom); if ($elems && $elems->length > 0) { $this->debug('hNews: found hentry'); $hentry = $elems->item(0); if ($detect_title) { // check for entry-title $elems = @$xpath->query(".//*[contains(concat(' ',normalize-space(@class),' '),' entry-title ')]", $hentry); if ($elems && $elems->length > 0) { $this->debug('hNews: found entry-title'); $this->title = $elems->item(0)->textContent; $detect_title = false; } } // check for entry-content. // according to hAtom spec, if there are multiple elements marked entry-content, // we include all of these in the order they appear - see http://microformats.org/wiki/hatom#Entry_Content if ($detect_body) { $elems = @$xpath->query(".//*[contains(concat(' ',normalize-space(@class),' '),' entry-content ')]", $hentry); if ($elems && $elems->length > 0) { $this->debug('hNews: found entry-content'); if ($elems->length == 1) { // what if it's empty? (some sites misuse hNews - place their content outside an empty entry-content element) $e = $elems->item(0); if ($e->tagName == 'img' || trim($e->textContent) != '') { $this->body = $elems->item(0); // prune (clean up elements that may not be content) if ($this->config->prune) { $this->debug('Pruning content'); $this->readability->prepArticle($this->body); } $detect_body = false; } else { $this->debug('hNews: skipping entry-content - appears not to contain content'); } unset($e); } else { $this->body = $this->readability->dom->createElement('div'); $this->debug($elems->length . ' entry-content elems found'); foreach ($elems as $elem) { $isDescendant = false; foreach ($this->body->childNodes as $parent) { if ($this->isDescendant($parent, $elem)) { $isDescendant = true; break; } } if ($isDescendant) { $this->debug('Element is child of another body element, skipping.'); } else { // prune (clean up elements that may not be content) if ($this->config->prune) { $this->debug('Pruning content'); $this->readability->prepArticle($elem); } $this->debug('Element added to body'); $this->body->appendChild($elem); } } $detect_body = false; } } } } } // check for elements marked with instapaper_title if ($detect_title) { // check for instapaper_title $elems = @$xpath->query("//*[contains(concat(' ',normalize-space(@class),' '),' instapaper_title ')]", $this->readability->dom); if ($elems && $elems->length > 0) { $this->debug('title found (.instapaper_title)'); $this->title = $elems->item(0)->textContent; $detect_title = false; } } // check for elements marked with instapaper_body if ($detect_body) { $elems = @$xpath->query("//*[contains(concat(' ',normalize-space(@class),' '),' instapaper_body ')]", $this->readability->dom); if ($elems && $elems->length > 0) { $this->debug('body found (.instapaper_body)'); $this->body = $elems->item(0); // prune (clean up elements that may not be content) if ($this->config->prune) { $this->debug('Pruning content'); $this->readability->prepArticle($this->body); } $detect_body = false; } } // still missing title or body, so we detect using Readability if ($detect_title || $detect_body) { $this->debug('Using Readability'); // clone body if we're only using Readability for title (otherwise it may interfere with body element) if (isset($this->body)) { $this->body = $this->body->cloneNode(true); } $success = $this->readability->init(); } if ($detect_title) { $this->debug('Detecting title'); $this->title = $this->readability->getTitle()->textContent; } if ($detect_body && $success) { $this->debug('Detecting body'); $this->body = $this->readability->getContent(); if ($this->body->childNodes->length == 1 && $this->body->firstChild->nodeType === XML_ELEMENT_NODE) { $this->body = $this->body->firstChild; } // prune (clean up elements that may not be content) if ($this->config->prune) { $this->debug('Pruning content'); $this->readability->prepArticle($this->body); } } if (isset($this->body)) { // remove scripts $this->readability->removeScripts($this->body); $this->success = true; } // if we've had no success and we've used tidy, there's a chance // that tidy has messed up. So let's try again without tidy... if (!$this->success && $tidied && $smart_tidy) { $this->debug('Trying again without tidy'); $this->process($original_html, $url, false); } return $this->success; }
public function buildSiteConfig($url, $html = '', $add_to_cache = true) { // extract host name $host = @parse_url($url, PHP_URL_HOST); $host = strtolower($host); if (substr($host, 0, 4) == 'www.') { $host = substr($host, 4); } // is merged version already cached? if (SiteConfig::is_cached("{$host}.merged")) { $this->debug("Returning cached and merged site config for {$host}"); return SiteConfig::build("{$host}.merged"); } // let's build from site_config/custom/ and standard/ $config = SiteConfig::build($host); if ($add_to_cache && $config && !SiteConfig::is_cached("{$host}")) { SiteConfig::add_to_cache($host, $config); } // if no match, use defaults if (!$config) { $config = new SiteConfig(); } // load fingerprint config? if ($config->autodetect_on_failure()) { // check HTML for fingerprints if (!empty($this->fingerprints) && ($_fphost = $this->findHostUsingFingerprints($html))) { if ($config_fingerprint = SiteConfig::build($_fphost)) { $this->debug("Appending site config settings from {$_fphost} (fingerprint match)"); $config->append($config_fingerprint); if ($add_to_cache && !SiteConfig::is_cached($_fphost)) { //$config_fingerprint->cache_in_apc = true; SiteConfig::add_to_cache($_fphost, $config_fingerprint); } } } } // load global config? if ($config->autodetect_on_failure()) { if ($config_global = SiteConfig::build('global', true)) { $this->debug('Appending site config settings from global.txt'); $config->append($config_global); if ($add_to_cache && !SiteConfig::is_cached('global')) { //$config_global->cache_in_apc = true; SiteConfig::add_to_cache('global', $config_global); } } } // store copy of merged config if ($add_to_cache) { // do not store in APC if wildcard match $use_apc = $host == $config->cache_key; $config->cache_key = null; SiteConfig::add_to_cache("{$host}.merged", $config, $use_apc); } return $config; }
public function process($html, $url, $smart_tidy = true) { $this->reset(); // extract host name $host = @parse_url($url, PHP_URL_HOST); if (!($this->config = SiteConfig::build($host))) { // no match, check HTML for fingerprints if (!empty($this->fingerprints) && ($_fphost = $this->findHostUsingFingerprints($html))) { $this->config = SiteConfig::build($_fphost); } unset($_fphost); if (!$this->config) { // no match, so use defaults $this->config = new SiteConfig(); } } //echo count($this->config->body); // store copy of config in our static cache array in case we need to process another URL SiteConfig::add_to_cache($host, $this->config); // do string replacements foreach ($this->config->replace_string as $_repl) { $html = str_replace($_repl[0], $_repl[1], $html); } unset($_repl); // use tidy (if it exists)? // This fixes problems with some sites which would otherwise // trouble DOMDocument's HTML parsing. (Although sometimes it // makes matters worse, which is why you can override it in site config files.) $tidied = false; if ($this->config->tidy && function_exists('tidy_parse_string') && $smart_tidy) { $this->debug('Using Tidy'); $tidy = tidy_parse_string($html, self::$tidy_config, 'UTF8'); if (tidy_clean_repair($tidy)) { $original_html = $html; $tidied = true; $html = $tidy->value; } unset($tidy); } // load and parse html $this->readability = new Readability($html, $url); // we use xpath to find elements in the given HTML document // see http://en.wikipedia.org/wiki/XPath_1.0 $xpath = new DOMXPath($this->readability->dom); // try to get title foreach ($this->config->title as $pattern) { $elems = @$xpath->evaluate($pattern, $this->readability->dom); if (is_string($elems)) { $this->debug('Title expression evaluated as string'); $this->title = trim($elems); break; } elseif ($elems instanceof DOMNodeList && $elems->length > 0) { $this->debug('Title matched'); $this->title = $elems->item(0)->textContent; // remove title from document try { $elems->item(0)->parentNode->removeChild($elems->item(0)); } catch (DOMException $e) { // do nothing } break; } } // try to get author (if it hasn't already been set) if (empty($this->author)) { foreach ($this->config->author as $pattern) { $elems = @$xpath->evaluate($pattern, $this->readability->dom); if (is_string($elems)) { $this->debug('Author expression evaluated as string'); if (trim($elems) != '') { $this->author[] = trim($elems); break; } } elseif ($elems instanceof DOMNodeList && $elems->length > 0) { foreach ($elems as $elem) { if (!isset($elem->parentNode)) { continue; } $this->author[] = trim($elem->textContent); } if (!empty($this->author)) { break; } } } } // try to get language $_lang_xpath = array('//html[@lang]/@lang', '//meta[@name="DC.language"]/@content'); foreach ($_lang_xpath as $pattern) { $elems = @$xpath->evaluate($pattern, $this->readability->dom); if (is_string($elems)) { if (trim($elems) != '') { $this->language = trim($elems); break; } } elseif ($elems instanceof DOMNodeList && $elems->length > 0) { foreach ($elems as $elem) { if (!isset($elem->parentNode)) { continue; } $this->language = trim($elem->textContent); } if ($this->language) { break; } } } // try to get date foreach ($this->config->date as $pattern) { $elems = @$xpath->evaluate($pattern, $this->readability->dom); if (is_string($elems)) { $this->debug('Date expression evaluated as string'); $this->date = strtotime(trim($elems, "; \t\n\r\v")); } elseif ($elems instanceof DOMNodeList && $elems->length > 0) { $this->debug('Date matched'); $this->date = $elems->item(0)->textContent; $this->date = strtotime(trim($this->date, "; \t\n\r\v")); // remove date from document // $elems->item(0)->parentNode->removeChild($elems->item(0)); } if (!$this->date) { $this->date = null; } else { break; } } // strip elements (using xpath expressions) foreach ($this->config->strip as $pattern) { $elems = @$xpath->query($pattern, $this->readability->dom); // check for matches if ($elems && $elems->length > 0) { $this->debug('Stripping ' . $elems->length . ' elements (strip)'); for ($i = $elems->length - 1; $i >= 0; $i--) { $elems->item($i)->parentNode->removeChild($elems->item($i)); } } } // strip elements (using id and class attribute values) foreach ($this->config->strip_id_or_class as $string) { $string = strtr($string, array("'" => '', '"' => '')); $elems = @$xpath->query("//*[contains(@class, '{$string}') or contains(@id, '{$string}')]", $this->readability->dom); // check for matches if ($elems && $elems->length > 0) { $this->debug('Stripping ' . $elems->length . ' elements (strip_id_or_class)'); for ($i = $elems->length - 1; $i >= 0; $i--) { $elems->item($i)->parentNode->removeChild($elems->item($i)); } } } // strip images (using src attribute values) foreach ($this->config->strip_image_src as $string) { $string = strtr($string, array("'" => '', '"' => '')); $elems = @$xpath->query("//img[contains(@src, '{$string}')]", $this->readability->dom); // check for matches if ($elems && $elems->length > 0) { $this->debug('Stripping ' . $elems->length . ' image elements'); for ($i = $elems->length - 1; $i >= 0; $i--) { $elems->item($i)->parentNode->removeChild($elems->item($i)); } } } // strip elements using Readability.com and Instapaper.com ignore class names // .entry-unrelated and .instapaper_ignore // See https://www.readability.com/publishers/guidelines/#view-plainGuidelines // and http://blog.instapaper.com/post/730281947 $elems = @$xpath->query("//*[contains(concat(' ',normalize-space(@class),' '),' entry-unrelated ') or contains(concat(' ',normalize-space(@class),' '),' instapaper_ignore ')]", $this->readability->dom); // check for matches if ($elems && $elems->length > 0) { $this->debug('Stripping ' . $elems->length . ' .entry-unrelated,.instapaper_ignore elements'); for ($i = $elems->length - 1; $i >= 0; $i--) { $elems->item($i)->parentNode->removeChild($elems->item($i)); } } // strip elements that contain style="display: none;" $elems = @$xpath->query("//*[contains(@style,'display:none')]", $this->readability->dom); // check for matches if ($elems && $elems->length > 0) { $this->debug('Stripping ' . $elems->length . ' elements with inline display:none style'); for ($i = $elems->length - 1; $i >= 0; $i--) { $elems->item($i)->parentNode->removeChild($elems->item($i)); } } // try to get body foreach ($this->config->body as $pattern) { $elems = @$xpath->query($pattern, $this->readability->dom); // check for matches //echo "elems->length: [" . $pattern. "]\n\n"; //echo "elems->length: [" . $this->readability->dom->innerHTML. "]\n\n"; if ($elems && $elems->length > 0) { //echo "elems->length matched: [" . $pattern. "]\n\n"; //print_r($next_page_pattern); $this->body = $this->getMatchedBody($elems); $next_page_content = $this->retrieveNextPage($xpath, $url); //if ($next_page_content !== FALSE) { // $body->appendChild($next_page_content); //$next_page_content = $this->retrieveNextPage($xpath, $body, $url); //} //$this->body = $body; if ($elems->length === 1) { break; } } } //echo "auto detect之前: [" . $this->body->innerHTML . "]\n\n"; // auto detect? $detect_title = $detect_body = $detect_author = $detect_date = false; // detect title? if (!isset($this->title)) { if (empty($this->config->title) || $this->config->autodetect_on_failure) { $detect_title = true; } } // detect body? if (!isset($this->body)) { if (empty($this->config->body) || $this->config->autodetect_on_failure) { $detect_body = true; } } // detect author? if (empty($this->author)) { if (empty($this->config->author) || $this->config->autodetect_on_failure) { $detect_author = true; } } // detect date? if (!isset($this->date)) { if (empty($this->config->date) || $this->config->autodetect_on_failure) { $detect_date = true; } } // check for hNews if ($detect_title || $detect_body) { // check for hentry $elems = @$xpath->query("//*[contains(concat(' ',normalize-space(@class),' '),' hentry ')]", $this->readability->dom); if ($elems && $elems->length > 0) { $this->debug('hNews: found hentry'); $hentry = $elems->item(0); if ($detect_title) { // check for entry-title $elems = @$xpath->query(".//*[contains(concat(' ',normalize-space(@class),' '),' entry-title ')]", $hentry); if ($elems && $elems->length > 0) { $this->debug('hNews: found entry-title'); $this->title = $elems->item(0)->textContent; // remove title from document $elems->item(0)->parentNode->removeChild($elems->item(0)); $detect_title = false; } } if ($detect_date) { // check for time element with pubdate attribute $elems = @$xpath->query(".//time[@pubdate] | .//abbr[contains(concat(' ',normalize-space(@class),' '),' published ')]", $hentry); if ($elems && $elems->length > 0) { $this->debug('hNews: found publication date'); $this->date = strtotime(trim($elems->item(0)->textContent)); // remove date from document //$elems->item(0)->parentNode->removeChild($elems->item(0)); if ($this->date) { $detect_date = false; } else { $this->date = null; } } } if ($detect_author) { // check for time element with pubdate attribute $elems = @$xpath->query(".//*[contains(concat(' ',normalize-space(@class),' '),' vcard ') and (contains(concat(' ',normalize-space(@class),' '),' author ') or contains(concat(' ',normalize-space(@class),' '),' byline '))]", $hentry); if ($elems && $elems->length > 0) { $this->debug('hNews: found author'); $author = $elems->item(0); $fn = @$xpath->query(".//*[contains(concat(' ',normalize-space(@class),' '),' fn ')]", $author); if ($fn && $fn->length > 0) { foreach ($fn as $_fn) { if (trim($_fn->textContent) != '') { $this->author[] = trim($_fn->textContent); } } } else { if (trim($author->textContent) != '') { $this->author[] = trim($author->textContent); } } $detect_author = empty($this->author); } } // check for entry-content. // according to hAtom spec, if there are multiple elements marked entry-content, // we include all of these in the order they appear - see http://microformats.org/wiki/hatom#Entry_Content if ($detect_body) { $elems = @$xpath->query(".//*[contains(concat(' ',normalize-space(@class),' '),' entry-content ')]", $hentry); if ($elems && $elems->length > 0) { $this->debug('hNews: found entry-content'); if ($elems->length == 1) { // what if it's empty? (some sites misuse hNews - place their content outside an empty entry-content element) $e = $elems->item(0); if ($e->tagName == 'img' || trim($e->textContent) != '') { $this->body = $elems->item(0); //echo "elems->item: [" . $this->body->innerHTML . "]\n\n"; // prune (clean up elements that may not be content) if ($this->config->prune) { $this->debug('Pruning content'); $this->readability->prepArticle($this->body); } $detect_body = false; } else { $this->debug('hNews: skipping entry-content - appears not to contain content'); } unset($e); } else { $this->body = $this->readability->dom->createElement('div'); //echo "elems->item: [" . $this->body->innerHTML . "]\n\n"; $this->debug($elems->length . ' entry-content elems found'); foreach ($elems as $elem) { if (!isset($elem->parentNode)) { continue; } $isDescendant = false; foreach ($this->body->childNodes as $parent) { if ($this->isDescendant($parent, $elem)) { $isDescendant = true; break; } } if ($isDescendant) { $this->debug('Element is child of another body element, skipping.'); } else { // prune (clean up elements that may not be content) if ($this->config->prune) { $this->debug('Pruning content'); $this->readability->prepArticle($elem); } $this->debug('Element added to body'); $this->body->appendChild($elem); } } echo "elems->item: [" . $this->body->innerHTML . "]\n\n"; $detect_body = false; } } } } } //echo "elems->item: [" . $this->body->innerHTML . "]\n\n"; // check for elements marked with instapaper_title if ($detect_title) { // check for instapaper_title $elems = @$xpath->query("//*[contains(concat(' ',normalize-space(@class),' '),' instapaper_title ')]", $this->readability->dom); if ($elems && $elems->length > 0) { $this->debug('title found (.instapaper_title)'); $this->title = $elems->item(0)->textContent; // remove title from document $elems->item(0)->parentNode->removeChild($elems->item(0)); $detect_title = false; } } // check for elements marked with instapaper_body if ($detect_body) { $elems = @$xpath->query("//*[contains(concat(' ',normalize-space(@class),' '),' instapaper_body ')]", $this->readability->dom); if ($elems && $elems->length > 0) { $this->debug('body found (.instapaper_body)'); $this->body = $elems->item(0); // prune (clean up elements that may not be content) if ($this->config->prune) { $this->debug('Pruning content'); $this->readability->prepArticle($this->body); } $detect_body = false; } } //echo "after detect_body: [" . $this->body->innerHTML . "]\n\n"; // Find author in rel="author" marked element // We only use this if there's exactly one. // If there's more than one, it could indicate more than // one author, but it could also indicate that we're processing // a page listing different articles with different authors. if ($detect_author) { $elems = @$xpath->query("//a[contains(concat(' ',normalize-space(@rel),' '),' author ')]", $this->readability->dom); if ($elems && $elems->length == 1) { $this->debug('Author found (rel="author")'); $author = trim($elems->item(0)->textContent); if ($author != '') { $this->author[] = $author; $detect_author = false; } } } // Find date in pubdate marked time element // For the same reason given above, we only use this // if there's exactly one element. if ($detect_date) { $elems = @$xpath->query("//time[@pubdate]", $this->readability->dom); if ($elems && $elems->length == 1) { $this->debug('Date found (pubdate marked time element)'); $this->date = strtotime(trim($elems->item(0)->textContent)); // remove date from document //$elems->item(0)->parentNode->removeChild($elems->item(0)); if ($this->date) { $detect_date = false; } else { $this->date = null; } } } // still missing title or body, so we detect using Readability if ($detect_title || $detect_body) { $this->debug('Using Readability'); // clone body if we're only using Readability for title (otherwise it may interfere with body element) if (isset($this->body)) { $this->body = $this->body->cloneNode(true); } $success = $this->readability->init(); } if ($detect_title) { $this->debug('Detecting title'); $this->title = $this->readability->getTitle()->textContent; } //echo "before detect body success [" . $this->body->innerHTML . "]\n\n"; if ($detect_body && $success) { $this->debug('Detecting body'); $this->body = $this->readability->getContent(); //echo "getContent() : [" . $this->body->innerHTML . "] \n\n" ; if ($this->body->childNodes->length == 1 && $this->body->firstChild->nodeType === XML_ELEMENT_NODE) { $this->body = $this->body->firstChild; } // prune (clean up elements that may not be content) if ($this->config->prune) { $this->debug('Pruning content'); $this->readability->prepArticle($this->body); } } //echo "如果沒有Body [" . $this->body->innerHTML . "]\n\n"; if (isset($this->body)) { // remove scripts $this->readability->removeScripts($this->body); // remove any h1-h6 elements that appear as first thing in the body // and which match our title if (isset($this->title) && $this->title != '') { $firstChild = $this->body->firstChild; while ($firstChild->nodeType && $firstChild->nodeType !== XML_ELEMENT_NODE) { $firstChild = $firstChild->nextSibling; } if ($firstChild->nodeType === XML_ELEMENT_NODE && in_array(strtolower($firstChild->tagName), array('h1', 'h2', 'h3', 'h4', 'h5', 'h6')) && strtolower(trim($firstChild->textContent)) == strtolower(trim($this->title))) { $this->body->removeChild($firstChild); } } $this->success = true; } //echo "下一頁之前: [" . $this->body->innerHTML . "]\n\n"; // 20131011 要實作下一頁的偵測!!! if (isset($this->body)) { //$elems = @$xpath->query("//a[starts-with(@href, /?p=) and &page=2']", $this->readability->dom); $next_page_pattern = $this->options->next_page_pattern; //echo $next_page_pattern; //$next_page_pattern = "//a[contains(@href, '&page=')]"; //$next_page_pattern = "//a"; $elems = @$xpath->query($next_page_pattern, $this->readability->dom); ////echo $elems->length; //$link = @$xpath->query("//a[contains(@href, '&page=')]/@href", $this->readability->dom); //if ($link, $) { if ($elems && $elems->length > 0) { try { @($elem = $this->readability->dom->createElement('div', $elems->item(0)->getAttribute("href"))); } catch (Exception $e) { } $elem = $this->readability->dom->createElement('div', "aaa"); $attributes = $elems->item($elems->legnth)->attributes; $href = $attributes->getNamedItem("href")->value; if (substr($href, 0, 4) !== "http") { //echo $href; $url_component = parse_url($url); //$href = urlencode($href); //$elem = $this->readability->dom->createElement('div', $href); //$this->body = $elem; $permalink = $url_component["scheme"] . "://" . $url_component["host"] . $href; } else { $permalink = $href; } //$permalink = $this->getNextPagePermalink($elems); //echo $permalink; //echo "[[[[".$permalink."]]]]"; //$permalink = "http://blog.soft.idv.tw/?p=1606&page=2"; $extractor = new ContentExtractor(dirname(__FILE__) . '/site_config/custom', dirname(__FILE__) . '/site_config/standard'); $extractor->fingerprints = $options->fingerprints; $elem = new ContentExtractor($this->path, $this->fallback); $extractor->fingerprints = $this->fingerprints; $http = new HumbleHttpAgent(); $response = $http->get($permalink, true); //echo 'status_code: '. $response['status_code'] . "\n\n"; if ($permalink && ($response = $http->get($permalink, true)) && ($response['status_code'] < 300 || $response['status_code'] > 400)) { $html = $response['body']; //echo "html: " .$html; // remove strange things $html = str_replace('</[>', '', $html); $html = convert_to_utf8($html, $response['headers']); $extract_result = $extractor->process($html, $permalink); //$readability = $extractor->readability; $content_block = $extract_result ? $extractor->getContent() : null; //echo "content_block->innerHTML: ". $content_block->innerHTML . "\n\n"; //$this->body->appendChild($elem); } //$doc = $this->readability->dom->("<node>".$content_block->C14N()."</node>"); //$content = $content_block-> //$content = $this->readability->dom->createElement('div', $content_block->innerHTML); $doc = new DOMDocument(); if (@$doc->loadHTML($content_block->innerHTML)) { $doc->saveHTML(); //$content = $this->readability->dom->loadHTML($content_block->innerHTML); $content = $this->readability->dom->createElement('div', $content_block->innerHTML); $content = $this->readability->dom->importNode($content_block, true); $this->body->appendChild($content); //$this->body->appendChild($doc); //$xpath = new DOMXPath($this->readability->dom); //$elems = @$xpath->query($extract_pattern, $content_block); //$this->body->appendChild($content_block); //$this->body = $content_block; } } } // if we've had no success and we've used tidy, there's a chance // that tidy has messed up. So let's try again without tidy... if (!$this->success && $tidied && $smart_tidy) { $this->debug('Trying again without tidy'); $this->process($original_html, $url, false); } return $this->success; }