function getSinglePage($item, $html, $url) { global $http, $extractor; debug('Looking for site config files to see if single page link exists'); $site_config = $extractor->buildSiteConfig($url, $html); $splink = null; if (!empty($site_config->single_page_link)) { $splink = $site_config->single_page_link; } elseif (!empty($site_config->single_page_link_in_feed)) { // single page link xpath is targeted at feed $splink = $site_config->single_page_link_in_feed; // so let's replace HTML with feed item description $html = $item->get_description(); } if (isset($splink)) { // Build DOM tree from HTML $readability = new Readability($html, $url); $xpath = new DOMXPath($readability->dom); // Loop through single_page_link xpath expressions $single_page_url = null; foreach ($splink as $pattern) { $elems = @$xpath->evaluate($pattern, $readability->dom); if (is_string($elems)) { $single_page_url = trim($elems); break; } elseif ($elems instanceof DOMNodeList && $elems->length > 0) { foreach ($elems as $item) { if ($item instanceof DOMElement && $item->hasAttribute('href')) { $single_page_url = $item->getAttribute('href'); break 2; } elseif ($item instanceof DOMAttr && $item->value) { $single_page_url = $item->value; break 2; } } } } // If we've got URL, resolve against $url if (isset($single_page_url) && ($single_page_url = makeAbsoluteStr($url, $single_page_url))) { // check it's not what we have already! if ($single_page_url != $url) { // it's not, so let's try to fetch it... $_prev_ref = $http->referer; $http->referer = $single_page_url; if (($response = $http->get($single_page_url, true)) && $response['status_code'] < 300) { $http->referer = $_prev_ref; return $response; } $http->referer = $_prev_ref; } } } return false; }
function getSinglePage($item, $html, $url) { global $http, $extractor; //$url = "http://chinese.engadget.com/2014/04/21/nintendo-game-boy-25th-anniversary/"; //echo "getSinglePage: " . $url . "\n"; $host = @parse_url($url, PHP_URL_HOST); $site_config = SiteConfig::build($host); if ($site_config === false) { // check for fingerprints if (!empty($extractor->fingerprints) && ($_fphost = $extractor->findHostUsingFingerprints($html))) { $site_config = SiteConfig::build($_fphost); } if ($site_config === false) { $site_config = new SiteConfig(); } SiteConfig::add_to_cache($host, $site_config); return false; } else { SiteConfig::add_to_cache($host, $site_config); } $splink = null; if (!empty($site_config->single_page_link)) { $splink = $site_config->single_page_link; } elseif (!empty($site_config->single_page_link_in_feed)) { // single page link xpath is targeted at feed $splink = $site_config->single_page_link_in_feed; // so let's replace HTML with feed item description $html = $item->get_description(); } if (isset($splink)) { // Build DOM tree from HTML $readability = new Readability($html, $url); $xpath = new DOMXPath($readability->dom); // Loop through single_page_link xpath expressions $single_page_url = null; foreach ($splink as $pattern) { $elems = @$xpath->evaluate($pattern, $readability->dom); if (is_string($elems)) { $single_page_url = trim($elems); break; } elseif ($elems instanceof DOMNodeList && $elems->length > 0) { foreach ($elems as $item) { if ($item instanceof DOMElement && $item->hasAttribute('href')) { $single_page_url = $item->getAttribute('href'); break; } elseif ($item instanceof DOMAttr && $item->value) { $single_page_url = $item->value; break; } } } } // If we've got URL, resolve against $url if (isset($single_page_url) && ($single_page_url = makeAbsoluteStr($url, $single_page_url))) { // check it's not what we have already! if ($single_page_url != $url) { // it's not, so let's try to fetch it... $_prev_ref = $http->referer; $http->referer = $single_page_url; if (($response = $http->get($single_page_url, true)) && $response['status_code'] < 300) { $http->referer = $_prev_ref; return $response; } $http->referer = $_prev_ref; } } } return false; }
protected function check_single_page($extractor, &$url, &$html) { $site_config = $extractor->buildSiteConfig($url, $html); $debug_enabled = defined('DAEMON_EXTENDED_DEBUG') || $_REQUEST['xdebug']; if (empty($site_config->single_page_link)) { _debug("SiteConfig doesn't declare single_page_link", $debug_enabled); return; } // Build DOM tree from HTML $readability = new Readability($html, $url); $xpath = new DOMXPath($readability->dom); // Loop through single_page_link xpath expressions $single_page_url = null; foreach ($site_config->single_page_link as $pattern) { _debug("Trying pattern: {$pattern}", $debug_enabled); $elems = @$xpath->evaluate($pattern, $readability->dom); if (is_string($elems)) { _debug(". matched and returned a string", $debug_enabled); $single_page_url = trim($elems); break; } elseif ($elems instanceof DOMNodeList && $elems->length > 0) { _debug(". matched and returned a node list", $debug_enabled); foreach ($elems as $item) { if ($item instanceof DOMElement && $item->hasAttribute('href')) { _debug("... got an element, using href attribute", $debug_enabled); $single_page_url = $item->getAttribute('href'); break 2; } elseif ($item instanceof DOMAttr && $item->value) { _debug("... got an attribute, using its value", $debug_enabled); $single_page_url = $item->value; break 2; } } } } if (empty($single_page_url)) { _debug("no single_page_url found, continuing with main page", $debug_enabled); return; } _debug("extracted single_page_url: {$single_page_url}", $debug_enabled); // If we've got URL, resolve against $url $single_page_url = makeAbsoluteStr($url, $single_page_url); _debug("... converted to absolute single_page_url: {$single_page_url}", $debug_enabled); if ($single_page_url == $url) { _debug("single_page_url equals current page", $debug_enabled); return; } $single_page_html = file_get_contents($single_page_url); if (empty($single_page_html)) { _debug("single_page_url document is empty", $debug_enabled); return; } $html = $single_page_html; $url = $single_page_url; }
$readability = $extractor->readability; $content_block = $extract_result ? $extractor->getContent() : null; $extracted_title = $extract_result ? $extractor->getTitle() : ''; // Deal with multi-page articles //die('Next: '.$extractor->getNextPageUrl()); $is_multi_page = !$is_single_page && $extract_result && $extractor->getNextPageUrl(); if ($options->multipage && $is_multi_page && $options->content) { debug('--------'); debug('Attempting to process multi-page article'); $multi_page_urls = array(); $multi_page_content = array(); while ($next_page_url = $extractor->getNextPageUrl()) { debug('--------'); debug('Processing next page: ' . $next_page_url); // If we've got URL, resolve against $url if ($next_page_url = makeAbsoluteStr($effective_url, $next_page_url)) { // check it's not what we have already! if (!in_array($next_page_url, $multi_page_urls)) { // it's not, so let's attempt to fetch it $multi_page_urls[] = $next_page_url; $_prev_ref = $http->referer; if (($response = $http->get($next_page_url, true)) && $response['status_code'] < 300) { // make sure mime type is not something with a different action associated $page_mime_info = get_mime_action_info($response['headers']); if (!isset($page_mime_info['action'])) { $html = $response['body']; // remove strange things $html = str_replace('</[>', '', $html); $html = convert_to_utf8($html, $response['headers']); if ($extractor->process($html, $next_page_url)) { $multi_page_content[] = $extractor->getContent();
function getSinglePage($item, $html, $url) { global $http; $host = @parse_url($url, PHP_URL_HOST); $site_config = SiteConfig::build($host); if ($site_config === false) { return false; } $splink = null; if (!empty($site_config->single_page_link)) { $splink = $site_config->single_page_link; } elseif (!empty($site_config->single_page_link_in_feed)) { // single page link xpath is targeted at feed $splink = $site_config->single_page_link_in_feed; // so let's replace HTML with feed item description $html = $item->get_description(); } if (isset($splink)) { // Build DOM tree from HTML $readability = new Readability($html, $url); $xpath = new DOMXPath($readability->dom); // Loop through single_page_link xpath expressions $single_page_url = null; foreach ($splink as $pattern) { $elems = @$xpath->evaluate($pattern, $readability->dom); if (is_string($elems)) { $single_page_url = trim($elems); break; } elseif ($elems instanceof DOMNodeList && $elems->length > 0) { foreach ($elems as $item) { if ($item->hasAttribute('href')) { $single_page_url = $item->getAttribute('href'); break; } } } } // If we've got URL, resolve against $url if (isset($single_page_url) && ($single_page_url = makeAbsoluteStr($url, $single_page_url))) { // check it's not what we have already! if ($single_page_url != $url) { // it's not, so let's try to fetch it... if (($response = $http->get($single_page_url, true)) && $response['status_code'] < 300) { return $response; } } } } return false; }