function getSinglePage($item, $html, $url)
{
    global $http, $extractor;
    debug('Looking for site config files to see if single page link exists');
    $site_config = $extractor->buildSiteConfig($url, $html);
    $splink = null;
    if (!empty($site_config->single_page_link)) {
        $splink = $site_config->single_page_link;
    } elseif (!empty($site_config->single_page_link_in_feed)) {
        // single page link xpath is targeted at feed
        $splink = $site_config->single_page_link_in_feed;
        // so let's replace HTML with feed item description
        $html = $item->get_description();
    }
    if (isset($splink)) {
        // Build DOM tree from HTML
        $readability = new Readability($html, $url);
        $xpath = new DOMXPath($readability->dom);
        // Loop through single_page_link xpath expressions
        $single_page_url = null;
        foreach ($splink as $pattern) {
            $elems = @$xpath->evaluate($pattern, $readability->dom);
            if (is_string($elems)) {
                $single_page_url = trim($elems);
                break;
            } elseif ($elems instanceof DOMNodeList && $elems->length > 0) {
                foreach ($elems as $item) {
                    if ($item instanceof DOMElement && $item->hasAttribute('href')) {
                        $single_page_url = $item->getAttribute('href');
                        break 2;
                    } elseif ($item instanceof DOMAttr && $item->value) {
                        $single_page_url = $item->value;
                        break 2;
                    }
                }
            }
        }
        // If we've got URL, resolve against $url
        if (isset($single_page_url) && ($single_page_url = makeAbsoluteStr($url, $single_page_url))) {
            // check it's not what we have already!
            if ($single_page_url != $url) {
                // it's not, so let's try to fetch it...
                $_prev_ref = $http->referer;
                $http->referer = $single_page_url;
                if (($response = $http->get($single_page_url, true)) && $response['status_code'] < 300) {
                    $http->referer = $_prev_ref;
                    return $response;
                }
                $http->referer = $_prev_ref;
            }
        }
    }
    return false;
}
function getSinglePage($item, $html, $url)
{
    global $http, $extractor;
    //$url = "http://chinese.engadget.com/2014/04/21/nintendo-game-boy-25th-anniversary/";
    //echo "getSinglePage: " . $url . "\n";
    $host = @parse_url($url, PHP_URL_HOST);
    $site_config = SiteConfig::build($host);
    if ($site_config === false) {
        // check for fingerprints
        if (!empty($extractor->fingerprints) && ($_fphost = $extractor->findHostUsingFingerprints($html))) {
            $site_config = SiteConfig::build($_fphost);
        }
        if ($site_config === false) {
            $site_config = new SiteConfig();
        }
        SiteConfig::add_to_cache($host, $site_config);
        return false;
    } else {
        SiteConfig::add_to_cache($host, $site_config);
    }
    $splink = null;
    if (!empty($site_config->single_page_link)) {
        $splink = $site_config->single_page_link;
    } elseif (!empty($site_config->single_page_link_in_feed)) {
        // single page link xpath is targeted at feed
        $splink = $site_config->single_page_link_in_feed;
        // so let's replace HTML with feed item description
        $html = $item->get_description();
    }
    if (isset($splink)) {
        // Build DOM tree from HTML
        $readability = new Readability($html, $url);
        $xpath = new DOMXPath($readability->dom);
        // Loop through single_page_link xpath expressions
        $single_page_url = null;
        foreach ($splink as $pattern) {
            $elems = @$xpath->evaluate($pattern, $readability->dom);
            if (is_string($elems)) {
                $single_page_url = trim($elems);
                break;
            } elseif ($elems instanceof DOMNodeList && $elems->length > 0) {
                foreach ($elems as $item) {
                    if ($item instanceof DOMElement && $item->hasAttribute('href')) {
                        $single_page_url = $item->getAttribute('href');
                        break;
                    } elseif ($item instanceof DOMAttr && $item->value) {
                        $single_page_url = $item->value;
                        break;
                    }
                }
            }
        }
        // If we've got URL, resolve against $url
        if (isset($single_page_url) && ($single_page_url = makeAbsoluteStr($url, $single_page_url))) {
            // check it's not what we have already!
            if ($single_page_url != $url) {
                // it's not, so let's try to fetch it...
                $_prev_ref = $http->referer;
                $http->referer = $single_page_url;
                if (($response = $http->get($single_page_url, true)) && $response['status_code'] < 300) {
                    $http->referer = $_prev_ref;
                    return $response;
                }
                $http->referer = $_prev_ref;
            }
        }
    }
    return false;
}
Example #3
0
 protected function check_single_page($extractor, &$url, &$html)
 {
     $site_config = $extractor->buildSiteConfig($url, $html);
     $debug_enabled = defined('DAEMON_EXTENDED_DEBUG') || $_REQUEST['xdebug'];
     if (empty($site_config->single_page_link)) {
         _debug("SiteConfig doesn't declare single_page_link", $debug_enabled);
         return;
     }
     // Build DOM tree from HTML
     $readability = new Readability($html, $url);
     $xpath = new DOMXPath($readability->dom);
     // Loop through single_page_link xpath expressions
     $single_page_url = null;
     foreach ($site_config->single_page_link as $pattern) {
         _debug("Trying pattern: {$pattern}", $debug_enabled);
         $elems = @$xpath->evaluate($pattern, $readability->dom);
         if (is_string($elems)) {
             _debug(". matched and returned a string", $debug_enabled);
             $single_page_url = trim($elems);
             break;
         } elseif ($elems instanceof DOMNodeList && $elems->length > 0) {
             _debug(". matched and returned a node list", $debug_enabled);
             foreach ($elems as $item) {
                 if ($item instanceof DOMElement && $item->hasAttribute('href')) {
                     _debug("... got an element, using href attribute", $debug_enabled);
                     $single_page_url = $item->getAttribute('href');
                     break 2;
                 } elseif ($item instanceof DOMAttr && $item->value) {
                     _debug("... got an attribute, using its value", $debug_enabled);
                     $single_page_url = $item->value;
                     break 2;
                 }
             }
         }
     }
     if (empty($single_page_url)) {
         _debug("no single_page_url found, continuing with main page", $debug_enabled);
         return;
     }
     _debug("extracted single_page_url: {$single_page_url}", $debug_enabled);
     // If we've got URL, resolve against $url
     $single_page_url = makeAbsoluteStr($url, $single_page_url);
     _debug("... converted to absolute single_page_url: {$single_page_url}", $debug_enabled);
     if ($single_page_url == $url) {
         _debug("single_page_url equals current page", $debug_enabled);
         return;
     }
     $single_page_html = file_get_contents($single_page_url);
     if (empty($single_page_html)) {
         _debug("single_page_url document is empty", $debug_enabled);
         return;
     }
     $html = $single_page_html;
     $url = $single_page_url;
 }
Example #4
0
 $readability = $extractor->readability;
 $content_block = $extract_result ? $extractor->getContent() : null;
 $extracted_title = $extract_result ? $extractor->getTitle() : '';
 // Deal with multi-page articles
 //die('Next: '.$extractor->getNextPageUrl());
 $is_multi_page = !$is_single_page && $extract_result && $extractor->getNextPageUrl();
 if ($options->multipage && $is_multi_page && $options->content) {
     debug('--------');
     debug('Attempting to process multi-page article');
     $multi_page_urls = array();
     $multi_page_content = array();
     while ($next_page_url = $extractor->getNextPageUrl()) {
         debug('--------');
         debug('Processing next page: ' . $next_page_url);
         // If we've got URL, resolve against $url
         if ($next_page_url = makeAbsoluteStr($effective_url, $next_page_url)) {
             // check it's not what we have already!
             if (!in_array($next_page_url, $multi_page_urls)) {
                 // it's not, so let's attempt to fetch it
                 $multi_page_urls[] = $next_page_url;
                 $_prev_ref = $http->referer;
                 if (($response = $http->get($next_page_url, true)) && $response['status_code'] < 300) {
                     // make sure mime type is not something with a different action associated
                     $page_mime_info = get_mime_action_info($response['headers']);
                     if (!isset($page_mime_info['action'])) {
                         $html = $response['body'];
                         // remove strange things
                         $html = str_replace('</[>', '', $html);
                         $html = convert_to_utf8($html, $response['headers']);
                         if ($extractor->process($html, $next_page_url)) {
                             $multi_page_content[] = $extractor->getContent();
function getSinglePage($item, $html, $url)
{
    global $http;
    $host = @parse_url($url, PHP_URL_HOST);
    $site_config = SiteConfig::build($host);
    if ($site_config === false) {
        return false;
    }
    $splink = null;
    if (!empty($site_config->single_page_link)) {
        $splink = $site_config->single_page_link;
    } elseif (!empty($site_config->single_page_link_in_feed)) {
        // single page link xpath is targeted at feed
        $splink = $site_config->single_page_link_in_feed;
        // so let's replace HTML with feed item description
        $html = $item->get_description();
    }
    if (isset($splink)) {
        // Build DOM tree from HTML
        $readability = new Readability($html, $url);
        $xpath = new DOMXPath($readability->dom);
        // Loop through single_page_link xpath expressions
        $single_page_url = null;
        foreach ($splink as $pattern) {
            $elems = @$xpath->evaluate($pattern, $readability->dom);
            if (is_string($elems)) {
                $single_page_url = trim($elems);
                break;
            } elseif ($elems instanceof DOMNodeList && $elems->length > 0) {
                foreach ($elems as $item) {
                    if ($item->hasAttribute('href')) {
                        $single_page_url = $item->getAttribute('href');
                        break;
                    }
                }
            }
        }
        // If we've got URL, resolve against $url
        if (isset($single_page_url) && ($single_page_url = makeAbsoluteStr($url, $single_page_url))) {
            // check it's not what we have already!
            if ($single_page_url != $url) {
                // it's not, so let's try to fetch it...
                if (($response = $http->get($single_page_url, true)) && $response['status_code'] < 300) {
                    return $response;
                }
            }
        }
    }
    return false;
}