Esempio n. 1
0
//TODO: cleanup URL
////////////////////////////////
// Base URL
////////////////////////////////
$_host = $_SERVER['HTTP_HOST'];
$_path = rtrim(dirname($_SERVER['SCRIPT_NAME']), '/\\');
$base = 'http://' . htmlspecialchars($_host . $_path);
//TODO: use HubmleHTTPAgent
require_once 'lib/simplepie/autoloader.php';
require_once 'lib/humble-http-agent/HumbleHttpAgent.php';
require_once 'lib/humble-http-agent/CookieJar.php';
$html = '';
$_req_options = null;
$http = new HumbleHttpAgent($_req_options);
//$http->debug = true;
if (($response = $http->get($url, true)) && $response['status_code'] < 300) {
    $html = $response['body'];
    //$html = convert_to_utf8($html, $response['headers']);
    //$html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8");
} else {
    die('Failed to fetch URL');
}
if (trim($html) == '') {
    die('Empty response :(');
}
// use Tidy?
if (isset($_GET['tidy']) && $_GET['tidy'] === '1') {
    if (!function_exists('tidy_parse_string')) {
        die('Tidy requested but not available on server.');
    }
    $tidy_config = array('clean' => true, 'output-xhtml' => true, 'logical-emphasis' => true, 'show-body-only' => false, 'new-blocklevel-tags' => 'article, aside, footer, header, hgroup, menu, nav, section, details, datagrid', 'new-inline-tags' => 'mark, time, meter, progress, data', 'wrap' => 0, 'drop-empty-paras' => true, 'drop-proprietary-attributes' => false, 'enclose-text' => true, 'enclose-block-text' => true, 'merge-divs' => true, 'merge-spans' => true, 'char-encoding' => 'utf8', 'hide-comments' => true);
Esempio n. 2
0
    echo '<p>If you have any trouble, please contact us via our <a href="http://help.fivefilters.org">support site</a>.</p>';
    exit;
}
//////////////////////////////////
// Check update key valid
//////////////////////////////////
if ($_REQUEST['key'] !== $admin_hash) {
    println("Sorry, invalid key supplied.");
    exit;
}
//////////////////////////////////
// Check for updates
//////////////////////////////////
//$ff_version = @file_get_contents('http://fivefilters.org/content-only/site_config/standard/version.txt');
$http = new HumbleHttpAgent();
$latest_info_json = $http->get('https://api.github.com/repos/fivefilters/ftr-site-config');
//$_context = stream_context_create(array('http' => array('user_agent' => 'PHP/5.5'), 'ssl'=>array('verify_peer'=>false)));
//$latest_info_json = file_get_contents('https://api.github.com/repos/fivefilters/ftr-site-config', false, $_context);
if (!$latest_info_json) {
    println("Sorry, couldn't get info on latest site config files. Please try again later or contact us.");
    exit;
}
$latest_info_json = $latest_info_json['body'];
$latest_info_json = @json_decode($latest_info_json);
if (!is_object($latest_info_json)) {
    println("Sorry, couldn't parse JSON from GitHub. Please try again later or contact us.");
    exit;
}
$ff_version = $latest_info_json->pushed_at;
if ($version == $ff_version) {
    die('Your site config files are up to date! If you have trouble extracting from a particular site, please email us: help@fivefilters.org');
    $feed->set_url_replacements(array());
    // initialise the feed
    // the @ suppresses notices which on some servers causes a 500 internal server error
    $result = @$feed->init();
    //$feed->handle_content_type();
    //$feed->get_title();
    if ($result && (!is_array($feed->data) || count($feed->data) == 0)) {
        die('Sorry, no feed items found');
    }
}
////////////////////////////////////////////////////////////////////////////////
// Extract content from HTML (if URL is not feed or explicit HTML request has been made)
////////////////////////////////////////////////////////////////////////////////
if ($html_only || !$result) {
    unset($feed, $result);
    if ($response = $http->get($url)) {
        $effective_url = $response['effective_url'];
        $html = $response['body'];
        $html = convert_to_utf8($html, $response['headers']);
    } else {
        die('Error retrieving ' . $url);
    }
    if ($auto_extract) {
        // Run through Tidy (if it exists).
        // This fixes problems with some sites which would otherwise
        // trouble DOMDocument's HTML parsing.
        if (function_exists('tidy_parse_string')) {
            $tidy = tidy_parse_string($html, $tidy_config, 'UTF8');
            if (tidy_clean_repair($tidy)) {
                $html = $tidy->value;
            }
Esempio n. 4
0
 protected function updateAllParallel($subscriptions)
 {
     zf_debugRuntime("before feeds parallel update");
     $urls = array();
     foreach ($subscriptions as $sub) {
         $url = ZF_URL . '/pub/index.php?q=force-refresh&id=' . $sub->source->id;
         $urls[] = $url;
     }
     // Request all feed items in parallel (if supported)
     $http = new HumbleHttpAgent();
     $http->userAgentDefault = HumbleHttpAgent::UA_PHP;
     zf_debug('fetching all ' . sizeof($urls) . ' feeds', DBG_FEED);
     $http->fetchAll($urls);
     foreach ($urls as $url) {
         zf_debug('going after ' . $url, DBG_FEED);
         if ($url && ($response = $http->get($url, true)) && ($response['status_code'] < 300 || $response['status_code'] > 400)) {
             $effective_url = $response['effective_url'];
             /*zf_debug('response: '. $response['body'], DBG_FEED);
             		if(DBG_FEED & ZF_DEBUG) var_dump($response);*/
         }
     }
     zf_debugRuntime("End of parallel update");
 }
 public function extractContentBlock($permalink)
 {
     $extractor = new ContentExtractor(dirname(__FILE__) . '/site_config/custom', dirname(__FILE__) . '/site_config/standard');
     //$extractor = $this;
     $extractor->next_page_deep_count = $this->next_page_deep_count + 1;
     $extractor->next_pages = $this->next_pages;
     if (in_array($permalink, $extractor->next_pages)) {
         return FALSE;
     }
     $extractor->next_pages[] = $permalink;
     if ($extractor->next_page_deep_count > 3) {
         return FALSE;
     }
     $extractor->fingerprints = $this->options->fingerprints;
     $elem = new ContentExtractor($this->path, $this->fallback);
     $extractor->fingerprints = $this->fingerprints;
     $http = new HumbleHttpAgent();
     $response = $http->get($permalink, true);
     //echo 'status_code: '. $response['status_code'] . "\n\n";
     if ($permalink && ($response = $http->get($permalink, true)) && ($response['status_code'] < 300 || $response['status_code'] > 400)) {
         $html = $response['body'];
         //echo "html: " .$html;
         // remove strange things
         $html = str_replace('</[>', '', $html);
         $html = convert_to_utf8($html, $response['headers']);
         if (function_exists('mb_convert_encoding')) {
             $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8");
         }
         $extract_result = $extractor->process($html, $permalink);
         //$readability = $extractor->readability;
         $content_block = $extract_result ? $extractor->getContent() : null;
         //echo "content_block->innerHTML: ". $content_block->innerHTML . "\n\n";
         //$this->body->appendChild($elem);
     }
     $doc = new DOMDocument();
     if (@$doc->loadHTML($content_block->innerHTML)) {
         $doc->saveHTML();
         //$content = $this->readability->dom->loadHTML($content_block->innerHTML);
         $content = $this->readability->dom->createElement('div', $content_block->innerHTML);
         $content = $this->readability->dom->importNode($content_block, true);
         return $content;
     } else {
         return FALSE;
     }
     return FALSE;
     //return $content_block;
 }