//TODO: cleanup URL //////////////////////////////// // Base URL //////////////////////////////// $_host = $_SERVER['HTTP_HOST']; $_path = rtrim(dirname($_SERVER['SCRIPT_NAME']), '/\\'); $base = 'http://' . htmlspecialchars($_host . $_path); //TODO: use HubmleHTTPAgent require_once 'lib/simplepie/autoloader.php'; require_once 'lib/humble-http-agent/HumbleHttpAgent.php'; require_once 'lib/humble-http-agent/CookieJar.php'; $html = ''; $_req_options = null; $http = new HumbleHttpAgent($_req_options); //$http->debug = true; if (($response = $http->get($url, true)) && $response['status_code'] < 300) { $html = $response['body']; //$html = convert_to_utf8($html, $response['headers']); //$html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8"); } else { die('Failed to fetch URL'); } if (trim($html) == '') { die('Empty response :('); } // use Tidy? if (isset($_GET['tidy']) && $_GET['tidy'] === '1') { if (!function_exists('tidy_parse_string')) { die('Tidy requested but not available on server.'); } $tidy_config = array('clean' => true, 'output-xhtml' => true, 'logical-emphasis' => true, 'show-body-only' => false, 'new-blocklevel-tags' => 'article, aside, footer, header, hgroup, menu, nav, section, details, datagrid', 'new-inline-tags' => 'mark, time, meter, progress, data', 'wrap' => 0, 'drop-empty-paras' => true, 'drop-proprietary-attributes' => false, 'enclose-text' => true, 'enclose-block-text' => true, 'merge-divs' => true, 'merge-spans' => true, 'char-encoding' => 'utf8', 'hide-comments' => true);
echo '<p>If you have any trouble, please contact us via our <a href="http://help.fivefilters.org">support site</a>.</p>'; exit; } ////////////////////////////////// // Check update key valid ////////////////////////////////// if ($_REQUEST['key'] !== $admin_hash) { println("Sorry, invalid key supplied."); exit; } ////////////////////////////////// // Check for updates ////////////////////////////////// //$ff_version = @file_get_contents('http://fivefilters.org/content-only/site_config/standard/version.txt'); $http = new HumbleHttpAgent(); $latest_info_json = $http->get('https://api.github.com/repos/fivefilters/ftr-site-config'); //$_context = stream_context_create(array('http' => array('user_agent' => 'PHP/5.5'), 'ssl'=>array('verify_peer'=>false))); //$latest_info_json = file_get_contents('https://api.github.com/repos/fivefilters/ftr-site-config', false, $_context); if (!$latest_info_json) { println("Sorry, couldn't get info on latest site config files. Please try again later or contact us."); exit; } $latest_info_json = $latest_info_json['body']; $latest_info_json = @json_decode($latest_info_json); if (!is_object($latest_info_json)) { println("Sorry, couldn't parse JSON from GitHub. Please try again later or contact us."); exit; } $ff_version = $latest_info_json->pushed_at; if ($version == $ff_version) { die('Your site config files are up to date! If you have trouble extracting from a particular site, please email us: help@fivefilters.org');
$feed->set_url_replacements(array()); // initialise the feed // the @ suppresses notices which on some servers causes a 500 internal server error $result = @$feed->init(); //$feed->handle_content_type(); //$feed->get_title(); if ($result && (!is_array($feed->data) || count($feed->data) == 0)) { die('Sorry, no feed items found'); } } //////////////////////////////////////////////////////////////////////////////// // Extract content from HTML (if URL is not feed or explicit HTML request has been made) //////////////////////////////////////////////////////////////////////////////// if ($html_only || !$result) { unset($feed, $result); if ($response = $http->get($url)) { $effective_url = $response['effective_url']; $html = $response['body']; $html = convert_to_utf8($html, $response['headers']); } else { die('Error retrieving ' . $url); } if ($auto_extract) { // Run through Tidy (if it exists). // This fixes problems with some sites which would otherwise // trouble DOMDocument's HTML parsing. if (function_exists('tidy_parse_string')) { $tidy = tidy_parse_string($html, $tidy_config, 'UTF8'); if (tidy_clean_repair($tidy)) { $html = $tidy->value; }
protected function updateAllParallel($subscriptions) { zf_debugRuntime("before feeds parallel update"); $urls = array(); foreach ($subscriptions as $sub) { $url = ZF_URL . '/pub/index.php?q=force-refresh&id=' . $sub->source->id; $urls[] = $url; } // Request all feed items in parallel (if supported) $http = new HumbleHttpAgent(); $http->userAgentDefault = HumbleHttpAgent::UA_PHP; zf_debug('fetching all ' . sizeof($urls) . ' feeds', DBG_FEED); $http->fetchAll($urls); foreach ($urls as $url) { zf_debug('going after ' . $url, DBG_FEED); if ($url && ($response = $http->get($url, true)) && ($response['status_code'] < 300 || $response['status_code'] > 400)) { $effective_url = $response['effective_url']; /*zf_debug('response: '. $response['body'], DBG_FEED); if(DBG_FEED & ZF_DEBUG) var_dump($response);*/ } } zf_debugRuntime("End of parallel update"); }
public function extractContentBlock($permalink) { $extractor = new ContentExtractor(dirname(__FILE__) . '/site_config/custom', dirname(__FILE__) . '/site_config/standard'); //$extractor = $this; $extractor->next_page_deep_count = $this->next_page_deep_count + 1; $extractor->next_pages = $this->next_pages; if (in_array($permalink, $extractor->next_pages)) { return FALSE; } $extractor->next_pages[] = $permalink; if ($extractor->next_page_deep_count > 3) { return FALSE; } $extractor->fingerprints = $this->options->fingerprints; $elem = new ContentExtractor($this->path, $this->fallback); $extractor->fingerprints = $this->fingerprints; $http = new HumbleHttpAgent(); $response = $http->get($permalink, true); //echo 'status_code: '. $response['status_code'] . "\n\n"; if ($permalink && ($response = $http->get($permalink, true)) && ($response['status_code'] < 300 || $response['status_code'] > 400)) { $html = $response['body']; //echo "html: " .$html; // remove strange things $html = str_replace('</[>', '', $html); $html = convert_to_utf8($html, $response['headers']); if (function_exists('mb_convert_encoding')) { $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8"); } $extract_result = $extractor->process($html, $permalink); //$readability = $extractor->readability; $content_block = $extract_result ? $extractor->getContent() : null; //echo "content_block->innerHTML: ". $content_block->innerHTML . "\n\n"; //$this->body->appendChild($elem); } $doc = new DOMDocument(); if (@$doc->loadHTML($content_block->innerHTML)) { $doc->saveHTML(); //$content = $this->readability->dom->loadHTML($content_block->innerHTML); $content = $this->readability->dom->createElement('div', $content_block->innerHTML); $content = $this->readability->dom->importNode($content_block, true); return $content; } else { return FALSE; } return FALSE; //return $content_block; }