protected function extract_content($url) { $debug_enabled = defined('DAEMON_EXTENDED_DEBUG') || $_REQUEST['xdebug']; if (!filter_var($url, FILTER_VALIDATE_URL)) { return null; } $scheme = parse_url($url, PHP_URL_SCHEME); if (!in_array($scheme, array("http", "https", "ftp"))) { return false; } $html = file_get_contents($url); if (empty($html)) { return false; } if ($debug_enabled) { echo "<pre>"; } $extractor = new ContentExtractor(__DIR__ . '/ftr-site-config', __DIR__ . '/site-config.local'); SiteConfig::$debug = $debug_enabled; $extractor->debug = $debug_enabled; $this->check_single_page($extractor, $url, $html); $extract_result = $extractor->process($html, $url); if ($debug_enabled) { echo "</pre>"; } if (!$extract_result) { return false; } $content_block = $extractor->getContent(); $extractor->readability->clean($content_block, 'select'); // get base URL $base_url = get_base_url($extractor->readability->dom); if (!$base_url) { $base_url = $url; } // rewrite URLs makeAbsolute($base_url, $content_block); // remove nesting: <div><div><div><p>test</p></div></div></div> = <p>test</p> while ($content_block->childNodes->length == 1 && $content_block->firstChild->nodeType === XML_ELEMENT_NODE) { // only follow these tag names if (!in_array(strtolower($content_block->tagName), array('div', 'article', 'section', 'header', 'footer'))) { break; } $content_block = $content_block->firstChild; } // convert content block to HTML string // Need to preserve things like body: //img[@id='feature'] if (in_array(strtolower($content_block->tagName), array('div', 'article', 'section', 'header', 'footer'))) { $html = $content_block->innerHTML; } else { $html = $content_block->ownerDocument->saveXML($content_block); // essentially outerHTML } // post-processing cleanup $html = preg_replace('!<p>[\\s\\h\\v]*</p>!u', '', $html); return array('title' => $extractor->getTitle(), 'authors' => $extractor->getAuthors(), 'date' => $extractor->getDate(), 'html' => $html); }
} ////////////////////////////////// // Set up HTTP agent ////////////////////////////////// $http = new HumbleHttpAgent(); $http->debug = $debug_mode; $http->userAgentMap = $options->user_agents; $http->headerOnlyTypes = array_keys($options->content_type_exc); $http->rewriteUrls = $options->rewrite_url; //$http->initCache($options->cache_dir, $options->cache_directory_level, $options->cache_cleanup, isset($options->http_cache_ttl) ? $options->http_cache_ttl : 12*60*60); ////////////////////////////////// // Set up Content Extractor ////////////////////////////////// $extractor = new ContentExtractor(dirname(__FILE__) . '/site_config/custom', dirname(__FILE__) . '/site_config/standard'); $extractor->debug = $debug_mode; SiteConfig::$debug = $debug_mode; SiteConfig::use_apc($options->apc); $extractor->fingerprints = $options->fingerprints; $extractor->allowedParsers = $options->allowed_parsers; //////////////////////////////// // Get RSS/Atom feed //////////////////////////////// if (!$html_only) { debug('--------'); debug("Attempting to process URL as feed"); // Send user agent header showing PHP (prevents a HTML response from feedburner) $http->userAgentDefault = HumbleHttpAgent::UA_PHP; // configure SimplePie HTTP extension class to use our HumbleHttpAgent instance SimplePie_HumbleHttpAgent::set_agent($http); $feed = new SimplePie(); // some feeds use the text/html content type - force_feed tells SimplePie to process anyway
/** * fetch content from FullTextRss * * @author Jean Baptiste Favre * @return string content */ private function fetchFromWebSite($url) { $this->extractor = new \ContentExtractor(\F3::get('FTRSS_DATA_DIR') . '/custom', \F3::get('FTRSS_DATA_DIR') . '/standard'); if (\F3::get('logger_level') === "DEBUG") { ob_start(); $this->extractor->debug = true; \SiteConfig::$debug = true; } \SiteConfig::use_apc(false); $this->extractor->fingerprints = $this->fingerprints; $this->extractor->allowedParsers = $this->allowed_parsers; $stream_opts = array('http' => array('timeout' => 5, 'method' => "GET", 'header' => "Accept-language: en-us,en-gb;q=0.8,en;q=0.6,fr;q=0.4,fr-fr;q=0.2\r\n" . "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8\r\n" . "User-Agent: SimplePie/1.3.1 (Feed Parser; http://simplepie.org; Allow like Gecko) Build/20121030175911" . "DNT: 1")); $context = stream_context_create($stream_opts); $url = $this->removeTrackersFromUrl($url); // Load web page $html = @file_get_contents($url, false, $context); if ($html === false) { return false; } return $html; }