// skip this and move to next item } //TODO: get text sample for language detection $html = $options->error_message; // keep the original item description $html .= $item->get_description(); } else { $readability->clean($content_block, 'select'); // get base URL $base_url = get_base_url($readability->dom); if (!$base_url) { $base_url = $effective_url; } // rewrite URLs if ($options->rewrite_relative_urls) { makeAbsolute($base_url, $content_block); } // footnotes if ($links == 'footnotes' && strpos($effective_url, 'wikipedia.org') === false) { $readability->addFootnotes($content_block); } // remove nesting: <div><div><div><p>test</p></div></div></div> = <p>test</p> while ($content_block->childNodes->length == 1 && $content_block->firstChild->nodeType === XML_ELEMENT_NODE) { // only follow these tag names if (!in_array(strtolower($content_block->tagName), array('div', 'article', 'section', 'header', 'footer'))) { break; } //$html = $content_block->firstChild->innerHTML; // FTR 2.9.5 $content_block = $content_block->firstChild; } // convert content block to HTML string
// if we failed to extract content... if (!$extraction_successful) { if ($exclude_on_fail) { debug('Failed to extract, so skipping (due to exclude on fail parameter)'); continue; // skip this and move to next item } //TODO: get text sample for language detection $html = $options->error_message; // keep the original item description $html .= $item->get_description(); } else { if (!$cached_page) { $readability->clean($content_block, 'select'); if ($options->rewrite_relative_urls) { makeAbsolute($effective_url, $content_block); } // footnotes if ($links == 'footnotes' && strpos($effective_url, 'wikipedia.org') === false) { $readability->addFootnotes($content_block); } // remove nesting: <div><div><div><p>test</p></div></div></div> = <p>test</p> while ($content_block->childNodes->length == 1 && $content_block->firstChild->nodeType === XML_ELEMENT_NODE) { // only follow these tag names if (!in_array(strtolower($content_block->tagName), array('div', 'article', 'section', 'header', 'footer'))) { break; } $content_block = $content_block->firstChild; } // convert content block to HTML string // Need to preserve things like body: //img[@id='feature']
protected function extract_content($url) { $debug_enabled = defined('DAEMON_EXTENDED_DEBUG') || $_REQUEST['xdebug']; if (!filter_var($url, FILTER_VALIDATE_URL)) { return null; } $scheme = parse_url($url, PHP_URL_SCHEME); if (!in_array($scheme, array("http", "https", "ftp"))) { return false; } $html = file_get_contents($url); if (empty($html)) { return false; } if ($debug_enabled) { echo "<pre>"; } $extractor = new ContentExtractor(__DIR__ . '/ftr-site-config', __DIR__ . '/site-config.local'); SiteConfig::$debug = $debug_enabled; $extractor->debug = $debug_enabled; $this->check_single_page($extractor, $url, $html); $extract_result = $extractor->process($html, $url); if ($debug_enabled) { echo "</pre>"; } if (!$extract_result) { return false; } $content_block = $extractor->getContent(); $extractor->readability->clean($content_block, 'select'); // get base URL $base_url = get_base_url($extractor->readability->dom); if (!$base_url) { $base_url = $url; } // rewrite URLs makeAbsolute($base_url, $content_block); // remove nesting: <div><div><div><p>test</p></div></div></div> = <p>test</p> while ($content_block->childNodes->length == 1 && $content_block->firstChild->nodeType === XML_ELEMENT_NODE) { // only follow these tag names if (!in_array(strtolower($content_block->tagName), array('div', 'article', 'section', 'header', 'footer'))) { break; } $content_block = $content_block->firstChild; } // convert content block to HTML string // Need to preserve things like body: //img[@id='feature'] if (in_array(strtolower($content_block->tagName), array('div', 'article', 'section', 'header', 'footer'))) { $html = $content_block->innerHTML; } else { $html = $content_block->ownerDocument->saveXML($content_block); // essentially outerHTML } // post-processing cleanup $html = preg_replace('!<p>[\\s\\h\\v]*</p>!u', '', $html); return array('title' => $extractor->getTitle(), 'authors' => $extractor->getAuthors(), 'date' => $extractor->getDate(), 'html' => $html); }