Beispiel #1
0
         // skip this and move to next item
     }
     //TODO: get text sample for language detection
     $html = $options->error_message;
     // keep the original item description
     $html .= $item->get_description();
 } else {
     $readability->clean($content_block, 'select');
     // get base URL
     $base_url = get_base_url($readability->dom);
     if (!$base_url) {
         $base_url = $effective_url;
     }
     // rewrite URLs
     if ($options->rewrite_relative_urls) {
         makeAbsolute($base_url, $content_block);
     }
     // footnotes
     if ($links == 'footnotes' && strpos($effective_url, 'wikipedia.org') === false) {
         $readability->addFootnotes($content_block);
     }
     // remove nesting: <div><div><div><p>test</p></div></div></div> = <p>test</p>
     while ($content_block->childNodes->length == 1 && $content_block->firstChild->nodeType === XML_ELEMENT_NODE) {
         // only follow these tag names
         if (!in_array(strtolower($content_block->tagName), array('div', 'article', 'section', 'header', 'footer'))) {
             break;
         }
         //$html = $content_block->firstChild->innerHTML; // FTR 2.9.5
         $content_block = $content_block->firstChild;
     }
     // convert content block to HTML string
 // if we failed to extract content...
 if (!$extraction_successful) {
     if ($exclude_on_fail) {
         debug('Failed to extract, so skipping (due to exclude on fail parameter)');
         continue;
         // skip this and move to next item
     }
     //TODO: get text sample for language detection
     $html = $options->error_message;
     // keep the original item description
     $html .= $item->get_description();
 } else {
     if (!$cached_page) {
         $readability->clean($content_block, 'select');
         if ($options->rewrite_relative_urls) {
             makeAbsolute($effective_url, $content_block);
         }
         // footnotes
         if ($links == 'footnotes' && strpos($effective_url, 'wikipedia.org') === false) {
             $readability->addFootnotes($content_block);
         }
         // remove nesting: <div><div><div><p>test</p></div></div></div> = <p>test</p>
         while ($content_block->childNodes->length == 1 && $content_block->firstChild->nodeType === XML_ELEMENT_NODE) {
             // only follow these tag names
             if (!in_array(strtolower($content_block->tagName), array('div', 'article', 'section', 'header', 'footer'))) {
                 break;
             }
             $content_block = $content_block->firstChild;
         }
         // convert content block to HTML string
         // Need to preserve things like body: //img[@id='feature']
Beispiel #3
0
 protected function extract_content($url)
 {
     $debug_enabled = defined('DAEMON_EXTENDED_DEBUG') || $_REQUEST['xdebug'];
     if (!filter_var($url, FILTER_VALIDATE_URL)) {
         return null;
     }
     $scheme = parse_url($url, PHP_URL_SCHEME);
     if (!in_array($scheme, array("http", "https", "ftp"))) {
         return false;
     }
     $html = file_get_contents($url);
     if (empty($html)) {
         return false;
     }
     if ($debug_enabled) {
         echo "<pre>";
     }
     $extractor = new ContentExtractor(__DIR__ . '/ftr-site-config', __DIR__ . '/site-config.local');
     SiteConfig::$debug = $debug_enabled;
     $extractor->debug = $debug_enabled;
     $this->check_single_page($extractor, $url, $html);
     $extract_result = $extractor->process($html, $url);
     if ($debug_enabled) {
         echo "</pre>";
     }
     if (!$extract_result) {
         return false;
     }
     $content_block = $extractor->getContent();
     $extractor->readability->clean($content_block, 'select');
     // get base URL
     $base_url = get_base_url($extractor->readability->dom);
     if (!$base_url) {
         $base_url = $url;
     }
     // rewrite URLs
     makeAbsolute($base_url, $content_block);
     // remove nesting: <div><div><div><p>test</p></div></div></div> = <p>test</p>
     while ($content_block->childNodes->length == 1 && $content_block->firstChild->nodeType === XML_ELEMENT_NODE) {
         // only follow these tag names
         if (!in_array(strtolower($content_block->tagName), array('div', 'article', 'section', 'header', 'footer'))) {
             break;
         }
         $content_block = $content_block->firstChild;
     }
     // convert content block to HTML string
     // Need to preserve things like body: //img[@id='feature']
     if (in_array(strtolower($content_block->tagName), array('div', 'article', 'section', 'header', 'footer'))) {
         $html = $content_block->innerHTML;
     } else {
         $html = $content_block->ownerDocument->saveXML($content_block);
         // essentially outerHTML
     }
     // post-processing cleanup
     $html = preg_replace('!<p>[\\s\\h\\v]*</p>!u', '', $html);
     return array('title' => $extractor->getTitle(), 'authors' => $extractor->getAuthors(), 'date' => $extractor->getDate(), 'html' => $html);
 }