$html = $single_page_response['body'];
             // remove strange things
             $html = str_replace('</[>', '', $html);
             $html = convert_to_utf8($html, $single_page_response['headers']);
             debug("Retrieved single-page view from {$effective_url}");
         }
         unset($single_page_response);
     }
 }
 if ($do_content_extraction) {
     debug('--------');
     debug('Attempting to extract content');
     $extraction_successful = $extractor->process($html, $effective_url);
     $readability = $extractor->readability;
     $content_block = $extraction_successful ? $extractor->getContent() : null;
     $extracted_title = $extraction_successful ? $extractor->getTitle() : '';
     // Deal with multi-page articles
     //die('Next: '.$extractor->getNextPageUrl());
     $is_multi_page = !$is_single_page && $extraction_successful && $extractor->getNextPageUrl();
     if ($options->multipage && $is_multi_page && $options->content) {
         debug('--------');
         debug('Attempting to process multi-page article');
         $multi_page_urls = array();
         $multi_page_content = array();
         while ($next_page_url = $extractor->getNextPageUrl()) {
             debug('--------');
             debug('Processing next page: ' . $next_page_url);
             // If we've got URL, resolve against $url
             if ($next_page_url = makeAbsoluteStr($effective_url, $next_page_url)) {
                 // check it's not what we have already!
                 if (!in_array($next_page_url, $multi_page_urls)) {
Exemplo n.º 2
0
     //echo "[" . $html .  "]";
     if ($auto_extract) {
         // check site config for single page URL - fetch it if found
         if ($single_page_response = getSinglePage($item, $html, $effective_url)) {
             $html = $single_page_response['body'];
             // remove strange things
             $html = str_replace('</[>', '', $html);
             $html = convert_to_utf8($html, $single_page_response['headers']);
             $effective_url = $single_page_response['effective_url'];
             unset($single_page_response);
         }
         $extract_result = $extractor->process($html, $effective_url);
         $readability = $extractor->readability;
         $content_block = $extract_result ? $extractor->getContent() : null;
         //echo "content_block: [" . $content_block->innerHTML . "] \n\n";
         $title = $extract_result ? $extractor->getTitle() : '';
     } else {
         $readability = new Readability($html, $effective_url);
         // content block is entire document (for now...)
         $content_block = $readability->dom;
         //echo $content_block->innerHTML;
         //TODO: get title
         $title = '';
     }
     //echo "[" . $content_block . "]" ;
 }
 // use extracted title for both feed and item title if we're using single-item dummy feed
 if ($isDummyFeed) {
     $output->setTitle($title);
     $newitem->setTitle($title);
 }