$html = $single_page_response['body']; // remove strange things $html = str_replace('</[>', '', $html); $html = convert_to_utf8($html, $single_page_response['headers']); debug("Retrieved single-page view from {$effective_url}"); } unset($single_page_response); } } if ($do_content_extraction) { debug('--------'); debug('Attempting to extract content'); $extraction_successful = $extractor->process($html, $effective_url); $readability = $extractor->readability; $content_block = $extraction_successful ? $extractor->getContent() : null; $extracted_title = $extraction_successful ? $extractor->getTitle() : ''; // Deal with multi-page articles //die('Next: '.$extractor->getNextPageUrl()); $is_multi_page = !$is_single_page && $extraction_successful && $extractor->getNextPageUrl(); if ($options->multipage && $is_multi_page && $options->content) { debug('--------'); debug('Attempting to process multi-page article'); $multi_page_urls = array(); $multi_page_content = array(); while ($next_page_url = $extractor->getNextPageUrl()) { debug('--------'); debug('Processing next page: ' . $next_page_url); // If we've got URL, resolve against $url if ($next_page_url = makeAbsoluteStr($effective_url, $next_page_url)) { // check it's not what we have already! if (!in_array($next_page_url, $multi_page_urls)) {
//echo "[" . $html . "]"; if ($auto_extract) { // check site config for single page URL - fetch it if found if ($single_page_response = getSinglePage($item, $html, $effective_url)) { $html = $single_page_response['body']; // remove strange things $html = str_replace('</[>', '', $html); $html = convert_to_utf8($html, $single_page_response['headers']); $effective_url = $single_page_response['effective_url']; unset($single_page_response); } $extract_result = $extractor->process($html, $effective_url); $readability = $extractor->readability; $content_block = $extract_result ? $extractor->getContent() : null; //echo "content_block: [" . $content_block->innerHTML . "] \n\n"; $title = $extract_result ? $extractor->getTitle() : ''; } else { $readability = new Readability($html, $effective_url); // content block is entire document (for now...) $content_block = $readability->dom; //echo $content_block->innerHTML; //TODO: get title $title = ''; } //echo "[" . $content_block . "]" ; } // use extracted title for both feed and item title if we're using single-item dummy feed if ($isDummyFeed) { $output->setTitle($title); $newitem->setTitle($title); }