} } if ($do_content_extraction) { $html = $single_page_response['body']; // remove strange things $html = str_replace('</[>', '', $html); $html = convert_to_utf8($html, $single_page_response['headers']); debug("Retrieved single-page view from {$effective_url}"); } unset($single_page_response); } } if ($do_content_extraction) { debug('--------'); debug('Attempting to extract content'); $extraction_successful = $extractor->process($html, $effective_url); $readability = $extractor->readability; $content_block = $extraction_successful ? $extractor->getContent() : null; $extracted_title = $extraction_successful ? $extractor->getTitle() : ''; // Deal with multi-page articles //die('Next: '.$extractor->getNextPageUrl()); $is_multi_page = !$is_single_page && $extraction_successful && $extractor->getNextPageUrl(); if ($options->multipage && $is_multi_page && $options->content) { debug('--------'); debug('Attempting to process multi-page article'); $multi_page_urls = array(); $multi_page_content = array(); while ($next_page_url = $extractor->getNextPageUrl()) { debug('--------'); debug('Processing next page: ' . $next_page_url); // If we've got URL, resolve against $url
} } if ($do_content_extraction) { $html = $single_page_response['body']; // remove strange things $html = str_replace('</[>', '', $html); $html = convert_to_utf8($html, $single_page_response['headers']); debug("Retrieved single-page view from {$effective_url}"); } unset($single_page_response); } } if ($do_content_extraction) { debug('--------'); debug('Attempting to extract content'); $extract_result = $extractor->process($html, $effective_url); $readability = $extractor->readability; $content_block = $extract_result ? $extractor->getContent() : null; $extracted_title = $extract_result ? $extractor->getTitle() : ''; // Deal with multi-page articles //die('Next: '.$extractor->getNextPageUrl()); $is_multi_page = !$is_single_page && $extract_result && $extractor->getNextPageUrl(); if ($options->multipage && $is_multi_page && $options->content) { debug('--------'); debug('Attempting to process multi-page article'); $multi_page_urls = array(); $multi_page_content = array(); while ($next_page_url = $extractor->getNextPageUrl()) { debug('--------'); debug('Processing next page: ' . $next_page_url); // If we've got URL, resolve against $url
//echo "[" . $html . "]"; // remove strange things $html = str_replace('</[>', '', $html); $html = convert_to_utf8($html, $response['headers']); //echo "[" . $html . "]"; if ($auto_extract) { // check site config for single page URL - fetch it if found if ($single_page_response = getSinglePage($item, $html, $effective_url)) { $html = $single_page_response['body']; // remove strange things $html = str_replace('</[>', '', $html); $html = convert_to_utf8($html, $single_page_response['headers']); $effective_url = $single_page_response['effective_url']; unset($single_page_response); } $extract_result = $extractor->process($html, $effective_url); $readability = $extractor->readability; $content_block = $extract_result ? $extractor->getContent() : null; //echo "content_block: [" . $content_block->innerHTML . "] \n\n"; $title = $extract_result ? $extractor->getTitle() : ''; } else { $readability = new Readability($html, $effective_url); // content block is entire document (for now...) $content_block = $readability->dom; //echo $content_block->innerHTML; //TODO: get title $title = ''; } //echo "[" . $content_block . "]" ; } // use extracted title for both feed and item title if we're using single-item dummy feed
public function extractContentBlock($permalink) { $extractor = new ContentExtractor(dirname(__FILE__) . '/site_config/custom', dirname(__FILE__) . '/site_config/standard'); //$extractor = $this; $extractor->next_page_deep_count = $this->next_page_deep_count + 1; $extractor->next_pages = $this->next_pages; if (in_array($permalink, $extractor->next_pages)) { return FALSE; } $extractor->next_pages[] = $permalink; if ($extractor->next_page_deep_count > 3) { return FALSE; } $extractor->fingerprints = $this->options->fingerprints; $elem = new ContentExtractor($this->path, $this->fallback); $extractor->fingerprints = $this->fingerprints; $http = new HumbleHttpAgent(); $response = $http->get($permalink, true); //echo 'status_code: '. $response['status_code'] . "\n\n"; if ($permalink && ($response = $http->get($permalink, true)) && ($response['status_code'] < 300 || $response['status_code'] > 400)) { $html = $response['body']; //echo "html: " .$html; // remove strange things $html = str_replace('</[>', '', $html); $html = convert_to_utf8($html, $response['headers']); if (function_exists('mb_convert_encoding')) { $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8"); } $extract_result = $extractor->process($html, $permalink); //$readability = $extractor->readability; $content_block = $extract_result ? $extractor->getContent() : null; //echo "content_block->innerHTML: ". $content_block->innerHTML . "\n\n"; //$this->body->appendChild($elem); } $doc = new DOMDocument(); if (@$doc->loadHTML($content_block->innerHTML)) { $doc->saveHTML(); //$content = $this->readability->dom->loadHTML($content_block->innerHTML); $content = $this->readability->dom->createElement('div', $content_block->innerHTML); $content = $this->readability->dom->importNode($content_block, true); return $content; } else { return FALSE; } return FALSE; //return $content_block; }