public function collectData(array $param) { function StripWithDelimiters($string, $start, $end) { while (strpos($string, $start) !== false) { $section_to_remove = substr($string, strpos($string, $start)); $section_to_remove = substr($section_to_remove, 0, strpos($section_to_remove, $end) + strlen($end)); $string = str_replace($section_to_remove, '', $string); } return $string; } function StripRecursiveHTMLSection($string, $tag_name, $tag_start) { $open_tag = '<' . $tag_name; $close_tag = '</' . $tag_name . '>'; $close_tag_length = strlen($close_tag); if (strpos($tag_start, $open_tag) === 0) { while (strpos($string, $tag_start) !== false) { $max_recursion = 100; $section_to_remove = null; $section_start = strpos($string, $tag_start); $search_offset = $section_start; do { $max_recursion--; $section_end = strpos($string, $close_tag, $search_offset); $search_offset = $section_end + $close_tag_length; $section_to_remove = substr($string, $section_start, $section_end - $section_start + $close_tag_length); $open_tag_count = substr_count($section_to_remove, $open_tag); $close_tag_count = substr_count($section_to_remove, $close_tag); } while ($open_tag_count > $close_tag_count && $max_recursion > 0); $string = str_replace($section_to_remove, '', $string); } } return $string; } $html = $this->file_get_html($this->getURI()) or $this->returnError('Could not request TheHackerNews: ' . $this->getURI(), 500); $limit = 0; foreach ($html->find('article') as $element) { if ($limit < 5) { $article_url = $element->find('a.entry-title', 0)->href; $article_author = trim($element->find('span.vcard', 0)->plaintext); $article_title = $element->find('a.entry-title', 0)->plaintext; $article_timestamp = strtotime($element->find('span.updated', 0)->plaintext); $article_thumbnail = $element->find('img', 0)->src; $article = $this->file_get_html($article_url) or $this->returnError('Could not request TheHackerNews: ' . $article_url, 500); $contents = $article->find('div.articlebodyonly', 0)->innertext; $contents = StripRecursiveHTMLSection($contents, 'div', '<div class=\'clear\''); $contents = StripWithDelimiters($contents, '<script', '</script>'); $item = new \Item(); $item->uri = $article_url; $item->title = $article_title; $item->author = $article_author; $item->thumbnailUri = $article_thumbnail; $item->timestamp = $article_timestamp; $item->content = trim($contents); $this->items[] = $item; $limit++; } } }
public function collectData(array $param) { function StripRecursiveHTMLSection($string, $tag_name, $tag_start) { $open_tag = '<' . $tag_name; $close_tag = '</' . $tag_name . '>'; $close_tag_length = strlen($close_tag); if (strpos($tag_start, $open_tag) === 0) { while (strpos($string, $tag_start) !== false) { $max_recursion = 100; $section_to_remove = null; $section_start = strpos($string, $tag_start); $search_offset = $section_start; do { $max_recursion--; $section_end = strpos($string, $close_tag, $search_offset); $search_offset = $section_end + $close_tag_length; $section_to_remove = substr($string, $section_start, $section_end - $section_start + $close_tag_length); $open_tag_count = substr_count($section_to_remove, $open_tag); $close_tag_count = substr_count($section_to_remove, $close_tag); } while ($open_tag_count > $close_tag_count && $max_recursion > 0); $string = str_replace($section_to_remove, '', $string); } } return $string; } $feedUrl = 'https://feeds.feedburner.com/nakedsecurity?format=xml'; $html = $this->file_get_html($feedUrl) or $this->returnError('Could not request ' . $this->getName() . ': ' . $feedUrl, 500); $limit = 0; foreach ($html->find('item') as $element) { if ($limit < 10) { //Retrieve article Uri and get that page $article_uri = $element->find('guid', 0)->plaintext; $article_html = $this->file_get_html($article_uri) or $this->returnError('Could not request ' . $this->getName() . ': ' . $article_uri, 500); //Build article contents from corresponding elements $article_title = trim($element->find('title', 0)->plaintext); $article_image = $article_html->find('img.wp-post-image', 0)->src; $article_summary = strip_tags(html_entity_decode($element->find('description', 0)->plaintext)); $article_content = $article_html->find('div.entry-content', 0)->innertext; $article_content = StripRecursiveHTMLSection($article_content, 'div', '<div class="entry-prefix"'); $article_content = StripRecursiveHTMLSection($article_content, 'script', '<script'); $article_content = StripRecursiveHTMLSection($article_content, 'aside', '<aside'); $article_content = '<p><img src="' . $article_image . '" /></p><p><b>' . $article_summary . '</b></p>' . $article_content; //Build and add final item $item = new \Item(); $item->uri = $article_uri; $item->title = $article_title; $item->thumbnailUri = $article_image; $item->author = $article_html->find('a[rel=author]', 0)->plaintext; $item->timestamp = strtotime($element->find('pubDate', 0)->plaintext); $item->content = $article_content; $this->items[] = $item; $limit++; } } }
public function collectData(array $param) { function StripCDATA($string) { $string = str_replace('<![CDATA[', '', $string); $string = str_replace(']]>', '', $string); return $string; } function StripWithDelimiters($string, $start, $end) { while (strpos($string, $start) !== false) { $section_to_remove = substr($string, strpos($string, $start)); $section_to_remove = substr($section_to_remove, 0, strpos($section_to_remove, $end) + strlen($end)); $string = str_replace($section_to_remove, '', $string); } return $string; } function StripRecursiveHTMLSection($string, $tag_name, $tag_start) { $open_tag = '<' . $tag_name; $close_tag = '</' . $tag_name . '>'; $close_tag_length = strlen($close_tag); if (strpos($tag_start, $open_tag) === 0) { while (strpos($string, $tag_start) !== false) { $max_recursion = 100; $section_to_remove = null; $section_start = strpos($string, $tag_start); $search_offset = $section_start; do { $max_recursion--; $section_end = strpos($string, $close_tag, $search_offset); $search_offset = $section_end + $close_tag_length; $section_to_remove = substr($string, $section_start, $section_end - $section_start + $close_tag_length); $open_tag_count = substr_count($section_to_remove, $open_tag); $close_tag_count = substr_count($section_to_remove, $close_tag); } while ($open_tag_count > $close_tag_count && $max_recursion > 0); $string = str_replace($section_to_remove, '', $string); } } return $string; } if (empty($param['feed'])) { $this->returnError('Please select a feed to display.' . $url, 400); } if ($param['feed'] !== preg_replace('/[^a-zA-Z-\\/]+/', '', $param['feed']) || substr_count($param['feed'], '/') > 1 || strlen($param['feed'] > 64)) { $this->returnError('Invalid "feed" parameter.' . $url, 400); } $url = $this->getURI() . 'rss/' . $param['feed'] . '.xml'; $html = $this->file_get_html($url) or $this->returnError('Could not request Futura-Sciences: ' . $url, 500); $limit = 0; foreach ($html->find('item') as $element) { if ($limit < 10) { $article_url = str_replace('#xtor=RSS-8', '', StripCDATA($element->find('guid', 0)->plaintext)); $article = $this->file_get_html($article_url) or $this->returnError('Could not request Futura-Sciences: ' . $article_url, 500); $contents = $article->find('div.content', 0)->innertext; $author = trim(str_replace(', Futura-Sciences', '', $article->find('span.author', 0)->plaintext)); if (empty($author)) { $author = StripCDATA($element->find('author', 0)->plaintext); } foreach (array('<div class="clear', '<div class="sharebar2', '<div class="diaporamafullscreen"', '<div style="margin-bottom:10px;" class="noprint"', '<div class="ficheprevnext', '<div class="bar noprint', '<div class="toolbar noprint', '<div class="addthis_toolbox', '<div class="noprint', '<div class="bg bglight border border-full noprint', '<div class="httplogbar-wrapper noprint', '<div id="forumcomments') as $div_start) { $contents = StripRecursiveHTMLSection($contents, 'div', $div_start); } $contents = StripWithDelimiters($contents, '<hr ', '/>'); $contents = StripWithDelimiters($contents, '<p class="content-date', '</p>'); $contents = StripWithDelimiters($contents, '<h1 class="content-title', '</h1>'); $contents = StripWithDelimiters($contents, 'fs:definition="', '"'); $contents = StripWithDelimiters($contents, 'fs:xt:clicktype="', '"'); $contents = StripWithDelimiters($contents, 'fs:xt:clickname="', '"'); $item = new \Item(); $item->author = $author; $item->uri = $article_url; $item->title = StripCDATA($element->find('title', 0)->innertext); $item->thumbnailUri = StripCDATA($element->find('enclosure', 0)->url); $item->timestamp = strtotime(StripCDATA($element->find('pubDate', 0)->plaintext)); $item->content = trim($contents); $this->items[] = $item; $limit++; } } }
public function collectData(array $param) { function StripCDATA($string) { $string = str_replace('<![CDATA[', '', $string); $string = str_replace(']]>', '', $string); return trim($string); } function ExtractFromDelimiters($string, $start, $end) { if (strpos($string, $start) !== false) { $section_retrieved = substr($string, strpos($string, $start) + strlen($start)); $section_retrieved = substr($section_retrieved, 0, strpos($section_retrieved, $end)); return $section_retrieved; } return false; } function StripWithDelimiters($string, $start, $end) { while (strpos($string, $start) !== false) { $section_to_remove = substr($string, strpos($string, $start)); $section_to_remove = substr($section_to_remove, 0, strpos($section_to_remove, $end) + strlen($end)); $string = str_replace($section_to_remove, '', $string); } return $string; } function StripRecursiveHTMLSection($string, $tag_name, $tag_start) { $open_tag = '<' . $tag_name; $close_tag = '</' . $tag_name . '>'; $close_tag_length = strlen($close_tag); if (strpos($tag_start, $open_tag) === 0) { while (strpos($string, $tag_start) !== false) { $max_recursion = 100; $section_to_remove = null; $section_start = strpos($string, $tag_start); $search_offset = $section_start; do { $max_recursion--; $section_end = strpos($string, $close_tag, $search_offset); $search_offset = $section_end + $close_tag_length; $section_to_remove = substr($string, $section_start, $section_end - $section_start + $close_tag_length); $open_tag_count = substr_count($section_to_remove, $open_tag); $close_tag_count = substr_count($section_to_remove, $close_tag); } while ($open_tag_count > $close_tag_count && $max_recursion > 0); $string = str_replace($section_to_remove, '', $string); } } return $string; } $baseUri = $this->getURI(); $feed = $param['feed']; if (empty($feed)) { $this->returnError('Please select a feed to display.', 400); } if (strpos($feed, 'downloads!') !== false) { $feed = str_replace('downloads!', '', $feed); $baseUri = str_replace('www.', 'downloads.', $baseUri); } if ($feed !== preg_replace('/[^a-zA-Z0-9-\\/]+/', '', $feed) || substr_count($feed, '/') > 1 || strlen($feed > 64)) { $this->returnError('Invalid "feed" parameter.', 400); } $url = $baseUri . trim($feed, '/') . '/rss.xml'; $html = $this->file_get_html($url) or $this->returnError('Could not request ZDNet: ' . $url, 500); $limit = 0; foreach ($html->find('item') as $element) { if ($limit < 10) { $article_url = preg_replace('/([^#]+)#ftag=.*/', '$1', StripCDATA(ExtractFromDelimiters($element->innertext, '<link>', '</link>'))); $article_author = StripCDATA(ExtractFromDelimiters($element->innertext, 'role="author">', '<')); $article_title = StripCDATA($element->find('title', 0)->plaintext); $article_subtitle = StripCDATA($element->find('description', 0)->plaintext); $article_timestamp = strtotime(StripCDATA($element->find('pubDate', 0)->plaintext)); $article = $this->file_get_html($article_url) or $this->returnError('Could not request ZDNet: ' . $article_url, 500); if (!empty($article_author)) { $author = $article_author; } else { $author = $article->find('meta[name=author]', 0); if (is_object($author)) { $author = $author->content; } else { $author = 'ZDNet'; } } $thumbnail = $article->find('meta[itemprop=image]', 0); if (is_object($thumbnail)) { $thumbnail = $thumbnail->content; } else { $thumbnail = ''; } $contents = $article->find('article', 0)->innertext; foreach (array('<div class="shareBar"', '<div class="shortcodeGalleryWrapper"', '<div class="relatedContent', '<div class="downloadNow', '<div data-shortcode', '<div id="sharethrough', '<div id="inpage-video') as $div_start) { $contents = StripRecursiveHTMLSection($contents, 'div', $div_start); } $contents = StripWithDelimiters($contents, '<script', '</script>'); $contents = StripWithDelimiters($contents, '<meta itemprop="image"', '>'); $contents = trim(StripWithDelimiters($contents, '<section class="sharethrough-top', '</section>')); $content_img = strpos($contents, '<img'); //Look for first image if ($content_img !== false && $content_img < 512 || $thumbnail == '') { $content_img = ''; } else { $content_img = '<p><img src="' . $thumbnail . '" /></p>'; } //Include thumbnail $contents = $content_img . '<p><b>' . $article_subtitle . '</b></p>' . $contents; if ($thumbnail == '') { $thumbnail = 'http://zdnet1.cbsistatic.com/fly/bundles/zdnetcss/images/logos/logo-192x192.png'; } $item = new \Item(); $item->author = $author; $item->uri = $article_url; $item->title = $article_title; $item->thumbnailUri = $thumbnail; $item->timestamp = $article_timestamp; $item->content = $contents; $this->items[] = $item; $limit++; } } }