public function collectData(array $param) { function StripCDATA($string) { $string = str_replace('<![CDATA[', '', $string); $string = str_replace(']]>', '', $string); return $string; } function ExtractContent($url) { $html2 = file_get_html($url); $text = $html2->find('div.entry-content', 0)->innertext; $text = preg_replace('@<script[^>]*?>.*?</script>@si', '', $text); $text = preg_replace('@<div[^>]*?>.*?</div>@si', '', $text); $text = preg_replace("/<h1.*/", '', $text); return $text; } $html = file_get_html('http://memo-linux.com/feed/') or $this->returnError('Could not request MemoLinux.', 404); $limit = 0; foreach ($html->find('item') as $element) { if ($limit < 10) { $item = new \Item(); $item->title = StripCDATA($element->find('title', 0)->innertext); $item->uri = StripCDATA($element->find('guid', 0)->plaintext); $item->timestamp = strtotime($element->find('pubDate', 0)->plaintext); $item->content = ExtractContent($item->uri); $this->items[] = $item; $limit++; } } }
public function collectData(array $param) { function StripCDATA($string) { $string = str_replace('<![CDATA[', '', $string); $string = str_replace(']]>', '', $string); return $string; } function ExtractContent($url) { $html2 = file_get_html($url); $text = $html2->find('div.single-contenu', 0)->innertext; return $text; } $html = file_get_html('http://feeds2.feedburner.com/lemotdujour/lemotdujour') or $this->returnError('Could not request LeMotDuJour.', 404); $limit = 0; foreach ($html->find('item') as $element) { if ($limit < 10) { $item = new \Item(); $item->title = StripCDATA($element->find('title', 0)->innertext); $item->uri = StripCDATA($element->find('guid', 0)->plaintext); $item->timestamp = strtotime($element->find('pubDate', 0)->plaintext); $item->content = ExtractContent($item->uri); $this->items[] = $item; $limit++; } } }
public function collectData(array $param) { function StripCDATA($string) { $string = str_replace('<![CDATA[', '', $string); $string = str_replace(']]>', '', $string); return $string; } function ExtractContent($url) { $html2 = $this->file_get_html($url); $text = '<p><em>' . $html2->find('span.sub_title', 0)->innertext . '</em></p>' . '<p><img src="' . $html2->find('div.container_main_image_article', 0)->find('img.dedicated', 0)->src . '" alt="-" /></p>' . '<div>' . $html2->find('div[itemprop=articleBody]', 0)->innertext . '</div>'; $premium_article = $html2->find('h2.title_reserve_article', 0); if (is_object($premium_article)) { $text = $text . '<p><em>' . $premium_article->innertext . '</em></p>'; } return $text; } $html = $this->file_get_html('http://www.nextinpact.com/rss/news.xml') or $this->returnError('Could not request NextInpact.', 404); $limit = 0; foreach ($html->find('item') as $element) { if ($limit < 3) { $item = new \Item(); $item->title = StripCDATA($element->find('title', 0)->innertext); $item->uri = StripCDATA($element->find('guid', 0)->plaintext); $item->thumbnailUri = StripCDATA($element->find('enclosure', 0)->url); $item->author = StripCDATA($element->find('author', 0)->innertext); $item->timestamp = strtotime($element->find('pubDate', 0)->plaintext); $item->content = ExtractContent($item->uri); $this->items[] = $item; $limit++; } } }
public function collectData(array $param) { function StripCDATA($string) { $string = str_replace('<![CDATA[', '', $string); $string = str_replace(']]>', '', $string); return $string; } function ExtractContent($url) { $html2 = file_get_html($url); $text = $html2->find('div.texte', 0)->innertext; return $text; } $html = file_get_html('http://www.acrimed.org/spip.php?page=backend') or $this->returnError('Could not request Acrimed.', 404); $limit = 0; foreach ($html->find('item') as $element) { if ($limit < 10) { $item = new \Item(); $item->title = StripCDATA($element->find('title', 0)->innertext); $item->uri = StripCDATA($element->find('guid', 0)->plaintext); $item->timestamp = strtotime($element->find('pubDate', 0)->plaintext); $item->content = ExtractContent($item->uri); $this->items[] = $item; $limit++; } } }
public function collectData(array $param) { function StripCDATA($string) { $string = str_replace('<![CDATA[', '', $string); $string = str_replace(']]>', '', $string); return $string; } function ExtractContent($url) { $html2 = file_get_html($url); $text = $html2->find('article#page', 0)->innertext; $text = preg_replace('@<script[^>]*?>.*?</script>@si', '', $text); return $text; } $html = file_get_html('http://www.tuxboard.com/feed/atom/') or $this->returnError('Could not request Tuxboard.', 404); $limit = 0; foreach ($html->find('entry') as $element) { if ($limit < 10) { $item = new \Item(); $item->title = StripCDATA($element->find('title', 0)->innertext); $item->uri = $element->find('link', 0)->href; $item->timestamp = strtotime($element->find('published', 0)->plaintext); $item->content = ExtractContent($item->uri); $this->items[] = $item; $limit++; } } }
public function collectData(array $param) { function StripCDATA($string) { $string = str_replace('<![CDATA[', '', $string); $string = str_replace(']]>', '', $string); return $string; } function ExtractContent($url) { $html2 = file_get_html($url); $text = '<h2>' . $html2->find('div#actu_entete > h2', 0)->innertext . '</h2><br><br>'; $text = $text . $html2->find('div[itemprop=articleBody]', 0)->innertext; return $text; } $html = file_get_html('http://www.nextinpact.com/rss/news.xml') or $this->returnError('Could not request Nextinpact.', 404); $limit = 0; foreach ($html->find('item') as $element) { if ($limit < 3) { $item = new \Item(); $item->title = StripCDATA($element->find('title', 0)->innertext); $item->uri = StripCDATA($element->find('guid', 0)->plaintext); $item->timestamp = strtotime($element->find('pubDate', 0)->plaintext); $item->content = ExtractContent($item->uri); $this->items[] = $item; $limit++; } } }
public function collectData(array $param) { function StripCDATA($string) { $string = str_replace('<![CDATA[', '', $string); $string = str_replace(']]>', '', $string); return $string; } function StripWithDelimiters($string, $start, $end) { while (strpos($string, $start) !== false) { $section_to_remove = substr($string, strpos($string, $start)); $section_to_remove = substr($section_to_remove, 0, strpos($section_to_remove, $end) + strlen($end)); $string = str_replace($section_to_remove, '', $string); } return $string; } function CleanArticle($article_html) { $article_html = StripWithDelimiters($article_html, '<script', '</script>'); $article_html = StripWithDelimiters($article_html, '<h1 class="cleanprint-title"', '</h1>'); return $article_html; } $feedUrl = 'http://www.lemondeinformatique.fr/rss/rss.xml'; $html = $this->file_get_html($feedUrl) or $this->returnError('Could not request LeMondeInformatique: ' . $feedUrl, 500); $limit = 0; foreach ($html->find('item') as $element) { if ($limit < 5) { //Retrieve article details $article_uri = $element->innertext; $article_uri = substr($article_uri, strpos($article_uri, '<link>') + 6); $article_uri = substr($article_uri, 0, strpos($article_uri, '</link>')); $article_html = $this->file_get_html($article_uri) or $this->returnError('Could not request LeMondeInformatique: ' . $article_uri, 500); $thumbnailUri = $article_html->find('div#article', 0)->find('img#illustration', 0)->src; $article_content = CleanArticle($article_html->find('div#article', 0)->innertext); $article_title = $article_html->find('h1.cleanprint-title', 0)->plaintext; //Build and add final item $item = new \Item(); $item->uri = $article_uri; $item->thumbnailUri = $thumbnailUri; $item->title = $article_title; $item->author = StripCDATA($element->find('dc:creator', 0)->innertext); $item->timestamp = strtotime($element->find('dc:date', 0)->plaintext); $item->content = $article_content; $this->items[] = $item; $limit++; } } }
public function collectData(array $param) { function StripCDATA($string) { $string = str_replace('<![CDATA[', '', $string); $string = str_replace(']]>', '', $string); return $string; } $feedUrl = 'http://www.silicon.fr/feed'; $html = $this->file_get_html($feedUrl) or $this->returnError('Could not request Silicon: ' . $feedUrl, 500); $limit = 0; foreach ($html->find('item') as $element) { if ($limit < 5) { //Retrieve article Uri and get that page $article_uri = $element->innertext; $article_uri = substr($article_uri, strpos($article_uri, '<link>') + 6); $article_uri = substr($article_uri, 0, strpos($article_uri, '</link>')); $article_html = $this->file_get_html($article_uri) or $this->returnError('Could not request Silicon: ' . $article_uri, 500); //Build article contents from corresponding elements $thumbnailUri = $element->find('enclosure', 0)->url; $article_content = '<p><img src="' . $thumbnailUri . '" /></p>' . '<p><b>' . $article_html->find('div.entry-excerpt', 0)->plaintext . '</b></p>' . $article_html->find('div.entry-content', 0)->innertext; //Remove useless scripts left in the page while (strpos($article_content, '<script') !== false) { $script_section = substr($article_content, strpos($article_content, '<script')); $script_section = substr($script_section, 0, strpos($script_section, '</script>') + 9); $article_content = str_replace($script_section, '', $article_content); } //Build and add final item $item = new \Item(); $item->uri = $article_uri; $item->thumbnailUri = $thumbnailUri; $item->title = StripCDATA($element->find('title', 0)->innertext); $item->author = StripCDATA($element->find('dc:creator', 0)->innertext); $item->timestamp = strtotime($element->find('pubDate', 0)->plaintext); $item->content = $article_content; $this->items[] = $item; $limit++; } } }
public function collectData(array $param) { function StripCDATA($string) { $string = str_replace('<![CDATA[', '', $string); $string = str_replace(']]>', '', $string); return $string; } $category = '/'; if (!empty($param['category'])) { $category = '/' . $param['category'] . '/'; } $url = $this->getURI() . $category . 'rss.xml'; $html = $this->file_get_html($url) or $this->returnError('Could not request Zone Telechargement: ' . $url, 500); foreach ($html->find('item') as $element) { $item = new \Item(); $item->title = $element->find('title', 0)->plaintext; $item->uri = str_replace('http://', 'https://', $element->find('guid', 0)->plaintext); $item->timestamp = strtotime($element->find('pubDate', 0)->plaintext); $item->content = StripCDATA($element->find('description', 0)->innertext); $this->items[] = $item; $limit++; } }
public function collectData(array $param) { //Utility function for extracting CDATA fields function StripCDATA($string) { $string = str_replace('<![CDATA[', '', $string); $string = str_replace(']]>', '', $string); return $string; } //Utility function for removing text based on specified delimiters function StripWithDelimiters($string, $start, $end) { while (strpos($string, $start) !== false) { $section_to_remove = substr($string, strpos($string, $start)); $section_to_remove = substr($section_to_remove, 0, strpos($section_to_remove, $end) + strlen($end)); $string = str_replace($section_to_remove, '', $string); } return $string; } //Ensure proper parameters have been provided if (empty($param['search'])) { $this->returnError('You must specify a search criteria', 400); } //Retrieve torrent listing as truncated rss, which does not contain torrent description $url = 'http://www.t411.in/torrents/rss/?' . $param['search'] . '&order=added&type=desc'; $html = file_get_html($url) or $this->returnError('Could not request t411: ' . $url, 500); $limit = 0; //Process each item individually foreach ($html->find('item') as $element) { //Limit total amount of requests if ($limit < 5) { //Requests are rate-limited sleep(1); //So we need to wait //Retrieve data from RSS entry $item_uri = StripCDATA($element->find('guid', 0)->plaintext); $item_title = StripWithDelimiters(StripCDATA($element->find('title', 0)->innertext), ' (S:', ')'); $item_date = strtotime($element->find('pubDate', 0)->plaintext); //Retrieve full description from torrent page if ($item_html = file_get_html($item_uri)) { //Retrieve data from page contents $item_desc = $item_html->find('div.description', 0); $item_author = $item_html->find('a.profile', 0)->innertext; //Retrieve image for thumbnail or generic logo fallback $item_image = 'http://www.t411.in/themes/blue/images/logo.png'; foreach ($item_desc->find('img') as $img) { if (strpos($img->src, 'dreamprez') === false) { $item_image = $img->src; break; } } //Build and add final item $item = new \Item(); $item->uri = $item_uri; $item->title = $item_title; $item->author = $item_author; $item->timestamp = $item_date; $item->thumbnailUri = $item_image; $item->content = utf8_encode($item_desc->innertext); $this->items[] = $item; $limit++; } } } }
public function collectData(array $param) { function StripCDATA($string) { $string = str_replace('<![CDATA[', '', $string); $string = str_replace(']]>', '', $string); return $string; } function StripWithDelimiters($string, $start, $end) { while (strpos($string, $start) !== false) { $section_to_remove = substr($string, strpos($string, $start)); $section_to_remove = substr($section_to_remove, 0, strpos($section_to_remove, $end) + strlen($end)); $string = str_replace($section_to_remove, '', $string); } return $string; } function StripRecursiveHTMLSection($string, $tag_name, $tag_start) { $open_tag = '<' . $tag_name; $close_tag = '</' . $tag_name . '>'; $close_tag_length = strlen($close_tag); if (strpos($tag_start, $open_tag) === 0) { while (strpos($string, $tag_start) !== false) { $max_recursion = 100; $section_to_remove = null; $section_start = strpos($string, $tag_start); $search_offset = $section_start; do { $max_recursion--; $section_end = strpos($string, $close_tag, $search_offset); $search_offset = $section_end + $close_tag_length; $section_to_remove = substr($string, $section_start, $section_end - $section_start + $close_tag_length); $open_tag_count = substr_count($section_to_remove, $open_tag); $close_tag_count = substr_count($section_to_remove, $close_tag); } while ($open_tag_count > $close_tag_count && $max_recursion > 0); $string = str_replace($section_to_remove, '', $string); } } return $string; } if (empty($param['feed'])) { $this->returnError('Please select a feed to display.' . $url, 400); } if ($param['feed'] !== preg_replace('/[^a-zA-Z-\\/]+/', '', $param['feed']) || substr_count($param['feed'], '/') > 1 || strlen($param['feed'] > 64)) { $this->returnError('Invalid "feed" parameter.' . $url, 400); } $url = $this->getURI() . 'rss/' . $param['feed'] . '.xml'; $html = $this->file_get_html($url) or $this->returnError('Could not request Futura-Sciences: ' . $url, 500); $limit = 0; foreach ($html->find('item') as $element) { if ($limit < 10) { $article_url = str_replace('#xtor=RSS-8', '', StripCDATA($element->find('guid', 0)->plaintext)); $article = $this->file_get_html($article_url) or $this->returnError('Could not request Futura-Sciences: ' . $article_url, 500); $contents = $article->find('div.content', 0)->innertext; $author = trim(str_replace(', Futura-Sciences', '', $article->find('span.author', 0)->plaintext)); if (empty($author)) { $author = StripCDATA($element->find('author', 0)->plaintext); } foreach (array('<div class="clear', '<div class="sharebar2', '<div class="diaporamafullscreen"', '<div style="margin-bottom:10px;" class="noprint"', '<div class="ficheprevnext', '<div class="bar noprint', '<div class="toolbar noprint', '<div class="addthis_toolbox', '<div class="noprint', '<div class="bg bglight border border-full noprint', '<div class="httplogbar-wrapper noprint', '<div id="forumcomments') as $div_start) { $contents = StripRecursiveHTMLSection($contents, 'div', $div_start); } $contents = StripWithDelimiters($contents, '<hr ', '/>'); $contents = StripWithDelimiters($contents, '<p class="content-date', '</p>'); $contents = StripWithDelimiters($contents, '<h1 class="content-title', '</h1>'); $contents = StripWithDelimiters($contents, 'fs:definition="', '"'); $contents = StripWithDelimiters($contents, 'fs:xt:clicktype="', '"'); $contents = StripWithDelimiters($contents, 'fs:xt:clickname="', '"'); $item = new \Item(); $item->author = $author; $item->uri = $article_url; $item->title = StripCDATA($element->find('title', 0)->innertext); $item->thumbnailUri = StripCDATA($element->find('enclosure', 0)->url); $item->timestamp = strtotime(StripCDATA($element->find('pubDate', 0)->plaintext)); $item->content = trim($contents); $this->items[] = $item; $limit++; } } }
public function collectData(array $param) { function StripCDATA($string) { $string = str_replace('<![CDATA[', '', $string); $string = str_replace(']]>', '', $string); return trim($string); } function ExtractFromDelimiters($string, $start, $end) { if (strpos($string, $start) !== false) { $section_retrieved = substr($string, strpos($string, $start) + strlen($start)); $section_retrieved = substr($section_retrieved, 0, strpos($section_retrieved, $end)); return $section_retrieved; } return false; } function StripWithDelimiters($string, $start, $end) { while (strpos($string, $start) !== false) { $section_to_remove = substr($string, strpos($string, $start)); $section_to_remove = substr($section_to_remove, 0, strpos($section_to_remove, $end) + strlen($end)); $string = str_replace($section_to_remove, '', $string); } return $string; } function StripRecursiveHTMLSection($string, $tag_name, $tag_start) { $open_tag = '<' . $tag_name; $close_tag = '</' . $tag_name . '>'; $close_tag_length = strlen($close_tag); if (strpos($tag_start, $open_tag) === 0) { while (strpos($string, $tag_start) !== false) { $max_recursion = 100; $section_to_remove = null; $section_start = strpos($string, $tag_start); $search_offset = $section_start; do { $max_recursion--; $section_end = strpos($string, $close_tag, $search_offset); $search_offset = $section_end + $close_tag_length; $section_to_remove = substr($string, $section_start, $section_end - $section_start + $close_tag_length); $open_tag_count = substr_count($section_to_remove, $open_tag); $close_tag_count = substr_count($section_to_remove, $close_tag); } while ($open_tag_count > $close_tag_count && $max_recursion > 0); $string = str_replace($section_to_remove, '', $string); } } return $string; } $baseUri = $this->getURI(); $feed = $param['feed']; if (empty($feed)) { $this->returnError('Please select a feed to display.', 400); } if (strpos($feed, 'downloads!') !== false) { $feed = str_replace('downloads!', '', $feed); $baseUri = str_replace('www.', 'downloads.', $baseUri); } if ($feed !== preg_replace('/[^a-zA-Z0-9-\\/]+/', '', $feed) || substr_count($feed, '/') > 1 || strlen($feed > 64)) { $this->returnError('Invalid "feed" parameter.', 400); } $url = $baseUri . trim($feed, '/') . '/rss.xml'; $html = $this->file_get_html($url) or $this->returnError('Could not request ZDNet: ' . $url, 500); $limit = 0; foreach ($html->find('item') as $element) { if ($limit < 10) { $article_url = preg_replace('/([^#]+)#ftag=.*/', '$1', StripCDATA(ExtractFromDelimiters($element->innertext, '<link>', '</link>'))); $article_author = StripCDATA(ExtractFromDelimiters($element->innertext, 'role="author">', '<')); $article_title = StripCDATA($element->find('title', 0)->plaintext); $article_subtitle = StripCDATA($element->find('description', 0)->plaintext); $article_timestamp = strtotime(StripCDATA($element->find('pubDate', 0)->plaintext)); $article = $this->file_get_html($article_url) or $this->returnError('Could not request ZDNet: ' . $article_url, 500); if (!empty($article_author)) { $author = $article_author; } else { $author = $article->find('meta[name=author]', 0); if (is_object($author)) { $author = $author->content; } else { $author = 'ZDNet'; } } $thumbnail = $article->find('meta[itemprop=image]', 0); if (is_object($thumbnail)) { $thumbnail = $thumbnail->content; } else { $thumbnail = ''; } $contents = $article->find('article', 0)->innertext; foreach (array('<div class="shareBar"', '<div class="shortcodeGalleryWrapper"', '<div class="relatedContent', '<div class="downloadNow', '<div data-shortcode', '<div id="sharethrough', '<div id="inpage-video') as $div_start) { $contents = StripRecursiveHTMLSection($contents, 'div', $div_start); } $contents = StripWithDelimiters($contents, '<script', '</script>'); $contents = StripWithDelimiters($contents, '<meta itemprop="image"', '>'); $contents = trim(StripWithDelimiters($contents, '<section class="sharethrough-top', '</section>')); $content_img = strpos($contents, '<img'); //Look for first image if ($content_img !== false && $content_img < 512 || $thumbnail == '') { $content_img = ''; } else { $content_img = '<p><img src="' . $thumbnail . '" /></p>'; } //Include thumbnail $contents = $content_img . '<p><b>' . $article_subtitle . '</b></p>' . $contents; if ($thumbnail == '') { $thumbnail = 'http://zdnet1.cbsistatic.com/fly/bundles/zdnetcss/images/logos/logo-192x192.png'; } $item = new \Item(); $item->author = $author; $item->uri = $article_url; $item->title = $article_title; $item->thumbnailUri = $thumbnail; $item->timestamp = $article_timestamp; $item->content = $contents; $this->items[] = $item; $limit++; } } }