public function collectData(array $param) { function StripWithDelimiters($string, $start, $end) { while (strpos($string, $start) !== false) { $section_to_remove = substr($string, strpos($string, $start)); $section_to_remove = substr($section_to_remove, 0, strpos($section_to_remove, $end) + strlen($end)); $string = str_replace($section_to_remove, '', $string); } return $string; } function StripRecursiveHTMLSection($string, $tag_name, $tag_start) { $open_tag = '<' . $tag_name; $close_tag = '</' . $tag_name . '>'; $close_tag_length = strlen($close_tag); if (strpos($tag_start, $open_tag) === 0) { while (strpos($string, $tag_start) !== false) { $max_recursion = 100; $section_to_remove = null; $section_start = strpos($string, $tag_start); $search_offset = $section_start; do { $max_recursion--; $section_end = strpos($string, $close_tag, $search_offset); $search_offset = $section_end + $close_tag_length; $section_to_remove = substr($string, $section_start, $section_end - $section_start + $close_tag_length); $open_tag_count = substr_count($section_to_remove, $open_tag); $close_tag_count = substr_count($section_to_remove, $close_tag); } while ($open_tag_count > $close_tag_count && $max_recursion > 0); $string = str_replace($section_to_remove, '', $string); } } return $string; } $html = $this->file_get_html($this->getURI()) or $this->returnError('Could not request TheHackerNews: ' . $this->getURI(), 500); $limit = 0; foreach ($html->find('article') as $element) { if ($limit < 5) { $article_url = $element->find('a.entry-title', 0)->href; $article_author = trim($element->find('span.vcard', 0)->plaintext); $article_title = $element->find('a.entry-title', 0)->plaintext; $article_timestamp = strtotime($element->find('span.updated', 0)->plaintext); $article_thumbnail = $element->find('img', 0)->src; $article = $this->file_get_html($article_url) or $this->returnError('Could not request TheHackerNews: ' . $article_url, 500); $contents = $article->find('div.articlebodyonly', 0)->innertext; $contents = StripRecursiveHTMLSection($contents, 'div', '<div class=\'clear\''); $contents = StripWithDelimiters($contents, '<script', '</script>'); $item = new \Item(); $item->uri = $article_url; $item->title = $article_title; $item->author = $article_author; $item->thumbnailUri = $article_thumbnail; $item->timestamp = $article_timestamp; $item->content = trim($contents); $this->items[] = $item; $limit++; } } }
public function collectData(array $param) { function ExtractFromDelimiters($string, $start, $end) { if (strpos($string, $start) !== false) { $section_retrieved = substr($string, strpos($string, $start) + strlen($start)); $section_retrieved = substr($section_retrieved, 0, strpos($section_retrieved, $end)); return $section_retrieved; } return false; } function StripWithDelimiters($string, $start, $end) { while (strpos($string, $start) !== false) { $section_to_remove = substr($string, strpos($string, $start)); $section_to_remove = substr($section_to_remove, 0, strpos($section_to_remove, $end) + strlen($end)); $string = str_replace($section_to_remove, '', $string); } return $string; } $category = $param['category']; if (empty($category)) { $category = 'all'; } if ($category !== preg_replace('/[^a-z-]+/', '', $category) || strlen($category > 32)) { $this->returnError('Invalid "category" parameter.', 400); } $url = $this->getURI() . 'rss/' . $category . '/'; $html = $this->file_get_html($url) or $this->returnError('Could not request Nextgov: ' . $url, 500); $limit = 0; foreach ($html->find('item') as $element) { if ($limit < 10) { $article_url = ExtractFromDelimiters($element->innertext, '<link>', '</link>'); $article_author = ExtractFromDelimiters($element->innertext, 'dc/elements/1.1/">', '</dc:creator>'); $article_title = $element->find('title', 0)->plaintext; $article_subtitle = $element->find('description', 0)->plaintext; $article_timestamp = strtotime($element->find('pubDate', 0)->plaintext); $article_thumbnail = ExtractFromDelimiters($element->innertext, '<media:content url="', '"'); $article = $this->file_get_html($article_url) or $this->returnError('Could not request Nextgov: ' . $article_url, 500); $contents = $article->find('div.wysiwyg', 0)->innertext; $contents = StripWithDelimiters($contents, '<div class="ad-container">', '</div>'); $contents = StripWithDelimiters($contents, '<div', '</div>'); //ad outer div $contents = StripWithDelimiters($contents, '<script', '</script>'); $contents = ($article_thumbnail == '' ? '' : '<p><img src="' . $article_thumbnail . '" /></p>') . '<p><b>' . $article_subtitle . '</b></p>' . trim($contents); if ($article_thumbnail == '') { $article_thumbnail = 'http://cdn.nextgov.com/nextgov/images/logo.png'; } $item = new \Item(); $item->uri = $article_url; $item->title = $article_title; $item->author = $article_author; $item->thumbnailUri = $article_thumbnail; $item->timestamp = $article_timestamp; $item->content = $contents; $this->items[] = $item; $limit++; } } }
function cleanup_post_content($content, $site_url) { $content = str_replace(':arrow:', '➤', $content); $content = str_replace('href="attachments/', 'href="' . $site_url . 'attachments/', $content); $content = StripWithDelimiters($content, '<script', '</script>'); return $content; }
function CleanArticle($article_html) { $article_html = '<p>' . substr($article_html, strpos($article_html, '</script></div><p>') + 18); $article_html = StripWithDelimiters($article_html, '<script>', '</script>'); $article_html = StripWithDelimiters($article_html, '<div class="shortcode related-links', '</div>'); $article_html = StripWithDelimiters($article_html, '<a class="clickToEnlarge">', '</a>'); return $article_html; }
public function collectData(array $param) { function ExtractFromDelimiters($string, $start, $end) { if (strpos($string, $start) !== false) { $section_retrieved = substr($string, strpos($string, $start) + strlen($start)); $section_retrieved = substr($section_retrieved, 0, strpos($section_retrieved, $end)); return $section_retrieved; } return false; } function StripWithDelimiters($string, $start, $end) { while (strpos($string, $start) !== false) { $section_to_remove = substr($string, strpos($string, $start)); $section_to_remove = substr($section_to_remove, 0, strpos($section_to_remove, $end) + strlen($end)); $string = str_replace($section_to_remove, '', $string); } return $string; } $feed = $this->getURI() . 'feed/'; $html = $this->file_get_html($feed) or $this->returnError('Could not request ' . $this->getName() . ': ' . $feed, 500); $limit = 0; foreach ($html->find('item') as $element) { if ($limit < 5) { $article_image = $element->find('image', 0)->plaintext; $article_url = ExtractFromDelimiters($element->innertext, '<link>', '</link>'); $article_summary = ExtractFromDelimiters($element->innertext, '<description><![CDATA[<p>', '</p>'); $article_html = file_get_contents($article_url) or $this->returnError('Could not request ' . $this->getName() . ': ' . $article_url, 500); if (substr($article_html, 0, 2) == "‹") { //http://www.gzip.org/zlib/rfc-gzip.html#header-trailer -> GZip ID1 $article_html = gzdecode($article_html); } //Response is GZipped even if we didn't accept GZip!? Let's decompress... $article_html = str_get_html($article_html); //Now we have our HTML data. But still, that's an important HTTP violation... $article_content = $article_html->find('div.wlistingsingletext', 0)->innertext; $article_content = StripWithDelimiters($article_content, '<script', '</script>'); $article_content = '<p><img src="' . $article_image . '" /></p>' . '<p><b>' . $article_summary . '</b></p>' . trim($article_content); $item = new \Item(); $item->uri = $article_url; $item->thumbnailUri = $article_image; $item->title = $element->find('title', 0)->plaintext; $item->author = $article_html->find('a[rel=author]', 0)->plaintext; $item->timestamp = strtotime($element->find('pubDate', 0)->plaintext); $item->content = $article_content; $this->items[] = $item; $limit++; } } }
public function collectData(array $param) { function StripWithDelimiters($string, $start, $end) { while (strpos($string, $start) !== false) { $section_to_remove = substr($string, strpos($string, $start)); $section_to_remove = substr($section_to_remove, 0, strpos($section_to_remove, $end) + strlen($end)); $string = str_replace($section_to_remove, '', $string); } return $string; } $feedUrl = 'http://www.9emeart.fr/9emeart.rss'; $html = $this->file_get_html($feedUrl) or $this->returnError('Could not request 9eme Art: ' . $feedUrl, 500); $limit = 0; foreach ($html->find('item') as $element) { if ($limit < 5) { //Retrieve article Uri and get that page $article_uri = $element->find('guid', 0)->plaintext; $article_html = $this->file_get_html($article_uri) or $this->returnError('Could not request 9eme Art: ' . $article_uri, 500); //Build article contents from corresponding elements $article_title = trim($element->find('title', 0)->plaintext); $article_image = $element->find('enclosure', 0)->url; foreach ($article_html->find('img.img_full') as $img) { if ($img->alt == $article_title) { $article_image = 'http://www.9emeart.fr' . $img->src; } } $article_content = '<p><img src="' . $article_image . '" /></p>' . str_replace('src="/', 'src="http://www.9emeart.fr/', $article_html->find('div.newsGenerique_con', 0)->innertext); $article_content = StripWithDelimiters($article_content, '<script', '</script>'); $article_content = StripWithDelimiters($article_content, '<style', '</style>'); $article_content = StripWithDelimiters($article_content, '<link', '>'); //Build and add final item $item = new \Item(); $item->uri = $article_uri; $item->title = $article_title; $item->thumbnailUri = $element->find('enclosure', 0)->url; $item->author = $article_html->find('a[class=upp transition_fast upp]', 0)->plaintext; $item->timestamp = strtotime($element->find('pubDate', 0)->plaintext); $item->content = $article_content; $this->items[] = $item; $limit++; } } }
public function collectData(array $param) { //Utility function for extracting CDATA fields function StripCDATA($string) { $string = str_replace('<![CDATA[', '', $string); $string = str_replace(']]>', '', $string); return $string; } //Utility function for removing text based on specified delimiters function StripWithDelimiters($string, $start, $end) { while (strpos($string, $start) !== false) { $section_to_remove = substr($string, strpos($string, $start)); $section_to_remove = substr($section_to_remove, 0, strpos($section_to_remove, $end) + strlen($end)); $string = str_replace($section_to_remove, '', $string); } return $string; } //Ensure proper parameters have been provided if (empty($param['search'])) { $this->returnError('You must specify a search criteria', 400); } //Retrieve torrent listing as truncated rss, which does not contain torrent description $url = 'http://www.t411.in/torrents/rss/?' . $param['search'] . '&order=added&type=desc'; $html = file_get_html($url) or $this->returnError('Could not request t411: ' . $url, 500); $limit = 0; //Process each item individually foreach ($html->find('item') as $element) { //Limit total amount of requests if ($limit < 5) { //Requests are rate-limited sleep(1); //So we need to wait //Retrieve data from RSS entry $item_uri = StripCDATA($element->find('guid', 0)->plaintext); $item_title = StripWithDelimiters(StripCDATA($element->find('title', 0)->innertext), ' (S:', ')'); $item_date = strtotime($element->find('pubDate', 0)->plaintext); //Retrieve full description from torrent page if ($item_html = file_get_html($item_uri)) { //Retrieve data from page contents $item_desc = $item_html->find('div.description', 0); $item_author = $item_html->find('a.profile', 0)->innertext; //Retrieve image for thumbnail or generic logo fallback $item_image = 'http://www.t411.in/themes/blue/images/logo.png'; foreach ($item_desc->find('img') as $img) { if (strpos($img->src, 'dreamprez') === false) { $item_image = $img->src; break; } } //Build and add final item $item = new \Item(); $item->uri = $item_uri; $item->title = $item_title; $item->author = $item_author; $item->timestamp = $item_date; $item->thumbnailUri = $item_image; $item->content = utf8_encode($item_desc->innertext); $this->items[] = $item; $limit++; } } } }
function CleanArticle($article_html) { $article_html = StripWithDelimiters($article_html, '<script', '</script>'); $article_html = StripWithDelimiters($article_html, '<h1 class="cleanprint-title"', '</h1>'); return $article_html; }
public function collectData(array $param) { function StripCDATA($string) { $string = str_replace('<![CDATA[', '', $string); $string = str_replace(']]>', '', $string); return $string; } function StripWithDelimiters($string, $start, $end) { while (strpos($string, $start) !== false) { $section_to_remove = substr($string, strpos($string, $start)); $section_to_remove = substr($section_to_remove, 0, strpos($section_to_remove, $end) + strlen($end)); $string = str_replace($section_to_remove, '', $string); } return $string; } function StripRecursiveHTMLSection($string, $tag_name, $tag_start) { $open_tag = '<' . $tag_name; $close_tag = '</' . $tag_name . '>'; $close_tag_length = strlen($close_tag); if (strpos($tag_start, $open_tag) === 0) { while (strpos($string, $tag_start) !== false) { $max_recursion = 100; $section_to_remove = null; $section_start = strpos($string, $tag_start); $search_offset = $section_start; do { $max_recursion--; $section_end = strpos($string, $close_tag, $search_offset); $search_offset = $section_end + $close_tag_length; $section_to_remove = substr($string, $section_start, $section_end - $section_start + $close_tag_length); $open_tag_count = substr_count($section_to_remove, $open_tag); $close_tag_count = substr_count($section_to_remove, $close_tag); } while ($open_tag_count > $close_tag_count && $max_recursion > 0); $string = str_replace($section_to_remove, '', $string); } } return $string; } if (empty($param['feed'])) { $this->returnError('Please select a feed to display.' . $url, 400); } if ($param['feed'] !== preg_replace('/[^a-zA-Z-\\/]+/', '', $param['feed']) || substr_count($param['feed'], '/') > 1 || strlen($param['feed'] > 64)) { $this->returnError('Invalid "feed" parameter.' . $url, 400); } $url = $this->getURI() . 'rss/' . $param['feed'] . '.xml'; $html = $this->file_get_html($url) or $this->returnError('Could not request Futura-Sciences: ' . $url, 500); $limit = 0; foreach ($html->find('item') as $element) { if ($limit < 10) { $article_url = str_replace('#xtor=RSS-8', '', StripCDATA($element->find('guid', 0)->plaintext)); $article = $this->file_get_html($article_url) or $this->returnError('Could not request Futura-Sciences: ' . $article_url, 500); $contents = $article->find('div.content', 0)->innertext; $author = trim(str_replace(', Futura-Sciences', '', $article->find('span.author', 0)->plaintext)); if (empty($author)) { $author = StripCDATA($element->find('author', 0)->plaintext); } foreach (array('<div class="clear', '<div class="sharebar2', '<div class="diaporamafullscreen"', '<div style="margin-bottom:10px;" class="noprint"', '<div class="ficheprevnext', '<div class="bar noprint', '<div class="toolbar noprint', '<div class="addthis_toolbox', '<div class="noprint', '<div class="bg bglight border border-full noprint', '<div class="httplogbar-wrapper noprint', '<div id="forumcomments') as $div_start) { $contents = StripRecursiveHTMLSection($contents, 'div', $div_start); } $contents = StripWithDelimiters($contents, '<hr ', '/>'); $contents = StripWithDelimiters($contents, '<p class="content-date', '</p>'); $contents = StripWithDelimiters($contents, '<h1 class="content-title', '</h1>'); $contents = StripWithDelimiters($contents, 'fs:definition="', '"'); $contents = StripWithDelimiters($contents, 'fs:xt:clicktype="', '"'); $contents = StripWithDelimiters($contents, 'fs:xt:clickname="', '"'); $item = new \Item(); $item->author = $author; $item->uri = $article_url; $item->title = StripCDATA($element->find('title', 0)->innertext); $item->thumbnailUri = StripCDATA($element->find('enclosure', 0)->url); $item->timestamp = strtotime(StripCDATA($element->find('pubDate', 0)->plaintext)); $item->content = trim($contents); $this->items[] = $item; $limit++; } } }
public function collectData(array $param) { function StripCDATA($string) { $string = str_replace('<![CDATA[', '', $string); $string = str_replace(']]>', '', $string); return trim($string); } function ExtractFromDelimiters($string, $start, $end) { if (strpos($string, $start) !== false) { $section_retrieved = substr($string, strpos($string, $start) + strlen($start)); $section_retrieved = substr($section_retrieved, 0, strpos($section_retrieved, $end)); return $section_retrieved; } return false; } function StripWithDelimiters($string, $start, $end) { while (strpos($string, $start) !== false) { $section_to_remove = substr($string, strpos($string, $start)); $section_to_remove = substr($section_to_remove, 0, strpos($section_to_remove, $end) + strlen($end)); $string = str_replace($section_to_remove, '', $string); } return $string; } function StripRecursiveHTMLSection($string, $tag_name, $tag_start) { $open_tag = '<' . $tag_name; $close_tag = '</' . $tag_name . '>'; $close_tag_length = strlen($close_tag); if (strpos($tag_start, $open_tag) === 0) { while (strpos($string, $tag_start) !== false) { $max_recursion = 100; $section_to_remove = null; $section_start = strpos($string, $tag_start); $search_offset = $section_start; do { $max_recursion--; $section_end = strpos($string, $close_tag, $search_offset); $search_offset = $section_end + $close_tag_length; $section_to_remove = substr($string, $section_start, $section_end - $section_start + $close_tag_length); $open_tag_count = substr_count($section_to_remove, $open_tag); $close_tag_count = substr_count($section_to_remove, $close_tag); } while ($open_tag_count > $close_tag_count && $max_recursion > 0); $string = str_replace($section_to_remove, '', $string); } } return $string; } $baseUri = $this->getURI(); $feed = $param['feed']; if (empty($feed)) { $this->returnError('Please select a feed to display.', 400); } if (strpos($feed, 'downloads!') !== false) { $feed = str_replace('downloads!', '', $feed); $baseUri = str_replace('www.', 'downloads.', $baseUri); } if ($feed !== preg_replace('/[^a-zA-Z0-9-\\/]+/', '', $feed) || substr_count($feed, '/') > 1 || strlen($feed > 64)) { $this->returnError('Invalid "feed" parameter.', 400); } $url = $baseUri . trim($feed, '/') . '/rss.xml'; $html = $this->file_get_html($url) or $this->returnError('Could not request ZDNet: ' . $url, 500); $limit = 0; foreach ($html->find('item') as $element) { if ($limit < 10) { $article_url = preg_replace('/([^#]+)#ftag=.*/', '$1', StripCDATA(ExtractFromDelimiters($element->innertext, '<link>', '</link>'))); $article_author = StripCDATA(ExtractFromDelimiters($element->innertext, 'role="author">', '<')); $article_title = StripCDATA($element->find('title', 0)->plaintext); $article_subtitle = StripCDATA($element->find('description', 0)->plaintext); $article_timestamp = strtotime(StripCDATA($element->find('pubDate', 0)->plaintext)); $article = $this->file_get_html($article_url) or $this->returnError('Could not request ZDNet: ' . $article_url, 500); if (!empty($article_author)) { $author = $article_author; } else { $author = $article->find('meta[name=author]', 0); if (is_object($author)) { $author = $author->content; } else { $author = 'ZDNet'; } } $thumbnail = $article->find('meta[itemprop=image]', 0); if (is_object($thumbnail)) { $thumbnail = $thumbnail->content; } else { $thumbnail = ''; } $contents = $article->find('article', 0)->innertext; foreach (array('<div class="shareBar"', '<div class="shortcodeGalleryWrapper"', '<div class="relatedContent', '<div class="downloadNow', '<div data-shortcode', '<div id="sharethrough', '<div id="inpage-video') as $div_start) { $contents = StripRecursiveHTMLSection($contents, 'div', $div_start); } $contents = StripWithDelimiters($contents, '<script', '</script>'); $contents = StripWithDelimiters($contents, '<meta itemprop="image"', '>'); $contents = trim(StripWithDelimiters($contents, '<section class="sharethrough-top', '</section>')); $content_img = strpos($contents, '<img'); //Look for first image if ($content_img !== false && $content_img < 512 || $thumbnail == '') { $content_img = ''; } else { $content_img = '<p><img src="' . $thumbnail . '" /></p>'; } //Include thumbnail $contents = $content_img . '<p><b>' . $article_subtitle . '</b></p>' . $contents; if ($thumbnail == '') { $thumbnail = 'http://zdnet1.cbsistatic.com/fly/bundles/zdnetcss/images/logos/logo-192x192.png'; } $item = new \Item(); $item->author = $author; $item->uri = $article_url; $item->title = $article_title; $item->thumbnailUri = $thumbnail; $item->timestamp = $article_timestamp; $item->content = $contents; $this->items[] = $item; $limit++; } } }