function parse_tumblr($url, $page_source) { // get feed source $single_post_feed_source = get_source_width_curl($url . '/rss'); $single_post_feed_source = $single_post_feed_source["content"]; // parse feed if (!stristr($single_post_feed_source, '</rss>')) { print 'Did not recieve a valid rss feed, check your post url by adding /rss after it'; return false; } $xml_parsed = simplexml_load_string($single_post_feed_source, null, LIBXML_NOCDATA); $h1 = $xml_parsed->channel->item->title; $title = $xml_parsed->channel->title; $content_decoded = $xml_parsed->channel->item->description; // wrap in article structure $content = '<div class="article"><h1>' . $h1 . '</h1>' . $content_decoded . '<address><a href="' . $url . '">' . $title . '</a></address></div>'; return $content; }
function parse_twitter($url, $page_source) { include "../settings.php"; if (!stristr($url, '/status/')) { print 'not a valid status URL, should contain "/status/"'; } else { $html = str_get_html($page_source); // get content $tweet_date = $html->find("div.client-and-actions", 0)->find('span.metadata', 0)->find('span', 0)->innertext; $tweet_body = $html->find("p.tweet-text", 0)->innertext; $tweet_user_screen_name = $html->find("a.account-group", 0)->find("span.username", 0)->find("b", 0)->innertext; $tweet_user_name = $html->find("a.account-group", 0)->find("strong.fullname", 0)->innertext; $tweet_user_profile_image_url = $html->find("a.account-group", 0)->find("img.avatar", 0)->src; // translate t.co addresses $html = str_get_html($tweet_body); $i = 0; foreach ($html->find("a") as $a) { if (strstr($a->href, '://t.co/')) { $tco_src = get_source_width_curl($a->href); $url_start = strpos($tco_src['content'], 'content="0;URL=') + 15; $url_length = strpos($tco_src['content'], '"', $url_start) - $url_start; $real_url = substr($tco_src['content'], $url_start, $url_length); $html->find("a", $i)->href = $real_url; $html->find("a", $i)->innertext = $real_url; } $i++; } $tweet_body = $html->save(); // contruct and return tweet html $the_tweet = '<div class="micropost">'; $the_tweet .= '<div class="micropost_body">' . $tweet_body . '</div>'; $the_tweet .= '<div class="micropost_date"><a href="' . $url . '">' . $tweet_date . '</a></div>'; $the_tweet .= '<a href="http://twitter.com/' . $tweet_user_screen_name . '"><img class="micropost_img" src="' . $tweet_user_profile_image_url . '"></a>'; $the_tweet .= '<div class="micropost_author"><span class="fullname">' . $tweet_user_name . '</span> <a href="http://twitter.com/' . $tweet_user_screen_name . '">@' . $tweet_user_screen_name . '</a></div>'; $the_tweet .= '</div>'; return $the_tweet; } }
function download_images($element) { if ($element->tag == 'img') { if ($element->src) { $image = get_source_width_curl($element->src); $hash = md5($image["content"]); $type_expl = explode("/", $image["type"]); $ext = $type_expl[1]; $path = '../media/' . $hash . '.' . $ext; if (!file_exists($path)) { file_put_contents($path, $image["content"]); } $fullpath = substr(__FILE__, 0, -1 * strlen('operations/save_html_from_url.php')) . 'media/' . $hash . '.' . $ext; $imagesize = @getimagesize($fullpath); if ($imagesize[0] < 40 || $imagesize[1] < 40) { $element->outertext = ''; } else { $element->src = $path; $element->style = 'max-width:' . $imagesize[0] . 'px;'; } } else { $element->outertext = ''; } } }
function parse_wordpress($url, $page_source) { // feed url $url_parsed = parse_url($url); if (isset($url_parsed['query'])) { $url_parsed['query'] .= '&feed=rss&withoutcomments=1'; } else { $url_parsed['query'] = 'feed=rss2&withoutcomments=1'; } $single_post_feed_url = glue_url($url_parsed); // get feed source $single_post_feed_source = get_source_width_curl($single_post_feed_url); $single_post_feed_source = $single_post_feed_source["content"]; // fallback parser if not full content in single post feed if (!stristr($single_post_feed_source, "<content:encoded>")) { return fallback_parser($url, $page_source); } else { // content $xml_parsed = simplexml_load_string($single_post_feed_source, null, LIBXML_NOCDATA); $link = $xml_parsed->channel->item->link; $h1 = $xml_parsed->channel->item->title; $title = $xml_parsed->channel->title; $content_decoded = $xml_parsed->channel->item->children('content', true)->encoded; // parse content $content = str_get_html((string) $content_decoded[0]); // remove facebook iframes $i = 0; foreach ($content->find("iframe") as $iframe) { if (strstr($iframe->src, 'facebook.com')) { $content->find("iframe", $i)->outertext = ''; } $i++; } // remove flattr-links $i = 0; foreach ($content->find("a") as $a) { if (strstr($a->href, 'flattrss_redirect')) { $content->find("a", $i)->outertext = ''; } $i++; } // remove wordpress.com feeds $i = 0; foreach ($content->find("a") as $a) { if (strstr($a->href, 'feeds.wordpress.com')) { $content->find("a", $i)->outertext = ''; } $i++; } // loop through elements and do stuff function do_stuff($element) { // remove all attributes, except href and src foreach ($element->attr as $name => $attr) { if ($name != 'src' && $name != 'href') { $element->removeAttribute($name); } } // we keep a, img, object, embed, h1, h2, h3, h4, h5, h6, i, em, b, strong, blockquote // all other converted to p if ($element->tag != 'img' && $element->tag != 'span' && $element->tag != 'a' && $element->tag != 'li' && $element->tag != 'ul' && $element->tag != 'ol' && $element->tag != 'object' && $element->tag != 'embed' && $element->tag != 'h1' && $element->tag != 'h2' && $element->tag != 'h3' && $element->tag != 'h4' && $element->tag != 'h5' && $element->tag != 'h6' && $element->tag != 'i' && $element->tag != 'em' && $element->tag != 'b' && $element->tag != 'strong' && $element->tag != 'blockquote') { $element->tag = 'p'; } // remove p:s without innertext if ($element->tag == 'p' && $element->innertext == null) { $element->outertext = ''; } // remove a:s without innertext if ($element->tag == 'a' && $element->innertext == null) { $element->outertext = ''; } // remove wordpress.com stats if (strstr($element->src, 'stats.wordpress.com')) { $element->outertext = ''; } } $content->set_callback('do_stuff'); $content = $content->save(); // no indent before image, if first in p $content = str_replace('<p><a', '<p> <a', $content); $content = str_replace('<p><img', '<p> <img', $content); // wrap in article structure return '<div class="article"><h1>' . $h1 . '</h1>' . $content . '<address><a href="' . $link . '">' . $title . '</a></address></div>'; } }