Example #1
0
function parse_tumblr($url, $page_source)
{
    // get feed source
    $single_post_feed_source = get_source_width_curl($url . '/rss');
    $single_post_feed_source = $single_post_feed_source["content"];
    // parse feed
    if (!stristr($single_post_feed_source, '</rss>')) {
        print 'Did not recieve a valid rss feed, check your post url by adding /rss after it';
        return false;
    }
    $xml_parsed = simplexml_load_string($single_post_feed_source, null, LIBXML_NOCDATA);
    $h1 = $xml_parsed->channel->item->title;
    $title = $xml_parsed->channel->title;
    $content_decoded = $xml_parsed->channel->item->description;
    // wrap in article structure
    $content = '<div class="article"><h1>' . $h1 . '</h1>' . $content_decoded . '<address><a href="' . $url . '">' . $title . '</a></address></div>';
    return $content;
}
Example #2
0
function parse_twitter($url, $page_source)
{
    include "../settings.php";
    if (!stristr($url, '/status/')) {
        print 'not a valid status URL, should contain "/status/"';
    } else {
        $html = str_get_html($page_source);
        // get content
        $tweet_date = $html->find("div.client-and-actions", 0)->find('span.metadata', 0)->find('span', 0)->innertext;
        $tweet_body = $html->find("p.tweet-text", 0)->innertext;
        $tweet_user_screen_name = $html->find("a.account-group", 0)->find("span.username", 0)->find("b", 0)->innertext;
        $tweet_user_name = $html->find("a.account-group", 0)->find("strong.fullname", 0)->innertext;
        $tweet_user_profile_image_url = $html->find("a.account-group", 0)->find("img.avatar", 0)->src;
        // translate t.co addresses
        $html = str_get_html($tweet_body);
        $i = 0;
        foreach ($html->find("a") as $a) {
            if (strstr($a->href, '://t.co/')) {
                $tco_src = get_source_width_curl($a->href);
                $url_start = strpos($tco_src['content'], 'content="0;URL=') + 15;
                $url_length = strpos($tco_src['content'], '"', $url_start) - $url_start;
                $real_url = substr($tco_src['content'], $url_start, $url_length);
                $html->find("a", $i)->href = $real_url;
                $html->find("a", $i)->innertext = $real_url;
            }
            $i++;
        }
        $tweet_body = $html->save();
        // contruct and return tweet html
        $the_tweet = '<div class="micropost">';
        $the_tweet .= '<div class="micropost_body">' . $tweet_body . '</div>';
        $the_tweet .= '<div class="micropost_date"><a href="' . $url . '">' . $tweet_date . '</a></div>';
        $the_tweet .= '<a href="http://twitter.com/' . $tweet_user_screen_name . '"><img class="micropost_img" src="' . $tweet_user_profile_image_url . '"></a>';
        $the_tweet .= '<div class="micropost_author"><span class="fullname">' . $tweet_user_name . '</span> <a href="http://twitter.com/' . $tweet_user_screen_name . '">@' . $tweet_user_screen_name . '</a></div>';
        $the_tweet .= '</div>';
        return $the_tweet;
    }
}
 function download_images($element)
 {
     if ($element->tag == 'img') {
         if ($element->src) {
             $image = get_source_width_curl($element->src);
             $hash = md5($image["content"]);
             $type_expl = explode("/", $image["type"]);
             $ext = $type_expl[1];
             $path = '../media/' . $hash . '.' . $ext;
             if (!file_exists($path)) {
                 file_put_contents($path, $image["content"]);
             }
             $fullpath = substr(__FILE__, 0, -1 * strlen('operations/save_html_from_url.php')) . 'media/' . $hash . '.' . $ext;
             $imagesize = @getimagesize($fullpath);
             if ($imagesize[0] < 40 || $imagesize[1] < 40) {
                 $element->outertext = '';
             } else {
                 $element->src = $path;
                 $element->style = 'max-width:' . $imagesize[0] . 'px;';
             }
         } else {
             $element->outertext = '';
         }
     }
 }
Example #4
0
function parse_wordpress($url, $page_source)
{
    // feed url
    $url_parsed = parse_url($url);
    if (isset($url_parsed['query'])) {
        $url_parsed['query'] .= '&feed=rss&withoutcomments=1';
    } else {
        $url_parsed['query'] = 'feed=rss2&withoutcomments=1';
    }
    $single_post_feed_url = glue_url($url_parsed);
    // get feed source
    $single_post_feed_source = get_source_width_curl($single_post_feed_url);
    $single_post_feed_source = $single_post_feed_source["content"];
    // fallback parser if not full content in single post feed
    if (!stristr($single_post_feed_source, "<content:encoded>")) {
        return fallback_parser($url, $page_source);
    } else {
        // content
        $xml_parsed = simplexml_load_string($single_post_feed_source, null, LIBXML_NOCDATA);
        $link = $xml_parsed->channel->item->link;
        $h1 = $xml_parsed->channel->item->title;
        $title = $xml_parsed->channel->title;
        $content_decoded = $xml_parsed->channel->item->children('content', true)->encoded;
        // parse content
        $content = str_get_html((string) $content_decoded[0]);
        // remove facebook iframes
        $i = 0;
        foreach ($content->find("iframe") as $iframe) {
            if (strstr($iframe->src, 'facebook.com')) {
                $content->find("iframe", $i)->outertext = '';
            }
            $i++;
        }
        // remove flattr-links
        $i = 0;
        foreach ($content->find("a") as $a) {
            if (strstr($a->href, 'flattrss_redirect')) {
                $content->find("a", $i)->outertext = '';
            }
            $i++;
        }
        // remove wordpress.com feeds
        $i = 0;
        foreach ($content->find("a") as $a) {
            if (strstr($a->href, 'feeds.wordpress.com')) {
                $content->find("a", $i)->outertext = '';
            }
            $i++;
        }
        // loop through elements and do stuff
        function do_stuff($element)
        {
            // remove all attributes, except href and src
            foreach ($element->attr as $name => $attr) {
                if ($name != 'src' && $name != 'href') {
                    $element->removeAttribute($name);
                }
            }
            // we keep a, img, object, embed, h1, h2, h3, h4, h5, h6, i, em, b, strong, blockquote
            // all other converted to p
            if ($element->tag != 'img' && $element->tag != 'span' && $element->tag != 'a' && $element->tag != 'li' && $element->tag != 'ul' && $element->tag != 'ol' && $element->tag != 'object' && $element->tag != 'embed' && $element->tag != 'h1' && $element->tag != 'h2' && $element->tag != 'h3' && $element->tag != 'h4' && $element->tag != 'h5' && $element->tag != 'h6' && $element->tag != 'i' && $element->tag != 'em' && $element->tag != 'b' && $element->tag != 'strong' && $element->tag != 'blockquote') {
                $element->tag = 'p';
            }
            // remove p:s without innertext
            if ($element->tag == 'p' && $element->innertext == null) {
                $element->outertext = '';
            }
            // remove a:s without innertext
            if ($element->tag == 'a' && $element->innertext == null) {
                $element->outertext = '';
            }
            // remove wordpress.com stats
            if (strstr($element->src, 'stats.wordpress.com')) {
                $element->outertext = '';
            }
        }
        $content->set_callback('do_stuff');
        $content = $content->save();
        // no indent before image, if first in p
        $content = str_replace('<p><a', '<p>&nbsp;<a', $content);
        $content = str_replace('<p><img', '<p>&nbsp;<img', $content);
        // wrap in article structure
        return '<div class="article"><h1>' . $h1 . '</h1>' . $content . '<address><a href="' . $link . '">' . $title . '</a></address></div>';
    }
}