foreach ($identifiers as $identifier) {
                if (strstr($url_source["content"], $identifier)) {
                    $function_name = 'parse_' . $parser_name;
                    break 2;
                }
            }
        }
        if (isset($function_name)) {
            $html = $function_name($url, $url_source["content"]);
            if ($html) {
                $html = download_inline_images($html, $url);
                save_html($obj_id, $html);
            }
        } else {
            // last resort, fallback parser
            $html = fallback_parser($url, $url_source["content"]);
            if ($html) {
                $html = download_inline_images($html, $url);
                save_html($obj_id, $html);
            }
        }
    }
} else {
    print 'unknown content type: ' . $url_source["type"];
}
// download inline images
function download_inline_images($html, $url)
{
    $html_parsed = str_get_html($html);
    // complete relative urls
    foreach ($html_parsed->find("img") as $img) {
Example #2
0
function parse_wordpress($url, $page_source)
{
    // feed url
    $url_parsed = parse_url($url);
    if (isset($url_parsed['query'])) {
        $url_parsed['query'] .= '&feed=rss&withoutcomments=1';
    } else {
        $url_parsed['query'] = 'feed=rss2&withoutcomments=1';
    }
    $single_post_feed_url = glue_url($url_parsed);
    // get feed source
    $single_post_feed_source = get_source_width_curl($single_post_feed_url);
    $single_post_feed_source = $single_post_feed_source["content"];
    // fallback parser if not full content in single post feed
    if (!stristr($single_post_feed_source, "<content:encoded>")) {
        return fallback_parser($url, $page_source);
    } else {
        // content
        $xml_parsed = simplexml_load_string($single_post_feed_source, null, LIBXML_NOCDATA);
        $link = $xml_parsed->channel->item->link;
        $h1 = $xml_parsed->channel->item->title;
        $title = $xml_parsed->channel->title;
        $content_decoded = $xml_parsed->channel->item->children('content', true)->encoded;
        // parse content
        $content = str_get_html((string) $content_decoded[0]);
        // remove facebook iframes
        $i = 0;
        foreach ($content->find("iframe") as $iframe) {
            if (strstr($iframe->src, 'facebook.com')) {
                $content->find("iframe", $i)->outertext = '';
            }
            $i++;
        }
        // remove flattr-links
        $i = 0;
        foreach ($content->find("a") as $a) {
            if (strstr($a->href, 'flattrss_redirect')) {
                $content->find("a", $i)->outertext = '';
            }
            $i++;
        }
        // remove wordpress.com feeds
        $i = 0;
        foreach ($content->find("a") as $a) {
            if (strstr($a->href, 'feeds.wordpress.com')) {
                $content->find("a", $i)->outertext = '';
            }
            $i++;
        }
        // loop through elements and do stuff
        function do_stuff($element)
        {
            // remove all attributes, except href and src
            foreach ($element->attr as $name => $attr) {
                if ($name != 'src' && $name != 'href') {
                    $element->removeAttribute($name);
                }
            }
            // we keep a, img, object, embed, h1, h2, h3, h4, h5, h6, i, em, b, strong, blockquote
            // all other converted to p
            if ($element->tag != 'img' && $element->tag != 'span' && $element->tag != 'a' && $element->tag != 'li' && $element->tag != 'ul' && $element->tag != 'ol' && $element->tag != 'object' && $element->tag != 'embed' && $element->tag != 'h1' && $element->tag != 'h2' && $element->tag != 'h3' && $element->tag != 'h4' && $element->tag != 'h5' && $element->tag != 'h6' && $element->tag != 'i' && $element->tag != 'em' && $element->tag != 'b' && $element->tag != 'strong' && $element->tag != 'blockquote') {
                $element->tag = 'p';
            }
            // remove p:s without innertext
            if ($element->tag == 'p' && $element->innertext == null) {
                $element->outertext = '';
            }
            // remove a:s without innertext
            if ($element->tag == 'a' && $element->innertext == null) {
                $element->outertext = '';
            }
            // remove wordpress.com stats
            if (strstr($element->src, 'stats.wordpress.com')) {
                $element->outertext = '';
            }
        }
        $content->set_callback('do_stuff');
        $content = $content->save();
        // no indent before image, if first in p
        $content = str_replace('<p><a', '<p>&nbsp;<a', $content);
        $content = str_replace('<p><img', '<p>&nbsp;<img', $content);
        // wrap in article structure
        return '<div class="article"><h1>' . $h1 . '</h1>' . $content . '<address><a href="' . $link . '">' . $title . '</a></address></div>';
    }
}