foreach ($identifiers as $identifier) { if (strstr($url_source["content"], $identifier)) { $function_name = 'parse_' . $parser_name; break 2; } } } if (isset($function_name)) { $html = $function_name($url, $url_source["content"]); if ($html) { $html = download_inline_images($html, $url); save_html($obj_id, $html); } } else { // last resort, fallback parser $html = fallback_parser($url, $url_source["content"]); if ($html) { $html = download_inline_images($html, $url); save_html($obj_id, $html); } } } } else { print 'unknown content type: ' . $url_source["type"]; } // download inline images function download_inline_images($html, $url) { $html_parsed = str_get_html($html); // complete relative urls foreach ($html_parsed->find("img") as $img) {
function parse_wordpress($url, $page_source) { // feed url $url_parsed = parse_url($url); if (isset($url_parsed['query'])) { $url_parsed['query'] .= '&feed=rss&withoutcomments=1'; } else { $url_parsed['query'] = 'feed=rss2&withoutcomments=1'; } $single_post_feed_url = glue_url($url_parsed); // get feed source $single_post_feed_source = get_source_width_curl($single_post_feed_url); $single_post_feed_source = $single_post_feed_source["content"]; // fallback parser if not full content in single post feed if (!stristr($single_post_feed_source, "<content:encoded>")) { return fallback_parser($url, $page_source); } else { // content $xml_parsed = simplexml_load_string($single_post_feed_source, null, LIBXML_NOCDATA); $link = $xml_parsed->channel->item->link; $h1 = $xml_parsed->channel->item->title; $title = $xml_parsed->channel->title; $content_decoded = $xml_parsed->channel->item->children('content', true)->encoded; // parse content $content = str_get_html((string) $content_decoded[0]); // remove facebook iframes $i = 0; foreach ($content->find("iframe") as $iframe) { if (strstr($iframe->src, 'facebook.com')) { $content->find("iframe", $i)->outertext = ''; } $i++; } // remove flattr-links $i = 0; foreach ($content->find("a") as $a) { if (strstr($a->href, 'flattrss_redirect')) { $content->find("a", $i)->outertext = ''; } $i++; } // remove wordpress.com feeds $i = 0; foreach ($content->find("a") as $a) { if (strstr($a->href, 'feeds.wordpress.com')) { $content->find("a", $i)->outertext = ''; } $i++; } // loop through elements and do stuff function do_stuff($element) { // remove all attributes, except href and src foreach ($element->attr as $name => $attr) { if ($name != 'src' && $name != 'href') { $element->removeAttribute($name); } } // we keep a, img, object, embed, h1, h2, h3, h4, h5, h6, i, em, b, strong, blockquote // all other converted to p if ($element->tag != 'img' && $element->tag != 'span' && $element->tag != 'a' && $element->tag != 'li' && $element->tag != 'ul' && $element->tag != 'ol' && $element->tag != 'object' && $element->tag != 'embed' && $element->tag != 'h1' && $element->tag != 'h2' && $element->tag != 'h3' && $element->tag != 'h4' && $element->tag != 'h5' && $element->tag != 'h6' && $element->tag != 'i' && $element->tag != 'em' && $element->tag != 'b' && $element->tag != 'strong' && $element->tag != 'blockquote') { $element->tag = 'p'; } // remove p:s without innertext if ($element->tag == 'p' && $element->innertext == null) { $element->outertext = ''; } // remove a:s without innertext if ($element->tag == 'a' && $element->innertext == null) { $element->outertext = ''; } // remove wordpress.com stats if (strstr($element->src, 'stats.wordpress.com')) { $element->outertext = ''; } } $content->set_callback('do_stuff'); $content = $content->save(); // no indent before image, if first in p $content = str_replace('<p><a', '<p> <a', $content); $content = str_replace('<p><img', '<p> <img', $content); // wrap in article structure return '<div class="article"><h1>' . $h1 . '</h1>' . $content . '<address><a href="' . $link . '">' . $title . '</a></address></div>'; } }