/** * Runs a URL through Readability and hands back the stripped content * * @since 1.7 * @see http://www.keyvan.net/2010/08/php-readability/ * @param $url */ public static function readability_object($url) { set_time_limit(0); $url = pf_de_https($url); $url = str_replace('&', '&', $url); //print_r($url); print_r(' - Readability<br />'); // change from Boone - use wp_remote_get() instead of file_get_contents() $request = wp_remote_get($url, array('timeout' => '30')); if (is_wp_error($request)) { $content = 'error-secured'; //print_r($request); die(); return $content; } if (!empty($request['body'])) { $html = $request['body']; } else { $content = false; return $content; } //check if tidy exists to clean up the input. if (function_exists('tidy_parse_string')) { $tidy = tidy_parse_string($html, array(), 'UTF8'); $tidy->cleanRepair(); $html = $tidy->value; } // give it to Readability $readability = new Readability($html, $url); // print debug output? // useful to compare against Arc90's original JS version - // simply click the bookmarklet with FireBug's // console window open $readability->debug = false; // convert links to footnotes? $readability->convertLinksToFootnotes = false; // process it $result = $readability->init(); if ($result) { $content = $readability->getContent()->innerHTML; //$content = $contentOut->innerHTML; //if we've got tidy, let's use it. if (function_exists('tidy_parse_string')) { $tidy = tidy_parse_string($content, array('indent' => true, 'show-body-only' => true), 'UTF8'); $tidy->cleanRepair(); $content = $tidy->value; } $content = balanceTags($content, true); $content = ent2ncr($content); $content = convert_chars($content); $domRotated = 0; $dom = new domDocument('1.0', 'utf-8'); $dom->preserveWhiteSpace = true; $dom->substituteEntities = true; $dom->resolveExternals = true; $dom->loadXML('<fullContent>' . $content . '</fullContent>'); $images = $dom->getElementsByTagName('img'); foreach ($images as $image) { $img = $image->getAttribute('src'); if (strpos($img, '/') === 0 || strpos($img, 'http') != 0) { $urlArray = parse_url($url); if (strpos($img, 'http') != 0) { $urlBase = 'http://' . $urlArray['host'] . '/'; } else { $urlBase = 'http://' . $urlArray['host']; } if (!is_wp_error(wp_remote_head($urlBase . $img))) { $image->setAttribute('src', $urlBase . $img); $domRotated++; } elseif (!is_wp_error(wp_remote_head($url . $img))) { $image->setAttribute('src', $url . $img); $domRotated++; } else { $image->parentNode->removeChild($image); $domRotated++; } } } if ($domRotated > 0) { $content = $dom->saveXML(); $rel = '(<\\?xml version="1\\.0" encoding="utf-8"\\?>)'; $content = preg_replace("/" . $rel . "/is", ' ', $content); $rel = '(<\\?xml version="1\\.0"\\?>)'; $content = preg_replace("/" . $rel . "/is", ' ', $content); } if (120 > strlen($content)) { $content = false; } # $content = stripslashes($content); # print_r($content); # var_dump($content); die(); // this will also output doctype and comments at top level # $content = ""; # foreach($dom->childNodes as $node){ # $content .= $dom->saveXML($node)."\n"; # } } else { # If Readability can't get the content, send back a FALSE to loop with. $content = false; # and let's throw up an error via AJAX as well, so we know what's going on. //print_r($url . ' fails Readability.<br />'); } if ($content != false) { $contentObj = new pf_htmlchecker($content); $content = $contentObj->closetags($content); } return $content; }
/** * Converts an https URL into http, to account for servers without SSL access. * If a function is passed, pf_de_https will return the function result * instead of the string. * * @since 1.7 * * @param string $url * @param string|array $function Function to call first to try and get the URL. * @return string|object $r Returns the string URL, converted, when no function is passed. * * otherwise returns the result of the function after being checked for accessability. */ function pf_de_https($url, $function = false) { $args = func_get_args(); $url = str_replace('&', '&', $url); $url_first = $url; if (!$function) { $r = set_url_scheme($url, 'http'); } else { $args[0] = $url; #unset($args[1]); #var_dump($args); $r = call_user_func_array($function, $args); # "A variable is considered empty if it does not exist or if its value equals FALSE" if (is_wp_error($r) || empty($r)) { $non_ssl_url = pf_de_https($url); if ($non_ssl_url != $url) { $args[0] = $non_ssl_url; $r = call_user_func_array($function, $args); } if (!$r || is_wp_error($r)) { # Last Chance! if ('file_get_contents' != $function) { $r = file_get_contents($url_first); #var_dump($r); die(); } else { // bail return false; } } } } return $r; }
/** * Fetches a URI and parses it for Open Graph data, returns * false on error. * * @param $URI URI to page to parse for Open Graph data * @return OpenGraph */ public static function fetch($URI) { $URI_data = pf_de_https($URI, 'wp_remote_get', array('timeout' => '30')); return self::_parse($URI_data); }