Beispiel #1
0
 /**
  * Runs a URL through Readability and hands back the stripped content
  *
  * @since 1.7
  * @see http://www.keyvan.net/2010/08/php-readability/
  * @param $url
  */
 public static function readability_object($url)
 {
     set_time_limit(0);
     $url = pf_de_https($url);
     $url = str_replace('&', '&', $url);
     //print_r($url); print_r(' - Readability<br />');
     // change from Boone - use wp_remote_get() instead of file_get_contents()
     $request = wp_remote_get($url, array('timeout' => '30'));
     if (is_wp_error($request)) {
         $content = 'error-secured';
         //print_r($request); die();
         return $content;
     }
     if (!empty($request['body'])) {
         $html = $request['body'];
     } else {
         $content = false;
         return $content;
     }
     //check if tidy exists to clean up the input.
     if (function_exists('tidy_parse_string')) {
         $tidy = tidy_parse_string($html, array(), 'UTF8');
         $tidy->cleanRepair();
         $html = $tidy->value;
     }
     // give it to Readability
     $readability = new Readability($html, $url);
     // print debug output?
     // useful to compare against Arc90's original JS version -
     // simply click the bookmarklet with FireBug's
     // console window open
     $readability->debug = false;
     // convert links to footnotes?
     $readability->convertLinksToFootnotes = false;
     // process it
     $result = $readability->init();
     if ($result) {
         $content = $readability->getContent()->innerHTML;
         //$content = $contentOut->innerHTML;
         //if we've got tidy, let's use it.
         if (function_exists('tidy_parse_string')) {
             $tidy = tidy_parse_string($content, array('indent' => true, 'show-body-only' => true), 'UTF8');
             $tidy->cleanRepair();
             $content = $tidy->value;
         }
         $content = balanceTags($content, true);
         $content = ent2ncr($content);
         $content = convert_chars($content);
         $domRotated = 0;
         $dom = new domDocument('1.0', 'utf-8');
         $dom->preserveWhiteSpace = true;
         $dom->substituteEntities = true;
         $dom->resolveExternals = true;
         $dom->loadXML('<fullContent>' . $content . '</fullContent>');
         $images = $dom->getElementsByTagName('img');
         foreach ($images as $image) {
             $img = $image->getAttribute('src');
             if (strpos($img, '/') === 0 || strpos($img, 'http') != 0) {
                 $urlArray = parse_url($url);
                 if (strpos($img, 'http') != 0) {
                     $urlBase = 'http://' . $urlArray['host'] . '/';
                 } else {
                     $urlBase = 'http://' . $urlArray['host'];
                 }
                 if (!is_wp_error(wp_remote_head($urlBase . $img))) {
                     $image->setAttribute('src', $urlBase . $img);
                     $domRotated++;
                 } elseif (!is_wp_error(wp_remote_head($url . $img))) {
                     $image->setAttribute('src', $url . $img);
                     $domRotated++;
                 } else {
                     $image->parentNode->removeChild($image);
                     $domRotated++;
                 }
             }
         }
         if ($domRotated > 0) {
             $content = $dom->saveXML();
             $rel = '(<\\?xml version="1\\.0" encoding="utf-8"\\?>)';
             $content = preg_replace("/" . $rel . "/is", ' ', $content);
             $rel = '(<\\?xml version="1\\.0"\\?>)';
             $content = preg_replace("/" . $rel . "/is", ' ', $content);
         }
         if (120 > strlen($content)) {
             $content = false;
         }
         #			$content = stripslashes($content);
         # print_r($content);
         #				var_dump($content); die();
         // this will also output doctype and comments at top level
         #			$content = "";
         #			foreach($dom->childNodes as $node){
         #				$content .= $dom->saveXML($node)."\n";
         #			}
     } else {
         # If Readability can't get the content, send back a FALSE to loop with.
         $content = false;
         # and let's throw up an error via AJAX as well, so we know what's going on.
         //print_r($url . ' fails Readability.<br />');
     }
     if ($content != false) {
         $contentObj = new pf_htmlchecker($content);
         $content = $contentObj->closetags($content);
     }
     return $content;
 }
/**
 * Converts an https URL into http, to account for servers without SSL access.
 * If a function is passed, pf_de_https will return the function result
 * instead of the string.
 *
 * @since 1.7
 *
 * @param string $url
 * @param string|array $function Function to call first to try and get the URL.
 * @return string|object $r Returns the string URL, converted, when no function is passed.
 *
 * otherwise returns the result of the function after being checked for accessability.
 */
function pf_de_https($url, $function = false)
{
    $args = func_get_args();
    $url = str_replace('&amp;', '&', $url);
    $url_first = $url;
    if (!$function) {
        $r = set_url_scheme($url, 'http');
    } else {
        $args[0] = $url;
        #unset($args[1]);
        #var_dump($args);
        $r = call_user_func_array($function, $args);
        # "A variable is considered empty if it does not exist or if its value equals FALSE"
        if (is_wp_error($r) || empty($r)) {
            $non_ssl_url = pf_de_https($url);
            if ($non_ssl_url != $url) {
                $args[0] = $non_ssl_url;
                $r = call_user_func_array($function, $args);
            }
            if (!$r || is_wp_error($r)) {
                # Last Chance!
                if ('file_get_contents' != $function) {
                    $r = file_get_contents($url_first);
                    #var_dump($r); die();
                } else {
                    // bail
                    return false;
                }
            }
        }
    }
    return $r;
}
 /**
  * Fetches a URI and parses it for Open Graph data, returns
  * false on error.
  *
  * @param $URI    URI to page to parse for Open Graph data
  * @return OpenGraph
  */
 public static function fetch($URI)
 {
     $URI_data = pf_de_https($URI, 'wp_remote_get', array('timeout' => '30'));
     return self::_parse($URI_data);
 }