Beispiel #1
0
 /**
  * Set HTTP status
  * 
  * @param integer $code
  */
 function set_http_status($code)
 {
     $status = get_http_status($code);
     if ($status != false && !headers_sent()) {
         header('HTTP/1.1 ' . $status);
         header('Status: ' . $status);
         return true;
     }
     return false;
 }
function get_http_meta($url, $depth = 0, $status_chain = '', $url_chain = '')
{
    //echo "get_http_meta('$url', $depth)\n";
    // how deep to follow http redirects
    if ($depth > 3) {
        return;
    }
    $depth++;
    $connect_timeout = 3;
    $read_timeout = 6;
    ini_set('default_socket_timeout', $connect_timeout);
    $http_meta = get_empty_http_meta();
    $purl = parse_url($url);
    // note: this seems to fail for all addresses
    // if a domain name, is it valid?
    /*
        if (!preg_match("/^[0-9]{3}\.[0-9]{3}\.[0-9]{3}\.[0-9]{3}$/", $purl['host'])) {
       $ip = gethostbyname($purl['host']);
       if ($ip == $purl['host']) {
           echo "(name resolution failed) "; 
           $http_meta['state'] = 2;
           return $http_meta;
       }
        }
    */
    // connect and get http return code
    if (array_key_exists('port', $purl)) {
        $port = $purl['port'];
    } else {
        $port = 80;
    }
    $sock = fsockopen($purl['host'], $port, $errno, $errstr, $connect_timeout);
    if (!$sock) {
        echo "(connection error - {$errstr} ({$errno})) ";
        $http_meta['state'] = 3;
        return $http_meta;
    }
    stream_set_blocking($sock, FALSE);
    stream_set_timeout($sock, $read_timeout);
    $url_path = '/';
    if (array_key_exists('path', $purl)) {
        $url_path = $purl['path'];
        if (array_key_exists('query', $purl)) {
            $url_path .= '?' . $purl['query'];
        }
    }
    // we set 'accept-language' as some sites will have different <title> depending on your geoip
    // e.g. flickr
    // some sites seem to ignore you if you don't send a user-agent
    // e.g. digg (but maybe that's for the better)
    fputs($sock, "GET {$url_path} HTTP/1.1\r\n");
    fputs($sock, "Host: " . $purl['host'] . "\r\n");
    fputs($sock, "Connection: close\r\n");
    fputs($sock, "Accept-Language: en-gb, en;q=0.8\r\n");
    fputs($sock, "User-Agent: URL catcher\r\n");
    fputs($sock, "\r\n");
    $status = socket_get_status($sock);
    if ($status['timed_out']) {
        echo "(status timed_out) ";
        $http_meta['state'] = 4;
        return $http_meta;
    }
    if (feof($sock)) {
        echo "(socket early EOF) ";
        $http_meta['state'] = 5;
        return $http_meta;
    }
    $contents = stream_get_contents($sock, 1024 * 8);
    if ($contents === FALSE) {
        echo "stream_get_contents returned FALSE ";
        return $http_meta;
    }
    fclose($sock);
    $http_status = get_http_status($contents);
    //echo "http_status=$http_status\n";
    if (!is_numeric($http_status)) {
        return $http_meta;
    }
    $http_meta['http_status'] = $http_status;
    $http_meta['location'] = $url;
    // handle different status codes
    if ($http_status == "404") {
        echo "(http 404) ";
        $http_meta['state'] = 6;
    } elseif (substr($http_status, 0, 1) == 4) {
        echo "(http other 40x) ";
        $http_meta['state'] = 7;
    } elseif (substr($http_status, 0, 1) == 5) {
        echo "(http other 50x) ";
        $http_meta['state'] = 8;
    } elseif ($http_status == "301" or $http_status == "302") {
        // 301 - Moved Permanently
        // 302 - Found / Moved
        $new_url = get_http_location($contents);
        if (!$new_url) {
            $http_meta['state'] = 0;
            return $http_meta;
        }
        // recurse recurse recurse
        // discard our meta in favour of the deepest url
        $http_meta = get_http_meta($new_url, $depth);
        // set any meta we want to persist here
        $http_meta['redirect'] = 1;
    } elseif ($http_status == "200") {
        $http_meta['state'] = 1;
        $http_meta['content_length'] = get_http_content_length($contents);
        $http_meta['content_type'] = get_http_content_type($contents);
        $check_html = 0;
        if ($http_meta['content_type']) {
            $check_html = preg_match("/^text\\/html/", $http_meta['content_type']);
        } else {
            $check_html = !preg_match("/(jpg|jpeg|gif|png|wav|mp3|avi|wmv|mpg)\$/", $url);
        }
        if ($check_html) {
            $http_meta['html_title'] = get_html_title($contents);
            if (!$http_meta['html_title']) {
                $http_meta['html_title'] = get_html_h1($contents);
            }
        } else {
            //echo "(non text/html type) ";
        }
        // html meta redirect?
        // this could be placed better to avoid doing all the work above
        $html_meta_redirect = get_html_meta_redirect($contents);
        if ($html_meta_redirect) {
            $http_meta = get_http_meta($html_meta_redirect, $depth);
            $http_meta['redirect'] = 1;
        }
    }
    return $http_meta;
}