<?php require_once 'functions_backend.php'; echo "<pre>Pool Cleaner(tm)\n\n"; #$meta = get_http_meta('http://news.bbc.co.uk/1/hi/technology/8349923.stm'); #$meta = get_http_meta('http://picasaweb.google.com/lh/photo/xr-C_wqHG8gssUZnzuv1Ow?feat=directlink'); #$meta = get_http_meta('http://bugs.kde.org/show_bug.cgi?id=162326'); #$meta = get_http_meta('http://lala.com/zPSy'); #$meta = get_http_meta('http://tinyurl.com/yjbub48'); echo "\n<hr>\n"; #$meta = get_http_meta('http://ur.ly/3kSj'); #echo "\n"; print_r($meta); #echo "\n<hr>\n"; $meta = get_http_meta('http://thejesustv.com'); echo "\n"; print_r($meta); exit;
function get_http_meta($url, $depth = 0, $status_chain = '', $url_chain = '') { //echo "get_http_meta('$url', $depth)\n"; // how deep to follow http redirects if ($depth > 3) { return; } $depth++; $connect_timeout = 3; $read_timeout = 6; ini_set('default_socket_timeout', $connect_timeout); $http_meta = get_empty_http_meta(); $purl = parse_url($url); // note: this seems to fail for all addresses // if a domain name, is it valid? /* if (!preg_match("/^[0-9]{3}\.[0-9]{3}\.[0-9]{3}\.[0-9]{3}$/", $purl['host'])) { $ip = gethostbyname($purl['host']); if ($ip == $purl['host']) { echo "(name resolution failed) "; $http_meta['state'] = 2; return $http_meta; } } */ // connect and get http return code if (array_key_exists('port', $purl)) { $port = $purl['port']; } else { $port = 80; } $sock = fsockopen($purl['host'], $port, $errno, $errstr, $connect_timeout); if (!$sock) { echo "(connection error - {$errstr} ({$errno})) "; $http_meta['state'] = 3; return $http_meta; } stream_set_blocking($sock, FALSE); stream_set_timeout($sock, $read_timeout); $url_path = '/'; if (array_key_exists('path', $purl)) { $url_path = $purl['path']; if (array_key_exists('query', $purl)) { $url_path .= '?' . $purl['query']; } } // we set 'accept-language' as some sites will have different <title> depending on your geoip // e.g. flickr // some sites seem to ignore you if you don't send a user-agent // e.g. digg (but maybe that's for the better) fputs($sock, "GET {$url_path} HTTP/1.1\r\n"); fputs($sock, "Host: " . $purl['host'] . "\r\n"); fputs($sock, "Connection: close\r\n"); fputs($sock, "Accept-Language: en-gb, en;q=0.8\r\n"); fputs($sock, "User-Agent: URL catcher\r\n"); fputs($sock, "\r\n"); $status = socket_get_status($sock); if ($status['timed_out']) { echo "(status timed_out) "; $http_meta['state'] = 4; return $http_meta; } if (feof($sock)) { echo "(socket early EOF) "; $http_meta['state'] = 5; return $http_meta; } $contents = stream_get_contents($sock, 1024 * 8); if ($contents === FALSE) { echo "stream_get_contents returned FALSE "; return $http_meta; } fclose($sock); $http_status = get_http_status($contents); //echo "http_status=$http_status\n"; if (!is_numeric($http_status)) { return $http_meta; } $http_meta['http_status'] = $http_status; $http_meta['location'] = $url; // handle different status codes if ($http_status == "404") { echo "(http 404) "; $http_meta['state'] = 6; } elseif (substr($http_status, 0, 1) == 4) { echo "(http other 40x) "; $http_meta['state'] = 7; } elseif (substr($http_status, 0, 1) == 5) { echo "(http other 50x) "; $http_meta['state'] = 8; } elseif ($http_status == "301" or $http_status == "302") { // 301 - Moved Permanently // 302 - Found / Moved $new_url = get_http_location($contents); if (!$new_url) { $http_meta['state'] = 0; return $http_meta; } // recurse recurse recurse // discard our meta in favour of the deepest url $http_meta = get_http_meta($new_url, $depth); // set any meta we want to persist here $http_meta['redirect'] = 1; } elseif ($http_status == "200") { $http_meta['state'] = 1; $http_meta['content_length'] = get_http_content_length($contents); $http_meta['content_type'] = get_http_content_type($contents); $check_html = 0; if ($http_meta['content_type']) { $check_html = preg_match("/^text\\/html/", $http_meta['content_type']); } else { $check_html = !preg_match("/(jpg|jpeg|gif|png|wav|mp3|avi|wmv|mpg)\$/", $url); } if ($check_html) { $http_meta['html_title'] = get_html_title($contents); if (!$http_meta['html_title']) { $http_meta['html_title'] = get_html_h1($contents); } } else { //echo "(non text/html type) "; } // html meta redirect? // this could be placed better to avoid doing all the work above $html_meta_redirect = get_html_meta_redirect($contents); if ($html_meta_redirect) { $http_meta = get_http_meta($html_meta_redirect, $depth); $http_meta['redirect'] = 1; } } return $http_meta; }