public static function rule($id) { $rs = iDB::row("SELECT * FROM `#iCMS@__spider_rule` WHERE `id`='{$id}' LIMIT 1;", ARRAY_A); $rs['rule'] && ($rs['rule'] = stripslashes_deep(unserialize($rs['rule']))); $rs['user_agent'] or $rs['user_agent'] = "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)"; spider::$useragent = $rs['rule']['user_agent']; spider::$encoding = $rs['rule']['curl']['encoding']; spider::$referer = $rs['rule']['curl']['referer']; spider::$cookie = $rs['rule']['curl']['cookie']; spider::$charset = $rs['rule']['charset']; return $rs; }
public static function remote($url, $_count = 0) { $url = str_replace('&', '&', $url); if (empty(spider::$referer)) { $uri = parse_url($url); spider::$referer = $uri['scheme'] . '://' . $uri['host']; } $options = array(CURLOPT_URL => $url, CURLOPT_ENCODING => spider::$encoding, CURLOPT_REFERER => spider::$referer, CURLOPT_USERAGENT => spider::$useragent, CURLOPT_TIMEOUT => 10, CURLOPT_CONNECTTIMEOUT => 10, CURLOPT_RETURNTRANSFER => 1, CURLOPT_FAILONERROR => 1, CURLOPT_HEADER => 0, CURLOPT_NOSIGNAL => true, CURLOPT_DNS_USE_GLOBAL_CACHE => true, CURLOPT_DNS_CACHE_TIMEOUT => 86400, CURLOPT_SSL_VERIFYPEER => false, CURLOPT_SSL_VERIFYHOST => false); spider::$cookie && ($options[CURLOPT_COOKIE] = spider::$cookie); if (spider::$curl_proxy) { $proxy = spiderTools::proxy_test(); $proxy && ($options = spiderTools::proxy($options, $proxy)); } $ch = curl_init(); curl_setopt_array($ch, $options); $responses = curl_exec($ch); $info = curl_getinfo($ch); if (spider::$dataTest || spider::$ruleTest) { echo "<b>{$url} 头信息:</b><pre>"; print_r($info); echo '</pre><hr />'; if ($_GET['breakinfo']) { exit; } } if (in_array($info['http_code'], array(301, 302)) && $_count < 5) { $_count++; $newurl = $info['redirect_url']; if (empty($newurl)) { curl_setopt($ch, CURLOPT_HEADER, 1); $header = curl_exec($ch); preg_match('|Location: (.*)|i', $header, $matches); $newurl = ltrim($matches[1], '/'); if (empty($newurl)) { return false; } if (!strstr($newurl, 'http://')) { $host = $uri['scheme'] . '://' . $uri['host']; $newurl = $host . '/' . $newurl; } } $newurl = trim($newurl); curl_close($ch); unset($responses, $info); return spiderTools::remote($newurl, $_count); } if (in_array($info['http_code'], array(404, 500))) { curl_close($ch); unset($responses, $info); return false; } if ((empty($responses) || $info['http_code'] != 200) && $_count < 5) { $_count++; if (spider::$dataTest || spider::$ruleTest) { echo $url . '<br />'; echo "获取内容失败,重试第{$_count}次...<br />"; } curl_close($ch); unset($responses, $info); return spiderTools::remote($url, $_count); } $pos = stripos($info['content_type'], 'charset='); $pos !== false && ($content_charset = trim(substr($info['content_type'], $pos + 8))); $responses = spiderTools::charsetTrans($responses, $content_charset, spider::$charset); curl_close($ch); unset($info); if (spider::$dataTest || spider::$ruleTest) { echo '<pre>'; print_r(htmlspecialchars(substr($responses, 0, 800))); echo '</pre><hr />'; } spider::$url = $url; return $responses; }