Beispiel #1
0
 public static function rule($id)
 {
     $rs = iDB::row("SELECT * FROM `#iCMS@__spider_rule` WHERE `id`='{$id}' LIMIT 1;", ARRAY_A);
     $rs['rule'] && ($rs['rule'] = stripslashes_deep(unserialize($rs['rule'])));
     $rs['user_agent'] or $rs['user_agent'] = "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)";
     spider::$useragent = $rs['rule']['user_agent'];
     spider::$encoding = $rs['rule']['curl']['encoding'];
     spider::$referer = $rs['rule']['curl']['referer'];
     spider::$cookie = $rs['rule']['curl']['cookie'];
     spider::$charset = $rs['rule']['charset'];
     return $rs;
 }
Beispiel #2
0
 public static function remote($url, $_count = 0)
 {
     $url = str_replace('&', '&', $url);
     if (empty(spider::$referer)) {
         $uri = parse_url($url);
         spider::$referer = $uri['scheme'] . '://' . $uri['host'];
     }
     $options = array(CURLOPT_URL => $url, CURLOPT_ENCODING => spider::$encoding, CURLOPT_REFERER => spider::$referer, CURLOPT_USERAGENT => spider::$useragent, CURLOPT_TIMEOUT => 10, CURLOPT_CONNECTTIMEOUT => 10, CURLOPT_RETURNTRANSFER => 1, CURLOPT_FAILONERROR => 1, CURLOPT_HEADER => 0, CURLOPT_NOSIGNAL => true, CURLOPT_DNS_USE_GLOBAL_CACHE => true, CURLOPT_DNS_CACHE_TIMEOUT => 86400, CURLOPT_SSL_VERIFYPEER => false, CURLOPT_SSL_VERIFYHOST => false);
     spider::$cookie && ($options[CURLOPT_COOKIE] = spider::$cookie);
     if (spider::$curl_proxy) {
         $proxy = spiderTools::proxy_test();
         $proxy && ($options = spiderTools::proxy($options, $proxy));
     }
     $ch = curl_init();
     curl_setopt_array($ch, $options);
     $responses = curl_exec($ch);
     $info = curl_getinfo($ch);
     if (spider::$dataTest || spider::$ruleTest) {
         echo "<b>{$url} 头信息:</b><pre>";
         print_r($info);
         echo '</pre><hr />';
         if ($_GET['breakinfo']) {
             exit;
         }
     }
     if (in_array($info['http_code'], array(301, 302)) && $_count < 5) {
         $_count++;
         $newurl = $info['redirect_url'];
         if (empty($newurl)) {
             curl_setopt($ch, CURLOPT_HEADER, 1);
             $header = curl_exec($ch);
             preg_match('|Location: (.*)|i', $header, $matches);
             $newurl = ltrim($matches[1], '/');
             if (empty($newurl)) {
                 return false;
             }
             if (!strstr($newurl, 'http://')) {
                 $host = $uri['scheme'] . '://' . $uri['host'];
                 $newurl = $host . '/' . $newurl;
             }
         }
         $newurl = trim($newurl);
         curl_close($ch);
         unset($responses, $info);
         return spiderTools::remote($newurl, $_count);
     }
     if (in_array($info['http_code'], array(404, 500))) {
         curl_close($ch);
         unset($responses, $info);
         return false;
     }
     if ((empty($responses) || $info['http_code'] != 200) && $_count < 5) {
         $_count++;
         if (spider::$dataTest || spider::$ruleTest) {
             echo $url . '<br />';
             echo "获取内容失败,重试第{$_count}次...<br />";
         }
         curl_close($ch);
         unset($responses, $info);
         return spiderTools::remote($url, $_count);
     }
     $pos = stripos($info['content_type'], 'charset=');
     $pos !== false && ($content_charset = trim(substr($info['content_type'], $pos + 8)));
     $responses = spiderTools::charsetTrans($responses, $content_charset, spider::$charset);
     curl_close($ch);
     unset($info);
     if (spider::$dataTest || spider::$ruleTest) {
         echo '<pre>';
         print_r(htmlspecialchars(substr($responses, 0, 800)));
         echo '</pre><hr />';
     }
     spider::$url = $url;
     return $responses;
 }