Beispiel #1
0
 /**
  * 下载网页,得到网页内容
  * 
  * @param mixed $url
  * @param mixed $options
  * @return void
  * @author seatle <*****@*****.**> 
  * @created time :2016-09-18 10:17
  */
 public function request_url($url, $options = array())
 {
     //$url = "http://www.qiushibaike.com/article/117568316";
     $link = array('url' => $url, 'url_type' => isset($options['url_type']) ? $options['url_type'] : '', 'method' => isset($options['method']) ? $options['method'] : 'get', 'headers' => isset($options['headers']) ? $options['headers'] : array(), 'params' => isset($options['params']) ? $options['params'] : array(), 'context_data' => isset($options['context_data']) ? $options['context_data'] : '', 'proxy' => isset($options['proxy']) ? $options['proxy'] : self::$configs['proxy'], 'try_num' => isset($options['try_num']) ? $options['try_num'] : 0, 'max_try' => isset($options['max_try']) ? $options['max_try'] : self::$configs['max_try']);
     // 设置了编码就不要让requests去判断了
     if (isset(self::$configs['input_encoding'])) {
         requests::$input_encoding = self::$configs['input_encoding'];
     }
     // 得到的编码如果不是utf-8的要转成utf-8,因为xpath只支持utf-8
     requests::$output_encoding = 'utf-8';
     requests::set_timeout(self::$configs['timeout']);
     requests::set_useragent(self::$configs['user_agent']);
     // 是否设置了代理
     if (!empty($link['proxy'])) {
         requests::set_proxies(array('http' => $link['proxy'], 'https' => $link['proxy']));
         // 自动切换IP
         requests::add_header('Proxy-Switch-Ip', 'yes');
     }
     // 如何设置了 HTTP Headers
     if (!empty($link['headers'])) {
         foreach ($link['headers'] as $k => $v) {
             requests::add_header($k, $v);
         }
     }
     // 如果设置了附加的数据,如json和xml,就直接发附加的数据,php端可以用 file_get_contents("php://input"); 获取
     $params = empty($link['context_data']) ? $link['params'] : $link['context_data'];
     $method = strtolower($link['method']);
     $html = requests::$method($url, $params);
     //var_dump($html);exit;
     $http_code = requests::$status_code;
     if ($this->on_status_code) {
         $return = call_user_func($this->on_status_code, $http_code, $url, $html, $this);
         if (isset($return)) {
             $html = $return;
         }
         if (!$html) {
             return false;
         }
     }
     if ($http_code != 200) {
         // 如果是301、302跳转,抓取跳转后的网页内容
         if ($http_code == 301 || $http_code == 302) {
             $info = requests::$info;
             $url = $info['redirect_url'];
             $html = $this->request_url($url, $options);
         } else {
             if ($http_code == 404) {
                 log::error(date("H:i:s") . " Failed to download {$url}\n");
                 //log::error(date("H:i:s")." Download page {$url} failed, Read timed out, will try again later\n");
                 log::error(date("H:i:s") . " HTTP CODE: {$http_code} Not Found\n");
             } elseif ($http_code == 407) {
                 // 扔到队列头部去,继续采集
                 $this->queue_rpush($link);
                 log::error(date("H:i:s") . " Failed to download {$url}\n");
                 log::error(date("H:i:s") . " Proxy server authentication failed, please check the proxy server settings\n");
             } elseif (in_array($http_code, array('0', '502', '503', '429'))) {
                 // 采集次数加一
                 $link['try_num']++;
                 // 抓取次数 小于 允许抓取失败次数
                 if ($link['try_num'] <= $link['max_try']) {
                     // 扔到队列头部去,继续采集
                     $this->queue_rpush($link);
                 }
                 log::error(date("H:i:s") . " Download page {$url} failed, Read timed out, will try again later, retry({$link['try_num']})\n");
                 log::error(date("H:i:s") . " HTTP CODE: {$http_code} service unavailable\n");
             } else {
                 log::error(date("H:i:s") . " Failed to download {$url}\n");
                 log::error(date("H:i:s") . " HTTP CODE: {$http_code}\n");
             }
             self::$collect_fail++;
             return false;
         }
     }
     self::$collect_succ++;
     return $html;
 }