Example #1
0
 /**
  * 下载网页,得到网页内容
  * 
  * @param mixed $url
  * @param mixed $options
  * @return void
  * @author seatle <*****@*****.**> 
  * @created time :2016-09-18 10:17
  */
 public function request_url($url, $options = array())
 {
     //$url = "http://www.qiushibaike.com/article/117568316";
     $link = array('url' => $url, 'url_type' => isset($options['url_type']) ? $options['url_type'] : '', 'method' => isset($options['method']) ? $options['method'] : 'get', 'headers' => isset($options['headers']) ? $options['headers'] : array(), 'params' => isset($options['params']) ? $options['params'] : array(), 'context_data' => isset($options['context_data']) ? $options['context_data'] : '', 'proxy' => isset($options['proxy']) ? $options['proxy'] : self::$configs['proxy'], 'try_num' => isset($options['try_num']) ? $options['try_num'] : 0, 'max_try' => isset($options['max_try']) ? $options['max_try'] : self::$configs['max_try']);
     // 设置了编码就不要让requests去判断了
     if (isset(self::$configs['input_encoding'])) {
         requests::$input_encoding = self::$configs['input_encoding'];
     }
     // 得到的编码如果不是utf-8的要转成utf-8,因为xpath只支持utf-8
     requests::$output_encoding = 'utf-8';
     requests::set_timeout(self::$configs['timeout']);
     requests::set_useragent(self::$configs['user_agent']);
     // 是否设置了代理
     if (!empty($link['proxy'])) {
         requests::set_proxies(array('http' => $link['proxy'], 'https' => $link['proxy']));
         // 自动切换IP
         requests::add_header('Proxy-Switch-Ip', 'yes');
     }
     // 如何设置了 HTTP Headers
     if (!empty($link['headers'])) {
         foreach ($link['headers'] as $k => $v) {
             requests::add_header($k, $v);
         }
     }
     // 如果设置了附加的数据,如json和xml,就直接发附加的数据,php端可以用 file_get_contents("php://input"); 获取
     $params = empty($link['context_data']) ? $link['params'] : $link['context_data'];
     $method = strtolower($link['method']);
     $html = requests::$method($url, $params);
     //var_dump($html);exit;
     $http_code = requests::$status_code;
     if ($this->on_status_code) {
         $return = call_user_func($this->on_status_code, $http_code, $url, $html, $this);
         if (isset($return)) {
             $html = $return;
         }
         if (!$html) {
             return false;
         }
     }
     if ($http_code != 200) {
         // 如果是301、302跳转,抓取跳转后的网页内容
         if ($http_code == 301 || $http_code == 302) {
             $info = requests::$info;
             $url = $info['redirect_url'];
             $html = $this->request_url($url, $options);
         } else {
             if ($http_code == 404) {
                 log::error(date("H:i:s") . " Failed to download {$url}\n");
                 //log::error(date("H:i:s")." Download page {$url} failed, Read timed out, will try again later\n");
                 log::error(date("H:i:s") . " HTTP CODE: {$http_code} Not Found\n");
             } elseif ($http_code == 407) {
                 // 扔到队列头部去,继续采集
                 $this->queue_rpush($link);
                 log::error(date("H:i:s") . " Failed to download {$url}\n");
                 log::error(date("H:i:s") . " Proxy server authentication failed, please check the proxy server settings\n");
             } elseif (in_array($http_code, array('0', '502', '503', '429'))) {
                 // 采集次数加一
                 $link['try_num']++;
                 // 抓取次数 小于 允许抓取失败次数
                 if ($link['try_num'] <= $link['max_try']) {
                     // 扔到队列头部去,继续采集
                     $this->queue_rpush($link);
                 }
                 log::error(date("H:i:s") . " Download page {$url} failed, Read timed out, will try again later, retry({$link['try_num']})\n");
                 log::error(date("H:i:s") . " HTTP CODE: {$http_code} service unavailable\n");
             } else {
                 log::error(date("H:i:s") . " Failed to download {$url}\n");
                 log::error(date("H:i:s") . " HTTP CODE: {$http_code}\n");
             }
             self::$collect_fail++;
             return false;
         }
     }
     self::$collect_succ++;
     return $html;
 }
Example #2
0
 public static function get_response_body($domain)
 {
     $header = $body = '';
     $http_headers = array();
     // 解析HTTP数据流
     if (!empty(self::$raw)) {
         self::get_response_cookies($domain);
         // body里面可能有 \r\n\r\n,但是第一个一定是HTTP Header,去掉后剩下的就是body
         $array = explode("\r\n\r\n", self::$raw);
         foreach ($array as $k => $v) {
             // post 方法会有两个http header:HTTP/1.1 100 Continue、HTTP/1.1 200 OK
             if (preg_match("#^HTTP/.*? 100 Continue#", $v)) {
                 unset($array[$k]);
                 continue;
             }
             if (preg_match("#^HTTP/.*? \\d+ #", $v)) {
                 $header = $v;
                 unset($array[$k]);
                 $http_headers = self::get_response_headers($v);
             }
         }
         $body = implode("\r\n\r\n", $array);
     }
     // 如果用户没有明确指定输入的页面编码格式(utf-8, gb2312),通过程序去判断
     if (self::$input_encoding == null) {
         // 从头部获取
         preg_match("/charset=([^\\s]*)/i", $header, $out);
         $encode = empty($out[1]) ? '' : str_replace(array('"', '\''), '', strtolower(trim($out[1])));
         if (empty($encode)) {
             // 在某些情况下,无法再 response header 中获取 html 的编码格式
             // 则需要根据 html 的文本格式获取
             $encode = self::_get_encode($body);
             $encode = strtolower($encode);
             if ($encode == false || $encode == "ascii") {
                 $encode = 'gbk';
             }
         }
         self::$input_encoding = $encode;
     }
     // 设置了输出编码的转码,注意: xpath只支持utf-8
     if (self::$output_encoding && self::$input_encoding != self::$output_encoding) {
         // 先将非utf8编码,转化为utf8编码
         $body = mb_convert_encoding($body, self::$output_encoding, self::$input_encoding);
         // 将页面中的指定的编码方式修改为utf8
         $body = preg_replace("/<meta([^>]*)charset=([^>]*)>/is", '<meta charset="UTF-8">', $body);
         // 直接干掉头部,国外很多信息是在头部的
         //$body = self::_remove_head($body);
     }
     return $body;
 }