/** * 下载网页,得到网页内容 * * @param mixed $url * @param mixed $options * @return void * @author seatle <*****@*****.**> * @created time :2016-09-18 10:17 */ public function request_url($url, $options = array()) { //$url = "http://www.qiushibaike.com/article/117568316"; $link = array('url' => $url, 'url_type' => isset($options['url_type']) ? $options['url_type'] : '', 'method' => isset($options['method']) ? $options['method'] : 'get', 'headers' => isset($options['headers']) ? $options['headers'] : array(), 'params' => isset($options['params']) ? $options['params'] : array(), 'context_data' => isset($options['context_data']) ? $options['context_data'] : '', 'proxy' => isset($options['proxy']) ? $options['proxy'] : self::$configs['proxy'], 'try_num' => isset($options['try_num']) ? $options['try_num'] : 0, 'max_try' => isset($options['max_try']) ? $options['max_try'] : self::$configs['max_try']); // 设置了编码就不要让requests去判断了 if (isset(self::$configs['input_encoding'])) { requests::$input_encoding = self::$configs['input_encoding']; } // 得到的编码如果不是utf-8的要转成utf-8,因为xpath只支持utf-8 requests::$output_encoding = 'utf-8'; requests::set_timeout(self::$configs['timeout']); requests::set_useragent(self::$configs['user_agent']); // 是否设置了代理 if (!empty($link['proxy'])) { requests::set_proxies(array('http' => $link['proxy'], 'https' => $link['proxy'])); // 自动切换IP requests::add_header('Proxy-Switch-Ip', 'yes'); } // 如何设置了 HTTP Headers if (!empty($link['headers'])) { foreach ($link['headers'] as $k => $v) { requests::add_header($k, $v); } } // 如果设置了附加的数据,如json和xml,就直接发附加的数据,php端可以用 file_get_contents("php://input"); 获取 $params = empty($link['context_data']) ? $link['params'] : $link['context_data']; $method = strtolower($link['method']); $html = requests::$method($url, $params); //var_dump($html);exit; $http_code = requests::$status_code; if ($this->on_status_code) { $return = call_user_func($this->on_status_code, $http_code, $url, $html, $this); if (isset($return)) { $html = $return; } if (!$html) { return false; } } if ($http_code != 200) { // 如果是301、302跳转,抓取跳转后的网页内容 if ($http_code == 301 || $http_code == 302) { $info = requests::$info; $url = $info['redirect_url']; $html = $this->request_url($url, $options); } else { if ($http_code == 404) { log::error(date("H:i:s") . " Failed to download {$url}\n"); //log::error(date("H:i:s")." Download page {$url} failed, Read timed out, will try again later\n"); log::error(date("H:i:s") . " HTTP CODE: {$http_code} Not Found\n"); } elseif ($http_code == 407) { // 扔到队列头部去,继续采集 $this->queue_rpush($link); log::error(date("H:i:s") . " Failed to download {$url}\n"); log::error(date("H:i:s") . " Proxy server authentication failed, please check the proxy server settings\n"); } elseif (in_array($http_code, array('0', '502', '503', '429'))) { // 采集次数加一 $link['try_num']++; // 抓取次数 小于 允许抓取失败次数 if ($link['try_num'] <= $link['max_try']) { // 扔到队列头部去,继续采集 $this->queue_rpush($link); } log::error(date("H:i:s") . " Download page {$url} failed, Read timed out, will try again later, retry({$link['try_num']})\n"); log::error(date("H:i:s") . " HTTP CODE: {$http_code} service unavailable\n"); } else { log::error(date("H:i:s") . " Failed to download {$url}\n"); log::error(date("H:i:s") . " HTTP CODE: {$http_code}\n"); } self::$collect_fail++; return false; } } self::$collect_succ++; return $html; }
public static function get_response_body($domain) { $header = $body = ''; $http_headers = array(); // 解析HTTP数据流 if (!empty(self::$raw)) { self::get_response_cookies($domain); // body里面可能有 \r\n\r\n,但是第一个一定是HTTP Header,去掉后剩下的就是body $array = explode("\r\n\r\n", self::$raw); foreach ($array as $k => $v) { // post 方法会有两个http header:HTTP/1.1 100 Continue、HTTP/1.1 200 OK if (preg_match("#^HTTP/.*? 100 Continue#", $v)) { unset($array[$k]); continue; } if (preg_match("#^HTTP/.*? \\d+ #", $v)) { $header = $v; unset($array[$k]); $http_headers = self::get_response_headers($v); } } $body = implode("\r\n\r\n", $array); } // 如果用户没有明确指定输入的页面编码格式(utf-8, gb2312),通过程序去判断 if (self::$input_encoding == null) { // 从头部获取 preg_match("/charset=([^\\s]*)/i", $header, $out); $encode = empty($out[1]) ? '' : str_replace(array('"', '\''), '', strtolower(trim($out[1]))); if (empty($encode)) { // 在某些情况下,无法再 response header 中获取 html 的编码格式 // 则需要根据 html 的文本格式获取 $encode = self::_get_encode($body); $encode = strtolower($encode); if ($encode == false || $encode == "ascii") { $encode = 'gbk'; } } self::$input_encoding = $encode; } // 设置了输出编码的转码,注意: xpath只支持utf-8 if (self::$output_encoding && self::$input_encoding != self::$output_encoding) { // 先将非utf8编码,转化为utf8编码 $body = mb_convert_encoding($body, self::$output_encoding, self::$input_encoding); // 将页面中的指定的编码方式修改为utf8 $body = preg_replace("/<meta([^>]*)charset=([^>]*)>/is", '<meta charset="UTF-8">', $body); // 直接干掉头部,国外很多信息是在头部的 //$body = self::_remove_head($body); } return $body; }