/** * 下载网页,得到网页内容 * * @param mixed $url * @param mixed $options * @return void * @author seatle <*****@*****.**> * @created time :2016-09-18 10:17 */ public function request_url($url, $options = array()) { //$url = "http://www.qiushibaike.com/article/117568316"; $link = array('url' => $url, 'url_type' => isset($options['url_type']) ? $options['url_type'] : '', 'method' => isset($options['method']) ? $options['method'] : 'get', 'headers' => isset($options['headers']) ? $options['headers'] : array(), 'params' => isset($options['params']) ? $options['params'] : array(), 'context_data' => isset($options['context_data']) ? $options['context_data'] : '', 'proxy' => isset($options['proxy']) ? $options['proxy'] : self::$configs['proxy'], 'try_num' => isset($options['try_num']) ? $options['try_num'] : 0, 'max_try' => isset($options['max_try']) ? $options['max_try'] : self::$configs['max_try']); // 设置了编码就不要让requests去判断了 if (isset(self::$configs['input_encoding'])) { requests::$input_encoding = self::$configs['input_encoding']; } // 得到的编码如果不是utf-8的要转成utf-8,因为xpath只支持utf-8 requests::$output_encoding = 'utf-8'; requests::set_timeout(self::$configs['timeout']); requests::set_useragent(self::$configs['user_agent']); // 是否设置了代理 if (!empty($link['proxy'])) { requests::set_proxies(array('http' => $link['proxy'], 'https' => $link['proxy'])); // 自动切换IP requests::add_header('Proxy-Switch-Ip', 'yes'); } // 如何设置了 HTTP Headers if (!empty($link['headers'])) { foreach ($link['headers'] as $k => $v) { requests::add_header($k, $v); } } // 如果设置了附加的数据,如json和xml,就直接发附加的数据,php端可以用 file_get_contents("php://input"); 获取 $params = empty($link['context_data']) ? $link['params'] : $link['context_data']; $method = strtolower($link['method']); $html = requests::$method($url, $params); //var_dump($html);exit; $http_code = requests::$status_code; if ($this->on_status_code) { $return = call_user_func($this->on_status_code, $http_code, $url, $html, $this); if (isset($return)) { $html = $return; } if (!$html) { return false; } } if ($http_code != 200) { // 如果是301、302跳转,抓取跳转后的网页内容 if ($http_code == 301 || $http_code == 302) { $info = requests::$info; $url = $info['redirect_url']; $html = $this->request_url($url, $options); } else { if ($http_code == 404) { log::error(date("H:i:s") . " Failed to download {$url}\n"); //log::error(date("H:i:s")." Download page {$url} failed, Read timed out, will try again later\n"); log::error(date("H:i:s") . " HTTP CODE: {$http_code} Not Found\n"); } elseif ($http_code == 407) { // 扔到队列头部去,继续采集 $this->queue_rpush($link); log::error(date("H:i:s") . " Failed to download {$url}\n"); log::error(date("H:i:s") . " Proxy server authentication failed, please check the proxy server settings\n"); } elseif (in_array($http_code, array('0', '502', '503', '429'))) { // 采集次数加一 $link['try_num']++; // 抓取次数 小于 允许抓取失败次数 if ($link['try_num'] <= $link['max_try']) { // 扔到队列头部去,继续采集 $this->queue_rpush($link); } log::error(date("H:i:s") . " Download page {$url} failed, Read timed out, will try again later, retry({$link['try_num']})\n"); log::error(date("H:i:s") . " HTTP CODE: {$http_code} service unavailable\n"); } else { log::error(date("H:i:s") . " Failed to download {$url}\n"); log::error(date("H:i:s") . " HTTP CODE: {$http_code}\n"); } self::$collect_fail++; return false; } } self::$collect_succ++; return $html; }