public static function search($key, $next = 1) { echo '抓取页面' . PHP_EOL; self::$key = $key; httpCls::$response = ''; httpCls::set('host', crawlConf::$host); httpCls::set('uri', crawlConf::$search . '?word=' . urlencode($key) . '&pn=' . $next . '&ssid=&lc=&from=&bd_page_type=&uid=&pu=&st=&wk='); httpCls::set('agent', crawlConf::$browser[0]['agent']); httpCls::set('accept', crawlConf::$browser[0]['accept']); httpCls::set('cookie', crawlConf::$browser[0]['cookie']); httpCls::send(); //返回内容 $content = httpCls::$response; //*获取list // var_dump($content); $preg = "/\\<p\\><a href=\"(.*)?\\?ssid\\=(.*)?\"\\>.*?\\.(\\w*?)\\<\\/a\\>/i"; preg_match_all($preg, $content, $matches); $data = array(); if ($matches) { foreach ($matches[1] as $k => $id) { $data[$id] = $matches[3][$k]; } } self::log('data', var_export($data, 1)); $next = self::next($content); if ($next) { $data = array_merge($data, self::search($key, $next)); } return $data; }
$key = '高考数学'; //抓取的分词 $list = crawlCls::search($key); echo count($list) . PHP_EOL; foreach ($list as $id => $type) { switch ($type) { //根据类型分析内容 case 'doc': $data = crawlCls::doc($id); crawlCls::save($data); break; case 'pdf': // crawlCls::pdf($id); // break; // crawlCls::pdf($id); // break; case 'ppt': // crawlCls::ppt($id); // break; // crawlCls::ppt($id); // break; case 'txt': // crawlCls::txt($id); // break; // crawlCls::txt($id); // break; default: crawlCls::log('type', $type . '的文件不可解析'); } } echo '抓取完成';