Esempio n. 1
0
 public static function search($key, $next = 1)
 {
     echo '抓取页面' . PHP_EOL;
     self::$key = $key;
     httpCls::$response = '';
     httpCls::set('host', crawlConf::$host);
     httpCls::set('uri', crawlConf::$search . '?word=' . urlencode($key) . '&pn=' . $next . '&ssid=&lc=&from=&bd_page_type=&uid=&pu=&st=&wk=');
     httpCls::set('agent', crawlConf::$browser[0]['agent']);
     httpCls::set('accept', crawlConf::$browser[0]['accept']);
     httpCls::set('cookie', crawlConf::$browser[0]['cookie']);
     httpCls::send();
     //返回内容
     $content = httpCls::$response;
     //*获取list
     //            var_dump($content);
     $preg = "/\\<p\\><a href=\"(.*)?\\?ssid\\=(.*)?\"\\>.*?\\.(\\w*?)\\<\\/a\\>/i";
     preg_match_all($preg, $content, $matches);
     $data = array();
     if ($matches) {
         foreach ($matches[1] as $k => $id) {
             $data[$id] = $matches[3][$k];
         }
     }
     self::log('data', var_export($data, 1));
     $next = self::next($content);
     if ($next) {
         $data = array_merge($data, self::search($key, $next));
     }
     return $data;
 }
Esempio n. 2
0
$key = '高考数学';
//抓取的分词
$list = crawlCls::search($key);
echo count($list) . PHP_EOL;
foreach ($list as $id => $type) {
    switch ($type) {
        //根据类型分析内容
        case 'doc':
            $data = crawlCls::doc($id);
            crawlCls::save($data);
            break;
        case 'pdf':
            //                crawlCls::pdf($id);
            //                break;
        //                crawlCls::pdf($id);
        //                break;
        case 'ppt':
            //                crawlCls::ppt($id);
            //                break;
        //                crawlCls::ppt($id);
        //                break;
        case 'txt':
            //                crawlCls::txt($id);
            //                break;
        //                crawlCls::txt($id);
        //                break;
        default:
            crawlCls::log('type', $type . '的文件不可解析');
    }
}
echo '抓取完成';