静态方法,访问入口
public static Query ( string $page, array $rules, string $range = '', $outputEncoding = null, string $inputEncoding = null, boolean | false $removeHead = false ) : mixed | ||
$page | string | 要抓取的网页URL地址(支持https);或者是html源代码 |
$rules | array | 【选择器数组】说明:格式array("名称"=>array("选择器","类型"[,"标签过滤列表"][,"回调函数"]),.......[,"callback"=>"全局回调函数"]); 【选择器】说明:可以为任意的jQuery选择器语法 【类型】说明:值 "text" ,"html" ,"HTML标签属性" , 【标签过滤列表】:可选,要过滤的选择器名,多个用空格隔开,当标签名前面添加减号(-)时(此时标签可以为任意的元素选择器),表示移除该标签以及标签内容,否则当【类型】值为text时表示需要保留的HTML标签,为html时表示要过滤掉的HTML标签 【回调函数】/【全局回调函数】:可选,字符串(函数名) 或 数组(array("类名","类的静态方法")),回调函数应有俩个参数,第一个参数是选择到的内容,第二个参数是选择器数组下标,回调函数会覆盖全局回调函数 |
$range | string | 【块选择器】:指 先按照规则 选出 几个大块 ,然后再分别再在块里面 进行相关的选择 |
$inputEncoding | string | 【输入编码格式】明确指定输入的页面编码格式(UTF-8,GB2312,.....),防止出现乱码,如果设置为 假值 则自动识别 |
$removeHead | boolean | false | 【是否移除页面头部区域】 乱码终极解决方案 |
Résultat | mixed |
/** * @author mohuishou<*****@*****.**> * @return $this * @throws \Exception */ protected function login() { //判断是否已经登录 if (!empty($this->_login_cookie)) { return $this; } //设置header伪造来源以及ip $ip = rand(1, 233) . '.' . rand(1, 233) . '.' . rand(1, 233) . '.' . rand(1, 233); $this->_curl->setHeader("X-Forwarded-For", $ip); $this->_curl->setHeader("Referer", 'http://202.115.47.141/login.jsp'); $param = ["zjh" => $this->_uid, "mm" => $this->_password]; $this->_curl->post('http://202.115.47.141/loginAction.do', $param); if ($this->_curl->error) { throw new \Exception('Error: ' . $this->_curl->errorCode . ': ' . $this->_curl->errorMessage, 5001); } //判断是否登录成功 $page = $this->_curl->response; $page = iconv('GBK', 'UTF-8//IGNORE', $page); $rule = ['err' => ['.errorTop', 'text']]; $err = QueryList::Query($page, $rule)->data; if (!empty($err)) { throw new \Exception('Error:' . $err[0]['err'], 4011); } //登录成功之后设置cookie $this->_login_cookie = $this->_curl->getResponseCookie("JSESSIONID"); $this->_curl->setCookie('JSESSIONID', $this->_login_cookie); return $this; }
protected function spider($page = 1) { //评教列表页面 $url = "http://202.115.47.141/jxpgXsAction.do?oper=listWj&page=" . $page; $rules = ['param' => ['#user img', 'name', '-td[align="right"]', function ($str) { if ($str) { return explode('#@', $str); } }], 'status' => ['#user img', 'onclick', '', function ($str) { //判断是否已经评教 if (trim($str) == "evaluation(this)") { return 0; } else { return 1; } }]]; $rules_page = ['page' => ['script:eq(3)', 'html', '', function ($page) { $pattern = "/document\\.all\\.pageNo\\.value>(.)/"; preg_match($pattern, $page, $no); if ($no[1]) { return $no[1]; } }]]; $html = $this->get($url); $data_info = QueryList::Query($html, $rules)->data; $data['info'] = $data_info; $data_page = QueryList::Query($html, $rules_page)->data; $data['page'] = $data_page[0]; return $data; }
/** * 获取本学期课表 * @param boolean $is_full * @author mohuishou<*****@*****.**> * @return mixed */ protected function spider($is_full = true) { $url = 'http://202.115.47.141/xkAction.do?actionType=6'; //课程表的采集规则 $rule = ['plan' => ['td:eq(0)', 'text'], 'courseId' => ['td:eq(1)', 'text'], 'name' => ['td:eq(2)', 'text'], 'lessonId' => ['td:eq(3)', 'text'], 'credit' => ['td:eq(4)', 'text'], 'courseType' => ['td:eq(5)', 'text'], 'examType' => ['td:eq(6)', 'text'], 'teacher' => ['td:eq(7)', 'text', '', 'cbTeacher'], 'studyWay' => ['td:eq(9)', 'text'], 'chooseType' => ['td:eq(10)', 'text'], 'allWeek' => ['td:eq(11)', 'text', '', 'cbWeek'], 'day' => ['td:eq(12)', 'text'], 'session' => ['td:eq(13)', 'text', '', 'cbWeek'], 'campus' => ['td:eq(14)', 'text'], 'building' => ['td:eq(15)', 'text'], 'classroom' => ['td:eq(16)', 'text'], 'callback' => "removeSpace"]; $page = $this->get($url); $data = QueryList::Query($page, $rule, '#user:eq(1) tr', '', '', true)->data; //防止出现一门课多个上课时间的问题 foreach ($data as $key => $value) { if (!empty($value['courseId'])) { if ($value['courseId'] < 10) { if ($is_full) { $a = $data[$key - 1]; $a['allWeek'] = cbWeek($value['plan']); $a['week'] = $value['courseId']; $a['session'] = cbWeek($value['name']); $a['building'] = $value['credit']; $a['classroom'] = $value['courseType']; $data[$key] = $a; $scheduleData[] = $a; } else { unset($data[$key]); } } } else { unset($data[$key]); } } return $data; }
public function getInfo() { $data = QueryList::Query($this->pageinfo, $this->reginfo, '#hot-list li')->data; foreach ($data as $item) { $this->infolist[] = array('hotimg' => $item['hotimg'], 'hoturl' => $item['hoturl']); } }
protected function spider() { $url = "http://202.115.47.141/xjInfoAction.do?oper=xjxx"; $page = $this->get($url); $rules = ["name" => ['tr:eq(0) td:eq(3)', 'text'], "en_name" => ['tr:eq(1) td:eq(3)', 'text'], "id" => ['tr:eq(2) td:eq(3)', 'text'], "sex" => ['tr:eq(3) td:eq(1)', 'text'], "type" => ['tr:eq(3) td:eq(3)', 'text'], "status" => ['tr:eq(4) td:eq(3)', 'text'], "nation" => ['tr:eq(5) td:eq(3)', 'text'], "native" => ['tr:eq(6) td:eq(1)', 'text'], "birth" => ['tr:eq(6) td:eq(3)', 'text'], "political" => ['tr:eq(7) td:eq(1)', 'text'], "college" => ['tr:eq(12) td:eq(3)', 'text'], "major" => ['tr:eq(13) td:eq(1)', 'text'], "year" => ['tr:eq(14) td:eq(1)', 'text'], "class" => ['tr:eq(14) td:eq(3)', 'text'], "campus" => ['tr:eq(16) td:eq(1)', 'text']]; $data = QueryList::Query($page, $rules, '#tblView:eq(0)', '', '', true)->data; return $data[0]; }
protected function spider() { $url = "http://202.115.47.141/ksApCxAction.do?oper=getKsapXx"; $rules = ['exam_name' => ['td:eq(0)', 'text'], 'campus' => ['td:eq(1)', 'text'], 'building' => ['td:eq(2)', 'text'], 'classroom' => ['td:eq(3)', 'text'], 'class_name' => ['td:eq(4)', 'text'], 'week' => ['td:eq(5)', 'text'], 'day' => ['td:eq(6)', 'text'], 'date' => ['td:eq(7)', 'text'], 'time' => ['td:eq(8)', 'text'], 'seat' => ['td:eq(9)', 'text']]; $page = $this->get($url); $data = QueryList::Query($page, $rules, '#user:eq(1) tr')->data; //抓取的第一个数组一般为空,还是验证一下 if (empty($data[0]['exam_name'])) { array_shift($data); //将第一个空数组弹出 } return $data; }
/** * 抓取本学期成绩 * @author mohuishou<*****@*****.**> * @return mixed * @throws \Exception */ protected function spiderThisTermGrade() { $url_now = "http://202.115.47.141/bxqcjcxAction.do"; $page = $this->get($url_now); $rules = ['courseId' => ['td:eq(0)', 'text'], 'lessonId' => ['td:eq(1)', 'text'], 'name' => ['td:eq(2)', 'text'], 'enName' => ['td:eq(3)', 'text'], 'credit' => ['td:eq(4)', 'text'], 'courseType' => ['td:eq(5)', 'text'], 'grade' => ['td:eq(6)', 'text']]; $grade_now = QueryList::Query($page, $rules, '#user tr')->data; //抓取的第一个数组一般为空,还是验证一下 if (empty($grade_now[0]['courseId'])) { array_shift($grade_now); //将第一个空数组弹出 } return $grade_now; }
protected function spider($params = [], $page = 0, $pageSize = 50) { //构造待抓取的URL $params_all = ['kch' => '', 'kcm' => '', 'jsm' => '', 'xsjc' => '', 'skxq' => '', 'skjc' => '', 'xaqh' => '', 'jxlh' => '', 'jash' => '', 'pageSize' => $pageSize, 'pageNumber' => $page, 'actionType' => 1]; $params = array_merge($params_all, $params); //要显示的列 $showColumn = "&showColumn=kkxsjc%23%BF%AA%BF%CE%CF%B5&showColumn=kch%23%BF%CE%B3%CC%BA%C5&showColumn=kcm%23%BF%CE%B3%CC%C3%FB&showColumn=kxh%23%BF%CE%D0%F2%BA%C5&showColumn=xf%23%D1%A7%B7%D6&showColumn=kslxmc%23%BF%BC%CA%D4%C0%E0%D0%CD&showColumn=skjs%23%BD%CC%CA%A6&showColumn=zcsm%23%D6%DC%B4%CE&showColumn=skxq%23%D0%C7%C6%DA&showColumn=skjc%23%BD%DA%B4%CE&showColumn=xqm%23%D0%A3%C7%F8&showColumn=jxlm%23%BD%CC%D1%A7%C2%A5&showColumn=jasm%23%BD%CC%CA%D2&showColumn=bkskrl%23%BF%CE%C8%DD%C1%BF&showColumn=xss%23%D1%A7%C9%FA%CA%FD&showColumn=xkxzsm%23%D1%A1%BF%CE%CF%DE%D6%C6%CB%B5%C3%F7"; //对参数转换字符编码 foreach ($params as &$v) { $v = iconv('UTF-8', 'GBK//IGNORE', $v); } $params = http_build_query($params) . $showColumn; $url = 'http://202.115.47.141/courseSearchAction.do?' . $params; //构造抓取规则 $rule = ['college' => ['td:eq(0)', 'text'], 'courseId' => ['td:eq(1)', 'text'], 'name' => ['td:eq(2)', 'text'], 'lessonId' => ['td:eq(3)', 'text'], 'credit' => ['td:eq(4)', 'text'], 'examType' => ['td:eq(5)', 'text'], 'teacher' => ['td:eq(6)', 'text', '', 'cbTeacher'], 'allWeek' => ['td:eq(7)', 'text', '', "cbWeek"], 'day' => ['td:eq(8)', 'text'], 'session' => ['td:eq(9)', 'text', '', "cbWeek"], 'campus' => ['td:eq(10)', 'text'], 'building' => ['td:eq(11)', 'text'], 'classroom' => ['td:eq(12)', 'text'], 'max' => ['td:eq(13)', 'text'], 'studentNumber' => ['td:eq(14)', 'text'], 'courseLimit' => ['td:eq(15)', 'text'], "callback" => "removeSpace"]; $html = $this->get($url); $data = QueryList::Query($html, $rule, '.odd', '', '', true)->data; return $data; }
$pids[$i] = pcntl_fork(); // 创建子进程 switch ($pids[$i]) { case -1: alert('创建子进程失败:' . $i); exit; case 0: $key_start = $thread_quantity / $workers * $i; $key_end = $thread_quantity / $workers * ($i + 1); for ($j = $key_start; $j < $key_end; $j++) { $url = $threads[$j]['detail_url']; $sourceId = $threads[$j]['source_id']; $reg = array('title' => array('.entry-header .entry-name', 'text', '-ins'), 'content' => array('.entry-content', 'html', '-div -script')); $rang = '#content'; // sleep(1); $hj = QueryList::Query($url, $reg, $rang); $res = $hj->data; //var_dump($res[0]['title']); //exit('#50-1#'); // 链接数据库 $insertData = array('author' => '左岸读书', 'content' => isset($res[0]['content']) ? addslashes($res[0]['content']) : '', 'list_id' => $sourceId, 'add_time' => date('Y-m-d H:i:s')); // 插入数据到数据库 $insert = $db->query("INSERT INTO content(author,content,list_id,add_time) VALUES (:p1,:p2,:p3,:p4)", array("p1" => "左岸读书", "p2" => $insertData['content'], 'p3' => $insertData['list_id'], 'p4' => date('Y-m-d H:i:s'))); var_dump($insert); } // 子进程退出 $curPid = getmypid(); exit('#子进程退出:' . $curPid . "#\n"); break; default: echo 'This is parent Process![' . getmypid() . "]\n";