Query() 공개 정적인 메소드

静态方法,访问入口
public static Query ( string $page, array $rules, string $range = '', $outputEncoding = null, string $inputEncoding = null, boolean | false $removeHead = false ) : mixed
$page string 要抓取的网页URL地址(支持https);或者是html源代码
$rules array 【选择器数组】说明:格式array("名称"=>array("选择器","类型"[,"标签过滤列表"][,"回调函数"]),.......[,"callback"=>"全局回调函数"]); 【选择器】说明:可以为任意的jQuery选择器语法 【类型】说明:值 "text" ,"html" ,"HTML标签属性" , 【标签过滤列表】:可选,要过滤的选择器名,多个用空格隔开,当标签名前面添加减号(-)时(此时标签可以为任意的元素选择器),表示移除该标签以及标签内容,否则当【类型】值为text时表示需要保留的HTML标签,为html时表示要过滤掉的HTML标签 【回调函数】/【全局回调函数】:可选,字符串(函数名) 或 数组(array("类名","类的静态方法")),回调函数应有俩个参数,第一个参数是选择到的内容,第二个参数是选择器数组下标,回调函数会覆盖全局回调函数
$range string 【块选择器】:指 先按照规则 选出 几个大块 ,然后再分别再在块里面 进行相关的选择
$inputEncoding string 【输入编码格式】明确指定输入的页面编码格式(UTF-8,GB2312,.....),防止出现乱码,如果设置为 假值 则自动识别
$removeHead boolean | false 【是否移除页面头部区域】 乱码终极解决方案
리턴 mixed
예제 #1
0
 /**
  * @author mohuishou<*****@*****.**>
  * @return $this
  * @throws \Exception
  */
 protected function login()
 {
     //判断是否已经登录
     if (!empty($this->_login_cookie)) {
         return $this;
     }
     //设置header伪造来源以及ip
     $ip = rand(1, 233) . '.' . rand(1, 233) . '.' . rand(1, 233) . '.' . rand(1, 233);
     $this->_curl->setHeader("X-Forwarded-For", $ip);
     $this->_curl->setHeader("Referer", 'http://202.115.47.141/login.jsp');
     $param = ["zjh" => $this->_uid, "mm" => $this->_password];
     $this->_curl->post('http://202.115.47.141/loginAction.do', $param);
     if ($this->_curl->error) {
         throw new \Exception('Error: ' . $this->_curl->errorCode . ': ' . $this->_curl->errorMessage, 5001);
     }
     //判断是否登录成功
     $page = $this->_curl->response;
     $page = iconv('GBK', 'UTF-8//IGNORE', $page);
     $rule = ['err' => ['.errorTop', 'text']];
     $err = QueryList::Query($page, $rule)->data;
     if (!empty($err)) {
         throw new \Exception('Error:' . $err[0]['err'], 4011);
     }
     //登录成功之后设置cookie
     $this->_login_cookie = $this->_curl->getResponseCookie("JSESSIONID");
     $this->_curl->setCookie('JSESSIONID', $this->_login_cookie);
     return $this;
 }
예제 #2
0
 protected function spider($page = 1)
 {
     //评教列表页面
     $url = "http://202.115.47.141/jxpgXsAction.do?oper=listWj&page=" . $page;
     $rules = ['param' => ['#user img', 'name', '-td[align="right"]', function ($str) {
         if ($str) {
             return explode('#@', $str);
         }
     }], 'status' => ['#user img', 'onclick', '', function ($str) {
         //判断是否已经评教
         if (trim($str) == "evaluation(this)") {
             return 0;
         } else {
             return 1;
         }
     }]];
     $rules_page = ['page' => ['script:eq(3)', 'html', '', function ($page) {
         $pattern = "/document\\.all\\.pageNo\\.value>(.)/";
         preg_match($pattern, $page, $no);
         if ($no[1]) {
             return $no[1];
         }
     }]];
     $html = $this->get($url);
     $data_info = QueryList::Query($html, $rules)->data;
     $data['info'] = $data_info;
     $data_page = QueryList::Query($html, $rules_page)->data;
     $data['page'] = $data_page[0];
     return $data;
 }
예제 #3
0
 /**
  * 获取本学期课表
  * @param boolean $is_full
  * @author mohuishou<*****@*****.**>
  * @return mixed
  */
 protected function spider($is_full = true)
 {
     $url = 'http://202.115.47.141/xkAction.do?actionType=6';
     //课程表的采集规则
     $rule = ['plan' => ['td:eq(0)', 'text'], 'courseId' => ['td:eq(1)', 'text'], 'name' => ['td:eq(2)', 'text'], 'lessonId' => ['td:eq(3)', 'text'], 'credit' => ['td:eq(4)', 'text'], 'courseType' => ['td:eq(5)', 'text'], 'examType' => ['td:eq(6)', 'text'], 'teacher' => ['td:eq(7)', 'text', '', 'cbTeacher'], 'studyWay' => ['td:eq(9)', 'text'], 'chooseType' => ['td:eq(10)', 'text'], 'allWeek' => ['td:eq(11)', 'text', '', 'cbWeek'], 'day' => ['td:eq(12)', 'text'], 'session' => ['td:eq(13)', 'text', '', 'cbWeek'], 'campus' => ['td:eq(14)', 'text'], 'building' => ['td:eq(15)', 'text'], 'classroom' => ['td:eq(16)', 'text'], 'callback' => "removeSpace"];
     $page = $this->get($url);
     $data = QueryList::Query($page, $rule, '#user:eq(1) tr', '', '', true)->data;
     //防止出现一门课多个上课时间的问题
     foreach ($data as $key => $value) {
         if (!empty($value['courseId'])) {
             if ($value['courseId'] < 10) {
                 if ($is_full) {
                     $a = $data[$key - 1];
                     $a['allWeek'] = cbWeek($value['plan']);
                     $a['week'] = $value['courseId'];
                     $a['session'] = cbWeek($value['name']);
                     $a['building'] = $value['credit'];
                     $a['classroom'] = $value['courseType'];
                     $data[$key] = $a;
                     $scheduleData[] = $a;
                 } else {
                     unset($data[$key]);
                 }
             }
         } else {
             unset($data[$key]);
         }
     }
     return $data;
 }
예제 #4
0
 public function getInfo()
 {
     $data = QueryList::Query($this->pageinfo, $this->reginfo, '#hot-list li')->data;
     foreach ($data as $item) {
         $this->infolist[] = array('hotimg' => $item['hotimg'], 'hoturl' => $item['hoturl']);
     }
 }
예제 #5
0
 protected function spider()
 {
     $url = "http://202.115.47.141/xjInfoAction.do?oper=xjxx";
     $page = $this->get($url);
     $rules = ["name" => ['tr:eq(0) td:eq(3)', 'text'], "en_name" => ['tr:eq(1) td:eq(3)', 'text'], "id" => ['tr:eq(2) td:eq(3)', 'text'], "sex" => ['tr:eq(3) td:eq(1)', 'text'], "type" => ['tr:eq(3) td:eq(3)', 'text'], "status" => ['tr:eq(4) td:eq(3)', 'text'], "nation" => ['tr:eq(5) td:eq(3)', 'text'], "native" => ['tr:eq(6) td:eq(1)', 'text'], "birth" => ['tr:eq(6) td:eq(3)', 'text'], "political" => ['tr:eq(7) td:eq(1)', 'text'], "college" => ['tr:eq(12) td:eq(3)', 'text'], "major" => ['tr:eq(13) td:eq(1)', 'text'], "year" => ['tr:eq(14) td:eq(1)', 'text'], "class" => ['tr:eq(14) td:eq(3)', 'text'], "campus" => ['tr:eq(16) td:eq(1)', 'text']];
     $data = QueryList::Query($page, $rules, '#tblView:eq(0)', '', '', true)->data;
     return $data[0];
 }
예제 #6
0
 protected function spider()
 {
     $url = "http://202.115.47.141/ksApCxAction.do?oper=getKsapXx";
     $rules = ['exam_name' => ['td:eq(0)', 'text'], 'campus' => ['td:eq(1)', 'text'], 'building' => ['td:eq(2)', 'text'], 'classroom' => ['td:eq(3)', 'text'], 'class_name' => ['td:eq(4)', 'text'], 'week' => ['td:eq(5)', 'text'], 'day' => ['td:eq(6)', 'text'], 'date' => ['td:eq(7)', 'text'], 'time' => ['td:eq(8)', 'text'], 'seat' => ['td:eq(9)', 'text']];
     $page = $this->get($url);
     $data = QueryList::Query($page, $rules, '#user:eq(1) tr')->data;
     //抓取的第一个数组一般为空,还是验证一下
     if (empty($data[0]['exam_name'])) {
         array_shift($data);
         //将第一个空数组弹出
     }
     return $data;
 }
예제 #7
0
 /**
  * 抓取本学期成绩
  * @author mohuishou<*****@*****.**>
  * @return mixed
  * @throws \Exception
  */
 protected function spiderThisTermGrade()
 {
     $url_now = "http://202.115.47.141/bxqcjcxAction.do";
     $page = $this->get($url_now);
     $rules = ['courseId' => ['td:eq(0)', 'text'], 'lessonId' => ['td:eq(1)', 'text'], 'name' => ['td:eq(2)', 'text'], 'enName' => ['td:eq(3)', 'text'], 'credit' => ['td:eq(4)', 'text'], 'courseType' => ['td:eq(5)', 'text'], 'grade' => ['td:eq(6)', 'text']];
     $grade_now = QueryList::Query($page, $rules, '#user tr')->data;
     //抓取的第一个数组一般为空,还是验证一下
     if (empty($grade_now[0]['courseId'])) {
         array_shift($grade_now);
         //将第一个空数组弹出
     }
     return $grade_now;
 }
예제 #8
0
 protected function spider($params = [], $page = 0, $pageSize = 50)
 {
     //构造待抓取的URL
     $params_all = ['kch' => '', 'kcm' => '', 'jsm' => '', 'xsjc' => '', 'skxq' => '', 'skjc' => '', 'xaqh' => '', 'jxlh' => '', 'jash' => '', 'pageSize' => $pageSize, 'pageNumber' => $page, 'actionType' => 1];
     $params = array_merge($params_all, $params);
     //要显示的列
     $showColumn = "&showColumn=kkxsjc%23%BF%AA%BF%CE%CF%B5&showColumn=kch%23%BF%CE%B3%CC%BA%C5&showColumn=kcm%23%BF%CE%B3%CC%C3%FB&showColumn=kxh%23%BF%CE%D0%F2%BA%C5&showColumn=xf%23%D1%A7%B7%D6&showColumn=kslxmc%23%BF%BC%CA%D4%C0%E0%D0%CD&showColumn=skjs%23%BD%CC%CA%A6&showColumn=zcsm%23%D6%DC%B4%CE&showColumn=skxq%23%D0%C7%C6%DA&showColumn=skjc%23%BD%DA%B4%CE&showColumn=xqm%23%D0%A3%C7%F8&showColumn=jxlm%23%BD%CC%D1%A7%C2%A5&showColumn=jasm%23%BD%CC%CA%D2&showColumn=bkskrl%23%BF%CE%C8%DD%C1%BF&showColumn=xss%23%D1%A7%C9%FA%CA%FD&showColumn=xkxzsm%23%D1%A1%BF%CE%CF%DE%D6%C6%CB%B5%C3%F7";
     //对参数转换字符编码
     foreach ($params as &$v) {
         $v = iconv('UTF-8', 'GBK//IGNORE', $v);
     }
     $params = http_build_query($params) . $showColumn;
     $url = 'http://202.115.47.141/courseSearchAction.do?' . $params;
     //构造抓取规则
     $rule = ['college' => ['td:eq(0)', 'text'], 'courseId' => ['td:eq(1)', 'text'], 'name' => ['td:eq(2)', 'text'], 'lessonId' => ['td:eq(3)', 'text'], 'credit' => ['td:eq(4)', 'text'], 'examType' => ['td:eq(5)', 'text'], 'teacher' => ['td:eq(6)', 'text', '', 'cbTeacher'], 'allWeek' => ['td:eq(7)', 'text', '', "cbWeek"], 'day' => ['td:eq(8)', 'text'], 'session' => ['td:eq(9)', 'text', '', "cbWeek"], 'campus' => ['td:eq(10)', 'text'], 'building' => ['td:eq(11)', 'text'], 'classroom' => ['td:eq(12)', 'text'], 'max' => ['td:eq(13)', 'text'], 'studentNumber' => ['td:eq(14)', 'text'], 'courseLimit' => ['td:eq(15)', 'text'], "callback" => "removeSpace"];
     $html = $this->get($url);
     $data = QueryList::Query($html, $rule, '.odd', '', '', true)->data;
     return $data;
 }
예제 #9
0
 $pids[$i] = pcntl_fork();
 // 创建子进程
 switch ($pids[$i]) {
     case -1:
         alert('创建子进程失败:' . $i);
         exit;
     case 0:
         $key_start = $thread_quantity / $workers * $i;
         $key_end = $thread_quantity / $workers * ($i + 1);
         for ($j = $key_start; $j < $key_end; $j++) {
             $url = $threads[$j]['detail_url'];
             $sourceId = $threads[$j]['source_id'];
             $reg = array('title' => array('.entry-header .entry-name', 'text', '-ins'), 'content' => array('.entry-content', 'html', '-div -script'));
             $rang = '#content';
             // sleep(1);
             $hj = QueryList::Query($url, $reg, $rang);
             $res = $hj->data;
             //var_dump($res[0]['title']);
             //exit('#50-1#');
             // 链接数据库
             $insertData = array('author' => '左岸读书', 'content' => isset($res[0]['content']) ? addslashes($res[0]['content']) : '', 'list_id' => $sourceId, 'add_time' => date('Y-m-d H:i:s'));
             // 插入数据到数据库
             $insert = $db->query("INSERT INTO content(author,content,list_id,add_time) VALUES (:p1,:p2,:p3,:p4)", array("p1" => "左岸读书", "p2" => $insertData['content'], 'p3' => $insertData['list_id'], 'p4' => date('Y-m-d H:i:s')));
             var_dump($insert);
         }
         // 子进程退出
         $curPid = getmypid();
         exit('#子进程退出:' . $curPid . "#\n");
         break;
     default:
         echo 'This is parent Process![' . getmypid() . "]\n";