一个基于phpQuery的通用列表采集类
Author: Jaeger
 /**
  * 获取本学期课表
  * @param boolean $is_full
  * @author mohuishou<*****@*****.**>
  * @return mixed
  */
 protected function spider($is_full = true)
 {
     $url = 'http://202.115.47.141/xkAction.do?actionType=6';
     //课程表的采集规则
     $rule = ['plan' => ['td:eq(0)', 'text'], 'courseId' => ['td:eq(1)', 'text'], 'name' => ['td:eq(2)', 'text'], 'lessonId' => ['td:eq(3)', 'text'], 'credit' => ['td:eq(4)', 'text'], 'courseType' => ['td:eq(5)', 'text'], 'examType' => ['td:eq(6)', 'text'], 'teacher' => ['td:eq(7)', 'text', '', 'cbTeacher'], 'studyWay' => ['td:eq(9)', 'text'], 'chooseType' => ['td:eq(10)', 'text'], 'allWeek' => ['td:eq(11)', 'text', '', 'cbWeek'], 'day' => ['td:eq(12)', 'text'], 'session' => ['td:eq(13)', 'text', '', 'cbWeek'], 'campus' => ['td:eq(14)', 'text'], 'building' => ['td:eq(15)', 'text'], 'classroom' => ['td:eq(16)', 'text'], 'callback' => "removeSpace"];
     $page = $this->get($url);
     $data = QueryList::Query($page, $rule, '#user:eq(1) tr', '', '', true)->data;
     //防止出现一门课多个上课时间的问题
     foreach ($data as $key => $value) {
         if (!empty($value['courseId'])) {
             if ($value['courseId'] < 10) {
                 if ($is_full) {
                     $a = $data[$key - 1];
                     $a['allWeek'] = cbWeek($value['plan']);
                     $a['week'] = $value['courseId'];
                     $a['session'] = cbWeek($value['name']);
                     $a['building'] = $value['credit'];
                     $a['classroom'] = $value['courseType'];
                     $data[$key] = $a;
                     $scheduleData[] = $a;
                 } else {
                     unset($data[$key]);
                 }
             }
         } else {
             unset($data[$key]);
         }
     }
     return $data;
 }
 protected function spider($page = 1)
 {
     //评教列表页面
     $url = "http://202.115.47.141/jxpgXsAction.do?oper=listWj&page=" . $page;
     $rules = ['param' => ['#user img', 'name', '-td[align="right"]', function ($str) {
         if ($str) {
             return explode('#@', $str);
         }
     }], 'status' => ['#user img', 'onclick', '', function ($str) {
         //判断是否已经评教
         if (trim($str) == "evaluation(this)") {
             return 0;
         } else {
             return 1;
         }
     }]];
     $rules_page = ['page' => ['script:eq(3)', 'html', '', function ($page) {
         $pattern = "/document\\.all\\.pageNo\\.value>(.)/";
         preg_match($pattern, $page, $no);
         if ($no[1]) {
             return $no[1];
         }
     }]];
     $html = $this->get($url);
     $data_info = QueryList::Query($html, $rules)->data;
     $data['info'] = $data_info;
     $data_page = QueryList::Query($html, $rules_page)->data;
     $data['page'] = $data_page[0];
     return $data;
 }
 /**
  * @author mohuishou<*****@*****.**>
  * @return $this
  * @throws \Exception
  */
 protected function login()
 {
     //判断是否已经登录
     if (!empty($this->_login_cookie)) {
         return $this;
     }
     //设置header伪造来源以及ip
     $ip = rand(1, 233) . '.' . rand(1, 233) . '.' . rand(1, 233) . '.' . rand(1, 233);
     $this->_curl->setHeader("X-Forwarded-For", $ip);
     $this->_curl->setHeader("Referer", 'http://202.115.47.141/login.jsp');
     $param = ["zjh" => $this->_uid, "mm" => $this->_password];
     $this->_curl->post('http://202.115.47.141/loginAction.do', $param);
     if ($this->_curl->error) {
         throw new \Exception('Error: ' . $this->_curl->errorCode . ': ' . $this->_curl->errorMessage, 5001);
     }
     //判断是否登录成功
     $page = $this->_curl->response;
     $page = iconv('GBK', 'UTF-8//IGNORE', $page);
     $rule = ['err' => ['.errorTop', 'text']];
     $err = QueryList::Query($page, $rule)->data;
     if (!empty($err)) {
         throw new \Exception('Error:' . $err[0]['err'], 4011);
     }
     //登录成功之后设置cookie
     $this->_login_cookie = $this->_curl->getResponseCookie("JSESSIONID");
     $this->_curl->setCookie('JSESSIONID', $this->_login_cookie);
     return $this;
 }
Example #4
0
 public function getInfo()
 {
     $data = QueryList::Query($this->pageinfo, $this->reginfo, '#hot-list li')->data;
     foreach ($data as $item) {
         $this->infolist[] = array('hotimg' => $item['hotimg'], 'hoturl' => $item['hoturl']);
     }
 }
 protected function spider()
 {
     $url = "http://202.115.47.141/xjInfoAction.do?oper=xjxx";
     $page = $this->get($url);
     $rules = ["name" => ['tr:eq(0) td:eq(3)', 'text'], "en_name" => ['tr:eq(1) td:eq(3)', 'text'], "id" => ['tr:eq(2) td:eq(3)', 'text'], "sex" => ['tr:eq(3) td:eq(1)', 'text'], "type" => ['tr:eq(3) td:eq(3)', 'text'], "status" => ['tr:eq(4) td:eq(3)', 'text'], "nation" => ['tr:eq(5) td:eq(3)', 'text'], "native" => ['tr:eq(6) td:eq(1)', 'text'], "birth" => ['tr:eq(6) td:eq(3)', 'text'], "political" => ['tr:eq(7) td:eq(1)', 'text'], "college" => ['tr:eq(12) td:eq(3)', 'text'], "major" => ['tr:eq(13) td:eq(1)', 'text'], "year" => ['tr:eq(14) td:eq(1)', 'text'], "class" => ['tr:eq(14) td:eq(3)', 'text'], "campus" => ['tr:eq(16) td:eq(1)', 'text']];
     $data = QueryList::Query($page, $rules, '#tblView:eq(0)', '', '', true)->data;
     return $data[0];
 }
Example #6
0
 public function index()
 {
     $url = 'http://sb.uedwin.com/zh-cn/OddsService/GetOdds?_=1455613746215&sportId=1&programmeId=0&pageType=1&uiBetType=am&displayView=2&oddsType=2&sortBy=1&isFirstLoad=true&MoreBetEvent=null';
     /*$UrlManager = new UrlManagerController();
     		$html = $UrlManager->getPage($url);
     		echo($html);*/
     $data = QueryList::run('Request', ['target' => $url, 'method' => 'GET', 'user_agent' => 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:21.0) Gecko/20100101 Firefox/21.0', 'timeout' => '30'])->getHtml($rel = false);
     $data = json_decode($data, true);
     dump($data);
 }
Example #7
0
 protected function spider()
 {
     $url = "http://202.115.47.141/ksApCxAction.do?oper=getKsapXx";
     $rules = ['exam_name' => ['td:eq(0)', 'text'], 'campus' => ['td:eq(1)', 'text'], 'building' => ['td:eq(2)', 'text'], 'classroom' => ['td:eq(3)', 'text'], 'class_name' => ['td:eq(4)', 'text'], 'week' => ['td:eq(5)', 'text'], 'day' => ['td:eq(6)', 'text'], 'date' => ['td:eq(7)', 'text'], 'time' => ['td:eq(8)', 'text'], 'seat' => ['td:eq(9)', 'text']];
     $page = $this->get($url);
     $data = QueryList::Query($page, $rules, '#user:eq(1) tr')->data;
     //抓取的第一个数组一般为空,还是验证一下
     if (empty($data[0]['exam_name'])) {
         array_shift($data);
         //将第一个空数组弹出
     }
     return $data;
 }
Example #8
0
 /**
  * 抓取本学期成绩
  * @author mohuishou<*****@*****.**>
  * @return mixed
  * @throws \Exception
  */
 protected function spiderThisTermGrade()
 {
     $url_now = "http://202.115.47.141/bxqcjcxAction.do";
     $page = $this->get($url_now);
     $rules = ['courseId' => ['td:eq(0)', 'text'], 'lessonId' => ['td:eq(1)', 'text'], 'name' => ['td:eq(2)', 'text'], 'enName' => ['td:eq(3)', 'text'], 'credit' => ['td:eq(4)', 'text'], 'courseType' => ['td:eq(5)', 'text'], 'grade' => ['td:eq(6)', 'text']];
     $grade_now = QueryList::Query($page, $rules, '#user tr')->data;
     //抓取的第一个数组一般为空,还是验证一下
     if (empty($grade_now[0]['courseId'])) {
         array_shift($grade_now);
         //将第一个空数组弹出
     }
     return $grade_now;
 }
Example #9
0
 protected function spider($params = [], $page = 0, $pageSize = 50)
 {
     //构造待抓取的URL
     $params_all = ['kch' => '', 'kcm' => '', 'jsm' => '', 'xsjc' => '', 'skxq' => '', 'skjc' => '', 'xaqh' => '', 'jxlh' => '', 'jash' => '', 'pageSize' => $pageSize, 'pageNumber' => $page, 'actionType' => 1];
     $params = array_merge($params_all, $params);
     //要显示的列
     $showColumn = "&showColumn=kkxsjc%23%BF%AA%BF%CE%CF%B5&showColumn=kch%23%BF%CE%B3%CC%BA%C5&showColumn=kcm%23%BF%CE%B3%CC%C3%FB&showColumn=kxh%23%BF%CE%D0%F2%BA%C5&showColumn=xf%23%D1%A7%B7%D6&showColumn=kslxmc%23%BF%BC%CA%D4%C0%E0%D0%CD&showColumn=skjs%23%BD%CC%CA%A6&showColumn=zcsm%23%D6%DC%B4%CE&showColumn=skxq%23%D0%C7%C6%DA&showColumn=skjc%23%BD%DA%B4%CE&showColumn=xqm%23%D0%A3%C7%F8&showColumn=jxlm%23%BD%CC%D1%A7%C2%A5&showColumn=jasm%23%BD%CC%CA%D2&showColumn=bkskrl%23%BF%CE%C8%DD%C1%BF&showColumn=xss%23%D1%A7%C9%FA%CA%FD&showColumn=xkxzsm%23%D1%A1%BF%CE%CF%DE%D6%C6%CB%B5%C3%F7";
     //对参数转换字符编码
     foreach ($params as &$v) {
         $v = iconv('UTF-8', 'GBK//IGNORE', $v);
     }
     $params = http_build_query($params) . $showColumn;
     $url = 'http://202.115.47.141/courseSearchAction.do?' . $params;
     //构造抓取规则
     $rule = ['college' => ['td:eq(0)', 'text'], 'courseId' => ['td:eq(1)', 'text'], 'name' => ['td:eq(2)', 'text'], 'lessonId' => ['td:eq(3)', 'text'], 'credit' => ['td:eq(4)', 'text'], 'examType' => ['td:eq(5)', 'text'], 'teacher' => ['td:eq(6)', 'text', '', 'cbTeacher'], 'allWeek' => ['td:eq(7)', 'text', '', "cbWeek"], 'day' => ['td:eq(8)', 'text'], 'session' => ['td:eq(9)', 'text', '', "cbWeek"], 'campus' => ['td:eq(10)', 'text'], 'building' => ['td:eq(11)', 'text'], 'classroom' => ['td:eq(12)', 'text'], 'max' => ['td:eq(13)', 'text'], 'studentNumber' => ['td:eq(14)', 'text'], 'courseLimit' => ['td:eq(15)', 'text'], "callback" => "removeSpace"];
     $html = $this->get($url);
     $data = QueryList::Query($html, $rule, '.odd', '', '', true)->data;
     return $data;
 }
 public function getUrls2()
 {
     //HTTP操作扩展
     $urls = QueryList::run('Request', ['target' => $this->url, 'method' => 'GET', 'user_agent' => 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:21.0) Gecko/20100101 Firefox/21.0', 'timeout' => '30'])->setQuery($this->param['rules'])->getData();
     return $urls;
 }
Example #11
0
 $pids[$i] = pcntl_fork();
 // 创建子进程
 switch ($pids[$i]) {
     case -1:
         alert('创建子进程失败:' . $i);
         exit;
     case 0:
         $key_start = $thread_quantity / $workers * $i;
         $key_end = $thread_quantity / $workers * ($i + 1);
         for ($j = $key_start; $j < $key_end; $j++) {
             $url = $threads[$j]['detail_url'];
             $sourceId = $threads[$j]['source_id'];
             $reg = array('title' => array('.entry-header .entry-name', 'text', '-ins'), 'content' => array('.entry-content', 'html', '-div -script'));
             $rang = '#content';
             // sleep(1);
             $hj = QueryList::Query($url, $reg, $rang);
             $res = $hj->data;
             //var_dump($res[0]['title']);
             //exit('#50-1#');
             // 链接数据库
             $insertData = array('author' => '左岸读书', 'content' => isset($res[0]['content']) ? addslashes($res[0]['content']) : '', 'list_id' => $sourceId, 'add_time' => date('Y-m-d H:i:s'));
             // 插入数据到数据库
             $insert = $db->query("INSERT INTO content(author,content,list_id,add_time) VALUES (:p1,:p2,:p3,:p4)", array("p1" => "左岸读书", "p2" => $insertData['content'], 'p3' => $insertData['list_id'], 'p4' => date('Y-m-d H:i:s')));
             var_dump($insert);
         }
         // 子进程退出
         $curPid = getmypid();
         exit('#子进程退出:' . $curPid . "#\n");
         break;
     default:
         echo 'This is parent Process![' . getmypid() . "]\n";