/** * 获取本学期课表 * @param boolean $is_full * @author mohuishou<*****@*****.**> * @return mixed */ protected function spider($is_full = true) { $url = 'http://202.115.47.141/xkAction.do?actionType=6'; //课程表的采集规则 $rule = ['plan' => ['td:eq(0)', 'text'], 'courseId' => ['td:eq(1)', 'text'], 'name' => ['td:eq(2)', 'text'], 'lessonId' => ['td:eq(3)', 'text'], 'credit' => ['td:eq(4)', 'text'], 'courseType' => ['td:eq(5)', 'text'], 'examType' => ['td:eq(6)', 'text'], 'teacher' => ['td:eq(7)', 'text', '', 'cbTeacher'], 'studyWay' => ['td:eq(9)', 'text'], 'chooseType' => ['td:eq(10)', 'text'], 'allWeek' => ['td:eq(11)', 'text', '', 'cbWeek'], 'day' => ['td:eq(12)', 'text'], 'session' => ['td:eq(13)', 'text', '', 'cbWeek'], 'campus' => ['td:eq(14)', 'text'], 'building' => ['td:eq(15)', 'text'], 'classroom' => ['td:eq(16)', 'text'], 'callback' => "removeSpace"]; $page = $this->get($url); $data = QueryList::Query($page, $rule, '#user:eq(1) tr', '', '', true)->data; //防止出现一门课多个上课时间的问题 foreach ($data as $key => $value) { if (!empty($value['courseId'])) { if ($value['courseId'] < 10) { if ($is_full) { $a = $data[$key - 1]; $a['allWeek'] = cbWeek($value['plan']); $a['week'] = $value['courseId']; $a['session'] = cbWeek($value['name']); $a['building'] = $value['credit']; $a['classroom'] = $value['courseType']; $data[$key] = $a; $scheduleData[] = $a; } else { unset($data[$key]); } } } else { unset($data[$key]); } } return $data; }
protected function spider($page = 1) { //评教列表页面 $url = "http://202.115.47.141/jxpgXsAction.do?oper=listWj&page=" . $page; $rules = ['param' => ['#user img', 'name', '-td[align="right"]', function ($str) { if ($str) { return explode('#@', $str); } }], 'status' => ['#user img', 'onclick', '', function ($str) { //判断是否已经评教 if (trim($str) == "evaluation(this)") { return 0; } else { return 1; } }]]; $rules_page = ['page' => ['script:eq(3)', 'html', '', function ($page) { $pattern = "/document\\.all\\.pageNo\\.value>(.)/"; preg_match($pattern, $page, $no); if ($no[1]) { return $no[1]; } }]]; $html = $this->get($url); $data_info = QueryList::Query($html, $rules)->data; $data['info'] = $data_info; $data_page = QueryList::Query($html, $rules_page)->data; $data['page'] = $data_page[0]; return $data; }
/** * @author mohuishou<*****@*****.**> * @return $this * @throws \Exception */ protected function login() { //判断是否已经登录 if (!empty($this->_login_cookie)) { return $this; } //设置header伪造来源以及ip $ip = rand(1, 233) . '.' . rand(1, 233) . '.' . rand(1, 233) . '.' . rand(1, 233); $this->_curl->setHeader("X-Forwarded-For", $ip); $this->_curl->setHeader("Referer", 'http://202.115.47.141/login.jsp'); $param = ["zjh" => $this->_uid, "mm" => $this->_password]; $this->_curl->post('http://202.115.47.141/loginAction.do', $param); if ($this->_curl->error) { throw new \Exception('Error: ' . $this->_curl->errorCode . ': ' . $this->_curl->errorMessage, 5001); } //判断是否登录成功 $page = $this->_curl->response; $page = iconv('GBK', 'UTF-8//IGNORE', $page); $rule = ['err' => ['.errorTop', 'text']]; $err = QueryList::Query($page, $rule)->data; if (!empty($err)) { throw new \Exception('Error:' . $err[0]['err'], 4011); } //登录成功之后设置cookie $this->_login_cookie = $this->_curl->getResponseCookie("JSESSIONID"); $this->_curl->setCookie('JSESSIONID', $this->_login_cookie); return $this; }
public function getInfo() { $data = QueryList::Query($this->pageinfo, $this->reginfo, '#hot-list li')->data; foreach ($data as $item) { $this->infolist[] = array('hotimg' => $item['hotimg'], 'hoturl' => $item['hoturl']); } }
protected function spider() { $url = "http://202.115.47.141/xjInfoAction.do?oper=xjxx"; $page = $this->get($url); $rules = ["name" => ['tr:eq(0) td:eq(3)', 'text'], "en_name" => ['tr:eq(1) td:eq(3)', 'text'], "id" => ['tr:eq(2) td:eq(3)', 'text'], "sex" => ['tr:eq(3) td:eq(1)', 'text'], "type" => ['tr:eq(3) td:eq(3)', 'text'], "status" => ['tr:eq(4) td:eq(3)', 'text'], "nation" => ['tr:eq(5) td:eq(3)', 'text'], "native" => ['tr:eq(6) td:eq(1)', 'text'], "birth" => ['tr:eq(6) td:eq(3)', 'text'], "political" => ['tr:eq(7) td:eq(1)', 'text'], "college" => ['tr:eq(12) td:eq(3)', 'text'], "major" => ['tr:eq(13) td:eq(1)', 'text'], "year" => ['tr:eq(14) td:eq(1)', 'text'], "class" => ['tr:eq(14) td:eq(3)', 'text'], "campus" => ['tr:eq(16) td:eq(1)', 'text']]; $data = QueryList::Query($page, $rules, '#tblView:eq(0)', '', '', true)->data; return $data[0]; }
public function index() { $url = 'http://sb.uedwin.com/zh-cn/OddsService/GetOdds?_=1455613746215&sportId=1&programmeId=0&pageType=1&uiBetType=am&displayView=2&oddsType=2&sortBy=1&isFirstLoad=true&MoreBetEvent=null'; /*$UrlManager = new UrlManagerController(); $html = $UrlManager->getPage($url); echo($html);*/ $data = QueryList::run('Request', ['target' => $url, 'method' => 'GET', 'user_agent' => 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:21.0) Gecko/20100101 Firefox/21.0', 'timeout' => '30'])->getHtml($rel = false); $data = json_decode($data, true); dump($data); }
protected function spider() { $url = "http://202.115.47.141/ksApCxAction.do?oper=getKsapXx"; $rules = ['exam_name' => ['td:eq(0)', 'text'], 'campus' => ['td:eq(1)', 'text'], 'building' => ['td:eq(2)', 'text'], 'classroom' => ['td:eq(3)', 'text'], 'class_name' => ['td:eq(4)', 'text'], 'week' => ['td:eq(5)', 'text'], 'day' => ['td:eq(6)', 'text'], 'date' => ['td:eq(7)', 'text'], 'time' => ['td:eq(8)', 'text'], 'seat' => ['td:eq(9)', 'text']]; $page = $this->get($url); $data = QueryList::Query($page, $rules, '#user:eq(1) tr')->data; //抓取的第一个数组一般为空,还是验证一下 if (empty($data[0]['exam_name'])) { array_shift($data); //将第一个空数组弹出 } return $data; }
/** * 抓取本学期成绩 * @author mohuishou<*****@*****.**> * @return mixed * @throws \Exception */ protected function spiderThisTermGrade() { $url_now = "http://202.115.47.141/bxqcjcxAction.do"; $page = $this->get($url_now); $rules = ['courseId' => ['td:eq(0)', 'text'], 'lessonId' => ['td:eq(1)', 'text'], 'name' => ['td:eq(2)', 'text'], 'enName' => ['td:eq(3)', 'text'], 'credit' => ['td:eq(4)', 'text'], 'courseType' => ['td:eq(5)', 'text'], 'grade' => ['td:eq(6)', 'text']]; $grade_now = QueryList::Query($page, $rules, '#user tr')->data; //抓取的第一个数组一般为空,还是验证一下 if (empty($grade_now[0]['courseId'])) { array_shift($grade_now); //将第一个空数组弹出 } return $grade_now; }
protected function spider($params = [], $page = 0, $pageSize = 50) { //构造待抓取的URL $params_all = ['kch' => '', 'kcm' => '', 'jsm' => '', 'xsjc' => '', 'skxq' => '', 'skjc' => '', 'xaqh' => '', 'jxlh' => '', 'jash' => '', 'pageSize' => $pageSize, 'pageNumber' => $page, 'actionType' => 1]; $params = array_merge($params_all, $params); //要显示的列 $showColumn = "&showColumn=kkxsjc%23%BF%AA%BF%CE%CF%B5&showColumn=kch%23%BF%CE%B3%CC%BA%C5&showColumn=kcm%23%BF%CE%B3%CC%C3%FB&showColumn=kxh%23%BF%CE%D0%F2%BA%C5&showColumn=xf%23%D1%A7%B7%D6&showColumn=kslxmc%23%BF%BC%CA%D4%C0%E0%D0%CD&showColumn=skjs%23%BD%CC%CA%A6&showColumn=zcsm%23%D6%DC%B4%CE&showColumn=skxq%23%D0%C7%C6%DA&showColumn=skjc%23%BD%DA%B4%CE&showColumn=xqm%23%D0%A3%C7%F8&showColumn=jxlm%23%BD%CC%D1%A7%C2%A5&showColumn=jasm%23%BD%CC%CA%D2&showColumn=bkskrl%23%BF%CE%C8%DD%C1%BF&showColumn=xss%23%D1%A7%C9%FA%CA%FD&showColumn=xkxzsm%23%D1%A1%BF%CE%CF%DE%D6%C6%CB%B5%C3%F7"; //对参数转换字符编码 foreach ($params as &$v) { $v = iconv('UTF-8', 'GBK//IGNORE', $v); } $params = http_build_query($params) . $showColumn; $url = 'http://202.115.47.141/courseSearchAction.do?' . $params; //构造抓取规则 $rule = ['college' => ['td:eq(0)', 'text'], 'courseId' => ['td:eq(1)', 'text'], 'name' => ['td:eq(2)', 'text'], 'lessonId' => ['td:eq(3)', 'text'], 'credit' => ['td:eq(4)', 'text'], 'examType' => ['td:eq(5)', 'text'], 'teacher' => ['td:eq(6)', 'text', '', 'cbTeacher'], 'allWeek' => ['td:eq(7)', 'text', '', "cbWeek"], 'day' => ['td:eq(8)', 'text'], 'session' => ['td:eq(9)', 'text', '', "cbWeek"], 'campus' => ['td:eq(10)', 'text'], 'building' => ['td:eq(11)', 'text'], 'classroom' => ['td:eq(12)', 'text'], 'max' => ['td:eq(13)', 'text'], 'studentNumber' => ['td:eq(14)', 'text'], 'courseLimit' => ['td:eq(15)', 'text'], "callback" => "removeSpace"]; $html = $this->get($url); $data = QueryList::Query($html, $rule, '.odd', '', '', true)->data; return $data; }
public function getUrls2() { //HTTP操作扩展 $urls = QueryList::run('Request', ['target' => $this->url, 'method' => 'GET', 'user_agent' => 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:21.0) Gecko/20100101 Firefox/21.0', 'timeout' => '30'])->setQuery($this->param['rules'])->getData(); return $urls; }
$pids[$i] = pcntl_fork(); // 创建子进程 switch ($pids[$i]) { case -1: alert('创建子进程失败:' . $i); exit; case 0: $key_start = $thread_quantity / $workers * $i; $key_end = $thread_quantity / $workers * ($i + 1); for ($j = $key_start; $j < $key_end; $j++) { $url = $threads[$j]['detail_url']; $sourceId = $threads[$j]['source_id']; $reg = array('title' => array('.entry-header .entry-name', 'text', '-ins'), 'content' => array('.entry-content', 'html', '-div -script')); $rang = '#content'; // sleep(1); $hj = QueryList::Query($url, $reg, $rang); $res = $hj->data; //var_dump($res[0]['title']); //exit('#50-1#'); // 链接数据库 $insertData = array('author' => '左岸读书', 'content' => isset($res[0]['content']) ? addslashes($res[0]['content']) : '', 'list_id' => $sourceId, 'add_time' => date('Y-m-d H:i:s')); // 插入数据到数据库 $insert = $db->query("INSERT INTO content(author,content,list_id,add_time) VALUES (:p1,:p2,:p3,:p4)", array("p1" => "左岸读书", "p2" => $insertData['content'], 'p3' => $insertData['list_id'], 'p4' => date('Y-m-d H:i:s'))); var_dump($insert); } // 子进程退出 $curPid = getmypid(); exit('#子进程退出:' . $curPid . "#\n"); break; default: echo 'This is parent Process![' . getmypid() . "]\n";