/** * [queryHotList 采集分类下的文章] * @param unknown $cat * @return Ambigous <multitype:, array> * @access public * @author polo<*****@*****.**> * @version 2015-3-11 上午10:35:25 * @copyright Show More */ public static function queryHotList($cat) { $url = 'http://weixin.sogou.com/pcindex/pc/' . $cat . '/' . $cat . '.html'; $reg = array("Title" => array('.wx-news-info2 h4', 'text'), 'Description' => array('.wx-news-info2>a', 'text'), 'PicUrl' => array('.wx-img-box a img', 'src'), 'Url' => array('.wx-img-box a', 'href')); $obj = \QueryList::Query($url, $reg); return $obj->jsonArr; }
/** * 更新各省IPV4地址段 */ public static function update() { require 'QueryList.class.php'; $province = array('BJ', 'GD', 'SD', 'ZJ', 'JS', 'SH', 'LN', 'SC', 'HA', 'HB', 'FJ', 'HN', 'HE', 'CQ', 'SX', 'JX', 'SN', 'AH', 'HL', 'GX', 'JL', 'YN', 'TJ', 'NM', 'XJ', 'GS', 'GZ', 'HI', 'NX', 'QH', 'XZ'); $count = count($province); $ip_segment = array(); for ($i = 0; $i < $count; $i++) { // 采集IP地址段目标网址 $url = 'http://ips.chacuo.net/view/s_' . $province[$i]; $ip_list = QueryList::Query($url, array('begin' => array('.v_l', 'text'), 'end' => array('.v_r', 'text')), '', 'UTF-8'); $ip_array = $ip_list->jsonArr; $ip_segment[$province[$i]] = $ip_array; } $path = str_replace('\\', '/', __DIR__); $file = $path . '/ip_segment.php'; $res = self::_write($file, $ip_segment); if ($res) { echo 'Update OK!'; } }
private function getType($html) { $reg = array('type1' => array('#site-nav a:eq(0)', 'text'), 'type2' => array('#main-body a:eq(0)', 'text'), 'type3' => array('.block_all', 'text'), 'type4' => array('.midsml-rank-stars', 'class')); $rang = ''; $query = QueryList::Query($html, $reg, $rang); $type = 0; if (isset($query->jsonArr[0])) { if (isset($query->jsonArr[0]['type2'])) { $type = isset($query->jsonArr[0]['type3']) ? 4 : 3; } else { $type = isset($query->jsonArr[0]['type4']) ? 5 : 2; } } else { $type = 1; } return $type; }
<?php require_once '../inc/QueryList/QueryList.class.php'; header('Content-type:text/html;charset=utf-8'); $url = "http://meiriyiwen.com/"; $reg = array("title" => array("#article_show h1", "text"), "author" => array("#article_show .article_author span", "text"), "content" => array("#article_show .article_text", "html")); $mryw_rst = QueryList::Query($url, $reg); $mryw_array = $mryw_rst->jsonArr; if ($mryw_array) { $m_title = $mryw_array[0]['title']; $m_author = $mryw_array[0]['author']; $m_content = $mryw_array[0]['content']; $m_md5 = md5($m_content); if (empty($m_title) || empty($m_author) || empty($m_content) || empty($m_md5)) { echo 'empty mryw, ' . date('Y-m-d H:i:s', time()); } else { require_once '../inc/mryw.inc.php'; $mryw_dao = new Mryw(); if ($mryw_dao->is_mryw_exist($m_md5)) { echo 'exist mryw, ' . date('Y-m-d H:i:s', time()); } else { $lastest = $mryw_dao->get_lastest_article(true); $n_time = strtotime($lastest['m_time']) + 3600 * 23 + 57; $mryw_dao->insert_mryw($m_title, $m_content, $m_author, $m_md5, date('Y-m-d H:i:s', $n_time)); // echo 'success mryw, ' . date('Y-m-d H:i:s', time()); Mysql::closeConn(); } } }
continue; } $data['title'] = $value['title']; $data['source'] = $value['url']; $data['sourceType'] = 2; // 2表示来源是互联网 $data['classId'] = 3; // 3表示左岸频道 $data['category'] = 1; // 1表示创造之路 $data['isPublish'] = 1; $data['publishTime'] = $nowDate; $data['changeTime'] = $nowDate; $data['createTime'] = $nowDate; $data['isDel'] = 0; $con_hj = QueryList::Query($value['url'], array("content" => array('#content .entry-content', 'html', '-div'))); $docId = $arti->insertData('uctoo_article_article', $data); if ($docId > 0) { $data2 = array(); $data2['docId'] = $docId; $data2['content'] = $con_hj->jsonArr[0]['content']; # 如果存在,则不执行添加操作 $res1 = $arti->select('select "X" from uctoo_article_content_1 where docId=' . $docId); if ($res1) { continue; } $arti->insertData('uctoo_article_content_1', $data2); } } } print_r($hj->jsonArr);
//开启前台发布 if ($config['UserAddArticle'] == 0) { echo "<script>alert('前台会员不支持发布文章');location.href='ucenter.php'</script>"; exit; } //微信文章导入 if ($_POST) { require 'QueryList.class.php'; $long = guolv(trim($_POST['long'])); $type_id = guolv(trim($_POST['type_id'])); $html = get_contents($long); $money = $type_arr[2]; $html = str_replace('data-src', 'src', $html); $caiji = array("title" => array(".rich_media_title:first", "text"), "content" => array("#js_content", "html")); $quyu = ''; $hj = QueryList::Query($html, $caiji, $quyu); $arr = $hj->jsonArr; $title = $arr[0]['title']; $content = $arr[0]['content']; $pic = cut($html, 'var msg_cdn_url = "', '"'); if (url_exists($long) == 1) { echo "<script>alert('网址不存在');location.href='weixin.php'</script>"; exit; } if (is_numeric($type_id) == false) { echo "<script>alert('分类不存在');location.href='weixin.php'</script>"; exit; } $row = $mysql->query("select * from `article` where `title`='{$title}' limit 1"); if (!$row) { $arr = array('top' => 0, 'title' => $title, 'content' => $content, 'pic' => '[weixin]' . $pic, 'type' => $type_id, 'pv' => 0, 'pv_max' => '', 'money' => $money, 'day' => date("Y-m-d", time()));
public function getPage() { $reg = array('page' => array('.PageLink:last', 'text')); $query = QueryList::Query($this->html, $reg, ''); $page = !empty($query->jsonArr[0]['page']) ? $query->jsonArr[0]['page'] : '0'; return $page; }
public function index() { set_time_limit(0); import('Org.JAE.QueryList'); header("Content-type: text/html; charset=utf-8"); $listurl = "http://qt.qq.com/static/pages/news/phone/c12_list_1.shtml"; $page = 0; while (true) { if ($page > 10) { break; } $pageresult = \QueryList::Query($listurl); $json = $pageresult->getHtmlJSON(); if (empty($json[0]['next'])) { echo $listurl; dump($json[0]); break; } $listurl = "http://qt.qq.com/static/pages/news/phone/" . $json[0]['next']; $items = $json[0]['list']; foreach ($items as $item) { $article_url = $item['article_url']; if (strpos($article_url, 'qq.com')) { continue; } if (!strpos($article_url, "article_")) { continue; } $article_url = "http://qt.qq.com/static/pages/news/phone/" . $article_url; $map['title'] = $item['title']; $iscollect = D('DcDocument')->where($map)->find(); if (!empty($iscollect)) { continue; } $data['create_time'] = strtotime($item['insert_date']); $data['title'] = $item['title']; $data['description'] = $item['summary']; $data['cover_id'] = $this->saveCoverImage($item['image_url_small']); if (empty($data['cover_id'])) { continue; } $Document = D('DcDocument'); $data['title'] = str_replace('掌盟', '群挑', $data['title']); $docid = $Document->addDoc($data); $pagecontent = \phpQuery::newDocumentFile($article_url); $content = pq(".article_content")->html(); $imgs = pq($content)->find("img"); foreach ($imgs as $img) { $src = pq($img)->attr('src'); if (empty($src)) { $src = pq($img)->attr('jason'); } $imgurl = $this->saveArticleImage($src); $content = str_replace($src, $imgurl, $content); $content = str_replace("jason=", "src=", $content); $content = str_replace("<img", "<img alt='" . $item['title'] . "'", $content); } $content = str_replace('掌盟', '群挑', $content); $content = preg_replace("/<a[^>]*>(.*)<\\/a>/isU", '${1}', $content); $Article = D('DcArticle'); $article['content'] = trim($content); $article['id'] = $docid; $article_id = $Article->addArticle($article); \phpQuery::$documents = array(); $page++; } } }
mysql_select_db(DB_NAME); $sql = "select count(*) from cn_kb_blogs"; $res = mysql_query($sql); $counts_info = mysql_fetch_array($res); $exist_max_num = $counts_info['0']; //每页20 $start_page = ceil($exist_max_num / 20); $start_num = intval($exist_max_num % 20); if ($start_num == 19) { $start_page++; } for ($page = 4; $page > 0; $page--) { $url = 'http://home.cnblogs.com/kb'; $fetch_page = $start_page + $page; $url = empty($page) ? $url : $url . "/page/" . $fetch_page; $hj = QueryList::Query($url, $reg, $rang, 'UTF-8'); $cn_blogs = $hj->data; $get_cnblogs_nums = count($cn_blogs) - 1; for ($i = $get_cnblogs_nums; $i >= 0; $i--) { $cn_blog = $cn_blogs[$i]; $exist_sql = "select id from cn_kb_blogs where content_url='" . $cn_blog['content_url'] . "'"; $exist_res = mysql_query($exist_sql); $num_rows = mysql_num_rows($exist_res); if (empty($num_rows)) { $values = implode("','", $cn_blog); $sql = "insert into cn_kb_blogs(title, content_url, kb_type) values('{$values}')"; mysql_query($sql); echo 'success' . "<br>"; } } }
echo $json . "<hr/>"; //采OSC内容页内容 $url = "http://www.oschina.net/code/snippet_186288_23816"; $reg = array("title" => array(".QTitle h1", "text"), "con" => array(".Content", "html")); $hj = QueryList::Query($url, $reg); $arr = $hj->jsonArr; echo "<pre>"; print_r($arr); echo "</pre><hr/>"; //抓取网站基本信息 //设置规则 $reg = array("kw" => array("meta[name=keywords]", "content"), "desc" => array("meta[name=description]", "content"), "title" => array("title", "text"), "css1" => array("link:eq(0)", "href"), "js2" => array("script[src]:eq(1)", "src")); //抓取的目标站 $url = 'http://x.44i.cc/'; //抓取 $data = QueryList::Query($url, $reg)->jsonArr; print_r($data); //下面单独演示回调函数的用法 //抓取网站keywords并分离每个关键词 $reg = array("kw" => array("meta[name=keywords]", "content", '', 'fun')); //自定义回调函数 function fun($content, $key) { //分离关键词 return explode(',', $content); } //抓取的目标站 $url = 'http://x.44i.cc/'; //抓取 $data = QueryList::Query($url, $reg)->jsonArr; print_r($data);
public static function http_query_list($url, $reg, $rang) { $hj = QueryList::Query($url, $reg, $rang, 'UTF-8'); $cn_blogs = $hj->data; return $cn_blogs; }
private function getList() { $s = urlencode($this->key); $num = $this->num; $getHtmlWay = 'get'; $start = $this->num * $this->page; if ($this->searcher == 'baidu') { $url = "http://www.baidu.com/s?pn={$start}&rn={$num}&wd={$s}"; $reg_znum = '/[\\d,]+/'; } else { if ($this->searcher == 'google') { $url = "https://www.google.com.hk/search?filter=0&lr=&newwindow=1&safe=images&hl=en&as_qdr=all&num={$num}&start={$start}&q={$s}"; $reg_znum = '/([\\d,]+) result(s)?/'; $getHtmlWay = 'curl'; } else { if ($this->searcher == 'sogou') { $url = "http://www.sogou.com/web?query={$s}&num={$num}&page=" . $this->page; $reg_znum = '/[\\d,]+/'; } } } $searcherObj = QueryList::Query($url, $this->regArr, $this->regRange, $getHtmlWay, false); for ($i = 0; $i < count($searcherObj->jsonArr); $i++) { if ($this->searcher == 'baidu') { // $searcherObj->jsonArr[$i]['url'] = $this->getBaiduRealURL($searcherObj->jsonArr[$i]['url']); } else { if ($this->searcher == 'google') { $searcherObj->jsonArr[$i]['url'] = $this->getGoogleRealURL($searcherObj->jsonArr[$i]['url']); } } } $this->jsonArr = $searcherObj->jsonArr; //获取总共结果条数 $searcherObj->setQuery($this->regZnum); $zNum = $searcherObj->jsonArr[0]['zNum']; preg_match($reg_znum, $zNum, $arr) ? $zNum = $arr[0] : ($zNum = 0); $zNum = (int) str_replace(',', '', $zNum); //计算总页数 $zPage = ceil($zNum / $this->num); $this->jsonArr = array('num' => $this->num, 'page' => (int) $this->page + 1, 'zNum' => $zNum, 'zPage' => $zPage, "s" => "{$this->key}", 'other' => array('author' => 'JAE', 'QQ' => '734708094', 'blog' => 'http://blog.jaekj.com'), 'data' => $this->jsonArr); }
<?php /* * Company:Quguonet.com * Author:Myth */ //避免延时 set_time_limit(0); //调用库 __autoload("QueryList"); //require 'Query/QueryList.class.php'; $stime = microtime(true); //获取程序开始执行的时间 //采集测试 $url = "http://wap.ganji.com/foshan/songshui/shunde/?domain=foshan&url=songshui&d=shunde%2F&page=4"; $arr = array('name' => array('.list-item a:even', 'text', '-.tel'), 'href' => array('.list-item a:even', 'href', '-.tel')); $data = QueryList::Query($url, $arr, '', 'UTF-8', 'UTF-8', true)->data; print_r($data); foreach ($data as $key => $data) { $name = $data['name']; $url = $data['href']; $db->insert("crm_list", "`name`,`href`,`pagename`,`createtime`", "'{$name}','{$url}','赶集顺德送水公司',NOW()"); } $etime = microtime(true); //获取程序执行结束的时间 $total = $etime - $stime; //计算差值 echo '<br />耗时:' . $total;
}); return $x; }); print_r($data); */ /* //模拟登陆 $login = QueryList::run('Login',[ 'target' => 'http://doc.querylist.cc/login/login', 'method' => 'post', 'params' => ['username'=>'admin','password'=>'admin'], 'cookiePath' => './cookie123.txt' ]); // print_r($login->html); $rt = $login->post('http://doc.querylist.cc/admin')->setQuery(['title'=>['h1','text']])->data; print_r($rt); */ //乱码终极解决方案 $data = QueryList::Query('http://www.phpddt.com/', ['title' => ['h2', 'text']], '', 'UTF-8', 'UTF-8', true)->data; print_r($data); $etime = microtime(true); //获取程序执行结束的时间 $total = $etime - $stime; //计算差值 // echo '耗时:'.$total;