Exemplo n.º 1
0
 public function index()
 {
     header("Content-type:text/html;charset=GB2312");
     import('Org.JAE.QueryList');
     //采集OSC的代码分享列表,标题 链接 作者
     $url = "http://www.oschina.net/code/list";
     $reg = array("title" => array(".code_title a:eq(0)", "text"), "url" => array(".code_title a:eq(0)", "href"), "author" => array("img", "title"));
     $rang = ".code_list li";
     //使用curl抓取源码并以GB2312编码格式输出
     $hj = new \QueryList($url, $reg, $rang, 'curl', 'GB2312');
     $arr = $hj->jsonArr;
     echo "<pre>";
     print_r($arr);
     echo "</pre><hr/>";
     //如果还想采当前页面右边的 TOP40活跃贡献者 图像,得到JSON数据,可以这样写
     $reg = array("portrait" => array(".hot_top img", "src"));
     $hj->setQuery($reg);
     $json = $hj->getJSON();
     echo $json . "<hr/>";
     //采OSC内容页内容
     $url = "http://www.oschina.net/code/snippet_186288_23816";
     $reg = array("title" => array(".QTitle h1", "text"), "con" => array(".Content", "html"));
     $hj = new \QueryList($url, $reg);
     $arr = $hj->jsonArr;
     echo "<pre>";
     print_r($arr);
     echo "</pre><hr/>";
 }
Exemplo n.º 2
0
 /**
  * [queryHotList 采集分类下的文章]
  * @param unknown $cat
  * @return Ambigous <multitype:, array>
  * @access public
  * @author polo<*****@*****.**>
  * @version 2015-3-11 上午10:35:25
  * @copyright Show More
  */
 public static function queryHotList($cat)
 {
     $url = 'http://weixin.sogou.com/pcindex/pc/' . $cat . '/' . $cat . '.html';
     $reg = array("Title" => array('.wx-news-info2 h4', 'text'), 'Description' => array('.wx-news-info2>a', 'text'), 'PicUrl' => array('.wx-img-box a img', 'src'), 'Url' => array('.wx-img-box a', 'href'));
     $obj = \QueryList::Query($url, $reg);
     return $obj->jsonArr;
 }
Exemplo n.º 3
0
 /**
  * 静态方法,访问入口
  * @param string $page            要抓取的网页URL地址(支持https);或者是html源代码
  * @param array  $regArr         【选择器数组】说明:格式array("名称"=>array("选择器","类型"[,"标签过滤列表"][,"回调函数"]),.......[,"callback"=>"全局回调函数"]);
  *                               【选择器】说明:可以为任意的jQuery选择器语法
  *                               【类型】说明:值 "text" ,"html" ,"HTML标签属性" ,
  *                               【标签过滤列表】:可选,当标签名前面添加减号(-)时(此时标签可以为任意的元素选择器),表示移除该标签以及标签内容,否则当【类型】值为text时表示需要保留的HTML标签,为html时表示要过滤掉的HTML标签
  *                               【回调函数】/【全局回调函数】:可选,字符串(函数名) 或 数组(array("类名","类的静态方法")),回调函数应有俩个参数,第一个参数是选择到的内容,第二个参数是选择器数组下标,回调函数会覆盖全局回调函数
  *
  * @param string $regRange       【块选择器】:指 先按照规则 选出 几个大块 ,然后再分别再在块里面 进行相关的选择
  * @param string $getHtmlWay     【源码获取方式】指是通过curl抓取源码,还是通过file_get_contents抓取源码
  * @param string $outputEncoding【输出编码格式】指要以什么编码输出(UTF-8,GB2312,.....),防止出现乱码,如果设置为 假值 则不改变原字符串编码
  */
 public static function Query($page, $regArr, $regRange = '', $getHtmlWay = 'curl', $outputEncoding = false)
 {
     if (!self::$ql instanceof self) {
         self::$ql = new self();
     }
     self::$ql->_query($page, $regArr, $regRange, $getHtmlWay, $outputEncoding);
     return self::$ql;
 }
Exemplo n.º 4
0
 /**
  * 更新各省IPV4地址段
  */
 public static function update()
 {
     require 'QueryList.class.php';
     $province = array('BJ', 'GD', 'SD', 'ZJ', 'JS', 'SH', 'LN', 'SC', 'HA', 'HB', 'FJ', 'HN', 'HE', 'CQ', 'SX', 'JX', 'SN', 'AH', 'HL', 'GX', 'JL', 'YN', 'TJ', 'NM', 'XJ', 'GS', 'GZ', 'HI', 'NX', 'QH', 'XZ');
     $count = count($province);
     $ip_segment = array();
     for ($i = 0; $i < $count; $i++) {
         // 采集IP地址段目标网址
         $url = 'http://ips.chacuo.net/view/s_' . $province[$i];
         $ip_list = QueryList::Query($url, array('begin' => array('.v_l', 'text'), 'end' => array('.v_r', 'text')), '', 'UTF-8');
         $ip_array = $ip_list->jsonArr;
         $ip_segment[$province[$i]] = $ip_array;
     }
     $path = str_replace('\\', '/', __DIR__);
     $file = $path . '/ip_segment.php';
     $res = self::_write($file, $ip_segment);
     if ($res) {
         echo 'Update OK!';
     }
 }
Exemplo n.º 5
0
            continue;
        }
        $data['title'] = $value['title'];
        $data['source'] = $value['url'];
        $data['sourceType'] = 2;
        // 2表示来源是互联网
        $data['classId'] = 3;
        // 3表示左岸频道
        $data['category'] = 1;
        // 1表示创造之路
        $data['isPublish'] = 1;
        $data['publishTime'] = $nowDate;
        $data['changeTime'] = $nowDate;
        $data['createTime'] = $nowDate;
        $data['isDel'] = 0;
        $con_hj = QueryList::Query($value['url'], array("content" => array('#content .entry-content', 'html', '-div')));
        $docId = $arti->insertData('uctoo_article_article', $data);
        if ($docId > 0) {
            $data2 = array();
            $data2['docId'] = $docId;
            $data2['content'] = $con_hj->jsonArr[0]['content'];
            # 如果存在,则不执行添加操作
            $res1 = $arti->select('select "X" from uctoo_article_content_1 where docId=' . $docId);
            if ($res1) {
                continue;
            }
            $arti->insertData('uctoo_article_content_1', $data2);
        }
    }
}
print_r($hj->jsonArr);
Exemplo n.º 6
0
// store the user's permission
// Get the QueryList object
if ($canSave) {
    $QUERYLIST = $USER->load_data('saved_searches');
    if (is_null($QUERYLIST)) {
        // no saved data yet
        $QUERYLIST = new QueryList();
    }
    $ACTIVELIST = $USER->load_data('saved_searches_active');
    if (is_null($ACTIVELIST)) {
        $ACTIVELIST = array();
    }
}
// User does not have permission, or saved querylist is broken
if (@$QUERYLIST == NULL) {
    $QUERYLIST = new QueryList();
    $ACTIVELIST = array();
}
// check for a form POST
if (is_array($_POST) && count($_POST) > 0 && $canSave) {
    // determine post type
    if (isset($_POST['save'])) {
        unset($_POST['save']);
        // create the query with the criteria in the post
        $query = QueryFactory::create($MODULE);
        $query->set_criteria_values($_POST);
        if (!$QUERYLIST->contains($query)) {
            if (count($QUERYLIST) >= $CONF['saved_searches_size']) {
                $MESSAGE = $STRINGS['error_saved_searches_limit'];
                $MESSAGE_CLASS = 'error-message';
            } else {
Exemplo n.º 7
0
 public function getPage()
 {
     $reg = array('page' => array('.PageLink:last', 'text'));
     $query = QueryList::Query($this->html, $reg, '');
     $page = !empty($query->jsonArr[0]['page']) ? $query->jsonArr[0]['page'] : '0';
     return $page;
 }
Exemplo n.º 8
0
                });
        
            return $x;
           
    });

print_r($data);
*/
/*
//模拟登陆
$login = QueryList::run('Login',[
    'target' => 'http://doc.querylist.cc/login/login',
    'method' => 'post',
    'params' => ['username'=>'admin','password'=>'admin'],
    'cookiePath' => './cookie123.txt'
    ]);

// print_r($login->html);

$rt = $login->post('http://doc.querylist.cc/admin')->setQuery(['title'=>['h1','text']])->data;

print_r($rt);
*/
//乱码终极解决方案
$data = QueryList::Query('http://www.phpddt.com/', ['title' => ['h2', 'text']], '', 'UTF-8', 'UTF-8', true)->data;
print_r($data);
$etime = microtime(true);
//获取程序执行结束的时间
$total = $etime - $stime;
//计算差值
// echo '耗时:'.$total;
Exemplo n.º 9
0
 public function index()
 {
     set_time_limit(0);
     import('Org.JAE.QueryList');
     header("Content-type: text/html; charset=utf-8");
     $listurl = "http://qt.qq.com/static/pages/news/phone/c12_list_1.shtml";
     $page = 0;
     while (true) {
         if ($page > 10) {
             break;
         }
         $pageresult = \QueryList::Query($listurl);
         $json = $pageresult->getHtmlJSON();
         if (empty($json[0]['next'])) {
             echo $listurl;
             dump($json[0]);
             break;
         }
         $listurl = "http://qt.qq.com/static/pages/news/phone/" . $json[0]['next'];
         $items = $json[0]['list'];
         foreach ($items as $item) {
             $article_url = $item['article_url'];
             if (strpos($article_url, 'qq.com')) {
                 continue;
             }
             if (!strpos($article_url, "article_")) {
                 continue;
             }
             $article_url = "http://qt.qq.com/static/pages/news/phone/" . $article_url;
             $map['title'] = $item['title'];
             $iscollect = D('DcDocument')->where($map)->find();
             if (!empty($iscollect)) {
                 continue;
             }
             $data['create_time'] = strtotime($item['insert_date']);
             $data['title'] = $item['title'];
             $data['description'] = $item['summary'];
             $data['cover_id'] = $this->saveCoverImage($item['image_url_small']);
             if (empty($data['cover_id'])) {
                 continue;
             }
             $Document = D('DcDocument');
             $data['title'] = str_replace('掌盟', '群挑', $data['title']);
             $docid = $Document->addDoc($data);
             $pagecontent = \phpQuery::newDocumentFile($article_url);
             $content = pq(".article_content")->html();
             $imgs = pq($content)->find("img");
             foreach ($imgs as $img) {
                 $src = pq($img)->attr('src');
                 if (empty($src)) {
                     $src = pq($img)->attr('jason');
                 }
                 $imgurl = $this->saveArticleImage($src);
                 $content = str_replace($src, $imgurl, $content);
                 $content = str_replace("jason=", "src=", $content);
                 $content = str_replace("<img", "<img alt='" . $item['title'] . "'", $content);
             }
             $content = str_replace('掌盟', '群挑', $content);
             $content = preg_replace("/<a[^>]*>(.*)<\\/a>/isU", '${1}', $content);
             $Article = D('DcArticle');
             $article['content'] = trim($content);
             $article['id'] = $docid;
             $article_id = $Article->addArticle($article);
             \phpQuery::$documents = array();
             $page++;
         }
     }
 }
Exemplo n.º 10
0
echo $json . "<hr/>";
//采OSC内容页内容
$url = "http://www.oschina.net/code/snippet_186288_23816";
$reg = array("title" => array(".QTitle h1", "text"), "con" => array(".Content", "html"));
$hj = QueryList::Query($url, $reg);
$arr = $hj->jsonArr;
echo "<pre>";
print_r($arr);
echo "</pre><hr/>";
//抓取网站基本信息
//设置规则
$reg = array("kw" => array("meta[name=keywords]", "content"), "desc" => array("meta[name=description]", "content"), "title" => array("title", "text"), "css1" => array("link:eq(0)", "href"), "js2" => array("script[src]:eq(1)", "src"));
//抓取的目标站
$url = 'http://x.44i.cc/';
//抓取
$data = QueryList::Query($url, $reg)->jsonArr;
print_r($data);
//下面单独演示回调函数的用法
//抓取网站keywords并分离每个关键词
$reg = array("kw" => array("meta[name=keywords]", "content", '', 'fun'));
//自定义回调函数
function fun($content, $key)
{
    //分离关键词
    return explode(',', $content);
}
//抓取的目标站
$url = 'http://x.44i.cc/';
//抓取
$data = QueryList::Query($url, $reg)->jsonArr;
print_r($data);
Exemplo n.º 11
0
 public static function http_query_list($url, $reg, $rang)
 {
     $hj = QueryList::Query($url, $reg, $rang, 'UTF-8');
     $cn_blogs = $hj->data;
     return $cn_blogs;
 }
 /**
  * Returns JSON response
  * @return array
  */
 public function format()
 {
     return ['total' => $this->queryList->getTotal(), 'items' => $this->queryList->getItems()];
 }
Exemplo n.º 13
0
 private function getList()
 {
     $s = urlencode($this->key);
     $num = $this->num;
     $getHtmlWay = 'get';
     $start = $this->num * $this->page;
     if ($this->searcher == 'baidu') {
         $url = "http://www.baidu.com/s?pn={$start}&rn={$num}&wd={$s}";
         $reg_znum = '/[\\d,]+/';
     } else {
         if ($this->searcher == 'google') {
             $url = "https://www.google.com.hk/search?filter=0&lr=&newwindow=1&safe=images&hl=en&as_qdr=all&num={$num}&start={$start}&q={$s}";
             $reg_znum = '/([\\d,]+) result(s)?/';
             $getHtmlWay = 'curl';
         } else {
             if ($this->searcher == 'sogou') {
                 $url = "http://www.sogou.com/web?query={$s}&num={$num}&page=" . $this->page;
                 $reg_znum = '/[\\d,]+/';
             }
         }
     }
     $searcherObj = QueryList::Query($url, $this->regArr, $this->regRange, $getHtmlWay, false);
     for ($i = 0; $i < count($searcherObj->jsonArr); $i++) {
         if ($this->searcher == 'baidu') {
             // $searcherObj->jsonArr[$i]['url'] = $this->getBaiduRealURL($searcherObj->jsonArr[$i]['url']);
         } else {
             if ($this->searcher == 'google') {
                 $searcherObj->jsonArr[$i]['url'] = $this->getGoogleRealURL($searcherObj->jsonArr[$i]['url']);
             }
         }
     }
     $this->jsonArr = $searcherObj->jsonArr;
     //获取总共结果条数
     $searcherObj->setQuery($this->regZnum);
     $zNum = $searcherObj->jsonArr[0]['zNum'];
     preg_match($reg_znum, $zNum, $arr) ? $zNum = $arr[0] : ($zNum = 0);
     $zNum = (int) str_replace(',', '', $zNum);
     //计算总页数
     $zPage = ceil($zNum / $this->num);
     $this->jsonArr = array('num' => $this->num, 'page' => (int) $this->page + 1, 'zNum' => $zNum, 'zPage' => $zPage, "s" => "{$this->key}", 'other' => array('author' => 'JAE', 'QQ' => '734708094', 'blog' => 'http://blog.jaekj.com'), 'data' => $this->jsonArr);
 }
Exemplo n.º 14
0
<?php

/*
 * Company:Quguonet.com
 * Author:Myth
 */
//避免延时
set_time_limit(0);
//调用库
__autoload("QueryList");
//require 'Query/QueryList.class.php';
$stime = microtime(true);
//获取程序开始执行的时间
//采集测试
$url = "http://wap.ganji.com/foshan/songshui/shunde/?domain=foshan&url=songshui&d=shunde%2F&page=4";
$arr = array('name' => array('.list-item a:even', 'text', '-.tel'), 'href' => array('.list-item a:even', 'href', '-.tel'));
$data = QueryList::Query($url, $arr, '', 'UTF-8', 'UTF-8', true)->data;
print_r($data);
foreach ($data as $key => $data) {
    $name = $data['name'];
    $url = $data['href'];
    $db->insert("crm_list", "`name`,`href`,`pagename`,`createtime`", "'{$name}','{$url}','赶集顺德送水公司',NOW()");
}
$etime = microtime(true);
//获取程序执行结束的时间
$total = $etime - $stime;
//计算差值
echo '<br />耗时:' . $total;
Exemplo n.º 15
0
 function test_limit_new_query_impl()
 {
     $list1 = new QueryList(3);
     // history limited to 3
     // setup 11 queries
     $q1 = $this->new_query_shortform(array('q' => 'foo'));
     $q2 = $this->new_query_shortform(array('q' => 'bar'));
     $q3 = $this->new_query_shortform(array('q' => 'baz'));
     $q4 = $this->new_query_shortform(array('q' => 'abc'));
     $q5 = $this->new_query_shortform(array('q' => 'def'));
     $q6 = $this->new_query_shortform(array('q' => 'ghi'));
     $q7 = $this->new_query_shortform(array('q' => 'jkl'));
     $q8 = $this->new_query_shortform(array('q' => 'mno'));
     $q9 = $this->new_query_shortform(array('q' => 'pqr'));
     $q10 = $this->new_query_shortform(array('q' => 'stu'));
     $q11 = $this->new_query_shortform(array('q' => 'vwx'));
     // add 6 queries, confirm limit
     $list1->add($q1);
     $this->assertTrue(count($list1) == 1);
     $list1->add($q2);
     $this->assertTrue(count($list1) == 2);
     $list1->add($q3);
     $this->assertTrue(count($list1) == 3);
     $list1->add($q4);
     $this->assertTrue(count($list1) == 3);
     $list1->add($q5);
     $this->assertTrue(count($list1) == 3);
     $this->assertEqual($list1[2]['url'], $this->url_search . '?q=baz');
     // oldest query is now baz
     $list1->add($q6);
     $this->assertTrue(count($list1) == 3);
     $this->assertEqual($list1[2]['url'], $this->url_search . '?q=abc');
     // oldest query is now abc
     $list1 = new QueryList();
     // reset the query list, default limit is umlimited
     $list1->add($q1);
     $list1->add($q2);
     $list1->add($q3);
     $list1->add($q4);
     $list1->add($q5);
     $list1->add($q6);
     $list1->add($q7);
     $list1->add($q8);
     $list1->add($q9);
     $list1->add($q10);
     $list1->add($q11);
     $this->assertTrue(count($list1) == 11);
 }
Exemplo n.º 16
0
<?php

require_once '../inc/QueryList/QueryList.class.php';
header('Content-type:text/html;charset=utf-8');
$url = "http://meiriyiwen.com/";
$reg = array("title" => array("#article_show h1", "text"), "author" => array("#article_show .article_author span", "text"), "content" => array("#article_show .article_text", "html"));
$mryw_rst = QueryList::Query($url, $reg);
$mryw_array = $mryw_rst->jsonArr;
if ($mryw_array) {
    $m_title = $mryw_array[0]['title'];
    $m_author = $mryw_array[0]['author'];
    $m_content = $mryw_array[0]['content'];
    $m_md5 = md5($m_content);
    if (empty($m_title) || empty($m_author) || empty($m_content) || empty($m_md5)) {
        echo 'empty mryw, ' . date('Y-m-d H:i:s', time());
    } else {
        require_once '../inc/mryw.inc.php';
        $mryw_dao = new Mryw();
        if ($mryw_dao->is_mryw_exist($m_md5)) {
            echo 'exist mryw, ' . date('Y-m-d H:i:s', time());
        } else {
            $lastest = $mryw_dao->get_lastest_article(true);
            $n_time = strtotime($lastest['m_time']) + 3600 * 23 + 57;
            $mryw_dao->insert_mryw($m_title, $m_content, $m_author, $m_md5, date('Y-m-d H:i:s', $n_time));
            //
            echo 'success mryw, ' . date('Y-m-d H:i:s', time());
            Mysql::closeConn();
        }
    }
}
Exemplo n.º 17
0
 private function getType($html)
 {
     $reg = array('type1' => array('#site-nav a:eq(0)', 'text'), 'type2' => array('#main-body a:eq(0)', 'text'), 'type3' => array('.block_all', 'text'), 'type4' => array('.midsml-rank-stars', 'class'));
     $rang = '';
     $query = QueryList::Query($html, $reg, $rang);
     $type = 0;
     if (isset($query->jsonArr[0])) {
         if (isset($query->jsonArr[0]['type2'])) {
             $type = isset($query->jsonArr[0]['type3']) ? 4 : 3;
         } else {
             $type = isset($query->jsonArr[0]['type4']) ? 5 : 2;
         }
     } else {
         $type = 1;
     }
     return $type;
 }
Exemplo n.º 18
0
mysql_select_db(DB_NAME);
$sql = "select count(*) from cn_kb_blogs";
$res = mysql_query($sql);
$counts_info = mysql_fetch_array($res);
$exist_max_num = $counts_info['0'];
//每页20
$start_page = ceil($exist_max_num / 20);
$start_num = intval($exist_max_num % 20);
if ($start_num == 19) {
    $start_page++;
}
for ($page = 4; $page > 0; $page--) {
    $url = 'http://home.cnblogs.com/kb';
    $fetch_page = $start_page + $page;
    $url = empty($page) ? $url : $url . "/page/" . $fetch_page;
    $hj = QueryList::Query($url, $reg, $rang, 'UTF-8');
    $cn_blogs = $hj->data;
    $get_cnblogs_nums = count($cn_blogs) - 1;
    for ($i = $get_cnblogs_nums; $i >= 0; $i--) {
        $cn_blog = $cn_blogs[$i];
        $exist_sql = "select id from cn_kb_blogs where content_url='" . $cn_blog['content_url'] . "'";
        $exist_res = mysql_query($exist_sql);
        $num_rows = mysql_num_rows($exist_res);
        if (empty($num_rows)) {
            $values = implode("','", $cn_blog);
            $sql = "insert into cn_kb_blogs(title, content_url, kb_type) values('{$values}')";
            mysql_query($sql);
            echo 'success' . "<br>";
        }
    }
}
Exemplo n.º 19
0
//开启前台发布
if ($config['UserAddArticle'] == 0) {
    echo "<script>alert('前台会员不支持发布文章');location.href='ucenter.php'</script>";
    exit;
}
//微信文章导入
if ($_POST) {
    require 'QueryList.class.php';
    $long = guolv(trim($_POST['long']));
    $type_id = guolv(trim($_POST['type_id']));
    $html = get_contents($long);
    $money = $type_arr[2];
    $html = str_replace('data-src', 'src', $html);
    $caiji = array("title" => array(".rich_media_title:first", "text"), "content" => array("#js_content", "html"));
    $quyu = '';
    $hj = QueryList::Query($html, $caiji, $quyu);
    $arr = $hj->jsonArr;
    $title = $arr[0]['title'];
    $content = $arr[0]['content'];
    $pic = cut($html, 'var msg_cdn_url = "', '"');
    if (url_exists($long) == 1) {
        echo "<script>alert('网址不存在');location.href='weixin.php'</script>";
        exit;
    }
    if (is_numeric($type_id) == false) {
        echo "<script>alert('分类不存在');location.href='weixin.php'</script>";
        exit;
    }
    $row = $mysql->query("select * from `article` where `title`='{$title}' limit 1");
    if (!$row) {
        $arr = array('top' => 0, 'title' => $title, 'content' => $content, 'pic' => '[weixin]' . $pic, 'type' => $type_id, 'pv' => 0, 'pv_max' => '', 'money' => $money, 'day' => date("Y-m-d", time()));
Exemplo n.º 20
0
 public function getInstance($className = 'QueryList', $params = null)
 {
     return QueryList::getInstance($className, $params);
 }