Exemple #1
0
 public function index()
 {
     // echo 'Ecshop采集专用接口';
     // $str = $this->_getTaskList();//print_r($str);print_r($GLOBALS['run_sql']);
     /* phpQuery Demo  */
     // phpQuery::newDocumentFile('http://www.tomdurrie.com/ding-g63517.html');
     // $test =  pq(".activity")->html();
     // echo trim($test);
     /* xml 获取 */
     // phpQuery::newDocumentFile('http://www.helloweba.com/feed');
     // $title_list = pq('item>title');print_r($title_list);exit;
     // foreach($title_list as $li){
     //     echo pq($li)->html();
     //     echo '<br/>';
     // }
     //
     /**
      * 获取维美达链接列表
      */
     // phpQuery::newDocumentFile('http://www.tomdurrie.com/search.php?page=1');
     // $goods_list = pq('.hoverlist');
     // foreach($goods_list as $li){
     //     $goods[] = array(
     //             'url' => pq($li)->find('a')->attr('href'),
     //             'image' => 'http://www.tomdurrie.com/'.pq($li)->find('img')->attr('src'),
     //         );
     // }
     // print_r($goods);
     /**
      * 获取单个产品内容
      */
     // phpQuery::newDocumentFile('http://www.tomdurrie.com/ding-g63256.html');
     // $goods_gallerys = pq('.gallery>#demo>#demo1>ul>li');
     // foreach($goods_gallerys as $li){
     //     $goods_images[] = array(
     //         'org_img' => 'http://www.tomdurrie.com/'.pq($li)->find('a')->attr('rev'),
     //         'thumb_img' => 'http://www.tomdurrie.com/'.pq($li)->find('img')->attr('src'),
     //     );
     // }
     // // 说明源id大于47900是无水印的 http://www.tomdurrie.com/search.php?page=380 前判读吧..
     // $goods_info = array(
     //     'title' => pq('h1')->html(),
     //     'cat_name' => pq('#ur_here>.f_l>a:eq(1)')->html(),
     //     'org_id' => pq('input[name="id"]')->attr('value'),
     //     'sn' => pq('.props>dl:eq(0)>dd')->html(),
     //     'brand' => pq('.props>dl:eq(1)>dd')->html(),
     //     'price' => pq('#ECS_SHOPPRICE')->html(),
     //     'detail' => '<table>' . pq('div>table')->html() . '</table>',
     //     'images' => $goods_images,
     // );
     // print_r($goods_info);
     /**
      * 加载图片
      * 
      */
     $save_dir = APP . 'tmp' . DS;
     $url = 'http://su.bdimg.com/static/superplus/img/logo_white_ee663702.png';
     $img = http_client_request($url);
     echo '<hr>Curl获取的二进制流文件MD5值:<br>' . md5($img);
     //图片md5
     $fp2 = @fopen($save_dir . 'baidu-save.png', 'w');
     //一定要用w,否者md5会变
     fwrite($fp2, $img);
     fclose($fp2);
     unset($img, $url);
     echo "\n";
     $fmd5 = '<hr>通过fwrite保存图片MD5:<br>' . md5_file($save_dir . 'baidu-save.png');
     echo $fmd5;
     echo '<hr>原图md5:<br>' . md5_file($save_dir . 'baidu-org.png');
     // header("content-type:image/jpeg");
     // echo $img;
 }
Exemple #2
0
 $task_list_id = $v['id'];
 $task_status = $v['status'];
 $chatset = $v['charset'];
 $rules = array();
 if ($task_status == 'yes') {
     system("echo -e '开始抓取\\033[34m[" . $v['node_name'] . "]\\033[0m...'");
     $rules = json_decode($v['link_rules'], true);
     // 获取内容的链接
     $link_list = array();
     $link_list = get_link_list($rules['list_rule']);
     $link_list_count = count($link_list);
     system("echo -e '获取到\\033[32m[" . $link_list_count . "]\\033[0m个列表列表链接,准备获取文章链接...'");
     foreach ($link_list as $vo) {
         system("echo -e '获取列表内容链接: \\033[32m" . $vo . "\\033[0m'");
         $content = '';
         $content = http_client_request($vo);
         if ($chatset != 'utf-8') {
             iconv($chatset, "UTF-8", $content);
         }
         // print_r($content);exit;
         $target_urls = array();
         $target_urls = get_content_url_list($content, $rules['list_area']);
         // 探测链接失败
         if (empty($target_urls)) {
             system("echo -e '探测链接列表失败: \\033[31m" . $vo . "\\033[0m'");
             $result_errr = insert_log($vo, '探测链接列表失败');
         } else {
             insert_urls($target_urls, $task_list_id, true);
         }
     }
 }
Exemple #3
0
$url_count = count($url_list);
system('echo -e "\\033[32m 获取到' . $url_count . '条要采集的内容... \\033[0m"');
if (!empty($url_list)) {
    foreach ($url_list as $v) {
        $tmp_url_data[$v['task_list_id']][] = $v;
    }
    foreach ($tmp_url_data as $ko => $vo) {
        $url_info = get_line(prepare('select * from task_list where id=?i limit 1', array($ko)));
        $content_rules = $url_info['content_rules'];
        $content_rules = json_decode($content_rules, true);
        $chatset = $content_rules['charset'];
        if (!empty($content_rules['type'])) {
            foreach ($vo as $va) {
                system("echo -e '获取内容链接: \\033[32m" . $va['url'] . "\\033[0m'");
                $html = '';
                $html = http_client_request($va['url']);
                if ($chatset != 'utf-8') {
                    iconv($chatset, "UTF-8", $html);
                }
                if (empty($html)) {
                    if ($va['error_time'] >= 3) {
                        update('task_url', array('need_push' => 'no'), array('hash' => $va['hash']));
                    } else {
                        update('task_url', array('error_time' => $va['error_time'] + 1), array('hash' => $va['hash']));
                    }
                    system("echo -e '获取内容链接: \\033[32m" . $va['url'] . "\\033[0m失败" . ($va['error_time'] + 1) . "次' ");
                    continue;
                }
                $content_data = get_content($html, $content_rules);
                if (!empty($content_data) && (!empty($content_data['content']) || !empty($content_data['pic']))) {
                    $_save = array('list_id' => $ko, 'content' => $content_data['content'], 'pic' => $content_data['pic'], 'url_id' => $va['id'], 'url_hash' => $va['hash'], 'url_link' => $va['url']);