public function index() { // echo 'Ecshop采集专用接口'; // $str = $this->_getTaskList();//print_r($str);print_r($GLOBALS['run_sql']); /* phpQuery Demo */ // phpQuery::newDocumentFile('http://www.tomdurrie.com/ding-g63517.html'); // $test = pq(".activity")->html(); // echo trim($test); /* xml 获取 */ // phpQuery::newDocumentFile('http://www.helloweba.com/feed'); // $title_list = pq('item>title');print_r($title_list);exit; // foreach($title_list as $li){ // echo pq($li)->html(); // echo '<br/>'; // } // /** * 获取维美达链接列表 */ // phpQuery::newDocumentFile('http://www.tomdurrie.com/search.php?page=1'); // $goods_list = pq('.hoverlist'); // foreach($goods_list as $li){ // $goods[] = array( // 'url' => pq($li)->find('a')->attr('href'), // 'image' => 'http://www.tomdurrie.com/'.pq($li)->find('img')->attr('src'), // ); // } // print_r($goods); /** * 获取单个产品内容 */ // phpQuery::newDocumentFile('http://www.tomdurrie.com/ding-g63256.html'); // $goods_gallerys = pq('.gallery>#demo>#demo1>ul>li'); // foreach($goods_gallerys as $li){ // $goods_images[] = array( // 'org_img' => 'http://www.tomdurrie.com/'.pq($li)->find('a')->attr('rev'), // 'thumb_img' => 'http://www.tomdurrie.com/'.pq($li)->find('img')->attr('src'), // ); // } // // 说明源id大于47900是无水印的 http://www.tomdurrie.com/search.php?page=380 前判读吧.. // $goods_info = array( // 'title' => pq('h1')->html(), // 'cat_name' => pq('#ur_here>.f_l>a:eq(1)')->html(), // 'org_id' => pq('input[name="id"]')->attr('value'), // 'sn' => pq('.props>dl:eq(0)>dd')->html(), // 'brand' => pq('.props>dl:eq(1)>dd')->html(), // 'price' => pq('#ECS_SHOPPRICE')->html(), // 'detail' => '<table>' . pq('div>table')->html() . '</table>', // 'images' => $goods_images, // ); // print_r($goods_info); /** * 加载图片 * */ $save_dir = APP . 'tmp' . DS; $url = 'http://su.bdimg.com/static/superplus/img/logo_white_ee663702.png'; $img = http_client_request($url); echo '<hr>Curl获取的二进制流文件MD5值:<br>' . md5($img); //图片md5 $fp2 = @fopen($save_dir . 'baidu-save.png', 'w'); //一定要用w,否者md5会变 fwrite($fp2, $img); fclose($fp2); unset($img, $url); echo "\n"; $fmd5 = '<hr>通过fwrite保存图片MD5:<br>' . md5_file($save_dir . 'baidu-save.png'); echo $fmd5; echo '<hr>原图md5:<br>' . md5_file($save_dir . 'baidu-org.png'); // header("content-type:image/jpeg"); // echo $img; }
$task_list_id = $v['id']; $task_status = $v['status']; $chatset = $v['charset']; $rules = array(); if ($task_status == 'yes') { system("echo -e '开始抓取\\033[34m[" . $v['node_name'] . "]\\033[0m...'"); $rules = json_decode($v['link_rules'], true); // 获取内容的链接 $link_list = array(); $link_list = get_link_list($rules['list_rule']); $link_list_count = count($link_list); system("echo -e '获取到\\033[32m[" . $link_list_count . "]\\033[0m个列表列表链接,准备获取文章链接...'"); foreach ($link_list as $vo) { system("echo -e '获取列表内容链接: \\033[32m" . $vo . "\\033[0m'"); $content = ''; $content = http_client_request($vo); if ($chatset != 'utf-8') { iconv($chatset, "UTF-8", $content); } // print_r($content);exit; $target_urls = array(); $target_urls = get_content_url_list($content, $rules['list_area']); // 探测链接失败 if (empty($target_urls)) { system("echo -e '探测链接列表失败: \\033[31m" . $vo . "\\033[0m'"); $result_errr = insert_log($vo, '探测链接列表失败'); } else { insert_urls($target_urls, $task_list_id, true); } } }
$url_count = count($url_list); system('echo -e "\\033[32m 获取到' . $url_count . '条要采集的内容... \\033[0m"'); if (!empty($url_list)) { foreach ($url_list as $v) { $tmp_url_data[$v['task_list_id']][] = $v; } foreach ($tmp_url_data as $ko => $vo) { $url_info = get_line(prepare('select * from task_list where id=?i limit 1', array($ko))); $content_rules = $url_info['content_rules']; $content_rules = json_decode($content_rules, true); $chatset = $content_rules['charset']; if (!empty($content_rules['type'])) { foreach ($vo as $va) { system("echo -e '获取内容链接: \\033[32m" . $va['url'] . "\\033[0m'"); $html = ''; $html = http_client_request($va['url']); if ($chatset != 'utf-8') { iconv($chatset, "UTF-8", $html); } if (empty($html)) { if ($va['error_time'] >= 3) { update('task_url', array('need_push' => 'no'), array('hash' => $va['hash'])); } else { update('task_url', array('error_time' => $va['error_time'] + 1), array('hash' => $va['hash'])); } system("echo -e '获取内容链接: \\033[32m" . $va['url'] . "\\033[0m失败" . ($va['error_time'] + 1) . "次' "); continue; } $content_data = get_content($html, $content_rules); if (!empty($content_data) && (!empty($content_data['content']) || !empty($content_data['pic']))) { $_save = array('list_id' => $ko, 'content' => $content_data['content'], 'pic' => $content_data['pic'], 'url_id' => $va['id'], 'url_hash' => $va['hash'], 'url_link' => $va['url']);