echo "the page don't get data\n"; continue; } preg_match_all('/<div class="sgoNext">\\s?<a href="(.*?)".*?>/', $str, $next_url); //获取页码html if (!empty($next_url[1][0])) { $url = $baseurl . $next_url[1][0]; echo "next page url:" . $url . "\n"; } else { echo "don't get next page url\n"; unset($url); } //进入每一页读取商品详细信息 foreach ($goods_lists as $good) { echo "good page url:" . $good['url'] . "\n"; $str = HttpGet($good['url']); $str = preg_replace("/\\s+/", " ", $str); //过滤多余回车 $str = preg_replace("/<[ ]+/si", "<", $str); //过滤“<”后面的空格 $str = preg_replace("/<\\!--.*?-->/si", "", $str); //过滤注释 //开始匹配所需数据 $match_arr = array(); //存储正则匹配的数据 preg_match('/<i class="gray09 fArial ml15">(.*?)<\\/i>/', $str, $match_arr['skuId']); //商品编号 if (empty($match_arr['skuId'][1])) { die("don't get 'skuId' data\n"); } preg_match('/<span id="goods_name_baike">(.*?)<\\/span>/', $str, $match_arr['name']);
<?php $page = 1; //当前页码; $totalPage = 0; //总页码; do { $s = ($page - 1) * 95; $url = "http://list.taobao.com/itemlist/market/nongye1.htm?_input_charset=utf-8&json=on&s=" . $s . "&atype=b&cat=50107919&style=list&at=4673%2C11138&as=0&viewIndex=1&spm=a2106.2206569.0.0.klNaSh&same_info=1&isnew=2&pSize=95&_ksTS=1405255051526_27"; //获取json数据 $json = HttpGet($url); //处理编码问题 $utf8_json = characet($json); //json解析为数组 $arr = json_decode($utf8_json, true); //获取页码 $page = $arr['page']['currentPage']; //获取总页码 $totalPage = $arr['page']['totalPage']; //获取商品列表 $lists = $arr['itemList']; //计算总共商品数 $count = count($lists); //向终端输出信息 echo "获取....第" . $page . "页,共" . $count . "条信息\n"; //开始插入数据库 $insert_num = insert_db($lists); //向终端输出信息 echo "\n"; echo "成功" . $insert_num . "条 失败" . ($count - $insert_num) . "条\n\n"; $page++;
// var_dump($v); preg_match_all('/<a.*?href=\\"(.*?)\\".*?>(.*?)<\\/a>/i', $v, $temp_1); for ($i = 0; $i < count($temp_1[1]); $i++) { $category_2[$k - 1][] = array('name' => $temp_1[2][$i], 'link' => $temp_1[1][$i]); } } // var_dump($category_2); // var_dump($category_1);die; foreach ($category_1 as $k => $v) { $category[$k] = array('fname' => $v[2][0], 'flink' => $v[1][0], 'children' => $category_2[$k]); } // var_dump($category);die; foreach ($category as $k => $v) { for ($i = 0; $i < count($v['children']); $i++) { $url = $v['children'][$i]['link']; $str = HttpGet($url); // print_r($url); $str = preg_replace("/\\s+/", " ", $str); //过滤多余回车 $str = preg_replace("/<[ ]+/si", "<", $str); //过滤<__("<"号后面带空格) $str = preg_replace("/<\\!--.*?-->/si", "", $str); //注释 preg_match_all('/<span class=\\"c-price\\">(\\d+.\\d+)/', $str, $price); //获取价格 preg_match_all('/<a class="item-name".*?>(.*?)<\\/a>/', $str, $name); //名称 preg_match_all('/<span class="sale-num">(\\d+)<\\/span>/', $str, $sale_num); //销量 preg_match_all('/<a\\sclass="item-name".*?href=\\"http:\\/\\/detail.tmall.com\\/item.htm\\?id=(\\d+)&/', $str, $skuId); //商品ID