function crawler() { global $nid; $proxyObj = new proxy(); $mysqli = new mysqli('localhost', 'admin', 'txg19831210', 'crawler'); $mysqli->query('SET NAMES gbk'); //for (;;) { $hour = date('G'); $current = time(); //echo "Id {$id}\n"; $sql = "SELECT * FROM keyword WHERE id = " . $nid; $result = $mysqli->query($sql); $data = array(); if ($result) { while ($obj = $result->fetch_object()) { $data[] = $obj; } } if (!$data) { echo "zz\n"; //$mysqli->rollback(); //sleep(60); continue; } foreach ($data as $obj) { $kwd = urlencode($obj->kwd); $nid = $obj->nid; $date = date('Ymd'); $sleep_time = $obj->sleep_time; //$crawler = new crawler($kwd, $nid); //$proxy = $crawler->proxy; //$ua = $crawler->userAgent; //echo $proxy . "\n"; $ua = 'aa'; $proxy = $proxyObj->getProxy(); //echo $proxy . "\n"; if ($obj->path1_page <= $obj->path2_page) { $search_url = 'http://s.taobao.com/search?&initiative_id=tbindexz_' . $date . '&spm=1.7274553.1997520841.1&sourceId=tb.index&search_type=item&ssid=s5-e&commend=all&q=' . $kwd . '&suggest=0_2'; $search_selector = ".item[nid='" . $nid . "'] h3 a"; $next_selector = ".page-next"; $cmd = "/usr/bin/casperjs --output-encoding=gbk --script-encoding=gbk --proxy=" . $proxy . " /var/html/casperjs/pcntl/process_single.js \"" . $search_url . "\" " . " \"" . $search_selector . "\" " . "\"" . $next_selector . "\" " . $sleep_time . " \"" . $ua . "\""; } else { $search_url = 'http://s.taobao.com/search?spm=a230r.1.0.0.9nMSJu&initiative_id=tbindexz_' . $date . '&tab=mall&q=' . $kwd . '&suggest=0_2'; $search_selector = ".item[nid='" . $nid . "'] h3 a"; $next_selector = ".page-next"; $cmd = "/usr/bin/casperjs --output-encoding=gbk --script-encoding=gbk --proxy=" . $proxy . " /var/html/casperjs/pcntl/process_single.js \"" . $search_url . "\" " . " \"" . $search_selector . "\" " . "\"" . $next_selector . "\" " . $sleep_time . " \"" . $ua . "\""; } //echo $cmd . "\n"; system($cmd); $sql = "UPDATE keyword SET clicked_times = clicked_times + 1, last_click_time = " . time() . ", run_status = 'free' WHERE id = " . $obj->id; $mysqli->query($sql); } //} }
function crawler() { $proxyObj = new proxy(); $mysqli = new mysqli('10.168.45.191', 'admin', 'txg19831210', 'crawler'); $mysqli->query('SET NAMES gbk'); //for (;;) { $hour = date('G'); $current = time(); //$sql = "SELECT * FROM keyword WHERE status = 'active' AND clicked_times < times AND ((last_click_time + click_interval) < {$current}) AND ((path1_page < 5 AND path1_page > 0) OR (path2_page < 5 AND path2_page > 0)) ORDER BY last_click_time ASC LIMIT 1"; $sql = "SELECT * FROM keyword WHERE id = 16"; $result = $mysqli->query($sql); $data = array(); if ($result) { $obj = $result->fetch_object(); $result->close(); } if (!$obj->id) { echo "zz\n"; sleep(1); continue; } else { //$sql = "UPDATE keyword SET last_click_time = {$current} WHERE id = {$obj->id}"; //$mysqli->query($sql); } $kwd = urlencode($obj->kwd); $nid = $obj->nid; $date = date('Ymd'); $sleep_time = $obj->sleep_time; $path1 = (int) $obj->path1; $path2 = $path1 + (int) $obj->path2; if ($obj->path1_page > 5) { $path1 = 0; $path2 = 100; } $ua = 'aa'; $proxy = $proxyObj->getProxy(true); $rand = rand(1, 100); //} }
function detector() { $mysqli = new mysqli('localhost', 'admin', 'txg19831210', 'crawler'); $mysqli->query('SET NAMES gbk'); $proxyObj = new proxy(); $hour = date('G'); $current = time(); $sql = "SELECT * FROM keyword WHERE is_detected = 0"; $result = $mysqli->query($sql); $data = array(); if ($result) { while ($obj = $result->fetch_object()) { $data[] = $obj; } } if (!$data) { echo "zz\n"; exit("Done\n"); } foreach ($data as $obj) { echo $obj->kwd . "\n"; $kwd = urlencode($obj->kwd); $nid = $obj->nid; $date = date('Ymd'); $path1 = (int) $obj->path1; $path2 = $path1 + (int) $obj->path2; $path3 = $path2 + (int) $obj->path3; $sleep_time = $obj->sleep_time; $proxy = $proxyObj->getProxy(); $httpsProxy = $proxyObj->getProxy(true); $ua = 'aa'; //taobao search $search_url = 'http://s.taobao.com/search?&initiative_id=tbindexz_' . $date . '&spm=1.7274553.1997520841.1&sourceId=tb.index&search_type=item&ssid=s5-e&commend=all&q=' . $kwd . '&suggest=0_2'; $search_selector = ".item[nid='" . $nid . "'] h3 a"; $next_selector = ".page-next"; $cmd = "/usr/bin/casperjs --output-encoding=gbk --script-encoding=gbk --proxy=" . $proxy . " /var/html/casperjs/detector.js \"" . $search_url . "\" " . " \"" . $search_selector . "\" " . "\"" . $next_selector . "\" " . $sleep_time . " \"" . $ua . "\""; $path1_page = system($cmd); echo $path1_page . "\n"; if (!preg_match('/^[0-9]/', $path1_page)) { echo "error\n"; } else { $depth = (int) $path1_page + 1; $sql = "UPDATE keyword SET path1_page ={$depth} WHERE id = " . $obj->id . " AND (path1_page > {$depth} OR path1_page = -1)"; $mysqli->query($sql); } //taobao search tmall tab $search_url = 'http://s.taobao.com/search?spm=a230r.1.0.0.9nMSJu&initiative_id=tbindexz_' . $date . '&tab=mall&q=' . $kwd . '&suggest=0_2'; $search_selector = ".item[nid='" . $nid . "'] h3 a"; $next_selector = ".page-next"; $cmd = "/usr/bin/casperjs --output-encoding=gbk --script-encoding=gbk --proxy=" . $proxy . " /var/html/casperjs/detector.js \"" . $search_url . "\" " . " \"" . $search_selector . "\" " . "\"" . $next_selector . "\" " . $sleep_time . " \"" . $ua . "\""; $path2_page = system($cmd); echo $path2_page . "\n"; if (!preg_match('/^[0-9]/', $path2_page)) { echo "error\n"; } else { $depth = (int) $path2_page + 1; $sql = "UPDATE keyword SET path2_page ={$depth} WHERE id = " . $obj->id . " AND (path2_page > {$depth} OR path2_page = -1)"; $mysqli->query($sql); } //tmall search $search_url = 'http://list.tmall.com/search_product.htm?q=' . $kwd . '&type=p&vmarket=&spm=3.7396704.a2227oh.d100&from=mallfp..pc_1_searchbutton'; $search_selector = ".product[data-id=' " . $nid . "'] div .productTitle a"; $next_selector = "a.ui-page-s-next"; $cmd = "/usr/bin/casperjs /var/html/casperjs/detector.js --ignore-ssl-errors=true --proxy=" . $httpsProxy . " --output-encoding=gbk --script-encoding=gbk \"" . $search_url . "\" " . " \"" . $search_selector . "\" " . "\"" . $next_selector . "\" " . $sleep_time . " \"" . $ua . "\""; $path3_page = system($cmd); echo $path3_page . "\n"; if (!preg_match('/^[0-9]/', $path3_page)) { echo "error\n"; } else { $depth = (int) $path3_page + 1; $sql = "UPDATE keyword SET path3_page ={$depth} WHERE id = " . $obj->id . " AND (path3_page > {$depth} OR path3_page = -1)"; $mysqli->query($sql); } $sql = "UPDATE keyword SET is_detected = 1 WHERE id = " . $obj->id; $mysqli->query($sql); } }
function crawler() { $proxyObj = new proxy(); $mysqli = new mysqli('10.168.45.191', 'admin', 'txg19831210', 'crawler'); $mysqli->query('SET NAMES gbk'); //for (;;) { $hour = date('G'); $current = time(); //$sql = "SELECT * FROM keyword WHERE status = 'active' AND clicked_times < times AND ((last_click_time + click_interval) < {$current}) AND ((path1_page < 5 AND path1_page > 0) OR (path2_page < 5 AND path2_page > 0) OR (path3_page < 5 AND path3_page > 0)) ORDER BY last_click_time ASC LIMIT 1"; $sql = "SELECT * FROM keyword WHERE id = 13 LIMIT 1"; $result = $mysqli->query($sql); $data = array(); if ($result) { $obj = $result->fetch_object(); $result->close(); } if (!$obj || !$obj->id) { echo "zz\n"; sleep(1); continue; } else { $kwd = urlencode($obj->kwd); $nid = $obj->nid; $date = date('Ymd'); $sleep_time = $obj->sleep_time; $path1 = (int) $obj->path1; $path2 = $path1 + (int) $obj->path2; $path3 = $path2 + (int) $obj->path3; $ua = 'aa'; $keyword = new keyword(); $rand = rand(1, 100); $rand = 5; if ($rand <= $path1) { //taobao search $data = array('path' => 'taobao', 'kwd' => $kwd, 'date' => $date, 'region' => $obj->path1_region, 'price_from' => $obj->path1_price_from, 'price_to' => $obj->path1_price_to); $search_url = $keyword->buildSearchUrl($data); echo $search_url . "\n"; exit; if ($obj->path1_page >= 5) { continue; } //$proxy = $proxyObj->getProxy(); //$search_url = 'http://s.taobao.com/search?&initiative_id=tbindexz_'.$date.'&spm=1.7274553.1997520841.1&sourceId=tb.index&search_type=item&ssid=s5-e&commend=all&q='.$kwd.'&suggest=0_2'; $search_selector = ".item[nid='" . $nid . "'] h3 a"; $next_selector = ".page-next"; $cmd = "/usr/bin/casperjs --output-encoding=gbk --script-encoding=gbk --proxy=" . $proxy . " /var/html/casperjs/pcntl/process.js \"" . $search_url . "\" " . " \"" . $search_selector . "\" " . "\"" . $next_selector . "\" " . $sleep_time . " \"" . $ua . "\""; } elseif ($rand <= $path2) { //taobao search tmall tab if ($obj->path2_page >= 5) { continue; } $proxy = $proxyObj->getProxy(); $search_url = 'http://s.taobao.com/search?spm=a230r.1.0.0.9nMSJu&initiative_id=tbindexz_' . $date . '&tab=mall&q=' . $kwd . '&suggest=0_2'; $search_selector = ".item[nid='" . $nid . "'] h3 a"; $next_selector = ".page-next"; $cmd = "/usr/bin/casperjs --output-encoding=gbk --script-encoding=gbk --proxy=" . $proxy . " /var/html/casperjs/pcntl/process.js \"" . $search_url . "\" " . " \"" . $search_selector . "\" " . "\"" . $next_selector . "\" " . $sleep_time . " \"" . $ua . "\""; } else { //tmall search if ($obj->path3_page >= 5) { continue; } $proxy = $proxyObj->getProxy(true); $search_url = 'http://list.tmall.com/search_product.htm?q=' . $kwd . '&type=p&vmarket=&spm=3.7396704.a2227oh.d100&from=mallfp..pc_1_searchbutton'; $search_selector = ".product[data-id=' " . $nid . "'] div .productTitle a"; $next_selector = "a.ui-page-s-next"; $cmd = "/usr/bin/casperjs /var/html/casperjs/pcntl/process.js --ignore-ssl-errors=true --proxy=" . $proxy . " --output-encoding=gbk --script-encoding=gbk \"" . $search_url . "\" " . " \"" . $search_selector . "\" " . "\"" . $next_selector . "\" " . $sleep_time . " \"" . $ua . "\""; } $sql = "UPDATE keyword SET last_click_time = {$current} WHERE id = {$obj->id}"; $mysqli->query($sql); } echo $cmd . "\n"; system($cmd); $sql = "UPDATE keyword SET clicked_times = clicked_times + 1 WHERE id = " . $obj->id; $mysqli->query($sql); //} }