Exemplo n.º 1
0
function crawler()
{
    global $nid;
    $proxyObj = new proxy();
    $mysqli = new mysqli('localhost', 'admin', 'txg19831210', 'crawler');
    $mysqli->query('SET NAMES gbk');
    //for (;;) {
    $hour = date('G');
    $current = time();
    //echo "Id {$id}\n";
    $sql = "SELECT * FROM keyword WHERE id = " . $nid;
    $result = $mysqli->query($sql);
    $data = array();
    if ($result) {
        while ($obj = $result->fetch_object()) {
            $data[] = $obj;
        }
    }
    if (!$data) {
        echo "zz\n";
        //$mysqli->rollback();
        //sleep(60);
        continue;
    }
    foreach ($data as $obj) {
        $kwd = urlencode($obj->kwd);
        $nid = $obj->nid;
        $date = date('Ymd');
        $sleep_time = $obj->sleep_time;
        //$crawler = new crawler($kwd, $nid);
        //$proxy = $crawler->proxy;
        //$ua = $crawler->userAgent;
        //echo $proxy . "\n";
        $ua = 'aa';
        $proxy = $proxyObj->getProxy();
        //echo $proxy . "\n";
        if ($obj->path1_page <= $obj->path2_page) {
            $search_url = 'http://s.taobao.com/search?&initiative_id=tbindexz_' . $date . '&spm=1.7274553.1997520841.1&sourceId=tb.index&search_type=item&ssid=s5-e&commend=all&q=' . $kwd . '&suggest=0_2';
            $search_selector = ".item[nid='" . $nid . "'] h3 a";
            $next_selector = ".page-next";
            $cmd = "/usr/bin/casperjs --output-encoding=gbk --script-encoding=gbk --proxy=" . $proxy . " /var/html/casperjs/pcntl/process_single.js \"" . $search_url . "\" " . " \"" . $search_selector . "\" " . "\"" . $next_selector . "\" " . $sleep_time . " \"" . $ua . "\"";
        } else {
            $search_url = 'http://s.taobao.com/search?spm=a230r.1.0.0.9nMSJu&initiative_id=tbindexz_' . $date . '&tab=mall&q=' . $kwd . '&suggest=0_2';
            $search_selector = ".item[nid='" . $nid . "'] h3 a";
            $next_selector = ".page-next";
            $cmd = "/usr/bin/casperjs --output-encoding=gbk --script-encoding=gbk --proxy=" . $proxy . " /var/html/casperjs/pcntl/process_single.js \"" . $search_url . "\" " . " \"" . $search_selector . "\" " . "\"" . $next_selector . "\" " . $sleep_time . " \"" . $ua . "\"";
        }
        //echo $cmd . "\n";
        system($cmd);
        $sql = "UPDATE keyword SET clicked_times = clicked_times + 1, last_click_time = " . time() . ", run_status = 'free' WHERE id = " . $obj->id;
        $mysqli->query($sql);
    }
    //}
}
Exemplo n.º 2
0
function crawler()
{
    $proxyObj = new proxy();
    $mysqli = new mysqli('10.168.45.191', 'admin', 'txg19831210', 'crawler');
    $mysqli->query('SET NAMES gbk');
    //for (;;) {
    $hour = date('G');
    $current = time();
    //$sql = "SELECT * FROM keyword WHERE status = 'active' AND clicked_times < times AND ((last_click_time + click_interval) < {$current}) AND ((path1_page < 5 AND path1_page > 0) OR (path2_page < 5 AND path2_page > 0)) ORDER BY last_click_time ASC LIMIT 1";
    $sql = "SELECT * FROM keyword WHERE id = 16";
    $result = $mysqli->query($sql);
    $data = array();
    if ($result) {
        $obj = $result->fetch_object();
        $result->close();
    }
    if (!$obj->id) {
        echo "zz\n";
        sleep(1);
        continue;
    } else {
        //$sql = "UPDATE keyword SET last_click_time = {$current} WHERE id = {$obj->id}";
        //$mysqli->query($sql);
    }
    $kwd = urlencode($obj->kwd);
    $nid = $obj->nid;
    $date = date('Ymd');
    $sleep_time = $obj->sleep_time;
    $path1 = (int) $obj->path1;
    $path2 = $path1 + (int) $obj->path2;
    if ($obj->path1_page > 5) {
        $path1 = 0;
        $path2 = 100;
    }
    $ua = 'aa';
    $proxy = $proxyObj->getProxy(true);
    $rand = rand(1, 100);
    //}
}
Exemplo n.º 3
0
function detector()
{
    $mysqli = new mysqli('localhost', 'admin', 'txg19831210', 'crawler');
    $mysqli->query('SET NAMES gbk');
    $proxyObj = new proxy();
    $hour = date('G');
    $current = time();
    $sql = "SELECT * FROM keyword WHERE is_detected = 0";
    $result = $mysqli->query($sql);
    $data = array();
    if ($result) {
        while ($obj = $result->fetch_object()) {
            $data[] = $obj;
        }
    }
    if (!$data) {
        echo "zz\n";
        exit("Done\n");
    }
    foreach ($data as $obj) {
        echo $obj->kwd . "\n";
        $kwd = urlencode($obj->kwd);
        $nid = $obj->nid;
        $date = date('Ymd');
        $path1 = (int) $obj->path1;
        $path2 = $path1 + (int) $obj->path2;
        $path3 = $path2 + (int) $obj->path3;
        $sleep_time = $obj->sleep_time;
        $proxy = $proxyObj->getProxy();
        $httpsProxy = $proxyObj->getProxy(true);
        $ua = 'aa';
        //taobao search
        $search_url = 'http://s.taobao.com/search?&initiative_id=tbindexz_' . $date . '&spm=1.7274553.1997520841.1&sourceId=tb.index&search_type=item&ssid=s5-e&commend=all&q=' . $kwd . '&suggest=0_2';
        $search_selector = ".item[nid='" . $nid . "'] h3 a";
        $next_selector = ".page-next";
        $cmd = "/usr/bin/casperjs --output-encoding=gbk --script-encoding=gbk --proxy=" . $proxy . " /var/html/casperjs/detector.js \"" . $search_url . "\" " . " \"" . $search_selector . "\" " . "\"" . $next_selector . "\" " . $sleep_time . " \"" . $ua . "\"";
        $path1_page = system($cmd);
        echo $path1_page . "\n";
        if (!preg_match('/^[0-9]/', $path1_page)) {
            echo "error\n";
        } else {
            $depth = (int) $path1_page + 1;
            $sql = "UPDATE keyword SET path1_page ={$depth} WHERE id = " . $obj->id . " AND (path1_page > {$depth} OR path1_page = -1)";
            $mysqli->query($sql);
        }
        //taobao search tmall tab
        $search_url = 'http://s.taobao.com/search?spm=a230r.1.0.0.9nMSJu&initiative_id=tbindexz_' . $date . '&tab=mall&q=' . $kwd . '&suggest=0_2';
        $search_selector = ".item[nid='" . $nid . "'] h3 a";
        $next_selector = ".page-next";
        $cmd = "/usr/bin/casperjs --output-encoding=gbk --script-encoding=gbk --proxy=" . $proxy . " /var/html/casperjs/detector.js \"" . $search_url . "\" " . " \"" . $search_selector . "\" " . "\"" . $next_selector . "\" " . $sleep_time . " \"" . $ua . "\"";
        $path2_page = system($cmd);
        echo $path2_page . "\n";
        if (!preg_match('/^[0-9]/', $path2_page)) {
            echo "error\n";
        } else {
            $depth = (int) $path2_page + 1;
            $sql = "UPDATE keyword SET path2_page ={$depth} WHERE id = " . $obj->id . " AND (path2_page > {$depth} OR path2_page = -1)";
            $mysqli->query($sql);
        }
        //tmall search
        $search_url = 'http://list.tmall.com/search_product.htm?q=' . $kwd . '&type=p&vmarket=&spm=3.7396704.a2227oh.d100&from=mallfp..pc_1_searchbutton';
        $search_selector = ".product[data-id=' " . $nid . "'] div .productTitle a";
        $next_selector = "a.ui-page-s-next";
        $cmd = "/usr/bin/casperjs /var/html/casperjs/detector.js --ignore-ssl-errors=true --proxy=" . $httpsProxy . " --output-encoding=gbk --script-encoding=gbk \"" . $search_url . "\" " . " \"" . $search_selector . "\" " . "\"" . $next_selector . "\" " . $sleep_time . " \"" . $ua . "\"";
        $path3_page = system($cmd);
        echo $path3_page . "\n";
        if (!preg_match('/^[0-9]/', $path3_page)) {
            echo "error\n";
        } else {
            $depth = (int) $path3_page + 1;
            $sql = "UPDATE keyword SET path3_page ={$depth} WHERE id = " . $obj->id . " AND (path3_page > {$depth} OR path3_page = -1)";
            $mysqli->query($sql);
        }
        $sql = "UPDATE keyword SET is_detected = 1 WHERE id = " . $obj->id;
        $mysqli->query($sql);
    }
}
Exemplo n.º 4
0
function crawler()
{
    $proxyObj = new proxy();
    $mysqli = new mysqli('10.168.45.191', 'admin', 'txg19831210', 'crawler');
    $mysqli->query('SET NAMES gbk');
    //for (;;) {
    $hour = date('G');
    $current = time();
    //$sql = "SELECT * FROM keyword WHERE status = 'active' AND clicked_times < times AND ((last_click_time + click_interval) < {$current}) AND ((path1_page < 5 AND path1_page > 0) OR (path2_page < 5 AND path2_page > 0) OR (path3_page < 5 AND path3_page > 0)) ORDER BY last_click_time ASC LIMIT 1";
    $sql = "SELECT * FROM keyword WHERE id = 13 LIMIT 1";
    $result = $mysqli->query($sql);
    $data = array();
    if ($result) {
        $obj = $result->fetch_object();
        $result->close();
    }
    if (!$obj || !$obj->id) {
        echo "zz\n";
        sleep(1);
        continue;
    } else {
        $kwd = urlencode($obj->kwd);
        $nid = $obj->nid;
        $date = date('Ymd');
        $sleep_time = $obj->sleep_time;
        $path1 = (int) $obj->path1;
        $path2 = $path1 + (int) $obj->path2;
        $path3 = $path2 + (int) $obj->path3;
        $ua = 'aa';
        $keyword = new keyword();
        $rand = rand(1, 100);
        $rand = 5;
        if ($rand <= $path1) {
            //taobao search
            $data = array('path' => 'taobao', 'kwd' => $kwd, 'date' => $date, 'region' => $obj->path1_region, 'price_from' => $obj->path1_price_from, 'price_to' => $obj->path1_price_to);
            $search_url = $keyword->buildSearchUrl($data);
            echo $search_url . "\n";
            exit;
            if ($obj->path1_page >= 5) {
                continue;
            }
            //$proxy = $proxyObj->getProxy();
            //$search_url = 'http://s.taobao.com/search?&initiative_id=tbindexz_'.$date.'&spm=1.7274553.1997520841.1&sourceId=tb.index&search_type=item&ssid=s5-e&commend=all&q='.$kwd.'&suggest=0_2';
            $search_selector = ".item[nid='" . $nid . "'] h3 a";
            $next_selector = ".page-next";
            $cmd = "/usr/bin/casperjs --output-encoding=gbk --script-encoding=gbk --proxy=" . $proxy . " /var/html/casperjs/pcntl/process.js \"" . $search_url . "\" " . " \"" . $search_selector . "\" " . "\"" . $next_selector . "\" " . $sleep_time . " \"" . $ua . "\"";
        } elseif ($rand <= $path2) {
            //taobao search tmall tab
            if ($obj->path2_page >= 5) {
                continue;
            }
            $proxy = $proxyObj->getProxy();
            $search_url = 'http://s.taobao.com/search?spm=a230r.1.0.0.9nMSJu&initiative_id=tbindexz_' . $date . '&tab=mall&q=' . $kwd . '&suggest=0_2';
            $search_selector = ".item[nid='" . $nid . "'] h3 a";
            $next_selector = ".page-next";
            $cmd = "/usr/bin/casperjs --output-encoding=gbk --script-encoding=gbk --proxy=" . $proxy . " /var/html/casperjs/pcntl/process.js \"" . $search_url . "\" " . " \"" . $search_selector . "\" " . "\"" . $next_selector . "\" " . $sleep_time . " \"" . $ua . "\"";
        } else {
            //tmall search
            if ($obj->path3_page >= 5) {
                continue;
            }
            $proxy = $proxyObj->getProxy(true);
            $search_url = 'http://list.tmall.com/search_product.htm?q=' . $kwd . '&type=p&vmarket=&spm=3.7396704.a2227oh.d100&from=mallfp..pc_1_searchbutton';
            $search_selector = ".product[data-id=' " . $nid . "'] div .productTitle a";
            $next_selector = "a.ui-page-s-next";
            $cmd = "/usr/bin/casperjs /var/html/casperjs/pcntl/process.js --ignore-ssl-errors=true --proxy=" . $proxy . " --output-encoding=gbk --script-encoding=gbk \"" . $search_url . "\" " . " \"" . $search_selector . "\" " . "\"" . $next_selector . "\" " . $sleep_time . " \"" . $ua . "\"";
        }
        $sql = "UPDATE keyword SET last_click_time = {$current} WHERE id = {$obj->id}";
        $mysqli->query($sql);
    }
    echo $cmd . "\n";
    system($cmd);
    $sql = "UPDATE keyword SET clicked_times = clicked_times + 1 WHERE id = " . $obj->id;
    $mysqli->query($sql);
    //}
}