示例#1
0
        }
        $data = array_merge($data, $server_data);
        db::update('user', $data, "`username`='{$username}'");
        //file_put_contents("./data/info/".$username.".json", json_encode($data));
        return;
    }
    $data = get_user($response);
    if (empty($data)) {
        db::update('user', $server_data, "`username`='{$username}'");
        file_put_contents("./data/error_emptydata.log", date("Y-m-d H:i:s") . ' ' . $username . " info data not exists --- \n", FILE_APPEND);
        return;
    }
    $data['last_message_week'] = empty($data['last_message_time']) ? 7 : intval(date("w", $data['last_message_time']));
    $data['last_message_hour'] = empty($data['last_message_time']) ? 24 : intval(date("H", $data['last_message_time']));
    $data = array_merge($data, $server_data);
    db::update('user', $data, "`username`='{$username}'");
    //file_put_contents("./data/about/".$username.".json", json_encode($data));
};
for ($j = 0; $j < 10000; $j++) {
    for ($i = 0; $i < 10; $i++) {
        $username = get_user_queue('info');
        $username = addslashes($username);
        $url = "http://www.zhihu.com/people/{$username}/about";
        $curl->get($url);
        $url = "http://www.zhihu.com/people/{$username}/";
        $curl->get($url);
    }
    $data = $curl->execute();
    // 睡眠100毫秒,太快了会被认为是ddos
    usleep(100000);
}
示例#2
0
    //echo $worker->worker_pid . " --- " . $worker->worker_id."\n";
    $cookie = trim(file_get_contents("cookie.txt"));
    $curl = new rolling_curl();
    $curl->set_cookie($cookie);
    $curl->set_gzip(true);
    $curl->callback = function ($response, $info, $request, $error) {
        preg_match("@http://www.zhihu.com/people/(.*?)/about@i", $request['url'], $out);
        $username = $out[1];
        if (empty($response)) {
            var_dump($info);
            file_put_contents("./timeout/" . $username . "_info.json", json_encode($info) . "\n", FILE_APPEND);
            file_put_contents("./timeout/" . $username . "_error.json", json_encode($error) . "\n", FILE_APPEND);
        } else {
            $data = get_user_about($response);
            if (empty($data)) {
                file_put_contents("./timeout_data.txt", $request['url'] . "\n", FILE_APPEND);
            } else {
                preg_match("@http://www.zhihu.com/people/(.*?)/about@i", $request['url'], $out);
                file_put_contents("./html/" . $out[1] . ".json", json_encode($data));
            }
        }
    };
    for ($i = 0; $i < $count; $i++) {
        $username = get_user_queue();
        $username = addslashes($username);
        $url = "http://www.zhihu.com/people/{$username}/about";
        $curl->get($url);
        $data = $curl->execute();
    }
};
$w->run();
示例#3
0
/**
 * 保存用户索引
 * 
 * @return void
 * @author seatle <*****@*****.**> 
 * @created time :2015-08-02 12:30
 */
function save_user_index($worker = null)
{
    // 先给一条记录上锁, 采用队列之后就不需要了,这个多进程下还是有问题
    $progress_id = posix_getpid();
    $time = time();
    // 会和下面的更新采集时间发送死锁,因为Order By 会扫描整张表,虽然desc出来的rows为1,也不知道为什么
    //$sql = "Update `user` Set `index_progress_id`='{$progress_id}' Order By `index_uptime` Asc Limit 1";
    // 效率太低
    //$sql = "Update `user` Set `index_progress_id`='15895' Where `index_uptime` = (Select Min(`index_uptime`) From (Select tmp.* From user tmp) a limit 1);";
    // 语法错误
    //$sql = "Update `user` Set `index_progress_id`='{$progress_id}' Where `index_uptime` = (Select Min(`index_uptime`) From `user`)";
    //db::query($sql);
    //$sql = "Select `username`, `depth` From `user` Where `index_progress_id`='{$progress_id}' Order By `index_uptime` Asc Limit 1";
    //$row = db::get_one($sql);
    //if (!empty($row['username']))
    $username = get_user_queue('index');
    if (!empty($username)) {
        $username = addslashes($username);
        // 先把用户深度拿出来,下面要增加1给新用户
        $sql = "Select `depth` From `user` Where `username`='{$username}'";
        $row = db::get_one($sql);
        $depth = $row['depth'];
        // 更新采集时间, 让队列每次都取到不同的用户
        $sql = "Update `user` Set `index_uptime`='{$time}',`index_progress_id`='{$progress_id}' Where `username`='{$username}'";
        db::query($sql);
        $worker->log("采集用户列表 --- " . $username . " --- 开始");
        // $user_rows = get_user_index($username);
        // $user_type followees 、followers
        // 获取关注了
        $followees_user = get_user_index($username, 'followees', $worker);
        $worker->log("采集用户列表 --- " . $username . " --- 关注了 --- 成功");
        // 获取关注者
        $followers_user = get_user_index($username, 'followers', $worker);
        $worker->log("采集用户列表 --- " . $username . " --- 关注者 --- 成功");
        // 合并 关注了 和 关注者
        $user_rows = array_merge($followers_user, $followees_user);
        if (!empty($user_rows)) {
            $worker->log("采集用户列表 --- " . $username . " --- 成功");
            foreach ($user_rows as $user_row) {
                // 子用户
                $c_username = addslashes($user_row['username']);
                $sql = "Select Count(*) As count From `user` Where `username`='{$c_username}'";
                $row = db::get_one($sql);
                // 如果用户不存在
                if (!$row['count']) {
                    $user_row['depth'] = $depth + 1;
                    $user_row['parent_username'] = $username;
                    $user_row['addtime'] = $user_row['index_uptime'] = $user_row['info_uptime'] = time();
                    if (db::insert('user', $user_row)) {
                        $worker->log("入库用户 --- " . $c_username . " --- 成功");
                    } else {
                        $worker->log("入库用户 --- " . $c_username . " --- 失败");
                    }
                }
            }
        } else {
            $worker->log("采集用户列表 --- " . $username . " --- 失败");
        }
    } else {
        $worker->log("采集用户 ---  队列不存在");
    }
}