Exemplo n.º 1
0
function getItemdetails($url)
{
    $dataResult = array();
    $baseUrl = "http://www.dianping.com";
    $totalUrl = $baseUrl . $url;
    cls_curl::set_gzip(true);
    $content = cls_curl::get($totalUrl);
    preg_match_all('#<h1 class=\\"shop-name\\">\\s(.*?)\\s</h1>#s', $content, $shopname);
    $name = preg_replace("/<(a.*?)>(.*?)<(\\/a.*?)>/si", "", $shopname[0][0]);
    $name = preg_replace("/<(\\/?h1.*?)>/si", "", $name);
    $dataResult['name'] = $name;
    preg_match_all('#<span class="info-name">营业时间:(.*?)</span>(.*?)<span class="item">\\s(.*?)\\s</span>#s', $content, $hour);
    $dataResult['hour'] = $hour[3][0];
    preg_match_all('#<div class="expand-info address" itemprop="street-address">\\s(.*?)\\s</div>#s', $content, $address);
    $addr = preg_replace("/<(\\/?div.*?)>/si", "", $address[0][0]);
    $addr = preg_replace("/<(\\/?span.*?)>/si", "", $addr);
    $addr = preg_replace("/<(\\/?a.*?)>/si", "", $addr);
    $dataResult['address_detail'] = $addr;
    preg_match_all('#<p class="expand-info tel">\\s(.*?)\\s</p>#s', $content, $phone);
    $phoneNum = preg_replace("/<(\\/?p.*?)>/si", "", $phone[0][0]);
    $phoneNum = preg_replace("/<(\\/?span.*?)>/si", "", $phoneNum);
    $phoneNum = preg_replace("/<(a.*?)>(.*?)<(\\/a.*?)>/si", "", $phoneNum);
    $dataResult['tel'] = $phoneNum;
    $dataResult['province'] = '北京市';
    $dataResult['city'] = '北京';
    preg_match_all('#<span class="item">人均:(.*?)</span>#s', $content, $price);
    $dataResult['price'] = $price[1][0];
    return $dataResult;
}
Exemplo n.º 2
0
 public static function http_request($url, $type = 'get', $fields)
 {
     // 如果是 get 方式,直接拼凑一个 url 出来
     if (strtolower($type) == 'get' && !empty($fields)) {
         $url = $url . (strpos($url, "?") === false ? "?" : "&") . http_build_query($fields);
     }
     // 随机绑定 hosts,做负载均衡
     if (self::$hosts) {
         $parse_url = parse_url($url);
         $host = $parse_url['host'];
         $key = rand(0, count(self::$hosts) - 1);
         $ip = self::$hosts[$key];
         $url = str_replace($host, $ip, $url);
         self::$headers = array_merge(array('Host:' . $host), self::$headers);
     }
     curl_setopt(self::$ch, CURLOPT_URL, $url);
     // 如果是 post 方式
     if (strtolower($type) == 'post') {
         curl_setopt(self::$ch, CURLOPT_POST, true);
         curl_setopt(self::$ch, CURLOPT_POSTFIELDS, $fields);
     }
     if (self::$useragent) {
         curl_setopt(self::$ch, CURLOPT_USERAGENT, self::$useragent);
     }
     if (self::$cookie) {
         curl_setopt(self::$ch, CURLOPT_COOKIE, self::$cookie);
     }
     if (self::$cookie_jar) {
         curl_setopt(self::$ch, CURLOPT_COOKIEJAR, self::$cookie_jar);
     }
     if (self::$cookie_file) {
         curl_setopt(self::$ch, CURLOPT_COOKIEFILE, self::$cookie_file);
     }
     if (self::$referer) {
         curl_setopt(self::$ch, CURLOPT_REFERER, self::$referer);
     }
     if (self::$ip) {
         self::$headers = array_merge(array('CLIENT-IP:' . self::$ip, 'X-FORWARDED-FOR:' . self::$ip), self::$headers);
     }
     if (self::$headers) {
         curl_setopt(self::$ch, CURLOPT_HTTPHEADER, self::$headers);
     }
     if (self::$gzip) {
         curl_setopt(self::$ch, CURLOPT_ENCODING, 'gzip');
     }
     if (self::$proxy) {
         curl_setopt(self::$ch, CURLOPT_PROXY, self::$proxy);
     }
     if (self::$http_raw) {
         curl_setopt(self::$ch, CURLOPT_HEADER, true);
     }
     $data = curl_exec(self::$ch);
     self::$info = curl_getinfo(self::$ch);
     if ($data === false) {
         //echo date("Y-m-d H:i:s"), ' Curl error: ' . curl_error( self::$ch ), "\n";
     }
     // 关闭句柄
     curl_close(self::$ch);
     //$data = substr($data, 10);
     //$data = gzinflate($data);
     return $data;
 }
Exemplo n.º 3
0
 public static function init()
 {
     if (empty(self::$ch)) {
         self::$ch = curl_init();
         curl_setopt(self::$ch, CURLOPT_RETURNTRANSFER, true);
         curl_setopt(self::$ch, CURLOPT_CONNECTTIMEOUT, self::$timeout);
         curl_setopt(self::$ch, CURLOPT_HEADER, false);
         curl_setopt(self::$ch, CURLOPT_USERAGENT, self::$useragent);
         curl_setopt(self::$ch, CURLOPT_TIMEOUT, self::$timeout + 5);
     }
     return self::$ch;
 }
Exemplo n.º 4
0
/**
 * 获取用户
 * 
 * @param string $username
 * @param string $user_type followees 、followers
 * @return void
 * @author seatle <*****@*****.**> 
 * @created time :2015-07-28 09:46
 */
function get_user_index($username, $user_type = 'followees', $worker)
{
    $url = "http://www.zhihu.com/people/{$username}/{$user_type}";
    set_cookie();
    cls_curl::set_gzip(true);
    $content = cls_curl::get($url);
    if (empty($content)) {
        return array();
    }
    $users = array();
    // 用户不足20个的时候,从ajax取不到用户,所以首页这里还是要取一下
    preg_match_all('#<h2 class="zm-list-content-title"><a data-tip=".*?" href="http://www.zhihu.com/people/(.*?)" class="zg-link" title=".*?">(.*?)</a></h2>#', $content, $out);
    $count = count($out[1]);
    for ($i = 0; $i < $count; $i++) {
        $d_username = empty($out[1][$i]) ? '' : $out[1][$i];
        $d_nickname = empty($out[2][$i]) ? '' : $out[2][$i];
        if (!empty($d_username) && !empty($d_nickname)) {
            $users[$d_username] = array('username' => $d_username, 'nickname' => $d_nickname);
        }
    }
    $keyword = $user_type == 'followees' ? '关注了' : '关注者';
    $worker->log("采集用户 --- " . $username . " --- {$keyword} --- 主页 --- 成功\n");
    preg_match('#<span class="zg-gray-normal">' . $keyword . '</span><br />\\s<strong>(.*?)</strong><label> 人</label>#', $content, $out);
    $user_count = empty($out[1]) ? 0 : intval($out[1]);
    preg_match('#<input type="hidden" name="_xsrf" value="(.*?)"/>#', $content, $out);
    $_xsrf = empty($out[1]) ? '' : trim($out[1]);
    preg_match('#<div class="zh-general-list clearfix" data-init="(.*?)">#', $content, $out);
    $url_params = empty($out[1]) ? '' : json_decode(html_entity_decode($out[1]), true);
    if (!empty($_xsrf) && !empty($url_params) && is_array($url_params)) {
        $url = "http://www.zhihu.com/node/" . $url_params['nodename'];
        $params = $url_params['params'];
        $j = 1;
        for ($i = 0; $i < $user_count; $i = $i + 20) {
            $params['offset'] = $i;
            $post_data = array('method' => 'next', 'params' => json_encode($params), '_xsrf' => $_xsrf);
            $content = cls_curl::post($url, $post_data);
            if (empty($content)) {
                $worker->log("采集用户 --- " . $username . " --- {$keyword} --- 第{$j}页 --- 失败\n");
                continue;
            }
            $rows = json_decode($content, true);
            if (empty($rows['msg']) || !is_array($rows['msg'])) {
                $worker->log("采集用户 --- " . $username . " --- {$keyword} --- 第{$j}页 --- 失败\n");
                continue;
            }
            $worker->log("采集用户 --- " . $username . " --- {$keyword} --- 第{$j}页 --- 成功\n");
            foreach ($rows['msg'] as $row) {
                preg_match_all('#<h2 class="zm-list-content-title"><a data-tip=".*?" href="http://www.zhihu.com/people/(.*?)" class="zg-link" title=".*?">(.*?)</a></h2>#', $row, $out);
                $d_username = empty($out[1][0]) ? '' : $out[1][0];
                $d_nickname = empty($out[2][0]) ? '' : $out[2][0];
                if (!empty($d_username) && !empty($d_nickname)) {
                    $users[$d_username] = array('username' => $d_username, 'nickname' => $d_nickname);
                }
            }
            $j++;
        }
    }
    return $users;
}
Exemplo n.º 5
0
<?php

date_default_timezone_set('Asia/Shanghai');
ini_set('display_errors', 1);
include "config.php";
include "cls_curl.php";
include "db.php";
include "cache.php";
include "worker.php";
include "../rolling/RollingCurl.php";
// an array of URL's to fetch
$urls = array("http://www.dianping.com/search/category/2/45/g146p2");
$content = cls_curl::get($urls[0]);
print_r($content);
// a function that will process the returned responses
function request_callback($response, $info, $request)
{
    // parse the page title out of the returned HTML
    if (preg_match("~<title>(.*?)</title>~i", $response, $out)) {
        $title = $out[1];
    }
    echo "<b>{$title}</b><br />";
    print_r($info);
    echo "<br>";
    echo "<hr>";
}
// create a new RollingCurl object and pass it the name of your custom callback function
$rc = new RollingCurl("request_callback");
// the window size determines how many simultaneous requests to allow.
$rc->window_size = 20;
foreach ($urls as $url) {