/** * 抓取网页 * * @param int $window_size curl批处理句柄最大同时连接数 */ public function fetch_html($window_size = NULL) { $rc = new RollingCurl($this->callback); foreach ($this->urls as $url) { $request = new RollingCurlRequest($url, 'Get', NULL, NULL, $this->options); $rc->add($request); } if (!empty($window_size)) { $rc->execute($window_size); } else { $rc->execute($this->window_size); } }
public static function multiGetFromCiteServer($mode, $sets, $style = 'chicago-note-bibliography') { require_once "../include/RollingCurl.inc.php"; $t = microtime(true); $setIDs = array(); $data = array(); $requestCallback = function ($response, $info) use($mode, &$setIDs, &$data) { if ($info['http_code'] != 200) { error_log("WARNING: HTTP {$info['http_code']} from citeserver {$mode} request: " . $response); return; } $response = json_decode($response); if (!$response) { error_log("WARNING: Invalid response from citeserver {$mode} request: " . $response); return; } $str = parse_url($info['url']); $str = parse_str($str['query']); if ($mode == 'citation') { $data[$setIDs[$setID]] = Zotero_Cite::processCitationResponse($response); } else { if ($mode == "bib") { $data[$setIDs[$setID]] = Zotero_Cite::processBibliographyResponse($response); } } }; $rc = new RollingCurl($requestCallback); // Number of simultaneous requests $rc->window_size = 20; foreach ($sets as $key => $items) { $json = self::getJSONFromItems($items); $server = Z_CONFIG::$CITATION_SERVERS[array_rand(Z_CONFIG::$CITATION_SERVERS)]; $url = "http://{$server}/?responseformat=json&style={$style}"; if ($mode == 'citation') { $url .= "&citations=1&bibliography=0"; } // Include array position in URL so that the callback can figure // out what request this was $url .= "&setID=" . $key; // TODO: support multiple items per set, if necessary if (!$items instanceof Zotero_Item) { throw new Exception("items is not a Zotero_Item"); } $setIDs[$key] = $items->libraryID . "/" . $items->key; $request = new RollingCurlRequest($url); $request->options = array(CURLOPT_POST => 1, CURLOPT_POSTFIELDS => $json, CURLOPT_HTTPHEADER => array("Expect:"), CURLOPT_CONNECTTIMEOUT => 1, CURLOPT_TIMEOUT => 4, CURLOPT_HEADER => 0, CURLOPT_RETURNTRANSFER => 1); $rc->add($request); } $rc->execute(); error_log(sizeOf($sets) . " {$mode} requests in " . round(microtime(true) - $t, 3)); return $data; }
public function execute($window_size = null) { if (count($this->requests) == 0) { return false; } return parent::execute($window_size); }
// a little example that fetches a bunch of sites in parallel and echos the page title and response info for each request function request_callback($response, $info) { // parse the page title out of the returned HTML if (preg_match("~<title>(.*?)</title>~i", $response, $out)) { $title = $out[1]; } echo "<b>{$title}</b><br />"; print_r($info); echo "<hr>"; } require "RollingCurl.php"; // single curl request $rc = new RollingCurl("request_callback"); $rc->request("http://www.msn.com"); $rc->execute(); // another single curl request $rc = new RollingCurl("request_callback"); $rc->request("http://www.google.com"); $rc->execute(); echo "<hr>"; // top 20 sites according to alexa (11/5/09) $urls = array("http://www.google.com", "http://www.facebook.com", "http://www.yahoo.com", "http://www.youtube.com", "http://www.live.com", "http://www.wikipedia.com", "http://www.blogger.com", "http://www.msn.com", "http://www.baidu.com", "http://www.yahoo.co.jp", "http://www.myspace.com", "http://www.qq.com", "http://www.google.co.in", "http://www.twitter.com", "http://www.google.de", "http://www.microsoft.com", "http://www.google.cn", "http://www.sina.com.cn", "http://www.wordpress.com", "http://www.google.co.uk"); $rc = new RollingCurl("request_callback"); $rc->window_size = 20; foreach ($urls as $url) { $request = new Request($url); $rc->add($request); } $rc->execute();
/** * Starting connections function execution overload * * @access public * * @throws AngryCurlException * * @param int $window_size Max number of simultaneous connections * * @return string|bool */ public function execute($window_size = null) { # checking $window_size var if ($window_size == null) { self::add_debug_msg(" (!) Default threads amount value (5) is used"); } elseif ($window_size > 0 && is_int($window_size)) { self::add_debug_msg(" * Threads set to:\t{$window_size}"); } else { throw new AngryCurlException(" (!) Wrong threads amount in execute():\t{$window_size}"); } # writing debug self::add_debug_msg(" * Starting connections"); //var_dump($this->__get('requests')); $time_start = microtime(1); $result = parent::execute($window_size); $time_end = microtime(1); # writing debug self::add_debug_msg(" * Finished in " . round($time_end - $time_start, 2) . "s"); return $result; }
$result = "fail\n"; if ($response[0] == '<') { $result .= "\t\tReceived: " . $response . "...\n"; } else { $result .= "\t\tReceived: " . base64_encode(substr($response, 0, 50)) . "...\n"; } $result .= "\t\tSent: " . base64_encode(substr($png, 0, 50)) . "...\n"; $results[$request->index] = $result; } }); for ($x = 1; $x <= $files_appearing; $x++) { $results[$x] = 'unknown'; $request = new RollingCurlRequest('http://' . $username . ':' . $password . '@' . $owncloud_remote . '/webdav' . $testdir . '/sample_' . $x . '.png', 'GET'); $request->index = $x; $verify->add($request); } $verify->execute($window); echo "\nVerification:\n"; foreach ($results as $idx => $result) { echo "\t{$idx}: {$result}\n"; if ($result != 'pass') { $hasproblem = true; } } if ($hasproblem) { echo "\nREVIEW THE RESULTS ABOVE FOR A POTENTIAL PROBLEM.\n"; } else { echo "\nNO OBVIOUS PROBLEMS DETECTED.\n"; } // Clean up temp images randpng(0);
public function sendRequestCallBack($response, $info = '') { // 处理内容 $response = preg_replace("/<code[\\s\\S]*><!--/iU", "", $response); $response = preg_replace("/--><\\/code>/iU", "", $response); // 分析html,获取添加好友需要的数据 \phpQuery::newDocumentHTML($response); $pagelet_timeline_main_column = pq("#pagelet_timeline_main_column")->attr("data-gt"); $pagelet_timeline_main_column = json_decode($pagelet_timeline_main_column); $profile_owner = $pagelet_timeline_main_column->profile_owner; $requests = array(); foreach (pq("div.fsl.fwb.fcb > a") as $value) { $data_hovercard = pq($value)->attr("data-hovercard"); $data_gt = pq($value)->attr("data-gt"); $data_gt = json_decode($data_gt); preg_match("/\\?id=([0-9]*)&/iU", $data_hovercard, $matches); $to_friend = $matches[1]; // 发送好友请求 $query = array("to_friend" => $to_friend, "action" => "add_friend", "how_found" => "profile_friends", "ref_param" => "pb_friends_tl", "link_data[gt][coeff2_registry_key]" => $data_gt->coeff2_registry_key, "link_data[gt][coeff2_info]" => $data_gt->coeff2_info, "link_data[gt][coeff2_action]" => $data_gt->coeff2_action, "link_data[gt][coeff2_pv_signature]" => $data_gt->coeff2_pv_signature, "link_data[gt][profile_owner]" => $profile_owner, "link_data[gt][ref]" => "timeline:timeline", "outgoing_id" => '', "logging_location" => '', "no_flyout_on_click" => "true", "ego_log_data" => '', "http_referer" => '', "floc" => "friends_tab", "__user" => $this->user_id, "__a" => "1", "fb_dtsg" => $this->token, "__rev" => $this->version); $capt_opts = $this->curl_opts; $url = "https://www.facebook.com/ajax/add_friend/action.php?__pc=EXP1%3ADEFAULT"; $capt_opts[CURLOPT_POST] = true; $capt_opts[CURLOPT_POSTFIELDS] = $query; $request = new \RollingCurlRequest($url); $request->options = $capt_opts; $requests[] = $request; } if (empty($requests)) { return; } $rc = new \RollingCurl(); if (sizeof($requests) < 20) { $rc->window_size = sizeof($requests); } else { $rc->window_size = 20; } foreach ($requests as $value) { $rc->add($value); } $rc->execute(); }
public static function curl_multi_get($requests) { cache::$__LinkToFunc = array(); cache::$__TempResult = array(); cache::$__ApiTime = microtime(true); $rc = new RollingCurl(function ($response, $info, $request) { $url = $request->url; if ($response == null) { cache::log("CURL RESULT EMPTY for url: " . $url . " with HTTP code " . $info['http_code']); /*$response = file_get_contents($url); if( $response == false ) { cache::log("FILE_GET_CONTENTS failed for url: ".$url); }*/ } cache::log(sprintf("Got multi result for %s in %s sec, len %s", $url, number_format(microtime(true) - cache::$__ApiTime, 3), strlen($response))); $cbtime = microtime(true); if (cache::$__LinkToFunc[$url]) { call_user_func_array(cache::$__LinkToFunc[$url], array($response)); } cache::log(sprintf("Ran callback for URL in %s sec", number_format(microtime(true) - $cbtime, 3))); cache::$__TempResult[$url] = $response; cache::put($url, $response); }); foreach ($requests as $request) { cache::$__LinkToFunc[$request['url']] = $request['func']; $rc->request($request['url']); } $rc->window_size = count($requests); $rc->execute(); cache::log(sprintf("Multi API request took %s sec", number_format(microtime(true) - cache::$__ApiTime, 3))); return cache::$__TempResult; }
// 'dol.gov', // 'humanresources.about.com', // 'shrm.org', // 'diversityinc.com', // 'stevepavlina.com/blog/', // 'osha.gov', // 'hr.com', // 'ere.net', // 'cisin.com', // 'blr.com', // 'peopleadmin.com', // 'wageworks.com', // 'dalecarnegie.com', // 'doleta.gov', // 'mercer.com', // 'astd.org', // 'brightscope.com', // 'tmp.com', // 'trinet.com', //]; // DETECT CYCLE! $loader = new FileLoader(); $scanner = new Scanner($loader, __DIR__ . '/apps.json'); $category = new Category($loader, __DIR__ . '/apps.json'); require_once __DIR__ . '/lib/RollingCurl.php'; $rc = new RollingCurl(); foreach ($urls as $url) { $rc->add(new RollingCurlRequest("http://scanner.loc/worker.php?url=" . urlencode($url))); } $rc->execute(10); echo "All request sent";