public function getAllMetadata() { $GLOBALS['metastores'] = array(); $curl_options = array(CURLOPT_SSL_VERIFYPEER => FALSE, CURLOPT_SSL_VERIFYHOST => FALSE, CURLOPT_USERAGENT, 'Parallel Curl google API request'); $parallel_curl = new ParallelCurl(count($this->videoResults), $curl_options); foreach ($this->videoResults as $id) { $metastore = new MetadataStore(); $search_url = "https://www.googleapis.com/youtube/v3/videos?id=" . $id . "&key=" . API_KEY . "&part=snippet,statistics,contentDetails,recordingDetails"; $parallel_curl->startRequest($search_url, 'on_request_done', array('id' => $id)); } $parallel_curl->finishAllRequests(); return $GLOBALS['metastores']; }
/** * curl 多线程方法 */ public function curlResult($urls) { include_once WEB_PATH . 'lib/parallelCurl.class.php'; $max_requests = isset($argv[1]) ? $argv[1] : 20; //最大调用次数 $curl_options = array(CURLOPT_SSL_VERIFYPEER => FALSE, CURLOPT_SSL_VERIFYHOST => FALSE, CURLOPT_USERAGENT, 'Parallel Curl test script'); $parallelCurl = new ParallelCurl($max_requests, $curl_options); foreach ($urls as $key => $terms) { if (empty($terms)) { continue; } $parallelCurl->startRequest($terms, '', array($key)); } $parallelCurl->finishAllRequests(); return $parallelCurl->result; }
$input = $options['input']; $output = $options['output']; $max_requests = $options['maxrequests']; $organization = $options['organization']; $email = $options['email']; if (empty($organization) || empty($email) || !strpos($email, '@')) { die("You need to specify a valid organization and email address (found '{$organization}', '{$email}')\n"); } $agent = 'Crawler from ' . $organization; $agent .= ' - contact ' . $email; $agent .= ' to report any problems with my crawling. Based on code from http://petewarden.typepad.com'; $curl_options = array(CURLOPT_SSL_VERIFYPEER => FALSE, CURLOPT_SSL_VERIFYHOST => FALSE, CURLOPT_FOLLOWLOCATION => TRUE, CURLOPT_USERAGENT => $agent); $urls_string = file_get_contents($input); $urls = split("\n", $urls_string); $output_handle = fopen($output, 'w'); $parallel_curl = new ParallelCurl($max_requests, $curl_options); $count = 0; foreach ($urls as $url) { $count += 1; if ($count % 100 == 0) { error_log("Completed {$count} urls"); } if (!preg_match('@^/company/@', $url)) { continue; } $full_url = 'http://api.crunchbase.com/v/1' . $url . '.js'; $data = array('output_handle' => $output_handle); $parallel_curl->startRequest($full_url, 'on_request_done', $data); } // This should be called when you need to wait for the requests to finish. // This will automatically run on destruct of the ParallelCurl object, so the next line is optional.
$responseobject = json_decode($content, true); if (empty($responseobject['responseData']['results'])) { print "No results found for '{$search}'\n"; return; } print "********\n"; print "{$search}:\n"; print "********\n"; $allresponseresults = $responseobject['responseData']['results']; foreach ($allresponseresults as $responseresult) { $title = $responseresult['title']; print "{$title}\n"; } } // The terms to search for on Google $terms_list = array("John", "Mary", "William", "Anna", "James", "Emma", "George", "Elizabeth", "Charles", "Margaret", "Frank", "Minnie", "Joseph", "Ida", "Henry", "Bertha", "Robert", "Clara", "Thomas", "Alice", "Edward", "Annie", "Harry", "Florence", "Walter", "Bessie", "Arthur", "Grace", "Fred", "Ethel", "Albert", "Sarah", "Samuel", "Ella", "Clarence", "Martha", "Louis", "Nellie", "David", "Mabel", "Joe", "Laura", "Charlie", "Carrie", "Richard", "Cora", "Ernest", "Helen", "Roy", "Maude", "Will", "Lillian", "Andrew", "Gertrude", "Jesse", "Rose", "Oscar", "Edna", "Willie", "Pearl", "Daniel", "Edith", "Benjamin", "Jennie", "Carl", "Hattie", "Sam", "Mattie", "Alfred", "Eva", "Earl", "Julia", "Peter", "Myrtle", "Elmer", "Louise", "Frederick", "Lillie", "Howard", "Jessie", "Lewis", "Frances", "Ralph", "Catherine", "Herbert", "Lula", "Paul", "Lena", "Lee", "Marie", "Tom", "Ada", "Herman", "Josephine", "Martin", "Fanny", "Jacob", "Lucy", "Michael", "Dora"); if (isset($argv[1])) { $max_requests = $argv[1]; } else { $max_requests = 10; } $curl_options = array(CURLOPT_SSL_VERIFYPEER => FALSE, CURLOPT_SSL_VERIFYHOST => FALSE, CURLOPT_USERAGENT, 'Parallel Curl test script'); $parallel_curl = new ParallelCurl($max_requests, $curl_options); foreach ($terms_list as $terms) { $search = '"' . $terms . ' is a"'; $search_url = SEARCH_URL_PREFIX . '&q=' . urlencode($terms); $parallel_curl->startRequest($search_url, 'on_request_done', $search); } // This should be called when you need to wait for the requests to finish. // This will automatically run on destruct of the ParallelCurl object, so the next line is optional. $parallel_curl->finishAllRequests();
function scan_queue($profile_list, $gid) { if (donator_level(20)) { $max_requests = 24; } else { $max_requests = 2; } $curl_options = array(CURLOPT_RETURNTRANSFER => 1, CURLOPT_CONNECTTIMEOUT => 5, CURLOPT_TIMEOUT => 7, CURLOPT_FOLLOWLOCATION => TRUE); $parallel_curl = new ParallelCurl($max_requests, $curl_options); foreach ($profile_list as $profile) { if ($gid == 440) { $url = 'http://api.steampowered.com/IEconItems_440/GetPlayerItems/v0001/?key=' . AKey() . '&SteamID=' . $profile['steamid'] . '&format=json'; $parallel_curl->startRequest($url, 'scan_440_single', $profile); } if ($gid == 730) { $url = 'http://steamcommunity.com/profiles/' . $profile['steamid'] . '/inventory/json/730/2'; $parallel_curl->startRequest($url, 'scan_730_single', $profile); } if ($gid == 570) { $url = 'http://api.steampowered.com/IEconItems_570/GetPlayerItems/v0001/?key=' . AKey() . '&SteamID=' . $profile['steamid'] . '&format=json'; $parallel_curl->startRequest($url, 'scan_570_single', $profile); } //ob_flush(); //flush(); } // This should be called when you need to wait for the requests to finish. // This will automatically run on destruct of the ParallelCurl object, so the next line is optional. $parallel_curl->finishAllRequests(); }
$email = $options['email']; $output = $options['output']; $threads = 2; if (empty($organization) || empty($email) || !strpos($email, '@')) { die("You need to specify a valid organization and email address\n"); } $agent = 'Crawler from ' . $organization; $agent .= ' - contact ' . $email; $agent .= ' to report any problems with my crawling. Based on code from http://petewarden.typepad.com'; $curloptions = array(CURLOPT_USERAGENT => $agent, CURLOPT_TIMEOUT => FETCH_TIMEOUT); $location_path = strtolower($location); $location_path = str_replace(',', '', $location_path); $location_path = str_replace(' ', '-', $location_path); $query_path = strtolower($query); $query_path = str_replace(',', '', $query_path); $query_path = str_replace(' ', '-', $query_path); $search_url = YELLOW_PAGES_DOMAIN; $search_url .= '/'; $search_url .= $location_path; $search_url .= '/'; $search_url .= $query_path; $search_url .= '?g=' . urlencode($location); $search_url .= '&q=' . urlencode($query); $output_handle = fopen($output, 'w') or die("Couldn't open output file '{$output}'\n"); fputcsv($output_handle, array('name', 'address')); $parallelcurl = new ParallelCurl($threads, $curloptions); error_log("Starting with '{$search_url}'"); $parallelcurl->startRequest($search_url, 'parse_page', array('output_handle' => $output_handle)); // Important - if you remove this any pending requests may not be processed $parallelcurl->finishAllRequests(); fclose($output_handle);
$result = array('page_rank' => $main_page_rank, 'inbound_link_count' => $inbound_link_count, 'page_count' => $site_page_count, 'inbound_links' => $inbound_links); return $result; } function blekko_seo_url($domain) { return 'http://blekko.com/ws/' . urlencode($domain) . '+/seo'; } // Take a string with extra cruft in it, and attempt to strip it out and return a number function pete_as_numeric($input_value) { $clean_value = trim($input_value); $clean_value = str_replace(',', '', $clean_value); $clean_value = str_replace('%', '', $clean_value); $clean_value = str_replace('+', '', $clean_value); $clean_value = str_replace('$', '', $clean_value); if (is_numeric($clean_value)) { $result = $clean_value; } else { $result = null; } return $result; } set_time_limit(0); $domain = $_GET['domain']; $curl_options = array(CURLOPT_USERAGENT, 'PageRankGraph - contact pete@petewarden.com'); $max_requests = 3; $main_url = blekko_seo_url($domain); $g_parallel_curl = new ParallelCurl($max_requests, $curl_options); $g_parallel_curl->startRequest($main_url, 'on_main_request_done', $domain); $g_parallel_curl->finishAllRequests(); print json_encode($g_domain_info);
} } // Don't forget to grab the fist count too, just in case. if (preg_match('/<div class="count">([\\d]{1,10})<\\/div>/', $line, $matches)) { $count = (int) $matches[1]; } } // Stuff them into the global associative array for later // assembly into a DB4 table. $bro_tips[$tipID] = array('tip' => $tip, 'count' => $count); } // Set up synchronus cURL wrapper require_once 'parallelcurl.php'; $curl_options = array(CURLOPT_SSL_VERIFYPEER => FALSE, CURLOPT_SSL_VERIFYHOST => FALSE, CURLOPT_AUTOREFERER => TRUE, CURLOPT_FOLLOWLOCATION => TRUE, CURLOPT_MAXREDIRS => 3, CURLOPT_NOPROGRESS => TRUE); $maxConcurrent = 10; $pcurl = new ParallelCurl($maxConcurrent, $curl_options); // Set to 10 for testing. $topTip = 1978; // Perform scraping. note that there's no way to randomize the wait // between requests that I can find. Maybe I'll extend that class? // Set $i to something > 1500 for testing. for ($i = 1; $i <= $topTip; ++$i) { $pcurl->startRequest("http://www.brotips.com/{$i}", 'store_tip', false); } $pcurl->finishAllRequests(); // Since DB4 is being pile of poo, lets also try CSV as a backup. $writeTipsCSV = True; if (!($csvHandle = fopen('brotips.csv', 'w'))) { echo "Fffuuuu unable to open brotips.csv file.. "; $writeTipsCSV = False; }
print $userid . "\t" . json_encode($result) . "\n"; } $cliargs = array('filepattern' => array('short' => 'f', 'type' => 'required', 'description' => 'The files to read the URLs from'), 'organization' => array('short' => 'o', 'type' => 'required', 'description' => 'The name of the organization or company running this crawler'), 'email' => array('short' => 'e', 'type' => 'required', 'description' => 'An email address where server owners can report any problems with this crawler'), 'threads' => array('short' => 't', 'type' => 'optional', 'description' => 'How many to requests to run at the same time', 'default' => 1)); $options = cliargs_get_options($cliargs); $filepattern = $options['filepattern']; $organization = $options['organization']; $email = $options['email']; $threads = $options['threads']; if (empty($organization) || empty($email) || !strpos($email, '@')) { die("You need to specify a valid organization and email address\n"); } $agent = 'Crawler from ' . $organization; $agent .= ' - contact ' . $email; $agent .= ' to report any problems with my crawling. Based on code from http://petewarden.typepad.com'; $curloptions = array(CURLOPT_SSL_VERIFYPEER => FALSE, CURLOPT_SSL_VERIFYHOST => FALSE, CURLOPT_FOLLOWLOCATION => TRUE, CURLOPT_USERAGENT => $agent, CURLOPT_TIMEOUT => FETCH_TIMEOUT); $parallelcurl = new ParallelCurl($threads, $curloptions); // Loop through all the files, extract all the URLs and process them foreach (glob($filepattern) as $filename) { error_log("Reading {$filename}"); $filehandle = fopen($filename, 'r'); $usertotal = 0; while (!feof($filehandle)) { $currentline = fgets($filehandle); $currenturl = trim($currentline); $usertotal += 1; if ($usertotal % 10000 === 0) { error_log(number_format($usertotal) . ' users processed'); } if (empty($currenturl)) { continue; }
private function pullSuggestions($callback) { $curl = new ParallelCurl(3); foreach (["track" => "http://ws.spotify.com/search/1/track.json?q=" . urlencode($this->search), "artist" => "http://ws.spotify.com/search/1/artist.json?q=" . urlencode($this->search), "album" => "http://ws.spotify.com/search/1/album.json?q=" . urlencode($this->search)] as $type => $url) { $curl->startRequest($url, [$this, 'processRequest'], ['type' => $type, 'callback' => $callback]); } }