Пример #1
0
 public function getAllMetadata()
 {
     $GLOBALS['metastores'] = array();
     $curl_options = array(CURLOPT_SSL_VERIFYPEER => FALSE, CURLOPT_SSL_VERIFYHOST => FALSE, CURLOPT_USERAGENT, 'Parallel Curl google API request');
     $parallel_curl = new ParallelCurl(count($this->videoResults), $curl_options);
     foreach ($this->videoResults as $id) {
         $metastore = new MetadataStore();
         $search_url = "https://www.googleapis.com/youtube/v3/videos?id=" . $id . "&key=" . API_KEY . "&part=snippet,statistics,contentDetails,recordingDetails";
         $parallel_curl->startRequest($search_url, 'on_request_done', array('id' => $id));
     }
     $parallel_curl->finishAllRequests();
     return $GLOBALS['metastores'];
 }
 /**
  * curl 多线程方法
  */
 public function curlResult($urls)
 {
     include_once WEB_PATH . 'lib/parallelCurl.class.php';
     $max_requests = isset($argv[1]) ? $argv[1] : 20;
     //最大调用次数
     $curl_options = array(CURLOPT_SSL_VERIFYPEER => FALSE, CURLOPT_SSL_VERIFYHOST => FALSE, CURLOPT_USERAGENT, 'Parallel Curl test script');
     $parallelCurl = new ParallelCurl($max_requests, $curl_options);
     foreach ($urls as $key => $terms) {
         if (empty($terms)) {
             continue;
         }
         $parallelCurl->startRequest($terms, '', array($key));
     }
     $parallelCurl->finishAllRequests();
     return $parallelCurl->result;
 }
Пример #3
0
$input = $options['input'];
$output = $options['output'];
$max_requests = $options['maxrequests'];
$organization = $options['organization'];
$email = $options['email'];
if (empty($organization) || empty($email) || !strpos($email, '@')) {
    die("You need to specify a valid organization and email address (found '{$organization}', '{$email}')\n");
}
$agent = 'Crawler from ' . $organization;
$agent .= ' - contact ' . $email;
$agent .= ' to report any problems with my crawling. Based on code from http://petewarden.typepad.com';
$curl_options = array(CURLOPT_SSL_VERIFYPEER => FALSE, CURLOPT_SSL_VERIFYHOST => FALSE, CURLOPT_FOLLOWLOCATION => TRUE, CURLOPT_USERAGENT => $agent);
$urls_string = file_get_contents($input);
$urls = split("\n", $urls_string);
$output_handle = fopen($output, 'w');
$parallel_curl = new ParallelCurl($max_requests, $curl_options);
$count = 0;
foreach ($urls as $url) {
    $count += 1;
    if ($count % 100 == 0) {
        error_log("Completed {$count} urls");
    }
    if (!preg_match('@^/company/@', $url)) {
        continue;
    }
    $full_url = 'http://api.crunchbase.com/v/1' . $url . '.js';
    $data = array('output_handle' => $output_handle);
    $parallel_curl->startRequest($full_url, 'on_request_done', $data);
}
// This should be called when you need to wait for the requests to finish.
// This will automatically run on destruct of the ParallelCurl object, so the next line is optional.
Пример #4
0
    $responseobject = json_decode($content, true);
    if (empty($responseobject['responseData']['results'])) {
        print "No results found for '{$search}'\n";
        return;
    }
    print "********\n";
    print "{$search}:\n";
    print "********\n";
    $allresponseresults = $responseobject['responseData']['results'];
    foreach ($allresponseresults as $responseresult) {
        $title = $responseresult['title'];
        print "{$title}\n";
    }
}
// The terms to search for on Google
$terms_list = array("John", "Mary", "William", "Anna", "James", "Emma", "George", "Elizabeth", "Charles", "Margaret", "Frank", "Minnie", "Joseph", "Ida", "Henry", "Bertha", "Robert", "Clara", "Thomas", "Alice", "Edward", "Annie", "Harry", "Florence", "Walter", "Bessie", "Arthur", "Grace", "Fred", "Ethel", "Albert", "Sarah", "Samuel", "Ella", "Clarence", "Martha", "Louis", "Nellie", "David", "Mabel", "Joe", "Laura", "Charlie", "Carrie", "Richard", "Cora", "Ernest", "Helen", "Roy", "Maude", "Will", "Lillian", "Andrew", "Gertrude", "Jesse", "Rose", "Oscar", "Edna", "Willie", "Pearl", "Daniel", "Edith", "Benjamin", "Jennie", "Carl", "Hattie", "Sam", "Mattie", "Alfred", "Eva", "Earl", "Julia", "Peter", "Myrtle", "Elmer", "Louise", "Frederick", "Lillie", "Howard", "Jessie", "Lewis", "Frances", "Ralph", "Catherine", "Herbert", "Lula", "Paul", "Lena", "Lee", "Marie", "Tom", "Ada", "Herman", "Josephine", "Martin", "Fanny", "Jacob", "Lucy", "Michael", "Dora");
if (isset($argv[1])) {
    $max_requests = $argv[1];
} else {
    $max_requests = 10;
}
$curl_options = array(CURLOPT_SSL_VERIFYPEER => FALSE, CURLOPT_SSL_VERIFYHOST => FALSE, CURLOPT_USERAGENT, 'Parallel Curl test script');
$parallel_curl = new ParallelCurl($max_requests, $curl_options);
foreach ($terms_list as $terms) {
    $search = '"' . $terms . ' is a"';
    $search_url = SEARCH_URL_PREFIX . '&q=' . urlencode($terms);
    $parallel_curl->startRequest($search_url, 'on_request_done', $search);
}
// This should be called when you need to wait for the requests to finish.
// This will automatically run on destruct of the ParallelCurl object, so the next line is optional.
$parallel_curl->finishAllRequests();
Пример #5
0
function scan_queue($profile_list, $gid)
{
    if (donator_level(20)) {
        $max_requests = 24;
    } else {
        $max_requests = 2;
    }
    $curl_options = array(CURLOPT_RETURNTRANSFER => 1, CURLOPT_CONNECTTIMEOUT => 5, CURLOPT_TIMEOUT => 7, CURLOPT_FOLLOWLOCATION => TRUE);
    $parallel_curl = new ParallelCurl($max_requests, $curl_options);
    foreach ($profile_list as $profile) {
        if ($gid == 440) {
            $url = 'http://api.steampowered.com/IEconItems_440/GetPlayerItems/v0001/?key=' . AKey() . '&SteamID=' . $profile['steamid'] . '&format=json';
            $parallel_curl->startRequest($url, 'scan_440_single', $profile);
        }
        if ($gid == 730) {
            $url = 'http://steamcommunity.com/profiles/' . $profile['steamid'] . '/inventory/json/730/2';
            $parallel_curl->startRequest($url, 'scan_730_single', $profile);
        }
        if ($gid == 570) {
            $url = 'http://api.steampowered.com/IEconItems_570/GetPlayerItems/v0001/?key=' . AKey() . '&SteamID=' . $profile['steamid'] . '&format=json';
            $parallel_curl->startRequest($url, 'scan_570_single', $profile);
        }
        //ob_flush();
        //flush();
    }
    // This should be called when you need to wait for the requests to finish.
    // This will automatically run on destruct of the ParallelCurl object, so the next line is optional.
    $parallel_curl->finishAllRequests();
}
Пример #6
0
$email = $options['email'];
$output = $options['output'];
$threads = 2;
if (empty($organization) || empty($email) || !strpos($email, '@')) {
    die("You need to specify a valid organization and email address\n");
}
$agent = 'Crawler from ' . $organization;
$agent .= ' - contact ' . $email;
$agent .= ' to report any problems with my crawling. Based on code from http://petewarden.typepad.com';
$curloptions = array(CURLOPT_USERAGENT => $agent, CURLOPT_TIMEOUT => FETCH_TIMEOUT);
$location_path = strtolower($location);
$location_path = str_replace(',', '', $location_path);
$location_path = str_replace(' ', '-', $location_path);
$query_path = strtolower($query);
$query_path = str_replace(',', '', $query_path);
$query_path = str_replace(' ', '-', $query_path);
$search_url = YELLOW_PAGES_DOMAIN;
$search_url .= '/';
$search_url .= $location_path;
$search_url .= '/';
$search_url .= $query_path;
$search_url .= '?g=' . urlencode($location);
$search_url .= '&q=' . urlencode($query);
$output_handle = fopen($output, 'w') or die("Couldn't open output file '{$output}'\n");
fputcsv($output_handle, array('name', 'address'));
$parallelcurl = new ParallelCurl($threads, $curloptions);
error_log("Starting with '{$search_url}'");
$parallelcurl->startRequest($search_url, 'parse_page', array('output_handle' => $output_handle));
// Important - if you remove this any pending requests may not be processed
$parallelcurl->finishAllRequests();
fclose($output_handle);
    $result = array('page_rank' => $main_page_rank, 'inbound_link_count' => $inbound_link_count, 'page_count' => $site_page_count, 'inbound_links' => $inbound_links);
    return $result;
}
function blekko_seo_url($domain)
{
    return 'http://blekko.com/ws/' . urlencode($domain) . '+/seo';
}
// Take a string with extra cruft in it, and attempt to strip it out and return a number
function pete_as_numeric($input_value)
{
    $clean_value = trim($input_value);
    $clean_value = str_replace(',', '', $clean_value);
    $clean_value = str_replace('%', '', $clean_value);
    $clean_value = str_replace('+', '', $clean_value);
    $clean_value = str_replace('$', '', $clean_value);
    if (is_numeric($clean_value)) {
        $result = $clean_value;
    } else {
        $result = null;
    }
    return $result;
}
set_time_limit(0);
$domain = $_GET['domain'];
$curl_options = array(CURLOPT_USERAGENT, 'PageRankGraph - contact pete@petewarden.com');
$max_requests = 3;
$main_url = blekko_seo_url($domain);
$g_parallel_curl = new ParallelCurl($max_requests, $curl_options);
$g_parallel_curl->startRequest($main_url, 'on_main_request_done', $domain);
$g_parallel_curl->finishAllRequests();
print json_encode($g_domain_info);
Пример #8
0
            }
        }
        // Don't forget to grab the fist count too, just in case.
        if (preg_match('/<div class="count">([\\d]{1,10})<\\/div>/', $line, $matches)) {
            $count = (int) $matches[1];
        }
    }
    // Stuff them into the global associative array for later
    // assembly into a DB4 table.
    $bro_tips[$tipID] = array('tip' => $tip, 'count' => $count);
}
// Set up synchronus cURL wrapper
require_once 'parallelcurl.php';
$curl_options = array(CURLOPT_SSL_VERIFYPEER => FALSE, CURLOPT_SSL_VERIFYHOST => FALSE, CURLOPT_AUTOREFERER => TRUE, CURLOPT_FOLLOWLOCATION => TRUE, CURLOPT_MAXREDIRS => 3, CURLOPT_NOPROGRESS => TRUE);
$maxConcurrent = 10;
$pcurl = new ParallelCurl($maxConcurrent, $curl_options);
// Set to 10 for testing.
$topTip = 1978;
// Perform scraping.  note that there's no way to randomize the wait
// between requests that I can find.  Maybe I'll extend that class?
// Set $i to something > 1500 for testing.
for ($i = 1; $i <= $topTip; ++$i) {
    $pcurl->startRequest("http://www.brotips.com/{$i}", 'store_tip', false);
}
$pcurl->finishAllRequests();
// Since DB4 is being pile of poo, lets also try CSV as a backup.
$writeTipsCSV = True;
if (!($csvHandle = fopen('brotips.csv', 'w'))) {
    echo "Fffuuuu unable to open brotips.csv file..  ";
    $writeTipsCSV = False;
}
    print $userid . "\t" . json_encode($result) . "\n";
}
$cliargs = array('filepattern' => array('short' => 'f', 'type' => 'required', 'description' => 'The files to read the URLs from'), 'organization' => array('short' => 'o', 'type' => 'required', 'description' => 'The name of the organization or company running this crawler'), 'email' => array('short' => 'e', 'type' => 'required', 'description' => 'An email address where server owners can report any problems with this crawler'), 'threads' => array('short' => 't', 'type' => 'optional', 'description' => 'How many to requests to run at the same time', 'default' => 1));
$options = cliargs_get_options($cliargs);
$filepattern = $options['filepattern'];
$organization = $options['organization'];
$email = $options['email'];
$threads = $options['threads'];
if (empty($organization) || empty($email) || !strpos($email, '@')) {
    die("You need to specify a valid organization and email address\n");
}
$agent = 'Crawler from ' . $organization;
$agent .= ' - contact ' . $email;
$agent .= ' to report any problems with my crawling. Based on code from http://petewarden.typepad.com';
$curloptions = array(CURLOPT_SSL_VERIFYPEER => FALSE, CURLOPT_SSL_VERIFYHOST => FALSE, CURLOPT_FOLLOWLOCATION => TRUE, CURLOPT_USERAGENT => $agent, CURLOPT_TIMEOUT => FETCH_TIMEOUT);
$parallelcurl = new ParallelCurl($threads, $curloptions);
// Loop through all the files, extract all the URLs and process them
foreach (glob($filepattern) as $filename) {
    error_log("Reading {$filename}");
    $filehandle = fopen($filename, 'r');
    $usertotal = 0;
    while (!feof($filehandle)) {
        $currentline = fgets($filehandle);
        $currenturl = trim($currentline);
        $usertotal += 1;
        if ($usertotal % 10000 === 0) {
            error_log(number_format($usertotal) . ' users processed');
        }
        if (empty($currenturl)) {
            continue;
        }
Пример #10
0
 private function pullSuggestions($callback)
 {
     $curl = new ParallelCurl(3);
     foreach (["track" => "http://ws.spotify.com/search/1/track.json?q=" . urlencode($this->search), "artist" => "http://ws.spotify.com/search/1/artist.json?q=" . urlencode($this->search), "album" => "http://ws.spotify.com/search/1/album.json?q=" . urlencode($this->search)] as $type => $url) {
         $curl->startRequest($url, [$this, 'processRequest'], ['type' => $type, 'callback' => $callback]);
     }
 }