public function onFinish(FilterCrawlerProcessEvent $event) { $rollingCurl = new \RollingCurl\RollingCurl(); $rollingCurl->setSimultaneousLimit(50); // kicks out javascript:void(0) and # urls $validator = new NoPseudoUrl(); // loop through all the links and add them to rollingcurl foreach ($this->links as &$link) { if ($validator->isValid($link)) { // add get request to curl $rollingCurl->get($link->getLinkHref(), null, array(&$link)); } } $brokenLinks = array(); $rollingCurl->setCallback(function (\RollingCurl\Request $request, \RollingCurl\RollingCurl $rollingCurl) use(&$brokenLinks) { $link = $request->getOptions()[0]; echo "checking -> " . $link->getLinkHref() . "\n"; $link->setStatusCode($request->getResponseInfo()['http_code']); if ($link->getStatusCode() != 200) { $brokenLinks[] = $link; } }); $rollingCurl->execute(); $this->brokenLinks = $brokenLinks; }
<?php require __DIR__ . '/../src/RollingCurl/RollingCurl.php'; require __DIR__ . '/../src/RollingCurl/Request.php'; $rollingCurl = new \RollingCurl\RollingCurl(); $rollingCurl->get('http://yahoo.com')->get('http://google.com')->get('http://hotmail.com')->get('http://msn.com')->get('http://reddit.com')->setCallback(function (\RollingCurl\Request $request, \RollingCurl\RollingCurl $rollingCurl) { if (preg_match("#<title>(.*)</title>#i", $request->getResponseText(), $out)) { $title = $out[1]; } else { $title = '[No Title Tag Found]'; } echo "Fetch complete for (" . $request->getUrl() . ") {$title} " . PHP_EOL; })->execute();
<?php require __DIR__ . '/../src/RollingCurl/RollingCurl.php'; require __DIR__ . '/../src/RollingCurl/Request.php'; $rollingCurl = new \RollingCurl\RollingCurl(); for ($i = 0; $i <= 500; $i += 10) { // https://www.google.com/search?q=curl&start=10 $rollingCurl->get('https://www.google.com/search?q=curl&start=' . $i); } $results = array(); $start = microtime(true); echo "Fetching..." . PHP_EOL; $rollingCurl->setCallback(function (\RollingCurl\Request $request, \RollingCurl\RollingCurl $rollingCurl) use(&$results) { if (preg_match_all('#<h3 class="r"><a href="([^"]+)">(.*)</a></h3>#iU', $request->getResponseText(), $out)) { foreach ($out[1] as $idx => $url) { parse_str(parse_url($url, PHP_URL_QUERY), $params); if (isset($params['q'])) { $results[$params['q']] = strip_tags($out[2][$idx]); } } } echo "Fetch complete for (" . $request->getUrl() . ")" . PHP_EOL; })->setSimultaneousLimit(10)->execute(); echo "...done in " . (microtime(true) - $start) . PHP_EOL; echo "All results: " . PHP_EOL; print_r($results);
<?php require __DIR__ . '/../src/RollingCurl/RollingCurl.php'; require __DIR__ . '/../src/RollingCurl/Request.php'; $rollingCurl = new \RollingCurl\RollingCurl(); $sites = array('http://yahoo.com' => array(CURLOPT_TIMEOUT => 15), 'http://google.com' => array(CURLOPT_TIMEOUT => 5), 'http://hotmail.com' => array(CURLOPT_TIMEOUT => 10), 'http://msn.com' => array(CURLOPT_TIMEOUT => 10), 'http://reddit.com' => array(CURLOPT_TIMEOUT => 25)); foreach ($sites as $url => $options) { $request = new \RollingCurl\Request($url); $request->setOptions($options); $rollingCurl->add($request); } $rollingCurl->setCallback(function (\RollingCurl\Request $request, \RollingCurl\RollingCurl $rollingCurl) { echo 'Fetch complete for (' . $request->getUrl() . ')' . PHP_EOL; })->execute();
<?php require __DIR__ . '/../src/RollingCurl/RollingCurl.php'; require __DIR__ . '/../src/RollingCurl/Request.php'; /* * This example does the same thing as search scrape, but instead of letting * things get processed by the call back, we simply wait until all the HTTP * traffic has been run, then we process the request objects one at a time. * * This is an approach you may wish to take if your callback routine is * particularly long running, so as to not tie up the fetching phase as much. */ $rollingCurl = new \RollingCurl\RollingCurl(); for ($i = 0; $i <= 500; $i += 10) { // https://www.google.com/search?q=curl&start=10 $rollingCurl->get('https://www.google.com/search?q=curl&start=' . $i); } $results = array(); $start = microtime(true); echo 'Fetching...' . PHP_EOL; $rollingCurl->setSimultaneousLimit(10)->execute(); echo '...done in ' . (microtime(true) - $start) . PHP_EOL; foreach ($rollingCurl->getCompletedRequests() as $request) { if (preg_match_all('#<h3 class="r"><a href="([^"]+)">(.*)</a></h3>#iU', $request->getResponseText(), $out)) { foreach ($out[1] as $idx => $url) { parse_str(parse_url($url, PHP_URL_QUERY), $params); $results[$params['q']] = strip_tags($out[2][$idx]); } } echo 'Processsed (' . $request->getUrl() . ')' . PHP_EOL; }
<?php require __DIR__ . '/../src/RollingCurl/RollingCurl.php'; require __DIR__ . '/../src/RollingCurl/Request.php'; // using this library to do a single request is a bit silly, but it will work. $rollingCurl = new \RollingCurl\RollingCurl(); $rollingCurl->get('http://google.com')->setCallback(function (\RollingCurl\Request $request, \RollingCurl\RollingCurl $rollingCurl) { if (preg_match("#<title>(.*)</title>#i", $request->getResponseText(), $out)) { $title = $out[1]; } else { $title = '[No Title Tag Found]'; } echo "Fetch complete for (" . $request->getUrl() . ") {$title} " . PHP_EOL; })->execute();
private function get_attachments($array_to_send) { //1er LOOP -> Crea requests (y prepara directorios) //2do LOOP -> Graba los archivos y llena el array de return $array_to_send //if($this->input->is_cli_request()){ ini_set('memory_limit', $this->config->item('memory_limit')); $server_url = $this->config->item('tableau_server_url'); $admin_user = $this->config->item('tableau_admin_user'); $admin_password = $this->config->item('tableau_admin_pass'); echo '<pre>' . date("Y-m-d H:i:s") . ' Preparando requests de attachments...</pre>' . PHP_EOL; foreach ($array_to_send as $index => $schedule) { $query = $this->schedule->get_schedules($schedule); $schedule = $query->row(); $schedule_id = $schedule->id; $view_name = $schedule->view_name; $site_url = $schedule->site_url; $view_url = $schedule->view_url; $format = $schedule->format; $parametersArray = json_decode($schedule->parameters); $workbook_id = $schedule->workbook_id; $workbook_url = $schedule->workbook_url; $site_id = $schedule->site_id; $user_id = $schedule->user_id; $parametersString = ''; foreach ($parametersArray as $value) { $parametersString = $parametersString . '?' . $value[0] . '=' . $value[2]; } // LOG: Set status = getting attachments $this->schedule->update_schedule_process_status($schedule_id, 1); if ($this->config->item('log_attachments_verbose') == true) { echo date("Y-m-d H:i:s") . ' Obteniendo credenciales para schedule # ' . $schedule_id . PHP_EOL; } //pido credenciales para impersonar, con el 5to parametro $credentials = get_credentials($server_url, $site_url, $admin_user, $admin_password, $user_id); $admin_token = $credentials['token']; $this->delete_previous_schedules($schedule_id); $this->make_dir($schedule_id); list($workbook, $no_sirve, $view) = explode("/", $view_url); $url = $server_url . '/t/' . $site_url . '/views' . '/' . $workbook . '/' . $view . '.' . strtolower($format) . $parametersString; $filename = $this->get_dir($schedule_id) . '\\' . $view_name . '.' . $format; switch ($format) { case "PDF": case "PNG": $url = $server_url . '/t/' . $site_url . '/views' . '/' . $workbook . '/' . $view . '.' . strtolower($format) . $parametersString; break; case "TWBX": if ($site_url != '') { $url = $server_url . '/t/' . strtoupper($site_url) . '/workbooks' . '/' . rawurlencode($workbook_url) . '.' . strtolower($format) . $parametersString; } else { $url = $server_url . '/workbooks' . '/' . rawurlencode($workbook_url) . '.' . strtolower($format) . $parametersString; } break; } //Chequeo el tamaño del attachment contra el limite en un request solo de header //si es mas grande, lo descarto //Como Tableau no permite HEAD requests, se tuvo que hacer una progress_function que frene al 1er kb bajado (con un request comun) //para poder tomar el Content-Length (para chequear el tamaño del attachment) $chHeader = curl_init($url); $content_length = null; curl_setopt($chHeader, CURLOPT_RETURNTRANSFER, true); curl_setopt($chHeader, CURLOPT_USERAGENT, 'curl/7.22.0 (i686-pc-linux-gnu) libcurl/7.22.0 OpenSSL/1.0.1 zlib/1.2.3.4 libidn/1.23 librtmp/2.3'); curl_setopt($chHeader, CURLOPT_HTTPHEADER, array("Cookie: workgroup_session_id=" . $admin_token)); curl_setopt($chHeader, CURLOPT_NOPROGRESS, false); curl_setopt($chHeader, CURLOPT_BUFFERSIZE, 1); // buffersize no se si sirve curl_setopt($chHeader, CURLOPT_PROGRESSFUNCTION, function ($resource, $download_size, $downloaded, $upload_size, $uploaded) use(&$content_length) { if ($downloaded > 0) { $content_length = curl_getinfo($resource, CURLINFO_CONTENT_LENGTH_DOWNLOAD) / 1024 / 1024; return 1; } }); $response = curl_exec($chHeader); $info = curl_getinfo($chHeader); curl_close($chHeader); if ($content_length > $this->config->item('max_content_length')) { echo date("Y-m-d H:i:s") . ' Error: El archivo del schedule ' . "'" . $schedule_id . "'" . ' supera el limite de ' . $this->config->item('max_content_length') . ' MB. Tiene . ' . round($content_length, 2) . ' MB\'s' . PHP_EOL; //LOG: set status = error getting attachment $this->schedule->update_schedule_process_status($schedule_id, -1); } else { $this->aRequests[$schedule_id] = array('url' => $url, 'headers' => array("Cookie: workgroup_session_id=" . $admin_token), 'options' => array(CURLOPT_USERAGENT => 'curl/7.22.0 (i686-pc-linux-gnu) libcurl/7.22.0 OpenSSL/1.0.1 zlib/1.2.3.4 libidn/1.23 librtmp/2.3', CURLOPT_HEADERFUNCTION => array($this, 'header_callback')), 'filename' => $filename); } } // end foreach $rollingCurl = new \RollingCurl\RollingCurl(); foreach ($this->aRequests as $key => $requestInfo) { $request = new \RollingCurl\Request($requestInfo['url'], 'GET'); $request->setExtraInfo(array('schedule_id' => $key)); $rollingCurl->add($request->setHeaders($requestInfo['headers']), $request->addOptions($requestInfo['options'])); } $rollingCurl->setCallback(function (\RollingCurl\Request $request, \RollingCurl\RollingCurl $rollingCurl) { if ($this->config->item('log_memory_usage') == true) { echo date("Y-m-d H:i:s") . ' Memoria usada al entrar al callback ' . round(memory_get_usage() / 1024, 2) . ' KB\'s <br>'; } $info = $request->getResponseInfo(); $extra = $request->getExtraInfo(); $handler = $request->getHandler(); $schedule_id = $extra['schedule_id']; $http_code = $info['http_code']; foreach ($this->aRequests as $key => $requestInfo) { //Todo esto es para obtener el filename que me obliga a usar la funcion header_callback //En el array inicial aRequests no tengo el ID del request, pero //modifique la libreria de Rolling_curl para incluir en el Request el numero del handler. //la funcion de header_callback, agrega al aRequest registros usando el handler como index //entonces cuando tengo un callback con un Request, recorro el aRequests y busco matchearlo //con el handler que me pusheo el header_callback, cuando lo encuentro, paso toda la informacion //al indice de schedule_id correspondiente //y si el http_code es 200, grabo el archivo tambien if ($key == 'Resource id #' . $handler) { $this->aRequests[$schedule_id]['response_info'] = $request->getResponseInfo(); $this->aRequests[$schedule_id]['schedule_id'] = $schedule_id; if (isset($this->aRequests[$key]['filename'])) { $this->aRequests[$schedule_id]['filename'] = $this->aRequests[$key]['filename']; } if ($http_code == 200) { file_put_contents($this->aRequests[$schedule_id]['filename'], $request->getResponseText()); if ($this->config->item('log_attachments_verbose') == true) { echo date("Y-m-d H:i:s") . ' File Saved: ' . $this->aRequests[$schedule_id]['filename'] . '. Size: ' . round(filesize($this->aRequests[$schedule_id]['filename']) / 1024 / 1024, 2) . ' MB\'s' . PHP_EOL; } unset($this->aRequests[$key]); gc_collect_cycles(); } else { //LOG: set status = error getting attachment $this->schedule->update_schedule_process_status($schedule_id, -1); } } } })->setSimultaneousLimit($this->config->item('max_parallel_requests'))->setOptions(array(CURLOPT_RETURNTRANSFER => 1, CURLOPT_FOLLOWLOCATION => 1, CURLOPT_MAXREDIRS => 5, CURLOPT_CONNECTTIMEOUT => 0, CURLOPT_TIMEOUT => 0)); echo '<pre>' . date("Y-m-d H:i:s") . ' Bajando Attachments...</pre>' . PHP_EOL; $rollingCurl->execute(); echo '<pre>' . date("Y-m-d H:i:s") . ' Finalizados todos los downloads...</pre>' . PHP_EOL; //} //end if cli_request //else {echo 'Error: Only script access';} }
<?php require __DIR__ . '/vendor/autoload.php'; if (!isset($argv[1])) { echo 'No board name given (i.e. "Southpark_Sounds_3").'; die(1); } $savePath = __DIR__ . '/sounds/'; $boardName = $argv[1]; $json = file_get_contents('http://www.soundboard.com/handler/gettrackjson.ashx?boardname=' . $boardName); $entries = json_decode($json, true); if (empty($entries)) { echo sprintf('No sounds could be found for board "%s"', $boardName); die(1); } $rollingCurl = new \RollingCurl\RollingCurl(); $fileNameMap = []; foreach ($entries as $entry) { $rollingCurl->get($entry['mp3']); $fileNameMap[$entry['mp3']] = $entry['title']; } $rollingCurl->setCallback(function (\RollingCurl\Request $request, \RollingCurl\RollingCurl $rollingCurl) use($fileNameMap, $savePath) { $fileName = $fileNameMap[$request->getUrl()]; if (!isset($request->getResponseInfo()['content_type']) || 'audio/mpeg' !== $request->getResponseInfo()['content_type']) { echo sprintf('File "%s" is no audio/mpeg file.%s', $fileName, PHP_EOL); } else { $fileNameExt = $fileName . '.mp3'; file_put_contents($savePath . $fileNameExt, $request->getResponseText()); echo sprintf('Downloaded: %s%s', $fileNameExt, PHP_EOL); } })->setSimultaneousLimit(3)->execute();
<?php /****************************************************************************** * Author: Petr Suchy (xsuchy09) <*****@*****.**> <http://www.wamos.cz> * Subject: WAMOS <http://www.wamos.cz> * Project: rollingcurl * Copyright: (c) Petr Suchy (xsuchy09) <*****@*****.**> <http://www.wamos.cz> *****************************************************************************/ require __DIR__ . '/../src/RollingCurl/RollingCurl.php'; require __DIR__ . '/../src/RollingCurl/Request.php'; $rollingCurl = new \RollingCurl\RollingCurl(); $rollingCurl->get('http://yahoo.com')->get('http://google.com')->get('http://hotmail.com')->get('http://msn.com')->get('http://reddit.com')->setCallback(function (\RollingCurl\Request $request, \RollingCurl\RollingCurl $rollingCurl) { if (preg_match("#<title>(.*)</title>#i", $request->getResponseText(), $out)) { $title = $out[1]; } else { $title = '[No Title Tag Found]'; } echo 'Fetch complete for (' . $request->getUrl() . ') ' . $title . ' in ' . round($request->getExecutionTime(), 3) . ' seconds' . PHP_EOL; // Clear list of completed requests and prune pending request queue to avoid memory growth $rollingCurl->clearCompleted(); $rollingCurl->prunePendingRequestQueue(); })->execute();