public function onFinish(FilterCrawlerProcessEvent $event)
 {
     $rollingCurl = new \RollingCurl\RollingCurl();
     $rollingCurl->setSimultaneousLimit(50);
     // kicks out javascript:void(0) and # urls
     $validator = new NoPseudoUrl();
     // loop through all the links and add them to rollingcurl
     foreach ($this->links as &$link) {
         if ($validator->isValid($link)) {
             // add get request to curl
             $rollingCurl->get($link->getLinkHref(), null, array(&$link));
         }
     }
     $brokenLinks = array();
     $rollingCurl->setCallback(function (\RollingCurl\Request $request, \RollingCurl\RollingCurl $rollingCurl) use(&$brokenLinks) {
         $link = $request->getOptions()[0];
         echo "checking -> " . $link->getLinkHref() . "\n";
         $link->setStatusCode($request->getResponseInfo()['http_code']);
         if ($link->getStatusCode() != 200) {
             $brokenLinks[] = $link;
         }
     });
     $rollingCurl->execute();
     $this->brokenLinks = $brokenLinks;
 }
Esempio n. 2
0
<?php

require __DIR__ . '/../src/RollingCurl/RollingCurl.php';
require __DIR__ . '/../src/RollingCurl/Request.php';
$rollingCurl = new \RollingCurl\RollingCurl();
$rollingCurl->get('http://yahoo.com')->get('http://google.com')->get('http://hotmail.com')->get('http://msn.com')->get('http://reddit.com')->setCallback(function (\RollingCurl\Request $request, \RollingCurl\RollingCurl $rollingCurl) {
    if (preg_match("#<title>(.*)</title>#i", $request->getResponseText(), $out)) {
        $title = $out[1];
    } else {
        $title = '[No Title Tag Found]';
    }
    echo "Fetch complete for (" . $request->getUrl() . ") {$title} " . PHP_EOL;
})->execute();
Esempio n. 3
0
<?php

require __DIR__ . '/../src/RollingCurl/RollingCurl.php';
require __DIR__ . '/../src/RollingCurl/Request.php';
$rollingCurl = new \RollingCurl\RollingCurl();
for ($i = 0; $i <= 500; $i += 10) {
    // https://www.google.com/search?q=curl&start=10
    $rollingCurl->get('https://www.google.com/search?q=curl&start=' . $i);
}
$results = array();
$start = microtime(true);
echo "Fetching..." . PHP_EOL;
$rollingCurl->setCallback(function (\RollingCurl\Request $request, \RollingCurl\RollingCurl $rollingCurl) use(&$results) {
    if (preg_match_all('#<h3 class="r"><a href="([^"]+)">(.*)</a></h3>#iU', $request->getResponseText(), $out)) {
        foreach ($out[1] as $idx => $url) {
            parse_str(parse_url($url, PHP_URL_QUERY), $params);
            if (isset($params['q'])) {
                $results[$params['q']] = strip_tags($out[2][$idx]);
            }
        }
    }
    echo "Fetch complete for (" . $request->getUrl() . ")" . PHP_EOL;
})->setSimultaneousLimit(10)->execute();
echo "...done in " . (microtime(true) - $start) . PHP_EOL;
echo "All results: " . PHP_EOL;
print_r($results);
<?php

require __DIR__ . '/../src/RollingCurl/RollingCurl.php';
require __DIR__ . '/../src/RollingCurl/Request.php';
$rollingCurl = new \RollingCurl\RollingCurl();
$sites = array('http://yahoo.com' => array(CURLOPT_TIMEOUT => 15), 'http://google.com' => array(CURLOPT_TIMEOUT => 5), 'http://hotmail.com' => array(CURLOPT_TIMEOUT => 10), 'http://msn.com' => array(CURLOPT_TIMEOUT => 10), 'http://reddit.com' => array(CURLOPT_TIMEOUT => 25));
foreach ($sites as $url => $options) {
    $request = new \RollingCurl\Request($url);
    $request->setOptions($options);
    $rollingCurl->add($request);
}
$rollingCurl->setCallback(function (\RollingCurl\Request $request, \RollingCurl\RollingCurl $rollingCurl) {
    echo 'Fetch complete for (' . $request->getUrl() . ')' . PHP_EOL;
})->execute();
<?php

require __DIR__ . '/../src/RollingCurl/RollingCurl.php';
require __DIR__ . '/../src/RollingCurl/Request.php';
/*
 * This example does the same thing as search scrape, but instead of letting
 * things get processed by the call back, we simply wait until all the HTTP
 * traffic has been run, then we process the request objects one at a time.
 *
 * This is an approach you may wish to take if your callback routine is
 * particularly long running, so as to not tie up the fetching phase as much.
 */
$rollingCurl = new \RollingCurl\RollingCurl();
for ($i = 0; $i <= 500; $i += 10) {
    // https://www.google.com/search?q=curl&start=10
    $rollingCurl->get('https://www.google.com/search?q=curl&start=' . $i);
}
$results = array();
$start = microtime(true);
echo 'Fetching...' . PHP_EOL;
$rollingCurl->setSimultaneousLimit(10)->execute();
echo '...done in ' . (microtime(true) - $start) . PHP_EOL;
foreach ($rollingCurl->getCompletedRequests() as $request) {
    if (preg_match_all('#<h3 class="r"><a href="([^"]+)">(.*)</a></h3>#iU', $request->getResponseText(), $out)) {
        foreach ($out[1] as $idx => $url) {
            parse_str(parse_url($url, PHP_URL_QUERY), $params);
            $results[$params['q']] = strip_tags($out[2][$idx]);
        }
    }
    echo 'Processsed (' . $request->getUrl() . ')' . PHP_EOL;
}
Esempio n. 6
0
<?php

require __DIR__ . '/../src/RollingCurl/RollingCurl.php';
require __DIR__ . '/../src/RollingCurl/Request.php';
// using this library to do a single request is a bit silly, but it will work.
$rollingCurl = new \RollingCurl\RollingCurl();
$rollingCurl->get('http://google.com')->setCallback(function (\RollingCurl\Request $request, \RollingCurl\RollingCurl $rollingCurl) {
    if (preg_match("#<title>(.*)</title>#i", $request->getResponseText(), $out)) {
        $title = $out[1];
    } else {
        $title = '[No Title Tag Found]';
    }
    echo "Fetch complete for (" . $request->getUrl() . ") {$title} " . PHP_EOL;
})->execute();
Esempio n. 7
0
 private function get_attachments($array_to_send)
 {
     //1er LOOP -> Crea requests (y prepara directorios)
     //2do LOOP -> Graba los archivos y llena el array de return $array_to_send
     //if($this->input->is_cli_request()){
     ini_set('memory_limit', $this->config->item('memory_limit'));
     $server_url = $this->config->item('tableau_server_url');
     $admin_user = $this->config->item('tableau_admin_user');
     $admin_password = $this->config->item('tableau_admin_pass');
     echo '<pre>' . date("Y-m-d H:i:s") . ' Preparando requests de attachments...</pre>' . PHP_EOL;
     foreach ($array_to_send as $index => $schedule) {
         $query = $this->schedule->get_schedules($schedule);
         $schedule = $query->row();
         $schedule_id = $schedule->id;
         $view_name = $schedule->view_name;
         $site_url = $schedule->site_url;
         $view_url = $schedule->view_url;
         $format = $schedule->format;
         $parametersArray = json_decode($schedule->parameters);
         $workbook_id = $schedule->workbook_id;
         $workbook_url = $schedule->workbook_url;
         $site_id = $schedule->site_id;
         $user_id = $schedule->user_id;
         $parametersString = '';
         foreach ($parametersArray as $value) {
             $parametersString = $parametersString . '?' . $value[0] . '=' . $value[2];
         }
         // LOG: Set status = getting attachments
         $this->schedule->update_schedule_process_status($schedule_id, 1);
         if ($this->config->item('log_attachments_verbose') == true) {
             echo date("Y-m-d H:i:s") . ' Obteniendo credenciales para schedule # ' . $schedule_id . PHP_EOL;
         }
         //pido credenciales para impersonar, con el 5to parametro
         $credentials = get_credentials($server_url, $site_url, $admin_user, $admin_password, $user_id);
         $admin_token = $credentials['token'];
         $this->delete_previous_schedules($schedule_id);
         $this->make_dir($schedule_id);
         list($workbook, $no_sirve, $view) = explode("/", $view_url);
         $url = $server_url . '/t/' . $site_url . '/views' . '/' . $workbook . '/' . $view . '.' . strtolower($format) . $parametersString;
         $filename = $this->get_dir($schedule_id) . '\\' . $view_name . '.' . $format;
         switch ($format) {
             case "PDF":
             case "PNG":
                 $url = $server_url . '/t/' . $site_url . '/views' . '/' . $workbook . '/' . $view . '.' . strtolower($format) . $parametersString;
                 break;
             case "TWBX":
                 if ($site_url != '') {
                     $url = $server_url . '/t/' . strtoupper($site_url) . '/workbooks' . '/' . rawurlencode($workbook_url) . '.' . strtolower($format) . $parametersString;
                 } else {
                     $url = $server_url . '/workbooks' . '/' . rawurlencode($workbook_url) . '.' . strtolower($format) . $parametersString;
                 }
                 break;
         }
         //Chequeo el tamaño del attachment contra el limite en un request solo de header
         //si es mas grande, lo descarto
         //Como Tableau no permite HEAD requests, se tuvo que hacer una progress_function que frene al 1er kb bajado (con un request comun)
         //para poder tomar el Content-Length (para chequear el tamaño del attachment)
         $chHeader = curl_init($url);
         $content_length = null;
         curl_setopt($chHeader, CURLOPT_RETURNTRANSFER, true);
         curl_setopt($chHeader, CURLOPT_USERAGENT, 'curl/7.22.0 (i686-pc-linux-gnu) libcurl/7.22.0 OpenSSL/1.0.1 zlib/1.2.3.4 libidn/1.23 librtmp/2.3');
         curl_setopt($chHeader, CURLOPT_HTTPHEADER, array("Cookie: workgroup_session_id=" . $admin_token));
         curl_setopt($chHeader, CURLOPT_NOPROGRESS, false);
         curl_setopt($chHeader, CURLOPT_BUFFERSIZE, 1);
         // buffersize no se si sirve
         curl_setopt($chHeader, CURLOPT_PROGRESSFUNCTION, function ($resource, $download_size, $downloaded, $upload_size, $uploaded) use(&$content_length) {
             if ($downloaded > 0) {
                 $content_length = curl_getinfo($resource, CURLINFO_CONTENT_LENGTH_DOWNLOAD) / 1024 / 1024;
                 return 1;
             }
         });
         $response = curl_exec($chHeader);
         $info = curl_getinfo($chHeader);
         curl_close($chHeader);
         if ($content_length > $this->config->item('max_content_length')) {
             echo date("Y-m-d H:i:s") . ' Error: El archivo del schedule ' . "'" . $schedule_id . "'" . ' supera el limite de ' . $this->config->item('max_content_length') . ' MB. Tiene . ' . round($content_length, 2) . ' MB\'s' . PHP_EOL;
             //LOG: set status = error getting attachment
             $this->schedule->update_schedule_process_status($schedule_id, -1);
         } else {
             $this->aRequests[$schedule_id] = array('url' => $url, 'headers' => array("Cookie: workgroup_session_id=" . $admin_token), 'options' => array(CURLOPT_USERAGENT => 'curl/7.22.0 (i686-pc-linux-gnu) libcurl/7.22.0 OpenSSL/1.0.1 zlib/1.2.3.4 libidn/1.23 librtmp/2.3', CURLOPT_HEADERFUNCTION => array($this, 'header_callback')), 'filename' => $filename);
         }
     }
     // end foreach
     $rollingCurl = new \RollingCurl\RollingCurl();
     foreach ($this->aRequests as $key => $requestInfo) {
         $request = new \RollingCurl\Request($requestInfo['url'], 'GET');
         $request->setExtraInfo(array('schedule_id' => $key));
         $rollingCurl->add($request->setHeaders($requestInfo['headers']), $request->addOptions($requestInfo['options']));
     }
     $rollingCurl->setCallback(function (\RollingCurl\Request $request, \RollingCurl\RollingCurl $rollingCurl) {
         if ($this->config->item('log_memory_usage') == true) {
             echo date("Y-m-d H:i:s") . ' Memoria usada al entrar al callback ' . round(memory_get_usage() / 1024, 2) . ' KB\'s <br>';
         }
         $info = $request->getResponseInfo();
         $extra = $request->getExtraInfo();
         $handler = $request->getHandler();
         $schedule_id = $extra['schedule_id'];
         $http_code = $info['http_code'];
         foreach ($this->aRequests as $key => $requestInfo) {
             //Todo esto es para obtener el filename que me obliga a usar la funcion header_callback
             //En el array inicial aRequests no tengo  el ID del request, pero
             //modifique la libreria de Rolling_curl para incluir en el Request el numero del handler.
             //la funcion de header_callback, agrega al aRequest registros usando el handler como index
             //entonces cuando tengo un callback con un Request, recorro el aRequests y busco matchearlo
             //con el handler que me pusheo el header_callback, cuando lo encuentro, paso toda la informacion
             //al indice de schedule_id correspondiente
             //y si el http_code es 200, grabo el archivo tambien
             if ($key == 'Resource id #' . $handler) {
                 $this->aRequests[$schedule_id]['response_info'] = $request->getResponseInfo();
                 $this->aRequests[$schedule_id]['schedule_id'] = $schedule_id;
                 if (isset($this->aRequests[$key]['filename'])) {
                     $this->aRequests[$schedule_id]['filename'] = $this->aRequests[$key]['filename'];
                 }
                 if ($http_code == 200) {
                     file_put_contents($this->aRequests[$schedule_id]['filename'], $request->getResponseText());
                     if ($this->config->item('log_attachments_verbose') == true) {
                         echo date("Y-m-d H:i:s") . ' File Saved: ' . $this->aRequests[$schedule_id]['filename'] . '. Size: ' . round(filesize($this->aRequests[$schedule_id]['filename']) / 1024 / 1024, 2) . ' MB\'s' . PHP_EOL;
                     }
                     unset($this->aRequests[$key]);
                     gc_collect_cycles();
                 } else {
                     //LOG: set status = error getting attachment
                     $this->schedule->update_schedule_process_status($schedule_id, -1);
                 }
             }
         }
     })->setSimultaneousLimit($this->config->item('max_parallel_requests'))->setOptions(array(CURLOPT_RETURNTRANSFER => 1, CURLOPT_FOLLOWLOCATION => 1, CURLOPT_MAXREDIRS => 5, CURLOPT_CONNECTTIMEOUT => 0, CURLOPT_TIMEOUT => 0));
     echo '<pre>' . date("Y-m-d H:i:s") . ' Bajando Attachments...</pre>' . PHP_EOL;
     $rollingCurl->execute();
     echo '<pre>' . date("Y-m-d H:i:s") . ' Finalizados todos los downloads...</pre>' . PHP_EOL;
     //} //end if cli_request
     //else {echo 'Error: Only script access';}
 }
Esempio n. 8
0
<?php

require __DIR__ . '/vendor/autoload.php';
if (!isset($argv[1])) {
    echo 'No board name given (i.e. "Southpark_Sounds_3").';
    die(1);
}
$savePath = __DIR__ . '/sounds/';
$boardName = $argv[1];
$json = file_get_contents('http://www.soundboard.com/handler/gettrackjson.ashx?boardname=' . $boardName);
$entries = json_decode($json, true);
if (empty($entries)) {
    echo sprintf('No sounds could be found for board "%s"', $boardName);
    die(1);
}
$rollingCurl = new \RollingCurl\RollingCurl();
$fileNameMap = [];
foreach ($entries as $entry) {
    $rollingCurl->get($entry['mp3']);
    $fileNameMap[$entry['mp3']] = $entry['title'];
}
$rollingCurl->setCallback(function (\RollingCurl\Request $request, \RollingCurl\RollingCurl $rollingCurl) use($fileNameMap, $savePath) {
    $fileName = $fileNameMap[$request->getUrl()];
    if (!isset($request->getResponseInfo()['content_type']) || 'audio/mpeg' !== $request->getResponseInfo()['content_type']) {
        echo sprintf('File "%s" is no audio/mpeg file.%s', $fileName, PHP_EOL);
    } else {
        $fileNameExt = $fileName . '.mp3';
        file_put_contents($savePath . $fileNameExt, $request->getResponseText());
        echo sprintf('Downloaded: %s%s', $fileNameExt, PHP_EOL);
    }
})->setSimultaneousLimit(3)->execute();
Esempio n. 9
0
<?php

/******************************************************************************
 * Author: Petr Suchy (xsuchy09) <*****@*****.**> <http://www.wamos.cz>
 * Subject: WAMOS <http://www.wamos.cz>
 * Project: rollingcurl
 * Copyright: (c) Petr Suchy (xsuchy09) <*****@*****.**> <http://www.wamos.cz>
 *****************************************************************************/
require __DIR__ . '/../src/RollingCurl/RollingCurl.php';
require __DIR__ . '/../src/RollingCurl/Request.php';
$rollingCurl = new \RollingCurl\RollingCurl();
$rollingCurl->get('http://yahoo.com')->get('http://google.com')->get('http://hotmail.com')->get('http://msn.com')->get('http://reddit.com')->setCallback(function (\RollingCurl\Request $request, \RollingCurl\RollingCurl $rollingCurl) {
    if (preg_match("#<title>(.*)</title>#i", $request->getResponseText(), $out)) {
        $title = $out[1];
    } else {
        $title = '[No Title Tag Found]';
    }
    echo 'Fetch complete for (' . $request->getUrl() . ') ' . $title . ' in ' . round($request->getExecutionTime(), 3) . ' seconds' . PHP_EOL;
    // Clear list of completed requests and prune pending request queue to avoid memory growth
    $rollingCurl->clearCompleted();
    $rollingCurl->prunePendingRequestQueue();
})->execute();