Example #1
0
 /**
  * @param string $url
  * @return ProductList $list
  */
 public function scrape($url)
 {
     $productList = new ProductList();
     $client = new Client();
     $crawler = $client->request('GET', $url);
     $crawler->filterXPath('//*[@class="productInfo"]/h3/a')->each(function ($node) use($productList, $client) {
         /** @var Crawler $node */
         $product = new Product();
         $link = $node->link();
         $subPage = $client->click($link);
         $subPage->filter('.productTitleDescriptionContainer > h1')->first()->each(function ($node) use($product) {
             /** @var Crawler $node */
             $product->title = trim($node->text());
         });
         $subPage->filter('.pricePerUnit')->each(function ($node) use($product) {
             /** @var Crawler $node */
             $product->unitPrice = $node->text();
         });
         $product->size = sprintf("%.2f", strlen($subPage->html()) / 1024);
         $subPage->filter('.pricePerUnit')->each(function ($node) use($product) {
             /** @var Crawler $node */
             preg_match("/[\\d\\.]+/", $node->text(), $price);
             $product->unitPrice = $price[0];
         });
         $subPage->filter('htmlcontent > div')->first()->each(function ($node) use($product) {
             /** @var Crawler $node */
             $product->description = trim($node->text());
         });
         $productList->addProduct($product);
     });
     return $productList;
 }
Example #2
0
 /**
  * @param boolean $allPages
  */
 public function run($allPages)
 {
     $client = new Client();
     $client->getClient()->setDefaultOption('config/curl/' . CURLOPT_TIMEOUT, 30);
     $client->setHeader('User-Agent', $this->config['user_agent']);
     try {
         $crawler = $client->request('GET', $this->config['url']);
     } catch (TransferException $e) {
         echo $e->getMessage() . PHP_EOL;
         exit(1);
     }
     if ($client->getResponse()->getStatus() == 200) {
         $this->getUrlsAndDownload($crawler);
         if ($allPages) {
             $link = $this->getNextLink($crawler);
             while ($link) {
                 $crawler = $client->click($link);
                 $this->getUrlsAndDownload($crawler);
                 $link = $this->getNextLink($crawler);
             }
         }
     } else {
         echo "site not available\n";
     }
 }
Example #3
0
 public function testUserClicksRegLinkAndIsTakenToRegPage()
 {
     $client = new Client();
     $crawler = $client->request('GET', 'http://localhost:8000');
     $link = $crawler->selectLink('Create an account')->link();
     $crawler = $client->click($link);
     $this->assertCount(1, $crawler->filter('h1:contains("Create a TODOParrot Account")'));
 }
Example #4
0
 public function testUserClicksContactLinkAndIsTakenToContactPage()
 {
     $client = new Client();
     $crawler = $client->request('GET', 'http://homestead.app/');
     $link = $crawler->selectLink('Contact Us')->link();
     $this->assertEquals('http://homestead.app/about/contact', $link->getUri());
     $crawler = $client->click($link);
     $this->assertCount(1, $crawler->filter('h1:contains("Contact Us")'));
 }
Example #5
0
 /**
  * @param InputInterface  $input  The input instance
  * @param OutputInterface $output The output instance
  */
 protected function execute(InputInterface $input, OutputInterface $output)
 {
     $pages = 25;
     $page = 1;
     $output->writeln('<info>Beginning Google crawl</info>');
     $query = new QueryString(array('q' => 'site:drupalcode.org "composer.json" "drupal-module"'));
     $url = 'http://www.google.com/search?' . $query;
     // Load page 1
     $client = new Client();
     $crawler = $client->request('GET', $url);
     $repos = array();
     // Crawl through search pages.
     do {
         $current = $client->getHistory()->current()->getUri();
         $output->writeln('<info>Crawling:</info> ' . $current);
         // Use a CSS filter to select only the result links:
         $links = $crawler->filter('li h3 a');
         // Search the links for the domain:
         foreach ($links as $index => $link) {
             $href = $link->getAttribute('href');
             $query = QueryString::fromString(parse_url($href, PHP_URL_QUERY));
             $url = $query->get('q');
             // Match pages with composer.json in root.
             if (preg_match('/^http:\\/\\/drupalcode.org.+\\.git\\/.+\\/composer.json$/i', $url)) {
                 // Strip to git url and rewrite to drupalcode.org then store unique matches.
                 $matches = array();
                 preg_match('/^http:\\/\\/drupalcode.org.+\\.git/i', $url, $matches);
                 $repo = str_replace('http://drupalcode.org/', 'http://git.drupal.org/', $matches[0]);
                 $repos[$repo] = null;
                 $output->writeln('<info>Found:</info> ' . $repo);
             }
         }
         // Turn the page.
         $page++;
         $node = $crawler->filter('table#nav')->selectLink($page);
         if ($node->count()) {
             $crawler = $client->click($node->link());
         } else {
             break;
         }
     } while ($page < $pages);
     $path = getcwd() . '/satis.json';
     $file = new JsonFile($path);
     $data = $file->read();
     foreach ($data['repositories'] as $file_repo) {
         $repos[$file_repo['url']] = null;
     }
     $repos = array_keys($repos);
     sort($repos);
     $data['repositories'] = array();
     foreach ($repos as $repo) {
         $data['repositories'][] = array('url' => $repo, 'type' => 'vcs');
     }
     $file->write((array) $data);
 }
 protected function execute(InputInterface $input, OutputInterface $output)
 {
     $licence = $input->getArgument('licence');
     $center = $input->getArgument('center');
     $filterDate = $input->getOption('filter');
     $mail = $input->getOption('mail');
     $client = new Client();
     $crawler = $client->request('GET', 'https://driverpracticaltest.direct.gov.uk/application');
     $output->writeln('Step 1');
     $form = $crawler->selectButton('testTypeCar')->form();
     $crawler = $client->submit($form);
     $output->writeln('Step 2');
     $form = $crawler->selectButton('drivingLicenceSubmit')->form();
     $form->setValues(['driverLicenceNumber' => $licence, 'extendedTest' => 'false', 'specialNeeds' => 'false']);
     $crawler = $client->submit($form);
     $output->writeln('Step 3');
     $form = $crawler->selectButton('testCentreSubmit')->form();
     $form->setValues(['testCentreName' => $center]);
     $crawler = $client->submit($form);
     $output->writeln('Step 4');
     $link = $crawler->filter('.test-centre-results > li > a')->first()->link();
     $crawler = $client->click($link);
     $output->writeln('Step 5');
     $button = $crawler->selectButton('drivingLicenceSubmit');
     if ($button->count() == 0) {
         $output->writeln('Captcha!');
         //TODO: display captcha image and ask to solve? Use decaptcha?
         return;
     }
     $form = $button->form();
     $date = (new \DateTime())->format('d/m/y');
     $form->setValues(['preferredTestDate' => $date]);
     $crawler = $client->submit($form);
     $output->writeln('Step 6');
     $slots = $crawler->filter('.slotDateTime');
     $dates = $slots->each(function ($node, $i) use($output) {
         return $node->text();
     });
     if ($filterDate) {
         $filter = new DateFilter();
         $dates = $filter->filterDates($dates, $filterDate);
     }
     foreach ($dates as $date) {
         $output->writeln($date);
     }
     if (count($dates) && $mail) {
         $mailer = new DateMailer();
         $mailer->mail($mail, $dates);
     }
 }
 protected function execute(InputInterface $input, OutputInterface $output)
 {
     $licence = $input->getArgument('licence');
     $reference = $input->getArgument('reference');
     $filterDate = $input->getOption('filter');
     $mail = $input->getOption('mail');
     $client = new Client();
     $crawler = $client->request('GET', 'https://driverpracticaltest.direct.gov.uk/login');
     $output->writeln('Step 1');
     $form = $crawler->selectButton('booking-login')->form();
     $form->setValues(['username' => $licence, 'password' => $reference]);
     $crawler = $client->submit($form);
     $output->writeln('Step 2');
     $link = $crawler->filter('#date-time-change')->first()->link();
     $crawler = $client->click($link);
     $output->writeln('Step 3');
     $button = $crawler->selectButton('drivingLicenceSubmit');
     if ($button->count() == 0) {
         $output->writeln('Captcha!');
         //TODO: display captcha image and ask to solve? Use decaptcha?
         return;
     }
     $form = $button->form();
     $crawler = $client->submit($form);
     $output->writeln('Step 4');
     $slots = $crawler->filter('.slotDateTime');
     $dates = $slots->each(function ($node, $i) use($output) {
         return $node->text();
     });
     if ($filterDate) {
         $filter = new DateFilter();
         $dates = $filter->filterDates($dates, $filterDate);
     }
     foreach ($dates as $date) {
         $output->writeln($date);
     }
     if (count($dates) && $mail) {
         $mailer = new DateMailer();
         $mailer->mail($mail, $dates);
     }
 }
 /**
  * @return category page single link url data eg.mobile, email etc
  */
 public function getData()
 {
     $link = Link::first();
     //$ua = 'Mozilla/5.0 (Windows NT 5.1; rv:16.0) Gecko/20100101 Firefox/16.0 (ROBOT)';
     $client = new Client();
     $client->setHeader('User-Agent', "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.101 Safari/537.36");
     //Set proxy using tor
     $guzzleClient = new \GuzzleHttp\Client(['curl' => [CURLOPT_PROXY => '127.0.0.1:9050', CURLOPT_PROXYTYPE => CURLPROXY_SOCKS5]]);
     $client->setClient($guzzleClient);
     $crawler = $client->request('GET', $link->url);
     //$button = $crawler->filter('.reply_button');
     $isBlock = $crawler->filter('p')->text();
     $isRun = true;
     $i = 0;
     while ($isRun) {
         if (strpos($isBlock, 'blocked') != false) {
             $this->torNew();
             //return $this->getIndex();
             $crawler = $client->request('GET', $link->url);
             $isBlock = $crawler->filter('p')->text();
         } else {
             $lnk = $crawler->selectLink('reply')->link();
             $crawler = $client->click($lnk);
             if ($crawler->filterXpath("//div[@class='captcha']")->count()) {
                 $this->torNew();
             } else {
                 var_dump($crawler->html());
                 $title = $crawler->filter('title')->text();
                 $mobile = $crawler->filter('.mobile-only')->first()->text();
                 $email = $crawler->filter('.mailapp')->first()->text();
                 echo $link->url . ' ' . $title . ' ' . $mobile . ' ' . $email;
                 Scrap::create(['url' => $link->url, 'title' => $title, 'email' => $email, 'phone' => $mobile]);
                 $isRun = false;
             }
         }
     }
     //End While
     // $crawler->filter('a.i')->each(function ($node) {
     // 	    $url = $node->attr("href")."\n";
     // 	    //$link = $node->filter('a')->first();
     // 	    $text = $node->text();
     // 	    $fullUrl = "http://auburn.craigslist.org".$url;
     // 	    //$scrap::create(['url' => $url, 'title' => $text ]);
     // 	   	Link::create(['url'=>$fullUrl, 'title'=> $text]);
     // 	    var_dump($url);
     // 	    $this->tor_new_identity();
     // });
 }
Example #9
0
 /**
  * @param Link $link
  * @return string
  */
 protected function findMagnet(Link $link)
 {
     $crawler = $this->client->click($link);
     return $crawler->filter('.magnet')->first()->attr('href');
 }
<?php

require_once "vendor/autoload.php";
use Goutte\Client;
$client = new Client();
$url = 'http://www.itajai.sc.gov.br';
//Acessar site da prefeitura de Itajaí
$crawler = $client->request('GET', $url);
//Selecionar link Notícias
$link = $crawler->selectLink('Notícias')->link();
//Clicar no link Notícias
$crawler = $client->click($link);
//Definir seletor utilizado para chegar até o conteudo
$seletor = '#conteudo > .dpag_noticia > .dpag_noticia_dados > a';
//Filtrar dados
$noticias = $crawler->filter($seletor)->each(function ($node) {
    //Obtem data da nocícia que está dentro da tag <a>
    $data = $node->filter(".dpag_noticia_data")->text();
    //Obtem titulo da nocícia que está dentro da tag <a>
    $titulo = $node->filter(".dpag_noticia_titulo")->text();
    //Obtem subtítulo da nocícia que está dentro da tag <a>
    $subtitulo = $node->filter(".dpag_noticia_descricao")->text();
    //Obtem link da notícia
    $link = $node->attr("href");
    //Retorna array com a data e titulo
    return array('data' => $data, 'titulo' => $titulo, 'subtitulo' => $subtitulo, 'link' => $link);
});
include "views/exemplo1.php";
 /**
  * Connect using curl and save stops to db.
  */
 public function gather()
 {
     set_time_limit(600);
     ini_set('memory_limit', '1024M');
     if (Cache::has('sync_stops')) {
         return $this->fill_db(Cache::get('sync_stops'));
     }
     $client = new Client();
     $crawler = $client->request('GET', 'http://rozklady.mpk.krakow.pl/aktualne/przystan.htm');
     $stops = array();
     $crawler->filter('tr ul li')->each(function ($node) use(&$stops) {
         $stops[$node->text()] = $node->text();
     });
     foreach ($stops as $index => $stop) {
         // Cache after each loop, if
         // site somehows blocks our curl
         // we get at least some of the data.
         Cache::put('sync_stops', $stops, 1440);
         // Handle empty node.
         try {
             $link = $crawler->selectLink($stop)->link();
         } catch (Exception $e) {
             if ($e->getMessage() == 'The current node list is empty.') {
                 continue;
             }
         }
         $stop_crawler = $client->click($link);
         $buses = array();
         $stop_crawler->filter('tr ul li a')->each(function ($node) use(&$buses) {
             $buses[$node->text()] = $node->text();
         });
         // pop last elements as it's always back to all stops
         array_pop($buses);
         $stops[$index] = $buses;
         foreach ($buses as $bus_index => $bus) {
             // Handle empty node.
             try {
                 $link = $stop_crawler->selectLink($bus)->link();
             } catch (Exception $e) {
                 if ($e->getMessage() == 'The current node list is empty.') {
                     continue;
                 }
             }
             // Because URL is protected against crawler
             // we have to replace it manually
             $new_link = $link->getUri();
             $to_replace_old = explode('/', $new_link);
             $to_replace_new = str_replace('r', 't', array_pop($to_replace_old));
             $new_link = '';
             foreach ($to_replace_old as $part) {
                 $new_link .= $part . '/';
             }
             $new_link = $new_link . $to_replace_new;
             // crawl with new link
             $bus_crawler = $client->request('GET', $new_link);
             $times = array();
             // gather route
             $bus_crawler->filter('.fontroute')->each(function ($node) use(&$times) {
                 $times['route'] = $node->text();
             });
             // gather mon-friday hour
             $i = 0;
             $bus_crawler->filter('.celldepart tr td:nth-child(1)')->each(function ($node) use(&$times, &$i) {
                 $times['working_days'][$i] = $node->text();
                 $i++;
             });
             // gather mon-friday minute
             $i = 1;
             $bus_crawler->filter('.celldepart tr td:nth-child(2)')->each(function ($node) use(&$times, &$i) {
                 $times['working_days'][$i] = str_replace(' ', ' ' . $times['working_days'][$i] . ':', $node->text());
                 $i++;
             });
             // Cleanup
             if (isset($times['working_days'])) {
                 $times['working_days'] = $this->fill_and_validate_timetable($times['working_days']);
             }
             // gather sunday hour
             $i = 0;
             $bus_crawler->filter('.celldepart tr td:nth-child(3)')->each(function ($node) use(&$times, &$i) {
                 $times['sunday'][$i] = $node->text();
                 $i++;
             });
             // gather sunday minute
             $i = 1;
             $bus_crawler->filter('.celldepart tr td:nth-child(4)')->each(function ($node) use(&$times, &$i) {
                 if (!isset($times['sunday'][$i])) {
                     $times['sunday'][$i] = NULL;
                 }
                 $times['sunday'][$i] = str_replace(' ', ' ' . $times['sunday'][$i] . ':', $node->text());
                 $i++;
             });
             // Cleanup
             if (isset($times['sunday'])) {
                 $times['sunday'] = $this->fill_and_validate_timetable($times['sunday']);
             }
             // gather holiday hour
             $i = 0;
             $bus_crawler->filter('.celldepart tr td:nth-child(5)')->each(function ($node) use(&$times, &$i) {
                 $times['holiday'][$i] = $node->text();
                 $i++;
             });
             // gather holiday minute
             $i = 1;
             $bus_crawler->filter('.celldepart tr td:nth-child(6)')->each(function ($node) use(&$times, &$i) {
                 if (!isset($times['holiday'][$i])) {
                     $times['holiday'][$i] = NULL;
                 }
                 $times['holiday'][$i] = str_replace(' ', ' ' . $times['holiday'][$i] . ':', $node->text());
                 $i++;
             });
             // Cleanup
             if (isset($times['holiday'])) {
                 $times['holiday'] = $this->fill_and_validate_timetable($times['holiday']);
             }
             $stops[$index][$bus_index] = $times;
         }
     }
 }
Example #12
0
 protected function goToNextPage()
 {
     $link = $this->crawler->filter('.pagination3 .page_next a')->link();
     $this->crawler = $this->client->click($link);
 }
 /**
  * Execute the console command.
  *
  * @return mixed
  */
 public function handle()
 {
     $client = new Client();
     $crawler = $client->request('GET', 'http://rustorka.com/forum/tracker.php?f[]=-1');
     $form = $crawler->filter(".borderless.bCenter input")->selectButton('Вход')->form();
     $crawler = $client->submit($form, array('login_username' => env('RUSTORKA_LOGIN'), 'login_password' => env('RUSTORKA_PASSWORD')));
     $this->processPage($crawler);
     $links = [];
     $links = $crawler->filter("div.bottom_info a")->each(function (Crawler $node) use($links) {
         $links = $node->link();
         return $links;
     });
     if ($links) {
         foreach ($links as $key => $link) {
             if ($key > 0) {
                 $crawler = $client->click($link);
                 $this->processPage($crawler);
             }
         }
     }
 }
Example #14
-8
 /**
  * @return Get user data from craglist
  */
 public function getInfo($link)
 {
     //Get the url name
     $url = Url::findOrfail($this->urlId);
     if ($url) {
         $ul = parse_url($url->name);
         $links = 'http://' . $ul['host'] . $link;
     }
     $crawler = $this->helper_crawler($links);
     $isBlock = $crawler->filter('p')->text();
     if (strpos($isBlock, 'blocked') != false) {
         //next process and change ip
         echo "Ip Address is blocked";
         die;
     } else {
         if ($crawler->filter('title')->count()) {
             $this->title = $crawler->filter('title')->text();
         }
         if ($crawler->filterXPath('//div[@class="mapAndAttrs"]')->count()) {
             $this->mapLocation = $crawler->filterXPath('//div[@class="mapAndAttrs"]')->html();
         }
         if ($crawler->filterXPath('//section[@id="postingbody"]')->count()) {
             $this->body = $crawler->filterXPath('//section[@id="postingbody"]')->html();
         }
         $lnk = $crawler->selectLink('reply')->link();
         //Ading user-agent
         $agent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.82 Safari/537.36';
         $client = new Client(['HTTP_USER_AGENT' => $agent]);
         $crawler = $client->click($lnk);
         if ($crawler->filterXpath("//div[@class='captcha']")->count()) {
             //Next process and change ip
             echo "Captcha given wait few hours";
         } else {
             $name = $email = $mobile = "";
             if ($crawler->filterXPath('//ul[not(@class)]/li[not(div)]')->count()) {
                 $name = $crawler->filterXPath('//ul[not(@class)]/li[not(div)]')->text();
             }
             if ($crawler->filterXPath('//ul/li/a[@class="mailapp"]')->count()) {
                 $email = $crawler->filterXPath('//ul/li/a[@class="mailapp"]')->text();
             }
             if ($crawler->filterXPath('//a[@class="mobile-only replytellink"]')->count()) {
                 $mb = $crawler->filterXPath('//a[@class="mobile-only replytellink"]')->attr('href');
                 $mobile = str_replace("tel:", '', $mb);
             }
             $url->leads()->create(['link' => $link, 'title' => $this->title, 'email' => $email, 'name' => $name, 'phone' => $mobile, 'mapLocation' => $this->mapLocation, 'body' => $this->body]);
         }
     }
     return redirect()->back()->with('message', "Please check scrap data");
 }