/** * @param string $url * @return ProductList $list */ public function scrape($url) { $productList = new ProductList(); $client = new Client(); $crawler = $client->request('GET', $url); $crawler->filterXPath('//*[@class="productInfo"]/h3/a')->each(function ($node) use($productList, $client) { /** @var Crawler $node */ $product = new Product(); $link = $node->link(); $subPage = $client->click($link); $subPage->filter('.productTitleDescriptionContainer > h1')->first()->each(function ($node) use($product) { /** @var Crawler $node */ $product->title = trim($node->text()); }); $subPage->filter('.pricePerUnit')->each(function ($node) use($product) { /** @var Crawler $node */ $product->unitPrice = $node->text(); }); $product->size = sprintf("%.2f", strlen($subPage->html()) / 1024); $subPage->filter('.pricePerUnit')->each(function ($node) use($product) { /** @var Crawler $node */ preg_match("/[\\d\\.]+/", $node->text(), $price); $product->unitPrice = $price[0]; }); $subPage->filter('htmlcontent > div')->first()->each(function ($node) use($product) { /** @var Crawler $node */ $product->description = trim($node->text()); }); $productList->addProduct($product); }); return $productList; }
/** * @param boolean $allPages */ public function run($allPages) { $client = new Client(); $client->getClient()->setDefaultOption('config/curl/' . CURLOPT_TIMEOUT, 30); $client->setHeader('User-Agent', $this->config['user_agent']); try { $crawler = $client->request('GET', $this->config['url']); } catch (TransferException $e) { echo $e->getMessage() . PHP_EOL; exit(1); } if ($client->getResponse()->getStatus() == 200) { $this->getUrlsAndDownload($crawler); if ($allPages) { $link = $this->getNextLink($crawler); while ($link) { $crawler = $client->click($link); $this->getUrlsAndDownload($crawler); $link = $this->getNextLink($crawler); } } } else { echo "site not available\n"; } }
public function testUserClicksRegLinkAndIsTakenToRegPage() { $client = new Client(); $crawler = $client->request('GET', 'http://localhost:8000'); $link = $crawler->selectLink('Create an account')->link(); $crawler = $client->click($link); $this->assertCount(1, $crawler->filter('h1:contains("Create a TODOParrot Account")')); }
public function testUserClicksContactLinkAndIsTakenToContactPage() { $client = new Client(); $crawler = $client->request('GET', 'http://homestead.app/'); $link = $crawler->selectLink('Contact Us')->link(); $this->assertEquals('http://homestead.app/about/contact', $link->getUri()); $crawler = $client->click($link); $this->assertCount(1, $crawler->filter('h1:contains("Contact Us")')); }
/** * @param InputInterface $input The input instance * @param OutputInterface $output The output instance */ protected function execute(InputInterface $input, OutputInterface $output) { $pages = 25; $page = 1; $output->writeln('<info>Beginning Google crawl</info>'); $query = new QueryString(array('q' => 'site:drupalcode.org "composer.json" "drupal-module"')); $url = 'http://www.google.com/search?' . $query; // Load page 1 $client = new Client(); $crawler = $client->request('GET', $url); $repos = array(); // Crawl through search pages. do { $current = $client->getHistory()->current()->getUri(); $output->writeln('<info>Crawling:</info> ' . $current); // Use a CSS filter to select only the result links: $links = $crawler->filter('li h3 a'); // Search the links for the domain: foreach ($links as $index => $link) { $href = $link->getAttribute('href'); $query = QueryString::fromString(parse_url($href, PHP_URL_QUERY)); $url = $query->get('q'); // Match pages with composer.json in root. if (preg_match('/^http:\\/\\/drupalcode.org.+\\.git\\/.+\\/composer.json$/i', $url)) { // Strip to git url and rewrite to drupalcode.org then store unique matches. $matches = array(); preg_match('/^http:\\/\\/drupalcode.org.+\\.git/i', $url, $matches); $repo = str_replace('http://drupalcode.org/', 'http://git.drupal.org/', $matches[0]); $repos[$repo] = null; $output->writeln('<info>Found:</info> ' . $repo); } } // Turn the page. $page++; $node = $crawler->filter('table#nav')->selectLink($page); if ($node->count()) { $crawler = $client->click($node->link()); } else { break; } } while ($page < $pages); $path = getcwd() . '/satis.json'; $file = new JsonFile($path); $data = $file->read(); foreach ($data['repositories'] as $file_repo) { $repos[$file_repo['url']] = null; } $repos = array_keys($repos); sort($repos); $data['repositories'] = array(); foreach ($repos as $repo) { $data['repositories'][] = array('url' => $repo, 'type' => 'vcs'); } $file->write((array) $data); }
protected function execute(InputInterface $input, OutputInterface $output) { $licence = $input->getArgument('licence'); $center = $input->getArgument('center'); $filterDate = $input->getOption('filter'); $mail = $input->getOption('mail'); $client = new Client(); $crawler = $client->request('GET', 'https://driverpracticaltest.direct.gov.uk/application'); $output->writeln('Step 1'); $form = $crawler->selectButton('testTypeCar')->form(); $crawler = $client->submit($form); $output->writeln('Step 2'); $form = $crawler->selectButton('drivingLicenceSubmit')->form(); $form->setValues(['driverLicenceNumber' => $licence, 'extendedTest' => 'false', 'specialNeeds' => 'false']); $crawler = $client->submit($form); $output->writeln('Step 3'); $form = $crawler->selectButton('testCentreSubmit')->form(); $form->setValues(['testCentreName' => $center]); $crawler = $client->submit($form); $output->writeln('Step 4'); $link = $crawler->filter('.test-centre-results > li > a')->first()->link(); $crawler = $client->click($link); $output->writeln('Step 5'); $button = $crawler->selectButton('drivingLicenceSubmit'); if ($button->count() == 0) { $output->writeln('Captcha!'); //TODO: display captcha image and ask to solve? Use decaptcha? return; } $form = $button->form(); $date = (new \DateTime())->format('d/m/y'); $form->setValues(['preferredTestDate' => $date]); $crawler = $client->submit($form); $output->writeln('Step 6'); $slots = $crawler->filter('.slotDateTime'); $dates = $slots->each(function ($node, $i) use($output) { return $node->text(); }); if ($filterDate) { $filter = new DateFilter(); $dates = $filter->filterDates($dates, $filterDate); } foreach ($dates as $date) { $output->writeln($date); } if (count($dates) && $mail) { $mailer = new DateMailer(); $mailer->mail($mail, $dates); } }
protected function execute(InputInterface $input, OutputInterface $output) { $licence = $input->getArgument('licence'); $reference = $input->getArgument('reference'); $filterDate = $input->getOption('filter'); $mail = $input->getOption('mail'); $client = new Client(); $crawler = $client->request('GET', 'https://driverpracticaltest.direct.gov.uk/login'); $output->writeln('Step 1'); $form = $crawler->selectButton('booking-login')->form(); $form->setValues(['username' => $licence, 'password' => $reference]); $crawler = $client->submit($form); $output->writeln('Step 2'); $link = $crawler->filter('#date-time-change')->first()->link(); $crawler = $client->click($link); $output->writeln('Step 3'); $button = $crawler->selectButton('drivingLicenceSubmit'); if ($button->count() == 0) { $output->writeln('Captcha!'); //TODO: display captcha image and ask to solve? Use decaptcha? return; } $form = $button->form(); $crawler = $client->submit($form); $output->writeln('Step 4'); $slots = $crawler->filter('.slotDateTime'); $dates = $slots->each(function ($node, $i) use($output) { return $node->text(); }); if ($filterDate) { $filter = new DateFilter(); $dates = $filter->filterDates($dates, $filterDate); } foreach ($dates as $date) { $output->writeln($date); } if (count($dates) && $mail) { $mailer = new DateMailer(); $mailer->mail($mail, $dates); } }
/** * @return category page single link url data eg.mobile, email etc */ public function getData() { $link = Link::first(); //$ua = 'Mozilla/5.0 (Windows NT 5.1; rv:16.0) Gecko/20100101 Firefox/16.0 (ROBOT)'; $client = new Client(); $client->setHeader('User-Agent', "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.101 Safari/537.36"); //Set proxy using tor $guzzleClient = new \GuzzleHttp\Client(['curl' => [CURLOPT_PROXY => '127.0.0.1:9050', CURLOPT_PROXYTYPE => CURLPROXY_SOCKS5]]); $client->setClient($guzzleClient); $crawler = $client->request('GET', $link->url); //$button = $crawler->filter('.reply_button'); $isBlock = $crawler->filter('p')->text(); $isRun = true; $i = 0; while ($isRun) { if (strpos($isBlock, 'blocked') != false) { $this->torNew(); //return $this->getIndex(); $crawler = $client->request('GET', $link->url); $isBlock = $crawler->filter('p')->text(); } else { $lnk = $crawler->selectLink('reply')->link(); $crawler = $client->click($lnk); if ($crawler->filterXpath("//div[@class='captcha']")->count()) { $this->torNew(); } else { var_dump($crawler->html()); $title = $crawler->filter('title')->text(); $mobile = $crawler->filter('.mobile-only')->first()->text(); $email = $crawler->filter('.mailapp')->first()->text(); echo $link->url . ' ' . $title . ' ' . $mobile . ' ' . $email; Scrap::create(['url' => $link->url, 'title' => $title, 'email' => $email, 'phone' => $mobile]); $isRun = false; } } } //End While // $crawler->filter('a.i')->each(function ($node) { // $url = $node->attr("href")."\n"; // //$link = $node->filter('a')->first(); // $text = $node->text(); // $fullUrl = "http://auburn.craigslist.org".$url; // //$scrap::create(['url' => $url, 'title' => $text ]); // Link::create(['url'=>$fullUrl, 'title'=> $text]); // var_dump($url); // $this->tor_new_identity(); // }); }
/** * @param Link $link * @return string */ protected function findMagnet(Link $link) { $crawler = $this->client->click($link); return $crawler->filter('.magnet')->first()->attr('href'); }
<?php require_once "vendor/autoload.php"; use Goutte\Client; $client = new Client(); $url = 'http://www.itajai.sc.gov.br'; //Acessar site da prefeitura de Itajaí $crawler = $client->request('GET', $url); //Selecionar link Notícias $link = $crawler->selectLink('Notícias')->link(); //Clicar no link Notícias $crawler = $client->click($link); //Definir seletor utilizado para chegar até o conteudo $seletor = '#conteudo > .dpag_noticia > .dpag_noticia_dados > a'; //Filtrar dados $noticias = $crawler->filter($seletor)->each(function ($node) { //Obtem data da nocícia que está dentro da tag <a> $data = $node->filter(".dpag_noticia_data")->text(); //Obtem titulo da nocícia que está dentro da tag <a> $titulo = $node->filter(".dpag_noticia_titulo")->text(); //Obtem subtítulo da nocícia que está dentro da tag <a> $subtitulo = $node->filter(".dpag_noticia_descricao")->text(); //Obtem link da notícia $link = $node->attr("href"); //Retorna array com a data e titulo return array('data' => $data, 'titulo' => $titulo, 'subtitulo' => $subtitulo, 'link' => $link); }); include "views/exemplo1.php";
/** * Connect using curl and save stops to db. */ public function gather() { set_time_limit(600); ini_set('memory_limit', '1024M'); if (Cache::has('sync_stops')) { return $this->fill_db(Cache::get('sync_stops')); } $client = new Client(); $crawler = $client->request('GET', 'http://rozklady.mpk.krakow.pl/aktualne/przystan.htm'); $stops = array(); $crawler->filter('tr ul li')->each(function ($node) use(&$stops) { $stops[$node->text()] = $node->text(); }); foreach ($stops as $index => $stop) { // Cache after each loop, if // site somehows blocks our curl // we get at least some of the data. Cache::put('sync_stops', $stops, 1440); // Handle empty node. try { $link = $crawler->selectLink($stop)->link(); } catch (Exception $e) { if ($e->getMessage() == 'The current node list is empty.') { continue; } } $stop_crawler = $client->click($link); $buses = array(); $stop_crawler->filter('tr ul li a')->each(function ($node) use(&$buses) { $buses[$node->text()] = $node->text(); }); // pop last elements as it's always back to all stops array_pop($buses); $stops[$index] = $buses; foreach ($buses as $bus_index => $bus) { // Handle empty node. try { $link = $stop_crawler->selectLink($bus)->link(); } catch (Exception $e) { if ($e->getMessage() == 'The current node list is empty.') { continue; } } // Because URL is protected against crawler // we have to replace it manually $new_link = $link->getUri(); $to_replace_old = explode('/', $new_link); $to_replace_new = str_replace('r', 't', array_pop($to_replace_old)); $new_link = ''; foreach ($to_replace_old as $part) { $new_link .= $part . '/'; } $new_link = $new_link . $to_replace_new; // crawl with new link $bus_crawler = $client->request('GET', $new_link); $times = array(); // gather route $bus_crawler->filter('.fontroute')->each(function ($node) use(&$times) { $times['route'] = $node->text(); }); // gather mon-friday hour $i = 0; $bus_crawler->filter('.celldepart tr td:nth-child(1)')->each(function ($node) use(&$times, &$i) { $times['working_days'][$i] = $node->text(); $i++; }); // gather mon-friday minute $i = 1; $bus_crawler->filter('.celldepart tr td:nth-child(2)')->each(function ($node) use(&$times, &$i) { $times['working_days'][$i] = str_replace(' ', ' ' . $times['working_days'][$i] . ':', $node->text()); $i++; }); // Cleanup if (isset($times['working_days'])) { $times['working_days'] = $this->fill_and_validate_timetable($times['working_days']); } // gather sunday hour $i = 0; $bus_crawler->filter('.celldepart tr td:nth-child(3)')->each(function ($node) use(&$times, &$i) { $times['sunday'][$i] = $node->text(); $i++; }); // gather sunday minute $i = 1; $bus_crawler->filter('.celldepart tr td:nth-child(4)')->each(function ($node) use(&$times, &$i) { if (!isset($times['sunday'][$i])) { $times['sunday'][$i] = NULL; } $times['sunday'][$i] = str_replace(' ', ' ' . $times['sunday'][$i] . ':', $node->text()); $i++; }); // Cleanup if (isset($times['sunday'])) { $times['sunday'] = $this->fill_and_validate_timetable($times['sunday']); } // gather holiday hour $i = 0; $bus_crawler->filter('.celldepart tr td:nth-child(5)')->each(function ($node) use(&$times, &$i) { $times['holiday'][$i] = $node->text(); $i++; }); // gather holiday minute $i = 1; $bus_crawler->filter('.celldepart tr td:nth-child(6)')->each(function ($node) use(&$times, &$i) { if (!isset($times['holiday'][$i])) { $times['holiday'][$i] = NULL; } $times['holiday'][$i] = str_replace(' ', ' ' . $times['holiday'][$i] . ':', $node->text()); $i++; }); // Cleanup if (isset($times['holiday'])) { $times['holiday'] = $this->fill_and_validate_timetable($times['holiday']); } $stops[$index][$bus_index] = $times; } } }
protected function goToNextPage() { $link = $this->crawler->filter('.pagination3 .page_next a')->link(); $this->crawler = $this->client->click($link); }
/** * Execute the console command. * * @return mixed */ public function handle() { $client = new Client(); $crawler = $client->request('GET', 'http://rustorka.com/forum/tracker.php?f[]=-1'); $form = $crawler->filter(".borderless.bCenter input")->selectButton('Вход')->form(); $crawler = $client->submit($form, array('login_username' => env('RUSTORKA_LOGIN'), 'login_password' => env('RUSTORKA_PASSWORD'))); $this->processPage($crawler); $links = []; $links = $crawler->filter("div.bottom_info a")->each(function (Crawler $node) use($links) { $links = $node->link(); return $links; }); if ($links) { foreach ($links as $key => $link) { if ($key > 0) { $crawler = $client->click($link); $this->processPage($crawler); } } } }
/** * @return Get user data from craglist */ public function getInfo($link) { //Get the url name $url = Url::findOrfail($this->urlId); if ($url) { $ul = parse_url($url->name); $links = 'http://' . $ul['host'] . $link; } $crawler = $this->helper_crawler($links); $isBlock = $crawler->filter('p')->text(); if (strpos($isBlock, 'blocked') != false) { //next process and change ip echo "Ip Address is blocked"; die; } else { if ($crawler->filter('title')->count()) { $this->title = $crawler->filter('title')->text(); } if ($crawler->filterXPath('//div[@class="mapAndAttrs"]')->count()) { $this->mapLocation = $crawler->filterXPath('//div[@class="mapAndAttrs"]')->html(); } if ($crawler->filterXPath('//section[@id="postingbody"]')->count()) { $this->body = $crawler->filterXPath('//section[@id="postingbody"]')->html(); } $lnk = $crawler->selectLink('reply')->link(); //Ading user-agent $agent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.82 Safari/537.36'; $client = new Client(['HTTP_USER_AGENT' => $agent]); $crawler = $client->click($lnk); if ($crawler->filterXpath("//div[@class='captcha']")->count()) { //Next process and change ip echo "Captcha given wait few hours"; } else { $name = $email = $mobile = ""; if ($crawler->filterXPath('//ul[not(@class)]/li[not(div)]')->count()) { $name = $crawler->filterXPath('//ul[not(@class)]/li[not(div)]')->text(); } if ($crawler->filterXPath('//ul/li/a[@class="mailapp"]')->count()) { $email = $crawler->filterXPath('//ul/li/a[@class="mailapp"]')->text(); } if ($crawler->filterXPath('//a[@class="mobile-only replytellink"]')->count()) { $mb = $crawler->filterXPath('//a[@class="mobile-only replytellink"]')->attr('href'); $mobile = str_replace("tel:", '', $mb); } $url->leads()->create(['link' => $link, 'title' => $this->title, 'email' => $email, 'name' => $name, 'phone' => $mobile, 'mapLocation' => $this->mapLocation, 'body' => $this->body]); } } return redirect()->back()->with('message', "Please check scrap data"); }