private function setUpClient() { $this->client = new Client(); $this->client->setHeader('User-Agent', $this->user_agent); $this->client->setHeader('Accept-Language', 'en-gb'); $this->client->setClient(new \GuzzleHttp\Client(['allow_redirects' => false, 'cookies' => true, 'verify' => false, 'proxy' => ['http' => $this->proxy]])); }
/** * @param boolean $allPages */ public function run($allPages) { $client = new Client(); $client->getClient()->setDefaultOption('config/curl/' . CURLOPT_TIMEOUT, 30); $client->setHeader('User-Agent', $this->config['user_agent']); try { $crawler = $client->request('GET', $this->config['url']); } catch (TransferException $e) { echo $e->getMessage() . PHP_EOL; exit(1); } if ($client->getResponse()->getStatus() == 200) { $this->getUrlsAndDownload($crawler); if ($allPages) { $link = $this->getNextLink($crawler); while ($link) { $crawler = $client->click($link); $this->getUrlsAndDownload($crawler); $link = $this->getNextLink($crawler); } } } else { echo "site not available\n"; } }
/** * @param $basic * @return mixed * @throws \Exception */ public function postOauth2Token($basic) { $rquestBody = 'grant_type=client_credentials'; $this->client->setHeader('Authorization', 'Basic ' . $basic); $this->client->setHeader('Content-Type', 'application/x-www-form-urlencoded;charset=UTF-8'); $this->client->request('POST', $this->getApiBaseUrl() . '/' . $this->authenticationUri, [], [], [], $rquestBody); /** * @var $response Response */ $response = $this->client->getResponse(); $decodedResponse = json_decode($response->getContent(), true); $lastError = json_last_error(); if ($lastError !== JSON_ERROR_NONE) { throw new \Exception('An error occurred when decoding the response (Error code: ' . $lastError . ')'); } return $decodedResponse; }
public function testCustomUserAgent() { $guzzle = $this->getGuzzle(); $client = new Client(); $client->setClient($guzzle); $client->setHeader('User-Agent', 'foo'); $crawler = $client->request('GET', 'http://www.example.com/'); $this->assertEquals('foo', $this->history->getLastRequest()->getHeader('User-Agent')); }
/** * @param array $parameters */ public function setParameters($parameters = []) { $this->validParameters($parameters); $this->client->setHeader('HTTP_USER_AGENT', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:45.0) Gecko/20100101 Firefox/45.0'); $crawler = $this->client->request('GET', $this->entryPoint); $crawler->filter('.tps_parcours.BP .secteurTable tbody tr td')->each(function (Crawler $node, $i) { $text = $node->text(); if (!($i % 7)) { $this->row++; } else { $way = $i <= 49 ? self::DIRECTION_EXTERIOR : self::DIRECTION_INTERIOR; $text = str_replace(["é", "\n", "\r", "\t", " "], ['e', ''], $text); $this->dataFetched[$this->row][$way][] = $text; } }); $this->sanitizeContent(); $this->calculateRoute(); }
public function testCustomUserAgent() { $guzzle = $this->getGuzzle(); $client = new Client(); $client->setClient($guzzle); $client->setHeader('User-Agent', 'foo'); $client->request('GET', 'http://www.example.com/'); $this->assertEquals('Symfony2 BrowserKit, foo', end($this->history)['request']->getHeaderLine('User-Agent')); }
protected function getContents($url, $headers = [], $as_html = false) { $key = $this->buildCacheKey($url); if (self::$cache->has($key)) { return self::$cache->get($key); } if ($as_html) { foreach ($headers as $k => $v) { $this->client->setHeader($k, $v); } $contents = $this->client->request('GET', $url)->html(); foreach ($headers as $k => $v) { $this->client->removeHeader($k); } } else { $contents = file_get_contents($url); } self::$cache->set($key, $contents); return $contents; }
/** * Metodo para realizar a consulta * * @param string $cnpj CNPJ * @param string $ie IE - Não Testado * @param string $paramBot ParamBot parametro enviado para validação do captcha * @param string $captcha CAPTCHA * @param string $stringCookie COOKIE * @throws Exception * @return array Dados da empresa */ public static function consulta($cnpj, $ie, $paramBot, $captcha, $stringCookie) { $arrayCookie = explode(';', $stringCookie); if (!Utils::isCnpj($cnpj)) { throw new Exception('O CNPJ informado não é válido.'); } $client = new Client(); #$client->getClient()->setDefaultOption('timeout', 120); $client->getClient()->setDefaultOption('config/curl/' . CURLOPT_TIMEOUT, 0); $client->getClient()->setDefaultOption('config/curl/' . CURLOPT_TIMEOUT_MS, 0); $client->getClient()->setDefaultOption('config/curl/' . CURLOPT_CONNECTTIMEOUT, 0); $client->getClient()->setDefaultOption('config/curl/' . CURLOPT_RETURNTRANSFER, true); $client->setHeader('Host', 'pfeserv1.fazenda.sp.gov.br'); $client->setHeader('User-Agent', 'Mozilla/5.0 (Windows NT 6.1; rv:32.0) Gecko/20100101 Firefox/32.0'); $client->setHeader('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9, */* ;q=0.8'); $client->setHeader('Accept-Language', 'pt-BR,pt;q=0.8,en-US;q=0.5,en;q=0.3'); $client->setHeader('Accept-Encoding', 'gzip, deflate'); $client->setHeader('Referer', 'http://www.sintegra.gov.br/new_bv.html'); $client->setHeader('Cookie', $arrayCookie[0]); $client->setHeader('Connection', 'keep-alive'); $servico = strlen($cnpj) > 0 ? 'cnpj' : 'ie'; $consultaPor = strlen($cnpj) > 0 ? 'Consulta por CNPJ' : 'Consulta por IE'; $param = array('hidFlag' => '0', 'cnpj' => Utils::unmask($cnpj), 'ie' => Utils::unmask($ie), 'paramBot' => $paramBot, 'Key' => $captcha, 'servico' => $servico, 'botao' => $consultaPor); $crawler = $client->request('POST', 'http://pfeserv1.fazenda.sp.gov.br/sintegrapfe/sintegra', $param); $imageError = 'O valor da imagem esta incorreto ou expirou. Verifique novamente a imagem e digite exatamente os 5 caracteres exibidos.'; $checkError = $crawler->filter('body > center')->eq(1)->count(); if ($checkError && $imageError == trim($crawler->filter('body > center')->eq(1)->text())) { throw new Exception($imageError, 99); } $center_ = $crawler->filter('body > center'); if (count($center_) == 0) { throw new Exception('Serviço indisponível!. Tente novamente.', 99); } //self::saveFile($client); $html = self::parseContent($client->getResponse()->__toString()); $crawler = new \Symfony\Component\DomCrawler\Crawler($html); $data = self::parseSelectors($crawler); return $data; }
/** * Metodo para realizar a consulta * * @param string $cpf CPF * @param string $captcha CAPTCHA * @param string $stringCookie COOKIE * @throws Exception * @return array Dados da pessoa */ public static function consulta($cpf, $captcha, $stringCookie) { try { $arrayCookie = explode(';', $stringCookie); if (!Utils::isCpf($cpf)) { throw new Exception(); } $client = new Client(); $client->setHeader('Host', 'www.receita.fazenda.gov.br'); $client->setHeader('User-Agent', 'Mozilla/5.0 (Windows NT 6.1; rv:32.0) Gecko/20100101 Firefox/32.0'); $client->setHeader('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'); $client->setHeader('Accept-Language', 'pt-BR,pt;q=0.8,en-US;q=0.5,en;q=0.3'); $client->setHeader('Accept-Encoding', 'gzip, deflate'); $client->setHeader('Referer', 'http://www.receita.fazenda.gov.br/aplicacoes/atcta/cpf/ConsultaPublica.asp'); $client->setHeader('Cookie', $arrayCookie[0]); $client->setHeader('Connection', 'keep-alive'); $param = array('txtCPF' => Utils::unmask($cpf), 'txtTexto_captcha_serpro_gov_br' => $captcha, 'Enviar' => 'Consultar'); $crawler = $client->request('POST', 'http://www.receita.fazenda.gov.br/aplicacoes/atcta/cpf/ConsultaPublicaExibir.asp', $param); $clConteudoDados = $crawler->filter('span.clConteudoDados'); return array('cpf' => Utils::unmask($cpf), 'nome' => trim(str_replace('Nome da Pessoa Física: ', '', $clConteudoDados->eq(1)->html())), 'situacao_cadastral' => str_replace('Situação Cadastral: ', '', $clConteudoDados->eq(2)->html()), 'digito_verificador' => str_replace('Digito Verificador: ', '', $clConteudoDados->eq(3)->html())); } catch (Exception $e) { throw new Exception('Aconteceu um erro ao fazer a consulta. Envie os dados novamente.'); } }
/** * @param string $service * @param string $method * @param array $parameters * @return \Symfony\Component\DomCrawler\Crawler */ protected function requestJsonAmf($service, $method, array $parameters = []) { $payload = $this->prepareAmfPayload($service, $method, $parameters); $this->client->setHeader('Content-Type', 'application/json'); return $this->client->request('POST', '/amf', [], [], [], $payload); }
protected function getHttpClient() { $client = new Client(); $client->setHeader('User-Agent', "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:12.0) Gecko/20100101 Firefox/12.0 FirePHP/0.7.1"); return $client; }
/** * @param FuzzingUri $uri_target */ public function attackFuzzedWithCsrf(FuzzingUri $uri_target) { $fuzz_target = $uri_target->getFuzzTarget(); $url = $this->_guzzle->getBaseUrl() . $uri_target->getUri(); $http_params = $fuzz_target->getParameters(); $userAndPassRefs = $this->getUserAndPassRefs($http_params); $usernames = $this->_easycredentials->getUsernames($this->NB_CREDENTIALS); $passwords = $this->_easycredentials->getPasswords($this->NB_CREDENTIALS); foreach ($usernames as $one_username) { $userAndPassRefs['username'] = $one_username; foreach ($passwords as $one_password) { /** * todo add to DB with array of values and entity->persist() ? * todo clariss verifier connexion + verification fichier * todo choix api guzzle && goutte == wtf ? */ $userAndPassRefs['password'] = $one_password; //create a client $client = new Client(); //set options $client->setHeader('User-Agent', "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.101 Safari/537.36"); $client->followRedirects(true); //request csrf $crawler_csrf = $client->request('get', $url); $csrf_token = $crawler_csrf->filter('input[name="' . $uri_target->getCsrf() . '"]')->first()->extract('value')[0]; $http_params = array_merge($http_params, array($uri_target->getCsrf() => $csrf_token)); $client->followRedirects(true); $crawler_login = $client->request($uri_target->getHttpMethod(), $url, $http_params); if (preg_match($uri_target->getMatchSuccess(), $crawler_login->text())) { $this->report(true, "Bruteforced " . $url . " with user " . $one_username . " and password " . $one_password); } } } }
public function testRestart() { $client = new Client(); $client->setHeader('X-Test', 'test'); $client->setAuth('foo', 'bar'); $headersReflectionProperty = new \ReflectionProperty('Goutte\\Client', 'headers'); $headersReflectionProperty->setAccessible(true); $this->assertEquals(array('X-Test' => 'test'), $headersReflectionProperty->getValue($client)); $authReflectionProperty = new \ReflectionProperty('Goutte\\Client', 'auth'); $authReflectionProperty->setAccessible(true); $this->assertEquals(array('foo', 'bar', 'basic'), $authReflectionProperty->getValue($client)); $client->restart(); $this->assertEquals([], $headersReflectionProperty->getValue($client)); $this->assertNull($authReflectionProperty->getValue($client)); }
/** * Metodo para realizar a consulta * * @param string $cnpj CNPJ * @param string $captcha CAPTCHA * @param string $stringCookie COOKIE * @throws Exception * @return array Dados da empresa */ public static function consulta($cnpj, $captcha, $stringCookie) { $result = array(); $arrayCookie = explode(';', $stringCookie); if (!Utils::isCnpj($cnpj)) { throw new Exception('O CNPJ informado não é válido'); } $client = new Client(); $client->setHeader('Host', 'www.receita.fazenda.gov.br'); $client->setHeader('User-Agent', 'Mozilla/5.0 (Windows NT 6.1; rv:32.0) Gecko/20100101 Firefox/32.0'); $client->setHeader('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9, */* ;q=0.8'); $client->setHeader('Accept-Language', 'pt-BR,pt;q=0.8,en-US;q=0.5,en;q=0.3'); $client->setHeader('Accept-Encoding', 'gzip, deflate'); $client->setHeader('Referer', 'http://www.receita.fazenda.gov.br/pessoajuridica/cnpj/cnpjreva/valida.asp'); $client->setHeader('Cookie', $arrayCookie[0]); $client->setHeader('Connection', 'keep-alive'); $param = array('origem' => 'comprovante', 'cnpj' => Utils::unmask($cnpj), 'txtTexto_captcha_serpro_gov_br' => $captcha, 'submit1' => 'Consultar', 'search_type' => 'cnpj'); $crawler = $client->request('POST', 'http://www.receita.fazenda.gov.br/pessoajuridica/cnpj/cnpjreva/valida.asp', $param); if ($crawler->filter('body > table:nth-child(3) > tr:nth-child(2) > td > b > font')->count() > 0) { throw new Exception('Erro ao consultar. O CNPJ informado não existe no cadastro.', 99); } $td = $crawler->filter('body > table:nth-child(3) > tr > td'); foreach ($td->filter('td') as $td) { $td = new Crawler($td); if ($td->filter('font:nth-child(1)')->count() > 0) { $key = trim(preg_replace('/\\s+/', ' ', $td->filter('font:nth-child(1)')->html())); switch ($key) { case 'NOME EMPRESARIAL': $key = 'razao_social'; break; case 'TÍTULO DO ESTABELECIMENTO (NOME DE FANTASIA)': $key = 'nome_fantasia'; break; case 'CÓDIGO E DESCRIÇÃO DA ATIVIDADE ECONÔMICA PRINCIPAL': $key = 'cnae_principal'; break; case 'CÓDIGO E DESCRIÇÃO DAS ATIVIDADES ECONÔMICAS SECUNDÁRIAS': $key = 'cnaes_secundario'; break; case 'CÓDIGO E DESCRIÇÃO DA NATUREZA JURÍDICA': $key = 'natureza_juridica'; break; case 'LOGRADOURO': $key = 'logradouro'; break; case 'NÚMERO': $key = 'numero'; break; case 'COMPLEMENTO': $key = 'complemento'; break; case 'CEP': $key = 'cep'; break; case 'BAIRRO/DISTRITO': $key = 'bairro'; break; case 'MUNICÍPIO': $key = 'cidade'; break; case 'UF': $key = 'uf'; break; case 'SITUAÇÃO CADASTRAL': $key = 'situacao_cadastral'; break; case 'DATA DA SITUAÇÃO CADASTRAL': $key = 'situacao_cadastral_data'; break; case 'MOTIVO DE SITUAÇÃO CADASTRAL': $key = 'motivo_situacao_cadastral'; break; case 'SITUAÇÃO ESPECIAL': $key = 'situacao_especial'; break; case 'DATA DA SITUAÇÃO ESPECIAL': $key = 'situacao_especial_data'; break; case 'TELEFONE': $key = 'telefone'; break; case 'ENDEREÇO ELETRÔNICO': $key = 'email'; break; case 'ENTE FEDERATIVO RESPONSÁVEL (EFR)': $key = 'ente_federativo_responsavel'; break; default: $key = null; break; } if (!is_null($key)) { $bs = $td->filter('font > b'); foreach ($bs as $b) { $b = new Crawler($b); $str = trim(preg_replace('/\\s+/', ' ', $b->html())); $attach = htmlspecialchars_decode($str); if ($bs->count() == 1) { $result[$key] = $attach; } else { $result[$key][] = $attach; } } } } } return $result; }
/** * @return category page single link url data eg.mobile, email etc */ public function getData() { $link = Link::first(); //$ua = 'Mozilla/5.0 (Windows NT 5.1; rv:16.0) Gecko/20100101 Firefox/16.0 (ROBOT)'; $client = new Client(); $client->setHeader('User-Agent', "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.101 Safari/537.36"); //Set proxy using tor $guzzleClient = new \GuzzleHttp\Client(['curl' => [CURLOPT_PROXY => '127.0.0.1:9050', CURLOPT_PROXYTYPE => CURLPROXY_SOCKS5]]); $client->setClient($guzzleClient); $crawler = $client->request('GET', $link->url); //$button = $crawler->filter('.reply_button'); $isBlock = $crawler->filter('p')->text(); $isRun = true; $i = 0; while ($isRun) { if (strpos($isBlock, 'blocked') != false) { $this->torNew(); //return $this->getIndex(); $crawler = $client->request('GET', $link->url); $isBlock = $crawler->filter('p')->text(); } else { $lnk = $crawler->selectLink('reply')->link(); $crawler = $client->click($lnk); if ($crawler->filterXpath("//div[@class='captcha']")->count()) { $this->torNew(); } else { var_dump($crawler->html()); $title = $crawler->filter('title')->text(); $mobile = $crawler->filter('.mobile-only')->first()->text(); $email = $crawler->filter('.mailapp')->first()->text(); echo $link->url . ' ' . $title . ' ' . $mobile . ' ' . $email; Scrap::create(['url' => $link->url, 'title' => $title, 'email' => $email, 'phone' => $mobile]); $isRun = false; } } } //End While // $crawler->filter('a.i')->each(function ($node) { // $url = $node->attr("href")."\n"; // //$link = $node->filter('a')->first(); // $text = $node->text(); // $fullUrl = "http://auburn.craigslist.org".$url; // //$scrap::create(['url' => $url, 'title' => $text ]); // Link::create(['url'=>$fullUrl, 'title'=> $text]); // var_dump($url); // $this->tor_new_identity(); // }); }
<?php require __DIR__ . '/vendor/autoload.php'; use Goutte\Client; use WebmasterHacks\Pornhub\Video; use WebmasterHacks\Pornhub\Pornstar; use WebmasterHacks\Pornhub\Category; use WebmasterHacks\Pornhub\Tag; $client = new Client(); $client->setHeader('User-Agent', 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/44.0.2403.89 Chrome/44.0.2403.89 Safari/537.36'); $video = new Video($client, 'http://www.pornhub.com/view_video.php?viewkey=ph55f4113f77d67'); echo 'Pornstars:' . PHP_EOL; $video->pornstars()->each(function (Pornstar $pornstar) { echo $pornstar->url() . PHP_EOL; }); echo 'Categories:' . PHP_EOL; $video->categories()->each(function (Category $category) { echo $category->url() . PHP_EOL; }); echo 'Tags:' . PHP_EOL; $video->tags()->each(function (Tag $tag) { echo $tag->url() . PHP_EOL; });
/** * crawling single url after checking the depth value * @param string $url * @param int $depth */ protected function crawlPages($url, $depth) { if (!$url || isset($this->pages[$url]) && isset($this->pages[$url]['visited']) && $this->pages[$url]['visited']) { return; } $client = new Client(); $client->setHeader('User-Agent', $this->parameters['user_agent']); try { $crawler = $client->request('GET', $url); $statusCode = $client->getResponse()->getStatus(); $this->log(sprintf("%s: %s", $statusCode, $url)); } catch (\Exception $e) { $statusCode = 400; $this->log(sprintf("%s: %s", $statusCode, $url)); $this->log(sprintf("Error page retrieving (%s)", $e->getMessage())); } $this->setPageStatusStats($statusCode); if ($statusCode >= 400) { return; } if (!isset($this->pages[$url])) { $this->pages[$url] = array(); } $this->pages[$url]['status_code'] = $statusCode; $contentType = $client->getResponse()->getHeader('Content-Type'); if (strpos($contentType, ';') !== false) { $contentType = substr($contentType, 0, strpos($contentType, ';')); } switch ($contentType) { case 'text/html': $provider = $this->container->get('symbio_fulltext_search.provider.html'); try { $pageInfo = $provider->extract(array(HtmlProvider::CONFIG_CRAWLER_PARAMETERS_HANDLER => $this->parameters, HtmlProvider::CONFIG_CRAWLER_HANDLER => $crawler, HtmlProvider::CONFIG_IS_EXTERNAL_LINK_HANDLER => isset($this->pages[$url]['external_link']) ? $this->pages[$url]['external_link'] : false)); } catch (\Exception $e) { error_log('Error retrieving data from link: ' . $url . ' (' . $e->getMessage() . ') '); $this->pages[$url]['dont_index'] = true; } if ($pageInfo) { $this->pages[$url] = array_merge($this->pages[$url], $pageInfo); $this->pages[$url]['visited'] = true; // mark current url as visited if (!isset($this->pages[$url]['external_link']) || !$this->pages[$url]['external_link']) { // for internal uris, get all links inside $links = $this->extractLinks($crawler, $url); if (count($links)) { $this->crawlChildLinks($links, $depth !== false ? $depth - 1 : false); } } elseif ($this->parameters[self::CRAWL_EXTERNAL_LINKS] && $this->parameters[self::EXTERNAL_LINKS_DEPTH] > 0) { $links = $this->extractLinks($crawler, $url); if (count($links)) { $this->crawlChildLinks($links, $this->parameters[self::EXTERNAL_LINKS_DEPTH]); } } } break; } }