/** * {@inheritdoc} * @param Proxy $proxy * * @return mixed|void * @throws \Exception */ public function setProxy(Proxy $proxy) { $sessionName = md5($proxy->getUrl()); $this->browser->resetSessions(); if ($this->browser->hasSession($sessionName)) { $this->browser->setDefaultSessionName($sessionName); return; } $driver = $this->browser->getSession()->getDriver(); switch (get_class($driver)) { case 'Behat\\Mink\\Driver\\GoutteDriver': /* @var $driver GoutteDriver */ $client = new Client(); $guzzle = $client->getClient(); $client->setClient($guzzle); $guzzle->setDefaultOption('proxy', $proxy->getUrl()); $goutteDriver = new GoutteDriver($client); $this->browser->registerSession($sessionName, new Session($goutteDriver)); $this->browser->setDefaultSessionName($sessionName); break; default: throw new \Exception('Error : Proxy configuration is not implemented for class ' . get_class($driver) . ''); } }
function __construct(Pixie $pixie) { $this->pixie = $pixie; // Create client $this->client = new Client(); $this->client->getClient()->setDefaultOption('verify', false); $this->client->getClient()->setDefaultOption('timeout', 120); $this->service = $this->pixie->paymentTest; }
public function actionIndex() { /** @var $entity RivegaucheLink */ $entity = new RivegaucheLink(); $offset = 0; do { $links = $entity->getLinks($offset, 5); if (!empty($links)) { foreach ($links as $link) { \Yii::info(sprintf('Обрабатываем: %s ', $link['link']), 'cron'); $client = new Client(); $guzzle = $client->getClient(); $client->getClient()->setDefaultOption('config/curl/' . CURLOPT_PROXY, 'http://141.101.118.147:80'); //$client->getClient()->setDefaultOption('config/curl/'.CURLOPT_TIMEOUT, 10); $client->getClient()->setDefaultOption('config/curl/' . CURLOPT_CONNECTTIMEOUT, 10); $client->setClient($guzzle); /*$guzzle = $client->getClient(); $guzzle->setDefaultOption('timeout', 10); $client->getClient()->setDefaultOption('config/curl/'.CURLOPT_TIMEOUT_MS, 100); $client->getClient()->setDefaultOption('config/curl/'.CURLOPT_CONNECTTIMEOUT, 5); $client->getClient()->setDefaultOption('config/curl/'.CURLOPT_RETURNTRANSFER, true); $client->setClient($guzzle);*/ $crawler = $client->request('GET', $link['link']); \Yii::info(sprintf('Извлекаем тело: %s ', $link['link']), 'cron'); $head = $this->getHtml($crawler, true); \Yii::info(sprintf('HEAD тело: %s ', $link['link']), 'cron'); if (!empty($head['links'])) { foreach ($head['links'] as $l) { $crawler = $client->request('GET', $l); $subHead = $this->getHtml($crawler, false); $subHead['link'] = $l; $this->saveResult($subHead, $link); } } if (empty($head['title'])) { $head = $this->getPromoHTML($crawler, true); } if (empty($head['title'])) { $head = $this->getPromo2HTML($crawler, true); } $head['link'] = $link['link']; $this->saveResult($head, $link); unset($node); unset($subNode); unset($head); } $z = 1; $offset += 5; unset($links); unset($client); } else { $z = 0; } } while ($z > 0); return 0; }
/** * @param Song $song * @return array */ private function getInfo(Song $song) { $videoId = $song->videoId(); $url = 'http://youtube.com/get_video_info?video_id=' . $videoId; /** @var Response $response */ $response = $this->client->getClient()->get($url); $body = $response->getBody(true); parse_str($body, $arr); if (array_key_exists("status", $arr) && $arr["status"] == 'fail') { throw new \Exception($arr["reason"], 697); } return $arr; }
/** * @param boolean $allPages */ public function run($allPages) { $client = new Client(); $client->getClient()->setDefaultOption('config/curl/' . CURLOPT_TIMEOUT, 30); $client->setHeader('User-Agent', $this->config['user_agent']); try { $crawler = $client->request('GET', $this->config['url']); } catch (TransferException $e) { echo $e->getMessage() . PHP_EOL; exit(1); } if ($client->getResponse()->getStatus() == 200) { $this->getUrlsAndDownload($crawler); if ($allPages) { $link = $this->getNextLink($crawler); while ($link) { $crawler = $client->click($link); $this->getUrlsAndDownload($crawler); $link = $this->getNextLink($crawler); } } } else { echo "site not available\n"; } }
/** * Returns original {@see \Goutte\Client} client. * * If not set, creates new instance and sets * {@see \Diggin\Bridge\Guzzle\AutoCharsetEncodingPlugin\AutoCharsetEncodingPlugin} to support various charsets. * * @return GoutteClient */ public function getClient() { if (null === $this->client) { $this->client = new GoutteClient(); $this->client->getClient()->addSubscriber(new AutoCharsetEncodingPlugin()); } return $this->client; }
/** * @return \Goutte\Client */ public static function getClient() { if (!static::$client) { static::$client = new Client(); static::$client->getClient()->setDefaultOption('config/curl/' . CURLOPT_TIMEOUT, 30); } return static::$client; }
/** * Execute load request. * * @return array */ public function execute() { $client = new Client(); $client->getClient()->setDefaultOption('verify', false); $crawler = $client->request('GET', 'https://loadxtreme.ph/cgi-bin/webload.cgi?state=webload'); $form = $crawler->selectButton('SEND LOAD')->form(); $crawler = $client->submit($form, array('state' => 'webload', 'step' => '1', 'webtype' => '', 'uid' => $this->uid, 'pik' => $this->pik, 'pc' => $this->pc, 'cellno' => $this->cellno, 'email' => $this->email)); return $this->response($crawler->html()); }
protected function execute(InputInterface $input, OutputInterface $output) { $name = $input->getArgument('name'); if ($name) { $text = 'Hello ' . $name; } else { $text = 'Hello'; } if ($input->getOption('yell')) { $text = strtoupper($text); } // $output->writeln($text); if (!($file = fopen("prijzen-" . date('Y-m-d') . ".csv", "w"))) { $output->writeln("can not open file"); die; } $client = new Client(); $client->getClient()->setDefaultOption('config/curl/' . CURLOPT_TIMEOUT, 60); $products = $this->getProducts(); $counter = 1; foreach ($products as $product) { $crawler = $client->request('GET', 'http://brickwat.ch/' . $product); $status_code = $client->getResponse()->getStatus(); if ($status_code == 200) { $result = $crawler->filter('#prices')->filter('tr.row-collapse'); $output->writeln("Sites found for product " . $product . ": " . $result->count()); $prices[$product] = array(); if ($result->count()) { for ($i = 0; $i < $result->count(); $i++) { $price = ""; $company = ""; $values = $result->eq($i)->filter('td'); // Get company Name if ($values->eq(0)->filter('a img')->count()) { $company = $values->eq(0)->filter('a img')->attr('title'); $price = $values->eq(2)->filter('a')->text(); } else { $company = $values->eq(0)->text(); $price = $values->eq(2)->text(); } $prices[$product][$company] = $price; $companys[$company] = $company; } } } $counter++; usleep(500000); //if ($counter > 2) // break; } $this->writeResults($output, $prices, $companys, $file); fclose($file); }
/** * Metodo para realizar a consulta * * @param string $cnpj CNPJ * @param string $ie IE - Não Testado * @param string $paramBot ParamBot parametro enviado para validação do captcha * @param string $captcha CAPTCHA * @param string $stringCookie COOKIE * @throws Exception * @return array Dados da empresa */ public static function consulta($cnpj, $ie, $paramBot, $captcha, $stringCookie) { $arrayCookie = explode(';', $stringCookie); if (!Utils::isCnpj($cnpj)) { throw new Exception('O CNPJ informado não é válido.'); } $client = new Client(); #$client->getClient()->setDefaultOption('timeout', 120); $client->getClient()->setDefaultOption('config/curl/' . CURLOPT_TIMEOUT, 0); $client->getClient()->setDefaultOption('config/curl/' . CURLOPT_TIMEOUT_MS, 0); $client->getClient()->setDefaultOption('config/curl/' . CURLOPT_CONNECTTIMEOUT, 0); $client->getClient()->setDefaultOption('config/curl/' . CURLOPT_RETURNTRANSFER, true); $client->setHeader('Host', 'pfeserv1.fazenda.sp.gov.br'); $client->setHeader('User-Agent', 'Mozilla/5.0 (Windows NT 6.1; rv:32.0) Gecko/20100101 Firefox/32.0'); $client->setHeader('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9, */* ;q=0.8'); $client->setHeader('Accept-Language', 'pt-BR,pt;q=0.8,en-US;q=0.5,en;q=0.3'); $client->setHeader('Accept-Encoding', 'gzip, deflate'); $client->setHeader('Referer', 'http://www.sintegra.gov.br/new_bv.html'); $client->setHeader('Cookie', $arrayCookie[0]); $client->setHeader('Connection', 'keep-alive'); $servico = strlen($cnpj) > 0 ? 'cnpj' : 'ie'; $consultaPor = strlen($cnpj) > 0 ? 'Consulta por CNPJ' : 'Consulta por IE'; $param = array('hidFlag' => '0', 'cnpj' => Utils::unmask($cnpj), 'ie' => Utils::unmask($ie), 'paramBot' => $paramBot, 'Key' => $captcha, 'servico' => $servico, 'botao' => $consultaPor); $crawler = $client->request('POST', 'http://pfeserv1.fazenda.sp.gov.br/sintegrapfe/sintegra', $param); $imageError = 'O valor da imagem esta incorreto ou expirou. Verifique novamente a imagem e digite exatamente os 5 caracteres exibidos.'; $checkError = $crawler->filter('body > center')->eq(1)->count(); if ($checkError && $imageError == trim($crawler->filter('body > center')->eq(1)->text())) { throw new Exception($imageError, 99); } $center_ = $crawler->filter('body > center'); if (count($center_) == 0) { throw new Exception('Serviço indisponível!. Tente novamente.', 99); } //self::saveFile($client); $html = self::parseContent($client->getResponse()->__toString()); $crawler = new \Symfony\Component\DomCrawler\Crawler($html); $data = self::parseSelectors($crawler); return $data; }
/** * @param $episodeFilename string show file name * @param $download boolean download the file? * * @return null|string */ public function findSubtitle($episodeFilename, $download) { $language = $this->config->getSubtitleLanguage(); if (!isset($this->languages[$language])) { printf("Missing language [%s].\n", $language); return null; } $episode = new Episode($episodeFilename); if (!isset($this->shows[$episode->sanitizedShowName])) { printf("Missing show [%s].\n", $episode->showName); return null; } $languageId = $this->languages[$language]; $showId = $this->shows[$episode->sanitizedShowName]; $url = $this->builder->getAddictedShowAjaxUrl($showId, $episode->season, $languageId); printf("Trying to get subtitles from [%s].\n", $url); $crawler = $this->client->request('GET', $url); $matchingSubtitles = $crawler->filter('div#season > table > tbody > tr.epeven')->reduce(function (Crawler $node) use($episode) { $children = $node->children(); $ep = $children->getNode(1)->nodeValue; $group = strtolower($children->getNode(4)->nodeValue); $status = strtolower($children->getNode(5)->nodeValue); return (int) $ep === (int) $episode->ep && $episode->inGroups($group) && strpos($status, '%') === false; }); if ($matchingSubtitles->count() == 0) { printf("Missing subtitles for show [%s] season [%s] episode [%s] \n and groups [%s].\n", $episode->showName, $episode->season, $episode->ep, implode(', ', $episode->groups)); return null; } $chosenSubtitle = $matchingSubtitles->first(); $downloadUri = $chosenSubtitle->children()->getNode(9)->firstChild->getAttribute('href'); $url = $this->builder->getSubtitleUrl($downloadUri); if ($download === false) { printf("Chosen subtitle [%s].\n", $url); return null; } printf("Downloading subtitle [%s].\n", $url); $headers = $this->builder->getRequestHeaders($showId); return $this->client->getClient()->get($url, ['headers' => $headers])->getBody()->getContents(); }
public function get_scrap() { $client = new Client(); $client->getClient()->setDefaultOption('config/curl/' . CURLOPT_TIMEOUT, 60000); $crawled = "news.liputan6.com"; $title = "article.hentry > header.entry-header > h1"; $article = "article.hentry > div.entry-content > div.text-detail > p"; $geturl = DB::collection('crawling')->where('refurl', $crawled)->get(); foreach ($geturl as $key) { $url = $key['url']; $crawler = $client->request('GET', $url); $status_code = $client->getResponse()->getStatus(); if ($status_code == 200) { $crawler->filter($article)->each(function ($node) { $yew = new Scraping(); $yew->article = $node->text(); $yew->save(); }); } else { echo "we F*****G LOST DUDE !"; } echo "<hr>"; } }
protected function execute(InputInterface $input, OutputInterface $output) { // @todo: Determine better exit code. $exit_code = 1; $base_url = $input->getArgument('baseurl'); $c = $this->container; $sitemap_url = new UrlBuilder('/sitemap.xml', $base_url); $output->writeln('Crawling: ' . $sitemap_url); $sitemap = new SitemapCrawler($sitemap_url, $this->logger); $bad_urls = []; $p = new ProgressBar($output, count($sitemap)); $p->start(); $client = new Client(); $client->getClient()->setDefaultOption('config/curl/' . CURLOPT_TIMEOUT, $input->getOption('timeout')); // Pull in all URLs from the sitemap file(s), and compile a list of linked // URLs to check. // Linked array has the URL as key and NULL as the value, to be filled in // later with the status code of a HEAD request. // $linked is keyed by the URL so that we don't have duplicates. $linked = array(); foreach ($sitemap as $page_url) { // \sleep(2); $crawler = $client->request('GET', $page_url); $status = $client->getResponse()->getStatus(); if ($status != 200) { $bad_urls[] = $page_url; } else { if ($input->getOption('spider')) { $page_crawler = new HtmlCrawler($crawler, new UrlBuilder('', $base_url)); foreach ($page_crawler as $page_crawl_url => $page_crawl) { $linked[$page_crawl_url] = NULL; } } } $p->advance(); } $p->finish(); if ($input->getOption('spider')) { $linked_urls = []; $output->writeln(''); $output->writeln('Spidering links...'); $p = new ProgressBar($output, count($linked)); $p->start(); // Verify all linked URLs. foreach ($linked as $resource_url => $foo) { try { $crawler = $client->request('HEAD', $resource_url); $status = $client->getResponse()->getStatus(); if ($status < 400) { $linked_urls[$resource_url] = $client->getResponse()->getStatus(); } else { $bad_urls[] = $resource_url; } } catch (\Exception $e) { $bad_urls[] = $resource_url; } $p->advance(); } $p->finish(); } $output->writeln(''); if (empty($bad_urls)) { $output->writeln('No errors for any page in ' . $sitemap_url); $exit_code = 0; } else { foreach ($bad_urls as $item) { $output->writeln($item); } } $output->writeln(''); $output->writeln('<info>Done.</info>'); return $exit_code; }
/** * {@inheritdoc} * @param Proxy $proxy * * @return mixed|void * @throws \Exception */ public function setProxy(Proxy $proxy) { $sessionName = md5($proxy->getUrl()); $this->browser->resetSessions(); if ($this->browser->hasSession($sessionName)) { $this->browser->setDefaultSessionName($sessionName); return; } $driver = $this->browser->getSession()->getDriver(); switch (get_class($driver)) { case 'Behat\\Mink\\Driver\\GoutteDriver': /* @var $driver GoutteDriver */ $client = new Client(); $guzzle = $client->getClient(); $guzzle->setDefaultOption('proxy', $proxy->getUrl()); $client->setClient($guzzle); $goutteDriver = new GoutteDriver($client); $this->browser->registerSession($sessionName, new Session($goutteDriver)); $this->browser->setDefaultSessionName($sessionName); break; case 'Behat\\Mink\\Driver\\Selenium2Driver': /* @var $driver Selenium2Driver */ // Todo : use other files than pac file // Currently it does only support pac file if (empty($proxy->pacFile)) { throw new \Exception('Pac file/url is required.'); } // @see https://code.google.com/p/selenium/wiki/JsonWireProtocol#Proxy_JSON_Object $driver->setDesiredCapabilities(["proxy" => array("proxyType" => "pac", "proxyAutoconfigUrl" => $proxy->pacFile)]); $this->browser->stopSessions(); $this->browser->registerSession($sessionName, new Session($driver)); $this->browser->setDefaultSessionName($sessionName); break; default: throw new \Exception('Error : Proxy configuration is not implemented for class ' . get_class($driver) . ''); } }
curl_setopt($this->ch, CURLOPT_RETURNTRANSFER, 1); // $output contains the output string $this->result = curl_exec($this->ch); if ($this->result == false) { return false; } return true; } public function getResult() { return $this->result; } } use Goutte\Client; $client = new Client(); $client->getClient()->setDefaultOption('config/curl/' . CURLOPT_TIMEOUT, 60); $crawler = $client->request('GET', 'http://www.symfony.com/blog/'); $status_code = $client->getResponse()->getStatus(); if ($status_code == 200) { echo $crawler->filterXPath('html/head/title')->text(); echo $crawler->filter('title')->text(); } /* $pageloader = new pageloader(); $doc = new DOMDocument(); $result = $pageloader->loadurl("http://www.brickwatch.net/nl/set/7280/Straight-Crossroad-Plates.html"); if ($result) { $doc->loadHTML($pageloader->getResult());
/** * @param string $url * * @return null|\Symfony\Component\DomCrawler\Crawler */ private function getData($url) { $client = new Client(); $guzzle = $client->getClient(); //Использование Прокси пока отключено //$client->getClient()->setDefaultOption('config/curl/'.CURLOPT_PROXY, 'http://141.101.118.147:80'); //Максимальное количество секунд выполнения запроса $client->getClient()->setDefaultOption('verify', false); $client->getClient()->setDefaultOption('config/curl/' . CURLOPT_TIMEOUT, 10); //Ожидание до подключения $client->getClient()->setDefaultOption('config/curl/' . CURLOPT_CONNECTTIMEOUT, 15); $client->setClient($guzzle); try { $crawler = $client->request('GET', $url); } catch (\Exception $e) { \Yii::error(sprintf('Ошибка обработки: %s %s ', $e->getMessage(), $url), 'cron'); return null; } return $crawler; }
public function testCreatesDefaultClient() { $client = new Client(); $this->assertInstanceOf('GuzzleHttp\\ClientInterface', $client->getClient()); }
/** * Sets the browser driver depending on the javascript select parameter or injected browser driver * @param Session $driver */ private function setBrowser(Session $driver = null) { if ($driver != null) { $this->browser = new Mink(['custom' => $driver]); $this->browser->setDefaultSessionName('custom'); return; } $client = new Client(); $guzzle = $client->getClient(); CacheSubscriber::attach($guzzle, []); $client->setClient($guzzle); // init Mink and register sessions $this->browser = new Mink(['goutte' => new Session(new GoutteDriver($client)), 'selenium2' => new Session(new Selenium2Driver('firefox', ["permissions.default.image" => 2]))]); if (!$this->javaScriptRequired) { $this->browser->setDefaultSessionName('goutte'); return; } $this->browser->setDefaultSessionName('selenium2'); }