/** * Execute the console command. * * @return mixed */ public function handle(Client $client) { Crawler::create(['url' => $this->argument('url')]); return; Crawler::where('url', '=', $this->argument('url'))->delete(); $crawler = Crawler::create(['url' => $this->argument('url')]); $html = $client->get($this->argument('url'))->getBody(); $dom = new DomCrawler(); $dom->addHtmlContent($html); $spider = new Spider($crawler, $dom); $spider->get(); }
public function crawl($id) { $links = $this->getLinks($id); $uuid = uniqid(); Log::info("Starting to crawl {$uuid}... " . Carbon::now()->toDateTimeString()); $counter = 0; foreach ($links as $link) { try { $request = $this->client->get($link->url); if ($request->getStatusCode() == 404) { $link->delete(); continue; } else { if ($request->getStatusCode() != 200) { $link->ind_crawled = false; $link->failed_tries = $link->failed_tries + 1; $link->save(); Log::info("Erro de requisição: STATUS " . $request->getStatusCode()); continue; } } DB::beginTransaction(); try { $html = $request->getBody(); $dom = new DomCrawler(); $dom->addHtmlContent($html); if ($this->isProduct($link->url)) { $spider = new Spider($link, $dom); $spider->get(); } foreach ($dom->filter("a") as $anchor) { $url = $this->normalizeUrl($anchor->getAttribute("href")); if ($url === false) { continue; } $exists = CrawlerTable::where('url', '=', $url)->first(); if (!empty($exists)) { continue; } try { $this->crawler->create(['url' => $url]); $counter++; } catch (Exception $e) { Log::info($e->getMessage()); } } $link->failed_tries = 0; $link->error_log = null; $link->save(); DB::commit(); } catch (Exception $e) { DB::rollBack(); Log::info($e->getMessage() . " / Url: " . $link->url); Log::info($e->getTraceAsString()); $link->ind_crawled = false; $link->failed_tries = $link->failed_tries + 1; $link->error_log = $e->getMessage() . PHP_EOL . PHP_EOL . $e->getTraceAsString(); $link->save(); } finally { $dom = null; $html = null; } } catch (Exception $e) { Log::info("Houve um erro na requisição: "); continue; } } Log::info("Crawling finished {$uuid}, Links inseridos: {$counter}!"); session()->set('crawler', session()->get('crawler') - 1); }