protected function getCrawlRequests() : Generator { $i = 0; while ($crawlUrl = $this->crawlQueue->getPendingUrlAtIndex($i)) { if (!$this->crawlProfile->shouldCrawl($crawlUrl->url)) { $i++; continue; } if ($this->crawlQueue->hasAlreadyBeenProcessed($crawlUrl)) { $i++; continue; } $this->crawlObserver->willCrawl($crawlUrl->url); $this->crawlQueue->markAsProcessed($crawlUrl); (yield new Request('GET', (string) $crawlUrl->url)); $i++; } }
/** * Crawl the given url. * * @param \Spatie\Crawler\Url $url */ protected function crawlUrl(Url $url) { if (!$this->crawlProfile->shouldCrawl($url)) { return; } if ($this->hasAlreadyCrawled($url)) { return; } $this->crawlObserver->willCrawl($url); try { $response = $this->client->request('GET', (string) $url); } catch (RequestException $exception) { $response = $exception->getResponse(); } $this->crawlObserver->hasBeenCrawled($url, $response); $this->crawledUrls->push($url); if (!$response) { return; } if ($url->host === $this->baseUrl->host) { $this->crawlAllLinks($response->getBody()->getContents()); } }