Example #1
0
 protected function getCrawlRequests() : Generator
 {
     $i = 0;
     while ($crawlUrl = $this->crawlQueue->getPendingUrlAtIndex($i)) {
         if (!$this->crawlProfile->shouldCrawl($crawlUrl->url)) {
             $i++;
             continue;
         }
         if ($this->crawlQueue->hasAlreadyBeenProcessed($crawlUrl)) {
             $i++;
             continue;
         }
         $this->crawlObserver->willCrawl($crawlUrl->url);
         $this->crawlQueue->markAsProcessed($crawlUrl);
         (yield new Request('GET', (string) $crawlUrl->url));
         $i++;
     }
 }
Example #2
0
 /**
  * Crawl the given url.
  *
  * @param \Spatie\Crawler\Url $url
  */
 protected function crawlUrl(Url $url)
 {
     if (!$this->crawlProfile->shouldCrawl($url)) {
         return;
     }
     if ($this->hasAlreadyCrawled($url)) {
         return;
     }
     $this->crawlObserver->willCrawl($url);
     try {
         $response = $this->client->request('GET', (string) $url);
     } catch (RequestException $exception) {
         $response = $exception->getResponse();
     }
     $this->crawlObserver->hasBeenCrawled($url, $response);
     $this->crawledUrls->push($url);
     if (!$response) {
         return;
     }
     if ($url->host === $this->baseUrl->host) {
         $this->crawlAllLinks($response->getBody()->getContents());
     }
 }