/**
  * Logs a message that will tell the job is failed.
  *
  * @param \Simgroep\ConcurrentSpiderBundle\CrawlJob $crawlJob
  * @param string                                    $level
  */
 public function markAsFailed(CrawlJob $crawlJob, $errorMessage)
 {
     $meta = $crawlJob->getMetadata();
     $this->logMessage('emergency', sprintf("Failed (%s) %s", $errorMessage, $crawlJob->getUrl()), $crawlJob->getUrl(), $meta['core']);
 }
예제 #2
0
 /**
  * Function that crawls one webpage based on the give url.
  *
  * @param \Simgroep\ConcurrentSpiderBundle\CrawlJob $crawlJob
  */
 public function crawl(CrawlJob $crawlJob)
 {
     $this->currentCrawlJob = $crawlJob;
     $resource = $this->requestHandler->request(new Uri($crawlJob->getUrl()));
     $uris = [];
     $this->eventDispatcher->dispatch(SpiderEvents::SPIDER_CRAWL_PRE_DISCOVER);
     $crawler = $resource->getCrawler()->filterXPath('//a');
     foreach ($crawler as $node) {
         try {
             $href = $node->getAttribute('href');
             $baseUrl = $resource->getUri()->toString();
             $uri = new Uri($href, $baseUrl);
             $uris[] = $uri;
         } catch (UriSyntaxException $e) {
             //too bad
         }
     }
     $this->eventDispatcher->dispatch(SpiderEvents::SPIDER_CRAWL_POST_DISCOVER, new GenericEvent($this, ['uris' => $uris]));
     $this->persistenceHandler->persist($resource, $crawlJob);
 }
 /**
  * Logs a message that will tell the job is failed.
  *
  * @param \Simgroep\ConcurrentSpiderBundle\CrawlJob $crawlJob
  * @param string                                    $level
  */
 public function markAsFailed(CrawlJob $crawlJob, $errorMessage)
 {
     $this->logMessage('emergency', sprintf("Failed (%s) %s", $errorMessage, $crawlJob->getUrl()), $crawlJob->getUrl());
 }
예제 #4
0
 /**
  * Function that crawls one webpage based on the give url.
  *
  * @param \Simgroep\ConcurrentSpiderBundle\CrawlJob $crawlJob
  */
 public function crawl(CrawlJob $crawlJob)
 {
     $this->currentCrawlJob = $crawlJob;
     $resource = $this->requestHandler->request(new Uri($crawlJob->getUrl()));
     if ($resource->getResponse()->getStatusCode() == 301) {
         $exception = new ClientErrorResponseException(sprintf("Page moved to %s", $resource->getResponse()->getInfo('redirect_url')), 301);
         $exception->setResponse($resource->getResponse());
         throw $exception;
     }
     $uris = [];
     $this->eventDispatcher->dispatch(SpiderEvents::SPIDER_CRAWL_PRE_DISCOVER);
     $baseUrl = $resource->getUri()->toString();
     $crawler = $resource->getCrawler()->filterXPath('//a');
     foreach ($crawler as $node) {
         try {
             if ($node->getAttribute("rel") === "nofollow") {
                 continue;
             }
             $href = $node->getAttribute('href');
             $uri = new Uri($href, $baseUrl);
             $uris[] = $uri;
         } catch (UriSyntaxException $e) {
             //too bad
         }
     }
     $crawler = $resource->getCrawler()->filterXPath('//loc');
     foreach ($crawler as $node) {
         try {
             $href = $node->nodeValue;
             $uri = new Uri($href, $baseUrl);
             $uris[] = $uri;
         } catch (UriSyntaxException $e) {
             //too bad
         }
     }
     $this->eventDispatcher->dispatch(SpiderEvents::SPIDER_CRAWL_POST_DISCOVER, new GenericEvent($this, ['uris' => $uris]));
     $this->persistenceHandler->persist($resource, $crawlJob);
 }