Exemplo n.º 1
0
 /**
  * Function that crawls one webpage based on the give url.
  *
  * @param \Simgroep\ConcurrentSpiderBundle\CrawlJob $crawlJob
  */
 public function crawl(CrawlJob $crawlJob)
 {
     $this->currentCrawlJob = $crawlJob;
     $resource = $this->requestHandler->request(new Uri($crawlJob->getUrl()));
     $uris = [];
     $this->eventDispatcher->dispatch(SpiderEvents::SPIDER_CRAWL_PRE_DISCOVER);
     $crawler = $resource->getCrawler()->filterXPath('//a');
     foreach ($crawler as $node) {
         try {
             $href = $node->getAttribute('href');
             $baseUrl = $resource->getUri()->toString();
             $uri = new Uri($href, $baseUrl);
             $uris[] = $uri;
         } catch (UriSyntaxException $e) {
             //too bad
         }
     }
     $this->eventDispatcher->dispatch(SpiderEvents::SPIDER_CRAWL_POST_DISCOVER, new GenericEvent($this, ['uris' => $uris]));
     $this->persistenceHandler->persist($resource, $crawlJob);
 }
Exemplo n.º 2
0
 /**
  * Function that crawls one webpage based on the give url.
  *
  * @param \Simgroep\ConcurrentSpiderBundle\CrawlJob $crawlJob
  */
 public function crawl(CrawlJob $crawlJob)
 {
     $this->currentCrawlJob = $crawlJob;
     $resource = $this->requestHandler->request(new Uri($crawlJob->getUrl()));
     if ($resource->getResponse()->getStatusCode() == 301) {
         $exception = new ClientErrorResponseException(sprintf("Page moved to %s", $resource->getResponse()->getInfo('redirect_url')), 301);
         $exception->setResponse($resource->getResponse());
         throw $exception;
     }
     $uris = [];
     $this->eventDispatcher->dispatch(SpiderEvents::SPIDER_CRAWL_PRE_DISCOVER);
     $baseUrl = $resource->getUri()->toString();
     $crawler = $resource->getCrawler()->filterXPath('//a');
     foreach ($crawler as $node) {
         try {
             if ($node->getAttribute("rel") === "nofollow") {
                 continue;
             }
             $href = $node->getAttribute('href');
             $uri = new Uri($href, $baseUrl);
             $uris[] = $uri;
         } catch (UriSyntaxException $e) {
             //too bad
         }
     }
     $crawler = $resource->getCrawler()->filterXPath('//loc');
     foreach ($crawler as $node) {
         try {
             $href = $node->nodeValue;
             $uri = new Uri($href, $baseUrl);
             $uris[] = $uri;
         } catch (UriSyntaxException $e) {
             //too bad
         }
     }
     $this->eventDispatcher->dispatch(SpiderEvents::SPIDER_CRAWL_POST_DISCOVER, new GenericEvent($this, ['uris' => $uris]));
     $this->persistenceHandler->persist($resource, $crawlJob);
 }