/**
  * Grabs the content from the crawled page and publishes a job on the queue.
  *
  * @param \VDB\Spider\Resource                      $resource
  * @param \Simgroep\ConcurrentSpiderBundle\CrawlJob $crawlJob
  *
  * @throws \Simgroep\ConcurrentSpiderBundle\InvalidContentException
  */
 public function persist(Resource $resource, CrawlJob $crawlJob)
 {
     if (strlen($resource->getResponse()->getBody()) >= $this->maximumResourceSize) {
         throw new InvalidContentException(sprintf('Resource size exceeds limits (%s bytes)', $this->maximumResourceSize));
     }
     $document = $this->documentResolver->getDocumentByResource($resource);
     $persistenceEvent = new PersistenceEvent($document, $resource, $crawlJob->getMetadata());
     $this->eventDispatcher->dispatch(PersistenceEvents::PRE_PERSIST, $persistenceEvent);
     $message = new AMQPMessage(json_encode(array_merge(['document' => $document->toArray()], ['metadata' => $crawlJob->getMetadata()])), ['delivery_mode' => 1]);
     $this->queue->publish($message);
 }
 /**
  * Writes the found URL as a job on the queue.
  *
  * And URL is only persisted to the queue when it not has been indexed yet.
  *
  * @param \Symfony\Component\EventDispatcher\GenericEvent $event
  */
 public function onDiscoverUrl(Event $event)
 {
     $crawlJob = $event->getSubject()->getCurrentCrawlJob();
     foreach ($event['uris'] as $uri) {
         if ($position = strpos($uri, '#')) {
             $uri = new Uri(substr($uri, 0, $position));
         }
         $isBlacklisted = UrlCheck::isUrlBlacklisted($uri->normalize()->toString(), $crawlJob->getBlacklist());
         if ($isBlacklisted) {
             $this->eventDispatcher->dispatch("spider.crawl.blacklisted", new Event($this, ['uri' => $uri]));
             continue;
             //url blacklisted, so go to next one
         }
         if (!$this->indexer->isUrlIndexedandNotExpired(UrlCheck::fixUrl($uri->toString()), $crawlJob->getMetadata())) {
             $job = new CrawlJob(UrlCheck::fixUrl($uri->normalize()->toString()), (new Uri($crawlJob->getUrl()))->normalize()->toString(), $crawlJob->getBlacklist(), $crawlJob->getMetadata(), $crawlJob->getWhitelist());
             if ($job->isAllowedToCrawl()) {
                 $this->queue->publishJob($job);
             }
         }
     }
 }
예제 #3
0
 /**
  * Function that crawls one webpage based on the give url.
  *
  * @param \Simgroep\ConcurrentSpiderBundle\CrawlJob $crawlJob
  */
 public function crawl(CrawlJob $crawlJob)
 {
     $this->currentCrawlJob = $crawlJob;
     $resource = $this->requestHandler->request(new Uri($crawlJob->getUrl()));
     $uris = [];
     $this->eventDispatcher->dispatch(SpiderEvents::SPIDER_CRAWL_PRE_DISCOVER);
     $crawler = $resource->getCrawler()->filterXPath('//a');
     foreach ($crawler as $node) {
         try {
             $href = $node->getAttribute('href');
             $baseUrl = $resource->getUri()->toString();
             $uri = new Uri($href, $baseUrl);
             $uris[] = $uri;
         } catch (UriSyntaxException $e) {
             //too bad
         }
     }
     $this->eventDispatcher->dispatch(SpiderEvents::SPIDER_CRAWL_POST_DISCOVER, new GenericEvent($this, ['uris' => $uris]));
     $this->persistenceHandler->persist($resource, $crawlJob);
 }
예제 #4
0
 /**
  * Function that crawls one webpage based on the give url.
  *
  * @param \Simgroep\ConcurrentSpiderBundle\CrawlJob $crawlJob
  */
 public function crawl(CrawlJob $crawlJob)
 {
     $this->currentCrawlJob = $crawlJob;
     $resource = $this->requestHandler->request(new Uri($crawlJob->getUrl()));
     if ($resource->getResponse()->getStatusCode() == 301) {
         $exception = new ClientErrorResponseException(sprintf("Page moved to %s", $resource->getResponse()->getInfo('redirect_url')), 301);
         $exception->setResponse($resource->getResponse());
         throw $exception;
     }
     $uris = [];
     $this->eventDispatcher->dispatch(SpiderEvents::SPIDER_CRAWL_PRE_DISCOVER);
     $baseUrl = $resource->getUri()->toString();
     $crawler = $resource->getCrawler()->filterXPath('//a');
     foreach ($crawler as $node) {
         try {
             if ($node->getAttribute("rel") === "nofollow") {
                 continue;
             }
             $href = $node->getAttribute('href');
             $uri = new Uri($href, $baseUrl);
             $uris[] = $uri;
         } catch (UriSyntaxException $e) {
             //too bad
         }
     }
     $crawler = $resource->getCrawler()->filterXPath('//loc');
     foreach ($crawler as $node) {
         try {
             $href = $node->nodeValue;
             $uri = new Uri($href, $baseUrl);
             $uris[] = $uri;
         } catch (UriSyntaxException $e) {
             //too bad
         }
     }
     $this->eventDispatcher->dispatch(SpiderEvents::SPIDER_CRAWL_POST_DISCOVER, new GenericEvent($this, ['uris' => $uris]));
     $this->persistenceHandler->persist($resource, $crawlJob);
 }
 /**
  * Logs a message that will tell the job is failed.
  *
  * @param \Simgroep\ConcurrentSpiderBundle\CrawlJob $crawlJob
  * @param string                                    $level
  */
 public function markAsFailed(CrawlJob $crawlJob, $errorMessage)
 {
     $meta = $crawlJob->getMetadata();
     $this->logMessage('emergency', sprintf("Failed (%s) %s", $errorMessage, $crawlJob->getUrl()), $crawlJob->getUrl(), $meta['core']);
 }
예제 #6
0
 public function publishJob(CrawlJob $crawlJob)
 {
     $message = new AMQPMessage(json_encode($crawlJob->toArray()), ['delivery_mode' => 1]);
     return $this->publish($message);
 }
 /**
  * Logs a message that will tell the job is failed.
  *
  * @param \Simgroep\ConcurrentSpiderBundle\CrawlJob $crawlJob
  * @param string                                    $level
  */
 public function markAsFailed(CrawlJob $crawlJob, $errorMessage)
 {
     $this->logMessage('emergency', sprintf("Failed (%s) %s", $errorMessage, $crawlJob->getUrl()), $crawlJob->getUrl());
 }
 /**
  * @test
  * @testdox Tests if get base url returns correct value.
  */
 public function ifGetBaseUrlReturnCorrectValue()
 {
     $crawlJob = new CrawlJob('', 'http://dummy.nl/', [], [], []);
     $this->assertSame('http://dummy.nl/', $crawlJob->getBaseUrl());
 }