/** * Grabs the content from the crawled page and publishes a job on the queue. * * @param \VDB\Spider\Resource $resource * @param \Simgroep\ConcurrentSpiderBundle\CrawlJob $crawlJob * * @throws \Simgroep\ConcurrentSpiderBundle\InvalidContentException */ public function persist(Resource $resource, CrawlJob $crawlJob) { if (strlen($resource->getResponse()->getBody()) >= $this->maximumResourceSize) { throw new InvalidContentException(sprintf('Resource size exceeds limits (%s bytes)', $this->maximumResourceSize)); } $document = $this->documentResolver->getDocumentByResource($resource); $persistenceEvent = new PersistenceEvent($document, $resource, $crawlJob->getMetadata()); $this->eventDispatcher->dispatch(PersistenceEvents::PRE_PERSIST, $persistenceEvent); $message = new AMQPMessage(json_encode(array_merge(['document' => $document->toArray()], ['metadata' => $crawlJob->getMetadata()])), ['delivery_mode' => 1]); $this->queue->publish($message); }
/** * Writes the found URL as a job on the queue. * * And URL is only persisted to the queue when it not has been indexed yet. * * @param \Symfony\Component\EventDispatcher\GenericEvent $event */ public function onDiscoverUrl(Event $event) { $crawlJob = $event->getSubject()->getCurrentCrawlJob(); foreach ($event['uris'] as $uri) { if ($position = strpos($uri, '#')) { $uri = new Uri(substr($uri, 0, $position)); } $isBlacklisted = UrlCheck::isUrlBlacklisted($uri->normalize()->toString(), $crawlJob->getBlacklist()); if ($isBlacklisted) { $this->eventDispatcher->dispatch("spider.crawl.blacklisted", new Event($this, ['uri' => $uri])); continue; //url blacklisted, so go to next one } if (!$this->indexer->isUrlIndexedandNotExpired(UrlCheck::fixUrl($uri->toString()), $crawlJob->getMetadata())) { $job = new CrawlJob(UrlCheck::fixUrl($uri->normalize()->toString()), (new Uri($crawlJob->getUrl()))->normalize()->toString(), $crawlJob->getBlacklist(), $crawlJob->getMetadata(), $crawlJob->getWhitelist()); if ($job->isAllowedToCrawl()) { $this->queue->publishJob($job); } } } }
/** * Function that crawls one webpage based on the give url. * * @param \Simgroep\ConcurrentSpiderBundle\CrawlJob $crawlJob */ public function crawl(CrawlJob $crawlJob) { $this->currentCrawlJob = $crawlJob; $resource = $this->requestHandler->request(new Uri($crawlJob->getUrl())); $uris = []; $this->eventDispatcher->dispatch(SpiderEvents::SPIDER_CRAWL_PRE_DISCOVER); $crawler = $resource->getCrawler()->filterXPath('//a'); foreach ($crawler as $node) { try { $href = $node->getAttribute('href'); $baseUrl = $resource->getUri()->toString(); $uri = new Uri($href, $baseUrl); $uris[] = $uri; } catch (UriSyntaxException $e) { //too bad } } $this->eventDispatcher->dispatch(SpiderEvents::SPIDER_CRAWL_POST_DISCOVER, new GenericEvent($this, ['uris' => $uris])); $this->persistenceHandler->persist($resource, $crawlJob); }
/** * Function that crawls one webpage based on the give url. * * @param \Simgroep\ConcurrentSpiderBundle\CrawlJob $crawlJob */ public function crawl(CrawlJob $crawlJob) { $this->currentCrawlJob = $crawlJob; $resource = $this->requestHandler->request(new Uri($crawlJob->getUrl())); if ($resource->getResponse()->getStatusCode() == 301) { $exception = new ClientErrorResponseException(sprintf("Page moved to %s", $resource->getResponse()->getInfo('redirect_url')), 301); $exception->setResponse($resource->getResponse()); throw $exception; } $uris = []; $this->eventDispatcher->dispatch(SpiderEvents::SPIDER_CRAWL_PRE_DISCOVER); $baseUrl = $resource->getUri()->toString(); $crawler = $resource->getCrawler()->filterXPath('//a'); foreach ($crawler as $node) { try { if ($node->getAttribute("rel") === "nofollow") { continue; } $href = $node->getAttribute('href'); $uri = new Uri($href, $baseUrl); $uris[] = $uri; } catch (UriSyntaxException $e) { //too bad } } $crawler = $resource->getCrawler()->filterXPath('//loc'); foreach ($crawler as $node) { try { $href = $node->nodeValue; $uri = new Uri($href, $baseUrl); $uris[] = $uri; } catch (UriSyntaxException $e) { //too bad } } $this->eventDispatcher->dispatch(SpiderEvents::SPIDER_CRAWL_POST_DISCOVER, new GenericEvent($this, ['uris' => $uris])); $this->persistenceHandler->persist($resource, $crawlJob); }
/** * Logs a message that will tell the job is failed. * * @param \Simgroep\ConcurrentSpiderBundle\CrawlJob $crawlJob * @param string $level */ public function markAsFailed(CrawlJob $crawlJob, $errorMessage) { $meta = $crawlJob->getMetadata(); $this->logMessage('emergency', sprintf("Failed (%s) %s", $errorMessage, $crawlJob->getUrl()), $crawlJob->getUrl(), $meta['core']); }
public function publishJob(CrawlJob $crawlJob) { $message = new AMQPMessage(json_encode($crawlJob->toArray()), ['delivery_mode' => 1]); return $this->publish($message); }
/** * Logs a message that will tell the job is failed. * * @param \Simgroep\ConcurrentSpiderBundle\CrawlJob $crawlJob * @param string $level */ public function markAsFailed(CrawlJob $crawlJob, $errorMessage) { $this->logMessage('emergency', sprintf("Failed (%s) %s", $errorMessage, $crawlJob->getUrl()), $crawlJob->getUrl()); }
/** * @test * @testdox Tests if get base url returns correct value. */ public function ifGetBaseUrlReturnCorrectValue() { $crawlJob = new CrawlJob('', 'http://dummy.nl/', [], [], []); $this->assertSame('http://dummy.nl/', $crawlJob->getBaseUrl()); }