/** * Logs a message that will tell the job is failed. * * @param \Simgroep\ConcurrentSpiderBundle\CrawlJob $crawlJob * @param string $level */ public function markAsFailed(CrawlJob $crawlJob, $errorMessage) { $meta = $crawlJob->getMetadata(); $this->logMessage('emergency', sprintf("Failed (%s) %s", $errorMessage, $crawlJob->getUrl()), $crawlJob->getUrl(), $meta['core']); }
/** * Function that crawls one webpage based on the give url. * * @param \Simgroep\ConcurrentSpiderBundle\CrawlJob $crawlJob */ public function crawl(CrawlJob $crawlJob) { $this->currentCrawlJob = $crawlJob; $resource = $this->requestHandler->request(new Uri($crawlJob->getUrl())); $uris = []; $this->eventDispatcher->dispatch(SpiderEvents::SPIDER_CRAWL_PRE_DISCOVER); $crawler = $resource->getCrawler()->filterXPath('//a'); foreach ($crawler as $node) { try { $href = $node->getAttribute('href'); $baseUrl = $resource->getUri()->toString(); $uri = new Uri($href, $baseUrl); $uris[] = $uri; } catch (UriSyntaxException $e) { //too bad } } $this->eventDispatcher->dispatch(SpiderEvents::SPIDER_CRAWL_POST_DISCOVER, new GenericEvent($this, ['uris' => $uris])); $this->persistenceHandler->persist($resource, $crawlJob); }
/** * Logs a message that will tell the job is failed. * * @param \Simgroep\ConcurrentSpiderBundle\CrawlJob $crawlJob * @param string $level */ public function markAsFailed(CrawlJob $crawlJob, $errorMessage) { $this->logMessage('emergency', sprintf("Failed (%s) %s", $errorMessage, $crawlJob->getUrl()), $crawlJob->getUrl()); }
/** * Function that crawls one webpage based on the give url. * * @param \Simgroep\ConcurrentSpiderBundle\CrawlJob $crawlJob */ public function crawl(CrawlJob $crawlJob) { $this->currentCrawlJob = $crawlJob; $resource = $this->requestHandler->request(new Uri($crawlJob->getUrl())); if ($resource->getResponse()->getStatusCode() == 301) { $exception = new ClientErrorResponseException(sprintf("Page moved to %s", $resource->getResponse()->getInfo('redirect_url')), 301); $exception->setResponse($resource->getResponse()); throw $exception; } $uris = []; $this->eventDispatcher->dispatch(SpiderEvents::SPIDER_CRAWL_PRE_DISCOVER); $baseUrl = $resource->getUri()->toString(); $crawler = $resource->getCrawler()->filterXPath('//a'); foreach ($crawler as $node) { try { if ($node->getAttribute("rel") === "nofollow") { continue; } $href = $node->getAttribute('href'); $uri = new Uri($href, $baseUrl); $uris[] = $uri; } catch (UriSyntaxException $e) { //too bad } } $crawler = $resource->getCrawler()->filterXPath('//loc'); foreach ($crawler as $node) { try { $href = $node->nodeValue; $uri = new Uri($href, $baseUrl); $uris[] = $uri; } catch (UriSyntaxException $e) { //too bad } } $this->eventDispatcher->dispatch(SpiderEvents::SPIDER_CRAWL_POST_DISCOVER, new GenericEvent($this, ['uris' => $uris])); $this->persistenceHandler->persist($resource, $crawlJob); }