/** * @inheritdoc */ protected function execute(InputInterface $input, OutputInterface $output) { if (null === ($entity = $this->findScraper($input->getArgument('scraper')))) { throw new \RuntimeException(sprintf('Scraper %d not found', $input->getArgument('scraper'))); } $scraper = $this->factory->createScraper($entity); if ($input->getOption('async')) { $scraper->setAsync(true); } if ($input->getOption('no-limit')) { $limit = $scraper->getCrawler()->getRateLimit(); if ($limit instanceof EnablingRateLimitInterface) { $limit->disable(); } } if ($output->getVerbosity() >= OutputInterface::VERBOSITY_NORMAL) { $dispatcher = $scraper->getEventDispatcher(); $dispatcher->addSubscriber(new ScrapeOutputSubscriber($output)); } try { $scraper->scrape($entity, $input->getArgument('url')); return 0; } catch (CrawlException $e) { $output->writeln("<error>Error scraping url: %s\n\n%s</error>", $e->getUrl(), $e->getMessage()); return 1; } }
/** * @inheritdoc */ public function execute(array $payload) { /** @var ScraperEntity $entity */ /** @var string $url */ list($entity, $url) = $payload; $scraper = $this->factory->createScraper($entity); $scraper->setAsync(true); try { $scraper->scrape($entity, $url); return true; } catch (RateLimitException $e) { $re = new RescheduleException(); if ($date = $e->getRetryDate()) { $re->setRescheduleDate($date); } throw $re; } catch (CrawlException $e) { $this->logger->error($e->getMessage(), ['url' => $e->getUrl()]); return false; } }
/** * @param ScraperEntity $scraperEntity * @param bool $disableLimit * * @return ScraperInterface */ protected function createScraper(ScraperEntity $scraperEntity, $disableLimit = false) { if (!array_key_exists($scraperEntity->getId(), $this->scrapers)) { $scraper = $this->factory->createScraper($scraperEntity); if ($disableLimit) { $limit = $scraper->getCrawler()->getRateLimit(); if ($limit instanceof EnablingRateLimitInterface) { $limit->disable(); } } $this->scrapers[$scraperEntity->getId()] = $scraper; } return $this->scrapers[$scraperEntity->getId()]; }
/** * @param InputInterface $input * @param Scraper $scraperEntity */ protected function scrape(InputInterface $input, $scraperEntity) { $scraper = $this->factory->createScraper($scraperEntity); if ($input->getOption('async')) { $scraper->setAsync(true); } if ($input->getOption('no-limit')) { $limit = $scraper->getCrawler()->getRateLimit(); if ($limit instanceof EnablingRateLimitInterface) { $limit->disable(); } } $scraper->scrape($scraperEntity, $scraperEntity->getUrl()); $scraperEntity->setDatetimeLastStarted(new \DateTime()); $this->doctrine->getManager()->flush($scraperEntity); }
/** * @expectedException \OutOfBoundsException * @expectedExceptionMessage Handler "foo" is not registered */ public function testMissingHandler() { $this->factory->getHandler('foo'); }
/** * @param string $crawler * * @return CrawlerInterface */ protected function findCrawler($crawler) { return $this->factory->getCrawler($crawler); }