/** * @param DelegatingSourceCleaner $cleaner * @param Feed $feed * @param ThresholdVoterInterface $voter * * @return bool */ public function cleanFeed(DelegatingSourceCleaner $cleaner, Feed $feed, ThresholdVoterInterface $voter) { if (null === ($expireDate = $this->getLastFullImportDate($feed))) { $this->logger->debug(sprintf('Skipping %s, because it has no recent imports', $feed)); $this->eventDispatcher->dispatch(IoEvents::FEED_CLEANUP_SKIP, new FeedCleanupEvent($feed, 0)); return false; } $this->eventDispatcher->dispatch(IoEvents::PRE_CLEAN_FEED, new FeedEvent($feed)); $this->logger->debug(sprintf('Checking sources of %s that have not been visited since %s', $feed, $expireDate->format('Y-m-d H:i:s'))); // get sources that haven't been visited since $expireDate $sourceRepo = $this->sourceManager->getRepository(); $count = $sourceRepo->countByFeedAndUnvisitedSince($feed, $expireDate); // fail safe: see if percentage of sources to be removed is not too high $total = $sourceRepo->countByFeed($feed); $max = $this->getThreshold($total); // see if threshold is reached if ($count > $max) { $message = sprintf('Stopping cleanup for %s, because %s of %s sources were to be deleted, %s is the maximum.', $feed, $count, $total, $max); if (!$voter->vote($count, $total, $max, $message)) { $this->eventDispatcher->dispatch(IoEvents::FEED_CLEANUP_HALT, new FeedCleanupHaltEvent($feed, $count, $total, $max)); return false; } } $this->logger->debug(sprintf('Cleaning %d sources for %s', $count, $feed)); $builder = $sourceRepo->queryByFeedAndUnvisitedSince($feed, $expireDate); $numCleaned = $cleaner->cleanByQuery($builder->getQuery()); $this->eventDispatcher->dispatch(IoEvents::POST_CLEAN_FEED, new FeedCleanupEvent($feed, $numCleaned)); return $numCleaned; }
/** * @inheritdoc */ protected function execute(InputInterface $input, OutputInterface $output) { $async = $input->getOption('async'); $noLimit = $input->getOption('no-limit'); $scrapers = $this->findScrapers($input->getArgument('scraper')); foreach ($scrapers as $scraperEntity) { $date = new \DateTime(sprintf('-%d hours', $scraperEntity->getRevisitFrequency())); $builder = $this->sourceManager->getRepository()->queryByScraperAndUnvisitedSince($scraperEntity, $date); foreach ($builder->getQuery()->iterate() as list($source)) { /* @var SourceInterface $source */ try { $output->writeln(sprintf('Revisiting <info>%s</info>', $source->getOriginalUrl())); $this->revisitor->revisit($source, $async, $noLimit); } catch (CrawlException $e) { $output->writeln(sprintf('<error>%s</error>', $e->getMessage())); } } } }
/** * @inheritdoc */ public function getRepository() { return $this->sourceManager->getRepository(); }
/** * @inheritdoc */ public function clean(DelegatingSourceCleaner $cleaner, ThresholdVoterInterface $voter) { $builder = $this->sourceManager->getRepository()->queryOrphaned(); return $cleaner->cleanByQuery($builder->getQuery()); }