/** * @param DelegatingSourceCleaner $cleaner * @param Feed $feed * @param ThresholdVoterInterface $voter * * @return bool */ public function cleanFeed(DelegatingSourceCleaner $cleaner, Feed $feed, ThresholdVoterInterface $voter) { if (null === ($expireDate = $this->getLastFullImportDate($feed))) { $this->logger->debug(sprintf('Skipping %s, because it has no recent imports', $feed)); $this->eventDispatcher->dispatch(IoEvents::FEED_CLEANUP_SKIP, new FeedCleanupEvent($feed, 0)); return false; } $this->eventDispatcher->dispatch(IoEvents::PRE_CLEAN_FEED, new FeedEvent($feed)); $this->logger->debug(sprintf('Checking sources of %s that have not been visited since %s', $feed, $expireDate->format('Y-m-d H:i:s'))); // get sources that haven't been visited since $expireDate $sourceRepo = $this->sourceManager->getRepository(); $count = $sourceRepo->countByFeedAndUnvisitedSince($feed, $expireDate); // fail safe: see if percentage of sources to be removed is not too high $total = $sourceRepo->countByFeed($feed); $max = $this->getThreshold($total); // see if threshold is reached if ($count > $max) { $message = sprintf('Stopping cleanup for %s, because %s of %s sources were to be deleted, %s is the maximum.', $feed, $count, $total, $max); if (!$voter->vote($count, $total, $max, $message)) { $this->eventDispatcher->dispatch(IoEvents::FEED_CLEANUP_HALT, new FeedCleanupHaltEvent($feed, $count, $total, $max)); return false; } } $this->logger->debug(sprintf('Cleaning %d sources for %s', $count, $feed)); $builder = $sourceRepo->queryByFeedAndUnvisitedSince($feed, $expireDate); $numCleaned = $cleaner->cleanByQuery($builder->getQuery()); $this->eventDispatcher->dispatch(IoEvents::POST_CLEAN_FEED, new FeedCleanupEvent($feed, $numCleaned)); return $numCleaned; }
/** * @inheritdoc */ protected function execute(InputInterface $input, OutputInterface $output) { $id = $input->getArgument('id'); if (null === ($source = $this->sourceManager->findById($id))) { $output->writeln(sprintf('<error>Could not find source with id %d</error>', $id)); return 1; } $linked = $this->sourceProcessor->isLinked($source); if (!$linked) { $output->writeln('Linking source first'); $this->sourceProcessor->link($source); } $this->sourceProcessor->process($source); $this->sourceManager->flush($source); $output->writeln(sprintf('Source <info>%d</info> has been processed', $id)); return 0; }
/** * @inheritdoc */ protected function execute(InputInterface $input, OutputInterface $output) { $async = $input->getOption('async'); $noLimit = $input->getOption('no-limit'); $scrapers = $this->findScrapers($input->getArgument('scraper')); foreach ($scrapers as $scraperEntity) { $date = new \DateTime(sprintf('-%d hours', $scraperEntity->getRevisitFrequency())); $builder = $this->sourceManager->getRepository()->queryByScraperAndUnvisitedSince($scraperEntity, $date); foreach ($builder->getQuery()->iterate() as list($source)) { /* @var SourceInterface $source */ try { $output->writeln(sprintf('Revisiting <info>%s</info>', $source->getOriginalUrl())); $this->revisitor->revisit($source, $async, $noLimit); } catch (CrawlException $e) { $output->writeln(sprintf('<error>%s</error>', $e->getMessage())); } } } }
public function testProcessException() { $executor = new SourceProcessExecutor($this->manager, $this->processor, new NullLogger()); $source = new SourceMock(12345); $this->manager->expects($this->once())->method('findById')->will($this->returnValue($source)); $this->processor->expects($this->once())->method('isLinked')->will($this->returnValue(false)); $this->processor->expects($this->once())->method('process')->will($this->throwException(new SourceProcessException('Foobar'))); $this->assertFalse($executor->execute($this->getPayload($executor, $source))); $messages = $source->getMessages(); $this->assertInternalType('array', $messages); $this->assertArrayHasKey('process', $messages); $this->assertArrayHasKey(LogLevel::ERROR, $messages['process']); $this->assertContains('Foobar', $messages['process'][LogLevel::ERROR]); }
/** * @param AbstractQuery $query * * @throws \LogicException * @return int * */ public function cleanByQuery(AbstractQuery $query) { $numCleaned = 0; /** @var SourceInterface $source */ foreach ($query->iterate() as list($source)) { if (!$source instanceof SourceInterface) { throw new \LogicException(sprintf('Invalid iterator given, encountered %s instead of SourceInterface', is_object($source) ? get_class($source) : gettype($source))); } $this->eventDispatcher->dispatch(IoEvents::PRE_CLEAN_SOURCE, new SourceEvent($source)); $this->sourceManager->remove($source); $this->eventDispatcher->dispatch(IoEvents::POST_CLEAN_SOURCE, new SourceEvent($source)); ++$numCleaned; if ($numCleaned % 50 === 0) { $this->sourceManager->flush(); $this->sourceManager->clear(); } } if ($numCleaned > 0) { $this->sourceManager->flush(); $this->sourceManager->clear(); } return $numCleaned; }
/** * @param SourceInterface $source */ protected function removeSource(SourceInterface $source) { $this->sourceManager->remove($source); $this->sourceManager->flush($source); }
/** * @param int $sourceId * * @return SourceInterface */ protected function findSource($sourceId) { return $this->sourceManager->findById($sourceId); }
/** * @inheritdoc */ public function clear() { $this->sourceManager->clear(); $this->sources = []; $this->originSources = []; }
/** * @inheritdoc */ public function clean(DelegatingSourceCleaner $cleaner, ThresholdVoterInterface $voter) { $builder = $this->sourceManager->getRepository()->queryOrphaned(); return $cleaner->cleanByQuery($builder->getQuery()); }