/**
  * Start a consumer that retrieved documents that have to be saved to the index.
  *
  * @param \Symfony\Component\Console\Input\InputInterface   $input
  * @param \Symfony\Component\Console\Output\OutputInterface $output
  *
  * @return integer
  */
 protected function execute(InputInterface $input, OutputInterface $output)
 {
     $this->queue->listen(function ($message) {
         if (strlen($message->body) == 0) {
             $this->queue->rejectMessage($message);
             return;
         }
         $data = json_decode($message->body, true);
         $this->indexer->prepareDocument($message);
         $this->queue->acknowledge($message);
     });
     return 1;
 }
 /**
  * Calculates what the desired revisit factor should be.
  *
  * @param \Simgroep\ConcurrentSpiderBundle\PersistenceEvent $event
  */
 public function onPrePersistDocument(PersistenceEvent $event)
 {
     $newDocument = $event->getDocument();
     $currentDocument = $this->indexer->findDocumentByUrl($newDocument['url'], $event->getMetadata());
     if (null === $currentDocument || $currentDocument['revisit_after'] == 0) {
         $newDocument['revisit_after'] = $this->defaultRevisitFactor;
         $expireDate = new DateTime();
         $expireDate->modify(sprintf('+%s minute', $newDocument['revisit_after']));
         $newDocument['revisit_expiration'] = $expireDate->format('Y-m-d\\TH:i:s\\Z');
         return;
     }
     if ($this->createDocumentChecksum($currentDocument) === $this->createDocumentChecksum($newDocument)) {
         $this->increaseRevisitFactor($newDocument, $currentDocument['revisit_after']);
     } else {
         $this->decreaseRevisitFactor($newDocument, $currentDocument['revisit_after']);
     }
 }
 /**
  * Consume a message, extracts the URL from it and crawls the webpage.
  *
  * @param \PhpAmqpLib\Message\AMQPMessage $message
  */
 public function crawlUrl(AMQPMessage $message)
 {
     $data = json_decode($message->body, true);
     $crawlJob = new CrawlJob($data['url'], $data['base_url'], $data['blacklist'], $data['metadata'], $data['whitelist']);
     if (false === $crawlJob->isAllowedToCrawl()) {
         $this->indexer->deleteDocument($message);
         $this->queue->rejectMessage($message);
         $this->markAsSkipped($crawlJob, 'info', 'Not allowed to crawl');
         return;
     }
     if ($this->indexer->isUrlIndexedAndNotExpired($crawlJob->getUrl(), $crawlJob->getMetadata())) {
         $this->queue->rejectMessage($message);
         $this->markAsSkipped($crawlJob, 'info', 'Not expired yet');
         return;
     }
     try {
         $this->spider->getRequestHandler()->getClient()->setUserAgent($this->userAgent);
         $this->spider->getRequestHandler()->getClient()->getConfig()->set('request.params', ['redirect.disable' => true]);
         $this->spider->crawl($crawlJob);
         $this->logMessage('info', sprintf("Crawling %s", $crawlJob->getUrl()), $crawlJob->getUrl(), $data['metadata']['core']);
         $this->queue->acknowledge($message);
     } catch (ClientErrorResponseException $e) {
         switch ($e->getResponse()->getStatusCode()) {
             case 301:
                 $this->indexer->deleteDocument($message);
                 $this->queue->rejectMessage($message);
                 $this->markAsSkipped($crawlJob, 'warning', $e->getMessage());
                 $newCrawlJob = new CrawlJob($e->getResponse()->getInfo('redirect_url'), $crawlJob->getBaseUrl(), $crawlJob->getBlacklist(), $crawlJob->getMetadata(), $crawlJob->getWhitelist());
                 $this->queue->publishJob($newCrawlJob);
                 break;
             case 403:
             case 401:
             case 500:
                 $this->queue->rejectMessage($message);
                 $this->markAsSkipped($crawlJob, 'warning', 'status: ' . $e->getResponse()->getStatusCode());
                 break;
             case 404:
             case 418:
                 $this->indexer->deleteDocument($message);
                 $this->logMessage('warning', sprintf("Deleted %s", $crawlJob->getUrl()), $crawlJob->getUrl(), $data['metadata']['core']);
                 $this->queue->rejectMessage($message);
                 break;
             default:
                 $this->queue->rejectMessageAndRequeue($message);
                 $this->markAsFailed($crawlJob, $e->getResponse()->getStatusCode());
                 break;
         }
     } catch (Exception $e) {
         $this->queue->rejectMessage($message);
         $this->markAsFailed($crawlJob, $e->getMessage());
     }
     unset($crawlJob, $message, $data);
     gc_collect_cycles();
 }
 /**
  * Writes the found URL as a job on the queue.
  *
  * And URL is only persisted to the queue when it not has been indexed yet.
  *
  * @param \Symfony\Component\EventDispatcher\GenericEvent $event
  */
 public function onDiscoverUrl(Event $event)
 {
     $crawlJob = $event->getSubject()->getCurrentCrawlJob();
     foreach ($event['uris'] as $uri) {
         if ($position = strpos($uri, '#')) {
             $uri = new Uri(substr($uri, 0, $position));
         }
         $isBlacklisted = UrlCheck::isUrlBlacklisted($uri->normalize()->toString(), $crawlJob->getBlacklist());
         if ($isBlacklisted) {
             $this->eventDispatcher->dispatch("spider.crawl.blacklisted", new Event($this, ['uri' => $uri]));
             continue;
             //url blacklisted, so go to next one
         }
         if (!$this->indexer->isUrlIndexedandNotExpired(UrlCheck::fixUrl($uri->toString()), $crawlJob->getMetadata())) {
             $job = new CrawlJob(UrlCheck::fixUrl($uri->normalize()->toString()), (new Uri($crawlJob->getUrl()))->normalize()->toString(), $crawlJob->getBlacklist(), $crawlJob->getMetadata(), $crawlJob->getWhitelist());
             if ($job->isAllowedToCrawl()) {
                 $this->queue->publishJob($job);
             }
         }
     }
 }
 public function dropBlacklistedDocuments(array $blacklist, $metadata)
 {
     $result = $this->indexer->getDocumentUrlsInCore(['core' => $metadata->core]);
     $toDelete = [];
     foreach ($result as $document) {
         if (UrlCheck::isUrlBlacklisted($document->url, $blacklist)) {
             $toDelete[] = ['core' => $metadata->core, 'id' => $document->id, 'url' => $document->url];
         }
     }
     foreach ($toDelete as $document) {
         $this->indexer->deleteDocumentById(['core' => $document['core']], $document['id']);
         $this->logMessage("info", sprintf("Delete document %s. URL: %s", $document['id'], $document['url']), $document['url']);
     }
 }
 /**
  * @test
  * @testdox Tests if the given url is not indexed in solr.
  */
 public function ifUrlIsNotIndexedReturnTrue()
 {
     $url = 'https://github.com';
     $solrQuery = $this->getMockBuilder('Solarium\\QueryType\\Select\\Query\\Query')->disableOriginalConstructor()->setMethods(['setQuery'])->getMock();
     $expiresBeforeDate = new DateTime();
     $expiresBeforeDate->modify('-8 hour');
     $solrQuery->expects($this->once())->method('setQuery');
     $solrResult = $this->getMockBuilder('Solarium\\Core\\Query\\Result\\Result')->disableOriginalConstructor()->setMethods(['getNumFound'])->getMock();
     $solrResult->expects($this->once())->method('getNumFound')->will($this->returnValue(0));
     $solrClient = $this->getMockBuilder('Solarium\\Client')->setConstructorArgs([])->setMethods(['createSelect', 'select'])->getMock();
     $solrClient->expects($this->once())->method('createSelect')->will($this->returnValue($solrQuery));
     $solrClient->expects($this->once())->method('select')->will($this->returnValue($solrResult));
     $indexer = new Indexer($solrClient, [], 50);
     $actual = $indexer->isUrlNotIndexedOrIndexedAndExpired($url, ['core' => 'coreName']);
     $this->assertTrue($actual);
 }
 /**
  * @test
  */
 public function isCorrectQueryPhraseUsedForDeletingAllDocuments()
 {
     $updateQuery = $this->getMockBuilder('Solarium\\QueryType\\Update\\Query\\Query')->setMethods(['addDeleteQuery', 'addCommit'])->getMock();
     $updateQuery->expects($this->once())->method('addDeleteQuery')->with($this->equalTo('*:*'))->will($this->returnValue($updateQuery));
     $result = $this->getMockBuilder('Solarium\\QueryType\\Select\\Result\\Result')->disableOriginalConstructor()->getMock();
     $solrClient = $this->getMockBuilder('Solarium\\Client')->setMethods(['createUpdate', 'update'])->getMock();
     $solrClient->expects($this->once())->method('createUpdate')->will($this->returnValue($updateQuery));
     $solrClient->expects($this->once())->method('update')->will($this->returnValue($result));
     $indexer = new Indexer($solrClient, [], 50);
     $indexer->emptyCore('core');
 }