/** * Start a consumer that retrieved documents that have to be saved to the index. * * @param \Symfony\Component\Console\Input\InputInterface $input * @param \Symfony\Component\Console\Output\OutputInterface $output * * @return integer */ protected function execute(InputInterface $input, OutputInterface $output) { $this->queue->listen(function ($message) { if (strlen($message->body) == 0) { $this->queue->rejectMessage($message); return; } $data = json_decode($message->body, true); $this->indexer->prepareDocument($message); $this->queue->acknowledge($message); }); return 1; }
/** * Calculates what the desired revisit factor should be. * * @param \Simgroep\ConcurrentSpiderBundle\PersistenceEvent $event */ public function onPrePersistDocument(PersistenceEvent $event) { $newDocument = $event->getDocument(); $currentDocument = $this->indexer->findDocumentByUrl($newDocument['url'], $event->getMetadata()); if (null === $currentDocument || $currentDocument['revisit_after'] == 0) { $newDocument['revisit_after'] = $this->defaultRevisitFactor; $expireDate = new DateTime(); $expireDate->modify(sprintf('+%s minute', $newDocument['revisit_after'])); $newDocument['revisit_expiration'] = $expireDate->format('Y-m-d\\TH:i:s\\Z'); return; } if ($this->createDocumentChecksum($currentDocument) === $this->createDocumentChecksum($newDocument)) { $this->increaseRevisitFactor($newDocument, $currentDocument['revisit_after']); } else { $this->decreaseRevisitFactor($newDocument, $currentDocument['revisit_after']); } }
/** * Consume a message, extracts the URL from it and crawls the webpage. * * @param \PhpAmqpLib\Message\AMQPMessage $message */ public function crawlUrl(AMQPMessage $message) { $data = json_decode($message->body, true); $crawlJob = new CrawlJob($data['url'], $data['base_url'], $data['blacklist'], $data['metadata'], $data['whitelist']); if (false === $crawlJob->isAllowedToCrawl()) { $this->indexer->deleteDocument($message); $this->queue->rejectMessage($message); $this->markAsSkipped($crawlJob, 'info', 'Not allowed to crawl'); return; } if ($this->indexer->isUrlIndexedAndNotExpired($crawlJob->getUrl(), $crawlJob->getMetadata())) { $this->queue->rejectMessage($message); $this->markAsSkipped($crawlJob, 'info', 'Not expired yet'); return; } try { $this->spider->getRequestHandler()->getClient()->setUserAgent($this->userAgent); $this->spider->getRequestHandler()->getClient()->getConfig()->set('request.params', ['redirect.disable' => true]); $this->spider->crawl($crawlJob); $this->logMessage('info', sprintf("Crawling %s", $crawlJob->getUrl()), $crawlJob->getUrl(), $data['metadata']['core']); $this->queue->acknowledge($message); } catch (ClientErrorResponseException $e) { switch ($e->getResponse()->getStatusCode()) { case 301: $this->indexer->deleteDocument($message); $this->queue->rejectMessage($message); $this->markAsSkipped($crawlJob, 'warning', $e->getMessage()); $newCrawlJob = new CrawlJob($e->getResponse()->getInfo('redirect_url'), $crawlJob->getBaseUrl(), $crawlJob->getBlacklist(), $crawlJob->getMetadata(), $crawlJob->getWhitelist()); $this->queue->publishJob($newCrawlJob); break; case 403: case 401: case 500: $this->queue->rejectMessage($message); $this->markAsSkipped($crawlJob, 'warning', 'status: ' . $e->getResponse()->getStatusCode()); break; case 404: case 418: $this->indexer->deleteDocument($message); $this->logMessage('warning', sprintf("Deleted %s", $crawlJob->getUrl()), $crawlJob->getUrl(), $data['metadata']['core']); $this->queue->rejectMessage($message); break; default: $this->queue->rejectMessageAndRequeue($message); $this->markAsFailed($crawlJob, $e->getResponse()->getStatusCode()); break; } } catch (Exception $e) { $this->queue->rejectMessage($message); $this->markAsFailed($crawlJob, $e->getMessage()); } unset($crawlJob, $message, $data); gc_collect_cycles(); }
/** * Writes the found URL as a job on the queue. * * And URL is only persisted to the queue when it not has been indexed yet. * * @param \Symfony\Component\EventDispatcher\GenericEvent $event */ public function onDiscoverUrl(Event $event) { $crawlJob = $event->getSubject()->getCurrentCrawlJob(); foreach ($event['uris'] as $uri) { if ($position = strpos($uri, '#')) { $uri = new Uri(substr($uri, 0, $position)); } $isBlacklisted = UrlCheck::isUrlBlacklisted($uri->normalize()->toString(), $crawlJob->getBlacklist()); if ($isBlacklisted) { $this->eventDispatcher->dispatch("spider.crawl.blacklisted", new Event($this, ['uri' => $uri])); continue; //url blacklisted, so go to next one } if (!$this->indexer->isUrlIndexedandNotExpired(UrlCheck::fixUrl($uri->toString()), $crawlJob->getMetadata())) { $job = new CrawlJob(UrlCheck::fixUrl($uri->normalize()->toString()), (new Uri($crawlJob->getUrl()))->normalize()->toString(), $crawlJob->getBlacklist(), $crawlJob->getMetadata(), $crawlJob->getWhitelist()); if ($job->isAllowedToCrawl()) { $this->queue->publishJob($job); } } } }
public function dropBlacklistedDocuments(array $blacklist, $metadata) { $result = $this->indexer->getDocumentUrlsInCore(['core' => $metadata->core]); $toDelete = []; foreach ($result as $document) { if (UrlCheck::isUrlBlacklisted($document->url, $blacklist)) { $toDelete[] = ['core' => $metadata->core, 'id' => $document->id, 'url' => $document->url]; } } foreach ($toDelete as $document) { $this->indexer->deleteDocumentById(['core' => $document['core']], $document['id']); $this->logMessage("info", sprintf("Delete document %s. URL: %s", $document['id'], $document['url']), $document['url']); } }
/** * @test * @testdox Tests if the given url is not indexed in solr. */ public function ifUrlIsNotIndexedReturnTrue() { $url = 'https://github.com'; $solrQuery = $this->getMockBuilder('Solarium\\QueryType\\Select\\Query\\Query')->disableOriginalConstructor()->setMethods(['setQuery'])->getMock(); $expiresBeforeDate = new DateTime(); $expiresBeforeDate->modify('-8 hour'); $solrQuery->expects($this->once())->method('setQuery'); $solrResult = $this->getMockBuilder('Solarium\\Core\\Query\\Result\\Result')->disableOriginalConstructor()->setMethods(['getNumFound'])->getMock(); $solrResult->expects($this->once())->method('getNumFound')->will($this->returnValue(0)); $solrClient = $this->getMockBuilder('Solarium\\Client')->setConstructorArgs([])->setMethods(['createSelect', 'select'])->getMock(); $solrClient->expects($this->once())->method('createSelect')->will($this->returnValue($solrQuery)); $solrClient->expects($this->once())->method('select')->will($this->returnValue($solrResult)); $indexer = new Indexer($solrClient, [], 50); $actual = $indexer->isUrlNotIndexedOrIndexedAndExpired($url, ['core' => 'coreName']); $this->assertTrue($actual); }
/** * @test */ public function isCorrectQueryPhraseUsedForDeletingAllDocuments() { $updateQuery = $this->getMockBuilder('Solarium\\QueryType\\Update\\Query\\Query')->setMethods(['addDeleteQuery', 'addCommit'])->getMock(); $updateQuery->expects($this->once())->method('addDeleteQuery')->with($this->equalTo('*:*'))->will($this->returnValue($updateQuery)); $result = $this->getMockBuilder('Solarium\\QueryType\\Select\\Result\\Result')->disableOriginalConstructor()->getMock(); $solrClient = $this->getMockBuilder('Solarium\\Client')->setMethods(['createUpdate', 'update'])->getMock(); $solrClient->expects($this->once())->method('createUpdate')->will($this->returnValue($updateQuery)); $solrClient->expects($this->once())->method('update')->will($this->returnValue($result)); $indexer = new Indexer($solrClient, [], 50); $indexer->emptyCore('core'); }