/** * Consume a message, extracts the URL from it and crawls the webpage. * * @param \PhpAmqpLib\Message\AMQPMessage $message */ public function crawlUrl(AMQPMessage $message) { $data = json_decode($message->body, true); $crawlJob = new CrawlJob($data['url'], $data['base_url'], $data['blacklist'], $data['metadata'], $data['whitelist']); if (false === $crawlJob->isAllowedToCrawl()) { $this->indexer->deleteDocument($message); $this->queue->rejectMessage($message); $this->markAsSkipped($crawlJob, 'info', 'Not allowed to crawl'); return; } if ($this->indexer->isUrlIndexedAndNotExpired($crawlJob->getUrl(), $crawlJob->getMetadata())) { $this->queue->rejectMessage($message); $this->markAsSkipped($crawlJob, 'info', 'Not expired yet'); return; } try { $this->spider->getRequestHandler()->getClient()->setUserAgent($this->userAgent); $this->spider->getRequestHandler()->getClient()->getConfig()->set('request.params', ['redirect.disable' => true]); $this->spider->crawl($crawlJob); $this->logMessage('info', sprintf("Crawling %s", $crawlJob->getUrl()), $crawlJob->getUrl(), $data['metadata']['core']); $this->queue->acknowledge($message); } catch (ClientErrorResponseException $e) { switch ($e->getResponse()->getStatusCode()) { case 301: $this->indexer->deleteDocument($message); $this->queue->rejectMessage($message); $this->markAsSkipped($crawlJob, 'warning', $e->getMessage()); $newCrawlJob = new CrawlJob($e->getResponse()->getInfo('redirect_url'), $crawlJob->getBaseUrl(), $crawlJob->getBlacklist(), $crawlJob->getMetadata(), $crawlJob->getWhitelist()); $this->queue->publishJob($newCrawlJob); break; case 403: case 401: case 500: $this->queue->rejectMessage($message); $this->markAsSkipped($crawlJob, 'warning', 'status: ' . $e->getResponse()->getStatusCode()); break; case 404: case 418: $this->indexer->deleteDocument($message); $this->logMessage('warning', sprintf("Deleted %s", $crawlJob->getUrl()), $crawlJob->getUrl(), $data['metadata']['core']); $this->queue->rejectMessage($message); break; default: $this->queue->rejectMessageAndRequeue($message); $this->markAsFailed($crawlJob, $e->getResponse()->getStatusCode()); break; } } catch (Exception $e) { $this->queue->rejectMessage($message); $this->markAsFailed($crawlJob, $e->getMessage()); } unset($crawlJob, $message, $data); gc_collect_cycles(); }
/** * @test * @testdox Test if docuemnt are deleted from solr */ public function ifDocumentAreDeleted() { $url = 'https://www.github.com'; $solrQuery = $this->getMockBuilder('Solarium\\QueryType\\Update\\Query\\Query')->disableOriginalConstructor()->setMethods(['addDeleteById', 'addCommit'])->getMock(); $solrQuery->expects($this->once())->method('addDeleteById')->with(sha1(strtolower($url))); $solrClient = $this->getMockBuilder('Solarium\\Client')->setMethods(['createUpdate', 'update'])->getMock(); $solrClient->expects($this->any())->method('createUpdate')->will($this->returnValue($solrQuery)); $mapping = []; $indexer = new Indexer($solrClient, $mapping, 50); $bodyCrawlJob = json_encode(['url' => $url, 'metadata' => ['core' => 'core2']]); $message = new AMQPMessage($bodyCrawlJob); $indexer->deleteDocument($message); }