/** * Consume a message, extracts the URL from it and crawls the webpage. * * @param \PhpAmqpLib\Message\AMQPMessage $message */ public function crawlUrl(AMQPMessage $message) { $data = json_decode($message->body, true); $crawlJob = new CrawlJob($data['url'], $data['base_url'], $data['blacklist'], $data['metadata'], $data['whitelist']); if (false === $crawlJob->isAllowedToCrawl()) { $this->indexer->deleteDocument($message); $this->queue->rejectMessage($message); $this->markAsSkipped($crawlJob, 'info', 'Not allowed to crawl'); return; } if ($this->indexer->isUrlIndexedAndNotExpired($crawlJob->getUrl(), $crawlJob->getMetadata())) { $this->queue->rejectMessage($message); $this->markAsSkipped($crawlJob, 'info', 'Not expired yet'); return; } try { $this->spider->getRequestHandler()->getClient()->setUserAgent($this->userAgent); $this->spider->getRequestHandler()->getClient()->getConfig()->set('request.params', ['redirect.disable' => true]); $this->spider->crawl($crawlJob); $this->logMessage('info', sprintf("Crawling %s", $crawlJob->getUrl()), $crawlJob->getUrl(), $data['metadata']['core']); $this->queue->acknowledge($message); } catch (ClientErrorResponseException $e) { switch ($e->getResponse()->getStatusCode()) { case 301: $this->indexer->deleteDocument($message); $this->queue->rejectMessage($message); $this->markAsSkipped($crawlJob, 'warning', $e->getMessage()); $newCrawlJob = new CrawlJob($e->getResponse()->getInfo('redirect_url'), $crawlJob->getBaseUrl(), $crawlJob->getBlacklist(), $crawlJob->getMetadata(), $crawlJob->getWhitelist()); $this->queue->publishJob($newCrawlJob); break; case 403: case 401: case 500: $this->queue->rejectMessage($message); $this->markAsSkipped($crawlJob, 'warning', 'status: ' . $e->getResponse()->getStatusCode()); break; case 404: case 418: $this->indexer->deleteDocument($message); $this->logMessage('warning', sprintf("Deleted %s", $crawlJob->getUrl()), $crawlJob->getUrl(), $data['metadata']['core']); $this->queue->rejectMessage($message); break; default: $this->queue->rejectMessageAndRequeue($message); $this->markAsFailed($crawlJob, $e->getResponse()->getStatusCode()); break; } } catch (Exception $e) { $this->queue->rejectMessage($message); $this->markAsFailed($crawlJob, $e->getMessage()); } unset($crawlJob, $message, $data); gc_collect_cycles(); }
/** * @test * @testdox Tests if the given url is checked upon a checksum in Solr. */ public function ifUrlIsSha1Checksum() { $url = 'https://github.com'; $solrQuery = $this->getMockBuilder('Solarium\\QueryType\\Select\\Query\\Query')->disableOriginalConstructor()->setMethods(['setQuery'])->getMock(); $expiresBeforeDate = new DateTime(); $expiresBeforeDate->modify('-8 hour'); $solrQuery->expects($this->once())->method('setQuery'); $solrResult = $this->getMockBuilder('Solarium\\Core\\Query\\Result\\Result')->disableOriginalConstructor()->setMethods(['getNumFound'])->getMock(); $solrResult->expects($this->once())->method('getNumFound')->will($this->returnValue(1)); $solrClient = $this->getMockBuilder('Solarium\\Client')->setConstructorArgs([])->setMethods(['createSelect', 'select'])->getMock(); $solrClient->expects($this->once())->method('createSelect')->will($this->returnValue($solrQuery)); $solrClient->expects($this->once())->method('select')->will($this->returnValue($solrResult)); $indexer = new Indexer($solrClient, [], 50); $actual = $indexer->isUrlIndexedAndNotExpired($url, ['core' => 'coreName']); $this->assertTrue($actual); }