/** * Grabs the content from the crawled page and publishes a job on the queue. * * @param \VDB\Spider\Resource $resource * @param \Simgroep\ConcurrentSpiderBundle\CrawlJob $crawlJob * * @throws \Simgroep\ConcurrentSpiderBundle\InvalidContentException */ public function persist(Resource $resource, CrawlJob $crawlJob) { if (strlen($resource->getResponse()->getBody()) >= $this->maximumResourceSize) { throw new InvalidContentException(sprintf('Resource size exceeds limits (%s bytes)', $this->maximumResourceSize)); } $document = $this->documentResolver->getDocumentByResource($resource); $persistenceEvent = new PersistenceEvent($document, $resource, $crawlJob->getMetadata()); $this->eventDispatcher->dispatch(PersistenceEvents::PRE_PERSIST, $persistenceEvent); $message = new AMQPMessage(json_encode(array_merge(['document' => $document->toArray()], ['metadata' => $crawlJob->getMetadata()])), ['delivery_mode' => 1]); $this->queue->publish($message); }
/** * Start a consumer that retrieved documents that have to be saved to the index. * * @param \Symfony\Component\Console\Input\InputInterface $input * @param \Symfony\Component\Console\Output\OutputInterface $output * * @return integer */ protected function execute(InputInterface $input, OutputInterface $output) { $this->queue->listen(function ($message) { if (strlen($message->body) == 0) { $this->queue->rejectMessage($message); return; } $data = json_decode($message->body, true); $this->indexer->prepareDocument($message); $this->queue->acknowledge($message); }); return 1; }
/** * Consume a message, extracts the URL from it and crawls the webpage. * * @param \PhpAmqpLib\Message\AMQPMessage $message */ public function recrawl(AMQPMessage $message) { try { $body = json_decode($message->body); // checking blacklist if (is_array($body->blacklist) && count($body->blacklist) > 0) { $this->dropBlacklistedDocuments($body->blacklist, $body->metadata); } $this->queue->acknowledge($message); } catch (Exception $e) { $this->queue->rejectMessage($message); $this->logMessage("emergency", $e->getMessage(), $body->metadata->core); } }
public function testAcknowledge() { $queueName = 'queue5'; $channel = $this->getMockBuilder('PhpAmqpLib\\Channel\\AMQPChannel')->disableOriginalConstructor()->setMethods(['basic_ack'])->getMock(); $channel->expects($this->once())->method('basic_ack')->with($this->equalTo('dummyTag3')); $message = $this->getMockBuilder('PhpAmqpLib\\Message\\AMQPMessage')->getMock(); $message->delivery_info = []; $message->delivery_info['channel'] = $channel; $message->delivery_info['delivery_tag'] = 'dummyTag3'; $connection = $this->getMockBuilder('PhpAmqpLib\\Connection\\AMQPConnection')->disableOriginalConstructor()->setMethods(['isConnected', 'close'])->getMock(); $connection->expects($this->once())->method('isConnected')->will($this->returnValue(true)); $connection->expects($this->once())->method('close'); $queue = new Queue($connection, $queueName); $queue->acknowledge($message); }
/** * Consume a message, extracts the URL from it and crawls the webpage. * * @param \PhpAmqpLib\Message\AMQPMessage $message */ public function crawlUrl(AMQPMessage $message) { $data = json_decode($message->body, true); $crawlJob = new CrawlJob($data['url'], $data['base_url'], $data['blacklist'], $data['metadata'], $data['whitelist']); if (false === $crawlJob->isAllowedToCrawl()) { $this->indexer->deleteDocument($message); $this->queue->rejectMessage($message); $this->markAsSkipped($crawlJob, 'info', 'Not allowed to crawl'); return; } if ($this->indexer->isUrlIndexedAndNotExpired($crawlJob->getUrl(), $crawlJob->getMetadata())) { $this->queue->rejectMessage($message); $this->markAsSkipped($crawlJob, 'info', 'Not expired yet'); return; } try { $this->spider->getRequestHandler()->getClient()->setUserAgent($this->userAgent); $this->spider->getRequestHandler()->getClient()->getConfig()->set('request.params', ['redirect.disable' => true]); $this->spider->crawl($crawlJob); $this->logMessage('info', sprintf("Crawling %s", $crawlJob->getUrl()), $crawlJob->getUrl(), $data['metadata']['core']); $this->queue->acknowledge($message); } catch (ClientErrorResponseException $e) { switch ($e->getResponse()->getStatusCode()) { case 301: $this->indexer->deleteDocument($message); $this->queue->rejectMessage($message); $this->markAsSkipped($crawlJob, 'warning', $e->getMessage()); $newCrawlJob = new CrawlJob($e->getResponse()->getInfo('redirect_url'), $crawlJob->getBaseUrl(), $crawlJob->getBlacklist(), $crawlJob->getMetadata(), $crawlJob->getWhitelist()); $this->queue->publishJob($newCrawlJob); break; case 403: case 401: case 500: $this->queue->rejectMessage($message); $this->markAsSkipped($crawlJob, 'warning', 'status: ' . $e->getResponse()->getStatusCode()); break; case 404: case 418: $this->indexer->deleteDocument($message); $this->logMessage('warning', sprintf("Deleted %s", $crawlJob->getUrl()), $crawlJob->getUrl(), $data['metadata']['core']); $this->queue->rejectMessage($message); break; default: $this->queue->rejectMessageAndRequeue($message); $this->markAsFailed($crawlJob, $e->getResponse()->getStatusCode()); break; } } catch (Exception $e) { $this->queue->rejectMessage($message); $this->markAsFailed($crawlJob, $e->getMessage()); } unset($crawlJob, $message, $data); gc_collect_cycles(); }
/** * Writes the found URL as a job on the queue. * * And URL is only persisted to the queue when it not has been indexed yet. * * @param \Symfony\Component\EventDispatcher\GenericEvent $event */ public function onDiscoverUrl(Event $event) { $crawlJob = $event->getSubject()->getCurrentCrawlJob(); foreach ($event['uris'] as $uri) { if ($position = strpos($uri, '#')) { $uri = new Uri(substr($uri, 0, $position)); } $isBlacklisted = UrlCheck::isUrlBlacklisted($uri->normalize()->toString(), $crawlJob->getBlacklist()); if ($isBlacklisted) { $this->eventDispatcher->dispatch("spider.crawl.blacklisted", new Event($this, ['uri' => $uri])); continue; //url blacklisted, so go to next one } if (!$this->indexer->isUrlIndexedandNotExpired(UrlCheck::fixUrl($uri->toString()), $crawlJob->getMetadata())) { $job = new CrawlJob(UrlCheck::fixUrl($uri->normalize()->toString()), (new Uri($crawlJob->getUrl()))->normalize()->toString(), $crawlJob->getBlacklist(), $crawlJob->getMetadata(), $crawlJob->getWhitelist()); if ($job->isAllowedToCrawl()) { $this->queue->publishJob($job); } } } }