/**
  * Returns a document that can be persisted based on the resource.
  *
  * @param \VDB\Spider\Resource $resource
  *
  * @return \Simgroep\ConcurrentSpiderBundle\PersistableDocument
  */
 public function getDocumentByResource(Resource $resource)
 {
     switch ($resource->getResponse()->getContentType()) {
         case 'application/pdf':
         case 'application/octet-stream':
             $data = $this->pdf->getData($resource);
             break;
         case 'application/msword':
         case 'application/vnd.openxmlformats-officedocument.wordprocessingml.document':
         case 'application/vnd.openxmlformats-officedocument.wordprocessingml.template':
             if (false !== stripos($resource->getUri()->toString(), '.docx')) {
                 $data = $this->word2007->getData($resource);
                 break;
             }
             $data = $this->msdoc->getData($resource);
             break;
         case 'application/rtf':
             $data = $this->rtf->getData($resource);
             break;
         case 'application/vnd.oasis.opendocument.text':
             $data = $this->odt->getData($resource);
             break;
         case 'text/html':
         default:
             $data = $this->html->getData($resource);
             break;
     }
     return new PersistableDocument($data);
 }
 public function persist(Resource $resource)
 {
     $fileName = urlencode($resource->getUri()->toString());
     $file = new \SplFileObject($this->getResultPath() . $fileName, 'w');
     $rawResponse = $resource->getResponse()->__toString();
     $this->totalSizePersisted += $file->fwrite($rawResponse);
 }
Пример #3
0
 public function match(Resource $resource)
 {
     if (!$resource->getResponse()->isContentType($this->allowedMimeType)) {
         $mime = $resource->getResponse()->getContentType();
         $resource->setFiltered(true, "Mime type '{$mime}' not allowed");
         return true;
     }
     return false;
 }
Пример #4
0
 /**
  * @covers VDB\Spider\Resource
  */
 public function testSerialization()
 {
     $serialized = serialize($this->resource);
     $unserialized = unserialize($serialized);
     $this->assertInstanceOf('VDB\\Spider\\Resource', $unserialized);
     $this->assertInstanceOf('Psr\\Http\\Message\\ResponseInterface', $unserialized->getResponse());
     $this->assertInstanceOf('VDB\\Spider\\Uri\\DiscoveredUri', $unserialized->getUri());
     $this->assertEquals($this->resource->getUri()->__toString(), $unserialized->getUri()->__toString());
     $this->assertEquals($this->html, $unserialized->getResponse()->getBody()->__toString());
     $this->assertEquals($this->resource->getCrawler()->html(), $unserialized->getCrawler()->html());
 }
 /**
  * Grabs the content from the crawled page and publishes a job on the queue.
  *
  * @param \VDB\Spider\Resource                      $resource
  * @param \Simgroep\ConcurrentSpiderBundle\CrawlJob $crawlJob
  *
  * @throws \Simgroep\ConcurrentSpiderBundle\InvalidContentException
  */
 public function persist(Resource $resource, CrawlJob $crawlJob)
 {
     if (strlen($resource->getResponse()->getBody()) >= $this->maximumResourceSize) {
         throw new InvalidContentException(sprintf('Resource size exceeds limits (%s bytes)', $this->maximumResourceSize));
     }
     $document = $this->documentResolver->getDocumentByResource($resource);
     $persistenceEvent = new PersistenceEvent($document, $resource, $crawlJob->getMetadata());
     $this->eventDispatcher->dispatch(PersistenceEvents::PRE_PERSIST, $persistenceEvent);
     $message = new AMQPMessage(json_encode(array_merge(['document' => $document->toArray()], ['metadata' => $crawlJob->getMetadata()])), ['delivery_mode' => 1]);
     $this->queue->publish($message);
 }
Пример #6
0
 /**
  * @param Spider $spider
  * @param Resource $document
  * @return UriInterface[]
  */
 public function discover(Spider $spider, Resource $document)
 {
     $crawler = $document->getCrawler()->filter($this->cssSelector);
     $uris = array();
     foreach ($crawler as $node) {
         try {
             $uris[] = new Uri($node->getAttribute('href'), $document->getUri()->toString());
         } catch (UriSyntaxException $e) {
             $spider->getStatsHandler()->addToFailed($node->getAttribute('href'), 'Invalid URI: ' . $e->getMessage());
         }
     }
     return $uris;
 }
 /**
  * Extract content from resource
  *
  * @param \VDB\Spider\Resource $resource
  *
  * @return string
  */
 public function extractContentFromResource(Resource $resource)
 {
     $tempFile = tempnam(sys_get_temp_dir(), 'docx');
     file_put_contents($tempFile, $resource->getResponse()->getBody());
     $reader = $this->getReader();
     //remove notice from library
     $errorReportingLevel = error_reporting();
     error_reporting($errorReportingLevel ^ E_NOTICE);
     $phpword = $reader->load($tempFile);
     //back error reporting to previous state
     error_reporting($errorReportingLevel);
     unlink($tempFile);
     $writer = $this->getWriter($phpword);
     return strip_tags($this->stripBinaryContent($writer->getContent()));
 }
Пример #8
0
 /**
  * Extract content from resource
  *
  * @param \VDB\Spider\Resource $resource
  *
  * @return string
  */
 public function extractContentFromResource(Resource $resource)
 {
     $tempFile = $this->getTempFileName('rtf');
     file_put_contents($tempFile, $resource->getResponse()->getBody());
     $reader = $this->getReader();
     //remove notice from library
     $errorReportingLevel = error_reporting();
     error_reporting($errorReportingLevel ^ E_NOTICE);
     try {
         $phpword = $reader->load($tempFile);
     } catch (\Exception $e) {
         // too bad
     }
     //back error reporting to previous state
     error_reporting($errorReportingLevel);
     unlink($tempFile);
     $writer = $this->getWriter($phpword);
     return strip_tags($this->stripBinaryContent($writer->getContent()));
 }
Пример #9
0
 /**
  * Extracts all text content from the crawled resource exception javascript.
  *
  * @param \VDB\Spider\Resource $resource
  *
  * @return string
  */
 public function extractContentFromResource(Resource $resource)
 {
     $crawler = $resource->getCrawler();
     if (null !== $this->cssBlacklist) {
         $crawler->filter($this->cssBlacklist)->each(function (Crawler $crawler) {
             foreach ($crawler as $node) {
                 $node->parentNode->removeChild($node);
             }
         });
     }
     $query = '//body//*[not(self::script)]/text()';
     $content = '';
     $crawler->filterXpath($query)->each(function (Crawler $crawler) use(&$content) {
         $text = trim($crawler->text());
         if (strlen($text) > 0) {
             $content .= $text . ' ';
         }
     });
     return trim($content);
 }
Пример #10
0
 /**
  * @param Resource $resource
  * @return UriInterface[]
  */
 public function discover(Resource $resource)
 {
     $this->markSeen($resource->getUri());
     if ($this->isAtMaxDepth($resource->getUri())) {
         return [];
     }
     $discoveredUris = [];
     foreach ($this->discoverers as $discoverer) {
         $discoveredUris = array_merge($discoveredUris, $discoverer->discover($resource));
     }
     $this->normalize($discoveredUris);
     $this->removeDuplicates($discoveredUris);
     $this->filterAlreadySeen($discoveredUris);
     $this->filter($discoveredUris);
     foreach ($discoveredUris as $uri) {
         $uri->setDepthFound($resource->getUri()->getDepthFound() + 1);
         $this->markSeen($uri);
     }
     return $discoveredUris;
 }
Пример #11
0
 /**
  *
  * Extract content from resource
  *
  * @param \VDB\Spider\Resource $resource
  *
  * @return string
  * @throws InvalidContentException
  * @throws ProcessFailedException
  */
 public function extractContentFromResource(Resource $resource)
 {
     $tempPdfFile = $this->getTempFileName('pdf');
     file_put_contents($tempPdfFile, $resource->getResponse()->getBody());
     $command = sprintf('%s %s', $this->pdfToTxtCommand, $tempPdfFile);
     $process = $this->processCommand($command);
     // executes after the command finishes
     if (!$process->isSuccessful()) {
         unlink($tempPdfFile);
         throw new ProcessFailedException($process);
     }
     //remove tempPdf file
     unlink($tempPdfFile);
     //read txt file created by pdftotext command
     $tempTxtFile = sprintf('%s.txt', $tempPdfFile);
     $content = file_get_contents($tempTxtFile);
     if ($content == false) {
         unlink($tempTxtFile);
         throw new InvalidContentException(sprintf("PDF: Cant read temporary txt file converted from pdf: %s", $tempTxtFile));
     }
     //remove tempTxt file
     unlink($tempTxtFile);
     return $content;
 }
Пример #12
0
 protected function getFilteredCrawler(Resource $resource)
 {
     return $resource->getCrawler()->filter($this->selector);
 }
Пример #13
0
 public function match(Resource $resource)
 {
     $contentType = $resource->getResponse()->getHeaderLine('Content-Type');
     return $contentType !== $this->allowedMimeType;
 }
Пример #14
0
 /**
  * @covers VDB\Spider\Resource::getIdentifier
  */
 public function testGetIdentifier()
 {
     $this->assertEquals('http://example.org/domains/special', $this->resource->getIdentifier());
 }
Пример #15
0
 /**
  * Add a Resource to the processing queue
  *
  * @param Resource $resource
  * @return void
  */
 protected function addToProcessQueue(Resource $resource)
 {
     if ($this->maxQueueSize != 0 && $this->currentQueueSize >= $this->maxQueueSize) {
         $resource->setFiltered(true, 'Maximum Queue Size of ' . $this->maxQueueSize . ' reached');
         $this->getStatsHandler()->addToFiltered($resource);
         throw new QueueException('Maximum Queue Size of ' . $this->maxQueueSize . ' reached');
     }
     $this->currentQueueSize++;
     $this->getPersistenceHandler()->persist($resource);
     $this->getStatsHandler()->addToQueued($resource->getUri());
 }
Пример #16
0
 /**
  * @covers VDB\Spider\Resource::getResponse
  */
 public function testGetResponse()
 {
     $this->assertInstanceOf('Psr\\Http\\Message\\ResponseInterface', $this->resource->getResponse());
 }
Пример #17
0
 /**
  *
  * Extract content from resource
  *
  * @param \VDB\Spider\Resource $resource
  *
  * @return string
  */
 public function extractContentFromResource(Resource $resource)
 {
     $pdf = $this->pdfParser->parseContent($resource->getResponse()->getBody(true));
     return $this->stripBinaryContent($pdf->getText());
 }