Example #1
0
 public function match(Resource $resource)
 {
     if (!$resource->getResponse()->isContentType($this->allowedMimeType)) {
         $mime = $resource->getResponse()->getContentType();
         $resource->setFiltered(true, "Mime type '{$mime}' not allowed");
         return true;
     }
     return false;
 }
 public function persist(Resource $resource)
 {
     $fileName = urlencode($resource->getUri()->toString());
     $file = new \SplFileObject($this->getResultPath() . $fileName, 'w');
     $rawResponse = $resource->getResponse()->__toString();
     $this->totalSizePersisted += $file->fwrite($rawResponse);
 }
 /**
  * Returns a document that can be persisted based on the resource.
  *
  * @param \VDB\Spider\Resource $resource
  *
  * @return \Simgroep\ConcurrentSpiderBundle\PersistableDocument
  */
 public function getDocumentByResource(Resource $resource)
 {
     switch ($resource->getResponse()->getContentType()) {
         case 'application/pdf':
         case 'application/octet-stream':
             $data = $this->pdf->getData($resource);
             break;
         case 'application/msword':
         case 'application/vnd.openxmlformats-officedocument.wordprocessingml.document':
         case 'application/vnd.openxmlformats-officedocument.wordprocessingml.template':
             if (false !== stripos($resource->getUri()->toString(), '.docx')) {
                 $data = $this->word2007->getData($resource);
                 break;
             }
             $data = $this->msdoc->getData($resource);
             break;
         case 'application/rtf':
             $data = $this->rtf->getData($resource);
             break;
         case 'application/vnd.oasis.opendocument.text':
             $data = $this->odt->getData($resource);
             break;
         case 'text/html':
         default:
             $data = $this->html->getData($resource);
             break;
     }
     return new PersistableDocument($data);
 }
 /**
  * Grabs the content from the crawled page and publishes a job on the queue.
  *
  * @param \VDB\Spider\Resource                      $resource
  * @param \Simgroep\ConcurrentSpiderBundle\CrawlJob $crawlJob
  *
  * @throws \Simgroep\ConcurrentSpiderBundle\InvalidContentException
  */
 public function persist(Resource $resource, CrawlJob $crawlJob)
 {
     if (strlen($resource->getResponse()->getBody()) >= $this->maximumResourceSize) {
         throw new InvalidContentException(sprintf('Resource size exceeds limits (%s bytes)', $this->maximumResourceSize));
     }
     $document = $this->documentResolver->getDocumentByResource($resource);
     $persistenceEvent = new PersistenceEvent($document, $resource, $crawlJob->getMetadata());
     $this->eventDispatcher->dispatch(PersistenceEvents::PRE_PERSIST, $persistenceEvent);
     $message = new AMQPMessage(json_encode(array_merge(['document' => $document->toArray()], ['metadata' => $crawlJob->getMetadata()])), ['delivery_mode' => 1]);
     $this->queue->publish($message);
 }
 /**
  * Extract content from resource
  *
  * @param \VDB\Spider\Resource $resource
  *
  * @return string
  */
 public function extractContentFromResource(Resource $resource)
 {
     $tempFile = tempnam(sys_get_temp_dir(), 'docx');
     file_put_contents($tempFile, $resource->getResponse()->getBody());
     $reader = $this->getReader();
     //remove notice from library
     $errorReportingLevel = error_reporting();
     error_reporting($errorReportingLevel ^ E_NOTICE);
     $phpword = $reader->load($tempFile);
     //back error reporting to previous state
     error_reporting($errorReportingLevel);
     unlink($tempFile);
     $writer = $this->getWriter($phpword);
     return strip_tags($this->stripBinaryContent($writer->getContent()));
 }
 /**
  * Extract content from resource
  *
  * @param \VDB\Spider\Resource $resource
  *
  * @return string
  */
 public function extractContentFromResource(Resource $resource)
 {
     $tempFile = $this->getTempFileName('rtf');
     file_put_contents($tempFile, $resource->getResponse()->getBody());
     $reader = $this->getReader();
     //remove notice from library
     $errorReportingLevel = error_reporting();
     error_reporting($errorReportingLevel ^ E_NOTICE);
     try {
         $phpword = $reader->load($tempFile);
     } catch (\Exception $e) {
         // too bad
     }
     //back error reporting to previous state
     error_reporting($errorReportingLevel);
     unlink($tempFile);
     $writer = $this->getWriter($phpword);
     return strip_tags($this->stripBinaryContent($writer->getContent()));
 }
 /**
  *
  * Extract content from resource
  *
  * @param \VDB\Spider\Resource $resource
  *
  * @return string
  * @throws InvalidContentException
  * @throws ProcessFailedException
  */
 public function extractContentFromResource(Resource $resource)
 {
     $tempPdfFile = $this->getTempFileName('pdf');
     file_put_contents($tempPdfFile, $resource->getResponse()->getBody());
     $command = sprintf('%s %s', $this->pdfToTxtCommand, $tempPdfFile);
     $process = $this->processCommand($command);
     // executes after the command finishes
     if (!$process->isSuccessful()) {
         unlink($tempPdfFile);
         throw new ProcessFailedException($process);
     }
     //remove tempPdf file
     unlink($tempPdfFile);
     //read txt file created by pdftotext command
     $tempTxtFile = sprintf('%s.txt', $tempPdfFile);
     $content = file_get_contents($tempTxtFile);
     if ($content == false) {
         unlink($tempTxtFile);
         throw new InvalidContentException(sprintf("PDF: Cant read temporary txt file converted from pdf: %s", $tempTxtFile));
     }
     //remove tempTxt file
     unlink($tempTxtFile);
     return $content;
 }
Example #8
0
 /**
  * @covers VDB\Spider\Resource
  */
 public function testGetResponse()
 {
     $this->assertInstanceOf('Psr\\Http\\Message\\ResponseInterface', $this->resource->getResponse());
     $this->assertEquals($this->html, $this->resource->getResponse()->getBody()->__toString());
 }
Example #9
0
 /**
  * @covers VDB\Spider\Resource::getResponse
  */
 public function testGetResponse()
 {
     $this->assertInstanceOf('Psr\\Http\\Message\\ResponseInterface', $this->resource->getResponse());
 }
Example #10
0
 public function match(Resource $resource)
 {
     $contentType = $resource->getResponse()->getHeaderLine('Content-Type');
     return $contentType !== $this->allowedMimeType;
 }
Example #11
0
 /**
  *
  * Extract content from resource
  *
  * @param \VDB\Spider\Resource $resource
  *
  * @return string
  */
 public function extractContentFromResource(Resource $resource)
 {
     $pdf = $this->pdfParser->parseContent($resource->getResponse()->getBody(true));
     return $this->stripBinaryContent($pdf->getText());
 }