public function match(Resource $resource) { if (!$resource->getResponse()->isContentType($this->allowedMimeType)) { $mime = $resource->getResponse()->getContentType(); $resource->setFiltered(true, "Mime type '{$mime}' not allowed"); return true; } return false; }
public function persist(Resource $resource) { $fileName = urlencode($resource->getUri()->toString()); $file = new \SplFileObject($this->getResultPath() . $fileName, 'w'); $rawResponse = $resource->getResponse()->__toString(); $this->totalSizePersisted += $file->fwrite($rawResponse); }
/** * Returns a document that can be persisted based on the resource. * * @param \VDB\Spider\Resource $resource * * @return \Simgroep\ConcurrentSpiderBundle\PersistableDocument */ public function getDocumentByResource(Resource $resource) { switch ($resource->getResponse()->getContentType()) { case 'application/pdf': case 'application/octet-stream': $data = $this->pdf->getData($resource); break; case 'application/msword': case 'application/vnd.openxmlformats-officedocument.wordprocessingml.document': case 'application/vnd.openxmlformats-officedocument.wordprocessingml.template': if (false !== stripos($resource->getUri()->toString(), '.docx')) { $data = $this->word2007->getData($resource); break; } $data = $this->msdoc->getData($resource); break; case 'application/rtf': $data = $this->rtf->getData($resource); break; case 'application/vnd.oasis.opendocument.text': $data = $this->odt->getData($resource); break; case 'text/html': default: $data = $this->html->getData($resource); break; } return new PersistableDocument($data); }
/** * Grabs the content from the crawled page and publishes a job on the queue. * * @param \VDB\Spider\Resource $resource * @param \Simgroep\ConcurrentSpiderBundle\CrawlJob $crawlJob * * @throws \Simgroep\ConcurrentSpiderBundle\InvalidContentException */ public function persist(Resource $resource, CrawlJob $crawlJob) { if (strlen($resource->getResponse()->getBody()) >= $this->maximumResourceSize) { throw new InvalidContentException(sprintf('Resource size exceeds limits (%s bytes)', $this->maximumResourceSize)); } $document = $this->documentResolver->getDocumentByResource($resource); $persistenceEvent = new PersistenceEvent($document, $resource, $crawlJob->getMetadata()); $this->eventDispatcher->dispatch(PersistenceEvents::PRE_PERSIST, $persistenceEvent); $message = new AMQPMessage(json_encode(array_merge(['document' => $document->toArray()], ['metadata' => $crawlJob->getMetadata()])), ['delivery_mode' => 1]); $this->queue->publish($message); }
/** * Extract content from resource * * @param \VDB\Spider\Resource $resource * * @return string */ public function extractContentFromResource(Resource $resource) { $tempFile = tempnam(sys_get_temp_dir(), 'docx'); file_put_contents($tempFile, $resource->getResponse()->getBody()); $reader = $this->getReader(); //remove notice from library $errorReportingLevel = error_reporting(); error_reporting($errorReportingLevel ^ E_NOTICE); $phpword = $reader->load($tempFile); //back error reporting to previous state error_reporting($errorReportingLevel); unlink($tempFile); $writer = $this->getWriter($phpword); return strip_tags($this->stripBinaryContent($writer->getContent())); }
/** * Extract content from resource * * @param \VDB\Spider\Resource $resource * * @return string */ public function extractContentFromResource(Resource $resource) { $tempFile = $this->getTempFileName('rtf'); file_put_contents($tempFile, $resource->getResponse()->getBody()); $reader = $this->getReader(); //remove notice from library $errorReportingLevel = error_reporting(); error_reporting($errorReportingLevel ^ E_NOTICE); try { $phpword = $reader->load($tempFile); } catch (\Exception $e) { // too bad } //back error reporting to previous state error_reporting($errorReportingLevel); unlink($tempFile); $writer = $this->getWriter($phpword); return strip_tags($this->stripBinaryContent($writer->getContent())); }
/** * * Extract content from resource * * @param \VDB\Spider\Resource $resource * * @return string * @throws InvalidContentException * @throws ProcessFailedException */ public function extractContentFromResource(Resource $resource) { $tempPdfFile = $this->getTempFileName('pdf'); file_put_contents($tempPdfFile, $resource->getResponse()->getBody()); $command = sprintf('%s %s', $this->pdfToTxtCommand, $tempPdfFile); $process = $this->processCommand($command); // executes after the command finishes if (!$process->isSuccessful()) { unlink($tempPdfFile); throw new ProcessFailedException($process); } //remove tempPdf file unlink($tempPdfFile); //read txt file created by pdftotext command $tempTxtFile = sprintf('%s.txt', $tempPdfFile); $content = file_get_contents($tempTxtFile); if ($content == false) { unlink($tempTxtFile); throw new InvalidContentException(sprintf("PDF: Cant read temporary txt file converted from pdf: %s", $tempTxtFile)); } //remove tempTxt file unlink($tempTxtFile); return $content; }
/** * @covers VDB\Spider\Resource */ public function testGetResponse() { $this->assertInstanceOf('Psr\\Http\\Message\\ResponseInterface', $this->resource->getResponse()); $this->assertEquals($this->html, $this->resource->getResponse()->getBody()->__toString()); }
/** * @covers VDB\Spider\Resource::getResponse */ public function testGetResponse() { $this->assertInstanceOf('Psr\\Http\\Message\\ResponseInterface', $this->resource->getResponse()); }
public function match(Resource $resource) { $contentType = $resource->getResponse()->getHeaderLine('Content-Type'); return $contentType !== $this->allowedMimeType; }
/** * * Extract content from resource * * @param \VDB\Spider\Resource $resource * * @return string */ public function extractContentFromResource(Resource $resource) { $pdf = $this->pdfParser->parseContent($resource->getResponse()->getBody(true)); return $this->stripBinaryContent($pdf->getText()); }