/** * Returns a document that can be persisted based on the resource. * * @param \VDB\Spider\Resource $resource * * @return \Simgroep\ConcurrentSpiderBundle\PersistableDocument */ public function getDocumentByResource(Resource $resource) { switch ($resource->getResponse()->getContentType()) { case 'application/pdf': case 'application/octet-stream': $data = $this->pdf->getData($resource); break; case 'application/msword': case 'application/vnd.openxmlformats-officedocument.wordprocessingml.document': case 'application/vnd.openxmlformats-officedocument.wordprocessingml.template': if (false !== stripos($resource->getUri()->toString(), '.docx')) { $data = $this->word2007->getData($resource); break; } $data = $this->msdoc->getData($resource); break; case 'application/rtf': $data = $this->rtf->getData($resource); break; case 'application/vnd.oasis.opendocument.text': $data = $this->odt->getData($resource); break; case 'text/html': default: $data = $this->html->getData($resource); break; } return new PersistableDocument($data); }
/** * @test * @expectedException \Simgroep\ConcurrentSpiderBundle\InvalidContentException * @expectedExceptionMessage PDF didn't contain enough content (minimal chars is 3) */ public function throwExceptionOnLessThenMinimalContentLength() { $document = $this->getMockBuilder('Smalot\\PdfParser\\Document')->setMethods(['getText'])->getMock(); $document->expects($this->once())->method('getText')->will($this->returnValue('')); $pdfType = $this->getMockBuilder('Smalot\\PdfParser\\Parser')->disableOriginalConstructor()->setMethods(['getText', 'parseContent'])->getMock(); $pdfType->expects($this->once())->method('parseContent')->will($this->returnValue($document)); $response = $this->getMockBuilder('Guzzle\\Http\\Message\\Response')->disableOriginalConstructor()->setMethods(['getBody'])->getMock(); $response->expects($this->once())->method('getBody')->with(true); $resource = $this->getMockBuilder('VDB\\Spider\\Resource')->disableOriginalConstructor()->setMethods(['getResponse'])->getMock(); $resource->expects($this->once())->method('getResponse')->will($this->returnValue($response)); $type = new Pdf($pdfType); $data = $type->getData($resource); //change that to: $this->assertEquals($expectedData, $data); $this->assertEquals(9, count($data)); $expectedKeys = ['id', 'url', 'content', 'title', 'tstamp', 'contentLength', 'lastModified', 'date', 'publishedDate']; foreach ($expectedKeys as $expectedKey) { $this->assertArrayHasKey($expectedKey, $data); } $this->assertEquals('dummyfile.pdf', $data['title']); $this->assertNotEmpty($data, $data['content']); }