/** * Returns a document that can be persisted based on the resource. * * @param \VDB\Spider\Resource $resource * * @return \Simgroep\ConcurrentSpiderBundle\PersistableDocument */ public function getDocumentByResource(Resource $resource) { switch ($resource->getResponse()->getContentType()) { case 'application/pdf': case 'application/octet-stream': $data = $this->pdf->getData($resource); break; case 'application/msword': case 'application/vnd.openxmlformats-officedocument.wordprocessingml.document': case 'application/vnd.openxmlformats-officedocument.wordprocessingml.template': if (false !== stripos($resource->getUri()->toString(), '.docx')) { $data = $this->word2007->getData($resource); break; } $data = $this->msdoc->getData($resource); break; case 'application/rtf': $data = $this->rtf->getData($resource); break; case 'application/vnd.oasis.opendocument.text': $data = $this->odt->getData($resource); break; case 'text/html': default: $data = $this->html->getData($resource); break; } return new PersistableDocument($data); }
/** * @test */ public function retrieveValidDataFromResource() { $response = $this->getMockBuilder('Guzzle\\Http\\Message\\Response')->disableOriginalConstructor()->setMethods(['getBody', 'getLastModified'])->getMock(); $response->expects($this->once())->method('getLastModified')->will($this->returnValue('2015-06-18T23:49:41Z')); $response->expects($this->once())->method('getBody')->will($this->returnValue(file_get_contents(__DIR__ . '/../../Mock/Documents/sample.docx'))); $uri = $this->getMockBuilder('VDB\\Uri\\Uri')->disableOriginalConstructor()->setMethods(['toString'])->getMock(); $uri->expects($this->exactly(2))->method('toString')->will($this->returnValue('http://blabdummy.de/dummydir/sample.docx')); $crawler = new Crawler('', 'http://blabdummy.de/dummydir/sample.docx'); $resource = $this->getMockBuilder('VDB\\Spider\\Resource')->disableOriginalConstructor()->setMethods(['getResponse', 'getUri', 'getBody'])->getMock(); $resource->expects($this->exactly(2))->method('getResponse')->will($this->returnValue($response)); $resource->expects($this->exactly(2))->method('getUri')->will($this->returnValue($uri)); $type = new Word2007(); $data = $type->getData($resource); $this->assertEquals(10, count($data)); $expectedKeys = ['id', 'url', 'content', 'title', 'tstamp', 'contentLength', 'lastModified', 'date', 'publishedDate', 'updatedDate']; foreach ($expectedKeys as $expectedKey) { $this->assertArrayHasKey($expectedKey, $data); } $this->assertEquals('sample.docx', $data['title']); $this->assertNotEmpty($data, $data['content']); }