/** * Extracts content from a webpage and returns document data. * * @param \VDB\Spider\Resource $resource * * @return array * * @throws \Simgroep\ConcurrentSpiderBundle\InvalidContentException */ public function getData(Resource $resource) { $content = $this->extractContentFromResource($resource); $dataExtractor = new DocumentDataExtractor($resource); $data = ['id' => $dataExtractor->getId(), 'url' => $dataExtractor->getUrl(), 'content' => $content, 'title' => $dataExtractor->getTitle(), 'tstamp' => date('Y-m-d\\TH:i:s\\Z'), 'type' => $dataExtractor->getType(), 'contentLength' => strlen($content), 'lastModified' => $dataExtractor->getLastModified(), 'date' => date('Y-m-d\\TH:i:s\\Z'), 'lang' => 'nl-NL', 'author' => $dataExtractor->getAuthor(), 'publishedDate' => date('Y-m-d\\TH:i:s\\Z'), 'updatedDate' => date('Y-m-d\\TH:i:s\\Z'), 'strippedContent' => strip_tags($content), 'description' => $dataExtractor->getDescription(), 'keywords' => $dataExtractor->getKeywords()]; return $data; }
/** * Extracts content from a word2007 and returns document data. * * @param \VDB\Spider\Resource $resource * * @return array * * @throws \Simgroep\ConcurrentSpiderBundle\InvalidContentException */ public function getData(Resource $resource) { $content = $this->extractContentFromResource($resource); if (strlen($content) < self::MINIMAL_CONTENT_LENGTH) { throw new InvalidContentException(sprintf("Word2007 didn't contain enough content (minimal chars is %s)", self::MINIMAL_CONTENT_LENGTH)); } $dataExtractor = new DocumentDataExtractor($resource); $url = $dataExtractor->getUrl(); $title = $this->getTitleByUrl($url) ?: ''; $data = ['id' => $dataExtractor->getId(), 'url' => $url, 'content' => $content, 'title' => $title, 'tstamp' => date('Y-m-d\\TH:i:s\\Z'), 'contentLength' => strlen($content), 'lastModified' => $dataExtractor->getLastModified(), 'date' => date('Y-m-d\\TH:i:s\\Z'), 'publishedDate' => date('Y-m-d\\TH:i:s\\Z'), 'updatedDate' => date('Y-m-d\\TH:i:s\\Z')]; return $data; }
/** * @test */ public function ifDocuemntDataExtractorReturnAllCorrectValues() { $response = $this->getMockBuilder('Guzzle\\Http\\Message\\Response')->disableOriginalConstructor()->setMethods(['getContentType', 'getLastModified'])->getMock(); $response->expects($this->once())->method('getContentType')->will($this->returnValue('text/html')); $response->expects($this->once())->method('getLastModified')->will($this->returnValue('2014-06-18T23:49:41Z')); $uri = $this->getMockBuilder('VDB\\Uri\\Uri')->disableOriginalConstructor()->setMethods(['toString'])->getMock(); $uri->expects($this->exactly(2))->method('toString')->will($this->returnValue('http://dummy.xxx/dummydir/somewebpagedummyfile.html')); $crawler = new Crawler('', 'https://github.com'); $crawler->addContent('<html><head><title>Site dummy 1</title><meta name="description" content="Dummy description" /><meta name="keywords" content="keyword1,keyword2" /><meta name="author" content="Dummy Author" /><meta name="SIM_archief" content="yes" /><meta name="SIM.simfaq" content="yes" /><meta name="SIM.item_trefwoorden" content="trefwoorden" /><meta name="DCTERMS.title" content="Title 2" /><meta name="DCTERMS.language" content="pl-PL" /><meta name="DCTERMS.type" content="dummytype" /><meta name="SIM.simloket_synoniemen" content="synoniemen" /><meta name="SIM.spatial" content="spatial" /><meta name="DCTERMS.identifier" content="identifierurl" /><meta name="SIM.audience" content="audience" /><meta name="SIM.subject" content="subject" /><meta name="DCTERMS.available" content="2015-06-18T23:49:41Z" /><meta name="DCTERMS.modified" content="2015-06-18T23:49:41Z" /></head><body><p>This is the text value.</p></body></html>'); $resource = $this->getMockBuilder('VDB\\Spider\\Resource')->disableOriginalConstructor()->setMethods(['getCrawler', 'getResponse', 'getUri'])->getMock(); $resource->expects($this->exactly(17))->method('getCrawler')->will($this->returnValue($crawler)); $resource->expects($this->exactly(2))->method('getResponse')->will($this->returnValue($response)); $resource->expects($this->exactly(2))->method('getUri')->will($this->returnValue($uri)); $documentExtractor = new DocumentDataExtractor($resource); $this->assertSame('3575e8f273b468d70a0e54a62e5c10b0d80f28a5', $documentExtractor->getId()); $this->assertSame('Site dummy 1', $documentExtractor->getTitle()); $this->assertSame('http://dummy.xxx/dummydir/somewebpagedummyfile.html', $documentExtractor->getUrl()); $this->assertSame(['text/html', 'text', 'html'], $documentExtractor->getType()); $this->assertSame('2014-06-18T23:49:41Z', $documentExtractor->getLastModified()); $this->assertSame('Dummy Author', $documentExtractor->getAuthor()); $this->assertSame('Dummy description', $documentExtractor->getDescription()); $this->assertSame('keyword1,keyword2', $documentExtractor->getKeywords()); $this->assertSame('yes', $documentExtractor->getSimArchief()); $this->assertSame(['yes'], $documentExtractor->getSimfaq()); $this->assertSame('2015-06-18T23:49:41Z', $documentExtractor->getDctermsModified()); $this->assertSame('identifierurl', $documentExtractor->getDctermsIdentifier()); $this->assertSame('Title 2', $documentExtractor->getDctermsTitle()); $this->assertSame('2015-06-18T23:49:41Z', $documentExtractor->getDctermsAvailable()); $this->assertSame('pl-PL', $documentExtractor->getDctermsLanguage()); $this->assertSame('dummytype', $documentExtractor->getDctermsType()); $this->assertSame('trefwoorden', $documentExtractor->getSimItemTrefwoorden()); $this->assertSame('synoniemen', $documentExtractor->getSimSimloketSynoniemen()); $this->assertSame('spatial', $documentExtractor->getDctermsSpatial()); $this->assertSame('audience', $documentExtractor->getDctermsAudience()); $this->assertSame('subject', $documentExtractor->getDctermsSubject()); }