/**
  * Extracts content from a webpage and returns document data.
  *
  * @param \VDB\Spider\Resource $resource
  *
  * @return array
  *
  * @throws \Simgroep\ConcurrentSpiderBundle\InvalidContentException
  */
 public function getData(Resource $resource)
 {
     $content = $this->extractContentFromResource($resource);
     $dataExtractor = new DocumentDataExtractor($resource);
     $data = ['id' => $dataExtractor->getId(), 'url' => $dataExtractor->getUrl(), 'content' => $content, 'title' => $dataExtractor->getTitle(), 'tstamp' => date('Y-m-d\\TH:i:s\\Z'), 'type' => $dataExtractor->getType(), 'contentLength' => strlen($content), 'lastModified' => $dataExtractor->getLastModified(), 'date' => date('Y-m-d\\TH:i:s\\Z'), 'lang' => 'nl-NL', 'author' => $dataExtractor->getAuthor(), 'publishedDate' => date('Y-m-d\\TH:i:s\\Z'), 'updatedDate' => date('Y-m-d\\TH:i:s\\Z'), 'strippedContent' => strip_tags($content), 'description' => $dataExtractor->getDescription(), 'keywords' => $dataExtractor->getKeywords()];
     return $data;
 }
 /**
  * Extracts content from a word2007 and returns document data.
  *
  * @param \VDB\Spider\Resource $resource
  *
  * @return array
  *
  * @throws \Simgroep\ConcurrentSpiderBundle\InvalidContentException
  */
 public function getData(Resource $resource)
 {
     $content = $this->extractContentFromResource($resource);
     if (strlen($content) < self::MINIMAL_CONTENT_LENGTH) {
         throw new InvalidContentException(sprintf("Word2007 didn't contain enough content (minimal chars is %s)", self::MINIMAL_CONTENT_LENGTH));
     }
     $dataExtractor = new DocumentDataExtractor($resource);
     $url = $dataExtractor->getUrl();
     $title = $this->getTitleByUrl($url) ?: '';
     $data = ['id' => $dataExtractor->getId(), 'url' => $url, 'content' => $content, 'title' => $title, 'tstamp' => date('Y-m-d\\TH:i:s\\Z'), 'contentLength' => strlen($content), 'lastModified' => $dataExtractor->getLastModified(), 'date' => date('Y-m-d\\TH:i:s\\Z'), 'publishedDate' => date('Y-m-d\\TH:i:s\\Z'), 'updatedDate' => date('Y-m-d\\TH:i:s\\Z')];
     return $data;
 }
 /**
  * @test
  */
 public function ifDocuemntDataExtractorReturnAllCorrectValues()
 {
     $response = $this->getMockBuilder('Guzzle\\Http\\Message\\Response')->disableOriginalConstructor()->setMethods(['getContentType', 'getLastModified'])->getMock();
     $response->expects($this->once())->method('getContentType')->will($this->returnValue('text/html'));
     $response->expects($this->once())->method('getLastModified')->will($this->returnValue('2014-06-18T23:49:41Z'));
     $uri = $this->getMockBuilder('VDB\\Uri\\Uri')->disableOriginalConstructor()->setMethods(['toString'])->getMock();
     $uri->expects($this->exactly(2))->method('toString')->will($this->returnValue('http://dummy.xxx/dummydir/somewebpagedummyfile.html'));
     $crawler = new Crawler('', 'https://github.com');
     $crawler->addContent('<html><head><title>Site dummy 1</title><meta name="description" content="Dummy description" /><meta name="keywords" content="keyword1,keyword2" /><meta name="author" content="Dummy Author" /><meta name="SIM_archief" content="yes" /><meta name="SIM.simfaq" content="yes" /><meta name="SIM.item_trefwoorden" content="trefwoorden" /><meta name="DCTERMS.title" content="Title 2" /><meta name="DCTERMS.language" content="pl-PL" /><meta name="DCTERMS.type" content="dummytype" /><meta name="SIM.simloket_synoniemen" content="synoniemen" /><meta name="SIM.spatial" content="spatial" /><meta name="DCTERMS.identifier" content="identifierurl" /><meta name="SIM.audience" content="audience" /><meta name="SIM.subject" content="subject" /><meta name="DCTERMS.available" content="2015-06-18T23:49:41Z" /><meta name="DCTERMS.modified" content="2015-06-18T23:49:41Z" /></head><body><p>This is the text value.</p></body></html>');
     $resource = $this->getMockBuilder('VDB\\Spider\\Resource')->disableOriginalConstructor()->setMethods(['getCrawler', 'getResponse', 'getUri'])->getMock();
     $resource->expects($this->exactly(17))->method('getCrawler')->will($this->returnValue($crawler));
     $resource->expects($this->exactly(2))->method('getResponse')->will($this->returnValue($response));
     $resource->expects($this->exactly(2))->method('getUri')->will($this->returnValue($uri));
     $documentExtractor = new DocumentDataExtractor($resource);
     $this->assertSame('3575e8f273b468d70a0e54a62e5c10b0d80f28a5', $documentExtractor->getId());
     $this->assertSame('Site dummy 1', $documentExtractor->getTitle());
     $this->assertSame('http://dummy.xxx/dummydir/somewebpagedummyfile.html', $documentExtractor->getUrl());
     $this->assertSame(['text/html', 'text', 'html'], $documentExtractor->getType());
     $this->assertSame('2014-06-18T23:49:41Z', $documentExtractor->getLastModified());
     $this->assertSame('Dummy Author', $documentExtractor->getAuthor());
     $this->assertSame('Dummy description', $documentExtractor->getDescription());
     $this->assertSame('keyword1,keyword2', $documentExtractor->getKeywords());
     $this->assertSame('yes', $documentExtractor->getSimArchief());
     $this->assertSame(['yes'], $documentExtractor->getSimfaq());
     $this->assertSame('2015-06-18T23:49:41Z', $documentExtractor->getDctermsModified());
     $this->assertSame('identifierurl', $documentExtractor->getDctermsIdentifier());
     $this->assertSame('Title 2', $documentExtractor->getDctermsTitle());
     $this->assertSame('2015-06-18T23:49:41Z', $documentExtractor->getDctermsAvailable());
     $this->assertSame('pl-PL', $documentExtractor->getDctermsLanguage());
     $this->assertSame('dummytype', $documentExtractor->getDctermsType());
     $this->assertSame('trefwoorden', $documentExtractor->getSimItemTrefwoorden());
     $this->assertSame('synoniemen', $documentExtractor->getSimSimloketSynoniemen());
     $this->assertSame('spatial', $documentExtractor->getDctermsSpatial());
     $this->assertSame('audience', $documentExtractor->getDctermsAudience());
     $this->assertSame('subject', $documentExtractor->getDctermsSubject());
 }