Beispiel #1
0
 /**
  * @covers VDB\Spider\Resource
  */
 public function testSerialization()
 {
     $serialized = serialize($this->resource);
     $unserialized = unserialize($serialized);
     $this->assertInstanceOf('VDB\\Spider\\Resource', $unserialized);
     $this->assertInstanceOf('Psr\\Http\\Message\\ResponseInterface', $unserialized->getResponse());
     $this->assertInstanceOf('VDB\\Spider\\Uri\\DiscoveredUri', $unserialized->getUri());
     $this->assertEquals($this->resource->getUri()->__toString(), $unserialized->getUri()->__toString());
     $this->assertEquals($this->html, $unserialized->getResponse()->getBody()->__toString());
     $this->assertEquals($this->resource->getCrawler()->html(), $unserialized->getCrawler()->html());
 }
 public function persist(Resource $resource)
 {
     $fileName = urlencode($resource->getUri()->toString());
     $file = new \SplFileObject($this->getResultPath() . $fileName, 'w');
     $rawResponse = $resource->getResponse()->__toString();
     $this->totalSizePersisted += $file->fwrite($rawResponse);
 }
Beispiel #3
0
 /**
  * @param Resource $resource
  * @return UriInterface[]
  */
 public function discover(Resource $resource)
 {
     $this->markSeen($resource->getUri());
     if ($this->isAtMaxDepth($resource->getUri())) {
         return [];
     }
     $discoveredUris = [];
     foreach ($this->discoverers as $discoverer) {
         $discoveredUris = array_merge($discoveredUris, $discoverer->discover($resource));
     }
     $this->normalize($discoveredUris);
     $this->removeDuplicates($discoveredUris);
     $this->filterAlreadySeen($discoveredUris);
     $this->filter($discoveredUris);
     foreach ($discoveredUris as $uri) {
         $uri->setDepthFound($resource->getUri()->getDepthFound() + 1);
         $this->markSeen($uri);
     }
     return $discoveredUris;
 }
 /**
  * @param Spider $spider
  * @param Resource $document
  * @return UriInterface[]
  */
 public function discover(Spider $spider, Resource $document)
 {
     $crawler = $document->getCrawler()->filter($this->cssSelector);
     $uris = array();
     foreach ($crawler as $node) {
         try {
             $uris[] = new Uri($node->getAttribute('href'), $document->getUri()->toString());
         } catch (UriSyntaxException $e) {
             $spider->getStatsHandler()->addToFailed($node->getAttribute('href'), 'Invalid URI: ' . $e->getMessage());
         }
     }
     return $uris;
 }
Beispiel #5
0
 /**
  * @covers VDB\Spider\Resource::getUri
  */
 public function testGetUri()
 {
     $this->assertInstanceOf('VDB\\Spider\\Uri\\DiscoveredUri', $this->resource->getUri());
     $this->assertEquals('http://example.org/domains/special', $this->resource->getUri()->toString());
 }