/** * @covers VDB\Spider\Resource */ public function testSerialization() { $serialized = serialize($this->resource); $unserialized = unserialize($serialized); $this->assertInstanceOf('VDB\\Spider\\Resource', $unserialized); $this->assertInstanceOf('Psr\\Http\\Message\\ResponseInterface', $unserialized->getResponse()); $this->assertInstanceOf('VDB\\Spider\\Uri\\DiscoveredUri', $unserialized->getUri()); $this->assertEquals($this->resource->getUri()->__toString(), $unserialized->getUri()->__toString()); $this->assertEquals($this->html, $unserialized->getResponse()->getBody()->__toString()); $this->assertEquals($this->resource->getCrawler()->html(), $unserialized->getCrawler()->html()); }
/** * @param Spider $spider * @param Resource $document * @return UriInterface[] */ public function discover(Spider $spider, Resource $document) { $crawler = $document->getCrawler()->filter($this->cssSelector); $uris = array(); foreach ($crawler as $node) { try { $uris[] = new Uri($node->getAttribute('href'), $document->getUri()->toString()); } catch (UriSyntaxException $e) { $spider->getStatsHandler()->addToFailed($node->getAttribute('href'), 'Invalid URI: ' . $e->getMessage()); } } return $uris; }
/** * Extracts all text content from the crawled resource exception javascript. * * @param \VDB\Spider\Resource $resource * * @return string */ public function extractContentFromResource(Resource $resource) { $crawler = $resource->getCrawler(); if (null !== $this->cssBlacklist) { $crawler->filter($this->cssBlacklist)->each(function (Crawler $crawler) { foreach ($crawler as $node) { $node->parentNode->removeChild($node); } }); } $query = '//body//*[not(self::script)]/text()'; $content = ''; $crawler->filterXpath($query)->each(function (Crawler $crawler) use(&$content) { $text = trim($crawler->text()); if (strlen($text) > 0) { $content .= $text . ' '; } }); return trim($content); }
protected function getFilteredCrawler(Resource $resource) { return $resource->getCrawler()->filter($this->selector); }
/** * @covers VDB\Spider\Resource::getCrawler */ public function testGetCrawler() { $this->assertInstanceOf('Symfony\\Component\\DomCrawler\\Crawler', $this->resource->getCrawler()); }