コード例 #1
0
ファイル: ResourceTest.php プロジェクト: aktuba/php-spider
 /**
  * @covers VDB\Spider\Resource
  */
 public function testSerialization()
 {
     $serialized = serialize($this->resource);
     $unserialized = unserialize($serialized);
     $this->assertInstanceOf('VDB\\Spider\\Resource', $unserialized);
     $this->assertInstanceOf('Psr\\Http\\Message\\ResponseInterface', $unserialized->getResponse());
     $this->assertInstanceOf('VDB\\Spider\\Uri\\DiscoveredUri', $unserialized->getUri());
     $this->assertEquals($this->resource->getUri()->__toString(), $unserialized->getUri()->__toString());
     $this->assertEquals($this->html, $unserialized->getResponse()->getBody()->__toString());
     $this->assertEquals($this->resource->getCrawler()->html(), $unserialized->getCrawler()->html());
 }
コード例 #2
0
 /**
  * @param Spider $spider
  * @param Resource $document
  * @return UriInterface[]
  */
 public function discover(Spider $spider, Resource $document)
 {
     $crawler = $document->getCrawler()->filter($this->cssSelector);
     $uris = array();
     foreach ($crawler as $node) {
         try {
             $uris[] = new Uri($node->getAttribute('href'), $document->getUri()->toString());
         } catch (UriSyntaxException $e) {
             $spider->getStatsHandler()->addToFailed($node->getAttribute('href'), 'Invalid URI: ' . $e->getMessage());
         }
     }
     return $uris;
 }
コード例 #3
0
 /**
  * Extracts all text content from the crawled resource exception javascript.
  *
  * @param \VDB\Spider\Resource $resource
  *
  * @return string
  */
 public function extractContentFromResource(Resource $resource)
 {
     $crawler = $resource->getCrawler();
     if (null !== $this->cssBlacklist) {
         $crawler->filter($this->cssBlacklist)->each(function (Crawler $crawler) {
             foreach ($crawler as $node) {
                 $node->parentNode->removeChild($node);
             }
         });
     }
     $query = '//body//*[not(self::script)]/text()';
     $content = '';
     $crawler->filterXpath($query)->each(function (Crawler $crawler) use(&$content) {
         $text = trim($crawler->text());
         if (strlen($text) > 0) {
             $content .= $text . ' ';
         }
     });
     return trim($content);
 }
コード例 #4
0
 protected function getFilteredCrawler(Resource $resource)
 {
     return $resource->getCrawler()->filter($this->selector);
 }
コード例 #5
0
ファイル: ResourceTest.php プロジェクト: ggnet/php-spider
 /**
  * @covers VDB\Spider\Resource::getCrawler
  */
 public function testGetCrawler()
 {
     $this->assertInstanceOf('Symfony\\Component\\DomCrawler\\Crawler', $this->resource->getCrawler());
 }