public function testNormalizeLink()
 {
     $adapter = new Adapters\DefaultAdapter();
     $adapter->currentUrl = 'http://example.com/subfolder/';
     //relative url is normalized to absolute one
     $url1 = $adapter->normalizeLink("/another-url");
     $this->assertEquals('http://example.com/another-url', $url1);
     //this url remains not changed
     $url2 = $adapter->normalizeLink("http://example2.com/whatever");
     $this->assertEquals("http://example2.com/whatever", $url2);
     $url3 = $adapter->normalizeLink('in-sub');
     $this->assertEquals('http://example.com/in-sub', $url3);
     $adapter->currentUrl = 'https://securedurl.com/';
     $url4 = $adapter->normalizeLink("//example3.com");
     $this->assertEquals('https://example3.com', $url4);
     $adapter->currentUrl = 'http://example5.com';
     $url5 = $adapter->normalizeLink("img.png");
     $this->assertEquals('http://example5.com/img.png', $url5);
     $adapter->currentUrl = __DIR__ . '/../../data/jsonld2.html';
     $url6 = $adapter->normalizeLink('jsonld.js');
     $this->assertEquals(__DIR__ . '/../../data/jsonld.js', $url6);
     $url7 = $adapter->normalizeLink('http://example.com/subpage#hash', true);
     $this->assertEquals('http://example.com/subpage', $url7);
     $url8 = $adapter->normalizeLink('http://example.com/subpage#hash');
     $this->assertEquals('http://example.com/subpage#hash', $url8);
 }
Пример #2
0
 /**
  * scrap one source of news
  * @param string   $baseUrl      url to scrap list of news from
  * @param string   $linkSelector css selector for news links in page
  * @param int|NULL $limit        limit of news article to scrap,
  *      if not set it will scrap all matching the selector
  * @return array array of article items scrapped
  */
 public function scrapLinkGroup($baseUrl, $linkSelector, $limit = null)
 {
     $crawler = $this->scrapClient->request('GET', $baseUrl);
     $scrap_result = array();
     $theAdapter = new Adapters\DefaultAdapter();
     $theAdapter->currentUrl = $baseUrl;
     $isXpath = Selector::isXPath($linkSelector);
     $method = $isXpath === false ? 'filter' : 'filterXPath';
     $crawler->{$method}($linkSelector)->each(function (Crawler $link_node) use(&$scrap_result, $theAdapter, &$limit) {
         if (!is_null($limit) && count($scrap_result) >= $limit) {
             return;
         }
         $link = $theAdapter->normalizeLink($link_node->attr('href'), true);
         //remove hash before scrapping
         $article_info = $this->getLinkData($link);
         $this->setAdapter('');
         //reset default adapter after scrapping one link
         $scrap_result[$link] = $article_info;
     });
     return $scrap_result;
 }