public function testNormalizeLink() { $adapter = new Adapters\DefaultAdapter(); $adapter->currentUrl = 'http://example.com/subfolder/'; //relative url is normalized to absolute one $url1 = $adapter->normalizeLink("/another-url"); $this->assertEquals('http://example.com/another-url', $url1); //this url remains not changed $url2 = $adapter->normalizeLink("http://example2.com/whatever"); $this->assertEquals("http://example2.com/whatever", $url2); $url3 = $adapter->normalizeLink('in-sub'); $this->assertEquals('http://example.com/in-sub', $url3); $adapter->currentUrl = 'https://securedurl.com/'; $url4 = $adapter->normalizeLink("//example3.com"); $this->assertEquals('https://example3.com', $url4); $adapter->currentUrl = 'http://example5.com'; $url5 = $adapter->normalizeLink("img.png"); $this->assertEquals('http://example5.com/img.png', $url5); $adapter->currentUrl = __DIR__ . '/../../data/jsonld2.html'; $url6 = $adapter->normalizeLink('jsonld.js'); $this->assertEquals(__DIR__ . '/../../data/jsonld.js', $url6); $url7 = $adapter->normalizeLink('http://example.com/subpage#hash', true); $this->assertEquals('http://example.com/subpage', $url7); $url8 = $adapter->normalizeLink('http://example.com/subpage#hash'); $this->assertEquals('http://example.com/subpage#hash', $url8); }
/** * scrap one source of news * @param string $baseUrl url to scrap list of news from * @param string $linkSelector css selector for news links in page * @param int|NULL $limit limit of news article to scrap, * if not set it will scrap all matching the selector * @return array array of article items scrapped */ public function scrapLinkGroup($baseUrl, $linkSelector, $limit = null) { $crawler = $this->scrapClient->request('GET', $baseUrl); $scrap_result = array(); $theAdapter = new Adapters\DefaultAdapter(); $theAdapter->currentUrl = $baseUrl; $isXpath = Selector::isXPath($linkSelector); $method = $isXpath === false ? 'filter' : 'filterXPath'; $crawler->{$method}($linkSelector)->each(function (Crawler $link_node) use(&$scrap_result, $theAdapter, &$limit) { if (!is_null($limit) && count($scrap_result) >= $limit) { return; } $link = $theAdapter->normalizeLink($link_node->attr('href'), true); //remove hash before scrapping $article_info = $this->getLinkData($link); $this->setAdapter(''); //reset default adapter after scrapping one link $scrap_result[$link] = $article_info; }); return $scrap_result; }