public function extractTitle(Crawler $crawler) { $ret = $this->getElementText($crawler, $this->titleSelector); if (empty($ret) === true) { $ret = $this->fallbackAdapter->extractTitle($crawler); } return $ret; }
/** * scrap one source of news * @param string $baseUrl url to scrap list of news from * @param string $linkSelector css selector for news links in page * @param int|NULL $limit limit of news article to scrap, * if not set it will scrap all matching the selector * @return array array of article items scrapped */ public function scrapLinkGroup($baseUrl, $linkSelector, $limit = null) { $crawler = $this->scrapClient->request('GET', $baseUrl); $scrap_result = array(); $theAdapter = new Adapters\DefaultAdapter(); $theAdapter->currentUrl = $baseUrl; $isXpath = Selector::isXPath($linkSelector); $method = $isXpath === false ? 'filter' : 'filterXPath'; $crawler->{$method}($linkSelector)->each(function (Crawler $link_node) use(&$scrap_result, $theAdapter, &$limit) { if (!is_null($limit) && count($scrap_result) >= $limit) { return; } $link = $theAdapter->normalizeLink($link_node->attr('href'), true); //remove hash before scrapping $article_info = $this->getLinkData($link); $this->setAdapter(''); //reset default adapter after scrapping one link $scrap_result[$link] = $article_info; }); return $scrap_result; }
public function testNormalizeBodyLinks() { $adapter = new Adapters\DefaultAdapter(); $adapter->currentUrl = 'http://example.com'; $html = $this->getHtmlContent(); $html_normalized = $adapter->normalizeBodyLinks($html); $this->assertContains("http://example.com/relative-url", $html_normalized); $this->assertContains("http://example.com/another-sub/url", $html_normalized); $html_normalized2 = $adapter->normalizeBodyLinks(''); $this->assertEmpty($html_normalized2); }