/** * scrap one source of news * @param string $baseUrl url to scrap list of news from * @param string $linkSelector css selector for news links in page * @param int|NULL $limit limit of news article to scrap, * if not set it will scrap all matching the selector * @return array array of article items scrapped */ public function scrapLinkGroup($baseUrl, $linkSelector, $limit = null) { $crawler = $this->scrapClient->request('GET', $baseUrl); $scrap_result = array(); $theAdapter = new Adapters\DefaultAdapter(); $theAdapter->currentUrl = $baseUrl; $isXpath = Selector::isXPath($linkSelector); $method = $isXpath === false ? 'filter' : 'filterXPath'; $crawler->{$method}($linkSelector)->each(function (Crawler $link_node) use(&$scrap_result, $theAdapter, &$limit) { if (!is_null($limit) && count($scrap_result) >= $limit) { return; } $link = $theAdapter->normalizeLink($link_node->attr('href'), true); //remove hash before scrapping $article_info = $this->getLinkData($link); $this->setAdapter(''); //reset default adapter after scrapping one link $scrap_result[$link] = $article_info; }); return $scrap_result; }
/** * extract image source by selector * @param Crawler $crawler * @param string $selector * @return string|NULL */ protected function getSrcByImgSelector(Crawler $crawler, $selector) { $ret = null; $imgExtractClosure = function (Crawler $node) use(&$ret) { $ret = $node->attr('src'); }; if (Selector::isXPath($selector)) { $crawler->filterXPath($selector)->each($imgExtractClosure); } else { $crawler->filter($selector)->each($imgExtractClosure); } if (empty($ret) === false) { return $this->normalizeLink($ret); } else { return null; } }
/** * getting text of element by selector (css selector or xpath ) * @param Crawler $crawler * @param string $selector * @param \Closure $extractClosure callback function to be used for extraction * @return string */ protected function getElementText(Crawler $crawler, $selector, $extractClosure = null) { if (empty($selector) === true) { return null; } $ret = null; if ($extractClosure === null) { $extractClosure = function (Crawler $node) use(&$ret) { $ret = $node->html(); }; } if (Selector::isCSS($selector)) { $crawler->filter($selector)->each($extractClosure); } else { $crawler->filterXPath($selector)->each($extractClosure); } return $ret; }