Esempio n. 1
0
 /**
  * scrap one source of news
  * @param string   $baseUrl      url to scrap list of news from
  * @param string   $linkSelector css selector for news links in page
  * @param int|NULL $limit        limit of news article to scrap,
  *      if not set it will scrap all matching the selector
  * @return array array of article items scrapped
  */
 public function scrapLinkGroup($baseUrl, $linkSelector, $limit = null)
 {
     $crawler = $this->scrapClient->request('GET', $baseUrl);
     $scrap_result = array();
     $theAdapter = new Adapters\DefaultAdapter();
     $theAdapter->currentUrl = $baseUrl;
     $isXpath = Selector::isXPath($linkSelector);
     $method = $isXpath === false ? 'filter' : 'filterXPath';
     $crawler->{$method}($linkSelector)->each(function (Crawler $link_node) use(&$scrap_result, $theAdapter, &$limit) {
         if (!is_null($limit) && count($scrap_result) >= $limit) {
             return;
         }
         $link = $theAdapter->normalizeLink($link_node->attr('href'), true);
         //remove hash before scrapping
         $article_info = $this->getLinkData($link);
         $this->setAdapter('');
         //reset default adapter after scrapping one link
         $scrap_result[$link] = $article_info;
     });
     return $scrap_result;
 }
 /**
  * extract image source by selector
  * @param  Crawler $crawler
  * @param  string $selector
  * @return string|NULL
  */
 protected function getSrcByImgSelector(Crawler $crawler, $selector)
 {
     $ret = null;
     $imgExtractClosure = function (Crawler $node) use(&$ret) {
         $ret = $node->attr('src');
     };
     if (Selector::isXPath($selector)) {
         $crawler->filterXPath($selector)->each($imgExtractClosure);
     } else {
         $crawler->filter($selector)->each($imgExtractClosure);
     }
     if (empty($ret) === false) {
         return $this->normalizeLink($ret);
     } else {
         return null;
     }
 }
Esempio n. 3
0
 /**
  * getting text of element by selector (css selector or xpath )
  * @param Crawler $crawler
  * @param string $selector
  * @param \Closure $extractClosure callback function to be used for extraction
  * @return string
  */
 protected function getElementText(Crawler $crawler, $selector, $extractClosure = null)
 {
     if (empty($selector) === true) {
         return null;
     }
     $ret = null;
     if ($extractClosure === null) {
         $extractClosure = function (Crawler $node) use(&$ret) {
             $ret = $node->html();
         };
     }
     if (Selector::isCSS($selector)) {
         $crawler->filter($selector)->each($extractClosure);
     } else {
         $crawler->filterXPath($selector)->each($extractClosure);
     }
     return $ret;
 }