コード例 #1
0
 public function extractTitle(Crawler $crawler)
 {
     $ret = $this->getElementText($crawler, $this->titleSelector);
     if (empty($ret) === true) {
         $ret = $this->fallbackAdapter->extractTitle($crawler);
     }
     return $ret;
 }
コード例 #2
0
ファイル: Client.php プロジェクト: zrashwani/news-scrapper
 /**
  * scrap one source of news
  * @param string   $baseUrl      url to scrap list of news from
  * @param string   $linkSelector css selector for news links in page
  * @param int|NULL $limit        limit of news article to scrap,
  *      if not set it will scrap all matching the selector
  * @return array array of article items scrapped
  */
 public function scrapLinkGroup($baseUrl, $linkSelector, $limit = null)
 {
     $crawler = $this->scrapClient->request('GET', $baseUrl);
     $scrap_result = array();
     $theAdapter = new Adapters\DefaultAdapter();
     $theAdapter->currentUrl = $baseUrl;
     $isXpath = Selector::isXPath($linkSelector);
     $method = $isXpath === false ? 'filter' : 'filterXPath';
     $crawler->{$method}($linkSelector)->each(function (Crawler $link_node) use(&$scrap_result, $theAdapter, &$limit) {
         if (!is_null($limit) && count($scrap_result) >= $limit) {
             return;
         }
         $link = $theAdapter->normalizeLink($link_node->attr('href'), true);
         //remove hash before scrapping
         $article_info = $this->getLinkData($link);
         $this->setAdapter('');
         //reset default adapter after scrapping one link
         $scrap_result[$link] = $article_info;
     });
     return $scrap_result;
 }
コード例 #3
0
 public function testNormalizeBodyLinks()
 {
     $adapter = new Adapters\DefaultAdapter();
     $adapter->currentUrl = 'http://example.com';
     $html = $this->getHtmlContent();
     $html_normalized = $adapter->normalizeBodyLinks($html);
     $this->assertContains("http://example.com/relative-url", $html_normalized);
     $this->assertContains("http://example.com/another-sub/url", $html_normalized);
     $html_normalized2 = $adapter->normalizeBodyLinks('');
     $this->assertEmpty($html_normalized2);
 }