/** * @group online */ public function testUrlScraper() { $grabber = new Scraper(new Config()); $grabber->setUrl('http://theonion.com.feedsportal.com/c/34529/f/632231/s/309a7fe4/sc/20/l/0L0Stheonion0N0Carticles0Cobama0Ethrows0Eup0Eright0Ethere0Eduring0Esyria0Emeeting0H336850C/story01.htm'); $grabber->execute(); $this->assertTrue($grabber->hasRelevantContent()); $grabber = new Scraper(new Config()); $grabber->setUrl('http://www.lemonde.fr/proche-orient/article/2013/08/30/la-france-nouvelle-plus-ancienne-alliee-des-etats-unis_3469218_3218.html'); $grabber->execute(); $this->assertTrue($grabber->hasRelevantContent()); $grabber = new Scraper(new Config()); $grabber->setUrl('http://www.inc.com/suzanne-lucas/why-employee-turnover-is-so-costly.html'); $grabber->execute(); $this->assertTrue($grabber->hasRelevantContent()); $grabber = new Scraper(new Config()); $grabber->setUrl('http://arstechnica.com/information-technology/2013/08/sysadmin-security-fail-nsa-finds-snowden-hijacked-officials-logins/'); $grabber->execute(); $this->assertTrue($grabber->hasRelevantContent()); $grabber = new Scraper(new Config()); $grabber->disableCandidateParser(); $grabber->setUrl('http://linuxfr.org/news/grammalecte-correcteur-grammatical'); $grabber->execute(); $this->assertFalse($grabber->hasRelevantContent()); }
/** * Fetch item content with the content grabber. * * @param Item $item Item object */ public function scrapWebsite(Item $item) { if ($this->enable_grabber && !in_array($item->getUrl(), $this->grabber_ignore_urls)) { $grabber = new Scraper($this->config); $grabber->setUrl($item->getUrl()); if ($this->grabber_needs_rule_file) { $grabber->disableCandidateParser(); } $grabber->execute(); if ($grabber->hasRelevantContent()) { $item->content = $grabber->getFilteredContent(); } } }