/** * @group online */ public function testGrabContentRegex() { $grabber = new Scraper(new Config()); $grabber->setUrl('http://penny-arcade.com/comic/2015/04/13/101-part-one'); $grabber->execute(); $this->assertTrue($grabber->hasRelevantContent()); $this->assertEquals('<img src="http://art.penny-arcade.com/photos/i-tBMHkzG/0/1050x10000/i-tBMHkzG-1050x10000.jpg" alt="101, Part One"/>', $grabber->getRelevantContent()); $grabber->setUrl('http://penny-arcade.com/news/post/2015/04/15/101-part-two'); $grabber->execute(); $this->assertTrue($grabber->hasRelevantContent()); $this->assertContains('101, Part Two', $grabber->getRelevantContent()); }
/** * Fetch item content with the content grabber. * * @param Item $item Item object */ public function scrapWebsite(Item $item) { if ($this->enable_grabber && !in_array($item->getUrl(), $this->grabber_ignore_urls)) { $grabber = new Scraper($this->config); $grabber->setUrl($item->getUrl()); if ($this->grabber_needs_rule_file) { $grabber->disableCandidateParser(); } $grabber->execute(); if ($grabber->hasRelevantContent()) { $item->content = $grabber->getFilteredContent(); } } }
function download_content_url($url) { $content = ''; $grabber = new Scraper(Config\get_reader_config()); $grabber->setUrl($url); $grabber->execute(); if ($grabber->hasRelevantContent()) { $content = $grabber->getFilteredContent(); } return $content; }