Author: Frederic Guillot
Inheritance: extends PicoFeed\Base
 /**
  * @group online
  */
 public function testGrabContentRegex()
 {
     $grabber = new Scraper(new Config());
     $grabber->setUrl('http://penny-arcade.com/comic/2015/04/13/101-part-one');
     $grabber->execute();
     $this->assertTrue($grabber->hasRelevantContent());
     $this->assertEquals('<img src="http://art.penny-arcade.com/photos/i-tBMHkzG/0/1050x10000/i-tBMHkzG-1050x10000.jpg" alt="101, Part One"/>', $grabber->getRelevantContent());
     $grabber->setUrl('http://penny-arcade.com/news/post/2015/04/15/101-part-two');
     $grabber->execute();
     $this->assertTrue($grabber->hasRelevantContent());
     $this->assertContains('101, Part Two', $grabber->getRelevantContent());
 }
Exemple #2
0
 /**
  * Fetch item content with the content grabber.
  *
  * @param Item $item Item object
  */
 public function scrapWebsite(Item $item)
 {
     if ($this->enable_grabber && !in_array($item->getUrl(), $this->grabber_ignore_urls)) {
         $grabber = new Scraper($this->config);
         $grabber->setUrl($item->getUrl());
         if ($this->grabber_needs_rule_file) {
             $grabber->disableCandidateParser();
         }
         $grabber->execute();
         if ($grabber->hasRelevantContent()) {
             $item->content = $grabber->getFilteredContent();
         }
     }
 }
Exemple #3
0
function download_content_url($url)
{
    $content = '';
    $grabber = new Scraper(Config\get_reader_config());
    $grabber->setUrl($url);
    $grabber->execute();
    if ($grabber->hasRelevantContent()) {
        $content = $grabber->getFilteredContent();
    }
    return $content;
}