disableCandidateParser() public method

Disable candidates parsing.
public disableCandidateParser ( ) : Scraper
return Scraper
Exemplo n.º 1
0
 /**
  * @group online
  */
 public function testUrlScraper()
 {
     $grabber = new Scraper(new Config());
     $grabber->setUrl('http://theonion.com.feedsportal.com/c/34529/f/632231/s/309a7fe4/sc/20/l/0L0Stheonion0N0Carticles0Cobama0Ethrows0Eup0Eright0Ethere0Eduring0Esyria0Emeeting0H336850C/story01.htm');
     $grabber->execute();
     $this->assertTrue($grabber->hasRelevantContent());
     $grabber = new Scraper(new Config());
     $grabber->setUrl('http://www.lemonde.fr/proche-orient/article/2013/08/30/la-france-nouvelle-plus-ancienne-alliee-des-etats-unis_3469218_3218.html');
     $grabber->execute();
     $this->assertTrue($grabber->hasRelevantContent());
     $grabber = new Scraper(new Config());
     $grabber->setUrl('http://www.inc.com/suzanne-lucas/why-employee-turnover-is-so-costly.html');
     $grabber->execute();
     $this->assertTrue($grabber->hasRelevantContent());
     $grabber = new Scraper(new Config());
     $grabber->setUrl('http://arstechnica.com/information-technology/2013/08/sysadmin-security-fail-nsa-finds-snowden-hijacked-officials-logins/');
     $grabber->execute();
     $this->assertTrue($grabber->hasRelevantContent());
     $grabber = new Scraper(new Config());
     $grabber->disableCandidateParser();
     $grabber->setUrl('http://linuxfr.org/news/grammalecte-correcteur-grammatical');
     $grabber->execute();
     $this->assertFalse($grabber->hasRelevantContent());
 }
Exemplo n.º 2
0
 /**
  * Fetch item content with the content grabber.
  *
  * @param Item $item Item object
  */
 public function scrapWebsite(Item $item)
 {
     if ($this->enable_grabber && !in_array($item->getUrl(), $this->grabber_ignore_urls)) {
         $grabber = new Scraper($this->config);
         $grabber->setUrl($item->getUrl());
         if ($this->grabber_needs_rule_file) {
             $grabber->disableCandidateParser();
         }
         $grabber->execute();
         if ($grabber->hasRelevantContent()) {
             $item->content = $grabber->getFilteredContent();
         }
     }
 }