Example #1
0
 /**
  * Clean the contents of the supplied article document
  *
  * @param Article $article
  *
  * @return null
  */
 public function run(Article $article)
 {
     $this->document($article->getDoc());
     $this->removeXPath('//comment()');
     $this->replace('em, strong, b, i, strike, del, ins', function ($node) {
         return !$node->find('img')->count();
     });
     $this->replace('span[class~=dropcap], span[class~=drop_cap]');
     $this->remove('script, style');
     $this->remove('header, footer, input, form, button, aside');
     $this->removeBadTags();
     $this->remove("[id='caption'],[class='caption']");
     $this->remove("[id*=' google '],[class*=' google ']");
     $this->remove("[id*='more']:not([id^=entry-]),[class*='more']:not([class^=entry-])");
     $this->remove("[id*='facebook']:not([id*='-facebook']),[class*='facebook']:not([class*='-facebook'])");
     $this->remove("[id*='facebook-broadcasting'],[class*='facebook-broadcasting']");
     $this->remove("[id*='twitter']:not([id*='-twitter']),[class*='twitter']:not([class*='-twitter'])");
     $this->replace('span', function ($node) {
         if (is_null($node->parent())) {
             return false;
         }
         return $node->parent()->is('p');
     });
     $this->convertToParagraph('div, span, article');
 }
 /**
  * @param Article $article
  *
  * @return array
  */
 private function getTopNodeCandidatesByContents(Article $article)
 {
     $results = [];
     $nodes = $article->getDoc()->find('p, td, pre');
     foreach ($nodes as $node) {
         $wordStats = $this->config()->getStopWords()->getStopwordCount($node->text());
         $highLinkDensity = $this->isHighLinkDensity($node);
         if ($wordStats->getStopWordCount() > 2 && !$highLinkDensity) {
             $results[] = $node;
         }
     }
     return $results;
 }