/** * Clean the contents of the supplied article document * * @param Article $article * * @return null */ public function run(Article $article) { $this->document($article->getDoc()); $this->removeXPath('//comment()'); $this->replace('em, strong, b, i, strike, del, ins', function ($node) { return !$node->find('img')->count(); }); $this->replace('span[class~=dropcap], span[class~=drop_cap]'); $this->remove('script, style'); $this->remove('header, footer, input, form, button, aside'); $this->removeBadTags(); $this->remove("[id='caption'],[class='caption']"); $this->remove("[id*=' google '],[class*=' google ']"); $this->remove("[id*='more']:not([id^=entry-]),[class*='more']:not([class^=entry-])"); $this->remove("[id*='facebook']:not([id*='-facebook']),[class*='facebook']:not([class*='-facebook'])"); $this->remove("[id*='facebook-broadcasting'],[class*='facebook-broadcasting']"); $this->remove("[id*='twitter']:not([id*='-twitter']),[class*='twitter']:not([class*='-twitter'])"); $this->replace('span', function ($node) { if (is_null($node->parent())) { return false; } return $node->parent()->is('p'); }); $this->convertToParagraph('div, span, article'); }
/** * @param Article $article * * @return array */ private function getTopNodeCandidatesByContents(Article $article) { $results = []; $nodes = $article->getDoc()->find('p, td, pre'); foreach ($nodes as $node) { $wordStats = $this->config()->getStopWords()->getStopwordCount($node->text()); $highLinkDensity = $this->isHighLinkDensity($node); if ($wordStats->getStopWordCount() > 2 && !$highLinkDensity) { $results[] = $node; } } return $results; }