/** * Clean the contents of the supplied article document * * @param Article $article * * @return null */ public function run(Article $article) { $this->document($article->getDoc()); $this->removeXPath('//comment()'); $this->replace('em, strong, b, i, strike, del, ins', function ($node) { return !$node->find('img')->count(); }); $this->replace('span[class~=dropcap], span[class~=drop_cap]'); $this->remove('script, style'); $this->remove('header, footer, input, form, button, aside'); $this->removeBadTags(); $this->remove("[id='caption'],[class='caption']"); $this->remove("[id*=' google '],[class*=' google ']"); $this->remove("[id*='more']:not([id^=entry-]),[class*='more']:not([class^=entry-])"); $this->remove("[id*='facebook']:not([id*='-facebook']),[class*='facebook']:not([class*='-facebook'])"); $this->remove("[id*='facebook-broadcasting'],[class*='facebook-broadcasting']"); $this->remove("[id*='twitter']:not([id*='-twitter']),[class*='twitter']:not([class*='-twitter'])"); $this->replace('span', function ($node) { if (is_null($node->parent())) { return false; } return $node->parent()->is('p'); }); $this->convertToParagraph('div, span, article'); }
/** * @dataProvider getDateFromURLProvider */ public function testGetDateFromURL($expected, $url, $message) { $article = new Article(); $article->setFinalUrl($url); $this->setArticle($article); $this->assertEquals($expected, $this->call('getDateFromURL'), $message); }
/** * @param Article $article * * @return array */ private function getTopNodeCandidatesByContents(Article $article) { $results = []; $nodes = $article->getDoc()->find('p, td, pre'); foreach ($nodes as $node) { $wordStats = $this->config()->getStopWords()->getStopwordCount($node->text()); $highLinkDensity = $this->isHighLinkDensity($node); if ($wordStats->getStopWordCount() > 2 && !$highLinkDensity) { $results[] = $node; } } return $results; }
/** * @param string $url * @param string|null $rawHTML * * @return Article */ public function crawl($url, $rawHTML = null) { $article = new Article(); $parseCandidate = Helper::getCleanedUrl($url); $xmlInternalErrors = libxml_use_internal_errors(true); if (empty($rawHTML)) { $rawHTML = $this->getHTML($parseCandidate->url); } // Generate document $doc = $this->getDocument($rawHTML); // Set core mutators $article->setFinalUrl($parseCandidate->url); $article->setDomain($parseCandidate->parts->host); $article->setLinkhash($parseCandidate->linkhash); $article->setRawHtml($rawHTML); $article->setDoc($doc); $article->setRawDoc(clone $doc); // Pre-extraction document cleaning $this->modules('cleaners', $article); // Extract content $this->modules('extractors', $article); // Post-extraction content formatting $this->modules('formatters', $article); libxml_use_internal_errors($xmlInternalErrors); return $article; }
/** * @param string $html * * @return Article */ private function generate($html) { $article = new Article(); $article->setDoc($this->document($html)); return $article; }