/** * @dataProvider getDateFromURLProvider */ public function testGetDateFromURL($expected, $url, $message) { $article = new Article(); $article->setFinalUrl($url); $this->setArticle($article); $this->assertEquals($expected, $this->call('getDateFromURL'), $message); }
/** * @param string $url * @param string|null $rawHTML * * @return Article */ public function crawl($url, $rawHTML = null) { $article = new Article(); $parseCandidate = Helper::getCleanedUrl($url); $xmlInternalErrors = libxml_use_internal_errors(true); if (empty($rawHTML)) { $rawHTML = $this->getHTML($parseCandidate->url); } // Generate document $doc = $this->getDocument($rawHTML); // Set core mutators $article->setFinalUrl($parseCandidate->url); $article->setDomain($parseCandidate->parts->host); $article->setLinkhash($parseCandidate->linkhash); $article->setRawHtml($rawHTML); $article->setDoc($doc); $article->setRawDoc(clone $doc); // Pre-extraction document cleaning $this->modules('cleaners', $article); // Extract content $this->modules('extractors', $article); // Post-extraction content formatting $this->modules('formatters', $article); libxml_use_internal_errors($xmlInternalErrors); return $article; }