Example #1
0
 /**
  * @param string $url
  * @param string|null $rawHTML
  *
  * @return Article
  */
 public function crawl($url, $rawHTML = null)
 {
     $article = new Article();
     $parseCandidate = Helper::getCleanedUrl($url);
     $xmlInternalErrors = libxml_use_internal_errors(true);
     if (empty($rawHTML)) {
         $rawHTML = $this->getHTML($parseCandidate->url);
     }
     // Generate document
     $doc = $this->getDocument($rawHTML);
     // Set core mutators
     $article->setFinalUrl($parseCandidate->url);
     $article->setDomain($parseCandidate->parts->host);
     $article->setLinkhash($parseCandidate->linkhash);
     $article->setRawHtml($rawHTML);
     $article->setDoc($doc);
     $article->setRawDoc(clone $doc);
     // Pre-extraction document cleaning
     $this->modules('cleaners', $article);
     // Extract content
     $this->modules('extractors', $article);
     // Post-extraction content formatting
     $this->modules('formatters', $article);
     libxml_use_internal_errors($xmlInternalErrors);
     return $article;
 }
Example #2
0
 /**
  * @param string $html
  *
  * @return Article
  */
 private function generate($html)
 {
     $article = new Article();
     $article->setDoc($this->document($html));
     return $article;
 }