Example #1
0
 /**
  * @inheritdoc
  */
 protected function extraction(UrlInterface $url, $html)
 {
     $links = [];
     // Regular expression to identify a link
     $linkRegExp = '/(<([a]+)[^>]*href=["\']?([^"\'\\s>]+)[^>]*>)(.*?)(<\\/\\2>)/i';
     preg_match_all($linkRegExp, $html, $links, PREG_SET_ORDER);
     foreach ($links as $link) {
         $this->links[] = new Link($url->getAbsoluteUrl($link[3]), $link[0]);
     }
 }
Example #2
0
 /**
  * @inheritdoc
  */
 public function getContent(UrlInterface $url)
 {
     $ch = curl_init($url->getUrl());
     curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
     curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
     $html = curl_exec($ch);
     // Update URL
     $info = curl_getinfo($ch);
     $url->setUrl(strtolower($info['url']))->setHttpCode($info['http_code']);
     if (!empty($html)) {
         // Clean up carriage returns and tabulations
         $html = str_replace("\n", '', $html);
         $html = str_replace("\t", '', $html);
         $html = str_replace("\r", '', $html);
     }
     return $html;
 }