/** * @inheritdoc */ protected function extraction(UrlInterface $url, $html) { $links = []; // Regular expression to identify a link $linkRegExp = '/(<([a]+)[^>]*href=["\']?([^"\'\\s>]+)[^>]*>)(.*?)(<\\/\\2>)/i'; preg_match_all($linkRegExp, $html, $links, PREG_SET_ORDER); foreach ($links as $link) { $this->links[] = new Link($url->getAbsoluteUrl($link[3]), $link[0]); } }
/** * @inheritdoc */ public function getContent(UrlInterface $url) { $ch = curl_init($url->getUrl()); curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true); curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); $html = curl_exec($ch); // Update URL $info = curl_getinfo($ch); $url->setUrl(strtolower($info['url']))->setHttpCode($info['http_code']); if (!empty($html)) { // Clean up carriage returns and tabulations $html = str_replace("\n", '', $html); $html = str_replace("\t", '', $html); $html = str_replace("\r", '', $html); } return $html; }