/** * This is how typical bank grabbing works * Get URL, scrub document, cells, exchanges, throw exception if something is empty or wrong * * {@inheritdoc} */ public function execute() { // grab bank exchange page, check $url = $this->getURL(); if (empty($url)) { throw new \LogicException('broken class:no url'); } // grab bank exchange page, check $curl = curl_init(); curl_setopt($curl, CURLOPT_URL, $url); curl_setopt($curl, CURLOPT_FOLLOWLOCATION, 1); curl_setopt($curl, CURLOPT_RETURNTRANSFER, 1); curl_setopt($curl, CURLOPT_USERAGENT, 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.80 Safari/537.36'); $str = curl_exec($curl); if (empty($str)) { throw new \LogicException('broken remote:no document on link'); } $html = SimpleHTMLDom::str_get_html($str); if (empty($html)) { throw new \LogicException('broken markup:no html'); } // grab exchange table, check $cells = $this->grabCells($html); if (empty($cells)) { throw new \LogicException('broken markup:no cells'); } // grab exchange values $this->grabValues($cells); // return return $this->returnValues(); }
/** * Get HTML * Get remote HTML data from url using Curl and creates a new DOM Object * @param string $url * @return DOM object */ public function getHtml($url) { $c = curl_init(); curl_setopt($c, CURLOPT_URL, $url); curl_setopt($c, CURLOPT_RETURNTRANSFER, 1); curl_setopt($c, CURLOPT_USERAGENT, "My Scrap bot"); curl_setopt($c, CURLOPT_FOLLOWLOCATION, true); curl_setopt($c, CURLOPT_COOKIEJAR, 'cookies.txt'); curl_setopt($c, CURLOPT_COOKIEFILE, 'cookies.txt'); $result = curl_exec($c); $status = curl_getinfo($c); curl_close($c); if ($status['http_code'] == 200) { return \serhatozles\simplehtmldom\SimpleHTMLDom::str_get_html(mb_convert_encoding($result, 'HTML-ENTITIES', 'utf-8')); } //if not met the return criteria above, then show error return "\nERRORCODE22 with {$url}!!\nLast status codes:\n" . json_encode($status) . "\n\nLast data got:\n{$data}\n"; }