protected function _run_cron_job($api, $from = 'no') { //if ( ! in_array($from, $this->fromAr)) { // echo "You are not allowed to run this cron"; // exit; //} // Count the active proxies // Send a warning if 5 or less are active $threshold = 10; $proxies = getProxyIPS(TRUE, $threshold + 1); if (count($proxies) <= $threshold) { $this->email_alert('Low on proxies', 'The active proxy count has reached ' . $threshold . ' or less as of ' . date('Y-m-d H:i:s')); } $apis = ensure_array($api); $storesData = $this->Store->get_stores(TRUE); $currentDate = date('Y-m-d H:i:s'); $this->Crowl_model->stats = array(); // Cycle Through API crons for ($j = 0, $n = count($apis); $j < $n; $j++) { $this->Crowl_model->stats[$apis[$j]] = array('data_found' => 0, 'price_found' => 0); $this->Crowl_model->load_sticky_api($apis[$j]); $data = array('api_type' => $apis[$j], 'key' => generate_rand(32), 'run_from' => $from, 'start_datetime' => $currentDate); try { // Update cron log information $lastLog = $this->getMaxCronLog($data['api_type']); $lastRun = empty($lastLog) ? '0000-00-00 00:00:00' : $lastLog->datetime; if (!empty($lastLog)) { if ($lastLog->datetime === "0000-00-00 00:00:00") { $update = array('datetime' => $currentDate, 'end_datetime' => $currentDate); $this->db->where('datetime', '0000-00-00 00:00:00')->where('api_type', $data['api_type'])->update('cron_log', $update); } } if ($lastRun == '0000-00-00 00:00:00' || $currentDate >= $lastRun) { // No entry in cron log table $insert_id = $this->create_cron_log($data); $this->resetViolationCount(); // Crawl the products for each store for ($i = 0, $z = count($storesData); $i < $z; $i++) { $this->index($storesData[$i]->id, $data['api_type'], $insert_id); } $data_edit['end_datetime'] = $data_edit['datetime'] = date("Y-m-d H:i:s"); $this->update_cron_log($insert_id, $data_edit); } } catch (Exception $e) { email_alertToTeam('Run Cron Job ' . ucwords($apis[$j]), $e->getMessage()); } // Now that the crawl has finished, // compare these crawl numbers with the last // and send out notifications if needed. $currentLog = $this->getMaxCronLog($data['api_type']); $lastCount = $lastLog ? $lastLog->google_count : 0; $currentCount = $currentLog->google_count; $percentThresh = 25; // send a notification if completed this percent of the last crawl $smallThresh = 10; // send a notification if complete less than this many products $msg = ''; $percentDrop = 100 - $percentThresh; if ($lastCount > 0 and $currentCount * 100 / $lastCount <= $percentThresh) { $msg = ' crawl completed ' . $percentDrop . '% less products than its previous crawl.'; } elseif ($currentCount < $smallThresh) { $msg = ' crawl completed less than ' . $smallThresh . ' products.'; } if (!empty($msg)) { $msg_txt = 'Crawl Warning: ' . $data['api_type'] . $msg; $msg_html = 'Crawl Warning: ' . '<b>' . $data['api_type'] . '</b>' . $msg; log_message('error', $msg_txt); send_email($this->config->item('alerts'), 'TrackStreet Errors', 'TrackStreet Crawl Warning', $msg_html, $msg_txt); } } $this->_checkNotFoundRatio(); // todo: should we not return to the main script here? exit; }
private function _scrape_page($url) { $this->proxies = getProxyIPS(TRUE, 500); require_once 'HTTP/Request2.php'; if (count($this->proxies) > 0) { foreach ($this->proxies as $index => $proxy) { $now = time(); $totalAttempts = $proxy->fails + $proxy->connects; $failureRate = $totalAttempts == 0 ? 0 : $proxy->fails / $totalAttempts * 100; $gracePeriod = $now - strtotime($proxy->last_warn_time); $scheme = 'http'; switch ($proxy->scheme) { case 'https': case 'HTTPS': //Request2 only takes http, but proxy should still work $scheme = 'http'; break; case 'socks4': case 'socks5': $scheme = 'socks5'; break; } $proxyString = $scheme . '://' . $proxy->proxy_host . ":" . $proxy->proxy_port; //we don't want to mess with the ones that have too many warnings //let's try 25% failure/connect ratio should exclude this proxy temporarily if ($failureRate > 25 && $this->warn_period > $gracePeriod) { log_message('error', "Skipping {$proxyString} due to {$failureRate}% failure rate.\nTime calc: {$gracePeriod} period: {$this->warn_period}"); continue; } try { $options = array("timeout" => "5", "follow_redirects" => true, "max_redirects" => 3); $scrape = new HTTP_Request2($url, HTTP_Request2::METHOD_GET, $options); $scrape->setAdapter('curl'); $scrape->setConfig(array('proxy_host' => trim($proxy->proxy_host), 'proxy_port' => trim($proxy->proxy_port))); $aKey = array_rand($this->agents); $scrape->setHeader(array('Accept-Charset' => 'utf-8', 'User-Agent' => $this->agents[$aKey])); // send http request $response = $scrape->send(); $body = $response->getBody(); $status = $response->getStatus(); $source = parse_url($url); $upData = array(); //echo "From: $proxyString\nResponse status: ".$response->getStatus()."\n"; $this->CI->db->where("id", $proxy->id); $this->CI->db->set('connects', 'connects+1', FALSE); $this->CI->db->update($this->_table_proxy_ips); $updateFlag = $html = false; $title = ''; if ($status == '200' || $status == '304') { $body = preg_replace('/[^[:print:]]/', '', $body); $html = str_get_html($body); if (!$html) { $headerLog = ''; foreach ($response->getHeader() as $k => $v) { $headerLog .= "\t{$k}: {$v}\n"; } log_message('error', "!method_exists\n" . $response->getStatus() . "\ntitle: {$title}\nheaders: {$headerLog}\n{$proxyString}\n{$url}"); } else { $title = $html->find('title', 0); $title = $title ? strtolower($title->plaintext) : ''; $html->clear(); unset($html); //echo "got: $url\ntitle: $title\n"; } } //find any known phantom sites if (strpos($title, 'onlinecollegesuniversity.com') || strpos($title, 'articlesdigest.info') || strpos($title, 'ihowandwhy.com')) { $updateFlag = true; if ((int) $proxy->warns >= $this->warn_max * 2) { log_message('error', "Ban status Phantom {$title}:\n{$proxyString}\n{$url}"); $upData = array('use_flag' => 0, 'ban_source' => $source['host'] . ' - ' . $title, 'ban_type' => $status, 'ban_agent' => $this->agents[$aKey]); unset($this->proxies[$index]); } else { log_message('error', "Phantom site {$title}:\n{$proxyString}\n{$url}"); } } elseif ($status >= 500) { // Server Error -- assume this is ban $updateFlag = true; log_message('error', "Ban status {$status}:\n{$proxyString}\n{$url}"); $upData = array('use_flag' => 0, 'ban_source' => $source['host'], 'ban_type' => $status, 'ban_agent' => $this->agents[$aKey]); unset($this->proxies[$index]); } elseif ($status == 404) { $updateFlag = true; if ((int) $proxy->warns >= $this->warn_max * 2) { log_message('error', "Ban status {$status}:\n{$proxyString}\n{$url}"); $upData = array('use_flag' => 0, 'ban_source' => $source['host'], 'ban_type' => $status, 'ban_agent' => $this->agents[$aKey]); } else { log_message('error', "Warning {$status}:\n{$proxyString}\n{$url}"); } } elseif ($status >= 400) { $updateFlag = true; if ((int) $proxy->warns >= $this->warn_max) { log_message('error', "Ban status {$status}:\n{$proxyString}\n{$url}"); $upData = array('use_flag' => 0, 'ban_source' => $source['host'], 'ban_type' => $status, 'ban_agent' => $this->agents[$aKey]); } else { log_message('error', "Warning {$status}:\n{$proxyString}\n{$url}"); } } if ($updateFlag) { $this->CI->db->set('warns', 'warns+1', FALSE); $this->CI->db->set('last_warn_time', 'now()', FALSE); $this->CI->db->where("id", $proxy->id); $this->CI->db->update($this->_table_proxy_ips, $upData); } foreach ($response->getCookies() as $c) { /* echo "\tname: {$c['name']}, value: {$c['value']}".(empty($c['expires'])? '': ", expires: {$c['expires']}").(empty($c['domain'])? '': ", domain: {$c['domain']}").(empty($c['path'])? '': ", path: {$c['path']}").", secure: ".($c['secure']? 'yes': 'no')."\n";*/ $scrape->addCookie($c['name'], $c['value']); } unset($scrape); return $status == '200' || $status == '304' ? $body : false; } catch (HTTP_Request2_Exception $e) { //do proxy deactivation here... //once we have a good sample & connection failure is > 75% - kill proxy if ((int) $proxy->fails > 10 && ($failureRate > 75 || (int) $proxy->connects == 0)) { log_message('error', "Connection Ban status {$e->getNativeCode()}:\n{$proxyString}\n{$url}\n" . $e->getMessage() . "\nFails: {$proxy->fails} - {$failureRate}%"); $this->CI->db->where('id', $proxy->id); $this->CI->db->set('fails', 'fails+1', FALSE); $this->CI->db->set('last_warn_time', 'now()', FALSE); $this->CI->db->update($this->_table_proxy_ips, array('use_flag' => 0, 'ban_source' => $proxy->proxy_host . ':' . $proxy->proxy_port, 'ban_type' => "Connection: " . $e->getNativeCode(), 'ban_agent' => $this->agents[$aKey])); unset($this->proxies[$index]); } else { log_message('error', "Connection Warning {$e->getNativeCode()}:\n{$proxyString}\nFails: {$proxy->fails} rate: {$failureRate}\n{$url}"); $this->CI->db->where('id', $proxy->id); $this->CI->db->set('fails', 'fails+1', FALSE); $this->CI->db->set('last_warn_time', 'now()', FALSE); $this->CI->db->update($this->_table_proxy_ips); } unset($scrape); //return false; } } } else { log_message('error', 'We are out of proxies'); email_alertToTeam('amzecs _scrape_page() error', 'We are out of proxies'); } log_message('error', "amzecs _scrape_page() error - Neither success or failure\n{$proxyString}"); email_alertToTeam('amzecs _scrape_page() error', "Neither success or failure\n{$proxyString}"); return false; }