Example #1
0
 protected function _run_cron_job($api, $from = 'no')
 {
     //if ( ! in_array($from, $this->fromAr)) {
     //	echo "You are not allowed to run this cron";
     //	exit;
     //}
     // Count the active proxies
     // Send a warning if 5 or less are active
     $threshold = 10;
     $proxies = getProxyIPS(TRUE, $threshold + 1);
     if (count($proxies) <= $threshold) {
         $this->email_alert('Low on proxies', 'The active proxy count has reached ' . $threshold . ' or less as of ' . date('Y-m-d H:i:s'));
     }
     $apis = ensure_array($api);
     $storesData = $this->Store->get_stores(TRUE);
     $currentDate = date('Y-m-d H:i:s');
     $this->Crowl_model->stats = array();
     // Cycle Through API crons
     for ($j = 0, $n = count($apis); $j < $n; $j++) {
         $this->Crowl_model->stats[$apis[$j]] = array('data_found' => 0, 'price_found' => 0);
         $this->Crowl_model->load_sticky_api($apis[$j]);
         $data = array('api_type' => $apis[$j], 'key' => generate_rand(32), 'run_from' => $from, 'start_datetime' => $currentDate);
         try {
             // Update cron log information
             $lastLog = $this->getMaxCronLog($data['api_type']);
             $lastRun = empty($lastLog) ? '0000-00-00 00:00:00' : $lastLog->datetime;
             if (!empty($lastLog)) {
                 if ($lastLog->datetime === "0000-00-00 00:00:00") {
                     $update = array('datetime' => $currentDate, 'end_datetime' => $currentDate);
                     $this->db->where('datetime', '0000-00-00 00:00:00')->where('api_type', $data['api_type'])->update('cron_log', $update);
                 }
             }
             if ($lastRun == '0000-00-00 00:00:00' || $currentDate >= $lastRun) {
                 // No entry in cron log table
                 $insert_id = $this->create_cron_log($data);
                 $this->resetViolationCount();
                 // Crawl the products for each store
                 for ($i = 0, $z = count($storesData); $i < $z; $i++) {
                     $this->index($storesData[$i]->id, $data['api_type'], $insert_id);
                 }
                 $data_edit['end_datetime'] = $data_edit['datetime'] = date("Y-m-d H:i:s");
                 $this->update_cron_log($insert_id, $data_edit);
             }
         } catch (Exception $e) {
             email_alertToTeam('Run Cron Job ' . ucwords($apis[$j]), $e->getMessage());
         }
         // Now that the crawl has finished,
         // compare these crawl numbers with the last
         // and send out notifications if needed.
         $currentLog = $this->getMaxCronLog($data['api_type']);
         $lastCount = $lastLog ? $lastLog->google_count : 0;
         $currentCount = $currentLog->google_count;
         $percentThresh = 25;
         // send a notification if completed this percent of the last crawl
         $smallThresh = 10;
         // send a notification if complete less than this many products
         $msg = '';
         $percentDrop = 100 - $percentThresh;
         if ($lastCount > 0 and $currentCount * 100 / $lastCount <= $percentThresh) {
             $msg = ' crawl completed ' . $percentDrop . '% less products than its previous crawl.';
         } elseif ($currentCount < $smallThresh) {
             $msg = ' crawl completed less than ' . $smallThresh . ' products.';
         }
         if (!empty($msg)) {
             $msg_txt = 'Crawl Warning: ' . $data['api_type'] . $msg;
             $msg_html = 'Crawl Warning: ' . '<b>' . $data['api_type'] . '</b>' . $msg;
             log_message('error', $msg_txt);
             send_email($this->config->item('alerts'), 'TrackStreet Errors', 'TrackStreet Crawl Warning', $msg_html, $msg_txt);
         }
     }
     $this->_checkNotFoundRatio();
     // todo: should we not return to the main script here?
     exit;
 }
Example #2
0
 private function _scrape_page($url)
 {
     $this->proxies = getProxyIPS(TRUE, 500);
     require_once 'HTTP/Request2.php';
     if (count($this->proxies) > 0) {
         foreach ($this->proxies as $index => $proxy) {
             $now = time();
             $totalAttempts = $proxy->fails + $proxy->connects;
             $failureRate = $totalAttempts == 0 ? 0 : $proxy->fails / $totalAttempts * 100;
             $gracePeriod = $now - strtotime($proxy->last_warn_time);
             $scheme = 'http';
             switch ($proxy->scheme) {
                 case 'https':
                 case 'HTTPS':
                     //Request2 only takes http, but proxy should still work
                     $scheme = 'http';
                     break;
                 case 'socks4':
                 case 'socks5':
                     $scheme = 'socks5';
                     break;
             }
             $proxyString = $scheme . '://' . $proxy->proxy_host . ":" . $proxy->proxy_port;
             //we don't want to mess with the ones that have too many warnings
             //let's try 25% failure/connect ratio should exclude this proxy temporarily
             if ($failureRate > 25 && $this->warn_period > $gracePeriod) {
                 log_message('error', "Skipping {$proxyString} due to {$failureRate}% failure rate.\nTime calc: {$gracePeriod} period: {$this->warn_period}");
                 continue;
             }
             try {
                 $options = array("timeout" => "5", "follow_redirects" => true, "max_redirects" => 3);
                 $scrape = new HTTP_Request2($url, HTTP_Request2::METHOD_GET, $options);
                 $scrape->setAdapter('curl');
                 $scrape->setConfig(array('proxy_host' => trim($proxy->proxy_host), 'proxy_port' => trim($proxy->proxy_port)));
                 $aKey = array_rand($this->agents);
                 $scrape->setHeader(array('Accept-Charset' => 'utf-8', 'User-Agent' => $this->agents[$aKey]));
                 // send http request
                 $response = $scrape->send();
                 $body = $response->getBody();
                 $status = $response->getStatus();
                 $source = parse_url($url);
                 $upData = array();
                 //echo "From: $proxyString\nResponse status: ".$response->getStatus()."\n";
                 $this->CI->db->where("id", $proxy->id);
                 $this->CI->db->set('connects', 'connects+1', FALSE);
                 $this->CI->db->update($this->_table_proxy_ips);
                 $updateFlag = $html = false;
                 $title = '';
                 if ($status == '200' || $status == '304') {
                     $body = preg_replace('/[^[:print:]]/', '', $body);
                     $html = str_get_html($body);
                     if (!$html) {
                         $headerLog = '';
                         foreach ($response->getHeader() as $k => $v) {
                             $headerLog .= "\t{$k}: {$v}\n";
                         }
                         log_message('error', "!method_exists\n" . $response->getStatus() . "\ntitle: {$title}\nheaders: {$headerLog}\n{$proxyString}\n{$url}");
                     } else {
                         $title = $html->find('title', 0);
                         $title = $title ? strtolower($title->plaintext) : '';
                         $html->clear();
                         unset($html);
                         //echo "got: $url\ntitle: $title\n";
                     }
                 }
                 //find any known phantom sites
                 if (strpos($title, 'onlinecollegesuniversity.com') || strpos($title, 'articlesdigest.info') || strpos($title, 'ihowandwhy.com')) {
                     $updateFlag = true;
                     if ((int) $proxy->warns >= $this->warn_max * 2) {
                         log_message('error', "Ban status Phantom {$title}:\n{$proxyString}\n{$url}");
                         $upData = array('use_flag' => 0, 'ban_source' => $source['host'] . ' - ' . $title, 'ban_type' => $status, 'ban_agent' => $this->agents[$aKey]);
                         unset($this->proxies[$index]);
                     } else {
                         log_message('error', "Phantom site {$title}:\n{$proxyString}\n{$url}");
                     }
                 } elseif ($status >= 500) {
                     // Server Error -- assume this is ban
                     $updateFlag = true;
                     log_message('error', "Ban status {$status}:\n{$proxyString}\n{$url}");
                     $upData = array('use_flag' => 0, 'ban_source' => $source['host'], 'ban_type' => $status, 'ban_agent' => $this->agents[$aKey]);
                     unset($this->proxies[$index]);
                 } elseif ($status == 404) {
                     $updateFlag = true;
                     if ((int) $proxy->warns >= $this->warn_max * 2) {
                         log_message('error', "Ban status {$status}:\n{$proxyString}\n{$url}");
                         $upData = array('use_flag' => 0, 'ban_source' => $source['host'], 'ban_type' => $status, 'ban_agent' => $this->agents[$aKey]);
                     } else {
                         log_message('error', "Warning {$status}:\n{$proxyString}\n{$url}");
                     }
                 } elseif ($status >= 400) {
                     $updateFlag = true;
                     if ((int) $proxy->warns >= $this->warn_max) {
                         log_message('error', "Ban status {$status}:\n{$proxyString}\n{$url}");
                         $upData = array('use_flag' => 0, 'ban_source' => $source['host'], 'ban_type' => $status, 'ban_agent' => $this->agents[$aKey]);
                     } else {
                         log_message('error', "Warning {$status}:\n{$proxyString}\n{$url}");
                     }
                 }
                 if ($updateFlag) {
                     $this->CI->db->set('warns', 'warns+1', FALSE);
                     $this->CI->db->set('last_warn_time', 'now()', FALSE);
                     $this->CI->db->where("id", $proxy->id);
                     $this->CI->db->update($this->_table_proxy_ips, $upData);
                 }
                 foreach ($response->getCookies() as $c) {
                     /* echo "\tname: {$c['name']}, value: {$c['value']}".(empty($c['expires'])? '': ", expires: {$c['expires']}").(empty($c['domain'])? '': ", domain: {$c['domain']}").(empty($c['path'])? '': ", path: {$c['path']}").", secure: ".($c['secure']? 'yes': 'no')."\n";*/
                     $scrape->addCookie($c['name'], $c['value']);
                 }
                 unset($scrape);
                 return $status == '200' || $status == '304' ? $body : false;
             } catch (HTTP_Request2_Exception $e) {
                 //do proxy deactivation here...
                 //once we have a good sample & connection failure is > 75% - kill proxy
                 if ((int) $proxy->fails > 10 && ($failureRate > 75 || (int) $proxy->connects == 0)) {
                     log_message('error', "Connection Ban status {$e->getNativeCode()}:\n{$proxyString}\n{$url}\n" . $e->getMessage() . "\nFails: {$proxy->fails} - {$failureRate}%");
                     $this->CI->db->where('id', $proxy->id);
                     $this->CI->db->set('fails', 'fails+1', FALSE);
                     $this->CI->db->set('last_warn_time', 'now()', FALSE);
                     $this->CI->db->update($this->_table_proxy_ips, array('use_flag' => 0, 'ban_source' => $proxy->proxy_host . ':' . $proxy->proxy_port, 'ban_type' => "Connection: " . $e->getNativeCode(), 'ban_agent' => $this->agents[$aKey]));
                     unset($this->proxies[$index]);
                 } else {
                     log_message('error', "Connection Warning {$e->getNativeCode()}:\n{$proxyString}\nFails: {$proxy->fails} rate: {$failureRate}\n{$url}");
                     $this->CI->db->where('id', $proxy->id);
                     $this->CI->db->set('fails', 'fails+1', FALSE);
                     $this->CI->db->set('last_warn_time', 'now()', FALSE);
                     $this->CI->db->update($this->_table_proxy_ips);
                 }
                 unset($scrape);
                 //return false;
             }
         }
     } else {
         log_message('error', 'We are out of proxies');
         email_alertToTeam('amzecs _scrape_page() error', 'We are out of proxies');
     }
     log_message('error', "amzecs _scrape_page() error - Neither success or failure\n{$proxyString}");
     email_alertToTeam('amzecs _scrape_page() error', "Neither success or failure\n{$proxyString}");
     return false;
 }