Beispiel #1
0
 protected function _run_cron_job($api, $from = 'no')
 {
     //if ( ! in_array($from, $this->fromAr)) {
     //	echo "You are not allowed to run this cron";
     //	exit;
     //}
     // Count the active proxies
     // Send a warning if 5 or less are active
     $threshold = 10;
     $proxies = getProxyIPS(TRUE, $threshold + 1);
     if (count($proxies) <= $threshold) {
         $this->email_alert('Low on proxies', 'The active proxy count has reached ' . $threshold . ' or less as of ' . date('Y-m-d H:i:s'));
     }
     $apis = ensure_array($api);
     $storesData = $this->Store->get_stores(TRUE);
     $currentDate = date('Y-m-d H:i:s');
     $this->Crowl_model->stats = array();
     // Cycle Through API crons
     for ($j = 0, $n = count($apis); $j < $n; $j++) {
         $this->Crowl_model->stats[$apis[$j]] = array('data_found' => 0, 'price_found' => 0);
         $this->Crowl_model->load_sticky_api($apis[$j]);
         $data = array('api_type' => $apis[$j], 'key' => generate_rand(32), 'run_from' => $from, 'start_datetime' => $currentDate);
         try {
             // Update cron log information
             $lastLog = $this->getMaxCronLog($data['api_type']);
             $lastRun = empty($lastLog) ? '0000-00-00 00:00:00' : $lastLog->datetime;
             if (!empty($lastLog)) {
                 if ($lastLog->datetime === "0000-00-00 00:00:00") {
                     $update = array('datetime' => $currentDate, 'end_datetime' => $currentDate);
                     $this->db->where('datetime', '0000-00-00 00:00:00')->where('api_type', $data['api_type'])->update('cron_log', $update);
                 }
             }
             if ($lastRun == '0000-00-00 00:00:00' || $currentDate >= $lastRun) {
                 // No entry in cron log table
                 $insert_id = $this->create_cron_log($data);
                 $this->resetViolationCount();
                 // Crawl the products for each store
                 for ($i = 0, $z = count($storesData); $i < $z; $i++) {
                     $this->index($storesData[$i]->id, $data['api_type'], $insert_id);
                 }
                 $data_edit['end_datetime'] = $data_edit['datetime'] = date("Y-m-d H:i:s");
                 $this->update_cron_log($insert_id, $data_edit);
             }
         } catch (Exception $e) {
             email_alertToTeam('Run Cron Job ' . ucwords($apis[$j]), $e->getMessage());
         }
         // Now that the crawl has finished,
         // compare these crawl numbers with the last
         // and send out notifications if needed.
         $currentLog = $this->getMaxCronLog($data['api_type']);
         $lastCount = $lastLog ? $lastLog->google_count : 0;
         $currentCount = $currentLog->google_count;
         $percentThresh = 25;
         // send a notification if completed this percent of the last crawl
         $smallThresh = 10;
         // send a notification if complete less than this many products
         $msg = '';
         $percentDrop = 100 - $percentThresh;
         if ($lastCount > 0 and $currentCount * 100 / $lastCount <= $percentThresh) {
             $msg = ' crawl completed ' . $percentDrop . '% less products than its previous crawl.';
         } elseif ($currentCount < $smallThresh) {
             $msg = ' crawl completed less than ' . $smallThresh . ' products.';
         }
         if (!empty($msg)) {
             $msg_txt = 'Crawl Warning: ' . $data['api_type'] . $msg;
             $msg_html = 'Crawl Warning: ' . '<b>' . $data['api_type'] . '</b>' . $msg;
             log_message('error', $msg_txt);
             send_email($this->config->item('alerts'), 'TrackStreet Errors', 'TrackStreet Crawl Warning', $msg_html, $msg_txt);
         }
     }
     $this->_checkNotFoundRatio();
     // todo: should we not return to the main script here?
     exit;
 }
Beispiel #2
0
 function parsingData($url, $order = 0, $upc = 0)
 {
     echo 'dsdfsfd';
     $items = array();
     //let's do a number of scrape attempts before moving on...
     $try = 0;
     do {
         $body = $this->_scrape_page($url);
         $try++;
         if ($body) {
             $body = preg_replace('/[^[:print:]]/', '', $body);
             $html = str_get_html($body);
             if (!$html) {
                 email_alertToTeam(__CLASS__ . '::' . __FUNCTION__ . '() Alert', "Try {$try} - " . 'unable to find html = str_get_html() for UPC ' . $upc . ' at ' . $url, 1, $html);
                 $body = false;
                 $html->clear();
                 unset($html);
             }
             //don't assume we have what we need...
             if ($results = $html->find('div[id=olpOfferList]', 0)) {
                 if (!$html->find('div[class=olpOffer]')) {
                     //this should be okay because we landed on a page that is valid, but contains no table
                     email_alertToTeam(__CLASS__ . '::' . __FUNCTION__ . '() Alert', "Try {$try} - " . 'div[class=olpOffer] not found in for UPC ' . $upc . ' at ' . $url, 1, $results);
                     $body = false;
                     $html->clear();
                     unset($html);
                     return $items;
                 }
             } else {
                 email_alertToTeam(__CLASS__ . '::' . __FUNCTION__ . '() Alert', "Try {$try} - " . 'div[id=olpOfferList] not found for UPC ' . $upc . ' at ' . $url, 1, $html);
                 $body = false;
                 $html->clear();
                 unset($html);
             }
         }
         if (!$body && $try == 9) {
             log_message('error', "Unable to scrape UPC {$upc} after 10 attempts\n{$url}");
         }
     } while (!$body || $try > 9);
     //we didn't find anything after all that looping...
     if (!$body) {
         //going to have to come up with plan B...
         //try to load via file_get_html()
         $try2 = 0;
         do {
             $body = file_get_html($url);
             $try2++;
             if ($body) {
                 $body = preg_replace('/[^[:print:]]/', '', $body);
                 $html = str_get_html($body);
                 if (!$html) {
                     email_alertToTeam(__CLASS__ . '::' . __FUNCTION__ . '() Alert', "file_get_html() try {$try2} - " . 'unable to find html = str_get_html() for UPC ' . $upc . ' at ' . $url, 1, $html);
                     $body = false;
                     $html->clear();
                     unset($html);
                 }
                 //don't assume we have what we need...
                 if ($results = $html->find('div[id=olpOfferList]', 0)) {
                     if (!$html->find('div[class=olpOffer]')) {
                         //this should be okay because we landed on a page that is valid, but contains no table
                         email_alertToTeam(__CLASS__ . '::' . __FUNCTION__ . '() Alert', "file_get_html() try {$try2} - " . 'div[class=olpOffer] not found in for UPC ' . $upc . ' at ' . $url, 1, $results);
                         $body = false;
                         $html->clear();
                         unset($html);
                         return $items;
                     }
                 } else {
                     email_alertToTeam(__CLASS__ . '::' . __FUNCTION__ . '() Alert', "file_get_html() try {$try2} - " . 'div[id=olpOfferList] not found for UPC ' . $upc . ' at ' . $url, 1, $html);
                     $body = false;
                     $html->clear();
                     unset($html);
                 }
             }
             if (!$body && $try2 == 9) {
                 log_message('error', "Unable to scrape UPC {$upc} after 10 file_get_html() attempts\n{$url}");
             }
         } while (!$body || $try2 > 9);
         $body->clear();
         unset($body);
     }
     // Find Offer table
     if (!empty($html)) {
         $count = 0;
         //don't assume we have what we need...
         if ($results = $html->find('div[id=olpOfferList]', 0)) {
             if ($offers = $html->find('div[class=olpOffer]')) {
                 foreach ($offers as $offer) {
                     $count++;
                     $price = 0;
                     $flag = false;
                     $condition = $seller_name = $seller_image = $sellerId = $seller_rank = '';
                     if ($price = $offer->find('span[class=olpOfferPrice]', 0)) {
                         $shipping = $offer->find('p span[class=olpShippingPrice]', 0);
                         $ship = $shipping ? ltrim(trim($shipping->plaintext), '$') : '0.00';
                         if ($seller = $offer->find('p[class=olpSellerName] a b', 0)) {
                             $seller_name = $seller->plaintext;
                         } elseif ($imgSeller = $offer->find('p[class=olpSellerName] a img', 0)) {
                             $seller_name = $imgSeller->alt;
                             $seller_image = $imgSeller->src;
                         } elseif ($imgSeller = $offer->find('p[class=olpSellerName] img', 0)) {
                             $seller_name = $imgSeller->alt;
                             $seller_image = $imgSeller->src;
                         }
                         if (!empty($seller_name)) {
                             $seller_name = clearnSellerName($seller_name);
                         }
                         if (strtolower($seller_name) == 'amazon.com' || strtolower($seller_name) == 'amazon') {
                             $sellerId = 'Amazon.com';
                         }
                         if ($rank = $offer->find('p span[class=olpSellerRating] a b', 0)) {
                             $seller_rank = substr($rank->plaintext, 0, 4);
                         }
                         $link = $offer->find('p[class=olpSellerName] a', 0);
                         if ($seller_name != 'Amazon.com' && isset($link->href)) {
                             //implement explode here - now 2 types:
                             //w/o image: http://www.amazon.com/gp/aag/main/ref=olp_merch_name_1?ie=UTF8&asin=B001B4TS7U&isAmazonFulfilled=0&seller=A3QNLMW248O1LJ
                             //w/ image: http://www.amazon.com/shops/A1KTH924LYM2YH/ref=olp_merch_name_2
                             $url = parse_url($link->href);
                             if (strpos($url['path'], 'shops') !== FALSE) {
                                 $path = explode('/', $url['path']);
                                 $sellerId = isset($path[2]) ? $path[2] : '';
                             } elseif (isset($url['query']) && strpos($url['query'], 'seller') !== FALSE) {
                                 $queryParts = explode('&amp;', $url['query']);
                                 foreach ($queryParts as $param) {
                                     $item = explode('=', $param);
                                     if ($item[0] == 'seller') {
                                         $sellerId = $item[1];
                                         break;
                                     } else {
                                         continue;
                                     }
                                 }
                             }
                             $sellerId = clearnSellerId($sellerId);
                         } elseif ($seller_name == 'Amazon.com') {
                             //not sure...
                             $sellerId = 'Amazon.com';
                         } else {
                             log_message('error', "Unable to find seller link\n" . var_export($seller_name, true));
                         }
                         $parent = $offer->parent();
                         $h2 = $parent->find('h2', 0);
                         if ($cond = $offer->find('h3[class=olpCondition]', 0)) {
                             $condition = trim($cond->plaintext);
                         }
                         $items[] = array('price' => ltrim(trim($price->plaintext), '$'), 'shipping' => $ship, 'status' => strpos($h2->innertext, 'New') !== FALSE ? 'Secondary' : 'Featured', 'SellerID' => $sellerId, 'SellerName' => $seller_name, 'SellerImage' => $seller_image, 'condition' => $condition, 'FBRank' => trim($seller_rank), 'ListOrder' => ++$order);
                     }
                 }
             } else {
                 //we received some other content we're unsure of
                 email_alertToTeam(__CLASS__ . '::' . __FUNCTION__ . '() Alert', 'Offer not found in div[class=olpOffer] for UPC ' . $upc . ' at ' . $url, 1, var_export($results, true));
             }
         } else {
             //we received some other content we're unsure of
             email_alertToTeam(__CLASS__ . '::' . __FUNCTION__ . '() Alert', 'div[id=olpOfferList] not found for UPC ' . $upc . ' at ' . $url, 1, $html);
         }
         $html->clear();
         unset($html);
     } else {
         email_alertToTeam(__CLASS__ . '::' . __FUNCTION__ . '() Alert', 'HTML not found for UPC ' . $upc . ' at ' . $url, 1, var_export($html, true));
     }
     //echo "returning items:\n";var_dump($items);
     return $items;
 }
Beispiel #3
0
 public function getMerchantNameForKeyOld($merchant_name, $upc, $marketplace = '', $real_name = '', $seller_id = '', $merchant_url = '')
 {
     if (trim($merchant_name) == '' || $seller_id == '') {
         email_alertToTeam('Seller ID', 'Seller ID not found against UPC No(' . $merchant_name . ',' . $seller_id . ').: ' . $upc . ' in Marketplace: ' . $marketplace, 1);
         return false;
     }
     $ret = '';
     if ($marketplace == 'google') {
         $qStr = "select * from  " . $this->_table_crowl_merchant_name . " where merchant_name = '{$merchant_name}' and marketplace = '{$marketplace}'";
     } else {
         $qStr = "select * from  " . $this->_table_crowl_merchant_name . " where seller_id = '{$seller_id}' and marketplace = '{$marketplace}'";
     }
     $row = $this->myDB->select($qStr);
     if (is_array($row) && count($row) > 0) {
         $pid = $row[0]['id'];
         $ret = $row[0]['merchant_name'];
         $update = array('seller_id' => $seller_id);
         if (!empty($merchant_url)) {
             $update['merchant_url'] = $merchant_url;
             $this->myDB->where('id', $pid)->update($this->_table_crowl_merchant_name, $update);
         }
     } else {
         $insert = array('merchant_name' => $merchant_name, 'original_name' => $real_name, 'seller_id' => $seller_id, 'created' => date('Y-m-d H:i:s'), 'marketplace' => $marketplace);
         if (!empty($merchant_url)) {
             $update['merchant_url'] = $merchant_url;
             $this->myDB->insert($this->_table_crowl_merchant_name, $insert);
             $pid = $this->myDB->insertid();
             $ret = $merchant_name;
         }
     }
     $qStr = "select * from `" . $this->_table_crowl_product_list . "`\n                     where\n                        merchant_name_id = '{$pid}' and\n                        upc = '{$upc}' and\n                        marketplace = '{$marketplace}'\n                     limit 1";
     $rs = $this->myDB->select($qStr);
     if (is_array($rs) && count($rs) > 0) {
         $id = $rs[0]['id'];
         $qStr = "update " . $this->_table_crowl_product_list . " set last_date  = " . time() . " where id = " . $id;
     } else {
         $qStr = "insert into " . $this->_table_crowl_product_list . "\n                       set\n                          upc = '{$upc}',\n                          last_date  = " . time() . ",\n                          marketplace  = '{$marketplace}',\n                          merchant_name_id = '{$pid}'\n                ";
     }
     $this->myDB->simpleQuery($qStr);
     return $ret;
 }
Beispiel #4
0
 public function run()
 {
     $this->_start_time = date('Y-m-d H:i:s');
     $this->_cron_log_id = $this->Log->create_log($this->_start_time);
     //need to get all products
     $productCount = $this->Products->get_all_tracked_products_count();
     $offset = 0;
     //queue up products -- so we don't crash the machine with a lot of products
     do {
         $products = $this->Products->get_all_tracked_products();
         $this->_products = array_merge($products, $this->_products);
         $this->_numproducts += count($products);
         $offset += 500;
     } while ($offset < $productCount);
     if (count($this->_crawlers) > 0) {
         foreach ($this->_crawlers as $crawler) {
             $crawlerName = strtolower(trim($crawler['name']));
             $filePath = FCPATH . 'system/application/crawlers/' . $crawlerName . '.php';
             //check to see if crawler exists
             if (@file_exists($filePath) === false) {
                 throw new Exception('The crawler ' . $crawlerName . ' file does not exist. Please add it.');
             }
             require_once $filePath;
             if (class_exists($crawlerName) === false) {
                 throw new Exception('The crawler ' . $crawlerName . ' crawler class is not available. Please check that its defined within the file');
             }
             $this->_currentCrawler = $crawlerName;
             $this->_crawlerClass = new $crawlerName($crawler);
             foreach ($this->_products as $product) {
                 if ($product === end($this->_products)) {
                     $this->_last_upc = $product->upc_code;
                 }
                 try {
                     $this->_current_product_id = $product->id;
                     $this->_crawlerClass->setIdentifier($product->upc_code);
                     $productData = $this->_crawlerClass->getProduct();
                     $offers = $this->_crawlerClass->getAllOffers();
                     $this->_numfoundproducts++;
                     foreach ($offers as $offer) {
                         $this->_numofferings++;
                         $crawled_products = array();
                         if (empty($offer['merchant'])) {
                             throw new Exception('Merchant not defined.');
                         }
                         $marketplace = $offer['marketplace'];
                         if (empty($marketplace)) {
                             throw new Exception('Marketplace not defined');
                         }
                         $r = preg_split('/(?=\\.[^.]+$)/', $marketplace);
                         $marketplace = strtolower($r[0]);
                         $merchant = $this->Crowl->getMerchantNameForKey($offer['merchant'], $product->upc_code, $marketplace, $offer['merchant'], $offer['merchant'], extractDomainByURL($offer['url']));
                         $this->_marketplaces[] = $offer['marketplace'];
                         if (empty($offer['price_floor']) || !is_numeric($offer['price_floor'])) {
                             throw new Exception('Price floor is undefined');
                         }
                         $crawled_products['ap'] = $offer['price_floor'];
                         $title = $productData->get("title");
                         if (empty($title)) {
                             throw new Exception('Title is undefined');
                         }
                         $crawled_products['t'] = $title;
                         $crawled_products['ar'] = $marketplace;
                         $crawled_products['il'] = null;
                         if (empty($offer['url'])) {
                             throw new Exception('Url is undefined');
                         }
                         $crawled_products['l'] = $offer['url'];
                         $crawled_products['mu'] = $offer['merchant_url'];
                         if (empty($offer['price']) || !is_numeric($offer['price'])) {
                             throw new Exception('Price is undefined');
                         }
                         $crawled_products['mpo'] = $offer['price'];
                         $hashKey = $marketplace . '#' . $product->upc_code;
                         $crawled_products['um'] = $hashKey;
                         $crawled_products['dt'] = time();
                         // Get the price from the last crawl
                         $dynamo = new AmazonDynamoDB();
                         $lastRecordResponse = $dynamo->query(array('TableName' => $this->_dynamo_products_trends, 'HashKeyValue' => array(AmazonDynamoDB::TYPE_STRING => $crawled_products['um']), 'ConsistentRead' => true, 'Limit' => 1, 'ScanIndexForward' => false));
                         var_dump($crawled_products);
                         $insert_response = $this->amzdb->insertData($this->_dynamo_products_trends, $crawled_products, $marketplace);
                         if (isset($insert_response->status) && $insert_response->status == 200) {
                             $this->Crowl->insertUPCMerchant($merchant, $product->upc_code, $marketplace, $crawled_products['mpo']);
                         }
                         $crowlMerchantName = $this->Crawl_data->crowlMerchantByMerchantName($merchant);
                         if (empty($crowlMerchantName)) {
                             log_message('error', __FILE__ . ' Crowl_m::amazon_lookup() Line ' . __LINE__ . ': crowl_merchant_name record not found for merchant ' . $offer['merchant']);
                         }
                         $crowlMerchantNameID = isset($crowlMerchantName->id) ? $crowlMerchantName->id : 0;
                         $violatedPrice = (double) $crawled_products['mpo'];
                         $dataVio = $this->Products->get_products_by_floor($product->upc_code, $violatedPrice, $product->store_id);
                         if ($dataVio) {
                             //$price_floor > $crowled_products['mpo'])
                             $violation['um'] = $hashKey;
                             $violation['dt'] = $crawled_products['dt'];
                             //time();
                             $violation['ss'] = date('Ymd', $violation['dt']) . '/' . md5($hashKey . $violation['dt']) . '.png';
                             $this->Crowl->updateViolationSummary($dataVio);
                             $this->Violator->updatePriceViolator($crowlMerchantNameID, $product->upc_code, 1, $crawled_products['dt']);
                             $lastCrawlPrice = (double) 0;
                             $hashKey = null;
                             $rangeKey = null;
                             if ($lastRecordResponse->isOK()) {
                                 $mpo = isset($lastRecordResponse->body->Items->mpo->N) ? (double) $lastRecordResponse->body->Items->mpo->N : (double) 0;
                                 // Merchant Price Offered
                                 $lastCrawlPrice = $mpo;
                                 $hashKey = isset($lastRecordResponse->body->Items->um->S) ? $lastRecordResponse->body->Items->um->S : null;
                                 $rangeKey = isset($lastRecordResponse->body->Items->dt->N) ? $lastRecordResponse->body->Items->dt->N : null;
                             }
                             // Check if the price has changed
                             if ($lastCrawlPrice != $violatedPrice) {
                                 if (!isset($uniquArr[$crawled_products['l']])) {
                                     $uniquArr[$crawled_products['l']] = $violation['ss'];
                                     $this->Crowl->addScreenShot($crawled_products['l'], $violation['ss'], false, $violatedPrice);
                                 } else {
                                     $violation['ss'] = $uniquArr[$crawled_products['l']];
                                 }
                             } else {
                                 $takeNewScreenShot = false;
                                 // Get the violation screen shot
                                 if (!is_null($hashKey) and !is_null($rangeKey)) {
                                     $lastViolationResponse = $dynamo->query(array('TableName' => $this->_dynamo_violations, 'HashKeyValue' => array(AmazonDynamoDB::TYPE_STRING => (string) $hashKey), 'RangeKeyCondition' => array('ComparisonOperator' => AmazonDynamoDB::CONDITION_EQUAL, 'AttributeValueList' => array(array(AmazonDynamoDB::TYPE_NUMBER => (string) $rangeKey)))));
                                     if ($lastViolationResponse->isOK() and $lastViolationResponse->body->Count == 1) {
                                         if (isset($lastViolationResponse->body->Items->ss->S)) {
                                             if (@fopen(get_instance()->config->item('s3_cname') . 'stickyvision/violations/' . $lastViolationResponse->body->Items->ss->S, 'r')) {
                                                 $violation['ss'] = $lastViolationResponse->body->Items->ss->S;
                                                 $takeNewScreenShot = false;
                                             } else {
                                                 $takeNewScreenShot = true;
                                             }
                                         }
                                     }
                                 }
                                 if ($takeNewScreenShot === true) {
                                     $this->Crowl->addScreenShot($crawled_products['l'], $violation['ss'], false, $violatedPrice);
                                 }
                             }
                             $this->amzdb->insertData($this->_dynamo_violations, $violation, $marketplace);
                             $violationFlag = true;
                             $violation = null;
                         } else {
                             // update price violators for products not in violation
                             $this->Violator->updatePriceViolator($crowlMerchantNameID, $product->upc_code, 0, $crawled_products['dt']);
                         }
                         $this->Crowl->updateUPCFlag($product->upc_code, $marketplace, '1');
                     }
                     //end offers iteration
                     $this->_crawlerClass->reset();
                 } catch (Exception $e) {
                     $exceptionDetail = array('crawler_log_id' => $this->_cron_log_id, 'crawler_name' => $this->_currentCrawler, 'message' => $e->getMessage(), 'created_at' => date('Y-m-d H:i:s'), 'product_id' => $this->_current_product_id);
                     echo 'exception detected..';
                     var_dump($exceptionDetail);
                     $this->CrawlerErrorLog->create_log($exceptionDetail);
                     //send an alert to the team
                     email_alertToTeam('Run Cron Job - Error Cron Log ID ' . $this->_cron_log_id . ' : ' . $e->getMessage());
                 }
             }
         }
     }
     $this->_end_time = date('Y-m-d H:i:s');
     $this->Log->update_log($this->_cron_log_id, array('end_datetime' => $this->_end_time, 'products_count' => $this->_numfoundproducts, 'offerings_count' => $this->_numofferings));
     //create cron_log -- this is needed to handle code that still thinks we need to use old crawler
     $this->_marketplaces = array_unique($this->_marketplaces);
     foreach ($this->_marketplaces as $marketplace) {
         $domain = trim($marketplace);
         $r = preg_split('/(?=\\.[^.]+$)/', $domain);
         $marketplace = strtolower($r[0]);
         $cron_log_id = $this->CronLog->create_log(array('datetime' => date('Y-m-d H:i:s'), 'key' => generate_rand(32), 'api_type' => $marketplace, 'start_datetime' => $this->_start_time, 'end_datetime' => $this->_end_time, 'google_count' => 0, 'last_UPC' => $this->_last_upc, 'run_from' => 'cronTab1'));
         $this->Marketplace->add_retailer(strtolower($marketplace), strtolower($domain));
     }
     exit;
 }