protected function _run_cron_job($api, $from = 'no') { //if ( ! in_array($from, $this->fromAr)) { // echo "You are not allowed to run this cron"; // exit; //} // Count the active proxies // Send a warning if 5 or less are active $threshold = 10; $proxies = getProxyIPS(TRUE, $threshold + 1); if (count($proxies) <= $threshold) { $this->email_alert('Low on proxies', 'The active proxy count has reached ' . $threshold . ' or less as of ' . date('Y-m-d H:i:s')); } $apis = ensure_array($api); $storesData = $this->Store->get_stores(TRUE); $currentDate = date('Y-m-d H:i:s'); $this->Crowl_model->stats = array(); // Cycle Through API crons for ($j = 0, $n = count($apis); $j < $n; $j++) { $this->Crowl_model->stats[$apis[$j]] = array('data_found' => 0, 'price_found' => 0); $this->Crowl_model->load_sticky_api($apis[$j]); $data = array('api_type' => $apis[$j], 'key' => generate_rand(32), 'run_from' => $from, 'start_datetime' => $currentDate); try { // Update cron log information $lastLog = $this->getMaxCronLog($data['api_type']); $lastRun = empty($lastLog) ? '0000-00-00 00:00:00' : $lastLog->datetime; if (!empty($lastLog)) { if ($lastLog->datetime === "0000-00-00 00:00:00") { $update = array('datetime' => $currentDate, 'end_datetime' => $currentDate); $this->db->where('datetime', '0000-00-00 00:00:00')->where('api_type', $data['api_type'])->update('cron_log', $update); } } if ($lastRun == '0000-00-00 00:00:00' || $currentDate >= $lastRun) { // No entry in cron log table $insert_id = $this->create_cron_log($data); $this->resetViolationCount(); // Crawl the products for each store for ($i = 0, $z = count($storesData); $i < $z; $i++) { $this->index($storesData[$i]->id, $data['api_type'], $insert_id); } $data_edit['end_datetime'] = $data_edit['datetime'] = date("Y-m-d H:i:s"); $this->update_cron_log($insert_id, $data_edit); } } catch (Exception $e) { email_alertToTeam('Run Cron Job ' . ucwords($apis[$j]), $e->getMessage()); } // Now that the crawl has finished, // compare these crawl numbers with the last // and send out notifications if needed. $currentLog = $this->getMaxCronLog($data['api_type']); $lastCount = $lastLog ? $lastLog->google_count : 0; $currentCount = $currentLog->google_count; $percentThresh = 25; // send a notification if completed this percent of the last crawl $smallThresh = 10; // send a notification if complete less than this many products $msg = ''; $percentDrop = 100 - $percentThresh; if ($lastCount > 0 and $currentCount * 100 / $lastCount <= $percentThresh) { $msg = ' crawl completed ' . $percentDrop . '% less products than its previous crawl.'; } elseif ($currentCount < $smallThresh) { $msg = ' crawl completed less than ' . $smallThresh . ' products.'; } if (!empty($msg)) { $msg_txt = 'Crawl Warning: ' . $data['api_type'] . $msg; $msg_html = 'Crawl Warning: ' . '<b>' . $data['api_type'] . '</b>' . $msg; log_message('error', $msg_txt); send_email($this->config->item('alerts'), 'TrackStreet Errors', 'TrackStreet Crawl Warning', $msg_html, $msg_txt); } } $this->_checkNotFoundRatio(); // todo: should we not return to the main script here? exit; }
function parsingData($url, $order = 0, $upc = 0) { echo 'dsdfsfd'; $items = array(); //let's do a number of scrape attempts before moving on... $try = 0; do { $body = $this->_scrape_page($url); $try++; if ($body) { $body = preg_replace('/[^[:print:]]/', '', $body); $html = str_get_html($body); if (!$html) { email_alertToTeam(__CLASS__ . '::' . __FUNCTION__ . '() Alert', "Try {$try} - " . 'unable to find html = str_get_html() for UPC ' . $upc . ' at ' . $url, 1, $html); $body = false; $html->clear(); unset($html); } //don't assume we have what we need... if ($results = $html->find('div[id=olpOfferList]', 0)) { if (!$html->find('div[class=olpOffer]')) { //this should be okay because we landed on a page that is valid, but contains no table email_alertToTeam(__CLASS__ . '::' . __FUNCTION__ . '() Alert', "Try {$try} - " . 'div[class=olpOffer] not found in for UPC ' . $upc . ' at ' . $url, 1, $results); $body = false; $html->clear(); unset($html); return $items; } } else { email_alertToTeam(__CLASS__ . '::' . __FUNCTION__ . '() Alert', "Try {$try} - " . 'div[id=olpOfferList] not found for UPC ' . $upc . ' at ' . $url, 1, $html); $body = false; $html->clear(); unset($html); } } if (!$body && $try == 9) { log_message('error', "Unable to scrape UPC {$upc} after 10 attempts\n{$url}"); } } while (!$body || $try > 9); //we didn't find anything after all that looping... if (!$body) { //going to have to come up with plan B... //try to load via file_get_html() $try2 = 0; do { $body = file_get_html($url); $try2++; if ($body) { $body = preg_replace('/[^[:print:]]/', '', $body); $html = str_get_html($body); if (!$html) { email_alertToTeam(__CLASS__ . '::' . __FUNCTION__ . '() Alert', "file_get_html() try {$try2} - " . 'unable to find html = str_get_html() for UPC ' . $upc . ' at ' . $url, 1, $html); $body = false; $html->clear(); unset($html); } //don't assume we have what we need... if ($results = $html->find('div[id=olpOfferList]', 0)) { if (!$html->find('div[class=olpOffer]')) { //this should be okay because we landed on a page that is valid, but contains no table email_alertToTeam(__CLASS__ . '::' . __FUNCTION__ . '() Alert', "file_get_html() try {$try2} - " . 'div[class=olpOffer] not found in for UPC ' . $upc . ' at ' . $url, 1, $results); $body = false; $html->clear(); unset($html); return $items; } } else { email_alertToTeam(__CLASS__ . '::' . __FUNCTION__ . '() Alert', "file_get_html() try {$try2} - " . 'div[id=olpOfferList] not found for UPC ' . $upc . ' at ' . $url, 1, $html); $body = false; $html->clear(); unset($html); } } if (!$body && $try2 == 9) { log_message('error', "Unable to scrape UPC {$upc} after 10 file_get_html() attempts\n{$url}"); } } while (!$body || $try2 > 9); $body->clear(); unset($body); } // Find Offer table if (!empty($html)) { $count = 0; //don't assume we have what we need... if ($results = $html->find('div[id=olpOfferList]', 0)) { if ($offers = $html->find('div[class=olpOffer]')) { foreach ($offers as $offer) { $count++; $price = 0; $flag = false; $condition = $seller_name = $seller_image = $sellerId = $seller_rank = ''; if ($price = $offer->find('span[class=olpOfferPrice]', 0)) { $shipping = $offer->find('p span[class=olpShippingPrice]', 0); $ship = $shipping ? ltrim(trim($shipping->plaintext), '$') : '0.00'; if ($seller = $offer->find('p[class=olpSellerName] a b', 0)) { $seller_name = $seller->plaintext; } elseif ($imgSeller = $offer->find('p[class=olpSellerName] a img', 0)) { $seller_name = $imgSeller->alt; $seller_image = $imgSeller->src; } elseif ($imgSeller = $offer->find('p[class=olpSellerName] img', 0)) { $seller_name = $imgSeller->alt; $seller_image = $imgSeller->src; } if (!empty($seller_name)) { $seller_name = clearnSellerName($seller_name); } if (strtolower($seller_name) == 'amazon.com' || strtolower($seller_name) == 'amazon') { $sellerId = 'Amazon.com'; } if ($rank = $offer->find('p span[class=olpSellerRating] a b', 0)) { $seller_rank = substr($rank->plaintext, 0, 4); } $link = $offer->find('p[class=olpSellerName] a', 0); if ($seller_name != 'Amazon.com' && isset($link->href)) { //implement explode here - now 2 types: //w/o image: http://www.amazon.com/gp/aag/main/ref=olp_merch_name_1?ie=UTF8&asin=B001B4TS7U&isAmazonFulfilled=0&seller=A3QNLMW248O1LJ //w/ image: http://www.amazon.com/shops/A1KTH924LYM2YH/ref=olp_merch_name_2 $url = parse_url($link->href); if (strpos($url['path'], 'shops') !== FALSE) { $path = explode('/', $url['path']); $sellerId = isset($path[2]) ? $path[2] : ''; } elseif (isset($url['query']) && strpos($url['query'], 'seller') !== FALSE) { $queryParts = explode('&', $url['query']); foreach ($queryParts as $param) { $item = explode('=', $param); if ($item[0] == 'seller') { $sellerId = $item[1]; break; } else { continue; } } } $sellerId = clearnSellerId($sellerId); } elseif ($seller_name == 'Amazon.com') { //not sure... $sellerId = 'Amazon.com'; } else { log_message('error', "Unable to find seller link\n" . var_export($seller_name, true)); } $parent = $offer->parent(); $h2 = $parent->find('h2', 0); if ($cond = $offer->find('h3[class=olpCondition]', 0)) { $condition = trim($cond->plaintext); } $items[] = array('price' => ltrim(trim($price->plaintext), '$'), 'shipping' => $ship, 'status' => strpos($h2->innertext, 'New') !== FALSE ? 'Secondary' : 'Featured', 'SellerID' => $sellerId, 'SellerName' => $seller_name, 'SellerImage' => $seller_image, 'condition' => $condition, 'FBRank' => trim($seller_rank), 'ListOrder' => ++$order); } } } else { //we received some other content we're unsure of email_alertToTeam(__CLASS__ . '::' . __FUNCTION__ . '() Alert', 'Offer not found in div[class=olpOffer] for UPC ' . $upc . ' at ' . $url, 1, var_export($results, true)); } } else { //we received some other content we're unsure of email_alertToTeam(__CLASS__ . '::' . __FUNCTION__ . '() Alert', 'div[id=olpOfferList] not found for UPC ' . $upc . ' at ' . $url, 1, $html); } $html->clear(); unset($html); } else { email_alertToTeam(__CLASS__ . '::' . __FUNCTION__ . '() Alert', 'HTML not found for UPC ' . $upc . ' at ' . $url, 1, var_export($html, true)); } //echo "returning items:\n";var_dump($items); return $items; }
public function getMerchantNameForKeyOld($merchant_name, $upc, $marketplace = '', $real_name = '', $seller_id = '', $merchant_url = '') { if (trim($merchant_name) == '' || $seller_id == '') { email_alertToTeam('Seller ID', 'Seller ID not found against UPC No(' . $merchant_name . ',' . $seller_id . ').: ' . $upc . ' in Marketplace: ' . $marketplace, 1); return false; } $ret = ''; if ($marketplace == 'google') { $qStr = "select * from " . $this->_table_crowl_merchant_name . " where merchant_name = '{$merchant_name}' and marketplace = '{$marketplace}'"; } else { $qStr = "select * from " . $this->_table_crowl_merchant_name . " where seller_id = '{$seller_id}' and marketplace = '{$marketplace}'"; } $row = $this->myDB->select($qStr); if (is_array($row) && count($row) > 0) { $pid = $row[0]['id']; $ret = $row[0]['merchant_name']; $update = array('seller_id' => $seller_id); if (!empty($merchant_url)) { $update['merchant_url'] = $merchant_url; $this->myDB->where('id', $pid)->update($this->_table_crowl_merchant_name, $update); } } else { $insert = array('merchant_name' => $merchant_name, 'original_name' => $real_name, 'seller_id' => $seller_id, 'created' => date('Y-m-d H:i:s'), 'marketplace' => $marketplace); if (!empty($merchant_url)) { $update['merchant_url'] = $merchant_url; $this->myDB->insert($this->_table_crowl_merchant_name, $insert); $pid = $this->myDB->insertid(); $ret = $merchant_name; } } $qStr = "select * from `" . $this->_table_crowl_product_list . "`\n where\n merchant_name_id = '{$pid}' and\n upc = '{$upc}' and\n marketplace = '{$marketplace}'\n limit 1"; $rs = $this->myDB->select($qStr); if (is_array($rs) && count($rs) > 0) { $id = $rs[0]['id']; $qStr = "update " . $this->_table_crowl_product_list . " set last_date = " . time() . " where id = " . $id; } else { $qStr = "insert into " . $this->_table_crowl_product_list . "\n set\n upc = '{$upc}',\n last_date = " . time() . ",\n marketplace = '{$marketplace}',\n merchant_name_id = '{$pid}'\n "; } $this->myDB->simpleQuery($qStr); return $ret; }
public function run() { $this->_start_time = date('Y-m-d H:i:s'); $this->_cron_log_id = $this->Log->create_log($this->_start_time); //need to get all products $productCount = $this->Products->get_all_tracked_products_count(); $offset = 0; //queue up products -- so we don't crash the machine with a lot of products do { $products = $this->Products->get_all_tracked_products(); $this->_products = array_merge($products, $this->_products); $this->_numproducts += count($products); $offset += 500; } while ($offset < $productCount); if (count($this->_crawlers) > 0) { foreach ($this->_crawlers as $crawler) { $crawlerName = strtolower(trim($crawler['name'])); $filePath = FCPATH . 'system/application/crawlers/' . $crawlerName . '.php'; //check to see if crawler exists if (@file_exists($filePath) === false) { throw new Exception('The crawler ' . $crawlerName . ' file does not exist. Please add it.'); } require_once $filePath; if (class_exists($crawlerName) === false) { throw new Exception('The crawler ' . $crawlerName . ' crawler class is not available. Please check that its defined within the file'); } $this->_currentCrawler = $crawlerName; $this->_crawlerClass = new $crawlerName($crawler); foreach ($this->_products as $product) { if ($product === end($this->_products)) { $this->_last_upc = $product->upc_code; } try { $this->_current_product_id = $product->id; $this->_crawlerClass->setIdentifier($product->upc_code); $productData = $this->_crawlerClass->getProduct(); $offers = $this->_crawlerClass->getAllOffers(); $this->_numfoundproducts++; foreach ($offers as $offer) { $this->_numofferings++; $crawled_products = array(); if (empty($offer['merchant'])) { throw new Exception('Merchant not defined.'); } $marketplace = $offer['marketplace']; if (empty($marketplace)) { throw new Exception('Marketplace not defined'); } $r = preg_split('/(?=\\.[^.]+$)/', $marketplace); $marketplace = strtolower($r[0]); $merchant = $this->Crowl->getMerchantNameForKey($offer['merchant'], $product->upc_code, $marketplace, $offer['merchant'], $offer['merchant'], extractDomainByURL($offer['url'])); $this->_marketplaces[] = $offer['marketplace']; if (empty($offer['price_floor']) || !is_numeric($offer['price_floor'])) { throw new Exception('Price floor is undefined'); } $crawled_products['ap'] = $offer['price_floor']; $title = $productData->get("title"); if (empty($title)) { throw new Exception('Title is undefined'); } $crawled_products['t'] = $title; $crawled_products['ar'] = $marketplace; $crawled_products['il'] = null; if (empty($offer['url'])) { throw new Exception('Url is undefined'); } $crawled_products['l'] = $offer['url']; $crawled_products['mu'] = $offer['merchant_url']; if (empty($offer['price']) || !is_numeric($offer['price'])) { throw new Exception('Price is undefined'); } $crawled_products['mpo'] = $offer['price']; $hashKey = $marketplace . '#' . $product->upc_code; $crawled_products['um'] = $hashKey; $crawled_products['dt'] = time(); // Get the price from the last crawl $dynamo = new AmazonDynamoDB(); $lastRecordResponse = $dynamo->query(array('TableName' => $this->_dynamo_products_trends, 'HashKeyValue' => array(AmazonDynamoDB::TYPE_STRING => $crawled_products['um']), 'ConsistentRead' => true, 'Limit' => 1, 'ScanIndexForward' => false)); var_dump($crawled_products); $insert_response = $this->amzdb->insertData($this->_dynamo_products_trends, $crawled_products, $marketplace); if (isset($insert_response->status) && $insert_response->status == 200) { $this->Crowl->insertUPCMerchant($merchant, $product->upc_code, $marketplace, $crawled_products['mpo']); } $crowlMerchantName = $this->Crawl_data->crowlMerchantByMerchantName($merchant); if (empty($crowlMerchantName)) { log_message('error', __FILE__ . ' Crowl_m::amazon_lookup() Line ' . __LINE__ . ': crowl_merchant_name record not found for merchant ' . $offer['merchant']); } $crowlMerchantNameID = isset($crowlMerchantName->id) ? $crowlMerchantName->id : 0; $violatedPrice = (double) $crawled_products['mpo']; $dataVio = $this->Products->get_products_by_floor($product->upc_code, $violatedPrice, $product->store_id); if ($dataVio) { //$price_floor > $crowled_products['mpo']) $violation['um'] = $hashKey; $violation['dt'] = $crawled_products['dt']; //time(); $violation['ss'] = date('Ymd', $violation['dt']) . '/' . md5($hashKey . $violation['dt']) . '.png'; $this->Crowl->updateViolationSummary($dataVio); $this->Violator->updatePriceViolator($crowlMerchantNameID, $product->upc_code, 1, $crawled_products['dt']); $lastCrawlPrice = (double) 0; $hashKey = null; $rangeKey = null; if ($lastRecordResponse->isOK()) { $mpo = isset($lastRecordResponse->body->Items->mpo->N) ? (double) $lastRecordResponse->body->Items->mpo->N : (double) 0; // Merchant Price Offered $lastCrawlPrice = $mpo; $hashKey = isset($lastRecordResponse->body->Items->um->S) ? $lastRecordResponse->body->Items->um->S : null; $rangeKey = isset($lastRecordResponse->body->Items->dt->N) ? $lastRecordResponse->body->Items->dt->N : null; } // Check if the price has changed if ($lastCrawlPrice != $violatedPrice) { if (!isset($uniquArr[$crawled_products['l']])) { $uniquArr[$crawled_products['l']] = $violation['ss']; $this->Crowl->addScreenShot($crawled_products['l'], $violation['ss'], false, $violatedPrice); } else { $violation['ss'] = $uniquArr[$crawled_products['l']]; } } else { $takeNewScreenShot = false; // Get the violation screen shot if (!is_null($hashKey) and !is_null($rangeKey)) { $lastViolationResponse = $dynamo->query(array('TableName' => $this->_dynamo_violations, 'HashKeyValue' => array(AmazonDynamoDB::TYPE_STRING => (string) $hashKey), 'RangeKeyCondition' => array('ComparisonOperator' => AmazonDynamoDB::CONDITION_EQUAL, 'AttributeValueList' => array(array(AmazonDynamoDB::TYPE_NUMBER => (string) $rangeKey))))); if ($lastViolationResponse->isOK() and $lastViolationResponse->body->Count == 1) { if (isset($lastViolationResponse->body->Items->ss->S)) { if (@fopen(get_instance()->config->item('s3_cname') . 'stickyvision/violations/' . $lastViolationResponse->body->Items->ss->S, 'r')) { $violation['ss'] = $lastViolationResponse->body->Items->ss->S; $takeNewScreenShot = false; } else { $takeNewScreenShot = true; } } } } if ($takeNewScreenShot === true) { $this->Crowl->addScreenShot($crawled_products['l'], $violation['ss'], false, $violatedPrice); } } $this->amzdb->insertData($this->_dynamo_violations, $violation, $marketplace); $violationFlag = true; $violation = null; } else { // update price violators for products not in violation $this->Violator->updatePriceViolator($crowlMerchantNameID, $product->upc_code, 0, $crawled_products['dt']); } $this->Crowl->updateUPCFlag($product->upc_code, $marketplace, '1'); } //end offers iteration $this->_crawlerClass->reset(); } catch (Exception $e) { $exceptionDetail = array('crawler_log_id' => $this->_cron_log_id, 'crawler_name' => $this->_currentCrawler, 'message' => $e->getMessage(), 'created_at' => date('Y-m-d H:i:s'), 'product_id' => $this->_current_product_id); echo 'exception detected..'; var_dump($exceptionDetail); $this->CrawlerErrorLog->create_log($exceptionDetail); //send an alert to the team email_alertToTeam('Run Cron Job - Error Cron Log ID ' . $this->_cron_log_id . ' : ' . $e->getMessage()); } } } } $this->_end_time = date('Y-m-d H:i:s'); $this->Log->update_log($this->_cron_log_id, array('end_datetime' => $this->_end_time, 'products_count' => $this->_numfoundproducts, 'offerings_count' => $this->_numofferings)); //create cron_log -- this is needed to handle code that still thinks we need to use old crawler $this->_marketplaces = array_unique($this->_marketplaces); foreach ($this->_marketplaces as $marketplace) { $domain = trim($marketplace); $r = preg_split('/(?=\\.[^.]+$)/', $domain); $marketplace = strtolower($r[0]); $cron_log_id = $this->CronLog->create_log(array('datetime' => date('Y-m-d H:i:s'), 'key' => generate_rand(32), 'api_type' => $marketplace, 'start_datetime' => $this->_start_time, 'end_datetime' => $this->_end_time, 'google_count' => 0, 'last_UPC' => $this->_last_upc, 'run_from' => 'cronTab1')); $this->Marketplace->add_retailer(strtolower($marketplace), strtolower($domain)); } exit; }