Exemple #1
0
 /**
  * Send a request for scraping a single url (product) to keyword servers
  * @param object $product product
  * @param string $server keyword server
  * @author Stom
  */
 protected function create_product_crawl_request($product, $server)
 {
     $result = false;
     $product_url_host = preg_replace("/^www\\./", '', parse_url(strtolower($product->url), PHP_URL_HOST));
     if (empty($product->id) || empty($product_url_host)) {
         return $result;
     }
     $crawler_name = Ranking_model::getCrawlerName($product_url_host);
     $server_url = "http://{$server}:6543/ranking_data/";
     $post_array = array('site' => $crawler_name, 'product_url' => $product->url);
     $ch = curl_init();
     curl_setopt($ch, CURLOPT_URL, $server_url);
     curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
     curl_setopt($ch, CURLOPT_POST, true);
     curl_setopt($ch, CURLOPT_POSTFIELDS, $post_array);
     curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
     curl_setopt($ch, CURLOPT_MAXREDIRS, 1);
     $curl_result = curl_exec($ch);
     $curl_log_data = array('type' => 'keyword_cron_job', 'method' => 'POST', 'url' => $server_url, 'params' => json_encode($post_array), 'http_code' => curl_getinfo($ch, CURLINFO_HTTP_CODE), 'curl_info' => json_encode(curl_getinfo($ch)), 'result' => $curl_result);
     $curl_log_id = $this->ranking_model->create_('curl_logs', $curl_log_data);
     $http_code = curl_getinfo($ch, CURLINFO_HTTP_CODE);
     if (preg_match('/[23]\\d\\d/', $http_code)) {
         // 2xx 3xx codes (usually 202 or 302)
         // create job
         $eff_url = curl_getinfo($ch, CURLINFO_EFFECTIVE_URL);
         $eff_url_parts = explode('/', $eff_url);
         $request_id = $eff_url_parts[sizeof($eff_url_parts) - 2];
         $job_url = curl_getinfo($ch, CURLINFO_REDIRECT_URL) ?: $eff_url;
         // create cron job
         $keyword_cron_job_data = array('url' => $job_url, 'request_id' => $request_id, 'status' => $this->ranking_model->CRON_STATUS_WAITING, 'product_list_item_id' => $product->id);
         $keyword_cron_job_id = $this->ranking_model->create_('keyword_cron_jobs', $keyword_cron_job_data);
         $keyword_cron_jobs_log_data = array('job_id' => $keyword_cron_job_id, 'message' => json_encode(array('id' => $request_id, 'crawler_name' => $crawler_name, 'product_url' => $product->url, 'status' => "STATUS REQUESTED ({$http_code})")), 'child_job_id' => $request_id, 'job_phase' => $this->ranking_model->JOB_PHASE_REQUEST, 'job_phase_status' => 'true', 'curl_log_id' => $curl_log_id);
         $this->ranking_model->create_('keyword_cron_jobs_log', $keyword_cron_jobs_log_data);
         $result = true;
     }
     curl_close($ch);
     return $result;
 }
 public function create_jobs()
 {
     if (!$this->_check_setting()) {
         echo 'ranking data jobs disabled';
         return;
     }
     $this->load->model('ranking_model');
     $this->load->library('logger');
     $start_time = microtime(true);
     $log_message = array('number_of_messages_queued' => 0, 'time' => 0);
     $log_id = $this->logger->info('ranking_data_create_jobs', json_encode($log_message));
     // create 'search term' jobs
     $search_terms_to_crawl = $this->ranking_data_model->getSearchTermsToBeCrawled();
     // create jobs
     $ranking_data_jobs_batch = array();
     foreach ($search_terms_to_crawl as $search_term_to_crawl) {
         if (!empty($search_term_to_crawl->search_terms_group_id) && !empty($search_term_to_crawl->search_term) && !empty($search_term_to_crawl->site_id)) {
             $ranking_data_jobs_batch[] = array('search_term_group_id' => $search_term_to_crawl->search_terms_group_id, 'search_term' => $search_term_to_crawl->search_term, 'site_id' => $search_term_to_crawl->site_id);
         }
     }
     if (!empty($ranking_data_jobs_batch)) {
         $this->ranking_data_model->create_batch_('ranking_data_jobs', $ranking_data_jobs_batch);
     }
     // create 'product' jobs
     $products_to_crawl = $this->ranking_data_model->getProductsToBeCrawled();
     $ranking_data_jobs_batch = array();
     foreach ($products_to_crawl as $product) {
         $ranking_data_jobs_batch[] = array('product_list_item_id' => $product->id);
     }
     if (!empty($ranking_data_jobs_batch)) {
         $create_batch_result = $this->ranking_data_model->create_batch_('ranking_data_jobs', $ranking_data_jobs_batch);
     }
     // re-create failed jobs
     $failed_jobs_res = $this->ranking_data_model->getFailedJobs();
     $failed_jobs = array();
     foreach ($failed_jobs_res as $failed_job_res) {
         $failed_job_key = implode("|", array_intersect_key((array) $failed_job_res, array_flip(array('search_term_group_id', 'search_term', 'site_id', 'product_list_item_id'))));
         $failed_jobs[$failed_job_key] = $failed_job_res;
         // grouped by "search_term_group_id|search_term|site_id|product_list_item_id"
     }
     if (!empty($failed_jobs)) {
         $ranking_data_jobs_batch = array();
         foreach ($failed_jobs as $failed_job) {
             $failed_jobs_same_params = array('search_term_group_id' => $failed_job->search_term_group_id, 'search_term' => $failed_job->search_term, 'site_id' => $failed_job->site_id, 'product_list_item_id' => $failed_job->product_list_item_id);
             $failed_jobs_same = $this->ranking_data_model->getFailedJobSame($failed_jobs_same_params);
             // check same jobs count and status
             $max_status = Ranking_data_model::STATUS_CREATED;
             $min_status = Ranking_data_model::STATUS_RESULTS_IMPORTED;
             $age_1_hour = null;
             foreach ($failed_jobs_same as $failed_job_same) {
                 $max_status = max($max_status, $failed_job_same->status);
                 $min_status = min($min_status, $failed_job_same->status);
                 $age_1_hour = (is_null($age_1_hour) ? true : $age_1_hour) && ($failed_job_same->age_1_hour === 't' ? true : false);
             }
             if ($min_status > Ranking_data_model::STATUS_CREATED && $max_status < Ranking_data_model::STATUS_DONE && count($failed_jobs_same) < self::FAILED_JOB_ATTEMPTS && $age_1_hour) {
                 $ranking_data_jobs_batch[] = $failed_jobs_same_params;
             }
         }
         if (!empty($ranking_data_jobs_batch)) {
             $this->ranking_data_model->create_batch_('ranking_data_jobs', $ranking_data_jobs_batch);
         }
     }
     // queue jobs
     $jobs_to_queue = $this->ranking_data_model->getJobsToQueue(self::JOBS_TO_QUEUE);
     // get jobs' ids
     $jobs_ids = array_values(array_unique(array_filter(array_map(function ($job) {
         return $job->id;
     }, $jobs_to_queue)), SORT_NUMERIC));
     // get corresponding sites
     $sites_ids = array_values(array_unique(array_filter(array_map(function ($job) {
         return $job->site_id;
     }, $jobs_to_queue)), SORT_NUMERIC));
     $sites = array();
     if (!empty($sites_ids)) {
         foreach ($this->ranking_model->get_('sites', array('id' => $sites_ids)) as $site) {
             $sites[$site->id] = $site;
         }
     }
     // get corresponding product list items
     $product_list_item_ids = array_values(array_unique(array_filter(array_map(function ($job) {
         return $job->product_list_item_id;
     }, $jobs_to_queue)), SORT_NUMERIC));
     $product_list_items = array();
     if (!empty($product_list_item_ids)) {
         foreach ($this->ranking_data_model->get_('product_list_items', array('id' => $product_list_item_ids)) as $product_list_item) {
             $product_list_items[$product_list_item->id] = $product_list_item;
         }
     }
     $entries = array();
     foreach ($jobs_to_queue as $job_to_queue) {
         $message = array('server_name' => $this->_get_server_name(), 'task_id' => $job_to_queue->id);
         if (!empty($job_to_queue->search_term) && !empty($job_to_queue->site_id) && !empty($sites[$job_to_queue->site_id])) {
             $job_site = $sites[$job_to_queue->site_id];
             $message['searchterms_str'] = $job_to_queue->search_term;
             $message['site'] = Ranking_model::getCrawlerName($job_site->name);
             $message['cmd_args'] = array('quantity' => 1000);
         } elseif (!empty($job_to_queue->product_list_item_id) && !empty($product_list_items[$job_to_queue->product_list_item_id])) {
             $job_product = $product_list_items[$job_to_queue->product_list_item_id];
             $product_url_host = preg_replace("/^www\\./", '', parse_url(strtolower($job_product->url), PHP_URL_HOST));
             $message['site'] = Ranking_model::getCrawlerName($product_url_host);
             $message['url'] = $job_product->url;
         }
         if (!(empty($message['searchterms_str']) && empty($message['url']))) {
             $entries[] = array('Id' => $job_to_queue->id, 'MessageBody' => json_encode($message));
         }
     }
     if (!empty($entries)) {
         foreach (array_chunk($entries, self::SQS_MAX_MESSAGES) as $entries_sub) {
             $sqs_call_args = array('QueueUrl' => $this->_get_queue_url(), 'Entries' => $entries_sub);
             $aws_log_id = null;
             $sqs_result = $this->aws->sqs->call('sendMessageBatch', $sqs_call_args, $this->_get_log_callback(), $aws_log_id);
             if (!empty($sqs_result['Successful'])) {
                 foreach ($sqs_result['Successful'] as $job_queue_result) {
                     if (!empty($job_queue_result['Id']) && in_array($job_queue_result['Id'], $jobs_ids)) {
                         // change ranking_data_jobs status
                         $update_data = array('status' => Ranking_data_model::STATUS_QUEUED, 'updated_at' => 'now()');
                         $where_data = array('id' => $job_queue_result['Id']);
                         $this->ranking_data_model->update_('ranking_data_jobs', $update_data, $where_data);
                         // create ranking_data_job_logs
                         $ranking_data_job_logs_data = array_filter(array('ranking_data_job_id' => $job_queue_result['Id'], 'phase' => Ranking_data_model::STATUS_QUEUED, 'aws_log_id' => $aws_log_id));
                         $this->ranking_data_model->create_('ranking_data_job_logs', $ranking_data_job_logs_data);
                         $log_message['number_of_messages_queued']++;
                     }
                 }
             } else {
                 foreach ($jobs_ids as $jobs_id) {
                     // change ranking_data_jobs status
                     $update_data = array('updated_at' => 'now()');
                     $where_data = array('id' => $jobs_id);
                     $this->ranking_data_model->update_('ranking_data_jobs', $update_data, $where_data);
                     // create ranking_data_job_logs
                     $ranking_data_job_logs_data = array_filter(array('ranking_data_job_id' => $jobs_id, 'phase' => Ranking_data_model::STATUS_QUEUED, 'success' => 'false', 'aws_log_id' => $aws_log_id));
                     $this->ranking_data_model->create_('ranking_data_job_logs', $ranking_data_job_logs_data);
                 }
             }
         }
     }
     // update log
     $log_message['time'] = number_format(microtime(true) - $start_time, 2);
     $this->logger->update($log_id, array('message' => json_encode($log_message)));
     echo 'ok';
 }