/** * Send a request for scraping a single url (product) to keyword servers * @param object $product product * @param string $server keyword server * @author Stom */ protected function create_product_crawl_request($product, $server) { $result = false; $product_url_host = preg_replace("/^www\\./", '', parse_url(strtolower($product->url), PHP_URL_HOST)); if (empty($product->id) || empty($product_url_host)) { return $result; } $crawler_name = Ranking_model::getCrawlerName($product_url_host); $server_url = "http://{$server}:6543/ranking_data/"; $post_array = array('site' => $crawler_name, 'product_url' => $product->url); $ch = curl_init(); curl_setopt($ch, CURLOPT_URL, $server_url); curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); curl_setopt($ch, CURLOPT_POST, true); curl_setopt($ch, CURLOPT_POSTFIELDS, $post_array); curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true); curl_setopt($ch, CURLOPT_MAXREDIRS, 1); $curl_result = curl_exec($ch); $curl_log_data = array('type' => 'keyword_cron_job', 'method' => 'POST', 'url' => $server_url, 'params' => json_encode($post_array), 'http_code' => curl_getinfo($ch, CURLINFO_HTTP_CODE), 'curl_info' => json_encode(curl_getinfo($ch)), 'result' => $curl_result); $curl_log_id = $this->ranking_model->create_('curl_logs', $curl_log_data); $http_code = curl_getinfo($ch, CURLINFO_HTTP_CODE); if (preg_match('/[23]\\d\\d/', $http_code)) { // 2xx 3xx codes (usually 202 or 302) // create job $eff_url = curl_getinfo($ch, CURLINFO_EFFECTIVE_URL); $eff_url_parts = explode('/', $eff_url); $request_id = $eff_url_parts[sizeof($eff_url_parts) - 2]; $job_url = curl_getinfo($ch, CURLINFO_REDIRECT_URL) ?: $eff_url; // create cron job $keyword_cron_job_data = array('url' => $job_url, 'request_id' => $request_id, 'status' => $this->ranking_model->CRON_STATUS_WAITING, 'product_list_item_id' => $product->id); $keyword_cron_job_id = $this->ranking_model->create_('keyword_cron_jobs', $keyword_cron_job_data); $keyword_cron_jobs_log_data = array('job_id' => $keyword_cron_job_id, 'message' => json_encode(array('id' => $request_id, 'crawler_name' => $crawler_name, 'product_url' => $product->url, 'status' => "STATUS REQUESTED ({$http_code})")), 'child_job_id' => $request_id, 'job_phase' => $this->ranking_model->JOB_PHASE_REQUEST, 'job_phase_status' => 'true', 'curl_log_id' => $curl_log_id); $this->ranking_model->create_('keyword_cron_jobs_log', $keyword_cron_jobs_log_data); $result = true; } curl_close($ch); return $result; }
public function create_jobs() { if (!$this->_check_setting()) { echo 'ranking data jobs disabled'; return; } $this->load->model('ranking_model'); $this->load->library('logger'); $start_time = microtime(true); $log_message = array('number_of_messages_queued' => 0, 'time' => 0); $log_id = $this->logger->info('ranking_data_create_jobs', json_encode($log_message)); // create 'search term' jobs $search_terms_to_crawl = $this->ranking_data_model->getSearchTermsToBeCrawled(); // create jobs $ranking_data_jobs_batch = array(); foreach ($search_terms_to_crawl as $search_term_to_crawl) { if (!empty($search_term_to_crawl->search_terms_group_id) && !empty($search_term_to_crawl->search_term) && !empty($search_term_to_crawl->site_id)) { $ranking_data_jobs_batch[] = array('search_term_group_id' => $search_term_to_crawl->search_terms_group_id, 'search_term' => $search_term_to_crawl->search_term, 'site_id' => $search_term_to_crawl->site_id); } } if (!empty($ranking_data_jobs_batch)) { $this->ranking_data_model->create_batch_('ranking_data_jobs', $ranking_data_jobs_batch); } // create 'product' jobs $products_to_crawl = $this->ranking_data_model->getProductsToBeCrawled(); $ranking_data_jobs_batch = array(); foreach ($products_to_crawl as $product) { $ranking_data_jobs_batch[] = array('product_list_item_id' => $product->id); } if (!empty($ranking_data_jobs_batch)) { $create_batch_result = $this->ranking_data_model->create_batch_('ranking_data_jobs', $ranking_data_jobs_batch); } // re-create failed jobs $failed_jobs_res = $this->ranking_data_model->getFailedJobs(); $failed_jobs = array(); foreach ($failed_jobs_res as $failed_job_res) { $failed_job_key = implode("|", array_intersect_key((array) $failed_job_res, array_flip(array('search_term_group_id', 'search_term', 'site_id', 'product_list_item_id')))); $failed_jobs[$failed_job_key] = $failed_job_res; // grouped by "search_term_group_id|search_term|site_id|product_list_item_id" } if (!empty($failed_jobs)) { $ranking_data_jobs_batch = array(); foreach ($failed_jobs as $failed_job) { $failed_jobs_same_params = array('search_term_group_id' => $failed_job->search_term_group_id, 'search_term' => $failed_job->search_term, 'site_id' => $failed_job->site_id, 'product_list_item_id' => $failed_job->product_list_item_id); $failed_jobs_same = $this->ranking_data_model->getFailedJobSame($failed_jobs_same_params); // check same jobs count and status $max_status = Ranking_data_model::STATUS_CREATED; $min_status = Ranking_data_model::STATUS_RESULTS_IMPORTED; $age_1_hour = null; foreach ($failed_jobs_same as $failed_job_same) { $max_status = max($max_status, $failed_job_same->status); $min_status = min($min_status, $failed_job_same->status); $age_1_hour = (is_null($age_1_hour) ? true : $age_1_hour) && ($failed_job_same->age_1_hour === 't' ? true : false); } if ($min_status > Ranking_data_model::STATUS_CREATED && $max_status < Ranking_data_model::STATUS_DONE && count($failed_jobs_same) < self::FAILED_JOB_ATTEMPTS && $age_1_hour) { $ranking_data_jobs_batch[] = $failed_jobs_same_params; } } if (!empty($ranking_data_jobs_batch)) { $this->ranking_data_model->create_batch_('ranking_data_jobs', $ranking_data_jobs_batch); } } // queue jobs $jobs_to_queue = $this->ranking_data_model->getJobsToQueue(self::JOBS_TO_QUEUE); // get jobs' ids $jobs_ids = array_values(array_unique(array_filter(array_map(function ($job) { return $job->id; }, $jobs_to_queue)), SORT_NUMERIC)); // get corresponding sites $sites_ids = array_values(array_unique(array_filter(array_map(function ($job) { return $job->site_id; }, $jobs_to_queue)), SORT_NUMERIC)); $sites = array(); if (!empty($sites_ids)) { foreach ($this->ranking_model->get_('sites', array('id' => $sites_ids)) as $site) { $sites[$site->id] = $site; } } // get corresponding product list items $product_list_item_ids = array_values(array_unique(array_filter(array_map(function ($job) { return $job->product_list_item_id; }, $jobs_to_queue)), SORT_NUMERIC)); $product_list_items = array(); if (!empty($product_list_item_ids)) { foreach ($this->ranking_data_model->get_('product_list_items', array('id' => $product_list_item_ids)) as $product_list_item) { $product_list_items[$product_list_item->id] = $product_list_item; } } $entries = array(); foreach ($jobs_to_queue as $job_to_queue) { $message = array('server_name' => $this->_get_server_name(), 'task_id' => $job_to_queue->id); if (!empty($job_to_queue->search_term) && !empty($job_to_queue->site_id) && !empty($sites[$job_to_queue->site_id])) { $job_site = $sites[$job_to_queue->site_id]; $message['searchterms_str'] = $job_to_queue->search_term; $message['site'] = Ranking_model::getCrawlerName($job_site->name); $message['cmd_args'] = array('quantity' => 1000); } elseif (!empty($job_to_queue->product_list_item_id) && !empty($product_list_items[$job_to_queue->product_list_item_id])) { $job_product = $product_list_items[$job_to_queue->product_list_item_id]; $product_url_host = preg_replace("/^www\\./", '', parse_url(strtolower($job_product->url), PHP_URL_HOST)); $message['site'] = Ranking_model::getCrawlerName($product_url_host); $message['url'] = $job_product->url; } if (!(empty($message['searchterms_str']) && empty($message['url']))) { $entries[] = array('Id' => $job_to_queue->id, 'MessageBody' => json_encode($message)); } } if (!empty($entries)) { foreach (array_chunk($entries, self::SQS_MAX_MESSAGES) as $entries_sub) { $sqs_call_args = array('QueueUrl' => $this->_get_queue_url(), 'Entries' => $entries_sub); $aws_log_id = null; $sqs_result = $this->aws->sqs->call('sendMessageBatch', $sqs_call_args, $this->_get_log_callback(), $aws_log_id); if (!empty($sqs_result['Successful'])) { foreach ($sqs_result['Successful'] as $job_queue_result) { if (!empty($job_queue_result['Id']) && in_array($job_queue_result['Id'], $jobs_ids)) { // change ranking_data_jobs status $update_data = array('status' => Ranking_data_model::STATUS_QUEUED, 'updated_at' => 'now()'); $where_data = array('id' => $job_queue_result['Id']); $this->ranking_data_model->update_('ranking_data_jobs', $update_data, $where_data); // create ranking_data_job_logs $ranking_data_job_logs_data = array_filter(array('ranking_data_job_id' => $job_queue_result['Id'], 'phase' => Ranking_data_model::STATUS_QUEUED, 'aws_log_id' => $aws_log_id)); $this->ranking_data_model->create_('ranking_data_job_logs', $ranking_data_job_logs_data); $log_message['number_of_messages_queued']++; } } } else { foreach ($jobs_ids as $jobs_id) { // change ranking_data_jobs status $update_data = array('updated_at' => 'now()'); $where_data = array('id' => $jobs_id); $this->ranking_data_model->update_('ranking_data_jobs', $update_data, $where_data); // create ranking_data_job_logs $ranking_data_job_logs_data = array_filter(array('ranking_data_job_id' => $jobs_id, 'phase' => Ranking_data_model::STATUS_QUEUED, 'success' => 'false', 'aws_log_id' => $aws_log_id)); $this->ranking_data_model->create_('ranking_data_job_logs', $ranking_data_job_logs_data); } } } } // update log $log_message['time'] = number_format(microtime(true) - $start_time, 2); $this->logger->update($log_id, array('message' => json_encode($log_message))); echo 'ok'; }