/** * Add new keywords * @return json * @author Ruslan Ushakov */ function keyword_url_cron_job() { $this->load->model('sites_model'); $this->load->model('settings_model'); $this->load->model('ranking_model'); $parent_id = (int) $this->input->post('job'); $jobs_in_process = $this->ranking_model->getInProcessUrlCronJobsCount(); if ($jobs_in_process <= self::CRON_JOBS_TO_RUN) { $cron_jobs = $this->ranking_model->getWaitingChildCronJobs($parent_id, self::CRON_JOBS_TO_RUN); //change status of selected jobs to IN PROCESS $cron_jobs_ids = array_map(function ($cron_job) { return $cron_job->id; }, $cron_jobs); if ($cron_jobs_ids && $this->ranking_model->changeCronJobStatus($cron_jobs_ids, $this->ranking_model->CRON_STATUS_INPROCESS)) { foreach ($cron_jobs as $key => $cron_job) { $cron_jobs[$key]->status = $this->ranking_model->CRON_STATUS_INPROCESS; } } // get parent jobs $cron_jobs_parents_ids = array_values(array_filter(array_unique(array_map(function ($cron_job) { return $cron_job->parent_id; }, $cron_jobs)))); $cron_jobs_parents = array(); if (!empty($cron_jobs_parents_ids)) { foreach ($this->ranking_model->get_('keyword_cron_jobs', array('id' => $cron_jobs_parents_ids)) as $cron_jobs_parent) { $cron_jobs_parents[$cron_jobs_parent->id] = $cron_jobs_parent; } } // get corresponding sites $sites_ids = array_values(array_unique(array_filter(array_map(function ($cron_job) { return $cron_job->site_id; }, $cron_jobs_parents)), SORT_NUMERIC)); $sites = array(); if (!empty($sites_ids)) { foreach ($this->ranking_model->get_('sites', array('id' => $sites_ids)) as $site) { $sites[$site->id] = $site; } } // get search_terms_groups $cron_jobs_group_ids = array_values(array_unique(array_filter(array_map(function ($cron_job) { return $cron_job->group_id; }, $cron_jobs_parents)), SORT_NUMERIC)); $search_terms_groups = array(); if (!empty($cron_jobs_group_ids)) { foreach ($this->ranking_model->get_('search_terms_groups', array('id' => $cron_jobs_group_ids)) as $search_terms_group) { $search_terms_groups[$search_terms_group->id] = $search_terms_group; } } // get product list items $product_list_item_ids = array_filter(array_map(function ($cron_job) { return $cron_job->product_list_item_id; }, $cron_jobs)); $product_list_items = array(); if (!empty($product_list_item_ids)) { foreach ($this->ranking_model->get_('product_list_items', array('id' => $product_list_item_ids)) as $product_list_item) { $product_list_items[$product_list_item->id] = $product_list_item; } } $this->load->library('email'); $config['protocol'] = 'sendmail'; $config['mailpath'] = '/usr/sbin/sendmail'; $config['charset'] = 'UTF-8'; $config['wordwrap'] = TRUE; $config['mailtype'] = 'html'; $this->email->initialize($config); $begin_requests_time = gmdate('Y-m-d H:i:s'); $imported_cron_jobs = 0; if (!empty($cron_jobs)) { $sent_cron_jobs = array(); foreach ($cron_jobs as $cron_job) { // change status of current job to IN PROCESS if (!empty($cron_job->id) && $this->ranking_model->changeCronJobStatus($cron_job->id, $this->ranking_model->CRON_STATUS_INPROCESS)) { $cron_job->status = $this->ranking_model->CRON_STATUS_INPROCESS; } if (!$cron_job->url) { continue; } $keyword_cron_jobs_log_data = array('job_id' => null, 'message' => null, 'child_job_id' => $cron_job->request_id, 'job_phase' => $this->ranking_model->JOB_PHASE_RESPONSE, 'job_phase_status' => 'true'); if (empty($cron_job->product_list_item_id)) { // cron job per search term $cron_job_parent = isset($cron_jobs_parents[$cron_job->parent_id]) ? $cron_jobs_parents[$cron_job->parent_id] : null; if (!$cron_job_parent) { continue; } $site = isset($sites[$cron_job_parent->site_id]) ? $sites[$cron_job_parent->site_id] : null; $search_terms_group = isset($search_terms_groups[$cron_job_parent->group_id]) ? $search_terms_groups[$cron_job_parent->group_id] : null; if (!($site && $search_terms_group)) { continue; } $site_name = strtolower($site->name); $location = ''; // get crawler name $crawler_name = Ranking_model::getCrawlerName($site_name, $location); $group_name = $search_terms_group->name; $keyword_cron_jobs_log_message = array('id' => $cron_job->request_id, 'site' => $crawler_name, 'keyword' => $cron_job->keyword, 'group' => $group_name, 'status' => 'RESPONSE GET'); $keyword_cron_jobs_log_data['job_id'] = $cron_job_parent->id; $keyword_cron_jobs_log_data['message'] = json_encode($keyword_cron_jobs_log_message); } else { // cron job per product $cron_job_parent = null; $product_list_item = isset($product_list_items[$cron_job->product_list_item_id]) ? $product_list_items[$cron_job->product_list_item_id] : null; $keyword_cron_jobs_log_message = array('id' => $cron_job->request_id, 'product_url' => empty($product_list_item->url) ? null : $product_list_item->url, 'status' => 'RESPONSE GET'); $keyword_cron_jobs_log_data['job_id'] = $cron_job->id; $keyword_cron_jobs_log_data['message'] = json_encode($keyword_cron_jobs_log_message); } $ch = curl_init($cron_job->url); curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); $ch_result = curl_exec($ch); $curl_log_data = array('type' => 'keyword_cron_job', 'url' => curl_getinfo($ch, CURLINFO_EFFECTIVE_URL), 'http_code' => curl_getinfo($ch, CURLINFO_HTTP_CODE), 'curl_info' => json_encode(curl_getinfo($ch)), 'result' => $ch_result); $curl_log_id = $this->ranking_model->create_($this->ranking_model->tables['curl_logs'], $curl_log_data); $request_time_seconds = round(curl_getinfo($ch, CURLINFO_TOTAL_TIME)); $response_request_time = floor($request_time_seconds / 60) . " min " . $request_time_seconds % 60 . " sec"; $keyword_cron_jobs_log_message['request time'] = $response_request_time; $keyword_cron_jobs_log_data['curl_log_id'] = $curl_log_id; $http_code = curl_getinfo($ch, CURLINFO_HTTP_CODE); if ($http_code === 302) { $keyword_cron_jobs_log_data['message'] = json_encode($keyword_cron_jobs_log_message); $this->ranking_model->create_('keyword_cron_jobs_log', $keyword_cron_jobs_log_data); $redirect_url = curl_getinfo($ch, CURLINFO_REDIRECT_URL); $ch2 = curl_init($redirect_url); curl_setopt($ch2, CURLOPT_RETURNTRANSFER, true); $results = curl_exec($ch2); $curl_log_data = array('type' => 'keyword_cron_job', 'url' => curl_getinfo($ch2, CURLINFO_EFFECTIVE_URL), 'http_code' => curl_getinfo($ch2, CURLINFO_HTTP_CODE), 'curl_info' => json_encode(curl_getinfo($ch2)), 'result' => $results); $curl2_log_id = $this->ranking_model->create_('curl_logs', $curl_log_data); $results_code = curl_getinfo($ch2, CURLINFO_HTTP_CODE); $request_time_seconds = round(curl_getinfo($ch2, CURLINFO_TOTAL_TIME)); $result_request_time = floor($request_time_seconds / 60) . " min " . $request_time_seconds % 60 . " sec"; $keyword_cron_jobs_log_message['request time'] = $result_request_time; $keyword_cron_jobs_log_data['job_phase'] = $this->ranking_model->JOB_PHASE_IMPORT_BEGIN; $keyword_cron_jobs_log_data['curl_log_id'] = $curl2_log_id; if ($results_code === 200) { if ($results) { // got results, set CRON_STATUS_READY, make import $keyword_cron_jobs_log_message['status'] = 'IMPORT BEGIN'; $keyword_cron_jobs_log_data['message'] = json_encode($keyword_cron_jobs_log_message); $this->ranking_model->create_('keyword_cron_jobs_log', $keyword_cron_jobs_log_data); $import_time_begin = microtime(true); if (empty($cron_job->product_list_item_id)) { // import data per search term $imported_count = $this->import_ranking_data($results, $cron_job, $search_terms_group->id, $site->id); } else { // import data per product $imported_count = $this->import_ranking_data($results, $cron_job, null, null, $product_list_item); } $import_time = number_format(microtime(true) - $import_time_begin, 2); $imported_cron_jobs++; $this->ranking_model->changeCronJobStatus($cron_job->id, $this->ranking_model->CRON_STATUS_READY); if (!empty($cron_job_parent) && $cron_job_parent->status != $this->ranking_model->CRON_STATUS_READY) { if ($this->ranking_model->changeCronJobStatus($cron_job_parent->id, $this->ranking_model->CRON_STATUS_READY)) { $cron_job_parent->status = $this->ranking_model->CRON_STATUS_READY; } } $keyword_cron_jobs_log_message['status'] = 'STATUS READY'; unset($keyword_cron_jobs_log_message['request time']); $keyword_cron_jobs_log_message['import time'] = "{$import_time} sec"; $keyword_cron_jobs_log_message['imported products'] = $imported_count; $keyword_cron_jobs_log_data['message'] = json_encode($keyword_cron_jobs_log_message); $keyword_cron_jobs_log_data['job_phase'] = $this->ranking_model->JOB_PHASE_IMPORT_END; $this->ranking_model->create_('keyword_cron_jobs_log', $keyword_cron_jobs_log_data); } else { // result is empty, set CRON_STATUS_WARNING $this->ranking_model->changeCronJobStatus($cron_job->id, $this->ranking_model->CRON_STATUS_WARNING); if (!empty($cron_job_parent) && $cron_job_parent->status != $this->ranking_model->CRON_STATUS_WARNING) { if ($this->ranking_model->changeCronJobStatus($cron_job_parent->id, $this->ranking_model->CRON_STATUS_WARNING)) { $cron_job_parent->status = $this->ranking_model->CRON_STATUS_WARNING; } } $keyword_cron_jobs_log_message['status'] = 'STATUS WARNING - Response is empty'; $keyword_cron_jobs_log_data['message'] = json_encode($keyword_cron_jobs_log_message); $keyword_cron_jobs_log_data['job_phase_status'] = 'false'; $this->ranking_model->create_('keyword_cron_jobs_log', $keyword_cron_jobs_log_data); } } else { // result wasn't received, set CRON_STATUS_ERROR $this->ranking_model->changeCronJobStatus($cron_job->id, $this->ranking_model->CRON_STATUS_ERROR); if (!empty($cron_job_parent) && $cron_job_parent->status != $this->ranking_model->CRON_STATUS_ERROR) { if ($this->ranking_model->changeCronJobStatus($cron_job_parent->id, $this->ranking_model->CRON_STATUS_ERROR)) { $cron_job_parent->status = $this->ranking_model->CRON_STATUS_ERROR; } } $keyword_cron_jobs_log_message['status'] = "STATUS ERROR - Result wasn't received ({$results_code})"; $keyword_cron_jobs_log_data['message'] = json_encode($keyword_cron_jobs_log_message); $keyword_cron_jobs_log_data['job_phase_status'] = 'false'; $this->ranking_model->create_('keyword_cron_jobs_log', $keyword_cron_jobs_log_data); } } elseif ($http_code === 404 || preg_match('/[5]\\d\\d/', $http_code) || empty($ch_result)) { // not found, set CRON_STATUS_ERROR $this->ranking_model->changeCronJobStatus($cron_job->id, $this->ranking_model->CRON_STATUS_ERROR); if (!empty($cron_job_parent) && $cron_job_parent->status != $this->ranking_model->CRON_STATUS_ERROR) { if ($this->ranking_model->changeCronJobStatus($cron_job_parent->id, $this->ranking_model->CRON_STATUS_ERROR)) { $cron_job_parent->status = $this->ranking_model->CRON_STATUS_ERROR; } } if ($http_code === 404) { $keyword_cron_jobs_log_message['status'] = 'STATUS ERROR - The resource could not be found.'; } elseif (!empty($ch_result)) { $keyword_cron_jobs_log_message['status'] = "STATUS ERROR - REST API Server Error ({$http_code})"; } else { $keyword_cron_jobs_log_message['status'] = "STATUS ERROR - Empty response from keyword server"; } $keyword_cron_jobs_log_data['message'] = json_encode($keyword_cron_jobs_log_message); $keyword_cron_jobs_log_data['job_phase_status'] = 'false'; $this->ranking_model->create_('keyword_cron_jobs_log', $keyword_cron_jobs_log_data); } else { $this->ranking_model->changeCronJobStatus($cron_job->id, $this->ranking_model->CRON_STATUS_WAITING); } } } $end_requests_time = gmdate('Y-m-d H:i:s'); $requests_count = empty($cron_jobs) ? 0 : count($cron_jobs); if (count($cron_jobs) < 20) { // $this->amazon_urls_unification(); no need for the moment } $this->load->model('ranking_timing_model'); $this->ranking_timing_model->add_request_timing($begin_requests_time, $end_requests_time, $requests_count, $imported_cron_jobs); $result = array('status' => 'ok'); } else { $result = array('status' => "too many jobs are in process ({$jobs_in_process})"); } // fix 'stuck' jobs, reset their status to CRON_STATUS_WAITING $stuck_jobs = $this->ranking_model->getStuckChildCronJobs(); if ($stuck_jobs) { $stuck_jobs_ids = array_map(function ($stuck_job) { return $stuck_job->id; }, $stuck_jobs); $this->ranking_model->changeCronJobStatus($stuck_jobs_ids, $this->ranking_model->CRON_STATUS_WAITING); } $this->output->set_content_type('application/json')->set_output(json_encode($result)); }