コード例 #1
0
ファイル: crawl.php プロジェクト: janladaking/CodeIgniter
 /**
  * Add new keywords (parent)
  * @return json
  * @author Ruslan Ushakov
  */
 function keyword_cron_job()
 {
     $this->load->model('sites_model');
     $this->load->model('ranking_model');
     $this->load->model('product_model');
     $this->load->model('settings_model');
     // schedule unscheduled keyword groups
     if ($unscheduled_jobs = $this->ranking_model->getKeywordGroupsUnscheduled()) {
         foreach ($unscheduled_jobs as $unscheduled_job) {
             $this->ranking_model->addKeywordGroupCronJob($unscheduled_job->id, $this->ranking_model->CRON_PERIOD_DAY);
         }
     }
     // get scheduled (per search term groups) tasks
     $scheduled_jobs = array();
     foreach ($this->ranking_model->getKeywordGroupCronJobsActual() as $scheduled_job) {
         $scheduled_jobs[$scheduled_job->id] = $scheduled_job;
     }
     if (!empty($scheduled_jobs)) {
         $update_data = array('last_update' => 'now()');
         $where_data = array('id' => array_keys($scheduled_jobs));
         $this->ranking_model->update_('keyword_group_cron_jobs', $update_data, $where_data);
         $keyword_group_ids = array();
         foreach ($scheduled_jobs as $scheduled_job) {
             $keyword_group_ids[] = $scheduled_job->keyword_group_id ?: -1;
         }
         // get groups_sites relations by group ids to have associated sites
         $groups_sites = $this->ranking_model->get_('groups_sites', array('group_id' => $keyword_group_ids));
         // get count of search_terms for every search_terms_groups record to avoid empty search_terms_groups
         $search_terms_groups = array();
         foreach ($this->ranking_model->getSearchTermsGroupsWithCount(array('id' => $keyword_group_ids)) as $search_terms_group) {
             $search_terms_groups[$search_terms_group->id] = $search_terms_group;
         }
         // create 'parent' cron jobs
         $keyword_cron_jobs_batch = array();
         foreach ($groups_sites as $groups_sites_rel) {
             if (isset($search_terms_groups[$groups_sites_rel->group_id]) && $search_terms_groups[$groups_sites_rel->group_id]->search_terms_count) {
                 $keyword_cron_jobs_batch[] = array('group_id' => $groups_sites_rel->group_id, 'site_id' => $groups_sites_rel->site_id, 'status' => $this->ranking_model->CRON_STATUS_WAITING);
             }
         }
         if (!empty($keyword_cron_jobs_batch)) {
             $create_batch_result = $this->ranking_model->create_batch_('keyword_cron_jobs', $keyword_cron_jobs_batch);
         }
     }
     //Issue #2956
     $job_ids = $this->input->post('job_ids');
     if (!empty($job_ids)) {
         $cron_jobs = $this->ranking_model->getCronJobsById($job_ids);
         $manual_request = true;
     } else {
         $keyword_servers = $this->settings_model->get_value($this->settings_model->system_user, 'restapi_servers');
         if (empty($keyword_servers) || !is_array($keyword_servers)) {
             $keyword_servers = array(array('name' => $this->config->item('keyword_rest_api')));
         }
         $servers_pending_statuses = $this->get_keyword_servers_pending_status($keyword_servers);
         // get products that need to be crawled
         $products_to_crawl = $this->product_model->getProductsToBeCrawled(self::PRODUCTS_TO_CRAWL);
         foreach ($products_to_crawl as $product_to_crawl) {
             $server_num = $this->check_keyword_server($servers_pending_statuses);
             $server = $keyword_servers[$server_num]['name'];
             $create_crawl_request_result = $this->create_product_crawl_request($product_to_crawl, $server);
             if ($create_crawl_request_result) {
                 $servers_pending_statuses[$server_num]++;
             }
         }
         // get waiting cron jobs without url (parent)
         $cron_jobs = count($products_to_crawl) < self::PRODUCTS_TO_CRAWL ? $this->ranking_model->getWaitingParentCronJobs(20) : array();
         $manual_request = false;
     }
     if (!empty($cron_jobs)) {
         if (empty($keyword_servers)) {
             $keyword_servers = $this->settings_model->get_value($this->settings_model->system_user, 'restapi_servers');
             if (empty($keyword_servers) || !is_array($keyword_servers)) {
                 $keyword_servers = array(array('name' => $this->config->item('keyword_rest_api')));
             }
         }
         if (empty($servers_pending_statuses)) {
             $servers_pending_statuses = $this->get_keyword_servers_pending_status($keyword_servers);
         }
         // get search terms groups ids, to get corresponding search terms (aka keywords)
         $cron_jobs_group_ids = array_values(array_unique(array_filter(array_map(function ($cron_job) {
             return $cron_job->group_id;
         }, $cron_jobs)), SORT_NUMERIC));
         // get search_terms_groups
         $search_terms_groups = array();
         if (!empty($cron_jobs_group_ids)) {
             foreach ($this->ranking_model->get_('search_terms_groups', array('id' => $cron_jobs_group_ids)) as $search_terms_group) {
                 $search_terms_groups[$search_terms_group->id] = $search_terms_group;
             }
         }
         // get corresponding search terms (aka keywords), group them by group_id
         $search_terms_by_group = array();
         if (!empty($cron_jobs_group_ids)) {
             foreach ($this->ranking_model->get_('search_terms', array('group_id' => $cron_jobs_group_ids)) as $search_term) {
                 $search_terms_by_group[$search_term->group_id][] = $search_term;
             }
         }
         // get child keyword_cron_jobs by parent ids
         $cron_jobs_ids = array_map(function ($cron_job) {
             return $cron_job->id;
         }, $cron_jobs);
         $cron_jobs_childs = array();
         if (!empty($cron_jobs_ids)) {
             foreach ($this->ranking_model->get_('keyword_cron_jobs', array('parent_id' => $cron_jobs_ids)) as $keyword_cron_job) {
                 $cron_jobs_childs[$keyword_cron_job->parent_id][$keyword_cron_job->keyword] = $keyword_cron_job;
             }
         }
         // get corresponding sites
         $sites_ids = array_values(array_unique(array_filter(array_map(function ($cron_job) {
             return $cron_job->site_id;
         }, $cron_jobs)), SORT_NUMERIC));
         $sites = array();
         if (!empty($sites_ids)) {
             foreach ($this->ranking_model->get_('sites', array('id' => $sites_ids)) as $site) {
                 $sites[$site->id] = $site;
             }
         }
         // mark all jobs as 'in progress'
         if ($this->ranking_model->changeCronJobStatus($cron_jobs_ids, $this->ranking_model->CRON_STATUS_INPROCESS)) {
             foreach ($cron_jobs as $key => $cron_job) {
                 $cron_jobs[$key]->status = $this->ranking_model->CRON_STATUS_INPROCESS;
             }
         }
         foreach ($cron_jobs as $cron_job) {
             $cron_job_childs = isset($cron_jobs_childs[$cron_job->id]) ? $cron_jobs_childs[$cron_job->id] : array();
             $cron_job_search_terms = isset($search_terms_by_group[$cron_job->group_id]) ? $search_terms_by_group[$cron_job->group_id] : array();
             if (!empty($manual_request)) {
                 $cron_job_search_terms_to_create = $cron_job_search_terms;
             } else {
                 $cron_job_search_terms_to_create = array();
                 foreach ($cron_job_search_terms as $keyword) {
                     // filter keywords, get those which hadn't been processed
                     if (!isset($cron_job_childs[$keyword->title])) {
                         $cron_job_search_terms_to_create[] = $keyword;
                     }
                 }
             }
             $cron_jobs_to_create = count($cron_job_search_terms_to_create);
             if (!empty($cron_job_search_terms_to_create)) {
                 if (isset($sites[$cron_job->site_id]) && ($cron_job_site = $sites[$cron_job->site_id])) {
                     $cron_job_site_name = strtolower($cron_job_site->name);
                     $location = '';
                     $user_agent = '';
                     $zipCode = !empty($cron_job_site->zip_code) ? $cron_job_site->zip_code : '';
                     // get crawler name
                     $crawler_name = Ranking_model::getCrawlerName($cron_job_site_name, $location, $user_agent);
                     if (!empty($crawler_name)) {
                         foreach ($cron_job_search_terms_to_create as $keyword) {
                             $server_num = $this->check_keyword_server($servers_pending_statuses);
                             $server = $keyword_servers[$server_num]['name'];
                             if ($keyword->title && $server) {
                                 $group_name = isset($search_terms_groups[$keyword->group_id]) ? $search_terms_groups[$keyword->group_id]->name : null;
                                 $get_new_keywords_result = $this->get_new_keywords($crawler_name, $keyword->title, $cron_job, $location, $server, 1000, $group_name, $zipCode, $user_agent);
                                 if ($get_new_keywords_result) {
                                     $cron_jobs_to_create--;
                                     $servers_pending_statuses[$server_num]++;
                                 }
                             }
                         }
                     }
                 }
             }
             if ($cron_jobs_to_create < 1) {
                 $this->ranking_model->changeCronJobStatus($cron_job->id, $this->ranking_model->CRON_STATUS_REQUESTED);
             } elseif ($cron_job->status != $this->ranking_model->CRON_STATUS_ERROR) {
                 if ($this->ranking_model->changeCronJobStatus($cron_job->id, $this->ranking_model->CRON_STATUS_ERROR)) {
                     $cron_job->status = $this->ranking_model->CRON_STATUS_ERROR;
                 }
             }
         }
     }
     $this->output->set_content_type('application/json')->set_output(json_encode(array('status' => 'ok')));
 }