Пример #1
0
 public static function add($url)
 {
     $ret = ['code' => 200, 'data' => []];
     $has_in = SpiderQueue::findOne(['hash_url' => md5($url)]);
     if ($has_in) {
         $ret['data'] = $has_in;
         return $ret;
     }
     $date_now = date("Y-m-d H:i:s");
     $model_queue = new SpiderQueue();
     $model_queue->url = $url;
     $model_queue->hash_url = md5($url);
     $model_queue->status = -2;
     $model_queue->updated_time = $date_now;
     $model_queue->created_time = $date_now;
     $model_queue->save(0);
     return $ret;
 }
Пример #2
0
 public function actionRobot()
 {
     $date_now = date("Y-m-d H:i:s");
     $queue_list = SpiderQueue::find()->where(['status' => -2])->orderBy("id asc")->limit(1)->all();
     if (!$queue_list) {
         $this->echoLog("{$date_now} -- no data");
         return;
     }
     $route_mapping = SpiderService::$allow_hosts;
     foreach ($queue_list as $_info) {
         $_info->status = -1;
         $_info->update(0);
         $tmp_url_info = parse_url($_info['url']);
         $tmp_host = $tmp_url_info['host'];
         if (!isset($route_mapping[$tmp_host])) {
             $this->echoLog("-------queue_id:{$_info['id']},date:{$date_now},not allow host url:{$_info['url']}----------");
             $_info->status = 0;
             $_info->update(0);
             continue;
         }
         $this->echoLog("-------queue_id:{$_info['id']},date:{$date_now},url:{$_info['url']}----------");
         $tmp_action = $route_mapping[$tmp_host];
         $ret = call_user_func_array([$this, "crawl_{$tmp_action}"], [$_info['url']]);
         if (!$ret) {
             $_info->status = 0;
             $_info->update(0);
             continue;
         }
         $post_id = $this->save2blog($ret['content'], $ret['title'], $_info['url']);
         if (!$post_id) {
             $_info->status = 0;
             $_info->update(0);
             continue;
         }
         $_info->post_id = $post_id;
         $_info->status = 1;
         $_info->update(0);
     }
 }