Beispiel #1
0
 public function start()
 {
     $this->parse_command();
     // 爬虫开始时间
     self::$time_start = time();
     // 当前任务ID
     self::$taskid = 1;
     // 当前任务进程ID
     self::$taskpid = function_exists('posix_getpid') ? posix_getpid() : 1;
     // 当前任务是否主任务
     self::$taskmaster = true;
     self::$collect_succ = 0;
     self::$collect_fail = 0;
     //--------------------------------------------------------------------------------
     // 运行前验证
     //--------------------------------------------------------------------------------
     // 多任务需要pcntl扩展支持
     if (self::$tasknum > 1) {
         if (!function_exists('pcntl_fork')) {
             log::error("When the task number greater than 1 need pnctl extension");
             exit;
         }
     }
     // 保存运行状态需要Redis支持
     if (self::$save_running_state && !cls_redis::init()) {
         log::error("Save the running state need Redis support,Error: " . cls_redis::$error . "\n\nPlease check the configuration file config/inc_config.php\n");
         exit;
     }
     // 多任务需要Redis支持
     if (self::$tasknum > 1 && !cls_redis::init()) {
         log::error("Multitasking need Redis support,Error: " . cls_redis::$error . "\n\nPlease check the configuration file config/inc_config.php\n");
         exit;
     }
     // 验证导出
     $this->export_auth();
     // 检查 scan_urls
     if (empty(self::$configs['scan_urls'])) {
         log::error("No scan url to start\n");
         exit;
     }
     // 放这个位置,可以添加入口页面
     if ($this->on_start) {
         call_user_func($this->on_start, $this);
     }
     foreach (self::$configs['scan_urls'] as $url) {
         if (!$this->is_scan_page($url)) {
             log::error("Domain of scan_urls (\"{$url}\") does not match the domains of the domain name\n");
             exit;
         }
     }
     // windows 下没法显示面板,强制显示日志
     if (util::is_win()) {
         log::$log_show = true;
     } else {
         log::$log_show = isset(self::$configs['log_show']) ? self::$configs['log_show'] : false;
     }
     if (log::$log_show) {
         log::info("\n[ " . self::$configs['name'] . " Spider ] is started...\n");
         log::warn("Task Number:" . self::$tasknum . "\n");
         log::warn("!Documentation:\nhttps://doc.phpspider.org\n");
     }
     $status_files = scandir(PATH_DATA . "/status");
     foreach ($status_files as $v) {
         if ($v == '.' || $v == '..') {
             continue;
         }
         $filepath = PATH_DATA . "/status/" . $v;
         @unlink($filepath);
     }
     //--------------------------------------------------------------------------------
     // 生成多任务
     //--------------------------------------------------------------------------------
     if (self::$tasknum > 1) {
         // 不保留运行状态
         if (!self::$save_running_state) {
             // 清空redis里面的数据
             $this->cache_clear();
         }
     }
     foreach (self::$configs['scan_urls'] as $url) {
         $link = array('url' => $url, 'url_type' => 'scan_page', 'method' => 'get', 'headers' => array(), 'params' => array(), 'context_data' => '', 'proxy' => self::$configs['proxy'], 'try_num' => 0, 'max_try' => self::$configs['max_try']);
         $this->queue_lpush($link);
     }
     while ($this->queue_lsize()) {
         // 抓取页面
         $this->collect_page();
         // 多任务下主任务未准备就绪
         if (self::$tasknum > 1 && !self::$taskmaster_status) {
             // 如果队列中的网页比任务数多,生成子任务一起采集
             if ($this->queue_lsize() > self::$tasknum * 2) {
                 // 主任务状态
                 self::$taskmaster_status = true;
                 // fork 子进程前一定要先干掉redis连接fd,不然会存在进程互抢redis fd 问题
                 cls_redis::close();
                 //task进程从2开始,1被master进程所使用
                 for ($i = 2; $i <= self::$tasknum; $i++) {
                     $this->fork_one_task($i);
                 }
             }
         }
         // 每采集成功一次页面,就刷新一次面板
         if (!log::$log_show) {
             $this->display_ui();
         }
     }
     // 显示最后结果
     log::$log_show = true;
     log::info("Spider is finished\n");
     $spider_time_run = util::time2second(intval(microtime(true) - self::$time_start));
     log::info("Spider running time:{$spider_time_run}\n");
     $count_collected_url = $this->count_collected_url();
     log::info("Total pages:{$count_collected_url} \n\n");
     // 最后:多任务下不保留运行状态,清空redis数据
     if (self::$tasknum > 1 && !self::$save_running_state) {
         $this->cache_clear();
     }
 }