public function start() { $this->parse_command(); // 爬虫开始时间 self::$time_start = time(); // 当前任务ID self::$taskid = 1; // 当前任务进程ID self::$taskpid = function_exists('posix_getpid') ? posix_getpid() : 1; // 当前任务是否主任务 self::$taskmaster = true; self::$collect_succ = 0; self::$collect_fail = 0; //-------------------------------------------------------------------------------- // 运行前验证 //-------------------------------------------------------------------------------- // 多任务需要pcntl扩展支持 if (self::$tasknum > 1) { if (!function_exists('pcntl_fork')) { log::error("When the task number greater than 1 need pnctl extension"); exit; } } // 保存运行状态需要Redis支持 if (self::$save_running_state && !cls_redis::init()) { log::error("Save the running state need Redis support,Error: " . cls_redis::$error . "\n\nPlease check the configuration file config/inc_config.php\n"); exit; } // 多任务需要Redis支持 if (self::$tasknum > 1 && !cls_redis::init()) { log::error("Multitasking need Redis support,Error: " . cls_redis::$error . "\n\nPlease check the configuration file config/inc_config.php\n"); exit; } // 验证导出 $this->export_auth(); // 检查 scan_urls if (empty(self::$configs['scan_urls'])) { log::error("No scan url to start\n"); exit; } // 放这个位置,可以添加入口页面 if ($this->on_start) { call_user_func($this->on_start, $this); } foreach (self::$configs['scan_urls'] as $url) { if (!$this->is_scan_page($url)) { log::error("Domain of scan_urls (\"{$url}\") does not match the domains of the domain name\n"); exit; } } // windows 下没法显示面板,强制显示日志 if (util::is_win()) { log::$log_show = true; } else { log::$log_show = isset(self::$configs['log_show']) ? self::$configs['log_show'] : false; } if (log::$log_show) { log::info("\n[ " . self::$configs['name'] . " Spider ] is started...\n"); log::warn("Task Number:" . self::$tasknum . "\n"); log::warn("!Documentation:\nhttps://doc.phpspider.org\n"); } $status_files = scandir(PATH_DATA . "/status"); foreach ($status_files as $v) { if ($v == '.' || $v == '..') { continue; } $filepath = PATH_DATA . "/status/" . $v; @unlink($filepath); } //-------------------------------------------------------------------------------- // 生成多任务 //-------------------------------------------------------------------------------- if (self::$tasknum > 1) { // 不保留运行状态 if (!self::$save_running_state) { // 清空redis里面的数据 $this->cache_clear(); } } foreach (self::$configs['scan_urls'] as $url) { $link = array('url' => $url, 'url_type' => 'scan_page', 'method' => 'get', 'headers' => array(), 'params' => array(), 'context_data' => '', 'proxy' => self::$configs['proxy'], 'try_num' => 0, 'max_try' => self::$configs['max_try']); $this->queue_lpush($link); } while ($this->queue_lsize()) { // 抓取页面 $this->collect_page(); // 多任务下主任务未准备就绪 if (self::$tasknum > 1 && !self::$taskmaster_status) { // 如果队列中的网页比任务数多,生成子任务一起采集 if ($this->queue_lsize() > self::$tasknum * 2) { // 主任务状态 self::$taskmaster_status = true; // fork 子进程前一定要先干掉redis连接fd,不然会存在进程互抢redis fd 问题 cls_redis::close(); //task进程从2开始,1被master进程所使用 for ($i = 2; $i <= self::$tasknum; $i++) { $this->fork_one_task($i); } } } // 每采集成功一次页面,就刷新一次面板 if (!log::$log_show) { $this->display_ui(); } } // 显示最后结果 log::$log_show = true; log::info("Spider is finished\n"); $spider_time_run = util::time2second(intval(microtime(true) - self::$time_start)); log::info("Spider running time:{$spider_time_run}\n"); $count_collected_url = $this->count_collected_url(); log::info("Total pages:{$count_collected_url} \n\n"); // 最后:多任务下不保留运行状态,清空redis数据 if (self::$tasknum > 1 && !self::$save_running_state) { $this->cache_clear(); } }
/** * 运行worker实例 */ public function run() { $this->time_start = microtime(true); $this->worker_id = 0; $this->worker_pid = posix_getpid(); $this->set_process_title($this->title); // 这里赋值,worker进程也会克隆到 if ($this->log_show) { log::$log_show = true; } if ($this->on_start) { call_user_func($this->on_start, $this); } // worker进程从1开始,0被master进程所使用 for ($i = 1; $i <= $this->count; $i++) { $this->fork_one_worker($i); } $this->monitor_workers(); }