function evo_get_pagelink($content, $url, $list = array()) { $list = $list ? $list : $url; $rules_info = match_rules($url, $content, 4, 0); if ($rules_info && is_array($rules_info)) { if ($rules_info['page_get_type'] == 1) { $link_arr = dom_page_link($content, array('page_link_rules' => $rules_info['page_link_rules'], 'url' => $url)); } else { if ($rules_info['page_get_type'] == 2) { $link_arr = string_page_link($content, trim($rules_info['page_link_rules']), $url); } } } if ($link_arr) { return $link_arr; } $base_url = get_base_url($content); $base_url = $base_url ? $base_url : $url; $link_arr = _striplinks($content, $base_url); if (!$link_arr) { return array(); } foreach ((array) $link_arr as $k => $v_url) { if (!check_fastpick_viewurl($v_url, $url)) { unset($link_arr[$k]); continue; } $c_arr[$k] = strlen($v_url); } $value_count_arr = array_count_values($c_arr); arsort($value_count_arr); $value_count_arr = array_keys($value_count_arr); $view_lenth = array_shift($value_count_arr); $link_arr = array_resolve($link_arr); foreach ($link_arr as $k => $v) { if (abs(strlen($v) - $view_lenth) > 5) { unset($link_arr[$k]); } } $link_arr = array_filter($link_arr, 'filter_url_callback'); return $link_arr; }
$arr['content'] = format_url($_GET['content']); $arr['article_get_type'] = $_GET['article_get_type']; $arr['url_page_range'] = $_GET['url_page_range']; $arr['page_link_rules'] = format_url($_GET['page_link_rules']); $url_range_type = $_GET['url_range_type']; $page_test_url = $_GET['page_test_url']; $arr['url'] = $_GET['url']; $arr['auto'] = $_GET['auto']; $page_get_type = $_GET['page_get_type']; $range_arr = get_url_range($arr['url_page_range']); if ($page_test_url) { $link_arr[0] = $page_test_url; } else { if ($url_range_type == 'page') { if ($page_get_type == 'dom') { $link_arr = dom_page_link($range_arr[0], $arr); } else { $link_arr = regexp_page_link($range_arr[0], $arr['page_link_rules']); } } else { $auto = $auto == 'yes' ? true : false; $link_arr = get_url_range($arr['url_page_range'], $auto); } } $url = url_auto($link_arr[0]); if ($arr['url']) { $url = $arr['url']; } if ($arr['article_get_type'] == 'dom') { $a_info = dom_single_article($url, array('title' => $arr['title'], 'content' => $arr['content'])); } else {
function many_list_get_page($rules_arr, $start_url = '') { extract($rules_arr); //print_r($rules_arr); //exit(); $url = $start_url ? $start_url : $test; $rules = stripslashes($rules); $content = get_contents($url, array('login_cookie' => $login_cookie, 'cache' => -1)); if ($type == 1) { //dom $link_arr = dom_page_link($content, array('page_link_rules' => $rules, 'url_page_range' => $url)); } else { $link_arr = string_page_link($content, $rules, $url); } return $link_arr; }
function robot($level) { global $_G; $pick_config = $_G['cache']['evn_milu_pick']['pick_config']; $del_flag = 0; $this->now_level = $level; if (!$this->now_url_arr) { $this->restart_robot($this->now_level); } if (!$this->pick_cache_data['url_arr'][$this->now_level]) { $this->pick_cache_data['url_arr'][$this->now_level] = $this->now_url_arr; } foreach ((array) $this->now_url_arr as $k => $url) { d_s('run'); if ($this->p_arr['pick_num'] && $this->i == $this->p_arr['pick_num'] + 2 || $this->p_arr['pick_num'] && $this->i > $this->p_arr['pick_num'] + 2) { return; } $this->pick_cache_data['now_level'] = $this->now_level; $this->now_url = $url; if ($this->p_arr['url_range_type'] == 3 || $this->now_level == $this->p_arr['manyou_max_level']) { $host_arr = $this->GetHostInfo($url); $this->base_url = $host_arr['host']; } $this->format_url(); $show_args = array_merge($this->msg_args, array('li_no_end' => 1, 'no_border' => 1, 'now' => $this->i)); show_pick_info(array(milu_lang('read_link'), $this->now_url), 'url', $show_args); $this->i++; $this->temp_arr['have_reply'] = 0; $this->pick_cache_data['i'] = $this->i; $visit_flag = $this->check_visit_url(); if ($visit_flag > 0) { if ($this->now_level == 1) { if ($this->p_arr['rules_type'] == 3) { //若是一键采集,判断此网址是否是文章页 if (!$this->check_fastpick_viewurl($this->now_url)) { continue; } else { //exit($this->now_url); } } $content = $this->parse_page(); $this->status_arr['now'] = $this->i; show_pick_info('', 'success', $this->status_arr); if ($this->p_arr['stop_time'][0]) { sleep($this->p_arr['stop_time'][0]); } $get = 0; $this->temp_arr['have_page'] = 0; if ($this->p_arr['content_page_rules']) { //分页文章 if ($this->p_arr['reply_rules'] || $this->p_arr['reply_is_extend']) { //回复 } else { $content_page_arr = $this->get_content_page($content); if ($content_page_arr) { $get = 1; $this->a++; $this->pick_cache_data['a'] = $this->a; $this->temp_arr['have_page'] = 1; $article_info_arr = $this->page_get_content($content, array(), array(), $content_page_arr); if ($article_info_arr) { //取其他内容 $other_arr = $this->get_article_other($content); $other_arr = $other_arr ? $other_arr : array(); $article_info_arr = array_merge($article_info_arr, $other_arr); $this->create_page_article($article_info_arr); //分页文章的入库 } else { $this->v_a++; $this->pick_cache_data['v_a'] = $this->v_a; } } } } if ($get == 0) { //普通文章 $ori_title = $this->get_ori_title($content); $now = '-' . ($this->i - 1) . time(); $show_args = array_merge($this->msg_args, array('li_no_end' => 1, 'no_border' => 1, 'now' => $now)); show_pick_info(array(milu_lang('read_content'), cutstr($ori_title, 85)), 'left', $show_args); $article_info = $this->get_article($content); $this->status_arr['now'] = $now; show_pick_info('', 'success', $this->status_arr); $article_info = $this->format_article($article_info); $this->get_pick_status(); $this->status_arr['now'] = '-' . ($this->i - 1) . time(); $show_args = array_merge($this->msg_args, array('li_no_end' => 1, 'no_border' => 1, 'now' => $now)); $this->temp_arr['normal_now'] = $now; show_pick_info(array(milu_lang('article'), cutstr(trim($article_info['title']), 85)), 'left', $show_args); if ($this->check_article($article_info)) { $this->create_article($article_info); } else { $this->v_a++; $this->pick_cache_data['v_a'] = $this->v_a; } } $this->insert_url(); if ($this->aid || $this->public_info['insert_aid']) { if ($this->p_arr['reply_rules'] || $this->p_arr['reply_is_extend']) { //文章有回复 if ($this->p_arr['is_public_del'] == 1 && $this->p_arr['is_auto_public'] == 1 && $this->p_arr['public_type'] != 2) { //如果直接发布,而且是发布不入库,而且不是发布到论坛,就不必采集回复 } else { $now = '-' . ($this->i - 1) . time(); $show_args = array_merge($this->msg_args, array('li_no_end' => 1, 'no_border' => 1, 'now' => $now)); $this->temp_arr['reply_now'] = $now; show_pick_info(array(milu_lang('pick_reply')), 'left', $show_args); if (strexists($this->p_arr['reply_max_num'], ',')) { $arr = explode(',', $this->p_arr['reply_max_num']); $this->reply_max_num = rand($arr[0], $arr[1]); } else { $this->reply_max_num = intval($this->p_arr['reply_max_num']); } $this->oldurl_arr = NULL; $reply_arr = $this->page_get_reply($content, array($this->now_url)); $reply_arr = sarray_unique($reply_arr); //去重复处理 $this->create_reply($reply_arr); $this->oldurl_arr = NULL; $this->temp_arr['have_reply'] = 1; } } } } $msg = ''; $link_count = 0; $next_link = array(); if ($this->now_level > 1) { if ($this->p_arr['url_range_type'] == 1 || $this->p_arr['url_range_type'] == 5 || $this->p_arr['rules_type'] == 1) { //分页列表或多层列表获取是内置规则 if ($this->p_arr['url_range_type'] == 5) { $key_level = abs($this->now_level - 1 - count($this->p_arr['many_page_list'])) + 1; $rules_arr = $this->p_arr['many_page_list'][$key_level]; } else { if ($this->p_arr['url_range_type'] == 1 || $this->p_arr['rules_type'] == 1) { $rules_arr['type'] = $this->p_arr['page_get_type']; $rules_arr['rules'] = $this->p_arr['page_link_rules']; } } $content = $this->parse_page(); if ($rules_arr['type'] == 1) { $next_link = dom_page_link($content, array('page_link_rules' => $rules_arr['rules'], 'url' => $this->now_url)); } else { if ($rules_arr['type'] == 2) { $next_link = string_page_link($content, trim($rules_arr['rules']), $this->now_url); } else { $next_link = evo_get_pagelink($content, $this->now_url); } } if ($this->p_arr['url_range_type'] == 1 && !$rules_arr['rules']) { $msg = ' : ' . milu_lang('no_set_list_rules'); } $link_count = $this->temp_arr['per_num'] = count($next_link); if ($link_count == 0 && $rules_arr['rules']) { $msg = ' : ' . milu_lang('check_list_rules'); } $this->get_pick_count(); } else { if ($this->p_arr['rules_type'] == 3) { //一键采集 $content = $this->parse_page(); $next_link = evo_get_pagelink($content, $this->now_url, $this->pick_cache_data['lilely_page']); $link_count = count($next_link); } } $this->get_pick_status(1); show_pick_info(milu_lang('get_link_c', array('c' => $link_count)) . $msg, $link_count > 0 ? 'success' : 'err', $this->status_arr); if ($next_link) { $this->pick_cache_data['url_arr'][$this->now_level - 1] = $this->now_url_arr = $next_link; } } else { $next_link = $this->now_url_arr = $this->pick_cache_data['url_arr'][$this->now_level]; } if (!$this->flip()) { return; } $this->del_session_arr($this->now_level); if (!$this->pick_cache_data['url_arr']) { return; } $del_flag = 1; if ($this->now_level > 1 && $next_link) { $this->now_level -= 1; $this->robot($level - 1); } } else { $this->v_i++; $this->pick_cache_data['v_i'] = $this->v_i; $this->get_pick_status(1); show_pick_info(milu_lang('no_visit_err' . $visit_flag), 'err', $this->status_arr); if (!$this->flip()) { return; } } if ($del_flag != 1) { $this->del_session_arr($this->now_level); } } $this->now_level += 1; $this->restart_robot($this->now_level); }