function evo_get_pagelink($content, $url, $list = array()) { $list = $list ? $list : $url; $rules_info = match_rules($url, $content, 4, 0); if ($rules_info && is_array($rules_info)) { if ($rules_info['page_get_type'] == 1) { $link_arr = dom_page_link($content, array('page_link_rules' => $rules_info['page_link_rules'], 'url' => $url)); } else { if ($rules_info['page_get_type'] == 2) { $link_arr = string_page_link($content, trim($rules_info['page_link_rules']), $url); } } } if ($link_arr) { return $link_arr; } $base_url = get_base_url($content); $base_url = $base_url ? $base_url : $url; $link_arr = _striplinks($content, $base_url); if (!$link_arr) { return array(); } foreach ((array) $link_arr as $k => $v_url) { if (!check_fastpick_viewurl($v_url, $url)) { unset($link_arr[$k]); continue; } $c_arr[$k] = strlen($v_url); } $value_count_arr = array_count_values($c_arr); arsort($value_count_arr); $value_count_arr = array_keys($value_count_arr); $view_lenth = array_shift($value_count_arr); $link_arr = array_resolve($link_arr); foreach ($link_arr as $k => $v) { if (abs(strlen($v) - $view_lenth) > 5) { unset($link_arr[$k]); } } $link_arr = array_filter($link_arr, 'filter_url_callback'); return $link_arr; }
function get_start_url() { if ($this->p_arr['rules_type'] == 1) { //如果采集器采用内置规则 $this->parse_rules(); } else { if ($this->p_arr['rules_type'] == 2) { //自定义规则 if ($this->p_arr['url_range_type'] == 1 || $this->p_arr['url_range_type'] == 2) { //从分页列表采集文章或url范围 $args['step'] = $this->p_arr['page_url_auto_step']; $args['start'] = $this->p_arr['page_url_auto_start']; $args['end'] = $this->p_arr['page_url_auto_end']; $args['url'] = $this->p_arr['url_page_range']; $args['auto'] = $this->p_arr['page_url_auto']; $this->now_url_arr = convert_url_range($args); $this->max_level = 2; if ($this->p_arr['url_range_type'] == 2) { $this->max_level = 1; $this->temp_arr['per_num'] = 1; } else { $this->temp_arr['page_num'] = count($this->now_url_arr); } } else { if ($this->p_arr['url_range_type'] == 4) { //从rss地址 $this->now_url_arr = get_rss_url(2, $this->p_arr['rss_url']); $this->max_level = 1; } else { if ($this->p_arr['url_range_type'] == 5) { //多层列表 $this->now_url_arr = array($this->p_arr['many_list_start_url']); $this->max_level = count($this->p_arr['many_page_list']) + 1; } } } } else { if ($this->p_arr['rules_type'] == 3) { //一键采集 $start_arr = format_wrap($this->p_arr['manyou_start_url']); $this->now_url = $start_arr[0]; $content = $this->parse_page(); $rules_info = match_rules($this->now_url, $content, 4, 0); if ($rules_info && is_array($rules_info)) { $this->pick_cache_data['lilely_page'][] = $this->now_url; if ($rules_info['page_get_type'] == 1) { $this->now_url_arr = dom_page_link($content, array('page_link_rules' => $rules_info['page_link_rules'], 'url' => $this->now_url)); } else { $this->now_url_arr = string_page_link($content, trim($rules_info['page_link_rules']), $this->now_url); } } $page_url_arr = parse_url($this->now_url); parse_str($page_url_arr['query'], $url_info); $index_url = $auto = 0; if (is_numeric($url_info['page'])) { $var_url = str_replace('page=' . $url_info['page'], 'page=(*)', $this->now_url); $this->pick_cache_data['lilely_page'][] = $this->now_url; } else { $page_all_link = $this->parse_page('link', $content); $page_all_link = array_filter($page_all_link, 'filter_url_callback'); $likely_arr[0] = $this->now_url; foreach ((array) $page_all_link as $k => $v) { similar_text($v, $this->now_url, $percent); if ($percent < 90) { continue; } $likely_arr[] = $v; } $likely_arr = array_resolve($likely_arr); $var_arr = get_url_diff($likely_arr); $var_url = $var_arr['url']; $index_url = $var_arr['index']; $auto = $var_arr['auto']; if ($var_url && is_array($likely_arr)) { $key = array_rand($likely_arr); $this->pick_cache_data['lilely_page'][] = $likely_arr[$key]; } } if ($var_url) { $this->now_url_arr = convert_url_range(array('url' => $var_url, 'step' => 1, 'start' => $var_arr['index'] ? 2 : 1, 'end' => 99, 'auto' => $auto)); if ($var_arr['index']) { array_unshift($this->now_url_arr, $var_arr['index']); } $this->max_level = 2; } else { $this->now_url_arr = $start_arr; $this->max_level = $this->max_level ? $this->max_level : 2; } //print_r($this->now_url_arr);exit(); $this->max_level = $this->p_arr['manyou_max_level'] ? $this->p_arr['manyou_max_level'] : 2; } } } if ($this->p_arr['page_fiter'] == 1 && $this->now_url_arr) { //开启了过滤网址功能 if ($this->p_arr['page_url_other']) { $this->now_url_arr = array_merge(format_wrap($this->p_arr['page_url_other']), $this->now_url_arr); $this->temp_arr['page_num'] = count($this->now_url_arr); } } $this->pick_cache_data['max_level'] = $this->max_level; }