Exemplo n.º 1
0
function evo_get_pagelink($content, $url, $list = array())
{
    $list = $list ? $list : $url;
    $rules_info = match_rules($url, $content, 4, 0);
    if ($rules_info && is_array($rules_info)) {
        if ($rules_info['page_get_type'] == 1) {
            $link_arr = dom_page_link($content, array('page_link_rules' => $rules_info['page_link_rules'], 'url' => $url));
        } else {
            if ($rules_info['page_get_type'] == 2) {
                $link_arr = string_page_link($content, trim($rules_info['page_link_rules']), $url);
            }
        }
    }
    if ($link_arr) {
        return $link_arr;
    }
    $base_url = get_base_url($content);
    $base_url = $base_url ? $base_url : $url;
    $link_arr = _striplinks($content, $base_url);
    if (!$link_arr) {
        return array();
    }
    foreach ((array) $link_arr as $k => $v_url) {
        if (!check_fastpick_viewurl($v_url, $url)) {
            unset($link_arr[$k]);
            continue;
        }
        $c_arr[$k] = strlen($v_url);
    }
    $value_count_arr = array_count_values($c_arr);
    arsort($value_count_arr);
    $value_count_arr = array_keys($value_count_arr);
    $view_lenth = array_shift($value_count_arr);
    $link_arr = array_resolve($link_arr);
    foreach ($link_arr as $k => $v) {
        if (abs(strlen($v) - $view_lenth) > 5) {
            unset($link_arr[$k]);
        }
    }
    $link_arr = array_filter($link_arr, 'filter_url_callback');
    return $link_arr;
}
Exemplo n.º 2
0
 $arr['content'] = format_url($_GET['content']);
 $arr['article_get_type'] = $_GET['article_get_type'];
 $arr['url_page_range'] = $_GET['url_page_range'];
 $arr['page_link_rules'] = format_url($_GET['page_link_rules']);
 $url_range_type = $_GET['url_range_type'];
 $page_test_url = $_GET['page_test_url'];
 $arr['url'] = $_GET['url'];
 $arr['auto'] = $_GET['auto'];
 $page_get_type = $_GET['page_get_type'];
 $range_arr = get_url_range($arr['url_page_range']);
 if ($page_test_url) {
     $link_arr[0] = $page_test_url;
 } else {
     if ($url_range_type == 'page') {
         if ($page_get_type == 'dom') {
             $link_arr = dom_page_link($range_arr[0], $arr);
         } else {
             $link_arr = regexp_page_link($range_arr[0], $arr['page_link_rules']);
         }
     } else {
         $auto = $auto == 'yes' ? true : false;
         $link_arr = get_url_range($arr['url_page_range'], $auto);
     }
 }
 $url = url_auto($link_arr[0]);
 if ($arr['url']) {
     $url = $arr['url'];
 }
 if ($arr['article_get_type'] == 'dom') {
     $a_info = dom_single_article($url, array('title' => $arr['title'], 'content' => $arr['content']));
 } else {
Exemplo n.º 3
0
function many_list_get_page($rules_arr, $start_url = '')
{
    extract($rules_arr);
    //print_r($rules_arr);
    //exit();
    $url = $start_url ? $start_url : $test;
    $rules = stripslashes($rules);
    $content = get_contents($url, array('login_cookie' => $login_cookie, 'cache' => -1));
    if ($type == 1) {
        //dom
        $link_arr = dom_page_link($content, array('page_link_rules' => $rules, 'url_page_range' => $url));
    } else {
        $link_arr = string_page_link($content, $rules, $url);
    }
    return $link_arr;
}
Exemplo n.º 4
0
 function robot($level)
 {
     global $_G;
     $pick_config = $_G['cache']['evn_milu_pick']['pick_config'];
     $del_flag = 0;
     $this->now_level = $level;
     if (!$this->now_url_arr) {
         $this->restart_robot($this->now_level);
     }
     if (!$this->pick_cache_data['url_arr'][$this->now_level]) {
         $this->pick_cache_data['url_arr'][$this->now_level] = $this->now_url_arr;
     }
     foreach ((array) $this->now_url_arr as $k => $url) {
         d_s('run');
         if ($this->p_arr['pick_num'] && $this->i == $this->p_arr['pick_num'] + 2 || $this->p_arr['pick_num'] && $this->i > $this->p_arr['pick_num'] + 2) {
             return;
         }
         $this->pick_cache_data['now_level'] = $this->now_level;
         $this->now_url = $url;
         if ($this->p_arr['url_range_type'] == 3 || $this->now_level == $this->p_arr['manyou_max_level']) {
             $host_arr = $this->GetHostInfo($url);
             $this->base_url = $host_arr['host'];
         }
         $this->format_url();
         $show_args = array_merge($this->msg_args, array('li_no_end' => 1, 'no_border' => 1, 'now' => $this->i));
         show_pick_info(array(milu_lang('read_link'), $this->now_url), 'url', $show_args);
         $this->i++;
         $this->temp_arr['have_reply'] = 0;
         $this->pick_cache_data['i'] = $this->i;
         $visit_flag = $this->check_visit_url();
         if ($visit_flag > 0) {
             if ($this->now_level == 1) {
                 if ($this->p_arr['rules_type'] == 3) {
                     //若是一键采集,判断此网址是否是文章页
                     if (!$this->check_fastpick_viewurl($this->now_url)) {
                         continue;
                     } else {
                         //exit($this->now_url);
                     }
                 }
                 $content = $this->parse_page();
                 $this->status_arr['now'] = $this->i;
                 show_pick_info('', 'success', $this->status_arr);
                 if ($this->p_arr['stop_time'][0]) {
                     sleep($this->p_arr['stop_time'][0]);
                 }
                 $get = 0;
                 $this->temp_arr['have_page'] = 0;
                 if ($this->p_arr['content_page_rules']) {
                     //分页文章
                     if ($this->p_arr['reply_rules'] || $this->p_arr['reply_is_extend']) {
                         //回复
                     } else {
                         $content_page_arr = $this->get_content_page($content);
                         if ($content_page_arr) {
                             $get = 1;
                             $this->a++;
                             $this->pick_cache_data['a'] = $this->a;
                             $this->temp_arr['have_page'] = 1;
                             $article_info_arr = $this->page_get_content($content, array(), array(), $content_page_arr);
                             if ($article_info_arr) {
                                 //取其他内容
                                 $other_arr = $this->get_article_other($content);
                                 $other_arr = $other_arr ? $other_arr : array();
                                 $article_info_arr = array_merge($article_info_arr, $other_arr);
                                 $this->create_page_article($article_info_arr);
                                 //分页文章的入库
                             } else {
                                 $this->v_a++;
                                 $this->pick_cache_data['v_a'] = $this->v_a;
                             }
                         }
                     }
                 }
                 if ($get == 0) {
                     //普通文章
                     $ori_title = $this->get_ori_title($content);
                     $now = '-' . ($this->i - 1) . time();
                     $show_args = array_merge($this->msg_args, array('li_no_end' => 1, 'no_border' => 1, 'now' => $now));
                     show_pick_info(array(milu_lang('read_content'), cutstr($ori_title, 85)), 'left', $show_args);
                     $article_info = $this->get_article($content);
                     $this->status_arr['now'] = $now;
                     show_pick_info('', 'success', $this->status_arr);
                     $article_info = $this->format_article($article_info);
                     $this->get_pick_status();
                     $this->status_arr['now'] = '-' . ($this->i - 1) . time();
                     $show_args = array_merge($this->msg_args, array('li_no_end' => 1, 'no_border' => 1, 'now' => $now));
                     $this->temp_arr['normal_now'] = $now;
                     show_pick_info(array(milu_lang('article'), cutstr(trim($article_info['title']), 85)), 'left', $show_args);
                     if ($this->check_article($article_info)) {
                         $this->create_article($article_info);
                     } else {
                         $this->v_a++;
                         $this->pick_cache_data['v_a'] = $this->v_a;
                     }
                 }
                 $this->insert_url();
                 if ($this->aid || $this->public_info['insert_aid']) {
                     if ($this->p_arr['reply_rules'] || $this->p_arr['reply_is_extend']) {
                         //文章有回复
                         if ($this->p_arr['is_public_del'] == 1 && $this->p_arr['is_auto_public'] == 1 && $this->p_arr['public_type'] != 2) {
                             //如果直接发布,而且是发布不入库,而且不是发布到论坛,就不必采集回复
                         } else {
                             $now = '-' . ($this->i - 1) . time();
                             $show_args = array_merge($this->msg_args, array('li_no_end' => 1, 'no_border' => 1, 'now' => $now));
                             $this->temp_arr['reply_now'] = $now;
                             show_pick_info(array(milu_lang('pick_reply')), 'left', $show_args);
                             if (strexists($this->p_arr['reply_max_num'], ',')) {
                                 $arr = explode(',', $this->p_arr['reply_max_num']);
                                 $this->reply_max_num = rand($arr[0], $arr[1]);
                             } else {
                                 $this->reply_max_num = intval($this->p_arr['reply_max_num']);
                             }
                             $this->oldurl_arr = NULL;
                             $reply_arr = $this->page_get_reply($content, array($this->now_url));
                             $reply_arr = sarray_unique($reply_arr);
                             //去重复处理
                             $this->create_reply($reply_arr);
                             $this->oldurl_arr = NULL;
                             $this->temp_arr['have_reply'] = 1;
                         }
                     }
                 }
             }
             $msg = '';
             $link_count = 0;
             $next_link = array();
             if ($this->now_level > 1) {
                 if ($this->p_arr['url_range_type'] == 1 || $this->p_arr['url_range_type'] == 5 || $this->p_arr['rules_type'] == 1) {
                     //分页列表或多层列表获取是内置规则
                     if ($this->p_arr['url_range_type'] == 5) {
                         $key_level = abs($this->now_level - 1 - count($this->p_arr['many_page_list'])) + 1;
                         $rules_arr = $this->p_arr['many_page_list'][$key_level];
                     } else {
                         if ($this->p_arr['url_range_type'] == 1 || $this->p_arr['rules_type'] == 1) {
                             $rules_arr['type'] = $this->p_arr['page_get_type'];
                             $rules_arr['rules'] = $this->p_arr['page_link_rules'];
                         }
                     }
                     $content = $this->parse_page();
                     if ($rules_arr['type'] == 1) {
                         $next_link = dom_page_link($content, array('page_link_rules' => $rules_arr['rules'], 'url' => $this->now_url));
                     } else {
                         if ($rules_arr['type'] == 2) {
                             $next_link = string_page_link($content, trim($rules_arr['rules']), $this->now_url);
                         } else {
                             $next_link = evo_get_pagelink($content, $this->now_url);
                         }
                     }
                     if ($this->p_arr['url_range_type'] == 1 && !$rules_arr['rules']) {
                         $msg = ' : ' . milu_lang('no_set_list_rules');
                     }
                     $link_count = $this->temp_arr['per_num'] = count($next_link);
                     if ($link_count == 0 && $rules_arr['rules']) {
                         $msg = ' : ' . milu_lang('check_list_rules');
                     }
                     $this->get_pick_count();
                 } else {
                     if ($this->p_arr['rules_type'] == 3) {
                         //一键采集
                         $content = $this->parse_page();
                         $next_link = evo_get_pagelink($content, $this->now_url, $this->pick_cache_data['lilely_page']);
                         $link_count = count($next_link);
                     }
                 }
                 $this->get_pick_status(1);
                 show_pick_info(milu_lang('get_link_c', array('c' => $link_count)) . $msg, $link_count > 0 ? 'success' : 'err', $this->status_arr);
                 if ($next_link) {
                     $this->pick_cache_data['url_arr'][$this->now_level - 1] = $this->now_url_arr = $next_link;
                 }
             } else {
                 $next_link = $this->now_url_arr = $this->pick_cache_data['url_arr'][$this->now_level];
             }
             if (!$this->flip()) {
                 return;
             }
             $this->del_session_arr($this->now_level);
             if (!$this->pick_cache_data['url_arr']) {
                 return;
             }
             $del_flag = 1;
             if ($this->now_level > 1 && $next_link) {
                 $this->now_level -= 1;
                 $this->robot($level - 1);
             }
         } else {
             $this->v_i++;
             $this->pick_cache_data['v_i'] = $this->v_i;
             $this->get_pick_status(1);
             show_pick_info(milu_lang('no_visit_err' . $visit_flag), 'err', $this->status_arr);
             if (!$this->flip()) {
                 return;
             }
         }
         if ($del_flag != 1) {
             $this->del_session_arr($this->now_level);
         }
     }
     $this->now_level += 1;
     $this->restart_robot($this->now_level);
 }