Esempio n. 1
0
function evo_get_pagelink($content, $url, $list = array())
{
    $list = $list ? $list : $url;
    $rules_info = match_rules($url, $content, 4, 0);
    if ($rules_info && is_array($rules_info)) {
        if ($rules_info['page_get_type'] == 1) {
            $link_arr = dom_page_link($content, array('page_link_rules' => $rules_info['page_link_rules'], 'url' => $url));
        } else {
            if ($rules_info['page_get_type'] == 2) {
                $link_arr = string_page_link($content, trim($rules_info['page_link_rules']), $url);
            }
        }
    }
    if ($link_arr) {
        return $link_arr;
    }
    $base_url = get_base_url($content);
    $base_url = $base_url ? $base_url : $url;
    $link_arr = _striplinks($content, $base_url);
    if (!$link_arr) {
        return array();
    }
    foreach ((array) $link_arr as $k => $v_url) {
        if (!check_fastpick_viewurl($v_url, $url)) {
            unset($link_arr[$k]);
            continue;
        }
        $c_arr[$k] = strlen($v_url);
    }
    $value_count_arr = array_count_values($c_arr);
    arsort($value_count_arr);
    $value_count_arr = array_keys($value_count_arr);
    $view_lenth = array_shift($value_count_arr);
    $link_arr = array_resolve($link_arr);
    foreach ($link_arr as $k => $v) {
        if (abs(strlen($v) - $view_lenth) > 5) {
            unset($link_arr[$k]);
        }
    }
    $link_arr = array_filter($link_arr, 'filter_url_callback');
    return $link_arr;
}
Esempio n. 2
0
 function evo_get()
 {
     $milu_set = pick_common_get();
     $get_type = 3;
     $rules_info = match_rules($this->url, $this->str, $get_type, 0);
     //从本地学习到的规则获取
     if (!is_array($rules_info) || !$rules_info) {
         $get_type = 5;
         //只从详细页搜索
         $rules_info = match_rules($this->url, $this->str, $get_type, 0);
         //尝试从本地内置规则取
     }
     $data['evo'] = 2;
     if (!is_array($rules_info) || !$rules_info) {
         $data = cloud_match_rules(3, $this->url, $this->str);
         //从服务器端获取
         if (!$data['content']) {
             return array();
         }
         if (!$data['title']) {
             $re_title = $this->get_title();
             if ($re_title['html']) {
                 $data['title'] = $re_title['html'];
             }
         }
         if ($data['content']) {
             return $data;
         }
     }
     if (!$rules_info) {
         return array();
     }
     $data = evo_rules_get_article($this->str, $rules_info);
     if (!$data['content']) {
         //如果匹配到规则,但是又获取不到内容,证明规则出错了,记录起来
         pload('F:fastpick');
         write_evo_errlog($data, $this->url, $rules_info);
     }
     if (!$data['title']) {
         $re_title = $this->get_title();
         if ($re_title) {
             $data['title'] = $re_title['html'];
         }
     }
     if ($rules_info['detail_ID_test'] != $this->url) {
         DB::update("strayer_evo", array('hit_num' => $rules_info['hit_num'] + 1), array("id" => $rules_info['id']));
     }
     return $data;
 }
Esempio n. 3
0
function pick_match_rules()
{
    $url = format_url($_GET['url']);
    d_s();
    $content = get_contents($url);
    $v = match_rules($url, $content, 2, 0);
    if (!$v || !is_array($v)) {
        $v = pick_match_coloud_rules($url);
        if ($v['data_type'] == 1) {
            pload('F:rules');
            $v = $v['data'];
            rules_add($v);
            del_search_index(2);
        }
    }
    if (!$v || !is_array($v)) {
        return 'no';
    }
    $re_arr = array($v['rules_type'], $v['rules_hash']);
    return json_encode($re_arr);
}
Esempio n. 4
0
 function get_start_url()
 {
     if ($this->p_arr['rules_type'] == 1) {
         //如果采集器采用内置规则
         $this->parse_rules();
     } else {
         if ($this->p_arr['rules_type'] == 2) {
             //自定义规则
             if ($this->p_arr['url_range_type'] == 1 || $this->p_arr['url_range_type'] == 2) {
                 //从分页列表采集文章或url范围
                 $args['step'] = $this->p_arr['page_url_auto_step'];
                 $args['start'] = $this->p_arr['page_url_auto_start'];
                 $args['end'] = $this->p_arr['page_url_auto_end'];
                 $args['url'] = $this->p_arr['url_page_range'];
                 $args['auto'] = $this->p_arr['page_url_auto'];
                 $this->now_url_arr = convert_url_range($args);
                 $this->max_level = 2;
                 if ($this->p_arr['url_range_type'] == 2) {
                     $this->max_level = 1;
                     $this->temp_arr['per_num'] = 1;
                 } else {
                     $this->temp_arr['page_num'] = count($this->now_url_arr);
                 }
             } else {
                 if ($this->p_arr['url_range_type'] == 4) {
                     //从rss地址
                     $this->now_url_arr = get_rss_url(2, $this->p_arr['rss_url']);
                     $this->max_level = 1;
                 } else {
                     if ($this->p_arr['url_range_type'] == 5) {
                         //多层列表
                         $this->now_url_arr = array($this->p_arr['many_list_start_url']);
                         $this->max_level = count($this->p_arr['many_page_list']) + 1;
                     }
                 }
             }
         } else {
             if ($this->p_arr['rules_type'] == 3) {
                 //一键采集
                 $start_arr = format_wrap($this->p_arr['manyou_start_url']);
                 $this->now_url = $start_arr[0];
                 $content = $this->parse_page();
                 $rules_info = match_rules($this->now_url, $content, 4, 0);
                 if ($rules_info && is_array($rules_info)) {
                     $this->pick_cache_data['lilely_page'][] = $this->now_url;
                     if ($rules_info['page_get_type'] == 1) {
                         $this->now_url_arr = dom_page_link($content, array('page_link_rules' => $rules_info['page_link_rules'], 'url' => $this->now_url));
                     } else {
                         $this->now_url_arr = string_page_link($content, trim($rules_info['page_link_rules']), $this->now_url);
                     }
                 }
                 $page_url_arr = parse_url($this->now_url);
                 parse_str($page_url_arr['query'], $url_info);
                 $index_url = $auto = 0;
                 if (is_numeric($url_info['page'])) {
                     $var_url = str_replace('page=' . $url_info['page'], 'page=(*)', $this->now_url);
                     $this->pick_cache_data['lilely_page'][] = $this->now_url;
                 } else {
                     $page_all_link = $this->parse_page('link', $content);
                     $page_all_link = array_filter($page_all_link, 'filter_url_callback');
                     $likely_arr[0] = $this->now_url;
                     foreach ((array) $page_all_link as $k => $v) {
                         similar_text($v, $this->now_url, $percent);
                         if ($percent < 90) {
                             continue;
                         }
                         $likely_arr[] = $v;
                     }
                     $likely_arr = array_resolve($likely_arr);
                     $var_arr = get_url_diff($likely_arr);
                     $var_url = $var_arr['url'];
                     $index_url = $var_arr['index'];
                     $auto = $var_arr['auto'];
                     if ($var_url && is_array($likely_arr)) {
                         $key = array_rand($likely_arr);
                         $this->pick_cache_data['lilely_page'][] = $likely_arr[$key];
                     }
                 }
                 if ($var_url) {
                     $this->now_url_arr = convert_url_range(array('url' => $var_url, 'step' => 1, 'start' => $var_arr['index'] ? 2 : 1, 'end' => 99, 'auto' => $auto));
                     if ($var_arr['index']) {
                         array_unshift($this->now_url_arr, $var_arr['index']);
                     }
                     $this->max_level = 2;
                 } else {
                     $this->now_url_arr = $start_arr;
                     $this->max_level = $this->max_level ? $this->max_level : 2;
                 }
                 //print_r($this->now_url_arr);exit();
                 $this->max_level = $this->p_arr['manyou_max_level'] ? $this->p_arr['manyou_max_level'] : 2;
             }
         }
     }
     if ($this->p_arr['page_fiter'] == 1 && $this->now_url_arr) {
         //开启了过滤网址功能
         if ($this->p_arr['page_url_other']) {
             $this->now_url_arr = array_merge(format_wrap($this->p_arr['page_url_other']), $this->now_url_arr);
             $this->temp_arr['page_num'] = count($this->now_url_arr);
         }
     }
     $this->pick_cache_data['max_level'] = $this->max_level;
 }