예제 #1
0
 /**
  * 采集内容
  * @param string $url    采集地址
  * @param array $config  配置参数
  * @param integer $page  分页采集模式
  */
 public static function get_content($url, $config, $page = 0)
 {
     set_time_limit(300);
     static $oldurl = array();
     $page = intval($page) ? intval($page) : 0;
     if ($html = self::get_html($url, $config)) {
         if (empty($page)) {
             //获取标题
             if ($config['title_rule']) {
                 $title_rule = self::replace_sg($config['title_rule']);
                 $data['title'] = self::replace_item(self::cut_html($html, $title_rule[0], $title_rule[1]), $config['title_html_rule']);
             }
             //获取作者
             if ($config['author_rule']) {
                 $author_rule = self::replace_sg($config['author_rule']);
                 $data['author'] = self::replace_item(self::cut_html($html, $author_rule[0], $author_rule[1]), $config['author_html_rule']);
             }
             //获取来源
             if ($config['comeform_rule']) {
                 $comeform_rule = self::replace_sg($config['comeform_rule']);
                 $data['comeform'] = self::replace_item(self::cut_html($html, $comeform_rule[0], $comeform_rule[1]), $config['comeform_html_rule']);
             }
             //获取时间
             if ($config['time_rule']) {
                 $time_rule = self::replace_sg($config['time_rule']);
                 $data['time'] = strtotime(self::replace_item(self::cut_html($html, $time_rule[0], $time_rule[1]), $config['time_html_rule']));
             }
             if (empty($data['time'])) {
                 $data['time'] = SYS_TIME;
             }
             //对自定义数据进行采集
             if ($config['customize_config'] = string2array($config['customize_config'])) {
                 foreach ($config['customize_config'] as $k => $v) {
                     if (empty($v['rule'])) {
                         continue;
                     }
                     $rule = self::replace_sg($v['rule']);
                     $data[$v['en_name']] = self::replace_item(self::cut_html($html, $rule[0], $rule[1]), $v['html_rule']);
                 }
             }
         }
         //获取内容
         if ($config['content_rule']) {
             $content_rule = self::replace_sg($config['content_rule']);
             $data['content'] = self::replace_item(self::cut_html($html, $content_rule[0], $content_rule[1]), $config['content_html_rule']);
         }
         //处理分页
         if (in_array($page, array(0, 2)) && !empty($config['content_page_start']) && !empty($config['content_page_end'])) {
             $oldurl[] = $url;
             $tmp[] = $data['content'];
             $page_html = self::cut_html($html, $config['content_page_start'], $config['content_page_end']);
             //上下页模式
             if ($config['content_page_rule'] == 2 && in_array($page, array(0, 2)) && $page_html) {
                 preg_match_all('/<a[^>]*href=[\'"]?([^>\'" ]*)[\'"]?[^>]*>([^<\\/]*)<\\/a>/i', $page_html, $out);
                 if (!empty($out[1]) && !empty($out[2])) {
                     foreach ($out[2] as $k => $v) {
                         if (strpos($v, $config['content_nextpage']) === false) {
                             continue;
                         }
                         if ($out[1][$k] == '#') {
                             continue;
                         }
                         $out[1][$k] = self::url_check($out[1][$k], $url, $config);
                         if (in_array($out[1][$k], $oldurl)) {
                             continue;
                         }
                         $oldurl[] = $out[1][$k];
                         $results = self::get_content($out[1][$k], $config, 2);
                         if (!in_array($results['content'], $tmp)) {
                             $tmp[] = $results['content'];
                         }
                     }
                 }
             }
             //全部罗列模式
             if ($config['content_page_rule'] == 1 && $page == 0 && $page_html) {
                 preg_match_all('/<a[^>]*href=[\'"]?([^>\'" ]*)[\'"]?/i', $page_html, $out);
                 if (is_array($out[1]) && !empty($out[1])) {
                     $out = array_unique($out[1]);
                     foreach ($out as $k => $v) {
                         if ($out[1][$k] == '#') {
                             continue;
                         }
                         $v = self::url_check($v, $url, $config);
                         $results = self::get_content($v, $config, 1);
                         if (!in_array($results['content'], $tmp)) {
                             $tmp[] = $results['content'];
                         }
                     }
                 }
             }
             $data['content'] = $config['content_page'] == 1 ? implode('[page]', $tmp) : implode('', $tmp);
         }
         if ($page == 0) {
             self::$url = $url;
             self::$config = $config;
             $data['content'] = preg_replace('/<img[^>]*src=[\'"]?([^>\'"\\s]*)[\'"]?[^>]*>/ie', "self::download_img('\$0', '\$1')", $data['content']);
             //下载内容中的图片到本地
             if (empty($page) && !empty($data['content']) && $config['down_attachment'] == 1) {
                 pc_base::load_sys_class('attachment', '', 0);
                 $attachment = new attachment('collection', '0', get_siteid());
                 $data['content'] = $attachment->download('content', $data['content'], $config['watermark']);
             }
         }
         return $data;
     }
 }