/** * 采集内容 * @param string $url 采集地址 * @param array $config 配置参数 * @param integer $page 分页采集模式 */ public static function get_content($url, $config, $page = 0) { set_time_limit(300); static $oldurl = array(); $page = intval($page) ? intval($page) : 0; if ($html = self::get_html($url, $config)) { if (empty($page)) { //获取标题 if ($config['title_rule']) { $title_rule = self::replace_sg($config['title_rule']); $data['title'] = self::replace_item(self::cut_html($html, $title_rule[0], $title_rule[1]), $config['title_html_rule']); } //获取作者 if ($config['author_rule']) { $author_rule = self::replace_sg($config['author_rule']); $data['author'] = self::replace_item(self::cut_html($html, $author_rule[0], $author_rule[1]), $config['author_html_rule']); } //获取来源 if ($config['comeform_rule']) { $comeform_rule = self::replace_sg($config['comeform_rule']); $data['comeform'] = self::replace_item(self::cut_html($html, $comeform_rule[0], $comeform_rule[1]), $config['comeform_html_rule']); } //获取时间 if ($config['time_rule']) { $time_rule = self::replace_sg($config['time_rule']); $data['time'] = strtotime(self::replace_item(self::cut_html($html, $time_rule[0], $time_rule[1]), $config['time_html_rule'])); } if (empty($data['time'])) { $data['time'] = SYS_TIME; } //对自定义数据进行采集 if ($config['customize_config'] = string2array($config['customize_config'])) { foreach ($config['customize_config'] as $k => $v) { if (empty($v['rule'])) { continue; } $rule = self::replace_sg($v['rule']); $data[$v['en_name']] = self::replace_item(self::cut_html($html, $rule[0], $rule[1]), $v['html_rule']); } } } //获取内容 if ($config['content_rule']) { $content_rule = self::replace_sg($config['content_rule']); $data['content'] = self::replace_item(self::cut_html($html, $content_rule[0], $content_rule[1]), $config['content_html_rule']); } //处理分页 if (in_array($page, array(0, 2)) && !empty($config['content_page_start']) && !empty($config['content_page_end'])) { $oldurl[] = $url; $tmp[] = $data['content']; $page_html = self::cut_html($html, $config['content_page_start'], $config['content_page_end']); //上下页模式 if ($config['content_page_rule'] == 2 && in_array($page, array(0, 2)) && $page_html) { preg_match_all('/<a[^>]*href=[\'"]?([^>\'" ]*)[\'"]?[^>]*>([^<\\/]*)<\\/a>/i', $page_html, $out); if (!empty($out[1]) && !empty($out[2])) { foreach ($out[2] as $k => $v) { if (strpos($v, $config['content_nextpage']) === false) { continue; } if ($out[1][$k] == '#') { continue; } $out[1][$k] = self::url_check($out[1][$k], $url, $config); if (in_array($out[1][$k], $oldurl)) { continue; } $oldurl[] = $out[1][$k]; $results = self::get_content($out[1][$k], $config, 2); if (!in_array($results['content'], $tmp)) { $tmp[] = $results['content']; } } } } //全部罗列模式 if ($config['content_page_rule'] == 1 && $page == 0 && $page_html) { preg_match_all('/<a[^>]*href=[\'"]?([^>\'" ]*)[\'"]?/i', $page_html, $out); if (is_array($out[1]) && !empty($out[1])) { $out = array_unique($out[1]); foreach ($out as $k => $v) { if ($out[1][$k] == '#') { continue; } $v = self::url_check($v, $url, $config); $results = self::get_content($v, $config, 1); if (!in_array($results['content'], $tmp)) { $tmp[] = $results['content']; } } } } $data['content'] = $config['content_page'] == 1 ? implode('[page]', $tmp) : implode('', $tmp); } if ($page == 0) { self::$url = $url; self::$config = $config; $data['content'] = preg_replace('/<img[^>]*src=[\'"]?([^>\'"\\s]*)[\'"]?[^>]*>/ie', "self::download_img('\$0', '\$1')", $data['content']); //下载内容中的图片到本地 if (empty($page) && !empty($data['content']) && $config['down_attachment'] == 1) { pc_base::load_sys_class('attachment', '', 0); $attachment = new attachment('collection', '0', get_siteid()); $data['content'] = $attachment->download('content', $data['content'], $config['watermark']); } } return $data; } }