/** * 获取文章网址 * @param string $url 采集地址 * @param array $config 配置 */ public static function get_url_lists($url, &$config) { if ($html = self::get_html($url, $config)) { if ($config['sourcetype'] == 4) { //RSS if (!class_exists('xml')) { include WX_PATH . 'include' . DIRECTORY_SEPARATOR . 'xml.class.php'; } $xml = new xml(); $html = $xml->xml_unserialize($html); //if (pc_base::load_config('system', 'charset') == 'gbk') { //$html = array_iconv($html, 'utf-8', 'gbk'); //} $data = array(); if (is_array($html['rss']['channel']['item'])) { foreach ($html['rss']['channel']['item'] as $k => $v) { $data[$k]['url'] = $v['link']; $data[$k]['title'] = $v['title']; } } } else { if ($config['sourcetype'] == 5) { //HTML List $doc = self::loadNprepare($html, 'utf-8'); $classname = 'list-div'; $finder = new DomXPath($doc); $nodes = $finder->query("//*[contains(concat(' ', normalize-space(@class), ' '), ' {$classname} ')]"); $data = array(); $k = 0; foreach ($nodes as $node) { $nTitle = $node->childNodes->item(0); // $data[$k]['url'] = $v['link']; $listA = $node->getElementsByTagName("a"); if ($listA->length > 0) { $aNode = $listA->item(0); $data[$k]['title'] = $aNode->textContent; $data[$k]['url'] = "http://www.chinacatholic.org" . $aNode->getAttribute("href"); $listPic = $node->getElementsByTagName("img"); if ($listPic->length > 0) { $data[$k]['pic'] = "http://www.chinacatholic.org" . $listPic->item(0)->getAttribute("src"); } $k++; } } } else { $html = self::cut_html($html, $config['url_start'], $config['url_end']); $html = str_replace(array("\r", "\n"), '', $html); $html = str_replace(array("</a>", "</A>"), "</a>\n", $html); preg_match_all('/<a([^>]*)>([^\\/a>].*)<\\/a>/i', $html, $out); $out[1] = array_unique($out[1]); $out[2] = array_unique($out[2]); $data = array(); foreach ($out[1] as $k => $v) { if (preg_match('/href=[\'"]?([^\'" ]*)[\'"]?/i', $v, $match_out)) { if ($config['url_contain']) { if (strpos($match_out[1], $config['url_contain']) === false) { continue; } } if ($config['url_except']) { if (strpos($match_out[1], $config['url_except']) !== false) { continue; } } $url2 = $match_out[1]; $url2 = self::url_check($url2, $url, $config); $data[$k]['url'] = $url2; $data[$k]['title'] = strip_tags($out[2][$k]); } else { continue; } } } } return $data; } else { return false; } }
/** * 获取影片网址 * @param string $url 采集地址 * @param array $config 配置 */ function GetArtlist($url, &$config) { if ($html = collect::get_html($url, $config)) { if ($config['sourcetype'] == 4) { //RSS $html = xml::xml_unserialize($html); $data = array(); if (is_array($html['rss']['channel']['item'])) { foreach ($html['rss']['channel']['item'] as $k => $v) { $data[$k]['url'] = $v['link']; $data[$k]['title'] = $v['title']; } } } else { if ($config['sourcetype'] == 3) { //直接从内容页采集 $data = array(); $data[] = array('url' => $url); } else { $html = collect::cut_html($html, $config['url_start'], $config['url_end']); $html = str_replace(array("\r", "\n"), '', $html); $html = str_replace(array("</a>", "</A>"), "</a>\n", $html); preg_match_all('/<a([^>]*)>([^\\/a>].*)<\\/a>/i', $html, $out); $data = array(); /*=================================================*/ //列表页获取图片 if ($config['picmode'] == '1') { foreach ($out[2] as $k => $v) { if ($config['picurl_rule']) { $ArrRule = collect::replace_sg($config['picurl_rule']); foreach ($ArrRule as $key => $val) { $ArrRule[$key] = collect::str_replace_all($val); } $str = "/" . $ArrRule[0] . "([\\s\\S]*?)" . $ArrRule[1] . "/"; if (preg_match($str, $v, $match_pic_out)) { $pic = collect::replace_item($match_pic_out[1], $config['picurl_filter']); $data[$k]['picurl'] = collect::url_check($pic, $url, $config); } } } } /*======================================================*/ //移除重复数据 $out[1] = array_unique($out[1]); //url $out[2] = array_unique($out[2]); //title foreach ($out[1] as $k => $v) { if (preg_match('/href=[\'"]?([^\'" ]*)[\'"]?/i', $v, $match_out)) { if ($config['url_contain']) { if (strpos($match_out[1], $config['url_contain']) === false) { continue; } } if ($config['url_except']) { if (strpos($match_out[1], $config['url_except']) !== false) { continue; } } $url2 = $match_out[1]; $url2 = collect::url_check($url2, $url, $config); $data[$k]['url'] = $url2; $data[$k]['title'] = strip_tags($out[2][$k]); //去除标签 } } if ($config['colmode'] == 'desc') { $data = get_collect_krsort($data); } } } return $data; } else { return false; } }