Ejemplo n.º 1
0
 /**
  * 获取文章网址
  * @param string $url           采集地址
  * @param array $config         配置
  */
 public static function get_url_lists($url, &$config)
 {
     if ($html = self::get_html($url, $config)) {
         if ($config['sourcetype'] == 4) {
             //RSS
             if (!class_exists('xml')) {
                 include WX_PATH . 'include' . DIRECTORY_SEPARATOR . 'xml.class.php';
             }
             $xml = new xml();
             $html = $xml->xml_unserialize($html);
             //if (pc_base::load_config('system', 'charset') == 'gbk') {
             //$html = array_iconv($html, 'utf-8', 'gbk');
             //}
             $data = array();
             if (is_array($html['rss']['channel']['item'])) {
                 foreach ($html['rss']['channel']['item'] as $k => $v) {
                     $data[$k]['url'] = $v['link'];
                     $data[$k]['title'] = $v['title'];
                 }
             }
         } else {
             if ($config['sourcetype'] == 5) {
                 //HTML List
                 $doc = self::loadNprepare($html, 'utf-8');
                 $classname = 'list-div';
                 $finder = new DomXPath($doc);
                 $nodes = $finder->query("//*[contains(concat(' ', normalize-space(@class), ' '), ' {$classname} ')]");
                 $data = array();
                 $k = 0;
                 foreach ($nodes as $node) {
                     $nTitle = $node->childNodes->item(0);
                     //	$data[$k]['url'] = $v['link'];
                     $listA = $node->getElementsByTagName("a");
                     if ($listA->length > 0) {
                         $aNode = $listA->item(0);
                         $data[$k]['title'] = $aNode->textContent;
                         $data[$k]['url'] = "http://www.chinacatholic.org" . $aNode->getAttribute("href");
                         $listPic = $node->getElementsByTagName("img");
                         if ($listPic->length > 0) {
                             $data[$k]['pic'] = "http://www.chinacatholic.org" . $listPic->item(0)->getAttribute("src");
                         }
                         $k++;
                     }
                 }
             } else {
                 $html = self::cut_html($html, $config['url_start'], $config['url_end']);
                 $html = str_replace(array("\r", "\n"), '', $html);
                 $html = str_replace(array("</a>", "</A>"), "</a>\n", $html);
                 preg_match_all('/<a([^>]*)>([^\\/a>].*)<\\/a>/i', $html, $out);
                 $out[1] = array_unique($out[1]);
                 $out[2] = array_unique($out[2]);
                 $data = array();
                 foreach ($out[1] as $k => $v) {
                     if (preg_match('/href=[\'"]?([^\'" ]*)[\'"]?/i', $v, $match_out)) {
                         if ($config['url_contain']) {
                             if (strpos($match_out[1], $config['url_contain']) === false) {
                                 continue;
                             }
                         }
                         if ($config['url_except']) {
                             if (strpos($match_out[1], $config['url_except']) !== false) {
                                 continue;
                             }
                         }
                         $url2 = $match_out[1];
                         $url2 = self::url_check($url2, $url, $config);
                         $data[$k]['url'] = $url2;
                         $data[$k]['title'] = strip_tags($out[2][$k]);
                     } else {
                         continue;
                     }
                 }
             }
         }
         return $data;
     } else {
         return false;
     }
 }
Ejemplo n.º 2
0
 /**
  * 获取影片网址
  * @param string $url           采集地址
  * @param array $config         配置
  */
 function GetArtlist($url, &$config)
 {
     if ($html = collect::get_html($url, $config)) {
         if ($config['sourcetype'] == 4) {
             //RSS
             $html = xml::xml_unserialize($html);
             $data = array();
             if (is_array($html['rss']['channel']['item'])) {
                 foreach ($html['rss']['channel']['item'] as $k => $v) {
                     $data[$k]['url'] = $v['link'];
                     $data[$k]['title'] = $v['title'];
                 }
             }
         } else {
             if ($config['sourcetype'] == 3) {
                 //直接从内容页采集
                 $data = array();
                 $data[] = array('url' => $url);
             } else {
                 $html = collect::cut_html($html, $config['url_start'], $config['url_end']);
                 $html = str_replace(array("\r", "\n"), '', $html);
                 $html = str_replace(array("</a>", "</A>"), "</a>\n", $html);
                 preg_match_all('/<a([^>]*)>([^\\/a>].*)<\\/a>/i', $html, $out);
                 $data = array();
                 /*=================================================*/
                 //列表页获取图片
                 if ($config['picmode'] == '1') {
                     foreach ($out[2] as $k => $v) {
                         if ($config['picurl_rule']) {
                             $ArrRule = collect::replace_sg($config['picurl_rule']);
                             foreach ($ArrRule as $key => $val) {
                                 $ArrRule[$key] = collect::str_replace_all($val);
                             }
                             $str = "/" . $ArrRule[0] . "([\\s\\S]*?)" . $ArrRule[1] . "/";
                             if (preg_match($str, $v, $match_pic_out)) {
                                 $pic = collect::replace_item($match_pic_out[1], $config['picurl_filter']);
                                 $data[$k]['picurl'] = collect::url_check($pic, $url, $config);
                             }
                         }
                     }
                 }
                 /*======================================================*/
                 //移除重复数据
                 $out[1] = array_unique($out[1]);
                 //url
                 $out[2] = array_unique($out[2]);
                 //title
                 foreach ($out[1] as $k => $v) {
                     if (preg_match('/href=[\'"]?([^\'" ]*)[\'"]?/i', $v, $match_out)) {
                         if ($config['url_contain']) {
                             if (strpos($match_out[1], $config['url_contain']) === false) {
                                 continue;
                             }
                         }
                         if ($config['url_except']) {
                             if (strpos($match_out[1], $config['url_except']) !== false) {
                                 continue;
                             }
                         }
                         $url2 = $match_out[1];
                         $url2 = collect::url_check($url2, $url, $config);
                         $data[$k]['url'] = $url2;
                         $data[$k]['title'] = strip_tags($out[2][$k]);
                         //去除标签
                     }
                 }
                 if ($config['colmode'] == 'desc') {
                     $data = get_collect_krsort($data);
                 }
             }
         }
         return $data;
     } else {
         return false;
     }
 }