public function crawlcontent() { $sort_id = I('post.category', '', 'intval'); $url = I('post.url'); if (!$sort_id) { $json['alertinfo'] = "请选择栏目"; $this->ajaxReturn($json); } if (!$url) { exit; } if (IS_AJAX) { $sf = array('ifeng' => array('content' => '#main_content p'), 'qq' => array('content' => '#Cnt-Main-Article-QQ p'), 'weixin' => array('title' => '#activity-name', 'content' => '#js_content p'), 'sina' => array('content' => '#artibody p', 'title' => '#artibodyTitle'), '163' => array('content' => '#endText p'), 'toutiao' => array('content' => '.article-content p,.article-content div p')); $check['link'] = md5($url); if (M('history_list')->where($check)->find()) { $json['url'] = htmlspecialchars_decode($url); $json['info'] = "<span class='pink'>已存在,跳过</span>"; $this->ajaxReturn($json); } $current_sf = I('post.sf'); $dom = $this->_curl($url); $data = mb_convert_encoding($dom, 'utf-8', 'GBK,UTF-8,ASCII'); $data = '<meta http-equiv="Content-Type" content="text/html; charset=utf-8">' . $data; vendor("HtmlParser/ParserDom"); $obj = new \ParserDom($data); $title_selector = $sf[$current_sf]['title'] ? $sf[$current_sf]['title'] : "h1"; try { $title = $obj->find($title_selector); $keywords = $obj->find('meta[name=keywords]'); $description = $obj->find('meta[name=description]'); $content = $obj->find($sf[$current_sf]['content']); if ($title) { $detail['title'] = $title[0]->getPlainText(); } if ($keywords) { $detail['seo_keywords'] = $description[0]->getAttr('content'); } if ($description) { $detail['description'] = $description[0]->getAttr('content'); } if ($content) { $detail['content'] = ""; foreach ($content as $k => $v) { $detail['content'] .= preg_replace('/href=[\'\\"]?[:\\/\\w#\\.]*[\'\\"]?/i', '', $v->outerHtml()); } } } catch (Exception $e) { } //$detail['source'] = $current_sf; if ($detail['content']) { //此处根据前台提交的category(栏目ID),将内容发布到指定的栏目 if (M('crawl_content')->add($detail)) { $history['link'] = md5($url); $history['scheme'] = I('post.scheme'); M('history_list')->add($history); $json['info'] = "<span class='green'>已入库</span>"; } else { $json['info'] = "<span class='blue'>系统错误</span>"; } } else { $json['info'] = "<span class='red'>无内容,跳过</span>"; } $json['url'] = htmlspecialchars_decode($url); $this->ajaxReturn($json); } }
function dom($html_dom, $node = array()) { if (!class_exists('ParserDom')) { include_once ROOT_PATH . 'inc/class/dom/ParserInterface.php'; include_once ROOT_PATH . 'inc/class/dom/ParserAbstract.php'; include_once ROOT_PATH . 'inc/class/dom/ParserDom.php'; } $html_dom = '<meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>' . $html_dom; $dom = new ParserDom($html_dom); $arr = array(); foreach ($node as $k => $v) { $find = $dom->find($v['el']); foreach ($find as $k1 => $v1) { if ($v['attr'] == '' || $v['attr'] == 'text') { $value = $v1->getPlainText(); } else { $value = $v1->getAttr($v['attr']); } if ($v['replace']) { $con = $v['content'] ? $v['content'] : ''; $value = str_replace($v['replace'], $con, $value); } $name = $v['name']; $arr[$k1][$name] = $value; } } unset($dom); return $arr; }