function do_addproject() { $rs = array(); $this->pid && ($rs = spider::project($this->pid)); $cid = empty($rs['cid']) ? $this->cid : $rs['cid']; $categoryApp = iACP::app('category', iCMS_APP_ARTICLE); $cata_option = $categoryApp->select(false, $cid); $rule_option = $this->rule_opt($rs['rid']); $post_option = $this->post_opt($rs['poid']); //$rs['sleep'] OR $rs['sleep'] = 30; include iACP::view("spider.addproject"); }
public static function publish($work = null) { $_POST = spiderData::crawl(); if (spider::$work == 'shell') { if (empty($_POST['title'])) { echo "标题不能为空\n"; return false; } if (empty($_POST['body'])) { echo "内容不能为空\n"; return false; } } $checker = spider::checker($work, spider::$pid, $_POST['reurl'], $_POST['title']); if ($checker !== true) { return $checker; } $project = spider::project(spider::$pid); if (!isset($_POST['cid'])) { $_POST['cid'] = $project['cid']; } $postArgs = spider::postArgs($project['poid']); if ($_GET['indexid']) { $aid = (int) $_GET['indexid']; $_POST['aid'] = $aid; $_POST['adid'] = iDB::value("SELECT `id` FROM `#iCMS@__article_data` WHERE aid='{$aid}'"); } $title = iS::escapeStr($_POST['title']); $url = iS::escapeStr($_POST['reurl']); $hash = md5($url); if (empty(spider::$sid)) { $spider_url = iDB::row("SELECT `id`,`publish`,`indexid` FROM `#iCMS@__spider_url` where `url`='{$url}'", ARRAY_A); if (empty($spider_url)) { $spider_url_data = array('cid' => $project['cid'], 'rid' => spider::$rid, 'pid' => spider::$pid, 'title' => addslashes($title), 'url' => $url, 'hash' => $hash, 'status' => '1', 'addtime' => time(), 'publish' => '0', 'indexid' => '0', 'pubdate' => ''); $suid = iDB::insert('spider_url', $spider_url_data); } else { if ($spider_url['indexid']) { $_POST['aid'] = $spider_url['indexid']; $_POST['adid'] = iDB::value("SELECT `id` FROM `#iCMS@__article_data` WHERE aid='" . $spider_url['indexid'] . "'"); } $suid = $spider_url['id']; } } else { $suid = spider::$sid; } if (spider::$callback['post'] && is_callable(spider::$callback['post'])) { $_POST = call_user_func_array(spider::$callback['post'], array($_POST)); } iS::slashes($_POST); $app = iACP::app($postArgs->app); $fun = $postArgs->fun; $app->callback['code'] = '1001'; /** * 主表 回调 更新关联ID */ $app->callback['primary'] = array(array('spider', 'update_spider_url_indexid'), array('suid' => $suid)); /** * 数据表 回调 成功发布 */ $app->callback['data'] = array(array('spider', 'update_spider_url_publish'), array('suid' => $suid)); $callback = $app->{$fun}(); if ($callback['code'] == $app->callback['code']) { if (spider::$sid) { $work === NULL && iPHP::success("发布成功!", 'js:1'); } else { $work === NULL && iPHP::success("发布成功!", 'js:parent.$("#' . $hash . '").remove();'); } } if ($work == "shell" || $work == "WEB@AUTO") { $callback['work'] = $work; return $callback; } }
public static function crawl($_pid = NULL, $_rid = NULL, $_url = NULL, $_title = NULL) { ini_get('safe_mode') or set_time_limit(0); $sid = spider::$sid; if ($sid) { $sRs = iDB::row("SELECT * FROM `#iCMS@__spider_url` WHERE `id`='{$sid}' LIMIT 1;"); $title = $sRs->title; $cid = $sRs->cid; $pid = $sRs->pid; $url = $sRs->url; $rid = $sRs->rid; } else { $rid = spider::$rid; $pid = spider::$pid; $title = spider::$title; $url = spider::$url; $_rid === NULL or $rid = $_rid; $_pid === NULL or $pid = $_pid; $_title === NULL or $title = $_title; $_url === NULL or $url = $_url; } if ($pid) { $project = spider::project($pid); $prule_list_url = $project['list_url']; } $ruleA = spider::rule($rid); $rule = $ruleA['rule']; $dataArray = $rule['data']; if ($prule_list_url) { $rule['list_url'] = $prule_list_url; } if (spider::$dataTest) { echo "<b>抓取规则信息</b><pre>"; print_r(iS::escapeStr($ruleA)); print_r(iS::escapeStr($project)); echo "</pre><hr />"; } spider::$curl_proxy = $rule['proxy']; $responses = array(); $html = spiderTools::remote($url); if (empty($html)) { $msg = '错误:001..采集 ' . $url . '文件内容为空!请检查采集规则'; if (spider::$work == 'shell') { echo "{$msg}\n"; return false; } else { iPHP::alert($msg); } } // $http = spider::check_content_code($html); // // if($http['match']==false){ // return false; // } // $content = $http['content']; spider::$allHtml = ""; $rule['__url__'] = spider::$url; $responses['reurl'] = spider::$url; $responses['__title__'] = $title; foreach ((array) $dataArray as $key => $data) { $content_html = $html; $dname = $data['name']; /** * [UNSET:name] * 注销[name] * @var string */ if (strpos($dname, 'UNSET:') !== false) { $_dname = str_replace('UNSET:', '', $dname); unset($responses[$_dname]); continue; } /** * [DATA:name] * 把之前[name]处理完的数据当作原始数据 * 如果之前有数据会叠加 * 用于数据多次处理 * @var string */ if (strpos($dname, 'DATA:') !== false) { $_dname = str_replace('DATA:', '', $dname); $content_html = $responses[$_dname]; unset($responses[$dname]); } /** * [PRE:name] * 把PRE:name采集到的数据 当做原始数据 * 一般用于下载内容 * @var string */ $pre_dname = 'PRE:' . $dname; if (isset($responses[$pre_dname])) { $content_html = $responses[$pre_dname]; unset($responses[$pre_dname]); } /** * [EMPTY:name] * 如果[name]之前抓取结果数据为空使用这个数据项替换 * @var string */ if (strpos($dname, 'EMPTY:') !== false) { $_dname = str_replace('EMPTY:', '', $dname); if (empty($responses[$_dname])) { $dname = $_dname; } else { //有值不执行抓取 continue; } } $content = spiderContent::crawl($content_html, $data, $rule, $responses); unset($content_html); if (strpos($dname, 'ARRAY:') !== false) { // if(strpos($data['rule'], 'RULE@')!==false){ $dname = str_replace('ARRAY:', '', $dname); // $contentArray = $responses[$dname]; // // $contentArray = $responses[$dname]; $cArray = array(); foreach ((array) $content as $k => $value) { foreach ((array) $value as $key => $val) { $cArray[$key][$k] = $val; } } if ($cArray) { $content = $cArray; unset($cArray); } } /** * [name.xxx] * 采集内容做为数组 */ if (strpos($dname, '.') !== false) { $f_key = substr($dname, 0, stripos($dname, ".")); $s_key = substr(strrchr($dname, "."), 1); if (isset($responses[$f_key][$s_key])) { if (is_array($responses[$f_key][$s_key])) { $responses[$f_key][$s_key] = array_merge($responses[$f_key][$s_key], $content); } else { $responses[$f_key][$s_key] .= $content; } } else { $responses[$f_key][$s_key] = $content; } } else { /** * 多个name 内容合并 */ if (isset($responses[$dname])) { if (is_array($responses[$dname])) { $responses[$dname] = array_merge($responses[$dname], $content); } else { $responses[$dname] .= $content; } } else { $responses[$dname] = $content; } } /** * 对匹配多条的数据去重过滤 */ if (!is_array($responses[$dname]) && $data['multi']) { if (strpos($responses[$dname], ',') !== false) { $_dnameArray = explode(',', $responses[$dname]); $dnameArray = array(); foreach ((array) $_dnameArray as $key => $value) { $value = trim($value); $value && ($dnameArray[] = $value); } $dnameArray = array_filter($dnameArray); $dnameArray = array_unique($dnameArray); $responses[$dname] = implode(',', $dnameArray); unset($dnameArray, $_dnameArray); } } gc_collect_cycles(); } if (isset($responses['title']) && empty($responses['title'])) { $responses['title'] = $responses['__title__']; } spider::$allHtml = null; unset($html); gc_collect_cycles(); if (spider::$dataTest) { echo "<pre style='width:99%;word-wrap: break-word;'>"; print_r(iS::escapeStr($responses)); echo '<hr />'; echo '使用内存:' . iFS::sizeUnit(memory_get_usage()) . ' 执行时间:' . iPHP::timer_stop() . 's'; echo "</pre>"; } iFS::$CURLOPT_ENCODING = ''; iFS::$CURLOPT_REFERER = ''; iFS::$watermark_config['pos'] = iCMS::$config['watermark']['pos']; iFS::$watermark_config['x'] = iCMS::$config['watermark']['x']; iFS::$watermark_config['y'] = iCMS::$config['watermark']['y']; iFS::$watermark_config['img'] = iCMS::$config['watermark']['img']; $rule['fs']['encoding'] && (iFS::$CURLOPT_ENCODING = $rule['fs']['encoding']); $rule['fs']['referer'] && (iFS::$CURLOPT_REFERER = $rule['fs']['referer']); if ($rule['watermark_mode']) { iFS::$watermark_config['pos'] = $rule['watermark']['pos']; iFS::$watermark_config['x'] = $rule['watermark']['x']; iFS::$watermark_config['y'] = $rule['watermark']['y']; $rule['watermark']['img'] && (iFS::$watermark_config['img'] = $rule['watermark']['img']); } if (spider::$callback['data'] && is_callable(spider::$callback['data'])) { $responses = call_user_func_array(spider::$callback['data'], array($responses)); } return $responses; }
public static function crawl($work = NULL, $pid = NULL, $_rid = NULL, $_urls = null, $callback = null) { $pid === NULL && ($pid = spider::$pid); if ($pid) { $project = spider::project($pid); $cid = $project['cid']; $rid = $project['rid']; $prule_list_url = $project['list_url']; $lastupdate = $project['lastupdate']; } else { $cid = spider::$cid; $rid = spider::$rid; } if (empty($rid) && $_rid !== NULL) { $rid = $_rid; } if ($work == 'shell') { $lastupdate = $project['lastupdate']; if ($project['psleep']) { if (time() - $lastupdate < $project['psleep']) { echo '采集方案[' . $pid . "]:" . format_date($lastupdate) . "刚采集过了,请" . $project['psleep'] / 3600 . "小时后在继续采集\n"; return; } } echo "[32m开始采集方案[" . $pid . "] 采集规则[" . $rid . "][0m\n"; } $ruleA = spider::rule($rid); $rule = $ruleA['rule']; $urls = $rule['list_urls']; $project['urls'] && ($urls = $project['urls']); spiderUrls::$urls && ($urls = spiderUrls::$urls); $_urls && ($urls = $_urls); $urlsArray = explode("\n", $urls); $urlsArray = array_filter($urlsArray); $_urlsArray = $urlsArray; $urlsList = array(); if ($work == 'shell') { // echo "$urls\n"; print_r($urlsArray); } foreach ($_urlsArray as $_key => $_url) { $_url = htmlspecialchars_decode($_url); $_urlsList = array(); /** * RULE@rid@url * url使用[rid]规则采集并返回列表结果 */ if (strpos($_url, 'RULE@') !== false) { list($___s, $_rid, $_urls) = explode('@', $_url); if (spider::$ruleTest) { print_r('<b>使用[rid:' . $_rid . ']规则抓取列表</b>:' . $_urls); echo "<hr />"; } $_urlsList = spiderUrls::crawl($work, false, $_rid, $_urls, 'CALLBACK@URL'); $urlsList = array_merge($urlsList, $_urlsList); unset($urlsArray[$_key]); } else { preg_match('|.*<(.*)>.*|is', $_url, $_matches); if ($_matches) { list($format, $begin, $num, $step, $zeroize, $reverse) = explode(',', $_matches[1]); $url = str_replace($_matches[1], '*', trim($_matches[0])); $_urlsList = spiderTools::mkurls($url, $format, $begin, $num, $step, $zeroize, $reverse); unset($urlsArray[$_key]); $urlsList = array_merge($urlsList, $_urlsList); } } } $urlsList && ($urlsArray = array_merge($urlsArray, $urlsList)); unset($_urlsArray, $_key, $_url, $_matches, $_urlsList, $urlsList); $urlsArray = array_unique($urlsArray); // spider::$useragent = $rule['user_agent']; // spider::$encoding = $rule['curl']['encoding']; // spider::$referer = $rule['curl']['referer']; // spider::$charset = $rule['charset']; if (empty($urlsArray)) { if ($work == 'shell') { echo "采集列表为空!请填写!\n"; return false; } iPHP::alert('采集列表为空!请填写!', 'js:parent.window.iCMS_MODAL.destroy();'); } // if(spider::$ruleTest){ // echo "<pre>"; // print_r(iS::escapeStr($project)); // print_r(iS::escapeStr($rule)); // echo "</pre>"; // echo "<hr />"; // } if ($rule['mode'] == "2") { iPHP::import(iPHP_LIB . '/phpQuery.php'); spider::$ruleTest && $_GET['pq_debug'] && (phpQuery::$debug = 1); } $pubArray = array(); $pubCount = array(); $pubAllCount = array(); spider::$curl_proxy = $rule['proxy']; spider::$urlslast = null; foreach ($urlsArray as $key => $url) { $url = trim($url); spider::$urlslast = $url; if ($work == 'shell') { echo '开始采集列表:' . $url . "\n"; } if (spider::$ruleTest) { echo '<b>抓取列表:</b>' . $url . "<br />"; } $html = spiderTools::remote($url); if (empty($html)) { continue; } if ($rule['mode'] == "2") { $doc = phpQuery::newDocumentHTML($html, 'UTF-8'); $list_area = $doc[trim($rule['list_area_rule'])]; // if(strpos($rule['list_area_format'], 'DOM::')!==false){ // $list_area = spiderTools::dataClean($rule['list_area_format'], $list_area); // } if ($rule['list_area_format']) { $list_area_format = trim($rule['list_area_format']); if (strpos($list_area_format, 'ARRAY::') !== false) { $list_area_format = str_replace('ARRAY::', '', $list_area_format); $lists = array(); foreach ($list_area as $la_key => $la) { $lists[] = phpQuery::pq($list_area_format, $la); } } else { $lists = phpQuery::pq($list_area_format, $list_area); } } else { $lists = $list_area; } // $lists = $list_area; //echo 'list:getDocumentID:'.$lists->getDocumentID()."\n"; } else { $list_area_rule = spiderTools::pregTag($rule['list_area_rule']); if ($list_area_rule) { preg_match('|' . $list_area_rule . '|is', $html, $matches, $PREG_SET_ORDER); $list_area = $matches['content']; } else { $list_area = $html; } $html = null; unset($html); if (spider::$ruleTest) { echo iS::escapeStr($rule['list_area_rule']); // echo iS::escapeStr($list_area); echo "<hr />"; } if ($rule['list_area_format']) { $list_area = spiderTools::dataClean($rule['list_area_format'], $list_area); } preg_match_all('|' . spiderTools::pregTag($rule['list_url_rule']) . '|is', $list_area, $lists, PREG_SET_ORDER); $list_area = null; unset($list_area); if ($rule['sort'] == "1") { //arsort($lists); } elseif ($rule['sort'] == "2") { asort($lists); } elseif ($rule['sort'] == "3") { shuffle($lists); } } if (spider::$ruleTest) { echo '<b>列表区域规则:</b>' . iS::escapeStr($rule['list_area_rule']); echo "<hr />"; echo '<b>列表区域抓取结果:</b>' . iS::escapeStr($list_area); echo "<hr />"; echo '<b>列表链接规则:</b>' . iS::escapeStr($rule['list_url_rule']); echo "<hr />"; echo '<b>网址合成规则:</b>' . iS::escapeStr($rule['list_url']); echo "<hr />"; } if ($prule_list_url) { $rule['list_url'] = $prule_list_url; } //PID@xx 返回URL列表 if ($callback == 'CALLBACK@URL') { $cbListUrl = array(); foreach ($lists as $lkey => $row) { list(spider::$title, spider::$url) = spiderTools::title_url($row, $rule, $url); if (spider::$url === false) { continue; } // if(spider::checker($work)===true){ $cbListUrl[] = spider::$url; // } } return $cbListUrl; } if ($work == "shell") { $pubCount[$url]['count'] = count($lists); $pubAllCount['count'] += $pubCount[$url]['count']; echo "开始采集:" . $url . " 列表 " . $pubCount[$url]['count'] . "条记录\n"; foreach ($lists as $lkey => $row) { list(spider::$title, spider::$url) = spiderTools::title_url($row, $rule, $url); if (spider::$url === false) { continue; } $hash = md5(spider::$url); echo "title:" . spider::$title . "\n"; echo "url:" . spider::$url . "\n"; spider::$rid = $rid; $checker = spider::checker($work); if ($checker === true) { echo "开始采集...."; $callback = spider::publish("shell"); if ($callback['code'] == "1001") { $pubCount[$url]['success']++; $pubAllCount['success']++; echo "....√\n"; if ($project['sleep']) { echo "sleep:" . $project['sleep'] . "s\n"; if ($rule['mode'] != "2") { unset($lists[$lkey]); } gc_collect_cycles(); sleep($project['sleep']); } else { //sleep(1); } } else { $pubCount[$url]['error']++; $pubAllCount['error']++; echo "error\n\n"; continue; } } $pubCount[$url]['published']++; $pubAllCount['published']++; } if ($rule['mode'] == "2") { phpQuery::unloadDocuments($doc->getDocumentID()); } else { unset($lists); } } if ($work == "WEB@MANUAL") { $listsArray[$url] = $lists; } if ($work == "WEB@AUTO" || $work == 'DATA@RULE') { foreach ($lists as $lkey => $row) { list(spider::$title, spider::$url) = spiderTools::title_url($row, $rule, $url); if (spider::$url === false) { continue; } $hash = md5(spider::$url); if (spider::$ruleTest) { echo '<b>列表抓取结果:</b>' . $lkey . '<br />'; echo spider::$title . ' (<a href="' . APP_URI . '&do=testdata' . '&url=' . urlencode(spider::$url) . '&rid=' . $rid . '&pid=' . $pid . '&title=' . urlencode(spider::$title) . '" target="_blank">测试内容规则</a>) <br />'; echo spider::$url . "<br />"; echo $hash . "<br /><hr />"; } else { if (spider::checker($work) === true || spider::$dataTest) { $suData = array('sid' => 0, 'url' => spider::$url, 'title' => spider::$title, 'cid' => $cid, 'rid' => $rid, 'pid' => $pid, 'hash' => $hash); switch ($work) { case 'DATA@RULE': $contentArray[$lkey] = spiderData::crawl(); // $contentArray[$lkey] = spiderUrls::crawl($work,$_pid); unset($suData['sid']); $suData['title'] = addslashes($suData['title']); $suData += array('addtime' => time(), 'status' => '2', 'publish' => '2', 'indexid' => '0', 'pubdate' => '0'); spider::$dataTest or $suid = iDB::insert('spider_url', $suData); $contentArray[$lkey]['spider_url'] = $suid; break; case 'WEB@AUTO': $pubArray[] = $suData; break; } } } } } } $lists = null; unset($lists); gc_collect_cycles(); switch ($work) { case 'WEB@AUTO': return $pubArray; break; case 'DATA@RULE': return $contentArray; break; case 'WEB@MANUAL': return array('cid' => $cid, 'rid' => $rid, 'pid' => $pid, 'sid' => $sid, 'work' => $work, 'rule' => $rule, 'listsArray' => $listsArray); break; case "shell": echo "采集数据统结果:\n"; print_r($pubCount); print_r($pubAllCount); echo "全部采集完成....\n"; iDB::update('spider_project', array('lastupdate' => time()), array('id' => $pid)); break; } }