/** * 抓取资源 * @param [string] $html [抓取结果] * @param [array] $data [数据项] * @param [array] $rule [规则] * @param [array] $responses [已经抓取资源] * @return [array] [返回处理结果] */ public static function crawl($html, $data, $rule, $responses) { if (trim($data['rule']) === '') { return; } $name = $data['name']; if (spider::$dataTest) { print_r('<b>[' . $name . ']规则:</b>' . iS::escapeStr($data['rule'])); echo "<hr />"; } if (strpos($data['rule'], 'RULE@') !== false) { spider::$rid = str_replace('RULE@', '', $data['rule']); $_urls = trim($html); if (spider::$dataTest) { print_r('<b>使用[rid:' . spider::$rid . ']规则抓取</b>:' . $_urls); echo "<hr />"; } return spiderUrls::crawl('DATA@RULE', false, spider::$rid, $_urls); } /** * RAND@10,0 * 返回随机数 */ if (strpos($data['rule'], 'RAND@') !== false) { $random = str_replace('RAND@', '', $data['rule']); list($length, $numeric) = explode(',', $random); return random($length, empty($numeric) ? 0 : 1); } $contentArray = array(); $contentHash = array(); $_content = null; $_content = spiderContent::match($html, $data, $rule); $cmd5 = md5($_content); $contentArray[] = $_content; $contentHash[$cmd5] = true; if ($data['page']) { if (empty($rule['page_url'])) { $rule['page_url'] = $rule['list_url']; } if (empty(spider::$allHtml)) { $page_url_array = array(); $page_area_rule = trim($rule['page_area_rule']); if ($page_area_rule) { if (strpos($page_area_rule, 'DOM::') !== false) { iPHP::import(iPHP_LIB . '/phpQuery.php'); $doc = phpQuery::newDocumentHTML($html, 'UTF-8'); $pq_dom = str_replace('DOM::', '', $page_area_rule); $pq_array = phpQuery::pq($pq_dom); foreach ($pq_array as $pn => $pq_val) { $href = phpQuery::pq($pq_val)->attr('href'); if ($href) { if ($rule['page_url_rule']) { if (strpos($rule['page_url_rule'], '<%') !== false) { $page_url_rule = spiderTools::pregTag($rule['page_url_rule']); if (!preg_match('|' . $page_url_rule . '|is', $href)) { continue; } } else { $cleanhref = spiderTools::dataClean($rule['page_url_rule'], $href); if ($cleanhref) { $href = $cleanhref; unset($cleanhref); } else { continue; } } } $href = str_replace('<%url%>', $href, $rule['page_url']); $page_url_array[$pn] = spiderTools::url_complement($rule['__url__'], $href); } } phpQuery::unloadDocuments($doc->getDocumentID()); } else { $page_area_rule = spiderTools::pregTag($page_area_rule); if ($page_area_rule) { preg_match('|' . $page_area_rule . '|is', $html, $matches, $PREG_SET_ORDER); $page_area = $matches['content']; } else { $page_area = $html; } if ($rule['page_url_rule']) { $page_url_rule = spiderTools::pregTag($rule['page_url_rule']); preg_match_all('|' . $page_url_rule . '|is', $page_area, $page_url_matches, PREG_SET_ORDER); foreach ($page_url_matches as $pn => $row) { $href = str_replace('<%url%>', $row['url'], $rule['page_url']); $page_url_array[$pn] = spiderTools::url_complement($rule['__url__'], $href); gc_collect_cycles(); } } unset($page_area); } } else { // 逻辑方式 if ($rule['page_url_parse'] == '<%url%>') { $page_url = str_replace('<%url%>', $rule['__url__'], $rule['page_url']); } else { $page_url_rule = spiderTools::pregTag($rule['page_url_parse']); preg_match('|' . $page_url_rule . '|is', $rule['__url__'], $matches, $PREG_SET_ORDER); $page_url = str_replace('<%url%>', $matches['url'], $rule['page_url']); } if (stripos($page_url, '<%step%>') !== false) { for ($pn = $rule['page_no_start']; $pn <= $rule['page_no_end']; $pn = $pn + $rule['page_no_step']) { $page_url_array[$pn] = str_replace('<%step%>', $pn, $page_url); gc_collect_cycles(); } } } //URL去重清理 if ($page_url_array) { $page_url_array = array_filter($page_url_array); $page_url_array = array_unique($page_url_array); $puk = array_search($rule['__url__'], $page_url_array); if ($puk !== false) { unset($page_url_array[$puk]); } } if (spider::$dataTest) { echo "<b>内容页网址:</b>" . $rule['__url__'] . "<br />"; echo "<b>分页:</b>" . $rule['page_url'] . "<br />"; echo iS::escapeStr($page_url_rule); echo "<hr />"; } if (spider::$dataTest) { echo "<b>分页列表:</b><pre>"; print_r($page_url_array); echo "</pre><hr />"; } spider::$content_right_code = trim($rule['page_url_right']); spider::$content_error_code = trim($rule['page_url_error']); spider::$curl_proxy = $rule['proxy']; $pageurl = array(); foreach ($page_url_array as $pukey => $purl) { //usleep(100); $phtml = spiderTools::remote($purl); if (empty($phtml)) { break; } $md5 = md5($phtml); if ($pageurl[$md5]) { break; } $check_content = spiderTools::check_content_code($phtml); if ($check_content === false) { unset($check_content, $phtml); break; } $_content = spiderContent::match($phtml, $data, $rule); $cmd5 = md5($_content); if ($contentHash[$cmd5]) { break; } $contentArray[] = $_content; $contentHash[$cmd5] = true; $pageurl[$md5] = $purl; spider::$allHtml[$md5] = $phtml; } gc_collect_cycles(); unset($check_content, $phtml); if (spider::$dataTest) { echo "<b>最终分页列表:</b><pre>"; print_r($pageurl); echo "</pre><hr />"; } } else { foreach ((array) spider::$allHtml as $ahkey => $phtml) { $contentArray[] = spiderContent::match($phtml, $data, $rule); } } } $content = implode('#--iCMS.PageBreak--#', $contentArray); $html = null; unset($html, $contentArray, $contentHash, $_content); $content = stripslashes($content); if (spider::$dataTest) { print_r('<b>[' . $name . ']匹配结果:</b>' . htmlspecialchars($content)); echo "<hr />"; } if ($data['cleanbefor']) { $content = spiderTools::dataClean($data['cleanbefor'], $content); } /** * 在数据项里调用之前采集的数据[DATA@name][DATA@name.key] */ if (strpos($content, '[DATA@') !== false) { $content = spiderTools::getDATA($responses, $content); } if ($data['cleanhtml']) { $content = stripslashes($content); $content = preg_replace('/<[\\/\\!]*?[^<>]*?>/is', '', $content); } if ($data['format'] && $content) { $content = autoformat($content); } if ($data['img_absolute'] && $content) { // $content = stripslashes($content); preg_match_all("/<img.*?src\\s*=[\"|'](.*?)[\"|']/is", $content, $img_match); if ($img_match[1]) { $_img_array = array_unique($img_match[1]); $_img_urls = array(); foreach ((array) $_img_array as $_img_key => $_img_src) { $_img_urls[$_img_key] = spiderTools::url_complement($rule['__url__'], $_img_src); } $content = str_replace($_img_array, $_img_urls, $content); } unset($img_match, $_img_array, $_img_urls, $_img_src); } if ($data['trim']) { $content = trim($content); } if ($data['capture']) { // $content = stripslashes($content); $content = spiderTools::remote($content); } if ($data['download']) { // $content = stripslashes($content); $content = iFS::http($content); } if ($data['cleanafter']) { $content = spiderTools::dataClean($data['cleanafter'], $content); // $content = stripslashes($content); } if ($data['autobreakpage']) { $content = spiderTools::autoBreakPage($content); } if ($data['mergepage']) { $content = spiderTools::mergePage($content); } if ($data['empty'] && empty($content)) { $emptyMsg = '[' . $name . ']规则设置了不允许为空.当前抓取结果为空!请检查,规则是否正确!'; if (spider::$dataTest) { exit('<h1>' . $emptyMsg . '</h1>'); } if (spider::$work) { echo "\n{$emptyMsg}\n"; return false; } else { iPHP::alert($emptyMsg); } } if ($data['json_decode']) { $content = json_decode($content, true); } if ($data['array']) { return (array) $content; } if (spider::$callback['content'] && is_callable(spider::$callback['content'])) { $content = call_user_func_array(spider::$callback['content'], array($content)); } return $content; }
public static function crawl($_pid = NULL, $_rid = NULL, $_url = NULL, $_title = NULL) { ini_get('safe_mode') or set_time_limit(0); $sid = spider::$sid; if ($sid) { $sRs = iDB::row("SELECT * FROM `#iCMS@__spider_url` WHERE `id`='{$sid}' LIMIT 1;"); $title = $sRs->title; $cid = $sRs->cid; $pid = $sRs->pid; $url = $sRs->url; $rid = $sRs->rid; } else { $rid = spider::$rid; $pid = spider::$pid; $title = spider::$title; $url = spider::$url; $_rid === NULL or $rid = $_rid; $_pid === NULL or $pid = $_pid; $_title === NULL or $title = $_title; $_url === NULL or $url = $_url; } if ($pid) { $project = spider::project($pid); $prule_list_url = $project['list_url']; } $ruleA = spider::rule($rid); $rule = $ruleA['rule']; $dataArray = $rule['data']; if ($prule_list_url) { $rule['list_url'] = $prule_list_url; } if (spider::$dataTest) { echo "<b>抓取规则信息</b><pre>"; print_r(iS::escapeStr($ruleA)); print_r(iS::escapeStr($project)); echo "</pre><hr />"; } spider::$curl_proxy = $rule['proxy']; $responses = array(); $html = spiderTools::remote($url); if (empty($html)) { $msg = '错误:001..采集 ' . $url . '文件内容为空!请检查采集规则'; if (spider::$work == 'shell') { echo "{$msg}\n"; return false; } else { iPHP::alert($msg); } } // $http = spider::check_content_code($html); // // if($http['match']==false){ // return false; // } // $content = $http['content']; spider::$allHtml = ""; $rule['__url__'] = spider::$url; $responses['reurl'] = spider::$url; $responses['__title__'] = $title; foreach ((array) $dataArray as $key => $data) { $content_html = $html; $dname = $data['name']; /** * [UNSET:name] * 注销[name] * @var string */ if (strpos($dname, 'UNSET:') !== false) { $_dname = str_replace('UNSET:', '', $dname); unset($responses[$_dname]); continue; } /** * [DATA:name] * 把之前[name]处理完的数据当作原始数据 * 如果之前有数据会叠加 * 用于数据多次处理 * @var string */ if (strpos($dname, 'DATA:') !== false) { $_dname = str_replace('DATA:', '', $dname); $content_html = $responses[$_dname]; unset($responses[$dname]); } /** * [PRE:name] * 把PRE:name采集到的数据 当做原始数据 * 一般用于下载内容 * @var string */ $pre_dname = 'PRE:' . $dname; if (isset($responses[$pre_dname])) { $content_html = $responses[$pre_dname]; unset($responses[$pre_dname]); } /** * [EMPTY:name] * 如果[name]之前抓取结果数据为空使用这个数据项替换 * @var string */ if (strpos($dname, 'EMPTY:') !== false) { $_dname = str_replace('EMPTY:', '', $dname); if (empty($responses[$_dname])) { $dname = $_dname; } else { //有值不执行抓取 continue; } } $content = spiderContent::crawl($content_html, $data, $rule, $responses); unset($content_html); if (strpos($dname, 'ARRAY:') !== false) { // if(strpos($data['rule'], 'RULE@')!==false){ $dname = str_replace('ARRAY:', '', $dname); // $contentArray = $responses[$dname]; // // $contentArray = $responses[$dname]; $cArray = array(); foreach ((array) $content as $k => $value) { foreach ((array) $value as $key => $val) { $cArray[$key][$k] = $val; } } if ($cArray) { $content = $cArray; unset($cArray); } } /** * [name.xxx] * 采集内容做为数组 */ if (strpos($dname, '.') !== false) { $f_key = substr($dname, 0, stripos($dname, ".")); $s_key = substr(strrchr($dname, "."), 1); if (isset($responses[$f_key][$s_key])) { if (is_array($responses[$f_key][$s_key])) { $responses[$f_key][$s_key] = array_merge($responses[$f_key][$s_key], $content); } else { $responses[$f_key][$s_key] .= $content; } } else { $responses[$f_key][$s_key] = $content; } } else { /** * 多个name 内容合并 */ if (isset($responses[$dname])) { if (is_array($responses[$dname])) { $responses[$dname] = array_merge($responses[$dname], $content); } else { $responses[$dname] .= $content; } } else { $responses[$dname] = $content; } } /** * 对匹配多条的数据去重过滤 */ if (!is_array($responses[$dname]) && $data['multi']) { if (strpos($responses[$dname], ',') !== false) { $_dnameArray = explode(',', $responses[$dname]); $dnameArray = array(); foreach ((array) $_dnameArray as $key => $value) { $value = trim($value); $value && ($dnameArray[] = $value); } $dnameArray = array_filter($dnameArray); $dnameArray = array_unique($dnameArray); $responses[$dname] = implode(',', $dnameArray); unset($dnameArray, $_dnameArray); } } gc_collect_cycles(); } if (isset($responses['title']) && empty($responses['title'])) { $responses['title'] = $responses['__title__']; } spider::$allHtml = null; unset($html); gc_collect_cycles(); if (spider::$dataTest) { echo "<pre style='width:99%;word-wrap: break-word;'>"; print_r(iS::escapeStr($responses)); echo '<hr />'; echo '使用内存:' . iFS::sizeUnit(memory_get_usage()) . ' 执行时间:' . iPHP::timer_stop() . 's'; echo "</pre>"; } iFS::$CURLOPT_ENCODING = ''; iFS::$CURLOPT_REFERER = ''; iFS::$watermark_config['pos'] = iCMS::$config['watermark']['pos']; iFS::$watermark_config['x'] = iCMS::$config['watermark']['x']; iFS::$watermark_config['y'] = iCMS::$config['watermark']['y']; iFS::$watermark_config['img'] = iCMS::$config['watermark']['img']; $rule['fs']['encoding'] && (iFS::$CURLOPT_ENCODING = $rule['fs']['encoding']); $rule['fs']['referer'] && (iFS::$CURLOPT_REFERER = $rule['fs']['referer']); if ($rule['watermark_mode']) { iFS::$watermark_config['pos'] = $rule['watermark']['pos']; iFS::$watermark_config['x'] = $rule['watermark']['x']; iFS::$watermark_config['y'] = $rule['watermark']['y']; $rule['watermark']['img'] && (iFS::$watermark_config['img'] = $rule['watermark']['img']); } if (spider::$callback['data'] && is_callable(spider::$callback['data'])) { $responses = call_user_func_array(spider::$callback['data'], array($responses)); } return $responses; }
public static function remote($url, $_count = 0) { $url = str_replace('&', '&', $url); if (empty(spider::$referer)) { $uri = parse_url($url); spider::$referer = $uri['scheme'] . '://' . $uri['host']; } $options = array(CURLOPT_URL => $url, CURLOPT_ENCODING => spider::$encoding, CURLOPT_REFERER => spider::$referer, CURLOPT_USERAGENT => spider::$useragent, CURLOPT_TIMEOUT => 10, CURLOPT_CONNECTTIMEOUT => 10, CURLOPT_RETURNTRANSFER => 1, CURLOPT_FAILONERROR => 1, CURLOPT_HEADER => 0, CURLOPT_NOSIGNAL => true, CURLOPT_DNS_USE_GLOBAL_CACHE => true, CURLOPT_DNS_CACHE_TIMEOUT => 86400, CURLOPT_SSL_VERIFYPEER => false, CURLOPT_SSL_VERIFYHOST => false); spider::$cookie && ($options[CURLOPT_COOKIE] = spider::$cookie); if (spider::$curl_proxy) { $proxy = spiderTools::proxy_test(); $proxy && ($options = spiderTools::proxy($options, $proxy)); } $ch = curl_init(); curl_setopt_array($ch, $options); $responses = curl_exec($ch); $info = curl_getinfo($ch); if (spider::$dataTest || spider::$ruleTest) { echo "<b>{$url} 头信息:</b><pre>"; print_r($info); echo '</pre><hr />'; if ($_GET['breakinfo']) { exit; } } if (in_array($info['http_code'], array(301, 302)) && $_count < 5) { $_count++; $newurl = $info['redirect_url']; if (empty($newurl)) { curl_setopt($ch, CURLOPT_HEADER, 1); $header = curl_exec($ch); preg_match('|Location: (.*)|i', $header, $matches); $newurl = ltrim($matches[1], '/'); if (empty($newurl)) { return false; } if (!strstr($newurl, 'http://')) { $host = $uri['scheme'] . '://' . $uri['host']; $newurl = $host . '/' . $newurl; } } $newurl = trim($newurl); curl_close($ch); unset($responses, $info); return spiderTools::remote($newurl, $_count); } if (in_array($info['http_code'], array(404, 500))) { curl_close($ch); unset($responses, $info); return false; } if ((empty($responses) || $info['http_code'] != 200) && $_count < 5) { $_count++; if (spider::$dataTest || spider::$ruleTest) { echo $url . '<br />'; echo "获取内容失败,重试第{$_count}次...<br />"; } curl_close($ch); unset($responses, $info); return spiderTools::remote($url, $_count); } $pos = stripos($info['content_type'], 'charset='); $pos !== false && ($content_charset = trim(substr($info['content_type'], $pos + 8))); $responses = spiderTools::charsetTrans($responses, $content_charset, spider::$charset); curl_close($ch); unset($info); if (spider::$dataTest || spider::$ruleTest) { echo '<pre>'; print_r(htmlspecialchars(substr($responses, 0, 800))); echo '</pre><hr />'; } spider::$url = $url; return $responses; }
public static function crawl($work = NULL, $pid = NULL, $_rid = NULL, $_urls = null, $callback = null) { $pid === NULL && ($pid = spider::$pid); if ($pid) { $project = spider::project($pid); $cid = $project['cid']; $rid = $project['rid']; $prule_list_url = $project['list_url']; $lastupdate = $project['lastupdate']; } else { $cid = spider::$cid; $rid = spider::$rid; } if (empty($rid) && $_rid !== NULL) { $rid = $_rid; } if ($work == 'shell') { $lastupdate = $project['lastupdate']; if ($project['psleep']) { if (time() - $lastupdate < $project['psleep']) { echo '采集方案[' . $pid . "]:" . format_date($lastupdate) . "刚采集过了,请" . $project['psleep'] / 3600 . "小时后在继续采集\n"; return; } } echo "[32m开始采集方案[" . $pid . "] 采集规则[" . $rid . "][0m\n"; } $ruleA = spider::rule($rid); $rule = $ruleA['rule']; $urls = $rule['list_urls']; $project['urls'] && ($urls = $project['urls']); spiderUrls::$urls && ($urls = spiderUrls::$urls); $_urls && ($urls = $_urls); $urlsArray = explode("\n", $urls); $urlsArray = array_filter($urlsArray); $_urlsArray = $urlsArray; $urlsList = array(); if ($work == 'shell') { // echo "$urls\n"; print_r($urlsArray); } foreach ($_urlsArray as $_key => $_url) { $_url = htmlspecialchars_decode($_url); $_urlsList = array(); /** * RULE@rid@url * url使用[rid]规则采集并返回列表结果 */ if (strpos($_url, 'RULE@') !== false) { list($___s, $_rid, $_urls) = explode('@', $_url); if (spider::$ruleTest) { print_r('<b>使用[rid:' . $_rid . ']规则抓取列表</b>:' . $_urls); echo "<hr />"; } $_urlsList = spiderUrls::crawl($work, false, $_rid, $_urls, 'CALLBACK@URL'); $urlsList = array_merge($urlsList, $_urlsList); unset($urlsArray[$_key]); } else { preg_match('|.*<(.*)>.*|is', $_url, $_matches); if ($_matches) { list($format, $begin, $num, $step, $zeroize, $reverse) = explode(',', $_matches[1]); $url = str_replace($_matches[1], '*', trim($_matches[0])); $_urlsList = spiderTools::mkurls($url, $format, $begin, $num, $step, $zeroize, $reverse); unset($urlsArray[$_key]); $urlsList = array_merge($urlsList, $_urlsList); } } } $urlsList && ($urlsArray = array_merge($urlsArray, $urlsList)); unset($_urlsArray, $_key, $_url, $_matches, $_urlsList, $urlsList); $urlsArray = array_unique($urlsArray); // spider::$useragent = $rule['user_agent']; // spider::$encoding = $rule['curl']['encoding']; // spider::$referer = $rule['curl']['referer']; // spider::$charset = $rule['charset']; if (empty($urlsArray)) { if ($work == 'shell') { echo "采集列表为空!请填写!\n"; return false; } iPHP::alert('采集列表为空!请填写!', 'js:parent.window.iCMS_MODAL.destroy();'); } // if(spider::$ruleTest){ // echo "<pre>"; // print_r(iS::escapeStr($project)); // print_r(iS::escapeStr($rule)); // echo "</pre>"; // echo "<hr />"; // } if ($rule['mode'] == "2") { iPHP::import(iPHP_LIB . '/phpQuery.php'); spider::$ruleTest && $_GET['pq_debug'] && (phpQuery::$debug = 1); } $pubArray = array(); $pubCount = array(); $pubAllCount = array(); spider::$curl_proxy = $rule['proxy']; spider::$urlslast = null; foreach ($urlsArray as $key => $url) { $url = trim($url); spider::$urlslast = $url; if ($work == 'shell') { echo '开始采集列表:' . $url . "\n"; } if (spider::$ruleTest) { echo '<b>抓取列表:</b>' . $url . "<br />"; } $html = spiderTools::remote($url); if (empty($html)) { continue; } if ($rule['mode'] == "2") { $doc = phpQuery::newDocumentHTML($html, 'UTF-8'); $list_area = $doc[trim($rule['list_area_rule'])]; // if(strpos($rule['list_area_format'], 'DOM::')!==false){ // $list_area = spiderTools::dataClean($rule['list_area_format'], $list_area); // } if ($rule['list_area_format']) { $list_area_format = trim($rule['list_area_format']); if (strpos($list_area_format, 'ARRAY::') !== false) { $list_area_format = str_replace('ARRAY::', '', $list_area_format); $lists = array(); foreach ($list_area as $la_key => $la) { $lists[] = phpQuery::pq($list_area_format, $la); } } else { $lists = phpQuery::pq($list_area_format, $list_area); } } else { $lists = $list_area; } // $lists = $list_area; //echo 'list:getDocumentID:'.$lists->getDocumentID()."\n"; } else { $list_area_rule = spiderTools::pregTag($rule['list_area_rule']); if ($list_area_rule) { preg_match('|' . $list_area_rule . '|is', $html, $matches, $PREG_SET_ORDER); $list_area = $matches['content']; } else { $list_area = $html; } $html = null; unset($html); if (spider::$ruleTest) { echo iS::escapeStr($rule['list_area_rule']); // echo iS::escapeStr($list_area); echo "<hr />"; } if ($rule['list_area_format']) { $list_area = spiderTools::dataClean($rule['list_area_format'], $list_area); } preg_match_all('|' . spiderTools::pregTag($rule['list_url_rule']) . '|is', $list_area, $lists, PREG_SET_ORDER); $list_area = null; unset($list_area); if ($rule['sort'] == "1") { //arsort($lists); } elseif ($rule['sort'] == "2") { asort($lists); } elseif ($rule['sort'] == "3") { shuffle($lists); } } if (spider::$ruleTest) { echo '<b>列表区域规则:</b>' . iS::escapeStr($rule['list_area_rule']); echo "<hr />"; echo '<b>列表区域抓取结果:</b>' . iS::escapeStr($list_area); echo "<hr />"; echo '<b>列表链接规则:</b>' . iS::escapeStr($rule['list_url_rule']); echo "<hr />"; echo '<b>网址合成规则:</b>' . iS::escapeStr($rule['list_url']); echo "<hr />"; } if ($prule_list_url) { $rule['list_url'] = $prule_list_url; } //PID@xx 返回URL列表 if ($callback == 'CALLBACK@URL') { $cbListUrl = array(); foreach ($lists as $lkey => $row) { list(spider::$title, spider::$url) = spiderTools::title_url($row, $rule, $url); if (spider::$url === false) { continue; } // if(spider::checker($work)===true){ $cbListUrl[] = spider::$url; // } } return $cbListUrl; } if ($work == "shell") { $pubCount[$url]['count'] = count($lists); $pubAllCount['count'] += $pubCount[$url]['count']; echo "开始采集:" . $url . " 列表 " . $pubCount[$url]['count'] . "条记录\n"; foreach ($lists as $lkey => $row) { list(spider::$title, spider::$url) = spiderTools::title_url($row, $rule, $url); if (spider::$url === false) { continue; } $hash = md5(spider::$url); echo "title:" . spider::$title . "\n"; echo "url:" . spider::$url . "\n"; spider::$rid = $rid; $checker = spider::checker($work); if ($checker === true) { echo "开始采集...."; $callback = spider::publish("shell"); if ($callback['code'] == "1001") { $pubCount[$url]['success']++; $pubAllCount['success']++; echo "....√\n"; if ($project['sleep']) { echo "sleep:" . $project['sleep'] . "s\n"; if ($rule['mode'] != "2") { unset($lists[$lkey]); } gc_collect_cycles(); sleep($project['sleep']); } else { //sleep(1); } } else { $pubCount[$url]['error']++; $pubAllCount['error']++; echo "error\n\n"; continue; } } $pubCount[$url]['published']++; $pubAllCount['published']++; } if ($rule['mode'] == "2") { phpQuery::unloadDocuments($doc->getDocumentID()); } else { unset($lists); } } if ($work == "WEB@MANUAL") { $listsArray[$url] = $lists; } if ($work == "WEB@AUTO" || $work == 'DATA@RULE') { foreach ($lists as $lkey => $row) { list(spider::$title, spider::$url) = spiderTools::title_url($row, $rule, $url); if (spider::$url === false) { continue; } $hash = md5(spider::$url); if (spider::$ruleTest) { echo '<b>列表抓取结果:</b>' . $lkey . '<br />'; echo spider::$title . ' (<a href="' . APP_URI . '&do=testdata' . '&url=' . urlencode(spider::$url) . '&rid=' . $rid . '&pid=' . $pid . '&title=' . urlencode(spider::$title) . '" target="_blank">测试内容规则</a>) <br />'; echo spider::$url . "<br />"; echo $hash . "<br /><hr />"; } else { if (spider::checker($work) === true || spider::$dataTest) { $suData = array('sid' => 0, 'url' => spider::$url, 'title' => spider::$title, 'cid' => $cid, 'rid' => $rid, 'pid' => $pid, 'hash' => $hash); switch ($work) { case 'DATA@RULE': $contentArray[$lkey] = spiderData::crawl(); // $contentArray[$lkey] = spiderUrls::crawl($work,$_pid); unset($suData['sid']); $suData['title'] = addslashes($suData['title']); $suData += array('addtime' => time(), 'status' => '2', 'publish' => '2', 'indexid' => '0', 'pubdate' => '0'); spider::$dataTest or $suid = iDB::insert('spider_url', $suData); $contentArray[$lkey]['spider_url'] = $suid; break; case 'WEB@AUTO': $pubArray[] = $suData; break; } } } } } } $lists = null; unset($lists); gc_collect_cycles(); switch ($work) { case 'WEB@AUTO': return $pubArray; break; case 'DATA@RULE': return $contentArray; break; case 'WEB@MANUAL': return array('cid' => $cid, 'rid' => $rid, 'pid' => $pid, 'sid' => $sid, 'work' => $work, 'rule' => $rule, 'listsArray' => $listsArray); break; case "shell": echo "采集数据统结果:\n"; print_r($pubCount); print_r($pubAllCount); echo "全部采集完成....\n"; iDB::update('spider_project', array('lastupdate' => time()), array('id' => $pid)); break; } }