public static function crawl($_pid = NULL, $_rid = NULL, $_url = NULL, $_title = NULL) { ini_get('safe_mode') or set_time_limit(0); $sid = spider::$sid; if ($sid) { $sRs = iDB::row("SELECT * FROM `#iCMS@__spider_url` WHERE `id`='{$sid}' LIMIT 1;"); $title = $sRs->title; $cid = $sRs->cid; $pid = $sRs->pid; $url = $sRs->url; $rid = $sRs->rid; } else { $rid = spider::$rid; $pid = spider::$pid; $title = spider::$title; $url = spider::$url; $_rid === NULL or $rid = $_rid; $_pid === NULL or $pid = $_pid; $_title === NULL or $title = $_title; $_url === NULL or $url = $_url; } if ($pid) { $project = spider::project($pid); $prule_list_url = $project['list_url']; } $ruleA = spider::rule($rid); $rule = $ruleA['rule']; $dataArray = $rule['data']; if ($prule_list_url) { $rule['list_url'] = $prule_list_url; } if (spider::$dataTest) { echo "<b>抓取规则信息</b><pre>"; print_r(iS::escapeStr($ruleA)); print_r(iS::escapeStr($project)); echo "</pre><hr />"; } spider::$curl_proxy = $rule['proxy']; $responses = array(); $html = spiderTools::remote($url); if (empty($html)) { $msg = '错误:001..采集 ' . $url . '文件内容为空!请检查采集规则'; if (spider::$work == 'shell') { echo "{$msg}\n"; return false; } else { iPHP::alert($msg); } } // $http = spider::check_content_code($html); // // if($http['match']==false){ // return false; // } // $content = $http['content']; spider::$allHtml = ""; $rule['__url__'] = spider::$url; $responses['reurl'] = spider::$url; $responses['__title__'] = $title; foreach ((array) $dataArray as $key => $data) { $content_html = $html; $dname = $data['name']; /** * [UNSET:name] * 注销[name] * @var string */ if (strpos($dname, 'UNSET:') !== false) { $_dname = str_replace('UNSET:', '', $dname); unset($responses[$_dname]); continue; } /** * [DATA:name] * 把之前[name]处理完的数据当作原始数据 * 如果之前有数据会叠加 * 用于数据多次处理 * @var string */ if (strpos($dname, 'DATA:') !== false) { $_dname = str_replace('DATA:', '', $dname); $content_html = $responses[$_dname]; unset($responses[$dname]); } /** * [PRE:name] * 把PRE:name采集到的数据 当做原始数据 * 一般用于下载内容 * @var string */ $pre_dname = 'PRE:' . $dname; if (isset($responses[$pre_dname])) { $content_html = $responses[$pre_dname]; unset($responses[$pre_dname]); } /** * [EMPTY:name] * 如果[name]之前抓取结果数据为空使用这个数据项替换 * @var string */ if (strpos($dname, 'EMPTY:') !== false) { $_dname = str_replace('EMPTY:', '', $dname); if (empty($responses[$_dname])) { $dname = $_dname; } else { //有值不执行抓取 continue; } } $content = spiderContent::crawl($content_html, $data, $rule, $responses); unset($content_html); if (strpos($dname, 'ARRAY:') !== false) { // if(strpos($data['rule'], 'RULE@')!==false){ $dname = str_replace('ARRAY:', '', $dname); // $contentArray = $responses[$dname]; // // $contentArray = $responses[$dname]; $cArray = array(); foreach ((array) $content as $k => $value) { foreach ((array) $value as $key => $val) { $cArray[$key][$k] = $val; } } if ($cArray) { $content = $cArray; unset($cArray); } } /** * [name.xxx] * 采集内容做为数组 */ if (strpos($dname, '.') !== false) { $f_key = substr($dname, 0, stripos($dname, ".")); $s_key = substr(strrchr($dname, "."), 1); if (isset($responses[$f_key][$s_key])) { if (is_array($responses[$f_key][$s_key])) { $responses[$f_key][$s_key] = array_merge($responses[$f_key][$s_key], $content); } else { $responses[$f_key][$s_key] .= $content; } } else { $responses[$f_key][$s_key] = $content; } } else { /** * 多个name 内容合并 */ if (isset($responses[$dname])) { if (is_array($responses[$dname])) { $responses[$dname] = array_merge($responses[$dname], $content); } else { $responses[$dname] .= $content; } } else { $responses[$dname] = $content; } } /** * 对匹配多条的数据去重过滤 */ if (!is_array($responses[$dname]) && $data['multi']) { if (strpos($responses[$dname], ',') !== false) { $_dnameArray = explode(',', $responses[$dname]); $dnameArray = array(); foreach ((array) $_dnameArray as $key => $value) { $value = trim($value); $value && ($dnameArray[] = $value); } $dnameArray = array_filter($dnameArray); $dnameArray = array_unique($dnameArray); $responses[$dname] = implode(',', $dnameArray); unset($dnameArray, $_dnameArray); } } gc_collect_cycles(); } if (isset($responses['title']) && empty($responses['title'])) { $responses['title'] = $responses['__title__']; } spider::$allHtml = null; unset($html); gc_collect_cycles(); if (spider::$dataTest) { echo "<pre style='width:99%;word-wrap: break-word;'>"; print_r(iS::escapeStr($responses)); echo '<hr />'; echo '使用内存:' . iFS::sizeUnit(memory_get_usage()) . ' 执行时间:' . iPHP::timer_stop() . 's'; echo "</pre>"; } iFS::$CURLOPT_ENCODING = ''; iFS::$CURLOPT_REFERER = ''; iFS::$watermark_config['pos'] = iCMS::$config['watermark']['pos']; iFS::$watermark_config['x'] = iCMS::$config['watermark']['x']; iFS::$watermark_config['y'] = iCMS::$config['watermark']['y']; iFS::$watermark_config['img'] = iCMS::$config['watermark']['img']; $rule['fs']['encoding'] && (iFS::$CURLOPT_ENCODING = $rule['fs']['encoding']); $rule['fs']['referer'] && (iFS::$CURLOPT_REFERER = $rule['fs']['referer']); if ($rule['watermark_mode']) { iFS::$watermark_config['pos'] = $rule['watermark']['pos']; iFS::$watermark_config['x'] = $rule['watermark']['x']; iFS::$watermark_config['y'] = $rule['watermark']['y']; $rule['watermark']['img'] && (iFS::$watermark_config['img'] = $rule['watermark']['img']); } if (spider::$callback['data'] && is_callable(spider::$callback['data'])) { $responses = call_user_func_array(spider::$callback['data'], array($responses)); } return $responses; }
public static function remote($url, $_count = 0) { if (function_exists('curl_init')) { if (empty($url)) { echo 'remote:(' . $_count . ')' . $url . "\n"; echo "url:empty\n"; return false; } if (self::$CURLOPT_REFERER === null) { $uri = parse_url($url); self::$CURLOPT_REFERER = $uri['scheme'] . '://' . $uri['host']; } $options = array(CURLOPT_URL => $url, CURLOPT_REFERER => self::$CURLOPT_REFERER, CURLOPT_USERAGENT => self::$CURLOPT_USERAGENT, CURLOPT_ENCODING => self::$CURLOPT_ENCODING, CURLOPT_TIMEOUT => self::$CURLOPT_TIMEOUT, CURLOPT_CONNECTTIMEOUT => self::$CURLOPT_CONNECTTIMEOUT, CURLOPT_RETURNTRANSFER => 1, CURLOPT_FAILONERROR => 0, CURLOPT_HEADER => 0, CURLOPT_NOSIGNAL => true, CURLOPT_DNS_USE_GLOBAL_CACHE => true, CURLOPT_DNS_CACHE_TIMEOUT => 86400, CURLOPT_SSL_VERIFYPEER => false, CURLOPT_SSL_VERIFYHOST => false); if (self::$CURL_PROXY) { $proxy = self::proxy_test(); $proxy && ($options = self::proxy($options, $proxy)); } $ch = curl_init(); curl_setopt_array($ch, $options); $responses = curl_exec($ch); $info = curl_getinfo($ch); $errno = curl_errno($ch); if ($info['http_code'] == 404 || $info['http_code'] == 500) { curl_close($ch); echo $url . "\n"; echo "http_code:" . $info['http_code'] . "\n"; unset($responses, $info); return false; } if (($info['http_code'] == 301 || $info['http_code'] == 302) && $_count < self::$CURL_COUNT) { $newurl = $info['redirect_url']; if (empty($newurl)) { curl_setopt($ch, CURLOPT_HEADER, 1); $header = curl_exec($ch); preg_match('|Location: (.*)|i', $header, $matches); $newurl = ltrim($matches[1], '/'); if (empty($newurl)) { return false; } if (!strstr($newurl, 'http://')) { $host = $uri['scheme'] . '://' . $uri['host']; $newurl = $host . '/' . $newurl; } } $newurl = trim($newurl); curl_close($ch); unset($responses, $info); $_count++; return self::remote($newurl, $_count); } if ($errno > 0 || empty($responses) || empty($info['http_code'])) { if ($_count < self::$CURL_COUNT) { $_count++; curl_close($ch); unset($responses, $info); return self::remote($url, $_count); } else { $curl_error = curl_error($ch); curl_close($ch); unset($responses, $info); echo $url . " remote:{$_count}\n"; echo "cURL Error ({$errno}): {$curl_error}\n"; return false; } } curl_close($ch); } elseif (ini_get('allow_url_fopen') && ($handle = fopen($url, 'rb'))) { if (function_exists('stream_get_contents')) { $responses = stream_get_contents($handle); } else { while (!feof($handle) && connection_status() == 0) { $responses .= fread($handle, 8192); } } fclose($handle); } else { $responses = file_get_contents(urlencode($url)); } return $responses; }
function spider_content() { ini_get('safe_mode') or set_time_limit(0); $sid = $this->sid; if ($sid) { $sRs = iDB::row("SELECT * FROM `#iCMS@__spider_url` WHERE `id`='{$sid}' LIMIT 1;"); $title = $sRs->title; $cid = $sRs->cid; $pid = $sRs->pid; $url = $sRs->url; $rid = $sRs->rid; } else { $rid = $this->rid; $pid = $this->pid; $title = $this->title; $url = $this->url; } if ($pid) { $project = $this->project($pid); $prule_list_url = $project['list_url']; } $ruleA = $this->rule($rid); $rule = $ruleA['rule']; $dataArray = $rule['data']; if ($prule_list_url) { $rule['list_url'] = $prule_list_url; } if ($this->contTest) { echo "<pre>"; print_r(iS::escapeStr($ruleA)); print_r(iS::escapeStr(${$project})); echo "</pre><hr />"; } $this->curl_proxy = $rule['proxy']; $responses = array(); $html = $this->remote($url); if (empty($html)) { if ($this->work == 'shell') { echo '错误:001..采集 ' . $url . "文件内容为空!请检查采集规则\n"; return false; } else { iPHP::alert('错误:001..采集 ' . $url . ' 文件内容为空!请检查采集规则'); } } // $http = $this->check_content_code($html); // // if($http['match']==false){ // return false; // } // $content = $http['content']; $this->allHtml = ""; $responses['reurl'] = $this->url; $rule['__url__'] = $this->url; foreach ((array) $dataArray as $key => $data) { $content_html = $html; $dname = $data['name']; if (strpos($dname, 'DATA:') !== false) { $dname = str_replace('DATA:', '', $dname); $content_html = $responses[$dname]; unset($responses[$dname]); } else { $url_dkey = 'PRE:' . $dname; if (isset($responses[$url_dkey])) { $content_html = $responses[$url_dkey]; unset($responses[$url_dkey]); } } $content = $this->content($content_html, $data, $rule); unset($content_html); if (strpos($dname, '.') !== false) { $f_key = substr($dname, 0, stripos($dname, ".")); $s_key = substr(strrchr($dname, "."), 1); if (isset($responses[$f_key][$s_key])) { if (is_array($responses[$f_key][$s_key])) { $responses[$f_key][$s_key] = array_merge($responses[$f_key][$s_key], $content); } else { $responses[$f_key][$s_key] .= $content; } } else { $responses[$f_key][$s_key] = $content; } } else { if (isset($responses[$dname])) { if (is_array($responses[$dname])) { $responses[$dname] = array_merge($responses[$dname], $content); } else { $responses[$dname] .= $content; } } else { $responses[$dname] = $content; } } gc_collect_cycles(); } if (empty($responses['title']) && $responses['title'] !== false) { $responses['title'] = $title; } unset($this->allHtml, $html); gc_collect_cycles(); if ($this->contTest) { echo "<pre style='width:99%;word-wrap: break-word;'>"; print_r(iS::escapeStr($responses)); echo "</pre><hr />"; } iFS::$CURLOPT_ENCODING = $rule['fs']['encoding']; $rule['fs']['referer'] && (iFS::$CURLOPT_REFERER = $rule['fs']['referer']); if ($rule['watermark_mode']) { iFS::$watermark_config['pos'] = $rule['watermark']['pos']; iFS::$watermark_config['x'] = $rule['watermark']['x']; iFS::$watermark_config['y'] = $rule['watermark']['y']; $rule['watermark']['img'] && (iFS::$watermark_config['img'] = $rule['watermark']['img']); } return $responses; }