Пример #1
0
 public static function crawl($_pid = NULL, $_rid = NULL, $_url = NULL, $_title = NULL)
 {
     ini_get('safe_mode') or set_time_limit(0);
     $sid = spider::$sid;
     if ($sid) {
         $sRs = iDB::row("SELECT * FROM `#iCMS@__spider_url` WHERE `id`='{$sid}' LIMIT 1;");
         $title = $sRs->title;
         $cid = $sRs->cid;
         $pid = $sRs->pid;
         $url = $sRs->url;
         $rid = $sRs->rid;
     } else {
         $rid = spider::$rid;
         $pid = spider::$pid;
         $title = spider::$title;
         $url = spider::$url;
         $_rid === NULL or $rid = $_rid;
         $_pid === NULL or $pid = $_pid;
         $_title === NULL or $title = $_title;
         $_url === NULL or $url = $_url;
     }
     if ($pid) {
         $project = spider::project($pid);
         $prule_list_url = $project['list_url'];
     }
     $ruleA = spider::rule($rid);
     $rule = $ruleA['rule'];
     $dataArray = $rule['data'];
     if ($prule_list_url) {
         $rule['list_url'] = $prule_list_url;
     }
     if (spider::$dataTest) {
         echo "<b>抓取规则信息</b><pre>";
         print_r(iS::escapeStr($ruleA));
         print_r(iS::escapeStr($project));
         echo "</pre><hr />";
     }
     spider::$curl_proxy = $rule['proxy'];
     $responses = array();
     $html = spiderTools::remote($url);
     if (empty($html)) {
         $msg = '错误:001..采集 ' . $url . '文件内容为空!请检查采集规则';
         if (spider::$work == 'shell') {
             echo "{$msg}\n";
             return false;
         } else {
             iPHP::alert($msg);
         }
     }
     //      $http   = spider::check_content_code($html);
     //
     //      if($http['match']==false){
     //          return false;
     //      }
     //      $content        = $http['content'];
     spider::$allHtml = "";
     $rule['__url__'] = spider::$url;
     $responses['reurl'] = spider::$url;
     $responses['__title__'] = $title;
     foreach ((array) $dataArray as $key => $data) {
         $content_html = $html;
         $dname = $data['name'];
         /**
          * [UNSET:name]
          * 注销[name]
          * @var string
          */
         if (strpos($dname, 'UNSET:') !== false) {
             $_dname = str_replace('UNSET:', '', $dname);
             unset($responses[$_dname]);
             continue;
         }
         /**
          * [DATA:name]
          * 把之前[name]处理完的数据当作原始数据
          * 如果之前有数据会叠加
          * 用于数据多次处理
          * @var string
          */
         if (strpos($dname, 'DATA:') !== false) {
             $_dname = str_replace('DATA:', '', $dname);
             $content_html = $responses[$_dname];
             unset($responses[$dname]);
         }
         /**
          * [PRE:name]
          * 把PRE:name采集到的数据 当做原始数据
          * 一般用于下载内容
          * @var string
          */
         $pre_dname = 'PRE:' . $dname;
         if (isset($responses[$pre_dname])) {
             $content_html = $responses[$pre_dname];
             unset($responses[$pre_dname]);
         }
         /**
          * [EMPTY:name]
          * 如果[name]之前抓取结果数据为空使用这个数据项替换
          * @var string
          */
         if (strpos($dname, 'EMPTY:') !== false) {
             $_dname = str_replace('EMPTY:', '', $dname);
             if (empty($responses[$_dname])) {
                 $dname = $_dname;
             } else {
                 //有值不执行抓取
                 continue;
             }
         }
         $content = spiderContent::crawl($content_html, $data, $rule, $responses);
         unset($content_html);
         if (strpos($dname, 'ARRAY:') !== false) {
             // if(strpos($data['rule'], 'RULE@')!==false){
             $dname = str_replace('ARRAY:', '', $dname);
             // $contentArray = $responses[$dname];
             // // $contentArray = $responses[$dname];
             $cArray = array();
             foreach ((array) $content as $k => $value) {
                 foreach ((array) $value as $key => $val) {
                     $cArray[$key][$k] = $val;
                 }
             }
             if ($cArray) {
                 $content = $cArray;
                 unset($cArray);
             }
         }
         /**
          * [name.xxx]
          * 采集内容做为数组
          */
         if (strpos($dname, '.') !== false) {
             $f_key = substr($dname, 0, stripos($dname, "."));
             $s_key = substr(strrchr($dname, "."), 1);
             if (isset($responses[$f_key][$s_key])) {
                 if (is_array($responses[$f_key][$s_key])) {
                     $responses[$f_key][$s_key] = array_merge($responses[$f_key][$s_key], $content);
                 } else {
                     $responses[$f_key][$s_key] .= $content;
                 }
             } else {
                 $responses[$f_key][$s_key] = $content;
             }
         } else {
             /**
              * 多个name 内容合并
              */
             if (isset($responses[$dname])) {
                 if (is_array($responses[$dname])) {
                     $responses[$dname] = array_merge($responses[$dname], $content);
                 } else {
                     $responses[$dname] .= $content;
                 }
             } else {
                 $responses[$dname] = $content;
             }
         }
         /**
          * 对匹配多条的数据去重过滤
          */
         if (!is_array($responses[$dname]) && $data['multi']) {
             if (strpos($responses[$dname], ',') !== false) {
                 $_dnameArray = explode(',', $responses[$dname]);
                 $dnameArray = array();
                 foreach ((array) $_dnameArray as $key => $value) {
                     $value = trim($value);
                     $value && ($dnameArray[] = $value);
                 }
                 $dnameArray = array_filter($dnameArray);
                 $dnameArray = array_unique($dnameArray);
                 $responses[$dname] = implode(',', $dnameArray);
                 unset($dnameArray, $_dnameArray);
             }
         }
         gc_collect_cycles();
     }
     if (isset($responses['title']) && empty($responses['title'])) {
         $responses['title'] = $responses['__title__'];
     }
     spider::$allHtml = null;
     unset($html);
     gc_collect_cycles();
     if (spider::$dataTest) {
         echo "<pre style='width:99%;word-wrap: break-word;'>";
         print_r(iS::escapeStr($responses));
         echo '<hr />';
         echo '使用内存:' . iFS::sizeUnit(memory_get_usage()) . ' 执行时间:' . iPHP::timer_stop() . 's';
         echo "</pre>";
     }
     iFS::$CURLOPT_ENCODING = '';
     iFS::$CURLOPT_REFERER = '';
     iFS::$watermark_config['pos'] = iCMS::$config['watermark']['pos'];
     iFS::$watermark_config['x'] = iCMS::$config['watermark']['x'];
     iFS::$watermark_config['y'] = iCMS::$config['watermark']['y'];
     iFS::$watermark_config['img'] = iCMS::$config['watermark']['img'];
     $rule['fs']['encoding'] && (iFS::$CURLOPT_ENCODING = $rule['fs']['encoding']);
     $rule['fs']['referer'] && (iFS::$CURLOPT_REFERER = $rule['fs']['referer']);
     if ($rule['watermark_mode']) {
         iFS::$watermark_config['pos'] = $rule['watermark']['pos'];
         iFS::$watermark_config['x'] = $rule['watermark']['x'];
         iFS::$watermark_config['y'] = $rule['watermark']['y'];
         $rule['watermark']['img'] && (iFS::$watermark_config['img'] = $rule['watermark']['img']);
     }
     if (spider::$callback['data'] && is_callable(spider::$callback['data'])) {
         $responses = call_user_func_array(spider::$callback['data'], array($responses));
     }
     return $responses;
 }
Пример #2
0
 public static function remote($url, $_count = 0)
 {
     if (function_exists('curl_init')) {
         if (empty($url)) {
             echo 'remote:(' . $_count . ')' . $url . "\n";
             echo "url:empty\n";
             return false;
         }
         if (self::$CURLOPT_REFERER === null) {
             $uri = parse_url($url);
             self::$CURLOPT_REFERER = $uri['scheme'] . '://' . $uri['host'];
         }
         $options = array(CURLOPT_URL => $url, CURLOPT_REFERER => self::$CURLOPT_REFERER, CURLOPT_USERAGENT => self::$CURLOPT_USERAGENT, CURLOPT_ENCODING => self::$CURLOPT_ENCODING, CURLOPT_TIMEOUT => self::$CURLOPT_TIMEOUT, CURLOPT_CONNECTTIMEOUT => self::$CURLOPT_CONNECTTIMEOUT, CURLOPT_RETURNTRANSFER => 1, CURLOPT_FAILONERROR => 0, CURLOPT_HEADER => 0, CURLOPT_NOSIGNAL => true, CURLOPT_DNS_USE_GLOBAL_CACHE => true, CURLOPT_DNS_CACHE_TIMEOUT => 86400, CURLOPT_SSL_VERIFYPEER => false, CURLOPT_SSL_VERIFYHOST => false);
         if (self::$CURL_PROXY) {
             $proxy = self::proxy_test();
             $proxy && ($options = self::proxy($options, $proxy));
         }
         $ch = curl_init();
         curl_setopt_array($ch, $options);
         $responses = curl_exec($ch);
         $info = curl_getinfo($ch);
         $errno = curl_errno($ch);
         if ($info['http_code'] == 404 || $info['http_code'] == 500) {
             curl_close($ch);
             echo $url . "\n";
             echo "http_code:" . $info['http_code'] . "\n";
             unset($responses, $info);
             return false;
         }
         if (($info['http_code'] == 301 || $info['http_code'] == 302) && $_count < self::$CURL_COUNT) {
             $newurl = $info['redirect_url'];
             if (empty($newurl)) {
                 curl_setopt($ch, CURLOPT_HEADER, 1);
                 $header = curl_exec($ch);
                 preg_match('|Location: (.*)|i', $header, $matches);
                 $newurl = ltrim($matches[1], '/');
                 if (empty($newurl)) {
                     return false;
                 }
                 if (!strstr($newurl, 'http://')) {
                     $host = $uri['scheme'] . '://' . $uri['host'];
                     $newurl = $host . '/' . $newurl;
                 }
             }
             $newurl = trim($newurl);
             curl_close($ch);
             unset($responses, $info);
             $_count++;
             return self::remote($newurl, $_count);
         }
         if ($errno > 0 || empty($responses) || empty($info['http_code'])) {
             if ($_count < self::$CURL_COUNT) {
                 $_count++;
                 curl_close($ch);
                 unset($responses, $info);
                 return self::remote($url, $_count);
             } else {
                 $curl_error = curl_error($ch);
                 curl_close($ch);
                 unset($responses, $info);
                 echo $url . " remote:{$_count}\n";
                 echo "cURL Error ({$errno}): {$curl_error}\n";
                 return false;
             }
         }
         curl_close($ch);
     } elseif (ini_get('allow_url_fopen') && ($handle = fopen($url, 'rb'))) {
         if (function_exists('stream_get_contents')) {
             $responses = stream_get_contents($handle);
         } else {
             while (!feof($handle) && connection_status() == 0) {
                 $responses .= fread($handle, 8192);
             }
         }
         fclose($handle);
     } else {
         $responses = file_get_contents(urlencode($url));
     }
     return $responses;
 }
Пример #3
0
 function spider_content()
 {
     ini_get('safe_mode') or set_time_limit(0);
     $sid = $this->sid;
     if ($sid) {
         $sRs = iDB::row("SELECT * FROM `#iCMS@__spider_url` WHERE `id`='{$sid}' LIMIT 1;");
         $title = $sRs->title;
         $cid = $sRs->cid;
         $pid = $sRs->pid;
         $url = $sRs->url;
         $rid = $sRs->rid;
     } else {
         $rid = $this->rid;
         $pid = $this->pid;
         $title = $this->title;
         $url = $this->url;
     }
     if ($pid) {
         $project = $this->project($pid);
         $prule_list_url = $project['list_url'];
     }
     $ruleA = $this->rule($rid);
     $rule = $ruleA['rule'];
     $dataArray = $rule['data'];
     if ($prule_list_url) {
         $rule['list_url'] = $prule_list_url;
     }
     if ($this->contTest) {
         echo "<pre>";
         print_r(iS::escapeStr($ruleA));
         print_r(iS::escapeStr(${$project}));
         echo "</pre><hr />";
     }
     $this->curl_proxy = $rule['proxy'];
     $responses = array();
     $html = $this->remote($url);
     if (empty($html)) {
         if ($this->work == 'shell') {
             echo '错误:001..采集 ' . $url . "文件内容为空!请检查采集规则\n";
             return false;
         } else {
             iPHP::alert('错误:001..采集 ' . $url . ' 文件内容为空!请检查采集规则');
         }
     }
     //    	$http	= $this->check_content_code($html);
     //
     //    	if($http['match']==false){
     //    		return false;
     //    	}
     //		$content		= $http['content'];
     $this->allHtml = "";
     $responses['reurl'] = $this->url;
     $rule['__url__'] = $this->url;
     foreach ((array) $dataArray as $key => $data) {
         $content_html = $html;
         $dname = $data['name'];
         if (strpos($dname, 'DATA:') !== false) {
             $dname = str_replace('DATA:', '', $dname);
             $content_html = $responses[$dname];
             unset($responses[$dname]);
         } else {
             $url_dkey = 'PRE:' . $dname;
             if (isset($responses[$url_dkey])) {
                 $content_html = $responses[$url_dkey];
                 unset($responses[$url_dkey]);
             }
         }
         $content = $this->content($content_html, $data, $rule);
         unset($content_html);
         if (strpos($dname, '.') !== false) {
             $f_key = substr($dname, 0, stripos($dname, "."));
             $s_key = substr(strrchr($dname, "."), 1);
             if (isset($responses[$f_key][$s_key])) {
                 if (is_array($responses[$f_key][$s_key])) {
                     $responses[$f_key][$s_key] = array_merge($responses[$f_key][$s_key], $content);
                 } else {
                     $responses[$f_key][$s_key] .= $content;
                 }
             } else {
                 $responses[$f_key][$s_key] = $content;
             }
         } else {
             if (isset($responses[$dname])) {
                 if (is_array($responses[$dname])) {
                     $responses[$dname] = array_merge($responses[$dname], $content);
                 } else {
                     $responses[$dname] .= $content;
                 }
             } else {
                 $responses[$dname] = $content;
             }
         }
         gc_collect_cycles();
     }
     if (empty($responses['title']) && $responses['title'] !== false) {
         $responses['title'] = $title;
     }
     unset($this->allHtml, $html);
     gc_collect_cycles();
     if ($this->contTest) {
         echo "<pre style='width:99%;word-wrap: break-word;'>";
         print_r(iS::escapeStr($responses));
         echo "</pre><hr />";
     }
     iFS::$CURLOPT_ENCODING = $rule['fs']['encoding'];
     $rule['fs']['referer'] && (iFS::$CURLOPT_REFERER = $rule['fs']['referer']);
     if ($rule['watermark_mode']) {
         iFS::$watermark_config['pos'] = $rule['watermark']['pos'];
         iFS::$watermark_config['x'] = $rule['watermark']['x'];
         iFS::$watermark_config['y'] = $rule['watermark']['y'];
         $rule['watermark']['img'] && (iFS::$watermark_config['img'] = $rule['watermark']['img']);
     }
     return $responses;
 }