Exemple #1
0
 public static function crawl($_pid = NULL, $_rid = NULL, $_url = NULL, $_title = NULL)
 {
     ini_get('safe_mode') or set_time_limit(0);
     $sid = spider::$sid;
     if ($sid) {
         $sRs = iDB::row("SELECT * FROM `#iCMS@__spider_url` WHERE `id`='{$sid}' LIMIT 1;");
         $title = $sRs->title;
         $cid = $sRs->cid;
         $pid = $sRs->pid;
         $url = $sRs->url;
         $rid = $sRs->rid;
     } else {
         $rid = spider::$rid;
         $pid = spider::$pid;
         $title = spider::$title;
         $url = spider::$url;
         $_rid === NULL or $rid = $_rid;
         $_pid === NULL or $pid = $_pid;
         $_title === NULL or $title = $_title;
         $_url === NULL or $url = $_url;
     }
     if ($pid) {
         $project = spider::project($pid);
         $prule_list_url = $project['list_url'];
     }
     $ruleA = spider::rule($rid);
     $rule = $ruleA['rule'];
     $dataArray = $rule['data'];
     if ($prule_list_url) {
         $rule['list_url'] = $prule_list_url;
     }
     if (spider::$dataTest) {
         echo "<b>抓取规则信息</b><pre>";
         print_r(iS::escapeStr($ruleA));
         print_r(iS::escapeStr($project));
         echo "</pre><hr />";
     }
     spider::$curl_proxy = $rule['proxy'];
     $responses = array();
     $html = spiderTools::remote($url);
     if (empty($html)) {
         $msg = '错误:001..采集 ' . $url . '文件内容为空!请检查采集规则';
         if (spider::$work == 'shell') {
             echo "{$msg}\n";
             return false;
         } else {
             iPHP::alert($msg);
         }
     }
     //      $http   = spider::check_content_code($html);
     //
     //      if($http['match']==false){
     //          return false;
     //      }
     //      $content        = $http['content'];
     spider::$allHtml = "";
     $rule['__url__'] = spider::$url;
     $responses['reurl'] = spider::$url;
     $responses['__title__'] = $title;
     foreach ((array) $dataArray as $key => $data) {
         $content_html = $html;
         $dname = $data['name'];
         /**
          * [UNSET:name]
          * 注销[name]
          * @var string
          */
         if (strpos($dname, 'UNSET:') !== false) {
             $_dname = str_replace('UNSET:', '', $dname);
             unset($responses[$_dname]);
             continue;
         }
         /**
          * [DATA:name]
          * 把之前[name]处理完的数据当作原始数据
          * 如果之前有数据会叠加
          * 用于数据多次处理
          * @var string
          */
         if (strpos($dname, 'DATA:') !== false) {
             $_dname = str_replace('DATA:', '', $dname);
             $content_html = $responses[$_dname];
             unset($responses[$dname]);
         }
         /**
          * [PRE:name]
          * 把PRE:name采集到的数据 当做原始数据
          * 一般用于下载内容
          * @var string
          */
         $pre_dname = 'PRE:' . $dname;
         if (isset($responses[$pre_dname])) {
             $content_html = $responses[$pre_dname];
             unset($responses[$pre_dname]);
         }
         /**
          * [EMPTY:name]
          * 如果[name]之前抓取结果数据为空使用这个数据项替换
          * @var string
          */
         if (strpos($dname, 'EMPTY:') !== false) {
             $_dname = str_replace('EMPTY:', '', $dname);
             if (empty($responses[$_dname])) {
                 $dname = $_dname;
             } else {
                 //有值不执行抓取
                 continue;
             }
         }
         $content = spiderContent::crawl($content_html, $data, $rule, $responses);
         unset($content_html);
         if (strpos($dname, 'ARRAY:') !== false) {
             // if(strpos($data['rule'], 'RULE@')!==false){
             $dname = str_replace('ARRAY:', '', $dname);
             // $contentArray = $responses[$dname];
             // // $contentArray = $responses[$dname];
             $cArray = array();
             foreach ((array) $content as $k => $value) {
                 foreach ((array) $value as $key => $val) {
                     $cArray[$key][$k] = $val;
                 }
             }
             if ($cArray) {
                 $content = $cArray;
                 unset($cArray);
             }
         }
         /**
          * [name.xxx]
          * 采集内容做为数组
          */
         if (strpos($dname, '.') !== false) {
             $f_key = substr($dname, 0, stripos($dname, "."));
             $s_key = substr(strrchr($dname, "."), 1);
             if (isset($responses[$f_key][$s_key])) {
                 if (is_array($responses[$f_key][$s_key])) {
                     $responses[$f_key][$s_key] = array_merge($responses[$f_key][$s_key], $content);
                 } else {
                     $responses[$f_key][$s_key] .= $content;
                 }
             } else {
                 $responses[$f_key][$s_key] = $content;
             }
         } else {
             /**
              * 多个name 内容合并
              */
             if (isset($responses[$dname])) {
                 if (is_array($responses[$dname])) {
                     $responses[$dname] = array_merge($responses[$dname], $content);
                 } else {
                     $responses[$dname] .= $content;
                 }
             } else {
                 $responses[$dname] = $content;
             }
         }
         /**
          * 对匹配多条的数据去重过滤
          */
         if (!is_array($responses[$dname]) && $data['multi']) {
             if (strpos($responses[$dname], ',') !== false) {
                 $_dnameArray = explode(',', $responses[$dname]);
                 $dnameArray = array();
                 foreach ((array) $_dnameArray as $key => $value) {
                     $value = trim($value);
                     $value && ($dnameArray[] = $value);
                 }
                 $dnameArray = array_filter($dnameArray);
                 $dnameArray = array_unique($dnameArray);
                 $responses[$dname] = implode(',', $dnameArray);
                 unset($dnameArray, $_dnameArray);
             }
         }
         gc_collect_cycles();
     }
     if (isset($responses['title']) && empty($responses['title'])) {
         $responses['title'] = $responses['__title__'];
     }
     spider::$allHtml = null;
     unset($html);
     gc_collect_cycles();
     if (spider::$dataTest) {
         echo "<pre style='width:99%;word-wrap: break-word;'>";
         print_r(iS::escapeStr($responses));
         echo '<hr />';
         echo '使用内存:' . iFS::sizeUnit(memory_get_usage()) . ' 执行时间:' . iPHP::timer_stop() . 's';
         echo "</pre>";
     }
     iFS::$CURLOPT_ENCODING = '';
     iFS::$CURLOPT_REFERER = '';
     iFS::$watermark_config['pos'] = iCMS::$config['watermark']['pos'];
     iFS::$watermark_config['x'] = iCMS::$config['watermark']['x'];
     iFS::$watermark_config['y'] = iCMS::$config['watermark']['y'];
     iFS::$watermark_config['img'] = iCMS::$config['watermark']['img'];
     $rule['fs']['encoding'] && (iFS::$CURLOPT_ENCODING = $rule['fs']['encoding']);
     $rule['fs']['referer'] && (iFS::$CURLOPT_REFERER = $rule['fs']['referer']);
     if ($rule['watermark_mode']) {
         iFS::$watermark_config['pos'] = $rule['watermark']['pos'];
         iFS::$watermark_config['x'] = $rule['watermark']['x'];
         iFS::$watermark_config['y'] = $rule['watermark']['y'];
         $rule['watermark']['img'] && (iFS::$watermark_config['img'] = $rule['watermark']['img']);
     }
     if (spider::$callback['data'] && is_callable(spider::$callback['data'])) {
         $responses = call_user_func_array(spider::$callback['data'], array($responses));
     }
     return $responses;
 }