Example #1
0
 function fetch_son_gurls($gsid = 0, $guid = 0, $url0 = '', $url1 = '', $url2 = '', $istest = 0)
 {
     //采集或测试合辑内的网址列表
     global $db, $tblprefix, $sid, $c_upload, $timestamp, $progress;
     $rets = array();
     if (!$gsid) {
         return $rets;
     }
     $ng = new cls_gather();
     $ng->set_mission($gsid);
     $gmission =& $ng->gmission;
     $surl = ${'url' . $gmission['ufrompage']};
     //采集网址列表的源url
     if (!$gmission['pid'] || !$surl || !($html = $ng->onepage($surl))) {
         return $rets;
     }
     //如果不是子任务或网址源url不存在或源url页面采不到内容
     $html = $ng->fetch_detail($ng->gmission['uregion'], $html);
     //初始值范围
     $urlregions = explode($ng->gmission['uspilit'], $html);
     //分隔标记拆分
     if ($ng->gmission['udesc']) {
         krsort($urlregions);
     }
     //采集顺序
     unset($html);
     $ufields = array();
     empty($ng->fields) && $ng->gather_fields();
     foreach ($ng->fields as $k => $v) {
         $v['frompage'] == 1 && ($ufields[] = $k);
     }
     $linkcount = 0;
     foreach ($urlregions as $urlregion) {
         //每个url区块
         $c_upload->init();
         $ng->clean_blank($urlregion);
         if (!($gurl = $ng->fetch_detail($ng->gmission['uurltag'], $urlregion))) {
             continue;
         }
         //url模印
         $gurl = fillurl($gurl, $surl);
         //补全url
         if ($ng->gmission['uinclude'] && !eregi($ng->gmission['uinclude'], $gurl)) {
             continue;
         }
         if ($ng->gmission['uforbid'] && eregi($ng->gmission['uforbid'], $gurl)) {
             continue;
         }
         if ($db->result_one("SELECT COUNT(*) FROM {$tblprefix}gurls WHERE gurl='" . addslashes($gurl) . "'")) {
             continue;
         }
         //如果是已存在的网址
         $utitle = $ng->fetch_detail($ng->gmission['utitletag'], $urlregion);
         //标题
         $utitle = !$utitle ? lang('titleunknown') : strip_tags($utitle);
         $gurl1 = $ng->fetch_addurl($gurl, $ng->gmission['uurltag1'], $surl);
         //追溯页1
         $gurl2 = $ng->fetch_addurl($gurl1, $ng->gmission['uurltag2'], $surl);
         //追溯页2
         $linkcount++;
         $contents = array();
         if (!$istest) {
             foreach ($ufields as $v) {
                 $contents[$v] = $ng->common_field($v, $urlregion, $gurl);
             }
             //需要在列表页中采集的内容,在采集网址的同时采集内容
         }
         if ($istest) {
             //合辑需要将其子任务的网址列出来,
             $rets[$gurl]['utitle'] = $utitle;
             $rets[$gurl]['gurl'] = $gurl;
             $rets[$gurl]['gurl1'] = $gurl1;
             $rets[$gurl]['gurl2'] = $gurl2;
             $rets[$gurl]['son'] = 1;
         } else {
             //将网址及内容存入数据库中
             $db->query("INSERT INTO {$tblprefix}gurls SET\n\t\t\t\tsid='{$sid}',\n\t\t\t\tpid='{$guid}',\n\t\t\t\tgurl='{$gurl}',\n\t\t\t\tgurl1='{$gurl1}',\n\t\t\t\tgurl2='{$gurl2}',\n\t\t\t\tutitle='{$utitle}',\n\t\t\t\tcontents='" . addslashes(serialize($contents)) . "',\n\t\t\t\tufids='" . implode(',', $c_upload->ufids) . "',\n\t\t\t\tadddate='{$timestamp}',\n\t\t\t\tgsid='" . $ng->gsid . "'");
         }
     }
     $progress && $progress->linkcount($linkcount);
     unset($ng, $urlregions, $urlregion, $ufields, $contents);
     return $rets;
 }