function fetch_son_gurls($gsid = 0, $guid = 0, $url0 = '', $url1 = '', $url2 = '', $istest = 0) { //采集或测试合辑内的网址列表 global $db, $tblprefix, $sid, $c_upload, $timestamp, $progress; $rets = array(); if (!$gsid) { return $rets; } $ng = new cls_gather(); $ng->set_mission($gsid); $gmission =& $ng->gmission; $surl = ${'url' . $gmission['ufrompage']}; //采集网址列表的源url if (!$gmission['pid'] || !$surl || !($html = $ng->onepage($surl))) { return $rets; } //如果不是子任务或网址源url不存在或源url页面采不到内容 $html = $ng->fetch_detail($ng->gmission['uregion'], $html); //初始值范围 $urlregions = explode($ng->gmission['uspilit'], $html); //分隔标记拆分 if ($ng->gmission['udesc']) { krsort($urlregions); } //采集顺序 unset($html); $ufields = array(); empty($ng->fields) && $ng->gather_fields(); foreach ($ng->fields as $k => $v) { $v['frompage'] == 1 && ($ufields[] = $k); } $linkcount = 0; foreach ($urlregions as $urlregion) { //每个url区块 $c_upload->init(); $ng->clean_blank($urlregion); if (!($gurl = $ng->fetch_detail($ng->gmission['uurltag'], $urlregion))) { continue; } //url模印 $gurl = fillurl($gurl, $surl); //补全url if ($ng->gmission['uinclude'] && !eregi($ng->gmission['uinclude'], $gurl)) { continue; } if ($ng->gmission['uforbid'] && eregi($ng->gmission['uforbid'], $gurl)) { continue; } if ($db->result_one("SELECT COUNT(*) FROM {$tblprefix}gurls WHERE gurl='" . addslashes($gurl) . "'")) { continue; } //如果是已存在的网址 $utitle = $ng->fetch_detail($ng->gmission['utitletag'], $urlregion); //标题 $utitle = !$utitle ? lang('titleunknown') : strip_tags($utitle); $gurl1 = $ng->fetch_addurl($gurl, $ng->gmission['uurltag1'], $surl); //追溯页1 $gurl2 = $ng->fetch_addurl($gurl1, $ng->gmission['uurltag2'], $surl); //追溯页2 $linkcount++; $contents = array(); if (!$istest) { foreach ($ufields as $v) { $contents[$v] = $ng->common_field($v, $urlregion, $gurl); } //需要在列表页中采集的内容,在采集网址的同时采集内容 } if ($istest) { //合辑需要将其子任务的网址列出来, $rets[$gurl]['utitle'] = $utitle; $rets[$gurl]['gurl'] = $gurl; $rets[$gurl]['gurl1'] = $gurl1; $rets[$gurl]['gurl2'] = $gurl2; $rets[$gurl]['son'] = 1; } else { //将网址及内容存入数据库中 $db->query("INSERT INTO {$tblprefix}gurls SET\n\t\t\t\tsid='{$sid}',\n\t\t\t\tpid='{$guid}',\n\t\t\t\tgurl='{$gurl}',\n\t\t\t\tgurl1='{$gurl1}',\n\t\t\t\tgurl2='{$gurl2}',\n\t\t\t\tutitle='{$utitle}',\n\t\t\t\tcontents='" . addslashes(serialize($contents)) . "',\n\t\t\t\tufids='" . implode(',', $c_upload->ufids) . "',\n\t\t\t\tadddate='{$timestamp}',\n\t\t\t\tgsid='" . $ng->gsid . "'"); } } $progress && $progress->linkcount($linkcount); unset($ng, $urlregions, $urlregion, $ufields, $contents); return $rets; }