function zhua_sites($site_id) { $oldtime = time(); print str_repeat(" ", 4096); ob_flush(); flush(); global $db; $site = $db->get_one("select * from ve123_site_find where site_id='" . $site_id . "'"); $url = $site["url"]; Updan_zhua($url, $site_id); echo "<b>抓网站</b>"; echo "网址站=" . $url . " - - < id=" . $site_id . ">"; $ceng = 1; find_sites($site_id, $ceng); echo "<b>全站抓取完毕</b><br>"; $newtime = time(); echo " --- <b>用时:</b>"; echo date("H:i:s", $newtime - $oldtime - 28800); echo "<br>"; del_sites_temp($site_id); }
function find_sites($site_id, $ceng) { global $db; $new_url = array(); $fenge = array(); $numm = 20; //开启多少线程 echo "<br><b>开始抓取第" . $ceng . "层</b><br>"; $ceng++; $row = $db->get_one("select * from ve123_sites_temp where site_id='" . $site_id . "' and no_id='0'"); if (empty($row)) { echo " ---------- 没有新链接了<br>"; return; } //如果找不到新增加url,则结束 $query = $db->query("select * from ve123_sites_temp where site_id='" . $site_id . "' and no_id='0'"); while ($row = $db->fetch_array($query)) { $new_url[] = $row[url]; } $he_num = ceil(count($new_url) / $numm); //计算需要循环多少次 $fenge = array_chunk($new_url, $numm); //把数组分割成多少块数组 每块大小$numm for ($i = 0; $i <= $he_num; $i++) { $fen_url = array(); $fen_url = cmi($fenge[$i]); //需要把得到的数组 (数组只包括 网址和源码) 分析 写入数据库 , foreach ((array) $fen_url as $url => $file) { $links = array(); $fen_link = array(); $nei_link = array(); $wai_link = array(); $new_temp = array(); $cha_temp = array(); $new_site = array(); $cha_site = array(); $new_lik = array(); $cha_lik = array(); $links = _striplinks($file); //从htmlcode中提取网址 $links = _expandlinks($links, $url); //补全网址 $fen_link = fen_link($links, $url); //把内链和外链分开 $nei_link = array_values(array_unique($fen_link[nei])); //过滤内链 重复的网址 $wai_link = GetSiteUrl($fen_link[wai]); //把外链都转换成首页 $wai_link = array_values(array_unique($wai_link)); //过滤外链 重复的网址 //读出 ve123_sites_temp 中所有 site_id=-1 and no_id=0 $query = $db->query("select url from ve123_sites_temp where site_id='" . $site_id . "'"); while ($row = $db->fetch_array($query)) { $new_temp[] = $row[url]; } $cha_temp = array_diff($nei_link, $new_temp); //与内链进行比较 得出差集 //将差集创建到 ve123_sites_temp 中 foreach ((array) $cha_temp as $value) { $arral = array('url' => $value, 'site_id' => $site_id, 'no_id' => 0); $db->insert("ve123_sites_temp", $arral); } //读出 ve123_sites 中所有 site_id=-1 global $db; $query = $db->query("select url from ve123_sites where site_no='" . $site_id . "'"); while ($row = $db->fetch_array($query)) { $new_site[] = $row[url]; } $cha_site = array_diff($wai_link, $new_site); //与外链进行比较 得出差集 //将差集创建到 ve123_sites 中 foreach ((array) $cha_site as $value) { $arral = array('url' => $value, 'site_no' => $site_id); $db->insert("ve123_sites", $arral); } //读出 ve123_links 中所有 site_id=-1 global $db; global $db; $query = $db->query("select url from ve123_links where site_id='" . $site_id . "'"); while ($row = $db->fetch_array($query)) { $new_lik[] = $row[url]; } $cha_lik = array_diff($wai_link, $new_lik); //与外链进行比较 得出差集 //将得到的差集 创建到 ve123_links foreach ((array) $cha_lik as $value) { $array = array('url' => $value, 'site_id' => $site_id); $db->insert("ve123_links", $array); echo "<font color=#C60A00><b>抓取到:</b></font>"; echo "<a href=" . $value . " target=_blank>" . $value . "</a><br>"; } $arral = array('no_id' => 1); $db->update("ve123_sites_temp", $arral, "url='{$url}'"); ob_flush(); flush(); } } find_sites($site_id, $ceng); //再次调用本函数开始循环 }
<link rel="stylesheet" href="xp.css" type="text/css"> <?php set_time_limit(0); //error_reporting(0); require "global.php"; $url = $_GET["url"]; if (empty($url)) { echo tips("ÍøÖ·²»ÄÜΪ¿Õ!"); die; } find_sites($url);