function all_links_duo($site_id, $ceng, $include_word, $not_include_word) { global $db; $new_url = array(); $fenge = array(); $nei = 1; //1代表只收内链 2代表外链 空代表所有 $numm = 20; //开启多少线程 echo "<br><b>开始抓取第" . $ceng . "层</b><br>"; $ceng++; $row = $db->get_one("select * from ve123_links_temp where site_id='" . $site_id . "' and no_id='0'"); if (empty($row)) { echo " ---------- 没有新链接了<br>"; return; } //如果找不到新增加url,则结束 $query = $db->query("select * from ve123_links_temp where site_id='" . $site_id . "' and no_id='0'"); while ($row = $db->fetch_array($query)) { $new_url[] = $row[url]; } $he_num = ceil(count($new_url) / $numm); //计算需要循环多少次 $fenge = array_chunk($new_url, $numm); //把数组分割成多少块数组 每块大小$numm /* echo "一共多少个"; echo count($new_url); echo "需要循环"; echo $he_num; echo "次<br>"; */ for ($i = 0; $i <= $he_num; $i++) { /*echo "开始循环第 ".$i." 次<br>"; print_r($fenge[$i]); echo "<br>";*/ $fen_url = array(); $fen_url = cmi($fenge[$i]); //需要把得到的数组 (数组只包括 网址和源码) 分析 写入数据库 , /*echo "<b>本次抓完的网址为</b>"; print_r($fen_url[url]); echo "<br>";*/ foreach ((array) $fen_url as $url => $file) { $links = array(); $temp_links = array(); $cha_temp = array(); $loy = array(); $new_links = array(); $cha_links = array(); $cha_links_num = array(); $links = _striplinks($file); //从htmlcode中提取网址 $links = _expandlinks($links, $url); //补全网址 $links = check_wai($links, $nei, $url); $links = array_values(array_unique($links)); $bianma = bianma($file); //获取得到htmlcode的编码 $file = Convert_File($file, $bianma); //转换所有编码为gb2312 $loy = clean_lry($file, $url, "html"); $title = $loy["title"]; //从数组中得到标题,赋值给title $pagesize = number_format(strlen($file) / 1024, 0, ".", ""); $fulltxt = Html2Text($loy["fulltext"]); $description = $loy["description"]; //从数组中得到标题,赋值给description $keywords = $loy["keywords"]; //从数组中得到标题,赋值给keywords $lrymd5 = md5($fulltxt); $updatetime = time(); if ($title == "") { $title = str_cut($fulltxt, 65); } //根据url,更新内容 $array = array('lrymd5' => $lrymd5, 'title' => $title, 'fulltxt' => $fulltxt, 'description' => $description, 'keywords' => $keywords, 'pagesize' => $pagesize, 'updatetime' => $updatetime); $db->update("ve123_links", $array, "url='" . $url . "'"); $all_num = count($links); //开始读取 ve123_links_temp 中所有site_id 为$site_id 的url 然后和抓取的 $links 数组比较,将得到的差集创建到 ve123_links_temp 中 $query = $db->query("select url from ve123_links_temp where url like '%" . getdomain($url) . "%'"); while ($row = $db->fetch_array($query)) { $temp_links[] = rtrim($row[url], "/"); } $cha_temp = array_diff($links, $temp_links); foreach ((array) $cha_temp as $value) { if (check_include($value, $include_word, $not_include_word)) { $arral = array('url' => $value, 'site_id' => $site_id); $db->insert("ve123_links_temp", $arral); } } //开始读取 ve123_links 中所有site_id 为 $site_id 的url 然后和抓取的 $links 数组比较,将得到的差集创建到 ve123_links 中 合集则输出 已存在了 $query = $db->query("select url from ve123_links where url like '%" . getdomain($url) . "%'"); while ($row = $db->fetch_array($query)) { $new_links[] = rtrim($row[url], "/"); } $cha_links = array_diff($links, $new_links); foreach ((array) $cha_links as $value) { if (check_include($value, $include_word, $not_include_word)) { $array = array('url' => $value, 'site_id' => $site_id, 'level' => '1'); $db->insert("ve123_links", $array); $cha_links_num[] = $value; } } $cha_num = count($cha_links_num); printLinksReport($cha_num, $all_num, $cl = 0); echo "<a href=" . $url . " target=_blank>" . $url . "</a><br>"; $arral = array('no_id' => 1); $db->update("ve123_links_temp", $arral, "url='{$url}'"); ob_flush(); flush(); } } all_links_duo($site_id, $ceng, $include_word, $not_include_word); //再次调用本函数开始循环 }
function add_in_site_link($site_id) { echo "<b>起始网址:</b>"; $oldtime = time(); print str_repeat(" ", 4096); ob_flush(); flush(); //sleep(1); global $db; $site = $db->get_one("select * from ve123_sites where site_id='" . $site_id . "'"); $url = $site["url"]; $fpr = $site["fpr"]; $pagestart = $site["pagestart"]; $pagestop = $site["pagestop"]; $pageadd = $site["pageadd"]; //$site_id=$site["site_id"]; $include_word = $site["include_word"]; $not_include_word = $site["not_include_word"]; $spider_depth = $site["spider_depth"]; //从ve123_sites 中读取一网址 创建到ve123_links 表和ve123_links_temp 表中 Updan_link($url, $site_id); //收录全站---多线程 if ($spider_depth == -1) { echo "<b>收录全站</b>"; echo "原始页=" . $url . " - - <首层 id=" . $site_id . "> - - <包含字段=" . $include_word . "> - - <不包含字段=" . $not_include_word . ">"; $ceng = 1; $lry = 1; all_links_duo($site_id, $ceng, $include_word, $not_include_word, $lry); echo "<b>全站收录完毕</b><br>"; } //分析ve123_sites 得到的网址中的所有链接,创建到ve123_links_temp 表中,再通过包含不包含字段过滤,把过滤后的链接创建到ve123_links表中 if ($spider_depth == 1) { //echo "<br>"; if ($fpr == "1") { if (strpos($url, "{page}") === false) { echo ":( 抓取失败!您如果选择的是多页抓取 ,抓取的网址就必须含有{page} {page}作为你翻页的变量"; return false; } for ($s = $pagestart; $s <= $pagestop; $s = $s + $pageadd) { $urlgo = str_replace("{page}", $s, $url); echo "<br>"; echo "-------------------------------------------------------第 "; echo $pagestart; echo " 页-------------------------------------------------------"; echo "<br>"; echo $urlgo; echo "<br>"; all_url_dan($urlgo, $url, 1, 0, $site_id, $include_word, $not_include_word); $pagestart = $pagestart + $pageadd; } } else { all_url_dan($url, $url, 1, 0, $site_id, $include_word, $not_include_word); } } //读出ve123_links_temp 中所有包含 url 的链接,利用循环数组把得到 url 分析所有页面中的链接创建到ve123_links_temp 表中,再把过滤后的链接创建到ve123_links 表中 if ($spider_depth > 1) { $ceng = 1; $lry = 1; for ($i = $spider_depth; $i > 0; $i--) { echo "<br><b>开始抓取第" . $ceng . "层</b>"; $ceng++; //$domain=0; $roo = $db->get_one("select * from ve123_links_temp where site_id='" . $site_id . "' and no_id='0'"); if (empty($roo)) { echo " ---------- 没有新链接了<br>"; break; } $query = $db->query("select * from ve123_links_temp where site_id='" . $site_id . "' and no_id='0'"); while ($row = $db->fetch_array($query)) { $url = $row["url"]; echo "<br><font color='#aaaaaa'>" . $lry . ".</font>"; $lry++; all_url_dan($url, $url, 1, 1, $site_id, $include_word, $not_include_word); $arral = array('no_id' => 1); $db->update("ve123_links_temp", $arral, "url='{$url}'"); } } } //清空ve123_links_temp 中所有包含a.htm的数据 (或清空ve123_links_temp 的所有数据) del_links_temp($site_id); echo "<b>起始网址抓取完成</b>"; $newtime = time(); echo " --- <b>用时:</b>"; echo date("H:i:s", $newtime - $oldtime - 28800); echo "<br>"; }