function find_sites_($url) { $oldtime = time(); $site_id = -1; $numm = 10; $links = array(); $fen_link = array(); $lrp = array(); $nei_link = array(); $wai_link = array(); $new_temp = array(); $cha_temp = array(); $new_site = array(); $cha_site = array(); $new_lik = array(); $cha_lik = array(); $fenge = array(); $lrp = cmi($url); $links = _striplinks($lrp[$url]); //从htmlcode中提取网址 $links = _expandlinks($links, $url); //补全网址 $fen_link = fen_link($links, $url); //把内链和外链分开 $nei_link = array_values(array_unique($fen_link[nei])); //过滤内链 重复的网址 $wai_link = GetSiteUrl($fen_link[wai]); //把外链都转换成首页 $wai_link = array_values(array_unique($wai_link)); //过滤外链 重复的网址 /*print_r($nei_link); echo "<br><br>"; print_r($wai_link);*/ //读出 ve123_sites_temp 中所有 site_id=-1 and no_id=0 global $db; $query = $db->query("select url from ve123_sites_temp where site_id='-1' and no_id='0'"); while ($row = $db->fetch_array($query)) { $new_temp[] = $row[url]; } $cha_temp = array_diff($nei_link, $new_temp); //与内链进行比较 得出差集 //将差集创建到 ve123_sites_temp 中 foreach ((array) $cha_temp as $value) { $arral = array('url' => $value, 'site_id' => $site_id, 'no_id' => 0); $db->insert("ve123_sites_temp", $arral); } //读出 ve123_temp 中所有 site_id=-1 global $db; global $db; $query = $db->query("select url from ve123_sites where site_no='-1'"); while ($row = $db->fetch_array($query)) { $new_site[] = $row[url]; } $cha_site = array_diff($wai_link, $new_site); //与外链进行比较 得出差集 //将差集创建到 ve123_sites 中 foreach ((array) $cha_site as $value) { $arral = array('url' => $value, 'site_no' => $site_id); $db->insert("ve123_sites", $arral); } //读出 ve123_links 中所有 site_id=-1 global $db; global $db; $query = $db->query("select url from ve123_sites where site_id='-1'"); while ($row = $db->fetch_array($query)) { $new_lik[] = $row[url]; } $cha_lik = array_diff($wai_link, $new_lik); //与外链进行比较 得出差集 //将得到的差集 创建到 ve123_links $he_num = ceil(count($cha_lik) / $numm); //计算需要循环多少次 $fenge = array_chunk($cha_lik, $numm); //把数组分割成多少块数组 每块大小$numm for ($i = 0; $i <= $he_num; $i++) { $fen_url = array(); $fen_url = cmi($fenge[$i]); //多线程开始采集 foreach ((array) $fen_url as $url => $file) { $bianma = bianma($file); //获取得到htmlcode的编码 $file = Convert_File($file, $bianma); //转换所有编码为gb2312 $loy = clean_lry($file, $url, "html"); //过滤 file 中标题等 到数组 $title = $loy["title"]; //从数组中得到标题,赋值给title $pagesize = number_format(strlen($file) / 1024, 0, ".", ""); $fulltxt = Html2Text($loy["fulltext"]); $description = $loy["description"]; //从数组中得到标题,赋值给description $keywords = $loy["keywords"]; //从数组中得到标题,赋值给keywords $lrymd5 = md5($fulltxt); $updatetime = time(); if ($title == "") { $title = str_cut($fulltxt, 65); } //根据url,更新内容 $array = array('url' => $value, 'lrymd5' => $lrymd5, 'title' => $title, 'fulltxt' => $fulltxt, 'description' => $description, 'keywords' => $keywords, 'pagesize' => $pagesize, 'updatetime' => $updatetime); $db->insert("ve123_links", $array); echo "<font color=#C60A00><b>抓取到:</b></font>" . $title; echo "<a href=" . $url . " target=_blank>" . $url . "</a><br>"; } } $newtime = time(); echo " --- <b>用时:</b>"; echo date("H:i:s", $newtime - $oldtime - 28800); echo "<br>"; del_links_temp($site_id); }
function add_in_site_link($site_id) { echo "<b>起始网址:</b>"; $oldtime = time(); print str_repeat(" ", 4096); ob_flush(); flush(); //sleep(1); global $db; $site = $db->get_one("select * from ve123_sites where site_id='" . $site_id . "'"); $url = $site["url"]; $fpr = $site["fpr"]; $pagestart = $site["pagestart"]; $pagestop = $site["pagestop"]; $pageadd = $site["pageadd"]; //$site_id=$site["site_id"]; $include_word = $site["include_word"]; $not_include_word = $site["not_include_word"]; $spider_depth = $site["spider_depth"]; //从ve123_sites 中读取一网址 创建到ve123_links 表和ve123_links_temp 表中 Updan_link($url, $site_id); //收录全站---多线程 if ($spider_depth == -1) { echo "<b>收录全站</b>"; echo "原始页=" . $url . " - - <首层 id=" . $site_id . "> - - <包含字段=" . $include_word . "> - - <不包含字段=" . $not_include_word . ">"; $ceng = 1; $lry = 1; all_links_duo($site_id, $ceng, $include_word, $not_include_word, $lry); echo "<b>全站收录完毕</b><br>"; } //分析ve123_sites 得到的网址中的所有链接,创建到ve123_links_temp 表中,再通过包含不包含字段过滤,把过滤后的链接创建到ve123_links表中 if ($spider_depth == 1) { //echo "<br>"; if ($fpr == "1") { if (strpos($url, "{page}") === false) { echo ":( 抓取失败!您如果选择的是多页抓取 ,抓取的网址就必须含有{page} {page}作为你翻页的变量"; return false; } for ($s = $pagestart; $s <= $pagestop; $s = $s + $pageadd) { $urlgo = str_replace("{page}", $s, $url); echo "<br>"; echo "-------------------------------------------------------第 "; echo $pagestart; echo " 页-------------------------------------------------------"; echo "<br>"; echo $urlgo; echo "<br>"; all_url_dan($urlgo, $url, 1, 0, $site_id, $include_word, $not_include_word); $pagestart = $pagestart + $pageadd; } } else { all_url_dan($url, $url, 1, 0, $site_id, $include_word, $not_include_word); } } //读出ve123_links_temp 中所有包含 url 的链接,利用循环数组把得到 url 分析所有页面中的链接创建到ve123_links_temp 表中,再把过滤后的链接创建到ve123_links 表中 if ($spider_depth > 1) { $ceng = 1; $lry = 1; for ($i = $spider_depth; $i > 0; $i--) { echo "<br><b>开始抓取第" . $ceng . "层</b>"; $ceng++; //$domain=0; $roo = $db->get_one("select * from ve123_links_temp where site_id='" . $site_id . "' and no_id='0'"); if (empty($roo)) { echo " ---------- 没有新链接了<br>"; break; } $query = $db->query("select * from ve123_links_temp where site_id='" . $site_id . "' and no_id='0'"); while ($row = $db->fetch_array($query)) { $url = $row["url"]; echo "<br><font color='#aaaaaa'>" . $lry . ".</font>"; $lry++; all_url_dan($url, $url, 1, 1, $site_id, $include_word, $not_include_word); $arral = array('no_id' => 1); $db->update("ve123_links_temp", $arral, "url='{$url}'"); } } } //清空ve123_links_temp 中所有包含a.htm的数据 (或清空ve123_links_temp 的所有数据) del_links_temp($site_id); echo "<b>起始网址抓取完成</b>"; $newtime = time(); echo " --- <b>用时:</b>"; echo date("H:i:s", $newtime - $oldtime - 28800); echo "<br>"; }