Beispiel #1
0
function zhua_sites($site_id)
{
    $oldtime = time();
    print str_repeat(" ", 4096);
    ob_flush();
    flush();
    global $db;
    $site = $db->get_one("select * from ve123_site_find where site_id='" . $site_id . "'");
    $url = $site["url"];
    Updan_zhua($url, $site_id);
    echo "<b>抓网站</b>";
    echo "网址站=" . $url . " - - < id=" . $site_id . ">";
    $ceng = 1;
    find_sites($site_id, $ceng);
    echo "<b>全站抓取完毕</b><br>";
    $newtime = time();
    echo "  --- <b>用时:</b>";
    echo date("H:i:s", $newtime - $oldtime - 28800);
    echo "<br>";
    del_sites_temp($site_id);
}
function find_sites($site_id, $ceng)
{
    global $db;
    $new_url = array();
    $fenge = array();
    $numm = 20;
    //开启多少线程
    echo "<br><b>开始抓取第" . $ceng . "层</b><br>";
    $ceng++;
    $row = $db->get_one("select * from ve123_sites_temp where site_id='" . $site_id . "' and no_id='0'");
    if (empty($row)) {
        echo "  ---------- 没有新链接了<br>";
        return;
    }
    //如果找不到新增加url,则结束
    $query = $db->query("select * from ve123_sites_temp where site_id='" . $site_id . "' and no_id='0'");
    while ($row = $db->fetch_array($query)) {
        $new_url[] = $row[url];
    }
    $he_num = ceil(count($new_url) / $numm);
    //计算需要循环多少次
    $fenge = array_chunk($new_url, $numm);
    //把数组分割成多少块数组 每块大小$numm
    for ($i = 0; $i <= $he_num; $i++) {
        $fen_url = array();
        $fen_url = cmi($fenge[$i]);
        //需要把得到的数组  (数组只包括 网址和源码) 分析  写入数据库 ,
        foreach ((array) $fen_url as $url => $file) {
            $links = array();
            $fen_link = array();
            $nei_link = array();
            $wai_link = array();
            $new_temp = array();
            $cha_temp = array();
            $new_site = array();
            $cha_site = array();
            $new_lik = array();
            $cha_lik = array();
            $links = _striplinks($file);
            //从htmlcode中提取网址
            $links = _expandlinks($links, $url);
            //补全网址
            $fen_link = fen_link($links, $url);
            //把内链和外链分开
            $nei_link = array_values(array_unique($fen_link[nei]));
            //过滤内链 重复的网址
            $wai_link = GetSiteUrl($fen_link[wai]);
            //把外链都转换成首页
            $wai_link = array_values(array_unique($wai_link));
            //过滤外链 重复的网址
            //读出 ve123_sites_temp 中所有 site_id=-1  and no_id=0
            $query = $db->query("select url from ve123_sites_temp where site_id='" . $site_id . "'");
            while ($row = $db->fetch_array($query)) {
                $new_temp[] = $row[url];
            }
            $cha_temp = array_diff($nei_link, $new_temp);
            //与内链进行比较 得出差集
            //将差集创建到 ve123_sites_temp 中
            foreach ((array) $cha_temp as $value) {
                $arral = array('url' => $value, 'site_id' => $site_id, 'no_id' => 0);
                $db->insert("ve123_sites_temp", $arral);
            }
            //读出 ve123_sites 中所有 site_id=-1 global $db;
            $query = $db->query("select url from ve123_sites where site_no='" . $site_id . "'");
            while ($row = $db->fetch_array($query)) {
                $new_site[] = $row[url];
            }
            $cha_site = array_diff($wai_link, $new_site);
            //与外链进行比较 得出差集
            //将差集创建到 ve123_sites 中
            foreach ((array) $cha_site as $value) {
                $arral = array('url' => $value, 'site_no' => $site_id);
                $db->insert("ve123_sites", $arral);
            }
            //读出 ve123_links 中所有 site_id=-1 global $db;
            global $db;
            $query = $db->query("select url from ve123_links where site_id='" . $site_id . "'");
            while ($row = $db->fetch_array($query)) {
                $new_lik[] = $row[url];
            }
            $cha_lik = array_diff($wai_link, $new_lik);
            //与外链进行比较 得出差集
            //将得到的差集 创建到 ve123_links
            foreach ((array) $cha_lik as $value) {
                $array = array('url' => $value, 'site_id' => $site_id);
                $db->insert("ve123_links", $array);
                echo "<font color=#C60A00><b>抓取到:</b></font>";
                echo "<a href=" . $value . " target=_blank>" . $value . "</a><br>";
            }
            $arral = array('no_id' => 1);
            $db->update("ve123_sites_temp", $arral, "url='{$url}'");
            ob_flush();
            flush();
        }
    }
    find_sites($site_id, $ceng);
    //再次调用本函数开始循环
}
<link rel="stylesheet" href="xp.css" type="text/css">
<?php 
set_time_limit(0);
//error_reporting(0);
require "global.php";
$url = $_GET["url"];
if (empty($url)) {
    echo tips("ÍøÖ·²»ÄÜΪ¿Õ!");
    die;
}
find_sites($url);