Esempio n. 1
0
function all_links_duo($site_id, $ceng, $include_word, $not_include_word)
{
    global $db;
    $new_url = array();
    $fenge = array();
    $nei = 1;
    //1代表只收内链 2代表外链 空代表所有
    $numm = 20;
    //开启多少线程
    echo "<br><b>开始抓取第" . $ceng . "层</b><br>";
    $ceng++;
    $row = $db->get_one("select * from ve123_links_temp where site_id='" . $site_id . "' and no_id='0'");
    if (empty($row)) {
        echo "  ---------- 没有新链接了<br>";
        return;
    }
    //如果找不到新增加url,则结束
    $query = $db->query("select * from ve123_links_temp where site_id='" . $site_id . "' and no_id='0'");
    while ($row = $db->fetch_array($query)) {
        $new_url[] = $row[url];
    }
    $he_num = ceil(count($new_url) / $numm);
    //计算需要循环多少次
    $fenge = array_chunk($new_url, $numm);
    //把数组分割成多少块数组 每块大小$numm
    /* echo "一共多少个";
       echo count($new_url);
          echo "需要循环";
       echo $he_num;
       echo "次<br>";	*/
    for ($i = 0; $i <= $he_num; $i++) {
        /*echo "开始循环第 ".$i." 次<br>";
        	 print_r($fenge[$i]);
        	 echo "<br>";*/
        $fen_url = array();
        $fen_url = cmi($fenge[$i]);
        //需要把得到的数组  (数组只包括 网址和源码) 分析  写入数据库 ,
        /*echo "<b>本次抓完的网址为</b>";
        		print_r($fen_url[url]);
        		echo "<br>";*/
        foreach ((array) $fen_url as $url => $file) {
            $links = array();
            $temp_links = array();
            $cha_temp = array();
            $loy = array();
            $new_links = array();
            $cha_links = array();
            $cha_links_num = array();
            $links = _striplinks($file);
            //从htmlcode中提取网址
            $links = _expandlinks($links, $url);
            //补全网址
            $links = check_wai($links, $nei, $url);
            $links = array_values(array_unique($links));
            $bianma = bianma($file);
            //获取得到htmlcode的编码
            $file = Convert_File($file, $bianma);
            //转换所有编码为gb2312
            $loy = clean_lry($file, $url, "html");
            $title = $loy["title"];
            //从数组中得到标题,赋值给title
            $pagesize = number_format(strlen($file) / 1024, 0, ".", "");
            $fulltxt = Html2Text($loy["fulltext"]);
            $description = $loy["description"];
            //从数组中得到标题,赋值给description
            $keywords = $loy["keywords"];
            //从数组中得到标题,赋值给keywords
            $lrymd5 = md5($fulltxt);
            $updatetime = time();
            if ($title == "") {
                $title = str_cut($fulltxt, 65);
            }
            //根据url,更新内容
            $array = array('lrymd5' => $lrymd5, 'title' => $title, 'fulltxt' => $fulltxt, 'description' => $description, 'keywords' => $keywords, 'pagesize' => $pagesize, 'updatetime' => $updatetime);
            $db->update("ve123_links", $array, "url='" . $url . "'");
            $all_num = count($links);
            //开始读取 ve123_links_temp 中所有site_id 为$site_id 的url   然后和抓取的 $links 数组比较,将得到的差集创建到  ve123_links_temp 中
            $query = $db->query("select url from ve123_links_temp where url like '%" . getdomain($url) . "%'");
            while ($row = $db->fetch_array($query)) {
                $temp_links[] = rtrim($row[url], "/");
            }
            $cha_temp = array_diff($links, $temp_links);
            foreach ((array) $cha_temp as $value) {
                if (check_include($value, $include_word, $not_include_word)) {
                    $arral = array('url' => $value, 'site_id' => $site_id);
                    $db->insert("ve123_links_temp", $arral);
                }
            }
            //开始读取 ve123_links 中所有site_id 为 $site_id 的url   然后和抓取的 $links 数组比较,将得到的差集创建到  ve123_links 中  合集则输出 已存在了
            $query = $db->query("select url from ve123_links where url like '%" . getdomain($url) . "%'");
            while ($row = $db->fetch_array($query)) {
                $new_links[] = rtrim($row[url], "/");
            }
            $cha_links = array_diff($links, $new_links);
            foreach ((array) $cha_links as $value) {
                if (check_include($value, $include_word, $not_include_word)) {
                    $array = array('url' => $value, 'site_id' => $site_id, 'level' => '1');
                    $db->insert("ve123_links", $array);
                    $cha_links_num[] = $value;
                }
            }
            $cha_num = count($cha_links_num);
            printLinksReport($cha_num, $all_num, $cl = 0);
            echo "<a href=" . $url . " target=_blank>" . $url . "</a><br>";
            $arral = array('no_id' => 1);
            $db->update("ve123_links_temp", $arral, "url='{$url}'");
            ob_flush();
            flush();
        }
    }
    all_links_duo($site_id, $ceng, $include_word, $not_include_word);
    //再次调用本函数开始循环
}
Esempio n. 2
0
function add_in_site_link($site_id)
{
    echo "<b>起始网址:</b>";
    $oldtime = time();
    print str_repeat(" ", 4096);
    ob_flush();
    flush();
    //sleep(1);
    global $db;
    $site = $db->get_one("select * from ve123_sites where site_id='" . $site_id . "'");
    $url = $site["url"];
    $fpr = $site["fpr"];
    $pagestart = $site["pagestart"];
    $pagestop = $site["pagestop"];
    $pageadd = $site["pageadd"];
    //$site_id=$site["site_id"];
    $include_word = $site["include_word"];
    $not_include_word = $site["not_include_word"];
    $spider_depth = $site["spider_depth"];
    //从ve123_sites 中读取一网址 创建到ve123_links 表和ve123_links_temp 表中
    Updan_link($url, $site_id);
    //收录全站---多线程
    if ($spider_depth == -1) {
        echo "<b>收录全站</b>";
        echo "原始页=" . $url . " - - <首层 id=" . $site_id . "> - - <包含字段=" . $include_word . "> - - <不包含字段=" . $not_include_word . ">";
        $ceng = 1;
        $lry = 1;
        all_links_duo($site_id, $ceng, $include_word, $not_include_word, $lry);
        echo "<b>全站收录完毕</b><br>";
    }
    //分析ve123_sites 得到的网址中的所有链接,创建到ve123_links_temp 表中,再通过包含不包含字段过滤,把过滤后的链接创建到ve123_links表中
    if ($spider_depth == 1) {
        //echo "<br>";
        if ($fpr == "1") {
            if (strpos($url, "{page}") === false) {
                echo ":( 抓取失败!您如果选择的是多页抓取 ,抓取的网址就必须含有{page} {page}作为你翻页的变量";
                return false;
            }
            for ($s = $pagestart; $s <= $pagestop; $s = $s + $pageadd) {
                $urlgo = str_replace("{page}", $s, $url);
                echo "<br>";
                echo "-------------------------------------------------------第 ";
                echo $pagestart;
                echo " 页-------------------------------------------------------";
                echo "<br>";
                echo $urlgo;
                echo "<br>";
                all_url_dan($urlgo, $url, 1, 0, $site_id, $include_word, $not_include_word);
                $pagestart = $pagestart + $pageadd;
            }
        } else {
            all_url_dan($url, $url, 1, 0, $site_id, $include_word, $not_include_word);
        }
    }
    //读出ve123_links_temp 中所有包含 url 的链接,利用循环数组把得到 url 分析所有页面中的链接创建到ve123_links_temp 表中,再把过滤后的链接创建到ve123_links 表中
    if ($spider_depth > 1) {
        $ceng = 1;
        $lry = 1;
        for ($i = $spider_depth; $i > 0; $i--) {
            echo "<br><b>开始抓取第" . $ceng . "层</b>";
            $ceng++;
            //$domain=0;
            $roo = $db->get_one("select * from ve123_links_temp where site_id='" . $site_id . "' and no_id='0'");
            if (empty($roo)) {
                echo "  ---------- 没有新链接了<br>";
                break;
            }
            $query = $db->query("select * from ve123_links_temp where site_id='" . $site_id . "' and no_id='0'");
            while ($row = $db->fetch_array($query)) {
                $url = $row["url"];
                echo "<br><font color='#aaaaaa'>" . $lry . ".</font>";
                $lry++;
                all_url_dan($url, $url, 1, 1, $site_id, $include_word, $not_include_word);
                $arral = array('no_id' => 1);
                $db->update("ve123_links_temp", $arral, "url='{$url}'");
            }
        }
    }
    //清空ve123_links_temp 中所有包含a.htm的数据  (或清空ve123_links_temp 的所有数据)
    del_links_temp($site_id);
    echo "<b>起始网址抓取完成</b>";
    $newtime = time();
    echo "  --- <b>用时:</b>";
    echo date("H:i:s", $newtime - $oldtime - 28800);
    echo "<br>";
}