Ejemplo n.º 1
function find_sites_($url)
    $oldtime = time();
    $site_id = -1;
    $numm = 10;
    $links = array();
    $fen_link = array();
    $lrp = array();
    $nei_link = array();
    $wai_link = array();
    $new_temp = array();
    $cha_temp = array();
    $new_site = array();
    $cha_site = array();
    $new_lik = array();
    $cha_lik = array();
    $fenge = array();
    $lrp = cmi($url);
    $links = _striplinks($lrp[$url]);
    $links = _expandlinks($links, $url);
    $fen_link = fen_link($links, $url);
    $nei_link = array_values(array_unique($fen_link[nei]));
    //过滤内链 重复的网址
    $wai_link = GetSiteUrl($fen_link[wai]);
    $wai_link = array_values(array_unique($wai_link));
    //过滤外链 重复的网址
    		echo "<br><br>";
    //读出 ve123_sites_temp 中所有 site_id=-1  and no_id=0
    global $db;
    $query = $db->query("select url from ve123_sites_temp where site_id='-1' and no_id='0'");
    while ($row = $db->fetch_array($query)) {
        $new_temp[] = $row[url];
    $cha_temp = array_diff($nei_link, $new_temp);
    //与内链进行比较 得出差集
    //将差集创建到 ve123_sites_temp 中
    foreach ((array) $cha_temp as $value) {
        $arral = array('url' => $value, 'site_id' => $site_id, 'no_id' => 0);
        $db->insert("ve123_sites_temp", $arral);
    //读出 ve123_temp 中所有 site_id=-1 global $db;
    global $db;
    $query = $db->query("select url from ve123_sites where site_no='-1'");
    while ($row = $db->fetch_array($query)) {
        $new_site[] = $row[url];
    $cha_site = array_diff($wai_link, $new_site);
    //与外链进行比较 得出差集
    //将差集创建到 ve123_sites 中
    foreach ((array) $cha_site as $value) {
        $arral = array('url' => $value, 'site_no' => $site_id);
        $db->insert("ve123_sites", $arral);
    //读出 ve123_links 中所有 site_id=-1 global $db;
    global $db;
    $query = $db->query("select url from ve123_sites where site_id='-1'");
    while ($row = $db->fetch_array($query)) {
        $new_lik[] = $row[url];
    $cha_lik = array_diff($wai_link, $new_lik);
    //与外链进行比较 得出差集
    //将得到的差集 创建到 ve123_links
    $he_num = ceil(count($cha_lik) / $numm);
    $fenge = array_chunk($cha_lik, $numm);
    //把数组分割成多少块数组 每块大小$numm
    for ($i = 0; $i <= $he_num; $i++) {
        $fen_url = array();
        $fen_url = cmi($fenge[$i]);
        foreach ((array) $fen_url as $url => $file) {
            $bianma = bianma($file);
            $file = Convert_File($file, $bianma);
            $loy = clean_lry($file, $url, "html");
            //过滤 file 中标题等 到数组
            $title = $loy["title"];
            $pagesize = number_format(strlen($file) / 1024, 0, ".", "");
            $fulltxt = Html2Text($loy["fulltext"]);
            $description = $loy["description"];
            $keywords = $loy["keywords"];
            $lrymd5 = md5($fulltxt);
            $updatetime = time();
            if ($title == "") {
                $title = str_cut($fulltxt, 65);
            $array = array('url' => $value, 'lrymd5' => $lrymd5, 'title' => $title, 'fulltxt' => $fulltxt, 'description' => $description, 'keywords' => $keywords, 'pagesize' => $pagesize, 'updatetime' => $updatetime);
            $db->insert("ve123_links", $array);
            echo "<font color=#C60A00><b>抓取到:</b></font>" . $title;
            echo "<a href=" . $url . " target=_blank>" . $url . "</a><br>";
    $newtime = time();
    echo "  --- <b>用时:</b>";
    echo date("H:i:s", $newtime - $oldtime - 28800);
    echo "<br>";
Ejemplo n.º 2
function add_in_site_link($site_id)
    echo "<b>起始网址:</b>";
    $oldtime = time();
    print str_repeat(" ", 4096);
    global $db;
    $site = $db->get_one("select * from ve123_sites where site_id='" . $site_id . "'");
    $url = $site["url"];
    $fpr = $site["fpr"];
    $pagestart = $site["pagestart"];
    $pagestop = $site["pagestop"];
    $pageadd = $site["pageadd"];
    $include_word = $site["include_word"];
    $not_include_word = $site["not_include_word"];
    $spider_depth = $site["spider_depth"];
    //从ve123_sites 中读取一网址 创建到ve123_links 表和ve123_links_temp 表中
    Updan_link($url, $site_id);
    if ($spider_depth == -1) {
        echo "<b>收录全站</b>";
        echo "原始页=" . $url . " - - <首层 id=" . $site_id . "> - - <包含字段=" . $include_word . "> - - <不包含字段=" . $not_include_word . ">";
        $ceng = 1;
        $lry = 1;
        all_links_duo($site_id, $ceng, $include_word, $not_include_word, $lry);
        echo "<b>全站收录完毕</b><br>";
    //分析ve123_sites 得到的网址中的所有链接,创建到ve123_links_temp 表中,再通过包含不包含字段过滤,把过滤后的链接创建到ve123_links表中
    if ($spider_depth == 1) {
        //echo "<br>";
        if ($fpr == "1") {
            if (strpos($url, "{page}") === false) {
                echo ":( 抓取失败!您如果选择的是多页抓取 ,抓取的网址就必须含有{page} {page}作为你翻页的变量";
                return false;
            for ($s = $pagestart; $s <= $pagestop; $s = $s + $pageadd) {
                $urlgo = str_replace("{page}", $s, $url);
                echo "<br>";
                echo "-------------------------------------------------------第 ";
                echo $pagestart;
                echo " 页-------------------------------------------------------";
                echo "<br>";
                echo $urlgo;
                echo "<br>";
                all_url_dan($urlgo, $url, 1, 0, $site_id, $include_word, $not_include_word);
                $pagestart = $pagestart + $pageadd;
        } else {
            all_url_dan($url, $url, 1, 0, $site_id, $include_word, $not_include_word);
    //读出ve123_links_temp 中所有包含 url 的链接,利用循环数组把得到 url 分析所有页面中的链接创建到ve123_links_temp 表中,再把过滤后的链接创建到ve123_links 表中
    if ($spider_depth > 1) {
        $ceng = 1;
        $lry = 1;
        for ($i = $spider_depth; $i > 0; $i--) {
            echo "<br><b>开始抓取第" . $ceng . "层</b>";
            $roo = $db->get_one("select * from ve123_links_temp where site_id='" . $site_id . "' and no_id='0'");
            if (empty($roo)) {
                echo "  ---------- 没有新链接了<br>";
            $query = $db->query("select * from ve123_links_temp where site_id='" . $site_id . "' and no_id='0'");
            while ($row = $db->fetch_array($query)) {
                $url = $row["url"];
                echo "<br><font color='#aaaaaa'>" . $lry . ".</font>";
                all_url_dan($url, $url, 1, 1, $site_id, $include_word, $not_include_word);
                $arral = array('no_id' => 1);
                $db->update("ve123_links_temp", $arral, "url='{$url}'");
    //清空ve123_links_temp 中所有包含a.htm的数据  (或清空ve123_links_temp 的所有数据)
    echo "<b>起始网址抓取完成</b>";
    $newtime = time();
    echo "  --- <b>用时:</b>";
    echo date("H:i:s", $newtime - $oldtime - 28800);
    echo "<br>";