예제 #1
0
 } else {
     $exists_spider_id = 0;
 }
 $url_indexing = $full_url . $temp_path . $temp_file;
 $url_indexing = ereg_replace("[?]\$", "", $url_indexing);
 $url_print = $url . $temp_path . $temp_file;
 $url_print = ereg_replace("[?]\$", "", $url_print);
 //verify if 'revisit-after' date is expired or if page doesn't exists, or force is on.
 if ($exists_spider_id == 0 || $upddate < $date || $force_first_reindex == 1 && ($level == 0 || $already_indexed == 0)) {
     //test content-type of this page if not excluded
     $result_test_http = '';
     if (!phpdigReadRobots($exclude, $temp_path . $temp_file) && !eregi(FORBIDDEN_EXTENSIONS, $temp_path . $temp_file)) {
         $result_test_http = phpdigTestUrl($url_indexing, 'date', $cookies);
     }
     if (is_array($result_test_http) && !in_array($result_test_http['status'], array('NOHOST', 'NOFILE', 'LOOP', 'NEWHOST'))) {
         $tested_url = phpdigRewriteUrl($result_test_http['path']);
         $cookies = $result_test_http['cookies'];
         // update URI if redirect in same host...
         if ($tested_url['path'] != $temp_path || $tested_url['file'] != $temp_file) {
             $temp_path = $tested_url['path'];
             $temp_file = $tested_url['file'];
             $query = "UPDATE " . PHPDIG_DB_PREFIX . "tempspider SET path='{$temp_path}', file='{$temp_file}', WHERE id={$tempspider_id}";
             mysql_query($query, $id_connect);
             $url_indexing = $full_url . $temp_path . $temp_file;
             $url_indexing = ereg_replace("[?]\$", "", $url_indexing);
             $url_print = $url . $temp_path . $temp_file;
             $url_print = ereg_replace("[?]\$", "", $url_print);
         }
         // set user-agent and cookies
         phpdigSetHeaders($cookies, $temp_path);
         $last_modified = $result_test_http['lm_date'];
function phpdigExplore($tempfile, $url, $path = "", $file = "")
{
    global $allowed_link_chars;
    $index = 0;
    if (!is_file($tempfile)) {
        return -1;
    } else {
        $file_content = @file($tempfile);
        $my_file_base_content = implode("", $file_content);
        if (eregi("<head>(.*)</head>", $my_file_base_content, $base_regs1)) {
            $base_regs1 = $base_regs1[1];
            if (eregi("<base href[[:space:]]*=[[:space:]]*['\"]*([a-z]{3,5}://[.a-z0-9-]+[^'\"]*)['\"]*[[:space:]]*[/]?>", $base_regs1, $base_regs2)) {
                $new_base_path = parse_url($base_regs2[1]);
                if (!isset($new_base_path["path"]) || $new_base_path["path"] == "/") {
                    $path = "";
                } else {
                    $new_base_path = eregi_replace("^/", "", $new_base_path["path"]);
                    if (eregi("/\$", $new_base_path)) {
                        $path = $new_base_path;
                    } else {
                        $path = dirname($new_base_path) . "/";
                    }
                }
            }
        }
    }
    if (!is_array($file_content)) {
        return -1;
    } else {
        $links = '';
        $http_scheme_array = '';
        foreach ($file_content as $eval) {
            //search hrefs and frames src
            while (eregi("(<frame[^>]*src[[:blank:]]*=|href[[:blank:]]*=|http-equiv=['\"]refresh['\"] *content=['\"][0-9]+;[[:blank:]]*url[[:blank:]]*=|window[.]location[[:blank:]]*=|window[.]open[[:blank:]]*[(])[[:blank:]]*[\\'\"]?((([a-z]{3,5}://)+(([.a-zA-Z0-9-])+(:[0-9]+)*))*({$allowed_link_chars}\\[?{$allowed_link_chars}\\]?{$allowed_link_chars}))(#[.a-zA-Z0-9-]*)?[\\'\" ]?", $eval, $regs)) {
                $eval = str_replace($regs[0], "", $eval);
                //test no host or same than site
                if (strlen($regs[4]) == 0) {
                    $regs[4] = '';
                }
                // the scheme
                if (strlen($regs[5]) == 0) {
                    $regs[5] = '';
                }
                // domain name
                if (strlen($regs[8]) == 0) {
                    $regs[8] = '';
                }
                // path/file
                if ($regs[5] != "" && $regs[8] == "") {
                    $links[$index] = array("path" => "", "file" => "");
                } elseif (substr($regs[8], 0, 1) == "/") {
                    $links[$index] = phpdigRewriteUrl($regs[8]);
                } elseif (substr($regs[8], 0, 1) == "?") {
                    // path/file is a query string - cut it from base file
                    $links[$index] = phpdigRewriteUrl($path . preg_replace('#\\?.*#', '', $file) . $regs[8]);
                } else {
                    $links[$index] = phpdigRewriteUrl($path . $regs[8]);
                }
                if (is_array($links[$index])) {
                    if ($regs[5] != "" && $url != 'http://' . $regs[5] . '/' && $url != 'https://' . $regs[5] . '/') {
                        $links[$index]['newhost'] = $regs[5] . '/';
                    }
                    if ($regs[4] == "https") {
                        $http_scheme_array[$index] = array("the_http_scheme" => "https");
                    } else {
                        $http_scheme_array[$index] = array("the_http_scheme" => "http");
                    }
                    $links[$index] = array_merge($links[$index], $http_scheme_array[$index]);
                    $index++;
                } else {
                    if (isset($links[$index])) {
                        unset($links[$index]);
                    }
                    if (isset($http_scheme_array[$index])) {
                        unset($http_scheme_array[$index]);
                    }
                }
            }
        }
        return $links;
    }
}