} else { $exists_spider_id = 0; } $url_indexing = $full_url . $temp_path . $temp_file; $url_indexing = ereg_replace("[?]\$", "", $url_indexing); $url_print = $url . $temp_path . $temp_file; $url_print = ereg_replace("[?]\$", "", $url_print); //verify if 'revisit-after' date is expired or if page doesn't exists, or force is on. if ($exists_spider_id == 0 || $upddate < $date || $force_first_reindex == 1 && ($level == 0 || $already_indexed == 0)) { //test content-type of this page if not excluded $result_test_http = ''; if (!phpdigReadRobots($exclude, $temp_path . $temp_file) && !eregi(FORBIDDEN_EXTENSIONS, $temp_path . $temp_file)) { $result_test_http = phpdigTestUrl($url_indexing, 'date', $cookies); } if (is_array($result_test_http) && !in_array($result_test_http['status'], array('NOHOST', 'NOFILE', 'LOOP', 'NEWHOST'))) { $tested_url = phpdigRewriteUrl($result_test_http['path']); $cookies = $result_test_http['cookies']; // update URI if redirect in same host... if ($tested_url['path'] != $temp_path || $tested_url['file'] != $temp_file) { $temp_path = $tested_url['path']; $temp_file = $tested_url['file']; $query = "UPDATE " . PHPDIG_DB_PREFIX . "tempspider SET path='{$temp_path}', file='{$temp_file}', WHERE id={$tempspider_id}"; mysql_query($query, $id_connect); $url_indexing = $full_url . $temp_path . $temp_file; $url_indexing = ereg_replace("[?]\$", "", $url_indexing); $url_print = $url . $temp_path . $temp_file; $url_print = ereg_replace("[?]\$", "", $url_print); } // set user-agent and cookies phpdigSetHeaders($cookies, $temp_path); $last_modified = $result_test_http['lm_date'];
function phpdigExplore($tempfile, $url, $path = "", $file = "") { global $allowed_link_chars; $index = 0; if (!is_file($tempfile)) { return -1; } else { $file_content = @file($tempfile); $my_file_base_content = implode("", $file_content); if (eregi("<head>(.*)</head>", $my_file_base_content, $base_regs1)) { $base_regs1 = $base_regs1[1]; if (eregi("<base href[[:space:]]*=[[:space:]]*['\"]*([a-z]{3,5}://[.a-z0-9-]+[^'\"]*)['\"]*[[:space:]]*[/]?>", $base_regs1, $base_regs2)) { $new_base_path = parse_url($base_regs2[1]); if (!isset($new_base_path["path"]) || $new_base_path["path"] == "/") { $path = ""; } else { $new_base_path = eregi_replace("^/", "", $new_base_path["path"]); if (eregi("/\$", $new_base_path)) { $path = $new_base_path; } else { $path = dirname($new_base_path) . "/"; } } } } } if (!is_array($file_content)) { return -1; } else { $links = ''; $http_scheme_array = ''; foreach ($file_content as $eval) { //search hrefs and frames src while (eregi("(<frame[^>]*src[[:blank:]]*=|href[[:blank:]]*=|http-equiv=['\"]refresh['\"] *content=['\"][0-9]+;[[:blank:]]*url[[:blank:]]*=|window[.]location[[:blank:]]*=|window[.]open[[:blank:]]*[(])[[:blank:]]*[\\'\"]?((([a-z]{3,5}://)+(([.a-zA-Z0-9-])+(:[0-9]+)*))*({$allowed_link_chars}\\[?{$allowed_link_chars}\\]?{$allowed_link_chars}))(#[.a-zA-Z0-9-]*)?[\\'\" ]?", $eval, $regs)) { $eval = str_replace($regs[0], "", $eval); //test no host or same than site if (strlen($regs[4]) == 0) { $regs[4] = ''; } // the scheme if (strlen($regs[5]) == 0) { $regs[5] = ''; } // domain name if (strlen($regs[8]) == 0) { $regs[8] = ''; } // path/file if ($regs[5] != "" && $regs[8] == "") { $links[$index] = array("path" => "", "file" => ""); } elseif (substr($regs[8], 0, 1) == "/") { $links[$index] = phpdigRewriteUrl($regs[8]); } elseif (substr($regs[8], 0, 1) == "?") { // path/file is a query string - cut it from base file $links[$index] = phpdigRewriteUrl($path . preg_replace('#\\?.*#', '', $file) . $regs[8]); } else { $links[$index] = phpdigRewriteUrl($path . $regs[8]); } if (is_array($links[$index])) { if ($regs[5] != "" && $url != 'http://' . $regs[5] . '/' && $url != 'https://' . $regs[5] . '/') { $links[$index]['newhost'] = $regs[5] . '/'; } if ($regs[4] == "https") { $http_scheme_array[$index] = array("the_http_scheme" => "https"); } else { $http_scheme_array[$index] = array("the_http_scheme" => "http"); } $links[$index] = array_merge($links[$index], $http_scheme_array[$index]); $index++; } else { if (isset($links[$index])) { unset($links[$index]); } if (isset($http_scheme_array[$index])) { unset($http_scheme_array[$index]); } } } } return $links; } }