settype($test_exists['last_modified'], 'string'); $exists_spider_id = $test_exists['spider_id']; $upddate = $test_exists['upddate']; $last_modif_old = $test_exists['last_modified']; } else { $exists_spider_id = 0; } $url_indexing = $full_url . $temp_path . $temp_file; $url_indexing = ereg_replace("[?]\$", "", $url_indexing); $url_print = $url . $temp_path . $temp_file; $url_print = ereg_replace("[?]\$", "", $url_print); //verify if 'revisit-after' date is expired or if page doesn't exists, or force is on. if ($exists_spider_id == 0 || $upddate < $date || $force_first_reindex == 1 && ($level == 0 || $already_indexed == 0)) { //test content-type of this page if not excluded $result_test_http = ''; if (!phpdigReadRobots($exclude, $temp_path . $temp_file) && !eregi(FORBIDDEN_EXTENSIONS, $temp_path . $temp_file)) { $result_test_http = phpdigTestUrl($url_indexing, 'date', $cookies); } if (is_array($result_test_http) && !in_array($result_test_http['status'], array('NOHOST', 'NOFILE', 'LOOP', 'NEWHOST'))) { $tested_url = phpdigRewriteUrl($result_test_http['path']); $cookies = $result_test_http['cookies']; // update URI if redirect in same host... if ($tested_url['path'] != $temp_path || $tested_url['file'] != $temp_file) { $temp_path = $tested_url['path']; $temp_file = $tested_url['file']; $query = "UPDATE " . PHPDIG_DB_PREFIX . "tempspider SET path='{$temp_path}', file='{$temp_file}', WHERE id={$tempspider_id}"; mysql_query($query, $id_connect); $url_indexing = $full_url . $temp_path . $temp_file; $url_indexing = ereg_replace("[?]\$", "", $url_indexing); $url_print = $url . $temp_path . $temp_file; $url_print = ereg_replace("[?]\$", "", $url_print);
function phpdigDetectDir($link, $exclude = '', $cookies = array(), $site_id = '', $id_connect = '') { $test = parse_url($link['path'] . $link['file']); //test the exclude with robots.txt if (phpdigReadRobots($exclude, $link['path'] . $link['file']) == 1 || isset($exclude['@ALL@'])) { $link['ok'] = 0; } elseif (!isset($test['query']) && !eregi('[.][a-z0-9]{1,4}$', $link['path'] . $link['file']) && ($status = phpdigTestUrl($link['url'] . $link['path'] . $link['file'] . '/', 'date', $cookies)) && isset($status['status']) && $status['status'] == "HTML") { $link['path'] = ereg_replace('/+$', '/', $link['path'] . $link['file'] . '/'); if ($link['path'] == '/') { $link['path'] = ''; } $link['file'] = ""; $link['ok'] = 1; } else { $status = phpdigTestUrl($link['url'] . $link['path'] . $link['file'], 'date', $cookies); if (!in_array($status['status'], array('NOHOST', 'NOFILE', 'LOOP', 'NEWHOST'))) { $link['ok'] = 1; } else { $link['ok'] = 0; } } if (is_numeric($site_id) && LIMIT_TO_DIRECTORY) { $query = "SELECT DISTINCT in_id, in_path FROM " . PHPDIG_DB_PREFIX . "includes WHERE in_site_id='{$site_id}'"; if (is_array($list_include = phpdigMySelect($id_connect, $query))) { foreach ($list_include as $add_include) { if ($link['path'] != $add_include['in_path']) { $link['ok'] = 0; } } } } if (!$link['ok'] && isset($status)) { $link['status'] = $status['status']; $link['host'] = $status['host']; $link['path'] = $status['path']; $link['cookies'] = $status['cookies']; } return $link; }