$exists_spider_id = $test_exists['spider_id']; $upddate = $test_exists['upddate']; $last_modif_old = $test_exists['last_modified']; } else { $exists_spider_id = 0; } $url_indexing = $full_url . $temp_path . $temp_file; $url_indexing = ereg_replace("[?]\$", "", $url_indexing); $url_print = $url . $temp_path . $temp_file; $url_print = ereg_replace("[?]\$", "", $url_print); //verify if 'revisit-after' date is expired or if page doesn't exists, or force is on. if ($exists_spider_id == 0 || $upddate < $date || $force_first_reindex == 1 && ($level == 0 || $already_indexed == 0)) { //test content-type of this page if not excluded $result_test_http = ''; if (!phpdigReadRobots($exclude, $temp_path . $temp_file) && !eregi(FORBIDDEN_EXTENSIONS, $temp_path . $temp_file)) { $result_test_http = phpdigTestUrl($url_indexing, 'date', $cookies); } if (is_array($result_test_http) && !in_array($result_test_http['status'], array('NOHOST', 'NOFILE', 'LOOP', 'NEWHOST'))) { $tested_url = phpdigRewriteUrl($result_test_http['path']); $cookies = $result_test_http['cookies']; // update URI if redirect in same host... if ($tested_url['path'] != $temp_path || $tested_url['file'] != $temp_file) { $temp_path = $tested_url['path']; $temp_file = $tested_url['file']; $query = "UPDATE " . PHPDIG_DB_PREFIX . "tempspider SET path='{$temp_path}', file='{$temp_file}', WHERE id={$tempspider_id}"; mysql_query($query, $id_connect); $url_indexing = $full_url . $temp_path . $temp_file; $url_indexing = ereg_replace("[?]\$", "", $url_indexing); $url_print = $url . $temp_path . $temp_file; $url_print = ereg_replace("[?]\$", "", $url_print); }
function phpdigReadRobotsTxt($site) { //don't forget the end backslash global $allowed_link_chars; $site = eregi_replace("^https", "http", $site); if (phpdigTestUrl($site . 'robots.txt') == 'PLAINTEXT') { @ini_set('auto_detect_line_endings', true); // needs PHP 4.3.0+ $robots = @file($site . 'robots.txt'); while (list($id, $line) = @each($robots)) { if (strpos(trim($line), "#") === 0 || trim($line) == "") { continue; } if (ereg('^user-agent:[ ]*([a-z0-9*]+)', strtolower($line), $regs)) { if ($regs[1] == "*") { $user_agent = "'{$regs['1']}'"; } else { $user_agent = $regs[1]; } } if (isset($user_agent)) { if (eregi('[[:blank:]]*disallow:[[:blank:]]*([/]?(' . $allowed_link_chars . '))', $line, $regs)) { if ($regs[1] == '/') { $exclude[$user_agent]['@ALL@'] = 1; } elseif ($user_agent == "'*'" && $regs[1] == '') { $exclude['@NONE@'] = 1; return $exclude; } else { $exclude[$user_agent][str_replace('*', '.*', str_replace('+', '\\+', str_replace('.', '\\.', $regs[2])))] = 1; } } elseif ($user_agent == 'phpdig' && eregi('[[:blank:]]*disallow:[[:blank:]]*', $line, $regs)) { $exclude[$user_agent]['@NONE@'] = 1; return $exclude[$user_agent]; } } } if (isset($exclude['phpdig']) && is_array($exclude['phpdig'])) { return $exclude['phpdig']; } elseif (isset($exclude['\'*\'']) && is_array($exclude['\'*\''])) { return $exclude['\'*\'']; } } $exclude['@NONE@'] = 1; return $exclude; }