Example #1
0
     $exists_spider_id = $test_exists['spider_id'];
     $upddate = $test_exists['upddate'];
     $last_modif_old = $test_exists['last_modified'];
 } else {
     $exists_spider_id = 0;
 }
 $url_indexing = $full_url . $temp_path . $temp_file;
 $url_indexing = ereg_replace("[?]\$", "", $url_indexing);
 $url_print = $url . $temp_path . $temp_file;
 $url_print = ereg_replace("[?]\$", "", $url_print);
 //verify if 'revisit-after' date is expired or if page doesn't exists, or force is on.
 if ($exists_spider_id == 0 || $upddate < $date || $force_first_reindex == 1 && ($level == 0 || $already_indexed == 0)) {
     //test content-type of this page if not excluded
     $result_test_http = '';
     if (!phpdigReadRobots($exclude, $temp_path . $temp_file) && !eregi(FORBIDDEN_EXTENSIONS, $temp_path . $temp_file)) {
         $result_test_http = phpdigTestUrl($url_indexing, 'date', $cookies);
     }
     if (is_array($result_test_http) && !in_array($result_test_http['status'], array('NOHOST', 'NOFILE', 'LOOP', 'NEWHOST'))) {
         $tested_url = phpdigRewriteUrl($result_test_http['path']);
         $cookies = $result_test_http['cookies'];
         // update URI if redirect in same host...
         if ($tested_url['path'] != $temp_path || $tested_url['file'] != $temp_file) {
             $temp_path = $tested_url['path'];
             $temp_file = $tested_url['file'];
             $query = "UPDATE " . PHPDIG_DB_PREFIX . "tempspider SET path='{$temp_path}', file='{$temp_file}', WHERE id={$tempspider_id}";
             mysql_query($query, $id_connect);
             $url_indexing = $full_url . $temp_path . $temp_file;
             $url_indexing = ereg_replace("[?]\$", "", $url_indexing);
             $url_print = $url . $temp_path . $temp_file;
             $url_print = ereg_replace("[?]\$", "", $url_print);
         }
function phpdigReadRobotsTxt($site)
{
    //don't forget the end backslash
    global $allowed_link_chars;
    $site = eregi_replace("^https", "http", $site);
    if (phpdigTestUrl($site . 'robots.txt') == 'PLAINTEXT') {
        @ini_set('auto_detect_line_endings', true);
        // needs PHP 4.3.0+
        $robots = @file($site . 'robots.txt');
        while (list($id, $line) = @each($robots)) {
            if (strpos(trim($line), "#") === 0 || trim($line) == "") {
                continue;
            }
            if (ereg('^user-agent:[ ]*([a-z0-9*]+)', strtolower($line), $regs)) {
                if ($regs[1] == "*") {
                    $user_agent = "'{$regs['1']}'";
                } else {
                    $user_agent = $regs[1];
                }
            }
            if (isset($user_agent)) {
                if (eregi('[[:blank:]]*disallow:[[:blank:]]*([/]?(' . $allowed_link_chars . '))', $line, $regs)) {
                    if ($regs[1] == '/') {
                        $exclude[$user_agent]['@ALL@'] = 1;
                    } elseif ($user_agent == "'*'" && $regs[1] == '') {
                        $exclude['@NONE@'] = 1;
                        return $exclude;
                    } else {
                        $exclude[$user_agent][str_replace('*', '.*', str_replace('+', '\\+', str_replace('.', '\\.', $regs[2])))] = 1;
                    }
                } elseif ($user_agent == 'phpdig' && eregi('[[:blank:]]*disallow:[[:blank:]]*', $line, $regs)) {
                    $exclude[$user_agent]['@NONE@'] = 1;
                    return $exclude[$user_agent];
                }
            }
        }
        if (isset($exclude['phpdig']) && is_array($exclude['phpdig'])) {
            return $exclude['phpdig'];
        } elseif (isset($exclude['\'*\'']) && is_array($exclude['\'*\''])) {
            return $exclude['\'*\''];
        }
    }
    $exclude['@NONE@'] = 1;
    return $exclude;
}