Ejemplo n.º 1
0
     settype($test_exists['last_modified'], 'string');
     $exists_spider_id = $test_exists['spider_id'];
     $upddate = $test_exists['upddate'];
     $last_modif_old = $test_exists['last_modified'];
 } else {
     $exists_spider_id = 0;
 }
 $url_indexing = $full_url . $temp_path . $temp_file;
 $url_indexing = ereg_replace("[?]\$", "", $url_indexing);
 $url_print = $url . $temp_path . $temp_file;
 $url_print = ereg_replace("[?]\$", "", $url_print);
 //verify if 'revisit-after' date is expired or if page doesn't exists, or force is on.
 if ($exists_spider_id == 0 || $upddate < $date || $force_first_reindex == 1 && ($level == 0 || $already_indexed == 0)) {
     //test content-type of this page if not excluded
     $result_test_http = '';
     if (!phpdigReadRobots($exclude, $temp_path . $temp_file) && !eregi(FORBIDDEN_EXTENSIONS, $temp_path . $temp_file)) {
         $result_test_http = phpdigTestUrl($url_indexing, 'date', $cookies);
     }
     if (is_array($result_test_http) && !in_array($result_test_http['status'], array('NOHOST', 'NOFILE', 'LOOP', 'NEWHOST'))) {
         $tested_url = phpdigRewriteUrl($result_test_http['path']);
         $cookies = $result_test_http['cookies'];
         // update URI if redirect in same host...
         if ($tested_url['path'] != $temp_path || $tested_url['file'] != $temp_file) {
             $temp_path = $tested_url['path'];
             $temp_file = $tested_url['file'];
             $query = "UPDATE " . PHPDIG_DB_PREFIX . "tempspider SET path='{$temp_path}', file='{$temp_file}', WHERE id={$tempspider_id}";
             mysql_query($query, $id_connect);
             $url_indexing = $full_url . $temp_path . $temp_file;
             $url_indexing = ereg_replace("[?]\$", "", $url_indexing);
             $url_print = $url . $temp_path . $temp_file;
             $url_print = ereg_replace("[?]\$", "", $url_print);
function phpdigDetectDir($link, $exclude = '', $cookies = array(), $site_id = '', $id_connect = '')
{
    $test = parse_url($link['path'] . $link['file']);
    //test the exclude with robots.txt
    if (phpdigReadRobots($exclude, $link['path'] . $link['file']) == 1 || isset($exclude['@ALL@'])) {
        $link['ok'] = 0;
    } elseif (!isset($test['query']) && !eregi('[.][a-z0-9]{1,4}$', $link['path'] . $link['file']) && ($status = phpdigTestUrl($link['url'] . $link['path'] . $link['file'] . '/', 'date', $cookies)) && isset($status['status']) && $status['status'] == "HTML") {
        $link['path'] = ereg_replace('/+$', '/', $link['path'] . $link['file'] . '/');
        if ($link['path'] == '/') {
            $link['path'] = '';
        }
        $link['file'] = "";
        $link['ok'] = 1;
    } else {
        $status = phpdigTestUrl($link['url'] . $link['path'] . $link['file'], 'date', $cookies);
        if (!in_array($status['status'], array('NOHOST', 'NOFILE', 'LOOP', 'NEWHOST'))) {
            $link['ok'] = 1;
        } else {
            $link['ok'] = 0;
        }
    }
    if (is_numeric($site_id) && LIMIT_TO_DIRECTORY) {
        $query = "SELECT DISTINCT in_id, in_path FROM " . PHPDIG_DB_PREFIX . "includes WHERE in_site_id='{$site_id}'";
        if (is_array($list_include = phpdigMySelect($id_connect, $query))) {
            foreach ($list_include as $add_include) {
                if ($link['path'] != $add_include['in_path']) {
                    $link['ok'] = 0;
                }
            }
        }
    }
    if (!$link['ok'] && isset($status)) {
        $link['status'] = $status['status'];
        $link['host'] = $status['host'];
        $link['path'] = $status['path'];
        $link['cookies'] = $status['cookies'];
    }
    return $link;
}