예제 #1
0
function index_site($url, $reindex, $maxlevel, $soption, $url_inc, $url_not_inc, $can_leave, $use_robot, $use_nofollow, $cl, $all, $use_pref)
{
    global $db_con, $mysql_table_prefix, $command_line, $mainurl, $tmp_urls, $domain_arr, $all_keywords, $smp, $follow_sitemap;
    global $link_check, $smap_dir, $index_media, $clear, $create_sitemap, $tmp_dir, $domaincb;
    global $max_links, $realnum, $debug, $no_log, $dba_act, $add_auth, $interrupt, $index_media, $thumb_folder;
    if (!$can_leave) {
        $can_leave = $domaincb;
    }
    $can_leave_domain = $can_leave;
    $starttime = getmicrotime();
    //  start time to index this site
    $black = '0';
    //  will become counter for hits of blacklist
    $site_id = '';
    $skip = '';
    $smp = '0';
    $omit = array();
    $url = $db_con->real_escape_string(stripslashes($url));
    if (strstr($interrupt, "-")) {
        //  if indexer should not be interrupted periodically
        $interrupt = '999999';
        //  never
    }
    $int_count = $interrupt;
    //  $int_count will be decreased by each indexed link until $int_count = 1
    printStandardReport('starting', $command_line, $no_log);
    if (!isset($all_keywords)) {
        mysqltest();
        $sql_query = "SELECT keyword_ID, keyword from " . $mysql_table_prefix . "keywords";
        $result = $db_con->query($sql_query);
        if ($debug && $db_con->errno) {
            $err_row = __LINE__ - 2;
            printf("<p><span class='red'>&nbsp;MySQL failure: %s&nbsp;\n<br /></span></p>", $db_con->error);
            if (__FUNCTION__) {
                printf("<p><span class='red'>&nbsp;Found in script: " . __FILE__ . "&nbsp;&nbsp;row: {$err_row}&nbsp;&nbsp;in function():&nbsp;" . __FUNCTION__ . "&nbsp;<br /></span></p>");
            } else {
                printf("<p><span class='red'>&nbsp;Found in script: " . __FILE__ . "&nbsp;&nbsp;row: {$err_row}&nbsp;<br /></span></p>");
            }
            printf("<p><span class='red'>&nbsp;Script execution aborted.&nbsp;<br /></span>");
            printf("<p><strong>Invalid query string, which caused the SQL error:</strong></p>");
            echo "<p> {$sql_query} </p>";
            exit;
        }
        while ($row = $result->fetch_array(MYSQLI_NUM)) {
            $all_keywords[addslashes($row[1])] = $row[0];
        }
        if ($clear == 1) {
            clean_resource($result, '06');
        }
    }
    $url = convert_url($url);
    $compurl = parse_addr($url);
    if ($compurl['path'] == '') {
        $url = $url . "/";
    }
    $t = microtime();
    $a = getenv("REMOTE_ADDR");
    $sessid = md5($t . $a);
    if ($url != '/') {
        //      ignore dummies
        $urlparts = parse_addr($url);
        $domain = $urlparts['host'];
        if (isset($urlparts['port'])) {
            $port = (int) $urlparts['port'];
        } else {
            $port = 80;
        }
        if (strpos($url, "?")) {
            $url_bas = substr($url, 0, strpos($url, "?"));
        } else {
            $url_bas = $url;
        }
        mysqltest();
        $sql_query = "SELECT * from " . $mysql_table_prefix . "sites where url like '{$url_bas}%'";
        $result = $db_con->query($sql_query);
        if ($debug && $db_con->errno) {
            $err_row = __LINE__ - 2;
            printf("<p><span class='red'>&nbsp;MySQL failure: %s&nbsp;\n<br /></span></p>", $db_con->error);
            if (__FUNCTION__) {
                printf("<p><span class='red'>&nbsp;Found in script: " . __FILE__ . "&nbsp;&nbsp;row: {$err_row}&nbsp;&nbsp;in function():&nbsp;" . __FUNCTION__ . "&nbsp;<br /></span></p>");
            } else {
                printf("<p><span class='red'>&nbsp;Found in script: " . __FILE__ . "&nbsp;&nbsp;row: {$err_row}&nbsp;<br /></span></p>");
            }
            printf("<p><span class='red'>&nbsp;Script execution aborted.&nbsp;<br /></span>");
            printf("<p><strong>Invalid query string, which caused the SQL error:</strong></p>");
            echo "<p> {$sql_query} </p>";
            exit;
        }
        $row = $result->fetch_array(MYSQLI_NUM);
        $site_id = $row[0];
        $authent = $row[2];
        if ($add_auth && $authent) {
            //  for sites with authentication we need to verify the value
            $url_status = url_status($url, $site_id, $sessid);
            $url_parts = parse_all_url($url);
            if ($url_status['state'] == 'ok' && $url_status['content'] == 'text') {
                if ($url_status['relocate']) {
                    //  if relocated,  print message and redirect to new URL
                    printRedirected($url_status['relocate'], $url_status['path'], $cl);
                    if (strstr($url_status['path'], "//")) {
                        //  if redirected to absolute URL, use this for further usage
                        $url = $url_status['path'];
                    } else {
                        $relo_url = str_replace($url_parts['query'], "", $url);
                        //  url without query
                        $relo_url = substr($url, 0, strrpos($relo_url, "/") + 1);
                        //  url without file name
                        if (strpos($url_status['path'], "./") === 0) {
                            //  if redirected relativ to same folder depth
                            $url_status['path'] = str_replace("./", "", $url_status['path']);
                            $url = "" . $relo_url . "" . $url_status['path'] . "";
                        }
                        if (strpos($url_status['path'], "../") === 0) {
                            //  if redirected relativ and one folder up
                            $url_status['path'] = str_replace("./", "", $url_status['path']);
                            $relo_url = substr($url, 0, strpos($url_parts['path']));
                            //  url without file name
                            $relo_url = substr($url, 0, strrpos($relo_url, "/") + 1);
                            //  url without last folder
                            $url = "" . $relo_url . "" . $url_status['path'] . "";
                        }
                    }
                }
                //  read file
                $contents = array();
                $file = '';
                $file = file_get_contents($url);
                if ($file === FALSE) {
                    //  we know another way to get the content
                    $get_charset = '';
                    $contents = getFileContents($url, $get_charset);
                    $file = $contents['file'];
                }
                //  parse header only
                preg_match("@<head[^>]*>(.*?)<\\/head>@si", $file, $regs);
                $headdata = $regs[1];
                //  fetch the tag value
                preg_match("/<meta +name *=[\"']?Sphider-plus[\"']? *content=[\"'](.*?)[\"']/i", $headdata, $res);
                if (isset($res)) {
                    if ($authent != $res[1]) {
                        //  invalid value in authentication tag
                        $skip = '1';
                        printHeader($omit, $url, $command_line);
                        printStandardReport('Skipped_03', $command_line, $no_log);
                    }
                } else {
                    //  no authentication tag found in header
                    $skip = '1';
                    printHeader($omit, $url, $command_line);
                    printStandardReport('Skipped_02', $command_line, $no_log);
                }
            } else {
                $skip = '1';
                printHeader($omit, $url, $command_line);
                printStandardReport('statError', $command_line, $no_log);
            }
        }
        if (!$skip) {
            if ($site_id != "" && $reindex == 1) {
                mysqltest();
                $sql_query = "INSERT into " . $mysql_table_prefix . "temp (link, level, id) values ('{$url}', 0, '{$sessid}')";
                $db_con->query($sql_query);
                if ($debug && $db_con->errno) {
                    $err_row = __LINE__ - 2;
                    printf("<p><span class='red'>&nbsp;MySQL failure: %s&nbsp;\n<br /></span></p>", $db_con->error);
                    if (__FUNCTION__) {
                        printf("<p><span class='red'>&nbsp;Found in script: " . __FILE__ . "&nbsp;&nbsp;row: {$err_row}&nbsp;&nbsp;in function():&nbsp;" . __FUNCTION__ . "&nbsp;<br /></span></p>");
                    } else {
                        printf("<p><span class='red'>&nbsp;Found in script: " . __FILE__ . "&nbsp;&nbsp;row: {$err_row}&nbsp;<br /></span></p>");
                    }
                    printf("<p><span class='red'>&nbsp;Script execution aborted.&nbsp;<br /></span>");
                    printf("<p><strong>Invalid query string, which caused the SQL error:</strong></p>");
                    echo "<p> {$sql_query} </p>";
                    exit;
                }
                $sql_query = "SELECT url, level from " . $mysql_table_prefix . "links where site_id = {$site_id}";
                $result = $db_con->query($sql_query);
                while ($row = $result->fetch_array(MYSQLI_ASSOC)) {
                    $site_link = $row['url'];
                    $link_level = $row['level'];
                    if ($site_link != $url) {
                        $sql_query = "INSERT into " . $mysql_table_prefix . "temp (link, level, id) values ('{$site_link}', '{$link_level}', '{$sessid}')";
                        $db_con->query($sql_query);
                    }
                }
                $sql_query = "UPDATE " . $mysql_table_prefix . "sites set indexdate=now(), spider_depth ='{$maxlevel}', required = '{$url_inc}'," . "disallowed = '{$url_not_inc}', can_leave_domain='{$can_leave}', use_prefcharset='{$use_pref}' where site_id='{$site_id}'";
                mysqltest();
                $db_con->query($sql_query);
                if ($debug && $db_con->errno) {
                    $err_row = __LINE__ - 2;
                    printf("<p><span class='red'>&nbsp;MySQL failure: %s&nbsp;\n<br /></span></p>", $db_con->error);
                    if (__FUNCTION__) {
                        printf("<p><span class='red'>&nbsp;Found in script: " . __FILE__ . "&nbsp;&nbsp;row: {$err_row}&nbsp;&nbsp;in function():&nbsp;" . __FUNCTION__ . "&nbsp;<br /></span></p>");
                    } else {
                        printf("<p><span class='red'>&nbsp;Found in script: " . __FILE__ . "&nbsp;&nbsp;row: {$err_row}&nbsp;<br /></span></p>");
                    }
                    printf("<p><span class='red'>&nbsp;Script execution aborted.&nbsp;<br /></span>");
                    printf("<p><strong>Invalid query string, which caused the SQL error:</strong></p>");
                    echo "<p> {$sql_query} </p>";
                    exit;
                }
            } else {
                if ($site_id == '') {
                    mysqltest();
                    $sql_query = "INSERT into " . $mysql_table_prefix . "sites (url, indexdate, spider_depth, required, disallowed, can_leave_domain, use_prefcharset) " . "values ('{$url}', now(), '{$maxlevel}', '{$url_inc}', '{$url_not_inc}', '{$can_leave_domain}', '{$use_pref}')";
                    $db_con->query($sql_query);
                    if ($debug && $db_con->errno) {
                        $err_row = __LINE__ - 2;
                        printf("<p><span class='red'>&nbsp;MySQL failure: %s&nbsp;\n<br /></span></p>", $db_con->error);
                        if (__FUNCTION__) {
                            printf("<p><span class='red'>&nbsp;Found in script: " . __FILE__ . "&nbsp;&nbsp;row: {$err_row}&nbsp;&nbsp;in function():&nbsp;" . __FUNCTION__ . "&nbsp;<br /></span></p>");
                        } else {
                            printf("<p><span class='red'>&nbsp;Found in script: " . __FILE__ . "&nbsp;&nbsp;row: {$err_row}&nbsp;<br /></span></p>");
                        }
                        printf("<p><span class='red'>&nbsp;Script execution aborted.&nbsp;<br /></span>");
                        printf("<p><strong>Invalid query string, which caused the SQL error:</strong></p>");
                        echo "<p> {$sql_query} </p>";
                        exit;
                    }
                    $sql_query = "SELECT site_ID from " . $mysql_table_prefix . "sites where url='{$url}'";
                    $result = $db_con->query($sql_query);
                    if ($debug && $db_con->errno) {
                        $err_row = __LINE__ - 2;
                        printf("<p><span class='red'>&nbsp;MySQL failure: %s&nbsp;\n<br /></span></p>", $db_con->error);
                        if (__FUNCTION__) {
                            printf("<p><span class='red'>&nbsp;Found in script: " . __FILE__ . "&nbsp;&nbsp;row: {$err_row}&nbsp;&nbsp;in function():&nbsp;" . __FUNCTION__ . "&nbsp;<br /></span></p>");
                        } else {
                            printf("<p><span class='red'>&nbsp;Found in script: " . __FILE__ . "&nbsp;&nbsp;row: {$err_row}&nbsp;<br /></span></p>");
                        }
                        printf("<p><span class='red'>&nbsp;Script execution aborted.&nbsp;<br /></span>");
                        printf("<p><strong>Invalid query string, which caused the SQL error:</strong></p>");
                        echo "<p> {$sql_query} </p>";
                        exit;
                    }
                    $row = $result->fetch_array(MYSQLI_NUM);
                    $site_id = $row[0];
                    if ($clear == 1) {
                        clean_resource($result, '09');
                    }
                } else {
                    mysqltest();
                    $sql_query = "UPDATE " . $mysql_table_prefix . "sites set indexdate=now(), spider_depth ='{$maxlevel}', required = '{$url_inc}'," . "disallowed = '{$url_not_inc}', can_leave_domain='{$can_leave_domain}', use_prefcharset='{$use_pref}' where site_id='{$site_id}'";
                    $db_con->query($sql_query);
                    if ($debug && $db_con->errno) {
                        $err_row = __LINE__ - 2;
                        printf("<p><span class='red'>&nbsp;MySQL failure: %s&nbsp;\n<br /></span></p>", $db_con->error);
                        if (__FUNCTION__) {
                            printf("<p><span class='red'>&nbsp;Found in script: " . __FILE__ . "&nbsp;&nbsp;row: {$err_row}&nbsp;&nbsp;in function():&nbsp;" . __FUNCTION__ . "&nbsp;<br /></span></p>");
                        } else {
                            printf("<p><span class='red'>&nbsp;Found in script: " . __FILE__ . "&nbsp;&nbsp;row: {$err_row}&nbsp;<br /></span></p>");
                        }
                        printf("<p><span class='red'>&nbsp;Script execution aborted.&nbsp;<br /></span>");
                        printf("<p><strong>Invalid query string, which caused the SQL error:</strong></p>");
                        echo "<p> {$sql_query} </p>";
                        exit;
                    }
                }
            }
            $pending = array();
            mysqltest();
            $sql_query = "SELECT site_id, temp_id, level, count, num from " . $mysql_table_prefix . "pending where site_id='{$site_id}'";
            $result = $db_con->query($sql_query);
            if ($debug && $db_con->errno) {
                $err_row = __LINE__ - 2;
                printf("<p><span class='red'>&nbsp;MySQL failure: %s&nbsp;\n<br /></span></p>", $db_con->error);
                if (__FUNCTION__) {
                    printf("<p><span class='red'>&nbsp;Found in script: " . __FILE__ . "&nbsp;&nbsp;row: {$err_row}&nbsp;&nbsp;in function():&nbsp;" . __FUNCTION__ . "&nbsp;<br /></span></p>");
                } else {
                    printf("<p><span class='red'>&nbsp;Found in script: " . __FILE__ . "&nbsp;&nbsp;row: {$err_row}&nbsp;<br /></span></p>");
                }
                printf("<p><span class='red'>&nbsp;Script execution aborted.&nbsp;<br /></span>");
                printf("<p><strong>Invalid query string, which caused the SQL error:</strong></p>");
                echo "<p> {$sql_query} </p>";
                exit;
            }
            $row = $result->fetch_array(MYSQLI_NUM);
            $pending = $row[0];
            $level = '0';
            $count = '0';
            if ($clear == 1) {
                clean_resource($result, '10');
            }
            $domain_arr = get_domains();
            if ($pending == '') {
                mysqltest();
                $sql_query = "INSERT into " . $mysql_table_prefix . "temp (link, level, id) values ('{$url}', 0, '{$sessid}')";
                $db_con->query($sql_query);
                if ($debug && $db_con->errno) {
                    $err_row = __LINE__ - 2;
                    printf("<p><span class='red'>&nbsp;MySQL failure: %s&nbsp;\n<br /></span></p>", $db_con->error);
                    if (__FUNCTION__) {
                        printf("<p><span class='red'>&nbsp;Found in script: " . __FILE__ . "&nbsp;&nbsp;row: {$err_row}&nbsp;&nbsp;in function():&nbsp;" . __FUNCTION__ . "&nbsp;<br /></span></p>");
                    } else {
                        printf("<p><span class='red'>&nbsp;Found in script: " . __FILE__ . "&nbsp;&nbsp;row: {$err_row}&nbsp;<br /></span></p>");
                    }
                    printf("<p><span class='red'>&nbsp;Script execution aborted.&nbsp;<br /></span>");
                    printf("<p><strong>Invalid query string, which caused the SQL error:</strong></p>");
                    echo "<p> {$sql_query} </p>";
                    exit;
                }
            } else {
                if ($pending != '') {
                    printStandardReport('continueSuspended', $command_line, $no_log);
                    mysqltest();
                    $pend_count = '0';
                    //$result = $db_con->query("SELECT temp_id, level, count from ".$mysql_table_prefix."pending where site_id='$site_id'");
                    $sql_query = "SELECT * from " . $mysql_table_prefix . "pending where site_id='{$site_id}'";
                    $result = $db_con->query($sql_query);
                    if ($debug && $db_con->errno) {
                        $err_row = __LINE__ - 2;
                        printf("<p><span class='red'>&nbsp;MySQL failure: %s&nbsp;\n<br /></span></p>", $db_con->error);
                        if (__FUNCTION__) {
                            printf("<p><span class='red'>&nbsp;Found in script: " . __FILE__ . "&nbsp;&nbsp;row: {$err_row}&nbsp;&nbsp;in function():&nbsp;" . __FUNCTION__ . "&nbsp;<br /></span></p>");
                        } else {
                            printf("<p><span class='red'>&nbsp;Found in script: " . __FILE__ . "&nbsp;&nbsp;row: {$err_row}&nbsp;<br /></span></p>");
                        }
                        printf("<p><span class='red'>&nbsp;Script execution aborted.&nbsp;<br /></span>");
                        printf("<p><strong>Invalid query string, which caused the SQL error:</strong></p>");
                        echo "<p> {$sql_query} </p>";
                        exit;
                    }
                    $row = $result->fetch_array(MYSQLI_NUM);
                    if ($row) {
                        $sessid = $row[1];
                        $level = $row[2];
                        $pend_count = $row[3] + 1;
                        $num = $row[4];
                        $pending = 1;
                        $tmp_urls = get_temp_urls($sessid);
                        if ($clear == 1) {
                            clean_resource($result, '11');
                        }
                    }
                }
            }
            if ($pending != 1) {
                mysqltest();
                $sql_query = "INSERT into " . $mysql_table_prefix . "pending (site_id, temp_id, level, count) values ('{$site_id}', '{$sessid}', '0', '0')";
                $db_con->query($sql_query);
                if ($debug && $db_con->errno) {
                    $err_row = __LINE__ - 2;
                    printf("<p><span class='red'>&nbsp;MySQL failure: %s&nbsp;\n<br /></span></p>", $db_con->error);
                    if (__FUNCTION__) {
                        printf("<p><span class='red'>&nbsp;Found in script: " . __FILE__ . "&nbsp;&nbsp;row: {$err_row}&nbsp;&nbsp;in function():&nbsp;" . __FUNCTION__ . "&nbsp;<br /></span></p>");
                    } else {
                        printf("<p><span class='red'>&nbsp;Found in script: " . __FILE__ . "&nbsp;&nbsp;row: {$err_row}&nbsp;<br /></span></p>");
                    }
                    printf("<p><span class='red'>&nbsp;Script execution aborted.&nbsp;<br /></span>");
                    printf("<p><strong>Invalid query string, which caused the SQL error:</strong></p>");
                    echo "<p> {$sql_query} </p>";
                    exit;
                }
            }
            $time = time();
            $robots = "robots.txt";
            // standardname of robots file
            if ($use_robot == '1') {
                $omit = check_robot_txt($url, $robots);
            }
            printHeader($omit, $url, $command_line);
            if ($link_check == 1) {
                printStandardReport('start_link_check', $command_line, $no_log);
            }
            if ($link_check == 0 && $reindex == 1) {
                printStandardReport('start_reindex', $command_line, $no_log);
            }
            if ($link_check == 0 && $reindex == 0) {
                printStandardReport('starting', $command_line, $no_log);
            }
            $mainurl = $url;
            $realnum = $num;
            $num = 0;
            while ($level <= $maxlevel && $soption == 'level' || $soption == 'full') {
                if ($pending == 1) {
                    $count = $pend_count;
                    $pending = 0;
                } else {
                    $count = 0;
                }
                $links = array();
                mysqltest();
                $sql_query = "SELECT distinct link from " . $mysql_table_prefix . "temp where level={$level} && id='{$sessid}' order by link";
                $result = $db_con->query($sql_query);
                $rows = $result->num_rows;
                if ($rows == 0) {
                    break;
                }
                while ($row = $result->fetch_array(MYSQLI_ASSOC)) {
                    $links[] = $row['link'];
                }
                //  now loop through all available links(pages)
                while ($count < count($links)) {
                    $num++;
                    $realnum++;
                    if ($realnum > $max_links) {
                        //  if max. links per page reached
                        mysqltest();
                        $sql_query = "DELETE from " . $mysql_table_prefix . "temp where id = '{$sessid}'";
                        $db_con->query($sql_query);
                        $sql_query = "DELETE from " . $mysql_table_prefix . "pending where site_id = '{$site_id}'";
                        $db_con->query($sql_query);
                        printMaxLinks($max_links, $cl);
                        printStandardReport('completed', $command_line, $no_log);
                        return;
                    }
                    $thislink = $db_con->real_escape_string(stripslashes($links[$count]));
                    $urlparts = parse_addr($thislink);
                    $forbidden = 0;
                    if (is_array($omit)) {
                        //      if valid robots.txt  was found
                        reset($omit);
                        foreach ($omit as $omiturl) {
                            $omiturl = trim($omiturl);
                            $omiturl_parts = array();
                            $omiturl_parts = parse_addr($omiturl);
                            if (@$omiturl_parts['scheme'] == '') {
                                $check_omit = $urlparts['host'] . $omiturl;
                            } else {
                                $check_omit = $omiturl;
                            }
                            if (strpos($thislink, $check_omit)) {
                                printRobotsReport($num, $thislink, $command_line);
                                $realnum--;
                                check_for_removal($thislink);
                                $forbidden = 1;
                                break;
                            }
                        }
                    }
                    if (!check_include($thislink, $url_inc, $url_not_inc)) {
                        $realnum--;
                        printUrlStringReport($num, $thislink, $command_line);
                        //printUrlStringReport($realnum, $thislink, $command_line);
                        check_for_removal($thislink);
                        $forbidden = 1;
                    }
                    if ($forbidden == 0) {
                        printRetrieving($num, stripslashes(rawurldecode($thislink)), $command_line);
                        //printRetrieving($realnum, $thislink, $command_line);
                        mysqltest();
                        $sql_query = "SELECT md5sum, indexdate from " . $mysql_table_prefix . "links where url='{$thislink}'";
                        $result = $db_con->query($sql_query);
                        $rows = $result->num_rows;
                        if ($rows == 0) {
                            $url_status = index_url($thislink, $level + 1, $site_id, '', $domain, '', $sessid, $can_leave_domain, $reindex, $use_nofollow, $cl, $use_robot, $use_pref, $url_inc, $url_not_inc, $num);
                            //  check for touching the blacklist and its count against limit
                            if ($url_status['black'] == "1") {
                                $black++;
                                if ($black > 20) {
                                    //  limit until aborting the indexation of this site
                                    $url_status['aborted'] = "1";
                                    $url_status['state'] = "<br /><br />Indexation aborted for this site, as it met too often the blacklist.";
                                }
                            } else {
                                $black = 0;
                                //  reset counter, as should count only on continuous hits
                            }
                            //  check for emergency exit
                            if ($url_status['aborted'] == "1") {
                                //  delete all links from the temp table, which might be left for this site
                                mysqltest();
                                $sql_query = "DELETE from " . $mysql_table_prefix . "temp where id = '{$sessid}'";
                                $db_con->query($sql_query);
                                $sql_query = "DELETE from " . $mysql_table_prefix . "pending where site_id = '{$site_id}'";
                                $db_con->query($sql_query);
                                $sql_query = "UPDATE " . $mysql_table_prefix . "sites set indexdate=now() where url = '{$url}'";
                                $db_con->query($sql_query);
                                //  end all loops
                                $forbidden = '1';
                                $omit = '';
                                $reindex = '';
                                $count = '9999999999';
                                $pending = array();
                                if (!stristr($url_status['state'], "NOHOST") && !stristr($url_status['state'], "black")) {
                                    //  NOHOST warning will be printed separately
                                    printWarning($url_status['state'], $command_line, $no_log);
                                }
                            }
                            if (stristr($url_status['state'], "NOHOST")) {
                                //  delete all links from the temp table, which might be left for this site,  etc
                                mysqltest();
                                $sql_query = "DELETE from " . $mysql_table_prefix . "temp where id = '{$sessid}'";
                                $db_con->query($sql_query);
                                $sql_query = "DELETE from " . $mysql_table_prefix . "pending where site_id = '{$site_id}'";
                                $db_con->query($sql_query);
                                $sql_query = "UPDATE " . $mysql_table_prefix . "sites set indexdate=now() where url = '{$url}'";
                                $db_con->query($sql_query);
                                //  end all loops
                                $forbidden = '1';
                                $omit = '';
                                $reindex = '';
                                $count = '9999999999';
                                $pending = array();
                                printWarning($url_status['state'], $command_line, $no_log);
                                return;
                            }
                            //  check for UFO file or invalid suffix (by redirected URL)
                            if (stristr($url_status['state'], "ufo")) {
                                //printWarning($url_status['state'],$command_line, $no_log);
                            }
                            if ($url_status['state'] != "ok") {
                                printWarning($url_status['state'], $command_line, $no_log);
                            }
                            mysqltest();
                            $sql_query = "UPDATE " . $mysql_table_prefix . "pending set level ='{$level}', count='{$count}', num='{$realnum}' where site_id='{$site_id}'";
                            $db_con->query($sql_query);
                        } else {
                            if ($rows != 0 && $reindex == 1) {
                                $row = $result->fetch_array(MYSQLI_ASSOC);
                                $md5sum = $row['md5sum'];
                                $indexdate = $row['indexdate'];
                                if ($link_check == 1 && $reindex == 1) {
                                    link_check($thislink, $level + 1, $sessid, $can_leave_domain, $reindex, $site_id);
                                } else {
                                    $url_status = index_url($thislink, $level + 1, $site_id, $md5sum, $domain, $indexdate, $sessid, $can_leave_domain, $reindex, $use_nofollow, $cl, $use_robot, $use_pref, $url_inc, $url_not_inc, $num);
                                    //  check for emergency exit
                                    if ($url_status['aborted']) {
                                        //  delete all links from the temp table, which might be left for this site
                                        mysqltest();
                                        $sql_query = "DELETE from " . $mysql_table_prefix . "temp where id = '{$sessid}'";
                                        $db_con->query($sql_query);
                                        //  end all loops
                                        $forbidden = '1';
                                        $omit = '';
                                        $reindex = '';
                                        $count = '9999999999';
                                        $pending = array();
                                        printWarning($url_status['state'], $command_line, $no_log);
                                    }
                                }
                            } else {
                                printStandardReport('inDatabase', $command_line, $no_log);
                                $realnum--;
                                //$num--;
                            }
                        }
                        if ($rows != 0) {
                            mysqltest();
                            $sql_query = "UPDATE " . $mysql_table_prefix . "pending set level ='{$level}', count='{$count}', num='{$realnum}' where site_id='{$site_id}'";
                            $db_con->query($sql_query);
                        }
                        if ($clear == 1) {
                            clean_resource($result, '13');
                        }
                    }
                    //  check for interrupt counter
                    if ($int_count == '1') {
                        //  interrupt the index procedure until interactive resume
                        $sql_query = "UPDATE " . $mysql_table_prefix . "pending set level ='{$level}', count='{$count}', num='{$realnum}' where site_id='{$site_id}'";
                        $db_con->query($sql_query);
                        printInterrupt($interrupt, $url, $cl);
                        die;
                    }
                    $count++;
                    $int_count--;
                }
                $level++;
            }
        }
        mysqltest();
        $sql_query = "DELETE from " . $mysql_table_prefix . "temp where id = '{$sessid}'";
        $db_con->query($sql_query);
        if ($debug && $db_con->errno) {
            $err_row = __LINE__ - 2;
            printf("<p><span class='red'>&nbsp;MySQL failure: %s&nbsp;\n<br /></span></p>", $db_con->error);
            if (__FUNCTION__) {
                printf("<p><span class='red'>&nbsp;Found in script: " . __FILE__ . "&nbsp;&nbsp;row: {$err_row}&nbsp;&nbsp;in function():&nbsp;" . __FUNCTION__ . "&nbsp;<br /></span></p>");
            } else {
                printf("<p><span class='red'>&nbsp;Found in script: " . __FILE__ . "&nbsp;&nbsp;row: {$err_row}&nbsp;<br /></span></p>");
            }
            printf("<p><span class='red'>&nbsp;Script execution aborted.&nbsp;<br /></span>");
            printf("<p><strong>Invalid query string, which caused the SQL error:</strong></p>");
            echo "<p> {$sql_query} </p>";
            exit;
        }
        $sql_query = "DELETE from " . $mysql_table_prefix . "pending where site_id = '{$site_id}'";
        $db_con->query($sql_query);
        if ($debug && $db_con->errno) {
            $err_row = __LINE__ - 2;
            printf("<p><span class='red'>&nbsp;MySQL failure: %s&nbsp;\n<br /></span></p>", $db_con->error);
            if (__FUNCTION__) {
                printf("<p><span class='red'>&nbsp;Found in script: " . __FILE__ . "&nbsp;&nbsp;row: {$err_row}&nbsp;&nbsp;in function():&nbsp;" . __FUNCTION__ . "&nbsp;<br /></span></p>");
            } else {
                printf("<p><span class='red'>&nbsp;Found in script: " . __FILE__ . "&nbsp;&nbsp;row: {$err_row}&nbsp;<br /></span></p>");
            }
            printf("<p><span class='red'>&nbsp;Script execution aborted.&nbsp;<br /></span>");
            printf("<p><strong>Invalid query string, which caused the SQL error:</strong></p>");
            echo "<p> {$sql_query} </p>";
            exit;
        }
        if ($create_sitemap == 1) {
            create_sitemap($site_id, $url);
        }
        printStandardReport('completed', $command_line, $no_log);
        $stats = get_Stats();
        printDatabase($stats, $cl);
    }
    if ($index_media) {
        //  delete all thumbnails in .../admin/tmp/thumbs/ folder
        clear_folder("." . $thumb_folder);
    }
}
예제 #2
0
파일: spider.php 프로젝트: hoelzro/Bifrost
function index_site($url, $reindex, $maxlevel, $soption, $url_inc, $url_not_inc, $can_leave_domain)
{
    global $mysql_table_prefix, $command_line, $mainurl, $tmp_urls, $domain_arr, $all_keywords;
    if (!isset($all_keywords)) {
        $result = mysql_query("select keyword_ID, keyword from " . $mysql_table_prefix . "keywords");
        echo mysql_error();
        while ($row = mysql_fetch_array($result)) {
            $all_keywords[addslashes($row[1])] = $row[0];
        }
    }
    $compurl = parse_url($url);
    if ($compurl['path'] == '') {
        $url = $url . "/";
    }
    $t = microtime();
    $a = getenv("REMOTE_ADDR");
    $sessid = md5($t . $a);
    $urlparts = parse_url($url);
    $domain = $urlparts['host'];
    if (isset($urlparts['port'])) {
        $port = (int) $urlparts['port'];
    } else {
        $port = 80;
    }
    $result = mysql_query("select site_id from " . $mysql_table_prefix . "sites where url='{$url}'");
    echo mysql_error();
    $row = mysql_fetch_row($result);
    $site_id = $row[0];
    if ($site_id != "" && $reindex == 1) {
        mysql_query("insert into " . $mysql_table_prefix . "temp (link, level, id) values ('{$url}', 0, '{$sessid}')");
        echo mysql_error();
        $result = mysql_query("select url, level from " . $mysql_table_prefix . "links where site_id = {$site_id}");
        while ($row = mysql_fetch_array($result)) {
            $site_link = $row['url'];
            $link_level = $row['level'];
            if ($site_link != $url) {
                mysql_query("insert into " . $mysql_table_prefix . "temp (link, level, id) values ('{$site_link}', {$link_level}, '{$sessid}')");
            }
        }
        $qry = "update " . $mysql_table_prefix . "sites set indexdate=now(), spider_depth = {$maxlevel}, required = '{$url_inc}'," . "disallowed = '{$url_not_inc}', can_leave_domain={$can_leave_domain} where site_id={$site_id}";
        mysql_query($qry);
        echo mysql_error();
    } else {
        if ($site_id == '') {
            mysql_query("insert into " . $mysql_table_prefix . "sites (url, indexdate, spider_depth, required, disallowed, can_leave_domain) " . "values ('{$url}', now(), {$maxlevel}, '{$url_inc}', '{$url_not_inc}', {$can_leave_domain})");
            echo mysql_error();
            $result = mysql_query("select site_ID from " . $mysql_table_prefix . "sites where url='{$url}'");
            $row = mysql_fetch_row($result);
            $site_id = $row[0];
        } else {
            mysql_query("update " . $mysql_table_prefix . "sites set indexdate=now(), spider_depth = {$maxlevel}, required = '{$url_inc}'," . "disallowed = '{$url_not_inc}', can_leave_domain={$can_leave_domain} where site_id={$site_id}");
            echo mysql_error();
        }
    }
    $result = mysql_query("select site_id, temp_id, level, count, num from " . $mysql_table_prefix . "pending where site_id='{$site_id}'");
    echo mysql_error();
    $row = mysql_fetch_row($result);
    $pending = $row[0];
    $level = 0;
    $domain_arr = get_domains();
    if ($pending == '') {
        mysql_query("insert into " . $mysql_table_prefix . "temp (link, level, id) values ('{$url}', 0, '{$sessid}')");
        echo mysql_error();
    } else {
        if ($pending != '') {
            printStandardReport('continueSuspended', $command_line);
            mysql_query("select temp_id, level, count from " . $mysql_table_prefix . "pending where site_id='{$site_id}'");
            echo mysql_error();
            $sessid = $row[1];
            $level = $row[2];
            $pend_count = $row[3] + 1;
            $num = $row[4];
            $pending = 1;
            $tmp_urls = get_temp_urls($sessid);
        }
    }
    if ($reindex != 1) {
        mysql_query("insert into " . $mysql_table_prefix . "pending (site_id, temp_id, level, count) values ('{$site_id}', '{$sessid}', '0', '0')");
        echo mysql_error();
    }
    $time = time();
    $omit = check_robot_txt($url);
    printHeader($omit, $url, $command_line);
    $mainurl = $url;
    $num = 0;
    while ($level <= $maxlevel && $soption == 'level' || $soption == 'full') {
        if ($pending == 1) {
            $count = $pend_count;
            $pending = 0;
        } else {
            $count = 0;
        }
        $links = array();
        $result = mysql_query("select distinct link from " . $mysql_table_prefix . "temp where level={$level} && id='{$sessid}' order by link");
        echo mysql_error();
        $rows = mysql_num_rows($result);
        if ($rows == 0) {
            break;
        }
        $i = 0;
        while ($row = mysql_fetch_array($result)) {
            $links[] = $row['link'];
        }
        reset($links);
        while ($count < count($links)) {
            $num++;
            $thislink = $links[$count];
            $urlparts = parse_url($thislink);
            reset($omit);
            $forbidden = 0;
            foreach ($omit as $omiturl) {
                $omiturl = trim($omiturl);
                $omiturl_parts = parse_url($omiturl);
                if ($omiturl_parts['scheme'] == '') {
                    $check_omit = $urlparts['host'] . $omiturl;
                } else {
                    $check_omit = $omiturl;
                }
                if (strpos($thislink, $check_omit)) {
                    printRobotsReport($num, $thislink, $command_line);
                    check_for_removal($thislink);
                    $forbidden = 1;
                    break;
                }
            }
            if (!check_include($thislink, $url_inc, $url_not_inc)) {
                printUrlStringReport($num, $thislink, $command_line);
                check_for_removal($thislink);
                $forbidden = 1;
            }
            if ($forbidden == 0) {
                printRetrieving($num, $thislink, $command_line);
                $query = "select md5sum, indexdate from " . $mysql_table_prefix . "links where url='{$thislink}'";
                $result = mysql_query($query);
                echo mysql_error();
                $rows = mysql_num_rows($result);
                if ($rows == 0) {
                    index_url($thislink, $level + 1, $site_id, '', $domain, '', $sessid, $can_leave_domain, $reindex);
                    mysql_query("update " . $mysql_table_prefix . "pending set level = {$level}, count={$count}, num={$num} where site_id={$site_id}");
                    echo mysql_error();
                } else {
                    if ($rows != 0 && $reindex == 1) {
                        $row = mysql_fetch_array($result);
                        $md5sum = $row['md5sum'];
                        $indexdate = $row['indexdate'];
                        index_url($thislink, $level + 1, $site_id, $md5sum, $domain, $indexdate, $sessid, $can_leave_domain, $reindex);
                        mysql_query("update " . $mysql_table_prefix . "pending set level = {$level}, count={$count}, num={$num} where site_id={$site_id}");
                        echo mysql_error();
                    } else {
                        printStandardReport('inDatabase', $command_line);
                    }
                }
            }
            $count++;
        }
        $level++;
    }
    mysql_query("delete from " . $mysql_table_prefix . "temp where id = '{$sessid}'");
    echo mysql_error();
    mysql_query("delete from " . $mysql_table_prefix . "pending where site_id = '{$site_id}'");
    echo mysql_error();
    printStandardReport('completed', $command_line);
}
예제 #3
0
파일: spider.php 프로젝트: pwh/scrutiny
function index_site($url, $reindex, $maxlevel, $soption, $url_inc, $url_not_inc, $can_leave_domain, $use_robot)
{
    global $command_line, $mainurl, $tmp_urls, $domain_arr, $all_keywords, $smp, $realnum;
    printStandardReport('starting', $command_line);
    $smp = '0';
    if (!isset($all_keywords)) {
        $result = mysql_query("select keyword_ID, keyword from " . TABLE_PREFIX . "keywords");
        if (DEBUG > '0') {
            echo mysql_error();
        }
        while ($row = mysql_fetch_array($result)) {
            $all_keywords[addslashes($row[1])] = $row[0];
        }
        clean_resource($result);
    }
    $compurl = parse_url($url);
    if (isset($compurl['path']) && $compurl['path'] == '') {
        $url = $url . "/";
    }
    $t = microtime();
    $a = getenv("REMOTE_ADDR");
    $sessid = md5($t . $a);
    if ($url != '/') {
        //      ignore dummies
        $urlparts = parse_url($url);
        $domain = $urlparts['host'];
        if (isset($urlparts['port'])) {
            $port = (int) $urlparts['port'];
        } else {
            $port = 80;
        }
        $result = mysql_query("select site_id from " . TABLE_PREFIX . "sites where url='{$url}'");
        if (DEBUG > '0') {
            echo mysql_error();
        }
        $row = mysql_fetch_row($result);
        $site_id = $row[0];
        clean_resource($result);
        if ($site_id != "" && $reindex == 1) {
            mysql_query("insert into " . TABLE_PREFIX . "temp (link, level, id) values ('{$url}', 0, '{$sessid}')");
            if (DEBUG > '0') {
                echo mysql_error();
            }
            $result = mysql_query("select url, level from " . TABLE_PREFIX . "links where site_id = {$site_id}");
            while ($row = mysql_fetch_array($result)) {
                $site_link = $row['url'];
                $link_level = $row['level'];
                if ($site_link != $url) {
                    mysql_query("insert into " . TABLE_PREFIX . "temp (link, level, id) values ('{$site_link}', {$link_level}, '{$sessid}')");
                }
            }
            clean_resource($result);
            $qry = "update " . TABLE_PREFIX . "sites set indexdate=now(), spider_depth = {$maxlevel}, required = '{$url_inc}'," . "disallowed = '{$url_not_inc}', can_leave_domain={$can_leave_domain} where site_id={$site_id}";
            mysql_query($qry);
            if (DEBUG > '0') {
                echo mysql_error();
            }
        } else {
            if ($site_id == '') {
                mysql_query("insert into " . TABLE_PREFIX . "sites (url, indexdate, spider_depth, required, disallowed, can_leave_domain) " . "values ('{$url}', now(), {$maxlevel}, '{$url_inc}', '{$url_not_inc}', {$can_leave_domain})");
                if (DEBUG > '0') {
                    echo mysql_error();
                }
                $result = mysql_query("select site_ID from " . TABLE_PREFIX . "sites where url='{$url}'");
                $row = mysql_fetch_row($result);
                $site_id = $row[0];
                clean_resource($result);
            } else {
                mysql_query("update " . TABLE_PREFIX . "sites set indexdate=now(), spider_depth = {$maxlevel}, required = '{$url_inc}'," . "disallowed = '{$url_not_inc}', can_leave_domain={$can_leave_domain} where site_id={$site_id}");
                if (DEBUG > '0') {
                    echo mysql_error();
                }
            }
        }
        $result = mysql_query("select site_id, temp_id, level, count, num from " . TABLE_PREFIX . "pending where site_id='{$site_id}'");
        if (DEBUG > '0') {
            echo mysql_error();
        }
        $row = mysql_fetch_row($result);
        $pending = $row[0];
        $level = 0;
        clean_resource($result);
        $domain_arr = get_domains();
        if ($pending == '') {
            mysql_query("insert into " . TABLE_PREFIX . "temp (link, level, id) values ('{$url}', 0, '{$sessid}')");
            if (DEBUG > '0') {
                echo mysql_error();
            }
        } else {
            if ($pending != '') {
                printStandardReport('continueSuspended', $command_line);
                $result = mysql_query("select temp_id, level, count from " . TABLE_PREFIX . "pending where site_id='{$site_id}'");
                if (DEBUG > '0') {
                    echo mysql_error();
                }
                $row = mysql_fetch_row($result);
                $sessid = $row[1];
                $level = $row[2];
                $pend_count = $row[3] + 1;
                $num = $row[4];
                $pending = 1;
                $tmp_urls = get_temp_urls($sessid);
                clean_resource($result);
            }
        }
        if ($reindex != 1) {
            mysql_query("insert into " . TABLE_PREFIX . "pending (site_id, temp_id, level, count) values ('{$site_id}', '{$sessid}', '0', '0')");
            if (DEBUG > '0') {
                echo mysql_error();
            }
        }
        $time = time();
        $robots = "robots.txt";
        // standardname of file
        if ($use_robot != '1') {
            $robots = "no_robots.txt";
            // Sphider never will find this file and ignore the contents of robots.txt
        }
        $omit = check_robot_txt($url, $robots);
        printHeader($omit, $url, $command_line);
        if (Configure::read('link_check') == 1) {
            printStandardReport('start_link_check', $command_line);
        }
        if (Configure::read('link_check') == 0 && $reindex == 1) {
            printStandardReport('start_reindex', $command_line);
        }
        if (Configure::read('link_check') == 0 && $reindex == 0) {
            printStandardReport('starting', $command_line);
        }
        $mainurl = $url;
        $realnum = 0;
        while ($level <= $maxlevel && $soption == 'level' || $soption == 'full') {
            if ($pending == 1) {
                $count = $pend_count;
                $pending = 0;
            } else {
                $count = 0;
            }
            $links = array();
            $result = mysql_query("select distinct link from " . TABLE_PREFIX . "temp where level={$level} && id='{$sessid}' order by link");
            if (DEBUG > '0') {
                echo mysql_error();
            }
            $rows = mysql_num_rows($result);
            if ($rows == 0) {
                break;
            }
            $i = 0;
            while ($row = mysql_fetch_array($result)) {
                $links[] = $row['link'];
            }
            clean_resource($result);
            reset($links);
            $num = 0;
            while ($count < count($links)) {
                $num++;
                $realnum++;
                if ($realnum > Configure::read('max_links') + 1) {
                    //  if max. links per page reached
                    mysql_query("delete from " . TABLE_PREFIX . "temp");
                    if (DEBUG > '0') {
                        echo mysql_error();
                    }
                    mysql_query("delete from " . TABLE_PREFIX . "pending");
                    if (DEBUG > '0') {
                        echo mysql_error();
                    }
                    printMaxLinks(Configure::read('max_links'));
                    printStandardReport('completed', $command_line);
                    return;
                }
                $thislink = $links[$count];
                $urlparts = parse_url($thislink);
                reset($omit);
                $forbidden = 0;
                foreach ($omit as $omiturl) {
                    $omiturl = trim($omiturl);
                    $omiturl_parts = parse_url($omiturl);
                    if ($omiturl_parts['scheme'] == '') {
                        $check_omit = $urlparts['host'] . $omiturl;
                    } else {
                        $check_omit = $omiturl;
                    }
                    if (strpos($thislink, $check_omit)) {
                        printRobotsReport($num, $thislink, $command_line);
                        $realnum--;
                        check_for_removal($thislink);
                        $forbidden = 1;
                        break;
                    }
                }
                if (!check_include($thislink, $url_inc, $url_not_inc)) {
                    printUrlStringReport($num, $thislink, $command_line);
                    check_for_removal($thislink);
                    $forbidden = 1;
                }
                if ($forbidden == 0) {
                    printRetrieving($num, $thislink, $command_line);
                    $query = "select md5sum, indexdate from " . TABLE_PREFIX . "links where url='{$thislink}'";
                    $result = mysql_query($query);
                    if (DEBUG > '0') {
                        echo mysql_error();
                    }
                    $rows = mysql_num_rows($result);
                    if ($rows == 0) {
                        index_url($thislink, $level + 1, $site_id, '', $domain, '', $sessid, $can_leave_domain, $reindex);
                        mysql_query("update " . TABLE_PREFIX . "pending set level = {$level}, count={$count}, num={$num} where site_id={$site_id}");
                        if (DEBUG > '0') {
                            echo mysql_error();
                        }
                    } else {
                        if ($rows != 0 && $reindex == 1) {
                            $row = mysql_fetch_array($result);
                            $md5sum = $row['md5sum'];
                            $indexdate = $row['indexdate'];
                            if (Configure::read('link_check') == 1 && $reindex == 1) {
                                link_check($thislink, $level + 1, $sessid, $can_leave_domain, $reindex);
                            } else {
                                index_url($thislink, $level + 1, $site_id, $md5sum, $domain, $indexdate, $sessid, $can_leave_domain, $reindex);
                            }
                            mysql_query("update " . TABLE_PREFIX . "pending set level = {$level}, count={$count}, num={$num} where site_id={$site_id}");
                            if (DEBUG > '0') {
                                echo mysql_error();
                            }
                        } else {
                            printStandardReport('inDatabase', $command_line);
                            $realnum--;
                        }
                    }
                    clean_resource($result);
                }
                $count++;
            }
            $level++;
        }
        mysql_query("delete from " . TABLE_PREFIX . "temp where id = '{$sessid}'");
        if (DEBUG > '0') {
            echo mysql_error();
        }
        mysql_query("delete from " . TABLE_PREFIX . "pending where site_id = '{$site_id}'");
        if (DEBUG > '0') {
            echo mysql_error();
        }
        create_sitemap($site_id, $url);
        printStandardReport('completed', $command_line);
        $stats = get_Stats();
        $stats_sites = $stats['sites'];
        $stats_links = $stats['links'];
        $stats_categories = $stats['categories'];
        $stats_keywords = $stats['keywords'];
        printDatabase($stats_sites, $stats_links, $stats_categories, $stats_keywords);
    }
}