function index_site($url, $reindex, $maxlevel, $soption, $url_inc, $url_not_inc, $can_leave, $use_robot, $use_nofollow, $cl, $all, $use_pref) { global $db_con, $mysql_table_prefix, $command_line, $mainurl, $tmp_urls, $domain_arr, $all_keywords, $smp, $follow_sitemap; global $link_check, $smap_dir, $index_media, $clear, $create_sitemap, $tmp_dir, $domaincb; global $max_links, $realnum, $debug, $no_log, $dba_act, $add_auth, $interrupt, $index_media, $thumb_folder; if (!$can_leave) { $can_leave = $domaincb; } $can_leave_domain = $can_leave; $starttime = getmicrotime(); // start time to index this site $black = '0'; // will become counter for hits of blacklist $site_id = ''; $skip = ''; $smp = '0'; $omit = array(); $url = $db_con->real_escape_string(stripslashes($url)); if (strstr($interrupt, "-")) { // if indexer should not be interrupted periodically $interrupt = '999999'; // never } $int_count = $interrupt; // $int_count will be decreased by each indexed link until $int_count = 1 printStandardReport('starting', $command_line, $no_log); if (!isset($all_keywords)) { mysqltest(); $sql_query = "SELECT keyword_ID, keyword from " . $mysql_table_prefix . "keywords"; $result = $db_con->query($sql_query); if ($debug && $db_con->errno) { $err_row = __LINE__ - 2; printf("<p><span class='red'> MySQL failure: %s \n<br /></span></p>", $db_con->error); if (__FUNCTION__) { printf("<p><span class='red'> Found in script: " . __FILE__ . " row: {$err_row} in function(): " . __FUNCTION__ . " <br /></span></p>"); } else { printf("<p><span class='red'> Found in script: " . __FILE__ . " row: {$err_row} <br /></span></p>"); } printf("<p><span class='red'> Script execution aborted. <br /></span>"); printf("<p><strong>Invalid query string, which caused the SQL error:</strong></p>"); echo "<p> {$sql_query} </p>"; exit; } while ($row = $result->fetch_array(MYSQLI_NUM)) { $all_keywords[addslashes($row[1])] = $row[0]; } if ($clear == 1) { clean_resource($result, '06'); } } $url = convert_url($url); $compurl = parse_addr($url); if ($compurl['path'] == '') { $url = $url . "/"; } $t = microtime(); $a = getenv("REMOTE_ADDR"); $sessid = md5($t . $a); if ($url != '/') { // ignore dummies $urlparts = parse_addr($url); $domain = $urlparts['host']; if (isset($urlparts['port'])) { $port = (int) $urlparts['port']; } else { $port = 80; } if (strpos($url, "?")) { $url_bas = substr($url, 0, strpos($url, "?")); } else { $url_bas = $url; } mysqltest(); $sql_query = "SELECT * from " . $mysql_table_prefix . "sites where url like '{$url_bas}%'"; $result = $db_con->query($sql_query); if ($debug && $db_con->errno) { $err_row = __LINE__ - 2; printf("<p><span class='red'> MySQL failure: %s \n<br /></span></p>", $db_con->error); if (__FUNCTION__) { printf("<p><span class='red'> Found in script: " . __FILE__ . " row: {$err_row} in function(): " . __FUNCTION__ . " <br /></span></p>"); } else { printf("<p><span class='red'> Found in script: " . __FILE__ . " row: {$err_row} <br /></span></p>"); } printf("<p><span class='red'> Script execution aborted. <br /></span>"); printf("<p><strong>Invalid query string, which caused the SQL error:</strong></p>"); echo "<p> {$sql_query} </p>"; exit; } $row = $result->fetch_array(MYSQLI_NUM); $site_id = $row[0]; $authent = $row[2]; if ($add_auth && $authent) { // for sites with authentication we need to verify the value $url_status = url_status($url, $site_id, $sessid); $url_parts = parse_all_url($url); if ($url_status['state'] == 'ok' && $url_status['content'] == 'text') { if ($url_status['relocate']) { // if relocated, print message and redirect to new URL printRedirected($url_status['relocate'], $url_status['path'], $cl); if (strstr($url_status['path'], "//")) { // if redirected to absolute URL, use this for further usage $url = $url_status['path']; } else { $relo_url = str_replace($url_parts['query'], "", $url); // url without query $relo_url = substr($url, 0, strrpos($relo_url, "/") + 1); // url without file name if (strpos($url_status['path'], "./") === 0) { // if redirected relativ to same folder depth $url_status['path'] = str_replace("./", "", $url_status['path']); $url = "" . $relo_url . "" . $url_status['path'] . ""; } if (strpos($url_status['path'], "../") === 0) { // if redirected relativ and one folder up $url_status['path'] = str_replace("./", "", $url_status['path']); $relo_url = substr($url, 0, strpos($url_parts['path'])); // url without file name $relo_url = substr($url, 0, strrpos($relo_url, "/") + 1); // url without last folder $url = "" . $relo_url . "" . $url_status['path'] . ""; } } } // read file $contents = array(); $file = ''; $file = file_get_contents($url); if ($file === FALSE) { // we know another way to get the content $get_charset = ''; $contents = getFileContents($url, $get_charset); $file = $contents['file']; } // parse header only preg_match("@<head[^>]*>(.*?)<\\/head>@si", $file, $regs); $headdata = $regs[1]; // fetch the tag value preg_match("/<meta +name *=[\"']?Sphider-plus[\"']? *content=[\"'](.*?)[\"']/i", $headdata, $res); if (isset($res)) { if ($authent != $res[1]) { // invalid value in authentication tag $skip = '1'; printHeader($omit, $url, $command_line); printStandardReport('Skipped_03', $command_line, $no_log); } } else { // no authentication tag found in header $skip = '1'; printHeader($omit, $url, $command_line); printStandardReport('Skipped_02', $command_line, $no_log); } } else { $skip = '1'; printHeader($omit, $url, $command_line); printStandardReport('statError', $command_line, $no_log); } } if (!$skip) { if ($site_id != "" && $reindex == 1) { mysqltest(); $sql_query = "INSERT into " . $mysql_table_prefix . "temp (link, level, id) values ('{$url}', 0, '{$sessid}')"; $db_con->query($sql_query); if ($debug && $db_con->errno) { $err_row = __LINE__ - 2; printf("<p><span class='red'> MySQL failure: %s \n<br /></span></p>", $db_con->error); if (__FUNCTION__) { printf("<p><span class='red'> Found in script: " . __FILE__ . " row: {$err_row} in function(): " . __FUNCTION__ . " <br /></span></p>"); } else { printf("<p><span class='red'> Found in script: " . __FILE__ . " row: {$err_row} <br /></span></p>"); } printf("<p><span class='red'> Script execution aborted. <br /></span>"); printf("<p><strong>Invalid query string, which caused the SQL error:</strong></p>"); echo "<p> {$sql_query} </p>"; exit; } $sql_query = "SELECT url, level from " . $mysql_table_prefix . "links where site_id = {$site_id}"; $result = $db_con->query($sql_query); while ($row = $result->fetch_array(MYSQLI_ASSOC)) { $site_link = $row['url']; $link_level = $row['level']; if ($site_link != $url) { $sql_query = "INSERT into " . $mysql_table_prefix . "temp (link, level, id) values ('{$site_link}', '{$link_level}', '{$sessid}')"; $db_con->query($sql_query); } } $sql_query = "UPDATE " . $mysql_table_prefix . "sites set indexdate=now(), spider_depth ='{$maxlevel}', required = '{$url_inc}'," . "disallowed = '{$url_not_inc}', can_leave_domain='{$can_leave}', use_prefcharset='{$use_pref}' where site_id='{$site_id}'"; mysqltest(); $db_con->query($sql_query); if ($debug && $db_con->errno) { $err_row = __LINE__ - 2; printf("<p><span class='red'> MySQL failure: %s \n<br /></span></p>", $db_con->error); if (__FUNCTION__) { printf("<p><span class='red'> Found in script: " . __FILE__ . " row: {$err_row} in function(): " . __FUNCTION__ . " <br /></span></p>"); } else { printf("<p><span class='red'> Found in script: " . __FILE__ . " row: {$err_row} <br /></span></p>"); } printf("<p><span class='red'> Script execution aborted. <br /></span>"); printf("<p><strong>Invalid query string, which caused the SQL error:</strong></p>"); echo "<p> {$sql_query} </p>"; exit; } } else { if ($site_id == '') { mysqltest(); $sql_query = "INSERT into " . $mysql_table_prefix . "sites (url, indexdate, spider_depth, required, disallowed, can_leave_domain, use_prefcharset) " . "values ('{$url}', now(), '{$maxlevel}', '{$url_inc}', '{$url_not_inc}', '{$can_leave_domain}', '{$use_pref}')"; $db_con->query($sql_query); if ($debug && $db_con->errno) { $err_row = __LINE__ - 2; printf("<p><span class='red'> MySQL failure: %s \n<br /></span></p>", $db_con->error); if (__FUNCTION__) { printf("<p><span class='red'> Found in script: " . __FILE__ . " row: {$err_row} in function(): " . __FUNCTION__ . " <br /></span></p>"); } else { printf("<p><span class='red'> Found in script: " . __FILE__ . " row: {$err_row} <br /></span></p>"); } printf("<p><span class='red'> Script execution aborted. <br /></span>"); printf("<p><strong>Invalid query string, which caused the SQL error:</strong></p>"); echo "<p> {$sql_query} </p>"; exit; } $sql_query = "SELECT site_ID from " . $mysql_table_prefix . "sites where url='{$url}'"; $result = $db_con->query($sql_query); if ($debug && $db_con->errno) { $err_row = __LINE__ - 2; printf("<p><span class='red'> MySQL failure: %s \n<br /></span></p>", $db_con->error); if (__FUNCTION__) { printf("<p><span class='red'> Found in script: " . __FILE__ . " row: {$err_row} in function(): " . __FUNCTION__ . " <br /></span></p>"); } else { printf("<p><span class='red'> Found in script: " . __FILE__ . " row: {$err_row} <br /></span></p>"); } printf("<p><span class='red'> Script execution aborted. <br /></span>"); printf("<p><strong>Invalid query string, which caused the SQL error:</strong></p>"); echo "<p> {$sql_query} </p>"; exit; } $row = $result->fetch_array(MYSQLI_NUM); $site_id = $row[0]; if ($clear == 1) { clean_resource($result, '09'); } } else { mysqltest(); $sql_query = "UPDATE " . $mysql_table_prefix . "sites set indexdate=now(), spider_depth ='{$maxlevel}', required = '{$url_inc}'," . "disallowed = '{$url_not_inc}', can_leave_domain='{$can_leave_domain}', use_prefcharset='{$use_pref}' where site_id='{$site_id}'"; $db_con->query($sql_query); if ($debug && $db_con->errno) { $err_row = __LINE__ - 2; printf("<p><span class='red'> MySQL failure: %s \n<br /></span></p>", $db_con->error); if (__FUNCTION__) { printf("<p><span class='red'> Found in script: " . __FILE__ . " row: {$err_row} in function(): " . __FUNCTION__ . " <br /></span></p>"); } else { printf("<p><span class='red'> Found in script: " . __FILE__ . " row: {$err_row} <br /></span></p>"); } printf("<p><span class='red'> Script execution aborted. <br /></span>"); printf("<p><strong>Invalid query string, which caused the SQL error:</strong></p>"); echo "<p> {$sql_query} </p>"; exit; } } } $pending = array(); mysqltest(); $sql_query = "SELECT site_id, temp_id, level, count, num from " . $mysql_table_prefix . "pending where site_id='{$site_id}'"; $result = $db_con->query($sql_query); if ($debug && $db_con->errno) { $err_row = __LINE__ - 2; printf("<p><span class='red'> MySQL failure: %s \n<br /></span></p>", $db_con->error); if (__FUNCTION__) { printf("<p><span class='red'> Found in script: " . __FILE__ . " row: {$err_row} in function(): " . __FUNCTION__ . " <br /></span></p>"); } else { printf("<p><span class='red'> Found in script: " . __FILE__ . " row: {$err_row} <br /></span></p>"); } printf("<p><span class='red'> Script execution aborted. <br /></span>"); printf("<p><strong>Invalid query string, which caused the SQL error:</strong></p>"); echo "<p> {$sql_query} </p>"; exit; } $row = $result->fetch_array(MYSQLI_NUM); $pending = $row[0]; $level = '0'; $count = '0'; if ($clear == 1) { clean_resource($result, '10'); } $domain_arr = get_domains(); if ($pending == '') { mysqltest(); $sql_query = "INSERT into " . $mysql_table_prefix . "temp (link, level, id) values ('{$url}', 0, '{$sessid}')"; $db_con->query($sql_query); if ($debug && $db_con->errno) { $err_row = __LINE__ - 2; printf("<p><span class='red'> MySQL failure: %s \n<br /></span></p>", $db_con->error); if (__FUNCTION__) { printf("<p><span class='red'> Found in script: " . __FILE__ . " row: {$err_row} in function(): " . __FUNCTION__ . " <br /></span></p>"); } else { printf("<p><span class='red'> Found in script: " . __FILE__ . " row: {$err_row} <br /></span></p>"); } printf("<p><span class='red'> Script execution aborted. <br /></span>"); printf("<p><strong>Invalid query string, which caused the SQL error:</strong></p>"); echo "<p> {$sql_query} </p>"; exit; } } else { if ($pending != '') { printStandardReport('continueSuspended', $command_line, $no_log); mysqltest(); $pend_count = '0'; //$result = $db_con->query("SELECT temp_id, level, count from ".$mysql_table_prefix."pending where site_id='$site_id'"); $sql_query = "SELECT * from " . $mysql_table_prefix . "pending where site_id='{$site_id}'"; $result = $db_con->query($sql_query); if ($debug && $db_con->errno) { $err_row = __LINE__ - 2; printf("<p><span class='red'> MySQL failure: %s \n<br /></span></p>", $db_con->error); if (__FUNCTION__) { printf("<p><span class='red'> Found in script: " . __FILE__ . " row: {$err_row} in function(): " . __FUNCTION__ . " <br /></span></p>"); } else { printf("<p><span class='red'> Found in script: " . __FILE__ . " row: {$err_row} <br /></span></p>"); } printf("<p><span class='red'> Script execution aborted. <br /></span>"); printf("<p><strong>Invalid query string, which caused the SQL error:</strong></p>"); echo "<p> {$sql_query} </p>"; exit; } $row = $result->fetch_array(MYSQLI_NUM); if ($row) { $sessid = $row[1]; $level = $row[2]; $pend_count = $row[3] + 1; $num = $row[4]; $pending = 1; $tmp_urls = get_temp_urls($sessid); if ($clear == 1) { clean_resource($result, '11'); } } } } if ($pending != 1) { mysqltest(); $sql_query = "INSERT into " . $mysql_table_prefix . "pending (site_id, temp_id, level, count) values ('{$site_id}', '{$sessid}', '0', '0')"; $db_con->query($sql_query); if ($debug && $db_con->errno) { $err_row = __LINE__ - 2; printf("<p><span class='red'> MySQL failure: %s \n<br /></span></p>", $db_con->error); if (__FUNCTION__) { printf("<p><span class='red'> Found in script: " . __FILE__ . " row: {$err_row} in function(): " . __FUNCTION__ . " <br /></span></p>"); } else { printf("<p><span class='red'> Found in script: " . __FILE__ . " row: {$err_row} <br /></span></p>"); } printf("<p><span class='red'> Script execution aborted. <br /></span>"); printf("<p><strong>Invalid query string, which caused the SQL error:</strong></p>"); echo "<p> {$sql_query} </p>"; exit; } } $time = time(); $robots = "robots.txt"; // standardname of robots file if ($use_robot == '1') { $omit = check_robot_txt($url, $robots); } printHeader($omit, $url, $command_line); if ($link_check == 1) { printStandardReport('start_link_check', $command_line, $no_log); } if ($link_check == 0 && $reindex == 1) { printStandardReport('start_reindex', $command_line, $no_log); } if ($link_check == 0 && $reindex == 0) { printStandardReport('starting', $command_line, $no_log); } $mainurl = $url; $realnum = $num; $num = 0; while ($level <= $maxlevel && $soption == 'level' || $soption == 'full') { if ($pending == 1) { $count = $pend_count; $pending = 0; } else { $count = 0; } $links = array(); mysqltest(); $sql_query = "SELECT distinct link from " . $mysql_table_prefix . "temp where level={$level} && id='{$sessid}' order by link"; $result = $db_con->query($sql_query); $rows = $result->num_rows; if ($rows == 0) { break; } while ($row = $result->fetch_array(MYSQLI_ASSOC)) { $links[] = $row['link']; } // now loop through all available links(pages) while ($count < count($links)) { $num++; $realnum++; if ($realnum > $max_links) { // if max. links per page reached mysqltest(); $sql_query = "DELETE from " . $mysql_table_prefix . "temp where id = '{$sessid}'"; $db_con->query($sql_query); $sql_query = "DELETE from " . $mysql_table_prefix . "pending where site_id = '{$site_id}'"; $db_con->query($sql_query); printMaxLinks($max_links, $cl); printStandardReport('completed', $command_line, $no_log); return; } $thislink = $db_con->real_escape_string(stripslashes($links[$count])); $urlparts = parse_addr($thislink); $forbidden = 0; if (is_array($omit)) { // if valid robots.txt was found reset($omit); foreach ($omit as $omiturl) { $omiturl = trim($omiturl); $omiturl_parts = array(); $omiturl_parts = parse_addr($omiturl); if (@$omiturl_parts['scheme'] == '') { $check_omit = $urlparts['host'] . $omiturl; } else { $check_omit = $omiturl; } if (strpos($thislink, $check_omit)) { printRobotsReport($num, $thislink, $command_line); $realnum--; check_for_removal($thislink); $forbidden = 1; break; } } } if (!check_include($thislink, $url_inc, $url_not_inc)) { $realnum--; printUrlStringReport($num, $thislink, $command_line); //printUrlStringReport($realnum, $thislink, $command_line); check_for_removal($thislink); $forbidden = 1; } if ($forbidden == 0) { printRetrieving($num, stripslashes(rawurldecode($thislink)), $command_line); //printRetrieving($realnum, $thislink, $command_line); mysqltest(); $sql_query = "SELECT md5sum, indexdate from " . $mysql_table_prefix . "links where url='{$thislink}'"; $result = $db_con->query($sql_query); $rows = $result->num_rows; if ($rows == 0) { $url_status = index_url($thislink, $level + 1, $site_id, '', $domain, '', $sessid, $can_leave_domain, $reindex, $use_nofollow, $cl, $use_robot, $use_pref, $url_inc, $url_not_inc, $num); // check for touching the blacklist and its count against limit if ($url_status['black'] == "1") { $black++; if ($black > 20) { // limit until aborting the indexation of this site $url_status['aborted'] = "1"; $url_status['state'] = "<br /><br />Indexation aborted for this site, as it met too often the blacklist."; } } else { $black = 0; // reset counter, as should count only on continuous hits } // check for emergency exit if ($url_status['aborted'] == "1") { // delete all links from the temp table, which might be left for this site mysqltest(); $sql_query = "DELETE from " . $mysql_table_prefix . "temp where id = '{$sessid}'"; $db_con->query($sql_query); $sql_query = "DELETE from " . $mysql_table_prefix . "pending where site_id = '{$site_id}'"; $db_con->query($sql_query); $sql_query = "UPDATE " . $mysql_table_prefix . "sites set indexdate=now() where url = '{$url}'"; $db_con->query($sql_query); // end all loops $forbidden = '1'; $omit = ''; $reindex = ''; $count = '9999999999'; $pending = array(); if (!stristr($url_status['state'], "NOHOST") && !stristr($url_status['state'], "black")) { // NOHOST warning will be printed separately printWarning($url_status['state'], $command_line, $no_log); } } if (stristr($url_status['state'], "NOHOST")) { // delete all links from the temp table, which might be left for this site, etc mysqltest(); $sql_query = "DELETE from " . $mysql_table_prefix . "temp where id = '{$sessid}'"; $db_con->query($sql_query); $sql_query = "DELETE from " . $mysql_table_prefix . "pending where site_id = '{$site_id}'"; $db_con->query($sql_query); $sql_query = "UPDATE " . $mysql_table_prefix . "sites set indexdate=now() where url = '{$url}'"; $db_con->query($sql_query); // end all loops $forbidden = '1'; $omit = ''; $reindex = ''; $count = '9999999999'; $pending = array(); printWarning($url_status['state'], $command_line, $no_log); return; } // check for UFO file or invalid suffix (by redirected URL) if (stristr($url_status['state'], "ufo")) { //printWarning($url_status['state'],$command_line, $no_log); } if ($url_status['state'] != "ok") { printWarning($url_status['state'], $command_line, $no_log); } mysqltest(); $sql_query = "UPDATE " . $mysql_table_prefix . "pending set level ='{$level}', count='{$count}', num='{$realnum}' where site_id='{$site_id}'"; $db_con->query($sql_query); } else { if ($rows != 0 && $reindex == 1) { $row = $result->fetch_array(MYSQLI_ASSOC); $md5sum = $row['md5sum']; $indexdate = $row['indexdate']; if ($link_check == 1 && $reindex == 1) { link_check($thislink, $level + 1, $sessid, $can_leave_domain, $reindex, $site_id); } else { $url_status = index_url($thislink, $level + 1, $site_id, $md5sum, $domain, $indexdate, $sessid, $can_leave_domain, $reindex, $use_nofollow, $cl, $use_robot, $use_pref, $url_inc, $url_not_inc, $num); // check for emergency exit if ($url_status['aborted']) { // delete all links from the temp table, which might be left for this site mysqltest(); $sql_query = "DELETE from " . $mysql_table_prefix . "temp where id = '{$sessid}'"; $db_con->query($sql_query); // end all loops $forbidden = '1'; $omit = ''; $reindex = ''; $count = '9999999999'; $pending = array(); printWarning($url_status['state'], $command_line, $no_log); } } } else { printStandardReport('inDatabase', $command_line, $no_log); $realnum--; //$num--; } } if ($rows != 0) { mysqltest(); $sql_query = "UPDATE " . $mysql_table_prefix . "pending set level ='{$level}', count='{$count}', num='{$realnum}' where site_id='{$site_id}'"; $db_con->query($sql_query); } if ($clear == 1) { clean_resource($result, '13'); } } // check for interrupt counter if ($int_count == '1') { // interrupt the index procedure until interactive resume $sql_query = "UPDATE " . $mysql_table_prefix . "pending set level ='{$level}', count='{$count}', num='{$realnum}' where site_id='{$site_id}'"; $db_con->query($sql_query); printInterrupt($interrupt, $url, $cl); die; } $count++; $int_count--; } $level++; } } mysqltest(); $sql_query = "DELETE from " . $mysql_table_prefix . "temp where id = '{$sessid}'"; $db_con->query($sql_query); if ($debug && $db_con->errno) { $err_row = __LINE__ - 2; printf("<p><span class='red'> MySQL failure: %s \n<br /></span></p>", $db_con->error); if (__FUNCTION__) { printf("<p><span class='red'> Found in script: " . __FILE__ . " row: {$err_row} in function(): " . __FUNCTION__ . " <br /></span></p>"); } else { printf("<p><span class='red'> Found in script: " . __FILE__ . " row: {$err_row} <br /></span></p>"); } printf("<p><span class='red'> Script execution aborted. <br /></span>"); printf("<p><strong>Invalid query string, which caused the SQL error:</strong></p>"); echo "<p> {$sql_query} </p>"; exit; } $sql_query = "DELETE from " . $mysql_table_prefix . "pending where site_id = '{$site_id}'"; $db_con->query($sql_query); if ($debug && $db_con->errno) { $err_row = __LINE__ - 2; printf("<p><span class='red'> MySQL failure: %s \n<br /></span></p>", $db_con->error); if (__FUNCTION__) { printf("<p><span class='red'> Found in script: " . __FILE__ . " row: {$err_row} in function(): " . __FUNCTION__ . " <br /></span></p>"); } else { printf("<p><span class='red'> Found in script: " . __FILE__ . " row: {$err_row} <br /></span></p>"); } printf("<p><span class='red'> Script execution aborted. <br /></span>"); printf("<p><strong>Invalid query string, which caused the SQL error:</strong></p>"); echo "<p> {$sql_query} </p>"; exit; } if ($create_sitemap == 1) { create_sitemap($site_id, $url); } printStandardReport('completed', $command_line, $no_log); $stats = get_Stats(); printDatabase($stats, $cl); } if ($index_media) { // delete all thumbnails in .../admin/tmp/thumbs/ folder clear_folder("." . $thumb_folder); } }
function index_site($url, $reindex, $maxlevel, $soption, $url_inc, $url_not_inc, $can_leave_domain) { global $mysql_table_prefix, $command_line, $mainurl, $tmp_urls, $domain_arr, $all_keywords; if (!isset($all_keywords)) { $result = mysql_query("select keyword_ID, keyword from " . $mysql_table_prefix . "keywords"); echo mysql_error(); while ($row = mysql_fetch_array($result)) { $all_keywords[addslashes($row[1])] = $row[0]; } } $compurl = parse_url($url); if ($compurl['path'] == '') { $url = $url . "/"; } $t = microtime(); $a = getenv("REMOTE_ADDR"); $sessid = md5($t . $a); $urlparts = parse_url($url); $domain = $urlparts['host']; if (isset($urlparts['port'])) { $port = (int) $urlparts['port']; } else { $port = 80; } $result = mysql_query("select site_id from " . $mysql_table_prefix . "sites where url='{$url}'"); echo mysql_error(); $row = mysql_fetch_row($result); $site_id = $row[0]; if ($site_id != "" && $reindex == 1) { mysql_query("insert into " . $mysql_table_prefix . "temp (link, level, id) values ('{$url}', 0, '{$sessid}')"); echo mysql_error(); $result = mysql_query("select url, level from " . $mysql_table_prefix . "links where site_id = {$site_id}"); while ($row = mysql_fetch_array($result)) { $site_link = $row['url']; $link_level = $row['level']; if ($site_link != $url) { mysql_query("insert into " . $mysql_table_prefix . "temp (link, level, id) values ('{$site_link}', {$link_level}, '{$sessid}')"); } } $qry = "update " . $mysql_table_prefix . "sites set indexdate=now(), spider_depth = {$maxlevel}, required = '{$url_inc}'," . "disallowed = '{$url_not_inc}', can_leave_domain={$can_leave_domain} where site_id={$site_id}"; mysql_query($qry); echo mysql_error(); } else { if ($site_id == '') { mysql_query("insert into " . $mysql_table_prefix . "sites (url, indexdate, spider_depth, required, disallowed, can_leave_domain) " . "values ('{$url}', now(), {$maxlevel}, '{$url_inc}', '{$url_not_inc}', {$can_leave_domain})"); echo mysql_error(); $result = mysql_query("select site_ID from " . $mysql_table_prefix . "sites where url='{$url}'"); $row = mysql_fetch_row($result); $site_id = $row[0]; } else { mysql_query("update " . $mysql_table_prefix . "sites set indexdate=now(), spider_depth = {$maxlevel}, required = '{$url_inc}'," . "disallowed = '{$url_not_inc}', can_leave_domain={$can_leave_domain} where site_id={$site_id}"); echo mysql_error(); } } $result = mysql_query("select site_id, temp_id, level, count, num from " . $mysql_table_prefix . "pending where site_id='{$site_id}'"); echo mysql_error(); $row = mysql_fetch_row($result); $pending = $row[0]; $level = 0; $domain_arr = get_domains(); if ($pending == '') { mysql_query("insert into " . $mysql_table_prefix . "temp (link, level, id) values ('{$url}', 0, '{$sessid}')"); echo mysql_error(); } else { if ($pending != '') { printStandardReport('continueSuspended', $command_line); mysql_query("select temp_id, level, count from " . $mysql_table_prefix . "pending where site_id='{$site_id}'"); echo mysql_error(); $sessid = $row[1]; $level = $row[2]; $pend_count = $row[3] + 1; $num = $row[4]; $pending = 1; $tmp_urls = get_temp_urls($sessid); } } if ($reindex != 1) { mysql_query("insert into " . $mysql_table_prefix . "pending (site_id, temp_id, level, count) values ('{$site_id}', '{$sessid}', '0', '0')"); echo mysql_error(); } $time = time(); $omit = check_robot_txt($url); printHeader($omit, $url, $command_line); $mainurl = $url; $num = 0; while ($level <= $maxlevel && $soption == 'level' || $soption == 'full') { if ($pending == 1) { $count = $pend_count; $pending = 0; } else { $count = 0; } $links = array(); $result = mysql_query("select distinct link from " . $mysql_table_prefix . "temp where level={$level} && id='{$sessid}' order by link"); echo mysql_error(); $rows = mysql_num_rows($result); if ($rows == 0) { break; } $i = 0; while ($row = mysql_fetch_array($result)) { $links[] = $row['link']; } reset($links); while ($count < count($links)) { $num++; $thislink = $links[$count]; $urlparts = parse_url($thislink); reset($omit); $forbidden = 0; foreach ($omit as $omiturl) { $omiturl = trim($omiturl); $omiturl_parts = parse_url($omiturl); if ($omiturl_parts['scheme'] == '') { $check_omit = $urlparts['host'] . $omiturl; } else { $check_omit = $omiturl; } if (strpos($thislink, $check_omit)) { printRobotsReport($num, $thislink, $command_line); check_for_removal($thislink); $forbidden = 1; break; } } if (!check_include($thislink, $url_inc, $url_not_inc)) { printUrlStringReport($num, $thislink, $command_line); check_for_removal($thislink); $forbidden = 1; } if ($forbidden == 0) { printRetrieving($num, $thislink, $command_line); $query = "select md5sum, indexdate from " . $mysql_table_prefix . "links where url='{$thislink}'"; $result = mysql_query($query); echo mysql_error(); $rows = mysql_num_rows($result); if ($rows == 0) { index_url($thislink, $level + 1, $site_id, '', $domain, '', $sessid, $can_leave_domain, $reindex); mysql_query("update " . $mysql_table_prefix . "pending set level = {$level}, count={$count}, num={$num} where site_id={$site_id}"); echo mysql_error(); } else { if ($rows != 0 && $reindex == 1) { $row = mysql_fetch_array($result); $md5sum = $row['md5sum']; $indexdate = $row['indexdate']; index_url($thislink, $level + 1, $site_id, $md5sum, $domain, $indexdate, $sessid, $can_leave_domain, $reindex); mysql_query("update " . $mysql_table_prefix . "pending set level = {$level}, count={$count}, num={$num} where site_id={$site_id}"); echo mysql_error(); } else { printStandardReport('inDatabase', $command_line); } } } $count++; } $level++; } mysql_query("delete from " . $mysql_table_prefix . "temp where id = '{$sessid}'"); echo mysql_error(); mysql_query("delete from " . $mysql_table_prefix . "pending where site_id = '{$site_id}'"); echo mysql_error(); printStandardReport('completed', $command_line); }
function index_site($url, $reindex, $maxlevel, $soption, $url_inc, $url_not_inc, $can_leave_domain, $use_robot) { global $command_line, $mainurl, $tmp_urls, $domain_arr, $all_keywords, $smp, $realnum; printStandardReport('starting', $command_line); $smp = '0'; if (!isset($all_keywords)) { $result = mysql_query("select keyword_ID, keyword from " . TABLE_PREFIX . "keywords"); if (DEBUG > '0') { echo mysql_error(); } while ($row = mysql_fetch_array($result)) { $all_keywords[addslashes($row[1])] = $row[0]; } clean_resource($result); } $compurl = parse_url($url); if (isset($compurl['path']) && $compurl['path'] == '') { $url = $url . "/"; } $t = microtime(); $a = getenv("REMOTE_ADDR"); $sessid = md5($t . $a); if ($url != '/') { // ignore dummies $urlparts = parse_url($url); $domain = $urlparts['host']; if (isset($urlparts['port'])) { $port = (int) $urlparts['port']; } else { $port = 80; } $result = mysql_query("select site_id from " . TABLE_PREFIX . "sites where url='{$url}'"); if (DEBUG > '0') { echo mysql_error(); } $row = mysql_fetch_row($result); $site_id = $row[0]; clean_resource($result); if ($site_id != "" && $reindex == 1) { mysql_query("insert into " . TABLE_PREFIX . "temp (link, level, id) values ('{$url}', 0, '{$sessid}')"); if (DEBUG > '0') { echo mysql_error(); } $result = mysql_query("select url, level from " . TABLE_PREFIX . "links where site_id = {$site_id}"); while ($row = mysql_fetch_array($result)) { $site_link = $row['url']; $link_level = $row['level']; if ($site_link != $url) { mysql_query("insert into " . TABLE_PREFIX . "temp (link, level, id) values ('{$site_link}', {$link_level}, '{$sessid}')"); } } clean_resource($result); $qry = "update " . TABLE_PREFIX . "sites set indexdate=now(), spider_depth = {$maxlevel}, required = '{$url_inc}'," . "disallowed = '{$url_not_inc}', can_leave_domain={$can_leave_domain} where site_id={$site_id}"; mysql_query($qry); if (DEBUG > '0') { echo mysql_error(); } } else { if ($site_id == '') { mysql_query("insert into " . TABLE_PREFIX . "sites (url, indexdate, spider_depth, required, disallowed, can_leave_domain) " . "values ('{$url}', now(), {$maxlevel}, '{$url_inc}', '{$url_not_inc}', {$can_leave_domain})"); if (DEBUG > '0') { echo mysql_error(); } $result = mysql_query("select site_ID from " . TABLE_PREFIX . "sites where url='{$url}'"); $row = mysql_fetch_row($result); $site_id = $row[0]; clean_resource($result); } else { mysql_query("update " . TABLE_PREFIX . "sites set indexdate=now(), spider_depth = {$maxlevel}, required = '{$url_inc}'," . "disallowed = '{$url_not_inc}', can_leave_domain={$can_leave_domain} where site_id={$site_id}"); if (DEBUG > '0') { echo mysql_error(); } } } $result = mysql_query("select site_id, temp_id, level, count, num from " . TABLE_PREFIX . "pending where site_id='{$site_id}'"); if (DEBUG > '0') { echo mysql_error(); } $row = mysql_fetch_row($result); $pending = $row[0]; $level = 0; clean_resource($result); $domain_arr = get_domains(); if ($pending == '') { mysql_query("insert into " . TABLE_PREFIX . "temp (link, level, id) values ('{$url}', 0, '{$sessid}')"); if (DEBUG > '0') { echo mysql_error(); } } else { if ($pending != '') { printStandardReport('continueSuspended', $command_line); $result = mysql_query("select temp_id, level, count from " . TABLE_PREFIX . "pending where site_id='{$site_id}'"); if (DEBUG > '0') { echo mysql_error(); } $row = mysql_fetch_row($result); $sessid = $row[1]; $level = $row[2]; $pend_count = $row[3] + 1; $num = $row[4]; $pending = 1; $tmp_urls = get_temp_urls($sessid); clean_resource($result); } } if ($reindex != 1) { mysql_query("insert into " . TABLE_PREFIX . "pending (site_id, temp_id, level, count) values ('{$site_id}', '{$sessid}', '0', '0')"); if (DEBUG > '0') { echo mysql_error(); } } $time = time(); $robots = "robots.txt"; // standardname of file if ($use_robot != '1') { $robots = "no_robots.txt"; // Sphider never will find this file and ignore the contents of robots.txt } $omit = check_robot_txt($url, $robots); printHeader($omit, $url, $command_line); if (Configure::read('link_check') == 1) { printStandardReport('start_link_check', $command_line); } if (Configure::read('link_check') == 0 && $reindex == 1) { printStandardReport('start_reindex', $command_line); } if (Configure::read('link_check') == 0 && $reindex == 0) { printStandardReport('starting', $command_line); } $mainurl = $url; $realnum = 0; while ($level <= $maxlevel && $soption == 'level' || $soption == 'full') { if ($pending == 1) { $count = $pend_count; $pending = 0; } else { $count = 0; } $links = array(); $result = mysql_query("select distinct link from " . TABLE_PREFIX . "temp where level={$level} && id='{$sessid}' order by link"); if (DEBUG > '0') { echo mysql_error(); } $rows = mysql_num_rows($result); if ($rows == 0) { break; } $i = 0; while ($row = mysql_fetch_array($result)) { $links[] = $row['link']; } clean_resource($result); reset($links); $num = 0; while ($count < count($links)) { $num++; $realnum++; if ($realnum > Configure::read('max_links') + 1) { // if max. links per page reached mysql_query("delete from " . TABLE_PREFIX . "temp"); if (DEBUG > '0') { echo mysql_error(); } mysql_query("delete from " . TABLE_PREFIX . "pending"); if (DEBUG > '0') { echo mysql_error(); } printMaxLinks(Configure::read('max_links')); printStandardReport('completed', $command_line); return; } $thislink = $links[$count]; $urlparts = parse_url($thislink); reset($omit); $forbidden = 0; foreach ($omit as $omiturl) { $omiturl = trim($omiturl); $omiturl_parts = parse_url($omiturl); if ($omiturl_parts['scheme'] == '') { $check_omit = $urlparts['host'] . $omiturl; } else { $check_omit = $omiturl; } if (strpos($thislink, $check_omit)) { printRobotsReport($num, $thislink, $command_line); $realnum--; check_for_removal($thislink); $forbidden = 1; break; } } if (!check_include($thislink, $url_inc, $url_not_inc)) { printUrlStringReport($num, $thislink, $command_line); check_for_removal($thislink); $forbidden = 1; } if ($forbidden == 0) { printRetrieving($num, $thislink, $command_line); $query = "select md5sum, indexdate from " . TABLE_PREFIX . "links where url='{$thislink}'"; $result = mysql_query($query); if (DEBUG > '0') { echo mysql_error(); } $rows = mysql_num_rows($result); if ($rows == 0) { index_url($thislink, $level + 1, $site_id, '', $domain, '', $sessid, $can_leave_domain, $reindex); mysql_query("update " . TABLE_PREFIX . "pending set level = {$level}, count={$count}, num={$num} where site_id={$site_id}"); if (DEBUG > '0') { echo mysql_error(); } } else { if ($rows != 0 && $reindex == 1) { $row = mysql_fetch_array($result); $md5sum = $row['md5sum']; $indexdate = $row['indexdate']; if (Configure::read('link_check') == 1 && $reindex == 1) { link_check($thislink, $level + 1, $sessid, $can_leave_domain, $reindex); } else { index_url($thislink, $level + 1, $site_id, $md5sum, $domain, $indexdate, $sessid, $can_leave_domain, $reindex); } mysql_query("update " . TABLE_PREFIX . "pending set level = {$level}, count={$count}, num={$num} where site_id={$site_id}"); if (DEBUG > '0') { echo mysql_error(); } } else { printStandardReport('inDatabase', $command_line); $realnum--; } } clean_resource($result); } $count++; } $level++; } mysql_query("delete from " . TABLE_PREFIX . "temp where id = '{$sessid}'"); if (DEBUG > '0') { echo mysql_error(); } mysql_query("delete from " . TABLE_PREFIX . "pending where site_id = '{$site_id}'"); if (DEBUG > '0') { echo mysql_error(); } create_sitemap($site_id, $url); printStandardReport('completed', $command_line); $stats = get_Stats(); $stats_sites = $stats['sites']; $stats_links = $stats['links']; $stats_categories = $stats['categories']; $stats_keywords = $stats['keywords']; printDatabase($stats_sites, $stats_links, $stats_categories, $stats_keywords); } }