function get_sitemap( $parent, $lname ) { global $html; $sql = ' SELECT lid FROM lists WHERE lists.lname = "' . $_SESSION['lname'] . '" AND lists.luid = ' . $_SESSION['uid'] . ' '; $result = mysql_query( $sql ) or die( mysql_error() ) ; if( mysql_num_rows( $result ) > 0 ) { $lid = mysql_result( $result, 0 ); } if( !$lid ) { die( '<h2>De sitemap is niet gevonden</h2>' ); } else { $sql = ' SELECT * FROM listitems WHERE listitems.lilid = ' . $lid . ' ORDER BY listitems.liorder ASC '; $result = mysql_query( $sql ) or die( mysql_error() ); if( mysql_num_rows( $result ) > 0 ) { while( $row = mysql_fetch_assoc( $result ) ) { $sitemap_arr[ $row['liid'] ] = array( 'name' => $row['livalue'], 'parent' => $row['lipid'], 'liid' => $row['liid'] ); } $has_children = false; foreach( $sitemap_arr as $k => $v ) { if( $v['parent'] == $parent ) { if( $has_children === false ) { $has_children = true; $html .= '<ul>'; } $html .= ' <li id="item_' . $v['liid'] . '" style="list-style: none;"> <div class="page_container"> <div class="page"> <img class="page_icon" src="../img/page_small.png" alt="Page icon" style="display: block; margin-top: 20px; height: 39px; width: 30px;" /> <p class="page_name" style="margin-left: 100px; top: 20px;">' . $v['name'] . '</p> </div> </div> '; get_sitemap( $v['liid'], $_SESSION['lname'] ); $html .= '</li>'; } } if( $has_children === true ) { $html .= '</ul>'; } } } }
function index_url($url, $level, $site_id, $md5sum, $domain, $indexdate, $sessid, $can_leave_domain, $reindex, $use_nofollow, $cl, $use_robot, $use_pref, $url_inc, $url_not_inc, $num) { global $db_con, $entities, $min_delay, $link_check, $command_line, $min_words_per_page, $dup_content, $dup_url, $quotes, $plus_nr, $use_prefcharset; global $min_words_per_page, $supdomain, $smp, $follow_sitemap, $max_links, $realnum, $local, $tmp_dir, $auto_add, $admin_email, $idna, $conv_puny; global $mysql_table_prefix, $user_agent, $tmp_urls, $delay_time, $domain_arr, $home_charset, $charSet, $url_status, $redir_count; global $debug, $common, $use_white1, $use_white2, $use_black, $whitelist, $blacklist, $clear, $abslinks, $utf8_verify, $webshot; global $index_media, $index_image, $suppress_suffix, $imagelist, $min_image_x, $min_image_y, $dup_media, $index_alt, $no_log, $index_rss; global $index_audio, $audiolist, $index_video, $videolist, $index_embeded, $rss_template, $index_csv, $delim, $ext, $index_id3, $dba_act; global $converter_dir, $dict_dir, $cn_seg, $jp_seg, $index_framesets, $index_iframes, $cdata, $dc, $preferred, $index_rar, $index_zip, $curl; global $docs, $only_docs, $only_links, $case_sensitive, $vowels, $noacc_el, $include_dir, $thumb_folder, $js_reloc, $server_char; global $latin_ligatures, $phon_trans, $liga; // Currently (2013.01.11) the variable $use_prefcharset as defined in Admin Settings 'Obligatory use preferred charset' is used. // and not the variable $use_pref as defined in Admin Settings as a varaiable used for addsite() in .../admin/admin.php error_reporting(E_ALL & ~E_DEPRECATED & ~E_WARNING & ~E_NOTICE & ~E_STRICT); $data = array(); $cn_data = array(); $url_parts = array(); $url_status = array(); $url_status['black'] = ''; $contents = array(); $links = array(); $wordarray = array(); $topic = ''; $url_reloc = ''; $js_link = ''; $document = ''; $file = ''; $file0 = ''; $raw_file = ''; $seg_data = ''; $index_url = $url; $comment = $db_con->real_escape_string("Automatically added during index procedure, as this domain is not yet available in 'Sites' menu."); $admin_email = $db_con->real_escape_string($admin_email); if ($debug == '0') { if (function_exists("ini_set")) { ini_set("display_errors", "0"); } error_reporting(0); } else { error_reporting(E_ERROR); // otherwise a non existing siemap.xml would always cause a warning message } $needsReindex = 1; $deletable = 0; $nohost = 1; $i = 0; $nohost_count = 5; // defines count of attempts to get in contact with the server // check URL status while ($i < $nohost_count && $nohost) { $url_status = url_status($url, $site_id, $sessid); if (!stristr($url_status['state'], "NOHOST")) { $nohost = ''; // reset for successfull attempt } $i++; } // check for emergency exit if ($url_status['aborted'] == '1' || stristr($url_status['state'], "NOHOST")) { return $url_status; } // check for UFO file or invalid suffix if (stristr($url_status['state'], "ufo")) { return $url_status; } // JFIELD here is right before we try to retrieve the URL and get the error // echo "<h3>F****E: $url</h3>\n"; // check for 'unreachable' links and if it is a known URL, delete all keyword relationships, former indexed from the meanwhile unreachable link if (stristr($url_status['state'], "unreachable")) { printStandardReport('unreachable', $command_line, $no_log); $sql_query = "SELECT link_id from " . $mysql_table_prefix . "links where url='{$url}'"; $result = $db_con->query($sql_query); if ($debug && $db_con->errno) { $err_row = __LINE__ - 2; printf("<p><span class='red'> MySQL failure: %s \n<br /></span></p>", $db_con->error); if (__FUNCTION__) { printf("<p><span class='red'> Found in script: " . __FILE__ . " row: {$err_row} in function(): " . __FUNCTION__ . " <br /></span></p>"); } else { printf("<p><span class='red'> Found in script: " . __FILE__ . " row: {$err_row} <br /></span></p>"); } printf("<p><span class='red'> Script execution aborted. <br /></span>"); printf("<p><strong>Invalid query string, which caused the SQL error:</strong></p>"); echo "<p> {$sql_query} </p>"; exit; } $row = $result->fetch_array(MYSQLI_NUM); $link_id = $row[0]; if ($link_id) { $sql_query = "DELETE from " . $mysql_table_prefix . "link_keyword where link_id={$link_id}"; $db_con->query($sql_query); if ($debug && $db_con->errno) { $err_row = __LINE__ - 2; printf("<p><span class='red'> MySQL failure: %s \n<br /></span></p>", $db_con->error); if (__FUNCTION__) { printf("<p><span class='red'> Found in script: " . __FILE__ . " row: {$err_row} in function(): " . __FUNCTION__ . " <br /></span></p>"); } else { printf("<p><span class='red'> Found in script: " . __FILE__ . " row: {$err_row} <br /></span></p>"); } printf("<p><span class='red'> Script execution aborted. <br /></span>"); printf("<p><strong>Invalid query string, which caused the SQL error:</strong></p>"); echo "<p> {$sql_query} </p>"; exit; } // here we should delete the keywords associated only to the unreachable link // but this takes too much time during index procedure // the admin is asked toc do it manually by using the regarding option in 'Clean' menue // // delete the meanwhile unreachable link from db $sql_query = "DELETE from " . $mysql_table_prefix . "links where link_id = {$link_id}"; $db_con->query($sql_query); if ($debug && $db_con->errno) { $err_row = __LINE__ - 2; printf("<p><span class='red'> MySQL failure: %s \n<br /></span></p>", $db_con->error); if (__FUNCTION__) { printf("<p><span class='red'> Found in script: " . __FILE__ . " row: {$err_row} in function(): " . __FUNCTION__ . " <br /></span></p>"); } else { printf("<p><span class='red'> Found in script: " . __FILE__ . " row: {$err_row} <br /></span></p>"); } printf("<p><span class='red'> Script execution aborted. <br /></span>"); printf("<p><strong>Invalid query string, which caused the SQL error:</strong></p>"); echo "<p> {$sql_query} </p>"; exit; } } return $url_status; } // check for overwritten URL, forced by the header, sending content PLUS any redirected URL if ($url_status['url_over'] && !$url_status['relocate']) { $url = $url_status['url_over']; } $url_parts = parse_all_url($url); $thislevel = $level - 1; // redirected URL ? if ($url_status['relocate']) { // if relocated, print message, verify the new URL, and redirect to new URL // check for redirection on an already indexed link $known_link = ''; $sql_query = "SELECT * from " . $mysql_table_prefix . "links where url='{$url}'"; $result = $db_con->query($sql_query); if ($debug && $db_con->errno) { $err_row = __LINE__ - 2; printf("<p><span class='red'> MySQL failure: %s \n<br /></span></p>", $db_con->error); if (__FUNCTION__) { printf("<p><span class='red'> Found in script: " . __FILE__ . " row: {$err_row} in function(): " . __FUNCTION__ . " <br /></span></p>"); } else { printf("<p><span class='red'> Found in script: " . __FILE__ . " row: {$err_row} <br /></span></p>"); } printf("<p><span class='red'> Script execution aborted. <br /></span>"); printf("<p><strong>Invalid query string, which caused the SQL error:</strong></p>"); echo "<p> {$sql_query} </p>"; exit; } $known_link = $result->num_rows; if ($known_link) { $urlo_status['state'] = "URL was redirected to an already indexed page.<br />In order to prevent infinite indexation, this is not supported by Sphider-plus.<br />Indexation aborted for this URL"; $url_status['aborted'] = 1; return $url_status; } // remove the original URL from temp table. The relocated URL will be added later on. mysqltest(); $sql_query = "DELETE from " . $mysql_table_prefix . "temp where link = '{$url}' AND id = '{$sessid}'"; $db_con->query($sql_query); if ($debug && $db_con->errno) { $err_row = __LINE__ - 2; printf("<p><span class='red'> MySQL failure: %s \n<br /></span></p>", $db_con->error); if (__FUNCTION__) { printf("<p><span class='red'> Found in script: " . __FILE__ . " row: {$err_row} in function(): " . __FUNCTION__ . " <br /></span></p>"); } else { printf("<p><span class='red'> Found in script: " . __FILE__ . " row: {$err_row} <br /></span></p>"); } printf("<p><span class='red'> Script execution aborted. <br /></span>"); printf("<p><strong>Invalid query string, which caused the SQL error:</strong></p>"); echo "<p> {$sql_query} </p>"; exit; } $new_url = $url_status['path']; // URL of first redirection // remove the redirected URL, which eventually is already stored in db // before finally storing in db, we need to check for correct redirection. $sql_query = "DELETE from " . $mysql_table_prefix . "temp where link = '{$new_url}' AND id = '{$sessid}'"; $db_con->query($sql_query); if ($debug && $db_con->errno) { $err_row = __LINE__ - 2; printf("<p><span class='red'> MySQL failure: %s \n<br /></span></p>", $db_con->error); if (__FUNCTION__) { printf("<p><span class='red'> Found in script: " . __FILE__ . " row: {$err_row} in function(): " . __FUNCTION__ . " <br /></span></p>"); } else { printf("<p><span class='red'> Found in script: " . __FILE__ . " row: {$err_row} <br /></span></p>"); } printf("<p><span class='red'> Script execution aborted. <br /></span>"); printf("<p><strong>Invalid query string, which caused the SQL error:</strong></p>"); echo "<p> {$sql_query} </p>"; exit; } // now special processing for relative links if (!strpos(substr($new_url, 0, 5), "ttp")) { $new_url = make_abs($new_url, $index_url); } if ($url == $new_url && $url_status['file']) { $url_status['relocate'] = ''; // remove this redirection, as it is 'in it selves' $url_status['state'] = "ok"; // try to index the conteent } $care_excl = '1'; // care file suffixed to be excluded $relocated = '1'; // URL is relocated if ($debug) { printRedirected($url_status['relocate'], $url_status['path'], $cl); } $count = "1"; while ($count <= $redir_count && $url_status['relocate'] && !$url_status['aborted']) { // check this redirection $url_status = url_status($new_url, $site_id, $sessid); if ($url_status['path']) { $new_url = $url_status['path']; // URL of another redirections // now special processing for relative links if (!strpos(substr($new_url, 0, 5), "ttp")) { $new_url = make_abs($new_url, $index_url); } } if ($debug) { printRedirected($url_status['relocate'], $url_status['path'], $cl); } $count++; } if ($url_status['relocate']) { $url_status['aborted'] = 1; $url_status['state'] = "<br />Indexation aborted because of too many redirections.<br />"; return $url_status; } if ($url_status['state'] != "ok") { $code = $url_status['state']; // check for most common client errors if (!preg_match("/401|402|403|404/", $code)) { $url_status['aborted'] = 1; // end indexing for cmplete site } else { $url_status['aborted'] = ''; // abort only for this page } if (strstr($code, "401")) { $code = "401 (Authentication required)"; } if (strstr($code, "403")) { $code = "403 (Forbidden)"; } if (strstr($code, "404")) { $code = "404 (Not found)"; } $url_status['state'] = "<br />Indexation aborted because of code: {$code}.<br />"; } // check final URL (which might be the 3. redirection) // and puriify final redirected URL $url = $db_con->real_escape_string(url_purify($new_url, $index_url, $can_leave_domain, $care_excl, $relocated, $local_redir)); // valid file suffix for the redirection?? if ($url) { if ($care_excl == '1') { // care about non-accepted suffixes reset($ext); while (list($id, $excl) = each($ext)) { if (preg_match("/\\.{$excl}(\$|\\?)/i", $url)) { // if suffix is at the end of the link, or followd by a question mark $url_status['state'] = 'Found: Not supported suffix'; // error message return $url_status; } } } } if (!$url) { $link_parts = parse_all_url($url); $host = $link_parts['host']; $sql_query = "DELETE from " . $mysql_table_prefix . "temp where link like '{$index_url}' AND id = '{$sessid}' OR relo_link like '{$url}'"; $db_con->query($sql_query); if ($debug && $db_con->errno) { $err_row = __LINE__ - 2; printf("<p><span class='red'> MySQL failure: %s \n<br /></span></p>", $db_con->error); if (__FUNCTION__) { printf("<p><span class='red'> Found in script: " . __FILE__ . " row: {$err_row} in function(): " . __FUNCTION__ . " <br /></span></p>"); } else { printf("<p><span class='red'> Found in script: " . __FILE__ . " row: {$err_row} <br /></span></p>"); } printf("<p><span class='red'> Script execution aborted. <br /></span>"); printf("<p><strong>Invalid query string, which caused the SQL error:</strong></p>"); echo "<p> {$sql_query} </p>"; exit; } $url_status['aborted'] = 1; $url_status['state'] = "<br />Indexation aborted because of undefined redirection error.<br />"; return $url_status; } // abort indexation, if the redirected URL is equal to calling URL if ($url == 'self') { $link_parts = parse_all_url($url); $host = $link_parts['host']; $sql_query = "DELETE from " . $mysql_table_prefix . "temp where link like '{$url}' AND id = '{$sessid}' OR relo_link like '{$url}'"; $db_con->query($sql_query); if ($debug && $db_con->errno) { $err_row = __LINE__ - 2; printf("<p><span class='red'> MySQL failure: %s \n<br /></span></p>", $db_con->error); if (__FUNCTION__) { printf("<p><span class='red'> Found in script: " . __FILE__ . " row: {$err_row} in function(): " . __FUNCTION__ . " <br /></span></p>"); } else { printf("<p><span class='red'> Found in script: " . __FILE__ . " row: {$err_row} <br /></span></p>"); } printf("<p><span class='red'> Script execution aborted. <br /></span>"); printf("<p><strong>Invalid query string, which caused the SQL error:</strong></p>"); echo "<p> {$sql_query} </p>"; exit; } $url_status['aborted'] = 1; $url_status['state'] = "<br />Indexation aborted for this page, because the redirection was a link in it selves.<br />Blocked by Sphider-plus, because this could end in an infinite indexation loop.<br />"; return $url_status; } // abort indexation, if the redirected URL contains invalid file suffix if ($url == 'excl') { $link_parts = parse_all_url($url); $host = $link_parts['host']; $sql_query = "DELETE from " . $mysql_table_prefix . "temp where link like '{$url}' AND id = '{$sessid}' OR relo_link like '{$url}'"; $db_con->query($sql_query); if ($debug && $db_con->errno) { $err_row = __LINE__ - 2; printf("<p><span class='red'> MySQL failure: %s \n<br /></span></p>", $db_con->error); if (__FUNCTION__) { printf("<p><span class='red'> Found in script: " . __FILE__ . " row: {$err_row} in function(): " . __FUNCTION__ . " <br /></span></p>"); } else { printf("<p><span class='red'> Found in script: " . __FILE__ . " row: {$err_row} <br /></span></p>"); } printf("<p><span class='red'> Script execution aborted. <br /></span>"); printf("<p><strong>Invalid query string, which caused the SQL error:</strong></p>"); echo "<p> {$sql_query} </p>"; exit; } $url_status['aborted'] = 1; $url_status['state'] = "<br />Indexation aborted because the redirected link does not meet the URL suffix conditions.<br />"; return $url_status; } // abort indexation, because purifing the redirected URL failed if (!strstr($url, "//")) { $sql_query = "DELETE from " . $mysql_table_prefix . "temp where link like '{$url}' AND id = '{$sessid}' OR relo_link like '{$url}'"; $db_con->query($sql_query); if ($debug && $db_con->errno) { $err_row = __LINE__ - 2; printf("<p><span class='red'> MySQL failure: %s \n<br /></span></p>", $db_con->error); if (__FUNCTION__) { printf("<p><span class='red'> Found in script: " . __FILE__ . " row: {$err_row} in function(): " . __FUNCTION__ . " <br /></span></p>"); } else { printf("<p><span class='red'> Found in script: " . __FILE__ . " row: {$err_row} <br /></span></p>"); } printf("<p><span class='red'> Script execution aborted. <br /></span>"); printf("<p><strong>Invalid query string, which caused the SQL error:</strong></p>"); echo "<p> {$sql_query} </p>"; exit; } $url_status['aborted'] = 1; $url_status['state'] = "<br />Indexation aborted because {$url} is not supported.<br />"; return $url_status; } // abort indexation, if redirected URL met 'must/must not include' string rule if (!check_include($url, $url_inc, $url_not_inc)) { $link_parts = parse_all_url($url); $host = $link_parts['host']; $sql_query = "DELETE from " . $mysql_table_prefix . "temp where link like '{$url}' AND id = '{$sessid}' OR relo_link like '{$url}'"; $db_con->query($sql_query); if ($debug && $db_con->errno) { $err_row = __LINE__ - 2; printf("<p><span class='red'> MySQL failure: %s \n<br /></span></p>", $db_con->error); if (__FUNCTION__) { printf("<p><span class='red'> Found in script: " . __FILE__ . " row: {$err_row} in function(): " . __FUNCTION__ . " <br /></span></p>"); } else { printf("<p><span class='red'> Found in script: " . __FILE__ . " row: {$err_row} <br /></span></p>"); } printf("<p><span class='red'> Script execution aborted. <br /></span>"); printf("<p><strong>Invalid query string, which caused the SQL error:</strong></p>"); echo "<p> {$sql_query} </p>"; exit; } $url_status['aborted'] = 1; $url_status['state'] = "<br />Indexation aborted because the redirected link does not meet<br />the URL 'must include' or 'must not include' conditions.<br />"; return $url_status; } // if redirected URL is already known and in database: abort $rows0 = ''; $rows1 = ''; mysqltest(); $sql_query = "SELECT url from " . $mysql_table_prefix . "sites where url like '{$url}'"; $result = $db_con->query($sql_query); if ($debug && $db_con->errno) { $err_row = __LINE__ - 2; printf("<p><span class='red'> MySQL failure: %s \n<br /></span></p>", $db_con->error); if (__FUNCTION__) { printf("<p><span class='red'> Found in script: " . __FILE__ . " row: {$err_row} in function(): " . __FUNCTION__ . " <br /></span></p>"); } else { printf("<p><span class='red'> Found in script: " . __FILE__ . " row: {$err_row} <br /></span></p>"); } printf("<p><span class='red'> Script execution aborted. <br /></span>"); printf("<p><strong>Invalid query string, which caused the SQL error:</strong></p>"); echo "<p> {$sql_query} </p>"; exit; } $rows0 = $result->num_rows; $sql_query = "SELECT * from " . $mysql_table_prefix . "links where url='{$url}'"; $result = $db_con->query($sql_query); if ($debug && $db_con->errno) { $err_row = __LINE__ - 2; printf("<p><span class='red'> MySQL failure: %s \n<br /></span></p>", $db_con->error); if (__FUNCTION__) { printf("<p><span class='red'> Found in script: " . __FILE__ . " row: {$err_row} in function(): " . __FUNCTION__ . " <br /></span></p>"); } else { printf("<p><span class='red'> Found in script: " . __FILE__ . " row: {$err_row} <br /></span></p>"); } printf("<p><span class='red'> Script execution aborted. <br /></span>"); printf("<p><strong>Invalid query string, which caused the SQL error:</strong></p>"); echo "<p> {$sql_query} </p>"; exit; } $known_link = $result->fetch_array(MYSQLI_NUM); $md5 = $known_link[8]; if ($clear == 1) { clean_resource($result, '02'); } if ($rows0) { $url_status['state'] = "<br />URL already in database (as a site URL). Index aborted.<br />"; $url_status['aborted'] = 1; return $url_status; } // if known link, which is already indexed (because containing the md5 checksum), enter here if ($known_link[8]) { $count = $known_link[15]; $count++; if ($count > $redir_count) { // abort indexation $url_status['state'] = "<br />{$count}. attempt to redirect in the same (already indexed) URL, <br />which is no longer accepted by Sphider-plus. Indexation aborted for this site.<br />"; $url_status['aborted'] = 1; return $url_status; } else { $sql_query = "UPDATE " . $mysql_table_prefix . "links set relo_count='{$count}' where url='{$url}'"; $db_con->query($sql_query); } } // add redirected URL to temp table, if not yet known $sql_query = "SELECT link from " . $mysql_table_prefix . "temp where link='{$url}' && id = '{$sessid}'"; $result = $db_con->query($sql_query); if ($debug && $db_con->errno) { $err_row = __LINE__ - 2; printf("<p><span class='red'> MySQL failure: %s \n<br /></span></p>", $db_con->error); if (__FUNCTION__) { printf("<p><span class='red'> Found in script: " . __FILE__ . " row: {$err_row} in function(): " . __FUNCTION__ . " <br /></span></p>"); } else { printf("<p><span class='red'> Found in script: " . __FILE__ . " row: {$err_row} <br /></span></p>"); } printf("<p><span class='red'> Script execution aborted. <br /></span>"); printf("<p><strong>Invalid query string, which caused the SQL error:</strong></p>"); echo "<p> {$sql_query} </p>"; exit; } $rows = $result->num_rows; if ($rows == 0) { $sql_query = "INSERT into " . $mysql_table_prefix . "temp (link, level, id, relo_count) values ('{$url}', '{$level}', '{$sessid}', '1')"; $db_con->query($sql_query); if ($debug && $db_con->errno) { $err_row = __LINE__ - 2; printf("<p><span class='red'> MySQL failure: %s \n<br /></span></p>", $db_con->error); if (__FUNCTION__) { printf("<p><span class='red'> Found in script: " . __FILE__ . " row: {$err_row} in function(): " . __FUNCTION__ . " <br /></span></p>"); } else { printf("<p><span class='red'> Found in script: " . __FILE__ . " row: {$err_row} <br /></span></p>"); } printf("<p><span class='red'> Script execution aborted. <br /></span>"); printf("<p><strong>Invalid query string, which caused the SQL error:</strong></p>"); echo "<p> {$sql_query} </p>"; exit; } } if ($clear == 1) { clean_resource($result, '02'); } // at the end of redirect, rebuild the url parts from the redirected URL. // This is the final URL, which will be indexed $url_parts = parse_all_url($url); } // end check any redirection/relocation // if a JavaScript file is currently indexed? $suffix = substr($url, strrpos($url, ".") + 1); $suffix = str_replace("/", "", $suffix); if (strlen($suffix) < "5") { if (preg_match("/js\$/", $suffix)) { $js_link = 1; // activate JS switch } } if ($smp != 1 && $follow_sitemap == 1) { // enter here if we don't already know a valid sitemap and if admin settings allowed us to do so $tmp_urls = get_temp_urls($sessid); // reload previous temp $url2 = remove_sessid(convert_url($url)); // get folder where sitemap should be and if exists, cut existing filename, suffix and subfolder $host = parse_addr($url2); $hostname = $host[host]; $more_sitemaps = array(); if ($hostname == 'localhost') { $host1 = str_replace($local, '', $url2); } $pos = strpos($host1, "/"); // on local server delete all behind the / if ($pos) { $host1 = substr($host1, 0, $pos); } // build full adress again, now only the host if ($hostname == 'localhost') { $url2 = "" . $local . "" . $host1 . ""; } else { $url2 = "{$host['scheme']}://{$hostname}"; } $sitemap_name = "sitemap"; // standard name for sitemap file $input_file = "{$url2}/{$sitemap_name}"; // create path to sitemap $log_file = './sitemaps/current_sitemap.xml'; // destination for sitemap log-file $smap_found = ''; $indexed_map = ''; $map_cont = ''; // try to fetch individual sitemap url from database mysqltest(); $sql_query = "SELECT smap_url from " . $mysql_table_prefix . "sites where site_id='{$site_id}'"; $result = $db_con->query($sql_query); if ($debug && $db_con->errno) { $err_row = __LINE__ - 2; printf("<p><span class='red'> MySQL failure: %s \n<br /></span></p>", $db_con->error); if (__FUNCTION__) { printf("<p><span class='red'> Found in script: " . __FILE__ . " row: {$err_row} in function(): " . __FUNCTION__ . " <br /></span></p>"); } else { printf("<p><span class='red'> Found in script: " . __FILE__ . " row: {$err_row} <br /></span></p>"); } printf("<p><span class='red'> Script execution aborted. <br /></span>"); printf("<p><strong>Invalid query string, which caused the SQL error:</strong></p>"); echo "<p> {$sql_query} </p>"; exit; } $row = $result->fetch_array(MYSQLI_NUM); if (preg_match("/http:\\/\\//", $row[0])) { // use the individual sitemap $input_file = preg_replace("/.xml.gz|.xml/i", "", $row[0]); } $file = "" . $input_file . ".xml"; if ($fd = @fopen($file, "r")) { // uncompressed ? //if ($zd = @gzopen("".$input_file.".xml", "r")) { // uncompressed ? $map_cont = @stream_get_contents($fd); if ($map_cont && strpos($map_cont, "schemas/sitemap")) { // if we were able to read it $smap_found = '1'; } fclose($fd); } $gz_file = "" . $input_file . ".xml.gz"; if (!$smap_found && ($zd = @fopen("compress.zlib://{$gz_file}", "r"))) { // compressed ? //if (!$smap_found && $zd = @gzopen("".$input_file.".xml.gz", "r")) { // compressed ? $map_cont = @gzread($zd, 10485760); // max. 10 MB (might be too large for some server) gzclose($zd); if ($map_cont && strpos($map_cont, "schemas/sitemap")) { $smap_found = '1'; } } //echo "\r\n\r\n<br>map_cont Array:<br><pre>";print_r($map_cont);echo "</pre>\r\n"; if ($smap_found) { if ($debug != '0') { // create a log-file of current sitemap.xml file_put_contents($log_file, $map_cont); } //$del = $db_con->query("DELETE from ".$mysql_table_prefix."temp"); // function get_sitemap and store_links will build a new temp table if (stristr($map_cont, "<sitemapindex")) { // if current sitemap file is an index file printStandardReport('validSitemapInd', $command_line, $no_log); $get_maps = simplexml_load_string($map_cont); if ($get_maps) { reset($get_maps); foreach ($get_maps as $map_x) { $new_links[] = $map_x->loc; // get all links to further sitemap files } if (is_array($new_links)) { // if we found more sitemap files $new_links = explode(",", implode(",", $new_links)); // destroy SimpleXMLElement Object and get the link array $new_links = array_slice($new_links, 0, $max_links); $indexed_map = '1'; $i = '0'; //echo "\r\n\r\n<br>new_links Array:<br><pre>";print_r($new_links);echo "</pre>\r\n"; foreach ($new_links as $input_file) { $these_links = get_sitemap($input_file, $indexed_map, $mysql_table_prefix); // now extract page links from this sitemap file //echo "\r\n\r\n<br>these_links Array:<br><pre>";print_r($these_links);echo "</pre>\r\n"; if ($these_links) { reset($these_links); store_newLinks($these_links, $level, $sessid); $smp = '1'; // there were valid sitemap files and we stored the new links $i++; } else { printStandardReport('invalidSecSitemap', $command_line, $no_log); // unable to extract links from secondary sitemap file } } printValidSecSmap($i, $cl); unset($input_file, $map_cont, $new_links); } else { printStandardReport('invalidSecSitemap', $command_line, $no_log); // unable to extract links from secondary sitemap file } } else { printStandardReport('invalidSitemapInd', $command_line, $no_log); // unable to extract links from sitemap INDEX file } } else { $links = get_sitemap($map_cont, $indexed_map, $mysql_table_prefix); // extract links from sitemap.xml (there was only one sitemap file) if ($links != '') { reset($links); //echo "\r\n\r\n<br>sitemmap links Array:<br><pre>";print_r($links);echo "</pre>\r\n"; store_newLinks($links, $level, $sessid); $smp = '1'; // there was one valid sitemap and we stored the new links printStandardReport('validSitemap', $command_line, $no_log); } else { printStandardReport('invalidSitemap', $command_line, $no_log); } unset($links); } } } if ($debug == '0') { if (function_exists("ini_set")) { ini_set("display_errors", "0"); } error_reporting(0); } else { error_reporting(E_ALL & ~E_DEPRECATED & ~E_WARNING & ~E_NOTICE & ~E_STRICT); } if ($url_status['state'] == 'ok') { $OKtoIndex = 1; $file_read_error = 0; if (time() - $delay_time < $min_delay) { sleep($min_delay - (time() - $delay_time)); } if ($url_status['file']) { $file = $url_status['file']; } else { $url_status['state'] = "Unable to read the content of the file.<br />{$url} does not deliver any content."; $realnum--; } } if ($url_status['state'] == 'ok') { // first attempt to define a charset $chrSet = ''; if ($use_prefcharset == '1') { // use preferred charset as defined in Admin settings $chrSet = $home_charset; //echo "<h1>USING PREFERRED CHARSET</h1>"; } else { if ($server_char && $url_status['charset']) { //echo "<h1>USING SERVER CHARSET</h1>"; $chrSet = $url_status['charset']; // use charset as supplied by the remote server } else { // try to extract the charset of this file //echo "<h1>USING CONTENT CHARSET</h1>"; //echo "<h1>" . substr($file, 0, 500) . "</h1>"; if (preg_match("'encoding=[\\'\"](.*?)[\\'\"]'si", substr($file, 0, 3000), $regs)) { //echo "<h1>1</h1>"; $chrSet = trim(strtoupper($regs[1])); // get encoding of current XML or XHTML file and use it furtheron } if (!$chrSet) { //echo "<h1>2</h1>"; if (preg_match("'charset=(.*?)[ \\/\\;\\'\"]'si", substr($file, 0, 3000), $regs)) { //echo "<h1>3</h1>"; $chrSet = trim(strtoupper($regs[1])); // get charset of current HTML file and use it furtheron } } if (!$chrSet) { //echo "<h1>4</h1>"; if (preg_match("'charset=[\\'\"](.*?)[\\'\"]'si", substr($file, 0, 3000), $regs)) { //echo "<h1>5</h1>"; $chrSet = trim(strtoupper($regs[1])); // get charset of current HTML file and use it furtheron } } // in assistance for all lazy webmasters $chrSet = preg_replace("/win-/si", "windows-", $chrSet); if ($chrSet == "1251") { //echo "<h1>6</h1>"; $chrSet = "windows-1251"; } if ($chrSet == '') { //echo "<h1>7</h1>"; $chrSet = $home_charset; // no charset found, we need to use default charset like for DOCs, PDFs, etc } } } //echo "<h1>CHRSET: $chrSet</h1>"; // if required, uncompress ZIP archives and make content of each file => text if ($url_status['content'] == 'zip' && $index_zip == '1' && $file) { file_put_contents("" . $tmp_dir . "/archiv.temp", $file); $zip = zip_open("" . $tmp_dir . "/archiv.temp"); if ($zip) { $url_status['content'] = "text"; // preventiv, if not another status will be detected for individual archiv files $file = ''; // starting with a blank file for all archive files $topic = 'zip'; if ($debug == '2') { printStandardReport('archivFiles', $command_line, $no_log); } while ($zip_entry = zip_read($zip)) { if (zip_entry_open($zip, $zip_entry, "r")) { $buf = zip_entry_read($zip_entry, zip_entry_filesize($zip_entry)); //uncompress the content of recent archiv file $name = zip_entry_name($zip_entry); // get filename of recent archive file if ($debug == '2') { // $report = "<strong> " . $name . "</strong>"; printThis($report, $cl); $size = (int) (zip_entry_filesize($zip_entry) / 1024); if ($size == 0) { $size = '1'; } $report = " - Unpacked size: " . $size . " kByte<br />"; printThis($report, $cl); } $buf = get_arch_content($buf, $name, $url, $chrSet); // if necessary, convert PDF, extract feed etc. for the recent file zip_entry_close($zip_entry); // done for this file in archiv $file .= "" . $buf . "<br /><br />"; // add all uncompressed and converted files together } } zip_close($zip); } unlink("" . $tmp_dir . "/archiv.temp"); } // if required, uncompress RAR archives and make content of each file => text if ($url_status['content'] == 'rar' && $index_rar == '1') { file_put_contents("" . $tmp_dir . "/archiv.temp", $file); $rar = rar_open("" . $tmp_dir . "/archiv.temp"); if ($rar) { $url_status['content'] = "text"; // preventiv, all individual archiv files willl be converted to 'text' $file = ''; // starting with a blank file for all archive files $topic = 'rar'; $entries = rar_list($rar); if ($rar) { if ($debug == '2') { printStandardReport('archivFiles', $command_line, $no_log); } foreach ($entries as $entry) { $name = $entry->getName(); if ($debug == '2') { $report = "<strong> " . $name . "</strong>"; printThis($report, $cl); $size = (int) ($entry->getPackedSize() / 1024); if ($size == 0) { $size = '1'; } $report = " - Packed size: " . $size . " kByte"; printThis($report, $cl); $size = (int) ($entry->getUnpackedSize() / 1024); if ($size == 0) { $size = '1'; } $report = " - Unpacked size: " . $size . " kByte<br />"; printThis($report, $cl); } $entry->extract('', "./" . $tmp_dir . "/" . $name . ""); // extract single file of archiv into temporary folder $buf = file_get_contents("./" . $tmp_dir . "/" . $name . ""); // read content of this intermediate file unlink("./" . $tmp_dir . "/" . $name . ""); // destroy this file if ($buf) { $buf = get_arch_content($buf, $name, $url, $chrSet); // if necessary, convert PDF, extract feed etc. for the recent file $file .= "" . $buf . "<br /><br />"; // add all uncompressed and converted files together } } } rar_close($rar); } unlink("" . $tmp_dir . "/archiv.temp"); } $file0 = $file; // rememberr the original (e.g. for doc2txt converter) // remove useless part of the content $file = purify_content($file); $valid_utf8 = '1'; $raw_file = $file; // kill eventually duplicate coding info in dynamic links if (stristr(substr($file, '0', '4000'), "encoding") && strstr(substr($file, '0', '4000'), "charset")) { $file = substr($file, strrpos($file, "<!DOCTYPE")); // subsstring starting at last found <!DOCTYPE } // we need to do it again for eventually new charset in archive $chrSet = ''; if ($use_prefcharset == '1') { // use preferred charset as defined in Admin settings $chrSet = $home_charset; } else { if ($server_char && $url_status['charset']) { $chrSet = $url_status['charset']; // use charset as supplied by the remote server } else { // try to extract the charset of this file if (preg_match("'encoding=[\\'\"](.*?)[\\'\"]'si", substr($file, 0, 3000), $regs)) { $chrSet = trim(strtoupper($regs[1])); // get encoding of current XML or XHTML file and use it furtheron } if (!$chrSet) { if (preg_match("'charset=(.*?)[ \\/\\;\\'\"]'si", substr($file, 0, 3000), $regs)) { $chrSet = trim(strtoupper($regs[1])); // get charset of current HTML file and use it furtheron } } if (!$chrSet) { if (preg_match("'charset=[\\'\"](.*?)[\\'\"]'si", substr($file, 0, 3000), $regs)) { $chrSet = trim(strtoupper($regs[1])); // get charset of current HTML file and use it furtheron } } // in assistance for all lazy webmasters $chrSet = preg_replace("/win-/si", "windows-", $chrSet); if ($chrSet == "1251") { $chrSet = "windows-1251"; } if ($chrSet == '') { $chrSet = $home_charset; // no charset found, we need to use default charset like for DOCs, PDFs, etc } } } if (strpos($chrSet, " ")) { // in the wild we have aloready seen a lot of variants $chrSet = substr($chrSet, 0, strpos($chrSet, " ")); } // some webmaster still use 'UNICODE' as name if (stristr($chrSet, "UNICODE")) { $chrSet = "UTF-8"; } // obsolete since 1990, but some (Italian) server still send it as charset . . . . if (stristr($chrSet, "8858")) { $chrSet = str_replace("8858", "8859", $chrSet); } // required coaching for some webmasters if (stristr($chrSet, "cp-")) { $chrSet = str_ireplace("CP-", "CP", $chrSet); } $contents['charset'] = $chrSet; if ($index_framesets == '1') { if (preg_match("@<frameset[^>]*>(.*?)<\\/frameset>@si", $file, $regs)) { printStandardReport('newFrameset', $command_line, $no_log); // separate the <frameset> ....</frameset> part of this file $frame = $regs[1]; $replace = get_frames($frame, $url, $can_leave_domain); $replace = "<body>" . $replace . "</body>"; // create the body tags for $file $contents['charset'] = $chrSet; // rebuild charset // include all replacements instead of the frameset tag into the actual file. This will become the body $file = preg_replace("@<frameset.*?</frameset>@si", "{$replace}", $file); } } if ($index_iframes == '1') { $links = array(); $regs = array(); $replace = ''; $get_charset = ''; $real_url = $url; if (preg_match_all("/(iframe[^>]*src[[:blank:]]*)=[[:blank:]]*[\\'\"]?(([[a-z]{3,5}:\\/\\/(([.a-zA-Z0-9-])+(:[0-9]+)*))*([+:%\\/?=&;\\\\(\\),._ a-zA-Z0-9-]*))(#[.a-zA-Z0-9-]*)?[\\'\" ]?/i", $file, $regs, PREG_SET_ORDER)) { printStandardReport('newIframe', $command_line, $no_log); // find all frames of the iframe; $care_excl = ''; // don't care file suffixed to be excluded $relocated = ''; // URL is not relocated foreach ($regs as $val) { if (($a = url_purify($val[2], $url, $can_leave_domain, $care_exel, $relocated, $local_redir)) != '') { $links[] = $a; // collect all iframe links } } if ($links) { foreach ($links as $url) { printNewLinks($url, $cl); if (preg_match("/.html|.htm|.xhtml|.xml|.php/i", $url)) { $frame = file_get_contents($url); // get content of this frame // separate the body part of this frame preg_match("@<body[^>]*>(.*?)<\\/body>@si", $frame, $regs); $body = $regs[1]; if ($abslinks == '1') { $body = make_abslinks($body, $url); // if required, correct links relative to found iframe } $replace = "" . $replace . "<br />" . $body . ""; } else { // might be an image $replace = "" . $replace . "<br /><img src=\"" . $url . "\">"; } } } // include all replacements instead of the iframe tag into the actual file $file = preg_replace("@<iframe.*?</iframe>@si", "{$replace}", $file); $contents['charset'] = $chrSet; // rebuild charset } $url = $real_url; } // in order to index RDF, RSD, RSS and ATOM feeds enter here if ($url_status['content'] == 'xml' && $index_rss == '1') { if (!preg_match("/<rss|atom|<feed|<rdf|<rsd/si", substr($file, 0, 400))) { printStandardReport('notRSS', $command_line, $no_log); // no valid feed detected $OKtoIndex = 0; $file_read_error = 1; $realnum--; } else { $html = ''; $xml = XML_IsWellFormed($file); // check for well-formed XML if ($xml != '1') { if ($debug > 0) { printNotWellFormedXML($xml, $cl); } $OKtoIndex = 0; $file_read_error = 1; $realnum--; } else { $rss = new feedParser(); // define options for feed parser $rss->limit = $max_links; // save time by limiting the items/entries to be processed $rss->in_cp = strtoupper($contents['charset']); // charset of actual file $rss->out_cp = 'UTF-8'; // convert all into this charset $rss->cache_dir = ''; // currently unused $rss->dc = $dc; // treat Dublin Core tags in RDF feeds $rss->pro = $preferred; // obey the PREFERRED directive in RSD feeds $rss->file = '1'; // use $file as feed (as a string, not URL) if ($cdata != 1) { $rss->CDATA = 'content'; // get it all (naughty) } else { $rss->CDATA = 'nochange'; // well educated crawler } // get feed as array if ($feed = $rss->get($url, $file)) { // if you want to see the feed during index procedure, uncomment the following row // echo "<br>FEED array:<br><pre>";print_r($feed);echo "</pre>"; $link = ''; $textinput_link = ''; $image_url = ''; $image_link = ''; $docs = ''; $subjects = ''; $count = ''; $type = $feed[type]; $count = $feed[sub_count]; $cached = $feed[cached]; // kill all no longer required values $feed[type] = ''; $feed[sub_count] = ''; $feed[encoding_in] = ''; $feed[encoding_out] = ''; $feed[items_count] = ''; $feed[cached] = ''; if (!$count) { $count = '0'; } if ($type == 'RSD') { // prepare all RSD APIs for ($i = 0; $i < $count; $i++) { $subjects .= '' . $feed['api'][$i]['name'] . '<br /> ' . $feed['api'][$i]['apiLink'] . '<br /> ' . $feed['api'][$i]['blogID'] . '<br /> ' . $feed['api'][$i]['settings_docs'] . '<br /> ' . $feed['api'][$i]['settings_notes'] . '<br />'; } } if ($type == 'Atom') { // prepare all Atom entries for ($i = 0; $i < $count; $i++) { $subjects .= '' . $feed['entries'][$i]['link'] . '<br /> ' . $feed['entries'][$i]['title'] . '<br /> ' . $feed['entries'][$i]['id'] . '<br /> ' . $feed['entries'][$i]['published'] . '<br /> ' . $feed['entries'][$i]['updated'] . '<br /> ' . $feed['entries'][$i]['summary'] . '<br /> ' . $feed['entries'][$i]['rights'] . '<br /> ' . $feed['entries'][$i]['author_name'] . ' ' . $feed['entries'][$i]['author_email'] . ' ' . $feed['entries'][$i]['author_uri'] . '<br /> ' . $feed['entries'][$i]['category_term'] . ' ' . $feed['entries'][$i]['category_label'] . ' ' . $feed['entries'][$i]['category_scheme'] . '<br /> ' . $feed['entries'][$i]['contributor_name'] . ' ' . $feed['entries'][$i]['contributor_email'] . ' ' . $feed['entries'][$i]['contributor_uri'] . '<br /> '; } } if ($type == 'RDF' | $type == 'RSS v.0.91/0.92' | $type == 'RSS v.2.0') { // For RDF and RSS feeds enter here // prepare channel image $image_url = $feed[image_url]; if ($image_url) { $width = $feed[image_width]; if (!$width || $width > '144') { $width = '88'; //set to default value } $height = $feed[image_height]; if (!$height || $height > '400') { $height = '31'; //set to default value } $feed[image_url] = "<img id=\"rss_007\" src=\"" . $image_url . "\" alt=\"" . $feed[image_title] . "\" width=\"" . $width . "\" height=\"" . $height . "\">"; } $image_link = $feed[image_link]; if ($image_link) { $feed[image_link] = "<a href=\"" . $image_link . "\">" . $image_link . "</a>"; } // prepare all RDF or RSS items for ($i = 0; $i < $count; $i++) { $subjects .= '' . $feed['items'][$i]['link'] . '<br /> ' . $feed['items'][$i]['title'] . '<br /> ' . $feed['items'][$i]['description'] . '<br /> ' . $feed['items'][$i]['author'] . '<br /> ' . $feed['items'][$i]['category'] . '<br /> ' . $feed['items'][$i]['guid'] . '<br /> ' . $feed['items'][$i]['comments'] . '<br /> ' . $feed['items'][$i]['pubDate'] . '<br /> ' . $feed['items'][$i]['source'] . '<br /> ' . $feed['items'][$i]['enclosure'] . '<br /> ' . $feed['items'][$i]['country'] . '<br /> ' . $feed['items'][$i]['coverage'] . '<br /> ' . $feed['items'][$i]['contributor'] . '<br /> ' . $feed['items'][$i]['date'] . '<br /> ' . $feed['items'][$i]['industry'] . '<br /> ' . $feed['items'][$i]['language'] . '<br /> ' . $feed['items'][$i]['publisher'] . '<br /> ' . $feed['items'][$i]['state'] . '<br /> ' . $feed['items'][$i]['subject'] . '<br /> '; } } // convert the channel/feed part into a string $feed_common = implode(" ", $feed); // build something that could be indexed $html .= "<html>\r\n<head>\r\n<title>" . $feed['title'] . "</title>\r\n<meta name=\"description\" content=\"" . $feed['description'] . " \">\r\n</head>\r\n"; $html .= "<body>\r\n" . $feed_common . "\r\n" . $subjects . "\r\n</body>\r\n</html>\r\n"; } if (strlen($html) < "130") { // can't be a valid feed if ($type == "unknown") { printInvalidFeedType($type, $cl); } else { printStandardReport('invalidRSS', $command_line, $no_log); } $OKtoIndex = 0; $file_read_error = 1; $realnum--; } else { $contents['charset'] = 'UTF-8'; // the feed reader converts all to utf-8 $file = $html; // use feed reader output if ($debug > 0) { printValidFeed($type, $count, $cl); } } } } } // duplicate here, but frames, iframes, or RSS might have added nonsense content $file = purify_content($file); // prepare CVS files if ($url_status['content'] == 'csv' && $index_csv == '1') { $file = str_replace(",", " ", $file); $file = str_replace(";", " ", $file); } //echo "\r\n\r\n<br>url_status Array:<br><pre>";print_r($url_status);echo "</pre>\r\n"; // for DOCs, PDFs, etc we need special text converter if ($url_status['content'] != 'text' && $url_status['content'] != 'xml' && $url_status['content'] != 'xhtml' && $url_status['content'] != 'csv') { $document = 1; $file = extract_text($file, $file0, $url_status['content'], $url, $chrSet); // because the converter already transferred the documents to UTF-8, we need to adjust it here $contents['charset'] = 'UTF-8'; $charSet = 'UTF-8'; if ($file == 'ERROR') { // if error, suppress further indexing $OKtoIndex = 0; $file_read_error = 1; $realnum--; } // reduce Pashtu and Urdu to the main Farsi letters if (strtolower($charSet) == 'windows-1256' && $url_status['content'] == 'pdf') { $f_letter0 = array("ﺎ", "�"); $f_letter1 = array("�", "�", "ﺑ", "ﺒ"); $f_letter2 = array("ï–", "ïÂÂÂâ€â€Â", "ïÂÂÂËœ", "ïÂÂÂâ„¢"); $f_letter3 = array("ﺕ", "ﺖ", "ïºâ€â€Â", "ﺘ"); $f_letter4 = array("ﺙ", "ﺚ", "ﺛ", "ﺜ"); $f_letter5 = array("�", "ﺞ", "ﺟ", "ﺠ"); $f_letter6 = array("ïº", "ï»", "ï¼", "ï½"); $f_letter7 = array("ﺡ", "ﺢ", "ﺣ", "ﺤ"); $f_letter8 = array("ﮋ", "ﮊ"); $f_letter9 = array("ﺥ", "ﺦ", "ﺧ", "ﺨ"); $f_letter10 = array("ﺩ", "ﺪ"); $f_letter11 = array("ﺫ", "ﺬ"); $f_letter12 = array("ïºÂÂÂ", "ﺮ"); $f_letter13 = array("ﺯ", "ﺰ"); $f_letter14 = array("ﺱ", "ﺲ", "ﺳ", "ﺴ"); $f_letter15 = array("ﺵ", "ﺶ", "ﺷ", "ﺸ"); $f_letter16 = array("ﺹ", "ﺺ", "ﺻ", "ﺼ"); $f_letter17 = array("ﺽ", "ﺾ", "ﺿ", "ﻀ"); $f_letter18 = array("�", "ﻂ", "ﻃ", "ﻄ"); $f_letter19 = array("ï»…", "ﻆ", "ﻇ", "ﻈ"); $f_letter20 = array("ﻉ", "ﻊ", "ﻋ", "ﻌ"); $f_letter21 = array("�", "ﻎ", "�", "�"); $f_letter22 = array("ﻑ", "ï»’", "ﻓ", "ï»â€ÂÂ"); $f_letter23 = array("ﻕ", "ï»–", "ï»â€â€Â", "ﻘ"); $f_letter24 = array("ï»™", "ﻚ", "ï»›", "ﻜ", "ﮎ", "�", "�", "ﮑ"); $f_letter25 = array("ï®’", "ﮓ", "ï®â€ÂÂ", "ﮕ"); $f_letter26 = array("�", "ﻞ", "ﻟ", "ï» "); $f_letter27 = array("ﻡ", "ﻢ", "ﻣ", "ﻤ"); $f_letter28 = array("ﻧ", "ﻨ", "ﻦ", "ﻥ"); $f_letter29 = array("ï»ÂÂÂ", "ï»®"); $f_letter30 = array("ﻩ", "ﻪ", "ﻫ", "ﻬ"); $f_letter31 = array("ﻯ", "ï»°", "ï»±", "ﻲ", "ﻳ", "ï»´"); $file = str_replace($f_letter0, "ا", $file); $file = str_replace($f_letter1, "ب", $file); $file = str_replace($f_letter2, "Ù¾", $file); $file = str_replace($f_letter3, "ت", $file); $file = str_replace($f_letter4, "Ø«", $file); $file = str_replace($f_letter5, "ج", $file); $file = str_replace($f_letter6, "Ú†", $file); $file = str_replace($f_letter7, "ØÂÂÂ", $file); $file = str_replace($f_letter8, "Ú˜", $file); $file = str_replace($f_letter9, "Ø®", $file); $file = str_replace($f_letter10, "د", $file); $file = str_replace($f_letter11, "Ø°", $file); $file = str_replace($f_letter12, "ر", $file); $file = str_replace($f_letter13, "ز", $file); $file = str_replace($f_letter14, "س", $file); $file = str_replace($f_letter15, "Ø´", $file); $file = str_replace($f_letter16, "ص", $file); $file = str_replace($f_letter17, "ض", $file); $file = str_replace($f_letter18, "Ø·", $file); $file = str_replace($f_letter19, "ظ", $file); $file = str_replace($f_letter20, "ع", $file); $file = str_replace($f_letter21, "غ", $file); $file = str_replace($f_letter22, "Ù�", $file); $file = str_replace($f_letter23, "Ù‚", $file); $file = str_replace($f_letter24, "Ú©", $file); $file = str_replace($f_letter25, "Ú¯", $file); $file = str_replace($f_letter26, "Ù„", $file); $file = str_replace($f_letter27, "Ù…", $file); $file = str_replace($f_letter28, "Ù†", $file); $file = str_replace($f_letter29, "Ùˆ", $file); $file = str_replace($f_letter30, "Ù‡", $file); $file = str_replace($f_letter31, "ÙŠ", $file); } } if ($OKtoIndex == 1) { $pageSize = number_format(strlen($file) / 1024, 2, ".", ""); printPageSizeReport($pageSize, $topic); } $charSet = strtoupper(trim($contents['charset'])); // final charset for UTF-8 converter if (stristr($charSet, "encoding") || strlen($charSet) < '3') { // must be invalid encountered charset $charSet = 'UTF-8'; } //echo "\r\n\r\n<br /> final charSet: '$charSet'<br />\r\n"; if ($charSet == "UTF-16") { $charSet = "UTF-8"; // content will be converted in function clean_file() } $dic = ''; // if Chinese or Korean text should be segmented enter here if ($cn_seg == '1' && $file && !$js_link && !stristr($charSet, "8859")) { if ($charSet == 'GB2312' || $charSet == 'GB18030' || $charSet == 'GBK') { $dic = "" . $dict_dir . "/cn_gb18030.dic"; // simplified Chinese } if ($charSet == 'BIG5') { $dic = "" . $dict_dir . "/cn_big5.dic"; // traditional Chinese } if ($charSet == 'ISO10646-1933') { $dic = "" . $dict_dir . "/kr_iso10646-1933.dic"; // Korean } if ($charSet == 'EUC-KR') { $dic = "" . $dict_dir . "/kr_euc-kr.dic"; // Korean } if ($charSet == 'UTF-8') { $dic = "" . $dict_dir . "/cn_utf-8.dic"; // Unicode } if ($dic) { // if dictionary is available for page charset, perform a segmentation $Segmentation = new Segmentation(); $Segmentation->load($dic); $Segmentation->setLowercase(FALSE); $cn_result = $Segmentation->segmentString($file); if ($cn_result && $charSet != 'UTF-8') { $iconv_file = @iconv($charSet, "UTF-8//IGNORE", $cn_result); if (trim($iconv_file) == "") { // iconv is not installed or input charSet not available. We need to use class ConvertCharset $NewEncoding = new ConvertCharset($charSet, "utf-8"); $NewFileOutput = $NewEncoding->Convert($cn_result); $cn_result = $NewFileOutput; } else { $cn_result = $iconv_file; } unset($iconv_file, $NewEncoding, $NewFileOutput); } $seg_data = clean_file($cn_result, $url, $url_status['content'], $charSet, $use_nofollow, $use_robot, $can_leave_domain); } else { printNoDictionary($charSet, $cl); // no dictionary found for this charset } } // if Japanese text should be segmented enter here. But not if a Chinese dictonary was already found if ($jp_seg == '1' && $file && !$js_link && !stristr($charSet, "ISO") && !$dic) { $dic = ''; if ($charSet == 'UTF-8' || $charSet == 'EUC-JP') { $file = @iconv($charSet, "SHIFT_JIS//IGNORE", $file); $charSet = "SHIFT_JIS"; } if ($charSet == 'SHIFT_JIS') { $dic = "" . $dict_dir . "/jp_shiftJIS.dic"; } if ($dic) { // if dictionary is available for page charset, perform a segmentation $Segmentation = new Segmentation(); $Segmentation->load($dic); $Segmentation->setLowercase(FALSE); $jp_result = $Segmentation->segmentString($file); //echo "\r\n\r\n<br /> jp_result: $jp_result<br />\r\n"; if ($jp_result && $charSet != 'UTF-8') { $iconv_file = @iconv($charSet, "UTF-8//IGNORE", $jp_result); if (trim($iconv_file) == "") { // iconv is not installed or input charSet not available. We need to use class ConvertCharset $NewEncoding = new ConvertCharset($charSet, "utf-8"); $NewFileOutput = $NewEncoding->Convert($jp_result); $jp_result = $NewFileOutput; } else { $jp_result = $iconv_file; } unset($iconv_file, $NewEncoding, $NewFileOutput); } $seg_data = clean_file($jp_result, $url, $url_status['content'], $charSet, $use_nofollow, $use_robot, $can_leave_domain); } else { printNoDictionary($charSet, $cl); // no dictionary found for this charset } } // enter here only, if site / file is not yet UTF-8 coded or had already been converted to UTF-8 if ($charSet != "UTF-8" && $file) { $file = convertToUTF8($file, $charSet, $char_Set, $converter_dir); } // if activated in Admin backend, check for correct converting of $file into UTF-8 if ($utf8_verify) { $valid_utf8 = @iconv('UTF-8', 'UTF-8', $file) === $file; } if (!$valid_utf8) { $url_status['state'] = "<br />Invalid charset definition placed in meta tags of HTML header. Unable to convert the text into UTF-8<br />Indexing aborted for {$url}"; if ($server_char) { $url_status['state'] = "<br />Invalid charset definition supplied via HTTP by the client server. Unable to convert the text into UTF-8<br />Indexing aborted for {$url}"; } if ($use_prefcharset) { $url_status['state'] = "<br />Invalid charset definition placed Admin Settings.<br />Site was created with another charset<br />Indexing aborted for {$url}"; } printUrlStatus($url_status['state'], $command_line, $no_log); $file = ''; $deletable = 1; } else { if ($index_media == '1') { $newmd5sum = md5($file); // get md5 including links and title of media files } $data = clean_file($file, $url, $url_status['content'], $charSet, $use_nofollow, $use_robot, $can_leave_domain); //echo "\r\n\r\n<br>data Array:<br><pre>";print_r($data);echo "</pre>\r\n"; // index only links and their titles if ($only_links) { $media_links = '0'; $my_links = get_link_details($file, $url, $can_leave_domain, $data['base'], $media_links, $use_nofollow, $local_redir); $data['content'] = $my_links[0][0]; // define new content $data['fulltext'] = $my_links[0][0]; // define new content also for 'full text'; } // combine raw words plus segmented words if ($cn_seg == 1 || $jp_seg == 1 && $dic && !$js_link) { if ($debug != '0') { $seg_add = $seg_data[count] - $data[count]; // calculate segmentation result if ($seg_add > '0') { if ($charSet == 'EUC-KR' || $charSet == 'ISO10646-1933') { printSegKR($seg_add, $cl); } if ($charSet == 'SHIFT_JIS') { printSegJA($seg_add, $cl); } else { printSegCN($seg_add, $cl); } } /* echo "<br /><pre>Results of word segmentation:</pre>"; echo "<br />Unsegmented title :<br><pre>";print_r($data[title]);echo "</pre>"; echo "<br />Segmented title :<br><pre>";print_r($seg_data[title]);echo "</pre>"; echo "<br />Unsegmented full text:<br />$data[fulltext]<br />"; echo "<br />Segmented full text:<br />$seg_data[fulltext]"; */ } $data[content] = "" . $data[content] . "" . $seg_data[content] . ""; //$data[title] ="".$data[title]."".$seg_data[title].""; $data[description] = "" . $data[description] . "" . $seg_data[description] . ""; $data[keywords] = "" . $data[keywords] . "" . $seg_data[keywords] . ""; } // check if canonical redirection was found in page head $cano_link = '0'; if ($data['cano_link']) { //echo "\r\n\r\n<br /> url: '$url'<br />\r\n"; $cano_link = $db_con->real_escape_string($data['cano_link']); //echo "\r\n\r\n<br /> cano_link: '$cano_link'<br />\r\n"; if ($url != $cano_link) { // only new cano links are accepted $OKtoIndex = 0; $deletable = 1; $realnum--; if ($cano_link == "1") { printNoCanonical($cano_link, $cl); // if unable to extract redirection link } else { if ($data['refresh'] == '1') { printRefreshed($cano_link, $data['wait'], $cl); // if refresh meta tag was found in HTML head } else { printCanonical($cano_link, $cl); // if canonical link was found in HTML head } // do we already know this link in link-table $sql_query = "SELECT /* jfield 2 */ url from " . $mysql_table_prefix . "links where url like '{$cano_link}'"; $res = $db_con->query($sql_query); if ($debug && $db_con->errno) { $err_row = __LINE__ - 2; printf("<p><span class='red'> MySQL failure: %s \n<br /></span></p>", $db_con->error); if (__FUNCTION__) { printf("<p><span class='red'> Found in script: " . __FILE__ . " row: {$err_row} in function(): " . __FUNCTION__ . " <br /></span></p>"); } else { printf("<p><span class='red'> Found in script: " . __FILE__ . " row: {$err_row} <br /></span></p>"); } printf("<p><span class='red'> Script execution aborted. <br /></span>"); printf("<p><strong>Invalid query string, which caused the SQL error:</strong></p>"); echo "<p> {$sql_query} </p>"; exit; } $rows = $res->num_rows; if ($rows == 0) { // if not known in link-table, check if already known in temp-table $sql_query = "SELECT /* jfield 1 */ link from " . $mysql_table_prefix . "temp where link like '{$cano_link}'"; $res = $db_con->query($sql_query); if ($debug && $db_con->errno) { $err_row = __LINE__ - 2; printf("<p><span class='red'> MySQL failure: %s \n<br /></span></p>", $db_con->error); if (__FUNCTION__) { printf("<p><span class='red'> Found in script: " . __FILE__ . " row: {$err_row} in function(): " . __FUNCTION__ . " <br /></span></p>"); } else { printf("<p><span class='red'> Found in script: " . __FILE__ . " row: {$err_row} <br /></span></p>"); } printf("<p><span class='red'> Script execution aborted. <br /></span>"); printf("<p><strong>Invalid query string, which caused the SQL error:</strong></p>"); echo "<p> {$sql_query} </p>"; exit; } $rows = $res->num_rows; if ($rows == 0) { // not known in link-table, add new link if ($numoflinks <= $max_links) { $sql_query = "INSERT into " . $mysql_table_prefix . "temp (link, level, id) values ('{$cano_link}', '{$level}', '{$sessid}')"; $db_con->query($sql_query); } if ($debug && $db_con->errno) { $err_row = __LINE__ - 2; printf("<p><span class='red'> MySQL failure: %s \n<br /></span></p>", $db_con->error); if (__FUNCTION__) { printf("<p><span class='red'> Found in script: " . __FILE__ . " row: {$err_row} in function(): " . __FUNCTION__ . " <br /></span></p>"); } else { printf("<p><span class='red'> Found in script: " . __FILE__ . " row: {$err_row} <br /></span></p>"); } printf("<p><span class='red'> Script execution aborted. <br /></span>"); printf("<p><strong>Invalid query string, which caused the SQL error:</strong></p>"); echo "<p> {$sql_query} </p>"; exit; } } } } } $cano_link = '0'; // reset the cano flag } else { if ($index_media == '0') { $newmd5sum = md5($data['content']); // get md5 from cleaned full text only } if ($md5sum == $newmd5sum) { printStandardReport('md5notChanged', $command_line, $no_log); $OKtoIndex = 0; $realnum--; } else { mysqltest(); // check for duplicate page content $sql_query = "SELECT * from " . $mysql_table_prefix . "links where md5sum='{$newmd5sum}'"; $result = $db_con->query($sql_query); if ($debug && $db_con->errno) { $err_row = __LINE__ - 2; printf("<p><span class='red'> MySQL failure: %s \n<br /></span></p>", $db_con->error); if (__FUNCTION__) { printf("<p><span class='red'> Found in script: " . __FILE__ . " row: {$err_row} in function(): " . __FUNCTION__ . " <br /></span></p>"); } else { printf("<p><span class='red'> Found in script: " . __FILE__ . " row: {$err_row} <br /></span></p>"); } printf("<p><span class='red'> Script execution aborted. <br /></span>"); printf("<p><strong>Invalid query string, which caused the SQL error:</strong></p>"); echo "<p> {$sql_query} </p>"; exit; } if ($num_rows = $result->num_rows) { // display warning message and urls with duplicate content printStandardReport('duplicate', $command_line, $no_log); while ($row = $result->fetch_array(MYSQLI_ASSOC)) { $dups[] = $row['link_id']; } for ($i = 0; $i < $num_rows; $i++) { $link_id = $dups[$i]; //$num = $i+1; $sql_query = "SELECT * from " . $mysql_table_prefix . "links where link_id like '{$link_id}'"; $res1 = $db_con->query($sql_query); if ($debug && $db_con->errno) { $err_row = __LINE__ - 2; printf("<p><span class='red'> MySQL failure: %s \n<br /></span></p>", $db_con->error); if (__FUNCTION__) { printf("<p><span class='red'> Found in script: " . __FILE__ . " row: {$err_row} in function(): " . __FUNCTION__ . " <br /></span></p>"); } else { printf("<p><span class='red'> Found in script: " . __FILE__ . " row: {$err_row} <br /></span></p>"); } printf("<p><span class='red'> Script execution aborted. <br /></span>"); printf("<p><strong>Invalid query string, which caused the SQL error:</strong></p>"); echo "<p> {$sql_query} </p>"; exit; } $row = $res1->fetch_array(MYSQLI_NUM); $dup_url = urldecode($row[2]); $dup_url = $dup_url; $dup_url = @iconv($charSet, "UTF-8//IGNORE", $dup_url); if ($idna) { // Initialize the converter class $IDN = new idna_convert(array('idn_version' => 2008)); if ($conv_puny && strstr($dup_url, "xn--") && $idna) { $dup_url = $IDN->decode($dup_url); } } if ($clear == 1) { clean_resource($res, '03'); } printDupReport($dup_url, $command_line); } if ($dup_content == '0') { // enter here, if pages with duplicate content should not be indexed/re-indexed $OKtoIndex = 0; $realnum--; } else { $OKtoIndex = 1; } } } } //echo "\r\n\r\n<br>data array1:<br><pre>";print_r($data);echo "</pre>\r\n"; if (($md5sum != $newmd5sum || $reindex == 1) && $OKtoIndex == 1) { $urlparts = parse_addr($url); $newdomain = $urlparts['host']; $type = 0; if ($data['noindex'] == 1) { // remember this URlL, so it might not become another time a new link // check without scheme and www. $check_link = substr($check_link, stripos($url, "//") + 2); if (stristr($check_link, "www.")) { $check_link = substr($check_link, stripos($check_link, "www") + 4); } $sql_query = "SELECT url from " . $mysql_table_prefix . "links where url like '%{$check_link}'"; $res = $db_con->query($sql_query); if ($debug && $db_con->errno) { $err_row = __LINE__ - 2; printf("<p><span class='red'> MySQL failure: %s \n<br /></span></p>", $db_con->error); if (__FUNCTION__) { printf("<p><span class='red'> Found in script: " . __FILE__ . " row: {$err_row} in function(): " . __FUNCTION__ . " <br /></span></p>"); } else { printf("<p><span class='red'> Found in script: " . __FILE__ . " row: {$err_row} <br /></span></p>"); } printf("<p><span class='red'> Script execution aborted. <br /></span>"); printf("<p><strong>Invalid query string, which caused the SQL error:</strong></p>"); echo "<p> {$sql_query} </p>"; exit; } $known_link = $res->num_rows; if ($known_link != '1') { $sql_query = "INSERT into " . $mysql_table_prefix . "links (site_id, url, indexdate, size, md5sum, level) values ('{$site_id}', '{$url}', curdate(), '{$pageSize}', '{$newmd5sum}', '{$thislevel}')"; $db_con->query($sql_query); if ($debug && $db_con->errno) { $err_row = __LINE__ - 2; printf("<p><span class='red'> MySQL failure: %s \n<br /></span></p>", $db_con->error); if (__FUNCTION__) { printf("<p><span class='red'> Found in script: " . __FILE__ . " row: {$err_row} in function(): " . __FUNCTION__ . " <br /></span></p>"); } else { printf("<p><span class='red'> Found in script: " . __FILE__ . " row: {$err_row} <br /></span></p>"); } printf("<p><span class='red'> Script execution aborted. <br /></span>"); printf("<p><strong>Invalid query string, which caused the SQL error:</strong></p>"); echo "<p> {$sql_query} </p>"; exit; } } $OKtoIndex = 0; $deletable = 1; $realnum--; printStandardReport('metaNoindex', $command_line, $no_log); } if (!$js_link) { // JavaScript will not deliver keywords, only links are parsed $content = explode(" ", addslashes($data['content'])); //echo "\r\n\r\n<br>content array0:<br><pre>";print_r($content);echo "</pre>\r\n"; $acc_words[] = array(); $type = ''; // if Greek accents should be removed from Greek vowels if ($noacc_el) { foreach ($content as &$thisword) { $no_acc = remove_acc_el($thisword); if ($no_acc != $thisword) { $acc_words[] = $no_acc; } } } // if the other (Latin) accents should be removed from their vowels if ($vowels) { foreach ($content as $thisword) { $no_acc = remove_acc($thisword, ''); if ($no_acc != $thisword) { $acc_words[] = $no_acc; } } } // now add the words without accents to the total text content $content = array_merge($content, $acc_words); //echo "\r\n\r\n<br>content array0:<br><pre>";print_r($content);echo "</pre>\r\n"; // if ligatures should be equalized if ($liga) { $liga_words = array(); // will contain converted ligatures $phon_words = array(); // will contain converted phonetics // first: convert letters into latin ligatures foreach ($content as $thisword) { if ($thisword) { $liga_words[] = html_entity_decode($thisword, ENT_QUOTES, "UTF-8"); $thisword1 = $thisword; reset($latin_ligatures); while ($char = each($latin_ligatures)) { $thisword2 = preg_replace("/" . $char[0] . "/s", $char[1], $thisword1); // convert ligatures if ($thisword1 != $thisword2) { // break on first ligature $liga_words[] = html_entity_decode($thisword2, ENT_QUOTES, "UTF-8"); // collect new words with ligatures $thisword1 = $thisword2; // continue with the word, containing the ligatures //break; } } } } // second: convert all letters into phonetic transcriptions reset($liga_words); foreach ($liga_words as $thisword) { $thisword1 = $thisword; reset($phon_trans); while ($char = each($phon_trans)) { $thisword2 = preg_replace("/" . $char[0] . "/s", $char[1], $thisword1); // convert into phonetics if ($thisword1 != $thisword2) { // break on first ligature $phon_words[] = html_entity_decode($thisword2, ENT_QUOTES, "UTF-8"); // collect new words with phonetics $thisword1 = $thisword2; // continue with the word, containing the ligatures //break; } } } $liga_words = array_merge($liga_words, $phon_words); // add all phoneticss to the liga array // now vice versa: convert latin ligatures and phonetic transcriptions into standard letters reset($content); $not_liga_words = array(); foreach ($content as $thisword) { if ($thisword) { // first: convert latin ligatures into standard letters $thisword1 = superentities($thisword, ENT_QUOTES, "UTF-8"); reset($latin_ligatures); while ($char = each($latin_ligatures)) { $thisword2 = preg_replace("/" . $char[1] . "/s", $char[0], $thisword1); // re-convert ligatures if ($thisword1 != $thisword2) { $not_liga_words[] = html_entity_decode($thisword2, ENT_QUOTES, "UTF-8"); // collect new words without ligatures $thisword1 = $thisword2; // continue with the word, containing the ligature } } } //echo "\r\n\r\n<br>not_liga_words Array:<br><pre>";print_r($not_liga_words);echo "</pre>\r\n"; // second: convert phonetic transcriptions into standard letters reset($not_liga_words); $not_phon_words = array(); foreach ($not_liga_words as $thisword) { $thisword1 = superentities($thisword, ENT_QUOTES, "UTF-8"); reset($phon_trans); while ($char = each($phon_trans)) { $thisword2 = preg_replace("/" . $char[1] . "/s", $char[0], $thisword1); // re-convert sphonetic if ($thisword1 != $thisword2) { $not_phon_words[] = html_entity_decode($thisword2, ENT_QUOTES, "UTF-8"); // collect new words without phonetics $thisword1 = $thisword2; // continue with the word, containing the phonetic trans. } } } } $not_words = array_merge($not_liga_words, $not_phon_words); // add all together $content = array_merge($liga_words, $not_words); // add all ligatures and re-converted letters to the content array } $wordarray = unique_array($content); } //echo "\r\n\r\n<br>wordarray0:<br><pre>";print_r($wordarray);echo "</pre>\r\n"; if ($smp != 1) { if ($data['nofollow'] != 1 && $cano_link == '0') { $media_links = '0'; $links = array(); if (!$document) { // don't try to find links in PDFs and other pure documents $links = get_links($file, $url, $can_leave_domain, $data['base'], $media_links, $use_nofollow, $local_redir, $url_reloc, $charSet); } if ($links[0]) { $links = distinct_array($links); $all_links = count($links); if ($all_links > $max_links) { $all_links = $max_links; } $links = array_slice($links, 0, $max_links); if ($realnum < $max_links) { $numoflinks = 0; //if there are any new links, add to the temp table, but only if there isn't such url already if ($links[0]) { reset($links); $tmp_urls = get_temp_urls($sessid); // reload previous temp // echo "\r\n\r\n<br>tmp_urls array:<br><pre>";print_r($tmp_urls);echo "</pre>\r\n"; if ($debug == '2') { // if debug mode, show details printStandardReport('newLinks', $command_line, $no_log); } while ($thislink = each($links)) { // echo "\r\n\r\n<br>thislink array:<br><pre>";print_r($thislink);echo "</pre>\r\n"; // ignore error (message) links and self linking if (strstr($thislink[1], "//") && $thislink[1] != $url) { // find new domains for _addurl table if ($auto_add && $can_leave_domain) { $all_link = parse_all_url($thislink[1]); // only the domain will be stored as new URL into addurl table $dom_link = $all_link['host']; // reduce to domain name and tld $new_link = str_replace("www.", "", $dom_link); // use the complete URL //$dom_link = $thislink[1]; // use only the domain $dom_link = $all_link['scheme'] . "://" . $dom_link; $banned = ''; mysqltest(); // check whether URL is already known in sites table $sql_query = "SELECT url from " . $mysql_table_prefix . "sites where url like '%{$new_link}%'"; $res1 = $db_con->query($sql_query); // check whether URL is already known in addurl table $sql_query = "SELECT url from " . $mysql_table_prefix . "addurl where url like '%{$new_link}%'"; $res2 = $db_con->query($sql_query); // check whether URL is banned $sql_query = "SELECT domain from " . $mysql_table_prefix . "banned where domain like '%{$new_link}%'"; $res3 = $db_con->query($sql_query); if ($res3->num_rows) { $banned = "1"; } if ($res1->num_rows == 0 && $res2->num_rows == 0 && $res3->num_rows == 0) { // add new domain into _addurl table $sql_query = "INSERT into " . $mysql_table_prefix . "addurl (url, description, account) values ('{$dom_link}', '{$comment}', '{$admin_email}')"; $db_con->query($sql_query); } } // check whether thislink is already known as a link ( might happen by means of relocated URLs) $res4 = ''; $res5 = ''; $known_link = ''; $known_temp = ''; $check_link = $thislink[1]; // i don't believe the "like" is necessary here and it slows down indexing // // check without scheme and www. // $check_link = substr($check_link, stripos($check_link, "//")+2); // if (stristr($check_link, "www.")) { // $check_link = substr($check_link, stripos($check_link, "www")+4); // } // // $sql_query = "SELECT /* jfield 3 */ url from ".$mysql_table_prefix."links where url like '%$check_link'"; // $res4 = $db_con->query($sql_query); // // $known_link = $res4->num_rows;; // // $sql_query = "SELECT /* jfield 4 */ link from ".$mysql_table_prefix."temp where link like '%$check_link'"; // $res5 = $db_con->query($sql_query); // if ($debug > 0 && $db_con->errno) { // printf("MySQL failure: %s\n", $db_con->error); // echo "<br />Script aborted."; // exit; // } // $known_temp = $res5->num_rows;; $sql_query = "SELECT /* jfield 3 */ url from " . $mysql_table_prefix . "links where url = '{$check_link}'"; $res4 = $db_con->query($sql_query); $known_link = $res4->num_rows; $sql_query = "SELECT /* jfield 4 */ link from " . $mysql_table_prefix . "temp where link = '{$check_link}'"; $res5 = $db_con->query($sql_query); if ($debug > 0 && $db_con->errno) { printf("MySQL failure: %s\n", $db_con->error); echo "<br />Script aborted."; exit; } $known_temp = $res5->num_rows; // if this is a new link not yet known or banned, add this new link to the temp table if ($tmp_urls[$thislink[1]] != 1 && !$res1 && !$known_link && !$known_temp && !$banned) { $tmp_urls[$thislink[1]] = 1; $numoflinks++; if ($debug == '2') { $act_link = rawurldecode($thislink[1]); // make it readable $act_link = stripslashes($act_link); printNewLinks($act_link, $cl); } mysqltest(); $sql_query = "INSERT into " . $mysql_table_prefix . "temp (link, level, id) values ('{$thislink['1']}', '{$level}', '{$sessid}')"; if ($numoflinks <= $max_links) { $db_con->query($sql_query); } } } } } } } } else { printStandardReport('noFollow', $command_line, $no_log); } unset($file); } // JFIELD at this point, the URL in the DB is good // echo "<h1>DONE</h1>"; // exit; // if we should index only the files as defined in docs list if ($only_docs) { $OKtoIndex = ''; foreach ($docs as $thisdoc) { if (strstr($urlparts['path'], $thisdoc)) { $OKtoIndex = "1"; } } if (!$OKtoIndex) { printStandardReport('noDoclist', $command_line, $no_log); } } if ($OKtoIndex == 1) { if ($link_check == 0) { $title = $data['title']; $host = $data['host']; $path = $data['path']; $fulltxt = $data['fulltext']; $desc = substr($data['description'], 0, 1024); // extract domain $url_parts = parse_all_url($url); $hostname = $url_parts[host]; // rebuild domain for localhost applications if ($hostname == 'localhost') { $host1 = str_replace($local, '', $url); } $pos = strpos($host1, "/"); // on local server delete all behind the / // will work for localhost URLs like http://localhost/publizieren/japan1/index.htm // will fail for localhost URLs like http://localhost/publizieren/externe/japan2/index.htm if ($pos) { $host1 = substr($host1, 0, $pos); // build full adress again, now only local domain } if ($hostname == 'localhost') { $domain_for_db = "" . $local . "" . $host1 . "/"; // complete URL $domain_for_db = str_replace("http://", "", $domain_for_db); //$domain_for_db = $host1; } else { //$domain_for_db = ("$url_parts[scheme]://".$hostname."/"); // complete URL $domain_for_db = $hostname; } if (isset($domain_arr[$domain_for_db])) { $dom_id = $domain_arr[$domain_for_db]; } else { mysqltest(); $sql_query = "INSERT into " . $mysql_table_prefix . "domains (domain) values ('{$domain_for_db}')"; $db_con->query($sql_query); if ($debug && $db_con->errno) { $err_row = __LINE__ - 2; printf("<p><span class='red'> MySQL failure: %s \n<br /></span></p>", $db_con->error); if (__FUNCTION__) { printf("<p><span class='red'> Found in script: " . __FILE__ . " row: {$err_row} in function(): " . __FUNCTION__ . " <br /></span></p>"); } else { printf("<p><span class='red'> Found in script: " . __FILE__ . " row: {$err_row} <br /></span></p>"); } printf("<p><span class='red'> Script execution aborted. <br /></span>"); printf("<p><strong>Invalid query string, which caused the SQL error:</strong></p>"); echo "<p> {$sql_query} </p>"; exit; } $dom_id = $db_con->insert_id; $domain_arr[$domain_for_db] = $dom_id; } if (!$js_link) { // JavaScript will not deliver keywords, only links are parsed reset($wordarray); if ($case_sensitive == '0') { foreach ($wordarray as &$value) { $value[1] = lower_ent($value[1]); $value[1] = lower_case($value[1]); // convert keywords to lower case } } $wordarray = calc_weights($wordarray, $title, $host, $path, $data['keywords'], $url_parts); } else { $wordarray = ''; } //if there are words to index, add the link to the database, get its id, and add the word + their relation if (is_array($wordarray) && count($wordarray) >= $min_words_per_page) { $OKtoSave = 1; if ($use_white1 == '1') { // check if content of page matches ANY word in whitelist $found = '0'; foreach ($whitelist as $key => $val1) { reset($wordarray); while ($thisword = each($wordarray)) { $word = trim($thisword[1][1]); if (strcasecmp($val1, $word) == 0) { $found = '1'; } } } if ($found == '0') { printStandardReport('noWhitelist', $command_line, $no_log); $OKtoSave = 0; $realnum--; } } if ($use_white2 == '1') { // check if content of page matches ALL words in whitelist $all = count($whitelist); $found = '0'; $found_this = '0'; foreach ($whitelist as $key => $val2) { reset($wordarray); while ($thisword = each($wordarray)) { $word = trim($thisword[1][1]); if (strcasecmp($val2, $word) == 0) { $found_this = '1'; } } if ($found_this != '0') { $found++; $found_this = '0'; } } if ($found != $all) { printStandardReport('noWhitelist', $command_line, $no_log); $OKtoSave = 0; $realnum--; } } if ($use_black == '1') { $found = '0'; // check if content of page matches ANY string in blacklist foreach ($blacklist as $key => $val3) { $met = stripos($data[fulltext], $val3); if ($met) { $found = '1'; } } if ($found == '1') { printStandardReport('matchBlacklist', $command_line, $no_log); $OKtoSave = 0; $realnum--; $url_status['black'] = 1; return $url_status; } } // if activated in Admin backend, create a thumbnail of this URL if ($OKtoSave && $hostname != 'localhost' && $webshot) { $shot = ''; // will contain the png webshot $img = new webshots(); $shot = $img->url_to_image($url); if ($debug && stristr($shot, "error: #")) { $shot_warn = "<br />Unable to create the webshot because of " . $shot; printWarning($shot_warn, $command_line, $no_log); } else { $shot = $db_con->real_escape_string($shot); } } if ($md5sum == '' || $md5sum == '' && $url_status['relocate']) { // enter here for new page (unknown link) OR for new relocated URL(so it will become a new link) // title, description and fulltxt are already escaped in function clean_file(); $url = $db_con->real_escape_string($url); // jfield says: messy char decoding earlier // leaves crap here that fudges up the works $title_enc = mb_detect_encoding($title); if (mb_detect_encoding($title) != "UTF-8") { $title = iconv($title_enc, "UTF-8", $title); } $fulltxt = substr($fulltxt, 0, 100000); // we've got to stop somewhere $fulltxt_enc = mb_detect_encoding($fulltxt); if (mb_detect_encoding($title) != "UTF-8") { $fulltxt = iconv($fulltxt_enc, "UTF-8", $fulltxt); } mysqltest(); $sql_query = "INSERT into " . $mysql_table_prefix . "links (site_id, url, title, description, fulltxt, indexdate, size, md5sum, level, webshot) values ('{$site_id}', '{$url}', '{$title}', left('{$desc}', 255), '{$fulltxt}', curdate(), '{$pageSize}', '{$newmd5sum}', '{$thislevel}', '{$shot}')"; $db_con->query($sql_query); if ($debug && $db_con->errno) { $err_row = __LINE__ - 2; printf("<p><span class='red'> MySQL failure: %s \n<br /></span></p>", $db_con->error); if (__FUNCTION__) { printf("<p><span class='red'> Found in script: " . __FILE__ . " row: {$err_row} in function(): " . __FUNCTION__ . " <br /></span></p>"); } else { printf("<p><span class='red'> Found in script: " . __FILE__ . " row: {$err_row} <br /></span></p>"); } printf("<p><span class='red'> Script execution aborted. <br /></span>"); printf("<p><strong>Invalid query string, which caused the SQL error:</strong></p>"); echo "<p> {$sql_query} </p>"; //exit; // jfield: let's keep going return; } $sql_query = "SELECT link_id from " . $mysql_table_prefix . "links where url='{$url}'"; $result = $db_con->query($sql_query); if ($debug && $db_con->errno) { $err_row = __LINE__ - 2; printf("<p><span class='red'> MySQL failure: %s \n<br /></span></p>", $db_con->error); if (__FUNCTION__) { printf("<p><span class='red'> Found in script: " . __FILE__ . " row: {$err_row} in function(): " . __FUNCTION__ . " <br /></span></p>"); } else { printf("<p><span class='red'> Found in script: " . __FILE__ . " row: {$err_row} <br /></span></p>"); } printf("<p><span class='red'> Script execution aborted. <br /></span>"); printf("<p><strong>Invalid query string, which caused the SQL error:</strong></p>"); echo "<p> {$sql_query} </p>"; exit; } $row = $result->fetch_array(MYSQLI_NUM); $link_id = $row[0]; if ($OKtoSave) { // store link details, if not yet known (during reindex) if ($only_links) { // extract domain of current page delivering the new links $url_parts = parse_all_url($url); $hostname = $url_parts[host]; if ($hostname == 'localhost') { // rebuild domain for localhost applications $host1 = str_replace($local, '', $url); } $pos = strpos($host1, "/"); // on local server delete all behind the / // will work for localhost URLs like http://localhost/publizieren/japan1/index.htm // will fail for localhost URLs like http://localhost/publizieren/externe/japan2/index.htm if ($pos) { $host1 = substr($host1, 0, $pos); // build full adress again, now only local domain } if ($hostname == 'localhost') { $domain_db = "" . $local . "" . $host1 . "/"; // complete URL $domain_db = str_replace("http://", "", $domain_db); //$domain_db = $host1; } else { //$domain_db = ("$url_parts[scheme]://".$hostname."/"); // complete URL $domain_db = $hostname; } // now store all link details into db foreach ($my_links as $found_link) { // but only if we have found a title if ($found_link[3]) { mysqltest(); // check whether URL is already known in sites table $sql_query = "SELECT title from " . $mysql_table_prefix . "link_details where link_id like '{$link_id}' and url like '%{$found_link['2']}%'"; $res1 = $db_con->query($sql_query); if ($res1->num_rows == 0) { // must be new link $sql_query = "INSERT into " . $mysql_table_prefix . "link_details (link_id, url, title, indexdate, domain) values ('{$link_id}', '{$found_link['2']}', '{$found_link['3']}', now(), '{$domain_db}')"; $db_con->query($sql_query); } } } } if ($debug == '2') { // if debug mode, show details printStandardReport('newKeywords', $command_line, $no_log); } save_keywords($wordarray, $link_id, $dom_id); } mysqltest(); if ($index_media == '1' && $OKtoSave) { // find media content only if there was no conflict with text (white and/or blacklist) include "index_media.php"; // try to find media files } mysqltest(); if ($debug == '2') { printStandardReport('indexed1', $command_line, $no_log); } else { printStandardReport('indexed', $command_line, $no_log); } } else { if ($md5sum != '' && $md5sum != $newmd5sum && $OKtoSave) { //if page has changed, start updating mysqltest(); $sql_query = "SELECT link_id from " . $mysql_table_prefix . "links where url='{$url}'"; $result = $db_con->query($sql_query); if ($debug && $db_con->errno) { $err_row = __LINE__ - 2; printf("<p><span class='red'> MySQL failure: %s \n<br /></span></p>", $db_con->error); if (__FUNCTION__) { printf("<p><span class='red'> Found in script: " . __FILE__ . " row: {$err_row} in function(): " . __FUNCTION__ . " <br /></span></p>"); } else { printf("<p><span class='red'> Found in script: " . __FILE__ . " row: {$err_row} <br /></span></p>"); } printf("<p><span class='red'> Script execution aborted. <br /></span>"); printf("<p><strong>Invalid query string, which caused the SQL error:</strong></p>"); echo "<p> {$sql_query} </p>"; exit; } $row = $result->fetch_array(MYSQLI_NUM); $link_id = $row[0]; $sql_query = "DELETE from " . $mysql_table_prefix . "link_keyword where link_id={$link_id}"; $db_con->query($sql_query); if ($debug && $db_con->errno) { $err_row = __LINE__ - 2; printf("<p><span class='red'> MySQL failure: %s \n<br /></span></p>", $db_con->error); if (__FUNCTION__) { printf("<p><span class='red'> Found in script: " . __FILE__ . " row: {$err_row} in function(): " . __FUNCTION__ . " <br /></span></p>"); } else { printf("<p><span class='red'> Found in script: " . __FILE__ . " row: {$err_row} <br /></span></p>"); } printf("<p><span class='red'> Script execution aborted. <br /></span>"); printf("<p><strong>Invalid query string, which caused the SQL error:</strong></p>"); echo "<p> {$sql_query} </p>"; exit; } if ($debug == '2') { // if debug mode, show details printStandardReport('newKeywords', $command_line, $no_log); } save_keywords($wordarray, $link_id, $dom_id); $sql_query = "UPDATE " . $mysql_table_prefix . "links set title='{$title}', description ='{$desc}', fulltxt = '{$fulltxt}', indexdate=now(), size = '{$pageSize}', md5sum='{$newmd5sum}', level='{$thislevel}', webshot='{$shot}' where link_id='{$link_id}'"; mysqltest(); $db_con->query($sql_query); if ($debug && $db_con->errno) { $err_row = __LINE__ - 2; printf("<p><span class='red'> MySQL failure: %s \n<br /></span></p>", $db_con->error); if (__FUNCTION__) { printf("<p><span class='red'> Found in script: " . __FILE__ . " row: {$err_row} in function(): " . __FUNCTION__ . " <br /></span></p>"); } else { printf("<p><span class='red'> Found in script: " . __FILE__ . " row: {$err_row} <br /></span></p>"); } printf("<p><span class='red'> Script execution aborted. <br /></span>"); printf("<p><strong>Invalid query string, which caused the SQL error:</strong></p>"); echo "<p> {$sql_query} </p>"; exit; } if ($index_media == '1') { include "index_media.php"; // try to find media files } if ($debug == '2') { printStandardReport('re-indexed1', $command_line, $no_log); } } } } else { if ($js_link) { printStandardReport('js_content', $command_line, $no_log); } else { printStandardReport('minWords', $command_line, $no_log); } $realnum--; } } else { printStandardReport('link_okay', $command_line, $no_log); } unset($file, $title, $fulltxt, $desc); $wordarray = array(); $data = array(); $seg_data = array(); } } } } else { $deletable = 1; //printUrlStatus($url_status['state'], $command_line, $no_log); } mysqltest(); if ($url_status['relocate']) { // remove this relocated URL from temp table, because it is indexed now $sql_query = "DELETE from " . $mysql_table_prefix . "temp where link = '{$url}' AND id = '{$sessid}'"; $db_con->query($sql_query); if ($debug && $db_con->errno) { $err_row = __LINE__ - 2; printf("<p><span class='red'> MySQL failure: %s \n<br /></span></p>", $db_con->error); if (__FUNCTION__) { printf("<p><span class='red'> Found in script: " . __FILE__ . " row: {$err_row} in function(): " . __FUNCTION__ . " <br /></span></p>"); } else { printf("<p><span class='red'> Found in script: " . __FILE__ . " row: {$err_row} <br /></span></p>"); } printf("<p><span class='red'> Script execution aborted. <br /></span>"); printf("<p><strong>Invalid query string, which caused the SQL error:</strong></p>"); echo "<p> {$sql_query} </p>"; exit; } } if ($reindex == 1 && $deletable == 1) { check_for_removal($url); } else { if ($reindex == 1) { } } if (!isset($all_links)) { $all_links = 0; } if (!isset($numoflinks)) { $numoflinks = 0; } // if valid sitemap found, or canonical link, or something else, no LinkReport if ($smp != 1 && $OKtoIndex == 1 && $url_status['state'] == 'ok') { printLinksReport($numoflinks, $all_links, $command_line); } // remove the URL, which haas been idexed now from temp table. mysqltest(); $sql_query = "DELETE from " . $mysql_table_prefix . "temp where link = '{$url}' AND id = '{$sessid}'"; $db_con->query($sql_query); if ($debug && $db_con->errno) { $err_row = __LINE__ - 2; printf("<p><span class='red'> MySQL failure: %s \n<br /></span></p>", $db_con->error); if (__FUNCTION__) { printf("<p><span class='red'> Found in script: " . __FILE__ . " row: {$err_row} in function(): " . __FUNCTION__ . " <br /></span></p>"); } else { printf("<p><span class='red'> Found in script: " . __FILE__ . " row: {$err_row} <br /></span></p>"); } printf("<p><span class='red'> Script execution aborted. <br /></span>"); printf("<p><strong>Invalid query string, which caused the SQL error:</strong></p>"); echo "<p> {$sql_query} </p>"; exit; } return $url_status; }
function get_sitemap( $parent, $lname ) { $lid = ''; $sql = ' SELECT lid FROM lists WHERE lists.lname = "' . $_SESSION['lname'] . '" AND lists.luid = ' . $_SESSION['uid'] . ' '; $result = mysql_query( $sql ) or die( mysql_error() ) ; if( mysql_num_rows( $result ) > 0 ) { $lid = mysql_result( $result, 0 ); } if( !$lid ) { die( '<h2>De sitemap is niet gevonden</h2>' ); } else { $sql = ' SELECT * FROM listitems WHERE listitems.lilid = ' . $lid . ' ORDER BY listitems.liorder ASC '; $result = mysql_query( $sql ) or die( mysql_error() ); if( mysql_num_rows( $result ) > 0 ) { while( $row = mysql_fetch_assoc( $result ) ) { $sitemap_arr[ $row['liid'] ] = array( 'name' => $row['livalue'], 'parent' => $row['lipid'], 'liid' => $row['liid'], 'info' => $row['linote'] ); } $has_children = false; foreach( $sitemap_arr as $k => $v ) { if( $v['parent'] == $parent ) { if( $has_children === false ) { $has_children = true; echo '<ul>'; } echo ' <li id="item_' . $v['liid'] . '"> <div class="page_container"> <div class="page"> <img class="page_icon" src="' . PROJECT_URL . 'inc/img/page.png" alt="Page icon" /> <p class="add_notes"><a href="/sitemap2/inc/php/add_notes.php?liid=' . $v['liid'] . '">INFO</a></p> <p class="add_page">Add</p> <p class="delete_page" id="' . $v['name'] . '" title="' . $v['parent'] . '">Delete</p> <p class="page_name">' . $v['name'] . '</p> <p class="info">' . $v['info'] . '</p> </div> </div> '; get_sitemap( $v['liid'], $_SESSION['lname'] ); echo '</li>'; } } if( $has_children === true ) { echo '</ul>'; } } } }
/** * Determines the link for the current section. * * If the section's sub items are an array * the 'slug name' is used for the link, * if the section's sub item is a string, * the string is used for the link. * * Optionally check any section given as a parameter. */ function get_section_link($section = '', $section_sub = '') { global $_section; # Use the current section unless a specific section is given as a parameter $section = use_default($section, $_section); # if $section_sub is undefined, get it from the sitemap if (!$section_sub) { $sitemap = get_sitemap(); $section_sub = $sitemap[$section]; } $link = ''; if (is_array($section_sub)) { if (has_index_pages()) { $link .= slug_name($section); $link .= '.php'; } else { $link .= get_first_unnested_item($section_sub); # link to first sub item that isn't a nested array } } elseif (is_string($section_sub)) { $link = $section_sub; } return $link; }
function index_url($url, $level, $site_id, $md5sum, $domain, $indexdate, $sessid, $can_leave_domain, $reindex) { global $tmp_urls, $delay_time, $domain_arr, $charSet, $url_status, $whitelist, $blacklist, $supdomain, $smp, $realnum, $dup_url, $entities, $command_line; if (DEBUG == '0') { error_reporting(0); } else { error_reporting(E_ERROR); // otherwise a non existing siemap.xml would always cause a warning message } $needsReindex = 1; $deletable = 0; $url_status = url_status($url); $thislevel = $level - 1; if ($smp != 1 && Configure::read('follow_sitemap') == 1) { // enter here if we don't already know a valid sitemap and if admin settings allowed us to do so $tmp_urls = get_temp_urls($sessid); // reload previous temp $url2 = remove_sessid(convert_url($url)); // get folder where sitemap should be and if exists, cut existing filename, suffix and subfolder // Configure::read('local') = "http://localhost/publizieren/"; // your base adress for your local server $sitemap_name = "sitemap.xml"; // could be individualized $host = parse_url($url2); $hostname = $host[host]; if ($hostname == 'localhost') { $host1 = str_replace(Configure::read('local'), '', $url2); } $pos = strpos($host1, "/"); // on local server delete all behind the / if ($pos) { $host1 = substr($host1, 0, $pos); } // build full adress again, now only until host if ($hostname == 'localhost') { $url2 = Configure::read('local') . $host1; } else { $url2 = "{$host['scheme']}://{$hostname}"; } $input_file = "{$url2}/{$sitemap_name}"; // create path to sitemap if ($handle = fopen($input_file, "r")) { // happy times, we found a new sitemap $links = get_sitemap($input_file, TABLE_PREFIX); // now extract links from sitemap.xml if ($links != '') { // if links were extracted from sitemap.xml reset($links); while ($thislink = each($links)) { // check if we already know this link as a site url $result = mysql_query("select url from " . TABLE_PREFIX . "sites where url like '{$thislink['1']}%'"); if (DEBUG > '0') { echo mysql_error(); } $rows = mysql_num_rows($result); if ($rows == '0') { // for all new links: save in temp table mysql_query("insert into " . TABLE_PREFIX . "temp (link, level, id) values ('{$thislink['1']}', '{$level}', '{$sessid}')"); if (DEBUG > '0') { echo mysql_error(); } } } clean_resource($result); $smp = '1'; // there was a valid sitemap and we stored the new links } unset($links, $input_file); fclose($handle); } } if (strstr($url_status['state'], "Relocation")) { $url = eregi_replace(" ", "", url_purify($url_status['path'], $url, $can_leave_domain)); if ($url != '') { $result = mysql_query("select link from " . TABLE_PREFIX . "temp where link='{$url}' && id = '{$sessid}'"); if (DEBUG > '0') { echo mysql_error(); } $rows = mysql_num_rows($result); if ($rows == 0) { mysql_query("insert into " . TABLE_PREFIX . "temp (link, level, id) values ('{$url}', '{$level}', '{$sessid}')"); if (DEBUG > '0') { echo mysql_error(); } } clean_resource($result); } $url_status['state'] == "redirected"; } ini_set("user_agent", Configure::read('user_agent')); if ($url_status['state'] == 'ok') { $OKtoIndex = 1; $file_read_error = 0; if (time() - $delay_time < Configure::read('min_delay')) { sleep(Configure::read('min_delay') - (time() - $delay_time)); } $delay_time = time(); if (!fst_lt_snd(phpversion(), "4.3.0")) { $file = file_get_contents($url); if ($file === FALSE) { $file_read_error = 1; } } else { $fl = @fopen($url, "r"); if ($fl) { while ($buffer = @fgets($fl, 4096)) { $file .= $buffer; } unset($buffer); } else { $file_read_error = 1; } fclose($fl); } if ($file_read_error || Configure::read('utf8') == 1) { unset($file); $contents = getFileContents($url); // parse_url to get charset $file = $contents['file']; } $pageSize = number_format(strlen($file) / 1024, 2, ".", ""); printPageSizeReport($pageSize); if ($url_status['content'] != 'text') { $file = extract_text($file, $url_status['content']); //for DOCs, PDFs etc we need special converter if ($file == 'ERROR') { // if error, suppress further indexing $OKtoIndex = 0; $file_read_error = 1; } } if (Configure::read('utf8') == 1) { // enter here if file should be translated into utf-8 $charSet = $contents['charset']; if ($charSet == '') { // if we did not find any charset, we will use our own $charSet = Configure::read('home_charset'); } $charSet = strtoupper(trim($charSet)); if (strpos($charSet, '8859')) { $conv_file = html_entity_decode($file); } else { $conv_file = $file; // pure code } if ($charSet != "UTF-8") { // enter here only, if site / file is not jet UTF-8 coded $iconv_file = iconv($charSet, "UTF-8", $conv_file); // if installed, first try to use PHP function iconv if (trim($iconv_file) == "") { // iconv is not installed or input charSet not available. We need to use class ConvertCharset $charSet = str_ireplace('iso-', '', $charSet); $charSet = str_ireplace('iso', '', $charSet); $NewEncoding = new ConvertCharset($charSet, "utf-8"); $NewFileOutput = $NewEncoding->Convert($conv_file); $file = $NewFileOutput; } else { $file = $iconv_file; } unset($conv_file, $iconv_file, $NewEncoding, $NewFileOutput); } } $data = clean_file($file, $url, $url_status['content']); $newmd5sum = md5($data['content']); if ($md5sum == $newmd5sum) { printStandardReport('md5notChanged', $command_line); $OKtoIndex = 0; $realnum--; } else { if (Configure::read('use_white') == '1') { $found = '0'; // check if content of page matches any word in whitelist foreach ($whitelist as $key => $value) { $met = stripos($file, $value); if ($met) { $found = '1'; } } if ($found == '0') { printStandardReport('noWhitelist', $command_line); $OKtoIndex = 0; $realnum--; } } if (Configure::read('use_black') == '1') { $found = '0'; // check if content of page matches any word in blacklist foreach ($blacklist as $key => $value) { $met = stripos($file, $value); if ($met) { $found = '1'; } } if ($found == '1') { printStandardReport('matchBlacklist', $command_line); $OKtoIndex = 0; $realnum--; } } // check for duplicate page content $result = mysql_query("select link_id from " . TABLE_PREFIX . "links where md5sum='{$newmd5sum}'"); if (DEBUG > '0') { echo mysql_error(); } if (mysql_num_rows($result) > 0) { // display warning message and urls with duplicate content printStandardReport('duplicate', $command_line); $num_rows = mysql_num_rows($result); for ($i = 0; $i < $num_rows; $i++) { $link_id = mysql_result($result, $i, "link_id"); $num = $i + 1; $res = mysql_query("select url from " . TABLE_PREFIX . "links where link_id like '{$link_id}'"); if (DEBUG > '0') { echo mysql_error(); } $row = mysql_fetch_row($res); $dup_url = $row[0]; clean_resource($res); printDupReport($dup_url, $command_line); } if (Configure::read('dup_content') == '0') { // enter here, if pages with duplicate content should not be indexed/re-indexed $OKtoIndex = 0; $realnum--; } else { $OKtoIndex = 1; } } } if (($md5sum != $newmd5sum || $reindex == 1) && $OKtoIndex == 1) { $urlparts = parse_url($url); $newdomain = $urlparts['host']; $type = 0; if ($data['noindex'] == 1) { $OKtoIndex = 0; $deletable = 1; printStandardReport('metaNoindex', $command_line); } if (Configure::read('use_white') == '1') { $found = '0'; // check if content of page matches any word in whitelist foreach ($whitelist as $key => $value) { $met = stripos($data[fulltext], $value); if ($met) { $found = '1'; } } if ($found == '0') { printStandardReport('noWhitelist', $command_line); $OKtoIndex = 0; $realnum--; } } if (Configure::read('use_black') == '1') { $found = '0'; // check if content of page matches any word in blacklist foreach ($blacklist as $key => $value) { $met = stripos($data[fulltext], $value); if ($met) { $found = '1'; } } if ($found == '1') { printStandardReport('matchBlacklist', $command_line); $OKtoIndex = 0; $realnum--; } } $wordarray = unique_array(explode(" ", $data['content'])); if ($smp != 1) { if ($data['nofollow'] != 1) { $links = get_links($file, $url, $can_leave_domain, $data['base']); $links = distinct_array($links); $all_links = count($links); if ($all_links > Configure::read('max_links')) { $all_links = Configure::read('max_links'); } $links = array_slice($links, 0, Configure::read('max_links')); if ($realnum < Configure::read('max_links')) { $numoflinks = 0; //if there are any, add to the temp table, but only if there isnt such url already if (is_array($links)) { reset($links); if (DEBUG == '2') { // if debug mode, show details printStandardReport('newLinks', $command_line); } while ($thislink = each($links)) { if ($tmp_urls[$thislink[1]] != 1) { $tmp_urls[$thislink[1]] = 1; $numoflinks++; if (DEBUG == '2') { $act_link = $thislink[1]; printNewLinks($act_link); } if ($numoflinks <= Configure::read('max_links')) { mysql_query("insert into " . TABLE_PREFIX . "temp (link, level, id) values ('{$thislink['1']}', '{$level}', '{$sessid}')"); } if (DEBUG > '0') { echo mysql_error(); } } } } } } else { printStandardReport('noFollow', $command_line); } unset($file); } if ($OKtoIndex == 1) { if (Configure::read('link_check') == 0) { $title = $data['title']; $host = $data['host']; $path = $data['path']; $fulltxt = $data['fulltext']; $desc = substr($data['description'], 0, 254); $url_parts = parse_url($url); $domain_for_db = $url_parts['host']; if (isset($domain_arr[$domain_for_db])) { $dom_id = $domain_arr[$domain_for_db]; } else { mysql_query("insert into " . TABLE_PREFIX . "domains (domain) values ('{$domain_for_db}')"); $dom_id = mysql_insert_id(); $domain_arr[$domain_for_db] = $dom_id; } $wordarray = calc_weights($wordarray, $title, $host, $path, $data['keywords'], $url_parts); //if there are words to index, add the link to the database, get its id, and add the word + their relation if (is_array($wordarray) && count($wordarray) > Configure::read('min_words_per_page')) { if ($md5sum == '') { mysql_query("insert into " . TABLE_PREFIX . "links (site_id, url, title, description, fulltxt, indexdate, size, md5sum, level) values ('{$site_id}', '{$url}', '{$title}', '{$desc}', '{$fulltxt}', curdate(), '{$pageSize}', '{$newmd5sum}', {$thislevel})"); if (DEBUG > '0') { echo mysql_error(); } $result = mysql_query("select link_id from " . TABLE_PREFIX . "links where url='{$url}'"); if (DEBUG > '0') { echo mysql_error(); } $row = mysql_fetch_row($result); $link_id = $row[0]; clean_resource($result); if (DEBUG == '2') { // if debug mode, show details printStandardReport('newKeywords', $command_line); } save_keywords($wordarray, $link_id, $dom_id); if (DEBUG == '2') { printStandardReport('indexed1', $command_line); } else { printStandardReport('indexed', $command_line); } } else { if ($md5sum != '' && $md5sum != $newmd5sum) { //if page has changed, start updating $result = mysql_query("select link_id from " . TABLE_PREFIX . "links where url='{$url}'"); if (DEBUG > '0') { echo mysql_error(); } $row = mysql_fetch_row($result); $link_id = $row[0]; for ($i = 0; $i <= 15; $i++) { $char = dechex($i); mysql_query("delete from " . TABLE_PREFIX . "link_keyword{$char} where link_id={$link_id}"); if (DEBUG > '0') { echo mysql_error(); } } clean_resource($result); if (DEBUG == '2') { // if debug mode, show details printStandardReport('newKeywords', $command_line); } save_keywords($wordarray, $link_id, $dom_id); $query = "update " . TABLE_PREFIX . "links set title='{$title}', description ='{$desc}', fulltxt = '{$fulltxt}', indexdate=now(), size = '{$pageSize}', md5sum='{$newmd5sum}', level={$thislevel} where link_id={$link_id}"; mysql_query($query); if (DEBUG > '0') { echo mysql_error(); } if (DEBUG == '2') { printStandardReport('re-indexed1', $command_line); } else { printStandardReport('re-indexed', $command_line); } } } } else { printStandardReport('minWords', $command_line); $realnum--; } } else { printStandardReport('link_okay', $command_line); } unset($wordarray, $title, $fulltxt, $desc); } } } else { $deletable = 1; printUrlStatus($url_status['state'], $command_line); } if ($reindex == 1 && $deletable == 1) { check_for_removal($url); } else { if ($reindex == 1) { } } if (!isset($all_links)) { $all_links = 0; } if (!isset($numoflinks)) { $numoflinks = 0; } if ($smp != 1) { // if valid sitemap found, no LinkReport printLinksReport($numoflinks, $all_links, $command_line); } }