Пример #1
0
function add_links($url, $is_index_page = true, $num = '')
{
    global $db;
    $new_links = array();
    $j = 1;
    $url_htmlcode = get_url_content($url);
    $url_htmlcode = get_encoding($url_htmlcode, "GB2312");
    $links = get_links($url_htmlcode, $url, 1, $url);
    echo "<br><b>url=";
    print_r($url);
    echo "<br></b>";
    if ($is_index_page) {
        foreach ($links as $value) {
            $new_links[] = GetSiteUrl($value);
        }
    } else {
        $new_links = $links;
    }
    $new_links = distinct_array($new_links);
    foreach ($new_links as $value) {
        //echo $value."<br>";
        //ob_flush();
        //flush();
        $query = $db->query("select * from ve123_links where url='{$value}'");
        $num = $db->num_rows($query);
        if ($num == 0) {
            echo "<font color=#C60A00><b>抓取到:</b></font>" . $value . "<br>";
            if (!add_update_link($value, "", "", "add")) {
                continue;
            }
            $j++;
            if (!empty($num)) {
                if ($j >= $num) {
                    exit;
                }
            }
        } else {
            echo "<b>已存在了:</b>";
            echo "<a href=" . $value . " target=_blank>" . $value . "</a>";
            echo "<br>";
        }
        ob_flush();
        flush();
    }
}
Пример #2
0
function index_url($url, $level, $site_id, $md5sum, $domain, $indexdate, $sessid, $can_leave_domain, $reindex)
{
    global $entities, $min_delay;
    global $command_line;
    global $min_words_per_page;
    global $supdomain;
    global $mysql_table_prefix, $user_agent, $tmp_urls, $delay_time, $domain_arr;
    $needsReindex = 1;
    $deletable = 0;
    $url_status = url_status($url);
    $thislevel = $level - 1;
    if (strstr($url_status['state'], "Relocation")) {
        $url = preg_replace("/ /", "", url_purify($url_status['path'], $url, $can_leave_domain));
        if ($url != '') {
            $result = mysql_query("select link from " . $mysql_table_prefix . "temp where link='{$url}' && id = '{$sessid}'");
            echo mysql_error();
            $rows = mysql_numrows($result);
            if ($rows == 0) {
                mysql_query("insert into " . $mysql_table_prefix . "temp (link, level, id) values ('{$url}', '{$level}', '{$sessid}')");
                echo mysql_error();
            }
        }
        $url_status['state'] == "redirected";
    }
    /*
    		if ($indexdate <> '' && $url_status['date'] <> '') {
    			if ($indexdate > $url_status['date']) {
    				$url_status['state'] = "Date checked. Page contents not changed";
    				$needsReindex = 0;
    			}
    		}*/
    ini_set("user_agent", $user_agent);
    if ($url_status['state'] == 'ok') {
        $OKtoIndex = 1;
        $file_read_error = 0;
        if (time() - $delay_time < $min_delay) {
            sleep($min_delay - (time() - $delay_time));
        }
        $delay_time = time();
        if (!fst_lt_snd(phpversion(), "4.3.0")) {
            $file = file_get_contents($url);
            if ($file === FALSE) {
                $file_read_error = 1;
            }
        } else {
            $fl = @fopen($url, "r");
            if ($fl) {
                while ($buffer = @fgets($fl, 4096)) {
                    $file .= $buffer;
                }
            } else {
                $file_read_error = 1;
            }
            fclose($fl);
        }
        if ($file_read_error) {
            $contents = getFileContents($url);
            $file = $contents['file'];
        }
        $pageSize = number_format(strlen($file) / 1024, 2, ".", "");
        printPageSizeReport($pageSize);
        if ($url_status['content'] != 'text') {
            $file = extract_text($file, $url_status['content']);
        }
        printStandardReport('starting', $command_line);
        $newmd5sum = md5($file);
        if ($md5sum == $newmd5sum) {
            printStandardReport('md5notChanged', $command_line);
            $OKtoIndex = 0;
        } else {
            if (isDuplicateMD5($newmd5sum)) {
                $OKtoIndex = 0;
                printStandardReport('duplicate', $command_line);
            }
        }
        if (($md5sum != $newmd5sum || $reindex == 1) && $OKtoIndex == 1) {
            $urlparts = parse_url($url);
            $newdomain = $urlparts['host'];
            $type = 0;
            /*		if ($newdomain <> $domain)
            					$domainChanged = 1;
            
            				if ($domaincb==1) {
            					$start = strlen($newdomain) - strlen($supdomain);
            					if (substr($newdomain, $start) == $supdomain) {
            						$domainChanged = 0;
            					}
            				}*/
            // remove link to css file
            //get all links from file
            $data = clean_file($file, $url, $url_status['content']);
            if ($data['noindex'] == 1) {
                $OKtoIndex = 0;
                $deletable = 1;
                printStandardReport('metaNoindex', $command_line);
            }
            $wordarray = unique_array(explode(" ", $data['content']));
            if ($data['nofollow'] != 1) {
                $links = get_links($file, $url, $can_leave_domain, $data['base']);
                $links = distinct_array($links);
                $all_links = count($links);
                $numoflinks = 0;
                //if there are any, add to the temp table, but only if there isnt such url already
                if (is_array($links)) {
                    reset($links);
                    while ($thislink = each($links)) {
                        if ($tmp_urls[$thislink[1]] != 1) {
                            $tmp_urls[$thislink[1]] = 1;
                            $numoflinks++;
                            mysql_query("insert into " . $mysql_table_prefix . "temp (link, level, id) values ('{$thislink['1']}', '{$level}', '{$sessid}')");
                            echo mysql_error();
                        }
                    }
                }
            } else {
                printStandardReport('noFollow', $command_line);
            }
            if ($OKtoIndex == 1) {
                $title = $data['title'];
                $host = $data['host'];
                $path = $data['path'];
                $fulltxt = $data['fulltext'];
                $desc = substr($data['description'], 0, 254);
                $url_parts = parse_url($url);
                $domain_for_db = $url_parts['host'];
                if (isset($domain_arr[$domain_for_db])) {
                    $dom_id = $domain_arr[$domain_for_db];
                } else {
                    mysql_query("insert into " . $mysql_table_prefix . "domains (domain) values ('{$domain_for_db}')");
                    $dom_id = mysql_insert_id();
                    $domain_arr[$domain_for_db] = $dom_id;
                }
                $wordarray = calc_weights($wordarray, $title, $host, $path, $data['keywords']);
                //if there are words to index, add the link to the database, get its id, and add the word + their relation
                if (is_array($wordarray) && count($wordarray) > $min_words_per_page) {
                    if ($md5sum == '') {
                        mysql_query("insert into " . $mysql_table_prefix . "links (site_id, url, title, description, fulltxt, indexdate, size, md5sum, level) values ('{$site_id}', '{$url}', '{$title}', '{$desc}', '{$fulltxt}', curdate(), '{$pageSize}', '{$newmd5sum}', {$thislevel})");
                        echo mysql_error();
                        $result = mysql_query("select link_id from " . $mysql_table_prefix . "links where url='{$url}'");
                        echo mysql_error();
                        $row = mysql_fetch_row($result);
                        $link_id = $row[0];
                        save_keywords($wordarray, $link_id, $dom_id);
                        printStandardReport('indexed', $command_line);
                    } else {
                        if ($md5sum != '' && $md5sum != $newmd5sum) {
                            //if page has changed, start updating
                            $result = mysql_query("select link_id from " . $mysql_table_prefix . "links where url='{$url}'");
                            echo mysql_error();
                            $row = mysql_fetch_row($result);
                            $link_id = $row[0];
                            for ($i = 0; $i <= 15; $i++) {
                                $char = dechex($i);
                                mysql_query("delete from " . $mysql_table_prefix . "link_keyword{$char} where link_id={$link_id}");
                                echo mysql_error();
                            }
                            save_keywords($wordarray, $link_id, $dom_id);
                            $query = "update " . $mysql_table_prefix . "links set title='{$title}', description ='{$desc}', fulltxt = '{$fulltxt}', indexdate=now(), size = '{$pageSize}', md5sum='{$newmd5sum}', level={$thislevel} where link_id={$link_id}";
                            mysql_query($query);
                            echo mysql_error();
                            printStandardReport('re-indexed', $command_line);
                        }
                    }
                } else {
                    printStandardReport('minWords', $command_line);
                }
            }
        }
    } else {
        $deletable = 1;
        printUrlStatus($url_status['state'], $command_line);
    }
    if ($reindex == 1 && $deletable == 1) {
        check_for_removal($url);
    } else {
        if ($reindex == 1) {
        }
    }
    if (!isset($all_links)) {
        $all_links = 0;
    }
    if (!isset($numoflinks)) {
        $numoflinks = 0;
    }
    printLinksReport($numoflinks, $all_links, $command_line);
}
Пример #3
0
 function sites()
 {
     $links = $this->links();
     $sites = array();
     foreach ($links as $value) {
         $sites[] = GetSiteUrl($value);
     }
     $sites = distinct_array($sites);
     return $sites;
 }
Пример #4
0
function add_links($url, $is_index_page = true, $num = '')
{
    global $db;
    $new_links = array();
    $j = 1;
    $url_htmlcode = get_url_content($url);
    $url_htmlcode = get_encoding($url_htmlcode, "GB2312");
    $links = get_links($url_htmlcode, $url, 1, $url);
    if ($is_index_page) {
        foreach ($links as $value) {
            $new_links[] = GetSiteUrl($value);
        }
    } else {
        $new_links = $links;
    }
    $new_links = distinct_array($new_links);
    foreach ($new_links as $value) {
        echo $value . "<br>";
        flush();
        $query = $db->query("select * from ve123_links where url='{$value}'");
        $num = $db->num_rows($query);
        if ($num == 0) {
            if (!add_update_link($value, "", "", "add")) {
                continue;
            }
            $j++;
            if (!empty($num)) {
                if ($j >= $num) {
                    exit;
                }
            }
        }
    }
}
Пример #5
0
function index_url($url, $level, $site_id, $md5sum, $domain, $indexdate, $sessid, $can_leave_domain, $reindex, $use_nofollow, $cl, $use_robot, $use_pref, $url_inc, $url_not_inc, $num)
{
    global $db_con, $entities, $min_delay, $link_check, $command_line, $min_words_per_page, $dup_content, $dup_url, $quotes, $plus_nr, $use_prefcharset;
    global $min_words_per_page, $supdomain, $smp, $follow_sitemap, $max_links, $realnum, $local, $tmp_dir, $auto_add, $admin_email, $idna, $conv_puny;
    global $mysql_table_prefix, $user_agent, $tmp_urls, $delay_time, $domain_arr, $home_charset, $charSet, $url_status, $redir_count;
    global $debug, $common, $use_white1, $use_white2, $use_black, $whitelist, $blacklist, $clear, $abslinks, $utf8_verify, $webshot;
    global $index_media, $index_image, $suppress_suffix, $imagelist, $min_image_x, $min_image_y, $dup_media, $index_alt, $no_log, $index_rss;
    global $index_audio, $audiolist, $index_video, $videolist, $index_embeded, $rss_template, $index_csv, $delim, $ext, $index_id3, $dba_act;
    global $converter_dir, $dict_dir, $cn_seg, $jp_seg, $index_framesets, $index_iframes, $cdata, $dc, $preferred, $index_rar, $index_zip, $curl;
    global $docs, $only_docs, $only_links, $case_sensitive, $vowels, $noacc_el, $include_dir, $thumb_folder, $js_reloc, $server_char;
    global $latin_ligatures, $phon_trans, $liga;
    //  Currently (2013.01.11)  the variable $use_prefcharset as defined in Admin Settings 'Obligatory use preferred charset' is used.
    //  and not the variable $use_pref as defined in Admin Settings as a varaiable used for addsite() in .../admin/admin.php
    error_reporting(E_ALL & ~E_DEPRECATED & ~E_WARNING & ~E_NOTICE & ~E_STRICT);
    $data = array();
    $cn_data = array();
    $url_parts = array();
    $url_status = array();
    $url_status['black'] = '';
    $contents = array();
    $links = array();
    $wordarray = array();
    $topic = '';
    $url_reloc = '';
    $js_link = '';
    $document = '';
    $file = '';
    $file0 = '';
    $raw_file = '';
    $seg_data = '';
    $index_url = $url;
    $comment = $db_con->real_escape_string("Automatically added during index procedure, as this domain is not yet available in 'Sites' menu.");
    $admin_email = $db_con->real_escape_string($admin_email);
    if ($debug == '0') {
        if (function_exists("ini_set")) {
            ini_set("display_errors", "0");
        }
        error_reporting(0);
    } else {
        error_reporting(E_ERROR);
        //  otherwise  a non existing siemap.xml  would always cause a warning message
    }
    $needsReindex = 1;
    $deletable = 0;
    $nohost = 1;
    $i = 0;
    $nohost_count = 5;
    //  defines count of attempts to get in contact with the server
    //  check URL status
    while ($i < $nohost_count && $nohost) {
        $url_status = url_status($url, $site_id, $sessid);
        if (!stristr($url_status['state'], "NOHOST")) {
            $nohost = '';
            //  reset for successfull attempt
        }
        $i++;
    }
    //  check for emergency exit
    if ($url_status['aborted'] == '1' || stristr($url_status['state'], "NOHOST")) {
        return $url_status;
    }
    //  check for UFO file or invalid suffix
    if (stristr($url_status['state'], "ufo")) {
        return $url_status;
    }
    // JFIELD here is right before we try to retrieve the URL and get the error
    // echo "<h3>F****E: $url</h3>\n";
    //  check for 'unreachable' links and if it is a known URL, delete all keyword relationships, former indexed from the meanwhile unreachable link
    if (stristr($url_status['state'], "unreachable")) {
        printStandardReport('unreachable', $command_line, $no_log);
        $sql_query = "SELECT link_id from " . $mysql_table_prefix . "links where url='{$url}'";
        $result = $db_con->query($sql_query);
        if ($debug && $db_con->errno) {
            $err_row = __LINE__ - 2;
            printf("<p><span class='red'>&nbsp;MySQL failure: %s&nbsp;\n<br /></span></p>", $db_con->error);
            if (__FUNCTION__) {
                printf("<p><span class='red'>&nbsp;Found in script: " . __FILE__ . "&nbsp;&nbsp;row: {$err_row}&nbsp;&nbsp;in function():&nbsp;" . __FUNCTION__ . "&nbsp;<br /></span></p>");
            } else {
                printf("<p><span class='red'>&nbsp;Found in script: " . __FILE__ . "&nbsp;&nbsp;row: {$err_row}&nbsp;<br /></span></p>");
            }
            printf("<p><span class='red'>&nbsp;Script execution aborted.&nbsp;<br /></span>");
            printf("<p><strong>Invalid query string, which caused the SQL error:</strong></p>");
            echo "<p> {$sql_query} </p>";
            exit;
        }
        $row = $result->fetch_array(MYSQLI_NUM);
        $link_id = $row[0];
        if ($link_id) {
            $sql_query = "DELETE from " . $mysql_table_prefix . "link_keyword where link_id={$link_id}";
            $db_con->query($sql_query);
            if ($debug && $db_con->errno) {
                $err_row = __LINE__ - 2;
                printf("<p><span class='red'>&nbsp;MySQL failure: %s&nbsp;\n<br /></span></p>", $db_con->error);
                if (__FUNCTION__) {
                    printf("<p><span class='red'>&nbsp;Found in script: " . __FILE__ . "&nbsp;&nbsp;row: {$err_row}&nbsp;&nbsp;in function():&nbsp;" . __FUNCTION__ . "&nbsp;<br /></span></p>");
                } else {
                    printf("<p><span class='red'>&nbsp;Found in script: " . __FILE__ . "&nbsp;&nbsp;row: {$err_row}&nbsp;<br /></span></p>");
                }
                printf("<p><span class='red'>&nbsp;Script execution aborted.&nbsp;<br /></span>");
                printf("<p><strong>Invalid query string, which caused the SQL error:</strong></p>");
                echo "<p> {$sql_query} </p>";
                exit;
            }
            //  here we should delete the keywords associated only to the unreachable link
            //  but this takes too much time during index procedure
            //  the admin is asked toc do it manually by using the regarding option in 'Clean' menue
            //
            //  delete the meanwhile unreachable link from db
            $sql_query = "DELETE from " . $mysql_table_prefix . "links where link_id = {$link_id}";
            $db_con->query($sql_query);
            if ($debug && $db_con->errno) {
                $err_row = __LINE__ - 2;
                printf("<p><span class='red'>&nbsp;MySQL failure: %s&nbsp;\n<br /></span></p>", $db_con->error);
                if (__FUNCTION__) {
                    printf("<p><span class='red'>&nbsp;Found in script: " . __FILE__ . "&nbsp;&nbsp;row: {$err_row}&nbsp;&nbsp;in function():&nbsp;" . __FUNCTION__ . "&nbsp;<br /></span></p>");
                } else {
                    printf("<p><span class='red'>&nbsp;Found in script: " . __FILE__ . "&nbsp;&nbsp;row: {$err_row}&nbsp;<br /></span></p>");
                }
                printf("<p><span class='red'>&nbsp;Script execution aborted.&nbsp;<br /></span>");
                printf("<p><strong>Invalid query string, which caused the SQL error:</strong></p>");
                echo "<p> {$sql_query} </p>";
                exit;
            }
        }
        return $url_status;
    }
    //  check for overwritten URL, forced by the header, sending content PLUS any redirected URL
    if ($url_status['url_over'] && !$url_status['relocate']) {
        $url = $url_status['url_over'];
    }
    $url_parts = parse_all_url($url);
    $thislevel = $level - 1;
    //  redirected URL ?
    if ($url_status['relocate']) {
        //  if relocated,  print message, verify the new URL, and redirect to new URL
        //  check for redirection on an already indexed link
        $known_link = '';
        $sql_query = "SELECT * from " . $mysql_table_prefix . "links where url='{$url}'";
        $result = $db_con->query($sql_query);
        if ($debug && $db_con->errno) {
            $err_row = __LINE__ - 2;
            printf("<p><span class='red'>&nbsp;MySQL failure: %s&nbsp;\n<br /></span></p>", $db_con->error);
            if (__FUNCTION__) {
                printf("<p><span class='red'>&nbsp;Found in script: " . __FILE__ . "&nbsp;&nbsp;row: {$err_row}&nbsp;&nbsp;in function():&nbsp;" . __FUNCTION__ . "&nbsp;<br /></span></p>");
            } else {
                printf("<p><span class='red'>&nbsp;Found in script: " . __FILE__ . "&nbsp;&nbsp;row: {$err_row}&nbsp;<br /></span></p>");
            }
            printf("<p><span class='red'>&nbsp;Script execution aborted.&nbsp;<br /></span>");
            printf("<p><strong>Invalid query string, which caused the SQL error:</strong></p>");
            echo "<p> {$sql_query} </p>";
            exit;
        }
        $known_link = $result->num_rows;
        if ($known_link) {
            $urlo_status['state'] = "URL was redirected to an already indexed page.<br />In order to prevent infinite indexation, this is not supported by Sphider-plus.<br />Indexation aborted for this URL";
            $url_status['aborted'] = 1;
            return $url_status;
        }
        //  remove the original URL from temp table. The relocated URL will be added later on.
        mysqltest();
        $sql_query = "DELETE from " . $mysql_table_prefix . "temp where link = '{$url}' AND id = '{$sessid}'";
        $db_con->query($sql_query);
        if ($debug && $db_con->errno) {
            $err_row = __LINE__ - 2;
            printf("<p><span class='red'>&nbsp;MySQL failure: %s&nbsp;\n<br /></span></p>", $db_con->error);
            if (__FUNCTION__) {
                printf("<p><span class='red'>&nbsp;Found in script: " . __FILE__ . "&nbsp;&nbsp;row: {$err_row}&nbsp;&nbsp;in function():&nbsp;" . __FUNCTION__ . "&nbsp;<br /></span></p>");
            } else {
                printf("<p><span class='red'>&nbsp;Found in script: " . __FILE__ . "&nbsp;&nbsp;row: {$err_row}&nbsp;<br /></span></p>");
            }
            printf("<p><span class='red'>&nbsp;Script execution aborted.&nbsp;<br /></span>");
            printf("<p><strong>Invalid query string, which caused the SQL error:</strong></p>");
            echo "<p> {$sql_query} </p>";
            exit;
        }
        $new_url = $url_status['path'];
        //  URL of first redirection
        //  remove the redirected URL, which eventually is  already stored in db
        //  before finally storing in db, we need to check for correct redirection.
        $sql_query = "DELETE from " . $mysql_table_prefix . "temp where link = '{$new_url}' AND id = '{$sessid}'";
        $db_con->query($sql_query);
        if ($debug && $db_con->errno) {
            $err_row = __LINE__ - 2;
            printf("<p><span class='red'>&nbsp;MySQL failure: %s&nbsp;\n<br /></span></p>", $db_con->error);
            if (__FUNCTION__) {
                printf("<p><span class='red'>&nbsp;Found in script: " . __FILE__ . "&nbsp;&nbsp;row: {$err_row}&nbsp;&nbsp;in function():&nbsp;" . __FUNCTION__ . "&nbsp;<br /></span></p>");
            } else {
                printf("<p><span class='red'>&nbsp;Found in script: " . __FILE__ . "&nbsp;&nbsp;row: {$err_row}&nbsp;<br /></span></p>");
            }
            printf("<p><span class='red'>&nbsp;Script execution aborted.&nbsp;<br /></span>");
            printf("<p><strong>Invalid query string, which caused the SQL error:</strong></p>");
            echo "<p> {$sql_query} </p>";
            exit;
        }
        //  now special processing for relative links
        if (!strpos(substr($new_url, 0, 5), "ttp")) {
            $new_url = make_abs($new_url, $index_url);
        }
        if ($url == $new_url && $url_status['file']) {
            $url_status['relocate'] = '';
            //  remove this redirection, as it is 'in it selves'
            $url_status['state'] = "ok";
            //  try to index the conteent
        }
        $care_excl = '1';
        //  care file suffixed to be excluded
        $relocated = '1';
        //  URL is relocated
        if ($debug) {
            printRedirected($url_status['relocate'], $url_status['path'], $cl);
        }
        $count = "1";
        while ($count <= $redir_count && $url_status['relocate'] && !$url_status['aborted']) {
            //  check this redirection
            $url_status = url_status($new_url, $site_id, $sessid);
            if ($url_status['path']) {
                $new_url = $url_status['path'];
                //  URL of another redirections
                //  now special processing for relative links
                if (!strpos(substr($new_url, 0, 5), "ttp")) {
                    $new_url = make_abs($new_url, $index_url);
                }
            }
            if ($debug) {
                printRedirected($url_status['relocate'], $url_status['path'], $cl);
            }
            $count++;
        }
        if ($url_status['relocate']) {
            $url_status['aborted'] = 1;
            $url_status['state'] = "<br />Indexation aborted because of too many redirections.<br />";
            return $url_status;
        }
        if ($url_status['state'] != "ok") {
            $code = $url_status['state'];
            //  check for most common client errors
            if (!preg_match("/401|402|403|404/", $code)) {
                $url_status['aborted'] = 1;
                //  end indexing for cmplete site
            } else {
                $url_status['aborted'] = '';
                //  abort only for this page
            }
            if (strstr($code, "401")) {
                $code = "401 (Authentication required)";
            }
            if (strstr($code, "403")) {
                $code = "403 (Forbidden)";
            }
            if (strstr($code, "404")) {
                $code = "404 (Not found)";
            }
            $url_status['state'] = "<br />Indexation aborted because of code: {$code}.<br />";
        }
        //  check final URL (which might be the 3. redirection)
        //  and puriify final redirected URL
        $url = $db_con->real_escape_string(url_purify($new_url, $index_url, $can_leave_domain, $care_excl, $relocated, $local_redir));
        // valid file suffix for the redirection??
        if ($url) {
            if ($care_excl == '1') {
                //  care about non-accepted suffixes
                reset($ext);
                while (list($id, $excl) = each($ext)) {
                    if (preg_match("/\\.{$excl}(\$|\\?)/i", $url)) {
                        //  if suffix is at the end of the link, or followd by a question mark
                        $url_status['state'] = 'Found: Not supported suffix';
                        //  error message
                        return $url_status;
                    }
                }
            }
        }
        if (!$url) {
            $link_parts = parse_all_url($url);
            $host = $link_parts['host'];
            $sql_query = "DELETE from " . $mysql_table_prefix . "temp where link like '{$index_url}' AND id = '{$sessid}' OR relo_link like '{$url}'";
            $db_con->query($sql_query);
            if ($debug && $db_con->errno) {
                $err_row = __LINE__ - 2;
                printf("<p><span class='red'>&nbsp;MySQL failure: %s&nbsp;\n<br /></span></p>", $db_con->error);
                if (__FUNCTION__) {
                    printf("<p><span class='red'>&nbsp;Found in script: " . __FILE__ . "&nbsp;&nbsp;row: {$err_row}&nbsp;&nbsp;in function():&nbsp;" . __FUNCTION__ . "&nbsp;<br /></span></p>");
                } else {
                    printf("<p><span class='red'>&nbsp;Found in script: " . __FILE__ . "&nbsp;&nbsp;row: {$err_row}&nbsp;<br /></span></p>");
                }
                printf("<p><span class='red'>&nbsp;Script execution aborted.&nbsp;<br /></span>");
                printf("<p><strong>Invalid query string, which caused the SQL error:</strong></p>");
                echo "<p> {$sql_query} </p>";
                exit;
            }
            $url_status['aborted'] = 1;
            $url_status['state'] = "<br />Indexation aborted because of undefined redirection error.<br />";
            return $url_status;
        }
        //  abort indexation, if the redirected URL is equal to calling URL
        if ($url == 'self') {
            $link_parts = parse_all_url($url);
            $host = $link_parts['host'];
            $sql_query = "DELETE from " . $mysql_table_prefix . "temp where link like '{$url}' AND id = '{$sessid}' OR relo_link like '{$url}'";
            $db_con->query($sql_query);
            if ($debug && $db_con->errno) {
                $err_row = __LINE__ - 2;
                printf("<p><span class='red'>&nbsp;MySQL failure: %s&nbsp;\n<br /></span></p>", $db_con->error);
                if (__FUNCTION__) {
                    printf("<p><span class='red'>&nbsp;Found in script: " . __FILE__ . "&nbsp;&nbsp;row: {$err_row}&nbsp;&nbsp;in function():&nbsp;" . __FUNCTION__ . "&nbsp;<br /></span></p>");
                } else {
                    printf("<p><span class='red'>&nbsp;Found in script: " . __FILE__ . "&nbsp;&nbsp;row: {$err_row}&nbsp;<br /></span></p>");
                }
                printf("<p><span class='red'>&nbsp;Script execution aborted.&nbsp;<br /></span>");
                printf("<p><strong>Invalid query string, which caused the SQL error:</strong></p>");
                echo "<p> {$sql_query} </p>";
                exit;
            }
            $url_status['aborted'] = 1;
            $url_status['state'] = "<br />Indexation aborted for this page, because the redirection was a link in it selves.<br />Blocked by Sphider-plus, because this could end in an infinite indexation loop.<br />";
            return $url_status;
        }
        //  abort indexation, if the redirected URL contains invalid file suffix
        if ($url == 'excl') {
            $link_parts = parse_all_url($url);
            $host = $link_parts['host'];
            $sql_query = "DELETE from " . $mysql_table_prefix . "temp where link like '{$url}' AND id = '{$sessid}' OR relo_link like '{$url}'";
            $db_con->query($sql_query);
            if ($debug && $db_con->errno) {
                $err_row = __LINE__ - 2;
                printf("<p><span class='red'>&nbsp;MySQL failure: %s&nbsp;\n<br /></span></p>", $db_con->error);
                if (__FUNCTION__) {
                    printf("<p><span class='red'>&nbsp;Found in script: " . __FILE__ . "&nbsp;&nbsp;row: {$err_row}&nbsp;&nbsp;in function():&nbsp;" . __FUNCTION__ . "&nbsp;<br /></span></p>");
                } else {
                    printf("<p><span class='red'>&nbsp;Found in script: " . __FILE__ . "&nbsp;&nbsp;row: {$err_row}&nbsp;<br /></span></p>");
                }
                printf("<p><span class='red'>&nbsp;Script execution aborted.&nbsp;<br /></span>");
                printf("<p><strong>Invalid query string, which caused the SQL error:</strong></p>");
                echo "<p> {$sql_query} </p>";
                exit;
            }
            $url_status['aborted'] = 1;
            $url_status['state'] = "<br />Indexation aborted because the redirected link does not meet the URL suffix conditions.<br />";
            return $url_status;
        }
        //  abort indexation, because purifing the redirected URL failed
        if (!strstr($url, "//")) {
            $sql_query = "DELETE from " . $mysql_table_prefix . "temp where link like '{$url}' AND id = '{$sessid}' OR relo_link like '{$url}'";
            $db_con->query($sql_query);
            if ($debug && $db_con->errno) {
                $err_row = __LINE__ - 2;
                printf("<p><span class='red'>&nbsp;MySQL failure: %s&nbsp;\n<br /></span></p>", $db_con->error);
                if (__FUNCTION__) {
                    printf("<p><span class='red'>&nbsp;Found in script: " . __FILE__ . "&nbsp;&nbsp;row: {$err_row}&nbsp;&nbsp;in function():&nbsp;" . __FUNCTION__ . "&nbsp;<br /></span></p>");
                } else {
                    printf("<p><span class='red'>&nbsp;Found in script: " . __FILE__ . "&nbsp;&nbsp;row: {$err_row}&nbsp;<br /></span></p>");
                }
                printf("<p><span class='red'>&nbsp;Script execution aborted.&nbsp;<br /></span>");
                printf("<p><strong>Invalid query string, which caused the SQL error:</strong></p>");
                echo "<p> {$sql_query} </p>";
                exit;
            }
            $url_status['aborted'] = 1;
            $url_status['state'] = "<br />Indexation aborted because {$url} is not supported.<br />";
            return $url_status;
        }
        //  abort indexation, if redirected URL met 'must/must not include' string rule
        if (!check_include($url, $url_inc, $url_not_inc)) {
            $link_parts = parse_all_url($url);
            $host = $link_parts['host'];
            $sql_query = "DELETE from " . $mysql_table_prefix . "temp where link like '{$url}' AND id = '{$sessid}' OR relo_link like '{$url}'";
            $db_con->query($sql_query);
            if ($debug && $db_con->errno) {
                $err_row = __LINE__ - 2;
                printf("<p><span class='red'>&nbsp;MySQL failure: %s&nbsp;\n<br /></span></p>", $db_con->error);
                if (__FUNCTION__) {
                    printf("<p><span class='red'>&nbsp;Found in script: " . __FILE__ . "&nbsp;&nbsp;row: {$err_row}&nbsp;&nbsp;in function():&nbsp;" . __FUNCTION__ . "&nbsp;<br /></span></p>");
                } else {
                    printf("<p><span class='red'>&nbsp;Found in script: " . __FILE__ . "&nbsp;&nbsp;row: {$err_row}&nbsp;<br /></span></p>");
                }
                printf("<p><span class='red'>&nbsp;Script execution aborted.&nbsp;<br /></span>");
                printf("<p><strong>Invalid query string, which caused the SQL error:</strong></p>");
                echo "<p> {$sql_query} </p>";
                exit;
            }
            $url_status['aborted'] = 1;
            $url_status['state'] = "<br />Indexation aborted because the redirected link does not meet<br />the URL 'must include' or 'must not include' conditions.<br />";
            return $url_status;
        }
        //  if redirected URL is already known and in database: abort
        $rows0 = '';
        $rows1 = '';
        mysqltest();
        $sql_query = "SELECT url from " . $mysql_table_prefix . "sites where url like '{$url}'";
        $result = $db_con->query($sql_query);
        if ($debug && $db_con->errno) {
            $err_row = __LINE__ - 2;
            printf("<p><span class='red'>&nbsp;MySQL failure: %s&nbsp;\n<br /></span></p>", $db_con->error);
            if (__FUNCTION__) {
                printf("<p><span class='red'>&nbsp;Found in script: " . __FILE__ . "&nbsp;&nbsp;row: {$err_row}&nbsp;&nbsp;in function():&nbsp;" . __FUNCTION__ . "&nbsp;<br /></span></p>");
            } else {
                printf("<p><span class='red'>&nbsp;Found in script: " . __FILE__ . "&nbsp;&nbsp;row: {$err_row}&nbsp;<br /></span></p>");
            }
            printf("<p><span class='red'>&nbsp;Script execution aborted.&nbsp;<br /></span>");
            printf("<p><strong>Invalid query string, which caused the SQL error:</strong></p>");
            echo "<p> {$sql_query} </p>";
            exit;
        }
        $rows0 = $result->num_rows;
        $sql_query = "SELECT * from " . $mysql_table_prefix . "links where url='{$url}'";
        $result = $db_con->query($sql_query);
        if ($debug && $db_con->errno) {
            $err_row = __LINE__ - 2;
            printf("<p><span class='red'>&nbsp;MySQL failure: %s&nbsp;\n<br /></span></p>", $db_con->error);
            if (__FUNCTION__) {
                printf("<p><span class='red'>&nbsp;Found in script: " . __FILE__ . "&nbsp;&nbsp;row: {$err_row}&nbsp;&nbsp;in function():&nbsp;" . __FUNCTION__ . "&nbsp;<br /></span></p>");
            } else {
                printf("<p><span class='red'>&nbsp;Found in script: " . __FILE__ . "&nbsp;&nbsp;row: {$err_row}&nbsp;<br /></span></p>");
            }
            printf("<p><span class='red'>&nbsp;Script execution aborted.&nbsp;<br /></span>");
            printf("<p><strong>Invalid query string, which caused the SQL error:</strong></p>");
            echo "<p> {$sql_query} </p>";
            exit;
        }
        $known_link = $result->fetch_array(MYSQLI_NUM);
        $md5 = $known_link[8];
        if ($clear == 1) {
            clean_resource($result, '02');
        }
        if ($rows0) {
            $url_status['state'] = "<br />URL already in database (as a site URL). Index aborted.<br />";
            $url_status['aborted'] = 1;
            return $url_status;
        }
        // if known link, which is already indexed (because containing the md5 checksum), enter here
        if ($known_link[8]) {
            $count = $known_link[15];
            $count++;
            if ($count > $redir_count) {
                //  abort indexation
                $url_status['state'] = "<br />{$count}. attempt to redirect in the same (already indexed) URL, <br />which is no longer accepted by Sphider-plus. Indexation aborted for this site.<br />";
                $url_status['aborted'] = 1;
                return $url_status;
            } else {
                $sql_query = "UPDATE " . $mysql_table_prefix . "links set relo_count='{$count}' where url='{$url}'";
                $db_con->query($sql_query);
            }
        }
        //  add redirected URL to temp table, if not yet known
        $sql_query = "SELECT link from " . $mysql_table_prefix . "temp where link='{$url}' && id = '{$sessid}'";
        $result = $db_con->query($sql_query);
        if ($debug && $db_con->errno) {
            $err_row = __LINE__ - 2;
            printf("<p><span class='red'>&nbsp;MySQL failure: %s&nbsp;\n<br /></span></p>", $db_con->error);
            if (__FUNCTION__) {
                printf("<p><span class='red'>&nbsp;Found in script: " . __FILE__ . "&nbsp;&nbsp;row: {$err_row}&nbsp;&nbsp;in function():&nbsp;" . __FUNCTION__ . "&nbsp;<br /></span></p>");
            } else {
                printf("<p><span class='red'>&nbsp;Found in script: " . __FILE__ . "&nbsp;&nbsp;row: {$err_row}&nbsp;<br /></span></p>");
            }
            printf("<p><span class='red'>&nbsp;Script execution aborted.&nbsp;<br /></span>");
            printf("<p><strong>Invalid query string, which caused the SQL error:</strong></p>");
            echo "<p> {$sql_query} </p>";
            exit;
        }
        $rows = $result->num_rows;
        if ($rows == 0) {
            $sql_query = "INSERT into " . $mysql_table_prefix . "temp (link, level, id, relo_count) values ('{$url}', '{$level}', '{$sessid}', '1')";
            $db_con->query($sql_query);
            if ($debug && $db_con->errno) {
                $err_row = __LINE__ - 2;
                printf("<p><span class='red'>&nbsp;MySQL failure: %s&nbsp;\n<br /></span></p>", $db_con->error);
                if (__FUNCTION__) {
                    printf("<p><span class='red'>&nbsp;Found in script: " . __FILE__ . "&nbsp;&nbsp;row: {$err_row}&nbsp;&nbsp;in function():&nbsp;" . __FUNCTION__ . "&nbsp;<br /></span></p>");
                } else {
                    printf("<p><span class='red'>&nbsp;Found in script: " . __FILE__ . "&nbsp;&nbsp;row: {$err_row}&nbsp;<br /></span></p>");
                }
                printf("<p><span class='red'>&nbsp;Script execution aborted.&nbsp;<br /></span>");
                printf("<p><strong>Invalid query string, which caused the SQL error:</strong></p>");
                echo "<p> {$sql_query} </p>";
                exit;
            }
        }
        if ($clear == 1) {
            clean_resource($result, '02');
        }
        //  at the end of redirect, rebuild the url parts from the redirected URL.
        //  This is the final URL, which will be indexed
        $url_parts = parse_all_url($url);
    }
    //  end check any redirection/relocation
    //  if a JavaScript file is currently indexed?
    $suffix = substr($url, strrpos($url, ".") + 1);
    $suffix = str_replace("/", "", $suffix);
    if (strlen($suffix) < "5") {
        if (preg_match("/js\$/", $suffix)) {
            $js_link = 1;
            //  activate JS switch
        }
    }
    if ($smp != 1 && $follow_sitemap == 1) {
        //  enter here if we don't already know a valid sitemap and if admin settings allowed us to do so
        $tmp_urls = get_temp_urls($sessid);
        //  reload previous temp
        $url2 = remove_sessid(convert_url($url));
        // get folder where sitemap should be and if exists, cut existing filename, suffix and subfolder
        $host = parse_addr($url2);
        $hostname = $host[host];
        $more_sitemaps = array();
        if ($hostname == 'localhost') {
            $host1 = str_replace($local, '', $url2);
        }
        $pos = strpos($host1, "/");
        //      on local server delete all behind the /
        if ($pos) {
            $host1 = substr($host1, 0, $pos);
        }
        //      build full adress again, now only the host
        if ($hostname == 'localhost') {
            $url2 = "" . $local . "" . $host1 . "";
        } else {
            $url2 = "{$host['scheme']}://{$hostname}";
        }
        $sitemap_name = "sitemap";
        //      standard name for sitemap file
        $input_file = "{$url2}/{$sitemap_name}";
        //      create path to sitemap
        $log_file = './sitemaps/current_sitemap.xml';
        //      destination for sitemap log-file
        $smap_found = '';
        $indexed_map = '';
        $map_cont = '';
        //  try to fetch individual sitemap url from database
        mysqltest();
        $sql_query = "SELECT smap_url from " . $mysql_table_prefix . "sites where site_id='{$site_id}'";
        $result = $db_con->query($sql_query);
        if ($debug && $db_con->errno) {
            $err_row = __LINE__ - 2;
            printf("<p><span class='red'>&nbsp;MySQL failure: %s&nbsp;\n<br /></span></p>", $db_con->error);
            if (__FUNCTION__) {
                printf("<p><span class='red'>&nbsp;Found in script: " . __FILE__ . "&nbsp;&nbsp;row: {$err_row}&nbsp;&nbsp;in function():&nbsp;" . __FUNCTION__ . "&nbsp;<br /></span></p>");
            } else {
                printf("<p><span class='red'>&nbsp;Found in script: " . __FILE__ . "&nbsp;&nbsp;row: {$err_row}&nbsp;<br /></span></p>");
            }
            printf("<p><span class='red'>&nbsp;Script execution aborted.&nbsp;<br /></span>");
            printf("<p><strong>Invalid query string, which caused the SQL error:</strong></p>");
            echo "<p> {$sql_query} </p>";
            exit;
        }
        $row = $result->fetch_array(MYSQLI_NUM);
        if (preg_match("/http:\\/\\//", $row[0])) {
            //   use the individual sitemap
            $input_file = preg_replace("/.xml.gz|.xml/i", "", $row[0]);
        }
        $file = "" . $input_file . ".xml";
        if ($fd = @fopen($file, "r")) {
            //  uncompressed ?
            //if ($zd = @gzopen("".$input_file.".xml", "r")) {    //  uncompressed ?
            $map_cont = @stream_get_contents($fd);
            if ($map_cont && strpos($map_cont, "schemas/sitemap")) {
                //  if we were able to read it
                $smap_found = '1';
            }
            fclose($fd);
        }
        $gz_file = "" . $input_file . ".xml.gz";
        if (!$smap_found && ($zd = @fopen("compress.zlib://{$gz_file}", "r"))) {
            // compressed  ?
            //if (!$smap_found && $zd = @gzopen("".$input_file.".xml.gz", "r")) {  // compressed  ?
            $map_cont = @gzread($zd, 10485760);
            //  max. 10 MB (might be too large for some server)
            gzclose($zd);
            if ($map_cont && strpos($map_cont, "schemas/sitemap")) {
                $smap_found = '1';
            }
        }
        //echo "\r\n\r\n<br>map_cont Array:<br><pre>";print_r($map_cont);echo "</pre>\r\n";
        if ($smap_found) {
            if ($debug != '0') {
                //      create a log-file of current sitemap.xml
                file_put_contents($log_file, $map_cont);
            }
            //$del = $db_con->query("DELETE from ".$mysql_table_prefix."temp"); // function get_sitemap and store_links will build a new temp table
            if (stristr($map_cont, "<sitemapindex")) {
                //      if current sitemap file is an index file
                printStandardReport('validSitemapInd', $command_line, $no_log);
                $get_maps = simplexml_load_string($map_cont);
                if ($get_maps) {
                    reset($get_maps);
                    foreach ($get_maps as $map_x) {
                        $new_links[] = $map_x->loc;
                        //   get all links to further sitemap files
                    }
                    if (is_array($new_links)) {
                        //      if we found more sitemap files
                        $new_links = explode(",", implode(",", $new_links));
                        // destroy SimpleXMLElement Object and get the link array
                        $new_links = array_slice($new_links, 0, $max_links);
                        $indexed_map = '1';
                        $i = '0';
                        //echo "\r\n\r\n<br>new_links Array:<br><pre>";print_r($new_links);echo "</pre>\r\n";
                        foreach ($new_links as $input_file) {
                            $these_links = get_sitemap($input_file, $indexed_map, $mysql_table_prefix);
                            // now extract page links from this sitemap file
                            //echo "\r\n\r\n<br>these_links Array:<br><pre>";print_r($these_links);echo "</pre>\r\n";
                            if ($these_links) {
                                reset($these_links);
                                store_newLinks($these_links, $level, $sessid);
                                $smp = '1';
                                //     there were valid sitemap files and we stored the new links
                                $i++;
                            } else {
                                printStandardReport('invalidSecSitemap', $command_line, $no_log);
                                //  unable to extract links from secondary sitemap file
                            }
                        }
                        printValidSecSmap($i, $cl);
                        unset($input_file, $map_cont, $new_links);
                    } else {
                        printStandardReport('invalidSecSitemap', $command_line, $no_log);
                        //  unable to extract links from secondary sitemap file
                    }
                } else {
                    printStandardReport('invalidSitemapInd', $command_line, $no_log);
                    //  unable to extract links from sitemap INDEX  file
                }
            } else {
                $links = get_sitemap($map_cont, $indexed_map, $mysql_table_prefix);
                // extract links from sitemap.xml  (there was only one sitemap file)
                if ($links != '') {
                    reset($links);
                    //echo "\r\n\r\n<br>sitemmap links Array:<br><pre>";print_r($links);echo "</pre>\r\n";
                    store_newLinks($links, $level, $sessid);
                    $smp = '1';
                    //     there was one valid sitemap and we stored the new links
                    printStandardReport('validSitemap', $command_line, $no_log);
                } else {
                    printStandardReport('invalidSitemap', $command_line, $no_log);
                }
                unset($links);
            }
        }
    }
    if ($debug == '0') {
        if (function_exists("ini_set")) {
            ini_set("display_errors", "0");
        }
        error_reporting(0);
    } else {
        error_reporting(E_ALL & ~E_DEPRECATED & ~E_WARNING & ~E_NOTICE & ~E_STRICT);
    }
    if ($url_status['state'] == 'ok') {
        $OKtoIndex = 1;
        $file_read_error = 0;
        if (time() - $delay_time < $min_delay) {
            sleep($min_delay - (time() - $delay_time));
        }
        if ($url_status['file']) {
            $file = $url_status['file'];
        } else {
            $url_status['state'] = "Unable to read the content of the file.<br />{$url} does not deliver any content.";
            $realnum--;
        }
    }
    if ($url_status['state'] == 'ok') {
        //  first attempt to define a charset
        $chrSet = '';
        if ($use_prefcharset == '1') {
            //  use preferred charset as defined in Admin settings
            $chrSet = $home_charset;
            //echo "<h1>USING PREFERRED CHARSET</h1>";
        } else {
            if ($server_char && $url_status['charset']) {
                //echo "<h1>USING SERVER CHARSET</h1>";
                $chrSet = $url_status['charset'];
                //  use charset as supplied by the remote server
            } else {
                //  try to extract the charset of this file
                //echo "<h1>USING CONTENT CHARSET</h1>";
                //echo "<h1>" . substr($file, 0, 500) . "</h1>";
                if (preg_match("'encoding=[\\'\"](.*?)[\\'\"]'si", substr($file, 0, 3000), $regs)) {
                    //echo "<h1>1</h1>";
                    $chrSet = trim(strtoupper($regs[1]));
                    //      get encoding of current XML or XHTML file     and use it furtheron
                }
                if (!$chrSet) {
                    //echo "<h1>2</h1>";
                    if (preg_match("'charset=(.*?)[ \\/\\;\\'\"]'si", substr($file, 0, 3000), $regs)) {
                        //echo "<h1>3</h1>";
                        $chrSet = trim(strtoupper($regs[1]));
                        //      get charset of current HTML file     and use it furtheron
                    }
                }
                if (!$chrSet) {
                    //echo "<h1>4</h1>";
                    if (preg_match("'charset=[\\'\"](.*?)[\\'\"]'si", substr($file, 0, 3000), $regs)) {
                        //echo "<h1>5</h1>";
                        $chrSet = trim(strtoupper($regs[1]));
                        //      get charset of current HTML file     and use it furtheron
                    }
                }
                //  in assistance for all lazy webmasters
                $chrSet = preg_replace("/win-/si", "windows-", $chrSet);
                if ($chrSet == "1251") {
                    //echo "<h1>6</h1>";
                    $chrSet = "windows-1251";
                }
                if ($chrSet == '') {
                    //echo "<h1>7</h1>";
                    $chrSet = $home_charset;
                    //  no charset found, we need to use default charset like for DOCs, PDFs, etc
                }
            }
        }
        //echo "<h1>CHRSET: $chrSet</h1>";
        //  if required, uncompress ZIP archives and make content of each file => text
        if ($url_status['content'] == 'zip' && $index_zip == '1' && $file) {
            file_put_contents("" . $tmp_dir . "/archiv.temp", $file);
            $zip = zip_open("" . $tmp_dir . "/archiv.temp");
            if ($zip) {
                $url_status['content'] = "text";
                //  preventiv, if not another status will be detected for individual archiv files
                $file = '';
                //  starting with a blank file for all archive files
                $topic = 'zip';
                if ($debug == '2') {
                    printStandardReport('archivFiles', $command_line, $no_log);
                }
                while ($zip_entry = zip_read($zip)) {
                    if (zip_entry_open($zip, $zip_entry, "r")) {
                        $buf = zip_entry_read($zip_entry, zip_entry_filesize($zip_entry));
                        //uncompress the content of recent archiv file
                        $name = zip_entry_name($zip_entry);
                        //  get filename of recent archive file
                        if ($debug == '2') {
                            //
                            $report = "<strong>&nbsp;&nbsp;" . $name . "</strong>";
                            printThis($report, $cl);
                            $size = (int) (zip_entry_filesize($zip_entry) / 1024);
                            if ($size == 0) {
                                $size = '1';
                            }
                            $report = "&nbsp;&nbsp;&nbsp;-&nbsp;Unpacked size:&nbsp;" . $size . " kByte<br />";
                            printThis($report, $cl);
                        }
                        $buf = get_arch_content($buf, $name, $url, $chrSet);
                        //  if necessary, convert PDF, extract feed etc. for the recent file
                        zip_entry_close($zip_entry);
                        //  done for this file in archiv
                        $file .= "" . $buf . "<br /><br />";
                        //  add all uncompressed and converted files together
                    }
                }
                zip_close($zip);
            }
            unlink("" . $tmp_dir . "/archiv.temp");
        }
        //  if required, uncompress RAR archives and make content of each file => text
        if ($url_status['content'] == 'rar' && $index_rar == '1') {
            file_put_contents("" . $tmp_dir . "/archiv.temp", $file);
            $rar = rar_open("" . $tmp_dir . "/archiv.temp");
            if ($rar) {
                $url_status['content'] = "text";
                //  preventiv, all individual archiv files willl be converted to 'text'
                $file = '';
                //  starting with a blank file for all archive files
                $topic = 'rar';
                $entries = rar_list($rar);
                if ($rar) {
                    if ($debug == '2') {
                        printStandardReport('archivFiles', $command_line, $no_log);
                    }
                    foreach ($entries as $entry) {
                        $name = $entry->getName();
                        if ($debug == '2') {
                            $report = "<strong>&nbsp;&nbsp;" . $name . "</strong>";
                            printThis($report, $cl);
                            $size = (int) ($entry->getPackedSize() / 1024);
                            if ($size == 0) {
                                $size = '1';
                            }
                            $report = "&nbsp;&nbsp;&nbsp;-&nbsp;Packed size:&nbsp;&nbsp;" . $size . " kByte";
                            printThis($report, $cl);
                            $size = (int) ($entry->getUnpackedSize() / 1024);
                            if ($size == 0) {
                                $size = '1';
                            }
                            $report = "&nbsp;&nbsp;&nbsp;-&nbsp;Unpacked size:&nbsp;" . $size . " kByte<br />";
                            printThis($report, $cl);
                        }
                        $entry->extract('', "./" . $tmp_dir . "/" . $name . "");
                        //  extract single file of archiv into temporary folder
                        $buf = file_get_contents("./" . $tmp_dir . "/" . $name . "");
                        //  read content of this intermediate file
                        unlink("./" . $tmp_dir . "/" . $name . "");
                        //  destroy this file
                        if ($buf) {
                            $buf = get_arch_content($buf, $name, $url, $chrSet);
                            //  if necessary, convert PDF, extract feed etc. for the recent file
                            $file .= "" . $buf . "<br /><br />";
                            //  add all uncompressed and converted files together
                        }
                    }
                }
                rar_close($rar);
            }
            unlink("" . $tmp_dir . "/archiv.temp");
        }
        $file0 = $file;
        //  rememberr the original (e.g. for doc2txt converter)
        //  remove useless part of the content
        $file = purify_content($file);
        $valid_utf8 = '1';
        $raw_file = $file;
        //  kill eventually duplicate coding info in dynamic links
        if (stristr(substr($file, '0', '4000'), "encoding") && strstr(substr($file, '0', '4000'), "charset")) {
            $file = substr($file, strrpos($file, "<!DOCTYPE"));
            //  subsstring starting at last found <!DOCTYPE
        }
        //  we need to do it again for eventually new charset in archive
        $chrSet = '';
        if ($use_prefcharset == '1') {
            //  use preferred charset as defined in Admin settings
            $chrSet = $home_charset;
        } else {
            if ($server_char && $url_status['charset']) {
                $chrSet = $url_status['charset'];
                //  use charset as supplied by the remote server
            } else {
                //  try to extract the charset of this file
                if (preg_match("'encoding=[\\'\"](.*?)[\\'\"]'si", substr($file, 0, 3000), $regs)) {
                    $chrSet = trim(strtoupper($regs[1]));
                    //      get encoding of current XML or XHTML file     and use it furtheron
                }
                if (!$chrSet) {
                    if (preg_match("'charset=(.*?)[ \\/\\;\\'\"]'si", substr($file, 0, 3000), $regs)) {
                        $chrSet = trim(strtoupper($regs[1]));
                        //      get charset of current HTML file     and use it furtheron
                    }
                }
                if (!$chrSet) {
                    if (preg_match("'charset=[\\'\"](.*?)[\\'\"]'si", substr($file, 0, 3000), $regs)) {
                        $chrSet = trim(strtoupper($regs[1]));
                        //      get charset of current HTML file     and use it furtheron
                    }
                }
                //  in assistance for all lazy webmasters
                $chrSet = preg_replace("/win-/si", "windows-", $chrSet);
                if ($chrSet == "1251") {
                    $chrSet = "windows-1251";
                }
                if ($chrSet == '') {
                    $chrSet = $home_charset;
                    //  no charset found, we need to use default charset like for DOCs, PDFs, etc
                }
            }
        }
        if (strpos($chrSet, " ")) {
            // in the wild we have aloready seen a lot of variants
            $chrSet = substr($chrSet, 0, strpos($chrSet, " "));
        }
        //  some webmaster still use 'UNICODE' as name
        if (stristr($chrSet, "UNICODE")) {
            $chrSet = "UTF-8";
        }
        //  obsolete since 1990, but some (Italian) server still send it as charset . . . .
        if (stristr($chrSet, "8858")) {
            $chrSet = str_replace("8858", "8859", $chrSet);
        }
        //  required coaching for some webmasters
        if (stristr($chrSet, "cp-")) {
            $chrSet = str_ireplace("CP-", "CP", $chrSet);
        }
        $contents['charset'] = $chrSet;
        if ($index_framesets == '1') {
            if (preg_match("@<frameset[^>]*>(.*?)<\\/frameset>@si", $file, $regs)) {
                printStandardReport('newFrameset', $command_line, $no_log);
                //  separate the <frameset> ....</frameset> part of this file
                $frame = $regs[1];
                $replace = get_frames($frame, $url, $can_leave_domain);
                $replace = "<body>" . $replace . "</body>";
                //  create the body tags for $file
                $contents['charset'] = $chrSet;
                // rebuild charset
                //  include all replacements instead of the frameset tag into the actual file. This will become the body
                $file = preg_replace("@<frameset.*?</frameset>@si", "{$replace}", $file);
            }
        }
        if ($index_iframes == '1') {
            $links = array();
            $regs = array();
            $replace = '';
            $get_charset = '';
            $real_url = $url;
            if (preg_match_all("/(iframe[^>]*src[[:blank:]]*)=[[:blank:]]*[\\'\"]?(([[a-z]{3,5}:\\/\\/(([.a-zA-Z0-9-])+(:[0-9]+)*))*([+:%\\/?=&;\\\\(\\),._ a-zA-Z0-9-]*))(#[.a-zA-Z0-9-]*)?[\\'\" ]?/i", $file, $regs, PREG_SET_ORDER)) {
                printStandardReport('newIframe', $command_line, $no_log);
                //  find all frames of the iframe;
                $care_excl = '';
                //  don't care file suffixed to be excluded
                $relocated = '';
                //  URL is not relocated
                foreach ($regs as $val) {
                    if (($a = url_purify($val[2], $url, $can_leave_domain, $care_exel, $relocated, $local_redir)) != '') {
                        $links[] = $a;
                        // collect  all iframe links
                    }
                }
                if ($links) {
                    foreach ($links as $url) {
                        printNewLinks($url, $cl);
                        if (preg_match("/.html|.htm|.xhtml|.xml|.php/i", $url)) {
                            $frame = file_get_contents($url);
                            //      get content of this frame
                            //  separate the body part of this frame
                            preg_match("@<body[^>]*>(.*?)<\\/body>@si", $frame, $regs);
                            $body = $regs[1];
                            if ($abslinks == '1') {
                                $body = make_abslinks($body, $url);
                                //  if required, correct links relative to found iframe
                            }
                            $replace = "" . $replace . "<br />" . $body . "";
                        } else {
                            //  might be an image
                            $replace = "" . $replace . "<br /><img src=\"" . $url . "\">";
                        }
                    }
                }
                //  include all replacements instead of the iframe tag into the actual file
                $file = preg_replace("@<iframe.*?</iframe>@si", "{$replace}", $file);
                $contents['charset'] = $chrSet;
                // rebuild charset
            }
            $url = $real_url;
        }
        //      in order to index RDF, RSD, RSS and ATOM feeds enter here
        if ($url_status['content'] == 'xml' && $index_rss == '1') {
            if (!preg_match("/<rss|atom|<feed|<rdf|<rsd/si", substr($file, 0, 400))) {
                printStandardReport('notRSS', $command_line, $no_log);
                //  no valid feed detected
                $OKtoIndex = 0;
                $file_read_error = 1;
                $realnum--;
            } else {
                $html = '';
                $xml = XML_IsWellFormed($file);
                //      check for well-formed XML
                if ($xml != '1') {
                    if ($debug > 0) {
                        printNotWellFormedXML($xml, $cl);
                    }
                    $OKtoIndex = 0;
                    $file_read_error = 1;
                    $realnum--;
                } else {
                    $rss = new feedParser();
                    // define options for feed parser
                    $rss->limit = $max_links;
                    //   save time by limiting the items/entries to be processed
                    $rss->in_cp = strtoupper($contents['charset']);
                    //  charset of actual file
                    $rss->out_cp = 'UTF-8';
                    //  convert all into this charset
                    $rss->cache_dir = '';
                    //  currently unused
                    $rss->dc = $dc;
                    //  treat Dublin Core tags in RDF feeds
                    $rss->pro = $preferred;
                    //  obey the PREFERRED directive in RSD feeds
                    $rss->file = '1';
                    //  use $file as feed (as a string, not URL)
                    if ($cdata != 1) {
                        $rss->CDATA = 'content';
                        //  get it all  (naughty)
                    } else {
                        $rss->CDATA = 'nochange';
                        //  well educated crawler
                    }
                    //  get feed as array
                    if ($feed = $rss->get($url, $file)) {
                        //  if you want to see the feed during index procedure, uncomment the following row
                        //  echo "<br>FEED array:<br><pre>";print_r($feed);echo "</pre>";
                        $link = '';
                        $textinput_link = '';
                        $image_url = '';
                        $image_link = '';
                        $docs = '';
                        $subjects = '';
                        $count = '';
                        $type = $feed[type];
                        $count = $feed[sub_count];
                        $cached = $feed[cached];
                        //  kill all no longer required values
                        $feed[type] = '';
                        $feed[sub_count] = '';
                        $feed[encoding_in] = '';
                        $feed[encoding_out] = '';
                        $feed[items_count] = '';
                        $feed[cached] = '';
                        if (!$count) {
                            $count = '0';
                        }
                        if ($type == 'RSD') {
                            //      prepare all RSD APIs
                            for ($i = 0; $i < $count; $i++) {
                                $subjects .= '' . $feed['api'][$i]['name'] . '<br />
                                            ' . $feed['api'][$i]['apiLink'] . '<br />
                                            ' . $feed['api'][$i]['blogID'] . '<br />
                                            ' . $feed['api'][$i]['settings_docs'] . '<br />
                                            ' . $feed['api'][$i]['settings_notes'] . '<br />';
                            }
                        }
                        if ($type == 'Atom') {
                            //      prepare all Atom entries
                            for ($i = 0; $i < $count; $i++) {
                                $subjects .= '' . $feed['entries'][$i]['link'] . '<br />
                                            ' . $feed['entries'][$i]['title'] . '<br />
                                            ' . $feed['entries'][$i]['id'] . '<br />
                                            ' . $feed['entries'][$i]['published'] . '<br />
                                            ' . $feed['entries'][$i]['updated'] . '<br />
                                            ' . $feed['entries'][$i]['summary'] . '<br />
                                            ' . $feed['entries'][$i]['rights'] . '<br />
                                            ' . $feed['entries'][$i]['author_name'] . ' ' . $feed['entries'][$i]['author_email'] . ' ' . $feed['entries'][$i]['author_uri'] . '<br />
                                            ' . $feed['entries'][$i]['category_term'] . ' ' . $feed['entries'][$i]['category_label'] . ' ' . $feed['entries'][$i]['category_scheme'] . '<br />
                                            ' . $feed['entries'][$i]['contributor_name'] . ' ' . $feed['entries'][$i]['contributor_email'] . ' ' . $feed['entries'][$i]['contributor_uri'] . '<br />
                                        ';
                            }
                        }
                        if ($type == 'RDF' | $type == 'RSS v.0.91/0.92' | $type == 'RSS v.2.0') {
                            //  For RDF and RSS feeds enter here
                            //  prepare channel image
                            $image_url = $feed[image_url];
                            if ($image_url) {
                                $width = $feed[image_width];
                                if (!$width || $width > '144') {
                                    $width = '88';
                                    //set to default value
                                }
                                $height = $feed[image_height];
                                if (!$height || $height > '400') {
                                    $height = '31';
                                    //set to default value
                                }
                                $feed[image_url] = "<img id=\"rss_007\" src=\"" . $image_url . "\" alt=\"" . $feed[image_title] . "\" width=\"" . $width . "\" height=\"" . $height . "\">";
                            }
                            $image_link = $feed[image_link];
                            if ($image_link) {
                                $feed[image_link] = "<a href=\"" . $image_link . "\">" . $image_link . "</a>";
                            }
                            //      prepare all RDF or RSS items
                            for ($i = 0; $i < $count; $i++) {
                                $subjects .= '' . $feed['items'][$i]['link'] . '<br />
                                            ' . $feed['items'][$i]['title'] . '<br />
                                            ' . $feed['items'][$i]['description'] . '<br />
                                            ' . $feed['items'][$i]['author'] . '<br />
                                            ' . $feed['items'][$i]['category'] . '<br />
                                            ' . $feed['items'][$i]['guid'] . '<br />
                                            ' . $feed['items'][$i]['comments'] . '<br />
                                            ' . $feed['items'][$i]['pubDate'] . '<br />
                                            ' . $feed['items'][$i]['source'] . '<br />
                                            ' . $feed['items'][$i]['enclosure'] . '<br />
                                            ' . $feed['items'][$i]['country'] . '<br />
                                            ' . $feed['items'][$i]['coverage'] . '<br />
                                            ' . $feed['items'][$i]['contributor'] . '<br />
                                            ' . $feed['items'][$i]['date'] . '<br />
                                            ' . $feed['items'][$i]['industry'] . '<br />
                                            ' . $feed['items'][$i]['language'] . '<br />
                                            ' . $feed['items'][$i]['publisher'] . '<br />
                                            ' . $feed['items'][$i]['state'] . '<br />
                                            ' . $feed['items'][$i]['subject'] . '<br />
                                        ';
                            }
                        }
                        //  convert  the channel/feed part  into a string
                        $feed_common = implode(" ", $feed);
                        //  build something that could be indexed
                        $html .= "<html>\r\n<head>\r\n<title>" . $feed['title'] . "</title>\r\n<meta name=\"description\" content=\"" . $feed['description'] . " \">\r\n</head>\r\n";
                        $html .= "<body>\r\n" . $feed_common . "\r\n" . $subjects . "\r\n</body>\r\n</html>\r\n";
                    }
                    if (strlen($html) < "130") {
                        //  can't be a valid feed
                        if ($type == "unknown") {
                            printInvalidFeedType($type, $cl);
                        } else {
                            printStandardReport('invalidRSS', $command_line, $no_log);
                        }
                        $OKtoIndex = 0;
                        $file_read_error = 1;
                        $realnum--;
                    } else {
                        $contents['charset'] = 'UTF-8';
                        //      the feed reader converts all to utf-8
                        $file = $html;
                        //     use feed reader output
                        if ($debug > 0) {
                            printValidFeed($type, $count, $cl);
                        }
                    }
                }
            }
        }
        //  duplicate here, but frames, iframes, or RSS might have added nonsense content
        $file = purify_content($file);
        //  prepare CVS files
        if ($url_status['content'] == 'csv' && $index_csv == '1') {
            $file = str_replace(",", " ", $file);
            $file = str_replace(";", " ", $file);
        }
        //echo "\r\n\r\n<br>url_status Array:<br><pre>";print_r($url_status);echo "</pre>\r\n";
        // for DOCs, PDFs, etc we need special text converter
        if ($url_status['content'] != 'text' && $url_status['content'] != 'xml' && $url_status['content'] != 'xhtml' && $url_status['content'] != 'csv') {
            $document = 1;
            $file = extract_text($file, $file0, $url_status['content'], $url, $chrSet);
            //  because the converter already transferred the documents to UTF-8, we need to adjust it here
            $contents['charset'] = 'UTF-8';
            $charSet = 'UTF-8';
            if ($file == 'ERROR') {
                //      if error, suppress further indexing
                $OKtoIndex = 0;
                $file_read_error = 1;
                $realnum--;
            }
            //  reduce Pashtu and Urdu to the main Farsi letters
            if (strtolower($charSet) == 'windows-1256' && $url_status['content'] == 'pdf') {
                $f_letter0 = array("ﺎ", "�");
                $f_letter1 = array("�", "�", "ﺑ", "ﺒ");
                $f_letter2 = array("ï­–", "ï­—", "ï­˜", "ï­™");
                $f_letter3 = array("ﺕ", "ﺖ", "ﺗ", "ﺘ");
                $f_letter4 = array("ﺙ", "ﺚ", "ﺛ", "ﺜ");
                $f_letter5 = array("�", "ﺞ", "ﺟ", "ﺠ");
                $f_letter6 = array("ï­º", "ï­»", "ï­¼", "ï­½");
                $f_letter7 = array("ﺡ", "ﺢ", "ﺣ", "ﺤ");
                $f_letter8 = array("ﮋ", "ﮊ");
                $f_letter9 = array("ﺥ", "ﺦ", "ﺧ", "ﺨ");
                $f_letter10 = array("ﺩ", "ﺪ");
                $f_letter11 = array("ﺫ", "ﺬ");
                $f_letter12 = array("ﺭ", "ﺮ");
                $f_letter13 = array("ﺯ", "ﺰ");
                $f_letter14 = array("ﺱ", "ﺲ", "ﺳ", "ﺴ");
                $f_letter15 = array("ﺵ", "ﺶ", "ﺷ", "ﺸ");
                $f_letter16 = array("ﺹ", "ﺺ", "ﺻ", "ﺼ");
                $f_letter17 = array("ﺽ", "ﺾ", "ﺿ", "ﻀ");
                $f_letter18 = array("�", "ﻂ", "ﻃ", "ﻄ");
                $f_letter19 = array("ﻅ", "ﻆ", "ﻇ", "ﻈ");
                $f_letter20 = array("ﻉ", "ﻊ", "ﻋ", "ﻌ");
                $f_letter21 = array("�", "ﻎ", "�", "�");
                $f_letter22 = array("ﻑ", "ﻒ", "ﻓ", "ﻔ");
                $f_letter23 = array("ﻕ", "ﻖ", "ﻗ", "ﻘ");
                $f_letter24 = array("ﻙ", "ﻚ", "ﻛ", "ﻜ", "ﮎ", "�", "�", "ﮑ");
                $f_letter25 = array("ﮒ", "ﮓ", "ﮔ", "ﮕ");
                $f_letter26 = array("�", "ﻞ", "ﻟ", "ﻠ");
                $f_letter27 = array("ﻡ", "ﻢ", "ﻣ", "ﻤ");
                $f_letter28 = array("ﻧ", "ﻨ", "ﻦ", "ﻥ");
                $f_letter29 = array("ï»­", "ï»®");
                $f_letter30 = array("ﻩ", "ﻪ", "ﻫ", "ﻬ");
                $f_letter31 = array("ﻯ", "ﻰ", "ﻱ", "ﻲ", "ﻳ", "ﻴ");
                $file = str_replace($f_letter0, "ا", $file);
                $file = str_replace($f_letter1, "ب", $file);
                $file = str_replace($f_letter2, "Ù¾", $file);
                $file = str_replace($f_letter3, "ت", $file);
                $file = str_replace($f_letter4, "Ø«", $file);
                $file = str_replace($f_letter5, "ج", $file);
                $file = str_replace($f_letter6, "Ú†", $file);
                $file = str_replace($f_letter7, "Ø­", $file);
                $file = str_replace($f_letter8, "Ú˜", $file);
                $file = str_replace($f_letter9, "Ø®", $file);
                $file = str_replace($f_letter10, "د", $file);
                $file = str_replace($f_letter11, "Ø°", $file);
                $file = str_replace($f_letter12, "ر", $file);
                $file = str_replace($f_letter13, "ز", $file);
                $file = str_replace($f_letter14, "س", $file);
                $file = str_replace($f_letter15, "Ø´", $file);
                $file = str_replace($f_letter16, "ص", $file);
                $file = str_replace($f_letter17, "ض", $file);
                $file = str_replace($f_letter18, "Ø·", $file);
                $file = str_replace($f_letter19, "ظ", $file);
                $file = str_replace($f_letter20, "ع", $file);
                $file = str_replace($f_letter21, "غ", $file);
                $file = str_replace($f_letter22, "Ù�", $file);
                $file = str_replace($f_letter23, "Ù‚", $file);
                $file = str_replace($f_letter24, "Ú©", $file);
                $file = str_replace($f_letter25, "Ú¯", $file);
                $file = str_replace($f_letter26, "Ù„", $file);
                $file = str_replace($f_letter27, "Ù…", $file);
                $file = str_replace($f_letter28, "Ù†", $file);
                $file = str_replace($f_letter29, "Ùˆ", $file);
                $file = str_replace($f_letter30, "Ù‡", $file);
                $file = str_replace($f_letter31, "ÙŠ", $file);
            }
        }
        if ($OKtoIndex == 1) {
            $pageSize = number_format(strlen($file) / 1024, 2, ".", "");
            printPageSizeReport($pageSize, $topic);
        }
        $charSet = strtoupper(trim($contents['charset']));
        //      final charset for UTF-8 converter
        if (stristr($charSet, "encoding") || strlen($charSet) < '3') {
            //  must be invalid encountered charset
            $charSet = 'UTF-8';
        }
        //echo "\r\n\r\n<br /> final charSet: '$charSet'<br />\r\n";
        if ($charSet == "UTF-16") {
            $charSet = "UTF-8";
            //  content will be converted in function clean_file()
        }
        $dic = '';
        //  if Chinese or Korean text should be segmented enter here
        if ($cn_seg == '1' && $file && !$js_link && !stristr($charSet, "8859")) {
            if ($charSet == 'GB2312' || $charSet == 'GB18030' || $charSet == 'GBK') {
                $dic = "" . $dict_dir . "/cn_gb18030.dic";
                //  simplified Chinese
            }
            if ($charSet == 'BIG5') {
                $dic = "" . $dict_dir . "/cn_big5.dic";
                //  traditional Chinese
            }
            if ($charSet == 'ISO10646-1933') {
                $dic = "" . $dict_dir . "/kr_iso10646-1933.dic";
                // Korean
            }
            if ($charSet == 'EUC-KR') {
                $dic = "" . $dict_dir . "/kr_euc-kr.dic";
                //  Korean
            }
            if ($charSet == 'UTF-8') {
                $dic = "" . $dict_dir . "/cn_utf-8.dic";
                //  Unicode
            }
            if ($dic) {
                //  if dictionary is available for page charset, perform a segmentation
                $Segmentation = new Segmentation();
                $Segmentation->load($dic);
                $Segmentation->setLowercase(FALSE);
                $cn_result = $Segmentation->segmentString($file);
                if ($cn_result && $charSet != 'UTF-8') {
                    $iconv_file = @iconv($charSet, "UTF-8//IGNORE", $cn_result);
                    if (trim($iconv_file) == "") {
                        // iconv is not installed or input charSet not available. We need to use class ConvertCharset
                        $NewEncoding = new ConvertCharset($charSet, "utf-8");
                        $NewFileOutput = $NewEncoding->Convert($cn_result);
                        $cn_result = $NewFileOutput;
                    } else {
                        $cn_result = $iconv_file;
                    }
                    unset($iconv_file, $NewEncoding, $NewFileOutput);
                }
                $seg_data = clean_file($cn_result, $url, $url_status['content'], $charSet, $use_nofollow, $use_robot, $can_leave_domain);
            } else {
                printNoDictionary($charSet, $cl);
                //  no dictionary found for this charset
            }
        }
        //  if Japanese text should be segmented enter here. But not if a Chinese dictonary was already found
        if ($jp_seg == '1' && $file && !$js_link && !stristr($charSet, "ISO") && !$dic) {
            $dic = '';
            if ($charSet == 'UTF-8' || $charSet == 'EUC-JP') {
                $file = @iconv($charSet, "SHIFT_JIS//IGNORE", $file);
                $charSet = "SHIFT_JIS";
            }
            if ($charSet == 'SHIFT_JIS') {
                $dic = "" . $dict_dir . "/jp_shiftJIS.dic";
            }
            if ($dic) {
                //  if dictionary is available for page charset, perform a segmentation
                $Segmentation = new Segmentation();
                $Segmentation->load($dic);
                $Segmentation->setLowercase(FALSE);
                $jp_result = $Segmentation->segmentString($file);
                //echo "\r\n\r\n<br /> jp_result: $jp_result<br />\r\n";
                if ($jp_result && $charSet != 'UTF-8') {
                    $iconv_file = @iconv($charSet, "UTF-8//IGNORE", $jp_result);
                    if (trim($iconv_file) == "") {
                        // iconv is not installed or input charSet not available. We need to use class ConvertCharset
                        $NewEncoding = new ConvertCharset($charSet, "utf-8");
                        $NewFileOutput = $NewEncoding->Convert($jp_result);
                        $jp_result = $NewFileOutput;
                    } else {
                        $jp_result = $iconv_file;
                    }
                    unset($iconv_file, $NewEncoding, $NewFileOutput);
                }
                $seg_data = clean_file($jp_result, $url, $url_status['content'], $charSet, $use_nofollow, $use_robot, $can_leave_domain);
            } else {
                printNoDictionary($charSet, $cl);
                //  no dictionary found for this charset
            }
        }
        //  enter here only, if site / file is not yet UTF-8 coded or had already been converted to UTF-8
        if ($charSet != "UTF-8" && $file) {
            $file = convertToUTF8($file, $charSet, $char_Set, $converter_dir);
        }
        //  if activated in Admin backend, check for correct converting of $file into UTF-8
        if ($utf8_verify) {
            $valid_utf8 = @iconv('UTF-8', 'UTF-8', $file) === $file;
        }
        if (!$valid_utf8) {
            $url_status['state'] = "<br />Invalid charset definition placed in meta tags of HTML header. Unable to convert the text into UTF-8<br />Indexing aborted for {$url}";
            if ($server_char) {
                $url_status['state'] = "<br />Invalid charset definition supplied via HTTP by the client server. Unable to convert the text into UTF-8<br />Indexing aborted for {$url}";
            }
            if ($use_prefcharset) {
                $url_status['state'] = "<br />Invalid charset definition placed Admin Settings.<br />Site was created with another charset<br />Indexing aborted for {$url}";
            }
            printUrlStatus($url_status['state'], $command_line, $no_log);
            $file = '';
            $deletable = 1;
        } else {
            if ($index_media == '1') {
                $newmd5sum = md5($file);
                //  get md5 including links and title of media files
            }
            $data = clean_file($file, $url, $url_status['content'], $charSet, $use_nofollow, $use_robot, $can_leave_domain);
            //echo "\r\n\r\n<br>data Array:<br><pre>";print_r($data);echo "</pre>\r\n";
            //  index only links and their titles
            if ($only_links) {
                $media_links = '0';
                $my_links = get_link_details($file, $url, $can_leave_domain, $data['base'], $media_links, $use_nofollow, $local_redir);
                $data['content'] = $my_links[0][0];
                //  define new content
                $data['fulltext'] = $my_links[0][0];
                //  define new content also for 'full text';
            }
            //  combine raw words plus segmented  words
            if ($cn_seg == 1 || $jp_seg == 1 && $dic && !$js_link) {
                if ($debug != '0') {
                    $seg_add = $seg_data[count] - $data[count];
                    //      calculate segmentation result
                    if ($seg_add > '0') {
                        if ($charSet == 'EUC-KR' || $charSet == 'ISO10646-1933') {
                            printSegKR($seg_add, $cl);
                        }
                        if ($charSet == 'SHIFT_JIS') {
                            printSegJA($seg_add, $cl);
                        } else {
                            printSegCN($seg_add, $cl);
                        }
                    }
                    /*
                    echo "<br /><pre>Results of word segmentation:</pre>";
                    echo "<br />Unsegmented title :<br><pre>";print_r($data[title]);echo "</pre>";
                    echo "<br />Segmented title :<br><pre>";print_r($seg_data[title]);echo "</pre>";
                    echo "<br />Unsegmented full text:<br />$data[fulltext]<br />";
                    echo "<br />Segmented full text:<br />$seg_data[fulltext]";
                    */
                }
                $data[content] = "" . $data[content] . "" . $seg_data[content] . "";
                //$data[title]        ="".$data[title]."".$seg_data[title]."";
                $data[description] = "" . $data[description] . "" . $seg_data[description] . "";
                $data[keywords] = "" . $data[keywords] . "" . $seg_data[keywords] . "";
            }
            //      check if canonical redirection was found in page head
            $cano_link = '0';
            if ($data['cano_link']) {
                //echo "\r\n\r\n<br /> url: '$url'<br />\r\n";
                $cano_link = $db_con->real_escape_string($data['cano_link']);
                //echo "\r\n\r\n<br /> cano_link: '$cano_link'<br />\r\n";
                if ($url != $cano_link) {
                    //  only new cano links are accepted
                    $OKtoIndex = 0;
                    $deletable = 1;
                    $realnum--;
                    if ($cano_link == "1") {
                        printNoCanonical($cano_link, $cl);
                        //  if unable to extract redirection link
                    } else {
                        if ($data['refresh'] == '1') {
                            printRefreshed($cano_link, $data['wait'], $cl);
                            //  if refresh meta tag was found in HTML head
                        } else {
                            printCanonical($cano_link, $cl);
                            //  if canonical link was found in HTML head
                        }
                        //      do we already know this link in link-table
                        $sql_query = "SELECT /* jfield 2 */ url from " . $mysql_table_prefix . "links where url like '{$cano_link}'";
                        $res = $db_con->query($sql_query);
                        if ($debug && $db_con->errno) {
                            $err_row = __LINE__ - 2;
                            printf("<p><span class='red'>&nbsp;MySQL failure: %s&nbsp;\n<br /></span></p>", $db_con->error);
                            if (__FUNCTION__) {
                                printf("<p><span class='red'>&nbsp;Found in script: " . __FILE__ . "&nbsp;&nbsp;row: {$err_row}&nbsp;&nbsp;in function():&nbsp;" . __FUNCTION__ . "&nbsp;<br /></span></p>");
                            } else {
                                printf("<p><span class='red'>&nbsp;Found in script: " . __FILE__ . "&nbsp;&nbsp;row: {$err_row}&nbsp;<br /></span></p>");
                            }
                            printf("<p><span class='red'>&nbsp;Script execution aborted.&nbsp;<br /></span>");
                            printf("<p><strong>Invalid query string, which caused the SQL error:</strong></p>");
                            echo "<p> {$sql_query} </p>";
                            exit;
                        }
                        $rows = $res->num_rows;
                        if ($rows == 0) {
                            // if not known in link-table, check if already known in temp-table
                            $sql_query = "SELECT /* jfield 1 */ link from " . $mysql_table_prefix . "temp where link like '{$cano_link}'";
                            $res = $db_con->query($sql_query);
                            if ($debug && $db_con->errno) {
                                $err_row = __LINE__ - 2;
                                printf("<p><span class='red'>&nbsp;MySQL failure: %s&nbsp;\n<br /></span></p>", $db_con->error);
                                if (__FUNCTION__) {
                                    printf("<p><span class='red'>&nbsp;Found in script: " . __FILE__ . "&nbsp;&nbsp;row: {$err_row}&nbsp;&nbsp;in function():&nbsp;" . __FUNCTION__ . "&nbsp;<br /></span></p>");
                                } else {
                                    printf("<p><span class='red'>&nbsp;Found in script: " . __FILE__ . "&nbsp;&nbsp;row: {$err_row}&nbsp;<br /></span></p>");
                                }
                                printf("<p><span class='red'>&nbsp;Script execution aborted.&nbsp;<br /></span>");
                                printf("<p><strong>Invalid query string, which caused the SQL error:</strong></p>");
                                echo "<p> {$sql_query} </p>";
                                exit;
                            }
                            $rows = $res->num_rows;
                            if ($rows == 0) {
                                // not known in link-table, add new link
                                if ($numoflinks <= $max_links) {
                                    $sql_query = "INSERT into " . $mysql_table_prefix . "temp (link, level, id) values ('{$cano_link}', '{$level}', '{$sessid}')";
                                    $db_con->query($sql_query);
                                }
                                if ($debug && $db_con->errno) {
                                    $err_row = __LINE__ - 2;
                                    printf("<p><span class='red'>&nbsp;MySQL failure: %s&nbsp;\n<br /></span></p>", $db_con->error);
                                    if (__FUNCTION__) {
                                        printf("<p><span class='red'>&nbsp;Found in script: " . __FILE__ . "&nbsp;&nbsp;row: {$err_row}&nbsp;&nbsp;in function():&nbsp;" . __FUNCTION__ . "&nbsp;<br /></span></p>");
                                    } else {
                                        printf("<p><span class='red'>&nbsp;Found in script: " . __FILE__ . "&nbsp;&nbsp;row: {$err_row}&nbsp;<br /></span></p>");
                                    }
                                    printf("<p><span class='red'>&nbsp;Script execution aborted.&nbsp;<br /></span>");
                                    printf("<p><strong>Invalid query string, which caused the SQL error:</strong></p>");
                                    echo "<p> {$sql_query} </p>";
                                    exit;
                                }
                            }
                        }
                    }
                }
                $cano_link = '0';
                //  reset the cano flag
            } else {
                if ($index_media == '0') {
                    $newmd5sum = md5($data['content']);
                    // get md5 from cleaned full text only
                }
                if ($md5sum == $newmd5sum) {
                    printStandardReport('md5notChanged', $command_line, $no_log);
                    $OKtoIndex = 0;
                    $realnum--;
                } else {
                    mysqltest();
                    //     check for duplicate page content
                    $sql_query = "SELECT * from " . $mysql_table_prefix . "links where md5sum='{$newmd5sum}'";
                    $result = $db_con->query($sql_query);
                    if ($debug && $db_con->errno) {
                        $err_row = __LINE__ - 2;
                        printf("<p><span class='red'>&nbsp;MySQL failure: %s&nbsp;\n<br /></span></p>", $db_con->error);
                        if (__FUNCTION__) {
                            printf("<p><span class='red'>&nbsp;Found in script: " . __FILE__ . "&nbsp;&nbsp;row: {$err_row}&nbsp;&nbsp;in function():&nbsp;" . __FUNCTION__ . "&nbsp;<br /></span></p>");
                        } else {
                            printf("<p><span class='red'>&nbsp;Found in script: " . __FILE__ . "&nbsp;&nbsp;row: {$err_row}&nbsp;<br /></span></p>");
                        }
                        printf("<p><span class='red'>&nbsp;Script execution aborted.&nbsp;<br /></span>");
                        printf("<p><strong>Invalid query string, which caused the SQL error:</strong></p>");
                        echo "<p> {$sql_query} </p>";
                        exit;
                    }
                    if ($num_rows = $result->num_rows) {
                        //  display warning message and urls with duplicate content
                        printStandardReport('duplicate', $command_line, $no_log);
                        while ($row = $result->fetch_array(MYSQLI_ASSOC)) {
                            $dups[] = $row['link_id'];
                        }
                        for ($i = 0; $i < $num_rows; $i++) {
                            $link_id = $dups[$i];
                            //$num = $i+1;
                            $sql_query = "SELECT * from " . $mysql_table_prefix . "links where link_id like '{$link_id}'";
                            $res1 = $db_con->query($sql_query);
                            if ($debug && $db_con->errno) {
                                $err_row = __LINE__ - 2;
                                printf("<p><span class='red'>&nbsp;MySQL failure: %s&nbsp;\n<br /></span></p>", $db_con->error);
                                if (__FUNCTION__) {
                                    printf("<p><span class='red'>&nbsp;Found in script: " . __FILE__ . "&nbsp;&nbsp;row: {$err_row}&nbsp;&nbsp;in function():&nbsp;" . __FUNCTION__ . "&nbsp;<br /></span></p>");
                                } else {
                                    printf("<p><span class='red'>&nbsp;Found in script: " . __FILE__ . "&nbsp;&nbsp;row: {$err_row}&nbsp;<br /></span></p>");
                                }
                                printf("<p><span class='red'>&nbsp;Script execution aborted.&nbsp;<br /></span>");
                                printf("<p><strong>Invalid query string, which caused the SQL error:</strong></p>");
                                echo "<p> {$sql_query} </p>";
                                exit;
                            }
                            $row = $res1->fetch_array(MYSQLI_NUM);
                            $dup_url = urldecode($row[2]);
                            $dup_url = $dup_url;
                            $dup_url = @iconv($charSet, "UTF-8//IGNORE", $dup_url);
                            if ($idna) {
                                // Initialize the converter class
                                $IDN = new idna_convert(array('idn_version' => 2008));
                                if ($conv_puny && strstr($dup_url, "xn--") && $idna) {
                                    $dup_url = $IDN->decode($dup_url);
                                }
                            }
                            if ($clear == 1) {
                                clean_resource($res, '03');
                            }
                            printDupReport($dup_url, $command_line);
                        }
                        if ($dup_content == '0') {
                            //  enter here, if pages with duplicate content should not be indexed/re-indexed
                            $OKtoIndex = 0;
                            $realnum--;
                        } else {
                            $OKtoIndex = 1;
                        }
                    }
                }
            }
            //echo "\r\n\r\n<br>data array1:<br><pre>";print_r($data);echo "</pre>\r\n";
            if (($md5sum != $newmd5sum || $reindex == 1) && $OKtoIndex == 1) {
                $urlparts = parse_addr($url);
                $newdomain = $urlparts['host'];
                $type = 0;
                if ($data['noindex'] == 1) {
                    //  remember this URlL, so it might not become another time a new link
                    //  check without scheme and www.
                    $check_link = substr($check_link, stripos($url, "//") + 2);
                    if (stristr($check_link, "www.")) {
                        $check_link = substr($check_link, stripos($check_link, "www") + 4);
                    }
                    $sql_query = "SELECT url from " . $mysql_table_prefix . "links where url like '%{$check_link}'";
                    $res = $db_con->query($sql_query);
                    if ($debug && $db_con->errno) {
                        $err_row = __LINE__ - 2;
                        printf("<p><span class='red'>&nbsp;MySQL failure: %s&nbsp;\n<br /></span></p>", $db_con->error);
                        if (__FUNCTION__) {
                            printf("<p><span class='red'>&nbsp;Found in script: " . __FILE__ . "&nbsp;&nbsp;row: {$err_row}&nbsp;&nbsp;in function():&nbsp;" . __FUNCTION__ . "&nbsp;<br /></span></p>");
                        } else {
                            printf("<p><span class='red'>&nbsp;Found in script: " . __FILE__ . "&nbsp;&nbsp;row: {$err_row}&nbsp;<br /></span></p>");
                        }
                        printf("<p><span class='red'>&nbsp;Script execution aborted.&nbsp;<br /></span>");
                        printf("<p><strong>Invalid query string, which caused the SQL error:</strong></p>");
                        echo "<p> {$sql_query} </p>";
                        exit;
                    }
                    $known_link = $res->num_rows;
                    if ($known_link != '1') {
                        $sql_query = "INSERT into " . $mysql_table_prefix . "links (site_id, url, indexdate, size, md5sum, level) values ('{$site_id}', '{$url}', curdate(), '{$pageSize}', '{$newmd5sum}', '{$thislevel}')";
                        $db_con->query($sql_query);
                        if ($debug && $db_con->errno) {
                            $err_row = __LINE__ - 2;
                            printf("<p><span class='red'>&nbsp;MySQL failure: %s&nbsp;\n<br /></span></p>", $db_con->error);
                            if (__FUNCTION__) {
                                printf("<p><span class='red'>&nbsp;Found in script: " . __FILE__ . "&nbsp;&nbsp;row: {$err_row}&nbsp;&nbsp;in function():&nbsp;" . __FUNCTION__ . "&nbsp;<br /></span></p>");
                            } else {
                                printf("<p><span class='red'>&nbsp;Found in script: " . __FILE__ . "&nbsp;&nbsp;row: {$err_row}&nbsp;<br /></span></p>");
                            }
                            printf("<p><span class='red'>&nbsp;Script execution aborted.&nbsp;<br /></span>");
                            printf("<p><strong>Invalid query string, which caused the SQL error:</strong></p>");
                            echo "<p> {$sql_query} </p>";
                            exit;
                        }
                    }
                    $OKtoIndex = 0;
                    $deletable = 1;
                    $realnum--;
                    printStandardReport('metaNoindex', $command_line, $no_log);
                }
                if (!$js_link) {
                    //  JavaScript will not deliver keywords, only links are parsed
                    $content = explode(" ", addslashes($data['content']));
                    //echo "\r\n\r\n<br>content array0:<br><pre>";print_r($content);echo "</pre>\r\n";
                    $acc_words[] = array();
                    $type = '';
                    //  if Greek accents should be removed from Greek vowels
                    if ($noacc_el) {
                        foreach ($content as &$thisword) {
                            $no_acc = remove_acc_el($thisword);
                            if ($no_acc != $thisword) {
                                $acc_words[] = $no_acc;
                            }
                        }
                    }
                    //  if the other (Latin)  accents should be removed from their vowels
                    if ($vowels) {
                        foreach ($content as $thisword) {
                            $no_acc = remove_acc($thisword, '');
                            if ($no_acc != $thisword) {
                                $acc_words[] = $no_acc;
                            }
                        }
                    }
                    //  now add the words without accents to the total text content
                    $content = array_merge($content, $acc_words);
                    //echo "\r\n\r\n<br>content array0:<br><pre>";print_r($content);echo "</pre>\r\n";
                    //  if ligatures should be equalized
                    if ($liga) {
                        $liga_words = array();
                        //  will contain converted ligatures
                        $phon_words = array();
                        //  will contain converted phonetics
                        //  first: convert letters into latin ligatures
                        foreach ($content as $thisword) {
                            if ($thisword) {
                                $liga_words[] = html_entity_decode($thisword, ENT_QUOTES, "UTF-8");
                                $thisword1 = $thisword;
                                reset($latin_ligatures);
                                while ($char = each($latin_ligatures)) {
                                    $thisword2 = preg_replace("/" . $char[0] . "/s", $char[1], $thisword1);
                                    //  convert ligatures
                                    if ($thisword1 != $thisword2) {
                                        //  break on first ligature
                                        $liga_words[] = html_entity_decode($thisword2, ENT_QUOTES, "UTF-8");
                                        //  collect new words with ligatures
                                        $thisword1 = $thisword2;
                                        //  continue with the word, containing the ligatures
                                        //break;
                                    }
                                }
                            }
                        }
                        // second: convert all letters into phonetic transcriptions
                        reset($liga_words);
                        foreach ($liga_words as $thisword) {
                            $thisword1 = $thisword;
                            reset($phon_trans);
                            while ($char = each($phon_trans)) {
                                $thisword2 = preg_replace("/" . $char[0] . "/s", $char[1], $thisword1);
                                //  convert into phonetics
                                if ($thisword1 != $thisword2) {
                                    //  break on first ligature
                                    $phon_words[] = html_entity_decode($thisword2, ENT_QUOTES, "UTF-8");
                                    //  collect new words with phonetics
                                    $thisword1 = $thisword2;
                                    //  continue with the word, containing the ligatures
                                    //break;
                                }
                            }
                        }
                        $liga_words = array_merge($liga_words, $phon_words);
                        //  add all phoneticss to the liga array
                        //  now vice versa: convert latin ligatures and phonetic transcriptions into standard letters
                        reset($content);
                        $not_liga_words = array();
                        foreach ($content as $thisword) {
                            if ($thisword) {
                                //  first: convert latin ligatures into standard letters
                                $thisword1 = superentities($thisword, ENT_QUOTES, "UTF-8");
                                reset($latin_ligatures);
                                while ($char = each($latin_ligatures)) {
                                    $thisword2 = preg_replace("/" . $char[1] . "/s", $char[0], $thisword1);
                                    //  re-convert ligatures
                                    if ($thisword1 != $thisword2) {
                                        $not_liga_words[] = html_entity_decode($thisword2, ENT_QUOTES, "UTF-8");
                                        //  collect new words without ligatures
                                        $thisword1 = $thisword2;
                                        //  continue with the word, containing the ligature
                                    }
                                }
                            }
                            //echo "\r\n\r\n<br>not_liga_words Array:<br><pre>";print_r($not_liga_words);echo "</pre>\r\n";
                            // second: convert phonetic transcriptions into standard letters
                            reset($not_liga_words);
                            $not_phon_words = array();
                            foreach ($not_liga_words as $thisword) {
                                $thisword1 = superentities($thisword, ENT_QUOTES, "UTF-8");
                                reset($phon_trans);
                                while ($char = each($phon_trans)) {
                                    $thisword2 = preg_replace("/" . $char[1] . "/s", $char[0], $thisword1);
                                    //  re-convert sphonetic
                                    if ($thisword1 != $thisword2) {
                                        $not_phon_words[] = html_entity_decode($thisword2, ENT_QUOTES, "UTF-8");
                                        //  collect new words without phonetics
                                        $thisword1 = $thisword2;
                                        //  continue with the word, containing the phonetic trans.
                                    }
                                }
                            }
                        }
                        $not_words = array_merge($not_liga_words, $not_phon_words);
                        //  add all together
                        $content = array_merge($liga_words, $not_words);
                        //  add all ligatures and re-converted letters to the content array
                    }
                    $wordarray = unique_array($content);
                }
                //echo "\r\n\r\n<br>wordarray0:<br><pre>";print_r($wordarray);echo "</pre>\r\n";
                if ($smp != 1) {
                    if ($data['nofollow'] != 1 && $cano_link == '0') {
                        $media_links = '0';
                        $links = array();
                        if (!$document) {
                            //  don't try to find links in PDFs and other pure documents
                            $links = get_links($file, $url, $can_leave_domain, $data['base'], $media_links, $use_nofollow, $local_redir, $url_reloc, $charSet);
                        }
                        if ($links[0]) {
                            $links = distinct_array($links);
                            $all_links = count($links);
                            if ($all_links > $max_links) {
                                $all_links = $max_links;
                            }
                            $links = array_slice($links, 0, $max_links);
                            if ($realnum < $max_links) {
                                $numoflinks = 0;
                                //if there are any new links, add to the temp table, but only if there isn't such url already
                                if ($links[0]) {
                                    reset($links);
                                    $tmp_urls = get_temp_urls($sessid);
                                    //  reload previous temp
                                    // echo "\r\n\r\n<br>tmp_urls array:<br><pre>";print_r($tmp_urls);echo "</pre>\r\n";
                                    if ($debug == '2') {
                                        //  if debug mode, show details
                                        printStandardReport('newLinks', $command_line, $no_log);
                                    }
                                    while ($thislink = each($links)) {
                                        // echo "\r\n\r\n<br>thislink array:<br><pre>";print_r($thislink);echo "</pre>\r\n";
                                        //  ignore error (message) links and self linking
                                        if (strstr($thislink[1], "//") && $thislink[1] != $url) {
                                            //  find new domains for _addurl table
                                            if ($auto_add && $can_leave_domain) {
                                                $all_link = parse_all_url($thislink[1]);
                                                //  only the domain will be stored as new URL into addurl table
                                                $dom_link = $all_link['host'];
                                                //  reduce to domain name and tld
                                                $new_link = str_replace("www.", "", $dom_link);
                                                // use the complete URL
                                                //$dom_link = $thislink[1];
                                                //  use only the domain
                                                $dom_link = $all_link['scheme'] . "://" . $dom_link;
                                                $banned = '';
                                                mysqltest();
                                                //     check whether URL is already known in sites table
                                                $sql_query = "SELECT url from " . $mysql_table_prefix . "sites where url like '%{$new_link}%'";
                                                $res1 = $db_con->query($sql_query);
                                                //     check whether URL is already known in addurl table
                                                $sql_query = "SELECT url from " . $mysql_table_prefix . "addurl where url like '%{$new_link}%'";
                                                $res2 = $db_con->query($sql_query);
                                                //     check whether URL is banned
                                                $sql_query = "SELECT domain from " . $mysql_table_prefix . "banned where domain like '%{$new_link}%'";
                                                $res3 = $db_con->query($sql_query);
                                                if ($res3->num_rows) {
                                                    $banned = "1";
                                                }
                                                if ($res1->num_rows == 0 && $res2->num_rows == 0 && $res3->num_rows == 0) {
                                                    //  add new domain into _addurl table
                                                    $sql_query = "INSERT into " . $mysql_table_prefix . "addurl (url, description, account) values ('{$dom_link}', '{$comment}', '{$admin_email}')";
                                                    $db_con->query($sql_query);
                                                }
                                            }
                                            //      check whether thislink is already known as a link ( might happen by means of relocated URLs)
                                            $res4 = '';
                                            $res5 = '';
                                            $known_link = '';
                                            $known_temp = '';
                                            $check_link = $thislink[1];
                                            // i don't believe the "like" is necessary here and it slows down indexing
                                            //                                                //  check without scheme and www.
                                            //                                                $check_link = substr($check_link, stripos($check_link, "//")+2);
                                            //                                                if (stristr($check_link, "www.")) {
                                            //                                                    $check_link = substr($check_link, stripos($check_link, "www")+4);
                                            //                                                }
                                            //
                                            //                                                $sql_query = "SELECT /* jfield 3 */ url from ".$mysql_table_prefix."links where url like '%$check_link'";
                                            //                                                $res4 = $db_con->query($sql_query);
                                            //
                                            //                                                $known_link = $res4->num_rows;;
                                            //
                                            //                                                $sql_query = "SELECT /* jfield 4 */ link from ".$mysql_table_prefix."temp where link like '%$check_link'";
                                            //                                                $res5 = $db_con->query($sql_query);
                                            //                                                if ($debug > 0 && $db_con->errno) {
                                            //                                                    printf("MySQL failure: %s\n", $db_con->error);
                                            //                                                    echo "<br />Script aborted.";
                                            //                                                    exit;
                                            //                                                }
                                            //                                                $known_temp = $res5->num_rows;;
                                            $sql_query = "SELECT /* jfield 3 */ url from " . $mysql_table_prefix . "links where url = '{$check_link}'";
                                            $res4 = $db_con->query($sql_query);
                                            $known_link = $res4->num_rows;
                                            $sql_query = "SELECT /* jfield 4 */ link from " . $mysql_table_prefix . "temp where link = '{$check_link}'";
                                            $res5 = $db_con->query($sql_query);
                                            if ($debug > 0 && $db_con->errno) {
                                                printf("MySQL failure: %s\n", $db_con->error);
                                                echo "<br />Script aborted.";
                                                exit;
                                            }
                                            $known_temp = $res5->num_rows;
                                            //      if this is a new link not yet known or banned, add this new link to the temp table
                                            if ($tmp_urls[$thislink[1]] != 1 && !$res1 && !$known_link && !$known_temp && !$banned) {
                                                $tmp_urls[$thislink[1]] = 1;
                                                $numoflinks++;
                                                if ($debug == '2') {
                                                    $act_link = rawurldecode($thislink[1]);
                                                    //  make it readable
                                                    $act_link = stripslashes($act_link);
                                                    printNewLinks($act_link, $cl);
                                                }
                                                mysqltest();
                                                $sql_query = "INSERT into " . $mysql_table_prefix . "temp (link, level, id) values ('{$thislink['1']}', '{$level}', '{$sessid}')";
                                                if ($numoflinks <= $max_links) {
                                                    $db_con->query($sql_query);
                                                }
                                            }
                                        }
                                    }
                                }
                            }
                        }
                    } else {
                        printStandardReport('noFollow', $command_line, $no_log);
                    }
                    unset($file);
                }
                // JFIELD at this point, the URL in the DB is good
                // echo "<h1>DONE</h1>";
                // exit;
                //  if we should index only the files as defined in docs list
                if ($only_docs) {
                    $OKtoIndex = '';
                    foreach ($docs as $thisdoc) {
                        if (strstr($urlparts['path'], $thisdoc)) {
                            $OKtoIndex = "1";
                        }
                    }
                    if (!$OKtoIndex) {
                        printStandardReport('noDoclist', $command_line, $no_log);
                    }
                }
                if ($OKtoIndex == 1) {
                    if ($link_check == 0) {
                        $title = $data['title'];
                        $host = $data['host'];
                        $path = $data['path'];
                        $fulltxt = $data['fulltext'];
                        $desc = substr($data['description'], 0, 1024);
                        //  extract domain
                        $url_parts = parse_all_url($url);
                        $hostname = $url_parts[host];
                        //  rebuild domain for localhost applications
                        if ($hostname == 'localhost') {
                            $host1 = str_replace($local, '', $url);
                        }
                        $pos = strpos($host1, "/");
                        //      on local server delete all behind the /
                        //      will work for localhost URLs like http://localhost/publizieren/japan1/index.htm
                        //       will fail for localhost URLs like http://localhost/publizieren/externe/japan2/index.htm
                        if ($pos) {
                            $host1 = substr($host1, 0, $pos);
                            //      build full adress again, now only local domain
                        }
                        if ($hostname == 'localhost') {
                            $domain_for_db = "" . $local . "" . $host1 . "/";
                            // complete URL
                            $domain_for_db = str_replace("http://", "", $domain_for_db);
                            //$domain_for_db = $host1;
                        } else {
                            //$domain_for_db = ("$url_parts[scheme]://".$hostname."/");  // complete URL
                            $domain_for_db = $hostname;
                        }
                        if (isset($domain_arr[$domain_for_db])) {
                            $dom_id = $domain_arr[$domain_for_db];
                        } else {
                            mysqltest();
                            $sql_query = "INSERT into " . $mysql_table_prefix . "domains (domain) values ('{$domain_for_db}')";
                            $db_con->query($sql_query);
                            if ($debug && $db_con->errno) {
                                $err_row = __LINE__ - 2;
                                printf("<p><span class='red'>&nbsp;MySQL failure: %s&nbsp;\n<br /></span></p>", $db_con->error);
                                if (__FUNCTION__) {
                                    printf("<p><span class='red'>&nbsp;Found in script: " . __FILE__ . "&nbsp;&nbsp;row: {$err_row}&nbsp;&nbsp;in function():&nbsp;" . __FUNCTION__ . "&nbsp;<br /></span></p>");
                                } else {
                                    printf("<p><span class='red'>&nbsp;Found in script: " . __FILE__ . "&nbsp;&nbsp;row: {$err_row}&nbsp;<br /></span></p>");
                                }
                                printf("<p><span class='red'>&nbsp;Script execution aborted.&nbsp;<br /></span>");
                                printf("<p><strong>Invalid query string, which caused the SQL error:</strong></p>");
                                echo "<p> {$sql_query} </p>";
                                exit;
                            }
                            $dom_id = $db_con->insert_id;
                            $domain_arr[$domain_for_db] = $dom_id;
                        }
                        if (!$js_link) {
                            //  JavaScript will not deliver keywords, only links are parsed
                            reset($wordarray);
                            if ($case_sensitive == '0') {
                                foreach ($wordarray as &$value) {
                                    $value[1] = lower_ent($value[1]);
                                    $value[1] = lower_case($value[1]);
                                    //  convert keywords to lower case
                                }
                            }
                            $wordarray = calc_weights($wordarray, $title, $host, $path, $data['keywords'], $url_parts);
                        } else {
                            $wordarray = '';
                        }
                        //if there are words to index, add the link to the database, get its id, and add the word + their relation
                        if (is_array($wordarray) && count($wordarray) >= $min_words_per_page) {
                            $OKtoSave = 1;
                            if ($use_white1 == '1') {
                                //  check if content of page matches ANY word in whitelist
                                $found = '0';
                                foreach ($whitelist as $key => $val1) {
                                    reset($wordarray);
                                    while ($thisword = each($wordarray)) {
                                        $word = trim($thisword[1][1]);
                                        if (strcasecmp($val1, $word) == 0) {
                                            $found = '1';
                                        }
                                    }
                                }
                                if ($found == '0') {
                                    printStandardReport('noWhitelist', $command_line, $no_log);
                                    $OKtoSave = 0;
                                    $realnum--;
                                }
                            }
                            if ($use_white2 == '1') {
                                //  check if content of page matches ALL words in whitelist
                                $all = count($whitelist);
                                $found = '0';
                                $found_this = '0';
                                foreach ($whitelist as $key => $val2) {
                                    reset($wordarray);
                                    while ($thisword = each($wordarray)) {
                                        $word = trim($thisword[1][1]);
                                        if (strcasecmp($val2, $word) == 0) {
                                            $found_this = '1';
                                        }
                                    }
                                    if ($found_this != '0') {
                                        $found++;
                                        $found_this = '0';
                                    }
                                }
                                if ($found != $all) {
                                    printStandardReport('noWhitelist', $command_line, $no_log);
                                    $OKtoSave = 0;
                                    $realnum--;
                                }
                            }
                            if ($use_black == '1') {
                                $found = '0';
                                //  check if content of page matches ANY string in blacklist
                                foreach ($blacklist as $key => $val3) {
                                    $met = stripos($data[fulltext], $val3);
                                    if ($met) {
                                        $found = '1';
                                    }
                                }
                                if ($found == '1') {
                                    printStandardReport('matchBlacklist', $command_line, $no_log);
                                    $OKtoSave = 0;
                                    $realnum--;
                                    $url_status['black'] = 1;
                                    return $url_status;
                                }
                            }
                            //  if activated in Admin backend, create a thumbnail of this URL
                            if ($OKtoSave && $hostname != 'localhost' && $webshot) {
                                $shot = '';
                                //  will contain the png webshot
                                $img = new webshots();
                                $shot = $img->url_to_image($url);
                                if ($debug && stristr($shot, "error: #")) {
                                    $shot_warn = "<br />Unable to create the webshot because of " . $shot;
                                    printWarning($shot_warn, $command_line, $no_log);
                                } else {
                                    $shot = $db_con->real_escape_string($shot);
                                }
                            }
                            if ($md5sum == '' || $md5sum == '' && $url_status['relocate']) {
                                //  enter here for new page (unknown link) OR for new relocated URL(so it will become a new link)
                                //  title, description and fulltxt are already escaped in function clean_file();
                                $url = $db_con->real_escape_string($url);
                                // jfield says: messy char decoding earlier
                                // leaves crap here that fudges up the works
                                $title_enc = mb_detect_encoding($title);
                                if (mb_detect_encoding($title) != "UTF-8") {
                                    $title = iconv($title_enc, "UTF-8", $title);
                                }
                                $fulltxt = substr($fulltxt, 0, 100000);
                                // we've got to stop somewhere
                                $fulltxt_enc = mb_detect_encoding($fulltxt);
                                if (mb_detect_encoding($title) != "UTF-8") {
                                    $fulltxt = iconv($fulltxt_enc, "UTF-8", $fulltxt);
                                }
                                mysqltest();
                                $sql_query = "INSERT into " . $mysql_table_prefix . "links (site_id, url, title, description, fulltxt, indexdate, size, md5sum, level, webshot) values ('{$site_id}', '{$url}', '{$title}', left('{$desc}', 255), '{$fulltxt}', curdate(), '{$pageSize}', '{$newmd5sum}', '{$thislevel}', '{$shot}')";
                                $db_con->query($sql_query);
                                if ($debug && $db_con->errno) {
                                    $err_row = __LINE__ - 2;
                                    printf("<p><span class='red'>&nbsp;MySQL failure: %s&nbsp;\n<br /></span></p>", $db_con->error);
                                    if (__FUNCTION__) {
                                        printf("<p><span class='red'>&nbsp;Found in script: " . __FILE__ . "&nbsp;&nbsp;row: {$err_row}&nbsp;&nbsp;in function():&nbsp;" . __FUNCTION__ . "&nbsp;<br /></span></p>");
                                    } else {
                                        printf("<p><span class='red'>&nbsp;Found in script: " . __FILE__ . "&nbsp;&nbsp;row: {$err_row}&nbsp;<br /></span></p>");
                                    }
                                    printf("<p><span class='red'>&nbsp;Script execution aborted.&nbsp;<br /></span>");
                                    printf("<p><strong>Invalid query string, which caused the SQL error:</strong></p>");
                                    echo "<p> {$sql_query} </p>";
                                    //exit;
                                    // jfield: let's keep going
                                    return;
                                }
                                $sql_query = "SELECT link_id from " . $mysql_table_prefix . "links where url='{$url}'";
                                $result = $db_con->query($sql_query);
                                if ($debug && $db_con->errno) {
                                    $err_row = __LINE__ - 2;
                                    printf("<p><span class='red'>&nbsp;MySQL failure: %s&nbsp;\n<br /></span></p>", $db_con->error);
                                    if (__FUNCTION__) {
                                        printf("<p><span class='red'>&nbsp;Found in script: " . __FILE__ . "&nbsp;&nbsp;row: {$err_row}&nbsp;&nbsp;in function():&nbsp;" . __FUNCTION__ . "&nbsp;<br /></span></p>");
                                    } else {
                                        printf("<p><span class='red'>&nbsp;Found in script: " . __FILE__ . "&nbsp;&nbsp;row: {$err_row}&nbsp;<br /></span></p>");
                                    }
                                    printf("<p><span class='red'>&nbsp;Script execution aborted.&nbsp;<br /></span>");
                                    printf("<p><strong>Invalid query string, which caused the SQL error:</strong></p>");
                                    echo "<p> {$sql_query} </p>";
                                    exit;
                                }
                                $row = $result->fetch_array(MYSQLI_NUM);
                                $link_id = $row[0];
                                if ($OKtoSave) {
                                    //  store link details, if not yet known (during reindex)
                                    if ($only_links) {
                                        //  extract domain of current page delivering the new links
                                        $url_parts = parse_all_url($url);
                                        $hostname = $url_parts[host];
                                        if ($hostname == 'localhost') {
                                            //  rebuild domain for localhost applications
                                            $host1 = str_replace($local, '', $url);
                                        }
                                        $pos = strpos($host1, "/");
                                        //      on local server delete all behind the /
                                        //      will work for localhost URLs like http://localhost/publizieren/japan1/index.htm
                                        //       will fail for localhost URLs like http://localhost/publizieren/externe/japan2/index.htm
                                        if ($pos) {
                                            $host1 = substr($host1, 0, $pos);
                                            //      build full adress again, now only local domain
                                        }
                                        if ($hostname == 'localhost') {
                                            $domain_db = "" . $local . "" . $host1 . "/";
                                            // complete URL
                                            $domain_db = str_replace("http://", "", $domain_db);
                                            //$domain_db = $host1;
                                        } else {
                                            //$domain_db = ("$url_parts[scheme]://".$hostname."/");  // complete URL
                                            $domain_db = $hostname;
                                        }
                                        //    now store all link details into db
                                        foreach ($my_links as $found_link) {
                                            //  but only if we have found a title
                                            if ($found_link[3]) {
                                                mysqltest();
                                                //     check whether URL is already known in sites table
                                                $sql_query = "SELECT title from " . $mysql_table_prefix . "link_details where link_id like '{$link_id}' and url like '%{$found_link['2']}%'";
                                                $res1 = $db_con->query($sql_query);
                                                if ($res1->num_rows == 0) {
                                                    //  must be new link
                                                    $sql_query = "INSERT into " . $mysql_table_prefix . "link_details (link_id, url, title, indexdate, domain) values ('{$link_id}', '{$found_link['2']}', '{$found_link['3']}', now(), '{$domain_db}')";
                                                    $db_con->query($sql_query);
                                                }
                                            }
                                        }
                                    }
                                    if ($debug == '2') {
                                        //  if debug mode, show details
                                        printStandardReport('newKeywords', $command_line, $no_log);
                                    }
                                    save_keywords($wordarray, $link_id, $dom_id);
                                }
                                mysqltest();
                                if ($index_media == '1' && $OKtoSave) {
                                    //   find media content only if there was no conflict with text (white and/or blacklist)
                                    include "index_media.php";
                                    //  try to find media files
                                }
                                mysqltest();
                                if ($debug == '2') {
                                    printStandardReport('indexed1', $command_line, $no_log);
                                } else {
                                    printStandardReport('indexed', $command_line, $no_log);
                                }
                            } else {
                                if ($md5sum != '' && $md5sum != $newmd5sum && $OKtoSave) {
                                    //if page has changed, start updating
                                    mysqltest();
                                    $sql_query = "SELECT link_id from " . $mysql_table_prefix . "links where url='{$url}'";
                                    $result = $db_con->query($sql_query);
                                    if ($debug && $db_con->errno) {
                                        $err_row = __LINE__ - 2;
                                        printf("<p><span class='red'>&nbsp;MySQL failure: %s&nbsp;\n<br /></span></p>", $db_con->error);
                                        if (__FUNCTION__) {
                                            printf("<p><span class='red'>&nbsp;Found in script: " . __FILE__ . "&nbsp;&nbsp;row: {$err_row}&nbsp;&nbsp;in function():&nbsp;" . __FUNCTION__ . "&nbsp;<br /></span></p>");
                                        } else {
                                            printf("<p><span class='red'>&nbsp;Found in script: " . __FILE__ . "&nbsp;&nbsp;row: {$err_row}&nbsp;<br /></span></p>");
                                        }
                                        printf("<p><span class='red'>&nbsp;Script execution aborted.&nbsp;<br /></span>");
                                        printf("<p><strong>Invalid query string, which caused the SQL error:</strong></p>");
                                        echo "<p> {$sql_query} </p>";
                                        exit;
                                    }
                                    $row = $result->fetch_array(MYSQLI_NUM);
                                    $link_id = $row[0];
                                    $sql_query = "DELETE from " . $mysql_table_prefix . "link_keyword where link_id={$link_id}";
                                    $db_con->query($sql_query);
                                    if ($debug && $db_con->errno) {
                                        $err_row = __LINE__ - 2;
                                        printf("<p><span class='red'>&nbsp;MySQL failure: %s&nbsp;\n<br /></span></p>", $db_con->error);
                                        if (__FUNCTION__) {
                                            printf("<p><span class='red'>&nbsp;Found in script: " . __FILE__ . "&nbsp;&nbsp;row: {$err_row}&nbsp;&nbsp;in function():&nbsp;" . __FUNCTION__ . "&nbsp;<br /></span></p>");
                                        } else {
                                            printf("<p><span class='red'>&nbsp;Found in script: " . __FILE__ . "&nbsp;&nbsp;row: {$err_row}&nbsp;<br /></span></p>");
                                        }
                                        printf("<p><span class='red'>&nbsp;Script execution aborted.&nbsp;<br /></span>");
                                        printf("<p><strong>Invalid query string, which caused the SQL error:</strong></p>");
                                        echo "<p> {$sql_query} </p>";
                                        exit;
                                    }
                                    if ($debug == '2') {
                                        //  if debug mode, show details
                                        printStandardReport('newKeywords', $command_line, $no_log);
                                    }
                                    save_keywords($wordarray, $link_id, $dom_id);
                                    $sql_query = "UPDATE " . $mysql_table_prefix . "links set title='{$title}', description ='{$desc}', fulltxt = '{$fulltxt}', indexdate=now(), size = '{$pageSize}', md5sum='{$newmd5sum}', level='{$thislevel}', webshot='{$shot}' where link_id='{$link_id}'";
                                    mysqltest();
                                    $db_con->query($sql_query);
                                    if ($debug && $db_con->errno) {
                                        $err_row = __LINE__ - 2;
                                        printf("<p><span class='red'>&nbsp;MySQL failure: %s&nbsp;\n<br /></span></p>", $db_con->error);
                                        if (__FUNCTION__) {
                                            printf("<p><span class='red'>&nbsp;Found in script: " . __FILE__ . "&nbsp;&nbsp;row: {$err_row}&nbsp;&nbsp;in function():&nbsp;" . __FUNCTION__ . "&nbsp;<br /></span></p>");
                                        } else {
                                            printf("<p><span class='red'>&nbsp;Found in script: " . __FILE__ . "&nbsp;&nbsp;row: {$err_row}&nbsp;<br /></span></p>");
                                        }
                                        printf("<p><span class='red'>&nbsp;Script execution aborted.&nbsp;<br /></span>");
                                        printf("<p><strong>Invalid query string, which caused the SQL error:</strong></p>");
                                        echo "<p> {$sql_query} </p>";
                                        exit;
                                    }
                                    if ($index_media == '1') {
                                        include "index_media.php";
                                        //  try to find media files
                                    }
                                    if ($debug == '2') {
                                        printStandardReport('re-indexed1', $command_line, $no_log);
                                    }
                                }
                            }
                        } else {
                            if ($js_link) {
                                printStandardReport('js_content', $command_line, $no_log);
                            } else {
                                printStandardReport('minWords', $command_line, $no_log);
                            }
                            $realnum--;
                        }
                    } else {
                        printStandardReport('link_okay', $command_line, $no_log);
                    }
                    unset($file, $title, $fulltxt, $desc);
                    $wordarray = array();
                    $data = array();
                    $seg_data = array();
                }
            }
        }
    } else {
        $deletable = 1;
        //printUrlStatus($url_status['state'], $command_line, $no_log);
    }
    mysqltest();
    if ($url_status['relocate']) {
        //  remove this relocated URL from temp table, because it is indexed now
        $sql_query = "DELETE from " . $mysql_table_prefix . "temp where link = '{$url}' AND id = '{$sessid}'";
        $db_con->query($sql_query);
        if ($debug && $db_con->errno) {
            $err_row = __LINE__ - 2;
            printf("<p><span class='red'>&nbsp;MySQL failure: %s&nbsp;\n<br /></span></p>", $db_con->error);
            if (__FUNCTION__) {
                printf("<p><span class='red'>&nbsp;Found in script: " . __FILE__ . "&nbsp;&nbsp;row: {$err_row}&nbsp;&nbsp;in function():&nbsp;" . __FUNCTION__ . "&nbsp;<br /></span></p>");
            } else {
                printf("<p><span class='red'>&nbsp;Found in script: " . __FILE__ . "&nbsp;&nbsp;row: {$err_row}&nbsp;<br /></span></p>");
            }
            printf("<p><span class='red'>&nbsp;Script execution aborted.&nbsp;<br /></span>");
            printf("<p><strong>Invalid query string, which caused the SQL error:</strong></p>");
            echo "<p> {$sql_query} </p>";
            exit;
        }
    }
    if ($reindex == 1 && $deletable == 1) {
        check_for_removal($url);
    } else {
        if ($reindex == 1) {
        }
    }
    if (!isset($all_links)) {
        $all_links = 0;
    }
    if (!isset($numoflinks)) {
        $numoflinks = 0;
    }
    //      if valid sitemap found, or canonical link, or something else, no LinkReport
    if ($smp != 1 && $OKtoIndex == 1 && $url_status['state'] == 'ok') {
        printLinksReport($numoflinks, $all_links, $command_line);
    }
    //  remove the URL, which haas been idexed now from temp table.
    mysqltest();
    $sql_query = "DELETE from " . $mysql_table_prefix . "temp where link = '{$url}' AND id = '{$sessid}'";
    $db_con->query($sql_query);
    if ($debug && $db_con->errno) {
        $err_row = __LINE__ - 2;
        printf("<p><span class='red'>&nbsp;MySQL failure: %s&nbsp;\n<br /></span></p>", $db_con->error);
        if (__FUNCTION__) {
            printf("<p><span class='red'>&nbsp;Found in script: " . __FILE__ . "&nbsp;&nbsp;row: {$err_row}&nbsp;&nbsp;in function():&nbsp;" . __FUNCTION__ . "&nbsp;<br /></span></p>");
        } else {
            printf("<p><span class='red'>&nbsp;Found in script: " . __FILE__ . "&nbsp;&nbsp;row: {$err_row}&nbsp;<br /></span></p>");
        }
        printf("<p><span class='red'>&nbsp;Script execution aborted.&nbsp;<br /></span>");
        printf("<p><strong>Invalid query string, which caused the SQL error:</strong></p>");
        echo "<p> {$sql_query} </p>";
        exit;
    }
    return $url_status;
}
Пример #6
0
function index_url($url, $level, $site_id, $md5sum, $domain, $indexdate, $sessid, $can_leave_domain, $reindex)
{
    global $min_delay;
    global $command_line;
    global $min_words_per_page;
    global $supdomain, $index_vpaths;
    global $user_agent, $tmp_urls, $delay_time, $domain_arr;
    global $db;
    $deletable = 0;
    $url_status = url_status($url);
    $thislevel = $level - 1;
    if (strstr($url_status['state'], "Relocation")) {
        $url = preg_replace("/ /", "", url_purify($url_status['path'], $url, $can_leave_domain));
        if ($url != '') {
            $result = $db->query("SELECT link FROM " . TABLE_PREFIX . "temp WHERE link=" . $db->quote($url) . " AND id=" . $db->quote($sessid));
            echo sql_errorstring(__FILE__, __LINE__);
            if ($result->fetch()) {
                $result->closeCursor();
                $db->exec("INSERT INTO " . TABLE_PREFIX . "temp (link, level, id) VALUES (" . $db->quote($url) . ", " . $db->quote($level) . ", " . $db->quote($sessid) . ")");
                echo sql_errorstring(__FILE__, __LINE__);
            }
        }
        $url_status['state'] == "redirected";
    }
    if (!$index_vpaths && $url_status['state'] == 'ok') {
        $url_parts = parse_url($url);
        $base = basename($url_parts['path']);
        if (strstr($base, '.') == false) {
            $url_status['state'] = "directory listing or default redirect";
        }
    }
    ini_set("user_agent", $user_agent);
    if ($url_status['state'] == 'ok') {
        $OKtoIndex = 1;
        $file_read_error = 0;
        if (time() - $delay_time < $min_delay) {
            sleep($min_delay - (time() - $delay_time));
        }
        $delay_time = time();
        if (!fst_lt_snd(phpversion(), "4.3.0")) {
            $file = file_get_contents($url);
            if ($file === FALSE) {
                $file_read_error = 1;
            }
        } else {
            $fl = @fopen($url, "r");
            if ($fl) {
                while ($buffer = @fgets($fl, 4096)) {
                    $file .= $buffer;
                }
            } else {
                $file_read_error = 1;
            }
            fclose($fl);
        }
        if ($file_read_error) {
            $contents = getFileContents($url);
            $file = $contents['file'];
        }
        $pageSize = number_format(strlen($file) / 1024, 2, ".", "");
        printPageSizeReport($pageSize);
        if ($url_status['content'] != 'text') {
            $file = extract_text($file, $url_status['content']);
        }
        printStandardReport('starting', $command_line);
        $newmd5sum = md5($file);
        if ($reindex == 0) {
            if ($md5sum == $newmd5sum) {
                printStandardReport('md5notChanged', $command_line);
                $OKtoIndex = 0;
            } else {
                if (isDuplicateMD5($newmd5sum)) {
                    $OKtoIndex = 0;
                    printStandardReport('duplicate', $command_line);
                }
            }
        }
        if (($md5sum != $newmd5sum || $reindex == 1) && $OKtoIndex == 1) {
            $urlparts = parse_url($url);
            $newdomain = $urlparts['host'];
            $type = 0;
            // remove link to css file
            //get all links from file
            $data = clean_file($file, $url, $url_status['content']);
            if ($data['noindex'] == 1) {
                $OKtoIndex = 0;
                $deletable = 1;
                printStandardReport('metaNoindex', $command_line);
            }
            $wordarray = unique_array(explode(" ", $data['content']));
            if ($data['nofollow'] != 1) {
                $links = get_links($file, $url, $can_leave_domain, $data['base']);
                $links = distinct_array($links);
                $all_links = count($links);
                $numoflinks = 0;
                //if there are any, add to the temp table, but only if there isnt such url already
                if (is_array($links)) {
                    reset($links);
                    while ($thislink = each($links)) {
                        if (!isset($tmp_urls[$thislink[1]]) || $tmp_urls[$thislink[1]] != 1) {
                            $tmp_urls[$thislink[1]] = 1;
                            $numoflinks++;
                            $db->exec("INSERT INTO " . TABLE_PREFIX . "temp (link, level, id) VALUES (" . $db->quote($thislink[1]) . ", " . $db->quote($level) . ", " . $db->quote($sessid) . ")");
                            echo sql_errorstring(__FILE__, __LINE__);
                        }
                    }
                }
            } else {
                printStandardReport('noFollow', $command_line);
            }
            if ($OKtoIndex == 1) {
                $title = $data['title'];
                $host = $data['host'];
                $path = $data['path'];
                $fulltxt = str_replace("\\'", "&quot;", $data['fulltext']);
                $desc = substr($data['description'], 0, 254);
                $language = substr($data['language'], 0, 2);
                $url_parts = parse_url($url);
                $domain_for_db = $url_parts['host'];
                if (isset($domain_arr[$domain_for_db])) {
                    $dom_id = $domain_arr[$domain_for_db];
                } else {
                    $db->exec("INSERT INTO " . TABLE_PREFIX . "domains (domain) VALUES (" . $db->quote($domain_for_db) . ")");
                    $dom_id = $db->lastInsertId();
                    $domain_arr[$domain_for_db] = $dom_id;
                }
                $wordarray = calc_weights($wordarray, $title, $host, $path, $data['keywords']);
                $tstamp = "'" . date("Y-m-d") . "'";
                //if there are words to index, add the link to the database, get its id, and add the word + their relation
                if (is_array($wordarray) && count($wordarray) > $min_words_per_page) {
                    $site_id = $db->quote($site_id);
                    $url = $db->quote($url);
                    $title = $db->quote($title);
                    $desc = $db->quote($desc);
                    $language = $db->quote($language);
                    $fulltxt = $db->quote($fulltxt);
                    $pageSize = $db->quote($pageSize);
                    $Qmd5sum = $db->quote($newmd5sum);
                    if ($md5sum == '') {
                        $db->exec("INSERT INTO " . TABLE_PREFIX . "links (site_id, url, title, description, language, fulltxt, indexdate, size, md5sum, level) VALUES ({$site_id}, {$url}, {$title}, {$desc}, {$language}, {$fulltxt}, {$tstamp}, {$pageSize}, {$Qmd5sum}, {$thislevel})");
                        $error = sql_errorstring(__FILE__, __LINE__);
                        if ($error) {
                            echo $error;
                            printStandardReport('skipped', $command_line);
                        } else {
                            $result = $db->query("SELECT link_id FROM " . TABLE_PREFIX . "links WHERE url={$url}");
                            echo sql_errorstring(__FILE__, __LINE__);
                            $row = $result->fetch();
                            $link_id = $row[0];
                            $result->closeCursor();
                            save_keywords($wordarray, $link_id, $dom_id);
                            printStandardReport('indexed', $command_line);
                        }
                    } else {
                        if ($md5sum != '' && $md5sum != $newmd5sum) {
                            //if page has changed, start updating
                            $result = $db->query("SELECT link_id FROM " . TABLE_PREFIX . "links WHERE url={$url}");
                            echo sql_errorstring(__FILE__, __LINE__);
                            $row = $result->fetch();
                            $link_id = $row[0];
                            $result->closeCursor();
                            for ($i = 0; $i <= 15; $i++) {
                                $char = dechex($i);
                                $db->exec("DELETE FROM " . TABLE_PREFIX . "link_keyword{$char} WHERE link_id={$link_id}");
                                echo sql_errorstring(__FILE__, __LINE__);
                            }
                            save_keywords($wordarray, $link_id, $dom_id);
                            $db->exec("UPDATE " . TABLE_PREFIX . "links SET title={$title}, description={$desc}, language={$language}, fulltxt={$fulltxt}, indexdate={$tstamp}, size={$pageSize}, md5sum={$Qmd5sum}, level={$thislevel} WHERE link_id={$link_id}");
                            echo sql_errorstring(__FILE__, __LINE__);
                            printStandardReport('re-indexed', $command_line);
                        }
                    }
                } else {
                    printStandardReport('minWords', $command_line);
                }
            }
        }
    } else {
        $deletable = 1;
        printUrlStatus($url_status['state'], $command_line);
    }
    if ($reindex == 1 && $deletable == 1) {
        check_for_removal($url);
    } else {
        if ($reindex == 1) {
            //???
        }
    }
    if (!isset($all_links)) {
        $all_links = 0;
    }
    if (!isset($numoflinks)) {
        $numoflinks = 0;
    }
    printLinksReport($numoflinks, $all_links, $command_line);
}
Пример #7
0
    static $aDigits = array();
    $time_start = date('H:i:s', time());
    // 10:00:00
    for ($i = 3; $i < $a; $i++) {
        for ($j = 3; $j < $b; $j++) {
            $aDigits[] = pow($i, $j);
        }
    }
    $aDigits = array_unique($aDigits);
    ini_set('date.timezone', 'Europe/Minsk');
    $time_end = date('H:i:s', time());
    // 10:00:00
    return array(count($aDigits), $time_start, $time_end);
}
if (isset($_POST["ii"])) {
    $uu = distinct_array($_POST["ii"]);
    $data = ["result" => $uu[0], "ti_start" => $uu[1], "ti_end" => $uu[2]];
    echo json_encode($data);
} else {
    ?>
	<!DOCTYPE html>
	<html>
		<head>
			<title>
			</title>
				<link rel="stylesheet" type="text/css" href="/css/1.css">
				<script type="text/javascript" src="/js/jquery-2.1.4.min.js" ></script>
				<script type="text/javascript" src="/js/func.js" ></script>
		</head>
		<body>
			Введите значение  <input id="14" style="width: 60px;"></input>
Пример #8
0
{
    static $aDigits = array();
    echo $time6 = date('H:i:s', time()) . "<br>";
    // 10:00:00
    for ($i = 3; $i < $a; $i++) {
        for ($j = 3; $j < $b; $j++) {
            $aDigits[] = pow($i, $j);
        }
    }
    $aDigits = array_unique($aDigits);
    ini_set('date.timezone', 'Europe/Minsk');
    echo $time6 = date('H:i:s', time()) . "<br>";
    // 10:00:00
    return count($aDigits);
}
echo distinct_array(100, 136);
/*Задача 5

Если мы возьмем 47, перевернем его и сложим, получится число 121 — палиндром.

Если взять 349 и проделать над ним эту операцию три раза, то тоже получится палиндром:
349 + 943 = 1292
1292 + 2921 = 4213
4213 + 3124 = 7337

Найдите количество положительных натуральных чисел меньших 13332 таких,
что из них нельзя получить палиндром за 50 или менее применений описанной операции.*/
function Count_poly()
{
    for ($jj = 10; $jj < 13332; $jj++) {
        $ss = [$jj];
Пример #9
-6
function index_url($url, $level, $site_id, $md5sum, $domain, $indexdate, $sessid, $can_leave_domain, $reindex)
{
    global $tmp_urls, $delay_time, $domain_arr, $charSet, $url_status, $whitelist, $blacklist, $supdomain, $smp, $realnum, $dup_url, $entities, $command_line;
    if (DEBUG == '0') {
        error_reporting(0);
    } else {
        error_reporting(E_ERROR);
        //  otherwise  a non existing siemap.xml  would always cause a warning message
    }
    $needsReindex = 1;
    $deletable = 0;
    $url_status = url_status($url);
    $thislevel = $level - 1;
    if ($smp != 1 && Configure::read('follow_sitemap') == 1) {
        //  enter here if we don't already know a valid sitemap and if admin settings allowed us to do so
        $tmp_urls = get_temp_urls($sessid);
        //  reload previous temp
        $url2 = remove_sessid(convert_url($url));
        // get folder where sitemap should be and if exists, cut existing filename, suffix and subfolder
        //                Configure::read('local') = "http://localhost/publizieren/";   //  your base adress for your local server
        $sitemap_name = "sitemap.xml";
        //  could be individualized
        $host = parse_url($url2);
        $hostname = $host[host];
        if ($hostname == 'localhost') {
            $host1 = str_replace(Configure::read('local'), '', $url2);
        }
        $pos = strpos($host1, "/");
        //      on local server delete all behind the /
        if ($pos) {
            $host1 = substr($host1, 0, $pos);
        }
        //      build full adress again, now only until host
        if ($hostname == 'localhost') {
            $url2 = Configure::read('local') . $host1;
        } else {
            $url2 = "{$host['scheme']}://{$hostname}";
        }
        $input_file = "{$url2}/{$sitemap_name}";
        // create path to sitemap
        if ($handle = fopen($input_file, "r")) {
            // happy times, we found a new sitemap
            $links = get_sitemap($input_file, TABLE_PREFIX);
            // now extract links from sitemap.xml
            if ($links != '') {
                //  if links were extracted from sitemap.xml
                reset($links);
                while ($thislink = each($links)) {
                    //  check if we already know this link as a site url
                    $result = mysql_query("select url from " . TABLE_PREFIX . "sites where url like '{$thislink['1']}%'");
                    if (DEBUG > '0') {
                        echo mysql_error();
                    }
                    $rows = mysql_num_rows($result);
                    if ($rows == '0') {
                        // for all new links: save in temp table
                        mysql_query("insert into " . TABLE_PREFIX . "temp (link, level, id) values ('{$thislink['1']}', '{$level}', '{$sessid}')");
                        if (DEBUG > '0') {
                            echo mysql_error();
                        }
                    }
                }
                clean_resource($result);
                $smp = '1';
                //     there was a valid sitemap and we stored the new links
            }
            unset($links, $input_file);
            fclose($handle);
        }
    }
    if (strstr($url_status['state'], "Relocation")) {
        $url = eregi_replace(" ", "", url_purify($url_status['path'], $url, $can_leave_domain));
        if ($url != '') {
            $result = mysql_query("select link from " . TABLE_PREFIX . "temp where link='{$url}' && id = '{$sessid}'");
            if (DEBUG > '0') {
                echo mysql_error();
            }
            $rows = mysql_num_rows($result);
            if ($rows == 0) {
                mysql_query("insert into " . TABLE_PREFIX . "temp (link, level, id) values ('{$url}', '{$level}', '{$sessid}')");
                if (DEBUG > '0') {
                    echo mysql_error();
                }
            }
            clean_resource($result);
        }
        $url_status['state'] == "redirected";
    }
    ini_set("user_agent", Configure::read('user_agent'));
    if ($url_status['state'] == 'ok') {
        $OKtoIndex = 1;
        $file_read_error = 0;
        if (time() - $delay_time < Configure::read('min_delay')) {
            sleep(Configure::read('min_delay') - (time() - $delay_time));
        }
        $delay_time = time();
        if (!fst_lt_snd(phpversion(), "4.3.0")) {
            $file = file_get_contents($url);
            if ($file === FALSE) {
                $file_read_error = 1;
            }
        } else {
            $fl = @fopen($url, "r");
            if ($fl) {
                while ($buffer = @fgets($fl, 4096)) {
                    $file .= $buffer;
                }
                unset($buffer);
            } else {
                $file_read_error = 1;
            }
            fclose($fl);
        }
        if ($file_read_error || Configure::read('utf8') == 1) {
            unset($file);
            $contents = getFileContents($url);
            // parse_url to get charset
            $file = $contents['file'];
        }
        $pageSize = number_format(strlen($file) / 1024, 2, ".", "");
        printPageSizeReport($pageSize);
        if ($url_status['content'] != 'text') {
            $file = extract_text($file, $url_status['content']);
            //for DOCs, PDFs etc we need special converter
            if ($file == 'ERROR') {
                //      if error, suppress further indexing
                $OKtoIndex = 0;
                $file_read_error = 1;
            }
        }
        if (Configure::read('utf8') == 1) {
            //   enter here if file should be translated into utf-8
            $charSet = $contents['charset'];
            if ($charSet == '') {
                // if we did not find any charset, we will use our own
                $charSet = Configure::read('home_charset');
            }
            $charSet = strtoupper(trim($charSet));
            if (strpos($charSet, '8859')) {
                $conv_file = html_entity_decode($file);
            } else {
                $conv_file = $file;
                //  pure code
            }
            if ($charSet != "UTF-8") {
                //  enter here only, if site / file is not jet UTF-8 coded
                $iconv_file = iconv($charSet, "UTF-8", $conv_file);
                //      if installed, first try to use PHP function iconv
                if (trim($iconv_file) == "") {
                    // iconv is not installed or input charSet not available. We need to use class ConvertCharset
                    $charSet = str_ireplace('iso-', '', $charSet);
                    $charSet = str_ireplace('iso', '', $charSet);
                    $NewEncoding = new ConvertCharset($charSet, "utf-8");
                    $NewFileOutput = $NewEncoding->Convert($conv_file);
                    $file = $NewFileOutput;
                } else {
                    $file = $iconv_file;
                }
                unset($conv_file, $iconv_file, $NewEncoding, $NewFileOutput);
            }
        }
        $data = clean_file($file, $url, $url_status['content']);
        $newmd5sum = md5($data['content']);
        if ($md5sum == $newmd5sum) {
            printStandardReport('md5notChanged', $command_line);
            $OKtoIndex = 0;
            $realnum--;
        } else {
            if (Configure::read('use_white') == '1') {
                $found = '0';
                //  check if content of page matches any word in whitelist
                foreach ($whitelist as $key => $value) {
                    $met = stripos($file, $value);
                    if ($met) {
                        $found = '1';
                    }
                }
                if ($found == '0') {
                    printStandardReport('noWhitelist', $command_line);
                    $OKtoIndex = 0;
                    $realnum--;
                }
            }
            if (Configure::read('use_black') == '1') {
                $found = '0';
                //  check if content of page matches any word in blacklist
                foreach ($blacklist as $key => $value) {
                    $met = stripos($file, $value);
                    if ($met) {
                        $found = '1';
                    }
                }
                if ($found == '1') {
                    printStandardReport('matchBlacklist', $command_line);
                    $OKtoIndex = 0;
                    $realnum--;
                }
            }
            //     check for duplicate page content
            $result = mysql_query("select link_id from " . TABLE_PREFIX . "links where md5sum='{$newmd5sum}'");
            if (DEBUG > '0') {
                echo mysql_error();
            }
            if (mysql_num_rows($result) > 0) {
                //  display warning message and urls with duplicate content
                printStandardReport('duplicate', $command_line);
                $num_rows = mysql_num_rows($result);
                for ($i = 0; $i < $num_rows; $i++) {
                    $link_id = mysql_result($result, $i, "link_id");
                    $num = $i + 1;
                    $res = mysql_query("select url from " . TABLE_PREFIX . "links where link_id like '{$link_id}'");
                    if (DEBUG > '0') {
                        echo mysql_error();
                    }
                    $row = mysql_fetch_row($res);
                    $dup_url = $row[0];
                    clean_resource($res);
                    printDupReport($dup_url, $command_line);
                }
                if (Configure::read('dup_content') == '0') {
                    //  enter here, if pages with duplicate content should not be indexed/re-indexed
                    $OKtoIndex = 0;
                    $realnum--;
                } else {
                    $OKtoIndex = 1;
                }
            }
        }
        if (($md5sum != $newmd5sum || $reindex == 1) && $OKtoIndex == 1) {
            $urlparts = parse_url($url);
            $newdomain = $urlparts['host'];
            $type = 0;
            if ($data['noindex'] == 1) {
                $OKtoIndex = 0;
                $deletable = 1;
                printStandardReport('metaNoindex', $command_line);
            }
            if (Configure::read('use_white') == '1') {
                $found = '0';
                //  check if content of page matches any word in whitelist
                foreach ($whitelist as $key => $value) {
                    $met = stripos($data[fulltext], $value);
                    if ($met) {
                        $found = '1';
                    }
                }
                if ($found == '0') {
                    printStandardReport('noWhitelist', $command_line);
                    $OKtoIndex = 0;
                    $realnum--;
                }
            }
            if (Configure::read('use_black') == '1') {
                $found = '0';
                //  check if content of page matches any word in blacklist
                foreach ($blacklist as $key => $value) {
                    $met = stripos($data[fulltext], $value);
                    if ($met) {
                        $found = '1';
                    }
                }
                if ($found == '1') {
                    printStandardReport('matchBlacklist', $command_line);
                    $OKtoIndex = 0;
                    $realnum--;
                }
            }
            $wordarray = unique_array(explode(" ", $data['content']));
            if ($smp != 1) {
                if ($data['nofollow'] != 1) {
                    $links = get_links($file, $url, $can_leave_domain, $data['base']);
                    $links = distinct_array($links);
                    $all_links = count($links);
                    if ($all_links > Configure::read('max_links')) {
                        $all_links = Configure::read('max_links');
                    }
                    $links = array_slice($links, 0, Configure::read('max_links'));
                    if ($realnum < Configure::read('max_links')) {
                        $numoflinks = 0;
                        //if there are any, add to the temp table, but only if there isnt such url already
                        if (is_array($links)) {
                            reset($links);
                            if (DEBUG == '2') {
                                //  if debug mode, show details
                                printStandardReport('newLinks', $command_line);
                            }
                            while ($thislink = each($links)) {
                                if ($tmp_urls[$thislink[1]] != 1) {
                                    $tmp_urls[$thislink[1]] = 1;
                                    $numoflinks++;
                                    if (DEBUG == '2') {
                                        $act_link = $thislink[1];
                                        printNewLinks($act_link);
                                    }
                                    if ($numoflinks <= Configure::read('max_links')) {
                                        mysql_query("insert into " . TABLE_PREFIX . "temp (link, level, id) values ('{$thislink['1']}', '{$level}', '{$sessid}')");
                                    }
                                    if (DEBUG > '0') {
                                        echo mysql_error();
                                    }
                                }
                            }
                        }
                    }
                } else {
                    printStandardReport('noFollow', $command_line);
                }
                unset($file);
            }
            if ($OKtoIndex == 1) {
                if (Configure::read('link_check') == 0) {
                    $title = $data['title'];
                    $host = $data['host'];
                    $path = $data['path'];
                    $fulltxt = $data['fulltext'];
                    $desc = substr($data['description'], 0, 254);
                    $url_parts = parse_url($url);
                    $domain_for_db = $url_parts['host'];
                    if (isset($domain_arr[$domain_for_db])) {
                        $dom_id = $domain_arr[$domain_for_db];
                    } else {
                        mysql_query("insert into " . TABLE_PREFIX . "domains (domain) values ('{$domain_for_db}')");
                        $dom_id = mysql_insert_id();
                        $domain_arr[$domain_for_db] = $dom_id;
                    }
                    $wordarray = calc_weights($wordarray, $title, $host, $path, $data['keywords'], $url_parts);
                    //if there are words to index, add the link to the database, get its id, and add the word + their relation
                    if (is_array($wordarray) && count($wordarray) > Configure::read('min_words_per_page')) {
                        if ($md5sum == '') {
                            mysql_query("insert into " . TABLE_PREFIX . "links (site_id, url, title, description, fulltxt, indexdate, size, md5sum, level) values ('{$site_id}', '{$url}', '{$title}', '{$desc}', '{$fulltxt}', curdate(), '{$pageSize}', '{$newmd5sum}', {$thislevel})");
                            if (DEBUG > '0') {
                                echo mysql_error();
                            }
                            $result = mysql_query("select link_id from " . TABLE_PREFIX . "links where url='{$url}'");
                            if (DEBUG > '0') {
                                echo mysql_error();
                            }
                            $row = mysql_fetch_row($result);
                            $link_id = $row[0];
                            clean_resource($result);
                            if (DEBUG == '2') {
                                //  if debug mode, show details
                                printStandardReport('newKeywords', $command_line);
                            }
                            save_keywords($wordarray, $link_id, $dom_id);
                            if (DEBUG == '2') {
                                printStandardReport('indexed1', $command_line);
                            } else {
                                printStandardReport('indexed', $command_line);
                            }
                        } else {
                            if ($md5sum != '' && $md5sum != $newmd5sum) {
                                //if page has changed, start updating
                                $result = mysql_query("select link_id from " . TABLE_PREFIX . "links where url='{$url}'");
                                if (DEBUG > '0') {
                                    echo mysql_error();
                                }
                                $row = mysql_fetch_row($result);
                                $link_id = $row[0];
                                for ($i = 0; $i <= 15; $i++) {
                                    $char = dechex($i);
                                    mysql_query("delete from " . TABLE_PREFIX . "link_keyword{$char} where link_id={$link_id}");
                                    if (DEBUG > '0') {
                                        echo mysql_error();
                                    }
                                }
                                clean_resource($result);
                                if (DEBUG == '2') {
                                    //  if debug mode, show details
                                    printStandardReport('newKeywords', $command_line);
                                }
                                save_keywords($wordarray, $link_id, $dom_id);
                                $query = "update " . TABLE_PREFIX . "links set title='{$title}', description ='{$desc}', fulltxt = '{$fulltxt}', indexdate=now(), size = '{$pageSize}', md5sum='{$newmd5sum}', level={$thislevel} where link_id={$link_id}";
                                mysql_query($query);
                                if (DEBUG > '0') {
                                    echo mysql_error();
                                }
                                if (DEBUG == '2') {
                                    printStandardReport('re-indexed1', $command_line);
                                } else {
                                    printStandardReport('re-indexed', $command_line);
                                }
                            }
                        }
                    } else {
                        printStandardReport('minWords', $command_line);
                        $realnum--;
                    }
                } else {
                    printStandardReport('link_okay', $command_line);
                }
                unset($wordarray, $title, $fulltxt, $desc);
            }
        }
    } else {
        $deletable = 1;
        printUrlStatus($url_status['state'], $command_line);
    }
    if ($reindex == 1 && $deletable == 1) {
        check_for_removal($url);
    } else {
        if ($reindex == 1) {
        }
    }
    if (!isset($all_links)) {
        $all_links = 0;
    }
    if (!isset($numoflinks)) {
        $numoflinks = 0;
    }
    if ($smp != 1) {
        //      if valid sitemap found, no LinkReport
        printLinksReport($numoflinks, $all_links, $command_line);
    }
}