Ejemplo n.º 1
0
function index_url($url, $level, $site_id, $md5sum, $domain, $indexdate, $sessid, $can_leave_domain, $reindex)
{
    global $entities, $min_delay;
    global $command_line;
    global $min_words_per_page;
    global $supdomain;
    global $mysql_table_prefix, $user_agent, $tmp_urls, $delay_time, $domain_arr;
    $needsReindex = 1;
    $deletable = 0;
    $url_status = url_status($url);
    $thislevel = $level - 1;
    if (strstr($url_status['state'], "Relocation")) {
        $url = preg_replace("/ /", "", url_purify($url_status['path'], $url, $can_leave_domain));
        if ($url != '') {
            $result = mysql_query("select link from " . $mysql_table_prefix . "temp where link='{$url}' && id = '{$sessid}'");
            echo mysql_error();
            $rows = mysql_numrows($result);
            if ($rows == 0) {
                mysql_query("insert into " . $mysql_table_prefix . "temp (link, level, id) values ('{$url}', '{$level}', '{$sessid}')");
                echo mysql_error();
            }
        }
        $url_status['state'] == "redirected";
    }
    /*
    		if ($indexdate <> '' && $url_status['date'] <> '') {
    			if ($indexdate > $url_status['date']) {
    				$url_status['state'] = "Date checked. Page contents not changed";
    				$needsReindex = 0;
    			}
    		}*/
    ini_set("user_agent", $user_agent);
    if ($url_status['state'] == 'ok') {
        $OKtoIndex = 1;
        $file_read_error = 0;
        if (time() - $delay_time < $min_delay) {
            sleep($min_delay - (time() - $delay_time));
        }
        $delay_time = time();
        if (!fst_lt_snd(phpversion(), "4.3.0")) {
            $file = file_get_contents($url);
            if ($file === FALSE) {
                $file_read_error = 1;
            }
        } else {
            $fl = @fopen($url, "r");
            if ($fl) {
                while ($buffer = @fgets($fl, 4096)) {
                    $file .= $buffer;
                }
            } else {
                $file_read_error = 1;
            }
            fclose($fl);
        }
        if ($file_read_error) {
            $contents = getFileContents($url);
            $file = $contents['file'];
        }
        $pageSize = number_format(strlen($file) / 1024, 2, ".", "");
        printPageSizeReport($pageSize);
        if ($url_status['content'] != 'text') {
            $file = extract_text($file, $url_status['content']);
        }
        printStandardReport('starting', $command_line);
        $newmd5sum = md5($file);
        if ($md5sum == $newmd5sum) {
            printStandardReport('md5notChanged', $command_line);
            $OKtoIndex = 0;
        } else {
            if (isDuplicateMD5($newmd5sum)) {
                $OKtoIndex = 0;
                printStandardReport('duplicate', $command_line);
            }
        }
        if (($md5sum != $newmd5sum || $reindex == 1) && $OKtoIndex == 1) {
            $urlparts = parse_url($url);
            $newdomain = $urlparts['host'];
            $type = 0;
            /*		if ($newdomain <> $domain)
            					$domainChanged = 1;
            
            				if ($domaincb==1) {
            					$start = strlen($newdomain) - strlen($supdomain);
            					if (substr($newdomain, $start) == $supdomain) {
            						$domainChanged = 0;
            					}
            				}*/
            // remove link to css file
            //get all links from file
            $data = clean_file($file, $url, $url_status['content']);
            if ($data['noindex'] == 1) {
                $OKtoIndex = 0;
                $deletable = 1;
                printStandardReport('metaNoindex', $command_line);
            }
            $wordarray = unique_array(explode(" ", $data['content']));
            if ($data['nofollow'] != 1) {
                $links = get_links($file, $url, $can_leave_domain, $data['base']);
                $links = distinct_array($links);
                $all_links = count($links);
                $numoflinks = 0;
                //if there are any, add to the temp table, but only if there isnt such url already
                if (is_array($links)) {
                    reset($links);
                    while ($thislink = each($links)) {
                        if ($tmp_urls[$thislink[1]] != 1) {
                            $tmp_urls[$thislink[1]] = 1;
                            $numoflinks++;
                            mysql_query("insert into " . $mysql_table_prefix . "temp (link, level, id) values ('{$thislink['1']}', '{$level}', '{$sessid}')");
                            echo mysql_error();
                        }
                    }
                }
            } else {
                printStandardReport('noFollow', $command_line);
            }
            if ($OKtoIndex == 1) {
                $title = $data['title'];
                $host = $data['host'];
                $path = $data['path'];
                $fulltxt = $data['fulltext'];
                $desc = substr($data['description'], 0, 254);
                $url_parts = parse_url($url);
                $domain_for_db = $url_parts['host'];
                if (isset($domain_arr[$domain_for_db])) {
                    $dom_id = $domain_arr[$domain_for_db];
                } else {
                    mysql_query("insert into " . $mysql_table_prefix . "domains (domain) values ('{$domain_for_db}')");
                    $dom_id = mysql_insert_id();
                    $domain_arr[$domain_for_db] = $dom_id;
                }
                $wordarray = calc_weights($wordarray, $title, $host, $path, $data['keywords']);
                //if there are words to index, add the link to the database, get its id, and add the word + their relation
                if (is_array($wordarray) && count($wordarray) > $min_words_per_page) {
                    if ($md5sum == '') {
                        mysql_query("insert into " . $mysql_table_prefix . "links (site_id, url, title, description, fulltxt, indexdate, size, md5sum, level) values ('{$site_id}', '{$url}', '{$title}', '{$desc}', '{$fulltxt}', curdate(), '{$pageSize}', '{$newmd5sum}', {$thislevel})");
                        echo mysql_error();
                        $result = mysql_query("select link_id from " . $mysql_table_prefix . "links where url='{$url}'");
                        echo mysql_error();
                        $row = mysql_fetch_row($result);
                        $link_id = $row[0];
                        save_keywords($wordarray, $link_id, $dom_id);
                        printStandardReport('indexed', $command_line);
                    } else {
                        if ($md5sum != '' && $md5sum != $newmd5sum) {
                            //if page has changed, start updating
                            $result = mysql_query("select link_id from " . $mysql_table_prefix . "links where url='{$url}'");
                            echo mysql_error();
                            $row = mysql_fetch_row($result);
                            $link_id = $row[0];
                            for ($i = 0; $i <= 15; $i++) {
                                $char = dechex($i);
                                mysql_query("delete from " . $mysql_table_prefix . "link_keyword{$char} where link_id={$link_id}");
                                echo mysql_error();
                            }
                            save_keywords($wordarray, $link_id, $dom_id);
                            $query = "update " . $mysql_table_prefix . "links set title='{$title}', description ='{$desc}', fulltxt = '{$fulltxt}', indexdate=now(), size = '{$pageSize}', md5sum='{$newmd5sum}', level={$thislevel} where link_id={$link_id}";
                            mysql_query($query);
                            echo mysql_error();
                            printStandardReport('re-indexed', $command_line);
                        }
                    }
                } else {
                    printStandardReport('minWords', $command_line);
                }
            }
        }
    } else {
        $deletable = 1;
        printUrlStatus($url_status['state'], $command_line);
    }
    if ($reindex == 1 && $deletable == 1) {
        check_for_removal($url);
    } else {
        if ($reindex == 1) {
        }
    }
    if (!isset($all_links)) {
        $all_links = 0;
    }
    if (!isset($numoflinks)) {
        $numoflinks = 0;
    }
    printLinksReport($numoflinks, $all_links, $command_line);
}
Ejemplo n.º 2
0
function index_url($url, $level, $site_id, $md5sum, $domain, $indexdate, $sessid, $can_leave_domain, $reindex)
{
    global $min_delay;
    global $command_line;
    global $min_words_per_page;
    global $supdomain, $index_vpaths;
    global $user_agent, $tmp_urls, $delay_time, $domain_arr;
    global $db;
    $deletable = 0;
    $url_status = url_status($url);
    $thislevel = $level - 1;
    if (strstr($url_status['state'], "Relocation")) {
        $url = preg_replace("/ /", "", url_purify($url_status['path'], $url, $can_leave_domain));
        if ($url != '') {
            $result = $db->query("SELECT link FROM " . TABLE_PREFIX . "temp WHERE link=" . $db->quote($url) . " AND id=" . $db->quote($sessid));
            echo sql_errorstring(__FILE__, __LINE__);
            if ($result->fetch()) {
                $result->closeCursor();
                $db->exec("INSERT INTO " . TABLE_PREFIX . "temp (link, level, id) VALUES (" . $db->quote($url) . ", " . $db->quote($level) . ", " . $db->quote($sessid) . ")");
                echo sql_errorstring(__FILE__, __LINE__);
            }
        }
        $url_status['state'] == "redirected";
    }
    if (!$index_vpaths && $url_status['state'] == 'ok') {
        $url_parts = parse_url($url);
        $base = basename($url_parts['path']);
        if (strstr($base, '.') == false) {
            $url_status['state'] = "directory listing or default redirect";
        }
    }
    ini_set("user_agent", $user_agent);
    if ($url_status['state'] == 'ok') {
        $OKtoIndex = 1;
        $file_read_error = 0;
        if (time() - $delay_time < $min_delay) {
            sleep($min_delay - (time() - $delay_time));
        }
        $delay_time = time();
        if (!fst_lt_snd(phpversion(), "4.3.0")) {
            $file = file_get_contents($url);
            if ($file === FALSE) {
                $file_read_error = 1;
            }
        } else {
            $fl = @fopen($url, "r");
            if ($fl) {
                while ($buffer = @fgets($fl, 4096)) {
                    $file .= $buffer;
                }
            } else {
                $file_read_error = 1;
            }
            fclose($fl);
        }
        if ($file_read_error) {
            $contents = getFileContents($url);
            $file = $contents['file'];
        }
        $pageSize = number_format(strlen($file) / 1024, 2, ".", "");
        printPageSizeReport($pageSize);
        if ($url_status['content'] != 'text') {
            $file = extract_text($file, $url_status['content']);
        }
        printStandardReport('starting', $command_line);
        $newmd5sum = md5($file);
        if ($reindex == 0) {
            if ($md5sum == $newmd5sum) {
                printStandardReport('md5notChanged', $command_line);
                $OKtoIndex = 0;
            } else {
                if (isDuplicateMD5($newmd5sum)) {
                    $OKtoIndex = 0;
                    printStandardReport('duplicate', $command_line);
                }
            }
        }
        if (($md5sum != $newmd5sum || $reindex == 1) && $OKtoIndex == 1) {
            $urlparts = parse_url($url);
            $newdomain = $urlparts['host'];
            $type = 0;
            // remove link to css file
            //get all links from file
            $data = clean_file($file, $url, $url_status['content']);
            if ($data['noindex'] == 1) {
                $OKtoIndex = 0;
                $deletable = 1;
                printStandardReport('metaNoindex', $command_line);
            }
            $wordarray = unique_array(explode(" ", $data['content']));
            if ($data['nofollow'] != 1) {
                $links = get_links($file, $url, $can_leave_domain, $data['base']);
                $links = distinct_array($links);
                $all_links = count($links);
                $numoflinks = 0;
                //if there are any, add to the temp table, but only if there isnt such url already
                if (is_array($links)) {
                    reset($links);
                    while ($thislink = each($links)) {
                        if (!isset($tmp_urls[$thislink[1]]) || $tmp_urls[$thislink[1]] != 1) {
                            $tmp_urls[$thislink[1]] = 1;
                            $numoflinks++;
                            $db->exec("INSERT INTO " . TABLE_PREFIX . "temp (link, level, id) VALUES (" . $db->quote($thislink[1]) . ", " . $db->quote($level) . ", " . $db->quote($sessid) . ")");
                            echo sql_errorstring(__FILE__, __LINE__);
                        }
                    }
                }
            } else {
                printStandardReport('noFollow', $command_line);
            }
            if ($OKtoIndex == 1) {
                $title = $data['title'];
                $host = $data['host'];
                $path = $data['path'];
                $fulltxt = str_replace("\\'", "&quot;", $data['fulltext']);
                $desc = substr($data['description'], 0, 254);
                $language = substr($data['language'], 0, 2);
                $url_parts = parse_url($url);
                $domain_for_db = $url_parts['host'];
                if (isset($domain_arr[$domain_for_db])) {
                    $dom_id = $domain_arr[$domain_for_db];
                } else {
                    $db->exec("INSERT INTO " . TABLE_PREFIX . "domains (domain) VALUES (" . $db->quote($domain_for_db) . ")");
                    $dom_id = $db->lastInsertId();
                    $domain_arr[$domain_for_db] = $dom_id;
                }
                $wordarray = calc_weights($wordarray, $title, $host, $path, $data['keywords']);
                $tstamp = "'" . date("Y-m-d") . "'";
                //if there are words to index, add the link to the database, get its id, and add the word + their relation
                if (is_array($wordarray) && count($wordarray) > $min_words_per_page) {
                    $site_id = $db->quote($site_id);
                    $url = $db->quote($url);
                    $title = $db->quote($title);
                    $desc = $db->quote($desc);
                    $language = $db->quote($language);
                    $fulltxt = $db->quote($fulltxt);
                    $pageSize = $db->quote($pageSize);
                    $Qmd5sum = $db->quote($newmd5sum);
                    if ($md5sum == '') {
                        $db->exec("INSERT INTO " . TABLE_PREFIX . "links (site_id, url, title, description, language, fulltxt, indexdate, size, md5sum, level) VALUES ({$site_id}, {$url}, {$title}, {$desc}, {$language}, {$fulltxt}, {$tstamp}, {$pageSize}, {$Qmd5sum}, {$thislevel})");
                        $error = sql_errorstring(__FILE__, __LINE__);
                        if ($error) {
                            echo $error;
                            printStandardReport('skipped', $command_line);
                        } else {
                            $result = $db->query("SELECT link_id FROM " . TABLE_PREFIX . "links WHERE url={$url}");
                            echo sql_errorstring(__FILE__, __LINE__);
                            $row = $result->fetch();
                            $link_id = $row[0];
                            $result->closeCursor();
                            save_keywords($wordarray, $link_id, $dom_id);
                            printStandardReport('indexed', $command_line);
                        }
                    } else {
                        if ($md5sum != '' && $md5sum != $newmd5sum) {
                            //if page has changed, start updating
                            $result = $db->query("SELECT link_id FROM " . TABLE_PREFIX . "links WHERE url={$url}");
                            echo sql_errorstring(__FILE__, __LINE__);
                            $row = $result->fetch();
                            $link_id = $row[0];
                            $result->closeCursor();
                            for ($i = 0; $i <= 15; $i++) {
                                $char = dechex($i);
                                $db->exec("DELETE FROM " . TABLE_PREFIX . "link_keyword{$char} WHERE link_id={$link_id}");
                                echo sql_errorstring(__FILE__, __LINE__);
                            }
                            save_keywords($wordarray, $link_id, $dom_id);
                            $db->exec("UPDATE " . TABLE_PREFIX . "links SET title={$title}, description={$desc}, language={$language}, fulltxt={$fulltxt}, indexdate={$tstamp}, size={$pageSize}, md5sum={$Qmd5sum}, level={$thislevel} WHERE link_id={$link_id}");
                            echo sql_errorstring(__FILE__, __LINE__);
                            printStandardReport('re-indexed', $command_line);
                        }
                    }
                } else {
                    printStandardReport('minWords', $command_line);
                }
            }
        }
    } else {
        $deletable = 1;
        printUrlStatus($url_status['state'], $command_line);
    }
    if ($reindex == 1 && $deletable == 1) {
        check_for_removal($url);
    } else {
        if ($reindex == 1) {
            //???
        }
    }
    if (!isset($all_links)) {
        $all_links = 0;
    }
    if (!isset($numoflinks)) {
        $numoflinks = 0;
    }
    printLinksReport($numoflinks, $all_links, $command_line);
}
Ejemplo n.º 3
-6
function index_url($url, $level, $site_id, $md5sum, $domain, $indexdate, $sessid, $can_leave_domain, $reindex)
{
    global $tmp_urls, $delay_time, $domain_arr, $charSet, $url_status, $whitelist, $blacklist, $supdomain, $smp, $realnum, $dup_url, $entities, $command_line;
    if (DEBUG == '0') {
        error_reporting(0);
    } else {
        error_reporting(E_ERROR);
        //  otherwise  a non existing siemap.xml  would always cause a warning message
    }
    $needsReindex = 1;
    $deletable = 0;
    $url_status = url_status($url);
    $thislevel = $level - 1;
    if ($smp != 1 && Configure::read('follow_sitemap') == 1) {
        //  enter here if we don't already know a valid sitemap and if admin settings allowed us to do so
        $tmp_urls = get_temp_urls($sessid);
        //  reload previous temp
        $url2 = remove_sessid(convert_url($url));
        // get folder where sitemap should be and if exists, cut existing filename, suffix and subfolder
        //                Configure::read('local') = "http://localhost/publizieren/";   //  your base adress for your local server
        $sitemap_name = "sitemap.xml";
        //  could be individualized
        $host = parse_url($url2);
        $hostname = $host[host];
        if ($hostname == 'localhost') {
            $host1 = str_replace(Configure::read('local'), '', $url2);
        }
        $pos = strpos($host1, "/");
        //      on local server delete all behind the /
        if ($pos) {
            $host1 = substr($host1, 0, $pos);
        }
        //      build full adress again, now only until host
        if ($hostname == 'localhost') {
            $url2 = Configure::read('local') . $host1;
        } else {
            $url2 = "{$host['scheme']}://{$hostname}";
        }
        $input_file = "{$url2}/{$sitemap_name}";
        // create path to sitemap
        if ($handle = fopen($input_file, "r")) {
            // happy times, we found a new sitemap
            $links = get_sitemap($input_file, TABLE_PREFIX);
            // now extract links from sitemap.xml
            if ($links != '') {
                //  if links were extracted from sitemap.xml
                reset($links);
                while ($thislink = each($links)) {
                    //  check if we already know this link as a site url
                    $result = mysql_query("select url from " . TABLE_PREFIX . "sites where url like '{$thislink['1']}%'");
                    if (DEBUG > '0') {
                        echo mysql_error();
                    }
                    $rows = mysql_num_rows($result);
                    if ($rows == '0') {
                        // for all new links: save in temp table
                        mysql_query("insert into " . TABLE_PREFIX . "temp (link, level, id) values ('{$thislink['1']}', '{$level}', '{$sessid}')");
                        if (DEBUG > '0') {
                            echo mysql_error();
                        }
                    }
                }
                clean_resource($result);
                $smp = '1';
                //     there was a valid sitemap and we stored the new links
            }
            unset($links, $input_file);
            fclose($handle);
        }
    }
    if (strstr($url_status['state'], "Relocation")) {
        $url = eregi_replace(" ", "", url_purify($url_status['path'], $url, $can_leave_domain));
        if ($url != '') {
            $result = mysql_query("select link from " . TABLE_PREFIX . "temp where link='{$url}' && id = '{$sessid}'");
            if (DEBUG > '0') {
                echo mysql_error();
            }
            $rows = mysql_num_rows($result);
            if ($rows == 0) {
                mysql_query("insert into " . TABLE_PREFIX . "temp (link, level, id) values ('{$url}', '{$level}', '{$sessid}')");
                if (DEBUG > '0') {
                    echo mysql_error();
                }
            }
            clean_resource($result);
        }
        $url_status['state'] == "redirected";
    }
    ini_set("user_agent", Configure::read('user_agent'));
    if ($url_status['state'] == 'ok') {
        $OKtoIndex = 1;
        $file_read_error = 0;
        if (time() - $delay_time < Configure::read('min_delay')) {
            sleep(Configure::read('min_delay') - (time() - $delay_time));
        }
        $delay_time = time();
        if (!fst_lt_snd(phpversion(), "4.3.0")) {
            $file = file_get_contents($url);
            if ($file === FALSE) {
                $file_read_error = 1;
            }
        } else {
            $fl = @fopen($url, "r");
            if ($fl) {
                while ($buffer = @fgets($fl, 4096)) {
                    $file .= $buffer;
                }
                unset($buffer);
            } else {
                $file_read_error = 1;
            }
            fclose($fl);
        }
        if ($file_read_error || Configure::read('utf8') == 1) {
            unset($file);
            $contents = getFileContents($url);
            // parse_url to get charset
            $file = $contents['file'];
        }
        $pageSize = number_format(strlen($file) / 1024, 2, ".", "");
        printPageSizeReport($pageSize);
        if ($url_status['content'] != 'text') {
            $file = extract_text($file, $url_status['content']);
            //for DOCs, PDFs etc we need special converter
            if ($file == 'ERROR') {
                //      if error, suppress further indexing
                $OKtoIndex = 0;
                $file_read_error = 1;
            }
        }
        if (Configure::read('utf8') == 1) {
            //   enter here if file should be translated into utf-8
            $charSet = $contents['charset'];
            if ($charSet == '') {
                // if we did not find any charset, we will use our own
                $charSet = Configure::read('home_charset');
            }
            $charSet = strtoupper(trim($charSet));
            if (strpos($charSet, '8859')) {
                $conv_file = html_entity_decode($file);
            } else {
                $conv_file = $file;
                //  pure code
            }
            if ($charSet != "UTF-8") {
                //  enter here only, if site / file is not jet UTF-8 coded
                $iconv_file = iconv($charSet, "UTF-8", $conv_file);
                //      if installed, first try to use PHP function iconv
                if (trim($iconv_file) == "") {
                    // iconv is not installed or input charSet not available. We need to use class ConvertCharset
                    $charSet = str_ireplace('iso-', '', $charSet);
                    $charSet = str_ireplace('iso', '', $charSet);
                    $NewEncoding = new ConvertCharset($charSet, "utf-8");
                    $NewFileOutput = $NewEncoding->Convert($conv_file);
                    $file = $NewFileOutput;
                } else {
                    $file = $iconv_file;
                }
                unset($conv_file, $iconv_file, $NewEncoding, $NewFileOutput);
            }
        }
        $data = clean_file($file, $url, $url_status['content']);
        $newmd5sum = md5($data['content']);
        if ($md5sum == $newmd5sum) {
            printStandardReport('md5notChanged', $command_line);
            $OKtoIndex = 0;
            $realnum--;
        } else {
            if (Configure::read('use_white') == '1') {
                $found = '0';
                //  check if content of page matches any word in whitelist
                foreach ($whitelist as $key => $value) {
                    $met = stripos($file, $value);
                    if ($met) {
                        $found = '1';
                    }
                }
                if ($found == '0') {
                    printStandardReport('noWhitelist', $command_line);
                    $OKtoIndex = 0;
                    $realnum--;
                }
            }
            if (Configure::read('use_black') == '1') {
                $found = '0';
                //  check if content of page matches any word in blacklist
                foreach ($blacklist as $key => $value) {
                    $met = stripos($file, $value);
                    if ($met) {
                        $found = '1';
                    }
                }
                if ($found == '1') {
                    printStandardReport('matchBlacklist', $command_line);
                    $OKtoIndex = 0;
                    $realnum--;
                }
            }
            //     check for duplicate page content
            $result = mysql_query("select link_id from " . TABLE_PREFIX . "links where md5sum='{$newmd5sum}'");
            if (DEBUG > '0') {
                echo mysql_error();
            }
            if (mysql_num_rows($result) > 0) {
                //  display warning message and urls with duplicate content
                printStandardReport('duplicate', $command_line);
                $num_rows = mysql_num_rows($result);
                for ($i = 0; $i < $num_rows; $i++) {
                    $link_id = mysql_result($result, $i, "link_id");
                    $num = $i + 1;
                    $res = mysql_query("select url from " . TABLE_PREFIX . "links where link_id like '{$link_id}'");
                    if (DEBUG > '0') {
                        echo mysql_error();
                    }
                    $row = mysql_fetch_row($res);
                    $dup_url = $row[0];
                    clean_resource($res);
                    printDupReport($dup_url, $command_line);
                }
                if (Configure::read('dup_content') == '0') {
                    //  enter here, if pages with duplicate content should not be indexed/re-indexed
                    $OKtoIndex = 0;
                    $realnum--;
                } else {
                    $OKtoIndex = 1;
                }
            }
        }
        if (($md5sum != $newmd5sum || $reindex == 1) && $OKtoIndex == 1) {
            $urlparts = parse_url($url);
            $newdomain = $urlparts['host'];
            $type = 0;
            if ($data['noindex'] == 1) {
                $OKtoIndex = 0;
                $deletable = 1;
                printStandardReport('metaNoindex', $command_line);
            }
            if (Configure::read('use_white') == '1') {
                $found = '0';
                //  check if content of page matches any word in whitelist
                foreach ($whitelist as $key => $value) {
                    $met = stripos($data[fulltext], $value);
                    if ($met) {
                        $found = '1';
                    }
                }
                if ($found == '0') {
                    printStandardReport('noWhitelist', $command_line);
                    $OKtoIndex = 0;
                    $realnum--;
                }
            }
            if (Configure::read('use_black') == '1') {
                $found = '0';
                //  check if content of page matches any word in blacklist
                foreach ($blacklist as $key => $value) {
                    $met = stripos($data[fulltext], $value);
                    if ($met) {
                        $found = '1';
                    }
                }
                if ($found == '1') {
                    printStandardReport('matchBlacklist', $command_line);
                    $OKtoIndex = 0;
                    $realnum--;
                }
            }
            $wordarray = unique_array(explode(" ", $data['content']));
            if ($smp != 1) {
                if ($data['nofollow'] != 1) {
                    $links = get_links($file, $url, $can_leave_domain, $data['base']);
                    $links = distinct_array($links);
                    $all_links = count($links);
                    if ($all_links > Configure::read('max_links')) {
                        $all_links = Configure::read('max_links');
                    }
                    $links = array_slice($links, 0, Configure::read('max_links'));
                    if ($realnum < Configure::read('max_links')) {
                        $numoflinks = 0;
                        //if there are any, add to the temp table, but only if there isnt such url already
                        if (is_array($links)) {
                            reset($links);
                            if (DEBUG == '2') {
                                //  if debug mode, show details
                                printStandardReport('newLinks', $command_line);
                            }
                            while ($thislink = each($links)) {
                                if ($tmp_urls[$thislink[1]] != 1) {
                                    $tmp_urls[$thislink[1]] = 1;
                                    $numoflinks++;
                                    if (DEBUG == '2') {
                                        $act_link = $thislink[1];
                                        printNewLinks($act_link);
                                    }
                                    if ($numoflinks <= Configure::read('max_links')) {
                                        mysql_query("insert into " . TABLE_PREFIX . "temp (link, level, id) values ('{$thislink['1']}', '{$level}', '{$sessid}')");
                                    }
                                    if (DEBUG > '0') {
                                        echo mysql_error();
                                    }
                                }
                            }
                        }
                    }
                } else {
                    printStandardReport('noFollow', $command_line);
                }
                unset($file);
            }
            if ($OKtoIndex == 1) {
                if (Configure::read('link_check') == 0) {
                    $title = $data['title'];
                    $host = $data['host'];
                    $path = $data['path'];
                    $fulltxt = $data['fulltext'];
                    $desc = substr($data['description'], 0, 254);
                    $url_parts = parse_url($url);
                    $domain_for_db = $url_parts['host'];
                    if (isset($domain_arr[$domain_for_db])) {
                        $dom_id = $domain_arr[$domain_for_db];
                    } else {
                        mysql_query("insert into " . TABLE_PREFIX . "domains (domain) values ('{$domain_for_db}')");
                        $dom_id = mysql_insert_id();
                        $domain_arr[$domain_for_db] = $dom_id;
                    }
                    $wordarray = calc_weights($wordarray, $title, $host, $path, $data['keywords'], $url_parts);
                    //if there are words to index, add the link to the database, get its id, and add the word + their relation
                    if (is_array($wordarray) && count($wordarray) > Configure::read('min_words_per_page')) {
                        if ($md5sum == '') {
                            mysql_query("insert into " . TABLE_PREFIX . "links (site_id, url, title, description, fulltxt, indexdate, size, md5sum, level) values ('{$site_id}', '{$url}', '{$title}', '{$desc}', '{$fulltxt}', curdate(), '{$pageSize}', '{$newmd5sum}', {$thislevel})");
                            if (DEBUG > '0') {
                                echo mysql_error();
                            }
                            $result = mysql_query("select link_id from " . TABLE_PREFIX . "links where url='{$url}'");
                            if (DEBUG > '0') {
                                echo mysql_error();
                            }
                            $row = mysql_fetch_row($result);
                            $link_id = $row[0];
                            clean_resource($result);
                            if (DEBUG == '2') {
                                //  if debug mode, show details
                                printStandardReport('newKeywords', $command_line);
                            }
                            save_keywords($wordarray, $link_id, $dom_id);
                            if (DEBUG == '2') {
                                printStandardReport('indexed1', $command_line);
                            } else {
                                printStandardReport('indexed', $command_line);
                            }
                        } else {
                            if ($md5sum != '' && $md5sum != $newmd5sum) {
                                //if page has changed, start updating
                                $result = mysql_query("select link_id from " . TABLE_PREFIX . "links where url='{$url}'");
                                if (DEBUG > '0') {
                                    echo mysql_error();
                                }
                                $row = mysql_fetch_row($result);
                                $link_id = $row[0];
                                for ($i = 0; $i <= 15; $i++) {
                                    $char = dechex($i);
                                    mysql_query("delete from " . TABLE_PREFIX . "link_keyword{$char} where link_id={$link_id}");
                                    if (DEBUG > '0') {
                                        echo mysql_error();
                                    }
                                }
                                clean_resource($result);
                                if (DEBUG == '2') {
                                    //  if debug mode, show details
                                    printStandardReport('newKeywords', $command_line);
                                }
                                save_keywords($wordarray, $link_id, $dom_id);
                                $query = "update " . TABLE_PREFIX . "links set title='{$title}', description ='{$desc}', fulltxt = '{$fulltxt}', indexdate=now(), size = '{$pageSize}', md5sum='{$newmd5sum}', level={$thislevel} where link_id={$link_id}";
                                mysql_query($query);
                                if (DEBUG > '0') {
                                    echo mysql_error();
                                }
                                if (DEBUG == '2') {
                                    printStandardReport('re-indexed1', $command_line);
                                } else {
                                    printStandardReport('re-indexed', $command_line);
                                }
                            }
                        }
                    } else {
                        printStandardReport('minWords', $command_line);
                        $realnum--;
                    }
                } else {
                    printStandardReport('link_okay', $command_line);
                }
                unset($wordarray, $title, $fulltxt, $desc);
            }
        }
    } else {
        $deletable = 1;
        printUrlStatus($url_status['state'], $command_line);
    }
    if ($reindex == 1 && $deletable == 1) {
        check_for_removal($url);
    } else {
        if ($reindex == 1) {
        }
    }
    if (!isset($all_links)) {
        $all_links = 0;
    }
    if (!isset($numoflinks)) {
        $numoflinks = 0;
    }
    if ($smp != 1) {
        //      if valid sitemap found, no LinkReport
        printLinksReport($numoflinks, $all_links, $command_line);
    }
}