Example #1
0
function index_url($url, $level, $site_id, $md5sum, $domain, $indexdate, $sessid, $can_leave_domain, $reindex)
{
    global $entities, $min_delay;
    global $command_line;
    global $min_words_per_page;
    global $supdomain;
    global $mysql_table_prefix, $user_agent, $tmp_urls, $delay_time, $domain_arr;
    $needsReindex = 1;
    $deletable = 0;
    $url_status = url_status($url);
    $thislevel = $level - 1;
    if (strstr($url_status['state'], "Relocation")) {
        $url = preg_replace("/ /", "", url_purify($url_status['path'], $url, $can_leave_domain));
        if ($url != '') {
            $result = mysql_query("select link from " . $mysql_table_prefix . "temp where link='{$url}' && id = '{$sessid}'");
            echo mysql_error();
            $rows = mysql_numrows($result);
            if ($rows == 0) {
                mysql_query("insert into " . $mysql_table_prefix . "temp (link, level, id) values ('{$url}', '{$level}', '{$sessid}')");
                echo mysql_error();
            }
        }
        $url_status['state'] == "redirected";
    }
    /*
    		if ($indexdate <> '' && $url_status['date'] <> '') {
    			if ($indexdate > $url_status['date']) {
    				$url_status['state'] = "Date checked. Page contents not changed";
    				$needsReindex = 0;
    			}
    		}*/
    ini_set("user_agent", $user_agent);
    if ($url_status['state'] == 'ok') {
        $OKtoIndex = 1;
        $file_read_error = 0;
        if (time() - $delay_time < $min_delay) {
            sleep($min_delay - (time() - $delay_time));
        }
        $delay_time = time();
        if (!fst_lt_snd(phpversion(), "4.3.0")) {
            $file = file_get_contents($url);
            if ($file === FALSE) {
                $file_read_error = 1;
            }
        } else {
            $fl = @fopen($url, "r");
            if ($fl) {
                while ($buffer = @fgets($fl, 4096)) {
                    $file .= $buffer;
                }
            } else {
                $file_read_error = 1;
            }
            fclose($fl);
        }
        if ($file_read_error) {
            $contents = getFileContents($url);
            $file = $contents['file'];
        }
        $pageSize = number_format(strlen($file) / 1024, 2, ".", "");
        printPageSizeReport($pageSize);
        if ($url_status['content'] != 'text') {
            $file = extract_text($file, $url_status['content']);
        }
        printStandardReport('starting', $command_line);
        $newmd5sum = md5($file);
        if ($md5sum == $newmd5sum) {
            printStandardReport('md5notChanged', $command_line);
            $OKtoIndex = 0;
        } else {
            if (isDuplicateMD5($newmd5sum)) {
                $OKtoIndex = 0;
                printStandardReport('duplicate', $command_line);
            }
        }
        if (($md5sum != $newmd5sum || $reindex == 1) && $OKtoIndex == 1) {
            $urlparts = parse_url($url);
            $newdomain = $urlparts['host'];
            $type = 0;
            /*		if ($newdomain <> $domain)
            					$domainChanged = 1;
            
            				if ($domaincb==1) {
            					$start = strlen($newdomain) - strlen($supdomain);
            					if (substr($newdomain, $start) == $supdomain) {
            						$domainChanged = 0;
            					}
            				}*/
            // remove link to css file
            //get all links from file
            $data = clean_file($file, $url, $url_status['content']);
            if ($data['noindex'] == 1) {
                $OKtoIndex = 0;
                $deletable = 1;
                printStandardReport('metaNoindex', $command_line);
            }
            $wordarray = unique_array(explode(" ", $data['content']));
            if ($data['nofollow'] != 1) {
                $links = get_links($file, $url, $can_leave_domain, $data['base']);
                $links = distinct_array($links);
                $all_links = count($links);
                $numoflinks = 0;
                //if there are any, add to the temp table, but only if there isnt such url already
                if (is_array($links)) {
                    reset($links);
                    while ($thislink = each($links)) {
                        if ($tmp_urls[$thislink[1]] != 1) {
                            $tmp_urls[$thislink[1]] = 1;
                            $numoflinks++;
                            mysql_query("insert into " . $mysql_table_prefix . "temp (link, level, id) values ('{$thislink['1']}', '{$level}', '{$sessid}')");
                            echo mysql_error();
                        }
                    }
                }
            } else {
                printStandardReport('noFollow', $command_line);
            }
            if ($OKtoIndex == 1) {
                $title = $data['title'];
                $host = $data['host'];
                $path = $data['path'];
                $fulltxt = $data['fulltext'];
                $desc = substr($data['description'], 0, 254);
                $url_parts = parse_url($url);
                $domain_for_db = $url_parts['host'];
                if (isset($domain_arr[$domain_for_db])) {
                    $dom_id = $domain_arr[$domain_for_db];
                } else {
                    mysql_query("insert into " . $mysql_table_prefix . "domains (domain) values ('{$domain_for_db}')");
                    $dom_id = mysql_insert_id();
                    $domain_arr[$domain_for_db] = $dom_id;
                }
                $wordarray = calc_weights($wordarray, $title, $host, $path, $data['keywords']);
                //if there are words to index, add the link to the database, get its id, and add the word + their relation
                if (is_array($wordarray) && count($wordarray) > $min_words_per_page) {
                    if ($md5sum == '') {
                        mysql_query("insert into " . $mysql_table_prefix . "links (site_id, url, title, description, fulltxt, indexdate, size, md5sum, level) values ('{$site_id}', '{$url}', '{$title}', '{$desc}', '{$fulltxt}', curdate(), '{$pageSize}', '{$newmd5sum}', {$thislevel})");
                        echo mysql_error();
                        $result = mysql_query("select link_id from " . $mysql_table_prefix . "links where url='{$url}'");
                        echo mysql_error();
                        $row = mysql_fetch_row($result);
                        $link_id = $row[0];
                        save_keywords($wordarray, $link_id, $dom_id);
                        printStandardReport('indexed', $command_line);
                    } else {
                        if ($md5sum != '' && $md5sum != $newmd5sum) {
                            //if page has changed, start updating
                            $result = mysql_query("select link_id from " . $mysql_table_prefix . "links where url='{$url}'");
                            echo mysql_error();
                            $row = mysql_fetch_row($result);
                            $link_id = $row[0];
                            for ($i = 0; $i <= 15; $i++) {
                                $char = dechex($i);
                                mysql_query("delete from " . $mysql_table_prefix . "link_keyword{$char} where link_id={$link_id}");
                                echo mysql_error();
                            }
                            save_keywords($wordarray, $link_id, $dom_id);
                            $query = "update " . $mysql_table_prefix . "links set title='{$title}', description ='{$desc}', fulltxt = '{$fulltxt}', indexdate=now(), size = '{$pageSize}', md5sum='{$newmd5sum}', level={$thislevel} where link_id={$link_id}";
                            mysql_query($query);
                            echo mysql_error();
                            printStandardReport('re-indexed', $command_line);
                        }
                    }
                } else {
                    printStandardReport('minWords', $command_line);
                }
            }
        }
    } else {
        $deletable = 1;
        printUrlStatus($url_status['state'], $command_line);
    }
    if ($reindex == 1 && $deletable == 1) {
        check_for_removal($url);
    } else {
        if ($reindex == 1) {
        }
    }
    if (!isset($all_links)) {
        $all_links = 0;
    }
    if (!isset($numoflinks)) {
        $numoflinks = 0;
    }
    printLinksReport($numoflinks, $all_links, $command_line);
}
Example #2
0
function check_robot_txt($url)
{
    global $user_agent;
    $urlparts = parse_url($url);
    $url = 'http://' . $urlparts['host'] . "/robots.txt";
    $url_status = url_status($url);
    $omit = array();
    if ($url_status['state'] == "ok") {
        $robot = file($url);
        if (!$robot) {
            $contents = getFileContents($url);
            $file = $contents['file'];
            $robot = explode("\n", $file);
        }
        $regs = array();
        $this_agent = "";
        while (list($id, $line) = each($robot)) {
            if (eregi("^user-agent: *([^#]+) *", $line, $regs)) {
                $this_agent = trim($regs[1]);
                if ($this_agent == '*' || $this_agent == $user_agent) {
                    $check = 1;
                } else {
                    $check = 0;
                }
            }
            if (eregi("disallow: *([^#]+)", $line, $regs) && $check == 1) {
                $disallow_str = eregi_replace("[\n ]+", "", $regs[1]);
                if (trim($disallow_str) != "") {
                    $omit[] = $disallow_str;
                } else {
                    if ($this_agent == '*' || $this_agent == $user_agent) {
                        return null;
                    }
                }
            }
        }
    }
    return $omit;
}
Example #3
0
function link_check($url, $level, $sessid, $can_leave_domain, $reindex)
{
    global $command_line;
    $needsReindex = 1;
    $deletable = 0;
    $local_url = 0;
    $local_url = strpos($url, 'localhost');
    if ($local_url != '7') {
        $url_status = url_status($url);
        $thislevel = $level - 1;
        if (strstr($url_status['state'], "Relocation")) {
            $url = eregi_replace(" ", "", url_purify($url_status['path'], $url, $can_leave_domain));
            if ($url != '') {
                $result = mysql_query("select link from " . TABLE_PREFIX . "temp where link='{$url}' && id = '{$sessid}'");
                if (DEBUG > '0') {
                    echo mysql_error();
                }
                $rows = mysql_num_rows($result);
                if ($rows == 0) {
                    mysql_query("insert into " . TABLE_PREFIX . "temp (link, level, id) values ('{$url}', '{$level}', '{$sessid}')");
                    if (DEBUG > '0') {
                        echo mysql_error();
                    }
                }
            }
            $url_status['state'] == "redirected";
            clean_resource($result);
        }
        ini_set("user_agent", Configure::read('user_agent'));
        if ($url_status['state'] == 'ok') {
            printStandardReport('link_okay', $command_line);
        } else {
            $deletable = 1;
            printUrlStatus($url_status['state'], $command_line);
        }
    }
    if ($local_url == '7') {
        printStandardReport('link_local', $command_line);
    }
    if ($reindex == 1 && $deletable == 1) {
        check_for_removal($url);
    } else {
        if ($reindex == 1) {
        }
    }
    if (!isset($all_links)) {
        $all_links = 0;
    }
    if (!isset($numoflinks)) {
        $numoflinks = 0;
    }
}
function link_check($url, $level, $sessid, $can_leave_domain, $reindex, $site_id)
{
    global $db_con, $debug, $command_line, $mysql_table_prefix, $user_agent, $index_media, $no_log, $clear;
    $needsReindex = 1;
    $deletable = 0;
    $local_url = 0;
    $local_url = strpos($url, 'localhost');
    if ($local_url != '/') {
        $url_status = url_status($url, $site_id, $sessid);
        $thislevel = $level - 1;
        if (strstr($url_status['state'], "Relocation")) {
            $care_excl = '1';
            //  care file suffixed to be excluded
            $relocated = '1';
            //  URL is relocated
            $local_redir = '';
            $url = $db_con->real_escape_string(preg_replace("/ /i", "", url_purify($url_status['path'], $url, $can_leave_domain, $care_excl, $relocated, $local_redir)));
            if (!$url) {
                $url_status['aborted'] = 1;
                $url_status['state'] = "Indexation aborted because of undefined redirection error.";
                return $url_status;
            }
            //  abort indexation, if the redirected URL is equal to calling URL
            if ($url == 'self') {
                $url_status['aborted'] = 1;
                $url_status['state'] = "Indexation aborted for this page, because the redirection was a link in it selves.<br />Blocked by Sphide-plus, because this could end in an infinite indexation loop.";
                return $url_status;
            }
            //  abort indexation, if the redirected URL contains invalid file suffix
            if ($url == 'excl') {
                $url_status['aborted'] = 1;
                $url_status['state'] = "Indexation aborted because the redirected link does not meet the URL suffix conditions.";
                return $url_status;
            }
            //  abort indexation, because purifing the redirected URL failed
            if (!strstr($url, "//")) {
                $url_status['aborted'] = 1;
                $url_status['state'] = "Indexation aborted because: {$url}";
                return $url_status;
            }
            mysqltest();
            $sql_query = "SELECT link from " . $mysql_table_prefix . "temp where link='{$url}' && id = '{$sessid}'";
            $result = $db_con->query($sql_query);
            if ($debug && $db_con->errno) {
                $err_row = __LINE__ - 2;
                printf("<p><span class='red'>&nbsp;MySQL failure: %s&nbsp;\n<br /></span></p>", $db_con->error);
                if (__FUNCTION__) {
                    printf("<p><span class='red'>&nbsp;Found in script: " . __FILE__ . "&nbsp;&nbsp;row: {$err_row}&nbsp;&nbsp;in function():&nbsp;" . __FUNCTION__ . "&nbsp;<br /></span></p>");
                } else {
                    printf("<p><span class='red'>&nbsp;Found in script: " . __FILE__ . "&nbsp;&nbsp;row: {$err_row}&nbsp;<br /></span></p>");
                }
                printf("<p><span class='red'>&nbsp;Script execution aborted.&nbsp;<br /></span>");
                printf("<p><strong>Invalid query string, which caused the SQL error:</strong></p>");
                echo "<p> {$sql_query} </p>";
                exit;
            }
            $rows = $result->num_rows;
            if ($rows == 0) {
                $sql_query = "INSERT into " . $mysql_table_prefix . "temp (link, level, id) values ('{$url}', '{$level}', '{$sessid}')";
                $db_con->query($sql_query);
                if ($debug && $db_con->errno) {
                    $err_row = __LINE__ - 2;
                    printf("<p><span class='red'>&nbsp;MySQL failure: %s&nbsp;\n<br /></span></p>", $db_con->error);
                    if (__FUNCTION__) {
                        printf("<p><span class='red'>&nbsp;Found in script: " . __FILE__ . "&nbsp;&nbsp;row: {$err_row}&nbsp;&nbsp;in function():&nbsp;" . __FUNCTION__ . "&nbsp;<br /></span></p>");
                    } else {
                        printf("<p><span class='red'>&nbsp;Found in script: " . __FILE__ . "&nbsp;&nbsp;row: {$err_row}&nbsp;<br /></span></p>");
                    }
                    printf("<p><span class='red'>&nbsp;Script execution aborted.&nbsp;<br /></span>");
                    printf("<p><strong>Invalid query string, which caused the SQL error:</strong></p>");
                    echo "<p> {$sql_query} </p>";
                    exit;
                }
            }
            $url_status['state'] == "redirected";
            if ($clear == 1) {
                clean_resource($result, '17');
            }
        }
        ini_set("user_agent", $user_agent);
        if ($url_status['state'] == 'ok') {
            printStandardReport('link_okay', $command_line, $no_log);
        } else {
            $deletable = 1;
            printUrlStatus($url_status['state'], $command_line);
        }
    }
    if ($local_url == '7') {
        printStandardReport('link_local', $command_line, $no_log);
    }
    if ($reindex == 1 && $deletable == 1) {
        check_for_removal($url);
    } else {
        if ($reindex == 1) {
        }
    }
    if (!isset($all_links)) {
        $all_links = 0;
    }
    if (!isset($numoflinks)) {
        $numoflinks = 0;
    }
}
function index_url($url, $level, $site_id, $md5sum, $domain, $indexdate, $sessid, $can_leave_domain, $reindex)
{
    global $min_delay;
    global $command_line;
    global $min_words_per_page;
    global $supdomain, $index_vpaths;
    global $user_agent, $tmp_urls, $delay_time, $domain_arr;
    global $db;
    $deletable = 0;
    $url_status = url_status($url);
    $thislevel = $level - 1;
    if (strstr($url_status['state'], "Relocation")) {
        $url = preg_replace("/ /", "", url_purify($url_status['path'], $url, $can_leave_domain));
        if ($url != '') {
            $result = $db->query("SELECT link FROM " . TABLE_PREFIX . "temp WHERE link=" . $db->quote($url) . " AND id=" . $db->quote($sessid));
            echo sql_errorstring(__FILE__, __LINE__);
            if ($result->fetch()) {
                $result->closeCursor();
                $db->exec("INSERT INTO " . TABLE_PREFIX . "temp (link, level, id) VALUES (" . $db->quote($url) . ", " . $db->quote($level) . ", " . $db->quote($sessid) . ")");
                echo sql_errorstring(__FILE__, __LINE__);
            }
        }
        $url_status['state'] == "redirected";
    }
    if (!$index_vpaths && $url_status['state'] == 'ok') {
        $url_parts = parse_url($url);
        $base = basename($url_parts['path']);
        if (strstr($base, '.') == false) {
            $url_status['state'] = "directory listing or default redirect";
        }
    }
    ini_set("user_agent", $user_agent);
    if ($url_status['state'] == 'ok') {
        $OKtoIndex = 1;
        $file_read_error = 0;
        if (time() - $delay_time < $min_delay) {
            sleep($min_delay - (time() - $delay_time));
        }
        $delay_time = time();
        if (!fst_lt_snd(phpversion(), "4.3.0")) {
            $file = file_get_contents($url);
            if ($file === FALSE) {
                $file_read_error = 1;
            }
        } else {
            $fl = @fopen($url, "r");
            if ($fl) {
                while ($buffer = @fgets($fl, 4096)) {
                    $file .= $buffer;
                }
            } else {
                $file_read_error = 1;
            }
            fclose($fl);
        }
        if ($file_read_error) {
            $contents = getFileContents($url);
            $file = $contents['file'];
        }
        $pageSize = number_format(strlen($file) / 1024, 2, ".", "");
        printPageSizeReport($pageSize);
        if ($url_status['content'] != 'text') {
            $file = extract_text($file, $url_status['content']);
        }
        printStandardReport('starting', $command_line);
        $newmd5sum = md5($file);
        if ($reindex == 0) {
            if ($md5sum == $newmd5sum) {
                printStandardReport('md5notChanged', $command_line);
                $OKtoIndex = 0;
            } else {
                if (isDuplicateMD5($newmd5sum)) {
                    $OKtoIndex = 0;
                    printStandardReport('duplicate', $command_line);
                }
            }
        }
        if (($md5sum != $newmd5sum || $reindex == 1) && $OKtoIndex == 1) {
            $urlparts = parse_url($url);
            $newdomain = $urlparts['host'];
            $type = 0;
            // remove link to css file
            //get all links from file
            $data = clean_file($file, $url, $url_status['content']);
            if ($data['noindex'] == 1) {
                $OKtoIndex = 0;
                $deletable = 1;
                printStandardReport('metaNoindex', $command_line);
            }
            $wordarray = unique_array(explode(" ", $data['content']));
            if ($data['nofollow'] != 1) {
                $links = get_links($file, $url, $can_leave_domain, $data['base']);
                $links = distinct_array($links);
                $all_links = count($links);
                $numoflinks = 0;
                //if there are any, add to the temp table, but only if there isnt such url already
                if (is_array($links)) {
                    reset($links);
                    while ($thislink = each($links)) {
                        if (!isset($tmp_urls[$thislink[1]]) || $tmp_urls[$thislink[1]] != 1) {
                            $tmp_urls[$thislink[1]] = 1;
                            $numoflinks++;
                            $db->exec("INSERT INTO " . TABLE_PREFIX . "temp (link, level, id) VALUES (" . $db->quote($thislink[1]) . ", " . $db->quote($level) . ", " . $db->quote($sessid) . ")");
                            echo sql_errorstring(__FILE__, __LINE__);
                        }
                    }
                }
            } else {
                printStandardReport('noFollow', $command_line);
            }
            if ($OKtoIndex == 1) {
                $title = $data['title'];
                $host = $data['host'];
                $path = $data['path'];
                $fulltxt = str_replace("\\'", "&quot;", $data['fulltext']);
                $desc = substr($data['description'], 0, 254);
                $language = substr($data['language'], 0, 2);
                $url_parts = parse_url($url);
                $domain_for_db = $url_parts['host'];
                if (isset($domain_arr[$domain_for_db])) {
                    $dom_id = $domain_arr[$domain_for_db];
                } else {
                    $db->exec("INSERT INTO " . TABLE_PREFIX . "domains (domain) VALUES (" . $db->quote($domain_for_db) . ")");
                    $dom_id = $db->lastInsertId();
                    $domain_arr[$domain_for_db] = $dom_id;
                }
                $wordarray = calc_weights($wordarray, $title, $host, $path, $data['keywords']);
                $tstamp = "'" . date("Y-m-d") . "'";
                //if there are words to index, add the link to the database, get its id, and add the word + their relation
                if (is_array($wordarray) && count($wordarray) > $min_words_per_page) {
                    $site_id = $db->quote($site_id);
                    $url = $db->quote($url);
                    $title = $db->quote($title);
                    $desc = $db->quote($desc);
                    $language = $db->quote($language);
                    $fulltxt = $db->quote($fulltxt);
                    $pageSize = $db->quote($pageSize);
                    $Qmd5sum = $db->quote($newmd5sum);
                    if ($md5sum == '') {
                        $db->exec("INSERT INTO " . TABLE_PREFIX . "links (site_id, url, title, description, language, fulltxt, indexdate, size, md5sum, level) VALUES ({$site_id}, {$url}, {$title}, {$desc}, {$language}, {$fulltxt}, {$tstamp}, {$pageSize}, {$Qmd5sum}, {$thislevel})");
                        $error = sql_errorstring(__FILE__, __LINE__);
                        if ($error) {
                            echo $error;
                            printStandardReport('skipped', $command_line);
                        } else {
                            $result = $db->query("SELECT link_id FROM " . TABLE_PREFIX . "links WHERE url={$url}");
                            echo sql_errorstring(__FILE__, __LINE__);
                            $row = $result->fetch();
                            $link_id = $row[0];
                            $result->closeCursor();
                            save_keywords($wordarray, $link_id, $dom_id);
                            printStandardReport('indexed', $command_line);
                        }
                    } else {
                        if ($md5sum != '' && $md5sum != $newmd5sum) {
                            //if page has changed, start updating
                            $result = $db->query("SELECT link_id FROM " . TABLE_PREFIX . "links WHERE url={$url}");
                            echo sql_errorstring(__FILE__, __LINE__);
                            $row = $result->fetch();
                            $link_id = $row[0];
                            $result->closeCursor();
                            for ($i = 0; $i <= 15; $i++) {
                                $char = dechex($i);
                                $db->exec("DELETE FROM " . TABLE_PREFIX . "link_keyword{$char} WHERE link_id={$link_id}");
                                echo sql_errorstring(__FILE__, __LINE__);
                            }
                            save_keywords($wordarray, $link_id, $dom_id);
                            $db->exec("UPDATE " . TABLE_PREFIX . "links SET title={$title}, description={$desc}, language={$language}, fulltxt={$fulltxt}, indexdate={$tstamp}, size={$pageSize}, md5sum={$Qmd5sum}, level={$thislevel} WHERE link_id={$link_id}");
                            echo sql_errorstring(__FILE__, __LINE__);
                            printStandardReport('re-indexed', $command_line);
                        }
                    }
                } else {
                    printStandardReport('minWords', $command_line);
                }
            }
        }
    } else {
        $deletable = 1;
        printUrlStatus($url_status['state'], $command_line);
    }
    if ($reindex == 1 && $deletable == 1) {
        check_for_removal($url);
    } else {
        if ($reindex == 1) {
            //???
        }
    }
    if (!isset($all_links)) {
        $all_links = 0;
    }
    if (!isset($numoflinks)) {
        $numoflinks = 0;
    }
    printLinksReport($numoflinks, $all_links, $command_line);
}