Example #1
0
function index_url($url, $level, $site_id, $md5sum, $domain, $indexdate, $sessid, $can_leave_domain, $reindex)
{
    global $entities, $min_delay;
    global $command_line;
    global $min_words_per_page;
    global $supdomain;
    global $mysql_table_prefix, $user_agent, $tmp_urls, $delay_time, $domain_arr;
    $needsReindex = 1;
    $deletable = 0;
    $url_status = url_status($url);
    $thislevel = $level - 1;
    if (strstr($url_status['state'], "Relocation")) {
        $url = preg_replace("/ /", "", url_purify($url_status['path'], $url, $can_leave_domain));
        if ($url != '') {
            $result = mysql_query("select link from " . $mysql_table_prefix . "temp where link='{$url}' && id = '{$sessid}'");
            echo mysql_error();
            $rows = mysql_numrows($result);
            if ($rows == 0) {
                mysql_query("insert into " . $mysql_table_prefix . "temp (link, level, id) values ('{$url}', '{$level}', '{$sessid}')");
                echo mysql_error();
            }
        }
        $url_status['state'] == "redirected";
    }
    /*
    		if ($indexdate <> '' && $url_status['date'] <> '') {
    			if ($indexdate > $url_status['date']) {
    				$url_status['state'] = "Date checked. Page contents not changed";
    				$needsReindex = 0;
    			}
    		}*/
    ini_set("user_agent", $user_agent);
    if ($url_status['state'] == 'ok') {
        $OKtoIndex = 1;
        $file_read_error = 0;
        if (time() - $delay_time < $min_delay) {
            sleep($min_delay - (time() - $delay_time));
        }
        $delay_time = time();
        if (!fst_lt_snd(phpversion(), "4.3.0")) {
            $file = file_get_contents($url);
            if ($file === FALSE) {
                $file_read_error = 1;
            }
        } else {
            $fl = @fopen($url, "r");
            if ($fl) {
                while ($buffer = @fgets($fl, 4096)) {
                    $file .= $buffer;
                }
            } else {
                $file_read_error = 1;
            }
            fclose($fl);
        }
        if ($file_read_error) {
            $contents = getFileContents($url);
            $file = $contents['file'];
        }
        $pageSize = number_format(strlen($file) / 1024, 2, ".", "");
        printPageSizeReport($pageSize);
        if ($url_status['content'] != 'text') {
            $file = extract_text($file, $url_status['content']);
        }
        printStandardReport('starting', $command_line);
        $newmd5sum = md5($file);
        if ($md5sum == $newmd5sum) {
            printStandardReport('md5notChanged', $command_line);
            $OKtoIndex = 0;
        } else {
            if (isDuplicateMD5($newmd5sum)) {
                $OKtoIndex = 0;
                printStandardReport('duplicate', $command_line);
            }
        }
        if (($md5sum != $newmd5sum || $reindex == 1) && $OKtoIndex == 1) {
            $urlparts = parse_url($url);
            $newdomain = $urlparts['host'];
            $type = 0;
            /*		if ($newdomain <> $domain)
            					$domainChanged = 1;
            
            				if ($domaincb==1) {
            					$start = strlen($newdomain) - strlen($supdomain);
            					if (substr($newdomain, $start) == $supdomain) {
            						$domainChanged = 0;
            					}
            				}*/
            // remove link to css file
            //get all links from file
            $data = clean_file($file, $url, $url_status['content']);
            if ($data['noindex'] == 1) {
                $OKtoIndex = 0;
                $deletable = 1;
                printStandardReport('metaNoindex', $command_line);
            }
            $wordarray = unique_array(explode(" ", $data['content']));
            if ($data['nofollow'] != 1) {
                $links = get_links($file, $url, $can_leave_domain, $data['base']);
                $links = distinct_array($links);
                $all_links = count($links);
                $numoflinks = 0;
                //if there are any, add to the temp table, but only if there isnt such url already
                if (is_array($links)) {
                    reset($links);
                    while ($thislink = each($links)) {
                        if ($tmp_urls[$thislink[1]] != 1) {
                            $tmp_urls[$thislink[1]] = 1;
                            $numoflinks++;
                            mysql_query("insert into " . $mysql_table_prefix . "temp (link, level, id) values ('{$thislink['1']}', '{$level}', '{$sessid}')");
                            echo mysql_error();
                        }
                    }
                }
            } else {
                printStandardReport('noFollow', $command_line);
            }
            if ($OKtoIndex == 1) {
                $title = $data['title'];
                $host = $data['host'];
                $path = $data['path'];
                $fulltxt = $data['fulltext'];
                $desc = substr($data['description'], 0, 254);
                $url_parts = parse_url($url);
                $domain_for_db = $url_parts['host'];
                if (isset($domain_arr[$domain_for_db])) {
                    $dom_id = $domain_arr[$domain_for_db];
                } else {
                    mysql_query("insert into " . $mysql_table_prefix . "domains (domain) values ('{$domain_for_db}')");
                    $dom_id = mysql_insert_id();
                    $domain_arr[$domain_for_db] = $dom_id;
                }
                $wordarray = calc_weights($wordarray, $title, $host, $path, $data['keywords']);
                //if there are words to index, add the link to the database, get its id, and add the word + their relation
                if (is_array($wordarray) && count($wordarray) > $min_words_per_page) {
                    if ($md5sum == '') {
                        mysql_query("insert into " . $mysql_table_prefix . "links (site_id, url, title, description, fulltxt, indexdate, size, md5sum, level) values ('{$site_id}', '{$url}', '{$title}', '{$desc}', '{$fulltxt}', curdate(), '{$pageSize}', '{$newmd5sum}', {$thislevel})");
                        echo mysql_error();
                        $result = mysql_query("select link_id from " . $mysql_table_prefix . "links where url='{$url}'");
                        echo mysql_error();
                        $row = mysql_fetch_row($result);
                        $link_id = $row[0];
                        save_keywords($wordarray, $link_id, $dom_id);
                        printStandardReport('indexed', $command_line);
                    } else {
                        if ($md5sum != '' && $md5sum != $newmd5sum) {
                            //if page has changed, start updating
                            $result = mysql_query("select link_id from " . $mysql_table_prefix . "links where url='{$url}'");
                            echo mysql_error();
                            $row = mysql_fetch_row($result);
                            $link_id = $row[0];
                            for ($i = 0; $i <= 15; $i++) {
                                $char = dechex($i);
                                mysql_query("delete from " . $mysql_table_prefix . "link_keyword{$char} where link_id={$link_id}");
                                echo mysql_error();
                            }
                            save_keywords($wordarray, $link_id, $dom_id);
                            $query = "update " . $mysql_table_prefix . "links set title='{$title}', description ='{$desc}', fulltxt = '{$fulltxt}', indexdate=now(), size = '{$pageSize}', md5sum='{$newmd5sum}', level={$thislevel} where link_id={$link_id}";
                            mysql_query($query);
                            echo mysql_error();
                            printStandardReport('re-indexed', $command_line);
                        }
                    }
                } else {
                    printStandardReport('minWords', $command_line);
                }
            }
        }
    } else {
        $deletable = 1;
        printUrlStatus($url_status['state'], $command_line);
    }
    if ($reindex == 1 && $deletable == 1) {
        check_for_removal($url);
    } else {
        if ($reindex == 1) {
        }
    }
    if (!isset($all_links)) {
        $all_links = 0;
    }
    if (!isset($numoflinks)) {
        $numoflinks = 0;
    }
    printLinksReport($numoflinks, $all_links, $command_line);
}
function index_url($url, $level, $site_id, $md5sum, $domain, $indexdate, $sessid, $can_leave_domain, $reindex)
{
    global $min_delay;
    global $command_line;
    global $min_words_per_page;
    global $supdomain, $index_vpaths;
    global $user_agent, $tmp_urls, $delay_time, $domain_arr;
    global $db;
    $deletable = 0;
    $url_status = url_status($url);
    $thislevel = $level - 1;
    if (strstr($url_status['state'], "Relocation")) {
        $url = preg_replace("/ /", "", url_purify($url_status['path'], $url, $can_leave_domain));
        if ($url != '') {
            $result = $db->query("SELECT link FROM " . TABLE_PREFIX . "temp WHERE link=" . $db->quote($url) . " AND id=" . $db->quote($sessid));
            echo sql_errorstring(__FILE__, __LINE__);
            if ($result->fetch()) {
                $result->closeCursor();
                $db->exec("INSERT INTO " . TABLE_PREFIX . "temp (link, level, id) VALUES (" . $db->quote($url) . ", " . $db->quote($level) . ", " . $db->quote($sessid) . ")");
                echo sql_errorstring(__FILE__, __LINE__);
            }
        }
        $url_status['state'] == "redirected";
    }
    if (!$index_vpaths && $url_status['state'] == 'ok') {
        $url_parts = parse_url($url);
        $base = basename($url_parts['path']);
        if (strstr($base, '.') == false) {
            $url_status['state'] = "directory listing or default redirect";
        }
    }
    ini_set("user_agent", $user_agent);
    if ($url_status['state'] == 'ok') {
        $OKtoIndex = 1;
        $file_read_error = 0;
        if (time() - $delay_time < $min_delay) {
            sleep($min_delay - (time() - $delay_time));
        }
        $delay_time = time();
        if (!fst_lt_snd(phpversion(), "4.3.0")) {
            $file = file_get_contents($url);
            if ($file === FALSE) {
                $file_read_error = 1;
            }
        } else {
            $fl = @fopen($url, "r");
            if ($fl) {
                while ($buffer = @fgets($fl, 4096)) {
                    $file .= $buffer;
                }
            } else {
                $file_read_error = 1;
            }
            fclose($fl);
        }
        if ($file_read_error) {
            $contents = getFileContents($url);
            $file = $contents['file'];
        }
        $pageSize = number_format(strlen($file) / 1024, 2, ".", "");
        printPageSizeReport($pageSize);
        if ($url_status['content'] != 'text') {
            $file = extract_text($file, $url_status['content']);
        }
        printStandardReport('starting', $command_line);
        $newmd5sum = md5($file);
        if ($reindex == 0) {
            if ($md5sum == $newmd5sum) {
                printStandardReport('md5notChanged', $command_line);
                $OKtoIndex = 0;
            } else {
                if (isDuplicateMD5($newmd5sum)) {
                    $OKtoIndex = 0;
                    printStandardReport('duplicate', $command_line);
                }
            }
        }
        if (($md5sum != $newmd5sum || $reindex == 1) && $OKtoIndex == 1) {
            $urlparts = parse_url($url);
            $newdomain = $urlparts['host'];
            $type = 0;
            // remove link to css file
            //get all links from file
            $data = clean_file($file, $url, $url_status['content']);
            if ($data['noindex'] == 1) {
                $OKtoIndex = 0;
                $deletable = 1;
                printStandardReport('metaNoindex', $command_line);
            }
            $wordarray = unique_array(explode(" ", $data['content']));
            if ($data['nofollow'] != 1) {
                $links = get_links($file, $url, $can_leave_domain, $data['base']);
                $links = distinct_array($links);
                $all_links = count($links);
                $numoflinks = 0;
                //if there are any, add to the temp table, but only if there isnt such url already
                if (is_array($links)) {
                    reset($links);
                    while ($thislink = each($links)) {
                        if (!isset($tmp_urls[$thislink[1]]) || $tmp_urls[$thislink[1]] != 1) {
                            $tmp_urls[$thislink[1]] = 1;
                            $numoflinks++;
                            $db->exec("INSERT INTO " . TABLE_PREFIX . "temp (link, level, id) VALUES (" . $db->quote($thislink[1]) . ", " . $db->quote($level) . ", " . $db->quote($sessid) . ")");
                            echo sql_errorstring(__FILE__, __LINE__);
                        }
                    }
                }
            } else {
                printStandardReport('noFollow', $command_line);
            }
            if ($OKtoIndex == 1) {
                $title = $data['title'];
                $host = $data['host'];
                $path = $data['path'];
                $fulltxt = str_replace("\\'", "&quot;", $data['fulltext']);
                $desc = substr($data['description'], 0, 254);
                $language = substr($data['language'], 0, 2);
                $url_parts = parse_url($url);
                $domain_for_db = $url_parts['host'];
                if (isset($domain_arr[$domain_for_db])) {
                    $dom_id = $domain_arr[$domain_for_db];
                } else {
                    $db->exec("INSERT INTO " . TABLE_PREFIX . "domains (domain) VALUES (" . $db->quote($domain_for_db) . ")");
                    $dom_id = $db->lastInsertId();
                    $domain_arr[$domain_for_db] = $dom_id;
                }
                $wordarray = calc_weights($wordarray, $title, $host, $path, $data['keywords']);
                $tstamp = "'" . date("Y-m-d") . "'";
                //if there are words to index, add the link to the database, get its id, and add the word + their relation
                if (is_array($wordarray) && count($wordarray) > $min_words_per_page) {
                    $site_id = $db->quote($site_id);
                    $url = $db->quote($url);
                    $title = $db->quote($title);
                    $desc = $db->quote($desc);
                    $language = $db->quote($language);
                    $fulltxt = $db->quote($fulltxt);
                    $pageSize = $db->quote($pageSize);
                    $Qmd5sum = $db->quote($newmd5sum);
                    if ($md5sum == '') {
                        $db->exec("INSERT INTO " . TABLE_PREFIX . "links (site_id, url, title, description, language, fulltxt, indexdate, size, md5sum, level) VALUES ({$site_id}, {$url}, {$title}, {$desc}, {$language}, {$fulltxt}, {$tstamp}, {$pageSize}, {$Qmd5sum}, {$thislevel})");
                        $error = sql_errorstring(__FILE__, __LINE__);
                        if ($error) {
                            echo $error;
                            printStandardReport('skipped', $command_line);
                        } else {
                            $result = $db->query("SELECT link_id FROM " . TABLE_PREFIX . "links WHERE url={$url}");
                            echo sql_errorstring(__FILE__, __LINE__);
                            $row = $result->fetch();
                            $link_id = $row[0];
                            $result->closeCursor();
                            save_keywords($wordarray, $link_id, $dom_id);
                            printStandardReport('indexed', $command_line);
                        }
                    } else {
                        if ($md5sum != '' && $md5sum != $newmd5sum) {
                            //if page has changed, start updating
                            $result = $db->query("SELECT link_id FROM " . TABLE_PREFIX . "links WHERE url={$url}");
                            echo sql_errorstring(__FILE__, __LINE__);
                            $row = $result->fetch();
                            $link_id = $row[0];
                            $result->closeCursor();
                            for ($i = 0; $i <= 15; $i++) {
                                $char = dechex($i);
                                $db->exec("DELETE FROM " . TABLE_PREFIX . "link_keyword{$char} WHERE link_id={$link_id}");
                                echo sql_errorstring(__FILE__, __LINE__);
                            }
                            save_keywords($wordarray, $link_id, $dom_id);
                            $db->exec("UPDATE " . TABLE_PREFIX . "links SET title={$title}, description={$desc}, language={$language}, fulltxt={$fulltxt}, indexdate={$tstamp}, size={$pageSize}, md5sum={$Qmd5sum}, level={$thislevel} WHERE link_id={$link_id}");
                            echo sql_errorstring(__FILE__, __LINE__);
                            printStandardReport('re-indexed', $command_line);
                        }
                    }
                } else {
                    printStandardReport('minWords', $command_line);
                }
            }
        }
    } else {
        $deletable = 1;
        printUrlStatus($url_status['state'], $command_line);
    }
    if ($reindex == 1 && $deletable == 1) {
        check_for_removal($url);
    } else {
        if ($reindex == 1) {
            //???
        }
    }
    if (!isset($all_links)) {
        $all_links = 0;
    }
    if (!isset($numoflinks)) {
        $numoflinks = 0;
    }
    printLinksReport($numoflinks, $all_links, $command_line);
}