function index_url($url, $level, $site_id, $md5sum, $domain, $indexdate, $sessid, $can_leave_domain, $reindex) { global $entities, $min_delay; global $command_line; global $min_words_per_page; global $supdomain; global $mysql_table_prefix, $user_agent, $tmp_urls, $delay_time, $domain_arr; $needsReindex = 1; $deletable = 0; $url_status = url_status($url); $thislevel = $level - 1; if (strstr($url_status['state'], "Relocation")) { $url = preg_replace("/ /", "", url_purify($url_status['path'], $url, $can_leave_domain)); if ($url != '') { $result = mysql_query("select link from " . $mysql_table_prefix . "temp where link='{$url}' && id = '{$sessid}'"); echo mysql_error(); $rows = mysql_numrows($result); if ($rows == 0) { mysql_query("insert into " . $mysql_table_prefix . "temp (link, level, id) values ('{$url}', '{$level}', '{$sessid}')"); echo mysql_error(); } } $url_status['state'] == "redirected"; } /* if ($indexdate <> '' && $url_status['date'] <> '') { if ($indexdate > $url_status['date']) { $url_status['state'] = "Date checked. Page contents not changed"; $needsReindex = 0; } }*/ ini_set("user_agent", $user_agent); if ($url_status['state'] == 'ok') { $OKtoIndex = 1; $file_read_error = 0; if (time() - $delay_time < $min_delay) { sleep($min_delay - (time() - $delay_time)); } $delay_time = time(); if (!fst_lt_snd(phpversion(), "4.3.0")) { $file = file_get_contents($url); if ($file === FALSE) { $file_read_error = 1; } } else { $fl = @fopen($url, "r"); if ($fl) { while ($buffer = @fgets($fl, 4096)) { $file .= $buffer; } } else { $file_read_error = 1; } fclose($fl); } if ($file_read_error) { $contents = getFileContents($url); $file = $contents['file']; } $pageSize = number_format(strlen($file) / 1024, 2, ".", ""); printPageSizeReport($pageSize); if ($url_status['content'] != 'text') { $file = extract_text($file, $url_status['content']); } printStandardReport('starting', $command_line); $newmd5sum = md5($file); if ($md5sum == $newmd5sum) { printStandardReport('md5notChanged', $command_line); $OKtoIndex = 0; } else { if (isDuplicateMD5($newmd5sum)) { $OKtoIndex = 0; printStandardReport('duplicate', $command_line); } } if (($md5sum != $newmd5sum || $reindex == 1) && $OKtoIndex == 1) { $urlparts = parse_url($url); $newdomain = $urlparts['host']; $type = 0; /* if ($newdomain <> $domain) $domainChanged = 1; if ($domaincb==1) { $start = strlen($newdomain) - strlen($supdomain); if (substr($newdomain, $start) == $supdomain) { $domainChanged = 0; } }*/ // remove link to css file //get all links from file $data = clean_file($file, $url, $url_status['content']); if ($data['noindex'] == 1) { $OKtoIndex = 0; $deletable = 1; printStandardReport('metaNoindex', $command_line); } $wordarray = unique_array(explode(" ", $data['content'])); if ($data['nofollow'] != 1) { $links = get_links($file, $url, $can_leave_domain, $data['base']); $links = distinct_array($links); $all_links = count($links); $numoflinks = 0; //if there are any, add to the temp table, but only if there isnt such url already if (is_array($links)) { reset($links); while ($thislink = each($links)) { if ($tmp_urls[$thislink[1]] != 1) { $tmp_urls[$thislink[1]] = 1; $numoflinks++; mysql_query("insert into " . $mysql_table_prefix . "temp (link, level, id) values ('{$thislink['1']}', '{$level}', '{$sessid}')"); echo mysql_error(); } } } } else { printStandardReport('noFollow', $command_line); } if ($OKtoIndex == 1) { $title = $data['title']; $host = $data['host']; $path = $data['path']; $fulltxt = $data['fulltext']; $desc = substr($data['description'], 0, 254); $url_parts = parse_url($url); $domain_for_db = $url_parts['host']; if (isset($domain_arr[$domain_for_db])) { $dom_id = $domain_arr[$domain_for_db]; } else { mysql_query("insert into " . $mysql_table_prefix . "domains (domain) values ('{$domain_for_db}')"); $dom_id = mysql_insert_id(); $domain_arr[$domain_for_db] = $dom_id; } $wordarray = calc_weights($wordarray, $title, $host, $path, $data['keywords']); //if there are words to index, add the link to the database, get its id, and add the word + their relation if (is_array($wordarray) && count($wordarray) > $min_words_per_page) { if ($md5sum == '') { mysql_query("insert into " . $mysql_table_prefix . "links (site_id, url, title, description, fulltxt, indexdate, size, md5sum, level) values ('{$site_id}', '{$url}', '{$title}', '{$desc}', '{$fulltxt}', curdate(), '{$pageSize}', '{$newmd5sum}', {$thislevel})"); echo mysql_error(); $result = mysql_query("select link_id from " . $mysql_table_prefix . "links where url='{$url}'"); echo mysql_error(); $row = mysql_fetch_row($result); $link_id = $row[0]; save_keywords($wordarray, $link_id, $dom_id); printStandardReport('indexed', $command_line); } else { if ($md5sum != '' && $md5sum != $newmd5sum) { //if page has changed, start updating $result = mysql_query("select link_id from " . $mysql_table_prefix . "links where url='{$url}'"); echo mysql_error(); $row = mysql_fetch_row($result); $link_id = $row[0]; for ($i = 0; $i <= 15; $i++) { $char = dechex($i); mysql_query("delete from " . $mysql_table_prefix . "link_keyword{$char} where link_id={$link_id}"); echo mysql_error(); } save_keywords($wordarray, $link_id, $dom_id); $query = "update " . $mysql_table_prefix . "links set title='{$title}', description ='{$desc}', fulltxt = '{$fulltxt}', indexdate=now(), size = '{$pageSize}', md5sum='{$newmd5sum}', level={$thislevel} where link_id={$link_id}"; mysql_query($query); echo mysql_error(); printStandardReport('re-indexed', $command_line); } } } else { printStandardReport('minWords', $command_line); } } } } else { $deletable = 1; printUrlStatus($url_status['state'], $command_line); } if ($reindex == 1 && $deletable == 1) { check_for_removal($url); } else { if ($reindex == 1) { } } if (!isset($all_links)) { $all_links = 0; } if (!isset($numoflinks)) { $numoflinks = 0; } printLinksReport($numoflinks, $all_links, $command_line); }
function get_links($file, $url, $can_leave_domain, $base) { $chunklist = array(); // The base URL comes from either the meta tag or the current URL. if (!empty($base)) { $url = $base; } $links = array(); $regs = array(); $checked_urls = array(); preg_match_all("/href\\s*=\\s*[\\'\"]?([+:%\\/\\?~=&;\\\\(\\),._a-zA-Z0-9-]*)(#[.a-zA-Z0-9-]*)?[\\'\" ]?(\\s*rel\\s*=\\s*[\\'\"]?(nofollow)[\\'\"]?)?/i", $file, $regs, PREG_SET_ORDER); foreach ($regs as $val) { if ($checked_urls[$val[1]] != 1 && !isset($val[4])) { //if nofollow is not set if (($a = url_purify($val[1], $url, $can_leave_domain)) != '') { $links[] = $a; } $checked_urls[$val[1]] = 1; } } preg_match_all("/(frame[^>]*src[[:blank:]]*)=[[:blank:]]*[\\'\"]?(([[a-z]{3,5}:\\/\\/(([.a-zA-Z0-9-])+(:[0-9]+)*))*([+:%\\/?=&;\\\\(\\),._ a-zA-Z0-9-]*))(#[.a-zA-Z0-9-]*)?[\\'\" ]?/i", $file, $regs, PREG_SET_ORDER); foreach ($regs as $val) { if ($checked_urls[$val[1]] != 1 && !isset($val[4])) { //if nofollow is not set if (($a = url_purify($val[1], $url, $can_leave_domain)) != '') { $links[] = $a; } $checked_urls[$val[1]] = 1; } } preg_match_all("/(window[.]location)[[:blank:]]*=[[:blank:]]*[\\'\"]?(([[a-z]{3,5}:\\/\\/(([.a-zA-Z0-9-])+(:[0-9]+)*))*([+:%\\/?=&;\\\\(\\),._ a-zA-Z0-9-]*))(#[.a-zA-Z0-9-]*)?[\\'\" ]?/i", $file, $regs, PREG_SET_ORDER); foreach ($regs as $val) { if ($checked_urls[$val[1]] != 1 && !isset($val[4])) { //if nofollow is not set if (($a = url_purify($val[1], $url, $can_leave_domain)) != '') { $links[] = $a; } $checked_urls[$val[1]] = 1; } } preg_match_all("/(http-equiv=['\"]refresh['\"] *content=['\"][0-9]+;url)[[:blank:]]*=[[:blank:]]*[\\'\"]?(([[a-z]{3,5}:\\/\\/(([.a-zA-Z0-9-])+(:[0-9]+)*))*([+:%\\/?=&;\\\\(\\),._ a-zA-Z0-9-]*))(#[.a-zA-Z0-9-]*)?[\\'\" ]?/i", $file, $regs, PREG_SET_ORDER); foreach ($regs as $val) { if ($checked_urls[$val[1]] != 1 && !isset($val[4])) { //if nofollow is not set if (($a = url_purify($val[1], $url, $can_leave_domain)) != '') { $links[] = $a; } $checked_urls[$val[1]] = 1; } } preg_match_all("/(window[.]open[[:blank:]]*[(])[[:blank:]]*[\\'\"]?(([[a-z]{3,5}:\\/\\/(([.a-zA-Z0-9-])+(:[0-9]+)*))*([+:%\\/?=&;\\\\(\\),._ a-zA-Z0-9-]*))(#[.a-zA-Z0-9-]*)?[\\'\" ]?/i", $file, $regs, PREG_SET_ORDER); foreach ($regs as $val) { if ($checked_urls[$val[1]] != 1 && !isset($val[4])) { //if nofollow is not set if (($a = url_purify($val[1], $url, $can_leave_domain)) != '') { $links[] = $a; } $checked_urls[$val[1]] = 1; } } return $links; }
function link_check($url, $level, $sessid, $can_leave_domain, $reindex) { global $command_line; $needsReindex = 1; $deletable = 0; $local_url = 0; $local_url = strpos($url, 'localhost'); if ($local_url != '7') { $url_status = url_status($url); $thislevel = $level - 1; if (strstr($url_status['state'], "Relocation")) { $url = eregi_replace(" ", "", url_purify($url_status['path'], $url, $can_leave_domain)); if ($url != '') { $result = mysql_query("select link from " . TABLE_PREFIX . "temp where link='{$url}' && id = '{$sessid}'"); if (DEBUG > '0') { echo mysql_error(); } $rows = mysql_num_rows($result); if ($rows == 0) { mysql_query("insert into " . TABLE_PREFIX . "temp (link, level, id) values ('{$url}', '{$level}', '{$sessid}')"); if (DEBUG > '0') { echo mysql_error(); } } } $url_status['state'] == "redirected"; clean_resource($result); } ini_set("user_agent", Configure::read('user_agent')); if ($url_status['state'] == 'ok') { printStandardReport('link_okay', $command_line); } else { $deletable = 1; printUrlStatus($url_status['state'], $command_line); } } if ($local_url == '7') { printStandardReport('link_local', $command_line); } if ($reindex == 1 && $deletable == 1) { check_for_removal($url); } else { if ($reindex == 1) { } } if (!isset($all_links)) { $all_links = 0; } if (!isset($numoflinks)) { $numoflinks = 0; } }
function link_check($url, $level, $sessid, $can_leave_domain, $reindex, $site_id) { global $db_con, $debug, $command_line, $mysql_table_prefix, $user_agent, $index_media, $no_log, $clear; $needsReindex = 1; $deletable = 0; $local_url = 0; $local_url = strpos($url, 'localhost'); if ($local_url != '/') { $url_status = url_status($url, $site_id, $sessid); $thislevel = $level - 1; if (strstr($url_status['state'], "Relocation")) { $care_excl = '1'; // care file suffixed to be excluded $relocated = '1'; // URL is relocated $local_redir = ''; $url = $db_con->real_escape_string(preg_replace("/ /i", "", url_purify($url_status['path'], $url, $can_leave_domain, $care_excl, $relocated, $local_redir))); if (!$url) { $url_status['aborted'] = 1; $url_status['state'] = "Indexation aborted because of undefined redirection error."; return $url_status; } // abort indexation, if the redirected URL is equal to calling URL if ($url == 'self') { $url_status['aborted'] = 1; $url_status['state'] = "Indexation aborted for this page, because the redirection was a link in it selves.<br />Blocked by Sphide-plus, because this could end in an infinite indexation loop."; return $url_status; } // abort indexation, if the redirected URL contains invalid file suffix if ($url == 'excl') { $url_status['aborted'] = 1; $url_status['state'] = "Indexation aborted because the redirected link does not meet the URL suffix conditions."; return $url_status; } // abort indexation, because purifing the redirected URL failed if (!strstr($url, "//")) { $url_status['aborted'] = 1; $url_status['state'] = "Indexation aborted because: {$url}"; return $url_status; } mysqltest(); $sql_query = "SELECT link from " . $mysql_table_prefix . "temp where link='{$url}' && id = '{$sessid}'"; $result = $db_con->query($sql_query); if ($debug && $db_con->errno) { $err_row = __LINE__ - 2; printf("<p><span class='red'> MySQL failure: %s \n<br /></span></p>", $db_con->error); if (__FUNCTION__) { printf("<p><span class='red'> Found in script: " . __FILE__ . " row: {$err_row} in function(): " . __FUNCTION__ . " <br /></span></p>"); } else { printf("<p><span class='red'> Found in script: " . __FILE__ . " row: {$err_row} <br /></span></p>"); } printf("<p><span class='red'> Script execution aborted. <br /></span>"); printf("<p><strong>Invalid query string, which caused the SQL error:</strong></p>"); echo "<p> {$sql_query} </p>"; exit; } $rows = $result->num_rows; if ($rows == 0) { $sql_query = "INSERT into " . $mysql_table_prefix . "temp (link, level, id) values ('{$url}', '{$level}', '{$sessid}')"; $db_con->query($sql_query); if ($debug && $db_con->errno) { $err_row = __LINE__ - 2; printf("<p><span class='red'> MySQL failure: %s \n<br /></span></p>", $db_con->error); if (__FUNCTION__) { printf("<p><span class='red'> Found in script: " . __FILE__ . " row: {$err_row} in function(): " . __FUNCTION__ . " <br /></span></p>"); } else { printf("<p><span class='red'> Found in script: " . __FILE__ . " row: {$err_row} <br /></span></p>"); } printf("<p><span class='red'> Script execution aborted. <br /></span>"); printf("<p><strong>Invalid query string, which caused the SQL error:</strong></p>"); echo "<p> {$sql_query} </p>"; exit; } } $url_status['state'] == "redirected"; if ($clear == 1) { clean_resource($result, '17'); } } ini_set("user_agent", $user_agent); if ($url_status['state'] == 'ok') { printStandardReport('link_okay', $command_line, $no_log); } else { $deletable = 1; printUrlStatus($url_status['state'], $command_line); } } if ($local_url == '7') { printStandardReport('link_local', $command_line, $no_log); } if ($reindex == 1 && $deletable == 1) { check_for_removal($url); } else { if ($reindex == 1) { } } if (!isset($all_links)) { $all_links = 0; } if (!isset($numoflinks)) { $numoflinks = 0; } }
function index_url($url, $level, $site_id, $md5sum, $domain, $indexdate, $sessid, $can_leave_domain, $reindex) { global $min_delay; global $command_line; global $min_words_per_page; global $supdomain, $index_vpaths; global $user_agent, $tmp_urls, $delay_time, $domain_arr; global $db; $deletable = 0; $url_status = url_status($url); $thislevel = $level - 1; if (strstr($url_status['state'], "Relocation")) { $url = preg_replace("/ /", "", url_purify($url_status['path'], $url, $can_leave_domain)); if ($url != '') { $result = $db->query("SELECT link FROM " . TABLE_PREFIX . "temp WHERE link=" . $db->quote($url) . " AND id=" . $db->quote($sessid)); echo sql_errorstring(__FILE__, __LINE__); if ($result->fetch()) { $result->closeCursor(); $db->exec("INSERT INTO " . TABLE_PREFIX . "temp (link, level, id) VALUES (" . $db->quote($url) . ", " . $db->quote($level) . ", " . $db->quote($sessid) . ")"); echo sql_errorstring(__FILE__, __LINE__); } } $url_status['state'] == "redirected"; } if (!$index_vpaths && $url_status['state'] == 'ok') { $url_parts = parse_url($url); $base = basename($url_parts['path']); if (strstr($base, '.') == false) { $url_status['state'] = "directory listing or default redirect"; } } ini_set("user_agent", $user_agent); if ($url_status['state'] == 'ok') { $OKtoIndex = 1; $file_read_error = 0; if (time() - $delay_time < $min_delay) { sleep($min_delay - (time() - $delay_time)); } $delay_time = time(); if (!fst_lt_snd(phpversion(), "4.3.0")) { $file = file_get_contents($url); if ($file === FALSE) { $file_read_error = 1; } } else { $fl = @fopen($url, "r"); if ($fl) { while ($buffer = @fgets($fl, 4096)) { $file .= $buffer; } } else { $file_read_error = 1; } fclose($fl); } if ($file_read_error) { $contents = getFileContents($url); $file = $contents['file']; } $pageSize = number_format(strlen($file) / 1024, 2, ".", ""); printPageSizeReport($pageSize); if ($url_status['content'] != 'text') { $file = extract_text($file, $url_status['content']); } printStandardReport('starting', $command_line); $newmd5sum = md5($file); if ($reindex == 0) { if ($md5sum == $newmd5sum) { printStandardReport('md5notChanged', $command_line); $OKtoIndex = 0; } else { if (isDuplicateMD5($newmd5sum)) { $OKtoIndex = 0; printStandardReport('duplicate', $command_line); } } } if (($md5sum != $newmd5sum || $reindex == 1) && $OKtoIndex == 1) { $urlparts = parse_url($url); $newdomain = $urlparts['host']; $type = 0; // remove link to css file //get all links from file $data = clean_file($file, $url, $url_status['content']); if ($data['noindex'] == 1) { $OKtoIndex = 0; $deletable = 1; printStandardReport('metaNoindex', $command_line); } $wordarray = unique_array(explode(" ", $data['content'])); if ($data['nofollow'] != 1) { $links = get_links($file, $url, $can_leave_domain, $data['base']); $links = distinct_array($links); $all_links = count($links); $numoflinks = 0; //if there are any, add to the temp table, but only if there isnt such url already if (is_array($links)) { reset($links); while ($thislink = each($links)) { if (!isset($tmp_urls[$thislink[1]]) || $tmp_urls[$thislink[1]] != 1) { $tmp_urls[$thislink[1]] = 1; $numoflinks++; $db->exec("INSERT INTO " . TABLE_PREFIX . "temp (link, level, id) VALUES (" . $db->quote($thislink[1]) . ", " . $db->quote($level) . ", " . $db->quote($sessid) . ")"); echo sql_errorstring(__FILE__, __LINE__); } } } } else { printStandardReport('noFollow', $command_line); } if ($OKtoIndex == 1) { $title = $data['title']; $host = $data['host']; $path = $data['path']; $fulltxt = str_replace("\\'", """, $data['fulltext']); $desc = substr($data['description'], 0, 254); $language = substr($data['language'], 0, 2); $url_parts = parse_url($url); $domain_for_db = $url_parts['host']; if (isset($domain_arr[$domain_for_db])) { $dom_id = $domain_arr[$domain_for_db]; } else { $db->exec("INSERT INTO " . TABLE_PREFIX . "domains (domain) VALUES (" . $db->quote($domain_for_db) . ")"); $dom_id = $db->lastInsertId(); $domain_arr[$domain_for_db] = $dom_id; } $wordarray = calc_weights($wordarray, $title, $host, $path, $data['keywords']); $tstamp = "'" . date("Y-m-d") . "'"; //if there are words to index, add the link to the database, get its id, and add the word + their relation if (is_array($wordarray) && count($wordarray) > $min_words_per_page) { $site_id = $db->quote($site_id); $url = $db->quote($url); $title = $db->quote($title); $desc = $db->quote($desc); $language = $db->quote($language); $fulltxt = $db->quote($fulltxt); $pageSize = $db->quote($pageSize); $Qmd5sum = $db->quote($newmd5sum); if ($md5sum == '') { $db->exec("INSERT INTO " . TABLE_PREFIX . "links (site_id, url, title, description, language, fulltxt, indexdate, size, md5sum, level) VALUES ({$site_id}, {$url}, {$title}, {$desc}, {$language}, {$fulltxt}, {$tstamp}, {$pageSize}, {$Qmd5sum}, {$thislevel})"); $error = sql_errorstring(__FILE__, __LINE__); if ($error) { echo $error; printStandardReport('skipped', $command_line); } else { $result = $db->query("SELECT link_id FROM " . TABLE_PREFIX . "links WHERE url={$url}"); echo sql_errorstring(__FILE__, __LINE__); $row = $result->fetch(); $link_id = $row[0]; $result->closeCursor(); save_keywords($wordarray, $link_id, $dom_id); printStandardReport('indexed', $command_line); } } else { if ($md5sum != '' && $md5sum != $newmd5sum) { //if page has changed, start updating $result = $db->query("SELECT link_id FROM " . TABLE_PREFIX . "links WHERE url={$url}"); echo sql_errorstring(__FILE__, __LINE__); $row = $result->fetch(); $link_id = $row[0]; $result->closeCursor(); for ($i = 0; $i <= 15; $i++) { $char = dechex($i); $db->exec("DELETE FROM " . TABLE_PREFIX . "link_keyword{$char} WHERE link_id={$link_id}"); echo sql_errorstring(__FILE__, __LINE__); } save_keywords($wordarray, $link_id, $dom_id); $db->exec("UPDATE " . TABLE_PREFIX . "links SET title={$title}, description={$desc}, language={$language}, fulltxt={$fulltxt}, indexdate={$tstamp}, size={$pageSize}, md5sum={$Qmd5sum}, level={$thislevel} WHERE link_id={$link_id}"); echo sql_errorstring(__FILE__, __LINE__); printStandardReport('re-indexed', $command_line); } } } else { printStandardReport('minWords', $command_line); } } } } else { $deletable = 1; printUrlStatus($url_status['state'], $command_line); } if ($reindex == 1 && $deletable == 1) { check_for_removal($url); } else { if ($reindex == 1) { //??? } } if (!isset($all_links)) { $all_links = 0; } if (!isset($numoflinks)) { $numoflinks = 0; } printLinksReport($numoflinks, $all_links, $command_line); }