// Don't let PHP timeout db_connect(); $stop = false; if ($first_run == true) { //Before starting, check the domains fields of the database and fill in any missing entries //Also fill in missing $strSQL = "SELECT * FROM tblPages WHERE strDomain IS NULL OR strDomain='' OR strCleanURL IS NULL OR strCleanURL=''"; $statement = $GLOBALS["db"]->prepare($strSQL); $result = $statement->execute(); while (null != ($row = $statement->fetch(PDO::FETCH_ASSOC))) { $url = $row['strURL']; if ($row['strDomain'] != null && $row['strDomain'] != '') { $domain = false; //$row['strDomain']; } else { $domain = get_domain_part($url, $SAME_DOMAIN_FETCH_LEVEL); } if ($row['strCleanURL'] != null && $row['strCleanURL'] != '') { $cleanURL = false; //$row['strCleanURL']; } else { $cleanURL = clean_url($url); } $pageID = $row["iPageID"]; $strSQL = "UPDATE tblPages SET "; if ($domain === false && $cleanURL === false) { //Should never get here. If we do something strange with db config die("Assert failes: neither domain nor clean url in need of updating"); } if ($domain !== false) { $strSQL .= "strDomain='{$domain}' ";
function exclude_link($link) { # Initialization global $SEED_URL, $exclusion_array, $ALLOW_OFFSITE, $ONLY_OFFSITE; $exclude = false; // Exclude links that are JavaScript commands if (stristr($link, "javascript")) { // echo "Ignored JavaScript fuction: $link\n"; $exclude = true; } if (strlen($link) < 9) { // echo "Link too short: $link\n"; $exclude = true; } // Exclude links found in $exclusion_array for ($xx = 0; $xx < count($exclusion_array); $xx++) { //if(stristr($link, $exclusion_array[$xx])) if (preg_match($exclusion_array[$xx], $link) > 0) { //echo "Ignored excluded link: $link\n"; $exclude = true; break; } } // Exclude offsite links if requested /*if($ALLOW_OFFSITE==false) { if(get_domain($link)!=get_domain($SEED_URL)) { //echo "Ignored offsite link: $link\n"; $exclude=true; } } if($ONLY_OFFSITE==true) { if(get_domain($link)==get_domain($SEED_URL)) { // echo "Ignored on-site link: $link\n"; $exclude=true; } }*/ global $whitelistdomain, $whitelistdomainlevel, $whitelistdomainlist, $whitelistdomainlist_arr, $whitelistdomainlist_part; if ($whitelistdomain) { if ($whitelistdomainlevel == -1) { //match any part $domain = get_domain($link); $found = false; #print $whitelistdomainlist_arr[0] . "\n$domain\n"; for ($x = 0; $x < count($whitelistdomainlist_arr); $x++) { if (strpos($domain, $whitelistdomainlist_arr[$x]) !== false) { $found = true; break; } } if ($found === false) { $exclude = true; } } else { $domain = get_domain_part($link, $whitelistdomainlevel); if (strpos($whitelistdomainlist_part, ":{$domain}:") === false) { $exclude = true; } } } global $whitelisturl, $whitelisturllist_arr; if ($whitelisturl) { $found = false; for ($x = 0; $x < count($whitelisturllist_arr); $x++) { if (strpos($link, $whitelisturllist_arr[$x]) !== false) { $found = true; break; } } if ($found === false) { $exclude = true; } } return $exclude; }
function db_store_link($seed, $link) { global $SAME_DOMAIN_FETCH_LEVEL, $MAX_PENETRATION; //echo "db_store_link($seed,$link)\n"; #check if in tblPages #if not, store #get unique id, tblPages.iPageID #get if there is link from seed[iPageID] to $resolved address #if so increment link count #if not, insert new record with link count = 0 #echo "start......db_store_link(...)\n"; #echo "link is: $link\n"; $link = html_entity_decode($link); $cleanUrl = clean_url($link); $cleanUrl = $cleanUrl; $domain = get_domain_part($link, $SAME_DOMAIN_FETCH_LEVEL); $link = $link; $page_id = checkCache($cleanUrl); if ($page_id == null) { $strSQL = "SELECT iPageID FROM tblPages WHERE strCleanURL=?"; $page_id = db_run_select($strSQL, array($cleanUrl), true); addToCache($cleanUrl, $page_id); } if ($page_id == NULL && $MAX_PENETRATION == 0) { //we are set only to crawl only pages in db return NULL; } else { if ($page_id == NULL) { $strSQL = "INSERT INTO tblPages SET fkQueryID=?,strURL=?,strCleanURL=?,iLevel=?,strDomain=?"; db_run_query($strSQL, array($seed["fkQueryID"], $link, $cleanUrl, $seed["iLevel"] + 1, $domain)); //$strSQL="SELECT LAST_INSERT_ID();";//TODO: ONLY MYSQL //"SELECT iPageID FROM tblPages WHERE strCleanURL='" . $cleanUrl . "'"; $page_id = $GLOBALS["db"]->lastInsertId(); //db_run_select($strSQL,true); addToCache($cleanUrl, $page_id); } else { //check current level and give shorter level if possible? } } /*$strSQL="SELECT iLinkID FROM tblLinks " . "WHERE fkParentID=" . $seed["iPageID"] . " AND fkChildID=" . $page_id; $link_id = db_run_select($strSQL,true); if ($link_id==NULL) { $strSQL="INSERT INTO tblLinks(fkParentID,fkChildID,fkQueryID,iNumberTimes) VALUES (" . $seed["iPageID"] . "," . $page_id . "," . $seed["fkQueryID"] . ",1)"; db_run_query($strSQL); } else { //update $strSQL="UPDATE tblLinks SET iNumberTimes=iNumberTimes+1 WHERE iLinkID=" . $link_id; db_run_query($strSQL); }*/ //print "returnVal(" . $seed["iPageID"] . "," . $page_id . "," . $seed["fkQueryID"] . ",1)\n"; return "(" . $seed["iPageID"] . "," . $page_id . "," . $seed["fkQueryID"] . ",1)"; }