コード例 #1
0
// Don't let PHP timeout
db_connect();
$stop = false;
if ($first_run == true) {
    //Before starting, check the domains fields of the database and fill in any missing entries
    //Also fill in missing
    $strSQL = "SELECT * FROM tblPages WHERE strDomain IS NULL OR strDomain='' OR strCleanURL IS NULL  OR strCleanURL=''";
    $statement = $GLOBALS["db"]->prepare($strSQL);
    $result = $statement->execute();
    while (null != ($row = $statement->fetch(PDO::FETCH_ASSOC))) {
        $url = $row['strURL'];
        if ($row['strDomain'] != null && $row['strDomain'] != '') {
            $domain = false;
            //$row['strDomain'];
        } else {
            $domain = get_domain_part($url, $SAME_DOMAIN_FETCH_LEVEL);
        }
        if ($row['strCleanURL'] != null && $row['strCleanURL'] != '') {
            $cleanURL = false;
            //$row['strCleanURL'];
        } else {
            $cleanURL = clean_url($url);
        }
        $pageID = $row["iPageID"];
        $strSQL = "UPDATE tblPages SET ";
        if ($domain === false && $cleanURL === false) {
            //Should never get here. If we do something strange with db config
            die("Assert failes: neither domain nor clean url in need of updating");
        }
        if ($domain !== false) {
            $strSQL .= "strDomain='{$domain}' ";
コード例 #2
0
function exclude_link($link)
{
    # Initialization
    global $SEED_URL, $exclusion_array, $ALLOW_OFFSITE, $ONLY_OFFSITE;
    $exclude = false;
    // Exclude links that are JavaScript commands
    if (stristr($link, "javascript")) {
        // echo "Ignored JavaScript fuction: $link\n";
        $exclude = true;
    }
    if (strlen($link) < 9) {
        // echo "Link too short: $link\n";
        $exclude = true;
    }
    // Exclude links found in $exclusion_array
    for ($xx = 0; $xx < count($exclusion_array); $xx++) {
        //if(stristr($link, $exclusion_array[$xx]))
        if (preg_match($exclusion_array[$xx], $link) > 0) {
            //echo "Ignored excluded link: $link\n";
            $exclude = true;
            break;
        }
    }
    // Exclude offsite links if requested
    /*if($ALLOW_OFFSITE==false)
            {
            if(get_domain($link)!=get_domain($SEED_URL))
                {
                //echo "Ignored offsite link: $link\n";
                $exclude=true;
                }
            }
    
        if($ONLY_OFFSITE==true)
            {
            if(get_domain($link)==get_domain($SEED_URL))
                {
               // echo "Ignored on-site link: $link\n";
                $exclude=true;
                }
            }*/
    global $whitelistdomain, $whitelistdomainlevel, $whitelistdomainlist, $whitelistdomainlist_arr, $whitelistdomainlist_part;
    if ($whitelistdomain) {
        if ($whitelistdomainlevel == -1) {
            //match any part
            $domain = get_domain($link);
            $found = false;
            #print $whitelistdomainlist_arr[0] . "\n$domain\n";
            for ($x = 0; $x < count($whitelistdomainlist_arr); $x++) {
                if (strpos($domain, $whitelistdomainlist_arr[$x]) !== false) {
                    $found = true;
                    break;
                }
            }
            if ($found === false) {
                $exclude = true;
            }
        } else {
            $domain = get_domain_part($link, $whitelistdomainlevel);
            if (strpos($whitelistdomainlist_part, ":{$domain}:") === false) {
                $exclude = true;
            }
        }
    }
    global $whitelisturl, $whitelisturllist_arr;
    if ($whitelisturl) {
        $found = false;
        for ($x = 0; $x < count($whitelisturllist_arr); $x++) {
            if (strpos($link, $whitelisturllist_arr[$x]) !== false) {
                $found = true;
                break;
            }
        }
        if ($found === false) {
            $exclude = true;
        }
    }
    return $exclude;
}
コード例 #3
0
function db_store_link($seed, $link)
{
    global $SAME_DOMAIN_FETCH_LEVEL, $MAX_PENETRATION;
    //echo "db_store_link($seed,$link)\n";
    #check if in tblPages
    #if not, store
    #get unique id, tblPages.iPageID
    #get if there is link from seed[iPageID] to $resolved address
    #if so increment link count
    #if not, insert new record with link count = 0
    #echo "start......db_store_link(...)\n";
    #echo "link is: $link\n";
    $link = html_entity_decode($link);
    $cleanUrl = clean_url($link);
    $cleanUrl = $cleanUrl;
    $domain = get_domain_part($link, $SAME_DOMAIN_FETCH_LEVEL);
    $link = $link;
    $page_id = checkCache($cleanUrl);
    if ($page_id == null) {
        $strSQL = "SELECT iPageID FROM tblPages WHERE strCleanURL=?";
        $page_id = db_run_select($strSQL, array($cleanUrl), true);
        addToCache($cleanUrl, $page_id);
    }
    if ($page_id == NULL && $MAX_PENETRATION == 0) {
        //we are set only to crawl only pages in db
        return NULL;
    } else {
        if ($page_id == NULL) {
            $strSQL = "INSERT INTO tblPages SET fkQueryID=?,strURL=?,strCleanURL=?,iLevel=?,strDomain=?";
            db_run_query($strSQL, array($seed["fkQueryID"], $link, $cleanUrl, $seed["iLevel"] + 1, $domain));
            //$strSQL="SELECT LAST_INSERT_ID();";//TODO: ONLY MYSQL
            //"SELECT iPageID FROM tblPages WHERE strCleanURL='" . $cleanUrl . "'";
            $page_id = $GLOBALS["db"]->lastInsertId();
            //db_run_select($strSQL,true);
            addToCache($cleanUrl, $page_id);
        } else {
            //check current level and give shorter level if possible?
        }
    }
    /*$strSQL="SELECT iLinkID FROM tblLinks " .
    			"WHERE fkParentID=" . $seed["iPageID"] . " AND fkChildID=" . $page_id;
    	$link_id = db_run_select($strSQL,true);
    	if ($link_id==NULL) {
    		$strSQL="INSERT INTO tblLinks(fkParentID,fkChildID,fkQueryID,iNumberTimes) VALUES (" .
    			$seed["iPageID"] . "," . $page_id . "," . $seed["fkQueryID"] . ",1)";
    		db_run_query($strSQL);
    	} else {
    		//update
    		$strSQL="UPDATE tblLinks SET iNumberTimes=iNumberTimes+1 WHERE iLinkID=" . $link_id;
    		db_run_query($strSQL);	
    	}*/
    //print "returnVal(" . 	$seed["iPageID"] . "," . $page_id . "," . $seed["fkQueryID"] . ",1)\n";
    return "(" . $seed["iPageID"] . "," . $page_id . "," . $seed["fkQueryID"] . ",1)";
}