Пример #1
0
 echo "Parsing....\n";
 $anchor_tags = parse_array($strHTML, "<a ", "</a>", EXCL);
 # Put http attributes for each tag into an array
 $sqlQuery = "INSERT INTO tblLinks(fkParentID,fkChildID,fkQueryID,iNumberTimes) VALUES ";
 //print "1 sqlQuery is $sqlQuery\n";
 $outputExists = false;
 for ($xx = 0; $xx < count($anchor_tags); $xx++) {
     //print "tags : ". $anchor_tags[$xx]. "\n";
     $href = get_attribute($anchor_tags[$xx], "href");
     //print "href = $href , page_base = $page_base \n";
     if ($href === false) {
         continue;
     }
     $resolved_address = resolve_address($href, $page_base);
     //echo "have address: $resolved_address\n";
     if (!exclude_link($resolved_address)) {
         try {
             $out = "";
             $out = db_store_link($seed, $resolved_address);
             if ($out != NULL && $out != "") {
                 $outputExists = true;
                 $sqlQuery = $sqlQuery . $out . ",";
                 //print "2 sqlQuery is $sqlQuery\n";
             }
         } catch (Exception $e) {
             echo "***ERROR***\n";
             echo "Couldn't store: {$resolved_address}\n";
             echo "While harvesting: {$SEED_URL}\n";
             break;
             //ignore any further links (to prevent multiple error messages for one page)
         }
Пример #2
0
<?php

//Restrict crawling to a whitelist? true | false
$whitelistdomain = true;
//What level of domain to match (-1 any part, 1=tld, 2=sld, etc. e.g. 1=uk, 2=gov.uk, 3=direct.gov.uk)
$whitelistdomainlevel = -1;
//list of domains separated with : (no starting / ending :)
$whitelistdomainlist = "example.com:another.co.uk";
include_once "../LIB_simple_spider.php";
assert(exclude_link("http://example.com") === false);
assert(exclude_link("http://www.example.com") === false);
assert(exclude_link("http://abcde.example.com") === false);
assert(exclude_link("http://www.example.com/hij/klmno") === false);
assert(exclude_link("http://abc.def.example.com/hij/klmno") === false);
assert(exclude_link("http://abc.def.should-fail.com/hij/klmno") === true);