echo "Parsing....\n"; $anchor_tags = parse_array($strHTML, "<a ", "</a>", EXCL); # Put http attributes for each tag into an array $sqlQuery = "INSERT INTO tblLinks(fkParentID,fkChildID,fkQueryID,iNumberTimes) VALUES "; //print "1 sqlQuery is $sqlQuery\n"; $outputExists = false; for ($xx = 0; $xx < count($anchor_tags); $xx++) { //print "tags : ". $anchor_tags[$xx]. "\n"; $href = get_attribute($anchor_tags[$xx], "href"); //print "href = $href , page_base = $page_base \n"; if ($href === false) { continue; } $resolved_address = resolve_address($href, $page_base); //echo "have address: $resolved_address\n"; if (!exclude_link($resolved_address)) { try { $out = ""; $out = db_store_link($seed, $resolved_address); if ($out != NULL && $out != "") { $outputExists = true; $sqlQuery = $sqlQuery . $out . ","; //print "2 sqlQuery is $sqlQuery\n"; } } catch (Exception $e) { echo "***ERROR***\n"; echo "Couldn't store: {$resolved_address}\n"; echo "While harvesting: {$SEED_URL}\n"; break; //ignore any further links (to prevent multiple error messages for one page) }
<?php //Restrict crawling to a whitelist? true | false $whitelistdomain = true; //What level of domain to match (-1 any part, 1=tld, 2=sld, etc. e.g. 1=uk, 2=gov.uk, 3=direct.gov.uk) $whitelistdomainlevel = -1; //list of domains separated with : (no starting / ending :) $whitelistdomainlist = "example.com:another.co.uk"; include_once "../LIB_simple_spider.php"; assert(exclude_link("http://example.com") === false); assert(exclude_link("http://www.example.com") === false); assert(exclude_link("http://abcde.example.com") === false); assert(exclude_link("http://www.example.com/hij/klmno") === false); assert(exclude_link("http://abc.def.example.com/hij/klmno") === false); assert(exclude_link("http://abc.def.should-fail.com/hij/klmno") === true);