function harvest_links($url) { # Initialize global $DELAY; $link_array = array(); # Get page base for $url $page_base = get_base_page_address($url); # Download webpage sleep($DELAY); $downloaded_page = http_get($url, ""); $anchor_tags = parse_array($downloaded_page['FILE'], "<a", "</a>", EXCL); # Put http attributes for each tag into an array for ($xx = 0; $xx < count($anchor_tags); $xx++) { $href = get_attribute($anchor_tags[$xx], "href"); $resolved_addres = resolve_address($href, $page_base); $link_array[] = $resolved_addres; echo "Harvested: " . $resolved_addres . " \n"; } return $link_array; }
} $strSQL .= " WHERE iPageID={$pageID}"; db_run_query($strSQL); } //die("Preparation done.\n"); } $seed = db_get_next_to_harvest(); while ($seed != NULL) { $SEED_URL = $seed["strURL"]; // First URL spider downloads #$START_PENETRATION = seed['iLevel']; # Get links from $SEED_URL echo "Harvesting Seed " . $seed["iPageID"] . "; URL " . $seed["strURL"] . "\n"; #$link_array = array(); # Get page base for $url $page_base = get_base_page_address($seed["strURL"]); # Download webpage //global $DELAY; //sleep($DELAY); $strHTML = ""; if ($seed["strHTML"] == NULL) { //die("No page: " . $seed["iPageID"]); echo "Downloading....\n"; try { $strURL = $seed["strURL"]; if (exclude_link($seed["strURL"])) { throw new Exception("Page in excluded list: {$strURL}\n"); } $downloaded_page = http_get_withheader($seed["strURL"], ""); # Catch fetch errors, oversize files, non-text extensions etc. if ($downloaded_page['ERROR'] !== '') {
function download_images_for_page($target) { echo "target = {$target}\n"; # Download the web page $web_page = http_get($target, $referer = ""); # Update the target in case there was a redirection $target = $web_page['STATUS']['url']; # Strip file name off target for use as page base $page_base = get_base_page_address($target); # Identify the directory where iamges are to be saved $save_image_directory = "saved_images_" . str_replace("http://", "", $page_base); # Parse the image tags $img_tag_array = parse_array($web_page['FILE'], "<img", ">"); if (count($img_tag_array) == 0) { echo "No images found at {$target}\n"; exit; } # Echo the image source attribute from each image tag for ($xx = 0; $xx < count($img_tag_array); $xx++) { $image_path = get_attribute($img_tag_array[$xx], $attribute = "src"); echo " image: " . $image_path; $image_url = resolve_address($image_path, $page_base); if (get_base_domain_address($page_base) == get_base_domain_address($image_url)) { # Make image storage directory for image, if one doesn't exist $directory = substr($image_path, 0, strrpos($image_path, "/")); $directory = str_replace(":", "-", $directory); $image_path = str_replace(":", "-", $image_path); clearstatcache(); // clear cache to get accurate directory status if (!is_dir($save_image_directory . "/" . $directory)) { mkpath($save_image_directory . "/" . $directory); } # Download the image, report image size $this_image_file = download_binary_file($image_url, $ref = ""); echo " size: " . strlen($this_image_file); # Save the image if (stristr($image_url, ".jpg") || stristr($image_url, ".gif") || stristr($image_url, ".png")) { $fp = fopen($save_image_directory . "/" . $image_path, "w"); fputs($fp, $this_image_file); fclose($fp); echo "\n"; } } else { echo "\nSkipping off-domain image.\n"; } } }
/* LIB_resolve_addresses.php 函数测试 */ include "../util/LIB_resolve_addresses.php"; $url_1 = "http://www.bruceyoo.com/"; $url_2 = "http://www.bruceyoo.com/image/"; $url_3 = "http://www.bruceyoo.com/image/index.html"; $url_4 = "http://"; $url_5 = "http://www.burceyoo.com/index.html"; $url_6 = "www.burceyoo.com/index.html"; echo "{$url_1} base_page: " . get_base_page_address($url_1) . "\n"; echo "{$url_2} base_page: " . get_base_page_address($url_2) . "\n"; echo "{$url_3} base_page: " . get_base_page_address($url_3) . "\n"; echo "{$url_4} base_page: " . get_base_page_address($url_4) . "\n"; echo "{$url_5} base_page: " . get_base_page_address($url_5) . "\n"; echo "{$url_6} base_page: " . get_base_page_address($url_6) . "\n"; echo "--------------------------------------------------------\n"; /* 应该使用base_page_address 测试直接使用原url address */ echo "{$url_1} base_domain: " . get_base_domain_address($url_1) . "\n"; echo "{$url_2} base_domain: " . get_base_domain_address($url_2) . "\n"; echo "{$url_3} base_domain: " . get_base_domain_address($url_3) . "\n"; // echo "url_4 base_domain: ".get_base_domain_address($url_4) ."\n"; #会报错,看可以继续运行返回为空 echo "{$url_5} base_domain: " . get_base_domain_address($url_5) . "\n"; echo "{$url_6} base_domain: " . get_base_domain_address($url_6) . "\n"; echo "--------------------------------------------------------\n"; $link_1 = "../image/book1.jpg"; $link_2 = "./image/book2.jpg"; $link_3 = "/image/book3.jpg"; $link_4 = "image/book4.jpg";