function download_images_for_page($target) { echo "target = {$target}\n"; # Download the web page $web_page = http_get($target, $referer = ""); # Update the target in case there was a redirection $target = $web_page['STATUS']['url']; # Strip file name off target for use as page base $page_base = get_base_page_address($target); # Identify the directory where iamges are to be saved $save_image_directory = "saved_images_" . str_replace("http://", "", $page_base); # Parse the image tags $img_tag_array = parse_array($web_page['FILE'], "<img", ">"); if (count($img_tag_array) == 0) { echo "No images found at {$target}\n"; exit; } # Echo the image source attribute from each image tag for ($xx = 0; $xx < count($img_tag_array); $xx++) { $image_path = get_attribute($img_tag_array[$xx], $attribute = "src"); echo " image: " . $image_path; $image_url = resolve_address($image_path, $page_base); if (get_base_domain_address($page_base) == get_base_domain_address($image_url)) { # Make image storage directory for image, if one doesn't exist $directory = substr($image_path, 0, strrpos($image_path, "/")); $directory = str_replace(":", "-", $directory); $image_path = str_replace(":", "-", $image_path); clearstatcache(); // clear cache to get accurate directory status if (!is_dir($save_image_directory . "/" . $directory)) { mkpath($save_image_directory . "/" . $directory); } # Download the image, report image size $this_image_file = download_binary_file($image_url, $ref = ""); echo " size: " . strlen($this_image_file); # Save the image if (stristr($image_url, ".jpg") || stristr($image_url, ".gif") || stristr($image_url, ".png")) { $fp = fopen($save_image_directory . "/" . $image_path, "w"); fputs($fp, $this_image_file); fclose($fp); echo "\n"; } } else { echo "\nSkipping off-domain image.\n"; } } }
function resolve_address($link, $page_base) { #---------------------------------------------------------- # CONDITION INCOMING LINK ADDRESS # $link = trim($link); $page_base = trim($page_base); # if there isn't one, put a "/" at the end of the $page_base $page_base = trim($page_base); if (strrpos($page_base, "/") + 1 != strlen($page_base)) { $page_base = $page_base . "/"; } # remove unwanted characters from $link $link = str_replace(";", "", $link); // remove ; characters $link = str_replace("\"", "", $link); // remove " characters $link = str_replace("'", "", $link); // remove ' characters $abs_address = $page_base . $link; $abs_address = str_replace("/./", "/", $abs_address); $abs_done = 0; #---------------------------------------------------------- # LOOK FOR REFERENCES TO THE BASE DOMAIN ADDRESS #---------------------------------------------------------- # There are essentially four types of addresses to resolve: # 1. References to the base domain address # 2. References to higher directories # 3. References to the base directory # 4. Addresses that are alreday fully resolved # if ($abs_done == 0) { # Use domain base address if $link starts with "/" if (substr($link, 0, 1) == "/") { // find the left_most "." $pos_left_most_dot = strrpos($page_base, "."); # Find the left-most "/" in $page_base after the dot for ($xx = $pos_left_most_dot; $xx < strlen($page_base); $xx++) { if (substr($page_base, $xx, 1) == "/") { break; } } $domain_base_address = get_base_domain_address($page_base); $abs_address = $domain_base_address . $link; $abs_done = 1; } } #---------------------------------------------------------- # LOOK FOR REFERENCES TO HIGHER DIRECTORIES # if ($abs_done == 0) { if (substr($link, 0, 3) == "../") { $page_base = trim($page_base); $right_most_slash = strrpos($page_base, "/"); // remove slash if at end of $page base if ($right_most_slash == strlen($page_base) - 1) { $page_base = substr($page_base, 0, strlen($page_base) - 1); $right_most_slash = strrpos($page_base, "/"); } if ($right_most_slash < 8) { $unadjusted_base_address = $page_base; } $not_done = TRUE; while ($not_done) { // bring page base back one level list($page_base, $link) = move_address_back_one_level($page_base, $link); if (substr($link, 0, 3) != "../") { $not_done = FALSE; } } if (isset($unadjusted_base_address)) { $abs_address = $unadjusted_base_address . "/" . $link; } else { $abs_address = $page_base . "/" . $link; } $abs_done = 1; } } #---------------------------------------------------------- # LOOK FOR REFERENCES TO BASE DIRECTORY # if ($abs_done == 0) { if (substr($link, 0, "1") == "/") { $link = substr($link, 1, strlen($link) - 1); // remove leading "/" $abs_address = $page_base . $link; // combine object with base address $abs_done = 1; } } #---------------------------------------------------------- # LOOK FOR REFERENCES THAT ARE ALREADY ABSOLUTE # if ($abs_done == 0) { if (substr($link, 0, 4) == "http") { $abs_address = $link; $abs_done = 1; } } #---------------------------------------------------------- # ADD PROTOCOL IDENTIFIER IF NEEDED # if (substr($abs_address, 0, 7) != "http://" && substr($abs_address, 0, 8) != "https://") { $abs_address = "http://" . $abs_address; } return $abs_address; }
$url_4 = "http://"; $url_5 = "http://www.burceyoo.com/index.html"; $url_6 = "www.burceyoo.com/index.html"; echo "{$url_1} base_page: " . get_base_page_address($url_1) . "\n"; echo "{$url_2} base_page: " . get_base_page_address($url_2) . "\n"; echo "{$url_3} base_page: " . get_base_page_address($url_3) . "\n"; echo "{$url_4} base_page: " . get_base_page_address($url_4) . "\n"; echo "{$url_5} base_page: " . get_base_page_address($url_5) . "\n"; echo "{$url_6} base_page: " . get_base_page_address($url_6) . "\n"; echo "--------------------------------------------------------\n"; /* 应该使用base_page_address 测试直接使用原url address */ echo "{$url_1} base_domain: " . get_base_domain_address($url_1) . "\n"; echo "{$url_2} base_domain: " . get_base_domain_address($url_2) . "\n"; echo "{$url_3} base_domain: " . get_base_domain_address($url_3) . "\n"; // echo "url_4 base_domain: ".get_base_domain_address($url_4) ."\n"; #会报错,看可以继续运行返回为空 echo "{$url_5} base_domain: " . get_base_domain_address($url_5) . "\n"; echo "{$url_6} base_domain: " . get_base_domain_address($url_6) . "\n"; echo "--------------------------------------------------------\n"; $link_1 = "../image/book1.jpg"; $link_2 = "./image/book2.jpg"; $link_3 = "/image/book3.jpg"; $link_4 = "image/book4.jpg"; $base_url = "http://bruce.com.cn/book/sale/"; echo "base_url:" . $base_url . "\n"; echo "{$link_1} resolve_address:" . resolve_address($link_1, $base_url) . "\n"; echo "{$link_2} resolve_address:" . resolve_address($link_2, $base_url) . "\n"; echo "{$link_3} resolve_address:" . resolve_address($link_3, $base_url) . "\n"; echo "{$link_4} resolve_address:" . resolve_address($link_4, $base_url) . "\n"; echo "--------------------------------------------------------\n";