function harvest_links($url)
{
    # Initialize
    global $DELAY;
    $link_array = array();
    # Get page base for $url
    $page_base = get_base_page_address($url);
    # Download webpage
    sleep($DELAY);
    $downloaded_page = http_get($url, "");
    $anchor_tags = parse_array($downloaded_page['FILE'], "<a", "</a>", EXCL);
    # Put http attributes for each tag into an array
    for ($xx = 0; $xx < count($anchor_tags); $xx++) {
        $href = get_attribute($anchor_tags[$xx], "href");
        $resolved_addres = resolve_address($href, $page_base);
        $link_array[] = $resolved_addres;
        echo "Harvested: " . $resolved_addres . " \n";
    }
    return $link_array;
}
Пример #2
0
        }
        $strSQL .= " WHERE iPageID={$pageID}";
        db_run_query($strSQL);
    }
    //die("Preparation done.\n");
}
$seed = db_get_next_to_harvest();
while ($seed != NULL) {
    $SEED_URL = $seed["strURL"];
    // First URL spider downloads
    #$START_PENETRATION = seed['iLevel'];
    # Get links from $SEED_URL
    echo "Harvesting Seed " . $seed["iPageID"] . "; URL   " . $seed["strURL"] . "\n";
    #$link_array = array();
    # Get page base for $url
    $page_base = get_base_page_address($seed["strURL"]);
    # Download webpage
    //global $DELAY;
    //sleep($DELAY);
    $strHTML = "";
    if ($seed["strHTML"] == NULL) {
        //die("No page: "  . $seed["iPageID"]);
        echo "Downloading....\n";
        try {
            $strURL = $seed["strURL"];
            if (exclude_link($seed["strURL"])) {
                throw new Exception("Page in excluded list: {$strURL}\n");
            }
            $downloaded_page = http_get_withheader($seed["strURL"], "");
            # Catch fetch errors, oversize files, non-text extensions etc.
            if ($downloaded_page['ERROR'] !== '') {
Пример #3
0
function download_images_for_page($target)
{
    echo "target = {$target}\n";
    # Download the web page
    $web_page = http_get($target, $referer = "");
    # Update the target in case there was a redirection
    $target = $web_page['STATUS']['url'];
    # Strip file name off target for use as page base
    $page_base = get_base_page_address($target);
    # Identify the directory where iamges are to be saved
    $save_image_directory = "saved_images_" . str_replace("http://", "", $page_base);
    # Parse the image tags
    $img_tag_array = parse_array($web_page['FILE'], "<img", ">");
    if (count($img_tag_array) == 0) {
        echo "No images found at {$target}\n";
        exit;
    }
    # Echo the image source attribute from each image tag
    for ($xx = 0; $xx < count($img_tag_array); $xx++) {
        $image_path = get_attribute($img_tag_array[$xx], $attribute = "src");
        echo " image: " . $image_path;
        $image_url = resolve_address($image_path, $page_base);
        if (get_base_domain_address($page_base) == get_base_domain_address($image_url)) {
            # Make image storage directory for image, if one doesn't exist
            $directory = substr($image_path, 0, strrpos($image_path, "/"));
            $directory = str_replace(":", "-", $directory);
            $image_path = str_replace(":", "-", $image_path);
            clearstatcache();
            // clear cache to get accurate directory status
            if (!is_dir($save_image_directory . "/" . $directory)) {
                mkpath($save_image_directory . "/" . $directory);
            }
            # Download the image, report image size
            $this_image_file = download_binary_file($image_url, $ref = "");
            echo " size: " . strlen($this_image_file);
            # Save the image
            if (stristr($image_url, ".jpg") || stristr($image_url, ".gif") || stristr($image_url, ".png")) {
                $fp = fopen($save_image_directory . "/" . $image_path, "w");
                fputs($fp, $this_image_file);
                fclose($fp);
                echo "\n";
            }
        } else {
            echo "\nSkipping off-domain image.\n";
        }
    }
}
/*
	LIB_resolve_addresses.php 函数测试
*/
include "../util/LIB_resolve_addresses.php";
$url_1 = "http://www.bruceyoo.com/";
$url_2 = "http://www.bruceyoo.com/image/";
$url_3 = "http://www.bruceyoo.com/image/index.html";
$url_4 = "http://";
$url_5 = "http://www.burceyoo.com/index.html";
$url_6 = "www.burceyoo.com/index.html";
echo "{$url_1} base_page: " . get_base_page_address($url_1) . "\n";
echo "{$url_2} base_page: " . get_base_page_address($url_2) . "\n";
echo "{$url_3} base_page: " . get_base_page_address($url_3) . "\n";
echo "{$url_4} base_page: " . get_base_page_address($url_4) . "\n";
echo "{$url_5} base_page: " . get_base_page_address($url_5) . "\n";
echo "{$url_6} base_page: " . get_base_page_address($url_6) . "\n";
echo "--------------------------------------------------------\n";
/*
	应该使用base_page_address 测试直接使用原url address
*/
echo "{$url_1} base_domain: " . get_base_domain_address($url_1) . "\n";
echo "{$url_2} base_domain: " . get_base_domain_address($url_2) . "\n";
echo "{$url_3} base_domain: " . get_base_domain_address($url_3) . "\n";
// echo "url_4 base_domain: ".get_base_domain_address($url_4) ."\n"; #会报错,看可以继续运行返回为空
echo "{$url_5} base_domain: " . get_base_domain_address($url_5) . "\n";
echo "{$url_6} base_domain: " . get_base_domain_address($url_6) . "\n";
echo "--------------------------------------------------------\n";
$link_1 = "../image/book1.jpg";
$link_2 = "./image/book2.jpg";
$link_3 = "/image/book3.jpg";
$link_4 = "image/book4.jpg";