function harvest_links($url)
{
    # Initialize
    global $DELAY;
    $link_array = array();
    # Get page base for $url
    $page_base = get_base_page_address($url);
    # Download webpage
    sleep($DELAY);
    $downloaded_page = http_get($url, "");
    $anchor_tags = parse_array($downloaded_page['FILE'], "<a", "</a>", EXCL);
    # Put http attributes for each tag into an array
    for ($xx = 0; $xx < count($anchor_tags); $xx++) {
        $href = get_attribute($anchor_tags[$xx], "href");
        $resolved_addres = resolve_address($href, $page_base);
        $link_array[] = $resolved_addres;
        echo "Harvested: " . $resolved_addres . " \n";
    }
    return $link_array;
}
Example #2
0
$save_image_directory = "saved_images_" . preg_replace("/(http:\\/\\/|https:\\/\\/)/i", "", $page_base);
$web_page = http_get($target, $referer);
$books = array();
$bookCount = 0;
$html = new simple_html_dom();
$html->load($web_page['FILE']);
foreach ($html->find('div.views-field-field-image-cache-fid') as $image) {
    $books[$bookCount]['imageUrl'] = str_replace("www", "ftp", get_attribute($image->find('img', 0), 'src'));
    $bookCount++;
}
$bookCount = 0;
foreach ($html->find('div.views-field-title') as $title) {
    $books[$bookCount]['title'] = trim($title->plaintext);
    $links = $title->find('a');
    foreach ($links as $link) {
        $books[$bookCount]['pageUrl'] = resolve_address($link->href, $page_base);
    }
    $bookPage = http_get($books[$bookCount]['pageUrl'], $target);
    $bookHTML = new simple_html_dom();
    $bookHTML->load($bookPage['FILE']);
    foreach ($bookHTML->find('div.product-body') as $summary) {
        $books[$bookCount]['summary'] = remove($summary->outertext, '<img', '/>');
    }
    $bookCount++;
}
$bookCount = 0;
foreach ($html->find('div.views-field-field-author-value') as $author) {
    $books[$bookCount]['author'] = trim($author->plaintext);
    $bookCount++;
}
$bookCount = 0;
# Parse the links
$link_array = parse_array($downloaded_page['FILE'], $beg_tag = "<a", $close_tag = ">");
# Verify the links
?>
<table border="1" cellpadding="1" cellspacing="0">
    <tr bgcolor="#e0e0e0">
        <th>URL</th>
        <th>HTTP CODE</th>
        <th>DOWNLOAD TIME (seconds)</th>
    </tr>
<?php 
for ($xx = 0; $xx < count($link_array); $xx++) {
    // Parse the http attribute from link
    $link = get_attribute($tag = $link_array[$xx], $attribute = "href");
    // Create a fully resolved address
    $resloved_link_address = resolve_address($link, $page_base);
    $downloaded_link = http_get($resloved_link_address, $target);
    ?>
    <tr>
        <td align="left"><?php 
    echo $downloaded_link['STATUS']['url'];
    ?>
</td>
        <td align="right"><?php 
    echo $downloaded_link['STATUS']['http_code'];
    ?>
</td>
        <td align="right"><?php 
    echo $downloaded_link['STATUS']['total_time'];
    ?>
</td>
function download_images_for_page($target)
{
    echo "target = {$target}\n";
    # Download the web page
    $web_page = http_get($target, $referer = "");
    # Update the target in case there was a redirection
    $target = $web_page['STATUS']['url'];
    # Strip file name off target for use as page base
    $page_base = get_base_page_address($target);
    # Identify the directory where iamges are to be saved
    $save_image_directory = "saved_images_" . str_replace("http://", "", $page_base);
    # Parse the image tags
    $img_tag_array = parse_array($web_page['FILE'], "<img", ">");
    if (count($img_tag_array) == 0) {
        echo "No images found at {$target}\n";
        exit;
    }
    # Echo the image source attribute from each image tag
    for ($xx = 0; $xx < count($img_tag_array); $xx++) {
        $image_path = get_attribute($img_tag_array[$xx], $attribute = "src");
        echo " image: " . $image_path;
        $image_url = resolve_address($image_path, $page_base);
        if (get_base_domain_address($page_base) == get_base_domain_address($image_url)) {
            # Make image storage directory for image, if one doesn't exist
            $directory = substr($image_path, 0, strrpos($image_path, "/"));
            $directory = str_replace(":", "-", $directory);
            $image_path = str_replace(":", "-", $image_path);
            clearstatcache();
            // clear cache to get accurate directory status
            if (!is_dir($save_image_directory . "/" . $directory)) {
                mkpath($save_image_directory . "/" . $directory);
            }
            # Download the image, report image size
            $this_image_file = download_binary_file($image_url, $ref = "");
            echo " size: " . strlen($this_image_file);
            # Save the image
            if (stristr($image_url, ".jpg") || stristr($image_url, ".gif") || stristr($image_url, ".png")) {
                $fp = fopen($save_image_directory . "/" . $image_path, "w");
                fputs($fp, $this_image_file);
                fclose($fp);
                echo "\n";
            }
        } else {
            echo "\nSkipping off-domain image.\n";
        }
    }
}
Example #5
0
 		}*/
 /*End insert*/
 echo "Parsing....\n";
 $anchor_tags = parse_array($strHTML, "<a ", "</a>", EXCL);
 # Put http attributes for each tag into an array
 $sqlQuery = "INSERT INTO tblLinks(fkParentID,fkChildID,fkQueryID,iNumberTimes) VALUES ";
 //print "1 sqlQuery is $sqlQuery\n";
 $outputExists = false;
 for ($xx = 0; $xx < count($anchor_tags); $xx++) {
     //print "tags : ". $anchor_tags[$xx]. "\n";
     $href = get_attribute($anchor_tags[$xx], "href");
     //print "href = $href , page_base = $page_base \n";
     if ($href === false) {
         continue;
     }
     $resolved_address = resolve_address($href, $page_base);
     //echo "have address: $resolved_address\n";
     if (!exclude_link($resolved_address)) {
         try {
             $out = "";
             $out = db_store_link($seed, $resolved_address);
             if ($out != NULL && $out != "") {
                 $outputExists = true;
                 $sqlQuery = $sqlQuery . $out . ",";
                 //print "2 sqlQuery is $sqlQuery\n";
             }
         } catch (Exception $e) {
             echo "***ERROR***\n";
             echo "Couldn't store: {$resolved_address}\n";
             echo "While harvesting: {$SEED_URL}\n";
             break;
Example #6
0
     if (!is_dir($save_image_directory)) {
         mkpath($save_image_directory);
     }
     $this_image_file = download_binary_file($books[$bookCount]['imageUrl'], $ref = "");
     if (stristr($books[$bookCount]['imageUrl'], ".jpg") || stristr($books[$bookCount]['imageUrl'], ".gif") || stristr($books[$bookCount]['imageUrl'], ".png")) {
         file_put_contents($save_image_directory . basename($books[$bookCount]['imageUrl']), $this_image_file);
     }
 }
 $divClass = 'views-field-title';
 if (stristr($div, $divClass)) {
     $books[$bookCount]['title'] = trim(strip_tags($div));
     $aTag = parse_array($div, '<a', '</a>');
     if ($cloudflare == 1) {
         $books[$bookCount]['bookUrl'] = resolve_address(str_replace("www", "ftp", get_attribute($aTag[0], $attribute = "href")), $page_base);
     } else {
         $books[$bookCount]['bookUrl'] = resolve_address(get_attribute($aTag[0], $attribute = "href"), $page_base);
     }
     $bookPage[$bookCount] = http_get($books[$bookCount]['bookUrl'], $target);
     $bookDivs[$bookCount] = parse_array($bookPage[$bookCount]['FILE'], "<div class=\"product-body\"", "</div>");
     $books[$bookCount]['summary'] = $bookDivs[$bookCount][0];
 }
 $divClass = 'views-field-field-author-value';
 if (stristr($div, $divClass)) {
     $books[$bookCount]['author'] = trim(strip_tags($div));
 }
 $divClass = 'views-field-field-isbn13-value';
 if (stristr($div, $divClass)) {
     $books[$bookCount]['ISBN13'] = trim(strip_tags($div));
 }
 $divClass = 'views-field-field-released-value';
 if (stristr($div, $divClass)) {
$url_4 = "http://";
$url_5 = "http://www.burceyoo.com/index.html";
$url_6 = "www.burceyoo.com/index.html";
echo "{$url_1} base_page: " . get_base_page_address($url_1) . "\n";
echo "{$url_2} base_page: " . get_base_page_address($url_2) . "\n";
echo "{$url_3} base_page: " . get_base_page_address($url_3) . "\n";
echo "{$url_4} base_page: " . get_base_page_address($url_4) . "\n";
echo "{$url_5} base_page: " . get_base_page_address($url_5) . "\n";
echo "{$url_6} base_page: " . get_base_page_address($url_6) . "\n";
echo "--------------------------------------------------------\n";
/*
	应该使用base_page_address 测试直接使用原url address
*/
echo "{$url_1} base_domain: " . get_base_domain_address($url_1) . "\n";
echo "{$url_2} base_domain: " . get_base_domain_address($url_2) . "\n";
echo "{$url_3} base_domain: " . get_base_domain_address($url_3) . "\n";
// echo "url_4 base_domain: ".get_base_domain_address($url_4) ."\n"; #会报错,看可以继续运行返回为空
echo "{$url_5} base_domain: " . get_base_domain_address($url_5) . "\n";
echo "{$url_6} base_domain: " . get_base_domain_address($url_6) . "\n";
echo "--------------------------------------------------------\n";
$link_1 = "../image/book1.jpg";
$link_2 = "./image/book2.jpg";
$link_3 = "/image/book3.jpg";
$link_4 = "image/book4.jpg";
$base_url = "http://bruce.com.cn/book/sale/";
echo "base_url:" . $base_url . "\n";
echo "{$link_1} resolve_address:" . resolve_address($link_1, $base_url) . "\n";
echo "{$link_2} resolve_address:" . resolve_address($link_2, $base_url) . "\n";
echo "{$link_3} resolve_address:" . resolve_address($link_3, $base_url) . "\n";
echo "{$link_4} resolve_address:" . resolve_address($link_4, $base_url) . "\n";
echo "--------------------------------------------------------\n";