function harvest_links($url) { # Initialize global $DELAY; $link_array = array(); # Get page base for $url $page_base = get_base_page_address($url); # Download webpage sleep($DELAY); $downloaded_page = http_get($url, ""); $anchor_tags = parse_array($downloaded_page['FILE'], "<a", "</a>", EXCL); # Put http attributes for each tag into an array for ($xx = 0; $xx < count($anchor_tags); $xx++) { $href = get_attribute($anchor_tags[$xx], "href"); $resolved_addres = resolve_address($href, $page_base); $link_array[] = $resolved_addres; echo "Harvested: " . $resolved_addres . " \n"; } return $link_array; }
$save_image_directory = "saved_images_" . preg_replace("/(http:\\/\\/|https:\\/\\/)/i", "", $page_base); $web_page = http_get($target, $referer); $books = array(); $bookCount = 0; $html = new simple_html_dom(); $html->load($web_page['FILE']); foreach ($html->find('div.views-field-field-image-cache-fid') as $image) { $books[$bookCount]['imageUrl'] = str_replace("www", "ftp", get_attribute($image->find('img', 0), 'src')); $bookCount++; } $bookCount = 0; foreach ($html->find('div.views-field-title') as $title) { $books[$bookCount]['title'] = trim($title->plaintext); $links = $title->find('a'); foreach ($links as $link) { $books[$bookCount]['pageUrl'] = resolve_address($link->href, $page_base); } $bookPage = http_get($books[$bookCount]['pageUrl'], $target); $bookHTML = new simple_html_dom(); $bookHTML->load($bookPage['FILE']); foreach ($bookHTML->find('div.product-body') as $summary) { $books[$bookCount]['summary'] = remove($summary->outertext, '<img', '/>'); } $bookCount++; } $bookCount = 0; foreach ($html->find('div.views-field-field-author-value') as $author) { $books[$bookCount]['author'] = trim($author->plaintext); $bookCount++; } $bookCount = 0;
# Parse the links $link_array = parse_array($downloaded_page['FILE'], $beg_tag = "<a", $close_tag = ">"); # Verify the links ?> <table border="1" cellpadding="1" cellspacing="0"> <tr bgcolor="#e0e0e0"> <th>URL</th> <th>HTTP CODE</th> <th>DOWNLOAD TIME (seconds)</th> </tr> <?php for ($xx = 0; $xx < count($link_array); $xx++) { // Parse the http attribute from link $link = get_attribute($tag = $link_array[$xx], $attribute = "href"); // Create a fully resolved address $resloved_link_address = resolve_address($link, $page_base); $downloaded_link = http_get($resloved_link_address, $target); ?> <tr> <td align="left"><?php echo $downloaded_link['STATUS']['url']; ?> </td> <td align="right"><?php echo $downloaded_link['STATUS']['http_code']; ?> </td> <td align="right"><?php echo $downloaded_link['STATUS']['total_time']; ?> </td>
function download_images_for_page($target) { echo "target = {$target}\n"; # Download the web page $web_page = http_get($target, $referer = ""); # Update the target in case there was a redirection $target = $web_page['STATUS']['url']; # Strip file name off target for use as page base $page_base = get_base_page_address($target); # Identify the directory where iamges are to be saved $save_image_directory = "saved_images_" . str_replace("http://", "", $page_base); # Parse the image tags $img_tag_array = parse_array($web_page['FILE'], "<img", ">"); if (count($img_tag_array) == 0) { echo "No images found at {$target}\n"; exit; } # Echo the image source attribute from each image tag for ($xx = 0; $xx < count($img_tag_array); $xx++) { $image_path = get_attribute($img_tag_array[$xx], $attribute = "src"); echo " image: " . $image_path; $image_url = resolve_address($image_path, $page_base); if (get_base_domain_address($page_base) == get_base_domain_address($image_url)) { # Make image storage directory for image, if one doesn't exist $directory = substr($image_path, 0, strrpos($image_path, "/")); $directory = str_replace(":", "-", $directory); $image_path = str_replace(":", "-", $image_path); clearstatcache(); // clear cache to get accurate directory status if (!is_dir($save_image_directory . "/" . $directory)) { mkpath($save_image_directory . "/" . $directory); } # Download the image, report image size $this_image_file = download_binary_file($image_url, $ref = ""); echo " size: " . strlen($this_image_file); # Save the image if (stristr($image_url, ".jpg") || stristr($image_url, ".gif") || stristr($image_url, ".png")) { $fp = fopen($save_image_directory . "/" . $image_path, "w"); fputs($fp, $this_image_file); fclose($fp); echo "\n"; } } else { echo "\nSkipping off-domain image.\n"; } } }
}*/ /*End insert*/ echo "Parsing....\n"; $anchor_tags = parse_array($strHTML, "<a ", "</a>", EXCL); # Put http attributes for each tag into an array $sqlQuery = "INSERT INTO tblLinks(fkParentID,fkChildID,fkQueryID,iNumberTimes) VALUES "; //print "1 sqlQuery is $sqlQuery\n"; $outputExists = false; for ($xx = 0; $xx < count($anchor_tags); $xx++) { //print "tags : ". $anchor_tags[$xx]. "\n"; $href = get_attribute($anchor_tags[$xx], "href"); //print "href = $href , page_base = $page_base \n"; if ($href === false) { continue; } $resolved_address = resolve_address($href, $page_base); //echo "have address: $resolved_address\n"; if (!exclude_link($resolved_address)) { try { $out = ""; $out = db_store_link($seed, $resolved_address); if ($out != NULL && $out != "") { $outputExists = true; $sqlQuery = $sqlQuery . $out . ","; //print "2 sqlQuery is $sqlQuery\n"; } } catch (Exception $e) { echo "***ERROR***\n"; echo "Couldn't store: {$resolved_address}\n"; echo "While harvesting: {$SEED_URL}\n"; break;
if (!is_dir($save_image_directory)) { mkpath($save_image_directory); } $this_image_file = download_binary_file($books[$bookCount]['imageUrl'], $ref = ""); if (stristr($books[$bookCount]['imageUrl'], ".jpg") || stristr($books[$bookCount]['imageUrl'], ".gif") || stristr($books[$bookCount]['imageUrl'], ".png")) { file_put_contents($save_image_directory . basename($books[$bookCount]['imageUrl']), $this_image_file); } } $divClass = 'views-field-title'; if (stristr($div, $divClass)) { $books[$bookCount]['title'] = trim(strip_tags($div)); $aTag = parse_array($div, '<a', '</a>'); if ($cloudflare == 1) { $books[$bookCount]['bookUrl'] = resolve_address(str_replace("www", "ftp", get_attribute($aTag[0], $attribute = "href")), $page_base); } else { $books[$bookCount]['bookUrl'] = resolve_address(get_attribute($aTag[0], $attribute = "href"), $page_base); } $bookPage[$bookCount] = http_get($books[$bookCount]['bookUrl'], $target); $bookDivs[$bookCount] = parse_array($bookPage[$bookCount]['FILE'], "<div class=\"product-body\"", "</div>"); $books[$bookCount]['summary'] = $bookDivs[$bookCount][0]; } $divClass = 'views-field-field-author-value'; if (stristr($div, $divClass)) { $books[$bookCount]['author'] = trim(strip_tags($div)); } $divClass = 'views-field-field-isbn13-value'; if (stristr($div, $divClass)) { $books[$bookCount]['ISBN13'] = trim(strip_tags($div)); } $divClass = 'views-field-field-released-value'; if (stristr($div, $divClass)) {
$url_4 = "http://"; $url_5 = "http://www.burceyoo.com/index.html"; $url_6 = "www.burceyoo.com/index.html"; echo "{$url_1} base_page: " . get_base_page_address($url_1) . "\n"; echo "{$url_2} base_page: " . get_base_page_address($url_2) . "\n"; echo "{$url_3} base_page: " . get_base_page_address($url_3) . "\n"; echo "{$url_4} base_page: " . get_base_page_address($url_4) . "\n"; echo "{$url_5} base_page: " . get_base_page_address($url_5) . "\n"; echo "{$url_6} base_page: " . get_base_page_address($url_6) . "\n"; echo "--------------------------------------------------------\n"; /* 应该使用base_page_address 测试直接使用原url address */ echo "{$url_1} base_domain: " . get_base_domain_address($url_1) . "\n"; echo "{$url_2} base_domain: " . get_base_domain_address($url_2) . "\n"; echo "{$url_3} base_domain: " . get_base_domain_address($url_3) . "\n"; // echo "url_4 base_domain: ".get_base_domain_address($url_4) ."\n"; #会报错,看可以继续运行返回为空 echo "{$url_5} base_domain: " . get_base_domain_address($url_5) . "\n"; echo "{$url_6} base_domain: " . get_base_domain_address($url_6) . "\n"; echo "--------------------------------------------------------\n"; $link_1 = "../image/book1.jpg"; $link_2 = "./image/book2.jpg"; $link_3 = "/image/book3.jpg"; $link_4 = "image/book4.jpg"; $base_url = "http://bruce.com.cn/book/sale/"; echo "base_url:" . $base_url . "\n"; echo "{$link_1} resolve_address:" . resolve_address($link_1, $base_url) . "\n"; echo "{$link_2} resolve_address:" . resolve_address($link_2, $base_url) . "\n"; echo "{$link_3} resolve_address:" . resolve_address($link_3, $base_url) . "\n"; echo "{$link_4} resolve_address:" . resolve_address($link_4, $base_url) . "\n"; echo "--------------------------------------------------------\n";