<?php /* 通过Lib_http:将获取网页文件到数组的制定key指中 http_get_withheader:获取有头信息的网页 */ include "../util/LIB_http.php"; $target = "http://www.schrenk.com/publications.php"; $ref = "http://www.schrenk.com"; $return_array = http_get_withheader($target, $ref); echo "FILE CONTENTS \n"; var_dump($return_array['FILE']); echo "ERROR \n"; var_dump($return_array['ERROR']); echo "STATUS \n"; var_dump($return_array['STATUS']);
#$link_array = array(); # Get page base for $url $page_base = get_base_page_address($seed["strURL"]); # Download webpage //global $DELAY; //sleep($DELAY); $strHTML = ""; if ($seed["strHTML"] == NULL) { //die("No page: " . $seed["iPageID"]); echo "Downloading....\n"; try { $strURL = $seed["strURL"]; if (exclude_link($seed["strURL"])) { throw new Exception("Page in excluded list: {$strURL}\n"); } $downloaded_page = http_get_withheader($seed["strURL"], ""); # Catch fetch errors, oversize files, non-text extensions etc. if ($downloaded_page['ERROR'] !== '') { throw new Exception("Error fetching page: {$downloaded_page['ERROR']}"); } $content_type = $downloaded_page['STATUS']['content_type']; $strStatus = $downloaded_page['STATUS']; $code = $strStatus["http_code"]; if ($code != 200 && $code != 206 || strpos(strtolower($content_type), "text") === false) { //200 is OK, 206 is partial content print "Skipping....http_code is {$code} content_type is {$content_type}\n"; db_marked_harvested($seed); $seed = db_get_next_to_harvest(); continue; } $strHTML = $downloaded_page['FILE'];