<?php

/*
通过Lib_http:将获取网页文件到数组的制定key指中
http_get_withheader:获取有头信息的网页
*/
include "../util/LIB_http.php";
$target = "http://www.schrenk.com/publications.php";
$ref = "http://www.schrenk.com";
$return_array = http_get_withheader($target, $ref);
echo "FILE CONTENTS \n";
var_dump($return_array['FILE']);
echo "ERROR \n";
var_dump($return_array['ERROR']);
echo "STATUS \n";
var_dump($return_array['STATUS']);
예제 #2
0
 #$link_array = array();
 # Get page base for $url
 $page_base = get_base_page_address($seed["strURL"]);
 # Download webpage
 //global $DELAY;
 //sleep($DELAY);
 $strHTML = "";
 if ($seed["strHTML"] == NULL) {
     //die("No page: "  . $seed["iPageID"]);
     echo "Downloading....\n";
     try {
         $strURL = $seed["strURL"];
         if (exclude_link($seed["strURL"])) {
             throw new Exception("Page in excluded list: {$strURL}\n");
         }
         $downloaded_page = http_get_withheader($seed["strURL"], "");
         # Catch fetch errors, oversize files, non-text extensions etc.
         if ($downloaded_page['ERROR'] !== '') {
             throw new Exception("Error fetching page: {$downloaded_page['ERROR']}");
         }
         $content_type = $downloaded_page['STATUS']['content_type'];
         $strStatus = $downloaded_page['STATUS'];
         $code = $strStatus["http_code"];
         if ($code != 200 && $code != 206 || strpos(strtolower($content_type), "text") === false) {
             //200 is OK, 206 is partial content
             print "Skipping....http_code is {$code} content_type is {$content_type}\n";
             db_marked_harvested($seed);
             $seed = db_get_next_to_harvest();
             continue;
         }
         $strHTML = $downloaded_page['FILE'];