Example #1
0
 /**
  * @brief Processes a resource at a specified URL.  This will first check and see if the 
  * resource has already been loaded and just return the hash to that resource.  If not
  * it will perform an HTTP request to load the resource and return the resulting hash.
  * @param string $url The absolute URL of the resource to proces.
  * @param boolean $followRedirects Whether to follow HTTP redirects if the resource has moved.
  * @param string[] $locations Array of locations that have already been followed.  This can help
  *	prevent redirect loops.
  * @return string MD5 hash of the absolute URL.  This can be used to look up the resource later
  *	with loadResource().
  */
 public function processResource($url, $followRedirects = false, $locations = array())
 {
     // First check to see if it is already loaded.
     $hash = md5($url);
     $resource = $this->loadResource($hash);
     if ($resource) {
         return $hash;
     }
     require_once 'inc/ProxyClient.php';
     $client = new ProxyClient();
     $client->clear();
     $client->URL = $url;
     $found = false;
     //$locations = array();
     $client->noBody = false;
     // only find out what type of content it is
     $client->process();
     if ($client->status['http_code'] == 200) {
         // We have success
         if (preg_match('#css#', $client->contentType)) {
             $client->content = $this->processCss($client->content, $client->URL);
         }
         $this->saveResource($url, $client->content, $client->contentType);
         return md5($url);
     } else {
         if ($client->status['http_code'] >= 300 and $client->status['http_code'] < 400) {
             if (!$followRedirects) {
                 throw new Exception("Failed to process page " . $url . " because the page has moved and followRedirects is set to false in this invocation.");
             }
             // We got a redirect status code
             $location = null;
             foreach ($client->headers as $h) {
                 if (preg_match('/^Location:(.*)$/i', $h, $matches)) {
                     $location = trim($matches[1]);
                     break;
                 }
             }
             if (!$location) {
                 throw new Exception("Received an http status code of " . $client->status['http_code'] . " but no location header was found.");
             }
             $locations[$url] = true;
             if (isset($locations[$location])) {
                 throw new Exception("Redirect loop found: " . $location);
             }
             return $this->processResource($location, $followRedirects, $locations);
         } else {
             throw new Exception("Failed to process page {$url}.  Received HTTP response code: " . $client->status['http_code']);
         }
     }
 }
Example #2
0
 /**
  * @brief Loads a webpage and processes it.   This will glean the content-type,
  * response code, and possibly the page content of the webpage and attach
  * this information to a node and add the node to the tree.
  *
  * @param string $url The URL of the webpage to process.
  * @param boolean $followRedirects If true then this will follow redirects if the server
  * 	returns a redirect response code (e.g. 302)
  * @param array $locations An array of locations that have already been visited.  The 
  * 	keys are URLs and the values are booleans.  This just makes it easier to look
  * up existence.  This array is used to prevent infinite loops.
  * @returns boolean True if the page was processed successfully and the node added.
  * @throws Exception if there was a problem processing the page. This will include
  * if a 400 error is returned or some other common occurrence so this exception MUST
  * be caught and handled or you'll regularly get uncaught exception errors.
  */
 public function processPage($url, $followRedirects = false, $locations = array())
 {
     error_log('Processing ' . $url);
     require_once 'inc/ProxyClient.php';
     $client = new ProxyClient();
     $client->clear();
     $client->URL = $url;
     $found = false;
     //$locations = array();
     $client->noBody = true;
     // only find out what type of content it is
     $client->process();
     $this->addNodeAtAbsoluteUrl($url, array('httpStatus' => $client->status['http_code'], 'contentType' => $client->contentType));
     if ($client->status['http_code'] == 200) {
         // We have success
         if (preg_match('#xml|html|xhtml#', $client->contentType)) {
             $client->noBody = false;
             // only find out what type of content it is
             $client->process();
             $this->processHtml($client->content, $client->URL);
             return true;
         } else {
             throw new Exception("Failed to process page {$url} because it is not a parsable content type: " . $client->contentType);
         }
     } else {
         if ($client->status['http_code'] >= 300 and $client->status['http_code'] < 400) {
             if (!$followRedirects) {
                 throw new Exception("Failed to process page " . $url . " because the page has moved and followRedirects is set to false in this invocation.");
             }
             // We got a redirect status code
             $location = null;
             foreach ($client->headers as $h) {
                 if (preg_match('/^Location:(.*)$/i', $h, $matches)) {
                     $location = trim($matches[1]);
                     break;
                 }
             }
             if (!$location) {
                 throw new Exception("Received an http status code of " . $client->status['http_code'] . " but no location header was found.");
             }
             $locations[$url] = true;
             if (isset($locations[$location])) {
                 throw new Exception("Redirect loop found: " . $location);
             }
             return $this->processPage($location, $followRedirects, $locations);
         } else {
             throw new Exception("Failed to process page {$url}.  Received HTTP response code: " . $client->status['http_code']);
         }
     }
 }