/** * @brief Processes a resource at a specified URL. This will first check and see if the * resource has already been loaded and just return the hash to that resource. If not * it will perform an HTTP request to load the resource and return the resulting hash. * @param string $url The absolute URL of the resource to proces. * @param boolean $followRedirects Whether to follow HTTP redirects if the resource has moved. * @param string[] $locations Array of locations that have already been followed. This can help * prevent redirect loops. * @return string MD5 hash of the absolute URL. This can be used to look up the resource later * with loadResource(). */ public function processResource($url, $followRedirects = false, $locations = array()) { // First check to see if it is already loaded. $hash = md5($url); $resource = $this->loadResource($hash); if ($resource) { return $hash; } require_once 'inc/ProxyClient.php'; $client = new ProxyClient(); $client->clear(); $client->URL = $url; $found = false; //$locations = array(); $client->noBody = false; // only find out what type of content it is $client->process(); if ($client->status['http_code'] == 200) { // We have success if (preg_match('#css#', $client->contentType)) { $client->content = $this->processCss($client->content, $client->URL); } $this->saveResource($url, $client->content, $client->contentType); return md5($url); } else { if ($client->status['http_code'] >= 300 and $client->status['http_code'] < 400) { if (!$followRedirects) { throw new Exception("Failed to process page " . $url . " because the page has moved and followRedirects is set to false in this invocation."); } // We got a redirect status code $location = null; foreach ($client->headers as $h) { if (preg_match('/^Location:(.*)$/i', $h, $matches)) { $location = trim($matches[1]); break; } } if (!$location) { throw new Exception("Received an http status code of " . $client->status['http_code'] . " but no location header was found."); } $locations[$url] = true; if (isset($locations[$location])) { throw new Exception("Redirect loop found: " . $location); } return $this->processResource($location, $followRedirects, $locations); } else { throw new Exception("Failed to process page {$url}. Received HTTP response code: " . $client->status['http_code']); } } }
/** * @brief Loads a webpage and processes it. This will glean the content-type, * response code, and possibly the page content of the webpage and attach * this information to a node and add the node to the tree. * * @param string $url The URL of the webpage to process. * @param boolean $followRedirects If true then this will follow redirects if the server * returns a redirect response code (e.g. 302) * @param array $locations An array of locations that have already been visited. The * keys are URLs and the values are booleans. This just makes it easier to look * up existence. This array is used to prevent infinite loops. * @returns boolean True if the page was processed successfully and the node added. * @throws Exception if there was a problem processing the page. This will include * if a 400 error is returned or some other common occurrence so this exception MUST * be caught and handled or you'll regularly get uncaught exception errors. */ public function processPage($url, $followRedirects = false, $locations = array()) { error_log('Processing ' . $url); require_once 'inc/ProxyClient.php'; $client = new ProxyClient(); $client->clear(); $client->URL = $url; $found = false; //$locations = array(); $client->noBody = true; // only find out what type of content it is $client->process(); $this->addNodeAtAbsoluteUrl($url, array('httpStatus' => $client->status['http_code'], 'contentType' => $client->contentType)); if ($client->status['http_code'] == 200) { // We have success if (preg_match('#xml|html|xhtml#', $client->contentType)) { $client->noBody = false; // only find out what type of content it is $client->process(); $this->processHtml($client->content, $client->URL); return true; } else { throw new Exception("Failed to process page {$url} because it is not a parsable content type: " . $client->contentType); } } else { if ($client->status['http_code'] >= 300 and $client->status['http_code'] < 400) { if (!$followRedirects) { throw new Exception("Failed to process page " . $url . " because the page has moved and followRedirects is set to false in this invocation."); } // We got a redirect status code $location = null; foreach ($client->headers as $h) { if (preg_match('/^Location:(.*)$/i', $h, $matches)) { $location = trim($matches[1]); break; } } if (!$location) { throw new Exception("Received an http status code of " . $client->status['http_code'] . " but no location header was found."); } $locations[$url] = true; if (isset($locations[$location])) { throw new Exception("Redirect loop found: " . $location); } return $this->processPage($location, $followRedirects, $locations); } else { throw new Exception("Failed to process page {$url}. Received HTTP response code: " . $client->status['http_code']); } } }