Example #1
0
 function testRequestToOutputFile()
 {
     $client = new ProxyClient();
     $client->URL = df_absolute_url('tests/test_ProxyClient/test1.html');
     $outputFile = tempnam(sys_get_temp_dir(), 'test_ProxyClient');
     $client->outputFile = $outputFile;
     $client->process();
     $this->assertEquals(null, $client->content, 'Content should be written to output file, not saved to variable.');
     $expected = file_get_contents('tests/test_ProxyClient/test1.html');
     $doc = new DOMDocument();
     @$doc->loadHtml($expected);
     $expected = $doc->saveHtml();
     $actual = file_get_contents($outputFile);
     $actual = '';
     $fh = fopen($outputFile, 'r');
     while (!feof($fh) and trim($line = fgets($fh, 1024))) {
         // We skip the headers
     }
     ob_start();
     fpassthru($fh);
     fclose($fh);
     $actual = ob_get_contents();
     ob_end_clean();
     unset($doc);
     $doc = new DOMDocument();
     @$doc->loadHtml($actual);
     $actual = $doc->saveHtml();
     unset($doc);
     $this->assertEquals($expected, $actual);
 }
Example #2
0
    if (!isset($images[$key])) {
        die("Could not find image");
    }
    $url = $images[$key];
} else {
    if ($_GET['page']) {
        $key = $_GET['page'];
        if (!isset($pages[$key])) {
            die("Could not find page");
        }
        $url = $pages[$key];
    }
}
if ($url) {
    //$url = 'http://ca2.php.net/images/php.gif';
    $client = new ProxyClient();
    $client->URL = $url;
    $client->flushableContentTypeRegex = '#html|css#';
    $client->afterFlushCallback = 'flushCallback';
    $client->process();
    foreach ($client->headers as $h) {
        header($h);
    }
    echo $client->content;
} else {
    $html = '<div style="overflow:scroll">';
    foreach ($images as $img => $imgurl) {
        $html .= '<img src="?image=' . urlencode($img) . '"/>';
    }
    $html .= '</div>';
    foreach ($pages as $pg => $pgurl) {
Example #3
0
 /**
  * @brief Processes a resource at a specified URL.  This will first check and see if the 
  * resource has already been loaded and just return the hash to that resource.  If not
  * it will perform an HTTP request to load the resource and return the resulting hash.
  * @param string $url The absolute URL of the resource to proces.
  * @param boolean $followRedirects Whether to follow HTTP redirects if the resource has moved.
  * @param string[] $locations Array of locations that have already been followed.  This can help
  *	prevent redirect loops.
  * @return string MD5 hash of the absolute URL.  This can be used to look up the resource later
  *	with loadResource().
  */
 public function processResource($url, $followRedirects = false, $locations = array())
 {
     // First check to see if it is already loaded.
     $hash = md5($url);
     $resource = $this->loadResource($hash);
     if ($resource) {
         return $hash;
     }
     require_once 'inc/ProxyClient.php';
     $client = new ProxyClient();
     $client->clear();
     $client->URL = $url;
     $found = false;
     //$locations = array();
     $client->noBody = false;
     // only find out what type of content it is
     $client->process();
     if ($client->status['http_code'] == 200) {
         // We have success
         if (preg_match('#css#', $client->contentType)) {
             $client->content = $this->processCss($client->content, $client->URL);
         }
         $this->saveResource($url, $client->content, $client->contentType);
         return md5($url);
     } else {
         if ($client->status['http_code'] >= 300 and $client->status['http_code'] < 400) {
             if (!$followRedirects) {
                 throw new Exception("Failed to process page " . $url . " because the page has moved and followRedirects is set to false in this invocation.");
             }
             // We got a redirect status code
             $location = null;
             foreach ($client->headers as $h) {
                 if (preg_match('/^Location:(.*)$/i', $h, $matches)) {
                     $location = trim($matches[1]);
                     break;
                 }
             }
             if (!$location) {
                 throw new Exception("Received an http status code of " . $client->status['http_code'] . " but no location header was found.");
             }
             $locations[$url] = true;
             if (isset($locations[$location])) {
                 throw new Exception("Redirect loop found: " . $location);
             }
             return $this->processResource($location, $followRedirects, $locations);
         } else {
             throw new Exception("Failed to process page {$url}.  Received HTTP response code: " . $client->status['http_code']);
         }
     }
 }
Example #4
0
 /**
  * @brief Loads a webpage and processes it.   This will glean the content-type,
  * response code, and possibly the page content of the webpage and attach
  * this information to a node and add the node to the tree.
  *
  * @param string $url The URL of the webpage to process.
  * @param boolean $followRedirects If true then this will follow redirects if the server
  * 	returns a redirect response code (e.g. 302)
  * @param array $locations An array of locations that have already been visited.  The 
  * 	keys are URLs and the values are booleans.  This just makes it easier to look
  * up existence.  This array is used to prevent infinite loops.
  * @returns boolean True if the page was processed successfully and the node added.
  * @throws Exception if there was a problem processing the page. This will include
  * if a 400 error is returned or some other common occurrence so this exception MUST
  * be caught and handled or you'll regularly get uncaught exception errors.
  */
 public function processPage($url, $followRedirects = false, $locations = array())
 {
     error_log('Processing ' . $url);
     require_once 'inc/ProxyClient.php';
     $client = new ProxyClient();
     $client->clear();
     $client->URL = $url;
     $found = false;
     //$locations = array();
     $client->noBody = true;
     // only find out what type of content it is
     $client->process();
     $this->addNodeAtAbsoluteUrl($url, array('httpStatus' => $client->status['http_code'], 'contentType' => $client->contentType));
     if ($client->status['http_code'] == 200) {
         // We have success
         if (preg_match('#xml|html|xhtml#', $client->contentType)) {
             $client->noBody = false;
             // only find out what type of content it is
             $client->process();
             $this->processHtml($client->content, $client->URL);
             return true;
         } else {
             throw new Exception("Failed to process page {$url} because it is not a parsable content type: " . $client->contentType);
         }
     } else {
         if ($client->status['http_code'] >= 300 and $client->status['http_code'] < 400) {
             if (!$followRedirects) {
                 throw new Exception("Failed to process page " . $url . " because the page has moved and followRedirects is set to false in this invocation.");
             }
             // We got a redirect status code
             $location = null;
             foreach ($client->headers as $h) {
                 if (preg_match('/^Location:(.*)$/i', $h, $matches)) {
                     $location = trim($matches[1]);
                     break;
                 }
             }
             if (!$location) {
                 throw new Exception("Received an http status code of " . $client->status['http_code'] . " but no location header was found.");
             }
             $locations[$url] = true;
             if (isset($locations[$location])) {
                 throw new Exception("Redirect loop found: " . $location);
             }
             return $this->processPage($location, $followRedirects, $locations);
         } else {
             throw new Exception("Failed to process page {$url}.  Received HTTP response code: " . $client->status['http_code']);
         }
     }
 }
Example #5
0
 /**
  * @brief Builds a @ref ProxyClient object to load a page from the source site.
  * The @ref ProxyClient object is returned.
  *
  * @returns ProxyClient The proxy client that has loaded the source page
  */
 public function getSourcePage()
 {
     require_once 'inc/ProxyClient.php';
     $client = new ProxyClient();
     $forwardedFor = @$client->REQUEST_HEADERS['X-Forwarded-For'];
     if (!$forwardedFor) {
         $forwardedFor = $_SERVER['REMOTE_ADDR'];
     } else {
         $forwardedFor .= ', ' . $_SERVER['REMOTE_ADDR'];
     }
     $client->REQUEST_HEADERS['X-Forwarded-For'] = $forwardedFor;
     if (@$client->REQUEST_HEADERS['Referer']) {
         $client->REQUEST_HEADERS['Referer'] = $this->site->getProxyWriter()->unproxifyUrl($client->REQUEST_HEADERS['Referer']);
     }
     $client->REQUEST_HEADERS['X-SWeTE-Language'] = $this->site->getDestinationLanguage();
     $client->REQUEST_HEADERS['Accept-Language'] = $this->site->getDestinationLanguage();
     $client->SERVER = $this->SERVER;
     $client->REQUEST = $this->REQUEST;
     $client->GET = $this->GET;
     $client->POST = $this->POST;
     $client->COOKIE = $this->COOKIE;
     //echo "Preprocess: [".$this->URL.']';
     $proxyWriter = $this->site->getProxyWriter();
     $client->URL = $this->site->getProxyWriter()->unproxifyUrl($this->URL);
     $logger = $this->logger;
     if (!isset($client->SERVER['REQUEST_METHOD'])) {
         print_r($client->SERVER);
         exit;
     }
     $logger->requestMethod = $client->SERVER['REQUEST_METHOD'];
     $logger->requestUrl = $client->URL;
     $logger->requestPostVars = serialize($client->POST);
     //echo "About to process ".$client->URL;
     $this->mark('About to process page request.');
     $client->process();
     $this->mark('Page request processed');
     $logger->responseHeaders = serialize($client->headers);
     $logger->responseBody = $client->content;
     $logger->responseContentType = $client->contentType;
     $logger->responseStatusCode = $client->status['http_code'];
     return $client;
 }