Esempio n. 1
0
 /**
  * Gets the cached version of a web page from the machine on which it was
  * fetched.
  *
  * Complete cached versions of web pages typically only live on a fetcher
  * machine. The queue server machine typically only maintains summaries.
  * This method makes a REST request of a fetcher machine for a cached page
  * and get the results back.
  *
  * @param string $machine the ip address of domain name of the machine the
  *     cached page lives on
  * @param string $machine_uri the path from document root on $machine where
  *     the yioop scripts live
  * @param int $partition the partition in the WebArchiveBundle the page is
  *      in
  * @param int $offset the offset in bytes into the WebArchive partition in
  *     the WebArchiveBundle at which the cached page lives.
  * @param string $crawl_time the timestamp of the crawl the cache page is
  *     from
  * @param int $instance_num which fetcher instance for the particular
  *     fetcher crawled the page (if more than one), false otherwise
  * @return array page data of the cached page
  */
 function getCacheFile($machine, $machine_uri, $partition, $offset, $crawl_time, $instance_num = false)
 {
     $time = time();
     $session = md5($time . AUTH_KEY);
     if ($machine == '::1') {
         //IPv6 :(
         $machine = "[::1]";
         //used if the fetching and queue serving were on the same machine
     }
     // we assume all machines use the same scheme & port of the name server
     $port = UrlParser::getPort(NAME_SERVER);
     $scheme = UrlParser::getScheme(NAME_SERVER);
     $request = "{$scheme}://{$machine}:{$port}{$machine_uri}?c=archive&a=cache&" . "time={$time}&session={$session}&partition={$partition}&offset={$offset}" . "&crawl_time={$crawl_time}";
     if ($instance_num !== false) {
         $request .= "&instance_num={$instance_num}";
     }
     $tmp = FetchUrl::getPage($request);
     $page = @unserialize(base64_decode($tmp));
     $page['REQUEST'] = $request;
     return $page;
 }