示例#1
0
文件: fetcher.php 项目: yakar/yioop
 /**
  * Sends to crawl, robot, and index data to the current queue server.
  * If this data is more than post_max_size, it splits it into chunks
  * which are then reassembled by the queue server web app before being
  * put into the appropriate schedule sub-directory.
  *
  * @param string $queue_server url of the current queue server
  * @param array $byte_counts has four fields: TOTAL, ROBOT, SCHEDULE,
  *     INDEX. These give the number of bytes overall for the
  *     'data' field of $post_data and for each of these components.
  * @param array $post_data data to be uploaded to the queue server web app
  */
 function uploadCrawlData($queue_server, $byte_counts, &$post_data)
 {
     $post_data['fetcher_peak_memory'] = memory_get_peak_usage();
     $post_data['byte_counts'] = webencode(serialize($byte_counts));
     $len = strlen($post_data['data']);
     $max_len = $this->post_max_size - 10 * 1024;
     // non-data post vars < 10K
     $post_data['num_parts'] = ceil($len / $max_len);
     $num_parts = $post_data['num_parts'];
     $data =& $post_data['data'];
     unset($post_data['data']);
     $post_data['hash_data'] = crawlHash($data);
     $offset = 0;
     for ($i = 1; $i <= $num_parts; $i++) {
         $time = time();
         $session = md5($time . AUTH_KEY);
         $post_data['time'] = $time;
         $post_data['session'] = $session;
         $post_data['part'] = substr($data, $offset, $max_len);
         $post_data['hash_part'] = crawlHash($post_data['part']);
         $post_data['current_part'] = $i;
         $offset += $max_len;
         $part_len = strlen($post_data['part']);
         crawlLog("Sending Queue Server Part {$i} of {$num_parts}...");
         crawlLog("...sending about {$part_len} bytes.");
         $sleep = false;
         do {
             if ($sleep == true) {
                 crawlLog("Trouble sending to the scheduler at url:");
                 crawlLog($queue_server);
                 crawlLog("Response was:");
                 crawlLog("{$info_string}");
                 $info = @unserialize($info_string);
                 $time = time();
                 $session = md5($time . AUTH_KEY);
                 $post_data['time'] = $time;
                 $post_data['session'] = $session;
                 if (isset($info[self::STATUS]) && $info[self::STATUS] == self::REDO_STATE) {
                     crawlLog("Server requested last item to be re-sent...");
                     if (isset($info[self::SUMMARY])) {
                         crawlLog($info[self::SUMMARY]);
                     }
                     crawlLog("Trying again in 5 seconds...");
                 } else {
                     crawlLog("Trying again in 5 seconds. You might want");
                     crawlLog("to check the queue server url and server");
                     crawlLog("key. Queue Server post_max_size is:" . $this->post_max_size);
                 }
                 if ($i == 1 && !defined('FORCE_SMALL') && $this->post_max_size > 1000000) {
                     /* maybe server has limited memory
                          and two high a post_max_size
                        */
                     crawlLog("Using smaller post size to see if helps");
                     define('FORCE_SMALL', true);
                     $this->post_max_size = 1000000;
                     $info[self::POST_MAX_SIZE] = 1000001;
                     /* set to small value before try again.
                      */
                 }
                 sleep(5);
             }
             $sleep = true;
             $info_string = FetchUrl::getPage($queue_server, $post_data, true);
             $info = unserialize(trim($info_string));
             if (isset($info[self::LOGGING])) {
                 crawlLog("Messages from Fetch Controller:");
                 crawlLog($info[self::LOGGING]);
             }
             if (isset($info[self::POST_MAX_SIZE]) && $this->post_max_size > $info[self::POST_MAX_SIZE]) {
                 if (!defined('FORCE_SMALL')) {
                     crawlLog("post_max_size has changed was " . "{$this->post_max_size}. Now is " . $info[self::POST_MAX_SIZE] . ".");
                     $this->post_max_size = $info[self::POST_MAX_SIZE];
                 } else {
                     crawlLog("...Using Force Small Rule on Server Posting");
                 }
                 if ($max_len > $this->post_max_size) {
                     crawlLog("Restarting upload...");
                     if (isset($post_data["resized_once"])) {
                         crawlLog("Restart failed");
                         return;
                     }
                     $post_data['data'] = $data;
                     $post_data["resized_once"] = true;
                     return $this->uploadCrawlData($queue_server, $byte_counts, $post_data);
                 }
             }
         } while (!isset($info[self::STATUS]) || $info[self::STATUS] != self::CONTINUE_STATE);
         crawlLog("Queue Server info response code: " . $info[self::STATUS]);
         crawlLog("Queue Server's crawl time is: " . $info[self::CRAWL_TIME]);
         crawlLog("Web Server peak memory usage: " . $info[self::MEMORY_USAGE]);
         crawlLog("This fetcher peak memory usage: " . memory_get_peak_usage());
     }
     crawlLog("Updated Queue Server, sent approximately" . " {$byte_counts['TOTAL']} bytes:");
 }
示例#2
0
文件: mirror.php 项目: yakar/yioop
 /**
  * Downloads the next file from the schedule of files to download received
  * from the web app.
  */
 function copyNextSyncFile()
 {
     $dir = $this->sync_dir;
     $name_server = $this->name_server;
     $time = time();
     $session = md5($time . AUTH_KEY);
     if (count($this->sync_schedule) <= 0) {
         return;
     }
     $file = array_pop($this->sync_schedule);
     crawlLog("Start syncing {$file['name']}..");
     if ($file['is_dir']) {
         if (!file_exists("{$dir}/{$file['name']}")) {
             mkdir("{$dir}/{$file['name']}");
             crawlLog(".. {$file['name']} directory created.");
         } else {
             crawlLog(".. {$file['name']} directory exists.");
         }
     } else {
         $request = "{$name_server}?c=resource&a=get&time={$time}&session={$session}" . "&robot_instance=" . ROBOT_INSTANCE . "&machine_uri=" . WEB_URI . "&last_sync=" . $this->last_sync . "&f=cache&n=" . urlencode($file["name"]);
         if ($file["size"] < self::DOWNLOAD_RANGE) {
             $data = FetchUrl::getPage($request, NULL, true);
             if ($file["size"] != strlen($data)) {
                 array_push($this->sync_schedule, $file);
                 crawlLog(".. {$file['name']} error downloading, retrying.");
                 return;
             }
             file_put_contents("{$dir}/{$file['name']}", $data);
             crawlLog(".. {$file['name']} file copied.");
         } else {
             $offset = 0;
             $fh = fopen("{$dir}/{$file['name']}", "wb");
             $request .= "&l=" . self::DOWNLOAD_RANGE;
             while ($offset < $file['size']) {
                 $data = FetchUrl::getPage($request . "&o={$offset}", NULL, true);
                 $old_offset = $offset;
                 $offset += self::DOWNLOAD_RANGE;
                 $end_point = min($offset, $file["size"]);
                 //crude check if we need to redownload segment
                 if (strlen($data) != $end_point - $old_offset) {
                     $offset = $old_offset;
                     crawlLog(".. Download error re-requesting segment");
                     continue;
                 }
                 fwrite($fh, $data);
                 crawlLog(".. {$file['name']} downloaded bytes {$old_offset} " . "to {$end_point}..");
             }
             crawlLog(".. {$file['name']} file copied.");
             fclose($fh);
         }
     }
 }
示例#3
0
 /**
  * Gets the cached version of a web page from the machine on which it was
  * fetched.
  *
  * Complete cached versions of web pages typically only live on a fetcher
  * machine. The queue server machine typically only maintains summaries.
  * This method makes a REST request of a fetcher machine for a cached page
  * and get the results back.
  *
  * @param string $machine the ip address of domain name of the machine the
  *     cached page lives on
  * @param string $machine_uri the path from document root on $machine where
  *     the yioop scripts live
  * @param int $partition the partition in the WebArchiveBundle the page is
  *      in
  * @param int $offset the offset in bytes into the WebArchive partition in
  *     the WebArchiveBundle at which the cached page lives.
  * @param string $crawl_time the timestamp of the crawl the cache page is
  *     from
  * @param int $instance_num which fetcher instance for the particular
  *     fetcher crawled the page (if more than one), false otherwise
  * @return array page data of the cached page
  */
 function getCacheFile($machine, $machine_uri, $partition, $offset, $crawl_time, $instance_num = false)
 {
     $time = time();
     $session = md5($time . AUTH_KEY);
     if ($machine == '::1') {
         //IPv6 :(
         $machine = "[::1]";
         //used if the fetching and queue serving were on the same machine
     }
     // we assume all machines use the same scheme & port of the name server
     $port = UrlParser::getPort(NAME_SERVER);
     $scheme = UrlParser::getScheme(NAME_SERVER);
     $request = "{$scheme}://{$machine}:{$port}{$machine_uri}?c=archive&a=cache&" . "time={$time}&session={$session}&partition={$partition}&offset={$offset}" . "&crawl_time={$crawl_time}";
     if ($instance_num !== false) {
         $request .= "&instance_num={$instance_num}";
     }
     $tmp = FetchUrl::getPage($request);
     $page = @unserialize(base64_decode($tmp));
     $page['REQUEST'] = $request;
     return $page;
 }
示例#4
0
 /**
  * Used to start or stop a queue_server, fetcher, mirror instance on
  * a machine managed by the current one
  *
  * @param string $machine_name name of machine
  * @param string $action "start" or "stop"
  * @param int $fetcher_num if the action is for a fetcher this value is not
  *      NULL and indicated which fetcher.
  * @param bool $is_mirror whether the requested machine is a mirror of
  *      another machine. (If $fetcher_num is NULL and this is false,
  *      then message is for a queue server)
  *
  */
 function update($machine_name, $action, $fetcher_num = NULL, $is_mirror = false)
 {
     $db = $this->db;
     $value = $action == "start" ? "true" : "false";
     $time = time();
     $session = md5($time . AUTH_KEY);
     $sql = "SELECT URL FROM MACHINE WHERE NAME=?";
     $result = $db->execute($sql, array($machine_name));
     $row = $db->fetchArray($result);
     if ($row) {
         $url = $row["URL"] . "?c=machine&a=update&time={$time}" . "&session={$session}";
         if ($fetcher_num !== NULL) {
             $url .= "&fetcher[{$fetcher_num}]={$value}";
             $sql = "DELETE FROM ACTIVE_FETCHER WHERE NAME=? AND\n                    FETCHER_ID=?";
             $db->execute($sql, array($machine_name, $fetcher_num));
             if ($action == "start") {
                 $sql = "INSERT INTO ACTIVE_FETCHER VALUES (?, ?)";
             }
             $db->execute($sql, array($machine_name, $fetcher_num));
         } else {
             if ($is_mirror) {
                 $url .= "&mirror={$value}";
             } else {
                 $url .= "&queue_server={$value}";
             }
         }
         echo FetchUrl::getPage($url);
     }
 }
示例#5
0
 /**
  * Used to check if there are any mirrors of the current server.
  * If so, it tries to distribute the query requests randomly amongst
  * the mirrors
  * @return bool whether or not a mirror of the current site handled it
  */
 function mirrorHandle()
 {
     $mirror_table_name = CRAWL_DIR . "/" . self::mirror_table_name;
     $handled = false;
     if (file_exists($mirror_table_name)) {
         $mirror_table = unserialize(file_get_contents($mirror_table_name));
         $mirrors = array();
         $time = time();
         foreach ($mirror_table['machines'] as $entry) {
             if ($time - $entry[3] < 2 * MIRROR_NOTIFY_FREQUENCY) {
                 if ($entry[0] == "::1") {
                     $entry[0] = "[::1]";
                 }
                 $request = "http://" . $entry[0] . $entry[1];
                 $mirrors[] = $request;
             }
         }
         $count = count($mirrors);
         if ($count > 0) {
             mt_srand();
             $rand = mt_rand(0, $count);
             // if ==$count, we'll let the current machine handle it
             if ($rand < $count) {
                 $request = $mirrors[$rand] . "?" . $_SERVER["QUERY_STRING"];
                 echo FetchUrl::getPage($request);
                 $handled = true;
             }
         }
     }
     return $handled;
 }