/** * Sends to crawl, robot, and index data to the current queue server. * If this data is more than post_max_size, it splits it into chunks * which are then reassembled by the queue server web app before being * put into the appropriate schedule sub-directory. * * @param string $queue_server url of the current queue server * @param array $byte_counts has four fields: TOTAL, ROBOT, SCHEDULE, * INDEX. These give the number of bytes overall for the * 'data' field of $post_data and for each of these components. * @param array $post_data data to be uploaded to the queue server web app */ function uploadCrawlData($queue_server, $byte_counts, &$post_data) { $post_data['fetcher_peak_memory'] = memory_get_peak_usage(); $post_data['byte_counts'] = webencode(serialize($byte_counts)); $len = strlen($post_data['data']); $max_len = $this->post_max_size - 10 * 1024; // non-data post vars < 10K $post_data['num_parts'] = ceil($len / $max_len); $num_parts = $post_data['num_parts']; $data =& $post_data['data']; unset($post_data['data']); $post_data['hash_data'] = crawlHash($data); $offset = 0; for ($i = 1; $i <= $num_parts; $i++) { $time = time(); $session = md5($time . AUTH_KEY); $post_data['time'] = $time; $post_data['session'] = $session; $post_data['part'] = substr($data, $offset, $max_len); $post_data['hash_part'] = crawlHash($post_data['part']); $post_data['current_part'] = $i; $offset += $max_len; $part_len = strlen($post_data['part']); crawlLog("Sending Queue Server Part {$i} of {$num_parts}..."); crawlLog("...sending about {$part_len} bytes."); $sleep = false; do { if ($sleep == true) { crawlLog("Trouble sending to the scheduler at url:"); crawlLog($queue_server); crawlLog("Response was:"); crawlLog("{$info_string}"); $info = @unserialize($info_string); $time = time(); $session = md5($time . AUTH_KEY); $post_data['time'] = $time; $post_data['session'] = $session; if (isset($info[self::STATUS]) && $info[self::STATUS] == self::REDO_STATE) { crawlLog("Server requested last item to be re-sent..."); if (isset($info[self::SUMMARY])) { crawlLog($info[self::SUMMARY]); } crawlLog("Trying again in 5 seconds..."); } else { crawlLog("Trying again in 5 seconds. You might want"); crawlLog("to check the queue server url and server"); crawlLog("key. Queue Server post_max_size is:" . $this->post_max_size); } if ($i == 1 && !defined('FORCE_SMALL') && $this->post_max_size > 1000000) { /* maybe server has limited memory and two high a post_max_size */ crawlLog("Using smaller post size to see if helps"); define('FORCE_SMALL', true); $this->post_max_size = 1000000; $info[self::POST_MAX_SIZE] = 1000001; /* set to small value before try again. */ } sleep(5); } $sleep = true; $info_string = FetchUrl::getPage($queue_server, $post_data, true); $info = unserialize(trim($info_string)); if (isset($info[self::LOGGING])) { crawlLog("Messages from Fetch Controller:"); crawlLog($info[self::LOGGING]); } if (isset($info[self::POST_MAX_SIZE]) && $this->post_max_size > $info[self::POST_MAX_SIZE]) { if (!defined('FORCE_SMALL')) { crawlLog("post_max_size has changed was " . "{$this->post_max_size}. Now is " . $info[self::POST_MAX_SIZE] . "."); $this->post_max_size = $info[self::POST_MAX_SIZE]; } else { crawlLog("...Using Force Small Rule on Server Posting"); } if ($max_len > $this->post_max_size) { crawlLog("Restarting upload..."); if (isset($post_data["resized_once"])) { crawlLog("Restart failed"); return; } $post_data['data'] = $data; $post_data["resized_once"] = true; return $this->uploadCrawlData($queue_server, $byte_counts, $post_data); } } } while (!isset($info[self::STATUS]) || $info[self::STATUS] != self::CONTINUE_STATE); crawlLog("Queue Server info response code: " . $info[self::STATUS]); crawlLog("Queue Server's crawl time is: " . $info[self::CRAWL_TIME]); crawlLog("Web Server peak memory usage: " . $info[self::MEMORY_USAGE]); crawlLog("This fetcher peak memory usage: " . memory_get_peak_usage()); } crawlLog("Updated Queue Server, sent approximately" . " {$byte_counts['TOTAL']} bytes:"); }
/** * Downloads the next file from the schedule of files to download received * from the web app. */ function copyNextSyncFile() { $dir = $this->sync_dir; $name_server = $this->name_server; $time = time(); $session = md5($time . AUTH_KEY); if (count($this->sync_schedule) <= 0) { return; } $file = array_pop($this->sync_schedule); crawlLog("Start syncing {$file['name']}.."); if ($file['is_dir']) { if (!file_exists("{$dir}/{$file['name']}")) { mkdir("{$dir}/{$file['name']}"); crawlLog(".. {$file['name']} directory created."); } else { crawlLog(".. {$file['name']} directory exists."); } } else { $request = "{$name_server}?c=resource&a=get&time={$time}&session={$session}" . "&robot_instance=" . ROBOT_INSTANCE . "&machine_uri=" . WEB_URI . "&last_sync=" . $this->last_sync . "&f=cache&n=" . urlencode($file["name"]); if ($file["size"] < self::DOWNLOAD_RANGE) { $data = FetchUrl::getPage($request, NULL, true); if ($file["size"] != strlen($data)) { array_push($this->sync_schedule, $file); crawlLog(".. {$file['name']} error downloading, retrying."); return; } file_put_contents("{$dir}/{$file['name']}", $data); crawlLog(".. {$file['name']} file copied."); } else { $offset = 0; $fh = fopen("{$dir}/{$file['name']}", "wb"); $request .= "&l=" . self::DOWNLOAD_RANGE; while ($offset < $file['size']) { $data = FetchUrl::getPage($request . "&o={$offset}", NULL, true); $old_offset = $offset; $offset += self::DOWNLOAD_RANGE; $end_point = min($offset, $file["size"]); //crude check if we need to redownload segment if (strlen($data) != $end_point - $old_offset) { $offset = $old_offset; crawlLog(".. Download error re-requesting segment"); continue; } fwrite($fh, $data); crawlLog(".. {$file['name']} downloaded bytes {$old_offset} " . "to {$end_point}.."); } crawlLog(".. {$file['name']} file copied."); fclose($fh); } } }
/** * Gets the cached version of a web page from the machine on which it was * fetched. * * Complete cached versions of web pages typically only live on a fetcher * machine. The queue server machine typically only maintains summaries. * This method makes a REST request of a fetcher machine for a cached page * and get the results back. * * @param string $machine the ip address of domain name of the machine the * cached page lives on * @param string $machine_uri the path from document root on $machine where * the yioop scripts live * @param int $partition the partition in the WebArchiveBundle the page is * in * @param int $offset the offset in bytes into the WebArchive partition in * the WebArchiveBundle at which the cached page lives. * @param string $crawl_time the timestamp of the crawl the cache page is * from * @param int $instance_num which fetcher instance for the particular * fetcher crawled the page (if more than one), false otherwise * @return array page data of the cached page */ function getCacheFile($machine, $machine_uri, $partition, $offset, $crawl_time, $instance_num = false) { $time = time(); $session = md5($time . AUTH_KEY); if ($machine == '::1') { //IPv6 :( $machine = "[::1]"; //used if the fetching and queue serving were on the same machine } // we assume all machines use the same scheme & port of the name server $port = UrlParser::getPort(NAME_SERVER); $scheme = UrlParser::getScheme(NAME_SERVER); $request = "{$scheme}://{$machine}:{$port}{$machine_uri}?c=archive&a=cache&" . "time={$time}&session={$session}&partition={$partition}&offset={$offset}" . "&crawl_time={$crawl_time}"; if ($instance_num !== false) { $request .= "&instance_num={$instance_num}"; } $tmp = FetchUrl::getPage($request); $page = @unserialize(base64_decode($tmp)); $page['REQUEST'] = $request; return $page; }
/** * Used to start or stop a queue_server, fetcher, mirror instance on * a machine managed by the current one * * @param string $machine_name name of machine * @param string $action "start" or "stop" * @param int $fetcher_num if the action is for a fetcher this value is not * NULL and indicated which fetcher. * @param bool $is_mirror whether the requested machine is a mirror of * another machine. (If $fetcher_num is NULL and this is false, * then message is for a queue server) * */ function update($machine_name, $action, $fetcher_num = NULL, $is_mirror = false) { $db = $this->db; $value = $action == "start" ? "true" : "false"; $time = time(); $session = md5($time . AUTH_KEY); $sql = "SELECT URL FROM MACHINE WHERE NAME=?"; $result = $db->execute($sql, array($machine_name)); $row = $db->fetchArray($result); if ($row) { $url = $row["URL"] . "?c=machine&a=update&time={$time}" . "&session={$session}"; if ($fetcher_num !== NULL) { $url .= "&fetcher[{$fetcher_num}]={$value}"; $sql = "DELETE FROM ACTIVE_FETCHER WHERE NAME=? AND\n FETCHER_ID=?"; $db->execute($sql, array($machine_name, $fetcher_num)); if ($action == "start") { $sql = "INSERT INTO ACTIVE_FETCHER VALUES (?, ?)"; } $db->execute($sql, array($machine_name, $fetcher_num)); } else { if ($is_mirror) { $url .= "&mirror={$value}"; } else { $url .= "&queue_server={$value}"; } } echo FetchUrl::getPage($url); } }
/** * Used to check if there are any mirrors of the current server. * If so, it tries to distribute the query requests randomly amongst * the mirrors * @return bool whether or not a mirror of the current site handled it */ function mirrorHandle() { $mirror_table_name = CRAWL_DIR . "/" . self::mirror_table_name; $handled = false; if (file_exists($mirror_table_name)) { $mirror_table = unserialize(file_get_contents($mirror_table_name)); $mirrors = array(); $time = time(); foreach ($mirror_table['machines'] as $entry) { if ($time - $entry[3] < 2 * MIRROR_NOTIFY_FREQUENCY) { if ($entry[0] == "::1") { $entry[0] = "[::1]"; } $request = "http://" . $entry[0] . $entry[1]; $mirrors[] = $request; } } $count = count($mirrors); if ($count > 0) { mt_srand(); $rand = mt_rand(0, $count); // if ==$count, we'll let the current machine handle it if ($rand < $count) { $request = $mirrors[$rand] . "?" . $_SERVER["QUERY_STRING"]; echo FetchUrl::getPage($request); $handled = true; } } } return $handled; }