/** * Checks for the crawl time according either to crawl_status.txt or to * network_status.txt, and presents it to the requesting fetcher, along * with a list of available queue servers. */ function crawlTime() { $info = array(); $info[self::STATUS] = self::CONTINUE_STATE; $view = "fetch"; $cron_model = $this->model("cron"); if (isset($_REQUEST['crawl_time'])) { $prev_crawl_time = substr($this->clean($_REQUEST['crawl_time'], 'int'), 0, TIMESTAMP_LEN); } else { $prev_crawl_time = 0; } $cron_time = $cron_model->getCronTime("fetcher_restart"); $delta = time() - $cron_time; if ($delta > self::CRON_INTERVAL) { $cron_model->updateCronTime("fetcher_restart"); $this->doCronTasks(); } else { if ($delta == 0) { $cron_model->updateCronTime("fetcher_restart"); } } $local_filename = CRAWL_DIR . "/schedules/crawl_status.txt"; $network_filename = CRAWL_DIR . "/schedules/network_status.txt"; if (file_exists($local_filename)) { $crawl_status = unserialize(file_get_contents($local_filename)); $crawl_time = isset($crawl_status["CRAWL_TIME"]) ? $crawl_status["CRAWL_TIME"] : 0; } else { if (file_exists($network_filename)) { $crawl_time = unserialize(file_get_contents($network_filename)); } else { $crawl_time = 0; } } $info[self::CRAWL_TIME] = $crawl_time; $status_filename = CRAWL_DIR . "/schedules/name_server_messages.txt"; if ($crawl_time != 0 && file_exists($status_filename)) { $status = unserialize(file_get_contents($status_filename)); if ($status[self::STATUS] == 'STOP_CRAWL') { $info[self::STATUS] == 'STOP_CRAWL'; $info[self::CRAWL_TIME] = 0; } if ($status[self::STATUS] != 'STOP_CRAWL' && $crawl_time != $prev_crawl_time) { $to_copy_fields = array(self::ALLOWED_SITES, self::ARC_DIR, self::ARC_TYPE, self::CRAWL_INDEX, self::CRAWL_TYPE, self::DISALLOWED_SITES, self::INDEXED_FILE_TYPES, self::PROXY_SERVERS, self::RESTRICT_SITES_BY_URL, self::SUMMARIZER_OPTION, self::TOR_PROXY); foreach ($to_copy_fields as $field) { if (isset($status[$field])) { $info[$field] = $status[$field]; } } /* When initiating a new crawl AND there are active classifiers (an array of class labels), then augment the info with compressed, serialized versions of each active classifier so that each fetcher can reconstruct the same classifiers. */ $classifier_array = array(); if (isset($status[self::ACTIVE_CLASSIFIERS])) { $classifier_array = array_merge($status[self::ACTIVE_CLASSIFIERS]); $info[self::ACTIVE_CLASSIFIERS] = $status[self::ACTIVE_CLASSIFIERS]; } if (isset($status[self::ACTIVE_RANKERS])) { $classifier_array = array_merge($classifier_array, $status[self::ACTIVE_RANKERS]); $info[self::ACTIVE_RANKERS] = $status[self::ACTIVE_RANKERS]; } if ($classifier_array != array()) { $classifiers_data = Classifier::loadClassifiersData($classifier_array); $info[self::ACTIVE_CLASSIFIERS_DATA] = $classifiers_data; } } } $info[self::QUEUE_SERVERS] = $this->model("machine")->getQueueServerUrls(); $info[self::SAVED_CRAWL_TIMES] = $this->getCrawlTimes(); $info[self::POST_MAX_SIZE] = metricToInt(ini_get("post_max_size")); if (count($info[self::QUEUE_SERVERS]) == 0) { $info[self::QUEUE_SERVERS] = array(NAME_SERVER); } $data = array(); $data['MESSAGE'] = serialize($info); $this->displayView($view, $data); }