Esempio n. 1
0
 /**
  * Checks for the crawl time according either to crawl_status.txt or to
  * network_status.txt, and presents it to the requesting fetcher, along
  * with a list of available queue servers.
  */
 function crawlTime()
 {
     $info = array();
     $info[self::STATUS] = self::CONTINUE_STATE;
     $view = "fetch";
     $cron_model = $this->model("cron");
     if (isset($_REQUEST['crawl_time'])) {
         $prev_crawl_time = substr($this->clean($_REQUEST['crawl_time'], 'int'), 0, TIMESTAMP_LEN);
     } else {
         $prev_crawl_time = 0;
     }
     $cron_time = $cron_model->getCronTime("fetcher_restart");
     $delta = time() - $cron_time;
     if ($delta > self::CRON_INTERVAL) {
         $cron_model->updateCronTime("fetcher_restart");
         $this->doCronTasks();
     } else {
         if ($delta == 0) {
             $cron_model->updateCronTime("fetcher_restart");
         }
     }
     $local_filename = CRAWL_DIR . "/schedules/crawl_status.txt";
     $network_filename = CRAWL_DIR . "/schedules/network_status.txt";
     if (file_exists($local_filename)) {
         $crawl_status = unserialize(file_get_contents($local_filename));
         $crawl_time = isset($crawl_status["CRAWL_TIME"]) ? $crawl_status["CRAWL_TIME"] : 0;
     } else {
         if (file_exists($network_filename)) {
             $crawl_time = unserialize(file_get_contents($network_filename));
         } else {
             $crawl_time = 0;
         }
     }
     $info[self::CRAWL_TIME] = $crawl_time;
     $status_filename = CRAWL_DIR . "/schedules/name_server_messages.txt";
     if ($crawl_time != 0 && file_exists($status_filename)) {
         $status = unserialize(file_get_contents($status_filename));
         if ($status[self::STATUS] == 'STOP_CRAWL') {
             $info[self::STATUS] == 'STOP_CRAWL';
             $info[self::CRAWL_TIME] = 0;
         }
         if ($status[self::STATUS] != 'STOP_CRAWL' && $crawl_time != $prev_crawl_time) {
             $to_copy_fields = array(self::ALLOWED_SITES, self::ARC_DIR, self::ARC_TYPE, self::CRAWL_INDEX, self::CRAWL_TYPE, self::DISALLOWED_SITES, self::INDEXED_FILE_TYPES, self::PROXY_SERVERS, self::RESTRICT_SITES_BY_URL, self::SUMMARIZER_OPTION, self::TOR_PROXY);
             foreach ($to_copy_fields as $field) {
                 if (isset($status[$field])) {
                     $info[$field] = $status[$field];
                 }
             }
             /*
               When initiating a new crawl AND there are active
               classifiers (an array of class labels), then augment the
               info with compressed, serialized versions of each active
               classifier so that each fetcher can reconstruct the same
               classifiers.
             */
             $classifier_array = array();
             if (isset($status[self::ACTIVE_CLASSIFIERS])) {
                 $classifier_array = array_merge($status[self::ACTIVE_CLASSIFIERS]);
                 $info[self::ACTIVE_CLASSIFIERS] = $status[self::ACTIVE_CLASSIFIERS];
             }
             if (isset($status[self::ACTIVE_RANKERS])) {
                 $classifier_array = array_merge($classifier_array, $status[self::ACTIVE_RANKERS]);
                 $info[self::ACTIVE_RANKERS] = $status[self::ACTIVE_RANKERS];
             }
             if ($classifier_array != array()) {
                 $classifiers_data = Classifier::loadClassifiersData($classifier_array);
                 $info[self::ACTIVE_CLASSIFIERS_DATA] = $classifiers_data;
             }
         }
     }
     $info[self::QUEUE_SERVERS] = $this->model("machine")->getQueueServerUrls();
     $info[self::SAVED_CRAWL_TIMES] = $this->getCrawlTimes();
     $info[self::POST_MAX_SIZE] = metricToInt(ini_get("post_max_size"));
     if (count($info[self::QUEUE_SERVERS]) == 0) {
         $info[self::QUEUE_SERVERS] = array(NAME_SERVER);
     }
     $data = array();
     $data['MESSAGE'] = serialize($info);
     $this->displayView($view, $data);
 }