Пример #1
0
 /**
  * Gets data from the machine model concerning the on/off states
  * of the machines managed by this Yioop instance and then passes
  * this data the the machinestatus view.
  * @return array $data MACHINES field has information about each
  *     machine managed by this Yioop instance as well the on off
  *     status of its queue_servers and fetchers.
  *     The REFRESH field is used to tell the controller that the
  *     view shouldn't have its own sidemenu.
  */
 function machineStatus()
 {
     $data = array();
     $data['REFRESH'] = "machinestatus";
     $this->pagingLogic($data, $this->model("machine"), 'MACHINES', DEFAULT_ADMIN_PAGING_NUM);
     $profile = $this->model("profile")->getProfile(WORK_DIRECTORY);
     $data['NEWS_MODE'] = isset($profile['NEWS_MODE']) ? $profile['NEWS_MODE'] : "";
     if ($data['NEWS_MODE'] == "news_process" && $data['MACHINES']['NAME_SERVER']["news_updater"] == 0) {
         // try to restart news server if dead
         CrawlDaemon::start("news_updater", 'none', "", -1);
     }
     return $data;
 }
Пример #2
0
 /**
  * Handles admin request related to the managing the machines which perform
  * crawls
  *
  * With this activity an admin can add/delete machines to manage. For each
  * managed machine, the admin can stop and start fetchers/queue_servers
  * as well as look at their log files
  *
  * @return array $data MACHINES, their MACHINE_NAMES, data for
  *     FETCHER_NUMBERS drop-down
  */
 function manageMachines()
 {
     $parent = $this->parent;
     $machine_model = $parent->model("machine");
     $profile_model = $parent->model("profile");
     $data = array();
     $data["ELEMENT"] = "managemachines";
     $possible_arguments = array("addmachine", "deletemachine", "newsmode", "log", "update");
     $data['SCRIPT'] = "doUpdate();";
     $data["leftorright"] = getLocaleDirection() == 'ltr' ? "right" : "left";
     $data['MACHINE_NAMES'] = array();
     $data['FETCHER_NUMBERS'] = array(0 => 0, 1 => 1, 2 => 2, 3 => 3, 4 => 4, 5 => 5, 6 => 6, 7 => 7, 8 => 8, 16 => 16);
     $tmp = tl('system_component_select_machine');
     if (!isset($_REQUEST["has_queue_server"]) || isset($_REQUEST['is_replica'])) {
         $_REQUEST["has_queue_server"] = false;
     }
     if (isset($_REQUEST['is_replica'])) {
         $_REQUEST['num_fetchers'] = 0;
     } else {
         $_REQUEST['parent'] = "";
     }
     $request_fields = array("name" => "string", "url" => "string", "has_queue_server" => "bool", "num_fetchers" => "int", "parent" => "string");
     $r = array();
     $allset = true;
     foreach ($request_fields as $field => $type) {
         if (isset($_REQUEST[$field])) {
             $r[$field] = $parent->clean($_REQUEST[$field], $type);
             if ($type == "string") {
                 $r[$field] = trim($r[$field]);
                 if ($r[$field] == "" && $field != "parent") {
                     $allset = false;
                 }
             }
             if ($field == "url") {
                 if (isset($r[$field][strlen($r[$field]) - 1]) && $r[$field][strlen($r[$field]) - 1] != "/") {
                     $r[$field] .= "/";
                 }
                 $r[$field] = UrlParser::canonicalLink($r[$field], NAME_SERVER);
                 if (!$r[$field]) {
                     $allset = false;
                 }
             }
         } else {
             $allset = false;
         }
     }
     if (isset($r["num_fetchers"]) && in_array($r["num_fetchers"], $data['FETCHER_NUMBERS'])) {
         $data['FETCHER_NUMBER'] = $r["num_fetchers"];
     } else {
         $data['FETCHER_NUMBER'] = 0;
         if (isset($r["num_fetchers"])) {
             $r["num_fetchers"] = 0;
         }
     }
     $machine_exists = isset($r["name"]) && $machine_model->checkMachineExists("NAME", $r["name"]) || isset($r["url"]) && $machine_model->checkMachineExists("URL", $r["url"]);
     if (isset($_REQUEST['arg']) && in_array($_REQUEST['arg'], $possible_arguments)) {
         switch ($_REQUEST['arg']) {
             case "addmachine":
                 if ($allset == true && !$machine_exists) {
                     $machine_model->addMachine($r["name"], $r["url"], $r["has_queue_server"], $r["num_fetchers"], $r["parent"]);
                     $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . tl('system_component_machine_added') . "</h1>');";
                     $data['MACHINE_NAMES'][] = $r["name"];
                     $data['DELETABLE_MACHINES'][$r["name"]] = $r["name"];
                     sort($data['MACHINE_NAMES']);
                 } else {
                     if ($allset && $machine_exists) {
                         $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . tl('system_component_machine_exists') . "</h1>');";
                     } else {
                         $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . tl('system_component_machine_incomplete') . "</h1>');";
                     }
                 }
                 break;
             case "deletemachine":
                 if (!$machine_exists) {
                     $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . tl('system_component_machine_doesnt_exists') . "</h1>');";
                 } else {
                     $machines = $machine_model->getRows(0, 1, $total_rows, array(array("name", "=", $r["name"], "")));
                     $service_in_use = false;
                     foreach ($machines as $machine) {
                         if ($machine['NAME'] == $r["name"]) {
                             if (isset($machine['STATUSES']) && is_array($machine['STATUSES']) && $machine['STATUSES'] != array()) {
                                 $service_in_use = true;
                                 break;
                             } else {
                                 break;
                             }
                         }
                     }
                     if ($service_in_use) {
                         $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . tl('system_component_stop_service_first') . "</h1>');";
                         break;
                     }
                     $machine_model->deleteMachine($r["name"]);
                     $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . tl('system_component_machine_deleted') . "</h1>');";
                 }
                 break;
             case "newsmode":
                 $profile = $profile_model->getProfile(WORK_DIRECTORY);
                 $news_modes = array("news_off", "news_web", "news_process");
                 if (isset($_REQUEST['news_mode']) && in_array($_REQUEST['news_mode'], $news_modes)) {
                     $profile["NEWS_MODE"] = $_REQUEST['news_mode'];
                     if ($profile["NEWS_MODE"] != "news_process") {
                         CrawlDaemon::stop("news_updater", "", false);
                         $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . tl('system_component_news_mode_updated') . "</h1>');";
                     } else {
                         CrawlDaemon::start("news_updater", 'none', "", -1);
                         $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . tl('system_component_news_mode_updated') . "</h1>');";
                     }
                     $profile_model->updateProfile(WORK_DIRECTORY, array(), $profile);
                 } else {
                     $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . tl('system_component_news_update_failed') . "</h1>');";
                 }
                 break;
             case "log":
                 if (isset($_REQUEST["fetcher_num"])) {
                     $r["fetcher_num"] = $parent->clean($_REQUEST["fetcher_num"], "int");
                 }
                 if (isset($_REQUEST["mirror_name"])) {
                     $r["mirror_name"] = $parent->clean($_REQUEST["mirror_name"], "string");
                 }
                 if (isset($_REQUEST["time"])) {
                     $data["time"] = $parent->clean($_REQUEST["time"], "int") + 30;
                 } else {
                     $data["time"] = 30;
                 }
                 if (isset($_REQUEST["NO_REFRESH"])) {
                     $data["NO_REFRESH"] = $parent->clean($_REQUEST["NO_REFRESH"], "bool");
                 } else {
                     $data["NO_REFRESH"] = false;
                 }
                 $data["ELEMENT"] = "machinelog";
                 $filter = "";
                 if (isset($_REQUEST['f'])) {
                     $filter = $parent->clean($_REQUEST['f'], "string");
                 }
                 $data['filter'] = $filter;
                 $data["REFRESH_LOG"] = "&time=" . $data["time"];
                 $data["LOG_TYPE"] = "";
                 if (isset($r['fetcher_num']) && isset($r['name'])) {
                     $data["LOG_FILE_DATA"] = $machine_model->getLog($r["name"], $r["fetcher_num"], $filter);
                     $data["LOG_TYPE"] = $r['name'] . " fetcher " . $r["fetcher_num"];
                     $data["REFRESH_LOG"] .= "&arg=log&name=" . $r['name'] . "&fetcher_num=" . $r['fetcher_num'];
                 } else {
                     if (isset($r["mirror_name"])) {
                         $data["LOG_TYPE"] = $r['mirror_name'] . " mirror";
                         $data["LOG_FILE_DATA"] = $machine_model->getLog($r["mirror_name"], NULL, $filter, true);
                     } else {
                         if (isset($r['name'])) {
                             $data["LOG_TYPE"] = $r['name'] . " queue_server";
                             if ($r['name'] == "news") {
                                 $data["LOG_TYPE"] = "Name Server News Updater";
                             }
                             $data["LOG_FILE_DATA"] = $machine_model->getLog($r["name"], NULL, $filter);
                             $data["REFRESH_LOG"] .= "&arg=log&name=" . $r['name'];
                         }
                     }
                 }
                 if ($data["time"] >= ONE_HOUR / 3) {
                     $data["REFRESH_LOG"] = "";
                 }
                 if (!isset($data["LOG_FILE_DATA"]) || $data["LOG_FILE_DATA"] == "") {
                     $data["LOG_FILE_DATA"] = tl('system_component_no_machine_log');
                 }
                 $lines = array_reverse(explode("\n", $data["LOG_FILE_DATA"]));
                 $data["LOG_FILE_DATA"] = implode("\n", $lines);
                 break;
             case "update":
                 if (isset($_REQUEST["fetcher_num"])) {
                     $r["fetcher_num"] = $parent->clean($_REQUEST["fetcher_num"], "int");
                 } else {
                     $r["fetcher_num"] = NULL;
                 }
                 $available_actions = array("start", "stop", "mirror_start", "mirror_stop");
                 if (isset($r["name"]) && isset($_REQUEST["action"]) && in_array($_REQUEST["action"], $available_actions)) {
                     $action = $_REQUEST["action"];
                     $is_mirror = false;
                     if ($action == "mirror_start") {
                         $action = "start";
                         $is_mirror = true;
                     } else {
                         if ($action == "mirror_stop") {
                             $action = "stop";
                             $is_mirror = true;
                         }
                     }
                     $machine_model->update($r["name"], $action, $r["fetcher_num"], $is_mirror);
                     $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . tl('system_component_machine_servers_updated') . "</h1>');";
                 } else {
                     $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . tl('system_component_machine_no_action') . "</h1>');";
                 }
                 break;
         }
     }
     $parent->pagingLogic($data, $machine_model, "MACHINE", DEFAULT_ADMIN_PAGING_NUM);
     if (!isset($_REQUEST['arg']) || $_REQUEST['arg'] != 'log') {
         $data['SCRIPT'] .= "toggleReplica(false);";
     }
     return $data;
 }
Пример #3
0
 /**
  * Main runtime loop of the queue_server.
  *
  * Loops until a stop message received, check for start, stop, resume
  * crawl messages, deletes any WebQueueBundle for which an
  * IndexArchiveBundle does not exist. Processes
  */
 function loop()
 {
     $info[self::STATUS] = self::WAITING_START_MESSAGE_STATE;
     crawlLog("In queue loop!! {$this->server_name}", "queue_server");
     if ($this->isAIndexer()) {
         $this->deleteOrphanedBundles();
     }
     while (CrawlDaemon::processHandler()) {
         crawlLog("{$this->server_name} peak memory usage so far: " . memory_get_peak_usage() . "!!");
         $info = $this->handleAdminMessages($info);
         if ($info[self::STATUS] == self::WAITING_START_MESSAGE_STATE) {
             crawlLog("{$this->server_name} is waiting for start message\n");
             sleep(QUEUE_SLEEP_TIME);
             continue;
         }
         if ($info[self::STATUS] == self::STOP_STATE) {
             continue;
         }
         crawlLog("{$this->server_name} active crawl is " . "{$this->crawl_time}.");
         if ($this->isAScheduler()) {
             crawlLog("Current queue size is:" . $this->web_queue->to_crawl_queue->count);
         }
         $start_loop_time = time();
         //check and update if necessary the crawl params of current crawl
         $this->checkUpdateCrawlParameters();
         $this->updateMostRecentFetcher();
         $this->processCrawlData();
         $time_diff = time() - $start_loop_time;
         if ($time_diff < QUEUE_SLEEP_TIME) {
             crawlLog("Sleeping...");
             sleep(QUEUE_SLEEP_TIME - $time_diff);
         }
     }
     crawlLog("{$this->server_name} shutting down!!");
 }
Пример #4
0
 /**
  * Handles admin requests for creating, editing, and deleting classifiers.
  *
  * This activity implements the logic for the page that lists existing
  * classifiers, including the actions that can be performed on them.
  */
 function manageClassifiers()
 {
     $parent = $this->parent;
     $crawl_model = $parent->model("crawl");
     $possible_arguments = array('createclassifier', 'editclassifier', 'finalizeclassifier', 'deleteclassifier', 'search');
     $data['ELEMENT'] = 'manageclassifiers';
     $data['SCRIPT'] = '';
     $data['FORM_TYPE'] = '';
     $search_array = array();
     $machine_urls = $parent->model("machine")->getQueueServerUrls();
     $num_machines = count($machine_urls);
     if ($num_machines < 1 || $num_machines == 1 && UrlParser::isLocalhostUrl($machine_urls[0])) {
         $machine_urls = NULL;
     }
     $data['leftorright'] = getLocaleDirection() == 'ltr' ? 'right' : 'left';
     $classifiers = Classifier::getClassifierList();
     $start_finalizing = false;
     if (isset($_REQUEST['arg']) && in_array($_REQUEST['arg'], $possible_arguments)) {
         if (isset($_REQUEST['name'])) {
             $name = substr($parent->clean($_REQUEST['name'], 'string'), 0, NAME_LEN);
             $name = Classifier::cleanLabel($name);
         } else {
             if (isset($_REQUEST['class_label'])) {
                 $name = substr($parent->clean($_REQUEST['class_label'], 'string'), 0, NAME_LEN);
                 $name = Classifier::cleanLabel($name);
             } else {
                 $name = "";
             }
         }
         switch ($_REQUEST['arg']) {
             case 'createclassifier':
                 if (!isset($classifiers[$name])) {
                     $classifier = new Classifier($name);
                     Classifier::setClassifier($classifier);
                     $classifiers[$name] = $classifier;
                     $data['SCRIPT'] .= "doMessage('<h1 class=\"red\">" . tl('crawl_component_new_classifier') . '</h1>\');';
                 } else {
                     $data['SCRIPT'] .= "doMessage('<h1 class=\"red\">" . tl('crawl_component_classifier_exists') . '</h1>\');';
                 }
                 break;
             case 'deleteclassifier':
                 /*
                   In addition to deleting the classifier, we also want to
                   delete the associated crawl mix (if one exists) used to
                   iterate over existing indexes in search of new training
                   examples.
                 */
                 if (isset($classifiers[$name])) {
                     unset($classifiers[$name]);
                     Classifier::deleteClassifier($name);
                     $mix_name = Classifier::getCrawlMixName($name);
                     $mix_time = $crawl_model->getCrawlMixTimestamp($mix_name);
                     if ($mix_time) {
                         $crawl_model->deleteCrawlMixIteratorState($mix_time);
                         $crawl_model->deleteCrawlMix($mix_time);
                     }
                     $data['SCRIPT'] .= "doMessage('<h1 class=\"red\">" . tl('crawl_component_classifier_deleted') . '</h1>\');';
                 } else {
                     $data['SCRIPT'] .= "doMessage('<h1 class=\"red\">" . tl('crawl_component_no_classifier') . '</h1>\');';
                 }
                 break;
             case 'editclassifier':
                 if (isset($classifiers[$name])) {
                     $data['class_label'] = $name;
                     $this->editClassifier($data, $classifiers, $machine_urls);
                 } else {
                     $data['SCRIPT'] .= "doMessage('<h1 class=\"red\">" . tl('crawl_component_no_classifier') . '</h1>\');';
                 }
                 break;
             case 'finalizeclassifier':
                 /*
                   Finalizing is too expensive to be done directly in the
                   controller that responds to the web request. Instead, a
                   daemon is launched to finalize the classifier
                   asynchronously and save it back to disk when it's done.
                   In the meantime, a flag is set to indicate the current
                   finalizing state.
                 */
                 CrawlDaemon::start("classifier_trainer", $name, '', -1);
                 $classifier = $classifiers[$name];
                 $classifier->finalized = Classifier::FINALIZING;
                 $start_finalizing = true;
                 $data['SCRIPT'] .= "doMessage('<h1 class=\"red\">" . tl('crawl_component_finalizing_classifier') . '</h1>\');';
                 break;
             case 'search':
                 $search_array = $parent->tableSearchRequestHandler($data, array('name'));
                 break;
         }
     }
     $data['classifiers'] = $classifiers;
     if ($search_array == array()) {
         $search_array[] = array("name", "", "", "ASC");
     }
     $parent->pagingLogic($data, 'classifiers', 'classifiers', DEFAULT_ADMIN_PAGING_NUM, $search_array, "", array('name' => 'class_label'));
     $data['reload'] = false;
     foreach ($classifiers as $label => $classifier) {
         if ($classifier->finalized == Classifier::FINALIZING) {
             $data['reload'] = true;
             break;
         }
     }
     if ($data['reload'] && !$start_finalizing) {
         $data['SCRIPT'] .= "doMessage('<h1 class=\"red\">" . tl('crawl_component_finalizing_classifier') . '</h1>\');';
     }
     return $data;
 }
Пример #5
0
 /**
  * Main loop for the fetcher.
  *
  * Checks for stop message, checks queue server if crawl has changed and
  * for new pages to crawl. Loop gets a group of next pages to crawl if
  * there are pages left to crawl (otherwise sleep 5 seconds). It downloads
  * these pages, deduplicates them, and updates the found site info with the
  * result before looping again.
  */
 function loop()
 {
     crawlLog("In Fetch Loop");
     $prefix = $this->fetcher_num . "-";
     if (!file_exists(CRAWL_DIR . "/{$prefix}temp")) {
         mkdir(CRAWL_DIR . "/{$prefix}temp");
     }
     $info[self::STATUS] = self::CONTINUE_STATE;
     $local_archives = array("");
     while (CrawlDaemon::processHandler()) {
         $start_time = microtime();
         $fetcher_message_file = CRAWL_DIR . "/schedules/{$prefix}fetcher_messages.txt";
         if (file_exists($fetcher_message_file)) {
             $info = unserialize(file_get_contents($fetcher_message_file));
             unlink($fetcher_message_file);
             if (isset($info[self::STATUS]) && $info[self::STATUS] == self::STOP_STATE) {
                 continue;
             }
         }
         $switch_fetch_or_no_current = $this->checkCrawlTime();
         if ($switch_fetch_or_no_current) {
             /* case(1) */
             crawlLog("MAIN LOOP CASE 1 --" . " SWITCH CRAWL OR NO CURRENT CRAWL");
             $info[self::CRAWL_TIME] = $this->crawl_time;
             if ($info[self::CRAWL_TIME] == 0) {
                 $info[self::STATUS] = self::NO_DATA_STATE;
                 $this->to_crawl = array();
             }
         } else {
             if ($this->crawl_type == self::ARCHIVE_CRAWL && $this->arc_type != "WebArchiveBundle" && $this->arc_type != "") {
                 /* case(2) */
                 // An archive crawl with data coming from the name server.
                 crawlLog("MAIN LOOP CASE 2 -- ARCHIVE SCHEDULER (NOT RECRAWL)");
                 $info = $this->checkArchiveScheduler();
                 if ($info === false) {
                     crawlLog("No Archive Schedule Data..." . " will try again in " . FETCH_SLEEP_TIME . " seconds.");
                     sleep(FETCH_SLEEP_TIME);
                     continue;
                 }
             } else {
                 if ($this->crawl_time > 0) {
                     /* case(3) */
                     // Either a web crawl or a recrawl of a previous web crawl.
                     if ($this->crawl_type == self::ARCHIVE_CRAWL) {
                         crawlLog("MAIN LOOP CASE 3 -- RECRAWL SCHEDULER");
                     } else {
                         crawlLog("MAIN LOOP CASE 4 -- WEB SCHEDULER");
                     }
                     $info = $this->checkScheduler();
                     if ($info === false) {
                         crawlLog("Cannot connect to name server..." . " will try again in " . FETCH_SLEEP_TIME . " seconds.");
                         sleep(FETCH_SLEEP_TIME);
                         continue;
                     }
                 } else {
                     crawlLog("MAIN LOOP CASE 5 -- NO CURRENT CRAWL");
                     $info[self::STATUS] = self::NO_DATA_STATE;
                 }
             }
         }
         /* case(2), case(3) might have set info without
              $info[self::STATUS] being set
            */
         if (!isset($info[self::STATUS])) {
             if ($info === true) {
                 $info = array();
             }
             $info[self::STATUS] = self::CONTINUE_STATE;
         }
         if ($info[self::STATUS] == self::NO_DATA_STATE) {
             crawlLog("No data. Sleeping...");
             sleep(FETCH_SLEEP_TIME);
             continue;
         }
         $tmp_base_name = isset($info[self::CRAWL_TIME]) ? CRAWL_DIR . "/cache/{$prefix}" . self::archive_base_name . $info[self::CRAWL_TIME] : "";
         if (isset($info[self::CRAWL_TIME]) && ($this->web_archive == NULL || $this->web_archive->dir_name != $tmp_base_name)) {
             if (isset($this->web_archive->dir_name)) {
                 crawlLog("Old name: " . $this->web_archive->dir_name);
             }
             if (is_object($this->web_archive)) {
                 $this->web_archive = NULL;
             }
             $this->to_crawl_again = array();
             $this->found_sites = array();
             gc_collect_cycles();
             $this->web_archive = new WebArchiveBundle($tmp_base_name, false);
             $this->crawl_time = $info[self::CRAWL_TIME];
             $this->sum_seen_title_length = 0;
             $this->sum_seen_description_length = 0;
             $this->sum_seen_site_link_length = 0;
             $this->num_seen_sites = 0;
             crawlLog("New name: " . $this->web_archive->dir_name);
             crawlLog("Switching archive...");
             if (!isset($info[self::ARC_DATA])) {
                 continue;
             }
         }
         switch ($this->crawl_type) {
             case self::WEB_CRAWL:
                 $downloaded_pages = $this->downloadPagesWebCrawl();
                 break;
             case self::ARCHIVE_CRAWL:
                 if (isset($info[self::ARC_DATA])) {
                     $downloaded_pages = $info[self::ARC_DATA];
                 } else {
                     $downloaded_pages = $this->downloadPagesArchiveCrawl();
                 }
                 break;
         }
         if (isset($downloaded_pages["NO_PROCESS"])) {
             unset($downloaded_pages["NO_PROCESS"]);
             $summarized_site_pages = array_values($downloaded_pages);
             $this->no_process_links = true;
         } else {
             $summarized_site_pages = $this->processFetchPages($downloaded_pages);
             $this->no_process_links = false;
         }
         crawlLog("Number of summarized pages " . count($summarized_site_pages));
         $force_send = isset($info[self::END_ITERATOR]) && $info[self::END_ITERATOR] ? true : false;
         $this->updateFoundSites($summarized_site_pages, $force_send);
         $sleep_time = max(0, ceil(MINIMUM_FETCH_LOOP_TIME - changeInMicrotime($start_time)));
         if ($sleep_time > 0) {
             crawlLog("Ensure minimum loop time by sleeping..." . $sleep_time);
             sleep($sleep_time);
         }
     }
     //end while
     crawlLog("Fetcher shutting down!!");
 }
Пример #6
0
 /**
  * Used to stop a daemon that is running in the background
  *
  * @param string $name the main name of this daemon such as queue_server
  *     or fetcher.
  * @param string $subname the instance name if it is possible for more
  *     than one copy of the daemon to be running at the same time
  * @param bool $exit whether this method should just return (false) or
  *      call exit() (true)
  */
 static function stop($name, $subname = "", $exit = true)
 {
     $name_string = CrawlDaemon::getNameString($name, $subname);
     $lock_file = CrawlDaemon::getLockFileName($name, $subname);
     $not_web_setting = php_sapi_name() == 'cli';
     if (file_exists($lock_file)) {
         unlink($lock_file);
         if ($not_web_setting) {
             crawlLog("Sending stop signal to {$name_string}...");
         }
     } else {
         if ($not_web_setting) {
             crawlLog("{$name_string} does not appear to running...");
         }
     }
     if ($exit) {
         exit;
     }
 }
Пример #7
0
 /**
  * Main loop for the mirror script.
  *
  */
 function loop()
 {
     crawlLog("In Sync Loop");
     $info[self::STATUS] = self::CONTINUE_STATE;
     while (CrawlDaemon::processHandler()) {
         $syncer_message_file = CRAWL_DIR . "/schedules/mirror_messages.txt";
         if (file_exists($syncer_message_file)) {
             $info = unserialize(file_get_contents($syncer_message_file));
             unlink($syncer_message_file);
             if (isset($info[self::STATUS]) && $info[self::STATUS] == self::STOP_STATE) {
                 continue;
             }
         }
         $info = $this->checkScheduler();
         if ($info === false) {
             crawlLog("Cannot connect to queue server..." . " will try again in " . MIRROR_NOTIFY_FREQUENCY . " seconds.");
             sleep(MIRROR_NOTIFY_FREQUENCY);
             continue;
         }
         if ($info[self::STATUS] == self::NO_DATA_STATE) {
             crawlLog("No data from queue server. Sleeping...");
             sleep(MIRROR_SYNC_FREQUENCY);
             continue;
         }
         $this->copyNextSyncFile();
     }
     //end while
     crawlLog("Mirror shutting down!!");
 }
Пример #8
0
 /**
  * Returns the statuses of machines in the machine table of their
  * fetchers and queue_server as well as the name and url's of these machines
  *
  * @param array $machines an array of machines to check the status for
  * @return array  a list of machines, together with all their properties
  * and the statuses of their fetchers and queue_servers
  */
 function getMachineStatuses($machines = array())
 {
     $num_machines = count($machines);
     $time = time();
     $session = md5($time . AUTH_KEY);
     for ($i = 0; $i < $num_machines; $i++) {
         $hash_url = crawlHash($machines[$i]["URL"]);
         $machines[$i][CrawlConstants::URL] = $machines[$i]["URL"] . "?c=machine&a=statuses&time={$time}" . "&session={$session}&arg={$hash_url}";
     }
     $statuses = FetchUrl::getPages($machines);
     for ($i = 0; $i < $num_machines; $i++) {
         foreach ($statuses as $status) {
             if ($machines[$i][CrawlConstants::URL] == $status[CrawlConstants::URL]) {
                 $pre_status = json_decode($status[CrawlConstants::PAGE], true);
                 if (is_array($pre_status)) {
                     $machines[$i]["STATUSES"] = $pre_status;
                 } else {
                     $machines[$i]["STATUSES"] = "NOT_CONFIGURED_ERROR";
                 }
             }
         }
     }
     $sql = "SELECT * FROM ACTIVE_FETCHER";
     $result = $this->db->execute($sql);
     if (!$result) {
         return $machines;
     }
     $active_fetchers = array();
     while ($row = $this->db->fetchArray($result)) {
         for ($i = 0; $i < $num_machines; $i++) {
             if ($machines[$i]['NAME'] == $row['NAME']) {
                 if (!isset($machines[$i]["STATUSES"]["fetcher"][$row['FETCHER_ID']])) {
                     $machines[$i]["STATUSES"]["fetcher"][$row['FETCHER_ID']] = 0;
                 }
             }
         }
     }
     stringROrderCallback("", "", "NAME");
     if ($machines != array()) {
         usort($machines, "stringROrderCallback");
     }
     $name_server_statuses = CrawlDaemon::statuses();
     $machines['NAME_SERVER']['news_updater'] = 0;
     if (isset($name_server_statuses['news_updater'])) {
         $machines['NAME_SERVER']['news_updater'] = 1;
     }
     return $machines;
 }
Пример #9
0
 /**
  * This is the function that should be called to get the
  * classifier_trainer to start training a logistic regression instance for
  * a particular classifier. The class label corresponding to the
  * classifier to be finalized should be passed as the second command-line
  * argument.
  */
 function start()
 {
     global $argv;
     CrawlDaemon::init($argv, "classifier_trainer");
     $label = $argv[2];
     crawlLog("Initializing classifier trainer log..", $label . '-classifier_trainer', true);
     $classifier = Classifier::getClassifier($label);
     $classifier->prepareToFinalize();
     $classifier->finalize();
     Classifier::setClassifier($classifier);
     crawlLog("Training complete.\n");
     CrawlDaemon::stop('classifier_trainer', $label);
 }
Пример #10
0
 /**
  * Used to start/stop a queue_server/fetcher of the current Yioop instance
  * based on the queue_server and fetcher fields of the current $_REQUEST
  */
 function update()
 {
     $statuses = CrawlDaemon::statuses();
     if (isset($_REQUEST['queue_server'])) {
         if ($_REQUEST['queue_server'] == "true" && !isset($statuses["queue_server"][-1])) {
             CrawlDaemon::start("queue_server", 'none', self::INDEXER, 0);
             CrawlDaemon::start("queue_server", 'none', self::SCHEDULER, 2);
         } else {
             if ($_REQUEST['queue_server'] == "false" && isset($statuses["queue_server"][-1])) {
                 CrawlDaemon::stop("queue_server");
             }
         }
     }
     if (isset($_REQUEST['mirror'])) {
         if ($_REQUEST['mirror'] == "true" && !isset($statuses["mirror"][-1])) {
             CrawlDaemon::start("mirror");
         } else {
             if ($_REQUEST['mirror'] == "false" && isset($statuses["mirror"][-1])) {
                 CrawlDaemon::stop("mirror");
             }
         }
     }
     if (isset($_REQUEST['fetcher']) && is_array($_REQUEST['fetcher'])) {
         foreach ($_REQUEST['fetcher'] as $index => $value) {
             if ($value == "true" && !isset($statuses["fetcher"][$index])) {
                 CrawlDaemon::start("fetcher", "{$index}");
             } else {
                 if ($value == "false" && isset($statuses["fetcher"][$index])) {
                     CrawlDaemon::stop("fetcher", "{$index}");
                 }
             }
         }
     }
 }
Пример #11
0
 /**
  * Main loop for the news updater.
  */
 function loop()
 {
     crawlLog("In News Update Loop");
     $info[self::STATUS] = self::CONTINUE_STATE;
     $local_archives = array("");
     while (CrawlDaemon::processHandler()) {
         $start_time = microtime();
         crawlLog("Checking if news feeds should be updated...");
         $this->newsUpdate();
         $sleep_time = max(0, ceil(MINIMUM_UPDATE_LOOP_TIME - changeInMicrotime($start_time)));
         if ($sleep_time > 0) {
             crawlLog("Ensure minimum loop time by sleeping..." . $sleep_time);
             sleep($sleep_time);
         }
     }
     //end while
     crawlLog("News Updater shutting down!!");
 }