/** * Gets data from the machine model concerning the on/off states * of the machines managed by this Yioop instance and then passes * this data the the machinestatus view. * @return array $data MACHINES field has information about each * machine managed by this Yioop instance as well the on off * status of its queue_servers and fetchers. * The REFRESH field is used to tell the controller that the * view shouldn't have its own sidemenu. */ function machineStatus() { $data = array(); $data['REFRESH'] = "machinestatus"; $this->pagingLogic($data, $this->model("machine"), 'MACHINES', DEFAULT_ADMIN_PAGING_NUM); $profile = $this->model("profile")->getProfile(WORK_DIRECTORY); $data['NEWS_MODE'] = isset($profile['NEWS_MODE']) ? $profile['NEWS_MODE'] : ""; if ($data['NEWS_MODE'] == "news_process" && $data['MACHINES']['NAME_SERVER']["news_updater"] == 0) { // try to restart news server if dead CrawlDaemon::start("news_updater", 'none', "", -1); } return $data; }
/** * Handles admin request related to the managing the machines which perform * crawls * * With this activity an admin can add/delete machines to manage. For each * managed machine, the admin can stop and start fetchers/queue_servers * as well as look at their log files * * @return array $data MACHINES, their MACHINE_NAMES, data for * FETCHER_NUMBERS drop-down */ function manageMachines() { $parent = $this->parent; $machine_model = $parent->model("machine"); $profile_model = $parent->model("profile"); $data = array(); $data["ELEMENT"] = "managemachines"; $possible_arguments = array("addmachine", "deletemachine", "newsmode", "log", "update"); $data['SCRIPT'] = "doUpdate();"; $data["leftorright"] = getLocaleDirection() == 'ltr' ? "right" : "left"; $data['MACHINE_NAMES'] = array(); $data['FETCHER_NUMBERS'] = array(0 => 0, 1 => 1, 2 => 2, 3 => 3, 4 => 4, 5 => 5, 6 => 6, 7 => 7, 8 => 8, 16 => 16); $tmp = tl('system_component_select_machine'); if (!isset($_REQUEST["has_queue_server"]) || isset($_REQUEST['is_replica'])) { $_REQUEST["has_queue_server"] = false; } if (isset($_REQUEST['is_replica'])) { $_REQUEST['num_fetchers'] = 0; } else { $_REQUEST['parent'] = ""; } $request_fields = array("name" => "string", "url" => "string", "has_queue_server" => "bool", "num_fetchers" => "int", "parent" => "string"); $r = array(); $allset = true; foreach ($request_fields as $field => $type) { if (isset($_REQUEST[$field])) { $r[$field] = $parent->clean($_REQUEST[$field], $type); if ($type == "string") { $r[$field] = trim($r[$field]); if ($r[$field] == "" && $field != "parent") { $allset = false; } } if ($field == "url") { if (isset($r[$field][strlen($r[$field]) - 1]) && $r[$field][strlen($r[$field]) - 1] != "/") { $r[$field] .= "/"; } $r[$field] = UrlParser::canonicalLink($r[$field], NAME_SERVER); if (!$r[$field]) { $allset = false; } } } else { $allset = false; } } if (isset($r["num_fetchers"]) && in_array($r["num_fetchers"], $data['FETCHER_NUMBERS'])) { $data['FETCHER_NUMBER'] = $r["num_fetchers"]; } else { $data['FETCHER_NUMBER'] = 0; if (isset($r["num_fetchers"])) { $r["num_fetchers"] = 0; } } $machine_exists = isset($r["name"]) && $machine_model->checkMachineExists("NAME", $r["name"]) || isset($r["url"]) && $machine_model->checkMachineExists("URL", $r["url"]); if (isset($_REQUEST['arg']) && in_array($_REQUEST['arg'], $possible_arguments)) { switch ($_REQUEST['arg']) { case "addmachine": if ($allset == true && !$machine_exists) { $machine_model->addMachine($r["name"], $r["url"], $r["has_queue_server"], $r["num_fetchers"], $r["parent"]); $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . tl('system_component_machine_added') . "</h1>');"; $data['MACHINE_NAMES'][] = $r["name"]; $data['DELETABLE_MACHINES'][$r["name"]] = $r["name"]; sort($data['MACHINE_NAMES']); } else { if ($allset && $machine_exists) { $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . tl('system_component_machine_exists') . "</h1>');"; } else { $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . tl('system_component_machine_incomplete') . "</h1>');"; } } break; case "deletemachine": if (!$machine_exists) { $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . tl('system_component_machine_doesnt_exists') . "</h1>');"; } else { $machines = $machine_model->getRows(0, 1, $total_rows, array(array("name", "=", $r["name"], ""))); $service_in_use = false; foreach ($machines as $machine) { if ($machine['NAME'] == $r["name"]) { if (isset($machine['STATUSES']) && is_array($machine['STATUSES']) && $machine['STATUSES'] != array()) { $service_in_use = true; break; } else { break; } } } if ($service_in_use) { $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . tl('system_component_stop_service_first') . "</h1>');"; break; } $machine_model->deleteMachine($r["name"]); $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . tl('system_component_machine_deleted') . "</h1>');"; } break; case "newsmode": $profile = $profile_model->getProfile(WORK_DIRECTORY); $news_modes = array("news_off", "news_web", "news_process"); if (isset($_REQUEST['news_mode']) && in_array($_REQUEST['news_mode'], $news_modes)) { $profile["NEWS_MODE"] = $_REQUEST['news_mode']; if ($profile["NEWS_MODE"] != "news_process") { CrawlDaemon::stop("news_updater", "", false); $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . tl('system_component_news_mode_updated') . "</h1>');"; } else { CrawlDaemon::start("news_updater", 'none', "", -1); $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . tl('system_component_news_mode_updated') . "</h1>');"; } $profile_model->updateProfile(WORK_DIRECTORY, array(), $profile); } else { $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . tl('system_component_news_update_failed') . "</h1>');"; } break; case "log": if (isset($_REQUEST["fetcher_num"])) { $r["fetcher_num"] = $parent->clean($_REQUEST["fetcher_num"], "int"); } if (isset($_REQUEST["mirror_name"])) { $r["mirror_name"] = $parent->clean($_REQUEST["mirror_name"], "string"); } if (isset($_REQUEST["time"])) { $data["time"] = $parent->clean($_REQUEST["time"], "int") + 30; } else { $data["time"] = 30; } if (isset($_REQUEST["NO_REFRESH"])) { $data["NO_REFRESH"] = $parent->clean($_REQUEST["NO_REFRESH"], "bool"); } else { $data["NO_REFRESH"] = false; } $data["ELEMENT"] = "machinelog"; $filter = ""; if (isset($_REQUEST['f'])) { $filter = $parent->clean($_REQUEST['f'], "string"); } $data['filter'] = $filter; $data["REFRESH_LOG"] = "&time=" . $data["time"]; $data["LOG_TYPE"] = ""; if (isset($r['fetcher_num']) && isset($r['name'])) { $data["LOG_FILE_DATA"] = $machine_model->getLog($r["name"], $r["fetcher_num"], $filter); $data["LOG_TYPE"] = $r['name'] . " fetcher " . $r["fetcher_num"]; $data["REFRESH_LOG"] .= "&arg=log&name=" . $r['name'] . "&fetcher_num=" . $r['fetcher_num']; } else { if (isset($r["mirror_name"])) { $data["LOG_TYPE"] = $r['mirror_name'] . " mirror"; $data["LOG_FILE_DATA"] = $machine_model->getLog($r["mirror_name"], NULL, $filter, true); } else { if (isset($r['name'])) { $data["LOG_TYPE"] = $r['name'] . " queue_server"; if ($r['name'] == "news") { $data["LOG_TYPE"] = "Name Server News Updater"; } $data["LOG_FILE_DATA"] = $machine_model->getLog($r["name"], NULL, $filter); $data["REFRESH_LOG"] .= "&arg=log&name=" . $r['name']; } } } if ($data["time"] >= ONE_HOUR / 3) { $data["REFRESH_LOG"] = ""; } if (!isset($data["LOG_FILE_DATA"]) || $data["LOG_FILE_DATA"] == "") { $data["LOG_FILE_DATA"] = tl('system_component_no_machine_log'); } $lines = array_reverse(explode("\n", $data["LOG_FILE_DATA"])); $data["LOG_FILE_DATA"] = implode("\n", $lines); break; case "update": if (isset($_REQUEST["fetcher_num"])) { $r["fetcher_num"] = $parent->clean($_REQUEST["fetcher_num"], "int"); } else { $r["fetcher_num"] = NULL; } $available_actions = array("start", "stop", "mirror_start", "mirror_stop"); if (isset($r["name"]) && isset($_REQUEST["action"]) && in_array($_REQUEST["action"], $available_actions)) { $action = $_REQUEST["action"]; $is_mirror = false; if ($action == "mirror_start") { $action = "start"; $is_mirror = true; } else { if ($action == "mirror_stop") { $action = "stop"; $is_mirror = true; } } $machine_model->update($r["name"], $action, $r["fetcher_num"], $is_mirror); $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . tl('system_component_machine_servers_updated') . "</h1>');"; } else { $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . tl('system_component_machine_no_action') . "</h1>');"; } break; } } $parent->pagingLogic($data, $machine_model, "MACHINE", DEFAULT_ADMIN_PAGING_NUM); if (!isset($_REQUEST['arg']) || $_REQUEST['arg'] != 'log') { $data['SCRIPT'] .= "toggleReplica(false);"; } return $data; }
/** * Main runtime loop of the queue_server. * * Loops until a stop message received, check for start, stop, resume * crawl messages, deletes any WebQueueBundle for which an * IndexArchiveBundle does not exist. Processes */ function loop() { $info[self::STATUS] = self::WAITING_START_MESSAGE_STATE; crawlLog("In queue loop!! {$this->server_name}", "queue_server"); if ($this->isAIndexer()) { $this->deleteOrphanedBundles(); } while (CrawlDaemon::processHandler()) { crawlLog("{$this->server_name} peak memory usage so far: " . memory_get_peak_usage() . "!!"); $info = $this->handleAdminMessages($info); if ($info[self::STATUS] == self::WAITING_START_MESSAGE_STATE) { crawlLog("{$this->server_name} is waiting for start message\n"); sleep(QUEUE_SLEEP_TIME); continue; } if ($info[self::STATUS] == self::STOP_STATE) { continue; } crawlLog("{$this->server_name} active crawl is " . "{$this->crawl_time}."); if ($this->isAScheduler()) { crawlLog("Current queue size is:" . $this->web_queue->to_crawl_queue->count); } $start_loop_time = time(); //check and update if necessary the crawl params of current crawl $this->checkUpdateCrawlParameters(); $this->updateMostRecentFetcher(); $this->processCrawlData(); $time_diff = time() - $start_loop_time; if ($time_diff < QUEUE_SLEEP_TIME) { crawlLog("Sleeping..."); sleep(QUEUE_SLEEP_TIME - $time_diff); } } crawlLog("{$this->server_name} shutting down!!"); }
/** * Handles admin requests for creating, editing, and deleting classifiers. * * This activity implements the logic for the page that lists existing * classifiers, including the actions that can be performed on them. */ function manageClassifiers() { $parent = $this->parent; $crawl_model = $parent->model("crawl"); $possible_arguments = array('createclassifier', 'editclassifier', 'finalizeclassifier', 'deleteclassifier', 'search'); $data['ELEMENT'] = 'manageclassifiers'; $data['SCRIPT'] = ''; $data['FORM_TYPE'] = ''; $search_array = array(); $machine_urls = $parent->model("machine")->getQueueServerUrls(); $num_machines = count($machine_urls); if ($num_machines < 1 || $num_machines == 1 && UrlParser::isLocalhostUrl($machine_urls[0])) { $machine_urls = NULL; } $data['leftorright'] = getLocaleDirection() == 'ltr' ? 'right' : 'left'; $classifiers = Classifier::getClassifierList(); $start_finalizing = false; if (isset($_REQUEST['arg']) && in_array($_REQUEST['arg'], $possible_arguments)) { if (isset($_REQUEST['name'])) { $name = substr($parent->clean($_REQUEST['name'], 'string'), 0, NAME_LEN); $name = Classifier::cleanLabel($name); } else { if (isset($_REQUEST['class_label'])) { $name = substr($parent->clean($_REQUEST['class_label'], 'string'), 0, NAME_LEN); $name = Classifier::cleanLabel($name); } else { $name = ""; } } switch ($_REQUEST['arg']) { case 'createclassifier': if (!isset($classifiers[$name])) { $classifier = new Classifier($name); Classifier::setClassifier($classifier); $classifiers[$name] = $classifier; $data['SCRIPT'] .= "doMessage('<h1 class=\"red\">" . tl('crawl_component_new_classifier') . '</h1>\');'; } else { $data['SCRIPT'] .= "doMessage('<h1 class=\"red\">" . tl('crawl_component_classifier_exists') . '</h1>\');'; } break; case 'deleteclassifier': /* In addition to deleting the classifier, we also want to delete the associated crawl mix (if one exists) used to iterate over existing indexes in search of new training examples. */ if (isset($classifiers[$name])) { unset($classifiers[$name]); Classifier::deleteClassifier($name); $mix_name = Classifier::getCrawlMixName($name); $mix_time = $crawl_model->getCrawlMixTimestamp($mix_name); if ($mix_time) { $crawl_model->deleteCrawlMixIteratorState($mix_time); $crawl_model->deleteCrawlMix($mix_time); } $data['SCRIPT'] .= "doMessage('<h1 class=\"red\">" . tl('crawl_component_classifier_deleted') . '</h1>\');'; } else { $data['SCRIPT'] .= "doMessage('<h1 class=\"red\">" . tl('crawl_component_no_classifier') . '</h1>\');'; } break; case 'editclassifier': if (isset($classifiers[$name])) { $data['class_label'] = $name; $this->editClassifier($data, $classifiers, $machine_urls); } else { $data['SCRIPT'] .= "doMessage('<h1 class=\"red\">" . tl('crawl_component_no_classifier') . '</h1>\');'; } break; case 'finalizeclassifier': /* Finalizing is too expensive to be done directly in the controller that responds to the web request. Instead, a daemon is launched to finalize the classifier asynchronously and save it back to disk when it's done. In the meantime, a flag is set to indicate the current finalizing state. */ CrawlDaemon::start("classifier_trainer", $name, '', -1); $classifier = $classifiers[$name]; $classifier->finalized = Classifier::FINALIZING; $start_finalizing = true; $data['SCRIPT'] .= "doMessage('<h1 class=\"red\">" . tl('crawl_component_finalizing_classifier') . '</h1>\');'; break; case 'search': $search_array = $parent->tableSearchRequestHandler($data, array('name')); break; } } $data['classifiers'] = $classifiers; if ($search_array == array()) { $search_array[] = array("name", "", "", "ASC"); } $parent->pagingLogic($data, 'classifiers', 'classifiers', DEFAULT_ADMIN_PAGING_NUM, $search_array, "", array('name' => 'class_label')); $data['reload'] = false; foreach ($classifiers as $label => $classifier) { if ($classifier->finalized == Classifier::FINALIZING) { $data['reload'] = true; break; } } if ($data['reload'] && !$start_finalizing) { $data['SCRIPT'] .= "doMessage('<h1 class=\"red\">" . tl('crawl_component_finalizing_classifier') . '</h1>\');'; } return $data; }
/** * Main loop for the fetcher. * * Checks for stop message, checks queue server if crawl has changed and * for new pages to crawl. Loop gets a group of next pages to crawl if * there are pages left to crawl (otherwise sleep 5 seconds). It downloads * these pages, deduplicates them, and updates the found site info with the * result before looping again. */ function loop() { crawlLog("In Fetch Loop"); $prefix = $this->fetcher_num . "-"; if (!file_exists(CRAWL_DIR . "/{$prefix}temp")) { mkdir(CRAWL_DIR . "/{$prefix}temp"); } $info[self::STATUS] = self::CONTINUE_STATE; $local_archives = array(""); while (CrawlDaemon::processHandler()) { $start_time = microtime(); $fetcher_message_file = CRAWL_DIR . "/schedules/{$prefix}fetcher_messages.txt"; if (file_exists($fetcher_message_file)) { $info = unserialize(file_get_contents($fetcher_message_file)); unlink($fetcher_message_file); if (isset($info[self::STATUS]) && $info[self::STATUS] == self::STOP_STATE) { continue; } } $switch_fetch_or_no_current = $this->checkCrawlTime(); if ($switch_fetch_or_no_current) { /* case(1) */ crawlLog("MAIN LOOP CASE 1 --" . " SWITCH CRAWL OR NO CURRENT CRAWL"); $info[self::CRAWL_TIME] = $this->crawl_time; if ($info[self::CRAWL_TIME] == 0) { $info[self::STATUS] = self::NO_DATA_STATE; $this->to_crawl = array(); } } else { if ($this->crawl_type == self::ARCHIVE_CRAWL && $this->arc_type != "WebArchiveBundle" && $this->arc_type != "") { /* case(2) */ // An archive crawl with data coming from the name server. crawlLog("MAIN LOOP CASE 2 -- ARCHIVE SCHEDULER (NOT RECRAWL)"); $info = $this->checkArchiveScheduler(); if ($info === false) { crawlLog("No Archive Schedule Data..." . " will try again in " . FETCH_SLEEP_TIME . " seconds."); sleep(FETCH_SLEEP_TIME); continue; } } else { if ($this->crawl_time > 0) { /* case(3) */ // Either a web crawl or a recrawl of a previous web crawl. if ($this->crawl_type == self::ARCHIVE_CRAWL) { crawlLog("MAIN LOOP CASE 3 -- RECRAWL SCHEDULER"); } else { crawlLog("MAIN LOOP CASE 4 -- WEB SCHEDULER"); } $info = $this->checkScheduler(); if ($info === false) { crawlLog("Cannot connect to name server..." . " will try again in " . FETCH_SLEEP_TIME . " seconds."); sleep(FETCH_SLEEP_TIME); continue; } } else { crawlLog("MAIN LOOP CASE 5 -- NO CURRENT CRAWL"); $info[self::STATUS] = self::NO_DATA_STATE; } } } /* case(2), case(3) might have set info without $info[self::STATUS] being set */ if (!isset($info[self::STATUS])) { if ($info === true) { $info = array(); } $info[self::STATUS] = self::CONTINUE_STATE; } if ($info[self::STATUS] == self::NO_DATA_STATE) { crawlLog("No data. Sleeping..."); sleep(FETCH_SLEEP_TIME); continue; } $tmp_base_name = isset($info[self::CRAWL_TIME]) ? CRAWL_DIR . "/cache/{$prefix}" . self::archive_base_name . $info[self::CRAWL_TIME] : ""; if (isset($info[self::CRAWL_TIME]) && ($this->web_archive == NULL || $this->web_archive->dir_name != $tmp_base_name)) { if (isset($this->web_archive->dir_name)) { crawlLog("Old name: " . $this->web_archive->dir_name); } if (is_object($this->web_archive)) { $this->web_archive = NULL; } $this->to_crawl_again = array(); $this->found_sites = array(); gc_collect_cycles(); $this->web_archive = new WebArchiveBundle($tmp_base_name, false); $this->crawl_time = $info[self::CRAWL_TIME]; $this->sum_seen_title_length = 0; $this->sum_seen_description_length = 0; $this->sum_seen_site_link_length = 0; $this->num_seen_sites = 0; crawlLog("New name: " . $this->web_archive->dir_name); crawlLog("Switching archive..."); if (!isset($info[self::ARC_DATA])) { continue; } } switch ($this->crawl_type) { case self::WEB_CRAWL: $downloaded_pages = $this->downloadPagesWebCrawl(); break; case self::ARCHIVE_CRAWL: if (isset($info[self::ARC_DATA])) { $downloaded_pages = $info[self::ARC_DATA]; } else { $downloaded_pages = $this->downloadPagesArchiveCrawl(); } break; } if (isset($downloaded_pages["NO_PROCESS"])) { unset($downloaded_pages["NO_PROCESS"]); $summarized_site_pages = array_values($downloaded_pages); $this->no_process_links = true; } else { $summarized_site_pages = $this->processFetchPages($downloaded_pages); $this->no_process_links = false; } crawlLog("Number of summarized pages " . count($summarized_site_pages)); $force_send = isset($info[self::END_ITERATOR]) && $info[self::END_ITERATOR] ? true : false; $this->updateFoundSites($summarized_site_pages, $force_send); $sleep_time = max(0, ceil(MINIMUM_FETCH_LOOP_TIME - changeInMicrotime($start_time))); if ($sleep_time > 0) { crawlLog("Ensure minimum loop time by sleeping..." . $sleep_time); sleep($sleep_time); } } //end while crawlLog("Fetcher shutting down!!"); }
/** * Used to stop a daemon that is running in the background * * @param string $name the main name of this daemon such as queue_server * or fetcher. * @param string $subname the instance name if it is possible for more * than one copy of the daemon to be running at the same time * @param bool $exit whether this method should just return (false) or * call exit() (true) */ static function stop($name, $subname = "", $exit = true) { $name_string = CrawlDaemon::getNameString($name, $subname); $lock_file = CrawlDaemon::getLockFileName($name, $subname); $not_web_setting = php_sapi_name() == 'cli'; if (file_exists($lock_file)) { unlink($lock_file); if ($not_web_setting) { crawlLog("Sending stop signal to {$name_string}..."); } } else { if ($not_web_setting) { crawlLog("{$name_string} does not appear to running..."); } } if ($exit) { exit; } }
/** * Main loop for the mirror script. * */ function loop() { crawlLog("In Sync Loop"); $info[self::STATUS] = self::CONTINUE_STATE; while (CrawlDaemon::processHandler()) { $syncer_message_file = CRAWL_DIR . "/schedules/mirror_messages.txt"; if (file_exists($syncer_message_file)) { $info = unserialize(file_get_contents($syncer_message_file)); unlink($syncer_message_file); if (isset($info[self::STATUS]) && $info[self::STATUS] == self::STOP_STATE) { continue; } } $info = $this->checkScheduler(); if ($info === false) { crawlLog("Cannot connect to queue server..." . " will try again in " . MIRROR_NOTIFY_FREQUENCY . " seconds."); sleep(MIRROR_NOTIFY_FREQUENCY); continue; } if ($info[self::STATUS] == self::NO_DATA_STATE) { crawlLog("No data from queue server. Sleeping..."); sleep(MIRROR_SYNC_FREQUENCY); continue; } $this->copyNextSyncFile(); } //end while crawlLog("Mirror shutting down!!"); }
/** * Returns the statuses of machines in the machine table of their * fetchers and queue_server as well as the name and url's of these machines * * @param array $machines an array of machines to check the status for * @return array a list of machines, together with all their properties * and the statuses of their fetchers and queue_servers */ function getMachineStatuses($machines = array()) { $num_machines = count($machines); $time = time(); $session = md5($time . AUTH_KEY); for ($i = 0; $i < $num_machines; $i++) { $hash_url = crawlHash($machines[$i]["URL"]); $machines[$i][CrawlConstants::URL] = $machines[$i]["URL"] . "?c=machine&a=statuses&time={$time}" . "&session={$session}&arg={$hash_url}"; } $statuses = FetchUrl::getPages($machines); for ($i = 0; $i < $num_machines; $i++) { foreach ($statuses as $status) { if ($machines[$i][CrawlConstants::URL] == $status[CrawlConstants::URL]) { $pre_status = json_decode($status[CrawlConstants::PAGE], true); if (is_array($pre_status)) { $machines[$i]["STATUSES"] = $pre_status; } else { $machines[$i]["STATUSES"] = "NOT_CONFIGURED_ERROR"; } } } } $sql = "SELECT * FROM ACTIVE_FETCHER"; $result = $this->db->execute($sql); if (!$result) { return $machines; } $active_fetchers = array(); while ($row = $this->db->fetchArray($result)) { for ($i = 0; $i < $num_machines; $i++) { if ($machines[$i]['NAME'] == $row['NAME']) { if (!isset($machines[$i]["STATUSES"]["fetcher"][$row['FETCHER_ID']])) { $machines[$i]["STATUSES"]["fetcher"][$row['FETCHER_ID']] = 0; } } } } stringROrderCallback("", "", "NAME"); if ($machines != array()) { usort($machines, "stringROrderCallback"); } $name_server_statuses = CrawlDaemon::statuses(); $machines['NAME_SERVER']['news_updater'] = 0; if (isset($name_server_statuses['news_updater'])) { $machines['NAME_SERVER']['news_updater'] = 1; } return $machines; }
/** * This is the function that should be called to get the * classifier_trainer to start training a logistic regression instance for * a particular classifier. The class label corresponding to the * classifier to be finalized should be passed as the second command-line * argument. */ function start() { global $argv; CrawlDaemon::init($argv, "classifier_trainer"); $label = $argv[2]; crawlLog("Initializing classifier trainer log..", $label . '-classifier_trainer', true); $classifier = Classifier::getClassifier($label); $classifier->prepareToFinalize(); $classifier->finalize(); Classifier::setClassifier($classifier); crawlLog("Training complete.\n"); CrawlDaemon::stop('classifier_trainer', $label); }
/** * Used to start/stop a queue_server/fetcher of the current Yioop instance * based on the queue_server and fetcher fields of the current $_REQUEST */ function update() { $statuses = CrawlDaemon::statuses(); if (isset($_REQUEST['queue_server'])) { if ($_REQUEST['queue_server'] == "true" && !isset($statuses["queue_server"][-1])) { CrawlDaemon::start("queue_server", 'none', self::INDEXER, 0); CrawlDaemon::start("queue_server", 'none', self::SCHEDULER, 2); } else { if ($_REQUEST['queue_server'] == "false" && isset($statuses["queue_server"][-1])) { CrawlDaemon::stop("queue_server"); } } } if (isset($_REQUEST['mirror'])) { if ($_REQUEST['mirror'] == "true" && !isset($statuses["mirror"][-1])) { CrawlDaemon::start("mirror"); } else { if ($_REQUEST['mirror'] == "false" && isset($statuses["mirror"][-1])) { CrawlDaemon::stop("mirror"); } } } if (isset($_REQUEST['fetcher']) && is_array($_REQUEST['fetcher'])) { foreach ($_REQUEST['fetcher'] as $index => $value) { if ($value == "true" && !isset($statuses["fetcher"][$index])) { CrawlDaemon::start("fetcher", "{$index}"); } else { if ($value == "false" && isset($statuses["fetcher"][$index])) { CrawlDaemon::stop("fetcher", "{$index}"); } } } } }
/** * Main loop for the news updater. */ function loop() { crawlLog("In News Update Loop"); $info[self::STATUS] = self::CONTINUE_STATE; $local_archives = array(""); while (CrawlDaemon::processHandler()) { $start_time = microtime(); crawlLog("Checking if news feeds should be updated..."); $this->newsUpdate(); $sleep_time = max(0, ceil(MINIMUM_UPDATE_LOOP_TIME - changeInMicrotime($start_time))); if ($sleep_time > 0) { crawlLog("Ensure minimum loop time by sleeping..." . $sleep_time); sleep($sleep_time); } } //end while crawlLog("News Updater shutting down!!"); }