function __construct($nodeName, $jobConfig) { parent::__construct($nodeName, $jobConfig); if (!$jobConfig["leader"]) { throw new Scalr_System_Cronjob_Exception("Configuration array must have a key for 'leader' " . "that names the leader node apointed by administrator"); } $this->logger = Logger::getLogger(__CLASS__); }
private function forkCoordinator() { $this->logger->info("Forking coordinator process"); $pid = pcntl_fork(); if ($pid > 0) { $this->coordinatorPid = $pid; } else { if ($pid == 0) { $this->coordinatorLoop = true; $this->coordinatorPid = posix_getpid(); $ppid = posix_getppid(); $this->nodeRegistry->set(self::REGKEY_COORDINATOR_PROCESS_PID, posix_getpid()); $leaderPath = "{$this->jobZPath}/leader"; $leaderTimeout = new Scalr_Util_Timeout($this->leaderTimeout); $zombyTimeout = new Scalr_Util_Timeout((int) $this->config["tickTime"] * 10); $heartbeatTimeout = new Scalr_Util_Timeout((int) $this->config["tickTime"]); // Track mtime from self node $lastMtime = $this->zookeeper->get("{$this->nodeRegistry->path}/{$this->nodeRegistry->node}")->mtime; while ($this->coordinatorLoop) { $leaderTimeout->reset(); try { $exceptionCounter = 0; while (!$leaderTimeout->reached() && $this->coordinatorLoop) { try { // Terminate myself if parent was killed if (!posix_kill($ppid, 0)) { $this->coordinatorLoop = false; break 2; } // Leader election maybe initiated if ($this->leaderElection->isInitiated()) { $this->logger->info("[coordinator] Someone has initiated leader election"); $this->doLeaderElection(); } // Leader may changed $leaderNodeName = $this->zookeeper->getData($leaderPath); $oldIsLeader = $this->isLeader; $this->isLeader = $leaderNodeName == $this->nodeName; if (!$this->isLeader && $oldIsLeader) { $this->logger->info("[coordinator] I am not longer a leader ('{$this->nodeName}'). " . "Leader is '{$leaderNodeName}'"); } // Check leader znode mtime $leaderStat = $this->zookeeper->get($leaderPath); if ($leaderStat->mtime != $this->leaderMtime) { // Leader had updated it's state $leaderTimeout->reset(); $this->logger->info("[coordinator] Leader is the same"); $this->leaderMtime = $leaderStat->mtime; } if ($this->isLeader) { // Process returned nodes. // Administrator's configured leader may be here if ($c = $this->returnedNodesQueue->capacity()) { $this->logger->info(sprintf("%d node(s) have returned back online", $c)); $votes = array($this->elector->getElectionData()); while ($vote = $this->returnedNodesQueue->peek()) { $votes[] = $vote; } $this->checkElectionResults($votes, false); } // Check zomby nodes if ($zombyTimeout->reached(false)) { $childData = $this->zookeeper->getChildren($this->nodeRegistry->path); foreach ($childData->children as $childName) { $childStat = $this->zookeeper->get("{$this->nodeRegistry->path}/{$childName}"); if ($childStat->mtime < $lastMtime) { // Zomby detected $this->logger->info(sprintf("[coordinator] Cleanup zomby node '%s'", $childName)); $this->zookeeper->deleteRecursive("{$this->nodeRegistry->path}/{$childName}"); } } $zombyTimeout->reset(); $lastMtime = $this->zookeeper->get("{$this->nodeRegistry->path}/{$this->nodeRegistry->node}")->mtime; } } // Node heart beat if ($heartbeatTimeout->reached(false)) { $this->logger->debug(sprintf("[coordinator] '%s' heartbeat", $this->nodeName)); $this->nodeRegistry->touchNode(); $heartbeatTimeout->reset(); } // Poll work queue while ($message = $this->globalWorkQueue->peek()) { $this->logger->info("[coordinator] Put received message into local queue"); $this->processPool->workQueue->put($message); } Scalr_Util_Timeout::sleep(1000); } catch (Exception $e) { $this->logger->error(sprintf("[coordinator] Caught in message loop <%s> %s", get_class($e), $e->getMessage())); if (++$exceptionCounter > $this->coordinatorSlippageLimit) { $this->logger->fatal("[coordinator] Got too many consistent exceptions in main loop. " . "Slippage limit: {$this->coordinatorSlippageLimit} exceed"); posix_kill(posix_getppid(), SIGTERM); exit; } } } } catch (Scalr_Util_TimeoutException $e) { $this->logger->warn("[coordinator] Caught leader timeout exception ({$leaderTimeout->format()})"); $this->logger->info("[coordinator] Start new leader election procedure"); try { $this->leaderElection->initiate($this->nodeRegistry->nodesCapacity()); } catch (Exception $e) { $this->logger->error(sprintf("[coordinator] Caught in leader election <%s> %s", get_class($e), $e->getMessage())); } } } $this->logger->info("[coordinator] Done"); exit; } else { if ($pid == -1) { throw new Scalr_System_Cronjob_Exception("Cannot fork coordinator process"); } } } }