"""This module implements health status reporting for watchdog operation. Module receive important health metrics and exports its status of overall health assessment. This health assessment can be used by external watchdog scripts to initiate agent restart. Process is considered "healthy" if: * it is being shut down and shutdown timeout has not elapsed -> HEALTHY * it is not registered -> HEALTHY * process was started more than 6 hours ago and no data was sent to server within last 6 hours -> FAULTY * process was started more than 18 hours ago and no data was received from server within last 18 hours -> FAULTY Otherwise process is considered HEALTHY. As agent exports this information through RPC interface there is an additional implicit "health" requirement that: * it responds to RPC requests. This implicit requirement considered valid because UI fully depends on RPC so it does not make health assessment any worse than it should.""" import collections HealthStatus = collections.namedtuple("HealthStatus", ["healthy", "why"]) class HealthSensor: """HealthSensor receives events about agent operation and provides information about overall status. Initially, new HealthSensor object assumes: * process was started long ago; * process is not being shut down; * data from server has been received long ago; * data to server was sent long ago; * agent is registered (license is valid). So, initial health status is False (faulty).""" RECEIVE_WINDOW = 18 * 3600 SEND_WINDOW = 6 * 3600 SHUTDOWN_TIMEOUT = 600 def __init__(self): self._started_at = 0.0 self._shutdown_at = 0.0 self._last_received = 0.0 self._last_sent = 0.0 self._is_registered = True def starting(self, when: float) -> None: """Records a moment of agent startup""" self._started_at = when def shutting_down(self, when: float) -> None: """Records a moment of externally initiated agent shutdown""" self._shutdown_at = when def server_data_received(self, when: float) -> None: """Records a moment when data was received from server""" self._last_received = when def server_data_sent(self, when: float) -> None: """Records a moment when data was sent to server""" self._last_sent = when def registered(self) -> None: """Marks agent as being registered""" self._is_registered = True def unregistered(self) -> None: """Marks agent as being not registered""" self._is_registered = False def status(self, now: float) -> HealthStatus: if self._shutdown_at > 0: if now - self._shutdown_at >= self.SHUTDOWN_TIMEOUT: return HealthStatus(False, "stuck at shutdown") return HealthStatus(True, "shutdown is in progress") if not self._is_registered: return HealthStatus(True, "not registered") if ( now - self._started_at >= self.RECEIVE_WINDOW and now - self._last_received >= self.RECEIVE_WINDOW ): return HealthStatus(False, "no data received from server") if ( now - self._started_at >= self.SEND_WINDOW and now - self._last_sent >= self.SEND_WINDOW ): return HealthStatus(False, "no data sent to server") return HealthStatus(True, "all is ok") sensor = HealthSensor()