11"""Health models and status definitions for service health checks."""
22
33from enum import StrEnum
4- from typing import ClassVar , Self
4+ from typing import Any , ClassVar , Self
55
66from pydantic import BaseModel , Field , model_validator
77
88
9- class _HealthStatus (StrEnum ):
9+ class HealthStatus (StrEnum ):
10+ """Health status enumeration for service health checks.
11+
12+ Values:
13+ UP: Service is operating normally
14+ DEGRADED: Service is operational but with reduced functionality
15+ DOWN: Service is not operational
16+ """
17+
1018 UP = "UP"
19+ DEGRADED = "DEGRADED"
1120 DOWN = "DOWN"
1221
1322
1423class Health (BaseModel ):
1524 """Represents the health status of a service with optional components and failure reasons.
1625
1726 - A health object can have child components, i.e. health forms a tree.
18- - Any node in the tree can set itself to DOWN. In this case the node is required
19- to set the reason attribute. If reason is not set when DOWN,
20- automatic model validation of the tree will fail.
21- - DOWN'ness is propagated to parent health objects. I.e. the health of a parent
22- node is automatically set to DOWN if any of its child components are DOWN. The
23- child components leading to this will be listed in the reason.
24- - The root of the health tree is computed in the system module. The health of other
25- modules is automatically picked up by the system module.
27+ - Any node in the tree can set itself to DOWN or DEGRADED. If DOWN or DEGRADED, the node
28+ is required to set the reason attribute. If reason is not set when DOWN or DEGRADED,
29+ automatic model validation fails.
30+ - DOWN trumps DEGRADED, DEGRADED trumps UP. If any child is DOWN, parent is DOWN.
31+ If none are DOWN but any are DEGRADED, parent is DEGRADED.
32+ - The root of the health tree is computed in the system module.
33+ The health of other modules is automatically picked up by the system module.
2634 """
2735
28- Code : ClassVar [type [_HealthStatus ]] = _HealthStatus
29- status : _HealthStatus
36+ Code : ClassVar [type [HealthStatus ]] = HealthStatus
37+ status : HealthStatus
3038 reason : str | None = None
3139 components : dict [str , "Health" ] = Field (default_factory = dict )
40+ uptime_statistics : dict [str , dict [str , Any ]] | None = None # Optional uptime stats
3241
3342 def compute_health_from_components (self ) -> Self :
3443 """Recursively compute health status from components.
3544
3645 - If health is already DOWN, it remains DOWN with its original reason.
3746 - If health is UP but any component is DOWN, health becomes DOWN with
3847 a reason listing all failed components.
48+ - If no components are DOWN but any are DEGRADED, health becomes DEGRADED with a reason.
3949
4050 Returns:
4151 Self: The updated health instance with computed status.
4252 """
4353 # Skip recomputation if already known to be DOWN
44- if self .status == _HealthStatus .DOWN :
54+ if self .status == HealthStatus .DOWN :
4555 return self
4656
4757 # No components means we keep the existing status
4858 if not self .components :
4959 return self
5060
51- # Find all DOWN components
61+ # Find all DOWN and DEGRADED components
5262 down_components = []
63+ degraded_components = []
5364 for component_name , component in self .components .items ():
5465 # Recursively compute health for each component
5566 component .compute_health_from_components ()
56- if component .status == _HealthStatus .DOWN :
57- down_components .append (component_name )
67+ if component .status == HealthStatus .DOWN :
68+ down_components .append ((component_name , component .reason ))
69+ elif component .status == HealthStatus .DEGRADED :
70+ degraded_components .append ((component_name , component .reason ))
5871
5972 # If any components are DOWN, mark the parent as DOWN
6073 if down_components :
61- self .status = _HealthStatus .DOWN
74+ self .status = HealthStatus .DOWN
6275 if len (down_components ) == 1 :
63- self .reason = f"Component '{ down_components [0 ]} ' is DOWN"
76+ component_name , component_reason = down_components [0 ]
77+ self .reason = f"Component '{ component_name } ' is DOWN ({ component_reason } )"
78+ else :
79+ component_list = ", " .join (f"'{ name } ' ({ reason } )" for name , reason in down_components )
80+ self .reason = f"Components { component_list } are DOWN"
81+ # If no components are DOWN but any are DEGRADED, mark parent as DEGRADED
82+ elif degraded_components :
83+ self .status = HealthStatus .DEGRADED
84+ if len (degraded_components ) == 1 :
85+ component_name , component_reason = degraded_components [0 ]
86+ self .reason = f"Component '{ component_name } ' is DEGRADED ({ component_reason } )"
6487 else :
65- component_list = "', ' " .join (down_components )
66- self .reason = f"Components ' { component_list } ' are DOWN "
88+ component_list = ", " .join (f"' { name } ' ( { reason } )" for name , reason in degraded_components )
89+ self .reason = f"Components { component_list } are DEGRADED "
6790
6891 return self
6992
@@ -73,7 +96,7 @@ def validate_health_state(self) -> Self:
7396
7497 - Compute overall health based on component health
7598 - Ensure UP status has no associated reason
76- - Ensure DOWN status always has a reason
99+ - Ensure DOWN and DEGRADED status always have a reason
77100
78101 Returns:
79102 Self: The validated model instance with correct health status.
@@ -85,31 +108,31 @@ def validate_health_state(self) -> Self:
85108 self .compute_health_from_components ()
86109
87110 # Validate that UP status has no reason
88- if (self .status == _HealthStatus .UP ) and self .reason :
111+ if (self .status == HealthStatus .UP ) and self .reason :
89112 msg = f"Health { self .status } must not have reason"
90113 raise ValueError (msg )
91114
92- # Validate that DOWN status always has a reason
93- if (self .status == _HealthStatus .DOWN ) and not self .reason :
94- msg = "Health DOWN must have a reason"
115+ # Validate that DOWN and DEGRADED status always have a reason
116+ if (self .status in { HealthStatus .DOWN , HealthStatus . DEGRADED } ) and not self .reason :
117+ msg = f "Health { self . status } must have a reason"
95118 raise ValueError (msg )
96119
97120 return self
98121
99122 def __str__ (self ) -> str :
100- """Return string representation of health status with optional reason for DOWN state.
123+ """Return string representation of health status with optional reason for DOWN/DEGRADED state.
101124
102125 Returns:
103- str: The health status value, with reason appended if status is DOWN.
126+ str: The health status value, with reason appended if status is DOWN or DEGRADED .
104127 """
105- if self .status == _HealthStatus .DOWN and self .reason :
128+ if self .status in { HealthStatus .DOWN , HealthStatus . DEGRADED } and self .reason :
106129 return f"{ self .status .value } : { self .reason } "
107130 return self .status .value
108131
109132 def __bool__ (self ) -> bool :
110133 """Convert health status to a boolean value.
111134
112135 Returns:
113- bool: True if the status is UP, False otherwise.
136+ bool: True if the status is UP or DEGRADED , False otherwise.
114137 """
115- return self .status == _HealthStatus .UP
138+ return self .status in { HealthStatus .UP , HealthStatus . DEGRADED }
0 commit comments