Skip to content

Commit 21e95ba

Browse files
olivermeyerclaude
andcommitted
feat(utils, platform): add DEGRADED state to Health model
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
1 parent e523bce commit 21e95ba

9 files changed

Lines changed: 304 additions & 86 deletions

File tree

specifications/SPEC-UTILS-SERVICE.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -151,12 +151,12 @@ health:
151151
properties:
152152
status:
153153
type: string
154-
enum: [UP, DOWN]
154+
enum: [UP, DEGRADED, DOWN]
155155
description: Service health status
156156
reason:
157157
type: string
158158
nullable: true
159-
description: Optional reason for status
159+
description: Required reason for DOWN or DEGRADED status; must be null for UP
160160
components:
161161
type: object
162162
description: Hierarchical component health

specifications/SPEC_SYSTEM_SERVICE.md

Lines changed: 0 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -107,27 +107,6 @@ _Note: For detailed implementation, refer to the source code in the `src/aignost
107107

108108
### 3.3 Data Schemas
109109

110-
**Health Status Schema:**
111-
112-
```yaml
113-
Health:
114-
type: object
115-
properties:
116-
status:
117-
type: string
118-
enum: [UP, DOWN]
119-
description: "Overall system health status"
120-
components:
121-
type: object
122-
description: "Health status of individual components"
123-
additionalProperties:
124-
$ref: "#/definitions/Health"
125-
reason:
126-
type: string
127-
description: "Reason for DOWN status, null for UP"
128-
required: [status]
129-
```
130-
131110
**System Info Schema:**
132111

133112
```yaml

src/aignostics/platform/_service.py

Lines changed: 40 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -192,10 +192,47 @@ def info(self, mask_secrets: bool = True) -> dict[str, Any]:
192192
else None,
193193
}
194194

195+
@staticmethod
196+
def _health_from_response(response: urllib3.BaseHTTPResponse) -> Health:
197+
"""Map a PAPI health response to a Health status.
198+
199+
Handles non-200 status codes, unparseable bodies, and the three recognised
200+
``status`` values (``"UP"``, ``"DEGRADED"``, ``"DOWN"``).
201+
202+
Args:
203+
response: urllib3 response from the ``/health`` endpoint.
204+
205+
Returns:
206+
Health: ``UP``, ``DEGRADED``, or ``DOWN`` derived from the response.
207+
"""
208+
if response.status != HTTPStatus.OK:
209+
logger.error("Aignostics Platform API returned '{}'", response.status)
210+
return Health(
211+
status=Health.Code.DOWN, reason=f"Aignostics Platform API returned status '{response.status}'"
212+
)
213+
214+
try:
215+
body = json.loads(response.data)
216+
except Exception:
217+
return Health(status=Health.Code.DOWN, reason="Aignostics Platform API returned unparseable response")
218+
219+
api_status = body.get("status")
220+
if api_status == "UP":
221+
return Health(status=Health.Code.UP)
222+
if api_status == "DEGRADED":
223+
reason = body.get("reason") or "Aignostics Platform API is DEGRADED"
224+
logger.warning("Aignostics Platform API is DEGRADED: {}", reason)
225+
return Health(status=Health.Code.DEGRADED, reason=reason)
226+
return Health(
227+
status=Health.Code.DOWN,
228+
reason=f"Aignostics Platform API returned unknown status '{api_status}'",
229+
)
230+
195231
def _determine_api_public_health(self) -> Health:
196232
"""Determine healthiness and reachability of Aignostics Platform API.
197233
198234
- Checks if health endpoint is reachable and returns 200 OK
235+
- Parses the response body to detect DEGRADED status
199236
- Uses urllib3 for a direct connection check without authentication
200237
201238
Returns:
@@ -209,23 +246,17 @@ def _determine_api_public_health(self) -> Health:
209246
headers={"User-Agent": user_agent()},
210247
timeout=urllib3.Timeout(total=self._settings.health_timeout),
211248
)
212-
213-
if response.status != HTTPStatus.OK:
214-
logger.error("Aignostics Platform API (public) returned '{}'", response.status)
215-
return Health(
216-
status=Health.Code.DOWN, reason=f"Aignostics Platform API returned status '{response.status}'"
217-
)
249+
return self._health_from_response(response)
218250
except Exception as e:
219251
logger.exception("Issue with Aignostics Platform API")
220252
return Health(status=Health.Code.DOWN, reason=f"Issue with Aignostics Platform API: '{e}'")
221253

222-
return Health(status=Health.Code.UP)
223-
224254
def _determine_api_authenticated_health(self) -> Health:
225255
"""Determine healthiness and reachability of Aignostics Platform API via authenticated request.
226256
227257
Uses a dedicated HTTP pool (separate from the API client's connection pool) to prevent
228258
connection-level cross-contamination between health checks and API calls.
259+
Parses the response body to detect DEGRADED status.
229260
230261
Returns:
231262
Health: The healthiness of the Aignostics Platform API when trying to reach via authenticated request.
@@ -242,14 +273,10 @@ def _determine_api_authenticated_health(self) -> Health:
242273
},
243274
timeout=urllib3.Timeout(total=self._settings.health_timeout),
244275
)
245-
246-
if response.status != HTTPStatus.OK:
247-
logger.error("Aignostics Platform API (authenticated) returned '{}'", response.status)
248-
return Health(status=Health.Code.DOWN, reason=f"Aignostics Platform API returned '{response.status}'")
276+
return self._health_from_response(response)
249277
except Exception as e:
250278
logger.exception("Issue with Aignostics Platform API")
251279
return Health(status=Health.Code.DOWN, reason=f"Issue with Aignostics Platform API: '{e}'")
252-
return Health(status=Health.Code.UP)
253280

254281
def health(self) -> Health:
255282
"""Determine health of this service.

src/aignostics/system/CLAUDE.md

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -123,11 +123,9 @@ def health(self) -> Health:
123123
)
124124

125125
# Determine overall status based on ALL modules
126-
overall = Health.Code.UP if all(
127-
c.status == Health.Code.UP for c in components.values()
128-
) else Health.Code.DOWN
129-
130-
return Health(status=overall, components=components)
126+
# Priority: DOWN > DEGRADED > UP
127+
# compute_health_from_components() handles this automatically
128+
return Health(status=Health.Code.UP, components=components)
131129
```
132130

133131
### Exception Hierarchy (`_exceptions.py`)
@@ -275,7 +273,7 @@ print(f"System status: {health.status}")
275273

276274
# Check specific component
277275
platform_health = health.components.get("platform")
278-
if platform_health.status != Health.Code.UP:
276+
if not platform_health: # False only when DOWN (DEGRADED and UP are both truthy)
279277
print(f"Platform issue: {platform_health.reason}")
280278
```
281279

@@ -453,7 +451,7 @@ def test_health_aggregation():
453451
service = Service()
454452
health = service.health()
455453

456-
assert health.status in [Health.Code.UP, Health.Code.DOWN]
454+
assert health.status in [Health.Code.UP, Health.Code.DEGRADED, Health.Code.DOWN]
457455
assert "platform" in health.components
458456
assert isinstance(health.components, dict)
459457

src/aignostics/utils/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
)
2525
from ._di import discover_plugin_packages, load_modules, locate_implementations, locate_subclasses
2626
from ._fs import get_user_data_directory, open_user_data_directory, sanitize_path, sanitize_path_component
27-
from ._health import Health
27+
from ._health import Health, HealthStatus
2828
from ._log import LogSettings
2929
from ._mcp import MCP_SERVER_NAME, MCP_TRANSPORT, mcp_create_server, mcp_discover_servers, mcp_list_tools, mcp_run
3030
from ._nav import BaseNavBuilder, NavGroup, NavItem, gui_get_nav_groups
@@ -42,6 +42,7 @@
4242
"BaseNavBuilder",
4343
"BaseService",
4444
"Health",
45+
"HealthStatus",
4546
"LogSettings",
4647
"NavGroup",
4748
"NavItem",

src/aignostics/utils/_health.py

Lines changed: 53 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -1,69 +1,92 @@
11
"""Health models and status definitions for service health checks."""
22

33
from enum import StrEnum
4-
from typing import ClassVar, Self
4+
from typing import Any, ClassVar, Self
55

66
from pydantic import BaseModel, Field, model_validator
77

88

9-
class _HealthStatus(StrEnum):
9+
class HealthStatus(StrEnum):
10+
"""Health status enumeration for service health checks.
11+
12+
Values:
13+
UP: Service is operating normally
14+
DEGRADED: Service is operational but with reduced functionality
15+
DOWN: Service is not operational
16+
"""
17+
1018
UP = "UP"
19+
DEGRADED = "DEGRADED"
1120
DOWN = "DOWN"
1221

1322

1423
class Health(BaseModel):
1524
"""Represents the health status of a service with optional components and failure reasons.
1625
1726
- A health object can have child components, i.e. health forms a tree.
18-
- Any node in the tree can set itself to DOWN. In this case the node is required
19-
to set the reason attribute. If reason is not set when DOWN,
20-
automatic model validation of the tree will fail.
21-
- DOWN'ness is propagated to parent health objects. I.e. the health of a parent
22-
node is automatically set to DOWN if any of its child components are DOWN. The
23-
child components leading to this will be listed in the reason.
24-
- The root of the health tree is computed in the system module. The health of other
25-
modules is automatically picked up by the system module.
27+
- Any node in the tree can set itself to DOWN or DEGRADED. If DOWN or DEGRADED, the node
28+
is required to set the reason attribute. If reason is not set when DOWN or DEGRADED,
29+
automatic model validation fails.
30+
- DOWN trumps DEGRADED, DEGRADED trumps UP. If any child is DOWN, parent is DOWN.
31+
If none are DOWN but any are DEGRADED, parent is DEGRADED.
32+
- The root of the health tree is computed in the system module.
33+
The health of other modules is automatically picked up by the system module.
2634
"""
2735

28-
Code: ClassVar[type[_HealthStatus]] = _HealthStatus
29-
status: _HealthStatus
36+
Code: ClassVar[type[HealthStatus]] = HealthStatus
37+
status: HealthStatus
3038
reason: str | None = None
3139
components: dict[str, "Health"] = Field(default_factory=dict)
40+
uptime_statistics: dict[str, dict[str, Any]] | None = None # Optional uptime stats
3241

3342
def compute_health_from_components(self) -> Self:
3443
"""Recursively compute health status from components.
3544
3645
- If health is already DOWN, it remains DOWN with its original reason.
3746
- If health is UP but any component is DOWN, health becomes DOWN with
3847
a reason listing all failed components.
48+
- If no components are DOWN but any are DEGRADED, health becomes DEGRADED with a reason.
3949
4050
Returns:
4151
Self: The updated health instance with computed status.
4252
"""
4353
# Skip recomputation if already known to be DOWN
44-
if self.status == _HealthStatus.DOWN:
54+
if self.status == HealthStatus.DOWN:
4555
return self
4656

4757
# No components means we keep the existing status
4858
if not self.components:
4959
return self
5060

51-
# Find all DOWN components
61+
# Find all DOWN and DEGRADED components
5262
down_components = []
63+
degraded_components = []
5364
for component_name, component in self.components.items():
5465
# Recursively compute health for each component
5566
component.compute_health_from_components()
56-
if component.status == _HealthStatus.DOWN:
57-
down_components.append(component_name)
67+
if component.status == HealthStatus.DOWN:
68+
down_components.append((component_name, component.reason))
69+
elif component.status == HealthStatus.DEGRADED:
70+
degraded_components.append((component_name, component.reason))
5871

5972
# If any components are DOWN, mark the parent as DOWN
6073
if down_components:
61-
self.status = _HealthStatus.DOWN
74+
self.status = HealthStatus.DOWN
6275
if len(down_components) == 1:
63-
self.reason = f"Component '{down_components[0]}' is DOWN"
76+
component_name, component_reason = down_components[0]
77+
self.reason = f"Component '{component_name}' is DOWN ({component_reason})"
78+
else:
79+
component_list = ", ".join(f"'{name}' ({reason})" for name, reason in down_components)
80+
self.reason = f"Components {component_list} are DOWN"
81+
# If no components are DOWN but any are DEGRADED, mark parent as DEGRADED
82+
elif degraded_components:
83+
self.status = HealthStatus.DEGRADED
84+
if len(degraded_components) == 1:
85+
component_name, component_reason = degraded_components[0]
86+
self.reason = f"Component '{component_name}' is DEGRADED ({component_reason})"
6487
else:
65-
component_list = "', '".join(down_components)
66-
self.reason = f"Components '{component_list}' are DOWN"
88+
component_list = ", ".join(f"'{name}' ({reason})" for name, reason in degraded_components)
89+
self.reason = f"Components {component_list} are DEGRADED"
6790

6891
return self
6992

@@ -73,7 +96,7 @@ def validate_health_state(self) -> Self:
7396
7497
- Compute overall health based on component health
7598
- Ensure UP status has no associated reason
76-
- Ensure DOWN status always has a reason
99+
- Ensure DOWN and DEGRADED status always have a reason
77100
78101
Returns:
79102
Self: The validated model instance with correct health status.
@@ -85,31 +108,31 @@ def validate_health_state(self) -> Self:
85108
self.compute_health_from_components()
86109

87110
# Validate that UP status has no reason
88-
if (self.status == _HealthStatus.UP) and self.reason:
111+
if (self.status == HealthStatus.UP) and self.reason:
89112
msg = f"Health {self.status} must not have reason"
90113
raise ValueError(msg)
91114

92-
# Validate that DOWN status always has a reason
93-
if (self.status == _HealthStatus.DOWN) and not self.reason:
94-
msg = "Health DOWN must have a reason"
115+
# Validate that DOWN and DEGRADED status always have a reason
116+
if (self.status in {HealthStatus.DOWN, HealthStatus.DEGRADED}) and not self.reason:
117+
msg = f"Health {self.status} must have a reason"
95118
raise ValueError(msg)
96119

97120
return self
98121

99122
def __str__(self) -> str:
100-
"""Return string representation of health status with optional reason for DOWN state.
123+
"""Return string representation of health status with optional reason for DOWN/DEGRADED state.
101124
102125
Returns:
103-
str: The health status value, with reason appended if status is DOWN.
126+
str: The health status value, with reason appended if status is DOWN or DEGRADED.
104127
"""
105-
if self.status == _HealthStatus.DOWN and self.reason:
128+
if self.status in {HealthStatus.DOWN, HealthStatus.DEGRADED} and self.reason:
106129
return f"{self.status.value}: {self.reason}"
107130
return self.status.value
108131

109132
def __bool__(self) -> bool:
110133
"""Convert health status to a boolean value.
111134
112135
Returns:
113-
bool: True if the status is UP, False otherwise.
136+
bool: True if the status is UP or DEGRADED, False otherwise.
114137
"""
115-
return self.status == _HealthStatus.UP
138+
return self.status in {HealthStatus.UP, HealthStatus.DEGRADED}

0 commit comments

Comments
 (0)