Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion backend/cmd/server/VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
0.1.124
0.1.125
6 changes: 5 additions & 1 deletion backend/internal/service/ops_dashboard.go
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,11 @@ func (s *OpsService) GetDashboardOverview(ctx context.Context, filter *OpsDashbo
log.Printf("[Ops] ListJobHeartbeats failed: %v", err)
}

overview.HealthScore = computeDashboardHealthScore(time.Now().UTC(), overview)
thresholds, err := s.GetMetricThresholds(ctx)
if err != nil {
return nil, err
}
overview.HealthScore = computeDashboardHealthScore(time.Now().UTC(), overview, thresholds)

return overview, nil
}
Expand Down
86 changes: 65 additions & 21 deletions backend/internal/service/ops_health_score.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,21 @@ import (
"time"
)

type dashboardHealthScoreThresholds struct {
errorRateFullPercent float64
errorRateZeroPercent float64
ttftP99FullMs float64
ttftP99ZeroMs float64
}

// computeDashboardHealthScore computes a 0-100 health score from the metrics returned by the dashboard overview.
//
// Design goals:
// - Backend-owned scoring (UI only displays).
// - Layered scoring: Business Health (70%) + Infrastructure Health (30%)
// - Avoids double-counting (e.g., DB failure affects both infra and business metrics)
// - Conservative + stable: penalize clear degradations; avoid overreacting to missing/idle data.
func computeDashboardHealthScore(now time.Time, overview *OpsDashboardOverview) int {
func computeDashboardHealthScore(now time.Time, overview *OpsDashboardOverview, thresholds *OpsMetricThresholds) int {
if overview == nil {
return 0
}
Expand All @@ -23,7 +30,7 @@ func computeDashboardHealthScore(now time.Time, overview *OpsDashboardOverview)
return 100
}

businessHealth := computeBusinessHealth(overview)
businessHealth := computeBusinessHealth(overview, thresholds)
infraHealth := computeInfraHealth(now, overview)

// Weighted combination: 70% business + 30% infrastructure
Expand All @@ -33,39 +40,76 @@ func computeDashboardHealthScore(now time.Time, overview *OpsDashboardOverview)

// computeBusinessHealth calculates business health score (0-100)
// Components: Error Rate (50%) + TTFT (50%)
func computeBusinessHealth(overview *OpsDashboardOverview) float64 {
// Error rate score: 1% → 100, 10% → 0 (linear)
func computeBusinessHealth(overview *OpsDashboardOverview, thresholds *OpsMetricThresholds) float64 {
scoreThresholds := resolveDashboardHealthScoreThresholds(thresholds)

// Error rate score defaults to 1% → 100, 10% → 0 (linear)
// Combines request errors and upstream errors
errorScore := 100.0
errorPct := clampFloat64(overview.ErrorRate*100, 0, 100)
upstreamPct := clampFloat64(overview.UpstreamErrorRate*100, 0, 100)
combinedErrorPct := math.Max(errorPct, upstreamPct) // Use worst case
if combinedErrorPct > 1.0 {
if combinedErrorPct <= 10.0 {
errorScore = (10.0 - combinedErrorPct) / 9.0 * 100
} else {
errorScore = 0
}
}
errorScore := scoreDescendingRange(
combinedErrorPct,
scoreThresholds.errorRateFullPercent,
scoreThresholds.errorRateZeroPercent,
)

// TTFT score: 1s → 100, 3s → 0 (linear)
// TTFT score defaults to 1s → 100, 3s → 0 (linear)
// Time to first token is critical for user experience
ttftScore := 100.0
if overview.TTFT.P99 != nil {
p99 := float64(*overview.TTFT.P99)
if p99 > 1000 {
if p99 <= 3000 {
ttftScore = (3000 - p99) / 2000 * 100
} else {
ttftScore = 0
}
}
ttftScore = scoreDescendingRange(
float64(*overview.TTFT.P99),
scoreThresholds.ttftP99FullMs,
scoreThresholds.ttftP99ZeroMs,
)
}

// Weighted combination: 50% error rate + 50% TTFT
return errorScore*0.5 + ttftScore*0.5
}

func resolveDashboardHealthScoreThresholds(thresholds *OpsMetricThresholds) dashboardHealthScoreThresholds {
defaults := defaultOpsMetricThresholds()
return dashboardHealthScoreThresholds{
errorRateFullPercent: metricThresholdValue(thresholds, defaults, func(v *OpsMetricThresholds) *float64 {
return v.HealthScoreErrorRateFullPercent
}),
errorRateZeroPercent: metricThresholdValue(thresholds, defaults, func(v *OpsMetricThresholds) *float64 {
return v.HealthScoreErrorRateZeroPercent
}),
ttftP99FullMs: metricThresholdValue(thresholds, defaults, func(v *OpsMetricThresholds) *float64 {
return v.HealthScoreTTFTP99FullMs
}),
ttftP99ZeroMs: metricThresholdValue(thresholds, defaults, func(v *OpsMetricThresholds) *float64 {
return v.HealthScoreTTFTP99ZeroMs
}),
}
}

func metricThresholdValue(
thresholds *OpsMetricThresholds,
defaults *OpsMetricThresholds,
selectValue func(*OpsMetricThresholds) *float64,
) float64 {
if thresholds != nil {
if value := selectValue(thresholds); value != nil {
return *value
}
}
return *selectValue(defaults)
}

func scoreDescendingRange(value float64, fullScoreAt float64, zeroScoreAt float64) float64 {
if value <= fullScoreAt {
return 100
}
if value >= zeroScoreAt {
return 0
}
return (zeroScoreAt - value) / (zeroScoreAt - fullScoreAt) * 100
}

// computeInfraHealth calculates infrastructure health score (0-100)
// Components: Storage (40%) + Compute Resources (30%) + Background Jobs (30%)
func computeInfraHealth(now time.Time, overview *OpsDashboardOverview) float64 {
Expand Down
102 changes: 98 additions & 4 deletions backend/internal/service/ops_health_score_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ import (
func TestComputeDashboardHealthScore_IdleReturns100(t *testing.T) {
t.Parallel()

score := computeDashboardHealthScore(time.Now().UTC(), &OpsDashboardOverview{})
score := computeDashboardHealthScore(time.Now().UTC(), &OpsDashboardOverview{}, nil)
require.Equal(t, 100, score)
}

Expand Down Expand Up @@ -50,7 +50,7 @@ func TestComputeDashboardHealthScore_DegradesOnBadSignals(t *testing.T) {
},
}

score := computeDashboardHealthScore(time.Now().UTC(), ov)
score := computeDashboardHealthScore(time.Now().UTC(), ov, nil)
require.Less(t, score, 80)
require.GreaterOrEqual(t, score, 0)
}
Expand Down Expand Up @@ -229,7 +229,7 @@ func TestComputeDashboardHealthScore_Comprehensive(t *testing.T) {

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
score := computeDashboardHealthScore(time.Now().UTC(), tt.overview)
score := computeDashboardHealthScore(time.Now().UTC(), tt.overview, nil)
require.GreaterOrEqual(t, score, tt.wantMin, "score should be >= %d", tt.wantMin)
require.LessOrEqual(t, score, tt.wantMax, "score should be <= %d", tt.wantMax)
require.GreaterOrEqual(t, score, 0, "score must be >= 0")
Expand Down Expand Up @@ -328,7 +328,7 @@ func TestComputeBusinessHealth(t *testing.T) {

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
score := computeBusinessHealth(tt.overview)
score := computeBusinessHealth(tt.overview, nil)
require.GreaterOrEqual(t, score, tt.wantMin, "score should be >= %.1f", tt.wantMin)
require.LessOrEqual(t, score, tt.wantMax, "score should be <= %.1f", tt.wantMax)
require.GreaterOrEqual(t, score, 0.0, "score must be >= 0")
Expand All @@ -337,6 +337,100 @@ func TestComputeBusinessHealth(t *testing.T) {
}
}

func TestComputeBusinessHealth_ConfigurableThresholds(t *testing.T) {
t.Parallel()

overview := &OpsDashboardOverview{
ErrorRate: 0,
UpstreamErrorRate: 0,
TTFT: OpsPercentiles{P99: intPtr(4594)},
}

defaultScore := computeBusinessHealth(overview, nil)
require.Equal(t, 50.0, defaultScore)

custom := &OpsMetricThresholds{
HealthScoreErrorRateFullPercent: float64Ptr(1),
HealthScoreErrorRateZeroPercent: float64Ptr(10),
HealthScoreTTFTP99FullMs: float64Ptr(1000),
HealthScoreTTFTP99ZeroMs: float64Ptr(6000),
}
customScore := computeBusinessHealth(overview, custom)
require.InDelta(t, 64.1, customScore, 0.1)
}

func TestComputeBusinessHealth_ConfigurableErrorThresholds(t *testing.T) {
t.Parallel()

overview := &OpsDashboardOverview{
ErrorRate: 0.05,
UpstreamErrorRate: 0.02,
TTFT: OpsPercentiles{P99: intPtr(500)},
}

custom := &OpsMetricThresholds{
HealthScoreErrorRateFullPercent: float64Ptr(2),
HealthScoreErrorRateZeroPercent: float64Ptr(8),
}
score := computeBusinessHealth(overview, custom)
require.InDelta(t, 75.0, score, 0.1)
}

func TestValidateOpsMetricThresholds_HealthScoreRanges(t *testing.T) {
t.Parallel()

tests := []struct {
name string
cfg *OpsMetricThresholds
wantErr string
}{
{
name: "valid full zero ranges",
cfg: &OpsMetricThresholds{
HealthScoreErrorRateFullPercent: float64Ptr(1),
HealthScoreErrorRateZeroPercent: float64Ptr(10),
HealthScoreTTFTP99FullMs: float64Ptr(1000),
HealthScoreTTFTP99ZeroMs: float64Ptr(5000),
},
},
{
name: "error full must be below zero",
cfg: &OpsMetricThresholds{
HealthScoreErrorRateFullPercent: float64Ptr(10),
HealthScoreErrorRateZeroPercent: float64Ptr(10),
},
wantErr: "health_score_error_rate_full_percent",
},
{
name: "ttft full must be below zero",
cfg: &OpsMetricThresholds{
HealthScoreTTFTP99FullMs: float64Ptr(3000),
HealthScoreTTFTP99ZeroMs: float64Ptr(1000),
},
wantErr: "health_score_ttft_p99_full_ms",
},
{
name: "error rate must be within percent range",
cfg: &OpsMetricThresholds{
HealthScoreErrorRateFullPercent: float64Ptr(-1),
},
wantErr: "health_score_error_rate_full_percent",
},
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
err := validateOpsMetricThresholds(tt.cfg)
if tt.wantErr == "" {
require.NoError(t, err)
return
}
require.Error(t, err)
require.Contains(t, err.Error(), tt.wantErr)
})
}
}

func TestComputeInfraHealth(t *testing.T) {
t.Parallel()

Expand Down
Loading
Loading