Wei-Shaw · Clarence12138 · May 5, 2026 · May 5, 2026 · May 6, 2026 · May 8, 2026
diff --git a/backend/cmd/server/VERSION b/backend/cmd/server/VERSION
@@ -1 +1 @@
-0.1.124
+0.1.125
diff --git a/backend/internal/service/ops_dashboard.go b/backend/internal/service/ops_dashboard.go
@@ -65,7 +65,11 @@ func (s *OpsService) GetDashboardOverview(ctx context.Context, filter *OpsDashbo
 		log.Printf("[Ops] ListJobHeartbeats failed: %v", err)
 	}
 
-	overview.HealthScore = computeDashboardHealthScore(time.Now().UTC(), overview)
+	thresholds, err := s.GetMetricThresholds(ctx)
+	if err != nil {
+		return nil, err
+	}
+	overview.HealthScore = computeDashboardHealthScore(time.Now().UTC(), overview, thresholds)
 
 	return overview, nil
 }

diff --git a/backend/internal/service/ops_health_score.go b/backend/internal/service/ops_health_score.go
@@ -5,14 +5,21 @@ import (
 	"time"
 )
 
+type dashboardHealthScoreThresholds struct {
+	errorRateFullPercent float64
+	errorRateZeroPercent float64
+	ttftP99FullMs        float64
+	ttftP99ZeroMs        float64
+}
+
 // computeDashboardHealthScore computes a 0-100 health score from the metrics returned by the dashboard overview.
 //
 // Design goals:
 // - Backend-owned scoring (UI only displays).
 // - Layered scoring: Business Health (70%) + Infrastructure Health (30%)
 // - Avoids double-counting (e.g., DB failure affects both infra and business metrics)
 // - Conservative + stable: penalize clear degradations; avoid overreacting to missing/idle data.
-func computeDashboardHealthScore(now time.Time, overview *OpsDashboardOverview) int {
+func computeDashboardHealthScore(now time.Time, overview *OpsDashboardOverview, thresholds *OpsMetricThresholds) int {
 	if overview == nil {
 		return 0
 	}
@@ -23,7 +30,7 @@ func computeDashboardHealthScore(now time.Time, overview *OpsDashboardOverview)
 		return 100
 	}
 
-	businessHealth := computeBusinessHealth(overview)
+	businessHealth := computeBusinessHealth(overview, thresholds)
 	infraHealth := computeInfraHealth(now, overview)
 
 	// Weighted combination: 70% business + 30% infrastructure
@@ -33,39 +40,76 @@ func computeDashboardHealthScore(now time.Time, overview *OpsDashboardOverview)
 
 // computeBusinessHealth calculates business health score (0-100)
 // Components: Error Rate (50%) + TTFT (50%)
-func computeBusinessHealth(overview *OpsDashboardOverview) float64 {
-	// Error rate score: 1% → 100, 10% → 0 (linear)
+func computeBusinessHealth(overview *OpsDashboardOverview, thresholds *OpsMetricThresholds) float64 {
+	scoreThresholds := resolveDashboardHealthScoreThresholds(thresholds)
+
+	// Error rate score defaults to 1% → 100, 10% → 0 (linear)
 	// Combines request errors and upstream errors
-	errorScore := 100.0
 	errorPct := clampFloat64(overview.ErrorRate*100, 0, 100)
 	upstreamPct := clampFloat64(overview.UpstreamErrorRate*100, 0, 100)
 	combinedErrorPct := math.Max(errorPct, upstreamPct) // Use worst case
-	if combinedErrorPct > 1.0 {
-		if combinedErrorPct <= 10.0 {
-			errorScore = (10.0 - combinedErrorPct) / 9.0 * 100
-		} else {
-			errorScore = 0
-		}
-	}
+	errorScore := scoreDescendingRange(
+		combinedErrorPct,
+		scoreThresholds.errorRateFullPercent,
+		scoreThresholds.errorRateZeroPercent,
+	)
 
-	// TTFT score: 1s → 100, 3s → 0 (linear)
+	// TTFT score defaults to 1s → 100, 3s → 0 (linear)
 	// Time to first token is critical for user experience
 	ttftScore := 100.0
 	if overview.TTFT.P99 != nil {
-		p99 := float64(*overview.TTFT.P99)
-		if p99 > 1000 {
-			if p99 <= 3000 {
-				ttftScore = (3000 - p99) / 2000 * 100
-			} else {
-				ttftScore = 0
-			}
-		}
+		ttftScore = scoreDescendingRange(
+			float64(*overview.TTFT.P99),
+			scoreThresholds.ttftP99FullMs,
+			scoreThresholds.ttftP99ZeroMs,
+		)
 	}
 
 	// Weighted combination: 50% error rate + 50% TTFT
 	return errorScore*0.5 + ttftScore*0.5
 }
 
+func resolveDashboardHealthScoreThresholds(thresholds *OpsMetricThresholds) dashboardHealthScoreThresholds {
+	defaults := defaultOpsMetricThresholds()
+	return dashboardHealthScoreThresholds{
+		errorRateFullPercent: metricThresholdValue(thresholds, defaults, func(v *OpsMetricThresholds) *float64 {
+			return v.HealthScoreErrorRateFullPercent
+		}),
+		errorRateZeroPercent: metricThresholdValue(thresholds, defaults, func(v *OpsMetricThresholds) *float64 {
+			return v.HealthScoreErrorRateZeroPercent
+		}),
+		ttftP99FullMs: metricThresholdValue(thresholds, defaults, func(v *OpsMetricThresholds) *float64 {
+			return v.HealthScoreTTFTP99FullMs
+		}),
+		ttftP99ZeroMs: metricThresholdValue(thresholds, defaults, func(v *OpsMetricThresholds) *float64 {
+			return v.HealthScoreTTFTP99ZeroMs
+		}),
+	}
+}
+
+func metricThresholdValue(
+	thresholds *OpsMetricThresholds,
+	defaults *OpsMetricThresholds,
+	selectValue func(*OpsMetricThresholds) *float64,
+) float64 {
+	if thresholds != nil {
+		if value := selectValue(thresholds); value != nil {
+			return *value
+		}
+	}
+	return *selectValue(defaults)
+}
+
+func scoreDescendingRange(value float64, fullScoreAt float64, zeroScoreAt float64) float64 {
+	if value <= fullScoreAt {
+		return 100
+	}
+	if value >= zeroScoreAt {
+		return 0
+	}
+	return (zeroScoreAt - value) / (zeroScoreAt - fullScoreAt) * 100
+}
+
 // computeInfraHealth calculates infrastructure health score (0-100)
 // Components: Storage (40%) + Compute Resources (30%) + Background Jobs (30%)
 func computeInfraHealth(now time.Time, overview *OpsDashboardOverview) float64 {

diff --git a/backend/internal/service/ops_health_score_test.go b/backend/internal/service/ops_health_score_test.go
@@ -12,7 +12,7 @@ import (
 func TestComputeDashboardHealthScore_IdleReturns100(t *testing.T) {
 	t.Parallel()
 
-	score := computeDashboardHealthScore(time.Now().UTC(), &OpsDashboardOverview{})
+	score := computeDashboardHealthScore(time.Now().UTC(), &OpsDashboardOverview{}, nil)
 	require.Equal(t, 100, score)
 }
 
@@ -50,7 +50,7 @@ func TestComputeDashboardHealthScore_DegradesOnBadSignals(t *testing.T) {
 		},
 	}
 
-	score := computeDashboardHealthScore(time.Now().UTC(), ov)
+	score := computeDashboardHealthScore(time.Now().UTC(), ov, nil)
 	require.Less(t, score, 80)
 	require.GreaterOrEqual(t, score, 0)
 }
@@ -229,7 +229,7 @@ func TestComputeDashboardHealthScore_Comprehensive(t *testing.T) {
 
 	for _, tt := range tests {
 		t.Run(tt.name, func(t *testing.T) {
-			score := computeDashboardHealthScore(time.Now().UTC(), tt.overview)
+			score := computeDashboardHealthScore(time.Now().UTC(), tt.overview, nil)
 			require.GreaterOrEqual(t, score, tt.wantMin, "score should be >= %d", tt.wantMin)
 			require.LessOrEqual(t, score, tt.wantMax, "score should be <= %d", tt.wantMax)
 			require.GreaterOrEqual(t, score, 0, "score must be >= 0")
@@ -328,7 +328,7 @@ func TestComputeBusinessHealth(t *testing.T) {
 
 	for _, tt := range tests {
 		t.Run(tt.name, func(t *testing.T) {
-			score := computeBusinessHealth(tt.overview)
+			score := computeBusinessHealth(tt.overview, nil)
 			require.GreaterOrEqual(t, score, tt.wantMin, "score should be >= %.1f", tt.wantMin)
 			require.LessOrEqual(t, score, tt.wantMax, "score should be <= %.1f", tt.wantMax)
 			require.GreaterOrEqual(t, score, 0.0, "score must be >= 0")
@@ -337,6 +337,100 @@ func TestComputeBusinessHealth(t *testing.T) {
 	}
 }
 
+func TestComputeBusinessHealth_ConfigurableThresholds(t *testing.T) {
+	t.Parallel()
+
+	overview := &OpsDashboardOverview{
+		ErrorRate:         0,
+		UpstreamErrorRate: 0,
+		TTFT:              OpsPercentiles{P99: intPtr(4594)},
+	}
+
+	defaultScore := computeBusinessHealth(overview, nil)
+	require.Equal(t, 50.0, defaultScore)
+
+	custom := &OpsMetricThresholds{
+		HealthScoreErrorRateFullPercent: float64Ptr(1),
+		HealthScoreErrorRateZeroPercent: float64Ptr(10),
+		HealthScoreTTFTP99FullMs:        float64Ptr(1000),
+		HealthScoreTTFTP99ZeroMs:        float64Ptr(6000),
+	}
+	customScore := computeBusinessHealth(overview, custom)
+	require.InDelta(t, 64.1, customScore, 0.1)
+}
+
+func TestComputeBusinessHealth_ConfigurableErrorThresholds(t *testing.T) {
+	t.Parallel()
+
+	overview := &OpsDashboardOverview{
+		ErrorRate:         0.05,
+		UpstreamErrorRate: 0.02,
+		TTFT:              OpsPercentiles{P99: intPtr(500)},
+	}
+
+	custom := &OpsMetricThresholds{
+		HealthScoreErrorRateFullPercent: float64Ptr(2),
+		HealthScoreErrorRateZeroPercent: float64Ptr(8),
+	}
+	score := computeBusinessHealth(overview, custom)
+	require.InDelta(t, 75.0, score, 0.1)
+}
+
+func TestValidateOpsMetricThresholds_HealthScoreRanges(t *testing.T) {
+	t.Parallel()
+
+	tests := []struct {
+		name    string
+		cfg     *OpsMetricThresholds
+		wantErr string
+	}{
+		{
+			name: "valid full zero ranges",
+			cfg: &OpsMetricThresholds{
+				HealthScoreErrorRateFullPercent: float64Ptr(1),
+				HealthScoreErrorRateZeroPercent: float64Ptr(10),
+				HealthScoreTTFTP99FullMs:        float64Ptr(1000),
+				HealthScoreTTFTP99ZeroMs:        float64Ptr(5000),
+			},
+		},
+		{
+			name: "error full must be below zero",
+			cfg: &OpsMetricThresholds{
+				HealthScoreErrorRateFullPercent: float64Ptr(10),
+				HealthScoreErrorRateZeroPercent: float64Ptr(10),
+			},
+			wantErr: "health_score_error_rate_full_percent",
+		},
+		{
+			name: "ttft full must be below zero",
+			cfg: &OpsMetricThresholds{
+				HealthScoreTTFTP99FullMs: float64Ptr(3000),
+				HealthScoreTTFTP99ZeroMs: float64Ptr(1000),
+			},
+			wantErr: "health_score_ttft_p99_full_ms",
+		},
+		{
+			name: "error rate must be within percent range",
+			cfg: &OpsMetricThresholds{
+				HealthScoreErrorRateFullPercent: float64Ptr(-1),
+			},
+			wantErr: "health_score_error_rate_full_percent",
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			err := validateOpsMetricThresholds(tt.cfg)
+			if tt.wantErr == "" {
+				require.NoError(t, err)
+				return
+			}
+			require.Error(t, err)
+			require.Contains(t, err.Error(), tt.wantErr)
+		})
+	}
+}
+
 func TestComputeInfraHealth(t *testing.T) {
 	t.Parallel()