diff --git a/backend/cmd/server/VERSION b/backend/cmd/server/VERSION index 98eb271542a..5076ee8063a 100644 --- a/backend/cmd/server/VERSION +++ b/backend/cmd/server/VERSION @@ -1 +1 @@ -0.1.124 +0.1.125 diff --git a/backend/internal/service/ops_dashboard.go b/backend/internal/service/ops_dashboard.go index 6f70c75ce7d..ca08259703b 100644 --- a/backend/internal/service/ops_dashboard.go +++ b/backend/internal/service/ops_dashboard.go @@ -65,7 +65,11 @@ func (s *OpsService) GetDashboardOverview(ctx context.Context, filter *OpsDashbo log.Printf("[Ops] ListJobHeartbeats failed: %v", err) } - overview.HealthScore = computeDashboardHealthScore(time.Now().UTC(), overview) + thresholds, err := s.GetMetricThresholds(ctx) + if err != nil { + return nil, err + } + overview.HealthScore = computeDashboardHealthScore(time.Now().UTC(), overview, thresholds) return overview, nil } diff --git a/backend/internal/service/ops_health_score.go b/backend/internal/service/ops_health_score.go index 5efae870071..ba3baafc92c 100644 --- a/backend/internal/service/ops_health_score.go +++ b/backend/internal/service/ops_health_score.go @@ -5,6 +5,13 @@ import ( "time" ) +type dashboardHealthScoreThresholds struct { + errorRateFullPercent float64 + errorRateZeroPercent float64 + ttftP99FullMs float64 + ttftP99ZeroMs float64 +} + // computeDashboardHealthScore computes a 0-100 health score from the metrics returned by the dashboard overview. // // Design goals: @@ -12,7 +19,7 @@ import ( // - Layered scoring: Business Health (70%) + Infrastructure Health (30%) // - Avoids double-counting (e.g., DB failure affects both infra and business metrics) // - Conservative + stable: penalize clear degradations; avoid overreacting to missing/idle data. -func computeDashboardHealthScore(now time.Time, overview *OpsDashboardOverview) int { +func computeDashboardHealthScore(now time.Time, overview *OpsDashboardOverview, thresholds *OpsMetricThresholds) int { if overview == nil { return 0 } @@ -23,7 +30,7 @@ func computeDashboardHealthScore(now time.Time, overview *OpsDashboardOverview) return 100 } - businessHealth := computeBusinessHealth(overview) + businessHealth := computeBusinessHealth(overview, thresholds) infraHealth := computeInfraHealth(now, overview) // Weighted combination: 70% business + 30% infrastructure @@ -33,39 +40,76 @@ func computeDashboardHealthScore(now time.Time, overview *OpsDashboardOverview) // computeBusinessHealth calculates business health score (0-100) // Components: Error Rate (50%) + TTFT (50%) -func computeBusinessHealth(overview *OpsDashboardOverview) float64 { - // Error rate score: 1% → 100, 10% → 0 (linear) +func computeBusinessHealth(overview *OpsDashboardOverview, thresholds *OpsMetricThresholds) float64 { + scoreThresholds := resolveDashboardHealthScoreThresholds(thresholds) + + // Error rate score defaults to 1% → 100, 10% → 0 (linear) // Combines request errors and upstream errors - errorScore := 100.0 errorPct := clampFloat64(overview.ErrorRate*100, 0, 100) upstreamPct := clampFloat64(overview.UpstreamErrorRate*100, 0, 100) combinedErrorPct := math.Max(errorPct, upstreamPct) // Use worst case - if combinedErrorPct > 1.0 { - if combinedErrorPct <= 10.0 { - errorScore = (10.0 - combinedErrorPct) / 9.0 * 100 - } else { - errorScore = 0 - } - } + errorScore := scoreDescendingRange( + combinedErrorPct, + scoreThresholds.errorRateFullPercent, + scoreThresholds.errorRateZeroPercent, + ) - // TTFT score: 1s → 100, 3s → 0 (linear) + // TTFT score defaults to 1s → 100, 3s → 0 (linear) // Time to first token is critical for user experience ttftScore := 100.0 if overview.TTFT.P99 != nil { - p99 := float64(*overview.TTFT.P99) - if p99 > 1000 { - if p99 <= 3000 { - ttftScore = (3000 - p99) / 2000 * 100 - } else { - ttftScore = 0 - } - } + ttftScore = scoreDescendingRange( + float64(*overview.TTFT.P99), + scoreThresholds.ttftP99FullMs, + scoreThresholds.ttftP99ZeroMs, + ) } // Weighted combination: 50% error rate + 50% TTFT return errorScore*0.5 + ttftScore*0.5 } +func resolveDashboardHealthScoreThresholds(thresholds *OpsMetricThresholds) dashboardHealthScoreThresholds { + defaults := defaultOpsMetricThresholds() + return dashboardHealthScoreThresholds{ + errorRateFullPercent: metricThresholdValue(thresholds, defaults, func(v *OpsMetricThresholds) *float64 { + return v.HealthScoreErrorRateFullPercent + }), + errorRateZeroPercent: metricThresholdValue(thresholds, defaults, func(v *OpsMetricThresholds) *float64 { + return v.HealthScoreErrorRateZeroPercent + }), + ttftP99FullMs: metricThresholdValue(thresholds, defaults, func(v *OpsMetricThresholds) *float64 { + return v.HealthScoreTTFTP99FullMs + }), + ttftP99ZeroMs: metricThresholdValue(thresholds, defaults, func(v *OpsMetricThresholds) *float64 { + return v.HealthScoreTTFTP99ZeroMs + }), + } +} + +func metricThresholdValue( + thresholds *OpsMetricThresholds, + defaults *OpsMetricThresholds, + selectValue func(*OpsMetricThresholds) *float64, +) float64 { + if thresholds != nil { + if value := selectValue(thresholds); value != nil { + return *value + } + } + return *selectValue(defaults) +} + +func scoreDescendingRange(value float64, fullScoreAt float64, zeroScoreAt float64) float64 { + if value <= fullScoreAt { + return 100 + } + if value >= zeroScoreAt { + return 0 + } + return (zeroScoreAt - value) / (zeroScoreAt - fullScoreAt) * 100 +} + // computeInfraHealth calculates infrastructure health score (0-100) // Components: Storage (40%) + Compute Resources (30%) + Background Jobs (30%) func computeInfraHealth(now time.Time, overview *OpsDashboardOverview) float64 { diff --git a/backend/internal/service/ops_health_score_test.go b/backend/internal/service/ops_health_score_test.go index 25bfb43d77d..b8e387b253a 100644 --- a/backend/internal/service/ops_health_score_test.go +++ b/backend/internal/service/ops_health_score_test.go @@ -12,7 +12,7 @@ import ( func TestComputeDashboardHealthScore_IdleReturns100(t *testing.T) { t.Parallel() - score := computeDashboardHealthScore(time.Now().UTC(), &OpsDashboardOverview{}) + score := computeDashboardHealthScore(time.Now().UTC(), &OpsDashboardOverview{}, nil) require.Equal(t, 100, score) } @@ -50,7 +50,7 @@ func TestComputeDashboardHealthScore_DegradesOnBadSignals(t *testing.T) { }, } - score := computeDashboardHealthScore(time.Now().UTC(), ov) + score := computeDashboardHealthScore(time.Now().UTC(), ov, nil) require.Less(t, score, 80) require.GreaterOrEqual(t, score, 0) } @@ -229,7 +229,7 @@ func TestComputeDashboardHealthScore_Comprehensive(t *testing.T) { for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - score := computeDashboardHealthScore(time.Now().UTC(), tt.overview) + score := computeDashboardHealthScore(time.Now().UTC(), tt.overview, nil) require.GreaterOrEqual(t, score, tt.wantMin, "score should be >= %d", tt.wantMin) require.LessOrEqual(t, score, tt.wantMax, "score should be <= %d", tt.wantMax) require.GreaterOrEqual(t, score, 0, "score must be >= 0") @@ -328,7 +328,7 @@ func TestComputeBusinessHealth(t *testing.T) { for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - score := computeBusinessHealth(tt.overview) + score := computeBusinessHealth(tt.overview, nil) require.GreaterOrEqual(t, score, tt.wantMin, "score should be >= %.1f", tt.wantMin) require.LessOrEqual(t, score, tt.wantMax, "score should be <= %.1f", tt.wantMax) require.GreaterOrEqual(t, score, 0.0, "score must be >= 0") @@ -337,6 +337,100 @@ func TestComputeBusinessHealth(t *testing.T) { } } +func TestComputeBusinessHealth_ConfigurableThresholds(t *testing.T) { + t.Parallel() + + overview := &OpsDashboardOverview{ + ErrorRate: 0, + UpstreamErrorRate: 0, + TTFT: OpsPercentiles{P99: intPtr(4594)}, + } + + defaultScore := computeBusinessHealth(overview, nil) + require.Equal(t, 50.0, defaultScore) + + custom := &OpsMetricThresholds{ + HealthScoreErrorRateFullPercent: float64Ptr(1), + HealthScoreErrorRateZeroPercent: float64Ptr(10), + HealthScoreTTFTP99FullMs: float64Ptr(1000), + HealthScoreTTFTP99ZeroMs: float64Ptr(6000), + } + customScore := computeBusinessHealth(overview, custom) + require.InDelta(t, 64.1, customScore, 0.1) +} + +func TestComputeBusinessHealth_ConfigurableErrorThresholds(t *testing.T) { + t.Parallel() + + overview := &OpsDashboardOverview{ + ErrorRate: 0.05, + UpstreamErrorRate: 0.02, + TTFT: OpsPercentiles{P99: intPtr(500)}, + } + + custom := &OpsMetricThresholds{ + HealthScoreErrorRateFullPercent: float64Ptr(2), + HealthScoreErrorRateZeroPercent: float64Ptr(8), + } + score := computeBusinessHealth(overview, custom) + require.InDelta(t, 75.0, score, 0.1) +} + +func TestValidateOpsMetricThresholds_HealthScoreRanges(t *testing.T) { + t.Parallel() + + tests := []struct { + name string + cfg *OpsMetricThresholds + wantErr string + }{ + { + name: "valid full zero ranges", + cfg: &OpsMetricThresholds{ + HealthScoreErrorRateFullPercent: float64Ptr(1), + HealthScoreErrorRateZeroPercent: float64Ptr(10), + HealthScoreTTFTP99FullMs: float64Ptr(1000), + HealthScoreTTFTP99ZeroMs: float64Ptr(5000), + }, + }, + { + name: "error full must be below zero", + cfg: &OpsMetricThresholds{ + HealthScoreErrorRateFullPercent: float64Ptr(10), + HealthScoreErrorRateZeroPercent: float64Ptr(10), + }, + wantErr: "health_score_error_rate_full_percent", + }, + { + name: "ttft full must be below zero", + cfg: &OpsMetricThresholds{ + HealthScoreTTFTP99FullMs: float64Ptr(3000), + HealthScoreTTFTP99ZeroMs: float64Ptr(1000), + }, + wantErr: "health_score_ttft_p99_full_ms", + }, + { + name: "error rate must be within percent range", + cfg: &OpsMetricThresholds{ + HealthScoreErrorRateFullPercent: float64Ptr(-1), + }, + wantErr: "health_score_error_rate_full_percent", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + err := validateOpsMetricThresholds(tt.cfg) + if tt.wantErr == "" { + require.NoError(t, err) + return + } + require.Error(t, err) + require.Contains(t, err.Error(), tt.wantErr) + }) + } +} + func TestComputeInfraHealth(t *testing.T) { t.Parallel() diff --git a/backend/internal/service/ops_settings.go b/backend/internal/service/ops_settings.go index 68c1d9ddea4..d6b845d7807 100644 --- a/backend/internal/service/ops_settings.go +++ b/backend/internal/service/ops_settings.go @@ -210,6 +210,7 @@ func defaultOpsAlertRuntimeSettings() *OpsAlertRuntimeSettings { GlobalReason: "", Entries: []OpsAlertSilenceEntry{}, }, + Thresholds: *defaultOpsMetricThresholds(), } } @@ -306,6 +307,7 @@ func (s *OpsService) GetOpsAlertRuntimeSettings(ctx context.Context) (*OpsAlertR } normalizeOpsDistributedLockSettings(&cfg.DistributedLock, opsAlertEvaluatorLeaderLockKeyDefault, defaultCfg.DistributedLock.TTLSeconds) normalizeOpsAlertSilencingSettings(&cfg.Silencing) + normalizeOpsMetricThresholds(&cfg.Thresholds) return cfg, nil } @@ -334,10 +336,13 @@ func (s *OpsService) UpdateOpsAlertRuntimeSettings(ctx context.Context, cfg *Ops return nil, err } } - defaultCfg := defaultOpsAlertRuntimeSettings() normalizeOpsDistributedLockSettings(&cfg.DistributedLock, opsAlertEvaluatorLeaderLockKeyDefault, defaultCfg.DistributedLock.TTLSeconds) normalizeOpsAlertSilencingSettings(&cfg.Silencing) + normalizeOpsMetricThresholds(&cfg.Thresholds) + if err := validateOpsMetricThresholds(&cfg.Thresholds); err != nil { + return nil, err + } raw, err := json.Marshal(cfg) if err != nil { @@ -502,12 +507,119 @@ func defaultOpsMetricThresholds() *OpsMetricThresholds { ttftMax := 500.0 reqErrMax := 5.0 upstreamErrMax := 5.0 + healthErrFull := 1.0 + healthErrZero := 10.0 + healthTTFTFull := 1000.0 + healthTTFTZero := 3000.0 return &OpsMetricThresholds{ - SLAPercentMin: &slaMin, - TTFTp99MsMax: &ttftMax, - RequestErrorRatePercentMax: &reqErrMax, - UpstreamErrorRatePercentMax: &upstreamErrMax, + SLAPercentMin: &slaMin, + TTFTp99MsMax: &ttftMax, + RequestErrorRatePercentMax: &reqErrMax, + UpstreamErrorRatePercentMax: &upstreamErrMax, + HealthScoreErrorRateFullPercent: &healthErrFull, + HealthScoreErrorRateZeroPercent: &healthErrZero, + HealthScoreTTFTP99FullMs: &healthTTFTFull, + HealthScoreTTFTP99ZeroMs: &healthTTFTZero, + } +} + +func normalizeOpsMetricThresholds(cfg *OpsMetricThresholds) *OpsMetricThresholds { + defaultCfg := defaultOpsMetricThresholds() + if cfg == nil { + return defaultCfg + } + if cfg.SLAPercentMin == nil { + cfg.SLAPercentMin = defaultCfg.SLAPercentMin + } + if cfg.TTFTp99MsMax == nil { + cfg.TTFTp99MsMax = defaultCfg.TTFTp99MsMax + } + if cfg.RequestErrorRatePercentMax == nil { + cfg.RequestErrorRatePercentMax = defaultCfg.RequestErrorRatePercentMax + } + if cfg.UpstreamErrorRatePercentMax == nil { + cfg.UpstreamErrorRatePercentMax = defaultCfg.UpstreamErrorRatePercentMax + } + if cfg.HealthScoreErrorRateFullPercent == nil { + cfg.HealthScoreErrorRateFullPercent = defaultCfg.HealthScoreErrorRateFullPercent + } + if cfg.HealthScoreErrorRateZeroPercent == nil { + cfg.HealthScoreErrorRateZeroPercent = defaultCfg.HealthScoreErrorRateZeroPercent + } + if cfg.HealthScoreTTFTP99FullMs == nil { + cfg.HealthScoreTTFTP99FullMs = defaultCfg.HealthScoreTTFTP99FullMs + } + if cfg.HealthScoreTTFTP99ZeroMs == nil { + cfg.HealthScoreTTFTP99ZeroMs = defaultCfg.HealthScoreTTFTP99ZeroMs + } + return cfg +} + +func validateOpsMetricThresholds(cfg *OpsMetricThresholds) error { + if cfg == nil { + return errors.New("invalid config") + } + if err := validateOpsDisplayMetricThresholds(cfg); err != nil { + return err + } + if err := validateOpsHealthScoreThresholds(cfg); err != nil { + return err + } + return nil +} + +func validateOpsDisplayMetricThresholds(cfg *OpsMetricThresholds) error { + if err := validateOptionalPercent(cfg.SLAPercentMin, "sla_percent_min"); err != nil { + return err + } + if err := validateOptionalNonNegative(cfg.TTFTp99MsMax, "ttft_p99_ms_max"); err != nil { + return err } + if err := validateOptionalPercent(cfg.RequestErrorRatePercentMax, "request_error_rate_percent_max"); err != nil { + return err + } + if err := validateOptionalPercent(cfg.UpstreamErrorRatePercentMax, "upstream_error_rate_percent_max"); err != nil { + return err + } + return nil +} + +func validateOpsHealthScoreThresholds(cfg *OpsMetricThresholds) error { + if err := validateOptionalPercent(cfg.HealthScoreErrorRateFullPercent, "health_score_error_rate_full_percent"); err != nil { + return err + } + if err := validateOptionalPercent(cfg.HealthScoreErrorRateZeroPercent, "health_score_error_rate_zero_percent"); err != nil { + return err + } + if err := validateOptionalNonNegative(cfg.HealthScoreTTFTP99FullMs, "health_score_ttft_p99_full_ms"); err != nil { + return err + } + if err := validateOptionalNonNegative(cfg.HealthScoreTTFTP99ZeroMs, "health_score_ttft_p99_zero_ms"); err != nil { + return err + } + if cfg.HealthScoreErrorRateFullPercent != nil && cfg.HealthScoreErrorRateZeroPercent != nil && + *cfg.HealthScoreErrorRateFullPercent >= *cfg.HealthScoreErrorRateZeroPercent { + return errors.New("health_score_error_rate_full_percent must be less than health_score_error_rate_zero_percent") + } + if cfg.HealthScoreTTFTP99FullMs != nil && cfg.HealthScoreTTFTP99ZeroMs != nil && + *cfg.HealthScoreTTFTP99FullMs >= *cfg.HealthScoreTTFTP99ZeroMs { + return errors.New("health_score_ttft_p99_full_ms must be less than health_score_ttft_p99_zero_ms") + } + return nil +} + +func validateOptionalPercent(value *float64, field string) error { + if value != nil && (*value < 0 || *value > 100) { + return errors.New(field + " must be between 0 and 100") + } + return nil +} + +func validateOptionalNonNegative(value *float64, field string) error { + if value != nil && *value < 0 { + return errors.New(field + " must be >= 0") + } + return nil } func (s *OpsService) GetMetricThresholds(ctx context.Context) (*OpsMetricThresholds, error) { @@ -530,11 +642,15 @@ func (s *OpsService) GetMetricThresholds(ctx context.Context) (*OpsMetricThresho return nil, err } - cfg := &OpsMetricThresholds{} + cfg := defaultCfg if err := json.Unmarshal([]byte(raw), cfg); err != nil { return defaultCfg, nil } + normalizeOpsMetricThresholds(cfg) + if err := validateOpsMetricThresholds(cfg); err != nil { + return nil, err + } return cfg, nil } @@ -549,18 +665,9 @@ func (s *OpsService) UpdateMetricThresholds(ctx context.Context, cfg *OpsMetricT return nil, errors.New("invalid config") } - // Validate thresholds - if cfg.SLAPercentMin != nil && (*cfg.SLAPercentMin < 0 || *cfg.SLAPercentMin > 100) { - return nil, errors.New("sla_percent_min must be between 0 and 100") - } - if cfg.TTFTp99MsMax != nil && *cfg.TTFTp99MsMax < 0 { - return nil, errors.New("ttft_p99_ms_max must be >= 0") - } - if cfg.RequestErrorRatePercentMax != nil && (*cfg.RequestErrorRatePercentMax < 0 || *cfg.RequestErrorRatePercentMax > 100) { - return nil, errors.New("request_error_rate_percent_max must be between 0 and 100") - } - if cfg.UpstreamErrorRatePercentMax != nil && (*cfg.UpstreamErrorRatePercentMax < 0 || *cfg.UpstreamErrorRatePercentMax > 100) { - return nil, errors.New("upstream_error_rate_percent_max must be between 0 and 100") + normalizeOpsMetricThresholds(cfg) + if err := validateOpsMetricThresholds(cfg); err != nil { + return nil, err } raw, err := json.Marshal(cfg) diff --git a/backend/internal/service/ops_settings_models.go b/backend/internal/service/ops_settings_models.go index fa18b05fb7c..f1e0572e8d9 100644 --- a/backend/internal/service/ops_settings_models.go +++ b/backend/internal/service/ops_settings_models.go @@ -66,6 +66,11 @@ type OpsMetricThresholds struct { TTFTp99MsMax *float64 `json:"ttft_p99_ms_max,omitempty"` // TTFT P99高于此值变红 RequestErrorRatePercentMax *float64 `json:"request_error_rate_percent_max,omitempty"` // 请求错误率高于此值变红 UpstreamErrorRatePercentMax *float64 `json:"upstream_error_rate_percent_max,omitempty"` // 上游错误率高于此值变红 + + HealthScoreErrorRateFullPercent *float64 `json:"health_score_error_rate_full_percent,omitempty"` // 业务健康评分错误率满分点 + HealthScoreErrorRateZeroPercent *float64 `json:"health_score_error_rate_zero_percent,omitempty"` // 业务健康评分错误率零分点 + HealthScoreTTFTP99FullMs *float64 `json:"health_score_ttft_p99_full_ms,omitempty"` // 业务健康评分 TTFT P99 满分点 + HealthScoreTTFTP99ZeroMs *float64 `json:"health_score_ttft_p99_zero_ms,omitempty"` // 业务健康评分 TTFT P99 零分点 } type OpsRuntimeLogConfig struct { diff --git a/frontend/src/api/admin/ops.ts b/frontend/src/api/admin/ops.ts index ac58eff40e9..21dab53b502 100644 --- a/frontend/src/api/admin/ops.ts +++ b/frontend/src/api/admin/ops.ts @@ -809,6 +809,10 @@ export interface OpsMetricThresholds { ttft_p99_ms_max?: number | null // TTFT P99高于此值变红 request_error_rate_percent_max?: number | null // 请求错误率高于此值变红 upstream_error_rate_percent_max?: number | null // 上游错误率高于此值变红 + health_score_error_rate_full_percent?: number | null // 业务健康评分错误率满分点 + health_score_error_rate_zero_percent?: number | null // 业务健康评分错误率零分点 + health_score_ttft_p99_full_ms?: number | null // 业务健康评分 TTFT P99 满分点 + health_score_ttft_p99_zero_ms?: number | null // 业务健康评分 TTFT P99 零分点 } export interface OpsDistributedLockSettings { diff --git a/frontend/src/i18n/locales/en.ts b/frontend/src/i18n/locales/en.ts index 90bf23f730b..35a4b7cb097 100644 --- a/frontend/src/i18n/locales/en.ts +++ b/frontend/src/i18n/locales/en.ts @@ -4880,8 +4880,22 @@ export default { slaMinPercentRange: 'SLA minimum percentage must be between 0 and 100', ttftP99MaxRange: 'TTFT P99 maximum must be a number ≥ 0', requestErrorRateMaxRange: 'Request error rate maximum must be between 0 and 100', - upstreamErrorRateMaxRange: 'Upstream error rate maximum must be between 0 and 100' - } + upstreamErrorRateMaxRange: 'Upstream error rate maximum must be between 0 and 100', + healthScoreErrorRateRange: 'Health score error rate thresholds must be between 0 and 100', + healthScoreErrorRateOrder: 'Health score error rate full-score threshold must be lower than zero-score threshold', + healthScoreTTFTRange: 'Health score TTFT P99 thresholds must be a number ≥ 0', + healthScoreTTFTOrder: 'Health score TTFT P99 full-score threshold must be lower than zero-score threshold' + }, + healthScoreThresholds: 'Business Health Score Thresholds', + healthScoreThresholdsHint: 'Only affects the overall health score, not card colors or alert thresholds.', + healthScoreErrorRateFullPercent: 'Error rate full score (%)', + healthScoreErrorRateFullPercentHint: 'At or below this value, the error-rate part of business health gets full score (default: 1%).', + healthScoreErrorRateZeroPercent: 'Error rate zero score (%)', + healthScoreErrorRateZeroPercentHint: 'At or above this value, the error-rate part of business health gets zero score (default: 10%).', + healthScoreTTFTP99FullMs: 'TTFT P99 full score (ms)', + healthScoreTTFTP99FullMsHint: 'At or below this value, the TTFT part of business health gets full score (default: 1000ms).', + healthScoreTTFTP99ZeroMs: 'TTFT P99 zero score (ms)', + healthScoreTTFTP99ZeroMsHint: 'At or above this value, the TTFT part of business health gets zero score (default: 3000ms).' }, email: { title: 'Email Notification', @@ -4942,7 +4956,7 @@ export default { dailySummary: 'Daily Summary', weeklySummary: 'Weekly Summary', metricThresholds: 'Metric Thresholds', - metricThresholdsHint: 'Configure alert thresholds for metrics, values exceeding thresholds will be displayed in red', + metricThresholdsHint: 'Configure display/alert thresholds and independent overall health score thresholds', slaMinPercent: 'SLA Minimum Percentage', slaMinPercentHint: 'SLA below this value will be displayed in red (default: 99.5%)', ttftP99MaxMs: 'TTFT P99 Maximum (ms)', @@ -4951,6 +4965,16 @@ export default { requestErrorRateMaxPercentHint: 'Request error rate above this value will be displayed in red (default: 5%)', upstreamErrorRateMaxPercent: 'Upstream Error Rate Maximum (%)', upstreamErrorRateMaxPercentHint: 'Upstream error rate above this value will be displayed in red (default: 5%)', + healthScoreThresholds: 'Business Health Score Thresholds', + healthScoreThresholdsHint: 'Only affects the overall health score, not the display/alert thresholds above.', + healthScoreErrorRateFullPercent: 'Error rate full score (%)', + healthScoreErrorRateFullPercentHint: 'At or below this value, the error-rate part of business health gets full score (default: 1%)', + healthScoreErrorRateZeroPercent: 'Error rate zero score (%)', + healthScoreErrorRateZeroPercentHint: 'At or above this value, the error-rate part of business health gets zero score (default: 10%)', + healthScoreTTFTP99FullMs: 'TTFT P99 full score (ms)', + healthScoreTTFTP99FullMsHint: 'At or below this value, the TTFT part of business health gets full score (default: 1000ms)', + healthScoreTTFTP99ZeroMs: 'TTFT P99 zero score (ms)', + healthScoreTTFTP99ZeroMsHint: 'At or above this value, the TTFT part of business health gets zero score (default: 3000ms)', advancedSettings: 'Advanced Settings', dataRetention: 'Data Retention Policy', enableCleanup: 'Enable Data Cleanup', @@ -4993,7 +5017,11 @@ export default { slaMinPercentRange: 'SLA minimum percentage must be between 0 and 100', ttftP99MaxRange: 'TTFT P99 maximum must be a number ≥ 0', requestErrorRateMaxRange: 'Request error rate maximum must be between 0 and 100', - upstreamErrorRateMaxRange: 'Upstream error rate maximum must be between 0 and 100' + upstreamErrorRateMaxRange: 'Upstream error rate maximum must be between 0 and 100', + healthScoreErrorRateRange: 'Health score error rate thresholds must be between 0 and 100', + healthScoreErrorRateOrder: 'Health score error rate full-score threshold must be lower than zero-score threshold', + healthScoreTTFTRange: 'Health score TTFT P99 thresholds must be a number ≥ 0', + healthScoreTTFTOrder: 'Health score TTFT P99 full-score threshold must be lower than zero-score threshold' } }, concurrency: { diff --git a/frontend/src/i18n/locales/zh.ts b/frontend/src/i18n/locales/zh.ts index 87482f9d329..d51a9fcb1b2 100644 --- a/frontend/src/i18n/locales/zh.ts +++ b/frontend/src/i18n/locales/zh.ts @@ -883,7 +883,7 @@ export default { monitorCommon: { status: { operational: '正常', - degraded: '降级', + degraded: '响应慢', failed: '失败', error: '错误', unknown: '-' @@ -5042,8 +5042,22 @@ export default { slaMinPercentRange: 'SLA 最低值必须在 0-100 之间', ttftP99MaxRange: 'TTFT P99 最大值必须大于或等于 0', requestErrorRateMaxRange: '请求错误率最大值必须在 0-100 之间', - upstreamErrorRateMaxRange: '上游错误率最大值必须在 0-100 之间' - } + upstreamErrorRateMaxRange: '上游错误率最大值必须在 0-100 之间', + healthScoreErrorRateRange: '健康评分错误率阈值必须在 0-100 之间', + healthScoreErrorRateOrder: '健康评分错误率满分点必须小于零分点', + healthScoreTTFTRange: '健康评分 TTFT P99 阈值必须大于或等于 0', + healthScoreTTFTOrder: '健康评分 TTFT P99 满分点必须小于零分点' + }, + healthScoreThresholds: '业务健康评分阈值', + healthScoreThresholdsHint: '仅影响综合健康分,不影响卡片红黄展示和告警阈值。', + healthScoreErrorRateFullPercent: '错误率满分点(%)', + healthScoreErrorRateFullPercentHint: '错误率低于或等于此值时,业务健康中的错误率部分为满分(默认:1%)。', + healthScoreErrorRateZeroPercent: '错误率零分点(%)', + healthScoreErrorRateZeroPercentHint: '错误率高于或等于此值时,业务健康中的错误率部分为零分(默认:10%)。', + healthScoreTTFTP99FullMs: 'TTFT P99 满分点(毫秒)', + healthScoreTTFTP99FullMsHint: 'TTFT P99 低于或等于此值时,业务健康中的 TTFT 部分为满分(默认:1000ms)。', + healthScoreTTFTP99ZeroMs: 'TTFT P99 零分点(毫秒)', + healthScoreTTFTP99ZeroMsHint: 'TTFT P99 高于或等于此值时,业务健康中的 TTFT 部分为零分(默认:3000ms)。' }, email: { title: '邮件通知配置', @@ -5104,7 +5118,7 @@ export default { dailySummary: '每日摘要', weeklySummary: '每周摘要', metricThresholds: '指标阈值配置', - metricThresholdsHint: '配置各项指标的告警阈值,超出阈值时将以红色显示', + metricThresholdsHint: '配置展示/告警阈值,以及独立的综合健康评分阈值', slaMinPercent: 'SLA最低百分比', slaMinPercentHint: 'SLA低于此值时显示为红色(默认:99.5%)', ttftP99MaxMs: 'TTFT P99最大值(毫秒)', @@ -5113,6 +5127,16 @@ export default { requestErrorRateMaxPercentHint: '请求错误率高于此值时显示为红色(默认:5%)', upstreamErrorRateMaxPercent: '上游错误率最大值(%)', upstreamErrorRateMaxPercentHint: '上游错误率高于此值时显示为红色(默认:5%)', + healthScoreThresholds: '业务健康评分阈值', + healthScoreThresholdsHint: '仅影响综合健康分,不影响上方展示/告警阈值。', + healthScoreErrorRateFullPercent: '错误率满分点(%)', + healthScoreErrorRateFullPercentHint: '错误率低于或等于此值时,业务健康中的错误率部分为满分(默认:1%)', + healthScoreErrorRateZeroPercent: '错误率零分点(%)', + healthScoreErrorRateZeroPercentHint: '错误率高于或等于此值时,业务健康中的错误率部分为零分(默认:10%)', + healthScoreTTFTP99FullMs: 'TTFT P99 满分点(毫秒)', + healthScoreTTFTP99FullMsHint: 'TTFT P99 低于或等于此值时,业务健康中的 TTFT 部分为满分(默认:1000ms)', + healthScoreTTFTP99ZeroMs: 'TTFT P99 零分点(毫秒)', + healthScoreTTFTP99ZeroMsHint: 'TTFT P99 高于或等于此值时,业务健康中的 TTFT 部分为零分(默认:3000ms)', advancedSettings: '高级设置', dataRetention: '数据保留策略', enableCleanup: '启用数据清理', @@ -5156,7 +5180,11 @@ export default { slaMinPercentRange: 'SLA最低百分比必须在0-100之间', ttftP99MaxRange: 'TTFT P99最大值必须大于等于0', requestErrorRateMaxRange: '请求错误率最大值必须在0-100之间', - upstreamErrorRateMaxRange: '上游错误率最大值必须在0-100之间' + upstreamErrorRateMaxRange: '上游错误率最大值必须在0-100之间', + healthScoreErrorRateRange: '健康评分错误率阈值必须在0-100之间', + healthScoreErrorRateOrder: '健康评分错误率满分点必须小于零分点', + healthScoreTTFTRange: '健康评分 TTFT P99 阈值必须大于等于0', + healthScoreTTFTOrder: '健康评分 TTFT P99 满分点必须小于零分点' } }, concurrency: { diff --git a/frontend/src/views/admin/ops/components/OpsRuntimeSettingsCard.vue b/frontend/src/views/admin/ops/components/OpsRuntimeSettingsCard.vue index 82c19f4ffa6..21808388c47 100644 --- a/frontend/src/views/admin/ops/components/OpsRuntimeSettingsCard.vue +++ b/frontend/src/views/admin/ops/components/OpsRuntimeSettingsCard.vue @@ -5,6 +5,7 @@ import { useAppStore } from '@/stores/app' import { opsAPI } from '@/api/admin/ops' import type { OpsAlertRuntimeSettings } from '../types' import BaseDialog from '@/components/common/BaseDialog.vue' +import { collectOpsMetricThresholdErrors, normalizeOpsMetricThresholds } from '../metricThresholds' const { t } = useI18n() const appStore = useAppStore() @@ -45,30 +46,7 @@ function validateRuntimeSettings(settings: OpsAlertRuntimeSettings): ValidationR errors.push(t('admin.ops.runtime.validation.evalIntervalRange')) } - // Thresholds validation - const thresholds = settings.thresholds - if (thresholds) { - if (thresholds.sla_percent_min != null) { - if (!Number.isFinite(thresholds.sla_percent_min) || thresholds.sla_percent_min < 0 || thresholds.sla_percent_min > 100) { - errors.push(t('admin.ops.runtime.validation.slaMinPercentRange')) - } - } - if (thresholds.ttft_p99_ms_max != null) { - if (!Number.isFinite(thresholds.ttft_p99_ms_max) || thresholds.ttft_p99_ms_max < 0) { - errors.push(t('admin.ops.runtime.validation.ttftP99MaxRange')) - } - } - if (thresholds.request_error_rate_percent_max != null) { - if (!Number.isFinite(thresholds.request_error_rate_percent_max) || thresholds.request_error_rate_percent_max < 0 || thresholds.request_error_rate_percent_max > 100) { - errors.push(t('admin.ops.runtime.validation.requestErrorRateMaxRange')) - } - } - if (thresholds.upstream_error_rate_percent_max != null) { - if (!Number.isFinite(thresholds.upstream_error_rate_percent_max) || thresholds.upstream_error_rate_percent_max < 0 || thresholds.upstream_error_rate_percent_max > 100) { - errors.push(t('admin.ops.runtime.validation.upstreamErrorRateMaxRange')) - } - } - } + errors.push(...collectOpsMetricThresholdErrors(settings.thresholds, (key) => t(`admin.ops.runtime.${key}`))) const lock = settings.distributed_lock if (lock?.enabled) { @@ -155,14 +133,7 @@ function openAlertEditor() { if (!Array.isArray(draftAlert.value.silencing.entries)) { draftAlert.value.silencing.entries = [] } - if (!draftAlert.value.thresholds) { - draftAlert.value.thresholds = { - sla_percent_min: 99.5, - ttft_p99_ms_max: 500, - request_error_rate_percent_max: 5, - upstream_error_rate_percent_max: 5 - } - } + draftAlert.value.thresholds = normalizeOpsMetricThresholds(draftAlert.value.thresholds) } showAlertEditor.value = true @@ -389,6 +360,65 @@ onMounted(() => { />
{{ t('admin.ops.runtime.upstreamErrorRateMaxPercentHint') }}
+ +{{ t('admin.ops.runtime.healthScoreThresholdsHint') }}
+{{ t('admin.ops.runtime.healthScoreErrorRateFullPercentHint') }}
+{{ t('admin.ops.runtime.healthScoreErrorRateZeroPercentHint') }}
+{{ t('admin.ops.runtime.healthScoreTTFTP99FullMsHint') }}
+{{ t('admin.ops.runtime.healthScoreTTFTP99ZeroMsHint') }}
+{{ t('admin.ops.settings.upstreamErrorRateMaxPercentHint') }}
+ +{{ t('admin.ops.settings.healthScoreThresholdsHint') }}
+{{ t('admin.ops.settings.healthScoreErrorRateFullPercentHint') }}
+{{ t('admin.ops.settings.healthScoreErrorRateZeroPercentHint') }}
+{{ t('admin.ops.settings.healthScoreTTFTP99FullMsHint') }}
+{{ t('admin.ops.settings.healthScoreTTFTP99ZeroMsHint') }}
+