From 196db810d387b195c69d120cc1b388b9951ec9f2 Mon Sep 17 00:00:00 2001 From: Nimm0ny Date: Wed, 6 May 2026 10:08:36 +0800 Subject: [PATCH] fix(openai): trigger account failover on passthrough 403 forbidden_error MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit OpenAI passthrough mode (OAuth-bridged Codex / ChatGPT subscription accounts) previously only failed over on 429/529, returning upstream 403 unchanged to the client. This means a Codex usage-limit-reached response (403 with {"error": {"type": "forbidden_error"}}) would terminate the user's request even though the same group has other healthy accounts available. Add http.StatusForbidden to shouldFailoverOpenAIPassthroughResponse so the existing handler.FailoverState loop kicks in. Test coverage extended: oauth_403_temp_unschedulable verifies the account is temp-unscheduled with "OpenAI 403 temporary cooldown" reason (10min) + IncrementOpenAI403Count threshold (3 strikes before hard-disable) — same shape as the 429/529 paths, no new state machine. Verified locally with go test; ready to send upstream as a separate PR. --- .../service/openai_gateway_service.go | 2 +- .../service/openai_oauth_passthrough_test.go | 54 +++++++++++++++++-- 2 files changed, 52 insertions(+), 4 deletions(-) diff --git a/backend/internal/service/openai_gateway_service.go b/backend/internal/service/openai_gateway_service.go index a5fe707d9fc..8f78e87ff3e 100644 --- a/backend/internal/service/openai_gateway_service.go +++ b/backend/internal/service/openai_gateway_service.go @@ -3199,7 +3199,7 @@ func (s *OpenAIGatewayService) buildUpstreamRequestOpenAIPassthrough( func shouldFailoverOpenAIPassthroughResponse(statusCode int) bool { switch statusCode { - case http.StatusTooManyRequests, 529: + case http.StatusForbidden, http.StatusTooManyRequests, 529: return true default: return false diff --git a/backend/internal/service/openai_oauth_passthrough_test.go b/backend/internal/service/openai_oauth_passthrough_test.go index 398cbb850b4..dbfb64029d4 100644 --- a/backend/internal/service/openai_oauth_passthrough_test.go +++ b/backend/internal/service/openai_oauth_passthrough_test.go @@ -147,8 +147,12 @@ func TestOpenAIGatewayService_OAuthMessagesBridgeDoesNotInjectDefaultInstruction type openAIPassthroughFailoverRepo struct { stubOpenAIAccountRepo - rateLimitCalls []time.Time - overloadCalls []time.Time + rateLimitCalls []time.Time + overloadCalls []time.Time + tempUnschedulableIDs []int64 + tempUnschedulableAt []time.Time + tempUnschedulableWhy []string + setErrorCalls []string } func (r *openAIPassthroughFailoverRepo) SetRateLimited(_ context.Context, _ int64, resetAt time.Time) error { @@ -161,6 +165,35 @@ func (r *openAIPassthroughFailoverRepo) SetOverloaded(_ context.Context, _ int64 return nil } +func (r *openAIPassthroughFailoverRepo) SetTempUnschedulable(_ context.Context, id int64, until time.Time, reason string) error { + r.tempUnschedulableIDs = append(r.tempUnschedulableIDs, id) + r.tempUnschedulableAt = append(r.tempUnschedulableAt, until) + r.tempUnschedulableWhy = append(r.tempUnschedulableWhy, reason) + return nil +} + +func (r *openAIPassthroughFailoverRepo) SetError(_ context.Context, _ int64, errorMsg string) error { + r.setErrorCalls = append(r.setErrorCalls, errorMsg) + return nil +} + +type openAIPassthrough403CounterStub struct { + counts []int64 +} + +func (s *openAIPassthrough403CounterStub) IncrementOpenAI403Count(context.Context, int64, int) (int64, error) { + if len(s.counts) == 0 { + return 1, nil + } + count := s.counts[0] + s.counts = s.counts[1:] + return count, nil +} + +func (s *openAIPassthrough403CounterStub) ResetOpenAI403Count(context.Context, int64) error { + return nil +} + var structuredLogCaptureMu sync.Mutex type inMemoryLogSink struct { @@ -745,7 +778,7 @@ func TestOpenAIGatewayService_OAuthPassthrough_UpstreamErrorIncludesPassthroughF require.Equal(t, "http_error", arr[len(arr)-1].Kind) } -func TestOpenAIGatewayService_OpenAIPassthrough_429And529TriggerFailover(t *testing.T) { +func TestOpenAIGatewayService_OpenAIPassthrough_FailoverStatusesTriggerAccountSwitch(t *testing.T) { gin.SetMode(gin.TestMode) originalBody := []byte(`{"model":"gpt-5.2","stream":false,"instructions":"local-test-instructions","input":[{"type":"text","text":"hi"}]}`) @@ -777,6 +810,20 @@ func TestOpenAIGatewayService_OpenAIPassthrough_429And529TriggerFailover(t *test body string assertRepo func(t *testing.T, repo *openAIPassthroughFailoverRepo, start time.Time) }{ + { + name: "oauth_403_temp_unschedulable", + accountType: AccountTypeOAuth, + statusCode: http.StatusForbidden, + body: `{"error":{"message":"usage limit reached","type":"forbidden_error"}}`, + assertRepo: func(t *testing.T, repo *openAIPassthroughFailoverRepo, _ time.Time) { + require.Empty(t, repo.rateLimitCalls) + require.Empty(t, repo.overloadCalls) + require.Equal(t, []int64{123}, repo.tempUnschedulableIDs) + require.Len(t, repo.tempUnschedulableAt, 1) + require.True(t, time.Until(repo.tempUnschedulableAt[0]) > 9*time.Minute) + require.Contains(t, repo.tempUnschedulableWhy[0], "OpenAI 403 temporary cooldown") + }, + }, { name: "oauth_429_rate_limit", accountType: AccountTypeOAuth, @@ -852,6 +899,7 @@ func TestOpenAIGatewayService_OpenAIPassthrough_429And529TriggerFailover(t *test RateLimit: config.RateLimitConfig{OverloadCooldownMinutes: 10}, }, } + rateSvc.SetOpenAI403CounterCache(&openAIPassthrough403CounterStub{counts: []int64{1}}) svc := &OpenAIGatewayService{ cfg: &config.Config{Gateway: config.GatewayConfig{ForceCodexCLI: false}},