From 6748ed863932c3cf67491aaadeb6fc2bb91468c3 Mon Sep 17 00:00:00 2001 From: nams1570 Date: Wed, 11 Feb 2026 11:37:15 -0800 Subject: [PATCH 01/12] feat: db and schema updates for new retry logic chose to name it failedSendAttemptCount as the nextSendRetryAt can only be set when a send attempt fails. nextSendRetryAt controls the retry logic. sendAttemptErrors is named as such because it tracks all of the send errors not just retry errors. We also put in api schema updates and type updates. --- .../migration.sql | 25 +++++++++++++++++++ apps/backend/prisma/schema.prisma | 14 ++++++++--- .../src/interface/crud/email-outbox.ts | 13 ++++++++++ .../apps/implementations/admin-app-impl.ts | 18 +++++++++++++ .../template/src/lib/stack-app/email/index.ts | 14 +++++++++++ packages/template/src/lib/stack-app/index.ts | 1 + 6 files changed, 82 insertions(+), 3 deletions(-) create mode 100644 apps/backend/prisma/migrations/20260210000000_deferred_email_retry/migration.sql diff --git a/apps/backend/prisma/migrations/20260210000000_deferred_email_retry/migration.sql b/apps/backend/prisma/migrations/20260210000000_deferred_email_retry/migration.sql new file mode 100644 index 0000000000..b8f3adc729 --- /dev/null +++ b/apps/backend/prisma/migrations/20260210000000_deferred_email_retry/migration.sql @@ -0,0 +1,25 @@ +-- Add deferred retry fields for email sending +-- These fields allow the email queue to schedule retries for later iterations +-- instead of blocking the current iteration with inline retries. + +ALTER TABLE "EmailOutbox" + ADD COLUMN "failedSendAttemptCount" INTEGER NOT NULL DEFAULT 0, + ADD COLUMN "nextSendRetryAt" TIMESTAMP(3), + ADD COLUMN "sendAttemptErrors" JSONB; + +-- Constraint: nextSendRetryAt can only be set after at least one failed attempt +-- (if failedSendAttemptCount is 0, no attempt has failed, so there's nothing to retry) +ALTER TABLE "EmailOutbox" + ADD CONSTRAINT "EmailOutbox_nextSendRetryAt_requires_failure" + CHECK ("nextSendRetryAt" IS NULL OR "failedSendAttemptCount" > 0); + +-- Constraint: sendAttemptErrors can only be set after at least one failed attempt +ALTER TABLE "EmailOutbox" + ADD CONSTRAINT "EmailOutbox_sendAttemptErrors_requires_failure" + CHECK ("sendAttemptErrors" IS NULL OR "failedSendAttemptCount" > 0); + +-- Constraint: nextSendRetryAt must be null when email has finished sending +-- (if finishedSendingAt is set, there's nothing more to retry) +ALTER TABLE "EmailOutbox" + ADD CONSTRAINT "EmailOutbox_no_retry_after_finished" + CHECK ("finishedSendingAt" IS NULL OR "nextSendRetryAt" IS NULL); diff --git a/apps/backend/prisma/schema.prisma b/apps/backend/prisma/schema.prisma index 21b781e81a..7712ed8713 100644 --- a/apps/backend/prisma/schema.prisma +++ b/apps/backend/prisma/schema.prisma @@ -99,8 +99,8 @@ model ExternalDbSyncMetadata { singleton BooleanTrue @unique @default(TRUE) - sequencerEnabled Boolean @default(true) - pollerEnabled Boolean @default(true) + sequencerEnabled Boolean @default(true) + pollerEnabled Boolean @default(true) createdAt DateTime @default(now()) updatedAt DateTime @updatedAt @@ -844,6 +844,14 @@ model EmailOutbox { // if startedSendingAt is not set, then finishedSendingAt is also not set finishedSendingAt DateTime? + // Deferred retry fields for email sending + // Number of failed send attempts (starts at 0, incremented on each failure). Reset when email content is edited. + failedSendAttemptCount Int @default(0) + // When to retry sending (null = not waiting for retry). Set when a retryable error occurs. Reset when email content is edited. Must be null if failedSendAttemptCount is 0 (enforced by EmailOutbox_nextSendRetryAt_requires_failure). Must be null when finishedSendingAt is set (enforced by EmailOutbox_no_retry_after_finished). + nextSendRetryAt DateTime? + // JSON array of errors from each failed send attempt. Each entry has: { attemptNumber, timestamp, externalMessage, externalDetails, internalMessage, internalDetails }. Reset when email content is edited. Must be null if failedSendAttemptCount is 0 (enforced by EmailOutbox_sendAttemptErrors_requires_failure). + sendAttemptErrors Json? + // A generated column that is equal to finishedSendingAt if canHaveDeliveryInfo is false, otherwise deliveredAt. sentAt DateTime? @default(dbgenerated("\nCASE\n WHEN (\"canHaveDeliveryInfo\" IS TRUE) THEN \"deliveredAt\"\n WHEN (\"canHaveDeliveryInfo\" IS FALSE) THEN \"finishedSendingAt\"\n ELSE NULL::timestamp without time zone\nEND")) @@ -1088,7 +1096,7 @@ model OutgoingRequest { qstashOptions Json startedFulfillingAt DateTime? - deduplicationKey String? + deduplicationKey String? // Partial unique index on deduplicationKey WHERE startedFulfillingAt IS NULL // is created in a custom migration (not expressible in Prisma schema) diff --git a/packages/stack-shared/src/interface/crud/email-outbox.ts b/packages/stack-shared/src/interface/crud/email-outbox.ts index 3629f81b7e..ddc785bd83 100644 --- a/packages/stack-shared/src/interface/crud/email-outbox.ts +++ b/packages/stack-shared/src/interface/crud/email-outbox.ts @@ -40,6 +40,19 @@ const emailOutboxBaseSchema = fieldSchema.yupObject({ skip_deliverability_check: fieldSchema.yupBoolean().defined(), scheduled_at_millis: fieldSchema.yupNumber().defined(), + // Retry-related fields (for debugging/testing deferred retry logic) + failed_send_attempt_count: fieldSchema.yupNumber().defined(), + next_send_retry_at_millis: fieldSchema.yupNumber().nullable().defined(), + // Array of errors from each failed send attempt, each with internal/external messages + send_attempt_errors: fieldSchema.yupArray(fieldSchema.yupObject({ + attempt_number: fieldSchema.yupNumber().defined(), + timestamp: fieldSchema.yupString().defined(), + external_message: fieldSchema.yupString().defined(), + external_details: fieldSchema.yupRecord(fieldSchema.yupString(), fieldSchema.jsonSchema).defined(), + internal_message: fieldSchema.yupString().defined(), + internal_details: fieldSchema.yupRecord(fieldSchema.yupString(), fieldSchema.jsonSchema).defined(), + }).defined()).nullable().defined(), + status: fieldSchema.yupString().defined(), simple_status: fieldSchema.yupString().defined(), diff --git a/packages/template/src/lib/stack-app/apps/implementations/admin-app-impl.ts b/packages/template/src/lib/stack-app/apps/implementations/admin-app-impl.ts index 8f9b331b21..9dd25fe9a4 100644 --- a/packages/template/src/lib/stack-app/apps/implementations/admin-app-impl.ts +++ b/packages/template/src/lib/stack-app/apps/implementations/admin-app-impl.ts @@ -733,6 +733,24 @@ export class _StackAdminAppImplIncomplete, + internal_message: string, + internal_details: Record, + }>).map((e) => ({ + attemptNumber: e.attempt_number, + timestamp: e.timestamp, + externalMessage: e.external_message, + externalDetails: e.external_details, + internalMessage: e.internal_message, + internalDetails: e.internal_details, + })) : null, }; // Rendered fields (available after rendering completes successfully) diff --git a/packages/template/src/lib/stack-app/email/index.ts b/packages/template/src/lib/stack-app/email/index.ts index 36a73e5ba8..c3cb3e8d51 100644 --- a/packages/template/src/lib/stack-app/email/index.ts +++ b/packages/template/src/lib/stack-app/email/index.ts @@ -36,6 +36,16 @@ export type AdminEmailOutboxSimpleStatus = | "ok" | "error"; +// Error entry from a failed send attempt +export type AdminSendAttemptError = { + attemptNumber: number, + timestamp: string, + externalMessage: string, + externalDetails: Record, + internalMessage: string, + internalDetails: Record, +}; + // =============================== BASE TYPES =============================== // Base fields present on all emails @@ -48,6 +58,10 @@ type AdminEmailOutboxBase = { isPaused: false, hasRendered: false, hasDelivered: false, + // Retry tracking fields + failedSendAttemptCount: number, + nextSendRetryAt: Date | null, + sendAttemptErrors: AdminSendAttemptError[] | null, }; // Fields available after rendering completes successfully diff --git a/packages/template/src/lib/stack-app/index.ts b/packages/template/src/lib/stack-app/index.ts index 87bf010a0f..01f9b11af1 100644 --- a/packages/template/src/lib/stack-app/index.ts +++ b/packages/template/src/lib/stack-app/index.ts @@ -56,6 +56,7 @@ export type { AdminEmailOutboxRecipient, AdminEmailOutboxSimpleStatus, AdminEmailOutboxStatus, + AdminSendAttemptError, AdminSentEmail } from "./email"; From 87b1c9c092ba6cee4ffeca36a17f37f92b34098a Mon Sep 17 00:00:00 2001 From: nams1570 Date: Wed, 11 Feb 2026 12:52:58 -0800 Subject: [PATCH 02/12] refactor: remove low-level email retry Spending too much time on retries in the low level email sending clogs the email-queue-step. This can lead to cascading delays if there is an outage with our sender provider. This will cause a vercel runtime failure. Removing the retry logic also obviates the need for separate funcs. --- apps/backend/src/lib/emails-low-level.tsx | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/apps/backend/src/lib/emails-low-level.tsx b/apps/backend/src/lib/emails-low-level.tsx index a91e31744c..a2d0e6240b 100644 --- a/apps/backend/src/lib/emails-low-level.tsx +++ b/apps/backend/src/lib/emails-low-level.tsx @@ -214,7 +214,23 @@ export async function lowLevelSendEmailDirectWithoutRetries(options: LowLevelSen canRetry: boolean, message?: string, }>> { - return await _lowLevelSendEmailWithoutRetries(options); + if (!options.to) { + throw new StackAssertionError("No recipient email address provided to sendEmail", omit(options, ['emailConfig'])); + } + + const result = await _lowLevelSendEmailWithoutRetries(options); + + if (result.status === 'error') { + console.warn("Failed to send email.", { + host: options.emailConfig.host, + from: options.emailConfig.senderEmail, + to: options.to, + subject: options.subject, + error: result.error, + }, result.error.rawError); + } + + return result; } // currently unused, although in the future we may want to use this to minimize the number of requests to Resend From ace7319471ab7b00489035ab64acf3579e224092 Mon Sep 17 00:00:00 2001 From: nams1570 Date: Wed, 11 Feb 2026 13:10:36 -0800 Subject: [PATCH 03/12] refactor: nix unused low-level resend func The comment indicates this isn't used anywhere else. There are no uses of it in our codebase. --- apps/backend/src/lib/emails-low-level.tsx | 100 ---------------------- 1 file changed, 100 deletions(-) diff --git a/apps/backend/src/lib/emails-low-level.tsx b/apps/backend/src/lib/emails-low-level.tsx index a2d0e6240b..0183c28a70 100644 --- a/apps/backend/src/lib/emails-low-level.tsx +++ b/apps/backend/src/lib/emails-low-level.tsx @@ -10,8 +10,6 @@ import { runAsynchronously, wait } from '@stackframe/stack-shared/dist/utils/pro import { Result } from '@stackframe/stack-shared/dist/utils/results'; import { traceSpan } from '@stackframe/stack-shared/dist/utils/telemetry'; import nodemailer from 'nodemailer'; -import { Resend } from 'resend'; -import { getTenancy } from './tenancies'; export function isSecureEmailPort(port: number | string) { // "secure" in most SMTP clients means implicit TLS from byte 1 (SMTPS) @@ -232,101 +230,3 @@ export async function lowLevelSendEmailDirectWithoutRetries(options: LowLevelSen return result; } - -// currently unused, although in the future we may want to use this to minimize the number of requests to Resend -export async function lowLevelSendEmailResendBatchedDirect(resendApiKey: string, emailOptions: LowLevelSendEmailOptions[]) { - if (emailOptions.length === 0) { - return Result.ok([]); - } - if (emailOptions.length > 100) { - throw new StackAssertionError("sendEmailResendBatchedDirect expects at most 100 emails to be sent at once", { emailOptions }); - } - if (emailOptions.some(option => option.tenancyId !== emailOptions[0].tenancyId)) { - throw new StackAssertionError("sendEmailResendBatchedDirect expects all emails to be sent from the same tenancy", { emailOptions }); - } - const tenancy = await getTenancy(emailOptions[0].tenancyId); - if (!tenancy) { - throw new StackAssertionError("Tenancy not found"); - } - const resend = new Resend(resendApiKey); - const result = await Result.retry(async (_) => { - const { data, error } = await resend.batch.send(emailOptions.map((option) => ({ - from: option.emailConfig.senderEmail, - to: option.to, - subject: option.subject, - html: option.html ?? "", - text: option.text, - }))); - - if (data) { - return Result.ok(data.data); - } - if (error.name === "rate_limit_exceeded" || error.name === "internal_server_error") { - // these are the errors we want to retry - return Result.error(error); - } - throw new StackAssertionError("Failed to send email with Resend", { error }); - }, 3, { exponentialDelayBase: 2000 }); - - return result; -} - -export async function lowLevelSendEmailDirectViaProvider(options: LowLevelSendEmailOptions): Promise> { - if (!options.to) { - throw new StackAssertionError("No recipient email address provided to sendEmail", omit(options, ['emailConfig'])); - } - - class DoNotRetryError extends Error { - constructor(public readonly errorObj: { - rawError: any, - errorType: string, - canRetry: boolean, - message?: string, - }) { - super("This error should never be caught anywhere else but inside the lowLevelSendEmailDirectViaProvider function, something went wrong if you see this!"); - } - } - - let result; - try { - result = await Result.retry(async (attempt) => { - const result = await lowLevelSendEmailDirectWithoutRetries(options); - - if (result.status === 'error') { - const extraData = { - host: options.emailConfig.host, - from: options.emailConfig.senderEmail, - to: options.to, - subject: options.subject, - error: result.error, - }; - - if (result.error.canRetry) { - console.warn("Failed to send email, but error is possibly transient so retrying.", extraData, result.error.rawError); - return Result.error(result.error); - } - - console.warn("Failed to send email, and error is not transient, so not retrying.", extraData, result.error.rawError); - throw new DoNotRetryError(result.error); - } - - return result; - }, 9, { exponentialDelayBase: 125 }); - } catch (error) { - if (error instanceof DoNotRetryError) { - return Result.error(error.errorObj); - } - throw error; - } - - if (result.status === 'error') { - console.warn("Failed to send email after all retries!", result.error); - return Result.error(result.error.errors[0]); - } - return Result.ok(undefined); -} From 4897f62844627b15e7e2da8d1f15ae9727131cdf Mon Sep 17 00:00:00 2001 From: nams1570 Date: Wed, 11 Feb 2026 19:10:54 -0800 Subject: [PATCH 04/12] feat: rework email queue step retry logic Now we mark emails to be retried in one iteration, and retry them in the next iteration. --- .../src/app/api/latest/emails/outbox/crud.tsx | 29 ++ apps/backend/src/lib/email-queue-step.tsx | 149 ++++++-- .../api/v1/emails/email-queue.test.ts | 336 +++++++++++++++++- 3 files changed, 491 insertions(+), 23 deletions(-) diff --git a/apps/backend/src/app/api/latest/emails/outbox/crud.tsx b/apps/backend/src/app/api/latest/emails/outbox/crud.tsx index a4aa26a916..42d76eec31 100644 --- a/apps/backend/src/app/api/latest/emails/outbox/crud.tsx +++ b/apps/backend/src/app/api/latest/emails/outbox/crud.tsx @@ -7,6 +7,7 @@ import { KnownErrors } from "@stackframe/stack-shared"; import { emailOutboxCrud, EmailOutboxCrud } from "@stackframe/stack-shared/dist/interface/crud/email-outbox"; import { yupObject, yupString } from "@stackframe/stack-shared/dist/schema-fields"; import { StackAssertionError, StatusError, throwErr } from "@stackframe/stack-shared/dist/utils/errors"; +import { Json } from "@stackframe/stack-shared/dist/utils/json"; import { createLazyProxy } from "@stackframe/stack-shared/dist/utils/proxies"; /** @@ -57,6 +58,25 @@ function prismaModelToCrud(prismaModel: EmailOutbox): EmailOutboxCrud["Server"][ to = { type: "custom-emails", emails: recipient?.emails ?? [] }; } + // Convert sendAttemptErrors from DB format (camelCase) to API format (snake_case) + const sendAttemptErrors = prismaModel.sendAttemptErrors + ? (prismaModel.sendAttemptErrors as Array<{ + attemptNumber: number, + timestamp: string, + externalMessage: string, + externalDetails: Record, + internalMessage: string, + internalDetails: Record, + }>).map(e => ({ + attempt_number: e.attemptNumber, + timestamp: e.timestamp, + external_message: e.externalMessage, + external_details: e.externalDetails, + internal_message: e.internalMessage, + internal_details: e.internalDetails, + })) + : null; + // Base fields present on all emails const base = { id: prismaModel.id, @@ -68,6 +88,9 @@ function prismaModelToCrud(prismaModel: EmailOutbox): EmailOutboxCrud["Server"][ variables: (prismaModel.extraRenderVariables ?? {}) as Record, skip_deliverability_check: prismaModel.shouldSkipDeliverabilityCheck, scheduled_at_millis: prismaModel.scheduledAt.getTime(), + failed_send_attempt_count: prismaModel.failedSendAttemptCount, + next_send_retry_at_millis: prismaModel.nextSendRetryAt?.getTime() ?? null, + send_attempt_errors: sendAttemptErrors, // Default flags (overridden in specific statuses) is_paused: false, has_rendered: false, @@ -395,6 +418,8 @@ export const emailOutboxCrudHandlers = createLazyProxy(() => createCrudHandlers( // If content changed, reset rendering and sending state if (needsRerenderReset) { set("isQueued", Prisma.sql`false`); + // Reset retry fields (failedSendAttemptCount to 0, others to null) + set("failedSendAttemptCount", Prisma.sql`0`); setNull( "renderedByWorkerId", "startedRenderingAt", "finishedRenderingAt", "renderErrorExternalMessage", "renderErrorExternalDetails", @@ -402,6 +427,7 @@ export const emailOutboxCrudHandlers = createLazyProxy(() => createCrudHandlers( "renderedHtml", "renderedText", "renderedSubject", "renderedIsTransactional", "renderedNotificationCategoryId", "startedSendingAt", "finishedSendingAt", + "nextSendRetryAt", "sendAttemptErrors", "sendServerErrorExternalMessage", "sendServerErrorExternalDetails", "sendServerErrorInternalMessage", "sendServerErrorInternalDetails", "skippedReason", "skippedDetails", "canHaveDeliveryInfo", @@ -494,6 +520,9 @@ function parseEmailOutboxFromJson(j: Record): EmailOutbox { scheduledAtIfNotYetQueued: dateOrNull("scheduledAtIfNotYetQueued"), startedSendingAt: dateOrNull("startedSendingAt"), finishedSendingAt: dateOrNull("finishedSendingAt"), + failedSendAttemptCount: j.failedSendAttemptCount as number, + nextSendRetryAt: dateOrNull("nextSendRetryAt"), + sendAttemptErrors: j.sendAttemptErrors as Prisma.JsonValue, sentAt: dateOrNull("sentAt"), sendServerErrorExternalMessage: j.sendServerErrorExternalMessage as string | null, sendServerErrorExternalDetails: j.sendServerErrorExternalDetails as Prisma.JsonValue, diff --git a/apps/backend/src/lib/email-queue-step.tsx b/apps/backend/src/lib/email-queue-step.tsx index 0d72ea2e1f..e82b22d7a9 100644 --- a/apps/backend/src/lib/email-queue-step.tsx +++ b/apps/backend/src/lib/email-queue-step.tsx @@ -15,10 +15,40 @@ import { filterUndefined } from "@stackframe/stack-shared/dist/utils/objects"; import { Result } from "@stackframe/stack-shared/dist/utils/results"; import { traceSpan } from "@stackframe/stack-shared/dist/utils/telemetry"; import { randomUUID } from "node:crypto"; -import { lowLevelSendEmailDirectViaProvider } from "./emails-low-level"; +import { lowLevelSendEmailDirectWithoutRetries } from "./emails-low-level"; const MAX_RENDER_BATCH = 50; +const MAX_SEND_ATTEMPTS = 3; + +const RETRY_BACKOFF_BASE_MS = 2000; + +const calculateRetryBackoffMs = (attemptCount: number): number => { + return (Math.random() + 0.5) * RETRY_BACKOFF_BASE_MS * Math.pow(2, attemptCount); +}; + +/** + * Structure for tracking errors from each send attempt. + * Mirrors the pattern used for sendServerError* fields. + * Uses Prisma.InputJsonValue-compatible types for DB storage. + */ +type SendAttemptError = { + attemptNumber: number, + timestamp: string, + externalMessage: string, + externalDetails: Prisma.InputJsonObject, + internalMessage: string, + internalDetails: Prisma.InputJsonObject, +}; + +const appendSendAttemptError =( + existingErrors: SendAttemptError[] | null | undefined, + newError: SendAttemptError +): SendAttemptError[] => { + const errors = existingErrors ?? []; + return [...errors, newError]; +}; + // Track if email queue has run at least once since server start (used to suppress first-run delta warnings in dev) const emailQueueFirstRunKey = Symbol.for("__stack_email_queue_first_run_completed"); @@ -480,6 +510,7 @@ async function queueReadyEmails(): Promise<{ queuedCount: number }> { AND "finishedRenderingAt" IS NOT NULL AND "renderedHtml" IS NOT NULL AND "scheduledAt" <= NOW() + AND "nextSendRetryAt" IS NULL RETURNING "id"; `; return { @@ -488,11 +519,16 @@ async function queueReadyEmails(): Promise<{ queuedCount: number }> { } async function prepareSendPlan(deltaSeconds: number): Promise { + // Find tenancies with emails ready to send (either new emails or emails ready for retry) const tenancyIds = await globalPrismaClient.emailOutbox.findMany({ where: { - isQueued: true, isPaused: false, - startedSendingAt: null, + OR: [ + // Normal case: queued, not started, and no pending retry + { startedSendingAt: null, isQueued: true, nextSendRetryAt: null }, + // Retry case: past retry time + { nextSendRetryAt: { lte: new Date() } }, + ], }, distinct: ["tenancyId"], select: { tenancyId: true }, @@ -523,16 +559,23 @@ async function claimEmailsForSending(tx: PrismaClientTransaction, tenancyId: str SELECT "tenancyId", "id" FROM "EmailOutbox" WHERE "tenancyId" = ${tenancyId}::uuid - AND "isQueued" = TRUE AND "isPaused" = FALSE AND "finishedRenderingAt" IS NOT NULL - AND "startedSendingAt" IS NULL + AND ( + -- Normal case: queued, not started, and no pending retry + ("startedSendingAt" IS NULL AND "isQueued" = TRUE AND "nextSendRetryAt" IS NULL) + OR + -- Retry case: past retry time + ("nextSendRetryAt" IS NOT NULL AND "nextSendRetryAt" <= NOW()) + ) ORDER BY "priority" DESC, "scheduledAt" ASC, "createdAt" ASC LIMIT ${limit} FOR UPDATE SKIP LOCKED ) UPDATE "EmailOutbox" AS e - SET "startedSendingAt" = NOW() + SET + "startedSendingAt" = NOW(), + "nextSendRetryAt" = NULL FROM selected WHERE e."tenancyId" = selected."tenancyId" AND e."id" = selected."id" RETURNING e.*; @@ -640,7 +683,7 @@ async function processSingleEmail(context: TenancyProcessingContext, row: EmailO const result = getEnvBoolean("STACK_EMAIL_BRANCHING_DISABLE_QUEUE_SENDING") ? Result.error({ errorType: "email-sending-disabled", canRetry: false, message: "Email sending is disabled", rawError: new Error("Email sending is disabled") }) - : await lowLevelSendEmailDirectViaProvider({ + : await lowLevelSendEmailDirectWithoutRetries({ tenancyId: context.tenancy.id, emailConfig: context.emailConfig, to: resolution.emails, @@ -650,24 +693,86 @@ async function processSingleEmail(context: TenancyProcessingContext, row: EmailO }); if (result.status === "error") { - await globalPrismaClient.emailOutbox.update({ - where: { - tenancyId_id: { + const newAttemptCount = row.failedSendAttemptCount + 1; + const canRetry = result.error.canRetry && newAttemptCount < MAX_SEND_ATTEMPTS; + + // Build error entry for this attempt + const errorEntry: SendAttemptError = { + attemptNumber: newAttemptCount, + timestamp: new Date().toISOString(), + externalMessage: result.error.message ?? result.error.errorType, + externalDetails: { errorType: result.error.errorType }, + internalMessage: result.error.message ?? result.error.errorType, + internalDetails: { rawError: errorToNiceString(result.error.rawError), errorType: result.error.errorType }, + }; + const updatedErrors = appendSendAttemptError(row.sendAttemptErrors as SendAttemptError[] | null, errorEntry); + + if (canRetry) { + // Schedule retry: unclaim the email and set nextSendRetryAt + const backoffMs = calculateRetryBackoffMs(newAttemptCount); + await globalPrismaClient.emailOutbox.update({ + where: { + tenancyId_id: { + tenancyId: row.tenancyId, + id: row.id, + }, + finishedSendingAt: null, + }, + data: { + startedSendingAt: null, // Unclaim the email + isQueued: false, // Prevent normal queue path from picking it up + failedSendAttemptCount: newAttemptCount, + nextSendRetryAt: new Date(Date.now() + backoffMs), + sendAttemptErrors: updatedErrors as Prisma.InputJsonArray, + }, + }); + } else { + // Mark as permanent failure - distinguish between "attempts exhausted" and "permanent error from provider" + const isAttemptsExhausted = result.error.canRetry && newAttemptCount >= MAX_SEND_ATTEMPTS; + const failureReason = isAttemptsExhausted ? "attempts_exhausted" : "permanent_error"; + + if (isAttemptsExhausted) { + captureError("email-queue-step-retries-exhausted", new StackAssertionError(`Email failed after ${newAttemptCount} attempts`, { + emailId: row.id, tenancyId: row.tenancyId, - id: row.id, + errorType: result.error.errorType, + errorMessage: result.error.message, + allAttemptErrors: updatedErrors, + })); + } + + const externalMessage = isAttemptsExhausted + ? "Email could not be delivered after multiple attempts. Please verify your email configuration and try again." + : result.error.message; + + await globalPrismaClient.emailOutbox.update({ + where: { + tenancyId_id: { + tenancyId: row.tenancyId, + id: row.id, + }, + finishedSendingAt: null, }, - finishedSendingAt: null, - }, - data: { - finishedSendingAt: new Date(), - canHaveDeliveryInfo: false, - sendServerErrorExternalMessage: result.error.message, - sendServerErrorExternalDetails: { errorType: result.error.errorType }, - sendServerErrorInternalMessage: result.error.message, - sendServerErrorInternalDetails: { rawError: errorToNiceString(result.error.rawError), errorType: result.error.errorType }, - }, - }); + data: { + finishedSendingAt: new Date(), + canHaveDeliveryInfo: false, + failedSendAttemptCount: newAttemptCount, + sendAttemptErrors: updatedErrors as Prisma.InputJsonArray, + sendServerErrorExternalMessage: externalMessage, + sendServerErrorExternalDetails: { errorType: result.error.errorType }, + sendServerErrorInternalMessage: result.error.message, + sendServerErrorInternalDetails: { + rawError: errorToNiceString(result.error.rawError), + errorType: result.error.errorType, + attemptCount: newAttemptCount, + failureReason, + allAttemptErrors: updatedErrors as Json[], + }, + }, + }); + } } else { + // Success - mark as sent (don't increment failedSendAttemptCount since this wasn't a failure) await globalPrismaClient.emailOutbox.update({ where: { tenancyId_id: { diff --git a/apps/e2e/tests/backend/endpoints/api/v1/emails/email-queue.test.ts b/apps/e2e/tests/backend/endpoints/api/v1/emails/email-queue.test.ts index 2bb050fd4f..adf1d8f2fa 100644 --- a/apps/e2e/tests/backend/endpoints/api/v1/emails/email-queue.test.ts +++ b/apps/e2e/tests/backend/endpoints/api/v1/emails/email-queue.test.ts @@ -1,7 +1,8 @@ import { wait } from "@stackframe/stack-shared/dist/utils/promises"; import { deindent, nicify } from "@stackframe/stack-shared/dist/utils/strings"; import beautify from "js-beautify"; -import { describe } from "vitest"; +import * as net from "net"; +import { afterAll, beforeAll, describe } from "vitest"; import { it, logIfTestFails } from "../../../../../helpers"; import { withPortPrefix } from "../../../../../helpers/ports"; import { Auth, Project, User, backendContext, bumpEmailAddress, getOutboxEmails, niceBackendFetch, waitForOutboxEmailWithStatus } from "../../../../backend-helpers"; @@ -412,6 +413,7 @@ describe("send email to all users", () => { "can_have_delivery_info": false, "created_at_millis": , "delivered_at_millis": , + "failed_send_attempt_count": 0, "has_delivered": true, "has_rendered": true, "html": "

All users test

", @@ -419,9 +421,11 @@ describe("send email to all users", () => { "is_high_priority": false, "is_paused": false, "is_transactional": true, + "next_send_retry_at_millis": null, "notification_category_id": "", "rendered_at_millis": , "scheduled_at_millis": , + "send_attempt_errors": null, "simple_status": "ok", "skip_deliverability_check": false, "started_rendering_at_millis": , @@ -449,6 +453,7 @@ describe("send email to all users", () => { "can_have_delivery_info": false, "created_at_millis": , "delivered_at_millis": , + "failed_send_attempt_count": 0, "has_delivered": true, "has_rendered": true, "html": "

All users test

", @@ -456,9 +461,11 @@ describe("send email to all users", () => { "is_high_priority": false, "is_paused": false, "is_transactional": true, + "next_send_retry_at_millis": null, "notification_category_id": "", "rendered_at_millis": , "scheduled_at_millis": , + "send_attempt_errors": null, "simple_status": "ok", "skip_deliverability_check": false, "started_rendering_at_millis": , @@ -1846,3 +1853,330 @@ describe("email outbox pagination", () => { }, 60_000); }); +// Invalid SMTP config - causes HOST_NOT_FOUND (non-retryable error) +const brokenSmtpConfig = { + type: "standard", + host: "this-host-does-not-exist.invalid", + port: 25, + username: "test", + password: "test", + sender_name: "Test Project", + sender_email: "test@example.com", +} as const; + +// SMTP server that responds with 450 (temporary failure) - retryable and fast +let tempFailServer: net.Server | null = null; +let tempFailPort: number | null = null; + +async function startTempFailSmtpServer(): Promise { + if (tempFailServer) { + return tempFailPort!; + } + + return await new Promise((resolve, reject) => { + tempFailServer = net.createServer((socket) => { + // Send SMTP greeting + socket.write('220 localhost SMTP Test Server\r\n'); + + socket.on('data', (data) => { + const command = data.toString().trim().toUpperCase(); + // Respond with 450 (temporary failure) to all commands + // This is a retryable error that happens immediately + if (command.startsWith('EHLO') || command.startsWith('HELO')) { + socket.write('250 localhost Hello\r\n'); + } else if (command.startsWith('MAIL FROM')) { + // Temporary failure - "mailbox unavailable, try again later" + socket.write('450 Requested mail action not taken: mailbox unavailable (test)\r\n'); + socket.end(); + } else if (command.startsWith('QUIT')) { + socket.write('221 Bye\r\n'); + socket.end(); + } else { + socket.write('450 Temporary failure (test)\r\n'); + } + }); + + socket.on('error', () => { + // Ignore errors (client may disconnect) + }); + }); + + tempFailServer.listen(0, '127.0.0.1', () => { + const address = tempFailServer!.address(); + if (typeof address === 'object' && address !== null) { + tempFailPort = address.port; + resolve(tempFailPort); + } else { + reject(new Error('Failed to get server address')); + } + }); + + tempFailServer.on('error', reject); + }); +} + +function stopTempFailSmtpServer(): void { + if (tempFailServer) { + tempFailServer.close(); + tempFailServer = null; + tempFailPort = null; + } +} + +// Factory function to create temp-fail SMTP config with dynamic port +function createTempFailSmtpConfig(port: number) { + return { + type: "standard", + host: "127.0.0.1", + port: port, + username: "test", + password: "test", + sender_name: "Test Project", + sender_email: "test@example.com", + } as const; +} + +// Helper type for send attempt error entries +type SendAttemptErrorEntry = { + attempt_number: number, + timestamp: string, + external_message: string, + external_details: Record, + internal_message: string, + internal_details: Record, +}; + +// Helper type for email outbox items with retry fields +type OutboxEmailWithRetryFields = OutboxEmail & { + failed_send_attempt_count: number, + next_send_retry_at_millis: number | null, + send_attempt_errors: SendAttemptErrorEntry[] | null, +}; + +// Helper to get detailed email from the outbox +async function getOutboxEmailById(emailId: string): Promise { + const response = await niceBackendFetch(`/api/v1/emails/outbox/${emailId}`, { + method: "GET", + accessType: "server", + }); + return response.body; +} + +// Helper to poll until an email with the given subject appears in the outbox +async function waitForOutboxEmail(subject: string, timeoutMs = 30000): Promise { + const startTime = Date.now(); + while (Date.now() - startTime < timeoutMs) { + const emails = await getOutboxEmails({ subject }); + if (emails.length > 0) { + return await getOutboxEmailById(emails[0].id); + } + await wait(500); + } + throw new StackAssertionError( + `Timeout waiting for email with subject "${subject}" to appear in outbox`, + { subject } + ); +} + +// Helper to poll until the email has reached a specific failed_send_attempt_count +// Note: Status may be "queued" or "sending" due to race conditions - that's expected +async function waitForAttemptCount(emailId: string, attemptCount: number, timeoutMs = 60000): Promise { + const startTime = Date.now(); + while (Date.now() - startTime < timeoutMs) { + const email = await getOutboxEmailById(emailId); + if (email.failed_send_attempt_count >= attemptCount) { + return email; + } + // Terminal state - no more retries will happen + if (email.status === "server-error") { + return email; + } + await wait(500); + } + const finalEmail = await getOutboxEmailById(emailId); + throw new StackAssertionError( + `Timeout waiting for email ${emailId} to reach failed_send_attempt_count >= ${attemptCount}`, + { emailId, attemptCount, finalState: { count: finalEmail.failed_send_attempt_count, status: finalEmail.status } } + ); +} + +describe("email queue deferred retry logic", () => { + it("should immediately mark non-retryable errors as server-error without retrying", async ({ expect }) => { + // brokenSmtpConfig causes HOST_NOT_FOUND which has canRetry: false + await Project.createAndSwitch({ + display_name: "Test Non-Retryable Error Project", + config: { + email_config: brokenSmtpConfig, + }, + }); + + const mailbox = backendContext.value.mailbox; + const createUserResponse = await niceBackendFetch("/api/v1/users", { + method: "POST", + accessType: "server", + body: { + primary_email: mailbox.emailAddress, + primary_email_verified: true, + }, + }); + expect(createUserResponse.status).toBe(201); + const userId = createUserResponse.body.id; + + const sendResponse = await niceBackendFetch("/api/v1/emails/send-email", { + method: "POST", + accessType: "server", + body: { + user_ids: [userId], + html: "Test email for non-retryable error", + subject: "Non-Retryable Error Test", + }, + }); + expect(sendResponse.status).toBe(200); + + // Wait for the email to appear in the outbox and reach server-error state + const initialEmail = await waitForOutboxEmail("Non-Retryable Error Test"); + const emailId = initialEmail.id; + + // Wait for the email to reach server-error status + const maxWaitMs = 30000; + const startTime = Date.now(); + let email = initialEmail; + while (Date.now() - startTime < maxWaitMs && email.status !== "server-error") { + await wait(500); + email = await getOutboxEmailById(emailId); + } + + // Non-retryable errors should go directly to server-error + expect(email.failed_send_attempt_count).toBe(1); + expect(email.next_send_retry_at_millis).toBeNull(); + expect(email.send_attempt_errors).not.toBeNull(); + expect(email.send_attempt_errors?.length).toBe(1); + expect(email.status).toBe("server-error"); + + logIfTestFails("Email after non-retryable error", email); + }); + + describe("retryable errors (using temp-fail SMTP server)", () => { + // These tests use a local SMTP server that responds with 450 (temporary failure). + // This is fast (immediate response) and produces retryable errors. + + let tempFailSmtpConfig: ReturnType; + + beforeAll(async () => { + const port = await startTempFailSmtpServer(); + tempFailSmtpConfig = createTempFailSmtpConfig(port); + }); + + afterAll(() => { + stopTempFailSmtpServer(); + }); + + it("should schedule retry on retryable failure and release email for next iteration", { timeout: 60000 }, async ({ expect }) => { + await Project.createAndSwitch({ + display_name: "Test Deferred Retry Project", + config: { + email_config: tempFailSmtpConfig, + }, + }); + + const mailbox = backendContext.value.mailbox; + const createUserResponse = await niceBackendFetch("/api/v1/users", { + method: "POST", + accessType: "server", + body: { + primary_email: mailbox.emailAddress, + primary_email_verified: true, + }, + }); + expect(createUserResponse.status).toBe(201); + const userId = createUserResponse.body.id; + + const sendResponse = await niceBackendFetch("/api/v1/emails/send-email", { + method: "POST", + accessType: "server", + body: { + user_ids: [userId], + html: "Test email for retry logic", + subject: "Retry Test Email", + }, + }); + expect(sendResponse.status).toBe(200); + + // Wait for the email to appear in the outbox + const initialEmail = await waitForOutboxEmail("Retry Test Email"); + const emailId = initialEmail.id; + + // Wait for first send attempt to complete (450 response is immediate) + const emailAfterFirstAttempt = await waitForAttemptCount(emailId, 1, 30000); + + // Verify the email was released for a DIFFERENT queue iteration to pick up + // - status should NOT be server-error (retries remaining) + expect(emailAfterFirstAttempt.failed_send_attempt_count).toBe(1); + expect(emailAfterFirstAttempt.send_attempt_errors).not.toBeNull(); + expect(emailAfterFirstAttempt.send_attempt_errors?.length).toBe(1); + expect(emailAfterFirstAttempt.send_attempt_errors?.[0].attempt_number).toBe(1); + expect(emailAfterFirstAttempt.send_attempt_errors?.[0].external_message).toContain("450"); + expect(emailAfterFirstAttempt.status).not.toBe("server-error"); + + // Status should be "scheduled" (isQueued=false, waiting for nextSendRetryAt) or "sending" (next iteration already picked it up) + expect(["scheduled", "sending"]).toContain(emailAfterFirstAttempt.status); + + logIfTestFails("Email after first retry attempt", emailAfterFirstAttempt); + }); + + it("should retry emails until max attempts exhausted, then mark as server-error", { timeout: 90000 }, async ({ expect }) => { + await Project.createAndSwitch({ + display_name: "Test Retry Exhaustion Project", + config: { + email_config: tempFailSmtpConfig, + }, + }); + + const mailbox = backendContext.value.mailbox; + const createUserResponse = await niceBackendFetch("/api/v1/users", { + method: "POST", + accessType: "server", + body: { + primary_email: mailbox.emailAddress, + primary_email_verified: true, + }, + }); + expect(createUserResponse.status).toBe(201); + const userId = createUserResponse.body.id; + + const sendResponse = await niceBackendFetch("/api/v1/emails/send-email", { + method: "POST", + accessType: "server", + body: { + user_ids: [userId], + html: "Test email for retry exhaustion", + subject: "Retry Exhaustion Test", + }, + }); + expect(sendResponse.status).toBe(200); + + // Wait for the email to appear in the outbox + const initialEmail = await waitForOutboxEmail("Retry Exhaustion Test"); + const emailId = initialEmail.id; + + // Wait for all retries to exhaust (MAX_SEND_ATTEMPTS = 3) + // With 450 errors (immediate) + exponential backoff, this should complete in ~30s + const maxWaitMs = 60000; + const startTime = Date.now(); + let email = await getOutboxEmailById(emailId); + while (Date.now() - startTime < maxWaitMs && email.status !== "server-error") { + await wait(1000); + email = await getOutboxEmailById(emailId); + } + + expect(email.status).toBe("server-error"); + expect(email.failed_send_attempt_count).toBe(3); // MAX_SEND_ATTEMPTS + expect(email.send_attempt_errors?.length).toBe(3); + // No more retries scheduled + expect(email.next_send_retry_at_millis).toBeNull(); + + logIfTestFails("Email after all retries exhausted", email); + }); + }); +}); + From a9bebf222031b6c8dfabe603fa86fb9a2e3f2475 Mon Sep 17 00:00:00 2001 From: nams1570 Date: Wed, 11 Feb 2026 19:51:28 -0800 Subject: [PATCH 05/12] fix: skipped, cancelled emails cant be retried now --- apps/backend/src/app/api/latest/emails/outbox/crud.tsx | 1 + apps/backend/src/lib/email-queue-step.tsx | 4 ++++ 2 files changed, 5 insertions(+) diff --git a/apps/backend/src/app/api/latest/emails/outbox/crud.tsx b/apps/backend/src/app/api/latest/emails/outbox/crud.tsx index 42d76eec31..3b61c65a5d 100644 --- a/apps/backend/src/app/api/latest/emails/outbox/crud.tsx +++ b/apps/backend/src/app/api/latest/emails/outbox/crud.tsx @@ -381,6 +381,7 @@ export const emailOutboxCrudHandlers = createLazyProxy(() => createCrudHandlers( // Cancel action - mark as skipped set("isPaused", Prisma.sql`false`); set("isQueued", Prisma.sql`false`); + setNull("nextSendRetryAt"); // Clear any pending retry so it won't be picked up set("skippedReason", Prisma.sql`'MANUALLY_CANCELLED'::"EmailOutboxSkippedReason"`); set("skippedDetails", Prisma.sql`'{}'::jsonb`); } else { diff --git a/apps/backend/src/lib/email-queue-step.tsx b/apps/backend/src/lib/email-queue-step.tsx index e82b22d7a9..01a8b04951 100644 --- a/apps/backend/src/lib/email-queue-step.tsx +++ b/apps/backend/src/lib/email-queue-step.tsx @@ -523,6 +523,8 @@ async function prepareSendPlan(deltaSeconds: number): Promise Date: Wed, 11 Feb 2026 20:34:36 -0800 Subject: [PATCH 06/12] chore: rename failedSendAttempts and backoff number --- .../migration.sql | 8 +++---- apps/backend/prisma/schema.prisma | 12 +++++----- .../src/app/api/latest/emails/outbox/crud.tsx | 8 +++---- apps/backend/src/lib/email-queue-step.tsx | 12 +++++----- .../api/v1/emails/email-queue.test.ts | 24 +++++++++---------- .../api/v1/emails/outbox-api.test.ts | 3 +++ .../src/interface/crud/email-outbox.ts | 2 +- .../apps/implementations/admin-app-impl.ts | 2 +- .../template/src/lib/stack-app/email/index.ts | 2 +- 9 files changed, 38 insertions(+), 35 deletions(-) diff --git a/apps/backend/prisma/migrations/20260210000000_deferred_email_retry/migration.sql b/apps/backend/prisma/migrations/20260210000000_deferred_email_retry/migration.sql index b8f3adc729..46243b8b79 100644 --- a/apps/backend/prisma/migrations/20260210000000_deferred_email_retry/migration.sql +++ b/apps/backend/prisma/migrations/20260210000000_deferred_email_retry/migration.sql @@ -3,20 +3,20 @@ -- instead of blocking the current iteration with inline retries. ALTER TABLE "EmailOutbox" - ADD COLUMN "failedSendAttemptCount" INTEGER NOT NULL DEFAULT 0, + ADD COLUMN "sendRetries" INTEGER NOT NULL DEFAULT 0, ADD COLUMN "nextSendRetryAt" TIMESTAMP(3), ADD COLUMN "sendAttemptErrors" JSONB; -- Constraint: nextSendRetryAt can only be set after at least one failed attempt --- (if failedSendAttemptCount is 0, no attempt has failed, so there's nothing to retry) +-- (if sendRetries is 0, no attempt has failed, so there's nothing to retry) ALTER TABLE "EmailOutbox" ADD CONSTRAINT "EmailOutbox_nextSendRetryAt_requires_failure" - CHECK ("nextSendRetryAt" IS NULL OR "failedSendAttemptCount" > 0); + CHECK ("nextSendRetryAt" IS NULL OR "sendRetries" > 0); -- Constraint: sendAttemptErrors can only be set after at least one failed attempt ALTER TABLE "EmailOutbox" ADD CONSTRAINT "EmailOutbox_sendAttemptErrors_requires_failure" - CHECK ("sendAttemptErrors" IS NULL OR "failedSendAttemptCount" > 0); + CHECK ("sendAttemptErrors" IS NULL OR "sendRetries" > 0); -- Constraint: nextSendRetryAt must be null when email has finished sending -- (if finishedSendingAt is set, there's nothing more to retry) diff --git a/apps/backend/prisma/schema.prisma b/apps/backend/prisma/schema.prisma index 7712ed8713..6d608df609 100644 --- a/apps/backend/prisma/schema.prisma +++ b/apps/backend/prisma/schema.prisma @@ -845,12 +845,12 @@ model EmailOutbox { finishedSendingAt DateTime? // Deferred retry fields for email sending - // Number of failed send attempts (starts at 0, incremented on each failure). Reset when email content is edited. - failedSendAttemptCount Int @default(0) - // When to retry sending (null = not waiting for retry). Set when a retryable error occurs. Reset when email content is edited. Must be null if failedSendAttemptCount is 0 (enforced by EmailOutbox_nextSendRetryAt_requires_failure). Must be null when finishedSendingAt is set (enforced by EmailOutbox_no_retry_after_finished). - nextSendRetryAt DateTime? - // JSON array of errors from each failed send attempt. Each entry has: { attemptNumber, timestamp, externalMessage, externalDetails, internalMessage, internalDetails }. Reset when email content is edited. Must be null if failedSendAttemptCount is 0 (enforced by EmailOutbox_sendAttemptErrors_requires_failure). - sendAttemptErrors Json? + // Number of send retries attempted (starts at 0, incremented on each failure). Reset when email content is edited. + sendRetries Int @default(0) + // When to retry sending (null = not waiting for retry). Set when a retryable error occurs. Reset when email content is edited. Must be null if sendRetries is 0 (enforced by EmailOutbox_nextSendRetryAt_requires_failure). Must be null when finishedSendingAt is set (enforced by EmailOutbox_no_retry_after_finished). + nextSendRetryAt DateTime? + // JSON array of errors from each failed send attempt. Each entry has: { attemptNumber, timestamp, externalMessage, externalDetails, internalMessage, internalDetails }. Reset when email content is edited. Must be null if sendRetries is 0 (enforced by EmailOutbox_sendAttemptErrors_requires_failure). + sendAttemptErrors Json? // A generated column that is equal to finishedSendingAt if canHaveDeliveryInfo is false, otherwise deliveredAt. sentAt DateTime? @default(dbgenerated("\nCASE\n WHEN (\"canHaveDeliveryInfo\" IS TRUE) THEN \"deliveredAt\"\n WHEN (\"canHaveDeliveryInfo\" IS FALSE) THEN \"finishedSendingAt\"\n ELSE NULL::timestamp without time zone\nEND")) diff --git a/apps/backend/src/app/api/latest/emails/outbox/crud.tsx b/apps/backend/src/app/api/latest/emails/outbox/crud.tsx index 3b61c65a5d..6ecf5ad789 100644 --- a/apps/backend/src/app/api/latest/emails/outbox/crud.tsx +++ b/apps/backend/src/app/api/latest/emails/outbox/crud.tsx @@ -88,7 +88,7 @@ function prismaModelToCrud(prismaModel: EmailOutbox): EmailOutboxCrud["Server"][ variables: (prismaModel.extraRenderVariables ?? {}) as Record, skip_deliverability_check: prismaModel.shouldSkipDeliverabilityCheck, scheduled_at_millis: prismaModel.scheduledAt.getTime(), - failed_send_attempt_count: prismaModel.failedSendAttemptCount, + send_retries: prismaModel.sendRetries, next_send_retry_at_millis: prismaModel.nextSendRetryAt?.getTime() ?? null, send_attempt_errors: sendAttemptErrors, // Default flags (overridden in specific statuses) @@ -419,8 +419,8 @@ export const emailOutboxCrudHandlers = createLazyProxy(() => createCrudHandlers( // If content changed, reset rendering and sending state if (needsRerenderReset) { set("isQueued", Prisma.sql`false`); - // Reset retry fields (failedSendAttemptCount to 0, others to null) - set("failedSendAttemptCount", Prisma.sql`0`); + // Reset retry fields (sendRetries to 0, others to null) + set("sendRetries", Prisma.sql`0`); setNull( "renderedByWorkerId", "startedRenderingAt", "finishedRenderingAt", "renderErrorExternalMessage", "renderErrorExternalDetails", @@ -521,7 +521,7 @@ function parseEmailOutboxFromJson(j: Record): EmailOutbox { scheduledAtIfNotYetQueued: dateOrNull("scheduledAtIfNotYetQueued"), startedSendingAt: dateOrNull("startedSendingAt"), finishedSendingAt: dateOrNull("finishedSendingAt"), - failedSendAttemptCount: j.failedSendAttemptCount as number, + sendRetries: j.sendRetries as number, nextSendRetryAt: dateOrNull("nextSendRetryAt"), sendAttemptErrors: j.sendAttemptErrors as Prisma.JsonValue, sentAt: dateOrNull("sentAt"), diff --git a/apps/backend/src/lib/email-queue-step.tsx b/apps/backend/src/lib/email-queue-step.tsx index 01a8b04951..cb84901be0 100644 --- a/apps/backend/src/lib/email-queue-step.tsx +++ b/apps/backend/src/lib/email-queue-step.tsx @@ -21,10 +21,10 @@ const MAX_RENDER_BATCH = 50; const MAX_SEND_ATTEMPTS = 3; -const RETRY_BACKOFF_BASE_MS = 2000; +const SEND_RETRY_BACKOFF_BASE_MS = 2000; const calculateRetryBackoffMs = (attemptCount: number): number => { - return (Math.random() + 0.5) * RETRY_BACKOFF_BASE_MS * Math.pow(2, attemptCount); + return (Math.random() + 0.5) * SEND_RETRY_BACKOFF_BASE_MS * Math.pow(2, attemptCount); }; /** @@ -697,7 +697,7 @@ async function processSingleEmail(context: TenancyProcessingContext, row: EmailO }); if (result.status === "error") { - const newAttemptCount = row.failedSendAttemptCount + 1; + const newAttemptCount = row.sendRetries + 1; const canRetry = result.error.canRetry && newAttemptCount < MAX_SEND_ATTEMPTS; // Build error entry for this attempt @@ -725,7 +725,7 @@ async function processSingleEmail(context: TenancyProcessingContext, row: EmailO data: { startedSendingAt: null, // Unclaim the email isQueued: false, // Prevent normal queue path from picking it up - failedSendAttemptCount: newAttemptCount, + sendRetries: newAttemptCount, nextSendRetryAt: new Date(Date.now() + backoffMs), sendAttemptErrors: updatedErrors as Prisma.InputJsonArray, }, @@ -760,7 +760,7 @@ async function processSingleEmail(context: TenancyProcessingContext, row: EmailO data: { finishedSendingAt: new Date(), canHaveDeliveryInfo: false, - failedSendAttemptCount: newAttemptCount, + sendRetries: newAttemptCount, sendAttemptErrors: updatedErrors as Prisma.InputJsonArray, sendServerErrorExternalMessage: externalMessage, sendServerErrorExternalDetails: { errorType: result.error.errorType }, @@ -776,7 +776,7 @@ async function processSingleEmail(context: TenancyProcessingContext, row: EmailO }); } } else { - // Success - mark as sent (don't increment failedSendAttemptCount since this wasn't a failure) + // Success - mark as sent (don't increment sendRetries since this wasn't a failure) await globalPrismaClient.emailOutbox.update({ where: { tenancyId_id: { diff --git a/apps/e2e/tests/backend/endpoints/api/v1/emails/email-queue.test.ts b/apps/e2e/tests/backend/endpoints/api/v1/emails/email-queue.test.ts index adf1d8f2fa..aa92f66541 100644 --- a/apps/e2e/tests/backend/endpoints/api/v1/emails/email-queue.test.ts +++ b/apps/e2e/tests/backend/endpoints/api/v1/emails/email-queue.test.ts @@ -413,7 +413,6 @@ describe("send email to all users", () => { "can_have_delivery_info": false, "created_at_millis": , "delivered_at_millis": , - "failed_send_attempt_count": 0, "has_delivered": true, "has_rendered": true, "html": "

All users test

", @@ -426,6 +425,7 @@ describe("send email to all users", () => { "rendered_at_millis": , "scheduled_at_millis": , "send_attempt_errors": null, + "send_retries": 0, "simple_status": "ok", "skip_deliverability_check": false, "started_rendering_at_millis": , @@ -453,7 +453,6 @@ describe("send email to all users", () => { "can_have_delivery_info": false, "created_at_millis": , "delivered_at_millis": , - "failed_send_attempt_count": 0, "has_delivered": true, "has_rendered": true, "html": "

All users test

", @@ -466,6 +465,7 @@ describe("send email to all users", () => { "rendered_at_millis": , "scheduled_at_millis": , "send_attempt_errors": null, + "send_retries": 0, "simple_status": "ok", "skip_deliverability_check": false, "started_rendering_at_millis": , @@ -1948,7 +1948,7 @@ type SendAttemptErrorEntry = { // Helper type for email outbox items with retry fields type OutboxEmailWithRetryFields = OutboxEmail & { - failed_send_attempt_count: number, + send_retries: number, next_send_retry_at_millis: number | null, send_attempt_errors: SendAttemptErrorEntry[] | null, }; @@ -1978,13 +1978,13 @@ async function waitForOutboxEmail(subject: string, timeoutMs = 30000): Promise { const startTime = Date.now(); while (Date.now() - startTime < timeoutMs) { const email = await getOutboxEmailById(emailId); - if (email.failed_send_attempt_count >= attemptCount) { + if (email.send_retries >= attemptCount) { return email; } // Terminal state - no more retries will happen @@ -1995,8 +1995,8 @@ async function waitForAttemptCount(emailId: string, attemptCount: number, timeou } const finalEmail = await getOutboxEmailById(emailId); throw new StackAssertionError( - `Timeout waiting for email ${emailId} to reach failed_send_attempt_count >= ${attemptCount}`, - { emailId, attemptCount, finalState: { count: finalEmail.failed_send_attempt_count, status: finalEmail.status } } + `Timeout waiting for email ${emailId} to reach send_retries >= ${attemptCount}`, + { emailId, attemptCount, finalState: { count: finalEmail.send_retries, status: finalEmail.status } } ); } @@ -2047,7 +2047,7 @@ describe("email queue deferred retry logic", () => { } // Non-retryable errors should go directly to server-error - expect(email.failed_send_attempt_count).toBe(1); + expect(email.send_retries).toBe(1); expect(email.next_send_retry_at_millis).toBeNull(); expect(email.send_attempt_errors).not.toBeNull(); expect(email.send_attempt_errors?.length).toBe(1); @@ -2111,7 +2111,7 @@ describe("email queue deferred retry logic", () => { // Verify the email was released for a DIFFERENT queue iteration to pick up // - status should NOT be server-error (retries remaining) - expect(emailAfterFirstAttempt.failed_send_attempt_count).toBe(1); + expect(emailAfterFirstAttempt.send_retries).toBe(1); expect(emailAfterFirstAttempt.send_attempt_errors).not.toBeNull(); expect(emailAfterFirstAttempt.send_attempt_errors?.length).toBe(1); expect(emailAfterFirstAttempt.send_attempt_errors?.[0].attempt_number).toBe(1); @@ -2159,7 +2159,7 @@ describe("email queue deferred retry logic", () => { const initialEmail = await waitForOutboxEmail("Retry Exhaustion Test"); const emailId = initialEmail.id; - // Wait for all retries to exhaust (MAX_SEND_ATTEMPTS = 3) + // Wait for all retries to exhaust (MAX_SEND_ATTEMPTS = 5) // With 450 errors (immediate) + exponential backoff, this should complete in ~30s const maxWaitMs = 60000; const startTime = Date.now(); @@ -2170,8 +2170,8 @@ describe("email queue deferred retry logic", () => { } expect(email.status).toBe("server-error"); - expect(email.failed_send_attempt_count).toBe(3); // MAX_SEND_ATTEMPTS - expect(email.send_attempt_errors?.length).toBe(3); + expect(email.send_retries).toBe(5); // MAX_SEND_ATTEMPTS + expect(email.send_attempt_errors?.length).toBe(5); // No more retries scheduled expect(email.next_send_retry_at_millis).toBeNull(); diff --git a/apps/e2e/tests/backend/endpoints/api/v1/emails/outbox-api.test.ts b/apps/e2e/tests/backend/endpoints/api/v1/emails/outbox-api.test.ts index ff31fa42db..503086930a 100644 --- a/apps/e2e/tests/backend/endpoints/api/v1/emails/outbox-api.test.ts +++ b/apps/e2e/tests/backend/endpoints/api/v1/emails/outbox-api.test.ts @@ -630,7 +630,10 @@ describe("email outbox API", () => { "has_rendered": false, "id": "", "is_paused": true, + "next_send_retry_at_millis": null, "scheduled_at_millis": , + "send_attempt_errors": null, + "send_retries": 0, "simple_status": "in-progress", "skip_deliverability_check": false, "status": "paused", diff --git a/packages/stack-shared/src/interface/crud/email-outbox.ts b/packages/stack-shared/src/interface/crud/email-outbox.ts index ddc785bd83..9759289cd0 100644 --- a/packages/stack-shared/src/interface/crud/email-outbox.ts +++ b/packages/stack-shared/src/interface/crud/email-outbox.ts @@ -41,7 +41,7 @@ const emailOutboxBaseSchema = fieldSchema.yupObject({ scheduled_at_millis: fieldSchema.yupNumber().defined(), // Retry-related fields (for debugging/testing deferred retry logic) - failed_send_attempt_count: fieldSchema.yupNumber().defined(), + send_retries: fieldSchema.yupNumber().defined(), next_send_retry_at_millis: fieldSchema.yupNumber().nullable().defined(), // Array of errors from each failed send attempt, each with internal/external messages send_attempt_errors: fieldSchema.yupArray(fieldSchema.yupObject({ diff --git a/packages/template/src/lib/stack-app/apps/implementations/admin-app-impl.ts b/packages/template/src/lib/stack-app/apps/implementations/admin-app-impl.ts index 9dd25fe9a4..d7b0d41a93 100644 --- a/packages/template/src/lib/stack-app/apps/implementations/admin-app-impl.ts +++ b/packages/template/src/lib/stack-app/apps/implementations/admin-app-impl.ts @@ -734,7 +734,7 @@ export class _StackAdminAppImplIncomplete Date: Thu, 12 Feb 2026 10:26:34 -0800 Subject: [PATCH 07/12] chore: bump number of retries --- apps/backend/src/lib/email-queue-step.tsx | 2 +- apps/e2e/tests/backend/backend-helpers.ts | 2 +- .../api/v1/emails/email-queue.test.ts | 21 +++++++++++++------ 3 files changed, 17 insertions(+), 8 deletions(-) diff --git a/apps/backend/src/lib/email-queue-step.tsx b/apps/backend/src/lib/email-queue-step.tsx index cb84901be0..4d2118a778 100644 --- a/apps/backend/src/lib/email-queue-step.tsx +++ b/apps/backend/src/lib/email-queue-step.tsx @@ -19,7 +19,7 @@ import { lowLevelSendEmailDirectWithoutRetries } from "./emails-low-level"; const MAX_RENDER_BATCH = 50; -const MAX_SEND_ATTEMPTS = 3; +const MAX_SEND_ATTEMPTS = 5; const SEND_RETRY_BACKOFF_BASE_MS = 2000; diff --git a/apps/e2e/tests/backend/backend-helpers.ts b/apps/e2e/tests/backend/backend-helpers.ts index fc5d260dc5..cf87bad045 100644 --- a/apps/e2e/tests/backend/backend-helpers.ts +++ b/apps/e2e/tests/backend/backend-helpers.ts @@ -187,7 +187,7 @@ export async function bumpEmailAddress(options: { unindexed?: boolean } = {}) { } // Type for outbox email items (simplified - full type is EmailOutboxCrud["Server"]["Read"]) -type OutboxEmail = { +export type OutboxEmail = { id: string, subject?: string, status: string, diff --git a/apps/e2e/tests/backend/endpoints/api/v1/emails/email-queue.test.ts b/apps/e2e/tests/backend/endpoints/api/v1/emails/email-queue.test.ts index aa92f66541..24d8d4aabf 100644 --- a/apps/e2e/tests/backend/endpoints/api/v1/emails/email-queue.test.ts +++ b/apps/e2e/tests/backend/endpoints/api/v1/emails/email-queue.test.ts @@ -1,3 +1,4 @@ +import { StackAssertionError } from "@stackframe/stack-shared/dist/utils/errors"; import { wait } from "@stackframe/stack-shared/dist/utils/promises"; import { deindent, nicify } from "@stackframe/stack-shared/dist/utils/strings"; import beautify from "js-beautify"; @@ -5,7 +6,7 @@ import * as net from "net"; import { afterAll, beforeAll, describe } from "vitest"; import { it, logIfTestFails } from "../../../../../helpers"; import { withPortPrefix } from "../../../../../helpers/ports"; -import { Auth, Project, User, backendContext, bumpEmailAddress, getOutboxEmails, niceBackendFetch, waitForOutboxEmailWithStatus } from "../../../../backend-helpers"; +import { Auth, InternalApiKey, OutboxEmail, Project, User, backendContext, bumpEmailAddress, getOutboxEmails, niceBackendFetch, waitForOutboxEmailWithStatus } from "../../../../backend-helpers"; const testEmailConfig = { type: "standard", @@ -1959,6 +1960,9 @@ async function getOutboxEmailById(emailId: string): Promise { email_config: tempFailSmtpConfig, }, }); + // Create API keys that don't expire (JWT admin tokens expire in 60s which is too short for retry tests) + await InternalApiKey.createAndSetProjectKeys(); const mailbox = backendContext.value.mailbox; const createUserResponse = await niceBackendFetch("/api/v1/users", { @@ -2124,13 +2130,15 @@ describe("email queue deferred retry logic", () => { logIfTestFails("Email after first retry attempt", emailAfterFirstAttempt); }); - it("should retry emails until max attempts exhausted, then mark as server-error", { timeout: 90000 }, async ({ expect }) => { + it("should retry emails until max attempts exhausted, then mark as server-error", { timeout: 150000 }, async ({ expect }) => { await Project.createAndSwitch({ display_name: "Test Retry Exhaustion Project", config: { email_config: tempFailSmtpConfig, }, }); + // Create API keys that don't expire (JWT admin tokens expire in 60s which is too short for retry tests) + await InternalApiKey.createAndSetProjectKeys(); const mailbox = backendContext.value.mailbox; const createUserResponse = await niceBackendFetch("/api/v1/users", { @@ -2160,8 +2168,8 @@ describe("email queue deferred retry logic", () => { const emailId = initialEmail.id; // Wait for all retries to exhaust (MAX_SEND_ATTEMPTS = 5) - // With 450 errors (immediate) + exponential backoff, this should complete in ~30s - const maxWaitMs = 60000; + // With 450 errors (immediate) + exponential backoff (2s base * 2^attempt), worst case ~90s + const maxWaitMs = 120000; const startTime = Date.now(); let email = await getOutboxEmailById(emailId); while (Date.now() - startTime < maxWaitMs && email.status !== "server-error") { @@ -2169,13 +2177,14 @@ describe("email queue deferred retry logic", () => { email = await getOutboxEmailById(emailId); } + // Log the email object to help debug undefined status issues + logIfTestFails("Email after retry exhaustion loop", email); + expect(email.status).toBe("server-error"); expect(email.send_retries).toBe(5); // MAX_SEND_ATTEMPTS expect(email.send_attempt_errors?.length).toBe(5); // No more retries scheduled expect(email.next_send_retry_at_millis).toBeNull(); - - logIfTestFails("Email after all retries exhausted", email); }); }); }); From e669c08fac481e2287a473bd2c32f3ccc45c6af8 Mon Sep 17 00:00:00 2001 From: nams1570 Date: Thu, 12 Feb 2026 10:57:43 -0800 Subject: [PATCH 08/12] refactor: add more info to error handling --- apps/backend/src/lib/email-queue-step.tsx | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/apps/backend/src/lib/email-queue-step.tsx b/apps/backend/src/lib/email-queue-step.tsx index 4d2118a778..0cc88eadae 100644 --- a/apps/backend/src/lib/email-queue-step.tsx +++ b/apps/backend/src/lib/email-queue-step.tsx @@ -698,7 +698,8 @@ async function processSingleEmail(context: TenancyProcessingContext, row: EmailO if (result.status === "error") { const newAttemptCount = row.sendRetries + 1; - const canRetry = result.error.canRetry && newAttemptCount < MAX_SEND_ATTEMPTS; + const isAttemptsExhausted = result.error.canRetry && newAttemptCount >= MAX_SEND_ATTEMPTS; + const canRetry = result.error.canRetry && !isAttemptsExhausted; // Build error entry for this attempt const errorEntry: SendAttemptError = { @@ -731,12 +732,12 @@ async function processSingleEmail(context: TenancyProcessingContext, row: EmailO }, }); } else { - // Mark as permanent failure - distinguish between "attempts exhausted" and "permanent error from provider" - const isAttemptsExhausted = result.error.canRetry && newAttemptCount >= MAX_SEND_ATTEMPTS; + // Mark as permanent failure - either "attempts_exhausted" (retryable but hit limit) or "permanent_error" (non-retryable) const failureReason = isAttemptsExhausted ? "attempts_exhausted" : "permanent_error"; if (isAttemptsExhausted) { captureError("email-queue-step-retries-exhausted", new StackAssertionError(`Email failed after ${newAttemptCount} attempts`, { + cause: result.error.rawError, emailId: row.id, tenancyId: row.tenancyId, errorType: result.error.errorType, @@ -745,10 +746,6 @@ async function processSingleEmail(context: TenancyProcessingContext, row: EmailO })); } - const externalMessage = isAttemptsExhausted - ? "Email could not be delivered after multiple attempts. Please verify your email configuration and try again." - : result.error.message; - await globalPrismaClient.emailOutbox.update({ where: { tenancyId_id: { @@ -762,7 +759,7 @@ async function processSingleEmail(context: TenancyProcessingContext, row: EmailO canHaveDeliveryInfo: false, sendRetries: newAttemptCount, sendAttemptErrors: updatedErrors as Prisma.InputJsonArray, - sendServerErrorExternalMessage: externalMessage, + sendServerErrorExternalMessage: result.error.message, sendServerErrorExternalDetails: { errorType: result.error.errorType }, sendServerErrorInternalMessage: result.error.message, sendServerErrorInternalDetails: { From 6ea82d7e4c1e3beee4481df30ffd4b552dee839f Mon Sep 17 00:00:00 2001 From: nams1570 Date: Thu, 12 Feb 2026 15:32:50 -0800 Subject: [PATCH 09/12] fix: retryable emails no longer skip the queuing logic Before, retryable emails went from sending -> scheduled -> sending Now, we move through the queuing logic. This simplifies the state transition functions and lifts the retry checking logic to only one func queuing. We also add some checks to queuing to have it be robust. --- apps/backend/src/lib/email-queue-step.tsx | 56 ++++++++++--------- .../api/v1/emails/email-queue.test.ts | 29 +++++----- 2 files changed, 47 insertions(+), 38 deletions(-) diff --git a/apps/backend/src/lib/email-queue-step.tsx b/apps/backend/src/lib/email-queue-step.tsx index 0cc88eadae..3200770308 100644 --- a/apps/backend/src/lib/email-queue-step.tsx +++ b/apps/backend/src/lib/email-queue-step.tsx @@ -502,15 +502,30 @@ async function renderTenancyEmails(workerId: string, tenancyId: string, group: E } async function queueReadyEmails(): Promise<{ queuedCount: number }> { + // Queue emails that are ready to send: + // - Fresh emails: scheduledAt has passed and no retry pending + // - Retry emails: both scheduledAt AND nextSendRetryAt have passed + // + // We always require scheduledAt <= NOW() to respect the original scheduling intent. + // nextSendRetryAt should not bypass scheduledAt (e.g., if data is corrupted or manually edited). + // + // Clear nextSendRetryAt when queuing so the email is in a clean "queued" state. const res = await globalPrismaClient.$queryRaw<{ id: string }[]>` UPDATE "EmailOutbox" - SET "isQueued" = TRUE + SET "isQueued" = TRUE, "nextSendRetryAt" = NULL WHERE "isQueued" = FALSE AND "isPaused" = FALSE + AND "skippedReason" IS NULL AND "finishedRenderingAt" IS NOT NULL AND "renderedHtml" IS NOT NULL AND "scheduledAt" <= NOW() - AND "nextSendRetryAt" IS NULL + AND ( + -- Fresh emails: no retry pending + "nextSendRetryAt" IS NULL + OR + -- Retry emails: retry time has passed + "nextSendRetryAt" <= NOW() + ) RETURNING "id"; `; return { @@ -519,18 +534,14 @@ async function queueReadyEmails(): Promise<{ queuedCount: number }> { } async function prepareSendPlan(deltaSeconds: number): Promise { - // Find tenancies with emails ready to send (either new emails or emails ready for retry) + // Find tenancies with queued emails ready to send const tenancyIds = await globalPrismaClient.emailOutbox.findMany({ where: { isPaused: false, - skippedReason: null, // Don't process skipped/cancelled emails - finishedSendingAt: null, // Don't process already-finished emails (defense in depth) - OR: [ - // Normal case: queued, not started, and no pending retry - { startedSendingAt: null, isQueued: true, nextSendRetryAt: null }, - // Retry case: past retry time - { nextSendRetryAt: { lte: new Date() } }, - ], + skippedReason: null, + finishedSendingAt: null, + startedSendingAt: null, + isQueued: true, }, distinct: ["tenancyId"], select: { tenancyId: true }, @@ -556,30 +567,25 @@ function stochasticQuota(value: number): number { } async function claimEmailsForSending(tx: PrismaClientTransaction, tenancyId: string, limit: number): Promise { + // Claim queued emails for sending + // Note: queueReadyEmails() handles the time-based logic, so we just look for isQueued = TRUE return await tx.$queryRaw(Prisma.sql` WITH selected AS ( SELECT "tenancyId", "id" FROM "EmailOutbox" WHERE "tenancyId" = ${tenancyId}::uuid AND "isPaused" = FALSE - AND "skippedReason" IS NULL -- Don't process skipped/cancelled emails - AND "finishedSendingAt" IS NULL -- Don't process already-finished emails (defense in depth) + AND "skippedReason" IS NULL + AND "finishedSendingAt" IS NULL AND "finishedRenderingAt" IS NOT NULL - AND ( - -- Normal case: queued, not started, and no pending retry - ("startedSendingAt" IS NULL AND "isQueued" = TRUE AND "nextSendRetryAt" IS NULL) - OR - -- Retry case: past retry time - ("nextSendRetryAt" IS NOT NULL AND "nextSendRetryAt" <= NOW()) - ) + AND "startedSendingAt" IS NULL + AND "isQueued" = TRUE ORDER BY "priority" DESC, "scheduledAt" ASC, "createdAt" ASC LIMIT ${limit} FOR UPDATE SKIP LOCKED ) UPDATE "EmailOutbox" AS e - SET - "startedSendingAt" = NOW(), - "nextSendRetryAt" = NULL + SET "startedSendingAt" = NOW() FROM selected WHERE e."tenancyId" = selected."tenancyId" AND e."id" = selected."id" RETURNING e.*; @@ -724,8 +730,8 @@ async function processSingleEmail(context: TenancyProcessingContext, row: EmailO finishedSendingAt: null, }, data: { - startedSendingAt: null, // Unclaim the email - isQueued: false, // Prevent normal queue path from picking it up + startedSendingAt: null, + isQueued: false, sendRetries: newAttemptCount, nextSendRetryAt: new Date(Date.now() + backoffMs), sendAttemptErrors: updatedErrors as Prisma.InputJsonArray, diff --git a/apps/e2e/tests/backend/endpoints/api/v1/emails/email-queue.test.ts b/apps/e2e/tests/backend/endpoints/api/v1/emails/email-queue.test.ts index 24d8d4aabf..50b23f17de 100644 --- a/apps/e2e/tests/backend/endpoints/api/v1/emails/email-queue.test.ts +++ b/apps/e2e/tests/backend/endpoints/api/v1/emails/email-queue.test.ts @@ -1,4 +1,4 @@ -import { StackAssertionError } from "@stackframe/stack-shared/dist/utils/errors"; +import { StackAssertionError, throwErr } from "@stackframe/stack-shared/dist/utils/errors"; import { wait } from "@stackframe/stack-shared/dist/utils/promises"; import { deindent, nicify } from "@stackframe/stack-shared/dist/utils/strings"; import beautify from "js-beautify"; @@ -1871,7 +1871,7 @@ let tempFailPort: number | null = null; async function startTempFailSmtpServer(): Promise { if (tempFailServer) { - return tempFailPort!; + return tempFailPort ?? throwErr("tempFailServer not initialized"); } return await new Promise((resolve, reject) => { @@ -1903,7 +1903,7 @@ async function startTempFailSmtpServer(): Promise { }); tempFailServer.listen(0, '127.0.0.1', () => { - const address = tempFailServer!.address(); + const address = (tempFailServer ?? throwErr("tempFailServer unexpectedly null in listen callback")).address(); if (typeof address === 'object' && address !== null) { tempFailPort = address.port; resolve(tempFailPort); @@ -1968,8 +1968,8 @@ async function getOutboxEmailById(emailId: string): Promise { - const startTime = Date.now(); - while (Date.now() - startTime < timeoutMs) { + const startTime = performance.now(); + while (performance.now() - startTime < timeoutMs) { const emails = await getOutboxEmails({ subject }); if (emails.length > 0) { return await getOutboxEmailById(emails[0].id); @@ -1985,8 +1985,8 @@ async function waitForOutboxEmail(subject: string, timeoutMs = 30000): Promise { - const startTime = Date.now(); - while (Date.now() - startTime < timeoutMs) { + const startTime = performance.now(); + while (performance.now() - startTime < timeoutMs) { const email = await getOutboxEmailById(emailId); if (email.send_retries >= attemptCount) { return email; @@ -2043,9 +2043,9 @@ describe("email queue deferred retry logic", () => { // Wait for the email to reach server-error status const maxWaitMs = 30000; - const startTime = Date.now(); + const startTime = performance.now(); let email = initialEmail; - while (Date.now() - startTime < maxWaitMs && email.status !== "server-error") { + while (performance.now() - startTime < maxWaitMs && email.status !== "server-error") { await wait(500); email = await getOutboxEmailById(emailId); } @@ -2124,8 +2124,11 @@ describe("email queue deferred retry logic", () => { expect(emailAfterFirstAttempt.send_attempt_errors?.[0].external_message).toContain("450"); expect(emailAfterFirstAttempt.status).not.toBe("server-error"); - // Status should be "scheduled" (isQueued=false, waiting for nextSendRetryAt) or "sending" (next iteration already picked it up) - expect(["scheduled", "sending"]).toContain(emailAfterFirstAttempt.status); + // Status could be: + // - "scheduled": isQueued=false, waiting for nextSendRetryAt to pass + // - "queued": queueReadyEmails() ran, isQueued=true, waiting to be claimed + // - "sending": next iteration already picked it up + expect(["scheduled", "queued", "sending"]).toContain(emailAfterFirstAttempt.status); logIfTestFails("Email after first retry attempt", emailAfterFirstAttempt); }); @@ -2170,9 +2173,9 @@ describe("email queue deferred retry logic", () => { // Wait for all retries to exhaust (MAX_SEND_ATTEMPTS = 5) // With 450 errors (immediate) + exponential backoff (2s base * 2^attempt), worst case ~90s const maxWaitMs = 120000; - const startTime = Date.now(); + const startTime = performance.now(); let email = await getOutboxEmailById(emailId); - while (Date.now() - startTime < maxWaitMs && email.status !== "server-error") { + while (performance.now() - startTime < maxWaitMs && email.status !== "server-error") { await wait(1000); email = await getOutboxEmailById(emailId); } From 725361e941e45a3c426be3bf4e1a6f333928fbe1 Mon Sep 17 00:00:00 2001 From: nams1570 Date: Thu, 12 Feb 2026 17:16:27 -0800 Subject: [PATCH 10/12] feat: add new index, constraints on isQueued Our queuing logic will be slow if we dont use an index. isQueued is most of the time true, since emailoutbox table accounts for historical data and all sent emails have isQueued set to true. So the index will actually be quite selective. We can't just use the partial index on isQueued because its build on tenancyId, and the queuing query doesnt check tenancy id. --- .../migration.sql | 6 ++++++ apps/backend/prisma/schema.prisma | 3 ++- 2 files changed, 8 insertions(+), 1 deletion(-) create mode 100644 apps/backend/prisma/migrations/20260213004424_email_outbox_is_queued_index/migration.sql diff --git a/apps/backend/prisma/migrations/20260213004424_email_outbox_is_queued_index/migration.sql b/apps/backend/prisma/migrations/20260213004424_email_outbox_is_queued_index/migration.sql new file mode 100644 index 0000000000..49f0db985e --- /dev/null +++ b/apps/backend/prisma/migrations/20260213004424_email_outbox_is_queued_index/migration.sql @@ -0,0 +1,6 @@ +-- SPLIT_STATEMENT_SENTINEL +-- SINGLE_STATEMENT_SENTINEL +-- RUN_OUTSIDE_TRANSACTION_SENTINEL +-- Index on isQueued for efficient queueReadyEmails() queries. +-- Most emails have isQueued=TRUE (already processed), so filtering for FALSE is highly selective. +CREATE INDEX CONCURRENTLY IF NOT EXISTS "EmailOutbox_isQueued_idx" ON /* SCHEMA_NAME_SENTINEL */."EmailOutbox" ("isQueued"); diff --git a/apps/backend/prisma/schema.prisma b/apps/backend/prisma/schema.prisma index 6d608df609..df43c5c39c 100644 --- a/apps/backend/prisma/schema.prisma +++ b/apps/backend/prisma/schema.prisma @@ -833,7 +833,7 @@ model EmailOutbox { // The scheduled time of when the email should be added to the queue. Can be edited, but only if the email has not yet started sending. Doing so should set isQueued to false. scheduledAt DateTime - // The scheduled time of the email if it is in the future. + // Whether the email has been queued for sending. Once queued, this stays true unless the email is retried or rescheduled. isQueued Boolean @default(false) // A generated column that is equal to scheduledAt if isQueued is false, otherwise null. See the note above on EmailOutboxStatus.status for more details on dbgenerated values. @@ -886,6 +886,7 @@ model EmailOutbox { @@index([tenancyId, finishedSendingAt(sort: Desc), scheduledAtIfNotYetQueued(sort: Desc), priority, id], map: "EmailOutbox_ordering_idx") @@index([tenancyId, simpleStatus], map: "EmailOutbox_simple_status_tenancy_idx") @@index([tenancyId, status], map: "EmailOutbox_status_tenancy_idx") + @@index([isQueued], map: "EmailOutbox_isQueued_idx") } model EmailOutboxProcessingMetadata { From de35c1c4422364eda6358788995a8ab9a46ec05e Mon Sep 17 00:00:00 2001 From: nams1570 Date: Thu, 12 Feb 2026 17:23:25 -0800 Subject: [PATCH 11/12] refactor: split queuing query into two Rather than using an OR which may/may not use the index correctly, we use two queries. --- apps/backend/src/lib/email-queue-step.tsx | 36 +++++++++++++---------- 1 file changed, 21 insertions(+), 15 deletions(-) diff --git a/apps/backend/src/lib/email-queue-step.tsx b/apps/backend/src/lib/email-queue-step.tsx index 3200770308..e120b34534 100644 --- a/apps/backend/src/lib/email-queue-step.tsx +++ b/apps/backend/src/lib/email-queue-step.tsx @@ -502,15 +502,26 @@ async function renderTenancyEmails(workerId: string, tenancyId: string, group: E } async function queueReadyEmails(): Promise<{ queuedCount: number }> { - // Queue emails that are ready to send: - // - Fresh emails: scheduledAt has passed and no retry pending - // - Retry emails: both scheduledAt AND nextSendRetryAt have passed - // + // Queue emails that are ready to send. Split into two queries for clarity and index usage. // We always require scheduledAt <= NOW() to respect the original scheduling intent. - // nextSendRetryAt should not bypass scheduledAt (e.g., if data is corrupted or manually edited). - // + + // Query 1: Fresh emails (scheduledAt has passed, no retry pending) + const freshEmails = await globalPrismaClient.$queryRaw<{ id: string }[]>` + UPDATE "EmailOutbox" + SET "isQueued" = TRUE + WHERE "isQueued" = FALSE + AND "isPaused" = FALSE + AND "skippedReason" IS NULL + AND "finishedRenderingAt" IS NOT NULL + AND "renderedHtml" IS NOT NULL + AND "scheduledAt" <= NOW() + AND "nextSendRetryAt" IS NULL + RETURNING "id"; + `; + + // Query 2: Retry emails (both scheduledAt AND nextSendRetryAt have passed) // Clear nextSendRetryAt when queuing so the email is in a clean "queued" state. - const res = await globalPrismaClient.$queryRaw<{ id: string }[]>` + const retryEmails = await globalPrismaClient.$queryRaw<{ id: string }[]>` UPDATE "EmailOutbox" SET "isQueued" = TRUE, "nextSendRetryAt" = NULL WHERE "isQueued" = FALSE @@ -519,17 +530,12 @@ async function queueReadyEmails(): Promise<{ queuedCount: number }> { AND "finishedRenderingAt" IS NOT NULL AND "renderedHtml" IS NOT NULL AND "scheduledAt" <= NOW() - AND ( - -- Fresh emails: no retry pending - "nextSendRetryAt" IS NULL - OR - -- Retry emails: retry time has passed - "nextSendRetryAt" <= NOW() - ) + AND "nextSendRetryAt" <= NOW() RETURNING "id"; `; + return { - queuedCount: res.length, + queuedCount: freshEmails.length + retryEmails.length, }; } From 091616af93c5df91088b8b293f2d04a20a999797 Mon Sep 17 00:00:00 2001 From: nams1570 Date: Fri, 13 Feb 2026 11:37:50 -0800 Subject: [PATCH 12/12] fix: decouple validation and constraint adding for db Without the not valid, the migration tries to create the constraint and validate it at the same time which can lock the entire table. We decouple the validation and the constraint creation to avoid transaction timeouts --- AGENTS.md | 1 + .../20260210000000_deferred_email_retry/migration.sql | 8 +++++--- .../migration.sql | 7 +++++++ 3 files changed, 13 insertions(+), 3 deletions(-) create mode 100644 apps/backend/prisma/migrations/20260210000001_deferred_email_retry_validate/migration.sql diff --git a/AGENTS.md b/AGENTS.md index c25b62c3ef..de951248e0 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -101,6 +101,7 @@ To see all development ports, refer to the index.html of `apps/dev-launchpad/pub - Fail early, fail loud. Fail fast with an error instead of silently continuing. - Do NOT use `as`/`any`/type casts or anything else like that to bypass the type system unless you specifically asked the user about it. Most of the time a place where you would use type casts is not one where you actually need them. Avoid wherever possible. - When writing database migration files, assume that we have >1,000,000 rows in every table (unless otherwise specified). This means you may have to use CONDITIONALLY_REPEAT_MIGRATION_SENTINEL to avoid running the migration and things like concurrent index builds; see the existing migrations for examples. +- Each migration file runs in its own transaction with a relatively short timeout. Split long-running operations into separate migration files to avoid timeouts. For example, when adding CHECK constraints, use `NOT VALID` in one migration, then `VALIDATE CONSTRAINT` in a separate migration file. - **When building frontend code, always carefully deal with loading and error states.** Be very explicit with these; some components make this easy, eg. the button onClick already takes an async callback for loading state, but make sure this is done everywhere, and make sure errors are NEVER just silently swallowed. - Unless very clearly equivalent from types, prefer explicit null/undefinedness checks over boolean checks, eg. `foo == null` instead of `!foo`. diff --git a/apps/backend/prisma/migrations/20260210000000_deferred_email_retry/migration.sql b/apps/backend/prisma/migrations/20260210000000_deferred_email_retry/migration.sql index 46243b8b79..1b54a669ae 100644 --- a/apps/backend/prisma/migrations/20260210000000_deferred_email_retry/migration.sql +++ b/apps/backend/prisma/migrations/20260210000000_deferred_email_retry/migration.sql @@ -9,17 +9,19 @@ ALTER TABLE "EmailOutbox" -- Constraint: nextSendRetryAt can only be set after at least one failed attempt -- (if sendRetries is 0, no attempt has failed, so there's nothing to retry) +-- Use NOT VALID to avoid holding ACCESS EXCLUSIVE lock during full-table validation. +-- Validation happens in a separate migration to avoid transaction timeout. ALTER TABLE "EmailOutbox" ADD CONSTRAINT "EmailOutbox_nextSendRetryAt_requires_failure" - CHECK ("nextSendRetryAt" IS NULL OR "sendRetries" > 0); + CHECK ("nextSendRetryAt" IS NULL OR "sendRetries" > 0) NOT VALID; -- Constraint: sendAttemptErrors can only be set after at least one failed attempt ALTER TABLE "EmailOutbox" ADD CONSTRAINT "EmailOutbox_sendAttemptErrors_requires_failure" - CHECK ("sendAttemptErrors" IS NULL OR "sendRetries" > 0); + CHECK ("sendAttemptErrors" IS NULL OR "sendRetries" > 0) NOT VALID; -- Constraint: nextSendRetryAt must be null when email has finished sending -- (if finishedSendingAt is set, there's nothing more to retry) ALTER TABLE "EmailOutbox" ADD CONSTRAINT "EmailOutbox_no_retry_after_finished" - CHECK ("finishedSendingAt" IS NULL OR "nextSendRetryAt" IS NULL); + CHECK ("finishedSendingAt" IS NULL OR "nextSendRetryAt" IS NULL) NOT VALID; diff --git a/apps/backend/prisma/migrations/20260210000001_deferred_email_retry_validate/migration.sql b/apps/backend/prisma/migrations/20260210000001_deferred_email_retry_validate/migration.sql new file mode 100644 index 0000000000..db7491d19b --- /dev/null +++ b/apps/backend/prisma/migrations/20260210000001_deferred_email_retry_validate/migration.sql @@ -0,0 +1,7 @@ +-- Validate the deferred retry constraints added in the previous migration. +-- This runs in a separate transaction to avoid timeout, and only takes +-- SHARE UPDATE EXCLUSIVE lock (allows concurrent reads/writes). + +ALTER TABLE "EmailOutbox" VALIDATE CONSTRAINT "EmailOutbox_nextSendRetryAt_requires_failure"; +ALTER TABLE "EmailOutbox" VALIDATE CONSTRAINT "EmailOutbox_sendAttemptErrors_requires_failure"; +ALTER TABLE "EmailOutbox" VALIDATE CONSTRAINT "EmailOutbox_no_retry_after_finished";