diff --git a/AGENTS.md b/AGENTS.md index c25b62c3ef..de951248e0 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -101,6 +101,7 @@ To see all development ports, refer to the index.html of `apps/dev-launchpad/pub - Fail early, fail loud. Fail fast with an error instead of silently continuing. - Do NOT use `as`/`any`/type casts or anything else like that to bypass the type system unless you specifically asked the user about it. Most of the time a place where you would use type casts is not one where you actually need them. Avoid wherever possible. - When writing database migration files, assume that we have >1,000,000 rows in every table (unless otherwise specified). This means you may have to use CONDITIONALLY_REPEAT_MIGRATION_SENTINEL to avoid running the migration and things like concurrent index builds; see the existing migrations for examples. +- Each migration file runs in its own transaction with a relatively short timeout. Split long-running operations into separate migration files to avoid timeouts. For example, when adding CHECK constraints, use `NOT VALID` in one migration, then `VALIDATE CONSTRAINT` in a separate migration file. - **When building frontend code, always carefully deal with loading and error states.** Be very explicit with these; some components make this easy, eg. the button onClick already takes an async callback for loading state, but make sure this is done everywhere, and make sure errors are NEVER just silently swallowed. - Unless very clearly equivalent from types, prefer explicit null/undefinedness checks over boolean checks, eg. `foo == null` instead of `!foo`. diff --git a/apps/backend/prisma/migrations/20260210000000_deferred_email_retry/migration.sql b/apps/backend/prisma/migrations/20260210000000_deferred_email_retry/migration.sql new file mode 100644 index 0000000000..1b54a669ae --- /dev/null +++ b/apps/backend/prisma/migrations/20260210000000_deferred_email_retry/migration.sql @@ -0,0 +1,27 @@ +-- Add deferred retry fields for email sending +-- These fields allow the email queue to schedule retries for later iterations +-- instead of blocking the current iteration with inline retries. + +ALTER TABLE "EmailOutbox" + ADD COLUMN "sendRetries" INTEGER NOT NULL DEFAULT 0, + ADD COLUMN "nextSendRetryAt" TIMESTAMP(3), + ADD COLUMN "sendAttemptErrors" JSONB; + +-- Constraint: nextSendRetryAt can only be set after at least one failed attempt +-- (if sendRetries is 0, no attempt has failed, so there's nothing to retry) +-- Use NOT VALID to avoid holding ACCESS EXCLUSIVE lock during full-table validation. +-- Validation happens in a separate migration to avoid transaction timeout. +ALTER TABLE "EmailOutbox" + ADD CONSTRAINT "EmailOutbox_nextSendRetryAt_requires_failure" + CHECK ("nextSendRetryAt" IS NULL OR "sendRetries" > 0) NOT VALID; + +-- Constraint: sendAttemptErrors can only be set after at least one failed attempt +ALTER TABLE "EmailOutbox" + ADD CONSTRAINT "EmailOutbox_sendAttemptErrors_requires_failure" + CHECK ("sendAttemptErrors" IS NULL OR "sendRetries" > 0) NOT VALID; + +-- Constraint: nextSendRetryAt must be null when email has finished sending +-- (if finishedSendingAt is set, there's nothing more to retry) +ALTER TABLE "EmailOutbox" + ADD CONSTRAINT "EmailOutbox_no_retry_after_finished" + CHECK ("finishedSendingAt" IS NULL OR "nextSendRetryAt" IS NULL) NOT VALID; diff --git a/apps/backend/prisma/migrations/20260210000001_deferred_email_retry_validate/migration.sql b/apps/backend/prisma/migrations/20260210000001_deferred_email_retry_validate/migration.sql new file mode 100644 index 0000000000..db7491d19b --- /dev/null +++ b/apps/backend/prisma/migrations/20260210000001_deferred_email_retry_validate/migration.sql @@ -0,0 +1,7 @@ +-- Validate the deferred retry constraints added in the previous migration. +-- This runs in a separate transaction to avoid timeout, and only takes +-- SHARE UPDATE EXCLUSIVE lock (allows concurrent reads/writes). + +ALTER TABLE "EmailOutbox" VALIDATE CONSTRAINT "EmailOutbox_nextSendRetryAt_requires_failure"; +ALTER TABLE "EmailOutbox" VALIDATE CONSTRAINT "EmailOutbox_sendAttemptErrors_requires_failure"; +ALTER TABLE "EmailOutbox" VALIDATE CONSTRAINT "EmailOutbox_no_retry_after_finished"; diff --git a/apps/backend/prisma/migrations/20260213004424_email_outbox_is_queued_index/migration.sql b/apps/backend/prisma/migrations/20260213004424_email_outbox_is_queued_index/migration.sql new file mode 100644 index 0000000000..49f0db985e --- /dev/null +++ b/apps/backend/prisma/migrations/20260213004424_email_outbox_is_queued_index/migration.sql @@ -0,0 +1,6 @@ +-- SPLIT_STATEMENT_SENTINEL +-- SINGLE_STATEMENT_SENTINEL +-- RUN_OUTSIDE_TRANSACTION_SENTINEL +-- Index on isQueued for efficient queueReadyEmails() queries. +-- Most emails have isQueued=TRUE (already processed), so filtering for FALSE is highly selective. +CREATE INDEX CONCURRENTLY IF NOT EXISTS "EmailOutbox_isQueued_idx" ON /* SCHEMA_NAME_SENTINEL */."EmailOutbox" ("isQueued"); diff --git a/apps/backend/prisma/schema.prisma b/apps/backend/prisma/schema.prisma index 21b781e81a..df43c5c39c 100644 --- a/apps/backend/prisma/schema.prisma +++ b/apps/backend/prisma/schema.prisma @@ -99,8 +99,8 @@ model ExternalDbSyncMetadata { singleton BooleanTrue @unique @default(TRUE) - sequencerEnabled Boolean @default(true) - pollerEnabled Boolean @default(true) + sequencerEnabled Boolean @default(true) + pollerEnabled Boolean @default(true) createdAt DateTime @default(now()) updatedAt DateTime @updatedAt @@ -833,7 +833,7 @@ model EmailOutbox { // The scheduled time of when the email should be added to the queue. Can be edited, but only if the email has not yet started sending. Doing so should set isQueued to false. scheduledAt DateTime - // The scheduled time of the email if it is in the future. + // Whether the email has been queued for sending. Once queued, this stays true unless the email is retried or rescheduled. isQueued Boolean @default(false) // A generated column that is equal to scheduledAt if isQueued is false, otherwise null. See the note above on EmailOutboxStatus.status for more details on dbgenerated values. @@ -844,6 +844,14 @@ model EmailOutbox { // if startedSendingAt is not set, then finishedSendingAt is also not set finishedSendingAt DateTime? + // Deferred retry fields for email sending + // Number of send retries attempted (starts at 0, incremented on each failure). Reset when email content is edited. + sendRetries Int @default(0) + // When to retry sending (null = not waiting for retry). Set when a retryable error occurs. Reset when email content is edited. Must be null if sendRetries is 0 (enforced by EmailOutbox_nextSendRetryAt_requires_failure). Must be null when finishedSendingAt is set (enforced by EmailOutbox_no_retry_after_finished). + nextSendRetryAt DateTime? + // JSON array of errors from each failed send attempt. Each entry has: { attemptNumber, timestamp, externalMessage, externalDetails, internalMessage, internalDetails }. Reset when email content is edited. Must be null if sendRetries is 0 (enforced by EmailOutbox_sendAttemptErrors_requires_failure). + sendAttemptErrors Json? + // A generated column that is equal to finishedSendingAt if canHaveDeliveryInfo is false, otherwise deliveredAt. sentAt DateTime? @default(dbgenerated("\nCASE\n WHEN (\"canHaveDeliveryInfo\" IS TRUE) THEN \"deliveredAt\"\n WHEN (\"canHaveDeliveryInfo\" IS FALSE) THEN \"finishedSendingAt\"\n ELSE NULL::timestamp without time zone\nEND")) @@ -878,6 +886,7 @@ model EmailOutbox { @@index([tenancyId, finishedSendingAt(sort: Desc), scheduledAtIfNotYetQueued(sort: Desc), priority, id], map: "EmailOutbox_ordering_idx") @@index([tenancyId, simpleStatus], map: "EmailOutbox_simple_status_tenancy_idx") @@index([tenancyId, status], map: "EmailOutbox_status_tenancy_idx") + @@index([isQueued], map: "EmailOutbox_isQueued_idx") } model EmailOutboxProcessingMetadata { @@ -1088,7 +1097,7 @@ model OutgoingRequest { qstashOptions Json startedFulfillingAt DateTime? - deduplicationKey String? + deduplicationKey String? // Partial unique index on deduplicationKey WHERE startedFulfillingAt IS NULL // is created in a custom migration (not expressible in Prisma schema) diff --git a/apps/backend/src/app/api/latest/emails/outbox/crud.tsx b/apps/backend/src/app/api/latest/emails/outbox/crud.tsx index a4aa26a916..6ecf5ad789 100644 --- a/apps/backend/src/app/api/latest/emails/outbox/crud.tsx +++ b/apps/backend/src/app/api/latest/emails/outbox/crud.tsx @@ -7,6 +7,7 @@ import { KnownErrors } from "@stackframe/stack-shared"; import { emailOutboxCrud, EmailOutboxCrud } from "@stackframe/stack-shared/dist/interface/crud/email-outbox"; import { yupObject, yupString } from "@stackframe/stack-shared/dist/schema-fields"; import { StackAssertionError, StatusError, throwErr } from "@stackframe/stack-shared/dist/utils/errors"; +import { Json } from "@stackframe/stack-shared/dist/utils/json"; import { createLazyProxy } from "@stackframe/stack-shared/dist/utils/proxies"; /** @@ -57,6 +58,25 @@ function prismaModelToCrud(prismaModel: EmailOutbox): EmailOutboxCrud["Server"][ to = { type: "custom-emails", emails: recipient?.emails ?? [] }; } + // Convert sendAttemptErrors from DB format (camelCase) to API format (snake_case) + const sendAttemptErrors = prismaModel.sendAttemptErrors + ? (prismaModel.sendAttemptErrors as Array<{ + attemptNumber: number, + timestamp: string, + externalMessage: string, + externalDetails: Record, + internalMessage: string, + internalDetails: Record, + }>).map(e => ({ + attempt_number: e.attemptNumber, + timestamp: e.timestamp, + external_message: e.externalMessage, + external_details: e.externalDetails, + internal_message: e.internalMessage, + internal_details: e.internalDetails, + })) + : null; + // Base fields present on all emails const base = { id: prismaModel.id, @@ -68,6 +88,9 @@ function prismaModelToCrud(prismaModel: EmailOutbox): EmailOutboxCrud["Server"][ variables: (prismaModel.extraRenderVariables ?? {}) as Record, skip_deliverability_check: prismaModel.shouldSkipDeliverabilityCheck, scheduled_at_millis: prismaModel.scheduledAt.getTime(), + send_retries: prismaModel.sendRetries, + next_send_retry_at_millis: prismaModel.nextSendRetryAt?.getTime() ?? null, + send_attempt_errors: sendAttemptErrors, // Default flags (overridden in specific statuses) is_paused: false, has_rendered: false, @@ -358,6 +381,7 @@ export const emailOutboxCrudHandlers = createLazyProxy(() => createCrudHandlers( // Cancel action - mark as skipped set("isPaused", Prisma.sql`false`); set("isQueued", Prisma.sql`false`); + setNull("nextSendRetryAt"); // Clear any pending retry so it won't be picked up set("skippedReason", Prisma.sql`'MANUALLY_CANCELLED'::"EmailOutboxSkippedReason"`); set("skippedDetails", Prisma.sql`'{}'::jsonb`); } else { @@ -395,6 +419,8 @@ export const emailOutboxCrudHandlers = createLazyProxy(() => createCrudHandlers( // If content changed, reset rendering and sending state if (needsRerenderReset) { set("isQueued", Prisma.sql`false`); + // Reset retry fields (sendRetries to 0, others to null) + set("sendRetries", Prisma.sql`0`); setNull( "renderedByWorkerId", "startedRenderingAt", "finishedRenderingAt", "renderErrorExternalMessage", "renderErrorExternalDetails", @@ -402,6 +428,7 @@ export const emailOutboxCrudHandlers = createLazyProxy(() => createCrudHandlers( "renderedHtml", "renderedText", "renderedSubject", "renderedIsTransactional", "renderedNotificationCategoryId", "startedSendingAt", "finishedSendingAt", + "nextSendRetryAt", "sendAttemptErrors", "sendServerErrorExternalMessage", "sendServerErrorExternalDetails", "sendServerErrorInternalMessage", "sendServerErrorInternalDetails", "skippedReason", "skippedDetails", "canHaveDeliveryInfo", @@ -494,6 +521,9 @@ function parseEmailOutboxFromJson(j: Record): EmailOutbox { scheduledAtIfNotYetQueued: dateOrNull("scheduledAtIfNotYetQueued"), startedSendingAt: dateOrNull("startedSendingAt"), finishedSendingAt: dateOrNull("finishedSendingAt"), + sendRetries: j.sendRetries as number, + nextSendRetryAt: dateOrNull("nextSendRetryAt"), + sendAttemptErrors: j.sendAttemptErrors as Prisma.JsonValue, sentAt: dateOrNull("sentAt"), sendServerErrorExternalMessage: j.sendServerErrorExternalMessage as string | null, sendServerErrorExternalDetails: j.sendServerErrorExternalDetails as Prisma.JsonValue, diff --git a/apps/backend/src/lib/email-queue-step.tsx b/apps/backend/src/lib/email-queue-step.tsx index 0d72ea2e1f..e120b34534 100644 --- a/apps/backend/src/lib/email-queue-step.tsx +++ b/apps/backend/src/lib/email-queue-step.tsx @@ -15,10 +15,40 @@ import { filterUndefined } from "@stackframe/stack-shared/dist/utils/objects"; import { Result } from "@stackframe/stack-shared/dist/utils/results"; import { traceSpan } from "@stackframe/stack-shared/dist/utils/telemetry"; import { randomUUID } from "node:crypto"; -import { lowLevelSendEmailDirectViaProvider } from "./emails-low-level"; +import { lowLevelSendEmailDirectWithoutRetries } from "./emails-low-level"; const MAX_RENDER_BATCH = 50; +const MAX_SEND_ATTEMPTS = 5; + +const SEND_RETRY_BACKOFF_BASE_MS = 2000; + +const calculateRetryBackoffMs = (attemptCount: number): number => { + return (Math.random() + 0.5) * SEND_RETRY_BACKOFF_BASE_MS * Math.pow(2, attemptCount); +}; + +/** + * Structure for tracking errors from each send attempt. + * Mirrors the pattern used for sendServerError* fields. + * Uses Prisma.InputJsonValue-compatible types for DB storage. + */ +type SendAttemptError = { + attemptNumber: number, + timestamp: string, + externalMessage: string, + externalDetails: Prisma.InputJsonObject, + internalMessage: string, + internalDetails: Prisma.InputJsonObject, +}; + +const appendSendAttemptError =( + existingErrors: SendAttemptError[] | null | undefined, + newError: SendAttemptError +): SendAttemptError[] => { + const errors = existingErrors ?? []; + return [...errors, newError]; +}; + // Track if email queue has run at least once since server start (used to suppress first-run delta warnings in dev) const emailQueueFirstRunKey = Symbol.for("__stack_email_queue_first_run_completed"); @@ -472,27 +502,52 @@ async function renderTenancyEmails(workerId: string, tenancyId: string, group: E } async function queueReadyEmails(): Promise<{ queuedCount: number }> { - const res = await globalPrismaClient.$queryRaw<{ id: string }[]>` + // Queue emails that are ready to send. Split into two queries for clarity and index usage. + // We always require scheduledAt <= NOW() to respect the original scheduling intent. + + // Query 1: Fresh emails (scheduledAt has passed, no retry pending) + const freshEmails = await globalPrismaClient.$queryRaw<{ id: string }[]>` UPDATE "EmailOutbox" SET "isQueued" = TRUE WHERE "isQueued" = FALSE AND "isPaused" = FALSE + AND "skippedReason" IS NULL AND "finishedRenderingAt" IS NOT NULL AND "renderedHtml" IS NOT NULL AND "scheduledAt" <= NOW() + AND "nextSendRetryAt" IS NULL RETURNING "id"; `; + + // Query 2: Retry emails (both scheduledAt AND nextSendRetryAt have passed) + // Clear nextSendRetryAt when queuing so the email is in a clean "queued" state. + const retryEmails = await globalPrismaClient.$queryRaw<{ id: string }[]>` + UPDATE "EmailOutbox" + SET "isQueued" = TRUE, "nextSendRetryAt" = NULL + WHERE "isQueued" = FALSE + AND "isPaused" = FALSE + AND "skippedReason" IS NULL + AND "finishedRenderingAt" IS NOT NULL + AND "renderedHtml" IS NOT NULL + AND "scheduledAt" <= NOW() + AND "nextSendRetryAt" <= NOW() + RETURNING "id"; + `; + return { - queuedCount: res.length, + queuedCount: freshEmails.length + retryEmails.length, }; } async function prepareSendPlan(deltaSeconds: number): Promise { + // Find tenancies with queued emails ready to send const tenancyIds = await globalPrismaClient.emailOutbox.findMany({ where: { - isQueued: true, isPaused: false, + skippedReason: null, + finishedSendingAt: null, startedSendingAt: null, + isQueued: true, }, distinct: ["tenancyId"], select: { tenancyId: true }, @@ -518,15 +573,19 @@ function stochasticQuota(value: number): number { } async function claimEmailsForSending(tx: PrismaClientTransaction, tenancyId: string, limit: number): Promise { + // Claim queued emails for sending + // Note: queueReadyEmails() handles the time-based logic, so we just look for isQueued = TRUE return await tx.$queryRaw(Prisma.sql` WITH selected AS ( SELECT "tenancyId", "id" FROM "EmailOutbox" WHERE "tenancyId" = ${tenancyId}::uuid - AND "isQueued" = TRUE AND "isPaused" = FALSE + AND "skippedReason" IS NULL + AND "finishedSendingAt" IS NULL AND "finishedRenderingAt" IS NOT NULL AND "startedSendingAt" IS NULL + AND "isQueued" = TRUE ORDER BY "priority" DESC, "scheduledAt" ASC, "createdAt" ASC LIMIT ${limit} FOR UPDATE SKIP LOCKED @@ -640,7 +699,7 @@ async function processSingleEmail(context: TenancyProcessingContext, row: EmailO const result = getEnvBoolean("STACK_EMAIL_BRANCHING_DISABLE_QUEUE_SENDING") ? Result.error({ errorType: "email-sending-disabled", canRetry: false, message: "Email sending is disabled", rawError: new Error("Email sending is disabled") }) - : await lowLevelSendEmailDirectViaProvider({ + : await lowLevelSendEmailDirectWithoutRetries({ tenancyId: context.tenancy.id, emailConfig: context.emailConfig, to: resolution.emails, @@ -650,24 +709,83 @@ async function processSingleEmail(context: TenancyProcessingContext, row: EmailO }); if (result.status === "error") { - await globalPrismaClient.emailOutbox.update({ - where: { - tenancyId_id: { + const newAttemptCount = row.sendRetries + 1; + const isAttemptsExhausted = result.error.canRetry && newAttemptCount >= MAX_SEND_ATTEMPTS; + const canRetry = result.error.canRetry && !isAttemptsExhausted; + + // Build error entry for this attempt + const errorEntry: SendAttemptError = { + attemptNumber: newAttemptCount, + timestamp: new Date().toISOString(), + externalMessage: result.error.message ?? result.error.errorType, + externalDetails: { errorType: result.error.errorType }, + internalMessage: result.error.message ?? result.error.errorType, + internalDetails: { rawError: errorToNiceString(result.error.rawError), errorType: result.error.errorType }, + }; + const updatedErrors = appendSendAttemptError(row.sendAttemptErrors as SendAttemptError[] | null, errorEntry); + + if (canRetry) { + // Schedule retry: unclaim the email and set nextSendRetryAt + const backoffMs = calculateRetryBackoffMs(newAttemptCount); + await globalPrismaClient.emailOutbox.update({ + where: { + tenancyId_id: { + tenancyId: row.tenancyId, + id: row.id, + }, + finishedSendingAt: null, + }, + data: { + startedSendingAt: null, + isQueued: false, + sendRetries: newAttemptCount, + nextSendRetryAt: new Date(Date.now() + backoffMs), + sendAttemptErrors: updatedErrors as Prisma.InputJsonArray, + }, + }); + } else { + // Mark as permanent failure - either "attempts_exhausted" (retryable but hit limit) or "permanent_error" (non-retryable) + const failureReason = isAttemptsExhausted ? "attempts_exhausted" : "permanent_error"; + + if (isAttemptsExhausted) { + captureError("email-queue-step-retries-exhausted", new StackAssertionError(`Email failed after ${newAttemptCount} attempts`, { + cause: result.error.rawError, + emailId: row.id, tenancyId: row.tenancyId, - id: row.id, + errorType: result.error.errorType, + errorMessage: result.error.message, + allAttemptErrors: updatedErrors, + })); + } + + await globalPrismaClient.emailOutbox.update({ + where: { + tenancyId_id: { + tenancyId: row.tenancyId, + id: row.id, + }, + finishedSendingAt: null, }, - finishedSendingAt: null, - }, - data: { - finishedSendingAt: new Date(), - canHaveDeliveryInfo: false, - sendServerErrorExternalMessage: result.error.message, - sendServerErrorExternalDetails: { errorType: result.error.errorType }, - sendServerErrorInternalMessage: result.error.message, - sendServerErrorInternalDetails: { rawError: errorToNiceString(result.error.rawError), errorType: result.error.errorType }, - }, - }); + data: { + finishedSendingAt: new Date(), + canHaveDeliveryInfo: false, + sendRetries: newAttemptCount, + sendAttemptErrors: updatedErrors as Prisma.InputJsonArray, + sendServerErrorExternalMessage: result.error.message, + sendServerErrorExternalDetails: { errorType: result.error.errorType }, + sendServerErrorInternalMessage: result.error.message, + sendServerErrorInternalDetails: { + rawError: errorToNiceString(result.error.rawError), + errorType: result.error.errorType, + attemptCount: newAttemptCount, + failureReason, + allAttemptErrors: updatedErrors as Json[], + }, + }, + }); + } } else { + // Success - mark as sent (don't increment sendRetries since this wasn't a failure) await globalPrismaClient.emailOutbox.update({ where: { tenancyId_id: { diff --git a/apps/backend/src/lib/emails-low-level.tsx b/apps/backend/src/lib/emails-low-level.tsx index a91e31744c..0183c28a70 100644 --- a/apps/backend/src/lib/emails-low-level.tsx +++ b/apps/backend/src/lib/emails-low-level.tsx @@ -10,8 +10,6 @@ import { runAsynchronously, wait } from '@stackframe/stack-shared/dist/utils/pro import { Result } from '@stackframe/stack-shared/dist/utils/results'; import { traceSpan } from '@stackframe/stack-shared/dist/utils/telemetry'; import nodemailer from 'nodemailer'; -import { Resend } from 'resend'; -import { getTenancy } from './tenancies'; export function isSecureEmailPort(port: number | string) { // "secure" in most SMTP clients means implicit TLS from byte 1 (SMTPS) @@ -213,104 +211,22 @@ export async function lowLevelSendEmailDirectWithoutRetries(options: LowLevelSen errorType: string, canRetry: boolean, message?: string, -}>> { - return await _lowLevelSendEmailWithoutRetries(options); -} - -// currently unused, although in the future we may want to use this to minimize the number of requests to Resend -export async function lowLevelSendEmailResendBatchedDirect(resendApiKey: string, emailOptions: LowLevelSendEmailOptions[]) { - if (emailOptions.length === 0) { - return Result.ok([]); - } - if (emailOptions.length > 100) { - throw new StackAssertionError("sendEmailResendBatchedDirect expects at most 100 emails to be sent at once", { emailOptions }); - } - if (emailOptions.some(option => option.tenancyId !== emailOptions[0].tenancyId)) { - throw new StackAssertionError("sendEmailResendBatchedDirect expects all emails to be sent from the same tenancy", { emailOptions }); - } - const tenancy = await getTenancy(emailOptions[0].tenancyId); - if (!tenancy) { - throw new StackAssertionError("Tenancy not found"); - } - const resend = new Resend(resendApiKey); - const result = await Result.retry(async (_) => { - const { data, error } = await resend.batch.send(emailOptions.map((option) => ({ - from: option.emailConfig.senderEmail, - to: option.to, - subject: option.subject, - html: option.html ?? "", - text: option.text, - }))); - - if (data) { - return Result.ok(data.data); - } - if (error.name === "rate_limit_exceeded" || error.name === "internal_server_error") { - // these are the errors we want to retry - return Result.error(error); - } - throw new StackAssertionError("Failed to send email with Resend", { error }); - }, 3, { exponentialDelayBase: 2000 }); - - return result; -} - -export async function lowLevelSendEmailDirectViaProvider(options: LowLevelSendEmailOptions): Promise> { if (!options.to) { throw new StackAssertionError("No recipient email address provided to sendEmail", omit(options, ['emailConfig'])); } - class DoNotRetryError extends Error { - constructor(public readonly errorObj: { - rawError: any, - errorType: string, - canRetry: boolean, - message?: string, - }) { - super("This error should never be caught anywhere else but inside the lowLevelSendEmailDirectViaProvider function, something went wrong if you see this!"); - } - } - - let result; - try { - result = await Result.retry(async (attempt) => { - const result = await lowLevelSendEmailDirectWithoutRetries(options); - - if (result.status === 'error') { - const extraData = { - host: options.emailConfig.host, - from: options.emailConfig.senderEmail, - to: options.to, - subject: options.subject, - error: result.error, - }; - - if (result.error.canRetry) { - console.warn("Failed to send email, but error is possibly transient so retrying.", extraData, result.error.rawError); - return Result.error(result.error); - } - - console.warn("Failed to send email, and error is not transient, so not retrying.", extraData, result.error.rawError); - throw new DoNotRetryError(result.error); - } - - return result; - }, 9, { exponentialDelayBase: 125 }); - } catch (error) { - if (error instanceof DoNotRetryError) { - return Result.error(error.errorObj); - } - throw error; - } + const result = await _lowLevelSendEmailWithoutRetries(options); if (result.status === 'error') { - console.warn("Failed to send email after all retries!", result.error); - return Result.error(result.error.errors[0]); + console.warn("Failed to send email.", { + host: options.emailConfig.host, + from: options.emailConfig.senderEmail, + to: options.to, + subject: options.subject, + error: result.error, + }, result.error.rawError); } - return Result.ok(undefined); + + return result; } diff --git a/apps/e2e/tests/backend/backend-helpers.ts b/apps/e2e/tests/backend/backend-helpers.ts index fc5d260dc5..cf87bad045 100644 --- a/apps/e2e/tests/backend/backend-helpers.ts +++ b/apps/e2e/tests/backend/backend-helpers.ts @@ -187,7 +187,7 @@ export async function bumpEmailAddress(options: { unindexed?: boolean } = {}) { } // Type for outbox email items (simplified - full type is EmailOutboxCrud["Server"]["Read"]) -type OutboxEmail = { +export type OutboxEmail = { id: string, subject?: string, status: string, diff --git a/apps/e2e/tests/backend/endpoints/api/v1/emails/email-queue.test.ts b/apps/e2e/tests/backend/endpoints/api/v1/emails/email-queue.test.ts index 2bb050fd4f..50b23f17de 100644 --- a/apps/e2e/tests/backend/endpoints/api/v1/emails/email-queue.test.ts +++ b/apps/e2e/tests/backend/endpoints/api/v1/emails/email-queue.test.ts @@ -1,10 +1,12 @@ +import { StackAssertionError, throwErr } from "@stackframe/stack-shared/dist/utils/errors"; import { wait } from "@stackframe/stack-shared/dist/utils/promises"; import { deindent, nicify } from "@stackframe/stack-shared/dist/utils/strings"; import beautify from "js-beautify"; -import { describe } from "vitest"; +import * as net from "net"; +import { afterAll, beforeAll, describe } from "vitest"; import { it, logIfTestFails } from "../../../../../helpers"; import { withPortPrefix } from "../../../../../helpers/ports"; -import { Auth, Project, User, backendContext, bumpEmailAddress, getOutboxEmails, niceBackendFetch, waitForOutboxEmailWithStatus } from "../../../../backend-helpers"; +import { Auth, InternalApiKey, OutboxEmail, Project, User, backendContext, bumpEmailAddress, getOutboxEmails, niceBackendFetch, waitForOutboxEmailWithStatus } from "../../../../backend-helpers"; const testEmailConfig = { type: "standard", @@ -419,9 +421,12 @@ describe("send email to all users", () => { "is_high_priority": false, "is_paused": false, "is_transactional": true, + "next_send_retry_at_millis": null, "notification_category_id": "", "rendered_at_millis": , "scheduled_at_millis": , + "send_attempt_errors": null, + "send_retries": 0, "simple_status": "ok", "skip_deliverability_check": false, "started_rendering_at_millis": , @@ -456,9 +461,12 @@ describe("send email to all users", () => { "is_high_priority": false, "is_paused": false, "is_transactional": true, + "next_send_retry_at_millis": null, "notification_category_id": "", "rendered_at_millis": , "scheduled_at_millis": , + "send_attempt_errors": null, + "send_retries": 0, "simple_status": "ok", "skip_deliverability_check": false, "started_rendering_at_millis": , @@ -1846,3 +1854,341 @@ describe("email outbox pagination", () => { }, 60_000); }); +// Invalid SMTP config - causes HOST_NOT_FOUND (non-retryable error) +const brokenSmtpConfig = { + type: "standard", + host: "this-host-does-not-exist.invalid", + port: 25, + username: "test", + password: "test", + sender_name: "Test Project", + sender_email: "test@example.com", +} as const; + +// SMTP server that responds with 450 (temporary failure) - retryable and fast +let tempFailServer: net.Server | null = null; +let tempFailPort: number | null = null; + +async function startTempFailSmtpServer(): Promise { + if (tempFailServer) { + return tempFailPort ?? throwErr("tempFailServer not initialized"); + } + + return await new Promise((resolve, reject) => { + tempFailServer = net.createServer((socket) => { + // Send SMTP greeting + socket.write('220 localhost SMTP Test Server\r\n'); + + socket.on('data', (data) => { + const command = data.toString().trim().toUpperCase(); + // Respond with 450 (temporary failure) to all commands + // This is a retryable error that happens immediately + if (command.startsWith('EHLO') || command.startsWith('HELO')) { + socket.write('250 localhost Hello\r\n'); + } else if (command.startsWith('MAIL FROM')) { + // Temporary failure - "mailbox unavailable, try again later" + socket.write('450 Requested mail action not taken: mailbox unavailable (test)\r\n'); + socket.end(); + } else if (command.startsWith('QUIT')) { + socket.write('221 Bye\r\n'); + socket.end(); + } else { + socket.write('450 Temporary failure (test)\r\n'); + } + }); + + socket.on('error', () => { + // Ignore errors (client may disconnect) + }); + }); + + tempFailServer.listen(0, '127.0.0.1', () => { + const address = (tempFailServer ?? throwErr("tempFailServer unexpectedly null in listen callback")).address(); + if (typeof address === 'object' && address !== null) { + tempFailPort = address.port; + resolve(tempFailPort); + } else { + reject(new Error('Failed to get server address')); + } + }); + + tempFailServer.on('error', reject); + }); +} + +function stopTempFailSmtpServer(): void { + if (tempFailServer) { + tempFailServer.close(); + tempFailServer = null; + tempFailPort = null; + } +} + +// Factory function to create temp-fail SMTP config with dynamic port +function createTempFailSmtpConfig(port: number) { + return { + type: "standard", + host: "127.0.0.1", + port: port, + username: "test", + password: "test", + sender_name: "Test Project", + sender_email: "test@example.com", + } as const; +} + +// Helper type for send attempt error entries +type SendAttemptErrorEntry = { + attempt_number: number, + timestamp: string, + external_message: string, + external_details: Record, + internal_message: string, + internal_details: Record, +}; + +// Helper type for email outbox items with retry fields +type OutboxEmailWithRetryFields = OutboxEmail & { + send_retries: number, + next_send_retry_at_millis: number | null, + send_attempt_errors: SendAttemptErrorEntry[] | null, +}; + +// Helper to get detailed email from the outbox +async function getOutboxEmailById(emailId: string): Promise { + const response = await niceBackendFetch(`/api/v1/emails/outbox/${emailId}`, { + method: "GET", + accessType: "server", + }); + if (response.status !== 200) { + throw new StackAssertionError(`Failed to get email ${emailId}: status ${response.status}`, { response }); + } + return response.body; +} + +// Helper to poll until an email with the given subject appears in the outbox +async function waitForOutboxEmail(subject: string, timeoutMs = 30000): Promise { + const startTime = performance.now(); + while (performance.now() - startTime < timeoutMs) { + const emails = await getOutboxEmails({ subject }); + if (emails.length > 0) { + return await getOutboxEmailById(emails[0].id); + } + await wait(500); + } + throw new StackAssertionError( + `Timeout waiting for email with subject "${subject}" to appear in outbox`, + { subject } + ); +} + +// Helper to poll until the email has reached a specific send_retries +// Note: Status may be "queued" or "sending" due to race conditions - that's expected +async function waitForAttemptCount(emailId: string, attemptCount: number, timeoutMs = 60000): Promise { + const startTime = performance.now(); + while (performance.now() - startTime < timeoutMs) { + const email = await getOutboxEmailById(emailId); + if (email.send_retries >= attemptCount) { + return email; + } + // Terminal state - no more retries will happen + if (email.status === "server-error") { + return email; + } + await wait(500); + } + const finalEmail = await getOutboxEmailById(emailId); + throw new StackAssertionError( + `Timeout waiting for email ${emailId} to reach send_retries >= ${attemptCount}`, + { emailId, attemptCount, finalState: { count: finalEmail.send_retries, status: finalEmail.status } } + ); +} + +describe("email queue deferred retry logic", () => { + it("should immediately mark non-retryable errors as server-error without retrying", async ({ expect }) => { + // brokenSmtpConfig causes HOST_NOT_FOUND which has canRetry: false + await Project.createAndSwitch({ + display_name: "Test Non-Retryable Error Project", + config: { + email_config: brokenSmtpConfig, + }, + }); + + const mailbox = backendContext.value.mailbox; + const createUserResponse = await niceBackendFetch("/api/v1/users", { + method: "POST", + accessType: "server", + body: { + primary_email: mailbox.emailAddress, + primary_email_verified: true, + }, + }); + expect(createUserResponse.status).toBe(201); + const userId = createUserResponse.body.id; + + const sendResponse = await niceBackendFetch("/api/v1/emails/send-email", { + method: "POST", + accessType: "server", + body: { + user_ids: [userId], + html: "Test email for non-retryable error", + subject: "Non-Retryable Error Test", + }, + }); + expect(sendResponse.status).toBe(200); + + // Wait for the email to appear in the outbox and reach server-error state + const initialEmail = await waitForOutboxEmail("Non-Retryable Error Test"); + const emailId = initialEmail.id; + + // Wait for the email to reach server-error status + const maxWaitMs = 30000; + const startTime = performance.now(); + let email = initialEmail; + while (performance.now() - startTime < maxWaitMs && email.status !== "server-error") { + await wait(500); + email = await getOutboxEmailById(emailId); + } + + // Non-retryable errors should go directly to server-error + expect(email.send_retries).toBe(1); + expect(email.next_send_retry_at_millis).toBeNull(); + expect(email.send_attempt_errors).not.toBeNull(); + expect(email.send_attempt_errors?.length).toBe(1); + expect(email.status).toBe("server-error"); + + logIfTestFails("Email after non-retryable error", email); + }); + + describe("retryable errors (using temp-fail SMTP server)", () => { + // These tests use a local SMTP server that responds with 450 (temporary failure). + // This is fast (immediate response) and produces retryable errors. + + let tempFailSmtpConfig: ReturnType; + + beforeAll(async () => { + const port = await startTempFailSmtpServer(); + tempFailSmtpConfig = createTempFailSmtpConfig(port); + }); + + afterAll(() => { + stopTempFailSmtpServer(); + }); + + it("should schedule retry on retryable failure and release email for next iteration", { timeout: 60000 }, async ({ expect }) => { + await Project.createAndSwitch({ + display_name: "Test Deferred Retry Project", + config: { + email_config: tempFailSmtpConfig, + }, + }); + // Create API keys that don't expire (JWT admin tokens expire in 60s which is too short for retry tests) + await InternalApiKey.createAndSetProjectKeys(); + + const mailbox = backendContext.value.mailbox; + const createUserResponse = await niceBackendFetch("/api/v1/users", { + method: "POST", + accessType: "server", + body: { + primary_email: mailbox.emailAddress, + primary_email_verified: true, + }, + }); + expect(createUserResponse.status).toBe(201); + const userId = createUserResponse.body.id; + + const sendResponse = await niceBackendFetch("/api/v1/emails/send-email", { + method: "POST", + accessType: "server", + body: { + user_ids: [userId], + html: "Test email for retry logic", + subject: "Retry Test Email", + }, + }); + expect(sendResponse.status).toBe(200); + + // Wait for the email to appear in the outbox + const initialEmail = await waitForOutboxEmail("Retry Test Email"); + const emailId = initialEmail.id; + + // Wait for first send attempt to complete (450 response is immediate) + const emailAfterFirstAttempt = await waitForAttemptCount(emailId, 1, 30000); + + // Verify the email was released for a DIFFERENT queue iteration to pick up + // - status should NOT be server-error (retries remaining) + expect(emailAfterFirstAttempt.send_retries).toBe(1); + expect(emailAfterFirstAttempt.send_attempt_errors).not.toBeNull(); + expect(emailAfterFirstAttempt.send_attempt_errors?.length).toBe(1); + expect(emailAfterFirstAttempt.send_attempt_errors?.[0].attempt_number).toBe(1); + expect(emailAfterFirstAttempt.send_attempt_errors?.[0].external_message).toContain("450"); + expect(emailAfterFirstAttempt.status).not.toBe("server-error"); + + // Status could be: + // - "scheduled": isQueued=false, waiting for nextSendRetryAt to pass + // - "queued": queueReadyEmails() ran, isQueued=true, waiting to be claimed + // - "sending": next iteration already picked it up + expect(["scheduled", "queued", "sending"]).toContain(emailAfterFirstAttempt.status); + + logIfTestFails("Email after first retry attempt", emailAfterFirstAttempt); + }); + + it("should retry emails until max attempts exhausted, then mark as server-error", { timeout: 150000 }, async ({ expect }) => { + await Project.createAndSwitch({ + display_name: "Test Retry Exhaustion Project", + config: { + email_config: tempFailSmtpConfig, + }, + }); + // Create API keys that don't expire (JWT admin tokens expire in 60s which is too short for retry tests) + await InternalApiKey.createAndSetProjectKeys(); + + const mailbox = backendContext.value.mailbox; + const createUserResponse = await niceBackendFetch("/api/v1/users", { + method: "POST", + accessType: "server", + body: { + primary_email: mailbox.emailAddress, + primary_email_verified: true, + }, + }); + expect(createUserResponse.status).toBe(201); + const userId = createUserResponse.body.id; + + const sendResponse = await niceBackendFetch("/api/v1/emails/send-email", { + method: "POST", + accessType: "server", + body: { + user_ids: [userId], + html: "Test email for retry exhaustion", + subject: "Retry Exhaustion Test", + }, + }); + expect(sendResponse.status).toBe(200); + + // Wait for the email to appear in the outbox + const initialEmail = await waitForOutboxEmail("Retry Exhaustion Test"); + const emailId = initialEmail.id; + + // Wait for all retries to exhaust (MAX_SEND_ATTEMPTS = 5) + // With 450 errors (immediate) + exponential backoff (2s base * 2^attempt), worst case ~90s + const maxWaitMs = 120000; + const startTime = performance.now(); + let email = await getOutboxEmailById(emailId); + while (performance.now() - startTime < maxWaitMs && email.status !== "server-error") { + await wait(1000); + email = await getOutboxEmailById(emailId); + } + + // Log the email object to help debug undefined status issues + logIfTestFails("Email after retry exhaustion loop", email); + + expect(email.status).toBe("server-error"); + expect(email.send_retries).toBe(5); // MAX_SEND_ATTEMPTS + expect(email.send_attempt_errors?.length).toBe(5); + // No more retries scheduled + expect(email.next_send_retry_at_millis).toBeNull(); + }); + }); +}); + diff --git a/apps/e2e/tests/backend/endpoints/api/v1/emails/outbox-api.test.ts b/apps/e2e/tests/backend/endpoints/api/v1/emails/outbox-api.test.ts index ff31fa42db..503086930a 100644 --- a/apps/e2e/tests/backend/endpoints/api/v1/emails/outbox-api.test.ts +++ b/apps/e2e/tests/backend/endpoints/api/v1/emails/outbox-api.test.ts @@ -630,7 +630,10 @@ describe("email outbox API", () => { "has_rendered": false, "id": "", "is_paused": true, + "next_send_retry_at_millis": null, "scheduled_at_millis": , + "send_attempt_errors": null, + "send_retries": 0, "simple_status": "in-progress", "skip_deliverability_check": false, "status": "paused", diff --git a/packages/stack-shared/src/interface/crud/email-outbox.ts b/packages/stack-shared/src/interface/crud/email-outbox.ts index 3629f81b7e..9759289cd0 100644 --- a/packages/stack-shared/src/interface/crud/email-outbox.ts +++ b/packages/stack-shared/src/interface/crud/email-outbox.ts @@ -40,6 +40,19 @@ const emailOutboxBaseSchema = fieldSchema.yupObject({ skip_deliverability_check: fieldSchema.yupBoolean().defined(), scheduled_at_millis: fieldSchema.yupNumber().defined(), + // Retry-related fields (for debugging/testing deferred retry logic) + send_retries: fieldSchema.yupNumber().defined(), + next_send_retry_at_millis: fieldSchema.yupNumber().nullable().defined(), + // Array of errors from each failed send attempt, each with internal/external messages + send_attempt_errors: fieldSchema.yupArray(fieldSchema.yupObject({ + attempt_number: fieldSchema.yupNumber().defined(), + timestamp: fieldSchema.yupString().defined(), + external_message: fieldSchema.yupString().defined(), + external_details: fieldSchema.yupRecord(fieldSchema.yupString(), fieldSchema.jsonSchema).defined(), + internal_message: fieldSchema.yupString().defined(), + internal_details: fieldSchema.yupRecord(fieldSchema.yupString(), fieldSchema.jsonSchema).defined(), + }).defined()).nullable().defined(), + status: fieldSchema.yupString().defined(), simple_status: fieldSchema.yupString().defined(), diff --git a/packages/template/src/lib/stack-app/apps/implementations/admin-app-impl.ts b/packages/template/src/lib/stack-app/apps/implementations/admin-app-impl.ts index 8f9b331b21..d7b0d41a93 100644 --- a/packages/template/src/lib/stack-app/apps/implementations/admin-app-impl.ts +++ b/packages/template/src/lib/stack-app/apps/implementations/admin-app-impl.ts @@ -733,6 +733,24 @@ export class _StackAdminAppImplIncomplete, + internal_message: string, + internal_details: Record, + }>).map((e) => ({ + attemptNumber: e.attempt_number, + timestamp: e.timestamp, + externalMessage: e.external_message, + externalDetails: e.external_details, + internalMessage: e.internal_message, + internalDetails: e.internal_details, + })) : null, }; // Rendered fields (available after rendering completes successfully) diff --git a/packages/template/src/lib/stack-app/email/index.ts b/packages/template/src/lib/stack-app/email/index.ts index 36a73e5ba8..e33ea61166 100644 --- a/packages/template/src/lib/stack-app/email/index.ts +++ b/packages/template/src/lib/stack-app/email/index.ts @@ -36,6 +36,16 @@ export type AdminEmailOutboxSimpleStatus = | "ok" | "error"; +// Error entry from a failed send attempt +export type AdminSendAttemptError = { + attemptNumber: number, + timestamp: string, + externalMessage: string, + externalDetails: Record, + internalMessage: string, + internalDetails: Record, +}; + // =============================== BASE TYPES =============================== // Base fields present on all emails @@ -48,6 +58,10 @@ type AdminEmailOutboxBase = { isPaused: false, hasRendered: false, hasDelivered: false, + // Retry tracking fields + sendRetries: number, + nextSendRetryAt: Date | null, + sendAttemptErrors: AdminSendAttemptError[] | null, }; // Fields available after rendering completes successfully diff --git a/packages/template/src/lib/stack-app/index.ts b/packages/template/src/lib/stack-app/index.ts index 87bf010a0f..01f9b11af1 100644 --- a/packages/template/src/lib/stack-app/index.ts +++ b/packages/template/src/lib/stack-app/index.ts @@ -56,6 +56,7 @@ export type { AdminEmailOutboxRecipient, AdminEmailOutboxSimpleStatus, AdminEmailOutboxStatus, + AdminSendAttemptError, AdminSentEmail } from "./email";