Skip to content

Commit c65cb27

Browse files
authored
Fix: [AEA-0000] - Add an upper time limit for the lambda runtime (#1846)
## Summary - Routine Change ### Details Lambdas have a timeout - heavy traffic was causing the lambdas to continually process the queue, and exceed that timeout as they never observed an empty one. This just adds a time limit to how long a lambda can be processing the queue before it gives up and yields. Note that the old behaviour works, but throws a timeout error that we don't want to see.
1 parent 0b0518c commit c65cb27

5 files changed

Lines changed: 88 additions & 4 deletions

File tree

SAMtemplates/alarms/main.yaml

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,9 @@ Parameters:
1818

1919
ConvertRequestToFhirFormatFunctionName:
2020
Type: String
21+
22+
NotifyProcessorFunctionName:
23+
Type: String
2124

2225
EnableAlerts:
2326
Type: String
@@ -309,3 +312,40 @@ Resources:
309312
- !ImportValue lambda-resources:SlackAlertsSnsTopicArn
310313
OKActions:
311314
- !ImportValue lambda-resources:SlackAlertsSnsTopicArn
315+
316+
NotifyProcessorTimeoutsMetricFilter:
317+
Type: AWS::Logs::MetricFilter
318+
Properties:
319+
FilterName: NotifyProcessorTimeouts
320+
FilterPattern: !Sub '{ ($.level = "INFO") && ($.function_name = "${NotifyProcessorFunctionName}") && ($.message = %drainAndProcess timed out; exiting before queue is empty%) }'
321+
LogGroupName:
322+
Fn::ImportValue: !Sub ${StackName}:functions:${NotifyProcessorFunctionName}:LambdaLogGroupName
323+
MetricTransformations:
324+
- MetricNamespace: LambdaLogFilterMetrics
325+
MetricName: TimeoutCount
326+
MetricValue: 1
327+
Unit: Count
328+
Dimensions:
329+
- Key: FunctionName
330+
Value: $.function_name
331+
332+
NotifyProcessorTimeoutsAlarm:
333+
Type: AWS::CloudWatch::Alarm
334+
Properties:
335+
AlarmDescription: Count of NotifyProcessor invocations that timed out
336+
AlarmName: !Sub ${StackName}_NotifyProcessor_Timeouts
337+
Namespace: LambdaLogFilterMetrics
338+
MetricName: TimeoutCount
339+
Period: 60 # seconds
340+
EvaluationPeriods: 1
341+
Statistic: Sum
342+
ComparisonOperator: GreaterThanOrEqualToThreshold
343+
Threshold: 1
344+
TreatMissingData: notBreaching
345+
ActionsEnabled: !Ref EnableAlerts
346+
AlarmActions:
347+
- !ImportValue lambda-resources:SlackAlertsSnsTopicArn
348+
InsufficientDataActions:
349+
- !ImportValue lambda-resources:SlackAlertsSnsTopicArn
350+
OKActions:
351+
- !ImportValue lambda-resources:SlackAlertsSnsTopicArn

SAMtemplates/functions/main.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -392,6 +392,7 @@ Resources:
392392
Type: AWS::Serverless::Function
393393
Properties:
394394
FunctionName: !Sub ${StackName}-NotifyProcessor
395+
Timeout: 900 # 15 minutes
395396
CodeUri: ../../packages/
396397
Handler: nhsNotifyLambda.handler
397398
Role: !GetAtt NotifyProcessorResources.Outputs.LambdaRoleArn

SAMtemplates/main_template.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -193,5 +193,6 @@ Resources:
193193
UpdatePrescriptionStatusFunctionName: !GetAtt Functions.Outputs.UpdatePrescriptionStatusFunctionName
194194
PrescriptionStatusUpdatesTableName: !GetAtt Tables.Outputs.PrescriptionStatusUpdatesTableName
195195
ConvertRequestToFhirFormatFunctionName: !GetAtt Functions.Outputs.ConvertRequestToFhirFormatFunctionName
196+
NotifyProcessorFunctionName: !GetAtt Functions.Outputs.NotifyProcessorFunctionName
196197
DynamoDBUtilizationPercentageThreshold: !Ref DynamoDBUtilizationPercentageThreshold
197198
EnableAlerts: !Ref EnableAlerts

packages/nhsNotifyLambda/src/nhsNotifyLambda.ts

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,8 @@ import {
2020

2121
const logger = new Logger({serviceName: "nhsNotify"})
2222

23+
const MAX_QUEUE_RUNTIME = 14*60*1000 // 14 minutes, to avoid Lambda timeout issues (timeout is 15 minutes)
24+
2325
/**
2426
* Process a single batch of SQS messages: filter, notify, persist, and clean up.
2527
*/
@@ -28,7 +30,6 @@ async function processBatch(
2830
routingId: string
2931
): Promise<void> {
3032
if (messages.length === 0) {
31-
console.log("No messages to process")
3233
logger.info("No messages to process")
3334
return
3435
}
@@ -80,14 +81,22 @@ function logSuppression(suppressedCount: number, total: number): void {
8081
}
8182

8283
/**
83-
* Drain the queue until empty, processing each batch.
84+
* Drain the queue until empty or the MAX_QUEUE_RUNTIME has passed, processing each batch.
8485
*/
8586
async function drainAndProcess(routingId: string): Promise<void> {
87+
const start = Date.now()
8688
let empty = false
8789
while (!empty) {
90+
if (Date.now() - start >= MAX_QUEUE_RUNTIME) {
91+
logger.warn("drainAndProcess timed out; exiting before queue is empty",
92+
{maxRuntimeMilliseconds: MAX_QUEUE_RUNTIME}
93+
)
94+
break
95+
}
96+
8897
const {messages, isEmpty} = await drainQueue(logger, 100)
8998
empty = isEmpty
90-
console.log(messages)
99+
91100
await processBatch(messages, routingId)
92101
}
93102
}

packages/nhsNotifyLambda/tests/testNhsNotifyLambda.test.ts

Lines changed: 34 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,13 +39,15 @@ jest.unstable_mockModule(
3939

4040
const mockInfo = jest.fn()
4141
const mockError = jest.fn()
42+
const mockWarn = jest.fn()
4243
jest.unstable_mockModule(
4344
"@aws-lambda-powertools/logger",
4445
async () => ({
4546
__esModule: true,
4647
Logger: jest.fn().mockImplementation(() => ({
4748
info: mockInfo,
48-
error: mockError
49+
error: mockError,
50+
warn: mockWarn
4951
}))
5052
})
5153
)
@@ -282,4 +284,35 @@ describe("Unit test for NHS Notify lambda handler", () => {
282284
{suppressedCount: 1, totalFetched: 1}
283285
)
284286
})
287+
288+
it("Stops draining after 14 minutes", async () => {
289+
const msg = constructPSUDataItemMessage()
290+
291+
// call returns a non‐empty batch so the loop should continue
292+
mockDrainQueue.mockImplementation(() =>
293+
Promise.resolve({messages: [msg], isEmpty: false})
294+
)
295+
296+
const nowSpy = jest.spyOn(Date, "now")
297+
.mockImplementationOnce(() => 0) // start time
298+
.mockImplementationOnce(() => (14 * 60 * 1000) + 1)
299+
300+
// Happy‐path for everything else
301+
mockCheckCooldownForUpdate.mockReturnValueOnce(Promise.resolve(true))
302+
mockMakeBatchNotifyRequest.mockReturnValueOnce(Promise.resolve([
303+
{...msg, success: true, notifyMessageId: "m1"}
304+
]))
305+
mockAddPrescriptionMessagesToNotificationStateStore.mockReturnValueOnce(() => Promise.resolve())
306+
mockRemoveSQSMessages.mockReturnValueOnce(() => Promise.resolve())
307+
308+
await expect(lambdaHandler(mockEventBridgeEvent)).resolves.not.toThrow()
309+
310+
// Calls a warning
311+
expect(mockWarn).toHaveBeenCalledTimes(1)
312+
313+
// Since the timer advances before the promise resolves, this never gets called
314+
expect(mockDrainQueue).toHaveBeenCalledTimes(0)
315+
316+
nowSpy.mockRestore()
317+
})
285318
})

0 commit comments

Comments
 (0)