From 0aa1ccede7b694d7513cdba60e1448b2bd94db38 Mon Sep 17 00:00:00 2001 From: Edd Almond <102675624+eddalmond1@users.noreply.github.com> Date: Tue, 5 Aug 2025 14:57:32 +0100 Subject: [PATCH 1/5] eli-285 and eli-349 adding cloudwatch alarms for a) security and b) ops - API Gateway and Lambda execution --- .../stacks/api-layer/cloudwatch_alarms.tf | 402 ++++++++++++++++++ .../github_actions_policies.tf | 49 +++ 2 files changed, 451 insertions(+) create mode 100644 infrastructure/stacks/api-layer/cloudwatch_alarms.tf diff --git a/infrastructure/stacks/api-layer/cloudwatch_alarms.tf b/infrastructure/stacks/api-layer/cloudwatch_alarms.tf new file mode 100644 index 000000000..196b85522 --- /dev/null +++ b/infrastructure/stacks/api-layer/cloudwatch_alarms.tf @@ -0,0 +1,402 @@ +locals { + # Security alarms based on CloudTrail custom metrics + cloudwatch_alarm_config = { + UnauthorizedApiCalls = { + threshold = 1 + comparison_operator = "GreaterThanOrEqualToThreshold" + evaluation_periods = 1 + period = 300 + statistic = "Sum" + alarm_description = "Unauthorized API calls detected - immediate alert on any occurrence" + actions_enabled = true + } + ConsoleAuthenticationFailures = { + threshold = 3 + comparison_operator = "GreaterThanOrEqualToThreshold" + evaluation_periods = 1 + period = 300 + statistic = "Sum" + alarm_description = "Multiple console authentication failures detected within 5 minutes" + actions_enabled = true + } + CloudTrailConfigChanges = { + threshold = 1 + comparison_operator = "GreaterThanOrEqualToThreshold" + evaluation_periods = 1 + period = 300 + statistic = "Sum" + alarm_description = "CloudTrail configuration changes detected - immediate alert" + actions_enabled = true + } + VPCChanges = { + threshold = 1 + comparison_operator = "GreaterThanOrEqualToThreshold" + evaluation_periods = 1 + period = 300 + statistic = "Sum" + alarm_description = "VPC configuration changes detected" + actions_enabled = true + } + AWSConfigChanges = { + threshold = 1 + comparison_operator = "GreaterThanOrEqualToThreshold" + evaluation_periods = 1 + period = 300 + statistic = "Sum" + alarm_description = "AWS Config service changes detected" + actions_enabled = true + } + ModificationOfCMKs = { + threshold = 1 + comparison_operator = "GreaterThanOrEqualToThreshold" + evaluation_periods = 1 + period = 300 + statistic = "Sum" + alarm_description = "KMS Customer Managed Key modifications detected - critical security alert" + actions_enabled = true + } + UnsuccessfulSwitchRole = { + threshold = 5 + comparison_operator = "GreaterThanOrEqualToThreshold" + evaluation_periods = 1 + period = 900 + statistic = "Sum" + alarm_description = "Multiple unsuccessful role switch attempts detected within 15 minutes" + actions_enabled = true + } + ConsoleLoginNoMFA = { + threshold = 1 + comparison_operator = "GreaterThanOrEqualToThreshold" + evaluation_periods = 1 + period = 300 + statistic = "Sum" + alarm_description = "Console login without MFA detected - security policy violation" + actions_enabled = true + } + RootAccountUsage = { + threshold = 1 + comparison_operator = "GreaterThanOrEqualToThreshold" + evaluation_periods = 1 + period = 300 + statistic = "Sum" + alarm_description = "Root account usage detected - immediate critical alert" + actions_enabled = true + } + SecurityGroupChange = { + threshold = 1 + comparison_operator = "GreaterThanOrEqualToThreshold" + evaluation_periods = 1 + period = 300 + statistic = "Sum" + alarm_description = "Security group changes detected" + actions_enabled = true + } + RouteTableChanges = { + threshold = 1 + comparison_operator = "GreaterThanOrEqualToThreshold" + evaluation_periods = 1 + period = 300 + statistic = "Sum" + alarm_description = "Route table changes detected" + actions_enabled = true + } + IAMPolicyChanges = { + threshold = 1 + comparison_operator = "GreaterThanOrEqualToThreshold" + evaluation_periods = 1 + period = 300 + statistic = "Sum" + alarm_description = "IAM policy changes detected - immediate security alert" + actions_enabled = true + } + s3BucketPolicyChanges = { + threshold = 1 + comparison_operator = "GreaterThanOrEqualToThreshold" + evaluation_periods = 1 + period = 300 + statistic = "Sum" + alarm_description = "S3 bucket policy changes detected" + actions_enabled = true + } + ChangesToNetworkGateways = { + threshold = 1 + comparison_operator = "GreaterThanOrEqualToThreshold" + evaluation_periods = 1 + period = 300 + statistic = "Sum" + alarm_description = "Network gateway changes detected" + actions_enabled = true + } + ChangesToNACLs = { + threshold = 1 + comparison_operator = "GreaterThanOrEqualToThreshold" + evaluation_periods = 1 + period = 300 + statistic = "Sum" + alarm_description = "Network ACL changes detected" + actions_enabled = true + } + KMSKeyPolicyChanges = { + threshold = 1 + comparison_operator = "GreaterThanOrEqualToThreshold" + evaluation_periods = 1 + period = 300 + statistic = "Sum" + alarm_description = "KMS key policy changes detected - critical security alert" + actions_enabled = true + } + s3PublicAccessChanges = { + threshold = 1 + comparison_operator = "GreaterThanOrEqualToThreshold" + evaluation_periods = 1 + period = 300 + statistic = "Sum" + alarm_description = "S3 public access changes detected - potential data exposure risk" + actions_enabled = true + } + CloudWatchAlarmChanges = { + threshold = 1 + comparison_operator = "GreaterThanOrEqualToThreshold" + evaluation_periods = 1 + period = 300 + statistic = "Sum" + alarm_description = "CloudWatch alarm configuration changes detected" + actions_enabled = true + } + LambdaFunctionChanges = { + threshold = 2 + comparison_operator = "GreaterThanOrEqualToThreshold" + evaluation_periods = 1 + period = 600 + statistic = "Sum" + alarm_description = "Multiple Lambda function changes detected within 10 minutes" + actions_enabled = true + } + } + + # API Gateway alarm configuration + api_gateway_alarm_config = { + "5XXError" = { + metric_name = "5XXError" + namespace = "AWS/ApiGateway" + statistic = "Sum" + threshold = 0 + comparison_operator = "GreaterThanThreshold" + evaluation_periods = 1 + period = 300 + alarm_description = "API Gateway 5XX errors detected - critical server-side issues" + severity = "critical" + treat_missing_data = "notBreaching" + } + "4XXError" = { + metric_name = "4XXError" + namespace = "AWS/ApiGateway" + statistic = "Sum" + threshold = 50 # Adjust based on expected traffic + comparison_operator = "GreaterThanThreshold" + evaluation_periods = 2 + period = 300 + alarm_description = "High rate of API Gateway 4XX errors - client-side issues or auth problems" + severity = "high" + treat_missing_data = "notBreaching" + } + "LatencyP95" = { + metric_name = "Latency" + namespace = "AWS/ApiGateway" + statistic = "Average" # Use Average for ExtendedStatistic + extended_statistic = "p95" + threshold = 1000 + comparison_operator = "GreaterThanThreshold" + evaluation_periods = 1 + period = 300 + alarm_description = "API Gateway P95 latency > 1000ms - performance degradation" + severity = "high" + treat_missing_data = "notBreaching" + } + "IntegrationLatencyP95" = { + metric_name = "IntegrationLatency" + namespace = "AWS/ApiGateway" + statistic = "Average" # Use Average for ExtendedStatistic + extended_statistic = "p95" + threshold = 900 + comparison_operator = "GreaterThanThreshold" + evaluation_periods = 1 + period = 300 + alarm_description = "API Gateway backend (Lambda) P95 latency > 900ms - backend performance issues" + severity = "high" + treat_missing_data = "notBreaching" + } + "CountDrop" = { + metric_name = "Count" + namespace = "AWS/ApiGateway" + statistic = "Sum" + threshold = 10 # Minimum expected requests per 5min - adjust when live + comparison_operator = "LessThanThreshold" + evaluation_periods = 2 + period = 300 + alarm_description = "API Gateway request volume drop - possible outage (enable when service is live)" + severity = "high" + treat_missing_data = "breaching" # Missing data could indicate outage + actions_enabled = false # Disable until service is live + } + } + + # Lambda alarm configuration + lambda_alarm_config = { + "Errors" = { + metric_name = "Errors" + namespace = "AWS/Lambda" + statistic = "Sum" + threshold = 0 + comparison_operator = "GreaterThanThreshold" + evaluation_periods = 1 + period = 300 + alarm_description = "Lambda invocation errors detected - critical function failures" + severity = "critical" + treat_missing_data = "notBreaching" + } + "Throttles" = { + metric_name = "Throttles" + namespace = "AWS/Lambda" + statistic = "Sum" + threshold = 0 + comparison_operator = "GreaterThanThreshold" + evaluation_periods = 1 + period = 300 + alarm_description = "Lambda throttling detected - concurrency limits reached" + severity = "critical" + treat_missing_data = "notBreaching" + } + "Duration" = { + metric_name = "Duration" + namespace = "AWS/Lambda" + statistic = "Average" + threshold = 27000 # 90% of 30s timeout (adjust based on actual timeout) + comparison_operator = "GreaterThanThreshold" + evaluation_periods = 2 + period = 300 + alarm_description = "Lambda duration approaching timeout - function performance warning" + severity = "warning" + treat_missing_data = "notBreaching" + } + "InvocationsDrop" = { + metric_name = "Invocations" + namespace = "AWS/Lambda" + statistic = "Sum" + threshold = 5 # Minimum expected invocations per 5min - adjust when live + comparison_operator = "LessThanThreshold" + evaluation_periods = 2 + period = 300 + alarm_description = "Lambda invocation volume drop - possible outage (enable when service is live)" + severity = "high" + treat_missing_data = "breaching" # Missing data could indicate outage + actions_enabled = false # Disable until service is live + } + } +} + +# SNS Topic for CloudWatch Alarms +resource "aws_sns_topic" "cloudwatch_alarms" { + name = "cloudwatch-security-alarms" + + tags = { + Environment = var.environment + Purpose = "security-alerting" + ManagedBy = "terraform" + } +} + +# Security Alarms (CloudTrail-based) +resource "aws_cloudwatch_metric_alarm" "cloudtrail_custom_metric_alarms" { + for_each = local.cloudwatch_alarm_config + + alarm_name = "SecurityAlert-${each.key}" + alarm_description = each.value.alarm_description + actions_enabled = each.value.actions_enabled + metric_name = each.key + namespace = "security" + statistic = each.value.statistic + period = each.value.period + evaluation_periods = each.value.evaluation_periods + threshold = each.value.threshold + comparison_operator = each.value.comparison_operator + + # Treat missing data as not breaching (common for security metrics) + treat_missing_data = "notBreaching" + + # Add standard tags for organization + tags = { + Environment = "production" + AlertType = "security" + Severity = contains(["RootAccountUsage", "ModificationOfCMKs", "KMSKeyPolicyChanges", "ConsoleLoginNoMFA"], each.key) ? "critical" : "high" + ManagedBy = "terraform" + } + + alarm_actions = [aws_sns_topic.cloudwatch_alarms.arn] +} + +# API Gateway CloudWatch Alarms +resource "aws_cloudwatch_metric_alarm" "api_gateway_alarms" { + for_each = local.api_gateway_alarm_config + + alarm_name = "APIGateway-${each.key}" + alarm_description = each.value.alarm_description + actions_enabled = lookup(each.value, "actions_enabled", true) + metric_name = each.value.metric_name + namespace = each.value.namespace + statistic = lookup(each.value, "extended_statistic", null) == null ? each.value.statistic : null + extended_statistic = lookup(each.value, "extended_statistic", null) + period = each.value.period + evaluation_periods = each.value.evaluation_periods + threshold = each.value.threshold + comparison_operator = each.value.comparison_operator + treat_missing_data = each.value.treat_missing_data + + # Add dimensions for API Gateway + dimensions = { + ApiName = "eligibility-signposting-api" + } + + tags = { + Environment = var.environment + AlertType = "performance" + Service = "api-gateway" + Severity = each.value.severity + ManagedBy = "terraform" + } + + alarm_actions = [aws_sns_topic.cloudwatch_alarms.arn] +} + +# Lambda CloudWatch Alarms +resource "aws_cloudwatch_metric_alarm" "lambda_alarms" { + # checkov:skip=CKV_AWS_319: Disabling some alarms until service is live + for_each = local.lambda_alarm_config + + alarm_name = "Lambda-${each.key}" + alarm_description = each.value.alarm_description + actions_enabled = lookup(each.value, "actions_enabled", true) + metric_name = each.value.metric_name + namespace = each.value.namespace + statistic = each.value.statistic + period = each.value.period + evaluation_periods = each.value.evaluation_periods + threshold = each.value.threshold + comparison_operator = each.value.comparison_operator + treat_missing_data = each.value.treat_missing_data + + # Add dimensions for Lambda + dimensions = { + FunctionName = module.eligibility_signposting_lambda_function.aws_lambda_function_name + } + + tags = { + Environment = var.environment + AlertType = "performance" + Service = "lambda" + Severity = each.value.severity + ManagedBy = "terraform" + } + + alarm_actions = [aws_sns_topic.cloudwatch_alarms.arn] +} diff --git a/infrastructure/stacks/iams-developer-roles/github_actions_policies.tf b/infrastructure/stacks/iams-developer-roles/github_actions_policies.tf index 1227415e7..6b69b5025 100644 --- a/infrastructure/stacks/iams-developer-roles/github_actions_policies.tf +++ b/infrastructure/stacks/iams-developer-roles/github_actions_policies.tf @@ -475,6 +475,50 @@ resource "aws_iam_policy" "firehose_readonly" { tags = merge(local.tags, { Name = "firehose-describe-access" }) } +resource "aws_iam_policy" "cloudwatch_alarms" { + name = "cloudwatch-alarms-management" + description = "Allow GitHub Actions to manage CloudWatch alarms and SNS topics" + path = "/service-policies/" + + policy = jsonencode({ + Version = "2012-10-17", + Statement = [ + { + Effect = "Allow", + Action = [ + # CloudWatch Alarms management + "cloudwatch:PutMetricAlarm", + "cloudwatch:DeleteAlarms", + "cloudwatch:DescribeAlarms", + "cloudwatch:DescribeAlarmsForMetric", + "cloudwatch:ListTagsForResource", + "cloudwatch:TagResource", + "cloudwatch:UntagResource", + # SNS Topic management for alarm notifications + "sns:CreateTopic", + "sns:DeleteTopic", + "sns:GetTopicAttributes", + "sns:SetTopicAttributes", + "sns:ListTopics", + "sns:ListTagsForResource", + "sns:TagResource", + "sns:UntagResource", + "sns:Subscribe", + "sns:Unsubscribe", + "sns:ListSubscriptions", + "sns:ListSubscriptionsByTopic" + ], + Resource = [ + "arn:aws:cloudwatch:${var.default_aws_region}:${data.aws_caller_identity.current.account_id}:alarm:*", + "arn:aws:sns:${var.default_aws_region}:${data.aws_caller_identity.current.account_id}:cloudwatch-security-alarms*" + ] + } + ] + }) + + tags = merge(local.tags, { Name = "cloudwatch-alarms-management" }) +} + # Attach the policies to the role resource "aws_iam_role_policy_attachment" "terraform_state" { role = aws_iam_role.github_actions.name @@ -520,3 +564,8 @@ resource "aws_iam_role_policy_attachment" "firehose_readonly_attach" { role = aws_iam_role.github_actions.name policy_arn = aws_iam_policy.firehose_readonly.arn } + +resource "aws_iam_role_policy_attachment" "cloudwatch_alarms" { + role = aws_iam_role.github_actions.name + policy_arn = aws_iam_policy.cloudwatch_alarms.arn +} From 28a295817b6dee97deb750be184304795d4260ae Mon Sep 17 00:00:00 2001 From: Edd Almond <102675624+eddalmond1@users.noreply.github.com> Date: Tue, 5 Aug 2025 16:59:56 +0100 Subject: [PATCH 2/5] eli-285 - disabling action on API calls as our internal security are triggering this --- infrastructure/stacks/api-layer/cloudwatch_alarms.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/infrastructure/stacks/api-layer/cloudwatch_alarms.tf b/infrastructure/stacks/api-layer/cloudwatch_alarms.tf index 196b85522..a3ed27834 100644 --- a/infrastructure/stacks/api-layer/cloudwatch_alarms.tf +++ b/infrastructure/stacks/api-layer/cloudwatch_alarms.tf @@ -8,7 +8,7 @@ locals { period = 300 statistic = "Sum" alarm_description = "Unauthorized API calls detected - immediate alert on any occurrence" - actions_enabled = true + actions_enabled = false # Disabling as cloudhealth role is triggering this alarm } ConsoleAuthenticationFailures = { threshold = 3 From 1ad50a965cd701dbb1f75ab236bbc07b4cb748fe Mon Sep 17 00:00:00 2001 From: Edd Almond <102675624+eddalmond1@users.noreply.github.com> Date: Tue, 5 Aug 2025 17:08:11 +0100 Subject: [PATCH 3/5] eli-285 and 349 adding kms for sns, checkov skip for disabled alarms --- .../stacks/api-layer/cloudwatch_alarms.tf | 16 ++++++ .../stacks/api-layer/iam_policies.tf | 49 +++++++++++++++++++ 2 files changed, 65 insertions(+) diff --git a/infrastructure/stacks/api-layer/cloudwatch_alarms.tf b/infrastructure/stacks/api-layer/cloudwatch_alarms.tf index a3ed27834..6ec0d2209 100644 --- a/infrastructure/stacks/api-layer/cloudwatch_alarms.tf +++ b/infrastructure/stacks/api-layer/cloudwatch_alarms.tf @@ -299,6 +299,8 @@ locals { resource "aws_sns_topic" "cloudwatch_alarms" { name = "cloudwatch-security-alarms" + kms_master_key_id = aws_kms_key.sns_encryption_key.id + tags = { Environment = var.environment Purpose = "security-alerting" @@ -306,8 +308,21 @@ resource "aws_sns_topic" "cloudwatch_alarms" { } } +resource "aws_kms_key" "sns_encryption_key" { + description = "KMS key for encrypting CloudWatch alarms SNS topic" + deletion_window_in_days = 7 + + tags = { + Name = "cloudwatch-alarms-sns-encryption-key" + Environment = var.environment + Purpose = "sns-encryption" + ManagedBy = "terraform" + } +} + # Security Alarms (CloudTrail-based) resource "aws_cloudwatch_metric_alarm" "cloudtrail_custom_metric_alarms" { + # checkov:skip=CKV_AWS_319: Disabling some alarms until service is live for_each = local.cloudwatch_alarm_config alarm_name = "SecurityAlert-${each.key}" @@ -337,6 +352,7 @@ resource "aws_cloudwatch_metric_alarm" "cloudtrail_custom_metric_alarms" { # API Gateway CloudWatch Alarms resource "aws_cloudwatch_metric_alarm" "api_gateway_alarms" { + # checkov:skip=CKV_AWS_319: Disabling some alarms until service is live for_each = local.api_gateway_alarm_config alarm_name = "APIGateway-${each.key}" diff --git a/infrastructure/stacks/api-layer/iam_policies.tf b/infrastructure/stacks/api-layer/iam_policies.tf index 8af65233e..5f384895c 100644 --- a/infrastructure/stacks/api-layer/iam_policies.tf +++ b/infrastructure/stacks/api-layer/iam_policies.tf @@ -358,3 +358,52 @@ resource "aws_iam_role_policy" "lambda_xray_tracing_policy" { role = aws_iam_role.eligibility_lambda_role.id policy = data.aws_iam_policy_document.lambda_xray_tracing_permissions_policy.json } + +# KMS Key Policy for SNS encryption +resource "aws_kms_key_policy" "sns_encryption_key_policy" { + key_id = aws_kms_key.sns_encryption_key.id + policy = jsonencode({ + Version = "2012-10-17" + Statement = [ + { + Sid = "EnableIAMRootPermissions" + Effect = "Allow" + Principal = { + AWS = "arn:aws:iam::${data.aws_caller_identity.current.account_id}:root" + } + Action = "kms:*" + Resource = "*" + }, + { + Sid = "AllowCloudWatchAlarmsAccess" + Effect = "Allow" + Principal = { + Service = "cloudwatch.amazonaws.com" + } + Action = [ + "kms:Encrypt", + "kms:Decrypt", + "kms:ReEncrypt*", + "kms:GenerateDataKey*", + "kms:DescribeKey" + ] + Resource = "*" + }, + { + Sid = "AllowSNSServiceAccess" + Effect = "Allow" + Principal = { + Service = "sns.amazonaws.com" + } + Action = [ + "kms:Encrypt", + "kms:Decrypt", + "kms:ReEncrypt*", + "kms:GenerateDataKey*", + "kms:DescribeKey" + ] + Resource = "*" + } + ] + }) +} From 4e18992efe9596f2be137327f1ec5ae793062f5c Mon Sep 17 00:00:00 2001 From: Edd Almond <102675624+eddalmond1@users.noreply.github.com> Date: Tue, 5 Aug 2025 17:13:20 +0100 Subject: [PATCH 4/5] eli-285 enable kms key rotation --- infrastructure/stacks/api-layer/cloudwatch_alarms.tf | 2 ++ 1 file changed, 2 insertions(+) diff --git a/infrastructure/stacks/api-layer/cloudwatch_alarms.tf b/infrastructure/stacks/api-layer/cloudwatch_alarms.tf index 6ec0d2209..4c252a38d 100644 --- a/infrastructure/stacks/api-layer/cloudwatch_alarms.tf +++ b/infrastructure/stacks/api-layer/cloudwatch_alarms.tf @@ -311,6 +311,8 @@ resource "aws_sns_topic" "cloudwatch_alarms" { resource "aws_kms_key" "sns_encryption_key" { description = "KMS key for encrypting CloudWatch alarms SNS topic" deletion_window_in_days = 7 + enable_key_rotation = true + tags = { Name = "cloudwatch-alarms-sns-encryption-key" From fdb4f0eb4c981760dda3918ed73eb27fb1a3e81c Mon Sep 17 00:00:00 2001 From: Edd Almond <102675624+eddalmond1@users.noreply.github.com> Date: Tue, 5 Aug 2025 17:18:29 +0100 Subject: [PATCH 5/5] eli-285 get rid of false flag gitleak --- scripts/config/gitleaks.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/config/gitleaks.toml b/scripts/config/gitleaks.toml index 175e20678..66a3d7e94 100644 --- a/scripts/config/gitleaks.toml +++ b/scripts/config/gitleaks.toml @@ -17,4 +17,4 @@ regexes = [ [allowlist] paths = ['''.terraform.lock.hcl''', '''poetry.lock''', '''yarn.lock'''] -stopwords = ['''dummy_key''', '''dummy_secret''', '''192.0.0.1'''] +stopwords = ['''dummy_key''', '''dummy_secret''', '''192.0.0.1''', '''prance = "^25.4.8.0"''', '''25.4.8.0''']