Merge branch 'main' into feature/eja-add-custom-review-instructions

eddalmond1 · web-flow · commit 0c369bc41fa0 · 2026-03-05T10:53:12.000Z
diff --git a/infrastructure/stacks/api-layer/variables.tf b/infrastructure/stacks/api-layer/variables.tf
@@ -12,8 +12,8 @@ variable "SPLUNK_HEC_ENDPOINT" {
 # WAF deployment environments (list of environment names where WAF should be deployed)
 variable "waf_enabled_environments" {
   type        = list(string)
-  description = "Environments in which WAF resources are deployed. Adjust to disable in test after evaluation."
-  default     = ["dev", "preprod", "prod"]
+  description = "Environments in which WAF resources are deployed"
+  default     = ["preprod", "prod"]
 }
 
 variable "OPERATOR_EMAILS" {
diff --git a/infrastructure/stacks/api-layer/waf.tf b/infrastructure/stacks/api-layer/waf.tf
@@ -1,11 +1,8 @@
 # WAF Web ACL for API Gateway
-# Only deployed in production environment for cost optimization
-# Initially all rules are in COUNT mode to monitor traffic patterns
-
 resource "aws_wafv2_web_acl" "api_gateway" {
   count       = local.waf_enabled ? 1 : 0
   name        = "${local.workspace}-eligibility-signposting-api-waf"
-  description = "WAF Web ACL for Eligibility Signposting API Gateway - Production"
+  description = "WAF Web ACL for Eligibility Signposting API Gateway"
   scope       = "REGIONAL"
 
   default_action {
@@ -19,7 +16,7 @@ resource "aws_wafv2_web_acl" "api_gateway" {
     priority = 10
 
     override_action {
-      count {} # Start in count mode - change to none {} when ready to block
+      none {}
     }
 
     statement {
@@ -43,13 +40,21 @@ resource "aws_wafv2_web_acl" "api_gateway" {
     priority = 20
 
     override_action {
-      count {} # Start in count mode - change to none {} when ready to block
+      none {}
     }
 
     statement {
       managed_rule_group_statement {
         vendor_name = "AWS"
         name        = "AWSManagedRulesCommonRuleSet"
+
+        # Override NoUserAgent_Header to count only - APIM health checks send no User-Agent
+        rule_action_override {
+          name = "NoUserAgent_Header"
+          action_to_use {
+            count {}
+          }
+        }
       }
     }
 
@@ -93,12 +98,12 @@ resource "aws_wafv2_web_acl" "api_gateway" {
     priority = 40
 
     action {
-      count {} # Start in count mode - change to block {} when ready
+      block {}
     }
 
     statement {
       rate_based_statement {
-        limit              = 2000 # Requests per 5-minute period per IP
+        limit              = 300000 # 1000 TPS - we should tie this to other rate limits
         aggregate_key_type = "IP"
       }
     }
@@ -110,30 +115,31 @@ resource "aws_wafv2_web_acl" "api_gateway" {
     }
   }
 
-  # Rule 5: Geographic Monitoring Rule - Monitor non-UK traffic (COUNT only)
-  # NHS-specific requirement: initially monitor requests originating from outside GB
-  # This rule COUNTS any request whose geo country code is not GB (does not block)
+  # Rule 5: Geographic Block Rule - Block non-UK traffic
+  # Blocks requests from outside the allowed country list.
+  # In prod: GB only - all legitimate traffic must originate from within the UK
+  # In preprod: GB + US - GitHub Actions integration tests run from US-based servers
   rule {
-    name     = "MonitorNonUK"
+    name     = "BlockNonUK"
     priority = 50
 
     action {
-      count {}
+      block {}
     }
 
     statement {
       not_statement {
         statement {
           geo_match_statement {
-            country_codes = ["GB"] # United Kingdom only (does NOT include Crown Dependencies)
+            country_codes = var.environment == "preprod" ? ["GB", "US"] : ["GB"]
           }
         }
       }
     }
 
     visibility_config {
       cloudwatch_metrics_enabled = true
-      metric_name                = "MonitorNonUK"
+      metric_name                = "BlockNonUK"
       sampled_requests_enabled   = true
     }
   }
diff --git a/infrastructure/stacks/api-layer/waf_alarms.tf b/infrastructure/stacks/api-layer/waf_alarms.tf
@@ -98,17 +98,20 @@ resource "aws_cloudwatch_metric_alarm" "waf_bad_inputs_blocks" {
 }
 
 # Alarm for rate limit violations (overall)
+# Rate limit is set to 300,000 req/5min (1000 TPS headroom over 500 TPS peak).
+# Any block at this threshold is a serious incident - a single IP would need to exceed
+# 300k requests in 5 minutes, which indicates a runaway or compromised proxy.
 resource "aws_cloudwatch_metric_alarm" "waf_rate_limit_blocks" {
   count               = local.waf_enabled ? 1 : 0
   alarm_name          = "WAF-RateLimit-Blocks-${local.workspace}"
-  alarm_description   = "Alerts when requests are rate-limited (potential DDoS)"
+  alarm_description   = "Alerts when requests are rate-limited - at 300k/5min limit this indicates a runaway or compromised proxy"
   comparison_operator = "GreaterThanThreshold"
-  evaluation_periods  = 2
+  evaluation_periods  = 1
   metric_name         = "BlockedRequests"
   namespace           = "AWS/WAFV2"
   period              = 300
   statistic           = "Sum"
-  threshold           = 50 # Alert after 50 rate-limited requests
+  threshold           = 1 # Any block at this limit is a serious incident
   treat_missing_data  = "notBreaching"
 
   dimensions = {
@@ -129,14 +132,16 @@ resource "aws_cloudwatch_metric_alarm" "waf_rate_limit_blocks" {
   )
 }
 
-# Alarm for non-UK rate limit violations
-resource "aws_cloudwatch_metric_alarm" "waf_non_uk_counted" {
+# Alarm for blocked non-UK requests
+# In preprod US is also allowed (for GitHub Actions), so this alarm fires on traffic
+# from countries outside GB+US. In prod it fires on anything outside GB.
+resource "aws_cloudwatch_metric_alarm" "waf_non_uk_blocked" {
   count               = local.waf_enabled ? 1 : 0
-  alarm_name          = "WAF-NonUK-CountedRequests-${local.workspace}"
-  alarm_description   = "Alerts when non-UK requests are observed (COUNT mode) by geo rule"
+  alarm_name          = "WAF-NonUK-BlockedRequests-${local.workspace}"
+  alarm_description   = "Alerts when non-UK requests are blocked by geo rule - may indicate stolen mTLS cert use from outside UK"
   comparison_operator = "GreaterThanThreshold"
   evaluation_periods  = 2
-  metric_name         = "CountedRequests"
+  metric_name         = "BlockedRequests"
   namespace           = "AWS/WAFV2"
   period              = 300
   statistic           = "Sum"
@@ -145,7 +150,7 @@ resource "aws_cloudwatch_metric_alarm" "waf_non_uk_counted" {
 
   dimensions = {
     Region = var.default_aws_region
-    Rule   = "MonitorNonUK"
+    Rule   = "BlockNonUK"
     WebACL = aws_wafv2_web_acl.api_gateway[0].name
   }
 
@@ -154,8 +159,8 @@ resource "aws_cloudwatch_metric_alarm" "waf_non_uk_counted" {
   tags = merge(
     local.tags,
     {
-      Name        = "WAF-NonUK-CountedRequests"
-      Severity    = "medium"
+      Name        = "WAF-NonUK-BlockedRequests"
+      Severity    = "high"
       Environment = var.environment
     }
   )
@@ -165,14 +170,14 @@ resource "aws_cloudwatch_metric_alarm" "waf_non_uk_counted" {
 resource "aws_cloudwatch_metric_alarm" "waf_all_requests_high" {
   count               = local.waf_enabled ? 1 : 0
   alarm_name          = "WAF-AllRequests-High-${local.workspace}"
-  alarm_description   = "Monitors total request volume through WAF"
+  alarm_description   = "Monitors total allowed request volume through WAF"
   comparison_operator = "GreaterThanThreshold"
   evaluation_periods  = 2
   metric_name         = "AllowedRequests"
   namespace           = "AWS/WAFV2"
   period              = 300
   statistic           = "Sum"
-  threshold           = 10000 # Adjust based on expected traffic
+  threshold           = 300000 # 2x peak (500 TPS = 150k/5min); alert above 300k/5min
   treat_missing_data  = "notBreaching"
 
   dimensions = {
@@ -192,19 +197,21 @@ resource "aws_cloudwatch_metric_alarm" "waf_all_requests_high" {
   )
 }
 
-# Alarm for monitoring counted requests (during initial count mode)
-# This helps identify if rules would block legitimate traffic
+# Alarm for counted requests (NoUserAgent_Header override)
+# The CRS NoUserAgent_Header sub-rule is kept in COUNT to allow the API proxy healthcheck.
+# This alarm alerts if count spikes unexpectedly, which could indicate rule misconfiguration
+# or unexpected traffic patterns hitting that override.
 resource "aws_cloudwatch_metric_alarm" "waf_counted_requests_monitoring" {
   count               = local.waf_enabled ? 1 : 0
   alarm_name          = "WAF-CountedRequests-Monitoring-${local.workspace}"
-  alarm_description   = "Monitors requests that would be blocked if rules were active (COUNT mode)"
+  alarm_description   = "Monitors counted requests - expected to be low volume (healthcheck NoUserAgent_Header override only)"
   comparison_operator = "GreaterThanThreshold"
   evaluation_periods  = 1
   metric_name         = "CountedRequests"
   namespace           = "AWS/WAFV2"
   period              = 300
   statistic           = "Sum"
-  threshold           = 100 # Alert if many requests would be blocked
+  threshold           = 100 # Alert if count spikes beyond normal healthcheck frequency
   treat_missing_data  = "notBreaching"
 
   dimensions = {
@@ -220,7 +227,7 @@ resource "aws_cloudwatch_metric_alarm" "waf_counted_requests_monitoring" {
       Name        = "WAF-CountedRequests-Monitoring"
       Severity    = "low"
       Environment = var.environment
-      Purpose     = "Initial monitoring during COUNT mode phase"
+      Purpose     = "Monitor NoUserAgent_Header count override for healthcheck proxy"
     }
   )
 }
diff --git a/poetry.lock b/poetry.lock

Original file line number	Diff line number	Diff line change
`@@ -1,11 +1,8 @@`
`1`	`1`	`# WAF Web ACL for API Gateway`
`2`		`-# Only deployed in production environment for cost optimization`
`3`		`-# Initially all rules are in COUNT mode to monitor traffic patterns`
`4`		`-`
`5`	`2`	`resource "aws_wafv2_web_acl" "api_gateway" {`
`6`	`3`	`count = local.waf_enabled ? 1 : 0`
`7`	`4`	`name = "${local.workspace}-eligibility-signposting-api-waf"`
`8`		`- description = "WAF Web ACL for Eligibility Signposting API Gateway - Production"`
	`5`	`+ description = "WAF Web ACL for Eligibility Signposting API Gateway"`
`9`	`6`	`scope = "REGIONAL"`
`10`	`7`
`11`	`8`	`default_action {`
`@@ -19,7 +16,7 @@ resource "aws_wafv2_web_acl" "api_gateway" {`
`19`	`16`	`priority = 10`
`20`	`17`
`21`	`18`	`override_action {`
`22`		`- count {} # Start in count mode - change to none {} when ready to block`
	`19`	`+ none {}`
`23`	`20`	`}`
`24`	`21`
`25`	`22`	`statement {`
`@@ -43,13 +40,21 @@ resource "aws_wafv2_web_acl" "api_gateway" {`
`43`	`40`	`priority = 20`
`44`	`41`
`45`	`42`	`override_action {`
`46`		`- count {} # Start in count mode - change to none {} when ready to block`
	`43`	`+ none {}`
`47`	`44`	`}`
`48`	`45`
`49`	`46`	`statement {`
`50`	`47`	`managed_rule_group_statement {`
`51`	`48`	`vendor_name = "AWS"`
`52`	`49`	`name = "AWSManagedRulesCommonRuleSet"`
	`50`	`+`
	`51`	`+ # Override NoUserAgent_Header to count only - APIM health checks send no User-Agent`
	`52`	`+ rule_action_override {`
	`53`	`+ name = "NoUserAgent_Header"`
	`54`	`+ action_to_use {`
	`55`	`+ count {}`
	`56`	`+ }`
	`57`	`+ }`
`53`	`58`	`}`
`54`	`59`	`}`
`55`	`60`
`@@ -93,12 +98,12 @@ resource "aws_wafv2_web_acl" "api_gateway" {`
`93`	`98`	`priority = 40`
`94`	`99`
`95`	`100`	`action {`
`96`		`- count {} # Start in count mode - change to block {} when ready`
	`101`	`+ block {}`
`97`	`102`	`}`
`98`	`103`
`99`	`104`	`statement {`
`100`	`105`	`rate_based_statement {`
`101`		`- limit = 2000 # Requests per 5-minute period per IP`
	`106`	`+ limit = 300000 # 1000 TPS - we should tie this to other rate limits`
`102`	`107`	`aggregate_key_type = "IP"`
`103`	`108`	`}`
`104`	`109`	`}`
`@@ -110,30 +115,31 @@ resource "aws_wafv2_web_acl" "api_gateway" {`
`110`	`115`	`}`
`111`	`116`	`}`
`112`	`117`
`113`		`- # Rule 5: Geographic Monitoring Rule - Monitor non-UK traffic (COUNT only)`
`114`		`- # NHS-specific requirement: initially monitor requests originating from outside GB`
`115`		`- # This rule COUNTS any request whose geo country code is not GB (does not block)`
	`118`	`+ # Rule 5: Geographic Block Rule - Block non-UK traffic`
	`119`	`+ # Blocks requests from outside the allowed country list.`
	`120`	`+ # In prod: GB only - all legitimate traffic must originate from within the UK`
	`121`	`+ # In preprod: GB + US - GitHub Actions integration tests run from US-based servers`
`116`	`122`	`rule {`
`117`		`- name = "MonitorNonUK"`
	`123`	`+ name = "BlockNonUK"`
`118`	`124`	`priority = 50`
`119`	`125`
`120`	`126`	`action {`
`121`		`- count {}`
	`127`	`+ block {}`
`122`	`128`	`}`
`123`	`129`
`124`	`130`	`statement {`
`125`	`131`	`not_statement {`
`126`	`132`	`statement {`
`127`	`133`	`geo_match_statement {`
`128`		`- country_codes = ["GB"] # United Kingdom only (does NOT include Crown Dependencies)`
	`134`	`+ country_codes = var.environment == "preprod" ? ["GB", "US"] : ["GB"]`
`129`	`135`	`}`
`130`	`136`	`}`
`131`	`137`	`}`
`132`	`138`	`}`
`133`	`139`
`134`	`140`	`visibility_config {`
`135`	`141`	`cloudwatch_metrics_enabled = true`
`136`		`- metric_name = "MonitorNonUK"`
	`142`	`+ metric_name = "BlockNonUK"`
`137`	`143`	`sampled_requests_enabled = true`
`138`	`144`	`}`
`139`	`145`	`}`