From efaae921aaf8225e87836dbfc7261698ec69b071 Mon Sep 17 00:00:00 2001 From: ariagraham-nhs Date: Fri, 1 May 2026 16:47:03 +0100 Subject: [PATCH 1/6] VED-1170: CloudWatch Alarm without actions --- .../fhir_api_perf_errors_slack_chatbot.tf | 24 +++++++++++++++++++ .../account/fhir_api_perf_errors_sns_topic.tf | 22 +++++++++++++++++ infrastructure/account/kms.tf | 6 +++++ 3 files changed, 52 insertions(+) create mode 100644 infrastructure/account/fhir_api_perf_errors_slack_chatbot.tf create mode 100644 infrastructure/account/fhir_api_perf_errors_sns_topic.tf diff --git a/infrastructure/account/fhir_api_perf_errors_slack_chatbot.tf b/infrastructure/account/fhir_api_perf_errors_slack_chatbot.tf new file mode 100644 index 000000000..88fe488df --- /dev/null +++ b/infrastructure/account/fhir_api_perf_errors_slack_chatbot.tf @@ -0,0 +1,24 @@ +resource "aws_chatbot_slack_channel_configuration" "fhir_api_perf_alerts" { + configuration_name = "${var.environment}-fhir-api-perf-alerts-slack-channel-config" + iam_role_arn = aws_iam_role.fhir_api_perf_alerts_chatbot.arn + slack_channel_id = var.environment == "prod" ? "C0B11MJPQ6A" : "C0B1GKZ5S4R" + slack_team_id = "TJ00QR03U" + sns_topic_arns = [aws_sns_topic.fhir_api_perf_alerts.arn] +} + +resource "aws_iam_role" "fhir_api_perf_alerts_chatbot" { + name = "${var.environment}-fhir-api-perf-alerts-chatbot-channel-role" + assume_role_policy = jsonencode({ + Version = "2012-10-17" + Statement = [ + { + Action = "sts:AssumeRole" + Effect = "Allow" + Sid = "AssumeChatbotRole" + Principal = { + Service = "chatbot.amazonaws.com" + } + }, + ] + }) +} diff --git a/infrastructure/account/fhir_api_perf_errors_sns_topic.tf b/infrastructure/account/fhir_api_perf_errors_sns_topic.tf new file mode 100644 index 000000000..00fcda457 --- /dev/null +++ b/infrastructure/account/fhir_api_perf_errors_sns_topic.tf @@ -0,0 +1,22 @@ +resource "aws_sns_topic" "fhir_api_perf_alerts" { + name = "${var.environment}-fhir-api-perf-alerts" + kms_master_key_id = aws_kms_key.error_alerts_sns_encryption_key.arn +} + +resource "aws_sns_topic_policy" "fhir_api_perf_alerts_topic_policy" { + arn = aws_sns_topic.fhir_api_perf_alerts.arn + policy = jsonencode({ + Version = "2012-10-17", + Statement = [ + { + Sid = "AllowCloudWatchToPublish", + Effect = "Allow", + Principal = { + Service = "cloudwatch.amazonaws.com" + }, + Action = "SNS:Publish", + Resource = aws_sns_topic.fhir_api_perf_alerts.arn + } + ] + }) +} diff --git a/infrastructure/account/kms.tf b/infrastructure/account/kms.tf index 21e5e2a78..563c7bdc7 100644 --- a/infrastructure/account/kms.tf +++ b/infrastructure/account/kms.tf @@ -225,3 +225,9 @@ resource "aws_kms_alias" "fhir_api_errors_sns_encryption_key" { name = "alias/${var.environment}-fhir-api-errors-imms-sns-encryption" target_key_id = aws_kms_key.error_alerts_sns_encryption_key.key_id } + +resource "aws_kms_alias" "fhir_api_perf_alerts_sns_encryption_key" { + name = "alias/${var.environment}-fhir-api-perf-alerts-imms-sns-encryption" + target_key_id = aws_kms_key.error_alerts_sns_encryption_key.key_id +} + From 70ac3c1b8f94bed160101249489af0028220cb5f Mon Sep 17 00:00:00 2001 From: ariagraham-nhs Date: Tue, 5 May 2026 11:59:43 +0100 Subject: [PATCH 2/6] Add action to existing alarm --- ...lack_chatbot.tf => fhir_api_perf_alerts_slack_chatbot.tf} | 0 ...errors_sns_topic.tf => fhir_api_perf_alerts_sns_topic.tf} | 0 infrastructure/instance/modules/lambda/lambda.tf | 5 +++++ 3 files changed, 5 insertions(+) rename infrastructure/account/{fhir_api_perf_errors_slack_chatbot.tf => fhir_api_perf_alerts_slack_chatbot.tf} (100%) rename infrastructure/account/{fhir_api_perf_errors_sns_topic.tf => fhir_api_perf_alerts_sns_topic.tf} (100%) diff --git a/infrastructure/account/fhir_api_perf_errors_slack_chatbot.tf b/infrastructure/account/fhir_api_perf_alerts_slack_chatbot.tf similarity index 100% rename from infrastructure/account/fhir_api_perf_errors_slack_chatbot.tf rename to infrastructure/account/fhir_api_perf_alerts_slack_chatbot.tf diff --git a/infrastructure/account/fhir_api_perf_errors_sns_topic.tf b/infrastructure/account/fhir_api_perf_alerts_sns_topic.tf similarity index 100% rename from infrastructure/account/fhir_api_perf_errors_sns_topic.tf rename to infrastructure/account/fhir_api_perf_alerts_sns_topic.tf diff --git a/infrastructure/instance/modules/lambda/lambda.tf b/infrastructure/instance/modules/lambda/lambda.tf index 9714614c0..87e38808d 100644 --- a/infrastructure/instance/modules/lambda/lambda.tf +++ b/infrastructure/instance/modules/lambda/lambda.tf @@ -24,6 +24,10 @@ module "lambda_function_container_image" { image_config_command = ["${var.function_name}_handler.${var.function_name}_handler"] } +data "aws_sns_topic" "fhir_api_perf_alerts" { + name = "${var.environment}-fhir-api-perf-alerts" +} + resource "aws_cloudwatch_metric_alarm" "memory_alarm" { alarm_name = "${var.short_prefix}_${var.function_name} memory alarm" comparison_operator = "GreaterThanOrEqualToThreshold" @@ -34,6 +38,7 @@ resource "aws_cloudwatch_metric_alarm" "memory_alarm" { statistic = "Maximum" threshold = 256 alarm_description = "This metric monitors Lambda memory usage" + alarm_actions = [data.aws_sns_topic.fhir_api_perf_alerts.arn] insufficient_data_actions = [] } From b2bf711da84e5131b9e172befe6ad2dab0fd1ac2 Mon Sep 17 00:00:00 2001 From: ariagraham-nhs Date: Wed, 6 May 2026 11:32:20 +0100 Subject: [PATCH 3/6] Add action to DDoS alarm --- infrastructure/account/shield_protection.tf | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/infrastructure/account/shield_protection.tf b/infrastructure/account/shield_protection.tf index 0809c97d0..a7a677083 100644 --- a/infrastructure/account/shield_protection.tf +++ b/infrastructure/account/shield_protection.tf @@ -34,6 +34,10 @@ locals { } } +# Topic to publish alerts to when alarm is triggered +data "aws_sns_topic" "fhir_api_perf_alerts" { + name = "${var.environment}-fhir-api-perf-alerts" +} # Create Metric Alarms for each of those resources resource "aws_cloudwatch_metric_alarm" "ddos_protection_regional" { @@ -41,6 +45,7 @@ resource "aws_cloudwatch_metric_alarm" "ddos_protection_regional" { alarm_name = "imms-${var.environment}-shield_ddos_${each.key}" alarm_description = "Alarm when Shield detects DDoS on ${each.key}" + alarm_actions = [data.aws_sns_topic.fhir_api_perf_alerts.arn] namespace = "AWS/DDoSProtection" metric_name = "DDoSDetected" From d8108c2f37bf8b663b1112349e623a633bb92dc0 Mon Sep 17 00:00:00 2001 From: Thomas-Boyle Date: Wed, 6 May 2026 14:36:26 +0100 Subject: [PATCH 4/6] Add CloudWatch metrics and alarms for ACK and Forwarding Lambda functions - Introduced CloudWatch log metric filters and alarms for error logging in both the ACK and Forwarding Lambda functions. - Updated existing CloudWatch dashboard metrics to include new Lambda functions and their respective error handling. - Enhanced alarm configurations to trigger notifications based on error logs detected in the Lambda functions. --- .../account/cloudwatch_dashboards.tf | 182 +++++++++++++----- infrastructure/instance/ack_lambda.tf | 30 +++ infrastructure/instance/forwarder_lambda.tf | 30 +++ 3 files changed, 189 insertions(+), 53 deletions(-) diff --git a/infrastructure/account/cloudwatch_dashboards.tf b/infrastructure/account/cloudwatch_dashboards.tf index 6dd1966a2..56961ce06 100644 --- a/infrastructure/account/cloudwatch_dashboards.tf +++ b/infrastructure/account/cloudwatch_dashboards.tf @@ -31,6 +31,7 @@ locals { [for sub_env in local.sub_environments_map[var.environment] : "imms-${sub_env}-delta-lambda"], [for sub_env in local.sub_environments_map[var.environment] : "imms-${sub_env}_get_status"], [for sub_env in local.sub_environments_map[var.environment] : "imms-${sub_env}-redis-sync-lambda"], + [for sub_env in local.sub_environments_map[var.environment] : "imms-${sub_env}-mns-publisher-lambda"], [for sub_env in local.sub_environments_map[var.environment] : "imms-${sub_env}-mesh-processor-lambda" if var.environment != "dev"], ]) @@ -42,7 +43,14 @@ locals { "immunisation-batch-${var.environment == "dev" ? "internal-dev" : var.environment}-audit-table", var.environment == "dev" ? "imms-internal-qa-delta" : "", var.environment == "dev" ? "imms-internal-qa-imms-events" : "", - var.environment == "dev" ? "imms-internal-qa-audit-table" : "", + var.environment == "dev" ? "immunisation-batch-internal-qa-audit-table" : "", + ]) + + mns_resource_scopes = var.environment == "dev" ? local.sub_environments_map[var.environment] : [var.environment] + mns_sqs_queues = flatten([ + [for resource_scope in local.mns_resource_scopes : "${resource_scope}-mns-outbound-events-queue"], + [for resource_scope in local.mns_resource_scopes : "${resource_scope}-mns-outbound-events-dead-letter-queue"], + var.environment == "dev" ? [for resource_scope in local.mns_resource_scopes : "${resource_scope}-mns-test-notification-queue"] : [], ]) sqs_queues = distinct(flatten([ @@ -52,13 +60,15 @@ locals { [for sub_env in local.sub_environments_map[var.environment] : "imms-${sub_env}-metadata-queue.fifo"], var.environment == "dev" ? [for sub_env in local.sub_environments_map[var.environment] : "imms-${sub_env}-id-sync-dlq"] : ["imms-${var.environment}-id-sync-dlq"], var.environment == "dev" ? [for sub_env in local.sub_environments_map[var.environment] : "imms-${sub_env}-id-sync-queue"] : ["imms-${var.environment}-id-sync-queue"], + local.mns_sqs_queues, ])) # ECS (cluster names match instance short_prefix: imms--ecs-cluster) - ecs_clusters = [for sub_env in local.sub_environments_map[var.environment] : "imms-${sub_env}-ecs-cluster"] + ecs_clusters = [for sub_env in local.sub_environments_map[var.environment] : "imms-${sub_env}-ecs-cluster"] + ecs_task_definition_families = [for sub_env in local.sub_environments_map[var.environment] : "imms-${sub_env}-processor-task"] # Alarms - alarms = [ + alarms = concat([ "_create_imms-lambda-error", "_create_imms memory alarm", "_get_imms-lambda-error", @@ -74,17 +84,23 @@ locals { "-record-processor-task-error", "-file-name-processor-lambda-error", "-batch-processor-filter-lambda-error", + "-ack-lambda-error", + "-forwarding-lambda-error", "-id-sync-lambda-error", "-redis-sync-lambda-error", "-delta-lambda-error", + "-mns-publisher-lambda-error", "_not_found-lambda-error", "_not_found memory alarm" - ] + ], var.environment == "dev" ? [] : ["-mesh-processor-lambda-error"]) # Alarms are turned off in internal-qa as testing could cause unnecessary noise dev_alarms = [for alarm in local.alarms : "arn:aws:cloudwatch:${var.aws_region}:${var.imms_account_id}:alarm:imms-internal-dev${alarm}"] non_dev_alarms = flatten([for sub_env in local.sub_environments_map[var.environment] : [for alarm in local.alarms : "arn:aws:cloudwatch:${var.aws_region}:${var.imms_account_id}:alarm:imms-${sub_env}${alarm}"] if var.environment != "dev"]) - alarms_properties = var.environment == "dev" ? local.dev_alarms : local.non_dev_alarms + shared_alarms = var.environment == "dev" ? [] : [ + "arn:aws:cloudwatch:${var.aws_region}:${var.imms_account_id}:alarm:imms-${var.environment}-mesh-processor-no-lambda-invocation" + ] + alarms_properties = concat(var.environment == "dev" ? local.dev_alarms : local.non_dev_alarms, local.shared_alarms) } @@ -224,8 +240,12 @@ resource "aws_cloudwatch_dashboard" "imms-metrics-dashboard" { "height" : 6, "properties" : { "metrics" : concat( - [for lambda in local.api_lambdas : ["AWS/Lambda", "Invocations", "FunctionName", lambda, { region : var.aws_region }]], - [for lambda in local.api_lambdas : ["AWS/Lambda", "Errors", "FunctionName", lambda, { color : local.errors_colour_code, region : var.aws_region }]] + [ + [{ expression : "SUM(METRICS(\"apiinvocations\"))", label : "API Invocations", id : "e1", region : var.aws_region }], + [{ expression : "SUM(METRICS(\"apierrors\"))", label : "API Errors", id : "e2", color : local.errors_colour_code, region : var.aws_region }] + ], + [for i, lambda in local.api_lambdas : ["AWS/Lambda", "Invocations", "FunctionName", lambda, { id : "apiinvocations${i}", visible : false, region : var.aws_region }]], + [for i, lambda in local.api_lambdas : ["AWS/Lambda", "Errors", "FunctionName", lambda, { id : "apierrors${i}", visible : false, color : local.errors_colour_code, region : var.aws_region }]] ), "view" : "timeSeries", "stacked" : false, @@ -243,8 +263,8 @@ resource "aws_cloudwatch_dashboard" "imms-metrics-dashboard" { "height" : 6, "properties" : { "metrics" : concat( - [[{ expression : "AVG(METRICS())", label : "Average Duration", id : "e1", stat : "Maximum", region : var.aws_region }]], - [for i, lambda in local.api_lambdas : ["AWS/Lambda", "Duration", "FunctionName", lambda, { stat : "Maximum", id : "m${i + 1}", region : var.aws_region }]] + [[{ expression : "AVG(METRICS(\"apiduration\"))", label : "API Average Duration", id : "e1", stat : "Maximum", region : var.aws_region }]], + [for i, lambda in local.api_lambdas : ["AWS/Lambda", "Duration", "FunctionName", lambda, { stat : "Maximum", id : "apiduration${i}", visible : false, region : var.aws_region }]] ), "view" : "timeSeries", "stacked" : false, @@ -261,9 +281,10 @@ resource "aws_cloudwatch_dashboard" "imms-metrics-dashboard" { "width" : 6, "height" : 6, "properties" : { - "metrics" : [ - for lambda in local.api_lambdas : ["AWS/Lambda", "ConcurrentExecutions", "FunctionName", lambda, { region : var.aws_region }] - ], + "metrics" : concat( + [[{ expression : "SUM(METRICS(\"apiconcurrency\"))", label : "API ConcurrentExecutions", id : "e1", region : var.aws_region }]], + [for i, lambda in local.api_lambdas : ["AWS/Lambda", "ConcurrentExecutions", "FunctionName", lambda, { id : "apiconcurrency${i}", visible : false, region : var.aws_region }]] + ), "view" : "timeSeries", "stacked" : false, "region" : var.aws_region, @@ -308,8 +329,12 @@ resource "aws_cloudwatch_dashboard" "imms-metrics-dashboard" { "height" : 6, "properties" : { "metrics" : concat( - [for lambda in local.batch_lambdas : ["AWS/Lambda", "Invocations", "FunctionName", lambda, { region : var.aws_region }]], - [for lambda in local.batch_lambdas : ["AWS/Lambda", "Errors", "FunctionName", lambda, { color : local.errors_colour_code, region : var.aws_region }]] + [ + [{ expression : "SUM(METRICS(\"batchinvocations\"))", label : "Batch Invocations", id : "e1", region : var.aws_region }], + [{ expression : "SUM(METRICS(\"batcherrors\"))", label : "Batch Errors", id : "e2", color : local.errors_colour_code, region : var.aws_region }] + ], + [for i, lambda in local.batch_lambdas : ["AWS/Lambda", "Invocations", "FunctionName", lambda, { id : "batchinvocations${i}", visible : false, region : var.aws_region }]], + [for i, lambda in local.batch_lambdas : ["AWS/Lambda", "Errors", "FunctionName", lambda, { id : "batcherrors${i}", visible : false, color : local.errors_colour_code, region : var.aws_region }]] ), "view" : "timeSeries", "stacked" : false, @@ -327,8 +352,8 @@ resource "aws_cloudwatch_dashboard" "imms-metrics-dashboard" { "height" : 6, "properties" : { "metrics" : concat( - [[{ expression : "AVG(METRICS())", label : "Average Duration", id : "e1", stat : "Maximum", region : var.aws_region }]], - [for i, lambda in local.batch_lambdas : ["AWS/Lambda", "Duration", "FunctionName", lambda, { stat : "Maximum", id : "m${i + 1}", region : var.aws_region }]] + [[{ expression : "AVG(METRICS(\"batchduration\"))", label : "Batch Average Duration", id : "e1", stat : "Maximum", region : var.aws_region }]], + [for i, lambda in local.batch_lambdas : ["AWS/Lambda", "Duration", "FunctionName", lambda, { stat : "Maximum", id : "batchduration${i}", visible : false, region : var.aws_region }]] ), "view" : "timeSeries", "stacked" : false, @@ -345,9 +370,10 @@ resource "aws_cloudwatch_dashboard" "imms-metrics-dashboard" { "width" : 6, "height" : 6, "properties" : { - "metrics" : [ - for lambda in local.batch_lambdas : ["AWS/Lambda", "ConcurrentExecutions", "FunctionName", lambda, { region : var.aws_region }] - ], + "metrics" : concat( + [[{ expression : "SUM(METRICS(\"batchconcurrency\"))", label : "Batch ConcurrentExecutions", id : "e1", region : var.aws_region }]], + [for i, lambda in local.batch_lambdas : ["AWS/Lambda", "ConcurrentExecutions", "FunctionName", lambda, { id : "batchconcurrency${i}", visible : false, region : var.aws_region }]] + ), "view" : "timeSeries", "stacked" : false, "region" : var.aws_region, @@ -364,7 +390,7 @@ resource "aws_cloudwatch_dashboard" "imms-metrics-dashboard" { "height" : 3, "properties" : { "metrics" : concat( - [[{ expression : "SUM(METRICS())", label : "API Errors", id : "e1", region : var.aws_region, color : local.errors_colour_code }]], + [[{ expression : "SUM(METRICS())", label : "Batch Errors", id : "e1", region : var.aws_region, color : local.errors_colour_code }]], [for i, lambda in local.batch_lambdas : ["AWS/Lambda", "Errors", "FunctionName", lambda, { color : local.errors_colour_code, region : var.aws_region, id : "m${i + 1}", visible : false }]] ), "sparkline" : true, @@ -392,8 +418,12 @@ resource "aws_cloudwatch_dashboard" "imms-metrics-dashboard" { "height" : 6, "properties" : { "metrics" : concat( - [for lambda in local.ancillary_lambdas : ["AWS/Lambda", "Invocations", "FunctionName", lambda, { region : var.aws_region }]], - [for lambda in local.ancillary_lambdas : ["AWS/Lambda", "Errors", "FunctionName", lambda, { color : local.errors_colour_code, region : var.aws_region }]] + [ + [{ expression : "SUM(METRICS(\"ancillaryinvocations\"))", label : "Ancillary Invocations", id : "e1", region : var.aws_region }], + [{ expression : "SUM(METRICS(\"ancillaryerrors\"))", label : "Ancillary Errors", id : "e2", color : local.errors_colour_code, region : var.aws_region }] + ], + [for i, lambda in local.ancillary_lambdas : ["AWS/Lambda", "Invocations", "FunctionName", lambda, { id : "ancillaryinvocations${i}", visible : false, region : var.aws_region }]], + [for i, lambda in local.ancillary_lambdas : ["AWS/Lambda", "Errors", "FunctionName", lambda, { id : "ancillaryerrors${i}", visible : false, color : local.errors_colour_code, region : var.aws_region }]] ), "view" : "timeSeries", "stacked" : false, @@ -411,8 +441,8 @@ resource "aws_cloudwatch_dashboard" "imms-metrics-dashboard" { "height" : 6, "properties" : { "metrics" : concat( - [[{ expression : "AVG(METRICS())", label : "Average Duration", id : "e1", stat : "Maximum", region : var.aws_region }]], - [for i, lambda in local.ancillary_lambdas : ["AWS/Lambda", "Duration", "FunctionName", lambda, { stat : "Maximum", id : "m${i + 1}", region : var.aws_region }]] + [[{ expression : "AVG(METRICS(\"ancillaryduration\"))", label : "Ancillary Average Duration", id : "e1", stat : "Maximum", region : var.aws_region }]], + [for i, lambda in local.ancillary_lambdas : ["AWS/Lambda", "Duration", "FunctionName", lambda, { stat : "Maximum", id : "ancillaryduration${i}", visible : false, region : var.aws_region }]] ), "view" : "timeSeries", "stacked" : false, @@ -429,9 +459,10 @@ resource "aws_cloudwatch_dashboard" "imms-metrics-dashboard" { "width" : 6, "height" : 6, "properties" : { - "metrics" : [ - for lambda in local.ancillary_lambdas : ["AWS/Lambda", "ConcurrentExecutions", "FunctionName", lambda, { region : var.aws_region }] - ], + "metrics" : concat( + [[{ expression : "SUM(METRICS(\"ancillaryconcurrency\"))", label : "Ancillary ConcurrentExecutions", id : "e1", region : var.aws_region }]], + [for i, lambda in local.ancillary_lambdas : ["AWS/Lambda", "ConcurrentExecutions", "FunctionName", lambda, { id : "ancillaryconcurrency${i}", visible : false, region : var.aws_region }]] + ), "view" : "timeSeries", "stacked" : false, "region" : var.aws_region, @@ -448,7 +479,7 @@ resource "aws_cloudwatch_dashboard" "imms-metrics-dashboard" { "height" : 3, "properties" : { "metrics" : concat( - [[{ expression : "SUM(METRICS())", label : "API Errors", id : "e1", region : var.aws_region, color : local.errors_colour_code }]], + [[{ expression : "SUM(METRICS())", label : "Ancillary Errors", id : "e1", region : var.aws_region, color : local.errors_colour_code }]], [for i, lambda in local.ancillary_lambdas : ["AWS/Lambda", "Errors", "FunctionName", lambda, { color : local.errors_colour_code, region : var.aws_region, id : "m${i + 1}", visible : false }]] ), "sparkline" : true, @@ -476,8 +507,9 @@ resource "aws_cloudwatch_dashboard" "imms-metrics-dashboard" { "height" : 6, "properties" : { "metrics" : concat( - [for table in local.dynamodb_tables : ["AWS/DynamoDB", "SuccessfulRequestLatency", "TableName", table, "Operation", "GetItem", { region : var.aws_region }]], - [for table in local.dynamodb_tables : ["AWS/DynamoDB", "SuccessfulRequestLatency", "TableName", table, "Operation", "Query", { region : var.aws_region }]] + [[{ expression : "SUM(METRICS(\"ddbreadcount\"))", label : "Successful Read Requests", id : "e1", region : var.aws_region }]], + [for i, table in local.dynamodb_tables : ["AWS/DynamoDB", "SuccessfulRequestLatency", "TableName", table, "Operation", "GetItem", { id : "ddbreadcountget${i}", visible : false, region : var.aws_region }]], + [for i, table in local.dynamodb_tables : ["AWS/DynamoDB", "SuccessfulRequestLatency", "TableName", table, "Operation", "Query", { id : "ddbreadcountquery${i}", visible : false, region : var.aws_region }]] ), "view" : "timeSeries", "stacked" : false, @@ -500,8 +532,9 @@ resource "aws_cloudwatch_dashboard" "imms-metrics-dashboard" { "height" : 6, "properties" : { "metrics" : concat( - [for table in local.dynamodb_tables : ["AWS/DynamoDB", "SuccessfulRequestLatency", "TableName", table, "Operation", "GetItem", { region : var.aws_region }]], - [for table in local.dynamodb_tables : ["AWS/DynamoDB", "SuccessfulRequestLatency", "TableName", table, "Operation", "Query", { region : var.aws_region }]] + [[{ expression : "AVG(METRICS(\"ddbreadlatency\"))", label : "Average Read Latency", id : "e1", region : var.aws_region }]], + [for i, table in local.dynamodb_tables : ["AWS/DynamoDB", "SuccessfulRequestLatency", "TableName", table, "Operation", "GetItem", { id : "ddbreadlatencyget${i}", visible : false, region : var.aws_region }]], + [for i, table in local.dynamodb_tables : ["AWS/DynamoDB", "SuccessfulRequestLatency", "TableName", table, "Operation", "Query", { id : "ddbreadlatencyquery${i}", visible : false, region : var.aws_region }]] ), "view" : "timeSeries", "stacked" : false, @@ -518,9 +551,10 @@ resource "aws_cloudwatch_dashboard" "imms-metrics-dashboard" { "width" : 6, "height" : 6, "properties" : { - "metrics" : [ - for table in local.dynamodb_tables : ["AWS/DynamoDB", "ConsumedReadCapacityUnits", "TableName", table] - ], + "metrics" : concat( + [[{ expression : "SUM(METRICS(\"ddbreadcapacity\"))", label : "Consumed Read Capacity Units", id : "e1", region : var.aws_region }]], + [for i, table in local.dynamodb_tables : ["AWS/DynamoDB", "ConsumedReadCapacityUnits", "TableName", table, { id : "ddbreadcapacity${i}", visible : false, region : var.aws_region }]] + ), "view" : "timeSeries", "stacked" : false, "region" : var.aws_region, @@ -560,8 +594,9 @@ resource "aws_cloudwatch_dashboard" "imms-metrics-dashboard" { "height" : 6, "properties" : { "metrics" : concat( - [for table in local.dynamodb_tables : ["AWS/DynamoDB", "SuccessfulRequestLatency", "TableName", table, "Operation", "PutItem", { region : var.aws_region }]], - [for table in local.dynamodb_tables : ["AWS/DynamoDB", "SuccessfulRequestLatency", "TableName", table, "Operation", "UpdateItem", { region : var.aws_region }]] + [[{ expression : "SUM(METRICS(\"ddbwritecount\"))", label : "Successful Write Requests", id : "e1", region : var.aws_region }]], + [for i, table in local.dynamodb_tables : ["AWS/DynamoDB", "SuccessfulRequestLatency", "TableName", table, "Operation", "PutItem", { id : "ddbwritecountput${i}", visible : false, region : var.aws_region }]], + [for i, table in local.dynamodb_tables : ["AWS/DynamoDB", "SuccessfulRequestLatency", "TableName", table, "Operation", "UpdateItem", { id : "ddbwritecountupdate${i}", visible : false, region : var.aws_region }]] ), "view" : "timeSeries", "stacked" : false, @@ -584,8 +619,9 @@ resource "aws_cloudwatch_dashboard" "imms-metrics-dashboard" { "height" : 6, "properties" : { "metrics" : concat( - [for table in local.dynamodb_tables : ["AWS/DynamoDB", "SuccessfulRequestLatency", "TableName", table, "Operation", "PutItem", { region : var.aws_region }]], - [for table in local.dynamodb_tables : ["AWS/DynamoDB", "SuccessfulRequestLatency", "TableName", table, "Operation", "UpdateItem", { region : var.aws_region }]] + [[{ expression : "AVG(METRICS(\"ddbwritelatency\"))", label : "Average Write Latency", id : "e1", region : var.aws_region }]], + [for i, table in local.dynamodb_tables : ["AWS/DynamoDB", "SuccessfulRequestLatency", "TableName", table, "Operation", "PutItem", { id : "ddbwritelatencyput${i}", visible : false, region : var.aws_region }]], + [for i, table in local.dynamodb_tables : ["AWS/DynamoDB", "SuccessfulRequestLatency", "TableName", table, "Operation", "UpdateItem", { id : "ddbwritelatencyupdate${i}", visible : false, region : var.aws_region }]] ), "view" : "timeSeries", "stacked" : false, @@ -602,9 +638,10 @@ resource "aws_cloudwatch_dashboard" "imms-metrics-dashboard" { "width" : 6, "height" : 6, "properties" : { - "metrics" : [ - for table in local.dynamodb_tables : ["AWS/DynamoDB", "ConsumedWriteCapacityUnits", "TableName", table] - ], + "metrics" : concat( + [[{ expression : "SUM(METRICS(\"ddbwritecapacity\"))", label : "Consumed Write Capacity Units", id : "e1", region : var.aws_region }]], + [for i, table in local.dynamodb_tables : ["AWS/DynamoDB", "ConsumedWriteCapacityUnits", "TableName", table, { id : "ddbwritecapacity${i}", visible : false, region : var.aws_region }]] + ), "view" : "timeSeries", "stacked" : false, "region" : var.aws_region, @@ -631,14 +668,14 @@ resource "aws_cloudwatch_dashboard" "imms-metrics-dashboard" { "height" : 6, "properties" : { "metrics" : [ - for cluster in local.ecs_clusters : ["ECS/ContainerInsights", "TaskCount", "ClusterName", cluster, { region : var.aws_region }] + for i, cluster in local.ecs_clusters : ["ECS/ContainerInsights", "TaskCount", "ClusterName", cluster, "TaskDefinitionFamily", local.ecs_task_definition_families[i], { region : var.aws_region }] ], "view" : "timeSeries", "stacked" : false, "region" : var.aws_region, - "stat" : "SampleCount", + "stat" : "Average", "period" : 300, - "title" : "ECS - Task Count" + "title" : "ECS Batch Processor - Task Count" } }, { @@ -649,14 +686,14 @@ resource "aws_cloudwatch_dashboard" "imms-metrics-dashboard" { "height" : 6, "properties" : { "metrics" : [ - for cluster in local.ecs_clusters : ["ECS/ContainerInsights", "CpuUtilized", "ClusterName", cluster, { region : var.aws_region }] + for i, cluster in local.ecs_clusters : ["ECS/ContainerInsights", "CpuUtilized", "ClusterName", cluster, "TaskDefinitionFamily", local.ecs_task_definition_families[i], { region : var.aws_region }] ], "view" : "timeSeries", "stacked" : false, "region" : var.aws_region, "stat" : "Maximum", "period" : 300, - "title" : "ECS - CPU Utilization" + "title" : "ECS Batch Processor - CPU Utilization" } }, { @@ -667,14 +704,14 @@ resource "aws_cloudwatch_dashboard" "imms-metrics-dashboard" { "height" : 6, "properties" : { "metrics" : [ - for cluster in local.ecs_clusters : ["ECS/ContainerInsights", "MemoryUtilized", "ClusterName", cluster, { region : var.aws_region }] + for i, cluster in local.ecs_clusters : ["ECS/ContainerInsights", "MemoryUtilized", "ClusterName", cluster, "TaskDefinitionFamily", local.ecs_task_definition_families[i], { region : var.aws_region }] ], "view" : "timeSeries", "stacked" : false, "region" : var.aws_region, "stat" : "Maximum", "period" : 300, - "title" : "ECS - Memory Utilization" + "title" : "ECS Batch Processor - Memory Utilization" } }, { @@ -724,9 +761,10 @@ resource "aws_cloudwatch_dashboard" "imms-metrics-dashboard" { "width" : 6, "height" : 6, "properties" : { - "metrics" : [ - for queue in local.sqs_queues : ["AWS/SQS", "NumberOfMessagesSent", "QueueName", queue, { region : var.aws_region }] - ], + "metrics" : concat( + [[{ expression : "SUM(METRICS(\"sqssent\"))", label : "Messages Sent", id : "e1", region : var.aws_region }]], + [for i, queue in local.sqs_queues : ["AWS/SQS", "NumberOfMessagesSent", "QueueName", queue, { id : "sqssent${i}", visible : false, region : var.aws_region }]] + ), "view" : "timeSeries", "stacked" : false, "region" : var.aws_region, @@ -735,12 +773,50 @@ resource "aws_cloudwatch_dashboard" "imms-metrics-dashboard" { "stat" : "Sum" } }, + { + "type" : "metric", + "x" : 18, + "y" : 51, + "width" : 6, + "height" : 6, + "properties" : { + "metrics" : concat( + [[{ expression : "SUM(METRICS(\"sqsvisible\"))", label : "Visible Messages", id : "e1", region : var.aws_region }]], + [for i, queue in local.sqs_queues : ["AWS/SQS", "ApproximateNumberOfMessagesVisible", "QueueName", queue, { id : "sqsvisible${i}", visible : false, region : var.aws_region }]] + ), + "view" : "timeSeries", + "stacked" : false, + "region" : var.aws_region, + "title" : "SQS Queues - Visible Messages", + "period" : 300, + "stat" : "Maximum" + } + }, { "type" : "metric", "x" : 0, "y" : 57, "width" : 6, "height" : 6, + "properties" : { + "metrics" : concat( + [[{ expression : "MAX(METRICS(\"sqsoldest\"))", label : "Oldest Message Age", id : "e1", region : var.aws_region }]], + [for i, queue in local.sqs_queues : ["AWS/SQS", "ApproximateAgeOfOldestMessage", "QueueName", queue, { id : "sqsoldest${i}", visible : false, region : var.aws_region }]] + ), + "view" : "timeSeries", + "stacked" : false, + "region" : var.aws_region, + "title" : "SQS Queues - Oldest Message Age", + "period" : 300, + "stat" : "Maximum" + } + }, + { + "type" : "metric", + "x" : 6, + "y" : 57, + "width" : 6, + "height" : 6, "properties" : { "view" : "timeSeries", "stacked" : false, @@ -754,7 +830,7 @@ resource "aws_cloudwatch_dashboard" "imms-metrics-dashboard" { }, { "type" : "metric", - "x" : 6, + "x" : 12, "y" : 57, "width" : 6, "height" : 6, diff --git a/infrastructure/instance/ack_lambda.tf b/infrastructure/instance/ack_lambda.tf index 7bf65dd08..ec04135ea 100644 --- a/infrastructure/instance/ack_lambda.tf +++ b/infrastructure/instance/ack_lambda.tf @@ -154,3 +154,33 @@ resource "aws_lambda_event_source_mapping" "sqs_to_lambda" { batch_size = 1 # VED-734 - forwarder lambda already sends a list of up to 100 messages in the body enabled = true } + +resource "aws_cloudwatch_log_metric_filter" "ack_lambda_error_logs" { + count = var.error_alarm_notifications_enabled ? 1 : 0 + + name = "${local.short_prefix}-AckLambdaErrorLogsFilter" + pattern = "%\\[ERROR\\]%" + log_group_name = aws_cloudwatch_log_group.ack_lambda_log_group.name + + metric_transformation { + name = "${local.short_prefix}-AckLambdaErrorLogs" + namespace = "${local.short_prefix}-AckLambda" + value = "1" + } +} + +resource "aws_cloudwatch_metric_alarm" "ack_lambda_error_alarm" { + count = var.error_alarm_notifications_enabled ? 1 : 0 + + alarm_name = "${local.ack_lambda_name}-error" + comparison_operator = "GreaterThanOrEqualToThreshold" + evaluation_periods = 1 + metric_name = "${local.short_prefix}-AckLambdaErrorLogs" + namespace = "${local.short_prefix}-AckLambda" + period = 120 + statistic = "Sum" + threshold = 1 + alarm_description = "This sets off an alarm for any error logs found in the ack Lambda function" + alarm_actions = [data.aws_sns_topic.imms_system_alert_errors.arn] + treat_missing_data = "notBreaching" +} diff --git a/infrastructure/instance/forwarder_lambda.tf b/infrastructure/instance/forwarder_lambda.tf index 1a8ee9ca0..673bf5319 100644 --- a/infrastructure/instance/forwarder_lambda.tf +++ b/infrastructure/instance/forwarder_lambda.tf @@ -178,3 +178,33 @@ resource "aws_cloudwatch_log_group" "forwarding_lambda_log_group" { name = "/aws/lambda/${local.forwarder_lambda_name}" retention_in_days = 30 } + +resource "aws_cloudwatch_log_metric_filter" "forwarding_lambda_error_logs" { + count = var.error_alarm_notifications_enabled ? 1 : 0 + + name = "${local.short_prefix}-ForwardingLambdaErrorLogsFilter" + pattern = "%\\[ERROR\\]%" + log_group_name = aws_cloudwatch_log_group.forwarding_lambda_log_group.name + + metric_transformation { + name = "${local.short_prefix}-ForwardingLambdaErrorLogs" + namespace = "${local.short_prefix}-ForwardingLambda" + value = "1" + } +} + +resource "aws_cloudwatch_metric_alarm" "forwarding_lambda_error_alarm" { + count = var.error_alarm_notifications_enabled ? 1 : 0 + + alarm_name = "${local.forwarder_lambda_name}-error" + comparison_operator = "GreaterThanOrEqualToThreshold" + evaluation_periods = 1 + metric_name = "${local.short_prefix}-ForwardingLambdaErrorLogs" + namespace = "${local.short_prefix}-ForwardingLambda" + period = 120 + statistic = "Sum" + threshold = 1 + alarm_description = "This sets off an alarm for any error logs found in the forwarding Lambda function" + alarm_actions = [data.aws_sns_topic.imms_system_alert_errors.arn] + treat_missing_data = "notBreaching" +} From 16b70c69b92f999b2053f563ac4bae39af12260e Mon Sep 17 00:00:00 2001 From: Thomas-Boyle Date: Wed, 6 May 2026 16:02:30 +0100 Subject: [PATCH 5/6] Refactor alarm properties logic in CloudWatch dashboard configuration --- infrastructure/account/cloudwatch_dashboards.tf | 2 +- infrastructure/account/kms.tf | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/infrastructure/account/cloudwatch_dashboards.tf b/infrastructure/account/cloudwatch_dashboards.tf index 56961ce06..8fce63138 100644 --- a/infrastructure/account/cloudwatch_dashboards.tf +++ b/infrastructure/account/cloudwatch_dashboards.tf @@ -100,7 +100,7 @@ locals { shared_alarms = var.environment == "dev" ? [] : [ "arn:aws:cloudwatch:${var.aws_region}:${var.imms_account_id}:alarm:imms-${var.environment}-mesh-processor-no-lambda-invocation" ] - alarms_properties = concat(var.environment == "dev" ? local.dev_alarms : local.non_dev_alarms, local.shared_alarms) + alarms_properties = var.environment == "dev" ? local.dev_alarms : concat(local.non_dev_alarms, local.shared_alarms) } diff --git a/infrastructure/account/kms.tf b/infrastructure/account/kms.tf index 563c7bdc7..743cbb2ba 100644 --- a/infrastructure/account/kms.tf +++ b/infrastructure/account/kms.tf @@ -230,4 +230,3 @@ resource "aws_kms_alias" "fhir_api_perf_alerts_sns_encryption_key" { name = "alias/${var.environment}-fhir-api-perf-alerts-imms-sns-encryption" target_key_id = aws_kms_key.error_alerts_sns_encryption_key.key_id } - From 8046c2a31deacdf43fd4fecb6b37f6a4deb298cf Mon Sep 17 00:00:00 2001 From: Thomas-Boyle Date: Fri, 8 May 2026 11:46:49 +0100 Subject: [PATCH 6/6] Refactor CloudWatch dashboard metrics to use structured ECS task definitions - Updated ECS metrics in the CloudWatch dashboard to utilize a structured format for batch processor tasks, enhancing clarity and maintainability. - Modified alarm configurations for ACK and Forwarding Lambda functions to reference dynamic metric names and namespaces from log metric filters. --- infrastructure/account/cloudwatch_dashboards.tf | 12 +++++++----- infrastructure/instance/ack_lambda.tf | 4 ++-- infrastructure/instance/forwarder_lambda.tf | 4 ++-- 3 files changed, 11 insertions(+), 9 deletions(-) diff --git a/infrastructure/account/cloudwatch_dashboards.tf b/infrastructure/account/cloudwatch_dashboards.tf index 8fce63138..016312ffc 100644 --- a/infrastructure/account/cloudwatch_dashboards.tf +++ b/infrastructure/account/cloudwatch_dashboards.tf @@ -64,8 +64,10 @@ locals { ])) # ECS (cluster names match instance short_prefix: imms--ecs-cluster) - ecs_clusters = [for sub_env in local.sub_environments_map[var.environment] : "imms-${sub_env}-ecs-cluster"] - ecs_task_definition_families = [for sub_env in local.sub_environments_map[var.environment] : "imms-${sub_env}-processor-task"] + ecs_batch_processor_tasks = [for sub_env in local.sub_environments_map[var.environment] : { + cluster = "imms-${sub_env}-ecs-cluster" + task_definition_family = "imms-${sub_env}-processor-task" + }] # Alarms alarms = concat([ @@ -668,7 +670,7 @@ resource "aws_cloudwatch_dashboard" "imms-metrics-dashboard" { "height" : 6, "properties" : { "metrics" : [ - for i, cluster in local.ecs_clusters : ["ECS/ContainerInsights", "TaskCount", "ClusterName", cluster, "TaskDefinitionFamily", local.ecs_task_definition_families[i], { region : var.aws_region }] + for task in local.ecs_batch_processor_tasks : ["ECS/ContainerInsights", "TaskCount", "ClusterName", task.cluster, "TaskDefinitionFamily", task.task_definition_family, { region : var.aws_region }] ], "view" : "timeSeries", "stacked" : false, @@ -686,7 +688,7 @@ resource "aws_cloudwatch_dashboard" "imms-metrics-dashboard" { "height" : 6, "properties" : { "metrics" : [ - for i, cluster in local.ecs_clusters : ["ECS/ContainerInsights", "CpuUtilized", "ClusterName", cluster, "TaskDefinitionFamily", local.ecs_task_definition_families[i], { region : var.aws_region }] + for task in local.ecs_batch_processor_tasks : ["ECS/ContainerInsights", "CpuUtilized", "ClusterName", task.cluster, "TaskDefinitionFamily", task.task_definition_family, { region : var.aws_region }] ], "view" : "timeSeries", "stacked" : false, @@ -704,7 +706,7 @@ resource "aws_cloudwatch_dashboard" "imms-metrics-dashboard" { "height" : 6, "properties" : { "metrics" : [ - for i, cluster in local.ecs_clusters : ["ECS/ContainerInsights", "MemoryUtilized", "ClusterName", cluster, "TaskDefinitionFamily", local.ecs_task_definition_families[i], { region : var.aws_region }] + for task in local.ecs_batch_processor_tasks : ["ECS/ContainerInsights", "MemoryUtilized", "ClusterName", task.cluster, "TaskDefinitionFamily", task.task_definition_family, { region : var.aws_region }] ], "view" : "timeSeries", "stacked" : false, diff --git a/infrastructure/instance/ack_lambda.tf b/infrastructure/instance/ack_lambda.tf index ec04135ea..29881f984 100644 --- a/infrastructure/instance/ack_lambda.tf +++ b/infrastructure/instance/ack_lambda.tf @@ -175,8 +175,8 @@ resource "aws_cloudwatch_metric_alarm" "ack_lambda_error_alarm" { alarm_name = "${local.ack_lambda_name}-error" comparison_operator = "GreaterThanOrEqualToThreshold" evaluation_periods = 1 - metric_name = "${local.short_prefix}-AckLambdaErrorLogs" - namespace = "${local.short_prefix}-AckLambda" + metric_name = aws_cloudwatch_log_metric_filter.ack_lambda_error_logs[count.index].metric_transformation[0].name + namespace = aws_cloudwatch_log_metric_filter.ack_lambda_error_logs[count.index].metric_transformation[0].namespace period = 120 statistic = "Sum" threshold = 1 diff --git a/infrastructure/instance/forwarder_lambda.tf b/infrastructure/instance/forwarder_lambda.tf index 673bf5319..27f5dfc6b 100644 --- a/infrastructure/instance/forwarder_lambda.tf +++ b/infrastructure/instance/forwarder_lambda.tf @@ -199,8 +199,8 @@ resource "aws_cloudwatch_metric_alarm" "forwarding_lambda_error_alarm" { alarm_name = "${local.forwarder_lambda_name}-error" comparison_operator = "GreaterThanOrEqualToThreshold" evaluation_periods = 1 - metric_name = "${local.short_prefix}-ForwardingLambdaErrorLogs" - namespace = "${local.short_prefix}-ForwardingLambda" + metric_name = aws_cloudwatch_log_metric_filter.forwarding_lambda_error_logs[count.index].metric_transformation[0].name + namespace = aws_cloudwatch_log_metric_filter.forwarding_lambda_error_logs[count.index].metric_transformation[0].namespace period = 120 statistic = "Sum" threshold = 1