diff --git a/infrastructure/account/cloudwatch_dashboards.tf b/infrastructure/account/cloudwatch_dashboards.tf index 6dd1966a2..016312ffc 100644 --- a/infrastructure/account/cloudwatch_dashboards.tf +++ b/infrastructure/account/cloudwatch_dashboards.tf @@ -31,6 +31,7 @@ locals { [for sub_env in local.sub_environments_map[var.environment] : "imms-${sub_env}-delta-lambda"], [for sub_env in local.sub_environments_map[var.environment] : "imms-${sub_env}_get_status"], [for sub_env in local.sub_environments_map[var.environment] : "imms-${sub_env}-redis-sync-lambda"], + [for sub_env in local.sub_environments_map[var.environment] : "imms-${sub_env}-mns-publisher-lambda"], [for sub_env in local.sub_environments_map[var.environment] : "imms-${sub_env}-mesh-processor-lambda" if var.environment != "dev"], ]) @@ -42,7 +43,14 @@ locals { "immunisation-batch-${var.environment == "dev" ? "internal-dev" : var.environment}-audit-table", var.environment == "dev" ? "imms-internal-qa-delta" : "", var.environment == "dev" ? "imms-internal-qa-imms-events" : "", - var.environment == "dev" ? "imms-internal-qa-audit-table" : "", + var.environment == "dev" ? "immunisation-batch-internal-qa-audit-table" : "", + ]) + + mns_resource_scopes = var.environment == "dev" ? local.sub_environments_map[var.environment] : [var.environment] + mns_sqs_queues = flatten([ + [for resource_scope in local.mns_resource_scopes : "${resource_scope}-mns-outbound-events-queue"], + [for resource_scope in local.mns_resource_scopes : "${resource_scope}-mns-outbound-events-dead-letter-queue"], + var.environment == "dev" ? [for resource_scope in local.mns_resource_scopes : "${resource_scope}-mns-test-notification-queue"] : [], ]) sqs_queues = distinct(flatten([ @@ -52,13 +60,17 @@ locals { [for sub_env in local.sub_environments_map[var.environment] : "imms-${sub_env}-metadata-queue.fifo"], var.environment == "dev" ? [for sub_env in local.sub_environments_map[var.environment] : "imms-${sub_env}-id-sync-dlq"] : ["imms-${var.environment}-id-sync-dlq"], var.environment == "dev" ? [for sub_env in local.sub_environments_map[var.environment] : "imms-${sub_env}-id-sync-queue"] : ["imms-${var.environment}-id-sync-queue"], + local.mns_sqs_queues, ])) # ECS (cluster names match instance short_prefix: imms--ecs-cluster) - ecs_clusters = [for sub_env in local.sub_environments_map[var.environment] : "imms-${sub_env}-ecs-cluster"] + ecs_batch_processor_tasks = [for sub_env in local.sub_environments_map[var.environment] : { + cluster = "imms-${sub_env}-ecs-cluster" + task_definition_family = "imms-${sub_env}-processor-task" + }] # Alarms - alarms = [ + alarms = concat([ "_create_imms-lambda-error", "_create_imms memory alarm", "_get_imms-lambda-error", @@ -74,17 +86,23 @@ locals { "-record-processor-task-error", "-file-name-processor-lambda-error", "-batch-processor-filter-lambda-error", + "-ack-lambda-error", + "-forwarding-lambda-error", "-id-sync-lambda-error", "-redis-sync-lambda-error", "-delta-lambda-error", + "-mns-publisher-lambda-error", "_not_found-lambda-error", "_not_found memory alarm" - ] + ], var.environment == "dev" ? [] : ["-mesh-processor-lambda-error"]) # Alarms are turned off in internal-qa as testing could cause unnecessary noise dev_alarms = [for alarm in local.alarms : "arn:aws:cloudwatch:${var.aws_region}:${var.imms_account_id}:alarm:imms-internal-dev${alarm}"] non_dev_alarms = flatten([for sub_env in local.sub_environments_map[var.environment] : [for alarm in local.alarms : "arn:aws:cloudwatch:${var.aws_region}:${var.imms_account_id}:alarm:imms-${sub_env}${alarm}"] if var.environment != "dev"]) - alarms_properties = var.environment == "dev" ? local.dev_alarms : local.non_dev_alarms + shared_alarms = var.environment == "dev" ? [] : [ + "arn:aws:cloudwatch:${var.aws_region}:${var.imms_account_id}:alarm:imms-${var.environment}-mesh-processor-no-lambda-invocation" + ] + alarms_properties = var.environment == "dev" ? local.dev_alarms : concat(local.non_dev_alarms, local.shared_alarms) } @@ -224,8 +242,12 @@ resource "aws_cloudwatch_dashboard" "imms-metrics-dashboard" { "height" : 6, "properties" : { "metrics" : concat( - [for lambda in local.api_lambdas : ["AWS/Lambda", "Invocations", "FunctionName", lambda, { region : var.aws_region }]], - [for lambda in local.api_lambdas : ["AWS/Lambda", "Errors", "FunctionName", lambda, { color : local.errors_colour_code, region : var.aws_region }]] + [ + [{ expression : "SUM(METRICS(\"apiinvocations\"))", label : "API Invocations", id : "e1", region : var.aws_region }], + [{ expression : "SUM(METRICS(\"apierrors\"))", label : "API Errors", id : "e2", color : local.errors_colour_code, region : var.aws_region }] + ], + [for i, lambda in local.api_lambdas : ["AWS/Lambda", "Invocations", "FunctionName", lambda, { id : "apiinvocations${i}", visible : false, region : var.aws_region }]], + [for i, lambda in local.api_lambdas : ["AWS/Lambda", "Errors", "FunctionName", lambda, { id : "apierrors${i}", visible : false, color : local.errors_colour_code, region : var.aws_region }]] ), "view" : "timeSeries", "stacked" : false, @@ -243,8 +265,8 @@ resource "aws_cloudwatch_dashboard" "imms-metrics-dashboard" { "height" : 6, "properties" : { "metrics" : concat( - [[{ expression : "AVG(METRICS())", label : "Average Duration", id : "e1", stat : "Maximum", region : var.aws_region }]], - [for i, lambda in local.api_lambdas : ["AWS/Lambda", "Duration", "FunctionName", lambda, { stat : "Maximum", id : "m${i + 1}", region : var.aws_region }]] + [[{ expression : "AVG(METRICS(\"apiduration\"))", label : "API Average Duration", id : "e1", stat : "Maximum", region : var.aws_region }]], + [for i, lambda in local.api_lambdas : ["AWS/Lambda", "Duration", "FunctionName", lambda, { stat : "Maximum", id : "apiduration${i}", visible : false, region : var.aws_region }]] ), "view" : "timeSeries", "stacked" : false, @@ -261,9 +283,10 @@ resource "aws_cloudwatch_dashboard" "imms-metrics-dashboard" { "width" : 6, "height" : 6, "properties" : { - "metrics" : [ - for lambda in local.api_lambdas : ["AWS/Lambda", "ConcurrentExecutions", "FunctionName", lambda, { region : var.aws_region }] - ], + "metrics" : concat( + [[{ expression : "SUM(METRICS(\"apiconcurrency\"))", label : "API ConcurrentExecutions", id : "e1", region : var.aws_region }]], + [for i, lambda in local.api_lambdas : ["AWS/Lambda", "ConcurrentExecutions", "FunctionName", lambda, { id : "apiconcurrency${i}", visible : false, region : var.aws_region }]] + ), "view" : "timeSeries", "stacked" : false, "region" : var.aws_region, @@ -308,8 +331,12 @@ resource "aws_cloudwatch_dashboard" "imms-metrics-dashboard" { "height" : 6, "properties" : { "metrics" : concat( - [for lambda in local.batch_lambdas : ["AWS/Lambda", "Invocations", "FunctionName", lambda, { region : var.aws_region }]], - [for lambda in local.batch_lambdas : ["AWS/Lambda", "Errors", "FunctionName", lambda, { color : local.errors_colour_code, region : var.aws_region }]] + [ + [{ expression : "SUM(METRICS(\"batchinvocations\"))", label : "Batch Invocations", id : "e1", region : var.aws_region }], + [{ expression : "SUM(METRICS(\"batcherrors\"))", label : "Batch Errors", id : "e2", color : local.errors_colour_code, region : var.aws_region }] + ], + [for i, lambda in local.batch_lambdas : ["AWS/Lambda", "Invocations", "FunctionName", lambda, { id : "batchinvocations${i}", visible : false, region : var.aws_region }]], + [for i, lambda in local.batch_lambdas : ["AWS/Lambda", "Errors", "FunctionName", lambda, { id : "batcherrors${i}", visible : false, color : local.errors_colour_code, region : var.aws_region }]] ), "view" : "timeSeries", "stacked" : false, @@ -327,8 +354,8 @@ resource "aws_cloudwatch_dashboard" "imms-metrics-dashboard" { "height" : 6, "properties" : { "metrics" : concat( - [[{ expression : "AVG(METRICS())", label : "Average Duration", id : "e1", stat : "Maximum", region : var.aws_region }]], - [for i, lambda in local.batch_lambdas : ["AWS/Lambda", "Duration", "FunctionName", lambda, { stat : "Maximum", id : "m${i + 1}", region : var.aws_region }]] + [[{ expression : "AVG(METRICS(\"batchduration\"))", label : "Batch Average Duration", id : "e1", stat : "Maximum", region : var.aws_region }]], + [for i, lambda in local.batch_lambdas : ["AWS/Lambda", "Duration", "FunctionName", lambda, { stat : "Maximum", id : "batchduration${i}", visible : false, region : var.aws_region }]] ), "view" : "timeSeries", "stacked" : false, @@ -345,9 +372,10 @@ resource "aws_cloudwatch_dashboard" "imms-metrics-dashboard" { "width" : 6, "height" : 6, "properties" : { - "metrics" : [ - for lambda in local.batch_lambdas : ["AWS/Lambda", "ConcurrentExecutions", "FunctionName", lambda, { region : var.aws_region }] - ], + "metrics" : concat( + [[{ expression : "SUM(METRICS(\"batchconcurrency\"))", label : "Batch ConcurrentExecutions", id : "e1", region : var.aws_region }]], + [for i, lambda in local.batch_lambdas : ["AWS/Lambda", "ConcurrentExecutions", "FunctionName", lambda, { id : "batchconcurrency${i}", visible : false, region : var.aws_region }]] + ), "view" : "timeSeries", "stacked" : false, "region" : var.aws_region, @@ -364,7 +392,7 @@ resource "aws_cloudwatch_dashboard" "imms-metrics-dashboard" { "height" : 3, "properties" : { "metrics" : concat( - [[{ expression : "SUM(METRICS())", label : "API Errors", id : "e1", region : var.aws_region, color : local.errors_colour_code }]], + [[{ expression : "SUM(METRICS())", label : "Batch Errors", id : "e1", region : var.aws_region, color : local.errors_colour_code }]], [for i, lambda in local.batch_lambdas : ["AWS/Lambda", "Errors", "FunctionName", lambda, { color : local.errors_colour_code, region : var.aws_region, id : "m${i + 1}", visible : false }]] ), "sparkline" : true, @@ -392,8 +420,12 @@ resource "aws_cloudwatch_dashboard" "imms-metrics-dashboard" { "height" : 6, "properties" : { "metrics" : concat( - [for lambda in local.ancillary_lambdas : ["AWS/Lambda", "Invocations", "FunctionName", lambda, { region : var.aws_region }]], - [for lambda in local.ancillary_lambdas : ["AWS/Lambda", "Errors", "FunctionName", lambda, { color : local.errors_colour_code, region : var.aws_region }]] + [ + [{ expression : "SUM(METRICS(\"ancillaryinvocations\"))", label : "Ancillary Invocations", id : "e1", region : var.aws_region }], + [{ expression : "SUM(METRICS(\"ancillaryerrors\"))", label : "Ancillary Errors", id : "e2", color : local.errors_colour_code, region : var.aws_region }] + ], + [for i, lambda in local.ancillary_lambdas : ["AWS/Lambda", "Invocations", "FunctionName", lambda, { id : "ancillaryinvocations${i}", visible : false, region : var.aws_region }]], + [for i, lambda in local.ancillary_lambdas : ["AWS/Lambda", "Errors", "FunctionName", lambda, { id : "ancillaryerrors${i}", visible : false, color : local.errors_colour_code, region : var.aws_region }]] ), "view" : "timeSeries", "stacked" : false, @@ -411,8 +443,8 @@ resource "aws_cloudwatch_dashboard" "imms-metrics-dashboard" { "height" : 6, "properties" : { "metrics" : concat( - [[{ expression : "AVG(METRICS())", label : "Average Duration", id : "e1", stat : "Maximum", region : var.aws_region }]], - [for i, lambda in local.ancillary_lambdas : ["AWS/Lambda", "Duration", "FunctionName", lambda, { stat : "Maximum", id : "m${i + 1}", region : var.aws_region }]] + [[{ expression : "AVG(METRICS(\"ancillaryduration\"))", label : "Ancillary Average Duration", id : "e1", stat : "Maximum", region : var.aws_region }]], + [for i, lambda in local.ancillary_lambdas : ["AWS/Lambda", "Duration", "FunctionName", lambda, { stat : "Maximum", id : "ancillaryduration${i}", visible : false, region : var.aws_region }]] ), "view" : "timeSeries", "stacked" : false, @@ -429,9 +461,10 @@ resource "aws_cloudwatch_dashboard" "imms-metrics-dashboard" { "width" : 6, "height" : 6, "properties" : { - "metrics" : [ - for lambda in local.ancillary_lambdas : ["AWS/Lambda", "ConcurrentExecutions", "FunctionName", lambda, { region : var.aws_region }] - ], + "metrics" : concat( + [[{ expression : "SUM(METRICS(\"ancillaryconcurrency\"))", label : "Ancillary ConcurrentExecutions", id : "e1", region : var.aws_region }]], + [for i, lambda in local.ancillary_lambdas : ["AWS/Lambda", "ConcurrentExecutions", "FunctionName", lambda, { id : "ancillaryconcurrency${i}", visible : false, region : var.aws_region }]] + ), "view" : "timeSeries", "stacked" : false, "region" : var.aws_region, @@ -448,7 +481,7 @@ resource "aws_cloudwatch_dashboard" "imms-metrics-dashboard" { "height" : 3, "properties" : { "metrics" : concat( - [[{ expression : "SUM(METRICS())", label : "API Errors", id : "e1", region : var.aws_region, color : local.errors_colour_code }]], + [[{ expression : "SUM(METRICS())", label : "Ancillary Errors", id : "e1", region : var.aws_region, color : local.errors_colour_code }]], [for i, lambda in local.ancillary_lambdas : ["AWS/Lambda", "Errors", "FunctionName", lambda, { color : local.errors_colour_code, region : var.aws_region, id : "m${i + 1}", visible : false }]] ), "sparkline" : true, @@ -476,8 +509,9 @@ resource "aws_cloudwatch_dashboard" "imms-metrics-dashboard" { "height" : 6, "properties" : { "metrics" : concat( - [for table in local.dynamodb_tables : ["AWS/DynamoDB", "SuccessfulRequestLatency", "TableName", table, "Operation", "GetItem", { region : var.aws_region }]], - [for table in local.dynamodb_tables : ["AWS/DynamoDB", "SuccessfulRequestLatency", "TableName", table, "Operation", "Query", { region : var.aws_region }]] + [[{ expression : "SUM(METRICS(\"ddbreadcount\"))", label : "Successful Read Requests", id : "e1", region : var.aws_region }]], + [for i, table in local.dynamodb_tables : ["AWS/DynamoDB", "SuccessfulRequestLatency", "TableName", table, "Operation", "GetItem", { id : "ddbreadcountget${i}", visible : false, region : var.aws_region }]], + [for i, table in local.dynamodb_tables : ["AWS/DynamoDB", "SuccessfulRequestLatency", "TableName", table, "Operation", "Query", { id : "ddbreadcountquery${i}", visible : false, region : var.aws_region }]] ), "view" : "timeSeries", "stacked" : false, @@ -500,8 +534,9 @@ resource "aws_cloudwatch_dashboard" "imms-metrics-dashboard" { "height" : 6, "properties" : { "metrics" : concat( - [for table in local.dynamodb_tables : ["AWS/DynamoDB", "SuccessfulRequestLatency", "TableName", table, "Operation", "GetItem", { region : var.aws_region }]], - [for table in local.dynamodb_tables : ["AWS/DynamoDB", "SuccessfulRequestLatency", "TableName", table, "Operation", "Query", { region : var.aws_region }]] + [[{ expression : "AVG(METRICS(\"ddbreadlatency\"))", label : "Average Read Latency", id : "e1", region : var.aws_region }]], + [for i, table in local.dynamodb_tables : ["AWS/DynamoDB", "SuccessfulRequestLatency", "TableName", table, "Operation", "GetItem", { id : "ddbreadlatencyget${i}", visible : false, region : var.aws_region }]], + [for i, table in local.dynamodb_tables : ["AWS/DynamoDB", "SuccessfulRequestLatency", "TableName", table, "Operation", "Query", { id : "ddbreadlatencyquery${i}", visible : false, region : var.aws_region }]] ), "view" : "timeSeries", "stacked" : false, @@ -518,9 +553,10 @@ resource "aws_cloudwatch_dashboard" "imms-metrics-dashboard" { "width" : 6, "height" : 6, "properties" : { - "metrics" : [ - for table in local.dynamodb_tables : ["AWS/DynamoDB", "ConsumedReadCapacityUnits", "TableName", table] - ], + "metrics" : concat( + [[{ expression : "SUM(METRICS(\"ddbreadcapacity\"))", label : "Consumed Read Capacity Units", id : "e1", region : var.aws_region }]], + [for i, table in local.dynamodb_tables : ["AWS/DynamoDB", "ConsumedReadCapacityUnits", "TableName", table, { id : "ddbreadcapacity${i}", visible : false, region : var.aws_region }]] + ), "view" : "timeSeries", "stacked" : false, "region" : var.aws_region, @@ -560,8 +596,9 @@ resource "aws_cloudwatch_dashboard" "imms-metrics-dashboard" { "height" : 6, "properties" : { "metrics" : concat( - [for table in local.dynamodb_tables : ["AWS/DynamoDB", "SuccessfulRequestLatency", "TableName", table, "Operation", "PutItem", { region : var.aws_region }]], - [for table in local.dynamodb_tables : ["AWS/DynamoDB", "SuccessfulRequestLatency", "TableName", table, "Operation", "UpdateItem", { region : var.aws_region }]] + [[{ expression : "SUM(METRICS(\"ddbwritecount\"))", label : "Successful Write Requests", id : "e1", region : var.aws_region }]], + [for i, table in local.dynamodb_tables : ["AWS/DynamoDB", "SuccessfulRequestLatency", "TableName", table, "Operation", "PutItem", { id : "ddbwritecountput${i}", visible : false, region : var.aws_region }]], + [for i, table in local.dynamodb_tables : ["AWS/DynamoDB", "SuccessfulRequestLatency", "TableName", table, "Operation", "UpdateItem", { id : "ddbwritecountupdate${i}", visible : false, region : var.aws_region }]] ), "view" : "timeSeries", "stacked" : false, @@ -584,8 +621,9 @@ resource "aws_cloudwatch_dashboard" "imms-metrics-dashboard" { "height" : 6, "properties" : { "metrics" : concat( - [for table in local.dynamodb_tables : ["AWS/DynamoDB", "SuccessfulRequestLatency", "TableName", table, "Operation", "PutItem", { region : var.aws_region }]], - [for table in local.dynamodb_tables : ["AWS/DynamoDB", "SuccessfulRequestLatency", "TableName", table, "Operation", "UpdateItem", { region : var.aws_region }]] + [[{ expression : "AVG(METRICS(\"ddbwritelatency\"))", label : "Average Write Latency", id : "e1", region : var.aws_region }]], + [for i, table in local.dynamodb_tables : ["AWS/DynamoDB", "SuccessfulRequestLatency", "TableName", table, "Operation", "PutItem", { id : "ddbwritelatencyput${i}", visible : false, region : var.aws_region }]], + [for i, table in local.dynamodb_tables : ["AWS/DynamoDB", "SuccessfulRequestLatency", "TableName", table, "Operation", "UpdateItem", { id : "ddbwritelatencyupdate${i}", visible : false, region : var.aws_region }]] ), "view" : "timeSeries", "stacked" : false, @@ -602,9 +640,10 @@ resource "aws_cloudwatch_dashboard" "imms-metrics-dashboard" { "width" : 6, "height" : 6, "properties" : { - "metrics" : [ - for table in local.dynamodb_tables : ["AWS/DynamoDB", "ConsumedWriteCapacityUnits", "TableName", table] - ], + "metrics" : concat( + [[{ expression : "SUM(METRICS(\"ddbwritecapacity\"))", label : "Consumed Write Capacity Units", id : "e1", region : var.aws_region }]], + [for i, table in local.dynamodb_tables : ["AWS/DynamoDB", "ConsumedWriteCapacityUnits", "TableName", table, { id : "ddbwritecapacity${i}", visible : false, region : var.aws_region }]] + ), "view" : "timeSeries", "stacked" : false, "region" : var.aws_region, @@ -631,14 +670,14 @@ resource "aws_cloudwatch_dashboard" "imms-metrics-dashboard" { "height" : 6, "properties" : { "metrics" : [ - for cluster in local.ecs_clusters : ["ECS/ContainerInsights", "TaskCount", "ClusterName", cluster, { region : var.aws_region }] + for task in local.ecs_batch_processor_tasks : ["ECS/ContainerInsights", "TaskCount", "ClusterName", task.cluster, "TaskDefinitionFamily", task.task_definition_family, { region : var.aws_region }] ], "view" : "timeSeries", "stacked" : false, "region" : var.aws_region, - "stat" : "SampleCount", + "stat" : "Average", "period" : 300, - "title" : "ECS - Task Count" + "title" : "ECS Batch Processor - Task Count" } }, { @@ -649,14 +688,14 @@ resource "aws_cloudwatch_dashboard" "imms-metrics-dashboard" { "height" : 6, "properties" : { "metrics" : [ - for cluster in local.ecs_clusters : ["ECS/ContainerInsights", "CpuUtilized", "ClusterName", cluster, { region : var.aws_region }] + for task in local.ecs_batch_processor_tasks : ["ECS/ContainerInsights", "CpuUtilized", "ClusterName", task.cluster, "TaskDefinitionFamily", task.task_definition_family, { region : var.aws_region }] ], "view" : "timeSeries", "stacked" : false, "region" : var.aws_region, "stat" : "Maximum", "period" : 300, - "title" : "ECS - CPU Utilization" + "title" : "ECS Batch Processor - CPU Utilization" } }, { @@ -667,14 +706,14 @@ resource "aws_cloudwatch_dashboard" "imms-metrics-dashboard" { "height" : 6, "properties" : { "metrics" : [ - for cluster in local.ecs_clusters : ["ECS/ContainerInsights", "MemoryUtilized", "ClusterName", cluster, { region : var.aws_region }] + for task in local.ecs_batch_processor_tasks : ["ECS/ContainerInsights", "MemoryUtilized", "ClusterName", task.cluster, "TaskDefinitionFamily", task.task_definition_family, { region : var.aws_region }] ], "view" : "timeSeries", "stacked" : false, "region" : var.aws_region, "stat" : "Maximum", "period" : 300, - "title" : "ECS - Memory Utilization" + "title" : "ECS Batch Processor - Memory Utilization" } }, { @@ -724,9 +763,10 @@ resource "aws_cloudwatch_dashboard" "imms-metrics-dashboard" { "width" : 6, "height" : 6, "properties" : { - "metrics" : [ - for queue in local.sqs_queues : ["AWS/SQS", "NumberOfMessagesSent", "QueueName", queue, { region : var.aws_region }] - ], + "metrics" : concat( + [[{ expression : "SUM(METRICS(\"sqssent\"))", label : "Messages Sent", id : "e1", region : var.aws_region }]], + [for i, queue in local.sqs_queues : ["AWS/SQS", "NumberOfMessagesSent", "QueueName", queue, { id : "sqssent${i}", visible : false, region : var.aws_region }]] + ), "view" : "timeSeries", "stacked" : false, "region" : var.aws_region, @@ -735,12 +775,50 @@ resource "aws_cloudwatch_dashboard" "imms-metrics-dashboard" { "stat" : "Sum" } }, + { + "type" : "metric", + "x" : 18, + "y" : 51, + "width" : 6, + "height" : 6, + "properties" : { + "metrics" : concat( + [[{ expression : "SUM(METRICS(\"sqsvisible\"))", label : "Visible Messages", id : "e1", region : var.aws_region }]], + [for i, queue in local.sqs_queues : ["AWS/SQS", "ApproximateNumberOfMessagesVisible", "QueueName", queue, { id : "sqsvisible${i}", visible : false, region : var.aws_region }]] + ), + "view" : "timeSeries", + "stacked" : false, + "region" : var.aws_region, + "title" : "SQS Queues - Visible Messages", + "period" : 300, + "stat" : "Maximum" + } + }, { "type" : "metric", "x" : 0, "y" : 57, "width" : 6, "height" : 6, + "properties" : { + "metrics" : concat( + [[{ expression : "MAX(METRICS(\"sqsoldest\"))", label : "Oldest Message Age", id : "e1", region : var.aws_region }]], + [for i, queue in local.sqs_queues : ["AWS/SQS", "ApproximateAgeOfOldestMessage", "QueueName", queue, { id : "sqsoldest${i}", visible : false, region : var.aws_region }]] + ), + "view" : "timeSeries", + "stacked" : false, + "region" : var.aws_region, + "title" : "SQS Queues - Oldest Message Age", + "period" : 300, + "stat" : "Maximum" + } + }, + { + "type" : "metric", + "x" : 6, + "y" : 57, + "width" : 6, + "height" : 6, "properties" : { "view" : "timeSeries", "stacked" : false, @@ -754,7 +832,7 @@ resource "aws_cloudwatch_dashboard" "imms-metrics-dashboard" { }, { "type" : "metric", - "x" : 6, + "x" : 12, "y" : 57, "width" : 6, "height" : 6, diff --git a/infrastructure/account/fhir_api_perf_alerts_slack_chatbot.tf b/infrastructure/account/fhir_api_perf_alerts_slack_chatbot.tf new file mode 100644 index 000000000..88fe488df --- /dev/null +++ b/infrastructure/account/fhir_api_perf_alerts_slack_chatbot.tf @@ -0,0 +1,24 @@ +resource "aws_chatbot_slack_channel_configuration" "fhir_api_perf_alerts" { + configuration_name = "${var.environment}-fhir-api-perf-alerts-slack-channel-config" + iam_role_arn = aws_iam_role.fhir_api_perf_alerts_chatbot.arn + slack_channel_id = var.environment == "prod" ? "C0B11MJPQ6A" : "C0B1GKZ5S4R" + slack_team_id = "TJ00QR03U" + sns_topic_arns = [aws_sns_topic.fhir_api_perf_alerts.arn] +} + +resource "aws_iam_role" "fhir_api_perf_alerts_chatbot" { + name = "${var.environment}-fhir-api-perf-alerts-chatbot-channel-role" + assume_role_policy = jsonencode({ + Version = "2012-10-17" + Statement = [ + { + Action = "sts:AssumeRole" + Effect = "Allow" + Sid = "AssumeChatbotRole" + Principal = { + Service = "chatbot.amazonaws.com" + } + }, + ] + }) +} diff --git a/infrastructure/account/fhir_api_perf_alerts_sns_topic.tf b/infrastructure/account/fhir_api_perf_alerts_sns_topic.tf new file mode 100644 index 000000000..00fcda457 --- /dev/null +++ b/infrastructure/account/fhir_api_perf_alerts_sns_topic.tf @@ -0,0 +1,22 @@ +resource "aws_sns_topic" "fhir_api_perf_alerts" { + name = "${var.environment}-fhir-api-perf-alerts" + kms_master_key_id = aws_kms_key.error_alerts_sns_encryption_key.arn +} + +resource "aws_sns_topic_policy" "fhir_api_perf_alerts_topic_policy" { + arn = aws_sns_topic.fhir_api_perf_alerts.arn + policy = jsonencode({ + Version = "2012-10-17", + Statement = [ + { + Sid = "AllowCloudWatchToPublish", + Effect = "Allow", + Principal = { + Service = "cloudwatch.amazonaws.com" + }, + Action = "SNS:Publish", + Resource = aws_sns_topic.fhir_api_perf_alerts.arn + } + ] + }) +} diff --git a/infrastructure/account/kms.tf b/infrastructure/account/kms.tf index 21e5e2a78..743cbb2ba 100644 --- a/infrastructure/account/kms.tf +++ b/infrastructure/account/kms.tf @@ -225,3 +225,8 @@ resource "aws_kms_alias" "fhir_api_errors_sns_encryption_key" { name = "alias/${var.environment}-fhir-api-errors-imms-sns-encryption" target_key_id = aws_kms_key.error_alerts_sns_encryption_key.key_id } + +resource "aws_kms_alias" "fhir_api_perf_alerts_sns_encryption_key" { + name = "alias/${var.environment}-fhir-api-perf-alerts-imms-sns-encryption" + target_key_id = aws_kms_key.error_alerts_sns_encryption_key.key_id +} diff --git a/infrastructure/account/shield_protection.tf b/infrastructure/account/shield_protection.tf index 0809c97d0..a7a677083 100644 --- a/infrastructure/account/shield_protection.tf +++ b/infrastructure/account/shield_protection.tf @@ -34,6 +34,10 @@ locals { } } +# Topic to publish alerts to when alarm is triggered +data "aws_sns_topic" "fhir_api_perf_alerts" { + name = "${var.environment}-fhir-api-perf-alerts" +} # Create Metric Alarms for each of those resources resource "aws_cloudwatch_metric_alarm" "ddos_protection_regional" { @@ -41,6 +45,7 @@ resource "aws_cloudwatch_metric_alarm" "ddos_protection_regional" { alarm_name = "imms-${var.environment}-shield_ddos_${each.key}" alarm_description = "Alarm when Shield detects DDoS on ${each.key}" + alarm_actions = [data.aws_sns_topic.fhir_api_perf_alerts.arn] namespace = "AWS/DDoSProtection" metric_name = "DDoSDetected" diff --git a/infrastructure/instance/ack_lambda.tf b/infrastructure/instance/ack_lambda.tf index 7bf65dd08..29881f984 100644 --- a/infrastructure/instance/ack_lambda.tf +++ b/infrastructure/instance/ack_lambda.tf @@ -154,3 +154,33 @@ resource "aws_lambda_event_source_mapping" "sqs_to_lambda" { batch_size = 1 # VED-734 - forwarder lambda already sends a list of up to 100 messages in the body enabled = true } + +resource "aws_cloudwatch_log_metric_filter" "ack_lambda_error_logs" { + count = var.error_alarm_notifications_enabled ? 1 : 0 + + name = "${local.short_prefix}-AckLambdaErrorLogsFilter" + pattern = "%\\[ERROR\\]%" + log_group_name = aws_cloudwatch_log_group.ack_lambda_log_group.name + + metric_transformation { + name = "${local.short_prefix}-AckLambdaErrorLogs" + namespace = "${local.short_prefix}-AckLambda" + value = "1" + } +} + +resource "aws_cloudwatch_metric_alarm" "ack_lambda_error_alarm" { + count = var.error_alarm_notifications_enabled ? 1 : 0 + + alarm_name = "${local.ack_lambda_name}-error" + comparison_operator = "GreaterThanOrEqualToThreshold" + evaluation_periods = 1 + metric_name = aws_cloudwatch_log_metric_filter.ack_lambda_error_logs[count.index].metric_transformation[0].name + namespace = aws_cloudwatch_log_metric_filter.ack_lambda_error_logs[count.index].metric_transformation[0].namespace + period = 120 + statistic = "Sum" + threshold = 1 + alarm_description = "This sets off an alarm for any error logs found in the ack Lambda function" + alarm_actions = [data.aws_sns_topic.imms_system_alert_errors.arn] + treat_missing_data = "notBreaching" +} diff --git a/infrastructure/instance/forwarder_lambda.tf b/infrastructure/instance/forwarder_lambda.tf index 1a8ee9ca0..27f5dfc6b 100644 --- a/infrastructure/instance/forwarder_lambda.tf +++ b/infrastructure/instance/forwarder_lambda.tf @@ -178,3 +178,33 @@ resource "aws_cloudwatch_log_group" "forwarding_lambda_log_group" { name = "/aws/lambda/${local.forwarder_lambda_name}" retention_in_days = 30 } + +resource "aws_cloudwatch_log_metric_filter" "forwarding_lambda_error_logs" { + count = var.error_alarm_notifications_enabled ? 1 : 0 + + name = "${local.short_prefix}-ForwardingLambdaErrorLogsFilter" + pattern = "%\\[ERROR\\]%" + log_group_name = aws_cloudwatch_log_group.forwarding_lambda_log_group.name + + metric_transformation { + name = "${local.short_prefix}-ForwardingLambdaErrorLogs" + namespace = "${local.short_prefix}-ForwardingLambda" + value = "1" + } +} + +resource "aws_cloudwatch_metric_alarm" "forwarding_lambda_error_alarm" { + count = var.error_alarm_notifications_enabled ? 1 : 0 + + alarm_name = "${local.forwarder_lambda_name}-error" + comparison_operator = "GreaterThanOrEqualToThreshold" + evaluation_periods = 1 + metric_name = aws_cloudwatch_log_metric_filter.forwarding_lambda_error_logs[count.index].metric_transformation[0].name + namespace = aws_cloudwatch_log_metric_filter.forwarding_lambda_error_logs[count.index].metric_transformation[0].namespace + period = 120 + statistic = "Sum" + threshold = 1 + alarm_description = "This sets off an alarm for any error logs found in the forwarding Lambda function" + alarm_actions = [data.aws_sns_topic.imms_system_alert_errors.arn] + treat_missing_data = "notBreaching" +} diff --git a/infrastructure/instance/modules/lambda/lambda.tf b/infrastructure/instance/modules/lambda/lambda.tf index 9714614c0..87e38808d 100644 --- a/infrastructure/instance/modules/lambda/lambda.tf +++ b/infrastructure/instance/modules/lambda/lambda.tf @@ -24,6 +24,10 @@ module "lambda_function_container_image" { image_config_command = ["${var.function_name}_handler.${var.function_name}_handler"] } +data "aws_sns_topic" "fhir_api_perf_alerts" { + name = "${var.environment}-fhir-api-perf-alerts" +} + resource "aws_cloudwatch_metric_alarm" "memory_alarm" { alarm_name = "${var.short_prefix}_${var.function_name} memory alarm" comparison_operator = "GreaterThanOrEqualToThreshold" @@ -34,6 +38,7 @@ resource "aws_cloudwatch_metric_alarm" "memory_alarm" { statistic = "Maximum" threshold = 256 alarm_description = "This metric monitors Lambda memory usage" + alarm_actions = [data.aws_sns_topic.fhir_api_perf_alerts.arn] insufficient_data_actions = [] }