diff --git a/infrastructure/bootstrap/hub.bicep b/infrastructure/bootstrap/hub.bicep index b17eaa81..6c18d65d 100644 --- a/infrastructure/bootstrap/hub.bicep +++ b/infrastructure/bootstrap/hub.bicep @@ -18,6 +18,7 @@ targetScope = 'subscription' +// param devopsInfrastructureId string param devopsSubnetAddressPrefix string param privateEndpointSubnetAddressPrefix string param hubType string // live / nonlive diff --git a/infrastructure/environments/dev/variables.tfvars b/infrastructure/environments/dev/variables.tfvars index 65187411..eccf7295 100644 --- a/infrastructure/environments/dev/variables.tfvars +++ b/infrastructure/environments/dev/variables.tfvars @@ -15,3 +15,6 @@ postgres_geo_redundant_backup_enabled = false protect_keyvault = false vnet_address_space = "10.12.0.0/16" seed_demo_data = true +enable_alerting = true +min_replicas = 1 +container_memory = "1" diff --git a/infrastructure/environments/preprod/variables.tfvars b/infrastructure/environments/preprod/variables.tfvars index c9becadf..ee760594 100644 --- a/infrastructure/environments/preprod/variables.tfvars +++ b/infrastructure/environments/preprod/variables.tfvars @@ -15,3 +15,6 @@ postgres_geo_redundant_backup_enabled = false protect_keyvault = true vnet_address_space = "10.14.0.0/16" seed_demo_data = true +enable_alerting = true +min_replicas = 2 +container_memory = "1" diff --git a/infrastructure/environments/prod/variables.tfvars b/infrastructure/environments/prod/variables.tfvars index dae1de65..36ded45b 100644 --- a/infrastructure/environments/prod/variables.tfvars +++ b/infrastructure/environments/prod/variables.tfvars @@ -15,3 +15,6 @@ protect_keyvault = true vnet_address_space = "10.15.0.0/16" use_apex_domain = true cae_zone_redundancy_enabled = true +enable_alerting = true +min_replicas = 2 +container_memory = "1" diff --git a/infrastructure/modules/container-apps/alerts.tf b/infrastructure/modules/container-apps/alerts.tf new file mode 100644 index 00000000..a9ea412b --- /dev/null +++ b/infrastructure/modules/container-apps/alerts.tf @@ -0,0 +1,38 @@ +resource "azurerm_monitor_scheduled_query_rules_alert_v2" "failure_event" { + count = var.enable_alerting ? 1 : 0 + + auto_mitigation_enabled = false + description = "An alert triggered by a custom event batch_marked_as_failed logged in code" + enabled = var.enable_alerting + evaluation_frequency = "PT5M" + location = var.region + name = "${var.app_short_name}-batch-failed-alert" + resource_group_name = azurerm_resource_group.main.name + scopes = [var.app_insights_id] + severity = 2 + skip_query_validation = false + target_resource_types = ["microsoft.insights/components"] + window_duration = "PT5M" + workspace_alerts_storage_enabled = false + + action { + action_groups = [var.action_group_id] + } + + criteria { + operator = "GreaterThan" + query = <<-QUERY + customEvents + | where name == "batch_marked_as_failed" + | project timestamp, name + | project-rename TimeGenerated=timestamp + QUERY + threshold = 0 + time_aggregation_method = "Count" + + failing_periods { + minimum_failing_periods_to_trigger_alert = 1 + number_of_evaluation_periods = 1 + } + } +} diff --git a/infrastructure/modules/container-apps/main.tf b/infrastructure/modules/container-apps/main.tf index 0b481ed5..11629b7d 100644 --- a/infrastructure/modules/container-apps/main.tf +++ b/infrastructure/modules/container-apps/main.tf @@ -13,6 +13,13 @@ module "webapp" { name = "${var.app_short_name}-web-${var.environment}" container_app_environment_id = var.container_app_environment_id + + # alerts + action_group_id = var.action_group_id + enable_alerting = var.enable_alerting + alert_memory_threshold = 80 + alert_cpu_threshold = 90 + resource_group_name = azurerm_resource_group.main.name fetch_secrets_from_app_key_vault = var.fetch_secrets_from_app_key_vault infra_key_vault_name = "kv-${var.app_short_name}-${var.env_config}-inf" @@ -31,4 +38,7 @@ module "webapp" { secret_variables = var.deploy_database_as_container ? { DATABASE_PASSWORD = resource.random_password.admin_password[0].result } : {} is_web_app = true port = 8000 + probe_path = "/healthcheck" + # min_replicas = var.min_replicas + # memory = var.container_memory } diff --git a/infrastructure/modules/container-apps/postgres.tf b/infrastructure/modules/container-apps/postgres.tf index 7939db25..20125b12 100644 --- a/infrastructure/modules/container-apps/postgres.tf +++ b/infrastructure/modules/container-apps/postgres.tf @@ -48,6 +48,13 @@ module "postgres" { public_network_access_enabled = !var.features.private_networking + # alerts + action_group_id = var.action_group_id + enable_alerting = var.enable_alerting + alert_memory_threshold = 80 + alert_cpu_threshold = 90 + alert_storage_threshold = 80 + databases = { db1 = { collation = "en_US.utf8" @@ -94,6 +101,13 @@ module "database_container" { POSTGRES_USER = local.database_user POSTGRES_DB = local.database_name } + + # alerts + action_group_id = var.action_group_id + enable_alerting = var.enable_alerting + alert_memory_threshold = 80 + alert_cpu_threshold = 90 + resource_group_name = azurerm_resource_group.main.name is_tcp_app = true # postgres has a port of 5432 diff --git a/infrastructure/modules/container-apps/variables.tf b/infrastructure/modules/container-apps/variables.tf index 44274627..d7fd503f 100644 --- a/infrastructure/modules/container-apps/variables.tf +++ b/infrastructure/modules/container-apps/variables.tf @@ -40,6 +40,7 @@ variable "enable_entra_id_authentication" { type = bool } + variable "env_config" { description = "Environment configuration. Different environments may share the same environment config and the same infrastructure" type = string @@ -128,6 +129,17 @@ variable "main_subnet_id" { type = string } +variable "min_replicas" { + description = "Minimum number of container replicas" + type = number +} + +variable "app_insights_id" { + description = "The Application Insights id." + type = string +} + + variable "region" { description = "The region to deploy in" type = string @@ -144,6 +156,45 @@ variable "use_apex_domain" { type = bool } +variable "enable_alerting" { + description = "Whether monitoring and alerting is enabled." + type = bool +} + +# variable "target_url" { +# description = "The external url" +# type = string +# } + +variable "alert_window_size" { + type = string + nullable = false + validation { + condition = contains(["PT1M", "PT5M", "PT15M", "PT30M", "PT1H", "PT6H", "PT12H"], var.alert_window_size) + error_message = "The alert_window_size must be one of: PT1M, PT5M, PT15M, PT30M, PT1H, PT6H, PT12H" + } + description = "The period of time that is used to monitor alert activity e.g. PT1M, PT5M, PT15M, PT30M, PT1H, PT6H, PT12H. The interval between checks is adjusted accordingly." +} + +variable "container_memory" { + description = "Memory allocated to the webapp container in Gi. CPU is automatically set to half the memory value by the container-app module." + type = string +} + +variable "action_group_id" { + type = string + description = "ID of the action group to notify." +} + +variable "infra_key_vault_name" { + description = "Name of the infra key vault" + type = string +} + +variable "infra_key_vault_rg" { + description = "Name of the infra key vault resource group" + type = string +} locals { resource_group_name = "rg-${var.app_short_name}-${var.environment}-container-app-uks" diff --git a/infrastructure/modules/infra/alerts.tf b/infrastructure/modules/infra/alerts.tf new file mode 100644 index 00000000..fafc3c5b --- /dev/null +++ b/infrastructure/modules/infra/alerts.tf @@ -0,0 +1,41 @@ +module "service_health_alert" { + source = "../dtos-devops-templates/infrastructure/modules/monitor-activity-log-alert" + + name = "service-health-alerts-${var.app_short_name}-${var.environment}" + location = "global" + resource_group_name = azurerm_resource_group.main.name + description = "Azure Service Health alert for services impacting ${var.app_short_name} in ${var.environment}" + + scopes = [data.azurerm_subscription.current.id] + + criteria = { + category = "ServiceHealth" + level = null + + service_health = { + events = ["Incident", "Maintenance", "Informational", "ActionRequired", "Security"] + locations = [var.region] + + # Only monitor Azure services used by this application + # This reduces noise from unrelated service health events + services = [ + "Application Insights", + "Azure Container Apps", + "Azure Container Service", + "Azure Container Storage", + "Azure Database for PostgreSQL flexible servers", + "Azure DNS", + "Azure Frontdoor", + "Azure Monitor", + "Azure Private Link", + "Key Vault", + "Log Analytics", + "Storage", + "Virtual Network", + "Windows Virtual Desktop" + ] + } + } + + action_group_id = module.monitor_action_group.monitor_action_group.id +} diff --git a/infrastructure/modules/infra/data.tf b/infrastructure/modules/infra/data.tf index 227dd1e7..7e9605eb 100644 --- a/infrastructure/modules/infra/data.tf +++ b/infrastructure/modules/infra/data.tf @@ -1,3 +1,5 @@ +data "azurerm_subscription" "current" {} + data "azuread_service_principal" "github-mi" { display_name = var.github_mi_name } @@ -7,3 +9,15 @@ data "azuread_group" "kv_officers" { display_name = each.value } + +data "azurerm_key_vault" "infra" { + provider = azurerm.hub + + name = var.infra_key_vault_name + resource_group_name = var.infra_key_vault_rg +} + +data "azurerm_key_vault_secret" "infra" { + name = "monitoring-email-address" + key_vault_id = data.azurerm_key_vault.infra.id +} diff --git a/infrastructure/modules/infra/main.tf b/infrastructure/modules/infra/main.tf index 7490a581..071e06d1 100644 --- a/infrastructure/modules/infra/main.tf +++ b/infrastructure/modules/infra/main.tf @@ -70,3 +70,44 @@ module "container-app-environment" { private_dns_zone_rg_name = var.features.private_networking ? "rg-hub-${var.hub}-uks-private-dns-zones" : null zone_redundancy_enabled = var.cae_zone_redundancy_enabled } + +module "app_insights_audit" { + source = "../dtos-devops-templates/infrastructure/modules/app-insights" + + name = "appi-${var.environment}-uks-${var.app_short_name}" + location = var.region + resource_group_name = azurerm_resource_group.main.name + appinsights_type = "web" + + log_analytics_workspace_id = module.log_analytics_workspace_audit.id + + # alerts + action_group_id = module.monitor_action_group.monitor_action_group.id + enable_alerting = var.enable_alerting +} + +module "private_link_scoped_service_law" { + source = "../dtos-devops-templates/infrastructure/modules/private-link-scoped-service" + + providers = { + azurerm = azurerm.hub + } + + name = "pls-${var.app_short_name}-${var.environment}-law" + resource_group_name = "rg-hub-${var.hub}-uks-hub-private-endpoints" + linked_resource_id = module.log_analytics_workspace_audit.id + scope_name = "ampls-${var.hub}hub" +} + +module "private_link_scoped_service_app_insights" { + source = "../dtos-devops-templates/infrastructure/modules/private-link-scoped-service" + + providers = { + azurerm = azurerm.hub + } + + name = "pls-${var.app_short_name}-${var.environment}-appinsights" + resource_group_name = "rg-hub-${var.hub}-uks-hub-private-endpoints" + linked_resource_id = module.app_insights_audit.id + scope_name = "ampls-${var.hub}hub" +} diff --git a/infrastructure/modules/infra/monitor_action_group.tf b/infrastructure/modules/infra/monitor_action_group.tf new file mode 100644 index 00000000..cafeb383 --- /dev/null +++ b/infrastructure/modules/infra/monitor_action_group.tf @@ -0,0 +1,14 @@ +module "monitor_action_group" { + source = "../dtos-devops-templates/infrastructure/modules/monitor-action-group" + + name = "ag-${var.environment}-uks-${var.app_short_name}-${var.environment}" + resource_group_name = azurerm_resource_group.main.name + location = var.region + short_name = "ag-${var.environment}" + email_receiver = { + email = { + name = "email" + email_address = data.azurerm_key_vault_secret.infra.value + } + } +} diff --git a/infrastructure/modules/infra/output.tf b/infrastructure/modules/infra/output.tf index 10baa0b4..922cf5c1 100644 --- a/infrastructure/modules/infra/output.tf +++ b/infrastructure/modules/infra/output.tf @@ -14,6 +14,14 @@ output "log_analytics_workspace_audit_id" { value = module.log_analytics_workspace_audit.id } +output "app_insights_id" { + value = module.app_insights_audit.id +} + +output "monitor_action_group_id" { + value = module.monitor_action_group.monitor_action_group.id +} + output "default_domain" { value = module.container-app-environment.default_domain } diff --git a/infrastructure/modules/infra/variables.tf b/infrastructure/modules/infra/variables.tf index 68bf1a18..36726480 100644 --- a/infrastructure/modules/infra/variables.tf +++ b/infrastructure/modules/infra/variables.tf @@ -58,6 +58,21 @@ variable "protect_keyvault" { default = true } +variable "infra_key_vault_name" { + description = "Name of the infra key vault" + type = string +} + +variable "infra_key_vault_rg" { + description = "Name of the infra key vault resource group" + type = string +} + +variable "enable_alerting" { + description = "Whether monitoring and alerting is enabled." + type = bool +} + locals { hub_vnet_rg_name = "rg-hub-${var.hub}-uks-bootstrap" hub_vnet_name = "vnet-hub-${var.hub}-uks" diff --git a/infrastructure/terraform/spoke/data.tf b/infrastructure/terraform/spoke/data.tf index 4f0a60ea..855734fc 100644 --- a/infrastructure/terraform/spoke/data.tf +++ b/infrastructure/terraform/spoke/data.tf @@ -35,3 +35,17 @@ data "azurerm_subnet" "main" { virtual_network_name = "vnet-${var.env_config}-uks-${var.app_short_name}" resource_group_name = local.resource_group_name } + +data "azurerm_monitor_action_group" "main" { + count = var.deploy_infra ? 0 : 1 + + name = "ag-${var.env_config}-uks-${var.app_short_name}-${var.env_config}" + resource_group_name = local.resource_group_name +} + +data "azurerm_application_insights" "app_insights" { + count = var.deploy_infra ? 0 : 1 + + name = "appi-${var.env_config}-uks-${var.app_short_name}" + resource_group_name = local.resource_group_name +} diff --git a/infrastructure/terraform/spoke/main.tf b/infrastructure/terraform/spoke/main.tf index 203b36d7..3c6b8fb5 100644 --- a/infrastructure/terraform/spoke/main.tf +++ b/infrastructure/terraform/spoke/main.tf @@ -10,6 +10,8 @@ module "infra" { region = local.region resource_group_name = local.resource_group_name + infra_key_vault_name = local.infra_key_vault_name + infra_key_vault_rg = local.infra_key_vault_rg app_short_name = var.app_short_name environment = var.env_config features = var.features @@ -19,6 +21,7 @@ module "infra" { protect_keyvault = var.protect_keyvault vnet_address_space = var.vnet_address_space cae_zone_redundancy_enabled = var.cae_zone_redundancy_enabled + enable_alerting = var.enable_alerting } module "container-apps" { @@ -32,8 +35,12 @@ module "container-apps" { } region = local.region + action_group_id = var.deploy_infra ? module.infra[0].monitor_action_group_id : data.azurerm_monitor_action_group.main[0].id + alert_window_size = var.alert_window_size + enable_alerting = var.enable_alerting app_key_vault_id = var.deploy_infra ? module.infra[0].app_key_vault_id : data.azurerm_key_vault.app_key_vault[0].id app_short_name = var.app_short_name + app_insights_id = var.deploy_infra ? module.infra[0].app_insights_id : data.azurerm_application_insights.app_insights[0].id container_app_environment_id = var.deploy_infra ? module.infra[0].container_app_environment_id : data.azurerm_container_app_environment.this[0].id default_domain = var.deploy_infra ? module.infra[0].default_domain : data.azurerm_container_app_environment.this[0].default_domain dns_zone_name = var.dns_zone_name @@ -56,5 +63,10 @@ module "container-apps" { postgres_subnet_id = var.deploy_infra ? module.infra[0].postgres_subnet_id : data.azurerm_subnet.postgres[0].id main_subnet_id = var.deploy_infra ? module.infra[0].main_subnet_id : data.azurerm_subnet.main[0].id seed_demo_data = var.seed_demo_data + infra_key_vault_name = local.infra_key_vault_name + infra_key_vault_rg = local.infra_key_vault_rg use_apex_domain = var.use_apex_domain + # target_url = var.deploy_container_apps ? "${module.container-apps[0].external_url}healthcheck" : null + container_memory = var.container_memory + min_replicas = var.min_replicas } diff --git a/infrastructure/terraform/spoke/variables.tf b/infrastructure/terraform/spoke/variables.tf index fd5ff666..8376be0c 100644 --- a/infrastructure/terraform/spoke/variables.tf +++ b/infrastructure/terraform/spoke/variables.tf @@ -152,8 +152,38 @@ variable "vnet_address_space" { type = string } +variable "alert_window_size" { + type = string + nullable = false + default = "PT15M" + validation { + condition = contains(["PT1M", "PT5M", "PT15M", "PT30M", "PT1H", "PT6H", "PT12H"], var.alert_window_size) + error_message = "The alert_window_size must be one of: PT1M, PT5M, PT15M, PT30M, PT1H, PT6H, PT12H" + } + description = "The period of time that is used to monitor alert activity e.g. PT1M, PT5M, PT15M, PT30M, PT1H, PT6H, PT12H. The interval between checks is adjusted accordingly." +} + +variable "enable_alerting" { + description = "Whether monitoring and alerting is enabled." + type = bool + default = false +} + +variable "container_memory" { + description = "Memory allocated to the webapp container in Gi. CPU is automatically set to half the memory value by the container-app module." + type = string + default = "0.5" +} + +variable "min_replicas" { + description = "Minimum number of container replicas" + type = number + default = 1 +} locals { region = "uksouth" resource_group_name = "rg-${var.app_short_name}-${var.env_config}-uks" + infra_key_vault_name = "kv-${var.app_short_name}-${var.env_config}-inf" + infra_key_vault_rg = "rg-${var.app_short_name}-${var.env_config}-infra" }