From b77f723c547a47e1a263030a28e9ddb3c969bcf8 Mon Sep 17 00:00:00 2001
From: Kim Gustyr <kim.gustyr@flagsmith.com>
Date: Fri, 26 Jun 2026 11:27:22 +0100
Subject: [PATCH] fix(infra): Task processor tasks killed during startup by
 health check
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The task-processor health check (`flagsmith healthcheck tcp`) passes only
once the container binds port 8000, but the entrypoint runs full bootstrap
(migrations, createcachetable, ClickHouse migrate, waitfordb) first, taking
~80s to bind. With startPeriod 5s the container was marked unhealthy and
SIGTERMed after ~55s — often just before it came up — recycling tasks
several times a day and flapping deploys.

Raise startPeriod to 120s in the prod and staging ECS task definitions to
cover the observed cold-start and its variance.

beep boop
---
 .../aws/production/ecs-task-definition-task-processor.json      | 2 +-
 .../aws/staging/ecs-task-definition-task-processor.json         | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/infrastructure/aws/production/ecs-task-definition-task-processor.json b/infrastructure/aws/production/ecs-task-definition-task-processor.json
index c5f5f35d106a..4c73ad144510 100644
--- a/infrastructure/aws/production/ecs-task-definition-task-processor.json
+++ b/infrastructure/aws/production/ecs-task-definition-task-processor.json
@@ -29,7 +29,7 @@
                 "interval": 10,
                 "timeout": 2,
                 "retries": 5,
-                "startPeriod": 5
+                "startPeriod": 120
             },
             "essential": true,
             "environment": [
diff --git a/infrastructure/aws/staging/ecs-task-definition-task-processor.json b/infrastructure/aws/staging/ecs-task-definition-task-processor.json
index cf24c91e1948..e840904fe471 100644
--- a/infrastructure/aws/staging/ecs-task-definition-task-processor.json
+++ b/infrastructure/aws/staging/ecs-task-definition-task-processor.json
@@ -24,7 +24,7 @@
                 "interval": 10,
                 "timeout": 2,
                 "retries": 5,
-                "startPeriod": 5
+                "startPeriod": 120
             },
             "essential": true,
             "environment": [