From b77f723c547a47e1a263030a28e9ddb3c969bcf8 Mon Sep 17 00:00:00 2001 From: Kim Gustyr Date: Fri, 26 Jun 2026 11:27:22 +0100 Subject: [PATCH] fix(infra): Task processor tasks killed during startup by health check MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The task-processor health check (`flagsmith healthcheck tcp`) passes only once the container binds port 8000, but the entrypoint runs full bootstrap (migrations, createcachetable, ClickHouse migrate, waitfordb) first, taking ~80s to bind. With startPeriod 5s the container was marked unhealthy and SIGTERMed after ~55s — often just before it came up — recycling tasks several times a day and flapping deploys. Raise startPeriod to 120s in the prod and staging ECS task definitions to cover the observed cold-start and its variance. beep boop --- .../aws/production/ecs-task-definition-task-processor.json | 2 +- .../aws/staging/ecs-task-definition-task-processor.json | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/infrastructure/aws/production/ecs-task-definition-task-processor.json b/infrastructure/aws/production/ecs-task-definition-task-processor.json index c5f5f35d106a..4c73ad144510 100644 --- a/infrastructure/aws/production/ecs-task-definition-task-processor.json +++ b/infrastructure/aws/production/ecs-task-definition-task-processor.json @@ -29,7 +29,7 @@ "interval": 10, "timeout": 2, "retries": 5, - "startPeriod": 5 + "startPeriod": 120 }, "essential": true, "environment": [ diff --git a/infrastructure/aws/staging/ecs-task-definition-task-processor.json b/infrastructure/aws/staging/ecs-task-definition-task-processor.json index cf24c91e1948..e840904fe471 100644 --- a/infrastructure/aws/staging/ecs-task-definition-task-processor.json +++ b/infrastructure/aws/staging/ecs-task-definition-task-processor.json @@ -24,7 +24,7 @@ "interval": 10, "timeout": 2, "retries": 5, - "startPeriod": 5 + "startPeriod": 120 }, "essential": true, "environment": [