From 8651f5bc20bfcaa77a45f6c383c30ca6fec9e5f4 Mon Sep 17 00:00:00 2001 From: Omni Date: Thu, 21 May 2026 05:11:49 +0000 Subject: [PATCH 1/2] Harden explorer docker compose resources --- docker-compose/README.md | 3 + docker-compose/production-hardening.md | 83 ++++++++++++++++++++++++++ docker-compose/services/backend.yml | 15 ++++- docker-compose/services/db.yml | 7 +++ docker-compose/services/redis.yml | 8 +++ docker-compose/services/stats.yml | 14 +++++ 6 files changed, 129 insertions(+), 1 deletion(-) create mode 100644 docker-compose/production-hardening.md diff --git a/docker-compose/README.md b/docker-compose/README.md index 97b6e2abdfbf..7934e73198ae 100644 --- a/docker-compose/README.md +++ b/docker-compose/README.md @@ -67,6 +67,9 @@ You can adjust BlockScout environment variables: - for visualizer in `./envs/common-visualizer.env` - for user-ops-indexer in `./envs/common-user-ops-indexer.env` +For production resource limits, health checks, log rotation, and monitoring +requirements, see [`production-hardening.md`](./production-hardening.md). + Descriptions of the ENVs are available - for [backend](https://docs.blockscout.com/setup/env-variables) diff --git a/docker-compose/production-hardening.md b/docker-compose/production-hardening.md new file mode 100644 index 000000000000..782cb6410cd0 --- /dev/null +++ b/docker-compose/production-hardening.md @@ -0,0 +1,83 @@ +# Production Hardening + +This checklist keeps the public Docker Compose files safe to publish while +covering the failure mode observed on the Numbers mainnet explorer: long-running +containers consumed nearly all VM memory, then the VM kept receiving packets but +stopped returning responses. + +## Resource Guards + +The compose service files define non-secret defaults that can be overridden by +the production environment: + +```sh +BACKEND_MEM_LIMIT=5g +BACKEND_MEMSWAP_LIMIT=5g +DB_MEM_LIMIT=5g +DB_MEMSWAP_LIMIT=5g +STATS_MEM_LIMIT=1g +STATS_MEMSWAP_LIMIT=1g +STATS_DB_MEM_LIMIT=1g +STATS_DB_MEMSWAP_LIMIT=1g +REDIS_MEM_LIMIT=512m +REDIS_MEMSWAP_LIMIT=512m +``` + +Keep total container limits below host memory so the OS, nginx, Docker, and the +monitoring agent have headroom. On a 16 GiB VM, reserve at least 3 GiB for the +host. + +## Health Checks + +The backend container now exposes a Docker healthcheck against: + +```text +http://localhost:$${PORT:-4000}/api/v2/main-page/indexing-status +``` + +Production monitoring should also check the public endpoint: + +```sh +curl -fsS --max-time 10 \ + https://mainnet.num.network/api/v2/main-page/indexing-status +``` + +Alert if this endpoint is non-200 or exceeds the expected latency for multiple +consecutive checks. + +## Log Rotation + +Docker `json-file` log rotation is enabled for the high-volume services. The +defaults are intentionally conservative and can be adjusted without changing the +compose files: + +```sh +BACKEND_LOG_MAX_SIZE=50m +BACKEND_LOG_MAX_FILE=5 +DB_LOG_MAX_SIZE=50m +DB_LOG_MAX_FILE=5 +STATS_LOG_MAX_SIZE=50m +STATS_LOG_MAX_FILE=5 +STATS_DB_LOG_MAX_SIZE=25m +STATS_DB_LOG_MAX_FILE=5 +REDIS_LOG_MAX_SIZE=25m +REDIS_LOG_MAX_FILE=5 +``` + +## Monitoring + +Required production alerts: + +- VM memory used > 85% for 10 minutes. +- VM outbound bytes = 0 while inbound bytes > 0 for 5 minutes. +- Public explorer health endpoint returns non-200 for 3 consecutive checks. +- Docker container health is `unhealthy` for backend. + +Enable process-level memory metrics on the VM. Without process RSS history, an +incident can prove memory exhaustion but cannot identify which process caused it. +Do not exclude these Ops Agent metrics in production: + +```yaml +agent.googleapis.com/processes/* +agent.googleapis.com/swap/* +``` diff --git a/docker-compose/services/backend.yml b/docker-compose/services/backend.yml index 46fe299ab24d..f31b21eee1ed 100644 --- a/docker-compose/services/backend.yml +++ b/docker-compose/services/backend.yml @@ -5,6 +5,8 @@ services: image: blockscout/${DOCKER_REPO:-blockscout}:${DOCKER_TAG:-latest} pull_policy: always restart: always + mem_limit: ${BACKEND_MEM_LIMIT:-5g} + memswap_limit: ${BACKEND_MEMSWAP_LIMIT:-5g} stop_grace_period: 5m container_name: 'backend' command: sh -c "bin/blockscout eval \"Elixir.Explorer.ReleaseTasks.create_and_migrate()\" && bin/blockscout start" @@ -14,4 +16,15 @@ services: - ../envs/common-blockscout.env volumes: - ./logs/:/app/logs/ - - ./dets/:/app/dets/ \ No newline at end of file + - ./dets/:/app/dets/ + healthcheck: + test: ["CMD-SHELL", "curl -fsS http://localhost:$${PORT:-4000}/api/v2/main-page/indexing-status >/dev/null || exit 1"] + interval: 30s + timeout: 10s + retries: 3 + start_period: 2m + logging: + driver: json-file + options: + max-size: ${BACKEND_LOG_MAX_SIZE:-50m} + max-file: ${BACKEND_LOG_MAX_FILE:-5} diff --git a/docker-compose/services/db.yml b/docker-compose/services/db.yml index 2db8647d13dc..b4bde7f0ace4 100644 --- a/docker-compose/services/db.yml +++ b/docker-compose/services/db.yml @@ -16,6 +16,8 @@ services: user: 2000:2000 shm_size: 256m restart: always + mem_limit: ${DB_MEM_LIMIT:-5g} + memswap_limit: ${DB_MEMSWAP_LIMIT:-5g} container_name: 'db' command: postgres -c 'max_connections=200' -c 'client_connection_check_interval=60000' environment: @@ -33,3 +35,8 @@ services: timeout: 5s retries: 5 start_period: 10s + logging: + driver: json-file + options: + max-size: ${DB_LOG_MAX_SIZE:-50m} + max-file: ${DB_LOG_MAX_FILE:-5} diff --git a/docker-compose/services/redis.yml b/docker-compose/services/redis.yml index 93f616686de6..03437478b55e 100644 --- a/docker-compose/services/redis.yml +++ b/docker-compose/services/redis.yml @@ -5,5 +5,13 @@ services: image: 'redis:alpine' container_name: redis-db command: redis-server + restart: always + mem_limit: ${REDIS_MEM_LIMIT:-512m} + memswap_limit: ${REDIS_MEMSWAP_LIMIT:-512m} volumes: - ./redis-data:/data + logging: + driver: json-file + options: + max-size: ${REDIS_LOG_MAX_SIZE:-25m} + max-file: ${REDIS_LOG_MAX_FILE:-5} diff --git a/docker-compose/services/stats.yml b/docker-compose/services/stats.yml index d4f7e8dffcd6..53d6b81b131b 100644 --- a/docker-compose/services/stats.yml +++ b/docker-compose/services/stats.yml @@ -16,6 +16,8 @@ services: user: 2000:2000 shm_size: 256m restart: always + mem_limit: ${STATS_DB_MEM_LIMIT:-1g} + memswap_limit: ${STATS_DB_MEMSWAP_LIMIT:-1g} container_name: 'stats-db' command: postgres -c 'max_connections=200' environment: @@ -33,12 +35,19 @@ services: timeout: 5s retries: 5 start_period: 10s + logging: + driver: json-file + options: + max-size: ${STATS_DB_LOG_MAX_SIZE:-25m} + max-file: ${STATS_DB_LOG_MAX_FILE:-5} stats: image: ghcr.io/blockscout/stats:${STATS_DOCKER_TAG:-latest} pull_policy: always platform: linux/amd64 restart: always + mem_limit: ${STATS_MEM_LIMIT:-1g} + memswap_limit: ${STATS_MEMSWAP_LIMIT:-1g} container_name: 'stats' extra_hosts: - 'host.docker.internal:host-gateway' @@ -49,3 +58,8 @@ services: - STATS__BLOCKSCOUT_DB_URL=${STATS__BLOCKSCOUT_DB_URL:-postgresql://blockscout:ceWb1MeLBEeOIfk65gU8EjF8@db:5432/blockscout} - STATS__CREATE_DATABASE=${STATS__CREATE_DATABASE:-true} - STATS__RUN_MIGRATIONS=${STATS__RUN_MIGRATIONS:-true} + logging: + driver: json-file + options: + max-size: ${STATS_LOG_MAX_SIZE:-50m} + max-file: ${STATS_LOG_MAX_FILE:-5} From e5f6c8ef8c68a1f3065a30472dc9a86808df0e37 Mon Sep 17 00:00:00 2001 From: Omni Date: Thu, 21 May 2026 05:58:10 +0000 Subject: [PATCH 2/2] Tune production resource defaults --- docker-compose/production-hardening.md | 8 ++++---- docker-compose/services/backend.yml | 4 ++-- docker-compose/services/db.yml | 4 ++-- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/docker-compose/production-hardening.md b/docker-compose/production-hardening.md index 782cb6410cd0..02912e564116 100644 --- a/docker-compose/production-hardening.md +++ b/docker-compose/production-hardening.md @@ -11,10 +11,10 @@ The compose service files define non-secret defaults that can be overridden by the production environment: ```sh -BACKEND_MEM_LIMIT=5g -BACKEND_MEMSWAP_LIMIT=5g -DB_MEM_LIMIT=5g -DB_MEMSWAP_LIMIT=5g +BACKEND_MEM_LIMIT=4g +BACKEND_MEMSWAP_LIMIT=4g +DB_MEM_LIMIT=6g +DB_MEMSWAP_LIMIT=6g STATS_MEM_LIMIT=1g STATS_MEMSWAP_LIMIT=1g STATS_DB_MEM_LIMIT=1g diff --git a/docker-compose/services/backend.yml b/docker-compose/services/backend.yml index f31b21eee1ed..fe6425ec755b 100644 --- a/docker-compose/services/backend.yml +++ b/docker-compose/services/backend.yml @@ -5,8 +5,8 @@ services: image: blockscout/${DOCKER_REPO:-blockscout}:${DOCKER_TAG:-latest} pull_policy: always restart: always - mem_limit: ${BACKEND_MEM_LIMIT:-5g} - memswap_limit: ${BACKEND_MEMSWAP_LIMIT:-5g} + mem_limit: ${BACKEND_MEM_LIMIT:-4g} + memswap_limit: ${BACKEND_MEMSWAP_LIMIT:-4g} stop_grace_period: 5m container_name: 'backend' command: sh -c "bin/blockscout eval \"Elixir.Explorer.ReleaseTasks.create_and_migrate()\" && bin/blockscout start" diff --git a/docker-compose/services/db.yml b/docker-compose/services/db.yml index b4bde7f0ace4..8fda2e31e5e3 100644 --- a/docker-compose/services/db.yml +++ b/docker-compose/services/db.yml @@ -16,8 +16,8 @@ services: user: 2000:2000 shm_size: 256m restart: always - mem_limit: ${DB_MEM_LIMIT:-5g} - memswap_limit: ${DB_MEMSWAP_LIMIT:-5g} + mem_limit: ${DB_MEM_LIMIT:-6g} + memswap_limit: ${DB_MEMSWAP_LIMIT:-6g} container_name: 'db' command: postgres -c 'max_connections=200' -c 'client_connection_check_interval=60000' environment: