From 08616067134f8689e572cc2016395ec0dac4c848 Mon Sep 17 00:00:00 2001 From: bdchatham Date: Wed, 24 Jun 2026 10:38:55 -0700 Subject: [PATCH 1/2] fix(harness): unique chain id per run; stop reusing stale genesis MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Root cause of the nightly hang: the cronjobs pass a static SEI_CHAIN_ID ("bench"/"rel"), so every run reuses the prior run's persisted genesis. The nodes boot with fresh validator/P2P keys, but the genesis — keyed by chain id — still names the prior run's validator set, so the live nodes are not the validators genesis expects. Consensus can never reach 2/3 of the genesis set and halts at height 1 (validators stuck in RoundStepPropose, voting_power=0, dialing phantom peer NodeIDs that exist nowhere in the live config). With no blocks, the EVM RPC never serves and the harness's EVM-readiness gate blocks → NightlyRunFailed. benchmark/release used SEI_CHAIN_ID raw; chaos suffixed only by scenario, so it collided across runs too. The harness image is distroless (no shell), so the id can't be made unique in the CronJob — derive it in-process: runChainID appends a per-run token to the base, matching the chaos suite's existing base semantics. Also reverts the storage.state_store.enable=false override from #445 — that was based on an incorrect state-store-deadlock diagnosis (seid was never wedged; it ran fine, the chain was halted). The state store returns to its image default. Co-Authored-By: Claude Opus 4.8 --- test/integration/benchmark_test.go | 4 ++-- test/integration/chaossuite_test.go | 2 +- test/integration/harness_test.go | 24 ++++++++++++++---------- test/integration/release_test.go | 2 +- 4 files changed, 18 insertions(+), 14 deletions(-) diff --git a/test/integration/benchmark_test.go b/test/integration/benchmark_test.go index 6c33d01..80c6c5d 100644 --- a/test/integration/benchmark_test.go +++ b/test/integration/benchmark_test.go @@ -16,7 +16,7 @@ import ( // // Inputs (env, mirroring k8s_nightly.yml): // -// SEI_CHAIN_ID per-run chain id (e.g. bench-) [required] +// SEI_CHAIN_ID base chain id (a per-run token is appended) [required] // SEID_IMAGE seid image under test [required] // SEILOAD_IMAGE sei-load benchmark image [required] // SEI_RUN_ID unique run id (sei.io/harness-run) [default: SEI_CHAIN_ID] @@ -33,7 +33,7 @@ import ( func TestBenchmark(t *testing.T) { requireCluster(t) - chainID := mustEnv(t, "SEI_CHAIN_ID") + chainID := runChainID(mustEnv(t, "SEI_CHAIN_ID")) s := spec{ chainID: chainID, runID: envOr("SEI_RUN_ID", chainID), diff --git a/test/integration/chaossuite_test.go b/test/integration/chaossuite_test.go index a14acf5..e5fe611 100644 --- a/test/integration/chaossuite_test.go +++ b/test/integration/chaossuite_test.go @@ -71,7 +71,7 @@ var chaosScenarios = []chaosScenario{ // CHAOS_DURATION [optional]. Run with -test.timeout 0 (see TestBenchmark). func TestChaosSuite(t *testing.T) { requireCluster(t) - base := mustEnv(t, "SEI_CHAIN_ID") + base := runChainID(mustEnv(t, "SEI_CHAIN_ID")) seid := mustEnv(t, "SEID_IMAGE") ns := envOr("SEI_NAMESPACE", "") duration := envOr("CHAOS_DURATION", "3m") diff --git a/test/integration/harness_test.go b/test/integration/harness_test.go index b77bf3b..defd2bf 100644 --- a/test/integration/harness_test.go +++ b/test/integration/harness_test.go @@ -40,18 +40,22 @@ import ( // DeletionPolicy cascade are the cleanup path. const runLabelKey = "sei.io/harness-run" -// memiavlStorageConfig pins storage for the load/release/chaos suites (the -// major-upgrade suite omits it — it tests the migration path). State commitment -// stays on memiavl; the SeiDB state store is disabled because the latest image -// defaults it on for full nodes, and enabling it on a fresh-genesis RPC follower -// deadlocks seid at store-open before it binds listeners. Matches the validators -// (ss-enable=false); FlatKV-migration coverage is unaffected — that's the SC -// layer, not the historical state store. (storage.state_store.write_mode is gone: -// the SS layer has no write-mode field on current sei-chain — EVM routing is the -// evm-split bool — so the old key was a silently-ignored no-op.) +// memiavlStorageConfig pins state commitment to memiavl for the load/release/chaos +// suites (the major-upgrade suite omits it — it tests the migration path). The +// controller default write-mode is rejected by the nightly image, so it must be +// set explicitly; the state store is left at its image default. var memiavlStorageConfig = map[string]string{ "storage.state_commit.write_mode": "memiavl_only", - "storage.state_store.enable": "false", +} + +// runChainID appends a per-run token to the base chain id so every run gets a +// fresh genesis and node keys. A static id reused across runs (e.g. "bench") +// collides with the prior run's persisted genesis: the nodes boot with new keys +// but the genesis — keyed by chain id — still names the prior run's validator +// set, so the live nodes are not the validators genesis expects and consensus +// halts at height 1. +func runChainID(base string) string { + return base + "-" + strconv.FormatInt(time.Now().Unix(), 36) } // mergeConfig returns base overlaid with extra; extra wins on key collision. diff --git a/test/integration/release_test.go b/test/integration/release_test.go index 49ae922..7d18872 100644 --- a/test/integration/release_test.go +++ b/test/integration/release_test.go @@ -62,7 +62,7 @@ var releaseRPCConfig = map[string]string{ // -test.timeout 0 (see TestBenchmark). func TestRelease(t *testing.T) { requireCluster(t) - chainID := mustEnv(t, "SEI_CHAIN_ID") + chainID := runChainID(mustEnv(t, "SEI_CHAIN_ID")) seid := mustEnv(t, "SEID_IMAGE") releaseImage := mustEnv(t, "RELEASE_TEST_IMAGE") ns := envOr("SEI_NAMESPACE", "") From c484971f775b43fa0fb564d7124d9cb1e06a0176 Mon Sep 17 00:00:00 2001 From: bdchatham Date: Wed, 24 Jun 2026 10:51:15 -0700 Subject: [PATCH 2/2] fix(harness): nanosecond run token + correct stale GC comment (xreview #446) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit xreview hardening: - runChainID uses UnixNano (was Unix): 1-second resolution could alias a same-second manual re-trigger onto a prior run's not-yet-reaped chain and reproduce the height-1 halt. Nanosecond resolution closes that window. - Correct the runLabelKey comment: the nightly-gc label sweep (sei.io/harness-run, >5h) already ships in platform and reaps abnormal-exit orphans — the prior "pending platform deliverable" note was stale and implied an unbounded leak. Co-Authored-By: Claude Opus 4.8 --- test/integration/harness_test.go | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/test/integration/harness_test.go b/test/integration/harness_test.go index defd2bf..ef20391 100644 --- a/test/integration/harness_test.go +++ b/test/integration/harness_test.go @@ -35,9 +35,10 @@ import ( // runLabelKey marks a run's resources for the abnormal-exit reaper (t.Cleanup is // skipped on SIGKILL / a -test.timeout breach). provision stamps it on the // network + every node; a suite's directly-applied seiload Job + fault CRs stamp -// it too. The matching nightly label-GC sweep is a pending platform deliverable; -// until it ships, normal-exit teardown (t.Cleanup) + the SeiNetwork -// DeletionPolicy cascade are the cleanup path. +// it too. The nightly-gc CronJob reaps these by label (resources older than 5h — +// above the longest suite deadline, so it never races a live run), cascading a +// SeiNetwork delete to its validators + PVCs; normal-exit t.Cleanup is the fast +// path. const runLabelKey = "sei.io/harness-run" // memiavlStorageConfig pins state commitment to memiavl for the load/release/chaos @@ -53,9 +54,10 @@ var memiavlStorageConfig = map[string]string{ // collides with the prior run's persisted genesis: the nodes boot with new keys // but the genesis — keyed by chain id — still names the prior run's validator // set, so the live nodes are not the validators genesis expects and consensus -// halts at height 1. +// halts at height 1. Nanosecond resolution so a same-second manual re-trigger +// can't alias a prior run's not-yet-reaped chain. func runChainID(base string) string { - return base + "-" + strconv.FormatInt(time.Now().Unix(), 36) + return base + "-" + strconv.FormatInt(time.Now().UnixNano(), 36) } // mergeConfig returns base overlaid with extra; extra wins on key collision.