From 4568ec8d3491a7a66a9f9d07fef2acc963414a1d Mon Sep 17 00:00:00 2001 From: Wen Date: Mon, 9 Mar 2026 12:12:18 -0700 Subject: [PATCH 01/31] persist: replace file-per-entry with WAL-based persistence Replace the file-per-block and file-per-commitqc persistence with sei-db/wal. Blocks use one WAL per lane so that truncation is independent (no stale-lane problem). CommitQCs use a single WAL with a linear RoadIndex-to-WAL-index mapping. Key changes: - BlockPersister: per-lane WAL in blocks// subdirs, lazy lane creation, independent per-lane TruncateBefore. - CommitQCPersister: single WAL in commitqcs/, tracks firstWALIdx and nextWALIdx locally for correct truncation mapping. - Remove all file-per-entry code: filename construction/parsing, directory scanning, individual file read/write/delete, corrupt file skipping. - Rewrite tests for WAL semantics (append-only, truncation, replay). Made-with: Cursor --- .../internal/autobahn/avail/state.go | 6 +- .../autobahn/consensus/persist/blocks.go | 311 ++++++++++-------- .../autobahn/consensus/persist/blocks_test.go | 214 ++++-------- .../autobahn/consensus/persist/commitqcs.go | 191 +++++------ .../consensus/persist/commitqcs_test.go | 148 +++------ 5 files changed, 381 insertions(+), 489 deletions(-) diff --git a/sei-tendermint/internal/autobahn/avail/state.go b/sei-tendermint/internal/autobahn/avail/state.go index aa070096d8..6d1b3cba7d 100644 --- a/sei-tendermint/internal/autobahn/avail/state.go +++ b/sei-tendermint/internal/autobahn/avail/state.go @@ -157,7 +157,7 @@ func NewState(key types.SecretKey, data *data.State, stateDir utils.Option[strin return nil, err } - // Delete files below the prune anchor that were filtered out by + // Truncate WAL entries below the prune anchor that were filtered out by // loadPersistedState. Also reset the CommitQC persister's cursor to // match the post-prune range. laneFirsts := make(map[types.LaneID]types.BlockNumber, len(inner.blocks)) @@ -165,10 +165,10 @@ func NewState(key types.SecretKey, data *data.State, stateDir utils.Option[strin laneFirsts[lane] = q.first } if err := pers.blocks.DeleteBefore(laneFirsts); err != nil { - return nil, fmt.Errorf("prune stale block files: %w", err) + return nil, fmt.Errorf("prune stale block WAL entries: %w", err) } if err := pers.commitQCs.DeleteBefore(inner.commitQCs.first); err != nil { - return nil, fmt.Errorf("prune stale commitQC files: %w", err) + return nil, fmt.Errorf("prune stale commitQC WAL entries: %w", err) } pers.commitQCs.ResetNext(inner.commitQCs.next) diff --git a/sei-tendermint/internal/autobahn/consensus/persist/blocks.go b/sei-tendermint/internal/autobahn/consensus/persist/blocks.go index 93f233ee7d..bf83768b56 100644 --- a/sei-tendermint/internal/autobahn/consensus/persist/blocks.go +++ b/sei-tendermint/internal/autobahn/consensus/persist/blocks.go @@ -1,30 +1,17 @@ -// TODO: Block file persistence is a temporary solution that will be replaced by -// a WAL (Write-Ahead Log) library before launch. CommitQC file persistence -// (commitqcs.go) shares the same migration plan. With a WAL, atomic appends -// eliminate several complexities in both files: -// - Corrupt file handling (WAL handles its own integrity). -// - Per-file naming, parsing, and directory scanning. -// - Orphaned file cleanup (WAL truncation replaces DeleteBefore). -// - Gap handling in newInner (WAL replay is always contiguous). -// -// What survives: the BlockPersister abstraction (PersistBlock/DeleteBefore). - package persist import ( + "cmp" + "context" "encoding/hex" "fmt" - "maps" "os" "path/filepath" "slices" - "strconv" - "strings" "log/slog" - "google.golang.org/protobuf/proto" - + dbwal "github.com/sei-protocol/sei-chain/sei-db/wal" "github.com/sei-protocol/sei-chain/sei-tendermint/internal/autobahn/types" "github.com/sei-protocol/sei-chain/sei-tendermint/libs/utils" "github.com/sei-protocol/seilog" @@ -38,27 +25,64 @@ type LoadedBlock struct { Proposal *types.Signed[*types.LaneProposal] } -// BlockPersister manages individual block files in a blocks/ subdirectory. -// Each block is stored as _.pb. -// The caller is responsible for driving persistence (typically a goroutine that -// watches in-memory block state and calls PersistBlock / DeleteBefore). +// laneWAL wraps a per-lane WAL with a linear index mapping between +// BlockNumber and WAL index. Blocks within a lane are sequential, so the +// mapping is: walIndex = firstWALIdx + (blockNumber - firstBlockNumber). +type laneWAL struct { + wal *dbwal.WAL[[]byte] + firstWALIdx uint64 + nextWALIdx uint64 + // firstBlockNum is the block number corresponding to firstWALIdx. + firstBlockNum types.BlockNumber + // nextBlockNum is the next block number to be persisted. + nextBlockNum types.BlockNumber +} + +// BlockPersister manages block persistence using one WAL per lane. +// Each lane gets its own WAL in a subdirectory named by hex-encoded lane ID, +// so truncation is independent per lane. // When noop is true, all disk I/O is skipped. type BlockPersister struct { - dir string // full path to the blocks/ subdirectory; empty when noop - noop bool + dir string + lanes map[types.LaneID]*laneWAL + noop bool } -// newNoOpBlockPersister returns a BlockPersister that skips all disk I/O. -// Used when persistence is disabled. func newNoOpBlockPersister() *BlockPersister { - return &BlockPersister{noop: true} + return &BlockPersister{noop: true, lanes: map[types.LaneID]*laneWAL{}} +} + +func laneDir(lane types.LaneID) string { + return hex.EncodeToString(lane.Bytes()) +} + +func newLaneWAL(dir string) (*laneWAL, error) { + if err := os.MkdirAll(dir, 0700); err != nil { + return nil, fmt.Errorf("create lane dir %s: %w", dir, err) + } + w, err := dbwal.NewWAL[[]byte]( + context.Background(), + func(data []byte) ([]byte, error) { return data, nil }, + func(data []byte) ([]byte, error) { return data, nil }, + logger, + dir, + dbwal.Config{ + WriteBufferSize: 0, // synchronous writes + WriteBatchSize: 1, // no batching + FsyncEnabled: true, + }, + ) + if err != nil { + return nil, err + } + return &laneWAL{wal: w, nextWALIdx: 1}, nil } -// NewBlockPersister creates the blocks/ subdirectory if it doesn't exist and -// returns a block persister. Loads all persisted blocks from disk as sorted -// slices per lane. Corrupt files are skipped; the caller (newInner) returns -// an error if the resulting slices are non-contiguous. -// When stateDir is None, returns a no-op persister that skips all disk I/O. +// NewBlockPersister opens (or creates) per-lane WALs in subdirectories of +// blocks/ and replays all persisted entries. Returns the persister and loaded +// blocks grouped by lane (sorted by block number). Corrupt tail entries are +// auto-truncated by the WAL library. +// When stateDir is None, returns a no-op persister. func NewBlockPersister(stateDir utils.Option[string]) (*BlockPersister, map[types.LaneID][]LoadedBlock, error) { sd, ok := stateDir.Get() if !ok { @@ -69,150 +93,151 @@ func NewBlockPersister(stateDir utils.Option[string]) (*BlockPersister, map[type return nil, nil, fmt.Errorf("create blocks dir %s: %w", dir, err) } - bp := &BlockPersister{ - dir: dir, - } - blocks, err := bp.loadAll() - if err != nil { - return nil, nil, err - } - return bp, blocks, nil -} - -func blockFilename(lane types.LaneID, n types.BlockNumber) string { - return hex.EncodeToString(lane.Bytes()) + "_" + strconv.FormatUint(uint64(n), 10) + ".pb" -} + bp := &BlockPersister{dir: dir, lanes: map[types.LaneID]*laneWAL{}} -func parseBlockFilename(name string) (types.LaneID, types.BlockNumber, error) { - name = strings.TrimSuffix(name, ".pb") - parts := strings.SplitN(name, "_", 2) - if len(parts) != 2 { - return types.PublicKey{}, 0, fmt.Errorf("bad block filename %q", name) - } - keyBytes, err := hex.DecodeString(parts[0]) + entries, err := os.ReadDir(dir) if err != nil { - return types.PublicKey{}, 0, fmt.Errorf("bad lane hex in %q: %w", name, err) + return nil, nil, fmt.Errorf("read blocks dir %s: %w", dir, err) } - lane, err := types.PublicKeyFromBytes(keyBytes) - if err != nil { - return types.PublicKey{}, 0, fmt.Errorf("bad lane key in %q: %w", name, err) - } - n, err := strconv.ParseUint(parts[1], 10, 64) - if err != nil { - return types.PublicKey{}, 0, fmt.Errorf("bad block number in %q: %w", name, err) + + allBlocks := map[types.LaneID][]LoadedBlock{} + for _, e := range entries { + if !e.IsDir() { + continue + } + laneDir := filepath.Join(dir, e.Name()) + lw, err := newLaneWAL(laneDir) + if err != nil { + bp.Close() + return nil, nil, fmt.Errorf("open lane WAL in %s: %w", laneDir, err) + } + loaded, lane, err := lw.loadAll() + if err != nil { + _ = lw.wal.Close() + bp.Close() + return nil, nil, fmt.Errorf("load lane WAL in %s: %w", laneDir, err) + } + if lane == nil { + _ = lw.wal.Close() + continue + } + bp.lanes[*lane] = lw + if len(loaded) > 0 { + allBlocks[*lane] = loaded + } } - return lane, types.BlockNumber(n), nil + + return bp, allBlocks, nil } -// PersistBlock writes a signed lane proposal to its own file. +// PersistBlock writes a signed lane proposal to the per-lane WAL. +// Creates the lane WAL lazily if this is the first block for the lane. func (bp *BlockPersister) PersistBlock(proposal *types.Signed[*types.LaneProposal]) error { if bp.noop { return nil } h := proposal.Msg().Block().Header() - pb := types.SignedMsgConv[*types.LaneProposal]().Encode(proposal) - data, err := proto.Marshal(pb) - if err != nil { - return fmt.Errorf("marshal block %s/%d: %w", h.Lane(), h.BlockNumber(), err) + lane := h.Lane() + lw, ok := bp.lanes[lane] + if !ok { + var err error + lw, err = newLaneWAL(filepath.Join(bp.dir, laneDir(lane))) + if err != nil { + return fmt.Errorf("create lane WAL for %s: %w", lane, err) + } + bp.lanes[lane] = lw + } + data := types.SignedMsgConv[*types.LaneProposal]().Marshal(proposal) + if err := lw.wal.Write(data); err != nil { + return fmt.Errorf("persist block %s/%d: %w", lane, h.BlockNumber(), err) } - path := filepath.Join(bp.dir, blockFilename(h.Lane(), h.BlockNumber())) - return writeAndSync(path, data) + if lw.firstWALIdx == 0 { + lw.firstWALIdx = lw.nextWALIdx + lw.firstBlockNum = h.BlockNumber() + } + lw.nextWALIdx++ + lw.nextBlockNum = h.BlockNumber() + 1 + return nil } -// DeleteBefore removes persisted block files that are no longer needed. -// For lanes in laneFirsts, deletes files with block number below the map value. -// For lanes NOT in laneFirsts (orphaned from a previous committee/epoch), -// deletes all files — old blocks are not reusable after a committee change. -// An empty/nil laneFirsts is a no-op (no committee info available to judge orphans). -// Returns an error if the directory cannot be read; individual file removal -// failures are logged but do not cause an error. +// DeleteBefore removes persisted blocks per lane by truncating each lane's +// WAL independently. For each lane in the map, blocks below the given +// BlockNumber are removed. Lanes not in the map are left untouched. func (bp *BlockPersister) DeleteBefore(laneFirsts map[types.LaneID]types.BlockNumber) error { - if bp.noop || len(laneFirsts) == 0 { + if bp.noop { return nil } - entries, err := os.ReadDir(bp.dir) - if err != nil { - return fmt.Errorf("list blocks dir for cleanup: %w", err) - } - for _, entry := range entries { - if entry.IsDir() || !strings.HasSuffix(entry.Name(), ".pb") { - continue - } - lane, fileN, err := parseBlockFilename(entry.Name()) - if err != nil { - continue - } - first, ok := laneFirsts[lane] - if ok && fileN >= first { + for lane, first := range laneFirsts { + lw, ok := bp.lanes[lane] + if !ok || first <= lw.firstBlockNum { continue } - path := filepath.Join(bp.dir, entry.Name()) - if err := os.Remove(path); err != nil && !os.IsNotExist(err) { - logger.Warn("failed to delete block file", "path", path, "err", err) + walIdx := lw.firstWALIdx + uint64(first-lw.firstBlockNum) + if err := lw.wal.TruncateBefore(walIdx); err != nil { + return fmt.Errorf("truncate lane %s WAL before block %d: %w", lane, first, err) } + lw.firstWALIdx = walIdx + lw.firstBlockNum = first } return nil } -// loadAll loads all persisted blocks from the blocks/ directory. -// Returns sorted slices per lane. Corrupt files are skipped; the caller -// (newInner) returns an error on gaps or parent-hash mismatches. -func (bp *BlockPersister) loadAll() (map[types.LaneID][]LoadedBlock, error) { - entries, err := os.ReadDir(bp.dir) +// Close shuts down all per-lane WALs. +func (bp *BlockPersister) Close() error { + if bp.noop { + return nil + } + var firstErr error + for _, lw := range bp.lanes { + if err := lw.wal.Close(); err != nil && firstErr == nil { + firstErr = err + } + } + return firstErr +} + +// loadAll replays a lane WAL and returns the loaded blocks plus the lane ID +// (extracted from the first entry). Returns nil lane if the WAL is empty. +func (lw *laneWAL) loadAll() ([]LoadedBlock, *types.LaneID, error) { + first, err := lw.wal.FirstOffset() + if err != nil { + return nil, nil, fmt.Errorf("first offset: %w", err) + } + last, err := lw.wal.LastOffset() if err != nil { - return nil, fmt.Errorf("read blocks dir %s: %w", bp.dir, err) + return nil, nil, fmt.Errorf("last offset: %w", err) + } + if first == 0 && last == 0 { + return nil, nil, nil } - raw := map[types.LaneID]map[types.BlockNumber]*types.Signed[*types.LaneProposal]{} - for _, entry := range entries { - if entry.IsDir() || !strings.HasSuffix(entry.Name(), ".pb") { - continue - } - lane, n, err := parseBlockFilename(entry.Name()) - if err != nil { - logger.Warn("skipping unrecognized block file", "file", entry.Name(), "err", err) - continue - } - proposal, err := loadBlockFile(filepath.Join(bp.dir, entry.Name())) + lw.firstWALIdx = first + lw.nextWALIdx = last + 1 + + var loaded []LoadedBlock + var lane *types.LaneID + err = lw.wal.Replay(first, last, func(index uint64, data []byte) error { + conv := types.SignedMsgConv[*types.LaneProposal]() + proposal, err := conv.Unmarshal(data) if err != nil { - logger.Warn("skipping corrupt block file", "file", entry.Name(), "err", err) - continue + return fmt.Errorf("unmarshal block at WAL index %d: %w", index, err) } h := proposal.Msg().Block().Header() - if h.Lane() != lane || h.BlockNumber() != n { - logger.Warn("skipping block file with mismatched header", - "file", entry.Name(), - "headerLane", h.Lane(), - slog.Uint64("headerNum", uint64(h.BlockNumber())), - "filenameLane", lane, - slog.Uint64("filenameNum", uint64(n)), - ) - continue - } - if raw[lane] == nil { - raw[lane] = map[types.BlockNumber]*types.Signed[*types.LaneProposal]{} + if lane == nil { + l := h.Lane() + lane = &l + lw.firstBlockNum = h.BlockNumber() } - raw[lane][n] = proposal - logger.Info("loaded persisted block", "lane", lane.String(), slog.Uint64("block", uint64(n))) - } - - result := map[types.LaneID][]LoadedBlock{} - for lane, bs := range raw { - sorted := slices.Sorted(maps.Keys(bs)) - blocks := make([]LoadedBlock, 0, len(sorted)) - for _, n := range sorted { - blocks = append(blocks, LoadedBlock{Number: n, Proposal: bs[n]}) - } - result[lane] = blocks - } - return result, nil -} - -func loadBlockFile(path string) (*types.Signed[*types.LaneProposal], error) { - data, err := os.ReadFile(path) //nolint:gosec // path is constructed from operator-configured stateDir + hardcoded filename; not user-controlled + lw.nextBlockNum = h.BlockNumber() + 1 + loaded = append(loaded, LoadedBlock{Number: h.BlockNumber(), Proposal: proposal}) + logger.Info("loaded persisted block", "lane", h.Lane().String(), slog.Uint64("block", uint64(h.BlockNumber()))) + return nil + }) if err != nil { - return nil, err + return nil, nil, err } - conv := types.SignedMsgConv[*types.LaneProposal]() - return conv.Unmarshal(data) + slices.SortFunc(loaded, func(a, b LoadedBlock) int { + return cmp.Compare(a.Number, b.Number) + }) + return loaded, lane, nil } diff --git a/sei-tendermint/internal/autobahn/consensus/persist/blocks_test.go b/sei-tendermint/internal/autobahn/consensus/persist/blocks_test.go index e248ca2bdf..e2d45ac56d 100644 --- a/sei-tendermint/internal/autobahn/consensus/persist/blocks_test.go +++ b/sei-tendermint/internal/autobahn/consensus/persist/blocks_test.go @@ -1,7 +1,6 @@ package persist import ( - "encoding/hex" "os" "path/filepath" "testing" @@ -23,10 +22,10 @@ func TestNewBlockPersisterEmptyDir(t *testing.T) { require.NoError(t, err) require.NotNil(t, bp) require.Equal(t, 0, len(blocks)) - // blocks/ subdirectory should exist fi, err := os.Stat(filepath.Join(dir, "blocks")) require.NoError(t, err) require.True(t, fi.IsDir()) + require.NoError(t, bp.Close()) } func TestPersistBlockAndLoad(t *testing.T) { @@ -42,6 +41,7 @@ func TestPersistBlockAndLoad(t *testing.T) { b1 := testSignedProposal(rng, key, 1) require.NoError(t, bp.PersistBlock(b0)) require.NoError(t, bp.PersistBlock(b1)) + require.NoError(t, bp.Close()) bp2, blocks, err := NewBlockPersister(utils.Some(dir)) require.NoError(t, err) @@ -52,6 +52,7 @@ func TestPersistBlockAndLoad(t *testing.T) { require.Equal(t, types.BlockNumber(1), blocks[lane][1].Number) require.NoError(t, utils.TestDiff(b0, blocks[lane][0].Proposal)) require.NoError(t, utils.TestDiff(b1, blocks[lane][1].Proposal)) + require.NoError(t, bp2.Close()) } func TestPersistBlockMultipleLanes(t *testing.T) { @@ -69,6 +70,7 @@ func TestPersistBlockMultipleLanes(t *testing.T) { b2 := testSignedProposal(rng, key2, 0) require.NoError(t, bp.PersistBlock(b1)) require.NoError(t, bp.PersistBlock(b2)) + require.NoError(t, bp.Close()) _, blocks, err := NewBlockPersister(utils.Some(dir)) require.NoError(t, err) @@ -79,55 +81,7 @@ func TestPersistBlockMultipleLanes(t *testing.T) { require.NoError(t, utils.TestDiff(b2, blocks[lane2][0].Proposal)) } -func TestLoadSkipsCorruptBlockFile(t *testing.T) { - rng := utils.TestRng() - dir := t.TempDir() - - key := types.GenSecretKey(rng) - lane := key.Public() - bp, _, err := NewBlockPersister(utils.Some(dir)) - require.NoError(t, err) - - // Write a good block - b0 := testSignedProposal(rng, key, 0) - require.NoError(t, bp.PersistBlock(b0)) - - // Write a corrupt file with a valid filename - corruptName := blockFilename(lane, 1) - require.NoError(t, os.WriteFile(filepath.Join(dir, "blocks", corruptName), []byte("corrupt"), 0600)) - - _, blocks, err := NewBlockPersister(utils.Some(dir)) - require.NoError(t, err) - require.Equal(t, 1, len(blocks[lane]), "should only load the valid block") - require.NoError(t, utils.TestDiff(b0, blocks[lane][0].Proposal)) -} - -func TestLoadCorruptMidSequenceCreatesGap(t *testing.T) { - rng := utils.TestRng() - dir := t.TempDir() - - key := types.GenSecretKey(rng) - lane := key.Public() - bp, _, err := NewBlockPersister(utils.Some(dir)) - require.NoError(t, err) - - // Persist blocks 0, 2 (valid) and corrupt block 1. - // After skipping corrupt-1, raw has {0, 2} → returned with gap. - b0 := testSignedProposal(rng, key, 0) - b2 := testSignedProposal(rng, key, 2) - require.NoError(t, bp.PersistBlock(b0)) - require.NoError(t, bp.PersistBlock(b2)) - corruptName := blockFilename(lane, 1) - require.NoError(t, os.WriteFile(filepath.Join(dir, "blocks", corruptName), []byte("corrupt"), 0600)) - - _, blocks, err := NewBlockPersister(utils.Some(dir)) - require.NoError(t, err) - require.Equal(t, 2, len(blocks[lane]), "corrupt skipped; both valid blocks returned") - require.Equal(t, types.BlockNumber(0), blocks[lane][0].Number) - require.Equal(t, types.BlockNumber(2), blocks[lane][1].Number) -} - -func TestLoadReturnsAllWithGap(t *testing.T) { +func TestDeleteBeforeRemovesOldKeepsNew(t *testing.T) { rng := utils.TestRng() dir := t.TempDir() @@ -136,21 +90,21 @@ func TestLoadReturnsAllWithGap(t *testing.T) { bp, _, err := NewBlockPersister(utils.Some(dir)) require.NoError(t, err) - // Persist blocks 3, 4, 6, 7 (gap at 5). All four returned sorted. - for _, n := range []types.BlockNumber{3, 4, 6, 7} { - require.NoError(t, bp.PersistBlock(testSignedProposal(rng, key, n))) + for i := types.BlockNumber(0); i < 5; i++ { + require.NoError(t, bp.PersistBlock(testSignedProposal(rng, key, i))) } + require.NoError(t, bp.DeleteBefore(map[types.LaneID]types.BlockNumber{lane: 3})) + require.NoError(t, bp.Close()) + _, blocks, err := NewBlockPersister(utils.Some(dir)) require.NoError(t, err) - require.Equal(t, 4, len(blocks[lane]), "should return all valid files including after gap") + require.Equal(t, 2, len(blocks[lane]), "should have blocks 3 and 4") require.Equal(t, types.BlockNumber(3), blocks[lane][0].Number) require.Equal(t, types.BlockNumber(4), blocks[lane][1].Number) - require.Equal(t, types.BlockNumber(6), blocks[lane][2].Number) - require.Equal(t, types.BlockNumber(7), blocks[lane][3].Number) } -func TestLoadSkipsMismatchedHeader(t *testing.T) { +func TestDeleteBeforeMultipleLanes(t *testing.T) { rng := utils.TestRng() dir := t.TempDir() @@ -161,40 +115,57 @@ func TestLoadSkipsMismatchedHeader(t *testing.T) { bp, _, err := NewBlockPersister(utils.Some(dir)) require.NoError(t, err) - // Write block for lane1 but save it under lane2's filename - b := testSignedProposal(rng, key1, 5) - require.NoError(t, bp.PersistBlock(b)) + for i := types.BlockNumber(0); i < 3; i++ { + require.NoError(t, bp.PersistBlock(testSignedProposal(rng, key1, i))) + require.NoError(t, bp.PersistBlock(testSignedProposal(rng, key2, i))) + } - // Rename the file to use lane2 in the filename - oldPath := filepath.Join(dir, "blocks", blockFilename(lane1, 5)) - newPath := filepath.Join(dir, "blocks", blockFilename(lane2, 5)) - require.NoError(t, os.Rename(oldPath, newPath)) + // Delete lane1 < 2, lane2 < 1 — independent per-lane truncation. + require.NoError(t, bp.DeleteBefore(map[types.LaneID]types.BlockNumber{lane1: 2, lane2: 1})) + require.NoError(t, bp.Close()) - // Reload — should skip the mismatched file _, blocks, err := NewBlockPersister(utils.Some(dir)) require.NoError(t, err) - require.Equal(t, 0, len(blocks), "mismatched header should be skipped") + require.Equal(t, 1, len(blocks[lane1]), "lane1 should have block 2") + require.Equal(t, types.BlockNumber(2), blocks[lane1][0].Number) + require.Equal(t, 2, len(blocks[lane2]), "lane2 should have blocks 1,2") + require.Equal(t, types.BlockNumber(1), blocks[lane2][0].Number) + require.Equal(t, types.BlockNumber(2), blocks[lane2][1].Number) } -func TestLoadSkipsUnrecognizedFilename(t *testing.T) { +func TestDeleteBeforeEmptyMap(t *testing.T) { + rng := utils.TestRng() dir := t.TempDir() + key := types.GenSecretKey(rng) + lane := key.Public() bp, _, err := NewBlockPersister(utils.Some(dir)) require.NoError(t, err) - _ = bp - // Write files with bad names - blocksDir := filepath.Join(dir, "blocks") - require.NoError(t, os.WriteFile(filepath.Join(blocksDir, "notablock.pb"), []byte("data"), 0600)) - require.NoError(t, os.WriteFile(filepath.Join(blocksDir, "readme.txt"), []byte("hi"), 0600)) + require.NoError(t, bp.PersistBlock(testSignedProposal(rng, key, 0))) + + require.NoError(t, bp.DeleteBefore(map[types.LaneID]types.BlockNumber{})) + require.NoError(t, bp.Close()) - // Reload — should skip both _, blocks, err := NewBlockPersister(utils.Some(dir)) require.NoError(t, err) + require.Equal(t, 1, len(blocks[lane])) +} + +func TestNoOpBlockPersister(t *testing.T) { + bp, blocks, err := NewBlockPersister(utils.None[string]()) + require.NoError(t, err) + require.NotNil(t, bp) require.Equal(t, 0, len(blocks)) + + rng := utils.TestRng() + key := types.GenSecretKey(rng) + require.NoError(t, bp.PersistBlock(testSignedProposal(rng, key, 0))) + require.NoError(t, bp.DeleteBefore(map[types.LaneID]types.BlockNumber{})) + require.NoError(t, bp.Close()) } -func TestDeleteBeforeRemovesOldKeepsNew(t *testing.T) { +func TestDeleteBeforeThenPersistMore(t *testing.T) { rng := utils.TestRng() dir := t.TempDir() @@ -203,22 +174,23 @@ func TestDeleteBeforeRemovesOldKeepsNew(t *testing.T) { bp, _, err := NewBlockPersister(utils.Some(dir)) require.NoError(t, err) - // Persist blocks 0..4 + // Persist 0..4, delete before 3, then persist 5. for i := types.BlockNumber(0); i < 5; i++ { require.NoError(t, bp.PersistBlock(testSignedProposal(rng, key, i))) } - - // Delete blocks before 3 require.NoError(t, bp.DeleteBefore(map[types.LaneID]types.BlockNumber{lane: 3})) + require.NoError(t, bp.PersistBlock(testSignedProposal(rng, key, 5))) + require.NoError(t, bp.Close()) _, blocks, err := NewBlockPersister(utils.Some(dir)) require.NoError(t, err) - require.Equal(t, 2, len(blocks[lane]), "should have blocks 3 and 4") + require.Equal(t, 3, len(blocks[lane]), "should have blocks 3, 4, 5") require.Equal(t, types.BlockNumber(3), blocks[lane][0].Number) require.Equal(t, types.BlockNumber(4), blocks[lane][1].Number) + require.Equal(t, types.BlockNumber(5), blocks[lane][2].Number) } -func TestDeleteBeforeMultipleLanes(t *testing.T) { +func TestPerLaneIndependentTruncation(t *testing.T) { rng := utils.TestRng() dir := t.TempDir() @@ -229,83 +201,41 @@ func TestDeleteBeforeMultipleLanes(t *testing.T) { bp, _, err := NewBlockPersister(utils.Some(dir)) require.NoError(t, err) - // Lane1: blocks 0,1,2; Lane2: blocks 0,1,2 - for i := types.BlockNumber(0); i < 3; i++ { + // Lane1: blocks 0..9, Lane2: blocks 0..2 + for i := types.BlockNumber(0); i < 10; i++ { require.NoError(t, bp.PersistBlock(testSignedProposal(rng, key1, i))) + } + for i := types.BlockNumber(0); i < 3; i++ { require.NoError(t, bp.PersistBlock(testSignedProposal(rng, key2, i))) } - // Delete lane1 < 2, lane2 < 1 - require.NoError(t, bp.DeleteBefore(map[types.LaneID]types.BlockNumber{lane1: 2, lane2: 1})) - - _, blocks, err := NewBlockPersister(utils.Some(dir)) - require.NoError(t, err) - require.Equal(t, 1, len(blocks[lane1]), "lane1 should have block 2") - require.Equal(t, types.BlockNumber(2), blocks[lane1][0].Number) - require.Equal(t, 2, len(blocks[lane2]), "lane2 should have blocks 1,2") - require.Equal(t, types.BlockNumber(1), blocks[lane2][0].Number) - require.Equal(t, types.BlockNumber(2), blocks[lane2][1].Number) -} - -func TestDeleteBeforeEmptyMap(t *testing.T) { - rng := utils.TestRng() - dir := t.TempDir() - - key := types.GenSecretKey(rng) - lane := key.Public() - bp, _, err := NewBlockPersister(utils.Some(dir)) - require.NoError(t, err) - - require.NoError(t, bp.PersistBlock(testSignedProposal(rng, key, 0))) - - // Empty map — should not delete anything - require.NoError(t, bp.DeleteBefore(map[types.LaneID]types.BlockNumber{})) + // Truncate lane1 aggressively, leave lane2 untouched. + require.NoError(t, bp.DeleteBefore(map[types.LaneID]types.BlockNumber{lane1: 8})) + require.NoError(t, bp.Close()) _, blocks, err := NewBlockPersister(utils.Some(dir)) require.NoError(t, err) - require.Equal(t, 1, len(blocks[lane])) + require.Equal(t, 2, len(blocks[lane1]), "lane1: blocks 8,9") + require.Equal(t, types.BlockNumber(8), blocks[lane1][0].Number) + require.Equal(t, 3, len(blocks[lane2]), "lane2: all 3 blocks intact") } -func TestDeleteBeforeRemovesOrphanedLanes(t *testing.T) { +func TestLazyLaneCreation(t *testing.T) { rng := utils.TestRng() dir := t.TempDir() - key1 := types.GenSecretKey(rng) - lane1 := key1.Public() - key2 := types.GenSecretKey(rng) - lane2 := key2.Public() - bp, _, err := NewBlockPersister(utils.Some(dir)) require.NoError(t, err) - // Persist blocks on both lanes. - for n := types.BlockNumber(0); n < 3; n++ { - require.NoError(t, bp.PersistBlock(testSignedProposal(rng, key1, n))) - require.NoError(t, bp.PersistBlock(testSignedProposal(rng, key2, n))) - } - - // Only lane1 is in the current committee; lane2 is orphaned. - require.NoError(t, bp.DeleteBefore(map[types.LaneID]types.BlockNumber{lane1: 1})) - - _, blocks, err := NewBlockPersister(utils.Some(dir)) - require.NoError(t, err) - - // lane1: block 0 deleted, blocks 1-2 kept. - require.Equal(t, 2, len(blocks[lane1])) - require.Equal(t, types.BlockNumber(1), blocks[lane1][0].Number) - - // lane2: all blocks deleted (orphaned lane). - require.Equal(t, 0, len(blocks[lane2])) -} + // No lanes exist yet. + entries, _ := os.ReadDir(filepath.Join(dir, "blocks")) + require.Equal(t, 0, len(entries)) -func TestBlockFilenameRoundTrip(t *testing.T) { - rng := utils.TestRng() - lane := types.GenSecretKey(rng).Public() - n := types.BlockNumber(42) + // First persist for a lane creates its directory and WAL. + key := types.GenSecretKey(rng) + require.NoError(t, bp.PersistBlock(testSignedProposal(rng, key, 0))) - name := blockFilename(lane, n) - parsedLane, parsedN, err := parseBlockFilename(name) - require.NoError(t, err) - require.Equal(t, hex.EncodeToString(lane.Bytes()), hex.EncodeToString(parsedLane.Bytes())) - require.Equal(t, n, parsedN) + entries, _ = os.ReadDir(filepath.Join(dir, "blocks")) + require.Equal(t, 1, len(entries), "should have 1 lane directory") + require.NoError(t, bp.Close()) } diff --git a/sei-tendermint/internal/autobahn/consensus/persist/commitqcs.go b/sei-tendermint/internal/autobahn/consensus/persist/commitqcs.go index 4ade5e712c..d9812fb5ba 100644 --- a/sei-tendermint/internal/autobahn/consensus/persist/commitqcs.go +++ b/sei-tendermint/internal/autobahn/consensus/persist/commitqcs.go @@ -1,22 +1,12 @@ -// TODO: CommitQC file persistence is a temporary solution that will be replaced -// by the same WAL (Write-Ahead Log) library as block persistence (see blocks.go). -// With a WAL, atomic appends eliminate corrupt file handling, per-file -// naming/parsing, directory scanning, and DeleteBefore cleanup -// (WAL replay is always contiguous). - package persist import ( + "context" "fmt" - "maps" "os" "path/filepath" - "slices" - "strconv" - "strings" - - "log/slog" + dbwal "github.com/sei-protocol/sei-chain/sei-db/wal" "github.com/sei-protocol/sei-chain/sei-tendermint/internal/autobahn/types" "github.com/sei-protocol/sei-chain/sei-tendermint/libs/utils" ) @@ -27,28 +17,34 @@ type LoadedCommitQC struct { QC *types.CommitQC } -// CommitQCPersister manages individual CommitQC files in a commitqcs/ subdirectory. -// Each CommitQC is stored as .pb. -// The caller is responsible for driving persistence (typically a goroutine that -// watches in-memory state and calls PersistCommitQC / DeleteBefore). +// CommitQCPersister manages CommitQC persistence using a WAL. +// Entries are appended in order; each entry is self-describing (the serialized +// CommitQC contains its RoadIndex). The WAL index is append order, not +// RoadIndex — we track the WAL index of the first entry to enable truncation. // When noop is true, all disk I/O is skipped but cursor tracking still works. type CommitQCPersister struct { - dir string // full path to the commitqcs/ subdirectory; empty when noop + wal *dbwal.WAL[[]byte] noop bool next types.RoadIndex + + // WAL index of the first (oldest) entry still in the WAL. + // Used to compute the truncation point for TruncateBefore. + // Zero when the WAL is empty. + firstWALIdx uint64 + // nextWALIdx is the WAL index that the next Write will be assigned. + // Tracked locally: initialized from LastOffset()+1, incremented after + // each successful Write. Safe because we are the sole synchronous writer. + nextWALIdx uint64 } -// newNoOpCommitQCPersister returns a CommitQCPersister that skips all disk I/O -// but still tracks the next index. Used when persistence is disabled. func newNoOpCommitQCPersister() *CommitQCPersister { return &CommitQCPersister{noop: true} } -// NewCommitQCPersister creates the commitqcs/ subdirectory if it doesn't exist -// and returns a persister. Loads all persisted CommitQCs from disk as a sorted -// slice. Corrupt files are skipped; the caller (newInner) returns an error if -// the resulting slice is non-contiguous. -// When stateDir is None, returns a no-op persister that skips all disk I/O. +// NewCommitQCPersister opens (or creates) a WAL in the commitqcs/ subdirectory +// and replays all persisted entries. Returns the persister and a sorted slice of +// loaded CommitQCs. Corrupt tail entries are auto-truncated by the WAL library. +// When stateDir is None, returns a no-op persister. func NewCommitQCPersister(stateDir utils.Option[string]) (*CommitQCPersister, []LoadedCommitQC, error) { sd, ok := stateDir.Get() if !ok { @@ -59,9 +55,26 @@ func NewCommitQCPersister(stateDir utils.Option[string]) (*CommitQCPersister, [] return nil, nil, fmt.Errorf("create commitqcs dir %s: %w", dir, err) } - cp := &CommitQCPersister{dir: dir} + w, err := dbwal.NewWAL[[]byte]( + context.Background(), + func(data []byte) ([]byte, error) { return data, nil }, + func(data []byte) ([]byte, error) { return data, nil }, + logger, + dir, + dbwal.Config{ + WriteBufferSize: 0, // synchronous writes + WriteBatchSize: 1, // no batching + FsyncEnabled: true, + }, + ) + if err != nil { + return nil, nil, fmt.Errorf("open commitqc WAL in %s: %w", dir, err) + } + + cp := &CommitQCPersister{wal: w, nextWALIdx: 1} loaded, err := cp.loadAll() if err != nil { + _ = w.Close() return nil, nil, err } if len(loaded) > 0 { @@ -78,26 +91,12 @@ func (cp *CommitQCPersister) LoadNext() types.RoadIndex { // ResetNext overrides the next-to-persist cursor. Called after newInner // applies prune(), which may advance commitQCs.next beyond the raw loader's -// cursor. Without this, PersistCommitQC would reject valid new QCs as -// "already persisted". +// cursor. func (cp *CommitQCPersister) ResetNext(idx types.RoadIndex) { cp.next = idx } -func commitQCFilename(idx types.RoadIndex) string { - return strconv.FormatUint(uint64(idx), 10) + ".pb" -} - -func parseCommitQCFilename(name string) (types.RoadIndex, error) { - name = strings.TrimSuffix(name, ".pb") - n, err := strconv.ParseUint(name, 10, 64) - if err != nil { - return 0, fmt.Errorf("bad commitqc filename %q: %w", name, err) - } - return types.RoadIndex(n), nil -} - -// PersistCommitQC writes a CommitQC to its own file. +// PersistCommitQC writes a CommitQC to the WAL. // The caller must persist CommitQCs in order; idx < cp.next is a bug. func (cp *CommitQCPersister) PersistCommitQC(qc *types.CommitQC) error { idx := qc.Index() @@ -106,97 +105,75 @@ func (cp *CommitQCPersister) PersistCommitQC(qc *types.CommitQC) error { } if !cp.noop { data := types.CommitQCConv.Marshal(qc) - path := filepath.Join(cp.dir, commitQCFilename(idx)) - if err := writeAndSync(path, data); err != nil { + if err := cp.wal.Write(data); err != nil { return fmt.Errorf("persist commitqc %d: %w", idx, err) } + if cp.firstWALIdx == 0 { + cp.firstWALIdx = cp.nextWALIdx + } + cp.nextWALIdx++ } cp.next = idx + 1 return nil } -// DeleteBefore removes persisted CommitQC files with road index below idx. -// Returns an error if the directory cannot be read; individual file removal -// failures are logged but do not cause an error. +// DeleteBefore removes persisted CommitQCs with road index below idx +// by truncating the front of the WAL. +// The mapping from RoadIndex to WAL index is linear: entries are written +// sequentially, so WAL index = firstWALIdx + (roadIndex - firstRoadIndex). func (cp *CommitQCPersister) DeleteBefore(idx types.RoadIndex) error { if cp.noop || idx == 0 { return nil } - entries, err := os.ReadDir(cp.dir) - if err != nil { - return fmt.Errorf("list commitqcs dir for cleanup: %w", err) + // Compute the WAL index corresponding to RoadIndex idx. + // Entries are sequential: firstWALIdx corresponds to firstRoadIndex. + firstRoadIndex := cp.next - types.RoadIndex(cp.nextWALIdx-cp.firstWALIdx) + if idx <= firstRoadIndex { + return nil } - for _, entry := range entries { - if entry.IsDir() || !strings.HasSuffix(entry.Name(), ".pb") { - continue - } - fileIdx, err := parseCommitQCFilename(entry.Name()) - if err != nil { - continue - } - if fileIdx >= idx { - continue - } - path := filepath.Join(cp.dir, entry.Name()) - if err := os.Remove(path); err != nil && !os.IsNotExist(err) { - logger.Warn("failed to delete commitqc file", "path", path, "err", err) - } + walIdx := cp.firstWALIdx + uint64(idx-firstRoadIndex) + if err := cp.wal.TruncateBefore(walIdx); err != nil { + return fmt.Errorf("truncate commitqc WAL before %d: %w", walIdx, err) } + cp.firstWALIdx = walIdx return nil } -// loadAll loads all persisted CommitQCs from the commitqcs/ directory. -// Returns a sorted slice of all valid files. Corrupt or mismatched files -// are skipped; the caller (newInner) returns an error on gaps. +// Close shuts down the WAL. +func (cp *CommitQCPersister) Close() error { + if cp.noop { + return nil + } + return cp.wal.Close() +} + func (cp *CommitQCPersister) loadAll() ([]LoadedCommitQC, error) { - entries, err := os.ReadDir(cp.dir) + first, err := cp.wal.FirstOffset() if err != nil { - return nil, fmt.Errorf("read commitqcs dir %s: %w", cp.dir, err) + return nil, fmt.Errorf("commitqc WAL first offset: %w", err) } - - raw := map[types.RoadIndex]*types.CommitQC{} - for _, entry := range entries { - if entry.IsDir() || !strings.HasSuffix(entry.Name(), ".pb") { - continue - } - idx, err := parseCommitQCFilename(entry.Name()) - if err != nil { - logger.Warn("skipping unrecognized commitqc file", "file", entry.Name(), "err", err) - continue - } - qc, err := loadCommitQCFile(filepath.Join(cp.dir, entry.Name())) - if err != nil { - logger.Warn("skipping corrupt commitqc file", "file", entry.Name(), "err", err) - continue - } - if qc.Index() != idx { - logger.Warn("skipping commitqc file with mismatched index", - "file", entry.Name(), - slog.Uint64("headerIdx", uint64(qc.Index())), - slog.Uint64("filenameIdx", uint64(idx)), - ) - continue - } - raw[idx] = qc - logger.Info("loaded persisted commitqc", slog.Uint64("roadIndex", uint64(idx))) + last, err := cp.wal.LastOffset() + if err != nil { + return nil, fmt.Errorf("commitqc WAL last offset: %w", err) } - - if len(raw) == 0 { + if first == 0 && last == 0 { return nil, nil } - sorted := slices.Sorted(maps.Keys(raw)) - result := make([]LoadedCommitQC, 0, len(sorted)) - for _, idx := range sorted { - result = append(result, LoadedCommitQC{Index: idx, QC: raw[idx]}) - } - return result, nil -} + cp.firstWALIdx = first + cp.nextWALIdx = last + 1 -func loadCommitQCFile(path string) (*types.CommitQC, error) { - data, err := os.ReadFile(path) //nolint:gosec // path is constructed from operator-configured stateDir + hardcoded filename; not user-controlled + var loaded []LoadedCommitQC + err = cp.wal.Replay(first, last, func(index uint64, data []byte) error { + qc, err := types.CommitQCConv.Unmarshal(data) + if err != nil { + return fmt.Errorf("unmarshal commitqc at WAL index %d: %w", index, err) + } + loaded = append(loaded, LoadedCommitQC{Index: qc.Index(), QC: qc}) + return nil + }) if err != nil { return nil, err } - return types.CommitQCConv.Unmarshal(data) + return loaded, nil } diff --git a/sei-tendermint/internal/autobahn/consensus/persist/commitqcs_test.go b/sei-tendermint/internal/autobahn/consensus/persist/commitqcs_test.go index dccaa341fc..e1eeaf7ddb 100644 --- a/sei-tendermint/internal/autobahn/consensus/persist/commitqcs_test.go +++ b/sei-tendermint/internal/autobahn/consensus/persist/commitqcs_test.go @@ -71,6 +71,7 @@ func TestNewCommitQCPersisterEmptyDir(t *testing.T) { fi, err := os.Stat(filepath.Join(dir, "commitqcs")) require.NoError(t, err) require.True(t, fi.IsDir()) + require.NoError(t, cp.Close()) } func TestPersistCommitQCAndLoad(t *testing.T) { @@ -87,6 +88,7 @@ func TestPersistCommitQCAndLoad(t *testing.T) { require.NoError(t, cp.PersistCommitQC(qc)) } require.Equal(t, types.RoadIndex(3), cp.LoadNext()) + require.NoError(t, cp.Close()) cp2, loaded, err := NewCommitQCPersister(utils.Some(dir)) require.NoError(t, err) @@ -97,52 +99,59 @@ func TestPersistCommitQCAndLoad(t *testing.T) { require.NoError(t, utils.TestDiff(qcs[i], lqc.QC)) } require.Equal(t, types.RoadIndex(3), cp2.LoadNext()) + require.NoError(t, cp2.Close()) } -func TestLoadSkipsCorruptCommitQCFile(t *testing.T) { +func TestCommitQCDeleteBeforeRemovesOldKeepsNew(t *testing.T) { rng := utils.TestRng() committee, keys := types.GenCommittee(rng, 4) dir := t.TempDir() - qcs := makeSequentialCommitQCs(rng, committee, keys, 1) + qcs := makeSequentialCommitQCs(rng, committee, keys, 5) cp, _, err := NewCommitQCPersister(utils.Some(dir)) require.NoError(t, err) - require.NoError(t, cp.PersistCommitQC(qcs[0])) + for _, qc := range qcs { + require.NoError(t, cp.PersistCommitQC(qc)) + } - // Write a corrupt file for index 1 - corruptPath := filepath.Join(dir, "commitqcs", commitQCFilename(1)) - require.NoError(t, os.WriteFile(corruptPath, []byte("corrupt"), 0600)) + require.NoError(t, cp.DeleteBefore(3)) + require.NoError(t, cp.Close()) _, loaded, err := NewCommitQCPersister(utils.Some(dir)) require.NoError(t, err) - require.Equal(t, 1, len(loaded), "should only load valid commitqc") - require.NoError(t, utils.TestDiff(qcs[0], loaded[0].QC)) + require.Equal(t, 2, len(loaded), "should have indices 3 and 4") + require.Equal(t, types.RoadIndex(3), loaded[0].Index) + require.Equal(t, types.RoadIndex(4), loaded[1].Index) } -func TestLoadCommitQCReturnsAllWithGap(t *testing.T) { +func TestCommitQCDeleteBeforeZero(t *testing.T) { rng := utils.TestRng() committee, keys := types.GenCommittee(rng, 4) dir := t.TempDir() - qcs := makeSequentialCommitQCs(rng, committee, keys, 4) + qcs := makeSequentialCommitQCs(rng, committee, keys, 2) cp, _, err := NewCommitQCPersister(utils.Some(dir)) require.NoError(t, err) + for _, qc := range qcs { + require.NoError(t, cp.PersistCommitQC(qc)) + } - // Persist 0, 1, skip 2, persist 3 → gap at 2, all three returned sorted. - require.NoError(t, cp.PersistCommitQC(qcs[0])) - require.NoError(t, cp.PersistCommitQC(qcs[1])) - require.NoError(t, cp.PersistCommitQC(qcs[3])) + require.NoError(t, cp.DeleteBefore(0)) + require.NoError(t, cp.Close()) - cp2, loaded, err := NewCommitQCPersister(utils.Some(dir)) + _, loaded, err := NewCommitQCPersister(utils.Some(dir)) require.NoError(t, err) - require.Equal(t, 3, len(loaded), "should return all valid files including after gap") - require.Equal(t, types.RoadIndex(0), loaded[0].Index) - require.Equal(t, types.RoadIndex(1), loaded[1].Index) - require.Equal(t, types.RoadIndex(3), loaded[2].Index) - require.Equal(t, types.RoadIndex(4), cp2.LoadNext(), "next should be max index + 1") + require.Equal(t, 2, len(loaded)) } -func TestLoadCommitQCCorruptMidSequenceCreatesGap(t *testing.T) { +func TestCommitQCResetNext(t *testing.T) { + cp := newNoOpCommitQCPersister() + require.Equal(t, types.RoadIndex(0), cp.LoadNext()) + cp.ResetNext(5) + require.Equal(t, types.RoadIndex(5), cp.LoadNext()) +} + +func TestCommitQCPersistInOrder(t *testing.T) { rng := utils.TestRng() committee, keys := types.GenCommittee(rng, 4) dir := t.TempDir() @@ -151,99 +160,50 @@ func TestLoadCommitQCCorruptMidSequenceCreatesGap(t *testing.T) { cp, _, err := NewCommitQCPersister(utils.Some(dir)) require.NoError(t, err) - // Persist 0, corrupt 1, persist 2 → corrupt skipped, returns [0, 2] with gap. require.NoError(t, cp.PersistCommitQC(qcs[0])) - require.NoError(t, cp.PersistCommitQC(qcs[2])) - corruptPath := filepath.Join(dir, "commitqcs", commitQCFilename(1)) - require.NoError(t, os.WriteFile(corruptPath, []byte("corrupt"), 0600)) + require.NoError(t, cp.PersistCommitQC(qcs[1])) + // Persisting qcs[0] again should fail (idx < next). + err = cp.PersistCommitQC(qcs[0]) + require.Error(t, err) + require.NoError(t, cp.Close()) +} - _, loaded, err := NewCommitQCPersister(utils.Some(dir)) +func TestNoOpCommitQCPersister(t *testing.T) { + cp, loaded, err := NewCommitQCPersister(utils.None[string]()) require.NoError(t, err) - require.Equal(t, 2, len(loaded), "corrupt skipped; both valid files returned") - require.Equal(t, types.RoadIndex(0), loaded[0].Index) - require.Equal(t, types.RoadIndex(2), loaded[1].Index) -} + require.NotNil(t, cp) + require.Equal(t, 0, len(loaded)) -func TestLoadCommitQCSkipsMismatchedIndex(t *testing.T) { rng := utils.TestRng() committee, keys := types.GenCommittee(rng, 4) - dir := t.TempDir() - - qcs := makeSequentialCommitQCs(rng, committee, keys, 2) - cp, _, err := NewCommitQCPersister(utils.Some(dir)) - require.NoError(t, err) - - // Persist qc[0] (index 0) but save it under filename for index 5 + qcs := makeSequentialCommitQCs(rng, committee, keys, 1) require.NoError(t, cp.PersistCommitQC(qcs[0])) - oldPath := filepath.Join(dir, "commitqcs", commitQCFilename(0)) - newPath := filepath.Join(dir, "commitqcs", commitQCFilename(5)) - require.NoError(t, os.Rename(oldPath, newPath)) - - _, loaded, err := NewCommitQCPersister(utils.Some(dir)) - require.NoError(t, err) - require.Equal(t, 0, len(loaded), "mismatched index should be skipped") -} - -func TestLoadCommitQCSkipsUnrecognizedFilename(t *testing.T) { - dir := t.TempDir() - cp, _, err := NewCommitQCPersister(utils.Some(dir)) - require.NoError(t, err) - _ = cp - - qcDir := filepath.Join(dir, "commitqcs") - require.NoError(t, os.WriteFile(filepath.Join(qcDir, "notaqc.pb"), []byte("data"), 0600)) - require.NoError(t, os.WriteFile(filepath.Join(qcDir, "readme.txt"), []byte("hi"), 0600)) - - _, loaded, err := NewCommitQCPersister(utils.Some(dir)) - require.NoError(t, err) - require.Equal(t, 0, len(loaded)) + require.Equal(t, types.RoadIndex(1), cp.LoadNext()) + require.NoError(t, cp.DeleteBefore(0)) + require.NoError(t, cp.Close()) } -func TestCommitQCDeleteBeforeRemovesOldKeepsNew(t *testing.T) { +func TestCommitQCDeleteBeforeThenPersistMore(t *testing.T) { rng := utils.TestRng() committee, keys := types.GenCommittee(rng, 4) dir := t.TempDir() - qcs := makeSequentialCommitQCs(rng, committee, keys, 5) + qcs := makeSequentialCommitQCs(rng, committee, keys, 6) cp, _, err := NewCommitQCPersister(utils.Some(dir)) require.NoError(t, err) - for _, qc := range qcs { - require.NoError(t, cp.PersistCommitQC(qc)) - } + // Persist 0..4, delete before 3, then persist 5. + for i := 0; i < 5; i++ { + require.NoError(t, cp.PersistCommitQC(qcs[i])) + } require.NoError(t, cp.DeleteBefore(3)) + require.NoError(t, cp.PersistCommitQC(qcs[5])) + require.NoError(t, cp.Close()) _, loaded, err := NewCommitQCPersister(utils.Some(dir)) require.NoError(t, err) - require.Equal(t, 2, len(loaded), "should have indices 3 and 4") + require.Equal(t, 3, len(loaded), "should have indices 3, 4, 5") require.Equal(t, types.RoadIndex(3), loaded[0].Index) require.Equal(t, types.RoadIndex(4), loaded[1].Index) -} - -func TestCommitQCDeleteBeforeZero(t *testing.T) { - rng := utils.TestRng() - committee, keys := types.GenCommittee(rng, 4) - dir := t.TempDir() - - qcs := makeSequentialCommitQCs(rng, committee, keys, 2) - cp, _, err := NewCommitQCPersister(utils.Some(dir)) - require.NoError(t, err) - for _, qc := range qcs { - require.NoError(t, cp.PersistCommitQC(qc)) - } - - // idx=0 should be a no-op - require.NoError(t, cp.DeleteBefore(0)) - - _, loaded, err := NewCommitQCPersister(utils.Some(dir)) - require.NoError(t, err) - require.Equal(t, 2, len(loaded)) -} - -func TestCommitQCFilenameRoundTrip(t *testing.T) { - idx := types.RoadIndex(42) - name := commitQCFilename(idx) - parsed, err := parseCommitQCFilename(name) - require.NoError(t, err) - require.Equal(t, idx, parsed) + require.Equal(t, types.RoadIndex(5), loaded[2].Index) } From 9bea6b021873d5e989feb528db45b1ec6f81d798 Mon Sep 17 00:00:00 2001 From: Wen Date: Mon, 9 Mar 2026 15:23:57 -0700 Subject: [PATCH 02/31] persist: refactor block/commitqc WAL into generic indexedWAL Extract common WAL mechanics (index tracking, typed write/replay, truncation) into a generic indexedWAL[T] backed by sei-db/wal, replacing the duplicated raw-bytes WAL setup in both blocks.go and commitqcs.go. Key changes: - Add indexedWAL[T] with codec[T] interface for typed serialization - laneWAL embeds indexedWAL; firstBlockNum() returns Option for safety - DeleteBefore now removes stale lane WALs (validators no longer in committee) and their directories - Add empty-WAL guard to CommitQCPersister.DeleteBefore - Add direct unit tests for indexedWAL (wal_test.go) - Add TODO for dynamic committee membership support Made-with: Cursor --- .../internal/autobahn/avail/inner.go | 6 + .../internal/autobahn/avail/state.go | 4 + .../autobahn/consensus/persist/blocks.go | 101 ++++----- .../autobahn/consensus/persist/blocks_test.go | 39 +++- .../autobahn/consensus/persist/commitqcs.go | 80 ++----- .../autobahn/consensus/persist/wal.go | 109 ++++++++++ .../autobahn/consensus/persist/wal_test.go | 203 ++++++++++++++++++ 7 files changed, 405 insertions(+), 137 deletions(-) create mode 100644 sei-tendermint/internal/autobahn/consensus/persist/wal.go create mode 100644 sei-tendermint/internal/autobahn/consensus/persist/wal_test.go diff --git a/sei-tendermint/internal/autobahn/avail/inner.go b/sei-tendermint/internal/autobahn/avail/inner.go index 167c02e486..cfb9cebfee 100644 --- a/sei-tendermint/internal/autobahn/avail/inner.go +++ b/sei-tendermint/internal/autobahn/avail/inner.go @@ -9,6 +9,12 @@ import ( "github.com/sei-protocol/sei-chain/sei-tendermint/libs/utils" ) +// TODO: when dynamic committee changes are supported, newly joined members +// must be added to blocks, votes, nextBlockToPersist, and persistedBlockStart. +// Currently all four are initialized once in newInner from c.Lanes().All(). +// BlockPersister already handles lazy lane WAL creation, but DeleteBefore +// removes lanes not in laneFirsts, so the new member must also appear in +// inner.blocks before the next persist cycle. type inner struct { latestAppQC utils.Option[*types.AppQC] latestCommitQC utils.AtomicSend[utils.Option[*types.CommitQC]] diff --git a/sei-tendermint/internal/autobahn/avail/state.go b/sei-tendermint/internal/autobahn/avail/state.go index 6d1b3cba7d..139503a7c8 100644 --- a/sei-tendermint/internal/autobahn/avail/state.go +++ b/sei-tendermint/internal/autobahn/avail/state.go @@ -160,6 +160,8 @@ func NewState(key types.SecretKey, data *data.State, stateDir utils.Option[strin // Truncate WAL entries below the prune anchor that were filtered out by // loadPersistedState. Also reset the CommitQC persister's cursor to // match the post-prune range. + // Must include all current committee members: DeleteBefore removes + // lane WALs not present in the map. laneFirsts := make(map[types.LaneID]types.BlockNumber, len(inner.blocks)) for lane, q := range inner.blocks { laneFirsts[lane] = q.first @@ -759,6 +761,8 @@ func (s *State) collectPersistBatch(ctx context.Context, lastPersistedAppQCNext }); err != nil { return b, err } + // Must include all current committee members: DeleteBefore removes + // lane WALs not present in the map. b.laneFirsts = make(map[types.LaneID]types.BlockNumber, len(inner.blocks)) for lane, q := range inner.blocks { start := max(inner.nextBlockToPersist[lane], q.first) diff --git a/sei-tendermint/internal/autobahn/consensus/persist/blocks.go b/sei-tendermint/internal/autobahn/consensus/persist/blocks.go index bf83768b56..168c30ddee 100644 --- a/sei-tendermint/internal/autobahn/consensus/persist/blocks.go +++ b/sei-tendermint/internal/autobahn/consensus/persist/blocks.go @@ -2,7 +2,6 @@ package persist import ( "cmp" - "context" "encoding/hex" "fmt" "os" @@ -11,7 +10,6 @@ import ( "log/slog" - dbwal "github.com/sei-protocol/sei-chain/sei-db/wal" "github.com/sei-protocol/sei-chain/sei-tendermint/internal/autobahn/types" "github.com/sei-protocol/sei-chain/sei-tendermint/libs/utils" "github.com/sei-protocol/seilog" @@ -25,19 +23,21 @@ type LoadedBlock struct { Proposal *types.Signed[*types.LaneProposal] } -// laneWAL wraps a per-lane WAL with a linear index mapping between -// BlockNumber and WAL index. Blocks within a lane are sequential, so the -// mapping is: walIndex = firstWALIdx + (blockNumber - firstBlockNumber). +// laneWAL wraps an indexedWAL with a per-lane block number cursor. +// Block numbers within a lane are contiguous, so the first block number +// is derived: nextBlockNum - Count(). type laneWAL struct { - wal *dbwal.WAL[[]byte] - firstWALIdx uint64 - nextWALIdx uint64 - // firstBlockNum is the block number corresponding to firstWALIdx. - firstBlockNum types.BlockNumber - // nextBlockNum is the next block number to be persisted. + *indexedWAL[*types.Signed[*types.LaneProposal]] nextBlockNum types.BlockNumber } +func (lw *laneWAL) firstBlockNum() utils.Option[types.BlockNumber] { + if lw.Count() == 0 { + return utils.None[types.BlockNumber]() + } + return utils.Some(lw.nextBlockNum - types.BlockNumber(lw.Count())) +} + // BlockPersister manages block persistence using one WAL per lane. // Each lane gets its own WAL in a subdirectory named by hex-encoded lane ID, // so truncation is independent per lane. @@ -57,25 +57,11 @@ func laneDir(lane types.LaneID) string { } func newLaneWAL(dir string) (*laneWAL, error) { - if err := os.MkdirAll(dir, 0700); err != nil { - return nil, fmt.Errorf("create lane dir %s: %w", dir, err) - } - w, err := dbwal.NewWAL[[]byte]( - context.Background(), - func(data []byte) ([]byte, error) { return data, nil }, - func(data []byte) ([]byte, error) { return data, nil }, - logger, - dir, - dbwal.Config{ - WriteBufferSize: 0, // synchronous writes - WriteBatchSize: 1, // no batching - FsyncEnabled: true, - }, - ) + iw, err := openIndexedWAL(dir, types.SignedMsgConv[*types.LaneProposal]()) if err != nil { return nil, err } - return &laneWAL{wal: w, nextWALIdx: 1}, nil + return &laneWAL{indexedWAL: iw}, nil } // NewBlockPersister opens (or creates) per-lane WALs in subdirectories of @@ -113,12 +99,12 @@ func NewBlockPersister(stateDir utils.Option[string]) (*BlockPersister, map[type } loaded, lane, err := lw.loadAll() if err != nil { - _ = lw.wal.Close() + _ = lw.Close() bp.Close() return nil, nil, fmt.Errorf("load lane WAL in %s: %w", laneDir, err) } if lane == nil { - _ = lw.wal.Close() + _ = lw.Close() continue } bp.lanes[*lane] = lw @@ -147,37 +133,43 @@ func (bp *BlockPersister) PersistBlock(proposal *types.Signed[*types.LaneProposa } bp.lanes[lane] = lw } - data := types.SignedMsgConv[*types.LaneProposal]().Marshal(proposal) - if err := lw.wal.Write(data); err != nil { + if err := lw.Write(proposal); err != nil { return fmt.Errorf("persist block %s/%d: %w", lane, h.BlockNumber(), err) } - if lw.firstWALIdx == 0 { - lw.firstWALIdx = lw.nextWALIdx - lw.firstBlockNum = h.BlockNumber() - } - lw.nextWALIdx++ lw.nextBlockNum = h.BlockNumber() + 1 return nil } // DeleteBefore removes persisted blocks per lane by truncating each lane's // WAL independently. For each lane in the map, blocks below the given -// BlockNumber are removed. Lanes not in the map are left untouched. +// BlockNumber are removed. Lanes NOT in the map are considered stale +// (validator no longer in committee): their WALs are closed and directories +// removed. func (bp *BlockPersister) DeleteBefore(laneFirsts map[types.LaneID]types.BlockNumber) error { if bp.noop { return nil } for lane, first := range laneFirsts { lw, ok := bp.lanes[lane] - if !ok || first <= lw.firstBlockNum { + if !ok { + continue + } + firstBN, ok := lw.firstBlockNum().Get() + if !ok || first <= firstBN { continue } - walIdx := lw.firstWALIdx + uint64(first-lw.firstBlockNum) - if err := lw.wal.TruncateBefore(walIdx); err != nil { + walIdx := lw.firstIdx + uint64(first-firstBN) + if err := lw.TruncateBefore(walIdx); err != nil { return fmt.Errorf("truncate lane %s WAL before block %d: %w", lane, first, err) } - lw.firstWALIdx = walIdx - lw.firstBlockNum = first + } + for lane, lw := range bp.lanes { + if _, ok := laneFirsts[lane]; ok { + continue + } + _ = lw.Close() + os.RemoveAll(filepath.Join(bp.dir, laneDir(lane))) + delete(bp.lanes, lane) } return nil } @@ -189,7 +181,7 @@ func (bp *BlockPersister) Close() error { } var firstErr error for _, lw := range bp.lanes { - if err := lw.wal.Close(); err != nil && firstErr == nil { + if err := lw.Close(); err != nil && firstErr == nil { firstErr = err } } @@ -199,34 +191,13 @@ func (bp *BlockPersister) Close() error { // loadAll replays a lane WAL and returns the loaded blocks plus the lane ID // (extracted from the first entry). Returns nil lane if the WAL is empty. func (lw *laneWAL) loadAll() ([]LoadedBlock, *types.LaneID, error) { - first, err := lw.wal.FirstOffset() - if err != nil { - return nil, nil, fmt.Errorf("first offset: %w", err) - } - last, err := lw.wal.LastOffset() - if err != nil { - return nil, nil, fmt.Errorf("last offset: %w", err) - } - if first == 0 && last == 0 { - return nil, nil, nil - } - - lw.firstWALIdx = first - lw.nextWALIdx = last + 1 - var loaded []LoadedBlock var lane *types.LaneID - err = lw.wal.Replay(first, last, func(index uint64, data []byte) error { - conv := types.SignedMsgConv[*types.LaneProposal]() - proposal, err := conv.Unmarshal(data) - if err != nil { - return fmt.Errorf("unmarshal block at WAL index %d: %w", index, err) - } + err := lw.Replay(func(index uint64, proposal *types.Signed[*types.LaneProposal]) error { h := proposal.Msg().Block().Header() if lane == nil { l := h.Lane() lane = &l - lw.firstBlockNum = h.BlockNumber() } lw.nextBlockNum = h.BlockNumber() + 1 loaded = append(loaded, LoadedBlock{Number: h.BlockNumber(), Proposal: proposal}) diff --git a/sei-tendermint/internal/autobahn/consensus/persist/blocks_test.go b/sei-tendermint/internal/autobahn/consensus/persist/blocks_test.go index e2d45ac56d..5b79230ec5 100644 --- a/sei-tendermint/internal/autobahn/consensus/persist/blocks_test.go +++ b/sei-tendermint/internal/autobahn/consensus/persist/blocks_test.go @@ -133,23 +133,23 @@ func TestDeleteBeforeMultipleLanes(t *testing.T) { require.Equal(t, types.BlockNumber(2), blocks[lane2][1].Number) } -func TestDeleteBeforeEmptyMap(t *testing.T) { +func TestDeleteBeforeEmptyMapRemovesAll(t *testing.T) { rng := utils.TestRng() dir := t.TempDir() key := types.GenSecretKey(rng) - lane := key.Public() bp, _, err := NewBlockPersister(utils.Some(dir)) require.NoError(t, err) require.NoError(t, bp.PersistBlock(testSignedProposal(rng, key, 0))) + // Empty map = no committee members, so all lanes are stale. require.NoError(t, bp.DeleteBefore(map[types.LaneID]types.BlockNumber{})) require.NoError(t, bp.Close()) _, blocks, err := NewBlockPersister(utils.Some(dir)) require.NoError(t, err) - require.Equal(t, 1, len(blocks[lane])) + require.Equal(t, 0, len(blocks)) } func TestNoOpBlockPersister(t *testing.T) { @@ -209,8 +209,8 @@ func TestPerLaneIndependentTruncation(t *testing.T) { require.NoError(t, bp.PersistBlock(testSignedProposal(rng, key2, i))) } - // Truncate lane1 aggressively, leave lane2 untouched. - require.NoError(t, bp.DeleteBefore(map[types.LaneID]types.BlockNumber{lane1: 8})) + // Truncate lane1 aggressively, keep lane2 at 0 (both in the map). + require.NoError(t, bp.DeleteBefore(map[types.LaneID]types.BlockNumber{lane1: 8, lane2: 0})) require.NoError(t, bp.Close()) _, blocks, err := NewBlockPersister(utils.Some(dir)) @@ -220,6 +220,35 @@ func TestPerLaneIndependentTruncation(t *testing.T) { require.Equal(t, 3, len(blocks[lane2]), "lane2: all 3 blocks intact") } +func TestDeleteBeforeRemovesStaleLanes(t *testing.T) { + rng := utils.TestRng() + dir := t.TempDir() + + key1 := types.GenSecretKey(rng) + key2 := types.GenSecretKey(rng) + lane1 := key1.Public() + lane2 := key2.Public() + bp, _, err := NewBlockPersister(utils.Some(dir)) + require.NoError(t, err) + + for i := types.BlockNumber(0); i < 3; i++ { + require.NoError(t, bp.PersistBlock(testSignedProposal(rng, key1, i))) + require.NoError(t, bp.PersistBlock(testSignedProposal(rng, key2, i))) + } + + // Only lane1 in laneFirsts — lane2 is stale and should be removed. + require.NoError(t, bp.DeleteBefore(map[types.LaneID]types.BlockNumber{lane1: 0})) + require.NoError(t, bp.Close()) + + _, blocks, err := NewBlockPersister(utils.Some(dir)) + require.NoError(t, err) + require.Equal(t, 3, len(blocks[lane1]), "lane1: all blocks intact") + require.Equal(t, 0, len(blocks[lane2]), "lane2: removed as stale") + + entries, _ := os.ReadDir(filepath.Join(dir, "blocks")) + require.Equal(t, 1, len(entries), "only lane1 directory remains") +} + func TestLazyLaneCreation(t *testing.T) { rng := utils.TestRng() dir := t.TempDir() diff --git a/sei-tendermint/internal/autobahn/consensus/persist/commitqcs.go b/sei-tendermint/internal/autobahn/consensus/persist/commitqcs.go index d9812fb5ba..4c7c627a5b 100644 --- a/sei-tendermint/internal/autobahn/consensus/persist/commitqcs.go +++ b/sei-tendermint/internal/autobahn/consensus/persist/commitqcs.go @@ -1,12 +1,9 @@ package persist import ( - "context" "fmt" - "os" "path/filepath" - dbwal "github.com/sei-protocol/sei-chain/sei-db/wal" "github.com/sei-protocol/sei-chain/sei-tendermint/internal/autobahn/types" "github.com/sei-protocol/sei-chain/sei-tendermint/libs/utils" ) @@ -20,21 +17,12 @@ type LoadedCommitQC struct { // CommitQCPersister manages CommitQC persistence using a WAL. // Entries are appended in order; each entry is self-describing (the serialized // CommitQC contains its RoadIndex). The WAL index is append order, not -// RoadIndex — we track the WAL index of the first entry to enable truncation. +// RoadIndex — the indexedWAL tracks first/next indices to enable truncation. // When noop is true, all disk I/O is skipped but cursor tracking still works. type CommitQCPersister struct { - wal *dbwal.WAL[[]byte] + iw *indexedWAL[*types.CommitQC] noop bool next types.RoadIndex - - // WAL index of the first (oldest) entry still in the WAL. - // Used to compute the truncation point for TruncateBefore. - // Zero when the WAL is empty. - firstWALIdx uint64 - // nextWALIdx is the WAL index that the next Write will be assigned. - // Tracked locally: initialized from LastOffset()+1, incremented after - // each successful Write. Safe because we are the sole synchronous writer. - nextWALIdx uint64 } func newNoOpCommitQCPersister() *CommitQCPersister { @@ -51,30 +39,15 @@ func NewCommitQCPersister(stateDir utils.Option[string]) (*CommitQCPersister, [] return newNoOpCommitQCPersister(), nil, nil } dir := filepath.Join(sd, "commitqcs") - if err := os.MkdirAll(dir, 0700); err != nil { - return nil, nil, fmt.Errorf("create commitqcs dir %s: %w", dir, err) - } - - w, err := dbwal.NewWAL[[]byte]( - context.Background(), - func(data []byte) ([]byte, error) { return data, nil }, - func(data []byte) ([]byte, error) { return data, nil }, - logger, - dir, - dbwal.Config{ - WriteBufferSize: 0, // synchronous writes - WriteBatchSize: 1, // no batching - FsyncEnabled: true, - }, - ) + iw, err := openIndexedWAL(dir, types.CommitQCConv) if err != nil { return nil, nil, fmt.Errorf("open commitqc WAL in %s: %w", dir, err) } - cp := &CommitQCPersister{wal: w, nextWALIdx: 1} + cp := &CommitQCPersister{iw: iw} loaded, err := cp.loadAll() if err != nil { - _ = w.Close() + _ = iw.Close() return nil, nil, err } if len(loaded) > 0 { @@ -104,14 +77,9 @@ func (cp *CommitQCPersister) PersistCommitQC(qc *types.CommitQC) error { return fmt.Errorf("commitqc %d already persisted (next=%d)", idx, cp.next) } if !cp.noop { - data := types.CommitQCConv.Marshal(qc) - if err := cp.wal.Write(data); err != nil { + if err := cp.iw.Write(qc); err != nil { return fmt.Errorf("persist commitqc %d: %w", idx, err) } - if cp.firstWALIdx == 0 { - cp.firstWALIdx = cp.nextWALIdx - } - cp.nextWALIdx++ } cp.next = idx + 1 return nil @@ -120,22 +88,19 @@ func (cp *CommitQCPersister) PersistCommitQC(qc *types.CommitQC) error { // DeleteBefore removes persisted CommitQCs with road index below idx // by truncating the front of the WAL. // The mapping from RoadIndex to WAL index is linear: entries are written -// sequentially, so WAL index = firstWALIdx + (roadIndex - firstRoadIndex). +// sequentially, so WAL index = firstIdx + (roadIndex - firstRoadIndex). func (cp *CommitQCPersister) DeleteBefore(idx types.RoadIndex) error { - if cp.noop || idx == 0 { + if cp.noop || idx == 0 || cp.iw.Count() == 0 { return nil } - // Compute the WAL index corresponding to RoadIndex idx. - // Entries are sequential: firstWALIdx corresponds to firstRoadIndex. - firstRoadIndex := cp.next - types.RoadIndex(cp.nextWALIdx-cp.firstWALIdx) + firstRoadIndex := cp.next - types.RoadIndex(cp.iw.Count()) if idx <= firstRoadIndex { return nil } - walIdx := cp.firstWALIdx + uint64(idx-firstRoadIndex) - if err := cp.wal.TruncateBefore(walIdx); err != nil { + walIdx := cp.iw.firstIdx + uint64(idx-firstRoadIndex) + if err := cp.iw.TruncateBefore(walIdx); err != nil { return fmt.Errorf("truncate commitqc WAL before %d: %w", walIdx, err) } - cp.firstWALIdx = walIdx return nil } @@ -144,31 +109,12 @@ func (cp *CommitQCPersister) Close() error { if cp.noop { return nil } - return cp.wal.Close() + return cp.iw.Close() } func (cp *CommitQCPersister) loadAll() ([]LoadedCommitQC, error) { - first, err := cp.wal.FirstOffset() - if err != nil { - return nil, fmt.Errorf("commitqc WAL first offset: %w", err) - } - last, err := cp.wal.LastOffset() - if err != nil { - return nil, fmt.Errorf("commitqc WAL last offset: %w", err) - } - if first == 0 && last == 0 { - return nil, nil - } - - cp.firstWALIdx = first - cp.nextWALIdx = last + 1 - var loaded []LoadedCommitQC - err = cp.wal.Replay(first, last, func(index uint64, data []byte) error { - qc, err := types.CommitQCConv.Unmarshal(data) - if err != nil { - return fmt.Errorf("unmarshal commitqc at WAL index %d: %w", index, err) - } + err := cp.iw.Replay(func(index uint64, qc *types.CommitQC) error { loaded = append(loaded, LoadedCommitQC{Index: qc.Index(), QC: qc}) return nil }) diff --git a/sei-tendermint/internal/autobahn/consensus/persist/wal.go b/sei-tendermint/internal/autobahn/consensus/persist/wal.go new file mode 100644 index 0000000000..899d709fde --- /dev/null +++ b/sei-tendermint/internal/autobahn/consensus/persist/wal.go @@ -0,0 +1,109 @@ +package persist + +import ( + "context" + "fmt" + "os" + + dbwal "github.com/sei-protocol/sei-chain/sei-db/wal" +) + +// codec is the marshal/unmarshal pair needed to store T in a WAL. +// protoutils.Conv[T, P] satisfies this interface automatically. +type codec[T any] interface { + Marshal(T) []byte + Unmarshal([]byte) (T, error) +} + +// indexedWAL wraps a WAL with monotonic index tracking and typed entries. +// Callers map domain-specific indices (BlockNumber, RoadIndex) to WAL +// indices via Count() and firstIdx. Not safe for concurrent use. +type indexedWAL[T any] struct { + wal *dbwal.WAL[T] + firstIdx uint64 // WAL index of the oldest entry; 0 when empty + nextIdx uint64 // WAL index that the next Write will be assigned +} + +// openIndexedWAL creates (or opens) a WAL in dir with synchronous, unbatched, +// fsync-enabled writes. Initializes index tracking from the WAL's stored +// offsets so the caller can immediately Write, Replay, or TruncateBefore. +func openIndexedWAL[T any](dir string, codec codec[T]) (*indexedWAL[T], error) { + if err := os.MkdirAll(dir, 0700); err != nil { + return nil, fmt.Errorf("create dir %s: %w", dir, err) + } + w, err := dbwal.NewWAL( + context.Background(), + func(entry T) ([]byte, error) { return codec.Marshal(entry), nil }, + codec.Unmarshal, + logger, + dir, + dbwal.Config{ + WriteBufferSize: 0, // synchronous writes + WriteBatchSize: 1, // no batching + FsyncEnabled: true, + }, + ) + if err != nil { + return nil, err + } + // tidwall/wal uses 1-based indexing; overwritten below if the WAL has data. + iw := &indexedWAL[T]{wal: w, nextIdx: 1} + first, err := w.FirstOffset() + if err != nil { + _ = w.Close() + return nil, fmt.Errorf("first offset: %w", err) + } + last, err := w.LastOffset() + if err != nil { + _ = w.Close() + return nil, fmt.Errorf("last offset: %w", err) + } + if first != 0 || last != 0 { + iw.firstIdx = first + iw.nextIdx = last + 1 + } + return iw, nil +} + +// Write appends entry to the WAL, advancing nextIdx. +// On the first write, also records firstIdx. +func (w *indexedWAL[T]) Write(entry T) error { + if err := w.wal.Write(entry); err != nil { + return err + } + if w.firstIdx == 0 { + w.firstIdx = w.nextIdx + } + w.nextIdx++ + return nil +} + +// TruncateBefore removes entries before walIdx and advances firstIdx. +func (w *indexedWAL[T]) TruncateBefore(walIdx uint64) error { + if err := w.wal.TruncateBefore(walIdx); err != nil { + return err + } + w.firstIdx = walIdx + return nil +} + +// Replay iterates all entries in [firstIdx, nextIdx-1]. No-op if empty. +func (w *indexedWAL[T]) Replay(fn func(index uint64, entry T) error) error { + if w.firstIdx == 0 { + return nil + } + return w.wal.Replay(w.firstIdx, w.nextIdx-1, fn) +} + +// Count returns the number of entries in the WAL. +func (w *indexedWAL[T]) Count() uint64 { + if w.firstIdx == 0 { + return 0 + } + return w.nextIdx - w.firstIdx +} + +// Close shuts down the underlying WAL. +func (w *indexedWAL[T]) Close() error { + return w.wal.Close() +} diff --git a/sei-tendermint/internal/autobahn/consensus/persist/wal_test.go b/sei-tendermint/internal/autobahn/consensus/persist/wal_test.go new file mode 100644 index 0000000000..2d329ef378 --- /dev/null +++ b/sei-tendermint/internal/autobahn/consensus/persist/wal_test.go @@ -0,0 +1,203 @@ +package persist + +import ( + "fmt" + "testing" + + "github.com/sei-protocol/sei-chain/sei-tendermint/libs/utils/require" +) + +// stringCodec is a trivial codec for testing indexedWAL with strings. +type stringCodec struct{} + +func (stringCodec) Marshal(s string) []byte { return []byte(s) } +func (stringCodec) Unmarshal(b []byte) (string, error) { return string(b), nil } + +func TestIndexedWAL_EmptyStart(t *testing.T) { + dir := t.TempDir() + iw, err := openIndexedWAL(dir, stringCodec{}) + require.NoError(t, err) + + require.Equal(t, uint64(0), iw.Count()) + require.Equal(t, uint64(0), iw.firstIdx) + require.Equal(t, uint64(1), iw.nextIdx) + + // Replay on empty WAL is a no-op. + called := false + require.NoError(t, iw.Replay(func(index uint64, entry string) error { + called = true + return nil + })) + require.False(t, called) + + require.NoError(t, iw.Close()) +} + +func TestIndexedWAL_WriteAndReplay(t *testing.T) { + dir := t.TempDir() + iw, err := openIndexedWAL(dir, stringCodec{}) + require.NoError(t, err) + + require.NoError(t, iw.Write("a")) + require.NoError(t, iw.Write("b")) + require.NoError(t, iw.Write("c")) + + require.Equal(t, uint64(3), iw.Count()) + require.Equal(t, uint64(1), iw.firstIdx) + require.Equal(t, uint64(4), iw.nextIdx) + + var entries []string + require.NoError(t, iw.Replay(func(index uint64, entry string) error { + entries = append(entries, entry) + return nil + })) + require.Equal(t, 3, len(entries)) + require.Equal(t, "a", entries[0]) + require.Equal(t, "b", entries[1]) + require.Equal(t, "c", entries[2]) + + require.NoError(t, iw.Close()) +} + +func TestIndexedWAL_ReopenWithData(t *testing.T) { + dir := t.TempDir() + + // Write some entries and close. + iw, err := openIndexedWAL(dir, stringCodec{}) + require.NoError(t, err) + require.NoError(t, iw.Write("x")) + require.NoError(t, iw.Write("y")) + require.NoError(t, iw.Close()) + + // Reopen — should recover firstIdx, nextIdx, and replay entries. + iw2, err := openIndexedWAL(dir, stringCodec{}) + require.NoError(t, err) + + require.Equal(t, uint64(2), iw2.Count()) + require.Equal(t, uint64(1), iw2.firstIdx) + require.Equal(t, uint64(3), iw2.nextIdx) + + var entries []string + require.NoError(t, iw2.Replay(func(index uint64, entry string) error { + entries = append(entries, entry) + return nil + })) + require.Equal(t, 2, len(entries)) + require.Equal(t, "x", entries[0]) + require.Equal(t, "y", entries[1]) + + require.NoError(t, iw2.Close()) +} + +func TestIndexedWAL_ReopenAfterTruncate(t *testing.T) { + dir := t.TempDir() + + iw, err := openIndexedWAL(dir, stringCodec{}) + require.NoError(t, err) + for _, s := range []string{"a", "b", "c", "d", "e"} { + require.NoError(t, iw.Write(s)) + } + // Truncate first 3 entries (indices 1,2,3); keep 4,5. + require.NoError(t, iw.TruncateBefore(4)) + require.Equal(t, uint64(2), iw.Count()) + require.NoError(t, iw.Close()) + + // Reopen — should see only the surviving entries. + iw2, err := openIndexedWAL(dir, stringCodec{}) + require.NoError(t, err) + require.Equal(t, uint64(2), iw2.Count()) + require.Equal(t, uint64(4), iw2.firstIdx) + require.Equal(t, uint64(6), iw2.nextIdx) + + var entries []string + require.NoError(t, iw2.Replay(func(index uint64, entry string) error { + entries = append(entries, entry) + return nil + })) + require.Equal(t, 2, len(entries)) + require.Equal(t, "d", entries[0]) + require.Equal(t, "e", entries[1]) + + require.NoError(t, iw2.Close()) +} + +func TestIndexedWAL_TruncateAllButLast(t *testing.T) { + dir := t.TempDir() + + iw, err := openIndexedWAL(dir, stringCodec{}) + require.NoError(t, err) + require.NoError(t, iw.Write("a")) + require.NoError(t, iw.Write("b")) + require.NoError(t, iw.Write("c")) + + // TruncateBefore keeps the entry at the given index; remove all but last. + require.NoError(t, iw.TruncateBefore(3)) + require.Equal(t, uint64(1), iw.Count()) + require.Equal(t, uint64(3), iw.firstIdx) + require.NoError(t, iw.Close()) + + // Reopen — should see one entry. + iw2, err := openIndexedWAL(dir, stringCodec{}) + require.NoError(t, err) + require.Equal(t, uint64(1), iw2.Count()) + + var entries []string + require.NoError(t, iw2.Replay(func(index uint64, entry string) error { + entries = append(entries, entry) + return nil + })) + require.Equal(t, 1, len(entries)) + require.Equal(t, "c", entries[0]) + + require.NoError(t, iw2.Close()) +} + +func TestIndexedWAL_WriteAfterTruncate(t *testing.T) { + dir := t.TempDir() + + iw, err := openIndexedWAL(dir, stringCodec{}) + require.NoError(t, err) + require.NoError(t, iw.Write("a")) + require.NoError(t, iw.Write("b")) + require.NoError(t, iw.Write("c")) + + // Truncate "a" and "b". + require.NoError(t, iw.TruncateBefore(3)) + require.Equal(t, uint64(1), iw.Count()) + + // Write more after truncation. + require.NoError(t, iw.Write("d")) + require.NoError(t, iw.Write("e")) + require.Equal(t, uint64(3), iw.Count()) + require.Equal(t, uint64(3), iw.firstIdx) + require.Equal(t, uint64(6), iw.nextIdx) + + var entries []string + require.NoError(t, iw.Replay(func(index uint64, entry string) error { + entries = append(entries, entry) + return nil + })) + require.Equal(t, 3, len(entries)) + require.Equal(t, "c", entries[0]) + require.Equal(t, "d", entries[1]) + require.Equal(t, "e", entries[2]) + + require.NoError(t, iw.Close()) +} + +func TestIndexedWAL_ReplayError(t *testing.T) { + dir := t.TempDir() + + iw, err := openIndexedWAL(dir, stringCodec{}) + require.NoError(t, err) + require.NoError(t, iw.Write("a")) + require.NoError(t, iw.Write("b")) + + errBoom := fmt.Errorf("boom") + err = iw.Replay(func(index uint64, entry string) error { + return errBoom + }) + require.Error(t, err) + + require.NoError(t, iw.Close()) +} From 89a5fcaf9734cd4d20021c05b792bf4844658a29 Mon Sep 17 00:00:00 2001 From: Wen Date: Mon, 9 Mar 2026 15:40:48 -0700 Subject: [PATCH 03/31] persist: fix errcheck lint warnings in blocks.go Made-with: Cursor --- .../internal/autobahn/consensus/persist/blocks.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sei-tendermint/internal/autobahn/consensus/persist/blocks.go b/sei-tendermint/internal/autobahn/consensus/persist/blocks.go index 168c30ddee..3c926cfb20 100644 --- a/sei-tendermint/internal/autobahn/consensus/persist/blocks.go +++ b/sei-tendermint/internal/autobahn/consensus/persist/blocks.go @@ -94,13 +94,13 @@ func NewBlockPersister(stateDir utils.Option[string]) (*BlockPersister, map[type laneDir := filepath.Join(dir, e.Name()) lw, err := newLaneWAL(laneDir) if err != nil { - bp.Close() + _ = bp.Close() return nil, nil, fmt.Errorf("open lane WAL in %s: %w", laneDir, err) } loaded, lane, err := lw.loadAll() if err != nil { _ = lw.Close() - bp.Close() + _ = bp.Close() return nil, nil, fmt.Errorf("load lane WAL in %s: %w", laneDir, err) } if lane == nil { @@ -168,7 +168,7 @@ func (bp *BlockPersister) DeleteBefore(laneFirsts map[types.LaneID]types.BlockNu continue } _ = lw.Close() - os.RemoveAll(filepath.Join(bp.dir, laneDir(lane))) + _ = os.RemoveAll(filepath.Join(bp.dir, laneDir(lane))) delete(bp.lanes, lane) } return nil From dddd764ff28f773ad5e475aab85ebe67fc1126a8 Mon Sep 17 00:00:00 2001 From: Wen Date: Mon, 9 Mar 2026 16:02:29 -0700 Subject: [PATCH 04/31] persist: fix gofmt alignment in wal_test.go Made-with: Cursor --- sei-tendermint/internal/autobahn/consensus/persist/wal_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sei-tendermint/internal/autobahn/consensus/persist/wal_test.go b/sei-tendermint/internal/autobahn/consensus/persist/wal_test.go index 2d329ef378..2a98590f91 100644 --- a/sei-tendermint/internal/autobahn/consensus/persist/wal_test.go +++ b/sei-tendermint/internal/autobahn/consensus/persist/wal_test.go @@ -10,7 +10,7 @@ import ( // stringCodec is a trivial codec for testing indexedWAL with strings. type stringCodec struct{} -func (stringCodec) Marshal(s string) []byte { return []byte(s) } +func (stringCodec) Marshal(s string) []byte { return []byte(s) } func (stringCodec) Unmarshal(b []byte) (string, error) { return string(b), nil } func TestIndexedWAL_EmptyStart(t *testing.T) { From a05ca12003231f2368326c1400735ba55b2f8fb4 Mon Sep 17 00:00:00 2001 From: Wen Date: Mon, 9 Mar 2026 18:00:45 -0700 Subject: [PATCH 05/31] persist: skip non-lane directories in blocks dir with warning Made-with: Cursor --- sei-tendermint/internal/autobahn/consensus/persist/blocks.go | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/sei-tendermint/internal/autobahn/consensus/persist/blocks.go b/sei-tendermint/internal/autobahn/consensus/persist/blocks.go index 3c926cfb20..fc8bbd237d 100644 --- a/sei-tendermint/internal/autobahn/consensus/persist/blocks.go +++ b/sei-tendermint/internal/autobahn/consensus/persist/blocks.go @@ -91,6 +91,10 @@ func NewBlockPersister(stateDir utils.Option[string]) (*BlockPersister, map[type if !e.IsDir() { continue } + if _, err := hex.DecodeString(e.Name()); err != nil { + logger.Warn("skipping unexpected entry in blocks dir", "name", e.Name()) + continue + } laneDir := filepath.Join(dir, e.Name()) lw, err := newLaneWAL(laneDir) if err != nil { From adb2b9cca2806b575de642edc31d456f0e9c7062 Mon Sep 17 00:00:00 2001 From: Wen Date: Mon, 9 Mar 2026 18:13:23 -0700 Subject: [PATCH 06/31] persist: use Option[LaneID] instead of *LaneID in loadAll Made-with: Cursor --- .../autobahn/consensus/persist/blocks.go | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/sei-tendermint/internal/autobahn/consensus/persist/blocks.go b/sei-tendermint/internal/autobahn/consensus/persist/blocks.go index fc8bbd237d..e882e71203 100644 --- a/sei-tendermint/internal/autobahn/consensus/persist/blocks.go +++ b/sei-tendermint/internal/autobahn/consensus/persist/blocks.go @@ -107,13 +107,14 @@ func NewBlockPersister(stateDir utils.Option[string]) (*BlockPersister, map[type _ = bp.Close() return nil, nil, fmt.Errorf("load lane WAL in %s: %w", laneDir, err) } - if lane == nil { + l, ok := lane.Get() + if !ok { _ = lw.Close() continue } - bp.lanes[*lane] = lw + bp.lanes[l] = lw if len(loaded) > 0 { - allBlocks[*lane] = loaded + allBlocks[l] = loaded } } @@ -193,15 +194,14 @@ func (bp *BlockPersister) Close() error { } // loadAll replays a lane WAL and returns the loaded blocks plus the lane ID -// (extracted from the first entry). Returns nil lane if the WAL is empty. -func (lw *laneWAL) loadAll() ([]LoadedBlock, *types.LaneID, error) { +// (extracted from the first entry). Returns None lane if the WAL is empty. +func (lw *laneWAL) loadAll() ([]LoadedBlock, utils.Option[types.LaneID], error) { var loaded []LoadedBlock - var lane *types.LaneID + lane := utils.None[types.LaneID]() err := lw.Replay(func(index uint64, proposal *types.Signed[*types.LaneProposal]) error { h := proposal.Msg().Block().Header() - if lane == nil { - l := h.Lane() - lane = &l + if !lane.IsPresent() { + lane = utils.Some(h.Lane()) } lw.nextBlockNum = h.BlockNumber() + 1 loaded = append(loaded, LoadedBlock{Number: h.BlockNumber(), Proposal: proposal}) @@ -209,7 +209,7 @@ func (lw *laneWAL) loadAll() ([]LoadedBlock, *types.LaneID, error) { return nil }) if err != nil { - return nil, nil, err + return nil, lane, err } slices.SortFunc(loaded, func(a, b LoadedBlock) int { return cmp.Compare(a.Number, b.Number) From d23861e601bf66ae28f9238a8bf62c72744ce035 Mon Sep 17 00:00:00 2001 From: Wen Date: Mon, 9 Mar 2026 18:26:50 -0700 Subject: [PATCH 07/31] persist: derive lane ID from directory name, keep empty WALs open Instead of extracting the lane ID from the first replayed entry (and closing/skipping empty WALs), decode it from the hex directory name. This keeps the WAL open so the lane can receive blocks without reopening it. Made-with: Cursor --- .../autobahn/consensus/persist/blocks.go | 26 +++++++++---------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/sei-tendermint/internal/autobahn/consensus/persist/blocks.go b/sei-tendermint/internal/autobahn/consensus/persist/blocks.go index e882e71203..f10121b543 100644 --- a/sei-tendermint/internal/autobahn/consensus/persist/blocks.go +++ b/sei-tendermint/internal/autobahn/consensus/persist/blocks.go @@ -101,20 +101,22 @@ func NewBlockPersister(stateDir utils.Option[string]) (*BlockPersister, map[type _ = bp.Close() return nil, nil, fmt.Errorf("open lane WAL in %s: %w", laneDir, err) } - loaded, lane, err := lw.loadAll() + loaded, err := lw.loadAll() if err != nil { _ = lw.Close() _ = bp.Close() return nil, nil, fmt.Errorf("load lane WAL in %s: %w", laneDir, err) } - l, ok := lane.Get() - if !ok { + laneBytes, _ := hex.DecodeString(e.Name()) + lane, err := types.PublicKeyFromBytes(laneBytes) + if err != nil { _ = lw.Close() + logger.Warn("skipping lane dir with invalid key", "name", e.Name(), "err", err) continue } - bp.lanes[l] = lw + bp.lanes[lane] = lw if len(loaded) > 0 { - allBlocks[l] = loaded + allBlocks[lane] = loaded } } @@ -193,26 +195,22 @@ func (bp *BlockPersister) Close() error { return firstErr } -// loadAll replays a lane WAL and returns the loaded blocks plus the lane ID -// (extracted from the first entry). Returns None lane if the WAL is empty. -func (lw *laneWAL) loadAll() ([]LoadedBlock, utils.Option[types.LaneID], error) { +// loadAll replays a lane WAL and returns the loaded blocks. +// Also restores nextBlockNum from the replayed entries. +func (lw *laneWAL) loadAll() ([]LoadedBlock, error) { var loaded []LoadedBlock - lane := utils.None[types.LaneID]() err := lw.Replay(func(index uint64, proposal *types.Signed[*types.LaneProposal]) error { h := proposal.Msg().Block().Header() - if !lane.IsPresent() { - lane = utils.Some(h.Lane()) - } lw.nextBlockNum = h.BlockNumber() + 1 loaded = append(loaded, LoadedBlock{Number: h.BlockNumber(), Proposal: proposal}) logger.Info("loaded persisted block", "lane", h.Lane().String(), slog.Uint64("block", uint64(h.BlockNumber()))) return nil }) if err != nil { - return nil, lane, err + return nil, err } slices.SortFunc(loaded, func(a, b LoadedBlock) int { return cmp.Compare(a.Number, b.Number) }) - return loaded, lane, nil + return loaded, nil } From 0c0d8e61aaa66d641ffea229ed0329461f7a6db3 Mon Sep 17 00:00:00 2001 From: Wen Date: Mon, 9 Mar 2026 18:29:48 -0700 Subject: [PATCH 08/31] persist: add TestEmptyLaneWALSurvivesReopen Made-with: Cursor --- .../autobahn/consensus/persist/blocks_test.go | 28 +++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/sei-tendermint/internal/autobahn/consensus/persist/blocks_test.go b/sei-tendermint/internal/autobahn/consensus/persist/blocks_test.go index 5b79230ec5..0a34e8039f 100644 --- a/sei-tendermint/internal/autobahn/consensus/persist/blocks_test.go +++ b/sei-tendermint/internal/autobahn/consensus/persist/blocks_test.go @@ -249,6 +249,34 @@ func TestDeleteBeforeRemovesStaleLanes(t *testing.T) { require.Equal(t, 1, len(entries), "only lane1 directory remains") } +func TestEmptyLaneWALSurvivesReopen(t *testing.T) { + rng := utils.TestRng() + dir := t.TempDir() + + key := types.GenSecretKey(rng) + lane := key.Public() + + // Simulate a crash after lazy lane directory creation but before any write: + // create the lane subdirectory so NewBlockPersister discovers it on open. + blocksDir := filepath.Join(dir, "blocks") + require.NoError(t, os.MkdirAll(filepath.Join(blocksDir, laneDir(lane)), 0700)) + + // Reopen — empty lane WAL should be loaded and usable. + bp, blocks, err := NewBlockPersister(utils.Some(dir)) + require.NoError(t, err) + require.Equal(t, 0, len(blocks[lane]), "no blocks loaded") + + // Persist a new block into the lane without needing lazy creation. + require.NoError(t, bp.PersistBlock(testSignedProposal(rng, key, 0))) + require.NoError(t, bp.Close()) + + // Reopen — should see the new block. + _, blocks2, err := NewBlockPersister(utils.Some(dir)) + require.NoError(t, err) + require.Equal(t, 1, len(blocks2[lane])) + require.Equal(t, types.BlockNumber(0), blocks2[lane][0].Number) +} + func TestLazyLaneCreation(t *testing.T) { rng := utils.TestRng() dir := t.TempDir() From 2d02be8cacf85c569c872ff9e2986da890e7e412 Mon Sep 17 00:00:00 2001 From: Wen Date: Mon, 9 Mar 2026 18:41:13 -0700 Subject: [PATCH 09/31] persist: decode lane ID once and add tests for skip paths Validate the directory name (hex decode + PublicKeyFromBytes) before opening the WAL, avoiding a redundant hex.DecodeString call. Add tests for both skip paths: non-hex directory name and valid hex but invalid public key length. Made-with: Cursor --- .../autobahn/consensus/persist/blocks.go | 15 +++++----- .../autobahn/consensus/persist/blocks_test.go | 29 +++++++++++++++++++ 2 files changed, 36 insertions(+), 8 deletions(-) diff --git a/sei-tendermint/internal/autobahn/consensus/persist/blocks.go b/sei-tendermint/internal/autobahn/consensus/persist/blocks.go index f10121b543..02656dc465 100644 --- a/sei-tendermint/internal/autobahn/consensus/persist/blocks.go +++ b/sei-tendermint/internal/autobahn/consensus/persist/blocks.go @@ -91,10 +91,16 @@ func NewBlockPersister(stateDir utils.Option[string]) (*BlockPersister, map[type if !e.IsDir() { continue } - if _, err := hex.DecodeString(e.Name()); err != nil { + laneBytes, err := hex.DecodeString(e.Name()) + if err != nil { logger.Warn("skipping unexpected entry in blocks dir", "name", e.Name()) continue } + lane, err := types.PublicKeyFromBytes(laneBytes) + if err != nil { + logger.Warn("skipping lane dir with invalid key", "name", e.Name(), "err", err) + continue + } laneDir := filepath.Join(dir, e.Name()) lw, err := newLaneWAL(laneDir) if err != nil { @@ -107,13 +113,6 @@ func NewBlockPersister(stateDir utils.Option[string]) (*BlockPersister, map[type _ = bp.Close() return nil, nil, fmt.Errorf("load lane WAL in %s: %w", laneDir, err) } - laneBytes, _ := hex.DecodeString(e.Name()) - lane, err := types.PublicKeyFromBytes(laneBytes) - if err != nil { - _ = lw.Close() - logger.Warn("skipping lane dir with invalid key", "name", e.Name(), "err", err) - continue - } bp.lanes[lane] = lw if len(loaded) > 0 { allBlocks[lane] = loaded diff --git a/sei-tendermint/internal/autobahn/consensus/persist/blocks_test.go b/sei-tendermint/internal/autobahn/consensus/persist/blocks_test.go index 0a34e8039f..e86915f6b4 100644 --- a/sei-tendermint/internal/autobahn/consensus/persist/blocks_test.go +++ b/sei-tendermint/internal/autobahn/consensus/persist/blocks_test.go @@ -277,6 +277,35 @@ func TestEmptyLaneWALSurvivesReopen(t *testing.T) { require.Equal(t, types.BlockNumber(0), blocks2[lane][0].Number) } +func TestNewBlockPersisterSkipsNonHexDir(t *testing.T) { + dir := t.TempDir() + blocksDir := filepath.Join(dir, "blocks") + require.NoError(t, os.MkdirAll(blocksDir, 0700)) + + // Create a non-hex directory and a regular file — both should be skipped. + require.NoError(t, os.Mkdir(filepath.Join(blocksDir, "not-valid-hex"), 0700)) + require.NoError(t, os.WriteFile(filepath.Join(blocksDir, "stray-file.txt"), []byte("hi"), 0600)) + + bp, blocks, err := NewBlockPersister(utils.Some(dir)) + require.NoError(t, err) + require.Equal(t, 0, len(blocks)) + require.NoError(t, bp.Close()) +} + +func TestNewBlockPersisterSkipsInvalidKeyDir(t *testing.T) { + dir := t.TempDir() + blocksDir := filepath.Join(dir, "blocks") + require.NoError(t, os.MkdirAll(blocksDir, 0700)) + + // Valid hex but too short to be a valid ed25519 public key. + require.NoError(t, os.Mkdir(filepath.Join(blocksDir, "abcd"), 0700)) + + bp, blocks, err := NewBlockPersister(utils.Some(dir)) + require.NoError(t, err) + require.Equal(t, 0, len(blocks)) + require.NoError(t, bp.Close()) +} + func TestLazyLaneCreation(t *testing.T) { rng := utils.TestRng() dir := t.TempDir() From 5331e882882f0556bebd4e865acaa27a3625dba4 Mon Sep 17 00:00:00 2001 From: Wen Date: Mon, 9 Mar 2026 19:03:45 -0700 Subject: [PATCH 10/31] persist: replace Replay with ReadAll, remove unnecessary sort Replace callback-based Replay on indexedWAL with ReadAll that returns a slice. Remove the defensive sort in blocks loadAll since WAL entries are already in append order. Fix stale Replay reference in godoc. Made-with: Cursor --- .../autobahn/consensus/persist/blocks.go | 26 +++---- .../autobahn/consensus/persist/commitqcs.go | 10 +-- .../autobahn/consensus/persist/wal.go | 16 +++-- .../autobahn/consensus/persist/wal_test.go | 67 +++++-------------- 4 files changed, 42 insertions(+), 77 deletions(-) diff --git a/sei-tendermint/internal/autobahn/consensus/persist/blocks.go b/sei-tendermint/internal/autobahn/consensus/persist/blocks.go index 02656dc465..9525af597a 100644 --- a/sei-tendermint/internal/autobahn/consensus/persist/blocks.go +++ b/sei-tendermint/internal/autobahn/consensus/persist/blocks.go @@ -1,12 +1,10 @@ package persist import ( - "cmp" "encoding/hex" "fmt" "os" "path/filepath" - "slices" "log/slog" @@ -133,8 +131,7 @@ func (bp *BlockPersister) PersistBlock(proposal *types.Signed[*types.LaneProposa lw, ok := bp.lanes[lane] if !ok { var err error - lw, err = newLaneWAL(filepath.Join(bp.dir, laneDir(lane))) - if err != nil { + if lw, err = newLaneWAL(filepath.Join(bp.dir, laneDir(lane))); err != nil { return fmt.Errorf("create lane WAL for %s: %w", lane, err) } bp.lanes[lane] = lw @@ -158,7 +155,7 @@ func (bp *BlockPersister) DeleteBefore(laneFirsts map[types.LaneID]types.BlockNu for lane, first := range laneFirsts { lw, ok := bp.lanes[lane] if !ok { - continue + continue // no WAL yet; PersistBlock will create one lazily } firstBN, ok := lw.firstBlockNum().Get() if !ok || first <= firstBN { @@ -194,22 +191,19 @@ func (bp *BlockPersister) Close() error { return firstErr } -// loadAll replays a lane WAL and returns the loaded blocks. -// Also restores nextBlockNum from the replayed entries. +// loadAll reads all entries from the lane WAL and returns the loaded blocks. +// Also restores nextBlockNum from the last entry. func (lw *laneWAL) loadAll() ([]LoadedBlock, error) { - var loaded []LoadedBlock - err := lw.Replay(func(index uint64, proposal *types.Signed[*types.LaneProposal]) error { + entries, err := lw.ReadAll() + if err != nil { + return nil, err + } + loaded := make([]LoadedBlock, 0, len(entries)) + for _, proposal := range entries { h := proposal.Msg().Block().Header() lw.nextBlockNum = h.BlockNumber() + 1 loaded = append(loaded, LoadedBlock{Number: h.BlockNumber(), Proposal: proposal}) logger.Info("loaded persisted block", "lane", h.Lane().String(), slog.Uint64("block", uint64(h.BlockNumber()))) - return nil - }) - if err != nil { - return nil, err } - slices.SortFunc(loaded, func(a, b LoadedBlock) int { - return cmp.Compare(a.Number, b.Number) - }) return loaded, nil } diff --git a/sei-tendermint/internal/autobahn/consensus/persist/commitqcs.go b/sei-tendermint/internal/autobahn/consensus/persist/commitqcs.go index 4c7c627a5b..1b527d09eb 100644 --- a/sei-tendermint/internal/autobahn/consensus/persist/commitqcs.go +++ b/sei-tendermint/internal/autobahn/consensus/persist/commitqcs.go @@ -113,13 +113,13 @@ func (cp *CommitQCPersister) Close() error { } func (cp *CommitQCPersister) loadAll() ([]LoadedCommitQC, error) { - var loaded []LoadedCommitQC - err := cp.iw.Replay(func(index uint64, qc *types.CommitQC) error { - loaded = append(loaded, LoadedCommitQC{Index: qc.Index(), QC: qc}) - return nil - }) + entries, err := cp.iw.ReadAll() if err != nil { return nil, err } + loaded := make([]LoadedCommitQC, 0, len(entries)) + for _, qc := range entries { + loaded = append(loaded, LoadedCommitQC{Index: qc.Index(), QC: qc}) + } return loaded, nil } diff --git a/sei-tendermint/internal/autobahn/consensus/persist/wal.go b/sei-tendermint/internal/autobahn/consensus/persist/wal.go index 899d709fde..eb23210cfb 100644 --- a/sei-tendermint/internal/autobahn/consensus/persist/wal.go +++ b/sei-tendermint/internal/autobahn/consensus/persist/wal.go @@ -26,7 +26,7 @@ type indexedWAL[T any] struct { // openIndexedWAL creates (or opens) a WAL in dir with synchronous, unbatched, // fsync-enabled writes. Initializes index tracking from the WAL's stored -// offsets so the caller can immediately Write, Replay, or TruncateBefore. +// offsets so the caller can immediately Write, ReadAll, or TruncateBefore. func openIndexedWAL[T any](dir string, codec codec[T]) (*indexedWAL[T], error) { if err := os.MkdirAll(dir, 0700); err != nil { return nil, fmt.Errorf("create dir %s: %w", dir, err) @@ -87,12 +87,20 @@ func (w *indexedWAL[T]) TruncateBefore(walIdx uint64) error { return nil } -// Replay iterates all entries in [firstIdx, nextIdx-1]. No-op if empty. -func (w *indexedWAL[T]) Replay(fn func(index uint64, entry T) error) error { +// ReadAll returns all entries in the WAL. Returns nil if empty. +func (w *indexedWAL[T]) ReadAll() ([]T, error) { if w.firstIdx == 0 { + return nil, nil + } + entries := make([]T, 0, w.Count()) + err := w.wal.Replay(w.firstIdx, w.nextIdx-1, func(_ uint64, entry T) error { + entries = append(entries, entry) return nil + }) + if err != nil { + return nil, err } - return w.wal.Replay(w.firstIdx, w.nextIdx-1, fn) + return entries, nil } // Count returns the number of entries in the WAL. diff --git a/sei-tendermint/internal/autobahn/consensus/persist/wal_test.go b/sei-tendermint/internal/autobahn/consensus/persist/wal_test.go index 2a98590f91..9467f8e979 100644 --- a/sei-tendermint/internal/autobahn/consensus/persist/wal_test.go +++ b/sei-tendermint/internal/autobahn/consensus/persist/wal_test.go @@ -1,7 +1,6 @@ package persist import ( - "fmt" "testing" "github.com/sei-protocol/sei-chain/sei-tendermint/libs/utils/require" @@ -22,18 +21,14 @@ func TestIndexedWAL_EmptyStart(t *testing.T) { require.Equal(t, uint64(0), iw.firstIdx) require.Equal(t, uint64(1), iw.nextIdx) - // Replay on empty WAL is a no-op. - called := false - require.NoError(t, iw.Replay(func(index uint64, entry string) error { - called = true - return nil - })) - require.False(t, called) + entries, err := iw.ReadAll() + require.NoError(t, err) + require.Equal(t, 0, len(entries)) require.NoError(t, iw.Close()) } -func TestIndexedWAL_WriteAndReplay(t *testing.T) { +func TestIndexedWAL_WriteAndReadAll(t *testing.T) { dir := t.TempDir() iw, err := openIndexedWAL(dir, stringCodec{}) require.NoError(t, err) @@ -46,11 +41,8 @@ func TestIndexedWAL_WriteAndReplay(t *testing.T) { require.Equal(t, uint64(1), iw.firstIdx) require.Equal(t, uint64(4), iw.nextIdx) - var entries []string - require.NoError(t, iw.Replay(func(index uint64, entry string) error { - entries = append(entries, entry) - return nil - })) + entries, err := iw.ReadAll() + require.NoError(t, err) require.Equal(t, 3, len(entries)) require.Equal(t, "a", entries[0]) require.Equal(t, "b", entries[1]) @@ -69,7 +61,7 @@ func TestIndexedWAL_ReopenWithData(t *testing.T) { require.NoError(t, iw.Write("y")) require.NoError(t, iw.Close()) - // Reopen — should recover firstIdx, nextIdx, and replay entries. + // Reopen — should recover firstIdx, nextIdx, and entries. iw2, err := openIndexedWAL(dir, stringCodec{}) require.NoError(t, err) @@ -77,11 +69,8 @@ func TestIndexedWAL_ReopenWithData(t *testing.T) { require.Equal(t, uint64(1), iw2.firstIdx) require.Equal(t, uint64(3), iw2.nextIdx) - var entries []string - require.NoError(t, iw2.Replay(func(index uint64, entry string) error { - entries = append(entries, entry) - return nil - })) + entries, err := iw2.ReadAll() + require.NoError(t, err) require.Equal(t, 2, len(entries)) require.Equal(t, "x", entries[0]) require.Equal(t, "y", entries[1]) @@ -109,11 +98,8 @@ func TestIndexedWAL_ReopenAfterTruncate(t *testing.T) { require.Equal(t, uint64(4), iw2.firstIdx) require.Equal(t, uint64(6), iw2.nextIdx) - var entries []string - require.NoError(t, iw2.Replay(func(index uint64, entry string) error { - entries = append(entries, entry) - return nil - })) + entries, err := iw2.ReadAll() + require.NoError(t, err) require.Equal(t, 2, len(entries)) require.Equal(t, "d", entries[0]) require.Equal(t, "e", entries[1]) @@ -141,11 +127,8 @@ func TestIndexedWAL_TruncateAllButLast(t *testing.T) { require.NoError(t, err) require.Equal(t, uint64(1), iw2.Count()) - var entries []string - require.NoError(t, iw2.Replay(func(index uint64, entry string) error { - entries = append(entries, entry) - return nil - })) + entries, err := iw2.ReadAll() + require.NoError(t, err) require.Equal(t, 1, len(entries)) require.Equal(t, "c", entries[0]) @@ -172,11 +155,8 @@ func TestIndexedWAL_WriteAfterTruncate(t *testing.T) { require.Equal(t, uint64(3), iw.firstIdx) require.Equal(t, uint64(6), iw.nextIdx) - var entries []string - require.NoError(t, iw.Replay(func(index uint64, entry string) error { - entries = append(entries, entry) - return nil - })) + entries, err := iw.ReadAll() + require.NoError(t, err) require.Equal(t, 3, len(entries)) require.Equal(t, "c", entries[0]) require.Equal(t, "d", entries[1]) @@ -184,20 +164,3 @@ func TestIndexedWAL_WriteAfterTruncate(t *testing.T) { require.NoError(t, iw.Close()) } - -func TestIndexedWAL_ReplayError(t *testing.T) { - dir := t.TempDir() - - iw, err := openIndexedWAL(dir, stringCodec{}) - require.NoError(t, err) - require.NoError(t, iw.Write("a")) - require.NoError(t, iw.Write("b")) - - errBoom := fmt.Errorf("boom") - err = iw.Replay(func(index uint64, entry string) error { - return errBoom - }) - require.Error(t, err) - - require.NoError(t, iw.Close()) -} From 1dd6adb273a89fc9bd80fcdb8c9c893418e419d1 Mon Sep 17 00:00:00 2001 From: Wen Date: Mon, 9 Mar 2026 20:51:00 -0700 Subject: [PATCH 11/31] persist: add defense-in-depth checks for WAL index mapping TruncateBefore now reads and verifies the entry at the target WAL index before truncating, catching index-mapping corruption before data loss. PersistCommitQC and PersistBlock enforce strict sequential order to prevent gaps that would break the linear domain-to-WAL-index mapping. Made-with: Cursor --- .../autobahn/consensus/persist/blocks.go | 10 ++++- .../autobahn/consensus/persist/commitqcs.go | 15 +++++-- .../consensus/persist/commitqcs_test.go | 21 +++++++++- .../autobahn/consensus/persist/wal.go | 14 ++++++- .../autobahn/consensus/persist/wal_test.go | 39 +++++++++++++++++-- 5 files changed, 88 insertions(+), 11 deletions(-) diff --git a/sei-tendermint/internal/autobahn/consensus/persist/blocks.go b/sei-tendermint/internal/autobahn/consensus/persist/blocks.go index 9525af597a..571bd24157 100644 --- a/sei-tendermint/internal/autobahn/consensus/persist/blocks.go +++ b/sei-tendermint/internal/autobahn/consensus/persist/blocks.go @@ -136,6 +136,9 @@ func (bp *BlockPersister) PersistBlock(proposal *types.Signed[*types.LaneProposa } bp.lanes[lane] = lw } + if lw.Count() > 0 && h.BlockNumber() != lw.nextBlockNum { + return fmt.Errorf("block %s/%d out of sequence (next=%d)", lane, h.BlockNumber(), lw.nextBlockNum) + } if err := lw.Write(proposal); err != nil { return fmt.Errorf("persist block %s/%d: %w", lane, h.BlockNumber(), err) } @@ -162,7 +165,12 @@ func (bp *BlockPersister) DeleteBefore(laneFirsts map[types.LaneID]types.BlockNu continue } walIdx := lw.firstIdx + uint64(first-firstBN) - if err := lw.TruncateBefore(walIdx); err != nil { + if err := lw.TruncateBefore(walIdx, func(entry *types.Signed[*types.LaneProposal]) error { + if got := entry.Msg().Block().Header().BlockNumber(); got != first { + return fmt.Errorf("block at WAL index %d has number %d, expected %d (index mapping broken)", walIdx, got, first) + } + return nil + }); err != nil { return fmt.Errorf("truncate lane %s WAL before block %d: %w", lane, first, err) } } diff --git a/sei-tendermint/internal/autobahn/consensus/persist/commitqcs.go b/sei-tendermint/internal/autobahn/consensus/persist/commitqcs.go index 1b527d09eb..739e5e79a8 100644 --- a/sei-tendermint/internal/autobahn/consensus/persist/commitqcs.go +++ b/sei-tendermint/internal/autobahn/consensus/persist/commitqcs.go @@ -70,11 +70,13 @@ func (cp *CommitQCPersister) ResetNext(idx types.RoadIndex) { } // PersistCommitQC writes a CommitQC to the WAL. -// The caller must persist CommitQCs in order; idx < cp.next is a bug. +// CommitQCs must be persisted in strict sequential order; any gap or +// duplicate breaks the linear RoadIndex-to-WAL-index mapping that +// DeleteBefore relies on. func (cp *CommitQCPersister) PersistCommitQC(qc *types.CommitQC) error { idx := qc.Index() - if idx < cp.next { - return fmt.Errorf("commitqc %d already persisted (next=%d)", idx, cp.next) + if idx != cp.next { + return fmt.Errorf("commitqc %d out of sequence (next=%d)", idx, cp.next) } if !cp.noop { if err := cp.iw.Write(qc); err != nil { @@ -98,7 +100,12 @@ func (cp *CommitQCPersister) DeleteBefore(idx types.RoadIndex) error { return nil } walIdx := cp.iw.firstIdx + uint64(idx-firstRoadIndex) - if err := cp.iw.TruncateBefore(walIdx); err != nil { + if err := cp.iw.TruncateBefore(walIdx, func(entry *types.CommitQC) error { + if entry.Index() != idx { + return fmt.Errorf("commitqc at WAL index %d has road index %d, expected %d (index mapping broken)", walIdx, entry.Index(), idx) + } + return nil + }); err != nil { return fmt.Errorf("truncate commitqc WAL before %d: %w", walIdx, err) } return nil diff --git a/sei-tendermint/internal/autobahn/consensus/persist/commitqcs_test.go b/sei-tendermint/internal/autobahn/consensus/persist/commitqcs_test.go index e1eeaf7ddb..47c750ed38 100644 --- a/sei-tendermint/internal/autobahn/consensus/persist/commitqcs_test.go +++ b/sei-tendermint/internal/autobahn/consensus/persist/commitqcs_test.go @@ -162,9 +162,28 @@ func TestCommitQCPersistInOrder(t *testing.T) { require.NoError(t, cp.PersistCommitQC(qcs[0])) require.NoError(t, cp.PersistCommitQC(qcs[1])) - // Persisting qcs[0] again should fail (idx < next). + // Persisting qcs[0] again should fail (duplicate). err = cp.PersistCommitQC(qcs[0]) require.Error(t, err) + require.Contains(t, err.Error(), "out of sequence") + require.NoError(t, cp.Close()) +} + +func TestCommitQCPersistGapRejected(t *testing.T) { + rng := utils.TestRng() + committee, keys := types.GenCommittee(rng, 4) + dir := t.TempDir() + + qcs := makeSequentialCommitQCs(rng, committee, keys, 5) + cp, _, err := NewCommitQCPersister(utils.Some(dir)) + require.NoError(t, err) + + require.NoError(t, cp.PersistCommitQC(qcs[0])) + require.NoError(t, cp.PersistCommitQC(qcs[1])) + // Skip qcs[2], try to persist qcs[3] — should fail because idx(3) != next(2). + err = cp.PersistCommitQC(qcs[3]) + require.Error(t, err) + require.Contains(t, err.Error(), "out of sequence") require.NoError(t, cp.Close()) } diff --git a/sei-tendermint/internal/autobahn/consensus/persist/wal.go b/sei-tendermint/internal/autobahn/consensus/persist/wal.go index eb23210cfb..41ed399876 100644 --- a/sei-tendermint/internal/autobahn/consensus/persist/wal.go +++ b/sei-tendermint/internal/autobahn/consensus/persist/wal.go @@ -78,8 +78,18 @@ func (w *indexedWAL[T]) Write(entry T) error { return nil } -// TruncateBefore removes entries before walIdx and advances firstIdx. -func (w *indexedWAL[T]) TruncateBefore(walIdx uint64) error { +// TruncateBefore reads the entry at walIdx, passes it to verify, and — if +// verify returns nil — removes all entries before walIdx. The verify callback +// lets callers assert that the WAL index maps to the expected domain object +// before a destructive operation. +func (w *indexedWAL[T]) TruncateBefore(walIdx uint64, verify func(T) error) error { + entry, err := w.wal.ReadAt(walIdx) + if err != nil { + return fmt.Errorf("read at WAL index %d: %w", walIdx, err) + } + if err := verify(entry); err != nil { + return err + } if err := w.wal.TruncateBefore(walIdx); err != nil { return err } diff --git a/sei-tendermint/internal/autobahn/consensus/persist/wal_test.go b/sei-tendermint/internal/autobahn/consensus/persist/wal_test.go index 9467f8e979..b788b7624d 100644 --- a/sei-tendermint/internal/autobahn/consensus/persist/wal_test.go +++ b/sei-tendermint/internal/autobahn/consensus/persist/wal_test.go @@ -1,11 +1,14 @@ package persist import ( + "fmt" "testing" "github.com/sei-protocol/sei-chain/sei-tendermint/libs/utils/require" ) +func acceptAny(string) error { return nil } + // stringCodec is a trivial codec for testing indexedWAL with strings. type stringCodec struct{} @@ -87,7 +90,7 @@ func TestIndexedWAL_ReopenAfterTruncate(t *testing.T) { require.NoError(t, iw.Write(s)) } // Truncate first 3 entries (indices 1,2,3); keep 4,5. - require.NoError(t, iw.TruncateBefore(4)) + require.NoError(t, iw.TruncateBefore(4, acceptAny)) require.Equal(t, uint64(2), iw.Count()) require.NoError(t, iw.Close()) @@ -117,7 +120,7 @@ func TestIndexedWAL_TruncateAllButLast(t *testing.T) { require.NoError(t, iw.Write("c")) // TruncateBefore keeps the entry at the given index; remove all but last. - require.NoError(t, iw.TruncateBefore(3)) + require.NoError(t, iw.TruncateBefore(3, acceptAny)) require.Equal(t, uint64(1), iw.Count()) require.Equal(t, uint64(3), iw.firstIdx) require.NoError(t, iw.Close()) @@ -135,6 +138,36 @@ func TestIndexedWAL_TruncateAllButLast(t *testing.T) { require.NoError(t, iw2.Close()) } +func TestIndexedWAL_TruncateBeforeVerifiesEntry(t *testing.T) { + dir := t.TempDir() + iw, err := openIndexedWAL(dir, stringCodec{}) + require.NoError(t, err) + + require.NoError(t, iw.Write("a")) + require.NoError(t, iw.Write("b")) + require.NoError(t, iw.Write("c")) + + // Verify callback receives the correct entry. + var got string + require.NoError(t, iw.TruncateBefore(2, func(s string) error { + got = s + return nil + })) + require.Equal(t, "b", got) + require.Equal(t, uint64(2), iw.firstIdx) + + // Verify callback can reject the truncation. + err = iw.TruncateBefore(3, func(s string) error { + return fmt.Errorf("rejected: %s", s) + }) + require.Error(t, err) + require.Contains(t, err.Error(), "rejected: c") + // firstIdx should NOT have advanced since verify rejected. + require.Equal(t, uint64(2), iw.firstIdx) + + require.NoError(t, iw.Close()) +} + func TestIndexedWAL_WriteAfterTruncate(t *testing.T) { dir := t.TempDir() @@ -145,7 +178,7 @@ func TestIndexedWAL_WriteAfterTruncate(t *testing.T) { require.NoError(t, iw.Write("c")) // Truncate "a" and "b". - require.NoError(t, iw.TruncateBefore(3)) + require.NoError(t, iw.TruncateBefore(3, acceptAny)) require.Equal(t, uint64(1), iw.Count()) // Write more after truncation. From 008c30c7cbd24542bc6882eb6e7005f5e1a1915b Mon Sep 17 00:00:00 2001 From: Wen Date: Mon, 9 Mar 2026 21:18:00 -0700 Subject: [PATCH 12/31] fix: update state_test to expect write-time sequence check The non-contiguous commitQC test now expects the gap to be caught at PersistCommitQC time ("out of sequence") rather than at NewState load time, matching the defense-in-depth guard added earlier. Made-with: Cursor --- .../internal/autobahn/avail/state_test.go | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/sei-tendermint/internal/autobahn/avail/state_test.go b/sei-tendermint/internal/autobahn/avail/state_test.go index d0e95a5bc0..62f2a06ae8 100644 --- a/sei-tendermint/internal/autobahn/avail/state_test.go +++ b/sei-tendermint/internal/autobahn/avail/state_test.go @@ -629,7 +629,6 @@ func TestNewStateWithPersistence(t *testing.T) { t.Run("non-contiguous commitQC files return error", func(t *testing.T) { dir := t.TempDir() - ds := data.NewState(&data.Config{Committee: committee}, utils.None[data.BlockStore]()) // Build 6 sequential CommitQCs (indices 0-5). allQCs := make([]*types.CommitQC, 6) @@ -649,19 +648,18 @@ func TestNewStateWithPersistence(t *testing.T) { CommitQc: types.CommitQCConv.Encode(allQCs[0]), })) - // Persist QCs 0, 1, 2 contiguously, then skip to 5 (simulating - // corruption or manual tampering). Since the anchor is persisted - // first, a gap should never occur normally — treat it as an error. + // Persist QCs 0, 1, 2 contiguously, then try to skip to 5. + // PersistCommitQC enforces strict sequential order, so the gap + // is caught at write time rather than at load time. cp, _, err := persist.NewCommitQCPersister(utils.Some(dir)) require.NoError(t, err) for i := 0; i < 3; i++ { require.NoError(t, cp.PersistCommitQC(allQCs[i])) } - require.NoError(t, cp.PersistCommitQC(allQCs[5])) - - _, err = NewState(keys[0], ds, utils.Some(dir)) + err = cp.PersistCommitQC(allQCs[5]) require.Error(t, err) - require.Contains(t, err.Error(), "non-contiguous") + require.Contains(t, err.Error(), "out of sequence") + require.NoError(t, cp.Close()) }) t.Run("corrupt AppQC data returns error", func(t *testing.T) { From 3c70d1d461845246f8c5fac344de94dfe6917cbc Mon Sep 17 00:00:00 2001 From: Wen Date: Mon, 9 Mar 2026 21:30:13 -0700 Subject: [PATCH 13/31] persist: simplify lazy lane WAL creation in PersistBlock Made-with: Cursor --- sei-tendermint/internal/autobahn/consensus/persist/blocks.go | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/sei-tendermint/internal/autobahn/consensus/persist/blocks.go b/sei-tendermint/internal/autobahn/consensus/persist/blocks.go index 571bd24157..051979e639 100644 --- a/sei-tendermint/internal/autobahn/consensus/persist/blocks.go +++ b/sei-tendermint/internal/autobahn/consensus/persist/blocks.go @@ -130,10 +130,11 @@ func (bp *BlockPersister) PersistBlock(proposal *types.Signed[*types.LaneProposa lane := h.Lane() lw, ok := bp.lanes[lane] if !ok { - var err error - if lw, err = newLaneWAL(filepath.Join(bp.dir, laneDir(lane))); err != nil { + newLW, err := newLaneWAL(filepath.Join(bp.dir, laneDir(lane))) + if err != nil { return fmt.Errorf("create lane WAL for %s: %w", lane, err) } + lw = newLW bp.lanes[lane] = lw } if lw.Count() > 0 && h.BlockNumber() != lw.nextBlockNum { From 02121466e6d352cc86ae5a054b09663966f7e30e Mon Sep 17 00:00:00 2001 From: Wen Date: Mon, 9 Mar 2026 21:33:41 -0700 Subject: [PATCH 14/31] persist: detect gaps in loadAll for blocks and commitQCs Add contiguity checks during WAL replay to catch on-disk corruption that bypasses write-time guards. Includes tests that write directly to the WAL to simulate corrupted data. Made-with: Cursor --- .../autobahn/consensus/persist/blocks.go | 5 +++- .../autobahn/consensus/persist/blocks_test.go | 21 +++++++++++++++++ .../autobahn/consensus/persist/commitqcs.go | 5 +++- .../consensus/persist/commitqcs_test.go | 23 +++++++++++++++++++ 4 files changed, 52 insertions(+), 2 deletions(-) diff --git a/sei-tendermint/internal/autobahn/consensus/persist/blocks.go b/sei-tendermint/internal/autobahn/consensus/persist/blocks.go index 051979e639..c4207fa5ab 100644 --- a/sei-tendermint/internal/autobahn/consensus/persist/blocks.go +++ b/sei-tendermint/internal/autobahn/consensus/persist/blocks.go @@ -208,8 +208,11 @@ func (lw *laneWAL) loadAll() ([]LoadedBlock, error) { return nil, err } loaded := make([]LoadedBlock, 0, len(entries)) - for _, proposal := range entries { + for i, proposal := range entries { h := proposal.Msg().Block().Header() + if i > 0 && h.BlockNumber() != lw.nextBlockNum { + return nil, fmt.Errorf("gap in lane %s: block %d follows %d", h.Lane(), h.BlockNumber(), lw.nextBlockNum-1) + } lw.nextBlockNum = h.BlockNumber() + 1 loaded = append(loaded, LoadedBlock{Number: h.BlockNumber(), Proposal: proposal}) logger.Info("loaded persisted block", "lane", h.Lane().String(), slog.Uint64("block", uint64(h.BlockNumber()))) diff --git a/sei-tendermint/internal/autobahn/consensus/persist/blocks_test.go b/sei-tendermint/internal/autobahn/consensus/persist/blocks_test.go index e86915f6b4..500a984abb 100644 --- a/sei-tendermint/internal/autobahn/consensus/persist/blocks_test.go +++ b/sei-tendermint/internal/autobahn/consensus/persist/blocks_test.go @@ -306,6 +306,27 @@ func TestNewBlockPersisterSkipsInvalidKeyDir(t *testing.T) { require.NoError(t, bp.Close()) } +func TestLoadAllDetectsBlockGap(t *testing.T) { + rng := utils.TestRng() + dir := t.TempDir() + key := types.GenSecretKey(rng) + lane := key.Public() + + // Write directly to a lane WAL, bypassing PersistBlock's contiguity check + // to simulate on-disk corruption (block 0 then block 2, skipping 1). + ld := filepath.Join(dir, "blocks", laneDir(lane)) + require.NoError(t, os.MkdirAll(ld, 0700)) + lw, err := newLaneWAL(ld) + require.NoError(t, err) + require.NoError(t, lw.Write(testSignedProposal(rng, key, 0))) + require.NoError(t, lw.Write(testSignedProposal(rng, key, 2))) + require.NoError(t, lw.Close()) + + _, _, err = NewBlockPersister(utils.Some(dir)) + require.Error(t, err) + require.Contains(t, err.Error(), "gap") +} + func TestLazyLaneCreation(t *testing.T) { rng := utils.TestRng() dir := t.TempDir() diff --git a/sei-tendermint/internal/autobahn/consensus/persist/commitqcs.go b/sei-tendermint/internal/autobahn/consensus/persist/commitqcs.go index 739e5e79a8..92d0f89f57 100644 --- a/sei-tendermint/internal/autobahn/consensus/persist/commitqcs.go +++ b/sei-tendermint/internal/autobahn/consensus/persist/commitqcs.go @@ -125,7 +125,10 @@ func (cp *CommitQCPersister) loadAll() ([]LoadedCommitQC, error) { return nil, err } loaded := make([]LoadedCommitQC, 0, len(entries)) - for _, qc := range entries { + for i, qc := range entries { + if i > 0 && qc.Index() != loaded[i-1].Index+1 { + return nil, fmt.Errorf("gap in commitQCs: index %d follows %d", qc.Index(), loaded[i-1].Index) + } loaded = append(loaded, LoadedCommitQC{Index: qc.Index(), QC: qc}) } return loaded, nil diff --git a/sei-tendermint/internal/autobahn/consensus/persist/commitqcs_test.go b/sei-tendermint/internal/autobahn/consensus/persist/commitqcs_test.go index 47c750ed38..fb7db8c3da 100644 --- a/sei-tendermint/internal/autobahn/consensus/persist/commitqcs_test.go +++ b/sei-tendermint/internal/autobahn/consensus/persist/commitqcs_test.go @@ -187,6 +187,29 @@ func TestCommitQCPersistGapRejected(t *testing.T) { require.NoError(t, cp.Close()) } +func TestLoadAllDetectsCommitQCGap(t *testing.T) { + rng := utils.TestRng() + committee, keys := types.GenCommittee(rng, 4) + dir := t.TempDir() + + // Build 3 sequential CommitQCs (indices 0, 1, 2). + qcs := makeSequentialCommitQCs(rng, committee, keys, 3) + + // Write directly to the WAL, bypassing PersistCommitQC's contiguity + // check to simulate on-disk corruption (index 0 then index 2, skipping 1). + walDir := filepath.Join(dir, "commitqcs") + require.NoError(t, os.MkdirAll(walDir, 0700)) + iw, err := openIndexedWAL(walDir, types.CommitQCConv) + require.NoError(t, err) + require.NoError(t, iw.Write(qcs[0])) + require.NoError(t, iw.Write(qcs[2])) + require.NoError(t, iw.Close()) + + _, _, err = NewCommitQCPersister(utils.Some(dir)) + require.Error(t, err) + require.Contains(t, err.Error(), "gap") +} + func TestNoOpCommitQCPersister(t *testing.T) { cp, loaded, err := NewCommitQCPersister(utils.None[string]()) require.NoError(t, err) From fff1fc3a8291178b09f3bf5d6d0f44894a0a912e Mon Sep 17 00:00:00 2001 From: Wen Date: Mon, 9 Mar 2026 21:35:25 -0700 Subject: [PATCH 15/31] persist: demote per-block load log to DEBUG Reduces log noise on restart with many validators and blocks. Made-with: Cursor --- sei-tendermint/internal/autobahn/consensus/persist/blocks.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sei-tendermint/internal/autobahn/consensus/persist/blocks.go b/sei-tendermint/internal/autobahn/consensus/persist/blocks.go index c4207fa5ab..28a4a9e36c 100644 --- a/sei-tendermint/internal/autobahn/consensus/persist/blocks.go +++ b/sei-tendermint/internal/autobahn/consensus/persist/blocks.go @@ -215,7 +215,7 @@ func (lw *laneWAL) loadAll() ([]LoadedBlock, error) { } lw.nextBlockNum = h.BlockNumber() + 1 loaded = append(loaded, LoadedBlock{Number: h.BlockNumber(), Proposal: proposal}) - logger.Info("loaded persisted block", "lane", h.Lane().String(), slog.Uint64("block", uint64(h.BlockNumber()))) + logger.Debug("loaded persisted block", "lane", h.Lane().String(), slog.Uint64("block", uint64(h.BlockNumber()))) } return loaded, nil } From 1e34e34d3130e079c39231587777b5054ed87473 Mon Sep 17 00:00:00 2001 From: Wen Date: Tue, 10 Mar 2026 11:33:33 -0700 Subject: [PATCH 16/31] persist: document why blocks use per-lane WALs Made-with: Cursor --- sei-tendermint/internal/autobahn/consensus/persist/blocks.go | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/sei-tendermint/internal/autobahn/consensus/persist/blocks.go b/sei-tendermint/internal/autobahn/consensus/persist/blocks.go index 28a4a9e36c..a31b2066c2 100644 --- a/sei-tendermint/internal/autobahn/consensus/persist/blocks.go +++ b/sei-tendermint/internal/autobahn/consensus/persist/blocks.go @@ -38,7 +38,10 @@ func (lw *laneWAL) firstBlockNum() utils.Option[types.BlockNumber] { // BlockPersister manages block persistence using one WAL per lane. // Each lane gets its own WAL in a subdirectory named by hex-encoded lane ID, -// so truncation is independent per lane. +// so truncation is independent per lane. A single shared WAL would be simpler +// but a lane whose blocks are never included in a committed block (e.g. the +// validator is removed from the committee) would prevent truncation of all +// other lanes' entries that follow it. // When noop is true, all disk I/O is skipped. type BlockPersister struct { dir string From 01cce1c9f18f913479f42c2e361f1fa36cb646cb Mon Sep 17 00:00:00 2001 From: Wen Date: Tue, 10 Mar 2026 11:50:59 -0700 Subject: [PATCH 17/31] persist: remove noop flag, use Option for dir/iw instead BlockPersister uses Option[string] for dir, CommitQCPersister uses Option[*indexedWAL] for iw. None = no-op mode, Some = real persistence. The noop behavior is now structurally implied rather than a separate flag. Made-with: Cursor --- .../autobahn/consensus/persist/blocks.go | 25 ++++++------ .../autobahn/consensus/persist/commitqcs.go | 38 +++++++++---------- .../consensus/persist/commitqcs_test.go | 2 +- 3 files changed, 31 insertions(+), 34 deletions(-) diff --git a/sei-tendermint/internal/autobahn/consensus/persist/blocks.go b/sei-tendermint/internal/autobahn/consensus/persist/blocks.go index a31b2066c2..661bbebd46 100644 --- a/sei-tendermint/internal/autobahn/consensus/persist/blocks.go +++ b/sei-tendermint/internal/autobahn/consensus/persist/blocks.go @@ -42,15 +42,10 @@ func (lw *laneWAL) firstBlockNum() utils.Option[types.BlockNumber] { // but a lane whose blocks are never included in a committed block (e.g. the // validator is removed from the committee) would prevent truncation of all // other lanes' entries that follow it. -// When noop is true, all disk I/O is skipped. +// When dir is None, all disk I/O is skipped (no-op mode). type BlockPersister struct { - dir string + dir utils.Option[string] lanes map[types.LaneID]*laneWAL - noop bool -} - -func newNoOpBlockPersister() *BlockPersister { - return &BlockPersister{noop: true, lanes: map[types.LaneID]*laneWAL{}} } func laneDir(lane types.LaneID) string { @@ -73,14 +68,14 @@ func newLaneWAL(dir string) (*laneWAL, error) { func NewBlockPersister(stateDir utils.Option[string]) (*BlockPersister, map[types.LaneID][]LoadedBlock, error) { sd, ok := stateDir.Get() if !ok { - return newNoOpBlockPersister(), nil, nil + return &BlockPersister{lanes: map[types.LaneID]*laneWAL{}}, nil, nil } dir := filepath.Join(sd, "blocks") if err := os.MkdirAll(dir, 0700); err != nil { return nil, nil, fmt.Errorf("create blocks dir %s: %w", dir, err) } - bp := &BlockPersister{dir: dir, lanes: map[types.LaneID]*laneWAL{}} + bp := &BlockPersister{dir: utils.Some(dir), lanes: map[types.LaneID]*laneWAL{}} entries, err := os.ReadDir(dir) if err != nil { @@ -126,14 +121,15 @@ func NewBlockPersister(stateDir utils.Option[string]) (*BlockPersister, map[type // PersistBlock writes a signed lane proposal to the per-lane WAL. // Creates the lane WAL lazily if this is the first block for the lane. func (bp *BlockPersister) PersistBlock(proposal *types.Signed[*types.LaneProposal]) error { - if bp.noop { + dir, ok := bp.dir.Get() + if !ok { return nil } h := proposal.Msg().Block().Header() lane := h.Lane() lw, ok := bp.lanes[lane] if !ok { - newLW, err := newLaneWAL(filepath.Join(bp.dir, laneDir(lane))) + newLW, err := newLaneWAL(filepath.Join(dir, laneDir(lane))) if err != nil { return fmt.Errorf("create lane WAL for %s: %w", lane, err) } @@ -156,7 +152,8 @@ func (bp *BlockPersister) PersistBlock(proposal *types.Signed[*types.LaneProposa // (validator no longer in committee): their WALs are closed and directories // removed. func (bp *BlockPersister) DeleteBefore(laneFirsts map[types.LaneID]types.BlockNumber) error { - if bp.noop { + dir, ok := bp.dir.Get() + if !ok { return nil } for lane, first := range laneFirsts { @@ -183,7 +180,7 @@ func (bp *BlockPersister) DeleteBefore(laneFirsts map[types.LaneID]types.BlockNu continue } _ = lw.Close() - _ = os.RemoveAll(filepath.Join(bp.dir, laneDir(lane))) + _ = os.RemoveAll(filepath.Join(dir, laneDir(lane))) delete(bp.lanes, lane) } return nil @@ -191,7 +188,7 @@ func (bp *BlockPersister) DeleteBefore(laneFirsts map[types.LaneID]types.BlockNu // Close shuts down all per-lane WALs. func (bp *BlockPersister) Close() error { - if bp.noop { + if _, ok := bp.dir.Get(); !ok { return nil } var firstErr error diff --git a/sei-tendermint/internal/autobahn/consensus/persist/commitqcs.go b/sei-tendermint/internal/autobahn/consensus/persist/commitqcs.go index 92d0f89f57..7dfbacc2ad 100644 --- a/sei-tendermint/internal/autobahn/consensus/persist/commitqcs.go +++ b/sei-tendermint/internal/autobahn/consensus/persist/commitqcs.go @@ -18,17 +18,12 @@ type LoadedCommitQC struct { // Entries are appended in order; each entry is self-describing (the serialized // CommitQC contains its RoadIndex). The WAL index is append order, not // RoadIndex — the indexedWAL tracks first/next indices to enable truncation. -// When noop is true, all disk I/O is skipped but cursor tracking still works. +// When iw is None, all disk I/O is skipped but cursor tracking still works. type CommitQCPersister struct { - iw *indexedWAL[*types.CommitQC] - noop bool + iw utils.Option[*indexedWAL[*types.CommitQC]] next types.RoadIndex } -func newNoOpCommitQCPersister() *CommitQCPersister { - return &CommitQCPersister{noop: true} -} - // NewCommitQCPersister opens (or creates) a WAL in the commitqcs/ subdirectory // and replays all persisted entries. Returns the persister and a sorted slice of // loaded CommitQCs. Corrupt tail entries are auto-truncated by the WAL library. @@ -36,7 +31,7 @@ func newNoOpCommitQCPersister() *CommitQCPersister { func NewCommitQCPersister(stateDir utils.Option[string]) (*CommitQCPersister, []LoadedCommitQC, error) { sd, ok := stateDir.Get() if !ok { - return newNoOpCommitQCPersister(), nil, nil + return &CommitQCPersister{}, nil, nil } dir := filepath.Join(sd, "commitqcs") iw, err := openIndexedWAL(dir, types.CommitQCConv) @@ -44,7 +39,7 @@ func NewCommitQCPersister(stateDir utils.Option[string]) (*CommitQCPersister, [] return nil, nil, fmt.Errorf("open commitqc WAL in %s: %w", dir, err) } - cp := &CommitQCPersister{iw: iw} + cp := &CommitQCPersister{iw: utils.Some(iw)} loaded, err := cp.loadAll() if err != nil { _ = iw.Close() @@ -78,8 +73,8 @@ func (cp *CommitQCPersister) PersistCommitQC(qc *types.CommitQC) error { if idx != cp.next { return fmt.Errorf("commitqc %d out of sequence (next=%d)", idx, cp.next) } - if !cp.noop { - if err := cp.iw.Write(qc); err != nil { + if iw, ok := cp.iw.Get(); ok { + if err := iw.Write(qc); err != nil { return fmt.Errorf("persist commitqc %d: %w", idx, err) } } @@ -92,15 +87,16 @@ func (cp *CommitQCPersister) PersistCommitQC(qc *types.CommitQC) error { // The mapping from RoadIndex to WAL index is linear: entries are written // sequentially, so WAL index = firstIdx + (roadIndex - firstRoadIndex). func (cp *CommitQCPersister) DeleteBefore(idx types.RoadIndex) error { - if cp.noop || idx == 0 || cp.iw.Count() == 0 { + iw, ok := cp.iw.Get() + if !ok || idx == 0 || iw.Count() == 0 { return nil } - firstRoadIndex := cp.next - types.RoadIndex(cp.iw.Count()) + firstRoadIndex := cp.next - types.RoadIndex(iw.Count()) if idx <= firstRoadIndex { return nil } - walIdx := cp.iw.firstIdx + uint64(idx-firstRoadIndex) - if err := cp.iw.TruncateBefore(walIdx, func(entry *types.CommitQC) error { + walIdx := iw.firstIdx + uint64(idx-firstRoadIndex) + if err := iw.TruncateBefore(walIdx, func(entry *types.CommitQC) error { if entry.Index() != idx { return fmt.Errorf("commitqc at WAL index %d has road index %d, expected %d (index mapping broken)", walIdx, entry.Index(), idx) } @@ -113,14 +109,18 @@ func (cp *CommitQCPersister) DeleteBefore(idx types.RoadIndex) error { // Close shuts down the WAL. func (cp *CommitQCPersister) Close() error { - if cp.noop { - return nil + if iw, ok := cp.iw.Get(); ok { + return iw.Close() } - return cp.iw.Close() + return nil } func (cp *CommitQCPersister) loadAll() ([]LoadedCommitQC, error) { - entries, err := cp.iw.ReadAll() + iw, ok := cp.iw.Get() + if !ok { + return nil, nil + } + entries, err := iw.ReadAll() if err != nil { return nil, err } diff --git a/sei-tendermint/internal/autobahn/consensus/persist/commitqcs_test.go b/sei-tendermint/internal/autobahn/consensus/persist/commitqcs_test.go index fb7db8c3da..cf01f7813a 100644 --- a/sei-tendermint/internal/autobahn/consensus/persist/commitqcs_test.go +++ b/sei-tendermint/internal/autobahn/consensus/persist/commitqcs_test.go @@ -145,7 +145,7 @@ func TestCommitQCDeleteBeforeZero(t *testing.T) { } func TestCommitQCResetNext(t *testing.T) { - cp := newNoOpCommitQCPersister() + cp := &CommitQCPersister{} require.Equal(t, types.RoadIndex(0), cp.LoadNext()) cp.ResetNext(5) require.Equal(t, types.RoadIndex(5), cp.LoadNext()) From 9169cfab4fcb4dda31ad5f761b2bdc01e6de5032 Mon Sep 17 00:00:00 2001 From: Wen Date: Tue, 10 Mar 2026 13:46:36 -0700 Subject: [PATCH 18/31] persist: simplify Close, consolidate loadAll log, drop slog import Made-with: Cursor --- .../internal/autobahn/consensus/persist/blocks.go | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/sei-tendermint/internal/autobahn/consensus/persist/blocks.go b/sei-tendermint/internal/autobahn/consensus/persist/blocks.go index 661bbebd46..490cdf921e 100644 --- a/sei-tendermint/internal/autobahn/consensus/persist/blocks.go +++ b/sei-tendermint/internal/autobahn/consensus/persist/blocks.go @@ -6,8 +6,6 @@ import ( "os" "path/filepath" - "log/slog" - "github.com/sei-protocol/sei-chain/sei-tendermint/internal/autobahn/types" "github.com/sei-protocol/sei-chain/sei-tendermint/libs/utils" "github.com/sei-protocol/seilog" @@ -191,13 +189,12 @@ func (bp *BlockPersister) Close() error { if _, ok := bp.dir.Get(); !ok { return nil } - var firstErr error for _, lw := range bp.lanes { - if err := lw.Close(); err != nil && firstErr == nil { - firstErr = err + if err := lw.Close(); err != nil { + return err } } - return firstErr + return nil } // loadAll reads all entries from the lane WAL and returns the loaded blocks. @@ -215,7 +212,11 @@ func (lw *laneWAL) loadAll() ([]LoadedBlock, error) { } lw.nextBlockNum = h.BlockNumber() + 1 loaded = append(loaded, LoadedBlock{Number: h.BlockNumber(), Proposal: proposal}) - logger.Debug("loaded persisted block", "lane", h.Lane().String(), slog.Uint64("block", uint64(h.BlockNumber()))) + } + if len(loaded) > 0 { + first, last := loaded[0].Number, loaded[len(loaded)-1].Number + logger.Debug("loaded persisted blocks", "lane", entries[0].Msg().Block().Header().Lane().String(), + "first", first, "last", last, "count", len(loaded)) } return loaded, nil } From 2d431fb0e3a2e14073db943c166a8ce786696b70 Mon Sep 17 00:00:00 2001 From: Wen Date: Tue, 10 Mar 2026 14:07:12 -0700 Subject: [PATCH 19/31] persist: extract directory name string literals into constants Made-with: Cursor --- .../autobahn/consensus/persist/blocks.go | 7 +- .../autobahn/consensus/persist/blocks_test.go | 125 ++++++++---------- .../autobahn/consensus/persist/commitqcs.go | 4 +- .../consensus/persist/commitqcs_test.go | 4 +- 4 files changed, 69 insertions(+), 71 deletions(-) diff --git a/sei-tendermint/internal/autobahn/consensus/persist/blocks.go b/sei-tendermint/internal/autobahn/consensus/persist/blocks.go index 490cdf921e..6ccdbc4e80 100644 --- a/sei-tendermint/internal/autobahn/consensus/persist/blocks.go +++ b/sei-tendermint/internal/autobahn/consensus/persist/blocks.go @@ -13,6 +13,8 @@ import ( var logger = seilog.NewLogger("tendermint", "internal", "autobahn", "consensus", "persist") +const blocksDir = "blocks" + // LoadedBlock is a block loaded from disk during state restoration. type LoadedBlock struct { Number types.BlockNumber @@ -68,7 +70,7 @@ func NewBlockPersister(stateDir utils.Option[string]) (*BlockPersister, map[type if !ok { return &BlockPersister{lanes: map[types.LaneID]*laneWAL{}}, nil, nil } - dir := filepath.Join(sd, "blocks") + dir := filepath.Join(sd, blocksDir) if err := os.MkdirAll(dir, 0700); err != nil { return nil, nil, fmt.Errorf("create blocks dir %s: %w", dir, err) } @@ -154,6 +156,9 @@ func (bp *BlockPersister) DeleteBefore(laneFirsts map[types.LaneID]types.BlockNu if !ok { return nil } + if len(laneFirsts) == 0 { + panic("DeleteBefore called with empty laneFirsts (empty committee)") + } for lane, first := range laneFirsts { lw, ok := bp.lanes[lane] if !ok { diff --git a/sei-tendermint/internal/autobahn/consensus/persist/blocks_test.go b/sei-tendermint/internal/autobahn/consensus/persist/blocks_test.go index 500a984abb..3c799ab006 100644 --- a/sei-tendermint/internal/autobahn/consensus/persist/blocks_test.go +++ b/sei-tendermint/internal/autobahn/consensus/persist/blocks_test.go @@ -22,7 +22,7 @@ func TestNewBlockPersisterEmptyDir(t *testing.T) { require.NoError(t, err) require.NotNil(t, bp) require.Equal(t, 0, len(blocks)) - fi, err := os.Stat(filepath.Join(dir, "blocks")) + fi, err := os.Stat(filepath.Join(dir, blocksDir)) require.NoError(t, err) require.True(t, fi.IsDir()) require.NoError(t, bp.Close()) @@ -104,14 +104,16 @@ func TestDeleteBeforeRemovesOldKeepsNew(t *testing.T) { require.Equal(t, types.BlockNumber(4), blocks[lane][1].Number) } -func TestDeleteBeforeMultipleLanes(t *testing.T) { +func TestDeleteBeforeAndRestart(t *testing.T) { rng := utils.TestRng() dir := t.TempDir() key1 := types.GenSecretKey(rng) key2 := types.GenSecretKey(rng) + key3 := types.GenSecretKey(rng) lane1 := key1.Public() lane2 := key2.Public() + lane3 := key3.Public() // never persisted — exercises the "no WAL yet" path bp, _, err := NewBlockPersister(utils.Some(dir)) require.NoError(t, err) @@ -120,38 +122,32 @@ func TestDeleteBeforeMultipleLanes(t *testing.T) { require.NoError(t, bp.PersistBlock(testSignedProposal(rng, key2, i))) } - // Delete lane1 < 2, lane2 < 1 — independent per-lane truncation. - require.NoError(t, bp.DeleteBefore(map[types.LaneID]types.BlockNumber{lane1: 2, lane2: 1})) + // lane1: truncate old blocks, lane2: delete nothing (first=0), lane3: empty (no WAL). + require.NoError(t, bp.DeleteBefore(map[types.LaneID]types.BlockNumber{lane1: 2, lane2: 0, lane3: 0})) require.NoError(t, bp.Close()) - _, blocks, err := NewBlockPersister(utils.Some(dir)) + // Restart — verify varied lane states load correctly. + bp2, blocks, err := NewBlockPersister(utils.Some(dir)) require.NoError(t, err) require.Equal(t, 1, len(blocks[lane1]), "lane1 should have block 2") require.Equal(t, types.BlockNumber(2), blocks[lane1][0].Number) - require.Equal(t, 2, len(blocks[lane2]), "lane2 should have blocks 1,2") - require.Equal(t, types.BlockNumber(1), blocks[lane2][0].Number) - require.Equal(t, types.BlockNumber(2), blocks[lane2][1].Number) -} - -func TestDeleteBeforeEmptyMapRemovesAll(t *testing.T) { - rng := utils.TestRng() - dir := t.TempDir() - - key := types.GenSecretKey(rng) - bp, _, err := NewBlockPersister(utils.Some(dir)) - require.NoError(t, err) + require.Equal(t, 3, len(blocks[lane2]), "lane2 should have all 3 blocks") + require.Equal(t, 0, len(blocks[lane3]), "lane3 never had blocks") - require.NoError(t, bp.PersistBlock(testSignedProposal(rng, key, 0))) - - // Empty map = no committee members, so all lanes are stale. - require.NoError(t, bp.DeleteBefore(map[types.LaneID]types.BlockNumber{})) - require.NoError(t, bp.Close()) + // Persist more after restart, then restart again to verify continuity. + require.NoError(t, bp2.PersistBlock(testSignedProposal(rng, key1, 3))) + require.NoError(t, bp2.PersistBlock(testSignedProposal(rng, key2, 3))) + require.NoError(t, bp2.Close()) - _, blocks, err := NewBlockPersister(utils.Some(dir)) + _, blocks2, err := NewBlockPersister(utils.Some(dir)) require.NoError(t, err) - require.Equal(t, 0, len(blocks)) + require.Equal(t, 2, len(blocks2[lane1]), "lane1 should have blocks 2,3") + require.Equal(t, types.BlockNumber(3), blocks2[lane1][1].Number) + require.Equal(t, 4, len(blocks2[lane2]), "lane2 should have blocks 0..3") + require.Equal(t, types.BlockNumber(3), blocks2[lane2][3].Number) } + func TestNoOpBlockPersister(t *testing.T) { bp, blocks, err := NewBlockPersister(utils.None[string]()) require.NoError(t, err) @@ -160,8 +156,9 @@ func TestNoOpBlockPersister(t *testing.T) { rng := utils.TestRng() key := types.GenSecretKey(rng) + lane := key.Public() require.NoError(t, bp.PersistBlock(testSignedProposal(rng, key, 0))) - require.NoError(t, bp.DeleteBefore(map[types.LaneID]types.BlockNumber{})) + require.NoError(t, bp.DeleteBefore(map[types.LaneID]types.BlockNumber{lane: 0})) require.NoError(t, bp.Close()) } @@ -190,35 +187,6 @@ func TestDeleteBeforeThenPersistMore(t *testing.T) { require.Equal(t, types.BlockNumber(5), blocks[lane][2].Number) } -func TestPerLaneIndependentTruncation(t *testing.T) { - rng := utils.TestRng() - dir := t.TempDir() - - key1 := types.GenSecretKey(rng) - key2 := types.GenSecretKey(rng) - lane1 := key1.Public() - lane2 := key2.Public() - bp, _, err := NewBlockPersister(utils.Some(dir)) - require.NoError(t, err) - - // Lane1: blocks 0..9, Lane2: blocks 0..2 - for i := types.BlockNumber(0); i < 10; i++ { - require.NoError(t, bp.PersistBlock(testSignedProposal(rng, key1, i))) - } - for i := types.BlockNumber(0); i < 3; i++ { - require.NoError(t, bp.PersistBlock(testSignedProposal(rng, key2, i))) - } - - // Truncate lane1 aggressively, keep lane2 at 0 (both in the map). - require.NoError(t, bp.DeleteBefore(map[types.LaneID]types.BlockNumber{lane1: 8, lane2: 0})) - require.NoError(t, bp.Close()) - - _, blocks, err := NewBlockPersister(utils.Some(dir)) - require.NoError(t, err) - require.Equal(t, 2, len(blocks[lane1]), "lane1: blocks 8,9") - require.Equal(t, types.BlockNumber(8), blocks[lane1][0].Number) - require.Equal(t, 3, len(blocks[lane2]), "lane2: all 3 blocks intact") -} func TestDeleteBeforeRemovesStaleLanes(t *testing.T) { rng := utils.TestRng() @@ -245,7 +213,7 @@ func TestDeleteBeforeRemovesStaleLanes(t *testing.T) { require.Equal(t, 3, len(blocks[lane1]), "lane1: all blocks intact") require.Equal(t, 0, len(blocks[lane2]), "lane2: removed as stale") - entries, _ := os.ReadDir(filepath.Join(dir, "blocks")) + entries, _ := os.ReadDir(filepath.Join(dir, blocksDir)) require.Equal(t, 1, len(entries), "only lane1 directory remains") } @@ -258,8 +226,8 @@ func TestEmptyLaneWALSurvivesReopen(t *testing.T) { // Simulate a crash after lazy lane directory creation but before any write: // create the lane subdirectory so NewBlockPersister discovers it on open. - blocksDir := filepath.Join(dir, "blocks") - require.NoError(t, os.MkdirAll(filepath.Join(blocksDir, laneDir(lane)), 0700)) + bd := filepath.Join(dir, blocksDir) + require.NoError(t, os.MkdirAll(filepath.Join(bd, laneDir(lane)), 0700)) // Reopen — empty lane WAL should be loaded and usable. bp, blocks, err := NewBlockPersister(utils.Some(dir)) @@ -279,12 +247,12 @@ func TestEmptyLaneWALSurvivesReopen(t *testing.T) { func TestNewBlockPersisterSkipsNonHexDir(t *testing.T) { dir := t.TempDir() - blocksDir := filepath.Join(dir, "blocks") - require.NoError(t, os.MkdirAll(blocksDir, 0700)) + bd := filepath.Join(dir, blocksDir) + require.NoError(t, os.MkdirAll(bd, 0700)) // Create a non-hex directory and a regular file — both should be skipped. - require.NoError(t, os.Mkdir(filepath.Join(blocksDir, "not-valid-hex"), 0700)) - require.NoError(t, os.WriteFile(filepath.Join(blocksDir, "stray-file.txt"), []byte("hi"), 0600)) + require.NoError(t, os.Mkdir(filepath.Join(bd, "not-valid-hex"), 0700)) + require.NoError(t, os.WriteFile(filepath.Join(bd, "stray-file.txt"), []byte("hi"), 0600)) bp, blocks, err := NewBlockPersister(utils.Some(dir)) require.NoError(t, err) @@ -294,11 +262,11 @@ func TestNewBlockPersisterSkipsNonHexDir(t *testing.T) { func TestNewBlockPersisterSkipsInvalidKeyDir(t *testing.T) { dir := t.TempDir() - blocksDir := filepath.Join(dir, "blocks") - require.NoError(t, os.MkdirAll(blocksDir, 0700)) + bd := filepath.Join(dir, blocksDir) + require.NoError(t, os.MkdirAll(bd, 0700)) // Valid hex but too short to be a valid ed25519 public key. - require.NoError(t, os.Mkdir(filepath.Join(blocksDir, "abcd"), 0700)) + require.NoError(t, os.Mkdir(filepath.Join(bd, "abcd"), 0700)) bp, blocks, err := NewBlockPersister(utils.Some(dir)) require.NoError(t, err) @@ -306,6 +274,29 @@ func TestNewBlockPersisterSkipsInvalidKeyDir(t *testing.T) { require.NoError(t, bp.Close()) } +func TestPersistBlockOutOfSequence(t *testing.T) { + rng := utils.TestRng() + dir := t.TempDir() + + key := types.GenSecretKey(rng) + bp, _, err := NewBlockPersister(utils.Some(dir)) + require.NoError(t, err) + + require.NoError(t, bp.PersistBlock(testSignedProposal(rng, key, 0))) + + // Gap: skip block 1, try block 2. + err = bp.PersistBlock(testSignedProposal(rng, key, 2)) + require.Error(t, err) + require.Contains(t, err.Error(), "out of sequence") + + // Duplicate: try block 0 again. + err = bp.PersistBlock(testSignedProposal(rng, key, 0)) + require.Error(t, err) + require.Contains(t, err.Error(), "out of sequence") + + require.NoError(t, bp.Close()) +} + func TestLoadAllDetectsBlockGap(t *testing.T) { rng := utils.TestRng() dir := t.TempDir() @@ -314,7 +305,7 @@ func TestLoadAllDetectsBlockGap(t *testing.T) { // Write directly to a lane WAL, bypassing PersistBlock's contiguity check // to simulate on-disk corruption (block 0 then block 2, skipping 1). - ld := filepath.Join(dir, "blocks", laneDir(lane)) + ld := filepath.Join(dir, blocksDir, laneDir(lane)) require.NoError(t, os.MkdirAll(ld, 0700)) lw, err := newLaneWAL(ld) require.NoError(t, err) @@ -335,14 +326,14 @@ func TestLazyLaneCreation(t *testing.T) { require.NoError(t, err) // No lanes exist yet. - entries, _ := os.ReadDir(filepath.Join(dir, "blocks")) + entries, _ := os.ReadDir(filepath.Join(dir, blocksDir)) require.Equal(t, 0, len(entries)) // First persist for a lane creates its directory and WAL. key := types.GenSecretKey(rng) require.NoError(t, bp.PersistBlock(testSignedProposal(rng, key, 0))) - entries, _ = os.ReadDir(filepath.Join(dir, "blocks")) + entries, _ = os.ReadDir(filepath.Join(dir, blocksDir)) require.Equal(t, 1, len(entries), "should have 1 lane directory") require.NoError(t, bp.Close()) } diff --git a/sei-tendermint/internal/autobahn/consensus/persist/commitqcs.go b/sei-tendermint/internal/autobahn/consensus/persist/commitqcs.go index 7dfbacc2ad..e47bf2ec00 100644 --- a/sei-tendermint/internal/autobahn/consensus/persist/commitqcs.go +++ b/sei-tendermint/internal/autobahn/consensus/persist/commitqcs.go @@ -8,6 +8,8 @@ import ( "github.com/sei-protocol/sei-chain/sei-tendermint/libs/utils" ) +const commitqcsDir = "commitqcs" + // LoadedCommitQC is a CommitQC loaded from disk during state restoration. type LoadedCommitQC struct { Index types.RoadIndex @@ -33,7 +35,7 @@ func NewCommitQCPersister(stateDir utils.Option[string]) (*CommitQCPersister, [] if !ok { return &CommitQCPersister{}, nil, nil } - dir := filepath.Join(sd, "commitqcs") + dir := filepath.Join(sd, commitqcsDir) iw, err := openIndexedWAL(dir, types.CommitQCConv) if err != nil { return nil, nil, fmt.Errorf("open commitqc WAL in %s: %w", dir, err) diff --git a/sei-tendermint/internal/autobahn/consensus/persist/commitqcs_test.go b/sei-tendermint/internal/autobahn/consensus/persist/commitqcs_test.go index cf01f7813a..2f825bb991 100644 --- a/sei-tendermint/internal/autobahn/consensus/persist/commitqcs_test.go +++ b/sei-tendermint/internal/autobahn/consensus/persist/commitqcs_test.go @@ -68,7 +68,7 @@ func TestNewCommitQCPersisterEmptyDir(t *testing.T) { require.Equal(t, 0, len(loaded)) require.Equal(t, types.RoadIndex(0), cp.LoadNext()) - fi, err := os.Stat(filepath.Join(dir, "commitqcs")) + fi, err := os.Stat(filepath.Join(dir, commitqcsDir)) require.NoError(t, err) require.True(t, fi.IsDir()) require.NoError(t, cp.Close()) @@ -197,7 +197,7 @@ func TestLoadAllDetectsCommitQCGap(t *testing.T) { // Write directly to the WAL, bypassing PersistCommitQC's contiguity // check to simulate on-disk corruption (index 0 then index 2, skipping 1). - walDir := filepath.Join(dir, "commitqcs") + walDir := filepath.Join(dir, commitqcsDir) require.NoError(t, os.MkdirAll(walDir, 0700)) iw, err := openIndexedWAL(walDir, types.CommitQCConv) require.NoError(t, err) From 3295dab972fb660ea922db801a33add40f8ddf1c Mon Sep 17 00:00:00 2001 From: Wen Date: Tue, 10 Mar 2026 15:22:32 -0700 Subject: [PATCH 20/31] persist: handle anchor past all persisted entries, remove ResetNext When a node restarts after being offline for a long time, the prune anchor may have advanced past all locally persisted WAL entries. - Add indexedWAL.Reset() to close, remove, and reopen a fresh WAL. - DeleteBefore in both blocks.go and commitqcs.go now calls Reset() when the prune point is at or past the last persisted entry. - CommitQCPersister.DeleteBefore advances the write cursor (cp.next) in the reset branch, making ResetNext unnecessary. - PersistCommitQC now silently ignores duplicates (idx < next) so startup can idempotently re-persist in-memory entries after a reset. - Remove ResetNext; replace call sites with a re-persist loop at startup and rely on DeleteBefore's cursor management at runtime. - Reorder runPersist: prune before writes (WAL needs contiguous indices). - Update runPersist godoc to match new step ordering. - Add tests for Reset, DeleteBefore-past-all, duplicate no-op, and an integration test for NewState with anchor past all persisted QCs. Made-with: Cursor --- .../internal/autobahn/avail/state.go | 52 ++++++++++++------- .../internal/autobahn/avail/state_test.go | 45 ++++++++++++++++ .../autobahn/consensus/persist/blocks.go | 7 +++ .../autobahn/consensus/persist/blocks_test.go | 29 ++++++++++- .../autobahn/consensus/persist/commitqcs.go | 29 ++++++----- .../consensus/persist/commitqcs_test.go | 46 +++++++++++----- .../autobahn/consensus/persist/wal.go | 26 ++++++++-- .../autobahn/consensus/persist/wal_test.go | 38 ++++++++++++++ 8 files changed, 225 insertions(+), 47 deletions(-) diff --git a/sei-tendermint/internal/autobahn/avail/state.go b/sei-tendermint/internal/autobahn/avail/state.go index 139503a7c8..9b2635c982 100644 --- a/sei-tendermint/internal/autobahn/avail/state.go +++ b/sei-tendermint/internal/autobahn/avail/state.go @@ -158,10 +158,8 @@ func NewState(key types.SecretKey, data *data.State, stateDir utils.Option[strin } // Truncate WAL entries below the prune anchor that were filtered out by - // loadPersistedState. Also reset the CommitQC persister's cursor to - // match the post-prune range. - // Must include all current committee members: DeleteBefore removes - // lane WALs not present in the map. + // loadPersistedState. Must include all current committee members: + // DeleteBefore removes lane WALs not present in the map. laneFirsts := make(map[types.LaneID]types.BlockNumber, len(inner.blocks)) for lane, q := range inner.blocks { laneFirsts[lane] = q.first @@ -172,7 +170,15 @@ func NewState(key types.SecretKey, data *data.State, stateDir utils.Option[strin if err := pers.commitQCs.DeleteBefore(inner.commitQCs.first); err != nil { return nil, fmt.Errorf("prune stale commitQC WAL entries: %w", err) } - pers.commitQCs.ResetNext(inner.commitQCs.next) + // Re-persist in-memory CommitQCs. In the normal case every entry is + // already on disk (idx < cp.next) so PersistCommitQC is a no-op. After + // a WAL reset (anchor advanced past all entries) this writes back the + // anchor's CommitQC that was lost when the WAL was cleared. + for idx := inner.commitQCs.first; idx < inner.commitQCs.next; idx++ { + if err := pers.commitQCs.PersistCommitQC(inner.commitQCs.q[idx]); err != nil { + return nil, fmt.Errorf("re-persist commitqc %d: %w", idx, err) + } + } return &State{ key: key, @@ -634,10 +640,14 @@ func (s *State) Run(ctx context.Context) error { // runPersist is the main loop for the persist goroutine. // Write order: // 1. Prune anchor (AppQC + CommitQC pair) — the crash-recovery watermark. -// 2. CommitQCs in order, then publish LastCommitQC immediately +// 2. Prune old blocks and CommitQCs (safe because the anchor is already durable). +// 3. CommitQCs in order, then publish LastCommitQC immediately // so consensus can advance without waiting for block writes. -// 3. Blocks per lane in order, markBlockPersisted after each. -// 4. Prune old blocks and CommitQCs. +// 4. Blocks per lane in order, markBlockPersisted after each. +// +// Pruning (step 2) must happen before writes (steps 3-4) because the WAL +// requires contiguous indices — if the anchor advanced past all persisted +// entries, DeleteBefore resets the WAL so new writes start clean. // // The prune anchor is a pruning watermark: on restart we resume from it. // @@ -664,8 +674,20 @@ func (s *State) runPersist(ctx context.Context, pers persisters) error { lastPersistedAppQCNext = anchor.CommitQC.Proposal().Index() + 1 } - // 2. Persist new CommitQCs, then publish immediately so consensus - // can advance without waiting for block writes or pruning. + // 2. Prune old data. Safe because the anchor (step 1) is already + // durable and embeds the CommitQC needed for crash recovery. + // Must happen before writes because the WAL requires contiguous + // indices — if the anchor advanced past all persisted entries, + // DeleteBefore resets the WAL so new writes start clean. + if err := pers.blocks.DeleteBefore(batch.laneFirsts); err != nil { + return fmt.Errorf("block deleteBefore: %w", err) + } + if err := pers.commitQCs.DeleteBefore(batch.commitQCFirst); err != nil { + return fmt.Errorf("commitqc deleteBefore: %w", err) + } + + // 3. Persist new CommitQCs, then publish immediately so consensus + // can advance without waiting for block writes. for _, qc := range batch.commitQCs { if err := pers.commitQCs.PersistCommitQC(qc); err != nil { return fmt.Errorf("persist commitqc %d: %w", qc.Index(), err) @@ -675,7 +697,7 @@ func (s *State) runPersist(ctx context.Context, pers persisters) error { s.markCommitQCsPersisted(batch.commitQCs[len(batch.commitQCs)-1]) } - // 3. Persist blocks (mark each individually for vote latency). + // 4. Persist blocks (mark each individually for vote latency). for _, proposal := range batch.blocks { h := proposal.Msg().Block().Header() if err := pers.blocks.PersistBlock(proposal); err != nil { @@ -683,14 +705,6 @@ func (s *State) runPersist(ctx context.Context, pers persisters) error { } s.markBlockPersisted(h.Lane(), h.BlockNumber()+1) } - - // 4. Prune old data. - if err := pers.blocks.DeleteBefore(batch.laneFirsts); err != nil { - return fmt.Errorf("block deleteBefore: %w", err) - } - if err := pers.commitQCs.DeleteBefore(batch.commitQCFirst); err != nil { - return fmt.Errorf("commitqc deleteBefore: %w", err) - } } } diff --git a/sei-tendermint/internal/autobahn/avail/state_test.go b/sei-tendermint/internal/autobahn/avail/state_test.go index 62f2a06ae8..f96f520bf8 100644 --- a/sei-tendermint/internal/autobahn/avail/state_test.go +++ b/sei-tendermint/internal/autobahn/avail/state_test.go @@ -662,6 +662,51 @@ func TestNewStateWithPersistence(t *testing.T) { require.NoError(t, cp.Close()) }) + t.Run("anchor past all persisted commitQCs resets WAL", func(t *testing.T) { + dir := t.TempDir() + ds := data.NewState(&data.Config{Committee: committee}, utils.None[data.BlockStore]()) + + // Build a chain of 10 CommitQCs (indices 0-9). + qcs := make([]*types.CommitQC, 10) + prev := utils.None[*types.CommitQC]() + for i := range qcs { + qcs[i] = makeCommitQC(rng, committee, keys, prev, nil, utils.None[*types.AppQC]()) + prev = utils.Some(qcs[i]) + } + + // Persist only indices 0-4 to the CommitQC WAL. + cp, _, err := persist.NewCommitQCPersister(utils.Some(dir)) + require.NoError(t, err) + for i := 0; i < 5; i++ { + require.NoError(t, cp.PersistCommitQC(qcs[i])) + } + require.NoError(t, cp.Close()) + + // Persist a prune anchor at index 9 — well past the persisted range. + appProposal := types.NewAppProposal(50, 9, types.GenAppHash(rng)) + appQC := types.NewAppQC(makeAppVotes(keys, appProposal)) + prunePers, _, err := persist.NewPersister[*pb.PersistedAvailPruneAnchor](utils.Some(dir), innerFile) + require.NoError(t, err) + require.NoError(t, prunePers.Persist(&pb.PersistedAvailPruneAnchor{ + AppQc: types.AppQCConv.Encode(appQC), + CommitQc: types.CommitQCConv.Encode(qcs[9]), + })) + + // NewState should succeed: DeleteBefore resets the stale WAL, + // then the re-persist loop writes the anchor's CommitQC back. + state, err := NewState(keys[0], ds, utils.Some(dir)) + require.NoError(t, err) + + require.Equal(t, types.RoadIndex(9), state.FirstCommitQC()) + latest, ok := state.LastCommitQC().Load().Get() + require.True(t, ok) + require.NoError(t, utils.TestDiff(qcs[9], latest)) + + got, ok := state.LastAppQC().Get() + require.True(t, ok) + require.Equal(t, types.RoadIndex(9), got.Proposal().RoadIndex()) + }) + t.Run("corrupt AppQC data returns error", func(t *testing.T) { dir := t.TempDir() ds := data.NewState(&data.Config{Committee: committee}, utils.None[data.BlockStore]()) diff --git a/sei-tendermint/internal/autobahn/consensus/persist/blocks.go b/sei-tendermint/internal/autobahn/consensus/persist/blocks.go index 6ccdbc4e80..ec999b496a 100644 --- a/sei-tendermint/internal/autobahn/consensus/persist/blocks.go +++ b/sei-tendermint/internal/autobahn/consensus/persist/blocks.go @@ -168,6 +168,13 @@ func (bp *BlockPersister) DeleteBefore(laneFirsts map[types.LaneID]types.BlockNu if !ok || first <= firstBN { continue } + if first >= lw.nextBlockNum { + // Anchor advanced past all persisted blocks for this lane. + if err := lw.Reset(); err != nil { + return fmt.Errorf("reset lane %s WAL: %w", lane, err) + } + continue + } walIdx := lw.firstIdx + uint64(first-firstBN) if err := lw.TruncateBefore(walIdx, func(entry *types.Signed[*types.LaneProposal]) error { if got := entry.Msg().Block().Header().BlockNumber(); got != first { diff --git a/sei-tendermint/internal/autobahn/consensus/persist/blocks_test.go b/sei-tendermint/internal/autobahn/consensus/persist/blocks_test.go index 3c799ab006..5d5a1d66d8 100644 --- a/sei-tendermint/internal/autobahn/consensus/persist/blocks_test.go +++ b/sei-tendermint/internal/autobahn/consensus/persist/blocks_test.go @@ -147,7 +147,6 @@ func TestDeleteBeforeAndRestart(t *testing.T) { require.Equal(t, types.BlockNumber(3), blocks2[lane2][3].Number) } - func TestNoOpBlockPersister(t *testing.T) { bp, blocks, err := NewBlockPersister(utils.None[string]()) require.NoError(t, err) @@ -187,6 +186,34 @@ func TestDeleteBeforeThenPersistMore(t *testing.T) { require.Equal(t, types.BlockNumber(5), blocks[lane][2].Number) } +func TestDeleteBeforePastAllBlocks(t *testing.T) { + rng := utils.TestRng() + dir := t.TempDir() + + key := types.GenSecretKey(rng) + lane := key.Public() + bp, _, err := NewBlockPersister(utils.Some(dir)) + require.NoError(t, err) + + for i := types.BlockNumber(0); i < 3; i++ { + require.NoError(t, bp.PersistBlock(testSignedProposal(rng, key, i))) + } + + // Anchor advanced past everything (nextBlockNum is 3, first=10). + require.NoError(t, bp.DeleteBefore(map[types.LaneID]types.BlockNumber{lane: 10})) + + // Lane WAL is now empty; new writes starting from 10 should work. + require.NoError(t, bp.PersistBlock(testSignedProposal(rng, key, 10))) + require.NoError(t, bp.PersistBlock(testSignedProposal(rng, key, 11))) + require.NoError(t, bp.Close()) + + // Reopen — should see only the post-reset blocks. + _, blocks, err := NewBlockPersister(utils.Some(dir)) + require.NoError(t, err) + require.Equal(t, 2, len(blocks[lane])) + require.Equal(t, types.BlockNumber(10), blocks[lane][0].Number) + require.Equal(t, types.BlockNumber(11), blocks[lane][1].Number) +} func TestDeleteBeforeRemovesStaleLanes(t *testing.T) { rng := utils.TestRng() diff --git a/sei-tendermint/internal/autobahn/consensus/persist/commitqcs.go b/sei-tendermint/internal/autobahn/consensus/persist/commitqcs.go index e47bf2ec00..b15c4a0d9a 100644 --- a/sei-tendermint/internal/autobahn/consensus/persist/commitqcs.go +++ b/sei-tendermint/internal/autobahn/consensus/persist/commitqcs.go @@ -59,20 +59,17 @@ func (cp *CommitQCPersister) LoadNext() types.RoadIndex { return cp.next } -// ResetNext overrides the next-to-persist cursor. Called after newInner -// applies prune(), which may advance commitQCs.next beyond the raw loader's -// cursor. -func (cp *CommitQCPersister) ResetNext(idx types.RoadIndex) { - cp.next = idx -} - // PersistCommitQC writes a CommitQC to the WAL. -// CommitQCs must be persisted in strict sequential order; any gap or -// duplicate breaks the linear RoadIndex-to-WAL-index mapping that -// DeleteBefore relies on. +// Entries must be persisted in sequential order. Duplicates (idx < next) are +// silently ignored — this makes startup idempotent after DeleteBefore resets +// the WAL. Gaps (idx > next) return an error because they break the linear +// RoadIndex-to-WAL-index mapping that DeleteBefore relies on. func (cp *CommitQCPersister) PersistCommitQC(qc *types.CommitQC) error { idx := qc.Index() - if idx != cp.next { + if idx < cp.next { + return nil + } + if idx > cp.next { return fmt.Errorf("commitqc %d out of sequence (next=%d)", idx, cp.next) } if iw, ok := cp.iw.Get(); ok { @@ -85,7 +82,9 @@ func (cp *CommitQCPersister) PersistCommitQC(qc *types.CommitQC) error { } // DeleteBefore removes persisted CommitQCs with road index below idx -// by truncating the front of the WAL. +// by truncating the front of the WAL. When idx is at or past every +// persisted entry, the WAL is reset and the write cursor is advanced +// to idx so subsequent PersistCommitQC calls start from the right place. // The mapping from RoadIndex to WAL index is linear: entries are written // sequentially, so WAL index = firstIdx + (roadIndex - firstRoadIndex). func (cp *CommitQCPersister) DeleteBefore(idx types.RoadIndex) error { @@ -93,6 +92,12 @@ func (cp *CommitQCPersister) DeleteBefore(idx types.RoadIndex) error { if !ok || idx == 0 || iw.Count() == 0 { return nil } + if idx >= cp.next { + // Anchor advanced past all persisted entries; discard everything + // and advance the cursor so the next PersistCommitQC starts at idx. + cp.next = idx + return iw.Reset() + } firstRoadIndex := cp.next - types.RoadIndex(iw.Count()) if idx <= firstRoadIndex { return nil diff --git a/sei-tendermint/internal/autobahn/consensus/persist/commitqcs_test.go b/sei-tendermint/internal/autobahn/consensus/persist/commitqcs_test.go index 2f825bb991..31d1e5bb05 100644 --- a/sei-tendermint/internal/autobahn/consensus/persist/commitqcs_test.go +++ b/sei-tendermint/internal/autobahn/consensus/persist/commitqcs_test.go @@ -144,14 +144,7 @@ func TestCommitQCDeleteBeforeZero(t *testing.T) { require.Equal(t, 2, len(loaded)) } -func TestCommitQCResetNext(t *testing.T) { - cp := &CommitQCPersister{} - require.Equal(t, types.RoadIndex(0), cp.LoadNext()) - cp.ResetNext(5) - require.Equal(t, types.RoadIndex(5), cp.LoadNext()) -} - -func TestCommitQCPersistInOrder(t *testing.T) { +func TestCommitQCPersistDuplicateIsNoOp(t *testing.T) { rng := utils.TestRng() committee, keys := types.GenCommittee(rng, 4) dir := t.TempDir() @@ -162,10 +155,9 @@ func TestCommitQCPersistInOrder(t *testing.T) { require.NoError(t, cp.PersistCommitQC(qcs[0])) require.NoError(t, cp.PersistCommitQC(qcs[1])) - // Persisting qcs[0] again should fail (duplicate). - err = cp.PersistCommitQC(qcs[0]) - require.Error(t, err) - require.Contains(t, err.Error(), "out of sequence") + // Persisting qcs[0] again is a no-op (idx < next). + require.NoError(t, cp.PersistCommitQC(qcs[0])) + require.Equal(t, types.RoadIndex(2), cp.LoadNext()) require.NoError(t, cp.Close()) } @@ -225,6 +217,36 @@ func TestNoOpCommitQCPersister(t *testing.T) { require.NoError(t, cp.Close()) } +func TestCommitQCDeleteBeforePastAll(t *testing.T) { + rng := utils.TestRng() + committee, keys := types.GenCommittee(rng, 4) + dir := t.TempDir() + + qcs := makeSequentialCommitQCs(rng, committee, keys, 3) + cp, _, err := NewCommitQCPersister(utils.Some(dir)) + require.NoError(t, err) + for _, qc := range qcs { + require.NoError(t, cp.PersistCommitQC(qc)) + } + // cp.next is 3; prune past everything. DeleteBefore advances the cursor + // to 10 and resets the WAL. + require.NoError(t, cp.DeleteBefore(10)) + require.Equal(t, types.RoadIndex(10), cp.LoadNext()) + + // New writes starting from 10 should work. + moreQCs := makeSequentialCommitQCs(rng, committee, keys, 12) + require.NoError(t, cp.PersistCommitQC(moreQCs[10])) + require.NoError(t, cp.PersistCommitQC(moreQCs[11])) + require.NoError(t, cp.Close()) + + // Reopen — should see only the post-reset entries. + _, loaded, err := NewCommitQCPersister(utils.Some(dir)) + require.NoError(t, err) + require.Equal(t, 2, len(loaded)) + require.Equal(t, types.RoadIndex(10), loaded[0].Index) + require.Equal(t, types.RoadIndex(11), loaded[1].Index) +} + func TestCommitQCDeleteBeforeThenPersistMore(t *testing.T) { rng := utils.TestRng() committee, keys := types.GenCommittee(rng, 4) diff --git a/sei-tendermint/internal/autobahn/consensus/persist/wal.go b/sei-tendermint/internal/autobahn/consensus/persist/wal.go index 41ed399876..93038a8bae 100644 --- a/sei-tendermint/internal/autobahn/consensus/persist/wal.go +++ b/sei-tendermint/internal/autobahn/consensus/persist/wal.go @@ -20,8 +20,10 @@ type codec[T any] interface { // indices via Count() and firstIdx. Not safe for concurrent use. type indexedWAL[T any] struct { wal *dbwal.WAL[T] - firstIdx uint64 // WAL index of the oldest entry; 0 when empty - nextIdx uint64 // WAL index that the next Write will be assigned + dir string // directory passed to openIndexedWAL; kept for Reset + codec codec[T] // codec passed to openIndexedWAL; kept for Reset + firstIdx uint64 // WAL index of the oldest entry; 0 when empty + nextIdx uint64 // WAL index that the next Write will be assigned } // openIndexedWAL creates (or opens) a WAL in dir with synchronous, unbatched, @@ -47,7 +49,7 @@ func openIndexedWAL[T any](dir string, codec codec[T]) (*indexedWAL[T], error) { return nil, err } // tidwall/wal uses 1-based indexing; overwritten below if the WAL has data. - iw := &indexedWAL[T]{wal: w, nextIdx: 1} + iw := &indexedWAL[T]{wal: w, dir: dir, codec: codec, nextIdx: 1} first, err := w.FirstOffset() if err != nil { _ = w.Close() @@ -121,6 +123,24 @@ func (w *indexedWAL[T]) Count() uint64 { return w.nextIdx - w.firstIdx } +// Reset closes the WAL, removes its directory, and reopens a fresh empty WAL. +// Used when all entries are stale (e.g. the prune anchor advanced past +// everything persisted). +func (w *indexedWAL[T]) Reset() error { + if err := w.wal.Close(); err != nil { + return fmt.Errorf("close WAL for reset: %w", err) + } + if err := os.RemoveAll(w.dir); err != nil { + return fmt.Errorf("remove WAL dir %s for reset: %w", w.dir, err) + } + fresh, err := openIndexedWAL(w.dir, w.codec) + if err != nil { + return fmt.Errorf("reopen WAL after reset: %w", err) + } + *w = *fresh + return nil +} + // Close shuts down the underlying WAL. func (w *indexedWAL[T]) Close() error { return w.wal.Close() diff --git a/sei-tendermint/internal/autobahn/consensus/persist/wal_test.go b/sei-tendermint/internal/autobahn/consensus/persist/wal_test.go index b788b7624d..2ec889ca5f 100644 --- a/sei-tendermint/internal/autobahn/consensus/persist/wal_test.go +++ b/sei-tendermint/internal/autobahn/consensus/persist/wal_test.go @@ -168,6 +168,44 @@ func TestIndexedWAL_TruncateBeforeVerifiesEntry(t *testing.T) { require.NoError(t, iw.Close()) } +func TestIndexedWAL_Reset(t *testing.T) { + dir := t.TempDir() + iw, err := openIndexedWAL(dir, stringCodec{}) + require.NoError(t, err) + + require.NoError(t, iw.Write("a")) + require.NoError(t, iw.Write("b")) + require.NoError(t, iw.Write("c")) + require.Equal(t, uint64(3), iw.Count()) + + require.NoError(t, iw.Reset()) + require.Equal(t, uint64(0), iw.Count()) + require.Equal(t, uint64(0), iw.firstIdx) + require.Equal(t, uint64(1), iw.nextIdx) + + // Can write fresh entries after reset. + require.NoError(t, iw.Write("x")) + require.Equal(t, uint64(1), iw.Count()) + require.Equal(t, uint64(1), iw.firstIdx) + + entries, err := iw.ReadAll() + require.NoError(t, err) + require.Equal(t, 1, len(entries)) + require.Equal(t, "x", entries[0]) + + require.NoError(t, iw.Close()) + + // Reopen — should see only the post-reset entry. + iw2, err := openIndexedWAL(dir, stringCodec{}) + require.NoError(t, err) + require.Equal(t, uint64(1), iw2.Count()) + entries, err = iw2.ReadAll() + require.NoError(t, err) + require.Equal(t, 1, len(entries)) + require.Equal(t, "x", entries[0]) + require.NoError(t, iw2.Close()) +} + func TestIndexedWAL_WriteAfterTruncate(t *testing.T) { dir := t.TempDir() From edc8383797f59263b71f2e4146007137f6f3ffd5 Mon Sep 17 00:00:00 2001 From: Wen Date: Tue, 10 Mar 2026 15:45:07 -0700 Subject: [PATCH 21/31] persist: add TODO for WAL Clear/TruncateAll method Made-with: Cursor --- sei-tendermint/internal/autobahn/consensus/persist/wal.go | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/sei-tendermint/internal/autobahn/consensus/persist/wal.go b/sei-tendermint/internal/autobahn/consensus/persist/wal.go index 93038a8bae..aa4f4d1231 100644 --- a/sei-tendermint/internal/autobahn/consensus/persist/wal.go +++ b/sei-tendermint/internal/autobahn/consensus/persist/wal.go @@ -126,6 +126,10 @@ func (w *indexedWAL[T]) Count() uint64 { // Reset closes the WAL, removes its directory, and reopens a fresh empty WAL. // Used when all entries are stale (e.g. the prune anchor advanced past // everything persisted). +// +// TODO: sei-db/wal doesn't expose tidwall/wal's AllowEmpty option, so there's +// no way to truncate all entries in-place. If the WAL library gains a Clear() +// or TruncateAll() method, this can be simplified to a single call. func (w *indexedWAL[T]) Reset() error { if err := w.wal.Close(); err != nil { return fmt.Errorf("close WAL for reset: %w", err) From a29473756bedac55924e96553e9cc601ef008243 Mon Sep 17 00:00:00 2001 From: Wen Date: Tue, 10 Mar 2026 16:07:35 -0700 Subject: [PATCH 22/31] persist: fix crash-recovery bug in CommitQC DeleteBefore with empty WAL When the WAL was reset (anchor past all entries) and the process crashed before writing new entries, restart would find an empty WAL with cp.next=0. The old guard order (count==0 early return before cursor check) prevented DeleteBefore from advancing cp.next, causing the subsequent PersistCommitQC to fail with "out of sequence". Fix: check idx >= cp.next before the count==0 guard so the cursor is always advanced, even on an already-empty WAL. Add TestCommitQCDeleteBeforePastAllCrashRecovery. Made-with: Cursor --- .../autobahn/consensus/persist/commitqcs.go | 12 ++++-- .../consensus/persist/commitqcs_test.go | 42 +++++++++++++++++++ 2 files changed, 51 insertions(+), 3 deletions(-) diff --git a/sei-tendermint/internal/autobahn/consensus/persist/commitqcs.go b/sei-tendermint/internal/autobahn/consensus/persist/commitqcs.go index b15c4a0d9a..aaa36f0859 100644 --- a/sei-tendermint/internal/autobahn/consensus/persist/commitqcs.go +++ b/sei-tendermint/internal/autobahn/consensus/persist/commitqcs.go @@ -89,15 +89,21 @@ func (cp *CommitQCPersister) PersistCommitQC(qc *types.CommitQC) error { // sequentially, so WAL index = firstIdx + (roadIndex - firstRoadIndex). func (cp *CommitQCPersister) DeleteBefore(idx types.RoadIndex) error { iw, ok := cp.iw.Get() - if !ok || idx == 0 || iw.Count() == 0 { + if !ok || idx == 0 { return nil } if idx >= cp.next { - // Anchor advanced past all persisted entries; discard everything - // and advance the cursor so the next PersistCommitQC starts at idx. + // Anchor advanced past all persisted entries; advance the cursor + // so the next PersistCommitQC starts at idx. cp.next = idx + if iw.Count() == 0 { + return nil // already empty (e.g. crash after a previous Reset) + } return iw.Reset() } + if iw.Count() == 0 { + return nil + } firstRoadIndex := cp.next - types.RoadIndex(iw.Count()) if idx <= firstRoadIndex { return nil diff --git a/sei-tendermint/internal/autobahn/consensus/persist/commitqcs_test.go b/sei-tendermint/internal/autobahn/consensus/persist/commitqcs_test.go index 31d1e5bb05..db0cf90599 100644 --- a/sei-tendermint/internal/autobahn/consensus/persist/commitqcs_test.go +++ b/sei-tendermint/internal/autobahn/consensus/persist/commitqcs_test.go @@ -247,6 +247,48 @@ func TestCommitQCDeleteBeforePastAll(t *testing.T) { require.Equal(t, types.RoadIndex(11), loaded[1].Index) } +// TestCommitQCDeleteBeforePastAllCrashRecovery simulates a crash between WAL +// reset and new write: on restart the WAL is empty but the anchor is far ahead. +// DeleteBefore must still advance the cursor so PersistCommitQC succeeds. +func TestCommitQCDeleteBeforePastAllCrashRecovery(t *testing.T) { + rng := utils.TestRng() + committee, keys := types.GenCommittee(rng, 4) + dir := t.TempDir() + + qcs := makeSequentialCommitQCs(rng, committee, keys, 3) + cp, _, err := NewCommitQCPersister(utils.Some(dir)) + require.NoError(t, err) + for _, qc := range qcs { + require.NoError(t, cp.PersistCommitQC(qc)) + } + + // DeleteBefore resets the WAL (past all), then "crash" before writing. + require.NoError(t, cp.DeleteBefore(10)) + require.NoError(t, cp.Close()) // simulate crash — no new QCs written + + // Restart: WAL is empty, cp.next will be 0. + cp2, loaded, err := NewCommitQCPersister(utils.Some(dir)) + require.NoError(t, err) + require.Empty(t, loaded) + require.Equal(t, types.RoadIndex(0), cp2.LoadNext()) + + // Second DeleteBefore on the empty WAL must advance the cursor. + require.NoError(t, cp2.DeleteBefore(10)) + require.Equal(t, types.RoadIndex(10), cp2.LoadNext()) + + // Writing from index 10 should now succeed. + moreQCs := makeSequentialCommitQCs(rng, committee, keys, 12) + require.NoError(t, cp2.PersistCommitQC(moreQCs[10])) + require.NoError(t, cp2.PersistCommitQC(moreQCs[11])) + require.NoError(t, cp2.Close()) + + _, loaded, err = NewCommitQCPersister(utils.Some(dir)) + require.NoError(t, err) + require.Equal(t, 2, len(loaded)) + require.Equal(t, types.RoadIndex(10), loaded[0].Index) + require.Equal(t, types.RoadIndex(11), loaded[1].Index) +} + func TestCommitQCDeleteBeforeThenPersistMore(t *testing.T) { rng := utils.TestRng() committee, keys := types.GenCommittee(rng, 4) From c907f70422c2895f19b30dbf7a23c2c67e7fc65a Mon Sep 17 00:00:00 2001 From: Wen Date: Tue, 10 Mar 2026 16:28:27 -0700 Subject: [PATCH 23/31] persist: simplify startup re-persist to only anchor's CommitQC MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The loop over all in-memory CommitQCs was unnecessary — entries loaded from the WAL are guaranteed to survive DeleteBefore (it only removes entries below the anchor). Only the anchor's CommitQC could be missing after a WAL reset, so persist just that one entry. Made-with: Cursor --- sei-tendermint/internal/autobahn/avail/state.go | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/sei-tendermint/internal/autobahn/avail/state.go b/sei-tendermint/internal/autobahn/avail/state.go index 9b2635c982..a584be239d 100644 --- a/sei-tendermint/internal/autobahn/avail/state.go +++ b/sei-tendermint/internal/autobahn/avail/state.go @@ -170,13 +170,16 @@ func NewState(key types.SecretKey, data *data.State, stateDir utils.Option[strin if err := pers.commitQCs.DeleteBefore(inner.commitQCs.first); err != nil { return nil, fmt.Errorf("prune stale commitQC WAL entries: %w", err) } - // Re-persist in-memory CommitQCs. In the normal case every entry is - // already on disk (idx < cp.next) so PersistCommitQC is a no-op. After - // a WAL reset (anchor advanced past all entries) this writes back the - // anchor's CommitQC that was lost when the WAL was cleared. - for idx := inner.commitQCs.first; idx < inner.commitQCs.next; idx++ { - if err := pers.commitQCs.PersistCommitQC(inner.commitQCs.q[idx]); err != nil { - return nil, fmt.Errorf("re-persist commitqc %d: %w", idx, err) + // After a WAL reset (anchor advanced past all entries), write back the + // anchor's CommitQC that was lost when the WAL was cleared. In the + // normal case this is a no-op (idx < cp.next, already on disk). + // CommitQCs loaded from the WAL are guaranteed to still be on disk + // because DeleteBefore only removes entries below the anchor. + if ls, ok := loaded.Get(); ok { + if anchor, ok := ls.pruneAnchor.Get(); ok { + if err := pers.commitQCs.PersistCommitQC(anchor.CommitQC); err != nil { + return nil, fmt.Errorf("re-persist anchor commitqc: %w", err) + } } } From 422aa8f52cb81ea950617fa342e9dc4dca21bfe6 Mon Sep 17 00:00:00 2001 From: Wen Date: Tue, 10 Mar 2026 21:27:38 -0700 Subject: [PATCH 24/31] =?UTF-8?q?persist:=20review=20feedback=20=E2=80=94?= =?UTF-8?q?=20disable=20fsync,=20harden=20ReadAll,=20fix=20Close?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Disable WAL fsync; the prune anchor (A/B files with fsync) already provides the crash-recovery watermark. - Use Count() == 0 instead of firstIdx == 0 for emptiness checks in Write and ReadAll for robustness. - Add post-replay count check in ReadAll to detect silent data loss. - Use errors.Join in BlockPersister.Close to close all lane WALs. - Rename laneDir → lanePath to avoid shadowing the laneDir() function. - Add TestIndexedWAL_ReadAllDetectsStaleNextIdx. Made-with: Cursor --- .../autobahn/consensus/persist/blocks.go | 14 +++++++----- .../autobahn/consensus/persist/wal.go | 19 +++++++++++----- .../autobahn/consensus/persist/wal_test.go | 22 +++++++++++++++++++ 3 files changed, 43 insertions(+), 12 deletions(-) diff --git a/sei-tendermint/internal/autobahn/consensus/persist/blocks.go b/sei-tendermint/internal/autobahn/consensus/persist/blocks.go index ec999b496a..cb9e6cb546 100644 --- a/sei-tendermint/internal/autobahn/consensus/persist/blocks.go +++ b/sei-tendermint/internal/autobahn/consensus/persist/blocks.go @@ -2,6 +2,7 @@ package persist import ( "encoding/hex" + "errors" "fmt" "os" "path/filepath" @@ -97,17 +98,17 @@ func NewBlockPersister(stateDir utils.Option[string]) (*BlockPersister, map[type logger.Warn("skipping lane dir with invalid key", "name", e.Name(), "err", err) continue } - laneDir := filepath.Join(dir, e.Name()) - lw, err := newLaneWAL(laneDir) + lanePath := filepath.Join(dir, e.Name()) + lw, err := newLaneWAL(lanePath) if err != nil { _ = bp.Close() - return nil, nil, fmt.Errorf("open lane WAL in %s: %w", laneDir, err) + return nil, nil, fmt.Errorf("open lane WAL in %s: %w", lanePath, err) } loaded, err := lw.loadAll() if err != nil { _ = lw.Close() _ = bp.Close() - return nil, nil, fmt.Errorf("load lane WAL in %s: %w", laneDir, err) + return nil, nil, fmt.Errorf("load lane WAL in %s: %w", lanePath, err) } bp.lanes[lane] = lw if len(loaded) > 0 { @@ -201,12 +202,13 @@ func (bp *BlockPersister) Close() error { if _, ok := bp.dir.Get(); !ok { return nil } + var errs []error for _, lw := range bp.lanes { if err := lw.Close(); err != nil { - return err + errs = append(errs, err) } } - return nil + return errors.Join(errs...) } // loadAll reads all entries from the lane WAL and returns the loaded blocks. diff --git a/sei-tendermint/internal/autobahn/consensus/persist/wal.go b/sei-tendermint/internal/autobahn/consensus/persist/wal.go index aa4f4d1231..e94043fa47 100644 --- a/sei-tendermint/internal/autobahn/consensus/persist/wal.go +++ b/sei-tendermint/internal/autobahn/consensus/persist/wal.go @@ -26,9 +26,12 @@ type indexedWAL[T any] struct { nextIdx uint64 // WAL index that the next Write will be assigned } -// openIndexedWAL creates (or opens) a WAL in dir with synchronous, unbatched, -// fsync-enabled writes. Initializes index tracking from the WAL's stored -// offsets so the caller can immediately Write, ReadAll, or TruncateBefore. +// openIndexedWAL creates (or opens) a WAL in dir with synchronous, unbatched +// writes. Fsync is disabled because the prune anchor (persisted via A/B files +// with fsync) is the crash-recovery watermark — on power loss we restart from +// the anchor and re-sync any lost WAL entries from peers. +// Initializes index tracking from the WAL's stored offsets so the caller can +// immediately Write, ReadAll, or TruncateBefore. func openIndexedWAL[T any](dir string, codec codec[T]) (*indexedWAL[T], error) { if err := os.MkdirAll(dir, 0700); err != nil { return nil, fmt.Errorf("create dir %s: %w", dir, err) @@ -42,7 +45,6 @@ func openIndexedWAL[T any](dir string, codec codec[T]) (*indexedWAL[T], error) { dbwal.Config{ WriteBufferSize: 0, // synchronous writes WriteBatchSize: 1, // no batching - FsyncEnabled: true, }, ) if err != nil { @@ -73,7 +75,7 @@ func (w *indexedWAL[T]) Write(entry T) error { if err := w.wal.Write(entry); err != nil { return err } - if w.firstIdx == 0 { + if w.Count() == 0 { w.firstIdx = w.nextIdx } w.nextIdx++ @@ -101,7 +103,7 @@ func (w *indexedWAL[T]) TruncateBefore(walIdx uint64, verify func(T) error) erro // ReadAll returns all entries in the WAL. Returns nil if empty. func (w *indexedWAL[T]) ReadAll() ([]T, error) { - if w.firstIdx == 0 { + if w.Count() == 0 { return nil, nil } entries := make([]T, 0, w.Count()) @@ -112,10 +114,15 @@ func (w *indexedWAL[T]) ReadAll() ([]T, error) { if err != nil { return nil, err } + if uint64(len(entries)) != w.Count() { + return nil, fmt.Errorf("WAL replay returned %d entries, expected %d (possible silent data loss)", len(entries), w.Count()) + } return entries, nil } // Count returns the number of entries in the WAL. +// firstIdx == 0 is the empty sentinel because tidwall/wal returns 0 for both +// FirstIndex and LastIndex on an empty log, and real indices start at 1. func (w *indexedWAL[T]) Count() uint64 { if w.firstIdx == 0 { return 0 diff --git a/sei-tendermint/internal/autobahn/consensus/persist/wal_test.go b/sei-tendermint/internal/autobahn/consensus/persist/wal_test.go index 2ec889ca5f..748b8a096c 100644 --- a/sei-tendermint/internal/autobahn/consensus/persist/wal_test.go +++ b/sei-tendermint/internal/autobahn/consensus/persist/wal_test.go @@ -206,6 +206,28 @@ func TestIndexedWAL_Reset(t *testing.T) { require.NoError(t, iw2.Close()) } +func TestIndexedWAL_ReadAllDetectsStaleNextIdx(t *testing.T) { + dir := t.TempDir() + iw, err := openIndexedWAL(dir, stringCodec{}) + require.NoError(t, err) + + require.NoError(t, iw.Write("a")) + require.NoError(t, iw.Write("b")) + require.Equal(t, uint64(2), iw.Count()) + + // Simulate stale internal state: advance nextIdx so Count() reports more + // entries than the WAL actually contains. ReadAll must return an error + // (either from Replay failing to read the missing entry, or from the + // post-replay count check). + iw.nextIdx++ + + _, err = iw.ReadAll() + require.Error(t, err) + + iw.nextIdx-- + require.NoError(t, iw.Close()) +} + func TestIndexedWAL_WriteAfterTruncate(t *testing.T) { dir := t.TempDir() From cd09aafc2949ef8b580f77c752f789a41c88b272 Mon Sep 17 00:00:00 2001 From: Wen Date: Wed, 11 Mar 2026 08:18:41 -0700 Subject: [PATCH 25/31] persist: use TruncateAll instead of close-remove-reopen for Reset MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Now that sei-db/wal exposes AllowEmpty and TruncateAll (#3049), use them to clear a WAL in-place instead of the heavier close → remove directory → reopen pattern. - Enable AllowEmpty in WAL config. - Replace Reset() with TruncateAll() — single call, no dir removal. - Remove dir/codec fields from indexedWAL (only needed for reopen). - Eliminate firstIdx == 0 sentinel: Count() is now just nextIdx - firstIdx, empty when equal. Write() no longer needs the first-write bookkeeping branch. - Update openIndexedWAL to handle AllowEmpty's empty-log reporting (first > last) uniformly with the non-empty case. Made-with: Cursor --- .../autobahn/consensus/persist/blocks.go | 4 +- .../autobahn/consensus/persist/commitqcs.go | 8 +-- .../autobahn/consensus/persist/wal.go | 53 +++++++------------ .../autobahn/consensus/persist/wal_test.go | 21 ++++---- 4 files changed, 36 insertions(+), 50 deletions(-) diff --git a/sei-tendermint/internal/autobahn/consensus/persist/blocks.go b/sei-tendermint/internal/autobahn/consensus/persist/blocks.go index cb9e6cb546..d00ab4f004 100644 --- a/sei-tendermint/internal/autobahn/consensus/persist/blocks.go +++ b/sei-tendermint/internal/autobahn/consensus/persist/blocks.go @@ -171,8 +171,8 @@ func (bp *BlockPersister) DeleteBefore(laneFirsts map[types.LaneID]types.BlockNu } if first >= lw.nextBlockNum { // Anchor advanced past all persisted blocks for this lane. - if err := lw.Reset(); err != nil { - return fmt.Errorf("reset lane %s WAL: %w", lane, err) + if err := lw.TruncateAll(); err != nil { + return fmt.Errorf("truncate all lane %s WAL: %w", lane, err) } continue } diff --git a/sei-tendermint/internal/autobahn/consensus/persist/commitqcs.go b/sei-tendermint/internal/autobahn/consensus/persist/commitqcs.go index aaa36f0859..5897b609b5 100644 --- a/sei-tendermint/internal/autobahn/consensus/persist/commitqcs.go +++ b/sei-tendermint/internal/autobahn/consensus/persist/commitqcs.go @@ -61,7 +61,7 @@ func (cp *CommitQCPersister) LoadNext() types.RoadIndex { // PersistCommitQC writes a CommitQC to the WAL. // Entries must be persisted in sequential order. Duplicates (idx < next) are -// silently ignored — this makes startup idempotent after DeleteBefore resets +// silently ignored — this makes startup idempotent after DeleteBefore truncates // the WAL. Gaps (idx > next) return an error because they break the linear // RoadIndex-to-WAL-index mapping that DeleteBefore relies on. func (cp *CommitQCPersister) PersistCommitQC(qc *types.CommitQC) error { @@ -83,7 +83,7 @@ func (cp *CommitQCPersister) PersistCommitQC(qc *types.CommitQC) error { // DeleteBefore removes persisted CommitQCs with road index below idx // by truncating the front of the WAL. When idx is at or past every -// persisted entry, the WAL is reset and the write cursor is advanced +// persisted entry, the WAL is truncated and the write cursor is advanced // to idx so subsequent PersistCommitQC calls start from the right place. // The mapping from RoadIndex to WAL index is linear: entries are written // sequentially, so WAL index = firstIdx + (roadIndex - firstRoadIndex). @@ -97,9 +97,9 @@ func (cp *CommitQCPersister) DeleteBefore(idx types.RoadIndex) error { // so the next PersistCommitQC starts at idx. cp.next = idx if iw.Count() == 0 { - return nil // already empty (e.g. crash after a previous Reset) + return nil // already empty (e.g. crash after a previous TruncateAll) } - return iw.Reset() + return iw.TruncateAll() } if iw.Count() == 0 { return nil diff --git a/sei-tendermint/internal/autobahn/consensus/persist/wal.go b/sei-tendermint/internal/autobahn/consensus/persist/wal.go index e94043fa47..10bf07e42b 100644 --- a/sei-tendermint/internal/autobahn/consensus/persist/wal.go +++ b/sei-tendermint/internal/autobahn/consensus/persist/wal.go @@ -20,10 +20,8 @@ type codec[T any] interface { // indices via Count() and firstIdx. Not safe for concurrent use. type indexedWAL[T any] struct { wal *dbwal.WAL[T] - dir string // directory passed to openIndexedWAL; kept for Reset - codec codec[T] // codec passed to openIndexedWAL; kept for Reset - firstIdx uint64 // WAL index of the oldest entry; 0 when empty - nextIdx uint64 // WAL index that the next Write will be assigned + firstIdx uint64 // WAL index of the oldest entry; == nextIdx when empty + nextIdx uint64 // WAL index that the next Write will be assigned } // openIndexedWAL creates (or opens) a WAL in dir with synchronous, unbatched @@ -45,13 +43,13 @@ func openIndexedWAL[T any](dir string, codec codec[T]) (*indexedWAL[T], error) { dbwal.Config{ WriteBufferSize: 0, // synchronous writes WriteBatchSize: 1, // no batching + AllowEmpty: true, }, ) if err != nil { return nil, err } - // tidwall/wal uses 1-based indexing; overwritten below if the WAL has data. - iw := &indexedWAL[T]{wal: w, dir: dir, codec: codec, nextIdx: 1} + iw := &indexedWAL[T]{wal: w} first, err := w.FirstOffset() if err != nil { _ = w.Close() @@ -62,22 +60,20 @@ func openIndexedWAL[T any](dir string, codec codec[T]) (*indexedWAL[T], error) { _ = w.Close() return nil, fmt.Errorf("last offset: %w", err) } - if first != 0 || last != 0 { - iw.firstIdx = first - iw.nextIdx = last + 1 - } + // With AllowEmpty, an empty log reports first > last (e.g. first=1, last=0 + // for a brand-new log, or first=N+1, last=N after TruncateAll). A non-empty + // log always has first <= last with first >= 1. In both cases, setting + // firstIdx = first and nextIdx = last + 1 yields Count() == 0 when empty. + iw.firstIdx = first + iw.nextIdx = last + 1 return iw, nil } // Write appends entry to the WAL, advancing nextIdx. -// On the first write, also records firstIdx. func (w *indexedWAL[T]) Write(entry T) error { if err := w.wal.Write(entry); err != nil { return err } - if w.Count() == 0 { - w.firstIdx = w.nextIdx - } w.nextIdx++ return nil } @@ -121,34 +117,21 @@ func (w *indexedWAL[T]) ReadAll() ([]T, error) { } // Count returns the number of entries in the WAL. -// firstIdx == 0 is the empty sentinel because tidwall/wal returns 0 for both -// FirstIndex and LastIndex on an empty log, and real indices start at 1. +// Empty when firstIdx == nextIdx (both after fresh open and after TruncateAll). func (w *indexedWAL[T]) Count() uint64 { - if w.firstIdx == 0 { - return 0 - } return w.nextIdx - w.firstIdx } -// Reset closes the WAL, removes its directory, and reopens a fresh empty WAL. +// TruncateAll removes all entries from the WAL, leaving it empty for new writes. +// The underlying index counter is preserved (next Write continues from where +// it left off); firstIdx is advanced to nextIdx so Count() == 0. // Used when all entries are stale (e.g. the prune anchor advanced past // everything persisted). -// -// TODO: sei-db/wal doesn't expose tidwall/wal's AllowEmpty option, so there's -// no way to truncate all entries in-place. If the WAL library gains a Clear() -// or TruncateAll() method, this can be simplified to a single call. -func (w *indexedWAL[T]) Reset() error { - if err := w.wal.Close(); err != nil { - return fmt.Errorf("close WAL for reset: %w", err) - } - if err := os.RemoveAll(w.dir); err != nil { - return fmt.Errorf("remove WAL dir %s for reset: %w", w.dir, err) - } - fresh, err := openIndexedWAL(w.dir, w.codec) - if err != nil { - return fmt.Errorf("reopen WAL after reset: %w", err) +func (w *indexedWAL[T]) TruncateAll() error { + if err := w.wal.TruncateAll(); err != nil { + return fmt.Errorf("truncate all WAL entries: %w", err) } - *w = *fresh + w.firstIdx = w.nextIdx return nil } diff --git a/sei-tendermint/internal/autobahn/consensus/persist/wal_test.go b/sei-tendermint/internal/autobahn/consensus/persist/wal_test.go index 748b8a096c..81793170c4 100644 --- a/sei-tendermint/internal/autobahn/consensus/persist/wal_test.go +++ b/sei-tendermint/internal/autobahn/consensus/persist/wal_test.go @@ -21,8 +21,7 @@ func TestIndexedWAL_EmptyStart(t *testing.T) { require.NoError(t, err) require.Equal(t, uint64(0), iw.Count()) - require.Equal(t, uint64(0), iw.firstIdx) - require.Equal(t, uint64(1), iw.nextIdx) + require.Equal(t, iw.firstIdx, iw.nextIdx) // empty: firstIdx == nextIdx entries, err := iw.ReadAll() require.NoError(t, err) @@ -168,7 +167,7 @@ func TestIndexedWAL_TruncateBeforeVerifiesEntry(t *testing.T) { require.NoError(t, iw.Close()) } -func TestIndexedWAL_Reset(t *testing.T) { +func TestIndexedWAL_TruncateAll(t *testing.T) { dir := t.TempDir() iw, err := openIndexedWAL(dir, stringCodec{}) require.NoError(t, err) @@ -177,16 +176,18 @@ func TestIndexedWAL_Reset(t *testing.T) { require.NoError(t, iw.Write("b")) require.NoError(t, iw.Write("c")) require.Equal(t, uint64(3), iw.Count()) + require.Equal(t, uint64(4), iw.nextIdx) - require.NoError(t, iw.Reset()) + require.NoError(t, iw.TruncateAll()) require.Equal(t, uint64(0), iw.Count()) - require.Equal(t, uint64(0), iw.firstIdx) - require.Equal(t, uint64(1), iw.nextIdx) + require.Equal(t, uint64(4), iw.firstIdx) // advanced to nextIdx + require.Equal(t, uint64(4), iw.nextIdx) // index counter preserved - // Can write fresh entries after reset. + // Can write fresh entries after TruncateAll; indices continue. require.NoError(t, iw.Write("x")) require.Equal(t, uint64(1), iw.Count()) - require.Equal(t, uint64(1), iw.firstIdx) + require.Equal(t, uint64(4), iw.firstIdx) + require.Equal(t, uint64(5), iw.nextIdx) entries, err := iw.ReadAll() require.NoError(t, err) @@ -195,10 +196,12 @@ func TestIndexedWAL_Reset(t *testing.T) { require.NoError(t, iw.Close()) - // Reopen — should see only the post-reset entry. + // Reopen — should see only the post-TruncateAll entry. iw2, err := openIndexedWAL(dir, stringCodec{}) require.NoError(t, err) require.Equal(t, uint64(1), iw2.Count()) + require.Equal(t, uint64(4), iw2.firstIdx) + require.Equal(t, uint64(5), iw2.nextIdx) entries, err = iw2.ReadAll() require.NoError(t, err) require.Equal(t, 1, len(entries)) From 1b3e3c169edd9eb31af2b76a251e520eb4f05b2f Mon Sep 17 00:00:00 2001 From: Wen Date: Wed, 11 Mar 2026 08:23:19 -0700 Subject: [PATCH 26/31] fix stale "reset" references in test comments to say "TruncateAll"/"truncates" Made-with: Cursor --- sei-tendermint/internal/autobahn/avail/state_test.go | 4 ++-- .../internal/autobahn/consensus/persist/blocks_test.go | 2 +- .../internal/autobahn/consensus/persist/commitqcs_test.go | 8 ++++---- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/sei-tendermint/internal/autobahn/avail/state_test.go b/sei-tendermint/internal/autobahn/avail/state_test.go index f96f520bf8..f4acffa0be 100644 --- a/sei-tendermint/internal/autobahn/avail/state_test.go +++ b/sei-tendermint/internal/autobahn/avail/state_test.go @@ -662,7 +662,7 @@ func TestNewStateWithPersistence(t *testing.T) { require.NoError(t, cp.Close()) }) - t.Run("anchor past all persisted commitQCs resets WAL", func(t *testing.T) { + t.Run("anchor past all persisted commitQCs truncates WAL", func(t *testing.T) { dir := t.TempDir() ds := data.NewState(&data.Config{Committee: committee}, utils.None[data.BlockStore]()) @@ -692,7 +692,7 @@ func TestNewStateWithPersistence(t *testing.T) { CommitQc: types.CommitQCConv.Encode(qcs[9]), })) - // NewState should succeed: DeleteBefore resets the stale WAL, + // NewState should succeed: DeleteBefore truncates the stale WAL, // then the re-persist loop writes the anchor's CommitQC back. state, err := NewState(keys[0], ds, utils.Some(dir)) require.NoError(t, err) diff --git a/sei-tendermint/internal/autobahn/consensus/persist/blocks_test.go b/sei-tendermint/internal/autobahn/consensus/persist/blocks_test.go index 5d5a1d66d8..b24d39c87a 100644 --- a/sei-tendermint/internal/autobahn/consensus/persist/blocks_test.go +++ b/sei-tendermint/internal/autobahn/consensus/persist/blocks_test.go @@ -207,7 +207,7 @@ func TestDeleteBeforePastAllBlocks(t *testing.T) { require.NoError(t, bp.PersistBlock(testSignedProposal(rng, key, 11))) require.NoError(t, bp.Close()) - // Reopen — should see only the post-reset blocks. + // Reopen — should see only the post-TruncateAll blocks. _, blocks, err := NewBlockPersister(utils.Some(dir)) require.NoError(t, err) require.Equal(t, 2, len(blocks[lane])) diff --git a/sei-tendermint/internal/autobahn/consensus/persist/commitqcs_test.go b/sei-tendermint/internal/autobahn/consensus/persist/commitqcs_test.go index db0cf90599..64d9bbfa93 100644 --- a/sei-tendermint/internal/autobahn/consensus/persist/commitqcs_test.go +++ b/sei-tendermint/internal/autobahn/consensus/persist/commitqcs_test.go @@ -229,7 +229,7 @@ func TestCommitQCDeleteBeforePastAll(t *testing.T) { require.NoError(t, cp.PersistCommitQC(qc)) } // cp.next is 3; prune past everything. DeleteBefore advances the cursor - // to 10 and resets the WAL. + // to 10 and truncates the WAL. require.NoError(t, cp.DeleteBefore(10)) require.Equal(t, types.RoadIndex(10), cp.LoadNext()) @@ -239,7 +239,7 @@ func TestCommitQCDeleteBeforePastAll(t *testing.T) { require.NoError(t, cp.PersistCommitQC(moreQCs[11])) require.NoError(t, cp.Close()) - // Reopen — should see only the post-reset entries. + // Reopen — should see only the post-TruncateAll entries. _, loaded, err := NewCommitQCPersister(utils.Some(dir)) require.NoError(t, err) require.Equal(t, 2, len(loaded)) @@ -248,7 +248,7 @@ func TestCommitQCDeleteBeforePastAll(t *testing.T) { } // TestCommitQCDeleteBeforePastAllCrashRecovery simulates a crash between WAL -// reset and new write: on restart the WAL is empty but the anchor is far ahead. +// TruncateAll and new write: on restart the WAL is empty but the anchor is far ahead. // DeleteBefore must still advance the cursor so PersistCommitQC succeeds. func TestCommitQCDeleteBeforePastAllCrashRecovery(t *testing.T) { rng := utils.TestRng() @@ -262,7 +262,7 @@ func TestCommitQCDeleteBeforePastAllCrashRecovery(t *testing.T) { require.NoError(t, cp.PersistCommitQC(qc)) } - // DeleteBefore resets the WAL (past all), then "crash" before writing. + // DeleteBefore truncates the WAL (past all), then "crash" before writing. require.NoError(t, cp.DeleteBefore(10)) require.NoError(t, cp.Close()) // simulate crash — no new QCs written From bd94691c76390efb216ead05082342ddbf8db25d Mon Sep 17 00:00:00 2001 From: Wen Date: Thu, 12 Mar 2026 14:19:26 -0700 Subject: [PATCH 27/31] Remove logger arg from NewWAL call after upstream API change sei-db/wal.NewWAL no longer accepts a *slog.Logger parameter. Made-with: Cursor --- sei-tendermint/internal/autobahn/consensus/persist/wal.go | 1 - 1 file changed, 1 deletion(-) diff --git a/sei-tendermint/internal/autobahn/consensus/persist/wal.go b/sei-tendermint/internal/autobahn/consensus/persist/wal.go index 10bf07e42b..798cbf0c20 100644 --- a/sei-tendermint/internal/autobahn/consensus/persist/wal.go +++ b/sei-tendermint/internal/autobahn/consensus/persist/wal.go @@ -38,7 +38,6 @@ func openIndexedWAL[T any](dir string, codec codec[T]) (*indexedWAL[T], error) { context.Background(), func(entry T) ([]byte, error) { return codec.Marshal(entry), nil }, codec.Unmarshal, - logger, dir, dbwal.Config{ WriteBufferSize: 0, // synchronous writes From 3310313031fe823e689118cff8a8442fbe652e78 Mon Sep 17 00:00:00 2001 From: Wen Date: Fri, 13 Mar 2026 11:02:04 -0700 Subject: [PATCH 28/31] Add time-based retention for stale lane WAL deletion Instead of immediately deleting lane WALs not in the current committee, retain them for 30 minutes (defaultStaleRetention) after the last write. This gives catching-up peers time to fetch blocks from lanes that have left the committee. - Add lastWriteTime to indexedWAL, initialized to time.Now() on open and updated on every Write() - Add staleRetention field to BlockPersister (default 30m) - DeleteBefore skips stale lane deletion when lastWriteTime is recent - Tests use staleRetention=0 for immediate deletion behavior Made-with: Cursor --- .../autobahn/consensus/persist/blocks.go | 23 +++++-- .../autobahn/consensus/persist/blocks_test.go | 61 ++++++++++++++++++- .../autobahn/consensus/persist/wal.go | 16 ++++- .../autobahn/consensus/persist/wal_test.go | 53 ++++++++++++++++ 4 files changed, 144 insertions(+), 9 deletions(-) diff --git a/sei-tendermint/internal/autobahn/consensus/persist/blocks.go b/sei-tendermint/internal/autobahn/consensus/persist/blocks.go index d00ab4f004..a6c5f96bf7 100644 --- a/sei-tendermint/internal/autobahn/consensus/persist/blocks.go +++ b/sei-tendermint/internal/autobahn/consensus/persist/blocks.go @@ -6,6 +6,7 @@ import ( "fmt" "os" "path/filepath" + "time" "github.com/sei-protocol/sei-chain/sei-tendermint/internal/autobahn/types" "github.com/sei-protocol/sei-chain/sei-tendermint/libs/utils" @@ -16,6 +17,11 @@ var logger = seilog.NewLogger("tendermint", "internal", "autobahn", "consensus", const blocksDir = "blocks" +// defaultStaleRetention is how long a lane WAL is kept after its last write +// before being deleted when the lane is no longer in the committee. This gives +// catching-up peers time to fetch blocks from the stale lane. +const defaultStaleRetention = 30 * time.Minute + // LoadedBlock is a block loaded from disk during state restoration. type LoadedBlock struct { Number types.BlockNumber @@ -45,8 +51,9 @@ func (lw *laneWAL) firstBlockNum() utils.Option[types.BlockNumber] { // other lanes' entries that follow it. // When dir is None, all disk I/O is skipped (no-op mode). type BlockPersister struct { - dir utils.Option[string] - lanes map[types.LaneID]*laneWAL + dir utils.Option[string] + lanes map[types.LaneID]*laneWAL + staleRetention time.Duration } func laneDir(lane types.LaneID) string { @@ -69,14 +76,14 @@ func newLaneWAL(dir string) (*laneWAL, error) { func NewBlockPersister(stateDir utils.Option[string]) (*BlockPersister, map[types.LaneID][]LoadedBlock, error) { sd, ok := stateDir.Get() if !ok { - return &BlockPersister{lanes: map[types.LaneID]*laneWAL{}}, nil, nil + return &BlockPersister{lanes: map[types.LaneID]*laneWAL{}, staleRetention: defaultStaleRetention}, nil, nil } dir := filepath.Join(sd, blocksDir) if err := os.MkdirAll(dir, 0700); err != nil { return nil, nil, fmt.Errorf("create blocks dir %s: %w", dir, err) } - bp := &BlockPersister{dir: utils.Some(dir), lanes: map[types.LaneID]*laneWAL{}} + bp := &BlockPersister{dir: utils.Some(dir), lanes: map[types.LaneID]*laneWAL{}, staleRetention: defaultStaleRetention} entries, err := os.ReadDir(dir) if err != nil { @@ -151,7 +158,8 @@ func (bp *BlockPersister) PersistBlock(proposal *types.Signed[*types.LaneProposa // WAL independently. For each lane in the map, blocks below the given // BlockNumber are removed. Lanes NOT in the map are considered stale // (validator no longer in committee): their WALs are closed and directories -// removed. +// removed after staleRetention has elapsed since the last write, giving +// catching-up peers time to fetch the data. func (bp *BlockPersister) DeleteBefore(laneFirsts map[types.LaneID]types.BlockNumber) error { dir, ok := bp.dir.Get() if !ok { @@ -190,6 +198,11 @@ func (bp *BlockPersister) DeleteBefore(laneFirsts map[types.LaneID]types.BlockNu if _, ok := laneFirsts[lane]; ok { continue } + if time.Since(lw.LastWriteTime()) < bp.staleRetention { + // Wait longer before real deletion to give catching-up peers a chance. + continue + } + logger.Info("removing stale lane WAL", "lane", lane, "lastWrite", lw.LastWriteTime()) _ = lw.Close() _ = os.RemoveAll(filepath.Join(dir, laneDir(lane))) delete(bp.lanes, lane) diff --git a/sei-tendermint/internal/autobahn/consensus/persist/blocks_test.go b/sei-tendermint/internal/autobahn/consensus/persist/blocks_test.go index b24d39c87a..fc2fecd470 100644 --- a/sei-tendermint/internal/autobahn/consensus/persist/blocks_test.go +++ b/sei-tendermint/internal/autobahn/consensus/persist/blocks_test.go @@ -231,7 +231,10 @@ func TestDeleteBeforeRemovesStaleLanes(t *testing.T) { require.NoError(t, bp.PersistBlock(testSignedProposal(rng, key2, i))) } - // Only lane1 in laneFirsts — lane2 is stale and should be removed. + // Disable retention so stale lanes are removed immediately. + bp.staleRetention = 0 + + // Only lane1 in laneFirsts — lane2 is stale and past retention, should be removed. require.NoError(t, bp.DeleteBefore(map[types.LaneID]types.BlockNumber{lane1: 0})) require.NoError(t, bp.Close()) @@ -244,6 +247,62 @@ func TestDeleteBeforeRemovesStaleLanes(t *testing.T) { require.Equal(t, 1, len(entries), "only lane1 directory remains") } +func TestDeleteBeforeRetainsRecentStaleLanes(t *testing.T) { + rng := utils.TestRng() + dir := t.TempDir() + + key1 := types.GenSecretKey(rng) + key2 := types.GenSecretKey(rng) + lane1 := key1.Public() + lane2 := key2.Public() + bp, _, err := NewBlockPersister(utils.Some(dir)) + require.NoError(t, err) + + for i := types.BlockNumber(0); i < 3; i++ { + require.NoError(t, bp.PersistBlock(testSignedProposal(rng, key1, i))) + require.NoError(t, bp.PersistBlock(testSignedProposal(rng, key2, i))) + } + + // lane2 was just written to (lastWriteTime is recent). + // DeleteBefore should NOT delete it even though it's not in laneFirsts. + require.NoError(t, bp.DeleteBefore(map[types.LaneID]types.BlockNumber{lane1: 0})) + require.NoError(t, bp.Close()) + + _, blocks, err := NewBlockPersister(utils.Some(dir)) + require.NoError(t, err) + require.Equal(t, 3, len(blocks[lane1]), "lane1: all blocks intact") + require.Equal(t, 3, len(blocks[lane2]), "lane2: retained because write was recent") + + entries, _ := os.ReadDir(filepath.Join(dir, blocksDir)) + require.Equal(t, 2, len(entries), "both lane directories remain") +} + +func TestDeleteBeforeRetainsEmptyStaleLaneAtStartup(t *testing.T) { + rng := utils.TestRng() + dir := t.TempDir() + + key1 := types.GenSecretKey(rng) + key2 := types.GenSecretKey(rng) + lane1 := key1.Public() + lane2 := key2.Public() + + // Create lane directories. lane2 is empty (e.g. just joined then left committee). + bd := filepath.Join(dir, blocksDir) + require.NoError(t, os.MkdirAll(filepath.Join(bd, laneDir(lane1)), 0700)) + require.NoError(t, os.MkdirAll(filepath.Join(bd, laneDir(lane2)), 0700)) + + // Open — both lanes discovered. lane2 has never been written to. + bp, _, err := NewBlockPersister(utils.Some(dir)) + require.NoError(t, err) + + // lane2 is not in laneFirsts but was just opened — retention should keep it. + require.NoError(t, bp.DeleteBefore(map[types.LaneID]types.BlockNumber{lane1: 0})) + + entries, _ := os.ReadDir(bd) + require.Equal(t, 2, len(entries), "empty stale lane retained at startup") + require.NoError(t, bp.Close()) +} + func TestEmptyLaneWALSurvivesReopen(t *testing.T) { rng := utils.TestRng() dir := t.TempDir() diff --git a/sei-tendermint/internal/autobahn/consensus/persist/wal.go b/sei-tendermint/internal/autobahn/consensus/persist/wal.go index 798cbf0c20..d2fd08adf7 100644 --- a/sei-tendermint/internal/autobahn/consensus/persist/wal.go +++ b/sei-tendermint/internal/autobahn/consensus/persist/wal.go @@ -4,6 +4,7 @@ import ( "context" "fmt" "os" + "time" dbwal "github.com/sei-protocol/sei-chain/sei-db/wal" ) @@ -19,9 +20,10 @@ type codec[T any] interface { // Callers map domain-specific indices (BlockNumber, RoadIndex) to WAL // indices via Count() and firstIdx. Not safe for concurrent use. type indexedWAL[T any] struct { - wal *dbwal.WAL[T] - firstIdx uint64 // WAL index of the oldest entry; == nextIdx when empty - nextIdx uint64 // WAL index that the next Write will be assigned + wal *dbwal.WAL[T] + firstIdx uint64 // WAL index of the oldest entry; == nextIdx when empty + nextIdx uint64 // WAL index that the next Write will be assigned + lastWriteTime time.Time // time of last Write(); initialized to open time } // openIndexedWAL creates (or opens) a WAL in dir with synchronous, unbatched @@ -65,6 +67,7 @@ func openIndexedWAL[T any](dir string, codec codec[T]) (*indexedWAL[T], error) { // firstIdx = first and nextIdx = last + 1 yields Count() == 0 when empty. iw.firstIdx = first iw.nextIdx = last + 1 + iw.lastWriteTime = time.Now() return iw, nil } @@ -74,6 +77,7 @@ func (w *indexedWAL[T]) Write(entry T) error { return err } w.nextIdx++ + w.lastWriteTime = time.Now() return nil } @@ -134,6 +138,12 @@ func (w *indexedWAL[T]) TruncateAll() error { return nil } +// LastWriteTime returns the time of the last successful Write(), or the time +// the WAL was opened if no writes have occurred since open. +func (w *indexedWAL[T]) LastWriteTime() time.Time { + return w.lastWriteTime +} + // Close shuts down the underlying WAL. func (w *indexedWAL[T]) Close() error { return w.wal.Close() diff --git a/sei-tendermint/internal/autobahn/consensus/persist/wal_test.go b/sei-tendermint/internal/autobahn/consensus/persist/wal_test.go index 81793170c4..0e6d4c3c01 100644 --- a/sei-tendermint/internal/autobahn/consensus/persist/wal_test.go +++ b/sei-tendermint/internal/autobahn/consensus/persist/wal_test.go @@ -3,6 +3,7 @@ package persist import ( "fmt" "testing" + "time" "github.com/sei-protocol/sei-chain/sei-tendermint/libs/utils/require" ) @@ -260,3 +261,55 @@ func TestIndexedWAL_WriteAfterTruncate(t *testing.T) { require.NoError(t, iw.Close()) } + +func TestIndexedWAL_LastWriteTimeSetOnOpen(t *testing.T) { + before := time.Now() + dir := t.TempDir() + iw, err := openIndexedWAL(dir, stringCodec{}) + require.NoError(t, err) + after := time.Now() + + lwt := iw.LastWriteTime() + require.False(t, lwt.IsZero(), "empty WAL should have non-zero LastWriteTime from open") + require.False(t, lwt.Before(before)) + require.False(t, lwt.After(after)) + require.NoError(t, iw.Close()) +} + +func TestIndexedWAL_LastWriteTimeAdvancesOnWrite(t *testing.T) { + dir := t.TempDir() + iw, err := openIndexedWAL(dir, stringCodec{}) + require.NoError(t, err) + + before := time.Now() + require.NoError(t, iw.Write("a")) + after := time.Now() + + lwt := iw.LastWriteTime() + require.False(t, lwt.IsZero()) + require.False(t, lwt.Before(before)) + require.False(t, lwt.After(after)) + + require.NoError(t, iw.Close()) +} + +func TestIndexedWAL_LastWriteTimeNonZeroOnReopenWithData(t *testing.T) { + dir := t.TempDir() + + iw, err := openIndexedWAL(dir, stringCodec{}) + require.NoError(t, err) + require.NoError(t, iw.Write("a")) + require.NoError(t, iw.Close()) + + before := time.Now() + iw2, err := openIndexedWAL(dir, stringCodec{}) + require.NoError(t, err) + after := time.Now() + + lwt := iw2.LastWriteTime() + require.False(t, lwt.IsZero(), "reopened WAL with data should have non-zero LastWriteTime") + require.False(t, lwt.Before(before)) + require.False(t, lwt.After(after)) + + require.NoError(t, iw2.Close()) +} From 963b3561283f23c9ca62db3f4bfe5c717cd349d7 Mon Sep 17 00:00:00 2001 From: Wen Date: Sun, 15 Mar 2026 12:52:35 -0700 Subject: [PATCH 29/31] Derive commitQC WAL truncation point from anchor Move commitQC DeleteBefore inside the anchor-persist block and derive the truncation index directly from anchor.CommitQC rather than from the in-memory queue's first index. This makes the safety invariant explicit: we only truncate WAL entries that the on-disk anchor covers. Remove the now-unused commitQCFirst field from persistBatch. Made-with: Cursor --- .../internal/autobahn/avail/state.go | 30 ++++++++++--------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/sei-tendermint/internal/autobahn/avail/state.go b/sei-tendermint/internal/autobahn/avail/state.go index a584be239d..a21b9ad0fe 100644 --- a/sei-tendermint/internal/autobahn/avail/state.go +++ b/sei-tendermint/internal/autobahn/avail/state.go @@ -669,25 +669,29 @@ func (s *State) runPersist(ctx context.Context, pers persisters) error { } // 1. Persist prune anchor first — establishes the crash-recovery watermark. + // CommitQC WAL pruning is co-located here so the truncation point + // is derived directly from the anchor, making the safety invariant + // explicit: we only truncate entries the on-disk anchor covers. if anchor, ok := batch.pruneAnchor.Get(); ok { if err := pers.pruneAnchor.Persist(PruneAnchorConv.Encode(anchor)); err != nil { return fmt.Errorf("persist prune anchor: %w", err) } s.advancePersistedBlockStart(anchor.CommitQC) lastPersistedAppQCNext = anchor.CommitQC.Proposal().Index() + 1 + + if err := pers.commitQCs.DeleteBefore(anchor.CommitQC.Proposal().Index()); err != nil { + return fmt.Errorf("commitqc deleteBefore: %w", err) + } } - // 2. Prune old data. Safe because the anchor (step 1) is already - // durable and embeds the CommitQC needed for crash recovery. - // Must happen before writes because the WAL requires contiguous - // indices — if the anchor advanced past all persisted entries, - // DeleteBefore resets the WAL so new writes start clean. + // 2. Prune block WAL entries. Must happen before writes because the + // WAL requires contiguous indices — if the anchor advanced past + // all persisted entries, DeleteBefore resets the WAL so new writes + // start clean. Runs every cycle (not just on anchor change) so + // that stale lane retention timeouts are evaluated promptly. if err := pers.blocks.DeleteBefore(batch.laneFirsts); err != nil { return fmt.Errorf("block deleteBefore: %w", err) } - if err := pers.commitQCs.DeleteBefore(batch.commitQCFirst); err != nil { - return fmt.Errorf("commitqc deleteBefore: %w", err) - } // 3. Persist new CommitQCs, then publish immediately so consensus // can advance without waiting for block writes. @@ -713,11 +717,10 @@ func (s *State) runPersist(ctx context.Context, pers persisters) error { // persistBatch holds the data collected under lock for one persist iteration. type persistBatch struct { - blocks []*types.Signed[*types.LaneProposal] - commitQCs []*types.CommitQC - pruneAnchor utils.Option[*PruneAnchor] - laneFirsts map[types.LaneID]types.BlockNumber - commitQCFirst types.RoadIndex + blocks []*types.Signed[*types.LaneProposal] + commitQCs []*types.CommitQC + pruneAnchor utils.Option[*PruneAnchor] + laneFirsts map[types.LaneID]types.BlockNumber } // advancePersistedBlockStart updates the per-lane block admission watermark @@ -789,7 +792,6 @@ func (s *State) collectPersistBatch(ctx context.Context, lastPersistedAppQCNext b.laneFirsts[lane] = q.first } commitQCNext = max(commitQCNext, inner.commitQCs.first) - b.commitQCFirst = inner.commitQCs.first for n := commitQCNext; n < inner.commitQCs.next; n++ { b.commitQCs = append(b.commitQCs, inner.commitQCs.q[n]) } From 52963c2cfaa1437b16bf2b472b0df79303f08b18 Mon Sep 17 00:00:00 2001 From: Wen Date: Sun, 15 Mar 2026 15:52:42 -0700 Subject: [PATCH 30/31] Add no-op persister comments to blocks.go and commitqcs.go Annotate all guard sites where persistence is disabled (dir/iw is None) with inline comments so the no-op behavior is immediately obvious. Made-with: Cursor --- .../internal/autobahn/consensus/persist/blocks.go | 6 +++--- .../internal/autobahn/consensus/persist/commitqcs.go | 8 ++++---- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/sei-tendermint/internal/autobahn/consensus/persist/blocks.go b/sei-tendermint/internal/autobahn/consensus/persist/blocks.go index a6c5f96bf7..1b1af896b7 100644 --- a/sei-tendermint/internal/autobahn/consensus/persist/blocks.go +++ b/sei-tendermint/internal/autobahn/consensus/persist/blocks.go @@ -131,7 +131,7 @@ func NewBlockPersister(stateDir utils.Option[string]) (*BlockPersister, map[type func (bp *BlockPersister) PersistBlock(proposal *types.Signed[*types.LaneProposal]) error { dir, ok := bp.dir.Get() if !ok { - return nil + return nil // no-op persister (persistence disabled) } h := proposal.Msg().Block().Header() lane := h.Lane() @@ -163,7 +163,7 @@ func (bp *BlockPersister) PersistBlock(proposal *types.Signed[*types.LaneProposa func (bp *BlockPersister) DeleteBefore(laneFirsts map[types.LaneID]types.BlockNumber) error { dir, ok := bp.dir.Get() if !ok { - return nil + return nil // no-op persister (persistence disabled) } if len(laneFirsts) == 0 { panic("DeleteBefore called with empty laneFirsts (empty committee)") @@ -213,7 +213,7 @@ func (bp *BlockPersister) DeleteBefore(laneFirsts map[types.LaneID]types.BlockNu // Close shuts down all per-lane WALs. func (bp *BlockPersister) Close() error { if _, ok := bp.dir.Get(); !ok { - return nil + return nil // no-op persister (persistence disabled) } var errs []error for _, lw := range bp.lanes { diff --git a/sei-tendermint/internal/autobahn/consensus/persist/commitqcs.go b/sei-tendermint/internal/autobahn/consensus/persist/commitqcs.go index 5897b609b5..3082a405ba 100644 --- a/sei-tendermint/internal/autobahn/consensus/persist/commitqcs.go +++ b/sei-tendermint/internal/autobahn/consensus/persist/commitqcs.go @@ -76,7 +76,7 @@ func (cp *CommitQCPersister) PersistCommitQC(qc *types.CommitQC) error { if err := iw.Write(qc); err != nil { return fmt.Errorf("persist commitqc %d: %w", idx, err) } - } + } // else: no-op persister (persistence disabled); cursor still advances. cp.next = idx + 1 return nil } @@ -90,7 +90,7 @@ func (cp *CommitQCPersister) PersistCommitQC(qc *types.CommitQC) error { func (cp *CommitQCPersister) DeleteBefore(idx types.RoadIndex) error { iw, ok := cp.iw.Get() if !ok || idx == 0 { - return nil + return nil // no-op persister (persistence disabled), or nothing to prune } if idx >= cp.next { // Anchor advanced past all persisted entries; advance the cursor @@ -125,13 +125,13 @@ func (cp *CommitQCPersister) Close() error { if iw, ok := cp.iw.Get(); ok { return iw.Close() } - return nil + return nil // no-op persister (persistence disabled) } func (cp *CommitQCPersister) loadAll() ([]LoadedCommitQC, error) { iw, ok := cp.iw.Get() if !ok { - return nil, nil + return nil, nil // no-op persister (persistence disabled) } entries, err := iw.ReadAll() if err != nil { From aeeddbf9501f35097fad683d1245c3cd60363337 Mon Sep 17 00:00:00 2001 From: Wen Date: Sun, 15 Mar 2026 18:19:36 -0700 Subject: [PATCH 31/31] Unexport Close() on BlockPersister and CommitQCPersister MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Make Close() internal (close()) since it's only called within the persist package — by tests and constructors for error cleanup. Add no-op comments, TODO for metrics, fix truncation comment, and correct close() godoc. Made-with: Cursor --- .../autobahn/consensus/persist/blocks.go | 14 +++++--- .../autobahn/consensus/persist/blocks_test.go | 36 +++++++++---------- .../autobahn/consensus/persist/commitqcs.go | 6 ++-- .../consensus/persist/commitqcs_test.go | 24 ++++++------- 4 files changed, 44 insertions(+), 36 deletions(-) diff --git a/sei-tendermint/internal/autobahn/consensus/persist/blocks.go b/sei-tendermint/internal/autobahn/consensus/persist/blocks.go index 1b1af896b7..edeeb252de 100644 --- a/sei-tendermint/internal/autobahn/consensus/persist/blocks.go +++ b/sei-tendermint/internal/autobahn/consensus/persist/blocks.go @@ -1,3 +1,4 @@ +// TODO: add Prometheus metrics for blocks written, truncated, and stale lanes removed. package persist import ( @@ -108,13 +109,13 @@ func NewBlockPersister(stateDir utils.Option[string]) (*BlockPersister, map[type lanePath := filepath.Join(dir, e.Name()) lw, err := newLaneWAL(lanePath) if err != nil { - _ = bp.Close() + _ = bp.close() return nil, nil, fmt.Errorf("open lane WAL in %s: %w", lanePath, err) } loaded, err := lw.loadAll() if err != nil { _ = lw.Close() - _ = bp.Close() + _ = bp.close() return nil, nil, fmt.Errorf("load lane WAL in %s: %w", lanePath, err) } bp.lanes[lane] = lw @@ -184,6 +185,9 @@ func (bp *BlockPersister) DeleteBefore(laneFirsts map[types.LaneID]types.BlockNu } continue } + // Truncate entries below 'first'. The verify callback checks that + // the entry at the new front has the expected block number, catching + // any block-number-to-WAL-index mapping corruption. walIdx := lw.firstIdx + uint64(first-firstBN) if err := lw.TruncateBefore(walIdx, func(entry *types.Signed[*types.LaneProposal]) error { if got := entry.Msg().Block().Header().BlockNumber(); got != first { @@ -210,8 +214,10 @@ func (bp *BlockPersister) DeleteBefore(laneFirsts map[types.LaneID]types.BlockNu return nil } -// Close shuts down all per-lane WALs. -func (bp *BlockPersister) Close() error { +// close shuts down all per-lane WALs. Internal: only used by tests and +// NewBlockPersister (error cleanup). Production code does not close WALs +// at shutdown — the OS reclaims resources on process exit. +func (bp *BlockPersister) close() error { if _, ok := bp.dir.Get(); !ok { return nil // no-op persister (persistence disabled) } diff --git a/sei-tendermint/internal/autobahn/consensus/persist/blocks_test.go b/sei-tendermint/internal/autobahn/consensus/persist/blocks_test.go index fc2fecd470..829ae8f289 100644 --- a/sei-tendermint/internal/autobahn/consensus/persist/blocks_test.go +++ b/sei-tendermint/internal/autobahn/consensus/persist/blocks_test.go @@ -25,7 +25,7 @@ func TestNewBlockPersisterEmptyDir(t *testing.T) { fi, err := os.Stat(filepath.Join(dir, blocksDir)) require.NoError(t, err) require.True(t, fi.IsDir()) - require.NoError(t, bp.Close()) + require.NoError(t, bp.close()) } func TestPersistBlockAndLoad(t *testing.T) { @@ -41,7 +41,7 @@ func TestPersistBlockAndLoad(t *testing.T) { b1 := testSignedProposal(rng, key, 1) require.NoError(t, bp.PersistBlock(b0)) require.NoError(t, bp.PersistBlock(b1)) - require.NoError(t, bp.Close()) + require.NoError(t, bp.close()) bp2, blocks, err := NewBlockPersister(utils.Some(dir)) require.NoError(t, err) @@ -52,7 +52,7 @@ func TestPersistBlockAndLoad(t *testing.T) { require.Equal(t, types.BlockNumber(1), blocks[lane][1].Number) require.NoError(t, utils.TestDiff(b0, blocks[lane][0].Proposal)) require.NoError(t, utils.TestDiff(b1, blocks[lane][1].Proposal)) - require.NoError(t, bp2.Close()) + require.NoError(t, bp2.close()) } func TestPersistBlockMultipleLanes(t *testing.T) { @@ -70,7 +70,7 @@ func TestPersistBlockMultipleLanes(t *testing.T) { b2 := testSignedProposal(rng, key2, 0) require.NoError(t, bp.PersistBlock(b1)) require.NoError(t, bp.PersistBlock(b2)) - require.NoError(t, bp.Close()) + require.NoError(t, bp.close()) _, blocks, err := NewBlockPersister(utils.Some(dir)) require.NoError(t, err) @@ -95,7 +95,7 @@ func TestDeleteBeforeRemovesOldKeepsNew(t *testing.T) { } require.NoError(t, bp.DeleteBefore(map[types.LaneID]types.BlockNumber{lane: 3})) - require.NoError(t, bp.Close()) + require.NoError(t, bp.close()) _, blocks, err := NewBlockPersister(utils.Some(dir)) require.NoError(t, err) @@ -124,7 +124,7 @@ func TestDeleteBeforeAndRestart(t *testing.T) { // lane1: truncate old blocks, lane2: delete nothing (first=0), lane3: empty (no WAL). require.NoError(t, bp.DeleteBefore(map[types.LaneID]types.BlockNumber{lane1: 2, lane2: 0, lane3: 0})) - require.NoError(t, bp.Close()) + require.NoError(t, bp.close()) // Restart — verify varied lane states load correctly. bp2, blocks, err := NewBlockPersister(utils.Some(dir)) @@ -137,7 +137,7 @@ func TestDeleteBeforeAndRestart(t *testing.T) { // Persist more after restart, then restart again to verify continuity. require.NoError(t, bp2.PersistBlock(testSignedProposal(rng, key1, 3))) require.NoError(t, bp2.PersistBlock(testSignedProposal(rng, key2, 3))) - require.NoError(t, bp2.Close()) + require.NoError(t, bp2.close()) _, blocks2, err := NewBlockPersister(utils.Some(dir)) require.NoError(t, err) @@ -158,7 +158,7 @@ func TestNoOpBlockPersister(t *testing.T) { lane := key.Public() require.NoError(t, bp.PersistBlock(testSignedProposal(rng, key, 0))) require.NoError(t, bp.DeleteBefore(map[types.LaneID]types.BlockNumber{lane: 0})) - require.NoError(t, bp.Close()) + require.NoError(t, bp.close()) } func TestDeleteBeforeThenPersistMore(t *testing.T) { @@ -176,7 +176,7 @@ func TestDeleteBeforeThenPersistMore(t *testing.T) { } require.NoError(t, bp.DeleteBefore(map[types.LaneID]types.BlockNumber{lane: 3})) require.NoError(t, bp.PersistBlock(testSignedProposal(rng, key, 5))) - require.NoError(t, bp.Close()) + require.NoError(t, bp.close()) _, blocks, err := NewBlockPersister(utils.Some(dir)) require.NoError(t, err) @@ -205,7 +205,7 @@ func TestDeleteBeforePastAllBlocks(t *testing.T) { // Lane WAL is now empty; new writes starting from 10 should work. require.NoError(t, bp.PersistBlock(testSignedProposal(rng, key, 10))) require.NoError(t, bp.PersistBlock(testSignedProposal(rng, key, 11))) - require.NoError(t, bp.Close()) + require.NoError(t, bp.close()) // Reopen — should see only the post-TruncateAll blocks. _, blocks, err := NewBlockPersister(utils.Some(dir)) @@ -236,7 +236,7 @@ func TestDeleteBeforeRemovesStaleLanes(t *testing.T) { // Only lane1 in laneFirsts — lane2 is stale and past retention, should be removed. require.NoError(t, bp.DeleteBefore(map[types.LaneID]types.BlockNumber{lane1: 0})) - require.NoError(t, bp.Close()) + require.NoError(t, bp.close()) _, blocks, err := NewBlockPersister(utils.Some(dir)) require.NoError(t, err) @@ -266,7 +266,7 @@ func TestDeleteBeforeRetainsRecentStaleLanes(t *testing.T) { // lane2 was just written to (lastWriteTime is recent). // DeleteBefore should NOT delete it even though it's not in laneFirsts. require.NoError(t, bp.DeleteBefore(map[types.LaneID]types.BlockNumber{lane1: 0})) - require.NoError(t, bp.Close()) + require.NoError(t, bp.close()) _, blocks, err := NewBlockPersister(utils.Some(dir)) require.NoError(t, err) @@ -300,7 +300,7 @@ func TestDeleteBeforeRetainsEmptyStaleLaneAtStartup(t *testing.T) { entries, _ := os.ReadDir(bd) require.Equal(t, 2, len(entries), "empty stale lane retained at startup") - require.NoError(t, bp.Close()) + require.NoError(t, bp.close()) } func TestEmptyLaneWALSurvivesReopen(t *testing.T) { @@ -322,7 +322,7 @@ func TestEmptyLaneWALSurvivesReopen(t *testing.T) { // Persist a new block into the lane without needing lazy creation. require.NoError(t, bp.PersistBlock(testSignedProposal(rng, key, 0))) - require.NoError(t, bp.Close()) + require.NoError(t, bp.close()) // Reopen — should see the new block. _, blocks2, err := NewBlockPersister(utils.Some(dir)) @@ -343,7 +343,7 @@ func TestNewBlockPersisterSkipsNonHexDir(t *testing.T) { bp, blocks, err := NewBlockPersister(utils.Some(dir)) require.NoError(t, err) require.Equal(t, 0, len(blocks)) - require.NoError(t, bp.Close()) + require.NoError(t, bp.close()) } func TestNewBlockPersisterSkipsInvalidKeyDir(t *testing.T) { @@ -357,7 +357,7 @@ func TestNewBlockPersisterSkipsInvalidKeyDir(t *testing.T) { bp, blocks, err := NewBlockPersister(utils.Some(dir)) require.NoError(t, err) require.Equal(t, 0, len(blocks)) - require.NoError(t, bp.Close()) + require.NoError(t, bp.close()) } func TestPersistBlockOutOfSequence(t *testing.T) { @@ -380,7 +380,7 @@ func TestPersistBlockOutOfSequence(t *testing.T) { require.Error(t, err) require.Contains(t, err.Error(), "out of sequence") - require.NoError(t, bp.Close()) + require.NoError(t, bp.close()) } func TestLoadAllDetectsBlockGap(t *testing.T) { @@ -421,5 +421,5 @@ func TestLazyLaneCreation(t *testing.T) { entries, _ = os.ReadDir(filepath.Join(dir, blocksDir)) require.Equal(t, 1, len(entries), "should have 1 lane directory") - require.NoError(t, bp.Close()) + require.NoError(t, bp.close()) } diff --git a/sei-tendermint/internal/autobahn/consensus/persist/commitqcs.go b/sei-tendermint/internal/autobahn/consensus/persist/commitqcs.go index 3082a405ba..bf4fdd094c 100644 --- a/sei-tendermint/internal/autobahn/consensus/persist/commitqcs.go +++ b/sei-tendermint/internal/autobahn/consensus/persist/commitqcs.go @@ -1,3 +1,4 @@ +// TODO: add Prometheus metrics for commitQCs written and truncated. package persist import ( @@ -120,8 +121,9 @@ func (cp *CommitQCPersister) DeleteBefore(idx types.RoadIndex) error { return nil } -// Close shuts down the WAL. -func (cp *CommitQCPersister) Close() error { +// close shuts down the WAL. Internal: only used by tests and NewCommitQCPersister +// (error cleanup). Production code does not close WALs at shutdown. +func (cp *CommitQCPersister) close() error { if iw, ok := cp.iw.Get(); ok { return iw.Close() } diff --git a/sei-tendermint/internal/autobahn/consensus/persist/commitqcs_test.go b/sei-tendermint/internal/autobahn/consensus/persist/commitqcs_test.go index 64d9bbfa93..e874a78bfb 100644 --- a/sei-tendermint/internal/autobahn/consensus/persist/commitqcs_test.go +++ b/sei-tendermint/internal/autobahn/consensus/persist/commitqcs_test.go @@ -71,7 +71,7 @@ func TestNewCommitQCPersisterEmptyDir(t *testing.T) { fi, err := os.Stat(filepath.Join(dir, commitqcsDir)) require.NoError(t, err) require.True(t, fi.IsDir()) - require.NoError(t, cp.Close()) + require.NoError(t, cp.close()) } func TestPersistCommitQCAndLoad(t *testing.T) { @@ -88,7 +88,7 @@ func TestPersistCommitQCAndLoad(t *testing.T) { require.NoError(t, cp.PersistCommitQC(qc)) } require.Equal(t, types.RoadIndex(3), cp.LoadNext()) - require.NoError(t, cp.Close()) + require.NoError(t, cp.close()) cp2, loaded, err := NewCommitQCPersister(utils.Some(dir)) require.NoError(t, err) @@ -99,7 +99,7 @@ func TestPersistCommitQCAndLoad(t *testing.T) { require.NoError(t, utils.TestDiff(qcs[i], lqc.QC)) } require.Equal(t, types.RoadIndex(3), cp2.LoadNext()) - require.NoError(t, cp2.Close()) + require.NoError(t, cp2.close()) } func TestCommitQCDeleteBeforeRemovesOldKeepsNew(t *testing.T) { @@ -115,7 +115,7 @@ func TestCommitQCDeleteBeforeRemovesOldKeepsNew(t *testing.T) { } require.NoError(t, cp.DeleteBefore(3)) - require.NoError(t, cp.Close()) + require.NoError(t, cp.close()) _, loaded, err := NewCommitQCPersister(utils.Some(dir)) require.NoError(t, err) @@ -137,7 +137,7 @@ func TestCommitQCDeleteBeforeZero(t *testing.T) { } require.NoError(t, cp.DeleteBefore(0)) - require.NoError(t, cp.Close()) + require.NoError(t, cp.close()) _, loaded, err := NewCommitQCPersister(utils.Some(dir)) require.NoError(t, err) @@ -158,7 +158,7 @@ func TestCommitQCPersistDuplicateIsNoOp(t *testing.T) { // Persisting qcs[0] again is a no-op (idx < next). require.NoError(t, cp.PersistCommitQC(qcs[0])) require.Equal(t, types.RoadIndex(2), cp.LoadNext()) - require.NoError(t, cp.Close()) + require.NoError(t, cp.close()) } func TestCommitQCPersistGapRejected(t *testing.T) { @@ -176,7 +176,7 @@ func TestCommitQCPersistGapRejected(t *testing.T) { err = cp.PersistCommitQC(qcs[3]) require.Error(t, err) require.Contains(t, err.Error(), "out of sequence") - require.NoError(t, cp.Close()) + require.NoError(t, cp.close()) } func TestLoadAllDetectsCommitQCGap(t *testing.T) { @@ -214,7 +214,7 @@ func TestNoOpCommitQCPersister(t *testing.T) { require.NoError(t, cp.PersistCommitQC(qcs[0])) require.Equal(t, types.RoadIndex(1), cp.LoadNext()) require.NoError(t, cp.DeleteBefore(0)) - require.NoError(t, cp.Close()) + require.NoError(t, cp.close()) } func TestCommitQCDeleteBeforePastAll(t *testing.T) { @@ -237,7 +237,7 @@ func TestCommitQCDeleteBeforePastAll(t *testing.T) { moreQCs := makeSequentialCommitQCs(rng, committee, keys, 12) require.NoError(t, cp.PersistCommitQC(moreQCs[10])) require.NoError(t, cp.PersistCommitQC(moreQCs[11])) - require.NoError(t, cp.Close()) + require.NoError(t, cp.close()) // Reopen — should see only the post-TruncateAll entries. _, loaded, err := NewCommitQCPersister(utils.Some(dir)) @@ -264,7 +264,7 @@ func TestCommitQCDeleteBeforePastAllCrashRecovery(t *testing.T) { // DeleteBefore truncates the WAL (past all), then "crash" before writing. require.NoError(t, cp.DeleteBefore(10)) - require.NoError(t, cp.Close()) // simulate crash — no new QCs written + require.NoError(t, cp.close()) // simulate crash — no new QCs written // Restart: WAL is empty, cp.next will be 0. cp2, loaded, err := NewCommitQCPersister(utils.Some(dir)) @@ -280,7 +280,7 @@ func TestCommitQCDeleteBeforePastAllCrashRecovery(t *testing.T) { moreQCs := makeSequentialCommitQCs(rng, committee, keys, 12) require.NoError(t, cp2.PersistCommitQC(moreQCs[10])) require.NoError(t, cp2.PersistCommitQC(moreQCs[11])) - require.NoError(t, cp2.Close()) + require.NoError(t, cp2.close()) _, loaded, err = NewCommitQCPersister(utils.Some(dir)) require.NoError(t, err) @@ -304,7 +304,7 @@ func TestCommitQCDeleteBeforeThenPersistMore(t *testing.T) { } require.NoError(t, cp.DeleteBefore(3)) require.NoError(t, cp.PersistCommitQC(qcs[5])) - require.NoError(t, cp.Close()) + require.NoError(t, cp.close()) _, loaded, err := NewCommitQCPersister(utils.Some(dir)) require.NoError(t, err)