diff --git a/sei-db/state_db/sc/hashvault/noop_hashvault.go b/sei-db/state_db/sc/hashvault/noop_hashvault.go new file mode 100644 index 0000000000..65df820cd8 --- /dev/null +++ b/sei-db/state_db/sc/hashvault/noop_hashvault.go @@ -0,0 +1,30 @@ +package hashvault + +import "context" + +var _ HashVault = (*NoopHashVault)(nil) + +// NoopHashVault is a HashVault implementation that does nothing. It provides no equivocation +// protection whatsoever. It exists for two purposes: +// - tests that construct a BlockExecutor but do not exercise the vault, and +// - the explicit, operator-opted-in "hash-vault-disabled-unsafe" escape hatch. +// +// Production code must never substitute this for a real vault without a deliberate human decision. +type NoopHashVault struct{} + +// NewNoopHashVault returns a HashVault whose methods are all no-ops. +func NewNoopHashVault() *NoopHashVault { + return &NoopHashVault{} +} + +func (n *NoopHashVault) CommitToHash(_ context.Context, _ uint64, _ []byte) error { + return nil +} + +func (n *NoopHashVault) Prune(_ context.Context, _ uint64) error { + return nil +} + +func (n *NoopHashVault) Close(_ context.Context) error { + return nil +} diff --git a/sei-db/state_db/sc/hashvault/pebble_hashvault.go b/sei-db/state_db/sc/hashvault/pebble_hashvault.go index e42f9e9caa..410c185dc6 100644 --- a/sei-db/state_db/sc/hashvault/pebble_hashvault.go +++ b/sei-db/state_db/sc/hashvault/pebble_hashvault.go @@ -217,9 +217,14 @@ func (p *PebbleHashVault) Close(_ context.Context) error { func (p *PebbleHashVault) logHashMismatch(blockHeight uint64, existing, incoming []byte) { logger.Error("Hashvault detected hash mismatch; node attempted to change its mind. "+ - "DO NOT RESTART WITHOUT HUMAN INVESTIGATION.", + "DO NOT RESTART WITHOUT HUMAN INVESTIGATION. If you are CERTAIN this is not a real "+ + "equivocation, you can bypass this guard by stopping the node and deleting the HashVault "+ + "data directory (hashVaultDir below), then restarting. WARNING: deleting it removes "+ + "equivocation protection — if the node then commits a conflicting hash for a height it has "+ + "already finalized, the validator may be SLASHED.", "blockHeight", blockHeight, "existingHex", hex.EncodeToString(existing), "incomingHex", hex.EncodeToString(incoming), + "hashVaultDir", p.config.DataDir, ) } diff --git a/sei-tendermint/config/config.go b/sei-tendermint/config/config.go index 71cce25dd9..de0e038371 100644 --- a/sei-tendermint/config/config.go +++ b/sei-tendermint/config/config.go @@ -82,21 +82,27 @@ type Config struct { // routing table and pulls blocks from committee members). A warning is // logged at startup if mode disagrees with committee membership. AutobahnConfigFile string `mapstructure:"autobahn-config-file"` + + // HashVaultDisabledUnsafe disables the block-hash equivocation guard (HashVault). The vault is + // on by default (false). Setting this to true is an explicit, last-resort operator decision to + // run WITHOUT equivocation protection; the node logs loudly that it is unsafe. + HashVaultDisabledUnsafe bool `mapstructure:"hash-vault-disabled-unsafe"` } // DefaultConfig returns a default configuration for a Tendermint node func DefaultConfig() *Config { return &Config{ - BaseConfig: DefaultBaseConfig(), - RPC: DefaultRPCConfig(), - P2P: DefaultP2PConfig(), - Mempool: DefaultMempoolConfig(), - StateSync: DefaultStateSyncConfig(), - Consensus: DefaultConsensusConfig(), - TxIndex: DefaultTxIndexConfig(), - Instrumentation: DefaultInstrumentationConfig(), - PrivValidator: DefaultPrivValidatorConfig(), - SelfRemediation: DefaultSelfRemediationConfig(), + BaseConfig: DefaultBaseConfig(), + RPC: DefaultRPCConfig(), + P2P: DefaultP2PConfig(), + Mempool: DefaultMempoolConfig(), + StateSync: DefaultStateSyncConfig(), + Consensus: DefaultConsensusConfig(), + TxIndex: DefaultTxIndexConfig(), + Instrumentation: DefaultInstrumentationConfig(), + PrivValidator: DefaultPrivValidatorConfig(), + SelfRemediation: DefaultSelfRemediationConfig(), + HashVaultDisabledUnsafe: false, } } diff --git a/sei-tendermint/config/toml.go b/sei-tendermint/config/toml.go index a5951360df..2809be24d0 100644 --- a/sei-tendermint/config/toml.go +++ b/sei-tendermint/config/toml.go @@ -157,6 +157,16 @@ node-key-file = "{{ js .BaseConfig.NodeKey }}" # would otherwise nest it under the immediately preceding section. autobahn-config-file = "{{ .AutobahnConfigFile }}" +# hash-vault-disabled-unsafe disables the block-hash equivocation guard (HashVault). +# DO NOT set this to true unless you are knowingly running an UNSAFE node as a last-resort +# recovery measure. A node with this enabled has NO protection against changing its mind about +# a committed block's hash, and will log error-level warnings on every startup. +# +# Placed here (as a top-level key, before any [section] header) so the TOML parser sees it at +# root scope where mapstructure expects it — viper would otherwise nest it under the +# immediately preceding section. +hash-vault-disabled-unsafe = {{ .HashVaultDisabledUnsafe }} + ####################################################################### ### Advanced Configuration Options ### ####################################################################### @@ -655,7 +665,6 @@ blocks-behind-check-interval = {{ .SelfRemediation.BlocksBehindCheckIntervalSeco # Cooldown between each restart restart-cooldown-seconds = {{ .SelfRemediation.RestartCooldownSeconds }} - ` // defaultConfigTemplate combines manual and auto-managed templates for backward compatibility diff --git a/sei-tendermint/internal/p2p/giga_router.go b/sei-tendermint/internal/p2p/giga_router.go index a66536fc55..ae0a9707f2 100644 --- a/sei-tendermint/internal/p2p/giga_router.go +++ b/sei-tendermint/internal/p2p/giga_router.go @@ -42,6 +42,11 @@ type GigaRouterCommonConfig struct { // MaxInboundFullnodePeers caps inbound block-sync from non-committee // peers. 0 rejects all; positive caps at n, up to maxInboundFullnodePeers. MaxInboundFullnodePeers int + + // HashVaultDisabledUnsafe disables the block-hash equivocation guard (HashVault). The guard is + // on by default (false); the GigaRouter builds and owns it (see runExecute). Setting this to true + // is an explicit, last-resort operator decision to run WITHOUT equivocation protection. + HashVaultDisabledUnsafe bool } // GigaValidatorConfig configures a committee-member GigaRouter. diff --git a/sei-tendermint/internal/p2p/giga_router_common.go b/sei-tendermint/internal/p2p/giga_router_common.go index 9ef33db906..13321767c4 100644 --- a/sei-tendermint/internal/p2p/giga_router_common.go +++ b/sei-tendermint/internal/p2p/giga_router_common.go @@ -6,10 +6,12 @@ import ( "fmt" "maps" "net/url" + "path/filepath" "slices" "sync/atomic" "github.com/ethereum/go-ethereum/common" + "github.com/sei-protocol/sei-chain/sei-db/state_db/sc/hashvault" abci "github.com/sei-protocol/sei-chain/sei-tendermint/abci/types" atypes "github.com/sei-protocol/sei-chain/sei-tendermint/autobahn/types" "github.com/sei-protocol/sei-chain/sei-tendermint/crypto" @@ -46,6 +48,12 @@ type gigaRouterCommon struct { // by one or two under contention but never over-accepts. inboundFullnodeCount atomic.Int64 inboundFullnodeCap int64 + + // hashVault is the block-hash equivocation guard. runExecute owns its lifecycle: it builds the + // vault on entry (durable under PersistentStateDir, or a no-op when disabled / in-memory) and + // closes it on exit. Set before executeBlock is ever called, which is the only other reader. See + // buildHashVault and commitHashToVault. + hashVault hashvault.HashVault } // buildDataState validates the common config and constructs the data @@ -238,6 +246,16 @@ func (r *gigaRouterCommon) executeBlock(ctx context.Context, b *atypes.GlobalBlo if err != nil { return nil, fmt.Errorf("app.FinalizeBlock(): %w", err) } + + // Commit this height's app hash to the equivocation guard before persisting app state, so the + // vault always records our commitment to a height before the state it implies is committed (and + // before the hash is proposed for AppQC voting via PushAppHash below). On restart the block is + // re-executed and the identical hash is re-committed idempotently. A returned error is a benign + // shutdown cancellation; genuine faults panic inside the call. See commitHashToVault. + if err := commitHashToVault(ctx, r.hashVault, b.GlobalNumber, resp.AppHash); err != nil { + return nil, err + } + commitResp, err := app.Commit(ctx) if err != nil { return nil, fmt.Errorf("app.Commit(): %w", err) @@ -248,7 +266,82 @@ func (r *gigaRouterCommon) executeBlock(ctx context.Context, b *atypes.GlobalBlo return commitResp, nil } +// buildHashVault constructs the block-hash equivocation guard runExecute owns. By default it +// returns a durable Pebble-backed vault rooted at /hashvault, alongside the +// other Autobahn on-disk state. It returns a no-op vault (no protection) in two cases: the operator +// explicitly set HashVaultDisabledUnsafe — logged loudly as unsafe — or there is no persistent +// state directory (in-memory mode, e.g. tests), where a durable vault would be pointless because +// the data WAL is itself a no-op (see data.NewDataWAL). +func buildHashVault(ctx context.Context, cfg *GigaRouterCommonConfig) (hashvault.HashVault, error) { + if cfg.HashVaultDisabledUnsafe { + logger.Error("################################################################") + logger.Error("# HASHVAULT DISABLED (hash-vault-disabled-unsafe=true). #") + logger.Error("# This node has NO block-hash equivocation protection and is #") + logger.Error("# running in an UNSAFE configuration. Re-enable as soon as the #") + logger.Error("# underlying issue is resolved. #") + logger.Error("################################################################") + return hashvault.NewNoopHashVault(), nil + } + dir, ok := cfg.PersistentStateDir.Get() + if !ok { + logger.Error("HASHVAULT DISABLED: no persistent state dir (in-memory mode); using no-op vault. " + + "This node has NO block-hash equivocation protection. This is expected only for in-memory " + + "runs such as tests; a production node reaching this path is misconfigured.") + return hashvault.NewNoopHashVault(), nil + } + hvCfg := hashvault.DefaultHashVaultConfig() + hvCfg.DataDir = filepath.Join(dir, "hashvault") + return hashvault.NewPebbleHashVault(ctx, hvCfg) +} + +// commitHashToVault records the app hash for the given height in the equivocation guard and halts +// the node on any error. Every executed height is guarded, so a node can never commit to two +// different app hashes for the same height without deliberate human intervention. +func commitHashToVault( + ctx context.Context, + vault hashvault.HashVault, + height atypes.GlobalBlockNumber, + hash []byte, +) error { + err := vault.CommitToHash(ctx, uint64(height), hash) + if err == nil { + return nil + } + if errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded) { + logger.Info("HashVault commit aborted by context cancellation during shutdown; not recording hash", + "height", height, "err", err) + return fmt.Errorf("hashvault CommitToHash aborted at height %d: %w", height, err) + } + if errors.Is(err, hashvault.ErrHashMismatch) { + // The HashVault has already logged the conflicting hashes, its data directory, and the + // bypass/slashing guidance immediately before returning this error; don't duplicate it. + logger.Error("FATAL: HashVault detected a block-hash equivocation; halting. See the preceding "+ + "HashVault error for the conflicting hashes and recovery steps. "+ + "DO NOT RESTART WITHOUT HUMAN INTERVENTION.", + "height", height) + } else { + logger.Error("FATAL: HashVault could not commit the block hash (operational error, not a "+ + "confirmed equivocation). Halting.", + "height", height, "hashHex", fmt.Sprintf("%x", hash), "err", err) + } + panic(fmt.Sprintf("hashvault CommitToHash failed at height %d: %v", height, err)) +} + func (r *gigaRouterCommon) runExecute(ctx context.Context) error { + // runExecute is the single block-execution loop spawned by both the validator and fullnode Run + // methods, so it owns the equivocation guard for both roles: build it here (set before the first + // executeBlock, the only other reader) and close it on exit. + hashVault, err := buildHashVault(ctx, r.cfg) + if err != nil { + return fmt.Errorf("buildHashVault(): %w", err) + } + r.hashVault = hashVault + defer func() { + if err := hashVault.Close(context.Background()); err != nil { + logger.Error("failed to close hashvault", "err", err) + } + }() + app := r.app info, err := app.Info(ctx, &version.RequestInfo) @@ -278,6 +371,15 @@ func (r *gigaRouterCommon) runExecute(ctx context.Context) error { return fmt.Errorf("invalid GenDoc.InitialHeight = %v", r.cfg.GenDoc.InitialHeight) } } else { + // Re-commit the last finalized block's hash to the equivocation guard before re-proposing it + // for AppQC voting (PushAppHash below), mirroring executeBlock's commit-before-PushAppHash + // ordering. On a normal restart this idempotently matches the hash recorded when `last` was + // first executed; if the committed app state has diverged from what the vault recorded (e.g. an + // out-of-band rollback/restore), this halts the node instead of externalizing a conflicting + // hash. A returned error is a benign shutdown cancellation; genuine faults panic inside. + if err := commitHashToVault(ctx, r.hashVault, last, info.LastBlockAppHash); err != nil { + return err + } // Losing a prefix of appHashes on crash is fine: AppQC is reached // once everyone votes on apphashes of a suffix of finalized blocks. if err := r.data.PushAppHash(ctx, last, info.LastBlockAppHash); err != nil { @@ -301,6 +403,17 @@ func (r *gigaRouterCommon) runExecute(ctx context.Context) error { if err := r.data.PruneBefore(pruneBefore); err != nil { return fmt.Errorf("r.data.PruneBefore(%v): %w", pruneBefore, err) } + // Align the vault's retention with the data layer's prune boundary. + if err := r.hashVault.Prune(ctx, uint64(pruneBefore)); err != nil { + // A canceled context just means we're shutting down between a successful executeBlock + // and this prune; that's benign, not a prune failure, so don't alarm operators. + if errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded) { + logger.Info("hashvault prune aborted by context cancellation during shutdown", + "prune_before", pruneBefore, "err", err) + } else { + logger.Error("failed to prune hashvault", "prune_before", pruneBefore, "err", err) + } + } } } diff --git a/sei-tendermint/internal/p2p/giga_router_common_test.go b/sei-tendermint/internal/p2p/giga_router_common_test.go new file mode 100644 index 0000000000..692492ccc1 --- /dev/null +++ b/sei-tendermint/internal/p2p/giga_router_common_test.go @@ -0,0 +1,58 @@ +package p2p + +import ( + "context" + "testing" + + "github.com/sei-protocol/sei-chain/sei-db/state_db/sc/hashvault" + atypes "github.com/sei-protocol/sei-chain/sei-tendermint/autobahn/types" + "github.com/sei-protocol/sei-chain/sei-tendermint/libs/utils/require" +) + +// newSeededVault returns a durable Pebble vault rooted in a temp dir with hash committed at height. +func newSeededVault(t *testing.T, height atypes.GlobalBlockNumber, hash []byte) hashvault.HashVault { + t.Helper() + cfg := hashvault.DefaultHashVaultConfig() + cfg.DataDir = t.TempDir() + v, err := hashvault.NewUnsafePebbleHashVault(context.Background(), cfg) + require.NoError(t, err) + t.Cleanup(func() { _ = v.Close(context.Background()) }) + require.NoError(t, v.CommitToHash(context.Background(), uint64(height), hash)) + return v +} + +// TestCommitHashToVault covers the safety contract the restart path in runExecute relies on: +// an idempotent match returns nil, a divergent hash halts the node (panic), and a canceled +// context returns an error without halting. +func TestCommitHashToVault(t *testing.T) { + const height atypes.GlobalBlockNumber = 42 + h1 := make([]byte, hashvault.BlockHashSize) + for i := range h1 { + h1[i] = 0xAA + } + h2 := make([]byte, hashvault.BlockHashSize) + for i := range h2 { + h2[i] = 0xBB + } + + t.Run("matching hash is idempotent", func(t *testing.T) { + vault := newSeededVault(t, height, h1) + require.NoError(t, commitHashToVault(context.Background(), vault, height, h1)) + }) + + t.Run("divergent hash halts the node", func(t *testing.T) { + vault := newSeededVault(t, height, h1) + require.Panics(t, func() { + _ = commitHashToVault(context.Background(), vault, height, h2) + }) + }) + + t.Run("canceled context returns error without halting", func(t *testing.T) { + vault := newSeededVault(t, height, h1) + ctx, cancel := context.WithCancel(context.Background()) + cancel() + // Must not panic: a canceled context is a benign shutdown, not an equivocation. + err := commitHashToVault(ctx, vault, height, h2) + require.Error(t, err) + }) +} diff --git a/sei-tendermint/node/setup.go b/sei-tendermint/node/setup.go index ca5a429317..71c19ff208 100644 --- a/sei-tendermint/node/setup.go +++ b/sei-tendermint/node/setup.go @@ -312,6 +312,9 @@ func buildGigaRouter( return nil, fmt.Errorf("buildValidatorGigaConfig: %w", err) } rootifyPersistentStateDir(cfg.RootDir, &valCfg.GigaRouterCommonConfig) + // The GigaRouter builds and owns the equivocation guard itself; just pass the operator's + // enable/disable decision through as plain config. + valCfg.HashVaultDisabledUnsafe = cfg.HashVaultDisabledUnsafe logger.Info("Autobahn: starting as validator", "validators", len(valCfg.ValidatorAddrs)) return p2p.NewGigaValidatorRouter(valCfg, p2p.NodeSecretKey(nodeKey)) } @@ -320,6 +323,9 @@ func buildGigaRouter( return nil, fmt.Errorf("buildFullnodeGigaConfig: %w", err) } rootifyPersistentStateDir(cfg.RootDir, fnCfg) + // The GigaRouter builds and owns the equivocation guard itself; just pass the operator's + // enable/disable decision through as plain config. + fnCfg.HashVaultDisabledUnsafe = cfg.HashVaultDisabledUnsafe logger.Info("Autobahn: starting as fullnode", "mode", cfg.Mode, "validators", len(validatorAddrs)) return p2p.NewGigaFullnodeRouter(fnCfg, p2p.NodeSecretKey(nodeKey)) }