Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 30 additions & 0 deletions sei-db/state_db/sc/hashvault/noop_hashvault.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
package hashvault

import "context"

var _ HashVault = (*NoopHashVault)(nil)

// NoopHashVault is a HashVault implementation that does nothing. It provides no equivocation
// protection whatsoever. It exists for two purposes:
// - tests that construct a BlockExecutor but do not exercise the vault, and
// - the explicit, operator-opted-in "hash-vault-disabled-unsafe" escape hatch.
//
// Production code must never substitute this for a real vault without a deliberate human decision.
type NoopHashVault struct{}

// NewNoopHashVault returns a HashVault whose methods are all no-ops.
func NewNoopHashVault() *NoopHashVault {
return &NoopHashVault{}
}

func (n *NoopHashVault) CommitToHash(_ context.Context, _ uint64, _ []byte) error {
return nil
}

func (n *NoopHashVault) Prune(_ context.Context, _ uint64) error {
return nil
}

func (n *NoopHashVault) Close(_ context.Context) error {
return nil
}
7 changes: 6 additions & 1 deletion sei-db/state_db/sc/hashvault/pebble_hashvault.go
Original file line number Diff line number Diff line change
Expand Up @@ -217,9 +217,14 @@ func (p *PebbleHashVault) Close(_ context.Context) error {

func (p *PebbleHashVault) logHashMismatch(blockHeight uint64, existing, incoming []byte) {
logger.Error("Hashvault detected hash mismatch; node attempted to change its mind. "+

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: make it clear this is AppHash mismatch

"DO NOT RESTART WITHOUT HUMAN INVESTIGATION.",
"DO NOT RESTART WITHOUT HUMAN INVESTIGATION. If you are CERTAIN this is not a real "+
"equivocation, you can bypass this guard by stopping the node and deleting the HashVault "+
"data directory (hashVaultDir below), then restarting. WARNING: deleting it removes "+
"equivocation protection — if the node then commits a conflicting hash for a height it has "+
"already finalized, the validator may be SLASHED.",
"blockHeight", blockHeight,
"existingHex", hex.EncodeToString(existing),
"incomingHex", hex.EncodeToString(incoming),
"hashVaultDir", p.config.DataDir,
)
}
26 changes: 16 additions & 10 deletions sei-tendermint/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -82,21 +82,27 @@ type Config struct {
// routing table and pulls blocks from committee members). A warning is
// logged at startup if mode disagrees with committee membership.
AutobahnConfigFile string `mapstructure:"autobahn-config-file"`

// HashVaultDisabledUnsafe disables the block-hash equivocation guard (HashVault). The vault is
// on by default (false). Setting this to true is an explicit, last-resort operator decision to
// run WITHOUT equivocation protection; the node logs loudly that it is unsafe.
HashVaultDisabledUnsafe bool `mapstructure:"hash-vault-disabled-unsafe"`
}

// DefaultConfig returns a default configuration for a Tendermint node
func DefaultConfig() *Config {
return &Config{
BaseConfig: DefaultBaseConfig(),
RPC: DefaultRPCConfig(),
P2P: DefaultP2PConfig(),
Mempool: DefaultMempoolConfig(),
StateSync: DefaultStateSyncConfig(),
Consensus: DefaultConsensusConfig(),
TxIndex: DefaultTxIndexConfig(),
Instrumentation: DefaultInstrumentationConfig(),
PrivValidator: DefaultPrivValidatorConfig(),
SelfRemediation: DefaultSelfRemediationConfig(),
BaseConfig: DefaultBaseConfig(),
RPC: DefaultRPCConfig(),
P2P: DefaultP2PConfig(),
Mempool: DefaultMempoolConfig(),
StateSync: DefaultStateSyncConfig(),
Consensus: DefaultConsensusConfig(),
TxIndex: DefaultTxIndexConfig(),
Instrumentation: DefaultInstrumentationConfig(),
PrivValidator: DefaultPrivValidatorConfig(),
SelfRemediation: DefaultSelfRemediationConfig(),
HashVaultDisabledUnsafe: false,
}
}

Expand Down
11 changes: 10 additions & 1 deletion sei-tendermint/config/toml.go
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,16 @@ node-key-file = "{{ js .BaseConfig.NodeKey }}"
# would otherwise nest it under the immediately preceding section.
autobahn-config-file = "{{ .AutobahnConfigFile }}"

# hash-vault-disabled-unsafe disables the block-hash equivocation guard (HashVault).
# DO NOT set this to true unless you are knowingly running an UNSAFE node as a last-resort
# recovery measure. A node with this enabled has NO protection against changing its mind about
# a committed block's hash, and will log error-level warnings on every startup.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: we can probably tell them something like:
It's safer to not disable hash vault, just remove hash vault files as instructed and let it run. They should only disable hash vault if they are very sure the hash vault is totally wrong and they will keep running into the same panic on new blocks.

#
# Placed here (as a top-level key, before any [section] header) so the TOML parser sees it at
# root scope where mapstructure expects it — viper would otherwise nest it under the
# immediately preceding section.
hash-vault-disabled-unsafe = {{ .HashVaultDisabledUnsafe }}

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what is "Advanced Configuration Options"? does this belong there? (I don't know what are Advanced Configuration Options either, this is a genuine question, not suggestion.)


#######################################################################
### Advanced Configuration Options ###
#######################################################################
Expand Down Expand Up @@ -655,7 +665,6 @@ blocks-behind-check-interval = {{ .SelfRemediation.BlocksBehindCheckIntervalSeco

# Cooldown between each restart
restart-cooldown-seconds = {{ .SelfRemediation.RestartCooldownSeconds }}

`

// defaultConfigTemplate combines manual and auto-managed templates for backward compatibility
Expand Down
5 changes: 5 additions & 0 deletions sei-tendermint/internal/p2p/giga_router.go
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,11 @@ type GigaRouterCommonConfig struct {
// MaxInboundFullnodePeers caps inbound block-sync from non-committee
// peers. 0 rejects all; positive caps at n, up to maxInboundFullnodePeers.
MaxInboundFullnodePeers int

// HashVaultDisabledUnsafe disables the block-hash equivocation guard (HashVault). The guard is
// on by default (false); the GigaRouter builds and owns it (see runExecute). Setting this to true
// is an explicit, last-resort operator decision to run WITHOUT equivocation protection.
HashVaultDisabledUnsafe bool
}

// GigaValidatorConfig configures a committee-member GigaRouter.
Expand Down
113 changes: 113 additions & 0 deletions sei-tendermint/internal/p2p/giga_router_common.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,12 @@ import (
"fmt"
"maps"
"net/url"
"path/filepath"
"slices"
"sync/atomic"

"github.com/ethereum/go-ethereum/common"
"github.com/sei-protocol/sei-chain/sei-db/state_db/sc/hashvault"
abci "github.com/sei-protocol/sei-chain/sei-tendermint/abci/types"
atypes "github.com/sei-protocol/sei-chain/sei-tendermint/autobahn/types"
"github.com/sei-protocol/sei-chain/sei-tendermint/crypto"
Expand Down Expand Up @@ -46,6 +48,12 @@ type gigaRouterCommon struct {
// by one or two under contention but never over-accepts.
inboundFullnodeCount atomic.Int64
inboundFullnodeCap int64

// hashVault is the block-hash equivocation guard. runExecute owns its lifecycle: it builds the

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: AppHash, we don't check blockHash

// vault on entry (durable under PersistentStateDir, or a no-op when disabled / in-memory) and
// closes it on exit. Set before executeBlock is ever called, which is the only other reader. See

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

wait, are we setting it before executeBlock is ever called?

// buildHashVault and commitHashToVault.
hashVault hashvault.HashVault
}

// buildDataState validates the common config and constructs the data
Expand Down Expand Up @@ -238,6 +246,16 @@ func (r *gigaRouterCommon) executeBlock(ctx context.Context, b *atypes.GlobalBlo
if err != nil {
return nil, fmt.Errorf("app.FinalizeBlock(): %w", err)
}

// Commit this height's app hash to the equivocation guard before persisting app state, so the
// vault always records our commitment to a height before the state it implies is committed (and
// before the hash is proposed for AppQC voting via PushAppHash below). On restart the block is
// re-executed and the identical hash is re-committed idempotently. A returned error is a benign
// shutdown cancellation; genuine faults panic inside the call. See commitHashToVault.
if err := commitHashToVault(ctx, r.hashVault, b.GlobalNumber, resp.AppHash); err != nil {

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: I'm thinking commitAppHashToVault may make the intent more obvious we never want to commit other Hash.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@pompon0 we are now inserting commitHashToVault before commit. I think this is what we said we wanted during the March offsite, but double checking with you.

return nil, err
}

commitResp, err := app.Commit(ctx)
if err != nil {
return nil, fmt.Errorf("app.Commit(): %w", err)
Expand All @@ -248,7 +266,82 @@ func (r *gigaRouterCommon) executeBlock(ctx context.Context, b *atypes.GlobalBlo
return commitResp, nil
}

// buildHashVault constructs the block-hash equivocation guard runExecute owns. By default it

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: AppHash, maybe find and replace?

// returns a durable Pebble-backed vault rooted at <PersistentStateDir>/hashvault, alongside the
// other Autobahn on-disk state. It returns a no-op vault (no protection) in two cases: the operator
// explicitly set HashVaultDisabledUnsafe — logged loudly as unsafe — or there is no persistent
// state directory (in-memory mode, e.g. tests), where a durable vault would be pointless because
// the data WAL is itself a no-op (see data.NewDataWAL).
func buildHashVault(ctx context.Context, cfg *GigaRouterCommonConfig) (hashvault.HashVault, error) {
if cfg.HashVaultDisabledUnsafe {
logger.Error("################################################################")
logger.Error("# HASHVAULT DISABLED (hash-vault-disabled-unsafe=true). #")
logger.Error("# This node has NO block-hash equivocation protection and is #")
logger.Error("# running in an UNSAFE configuration. Re-enable as soon as the #")
logger.Error("# underlying issue is resolved. #")
logger.Error("################################################################")
return hashvault.NewNoopHashVault(), nil
}
dir, ok := cfg.PersistentStateDir.Get()
if !ok {
logger.Error("HASHVAULT DISABLED: no persistent state dir (in-memory mode); using no-op vault. " +
"This node has NO block-hash equivocation protection. This is expected only for in-memory " +
"runs such as tests; a production node reaching this path is misconfigured.")
return hashvault.NewNoopHashVault(), nil
}
hvCfg := hashvault.DefaultHashVaultConfig()
hvCfg.DataDir = filepath.Join(dir, "hashvault")
return hashvault.NewPebbleHashVault(ctx, hvCfg)
}

// commitHashToVault records the app hash for the given height in the equivocation guard and halts
// the node on any error. Every executed height is guarded, so a node can never commit to two
// different app hashes for the same height without deliberate human intervention.
func commitHashToVault(
ctx context.Context,
vault hashvault.HashVault,
height atypes.GlobalBlockNumber,
hash []byte,
) error {
err := vault.CommitToHash(ctx, uint64(height), hash)
if err == nil {
return nil
}
if errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded) {
logger.Info("HashVault commit aborted by context cancellation during shutdown; not recording hash",
"height", height, "err", err)
return fmt.Errorf("hashvault CommitToHash aborted at height %d: %w", height, err)
}
if errors.Is(err, hashvault.ErrHashMismatch) {
// The HashVault has already logged the conflicting hashes, its data directory, and the
// bypass/slashing guidance immediately before returning this error; don't duplicate it.
logger.Error("FATAL: HashVault detected a block-hash equivocation; halting. See the preceding "+

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There is a tiny risk the logger doesn't flush before panicing, maybe either flush the log buffer or pass the error message to the panic string?

"HashVault error for the conflicting hashes and recovery steps. "+
"DO NOT RESTART WITHOUT HUMAN INTERVENTION.",
"height", height)
} else {
logger.Error("FATAL: HashVault could not commit the block hash (operational error, not a "+
"confirmed equivocation). Halting.",
"height", height, "hashHex", fmt.Sprintf("%x", hash), "err", err)
}
panic(fmt.Sprintf("hashvault CommitToHash failed at height %d: %v", height, err))
}

func (r *gigaRouterCommon) runExecute(ctx context.Context) error {
// runExecute is the single block-execution loop spawned by both the validator and fullnode Run
// methods, so it owns the equivocation guard for both roles: build it here (set before the first
// executeBlock, the only other reader) and close it on exit.
hashVault, err := buildHashVault(ctx, r.cfg)
if err != nil {
return fmt.Errorf("buildHashVault(): %w", err)
}
r.hashVault = hashVault
defer func() {
if err := hashVault.Close(context.Background()); err != nil {
logger.Error("failed to close hashvault", "err", err)
}
}()

app := r.app

info, err := app.Info(ctx, &version.RequestInfo)
Expand Down Expand Up @@ -278,6 +371,15 @@ func (r *gigaRouterCommon) runExecute(ctx context.Context) error {
return fmt.Errorf("invalid GenDoc.InitialHeight = %v", r.cfg.GenDoc.InitialHeight)
}
} else {
// Re-commit the last finalized block's hash to the equivocation guard before re-proposing it
// for AppQC voting (PushAppHash below), mirroring executeBlock's commit-before-PushAppHash
// ordering. On a normal restart this idempotently matches the hash recorded when `last` was
// first executed; if the committed app state has diverged from what the vault recorded (e.g. an
// out-of-band rollback/restore), this halts the node instead of externalizing a conflicting
// hash. A returned error is a benign shutdown cancellation; genuine faults panic inside.
if err := commitHashToVault(ctx, r.hashVault, last, info.LastBlockAppHash); err != nil {
return err
}
// Losing a prefix of appHashes on crash is fine: AppQC is reached
// once everyone votes on apphashes of a suffix of finalized blocks.
if err := r.data.PushAppHash(ctx, last, info.LastBlockAppHash); err != nil {
Expand All @@ -301,6 +403,17 @@ func (r *gigaRouterCommon) runExecute(ctx context.Context) error {
if err := r.data.PruneBefore(pruneBefore); err != nil {
return fmt.Errorf("r.data.PruneBefore(%v): %w", pruneBefore, err)
}
// Align the vault's retention with the data layer's prune boundary.
if err := r.hashVault.Prune(ctx, uint64(pruneBefore)); err != nil {
// A canceled context just means we're shutting down between a successful executeBlock
// and this prune; that's benign, not a prune failure, so don't alarm operators.
if errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded) {
logger.Info("hashvault prune aborted by context cancellation during shutdown",
"prune_before", pruneBefore, "err", err)
} else {
logger.Error("failed to prune hashvault", "prune_before", pruneBefore, "err", err)
}
}
}
}

Expand Down
58 changes: 58 additions & 0 deletions sei-tendermint/internal/p2p/giga_router_common_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
package p2p

import (
"context"
"testing"

"github.com/sei-protocol/sei-chain/sei-db/state_db/sc/hashvault"
atypes "github.com/sei-protocol/sei-chain/sei-tendermint/autobahn/types"
"github.com/sei-protocol/sei-chain/sei-tendermint/libs/utils/require"
)

// newSeededVault returns a durable Pebble vault rooted in a temp dir with hash committed at height.
func newSeededVault(t *testing.T, height atypes.GlobalBlockNumber, hash []byte) hashvault.HashVault {
t.Helper()
cfg := hashvault.DefaultHashVaultConfig()
cfg.DataDir = t.TempDir()
v, err := hashvault.NewUnsafePebbleHashVault(context.Background(), cfg)
require.NoError(t, err)
t.Cleanup(func() { _ = v.Close(context.Background()) })
require.NoError(t, v.CommitToHash(context.Background(), uint64(height), hash))
return v
}

// TestCommitHashToVault covers the safety contract the restart path in runExecute relies on:
// an idempotent match returns nil, a divergent hash halts the node (panic), and a canceled
// context returns an error without halting.
func TestCommitHashToVault(t *testing.T) {
const height atypes.GlobalBlockNumber = 42
h1 := make([]byte, hashvault.BlockHashSize)
for i := range h1 {
h1[i] = 0xAA
}
h2 := make([]byte, hashvault.BlockHashSize)
for i := range h2 {
h2[i] = 0xBB
}

t.Run("matching hash is idempotent", func(t *testing.T) {
vault := newSeededVault(t, height, h1)
require.NoError(t, commitHashToVault(context.Background(), vault, height, h1))
})

t.Run("divergent hash halts the node", func(t *testing.T) {
vault := newSeededVault(t, height, h1)
require.Panics(t, func() {
_ = commitHashToVault(context.Background(), vault, height, h2)
})
})

t.Run("canceled context returns error without halting", func(t *testing.T) {
vault := newSeededVault(t, height, h1)
ctx, cancel := context.WithCancel(context.Background())
cancel()
// Must not panic: a canceled context is a benign shutdown, not an equivocation.
err := commitHashToVault(ctx, vault, height, h2)
require.Error(t, err)
})
}
6 changes: 6 additions & 0 deletions sei-tendermint/node/setup.go
Original file line number Diff line number Diff line change
Expand Up @@ -312,6 +312,9 @@ func buildGigaRouter(
return nil, fmt.Errorf("buildValidatorGigaConfig: %w", err)
}
rootifyPersistentStateDir(cfg.RootDir, &valCfg.GigaRouterCommonConfig)
// The GigaRouter builds and owns the equivocation guard itself; just pass the operator's
// enable/disable decision through as plain config.
valCfg.HashVaultDisabledUnsafe = cfg.HashVaultDisabledUnsafe
logger.Info("Autobahn: starting as validator", "validators", len(valCfg.ValidatorAddrs))
return p2p.NewGigaValidatorRouter(valCfg, p2p.NodeSecretKey(nodeKey))
}
Expand All @@ -320,6 +323,9 @@ func buildGigaRouter(
return nil, fmt.Errorf("buildFullnodeGigaConfig: %w", err)
}
rootifyPersistentStateDir(cfg.RootDir, fnCfg)
// The GigaRouter builds and owns the equivocation guard itself; just pass the operator's
// enable/disable decision through as plain config.
fnCfg.HashVaultDisabledUnsafe = cfg.HashVaultDisabledUnsafe
logger.Info("Autobahn: starting as fullnode", "mode", cfg.Mode, "validators", len(validatorAddrs))
return p2p.NewGigaFullnodeRouter(fnCfg, p2p.NodeSecretKey(nodeKey))
}
Expand Down
Loading