-
Notifications
You must be signed in to change notification settings - Fork 882
Autobahn hashvault integration #3602
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
e70706e
203fb1f
166c6cd
ca9bfc7
e74b377
28bd8ef
5429566
f539793
4e49367
9191743
06a959f
76d281a
7812a81
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,30 @@ | ||
| package hashvault | ||
|
|
||
| import "context" | ||
|
|
||
| var _ HashVault = (*NoopHashVault)(nil) | ||
|
|
||
| // NoopHashVault is a HashVault implementation that does nothing. It provides no equivocation | ||
| // protection whatsoever. It exists for two purposes: | ||
| // - tests that construct a BlockExecutor but do not exercise the vault, and | ||
| // - the explicit, operator-opted-in "hash-vault-disabled-unsafe" escape hatch. | ||
| // | ||
| // Production code must never substitute this for a real vault without a deliberate human decision. | ||
| type NoopHashVault struct{} | ||
|
|
||
| // NewNoopHashVault returns a HashVault whose methods are all no-ops. | ||
| func NewNoopHashVault() *NoopHashVault { | ||
| return &NoopHashVault{} | ||
| } | ||
|
|
||
| func (n *NoopHashVault) CommitToHash(_ context.Context, _ uint64, _ []byte) error { | ||
| return nil | ||
| } | ||
|
|
||
| func (n *NoopHashVault) Prune(_ context.Context, _ uint64) error { | ||
| return nil | ||
| } | ||
|
|
||
| func (n *NoopHashVault) Close(_ context.Context) error { | ||
| return nil | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -157,6 +157,16 @@ node-key-file = "{{ js .BaseConfig.NodeKey }}" | |
| # would otherwise nest it under the immediately preceding section. | ||
| autobahn-config-file = "{{ .AutobahnConfigFile }}" | ||
|
|
||
| # hash-vault-disabled-unsafe disables the block-hash equivocation guard (HashVault). | ||
| # DO NOT set this to true unless you are knowingly running an UNSAFE node as a last-resort | ||
| # recovery measure. A node with this enabled has NO protection against changing its mind about | ||
| # a committed block's hash, and will log error-level warnings on every startup. | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: we can probably tell them something like: |
||
| # | ||
| # Placed here (as a top-level key, before any [section] header) so the TOML parser sees it at | ||
| # root scope where mapstructure expects it — viper would otherwise nest it under the | ||
| # immediately preceding section. | ||
| hash-vault-disabled-unsafe = {{ .HashVaultDisabledUnsafe }} | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. what is "Advanced Configuration Options"? does this belong there? (I don't know what are Advanced Configuration Options either, this is a genuine question, not suggestion.) |
||
|
|
||
| ####################################################################### | ||
| ### Advanced Configuration Options ### | ||
| ####################################################################### | ||
|
|
@@ -655,7 +665,6 @@ blocks-behind-check-interval = {{ .SelfRemediation.BlocksBehindCheckIntervalSeco | |
|
|
||
| # Cooldown between each restart | ||
| restart-cooldown-seconds = {{ .SelfRemediation.RestartCooldownSeconds }} | ||
|
|
||
| ` | ||
|
|
||
| // defaultConfigTemplate combines manual and auto-managed templates for backward compatibility | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -6,10 +6,12 @@ import ( | |
| "fmt" | ||
| "maps" | ||
| "net/url" | ||
| "path/filepath" | ||
| "slices" | ||
| "sync/atomic" | ||
|
|
||
| "github.com/ethereum/go-ethereum/common" | ||
| "github.com/sei-protocol/sei-chain/sei-db/state_db/sc/hashvault" | ||
| abci "github.com/sei-protocol/sei-chain/sei-tendermint/abci/types" | ||
| atypes "github.com/sei-protocol/sei-chain/sei-tendermint/autobahn/types" | ||
| "github.com/sei-protocol/sei-chain/sei-tendermint/crypto" | ||
|
|
@@ -46,6 +48,12 @@ type gigaRouterCommon struct { | |
| // by one or two under contention but never over-accepts. | ||
| inboundFullnodeCount atomic.Int64 | ||
| inboundFullnodeCap int64 | ||
|
|
||
| // hashVault is the block-hash equivocation guard. runExecute owns its lifecycle: it builds the | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: AppHash, we don't check blockHash |
||
| // vault on entry (durable under PersistentStateDir, or a no-op when disabled / in-memory) and | ||
| // closes it on exit. Set before executeBlock is ever called, which is the only other reader. See | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. wait, are we setting it before executeBlock is ever called? |
||
| // buildHashVault and commitHashToVault. | ||
| hashVault hashvault.HashVault | ||
| } | ||
|
|
||
| // buildDataState validates the common config and constructs the data | ||
|
|
@@ -238,6 +246,16 @@ func (r *gigaRouterCommon) executeBlock(ctx context.Context, b *atypes.GlobalBlo | |
| if err != nil { | ||
| return nil, fmt.Errorf("app.FinalizeBlock(): %w", err) | ||
| } | ||
|
|
||
| // Commit this height's app hash to the equivocation guard before persisting app state, so the | ||
| // vault always records our commitment to a height before the state it implies is committed (and | ||
| // before the hash is proposed for AppQC voting via PushAppHash below). On restart the block is | ||
| // re-executed and the identical hash is re-committed idempotently. A returned error is a benign | ||
| // shutdown cancellation; genuine faults panic inside the call. See commitHashToVault. | ||
| if err := commitHashToVault(ctx, r.hashVault, b.GlobalNumber, resp.AppHash); err != nil { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: I'm thinking commitAppHashToVault may make the intent more obvious we never want to commit other Hash.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @pompon0 we are now inserting commitHashToVault before commit. I think this is what we said we wanted during the March offsite, but double checking with you. |
||
| return nil, err | ||
| } | ||
|
|
||
| commitResp, err := app.Commit(ctx) | ||
| if err != nil { | ||
| return nil, fmt.Errorf("app.Commit(): %w", err) | ||
|
|
@@ -248,7 +266,82 @@ func (r *gigaRouterCommon) executeBlock(ctx context.Context, b *atypes.GlobalBlo | |
| return commitResp, nil | ||
| } | ||
|
|
||
| // buildHashVault constructs the block-hash equivocation guard runExecute owns. By default it | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: AppHash, maybe find and replace? |
||
| // returns a durable Pebble-backed vault rooted at <PersistentStateDir>/hashvault, alongside the | ||
| // other Autobahn on-disk state. It returns a no-op vault (no protection) in two cases: the operator | ||
| // explicitly set HashVaultDisabledUnsafe — logged loudly as unsafe — or there is no persistent | ||
| // state directory (in-memory mode, e.g. tests), where a durable vault would be pointless because | ||
| // the data WAL is itself a no-op (see data.NewDataWAL). | ||
| func buildHashVault(ctx context.Context, cfg *GigaRouterCommonConfig) (hashvault.HashVault, error) { | ||
| if cfg.HashVaultDisabledUnsafe { | ||
| logger.Error("################################################################") | ||
| logger.Error("# HASHVAULT DISABLED (hash-vault-disabled-unsafe=true). #") | ||
| logger.Error("# This node has NO block-hash equivocation protection and is #") | ||
| logger.Error("# running in an UNSAFE configuration. Re-enable as soon as the #") | ||
| logger.Error("# underlying issue is resolved. #") | ||
| logger.Error("################################################################") | ||
| return hashvault.NewNoopHashVault(), nil | ||
| } | ||
| dir, ok := cfg.PersistentStateDir.Get() | ||
| if !ok { | ||
| logger.Error("HASHVAULT DISABLED: no persistent state dir (in-memory mode); using no-op vault. " + | ||
| "This node has NO block-hash equivocation protection. This is expected only for in-memory " + | ||
| "runs such as tests; a production node reaching this path is misconfigured.") | ||
| return hashvault.NewNoopHashVault(), nil | ||
| } | ||
| hvCfg := hashvault.DefaultHashVaultConfig() | ||
| hvCfg.DataDir = filepath.Join(dir, "hashvault") | ||
| return hashvault.NewPebbleHashVault(ctx, hvCfg) | ||
| } | ||
|
|
||
| // commitHashToVault records the app hash for the given height in the equivocation guard and halts | ||
| // the node on any error. Every executed height is guarded, so a node can never commit to two | ||
| // different app hashes for the same height without deliberate human intervention. | ||
| func commitHashToVault( | ||
| ctx context.Context, | ||
| vault hashvault.HashVault, | ||
| height atypes.GlobalBlockNumber, | ||
| hash []byte, | ||
| ) error { | ||
| err := vault.CommitToHash(ctx, uint64(height), hash) | ||
| if err == nil { | ||
| return nil | ||
| } | ||
| if errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded) { | ||
| logger.Info("HashVault commit aborted by context cancellation during shutdown; not recording hash", | ||
| "height", height, "err", err) | ||
| return fmt.Errorf("hashvault CommitToHash aborted at height %d: %w", height, err) | ||
| } | ||
| if errors.Is(err, hashvault.ErrHashMismatch) { | ||
| // The HashVault has already logged the conflicting hashes, its data directory, and the | ||
| // bypass/slashing guidance immediately before returning this error; don't duplicate it. | ||
| logger.Error("FATAL: HashVault detected a block-hash equivocation; halting. See the preceding "+ | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There is a tiny risk the logger doesn't flush before panicing, maybe either flush the log buffer or pass the error message to the panic string? |
||
| "HashVault error for the conflicting hashes and recovery steps. "+ | ||
| "DO NOT RESTART WITHOUT HUMAN INTERVENTION.", | ||
| "height", height) | ||
| } else { | ||
| logger.Error("FATAL: HashVault could not commit the block hash (operational error, not a "+ | ||
| "confirmed equivocation). Halting.", | ||
| "height", height, "hashHex", fmt.Sprintf("%x", hash), "err", err) | ||
| } | ||
| panic(fmt.Sprintf("hashvault CommitToHash failed at height %d: %v", height, err)) | ||
| } | ||
|
|
||
| func (r *gigaRouterCommon) runExecute(ctx context.Context) error { | ||
| // runExecute is the single block-execution loop spawned by both the validator and fullnode Run | ||
| // methods, so it owns the equivocation guard for both roles: build it here (set before the first | ||
| // executeBlock, the only other reader) and close it on exit. | ||
| hashVault, err := buildHashVault(ctx, r.cfg) | ||
| if err != nil { | ||
| return fmt.Errorf("buildHashVault(): %w", err) | ||
| } | ||
| r.hashVault = hashVault | ||
| defer func() { | ||
| if err := hashVault.Close(context.Background()); err != nil { | ||
| logger.Error("failed to close hashvault", "err", err) | ||
| } | ||
| }() | ||
|
|
||
| app := r.app | ||
|
|
||
| info, err := app.Info(ctx, &version.RequestInfo) | ||
|
|
@@ -278,6 +371,15 @@ func (r *gigaRouterCommon) runExecute(ctx context.Context) error { | |
| return fmt.Errorf("invalid GenDoc.InitialHeight = %v", r.cfg.GenDoc.InitialHeight) | ||
| } | ||
| } else { | ||
| // Re-commit the last finalized block's hash to the equivocation guard before re-proposing it | ||
| // for AppQC voting (PushAppHash below), mirroring executeBlock's commit-before-PushAppHash | ||
| // ordering. On a normal restart this idempotently matches the hash recorded when `last` was | ||
| // first executed; if the committed app state has diverged from what the vault recorded (e.g. an | ||
| // out-of-band rollback/restore), this halts the node instead of externalizing a conflicting | ||
| // hash. A returned error is a benign shutdown cancellation; genuine faults panic inside. | ||
| if err := commitHashToVault(ctx, r.hashVault, last, info.LastBlockAppHash); err != nil { | ||
| return err | ||
| } | ||
| // Losing a prefix of appHashes on crash is fine: AppQC is reached | ||
| // once everyone votes on apphashes of a suffix of finalized blocks. | ||
| if err := r.data.PushAppHash(ctx, last, info.LastBlockAppHash); err != nil { | ||
|
|
@@ -301,6 +403,17 @@ func (r *gigaRouterCommon) runExecute(ctx context.Context) error { | |
| if err := r.data.PruneBefore(pruneBefore); err != nil { | ||
| return fmt.Errorf("r.data.PruneBefore(%v): %w", pruneBefore, err) | ||
| } | ||
| // Align the vault's retention with the data layer's prune boundary. | ||
| if err := r.hashVault.Prune(ctx, uint64(pruneBefore)); err != nil { | ||
| // A canceled context just means we're shutting down between a successful executeBlock | ||
| // and this prune; that's benign, not a prune failure, so don't alarm operators. | ||
| if errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded) { | ||
| logger.Info("hashvault prune aborted by context cancellation during shutdown", | ||
| "prune_before", pruneBefore, "err", err) | ||
| } else { | ||
| logger.Error("failed to prune hashvault", "prune_before", pruneBefore, "err", err) | ||
| } | ||
| } | ||
| } | ||
| } | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,58 @@ | ||
| package p2p | ||
|
|
||
| import ( | ||
| "context" | ||
| "testing" | ||
|
|
||
| "github.com/sei-protocol/sei-chain/sei-db/state_db/sc/hashvault" | ||
| atypes "github.com/sei-protocol/sei-chain/sei-tendermint/autobahn/types" | ||
| "github.com/sei-protocol/sei-chain/sei-tendermint/libs/utils/require" | ||
| ) | ||
|
|
||
| // newSeededVault returns a durable Pebble vault rooted in a temp dir with hash committed at height. | ||
| func newSeededVault(t *testing.T, height atypes.GlobalBlockNumber, hash []byte) hashvault.HashVault { | ||
| t.Helper() | ||
| cfg := hashvault.DefaultHashVaultConfig() | ||
| cfg.DataDir = t.TempDir() | ||
| v, err := hashvault.NewUnsafePebbleHashVault(context.Background(), cfg) | ||
| require.NoError(t, err) | ||
| t.Cleanup(func() { _ = v.Close(context.Background()) }) | ||
| require.NoError(t, v.CommitToHash(context.Background(), uint64(height), hash)) | ||
| return v | ||
| } | ||
|
|
||
| // TestCommitHashToVault covers the safety contract the restart path in runExecute relies on: | ||
| // an idempotent match returns nil, a divergent hash halts the node (panic), and a canceled | ||
| // context returns an error without halting. | ||
| func TestCommitHashToVault(t *testing.T) { | ||
| const height atypes.GlobalBlockNumber = 42 | ||
| h1 := make([]byte, hashvault.BlockHashSize) | ||
| for i := range h1 { | ||
| h1[i] = 0xAA | ||
| } | ||
| h2 := make([]byte, hashvault.BlockHashSize) | ||
| for i := range h2 { | ||
| h2[i] = 0xBB | ||
| } | ||
|
|
||
| t.Run("matching hash is idempotent", func(t *testing.T) { | ||
| vault := newSeededVault(t, height, h1) | ||
| require.NoError(t, commitHashToVault(context.Background(), vault, height, h1)) | ||
| }) | ||
|
|
||
| t.Run("divergent hash halts the node", func(t *testing.T) { | ||
| vault := newSeededVault(t, height, h1) | ||
| require.Panics(t, func() { | ||
| _ = commitHashToVault(context.Background(), vault, height, h2) | ||
| }) | ||
| }) | ||
|
|
||
| t.Run("canceled context returns error without halting", func(t *testing.T) { | ||
| vault := newSeededVault(t, height, h1) | ||
| ctx, cancel := context.WithCancel(context.Background()) | ||
| cancel() | ||
| // Must not panic: a canceled context is a benign shutdown, not an equivocation. | ||
| err := commitHashToVault(ctx, vault, height, h2) | ||
| require.Error(t, err) | ||
| }) | ||
| } |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: make it clear this is AppHash mismatch