From eb5b3ca69df7055dc0a491338e71c729103382c1 Mon Sep 17 00:00:00 2001 From: bdchatham Date: Wed, 24 Jun 2026 14:05:35 -0700 Subject: [PATCH 01/10] feat(app): expose EVM listener Stop handles + redirectable serve errors RegisterLocalServices constructs the EVM HTTP/WS listeners in detached goroutines that panic on a bind failure. An in-process host running N apps in one process needs (a) the listener handles to Stop() at teardown and (b) a single node's bind failure to be a reportable error, not a process-wide panic that kills all N. Add evmHTTPServer/evmWSServer handles (EVMHTTPServer/EVMWebSocketServer getters), and SetEVMServeErr to redirect Start() failures to a buffered channel. With no channel set (production seid) behavior is unchanged: the listener still panics on bind failure. Co-Authored-By: Claude Opus 4.8 --- app/app.go | 71 ++++++- inprocess/appoptions.go | 41 ++++ inprocess/doc.go | 49 +++++ inprocess/genesis.go | 173 +++++++++++++++++ inprocess/handle.go | 155 +++++++++++++++ inprocess/harness.go | 391 ++++++++++++++++++++++++++++++++++++++ inprocess/harness_test.go | 124 ++++++++++++ inprocess/readiness.go | 141 ++++++++++++++ inprocess/support.go | 27 +++ 9 files changed, 1170 insertions(+), 2 deletions(-) create mode 100644 inprocess/appoptions.go create mode 100644 inprocess/doc.go create mode 100644 inprocess/genesis.go create mode 100644 inprocess/handle.go create mode 100644 inprocess/harness.go create mode 100644 inprocess/harness_test.go create mode 100644 inprocess/readiness.go create mode 100644 inprocess/support.go diff --git a/app/app.go b/app/app.go index d8818d3844..c653b7846c 100644 --- a/app/app.go +++ b/app/app.go @@ -478,6 +478,21 @@ type App struct { httpServerStartSignalSent bool wsServerStartSignalSent bool + // evmHTTPServer/evmWSServer retain the EVM JSON-RPC HTTP and WebSocket + // listeners constructed in RegisterLocalServices so an embedding orchestrator + // (the in-process harness) can Stop() them at teardown. Nil for a node with + // the respective listener disabled. Production seid never reads these — its + // process exit reaps the listeners — but discarding them leaked the only Stop + // handle, which an in-process host running N apps in one process needs. + evmHTTPServer evmrpc.EVMServer + evmWSServer evmrpc.EVMServer + // evmServeErr, when non-nil, diverts an EVM listener Start() failure to the + // channel instead of panicking. Production leaves it nil and keeps the + // fail-loud panic (a bind failure must crash a real node). The harness sets it + // via SetEVMServeErr before the first block so a single node's bind failure is + // a reportable error, not a process-wide panic that kills all N nodes. + evmServeErr chan<- error + txPrioritizer sdk.TxPrioritizer benchmarkManager *benchmark.Manager @@ -2726,10 +2741,12 @@ func (app *App) RegisterLocalServices(node client.LocalClient, txConfig client.T if err != nil { panic(err) } + app.evmHTTPServer = evmHTTPServer go func() { + defer app.recoverEVMServe() <-app.httpServerStartSignal if err := evmHTTPServer.Start(); err != nil { - panic(err) + app.reportEVMServeErr(err) } }() } @@ -2740,10 +2757,12 @@ func (app *App) RegisterLocalServices(node client.LocalClient, txConfig client.T if err != nil { panic(err) } + app.evmWSServer = evmWSServer go func() { + defer app.recoverEVMServe() <-app.wsServerStartSignal if err := evmWSServer.Start(); err != nil { - panic(err) + app.reportEVMServeErr(err) } }() } @@ -2759,6 +2778,54 @@ func (app *App) RegisterLocalServices(node client.LocalClient, txConfig client.T } } +// reportEVMServeErr diverts an EVM listener Start() failure to the registered +// error channel, falling back to the historical panic when no channel is set +// (production seid). The send is non-blocking: the channel is buffered and a +// second listener's failure must not deadlock the goroutine. +func (app *App) reportEVMServeErr(err error) { + if app.evmServeErr == nil { + panic(err) + } + select { + case app.evmServeErr <- err: + default: + } +} + +// recoverEVMServe is the deferred guard on the EVM listener goroutines. A panic +// inside Start() (beyond the bind error it returns cleanly) is converted to a +// reported error when a channel is registered, so one node's listener panic does +// not crash an in-process host running N nodes. With no channel (production +// seid) it re-panics, preserving the historical fail-loud behavior exactly. +func (app *App) recoverEVMServe() { + if r := recover(); r != nil { + if app.evmServeErr == nil { + panic(r) + } + err, ok := r.(error) + if !ok { + err = fmt.Errorf("evm serve panic: %v", r) + } + app.reportEVMServeErr(err) + } +} + +// SetEVMServeErr registers the channel that EVM listener Start() failures are +// sent to, replacing the default fail-loud panic. An in-process host that runs +// multiple apps in one process calls this before the first block so one node's +// bind failure is a reportable error rather than a process-wide panic. The +// channel should be buffered (>= 2: one HTTP + one WS listener). +func (app *App) SetEVMServeErr(ch chan<- error) { app.evmServeErr = ch } + +// EVMHTTPServer returns the EVM JSON-RPC HTTP listener constructed in +// RegisterLocalServices, or nil if HTTP serving is disabled. An embedding +// orchestrator calls Stop() on it at teardown. +func (app *App) EVMHTTPServer() evmrpc.EVMServer { return app.evmHTTPServer } + +// EVMWebSocketServer returns the EVM JSON-RPC WebSocket listener, or nil if WS +// serving is disabled. +func (app *App) EVMWebSocketServer() evmrpc.EVMServer { return app.evmWSServer } + // RegisterSwaggerAPI registers swagger route with API Server func RegisterSwaggerAPI(rtr *mux.Router) { statikFS, err := fs.NewWithNamespace("swagger") diff --git a/inprocess/appoptions.go b/inprocess/appoptions.go new file mode 100644 index 0000000000..c3ade3ee88 --- /dev/null +++ b/inprocess/appoptions.go @@ -0,0 +1,41 @@ +//go:build inprocess + +package inprocess + +import "github.com/sei-protocol/sei-chain/app" + +// appOptions is the per-node servertypes.AppOptions the harness injects into +// app.New. app.TestAppOpts hard-disables the EVM HTTP/WS listeners to avoid port +// clashes in single-app tests; the harness needs the opposite — EVM enabled on +// distinct per-node ports (recipe #3) — plus the chain-id the sei-chain helpers +// hardcode. Unknown keys return nil, matching servertypes.AppOptions semantics +// (callers treat a nil as "unset, use the default"). +type appOptions struct { + chainID string + httpPort int + wsPort int +} + +func (o appOptions) Get(key string) interface{} { + switch key { + case "chain-id": + return o.chainID + case "evm.http_enabled": + return true + case "evm.http_port": + return o.httpPort + case "evm.ws_enabled": + return true + case "evm.ws_port": + return o.wsPort + case app.FlagSCEnable: + return true + case app.FlagSCSnapshotInterval: + return uint32(0) + case app.FlagSSEnable: + return true + case app.FlagSSBackend: + return "pebbledb" + } + return nil +} diff --git a/inprocess/doc.go b/inprocess/doc.go new file mode 100644 index 0000000000..cf15f96395 --- /dev/null +++ b/inprocess/doc.go @@ -0,0 +1,49 @@ +//go:build inprocess + +// Package inprocess stands up N sei-chain validators in a single Go process, +// reaching real CometBFT consensus and each serving its own full RPC stack +// (Tendermint RPC + EVM JSON-RPC HTTP/WS + gRPC), with deterministic teardown. +// +// It is the in-process provisioning foundation for the SDK "local" provider +// (design: bdchatham-designs/designs/test-harness/sdk-local-provider-lld.md). +// The package is gated behind the `inprocess` build tag so its heavy +// sei-tendermint/sei-cosmos bring-up never leaks into a normal `seid` build. +// +// # Usage +// +// net, err := inprocess.Start(ctx, inprocess.Options{Validators: 4}) +// if err != nil { ... } +// defer net.Close() +// if err := net.WaitReady(ctx); err != nil { ... } +// rpc := net.Node(0).TendermintRPC() // http://127.0.0.1:PORT +// +// # Why a native API, not the SDK sei.Provider interface +// +// The LLD's target is for Start to back the SDK's sei.Provider so suites written +// against sei.Open(ctx, "local") run unchanged. That wiring is deferred: the SDK +// lives in the github.com/sei-protocol/sei-k8s-controller module, which declares +// `go >= 1.26.0`; sei-chain runs go 1.25.6, so importing the SDK forces a +// chain-wide toolchain bump (and pulls the controller's controller-runtime/AWS +// dep graph into the seid build). The handle methods here intentionally mirror +// sei.NodeHandle / sei.NetworkHandle so a thin adapter can satisfy the SDK +// interface once the toolchain skew is resolved — see Node and Network below. +// +// # Recipe (the gotchas that make N>1 consensus + per-node RPC work) +// +// These are the load-bearing deltas vs sei-cosmos/testutil/network.New, proven +// by the N-RPC spike and preserved here: +// +// 1. genDoc.Validators = nil — let CometBFT derive the valset from the app's +// InitChain response. testutil/network sets it to []{self}, which fails +// consensus replay for N>1. +// 2. Full P2P mesh — persistent-peers wired nodeID@127.0.0.1:p2pPort across all +// N (testutil/network wires zero). +// 3. Injected AppOptions enable EVM HTTP/WS on per-node ports (app.TestAppOpts +// hard-disables them). +// 4. tmCfg.Instrumentation.Prometheus = false — avoids the dup-registry panic; +// with metrics off no evmrpc/EVM-keeper de-globalization is needed. +// 5. Listeners scoped to 127.0.0.1 (EVM defaults to 0.0.0.0, TM RPC to [::]). +// 6. MaxIncomingConnectionAttempts raised — loopback collapses all peers onto +// 127.0.0.1, so the router's IP-keyed conn-tracker counts the startup burst +// on one key. +package inprocess diff --git a/inprocess/genesis.go b/inprocess/genesis.go new file mode 100644 index 0000000000..4b18ea8825 --- /dev/null +++ b/inprocess/genesis.go @@ -0,0 +1,173 @@ +//go:build inprocess + +package inprocess + +import ( + "encoding/json" + "fmt" + "os" + "path/filepath" + + "github.com/sei-protocol/sei-chain/sei-cosmos/client" + "github.com/sei-protocol/sei-chain/sei-cosmos/client/tx" + "github.com/sei-protocol/sei-chain/sei-cosmos/codec" + "github.com/sei-protocol/sei-chain/sei-cosmos/crypto/keyring" + cryptotypes "github.com/sei-protocol/sei-chain/sei-cosmos/crypto/types" + "github.com/sei-protocol/sei-chain/sei-cosmos/testutil" + sdk "github.com/sei-protocol/sei-chain/sei-cosmos/types" + authtypes "github.com/sei-protocol/sei-chain/sei-cosmos/x/auth/types" + banktypes "github.com/sei-protocol/sei-chain/sei-cosmos/x/bank/types" + "github.com/sei-protocol/sei-chain/sei-cosmos/x/genutil" + genutiltypes "github.com/sei-protocol/sei-chain/sei-cosmos/x/genutil/types" + stakingtypes "github.com/sei-protocol/sei-chain/sei-cosmos/x/staking/types" + tmtime "github.com/sei-protocol/sei-chain/sei-tendermint/libs/time" + tmtypes "github.com/sei-protocol/sei-chain/sei-tendermint/types" +) + +// genesisBuilder accumulates per-validator accounts, balances, and gentxs across +// the key-generation pass, then assembles a shared genesis whose validator set +// is left EMPTY so every node derives the consensus valset from its InitChain +// response (recipe #1) — the single most important delta from testutil/network. +// +// This is a self-contained reimplementation of the unexported initGenFiles / +// collectGenFiles / writeFile helpers in sei-cosmos/testutil/network: lifting +// them verbatim would require exporting them from a production cosmos package. +// They use only exported cosmos APIs, so reimplementing keeps the harness free +// of any sei-cosmos source change. +type genesisBuilder struct { + codec codec.Codec + txConfig client.TxConfig + chainID string + bondDenom string + + accounts []authtypes.GenesisAccount + balances []banktypes.Balance +} + +// fundValidator stores a validator operator key in kb, funds its genesis account +// + balances, and writes its self-delegation gentx to gentxsDir keyed by moniker. +// It returns the operator address for downstream client wiring. +func (b *genesisBuilder) fundValidator( + kb keyring.Keyring, + moniker string, + pubKey cryptotypes.PubKey, + algo keyring.SignatureAlgo, + accountTokens, stakingTokens, bondedTokens sdk.Int, + p2pHost, p2pPort, nodeID, gentxsDir string, +) (sdk.AccAddress, error) { + addr, _, err := testutil.GenerateSaveCoinKey(kb, moniker, "", true, algo) + if err != nil { + return nil, fmt.Errorf("generate key for %s: %w", moniker, err) + } + + balances := sdk.NewCoins( + sdk.NewCoin(fmt.Sprintf("%stoken", moniker), accountTokens), + sdk.NewCoin(b.bondDenom, stakingTokens), + ) + b.balances = append(b.balances, banktypes.Balance{Address: addr.String(), Coins: balances.Sort()}) + b.accounts = append(b.accounts, authtypes.NewBaseAccount(addr, nil, 0, 0)) + + commission, err := sdk.NewDecFromStr("0.5") + if err != nil { + return nil, err + } + createValMsg, err := stakingtypes.NewMsgCreateValidator( + sdk.ValAddress(addr), pubKey, + sdk.NewCoin(b.bondDenom, bondedTokens), + stakingtypes.NewDescription(moniker, "", "", "", ""), + stakingtypes.NewCommissionRates(commission, sdk.OneDec(), sdk.OneDec()), + sdk.OneInt(), + ) + if err != nil { + return nil, err + } + + memo := fmt.Sprintf("%s@%s:%s", nodeID, p2pHost, p2pPort) + txb := b.txConfig.NewTxBuilder() + if err := txb.SetMsgs(createValMsg); err != nil { + return nil, err + } + txb.SetFeeAmount(sdk.NewCoins(sdk.NewCoin(fmt.Sprintf("%stoken", moniker), sdk.NewInt(0)))) + txb.SetGasLimit(1_000_000) + txb.SetMemo(memo) + txf := tx.Factory{}.WithChainID(b.chainID).WithMemo(memo).WithKeybase(kb).WithTxConfig(b.txConfig) + if err := tx.Sign(txf, moniker, txb, true); err != nil { + return nil, err + } + txBz, err := b.txConfig.TxJSONEncoder()(txb.GetTx()) + if err != nil { + return nil, err + } + if err := writeFile(moniker+".json", gentxsDir, txBz); err != nil { + return nil, err + } + return addr, nil +} + +// writeBaseGenesis writes a base genesis file (accounts + balances, empty +// validator set) to every validator's genesis path. Mirrors initGenFiles. +func (b *genesisBuilder) writeBaseGenesis(baseState map[string]json.RawMessage, genFiles []string) error { + var authGenState authtypes.GenesisState + b.codec.MustUnmarshalJSON(baseState[authtypes.ModuleName], &authGenState) + packed, err := authtypes.PackAccounts(b.accounts) + if err != nil { + return err + } + authGenState.Accounts = append(authGenState.Accounts, packed...) + baseState[authtypes.ModuleName] = b.codec.MustMarshalJSON(&authGenState) + + var bankGenState banktypes.GenesisState + b.codec.MustUnmarshalJSON(baseState[banktypes.ModuleName], &bankGenState) + bankGenState.Balances = append(bankGenState.Balances, b.balances...) + baseState[banktypes.ModuleName] = b.codec.MustMarshalJSON(&bankGenState) + + appStateJSON, err := json.MarshalIndent(baseState, "", " ") + if err != nil { + return err + } + genDoc := tmtypes.GenesisDoc{ + ChainID: b.chainID, + AppState: appStateJSON, + Validators: nil, // recipe #1: derive valset from InitChain. + } + for _, gf := range genFiles { + if err := genDoc.SaveAs(gf); err != nil { + return err + } + } + return nil +} + +// collectGentxs folds every validator's gentx into each node's genesis app state +// under one canonical genesis time (consensus timestamp validation diverges if +// the nodes disagree on GenesisTime). Mirrors collectGenFiles. +func (b *genesisBuilder) collectGentxs(nodes []*node, gentxsDir string) error { + genTime := tmtime.Now() + for _, n := range nodes { + initCfg := genutiltypes.NewInitConfig(b.chainID, gentxsDir, n.nodeID, n.pubKey) + genFile := n.tmCfg.GenesisFile() + genDoc, err := tmtypes.GenesisDocFromFile(genFile) + if err != nil { + return err + } + appState, err := genutil.GenAppStateFromConfig( + b.codec, b.txConfig, n.tmCfg, initCfg, *genDoc, banktypes.GenesisBalancesIterator{}, + ) + if err != nil { + return err + } + if err := genutil.ExportGenesisFileWithTime(genFile, b.chainID, nil, appState, genTime); err != nil { + return err + } + } + return nil +} + +// writeFile writes contents under dir/name, creating dir. Mirrors the network +// package's unexported writeFile. +func writeFile(name, dir string, contents []byte) error { + if err := os.MkdirAll(dir, 0o750); err != nil { + return err + } + return os.WriteFile(filepath.Join(dir, name), contents, 0o600) +} diff --git a/inprocess/handle.go b/inprocess/handle.go new file mode 100644 index 0000000000..efa24b741b --- /dev/null +++ b/inprocess/handle.go @@ -0,0 +1,155 @@ +//go:build inprocess + +package inprocess + +import ( + "context" + "fmt" + "net/http" + "os" + "time" + + "github.com/sei-protocol/sei-chain/evmrpc" +) + +// Node is a handle to one running in-process validator. Its method set mirrors +// the SDK's sei.NodeHandle (EVMRPC/TendermintRPC/REST/WaitReady/Object) so a thin +// adapter can satisfy that interface once the SDK toolchain skew is resolved +// (see doc.go). Endpoint getters return loopback URLs that are valid as soon as +// the node is started; WaitReady gates on the listeners actually serving. +type Node struct{ n *node } + +// Name is the node's moniker (node0, node1, ...). +func (h Node) Name() string { return h.n.moniker } + +// Namespace is "" for in-process nodes (no k8s namespace); present for SDK +// handle parity. +func (h Node) Namespace() string { return "" } + +// TendermintRPC is the node's CometBFT RPC base URL (http://127.0.0.1:PORT). +func (h Node) TendermintRPC() string { return "http://" + stripScheme(h.n.rpcAddr) } + +// EVMRPC is the node's EVM JSON-RPC HTTP URL. +func (h Node) EVMRPC() string { return fmt.Sprintf("http://127.0.0.1:%d", h.n.httpPort) } + +// EVMWS is the node's EVM JSON-RPC WebSocket URL. Not part of the SDK +// NodeHandle surface, but the in-process harness binds it, so it is exposed. +func (h Node) EVMWS() string { return fmt.Sprintf("ws://127.0.0.1:%d", h.n.wsPort) } + +// REST is "" — the harness does not enable the Cosmos LCD listener (validators +// serve none by default; present for SDK handle parity). +func (h Node) REST() string { return "" } + +// GRPC is the node's Cosmos gRPC address (host:port). Not in the SDK NodeHandle +// surface (gRPC is not a published status endpoint); exposed for in-process dials. +func (h Node) GRPC() string { return h.n.grpcAddr } + +// Object returns the live *node.Node behind the handle (SDK escape hatch: the +// dynamic value behind any). Read-oriented — driving it is an in-process-only +// capability k8s mode never offers. +func (h Node) Object() any { return h.n.tmNode } + +// ServeErr returns the channel EVM listener Start() failures are reported on +// (instead of the process-wide panic the production path uses). A non-nil +// receive means that node's EVM listener failed to bind; consensus may still be +// healthy. Buffered (cap 2: HTTP + WS). +func (h Node) ServeErr() <-chan error { return h.n.serveErr } + +// WaitReady blocks until this node has joined consensus (height advancing) and +// its EVM listener is serving, or ctx fires. +func (h Node) WaitReady(ctx context.Context, hc *http.Client) error { + if err := waitHeightAdvances(ctx, hc, h.TendermintRPC(), 1); err != nil { + return fmt.Errorf("%s tendermint: %w", h.n.moniker, err) + } + if err := waitEVMServing(ctx, hc, h.EVMRPC()); err != nil { + return fmt.Errorf("%s evm: %w", h.n.moniker, err) + } + return nil +} + +// Node returns a handle to the i-th validator (0-based). It panics on an +// out-of-range index — a programming error, not a runtime condition. +func (net *Network) Node(i int) Node { return Node{n: net.nodes[i]} } + +// Nodes returns handles to every validator in index order. +func (net *Network) Nodes() []Node { + out := make([]Node, len(net.nodes)) + for i := range net.nodes { + out[i] = Node{n: net.nodes[i]} + } + return out +} + +// Len is the validator count. +func (net *Network) Len() int { return len(net.nodes) } + +// WaitReady blocks until every node has joined consensus and is serving EVM, or +// ctx fires. It is the heavy readiness gate (per-node height-advance + EVM +// probe), distinct from Start (which only constructs + starts the nodes). +func (net *Network) WaitReady(ctx context.Context) error { + hc := &http.Client{Timeout: 5 * time.Second} + for i := range net.nodes { + if err := net.Node(i).WaitReady(ctx, hc); err != nil { + return err + } + } + return nil +} + +// Close tears every node down deterministically and is idempotent. Order: +// stop each tendermint node (halts consensus + block production), stop each +// EVM HTTP/WS listener, drain the EVM worker pool, then remove the temp dir the +// harness owns. Safe to call from a defer on both the success and partial-start +// paths; nodes that never started are skipped. +func (net *Network) Close() { + if net.closed { + return + } + net.closed = true + + for _, n := range net.nodes { + stopNode(n) + } + // The EVM worker pool is a process global shared by all N nodes (the LLD's + // known cross-node coupling, accepted for the metrics-off MVP). Drain it once + // here, not per node, so its goroutines do not leak across test runs. + if pool := evmrpc.GetGlobalWorkerPool(); pool != nil { + pool.Close() + } + + if net.ownBaseDir && net.baseDir != "" { + _ = os.RemoveAll(net.baseDir) + } +} + +// stopNode shuts one node's tendermint service and EVM listeners. Each step is +// guarded so a nil (never-started) field on a partial-start path is a no-op. +func stopNode(n *node) { + if n.tmNode != nil && n.tmNode.IsRunning() { + n.tmNode.Stop() + n.tmNode.Wait() + } + if n.app != nil { + stopEVMServer(n.app.EVMHTTPServer()) + stopEVMServer(n.app.EVMWebSocketServer()) + } +} + +// stopEVMServer stops an EVM listener if it was constructed (nil when the +// listener was disabled). +func stopEVMServer(s evmrpc.EVMServer) { + if s != nil { + s.Stop() + } +} + +// stripScheme drops a leading scheme:// from a listen address so it can be +// recomposed with a concrete scheme (TM RPC config carries tcp://). +func stripScheme(addr string) string { + for _, p := range []string{"tcp://", "http://"} { + if len(addr) >= len(p) && addr[:len(p)] == p { + return addr[len(p):] + } + } + return addr +} diff --git a/inprocess/harness.go b/inprocess/harness.go new file mode 100644 index 0000000000..59fc874d46 --- /dev/null +++ b/inprocess/harness.go @@ -0,0 +1,391 @@ +//go:build inprocess + +package inprocess + +import ( + "context" + "fmt" + "io" + "os" + "path/filepath" + "strconv" + "strings" + "time" + + dbm "github.com/tendermint/tm-db" + "go.opentelemetry.io/otel/sdk/trace" + + "github.com/sei-protocol/sei-chain/app" + "github.com/sei-protocol/sei-chain/sei-cosmos/client" + "github.com/sei-protocol/sei-chain/sei-cosmos/crypto/keyring" + cryptotypes "github.com/sei-protocol/sei-chain/sei-cosmos/crypto/types" + "github.com/sei-protocol/sei-chain/sei-cosmos/server" + srvconfig "github.com/sei-protocol/sei-chain/sei-cosmos/server/config" + sdk "github.com/sei-protocol/sei-chain/sei-cosmos/types" + "github.com/sei-protocol/sei-chain/sei-cosmos/x/genutil" + "github.com/sei-protocol/sei-chain/sei-tendermint/config" + tmnode "github.com/sei-protocol/sei-chain/sei-tendermint/node" + rpclocal "github.com/sei-protocol/sei-chain/sei-tendermint/rpc/client/local" + tmtypes "github.com/sei-protocol/sei-chain/sei-tendermint/types" + "github.com/sei-protocol/sei-chain/sei-wasmd/x/wasm" +) + +// defaultChainID is the chain-id the sei-chain integration helpers and CLI +// hardcode; a different value fails every tx, so it is the default and not +// merely a placeholder. +const defaultChainID = "sei" + +// Options configures a Start. The zero value is invalid (Validators must be +// >= 1); use sensible explicit values. +type Options struct { + // Validators is the number of in-process validators (>= 1). Each is a full + // (app, node.New) pair serving its own RPC stack. + Validators int + + // ChainID is the genesis chain id; "" defaults to "sei". + ChainID string + + // BaseDir is the parent dir for per-node homes; "" creates a temp dir the + // harness owns and removes at Close. A caller-supplied BaseDir is NOT removed. + BaseDir string + + // TimeoutCommit is the consensus commit timeout; 0 defaults to 2s. The + // dominant cadence lever — lower it (e.g. 500ms) for faster tests. + TimeoutCommit time.Duration +} + +func (o Options) withDefaults() Options { + if o.ChainID == "" { + o.ChainID = defaultChainID + } + if o.TimeoutCommit == 0 { + o.TimeoutCommit = 2 * time.Second + } + return o +} + +// node is one in-process validator: its identity, listener addresses, app, and +// running tendermint service. Exported access is via the *Node handle (handle.go) +// so the running internals stay encapsulated. +type node struct { + moniker string + nodeID string + pubKey cryptotypes.PubKey + addr sdk.AccAddress + home string + tmCfg *config.Config + clientCx client.Context + + p2pHost string + p2pPort string + rpcAddr string // tcp://127.0.0.1:PORT (TM RPC listen address) + grpcAddr string // 127.0.0.1:PORT + httpPort int // EVM JSON-RPC HTTP + wsPort int // EVM JSON-RPC WS + + app *app.App + tmNode rpclocal.NodeService + rpc *rpclocal.Local + serveErr chan error // EVM listener Start() failures (recipe: no process-wide panic) +} + +// Network is a handle to a running in-process mesh. It owns the lifecycle: Close +// tears every node down deterministically. Not goroutine-safe across calls. +type Network struct { + opts Options + baseDir string + ownBaseDir bool // true => Close removes baseDir + nodes []*node + closed bool +} + +// Start stands up opts.Validators in-process validators, wires a full P2P mesh, +// starts each node's RPC + EVM listeners, and returns once every node is +// constructed and started (NOT once consensus is live — call WaitReady for that). +// +// On any error mid-bring-up, every already-started node is torn down before +// returning, so a partial failure leaks nothing. The caller still must Close the +// returned Network on the success path; Start does not register cleanup. +func Start(ctx context.Context, opts Options) (_ *Network, retErr error) { + opts = opts.withDefaults() + if opts.Validators < 1 { + return nil, fmt.Errorf("inprocess: Options.Validators must be >= 1, got %d", opts.Validators) + } + + baseDir, ownBaseDir, err := resolveBaseDir(opts.BaseDir) + if err != nil { + return nil, err + } + net := &Network{opts: opts, baseDir: baseDir, ownBaseDir: ownBaseDir} + // Any error past this point tears down whatever came up (including the temp + // dir we own) so the caller never holds a half-built Network. + defer func() { + if retErr != nil { + net.Close() + } + }() + + enc := app.MakeEncodingConfig() + gb := &genesisBuilder{ + codec: enc.Marshaler, + txConfig: enc.TxConfig, + chainID: opts.ChainID, + bondDenom: sdk.DefaultBondDenom, + } + + if err := net.provisionNodes(enc, gb); err != nil { + return nil, err + } + wireMesh(net.nodes) + + baseState := app.ModuleBasics.DefaultGenesis(enc.Marshaler) + genFiles := make([]string, len(net.nodes)) + for i, n := range net.nodes { + genFiles[i] = n.tmCfg.GenesisFile() + } + if err := gb.writeBaseGenesis(baseState, genFiles); err != nil { + return nil, fmt.Errorf("write base genesis: %w", err) + } + if err := gb.collectGentxs(net.nodes, filepath.Join(baseDir, "gentxs")); err != nil { + return nil, fmt.Errorf("collect gentxs: %w", err) + } + + for _, n := range net.nodes { + if err := net.startNode(ctx, n, enc); err != nil { + return nil, fmt.Errorf("start %s: %w", n.moniker, err) + } + } + return net, nil +} + +// provisionNodes runs the first pass: per-node keys, node IDs, gentxs, isolated +// tendermint config, and loopback port allocation. It populates net.nodes. +func (net *Network) provisionNodes(enc encoding, gb *genesisBuilder) error { + algoStr := string(hdSecp256k1()) + for i := 0; i < net.opts.Validators; i++ { + moniker := fmt.Sprintf("node%d", i) + nodeDir := filepath.Join(net.baseDir, moniker, "simd") + clientDir := filepath.Join(net.baseDir, moniker, "simcli") + if err := os.MkdirAll(filepath.Join(nodeDir, "config"), 0o750); err != nil { + return err + } + if err := os.MkdirAll(clientDir, 0o750); err != nil { + return err + } + + tmCfg, addrs, err := buildNodeConfig(nodeDir, moniker, net.opts.TimeoutCommit) + if err != nil { + return err + } + + nodeID, pubKey, err := genutil.InitializeNodeValidatorFiles(tmCfg) + if err != nil { + return fmt.Errorf("init validator files for %s: %w", moniker, err) + } + + kb, err := keyring.New(sdk.KeyringServiceName(), keyring.BackendTest, clientDir, nil) + if err != nil { + return err + } + algos, _ := kb.SupportedAlgorithms() + algo, err := keyring.NewSigningAlgoFromString(algoStr, algos) + if err != nil { + return err + } + + addr, err := gb.fundValidator( + kb, moniker, pubKey, algo, + consensusTokens(1000), consensusTokens(500), consensusTokens(100), + addrs.p2pHost, addrs.p2pPort, nodeID, filepath.Join(net.baseDir, "gentxs"), + ) + if err != nil { + return err + } + + writeAppConfig(filepath.Join(nodeDir, "config/app.toml"), addrs.grpcAddr, net.opts) + + clientCx := client.Context{}. + WithKeyringDir(clientDir).WithKeyring(kb).WithHomeDir(tmCfg.RootDir). + WithChainID(net.opts.ChainID).WithInterfaceRegistry(enc.InterfaceRegistry). + WithCodec(enc.Marshaler).WithLegacyAmino(enc.Amino). + WithTxConfig(enc.TxConfig).WithAccountRetriever(accountRetriever()) + + net.nodes = append(net.nodes, &node{ + moniker: moniker, nodeID: nodeID, pubKey: pubKey, addr: addr, + home: nodeDir, tmCfg: tmCfg, clientCx: clientCx, + p2pHost: addrs.p2pHost, p2pPort: addrs.p2pPort, + rpcAddr: addrs.rpcAddr, grpcAddr: addrs.grpcAddr, + httpPort: addrs.httpPort, wsPort: addrs.wsPort, + serveErr: make(chan error, 2), // one HTTP + one WS listener + }) + } + return nil +} + +// startNode builds the app, constructs + starts the tendermint node with an +// EMPTY-valset genesis (recipe #1), wires the local RPC client, and registers +// the EVM listeners. The node's EVM Start() failures land on n.serveErr instead +// of panicking (recipe: a single bind failure must not kill all N nodes). +func (net *Network) startNode(ctx context.Context, n *node, enc encoding) error { + theApp := newNodeApp(n, enc) + theApp.SetEVMServeErr(n.serveErr) + n.app = theApp + + // recipe #1: zero the validator set so CometBFT derives it from InitChain. + genDoc, err := tmtypes.GenesisDocFromFile(n.tmCfg.GenesisFile()) + if err != nil { + return err + } + genDoc.Validators = nil + + tmNode, err := tmnode.New( + ctx, n.tmCfg, func() {}, theApp, genDoc, + []trace.TracerProviderOption{}, tmnode.NoOpMetricsProvider(), + tmtypes.DefaultConsensusPolicy(), + ) + if err != nil { + return fmt.Errorf("node.New: %w", err) + } + n.tmNode = tmNode + if err := tmNode.Start(ctx); err != nil { + return fmt.Errorf("node.Start: %w", err) + } + + lc, err := rpclocal.New(tmNode) + if err != nil { + return err + } + n.rpc = lc + n.clientCx = n.clientCx.WithClient(lc) + // RegisterLocalServices builds the EVM HTTP/WS listeners (their goroutines + // block on the first-block start signal) and the gRPC tx service. + theApp.RegisterLocalServices(lc, n.clientCx.TxConfig) + return nil +} + +// resolveBaseDir returns the base dir for node homes and whether the harness owns +// it (and so must remove it at Close). +func resolveBaseDir(dir string) (string, bool, error) { + if dir != "" { + return dir, false, nil + } + tmp, err := os.MkdirTemp("", "sei-inprocess-") + if err != nil { + return "", false, fmt.Errorf("create base dir: %w", err) + } + return tmp, true, nil +} + +// nodeAddrs holds one node's loopback listener addresses. +type nodeAddrs struct { + p2pHost, p2pPort string + rpcAddr string + grpcAddr string + httpPort, wsPort int +} + +// buildNodeConfig builds an isolated per-node tendermint config with loopback +// listeners and the conn-tracker ceiling raised (recipes #4, #5, #6). +func buildNodeConfig(nodeDir, moniker string, timeoutCommit time.Duration) (*config.Config, nodeAddrs, error) { + sctx := server.NewDefaultContext() + tmCfg := sctx.Config + tmCfg.Mode = config.ModeValidator + tmCfg.Moniker = moniker + tmCfg.SetRoot(nodeDir) + tmCfg.Consensus.UnsafeCommitTimeoutOverride = timeoutCommit + tmCfg.TxIndex = config.TestTxIndexConfig() + // recipe #6: loopback collapses every peer onto 127.0.0.1, so the router's + // IP-keyed conn-tracker counts all N-1 inbound on one key. AllowDuplicateIP + // is a peer-manager flag and does NOT touch the router conn-tracker. + tmCfg.P2P.MaxIncomingConnectionAttempts = 10000 + tmCfg.P2P.AllowDuplicateIP = true + // recipe #4: metrics-off avoids the prometheus.DefaultRegisterer dup panic + // and lets the evmrpc/EVM-keeper metrics globals commingle harmlessly. + tmCfg.Instrumentation.Prometheus = false + + // recipe #5: server.FreeTCPAddr composes tcp://0.0.0.0:PORT — a publicly-bound + // listener. An in-process harness must scope every listener to loopback, so we + // take only the free port and compose the 127.0.0.1 address ourselves. + var a nodeAddrs + rpcPort, err := freePort() + if err != nil { + return nil, a, err + } + a.rpcAddr = fmt.Sprintf("tcp://127.0.0.1:%d", rpcPort) + tmCfg.RPC.ListenAddress = a.rpcAddr + + grpcPort, err := freePort() + if err != nil { + return nil, a, err + } + a.grpcAddr = fmt.Sprintf("127.0.0.1:%d", grpcPort) + + p2pPort, err := freePort() + if err != nil { + return nil, a, err + } + a.p2pHost = "127.0.0.1" + a.p2pPort = strconv.Itoa(p2pPort) + tmCfg.P2P.ListenAddress = fmt.Sprintf("tcp://127.0.0.1:%d", p2pPort) + + if a.httpPort, err = freePort(); err != nil { + return nil, a, err + } + if a.wsPort, err = freePort(); err != nil { + return nil, a, err + } + return tmCfg, a, nil +} + +// wireMesh wires a full persistent-peer mesh: every node lists all others as +// nodeID@127.0.0.1:p2pPort (recipe #2 — testutil/network wires zero peers). +func wireMesh(nodes []*node) { + for i, n := range nodes { + var peers []string + for j, peer := range nodes { + if j == i { + continue + } + peers = append(peers, fmt.Sprintf("%s@127.0.0.1:%s", peer.nodeID, peer.p2pPort)) + } + n.tmCfg.P2P.PersistentPeers = strings.Join(peers, ",") + } +} + +// newNodeApp builds a real sei-chain app for one node with EVM serving on its +// per-node ports against an in-memory DB and on-disk home. +func newNodeApp(n *node, enc encoding) *app.App { + return app.New( + dbm.NewMemDB(), + io.Discard, + true, + map[int64]bool{}, + n.home, + 1, + false, + n.tmCfg, + enc, + wasm.EnableAllProposals, + appOptions{chainID: n.clientCx.ChainID, httpPort: n.httpPort, wsPort: n.wsPort}, + app.EmptyWasmOpts, + nil, + ) +} + +// writeAppConfig writes a minimal per-node app.toml enabling gRPC on grpcAddr. +func writeAppConfig(path, grpcAddr string, opts Options) { + appCfg := srvconfig.DefaultConfig() + appCfg.Telemetry.Enabled = false + appCfg.GRPC.Enable = true + appCfg.GRPC.Address = grpcAddr + srvconfig.WriteConfigFile(path, appCfg) +} + +// freePort allocates a free loopback TCP port via server.FreeTCPAddr. +func freePort() (int, error) { + _, portStr, err := server.FreeTCPAddr() + if err != nil { + return 0, err + } + return strconv.Atoi(portStr) +} diff --git a/inprocess/harness_test.go b/inprocess/harness_test.go new file mode 100644 index 0000000000..1dbd5eea70 --- /dev/null +++ b/inprocess/harness_test.go @@ -0,0 +1,124 @@ +//go:build inprocess + +package inprocess + +import ( + "context" + "fmt" + "testing" + "time" + + "github.com/sei-protocol/sei-chain/sei-cosmos/client/tx" + sdk "github.com/sei-protocol/sei-chain/sei-cosmos/types" + banktypes "github.com/sei-protocol/sei-chain/sei-cosmos/x/bank/types" +) + +// TestInProcessNetwork productionizes the N-RPC spike: it stands up N=4 +// validators in one process, asserts every node serves Tendermint RPC + EVM +// JSON-RPC, and round-trips a tx (broadcast on node 0, observed on node 1's +// independent RPC) — proving real consensus + N independent RPC stacks. +// +// Run: +// +// go test -tags inprocess -run TestInProcessNetwork -v -timeout 300s ./inprocess/ +func TestInProcessNetwork(t *testing.T) { + const n = 4 + ctx, cancel := context.WithTimeout(context.Background(), 4*time.Minute) + defer cancel() + + net, err := Start(ctx, Options{ + Validators: n, + TimeoutCommit: time.Second, // tighten the cadence for a faster test. + }) + if err != nil { + t.Fatalf("Start: %v", err) + } + defer net.Close() + + if net.Len() != n { + t.Fatalf("Len = %d, want %d", net.Len(), n) + } + + // VERIFY 1+2: every node reaches consensus and serves EVM (WaitReady gates on + // height-advance + eth_blockNumber per node). + if err := net.WaitReady(ctx); err != nil { + t.Fatalf("WaitReady: %v", err) + } + for i := 0; i < n; i++ { + nd := net.Node(i) + t.Logf("node %s: tm=%s evm=%s ws=%s grpc=%s", nd.Name(), nd.TendermintRPC(), nd.EVMRPC(), nd.EVMWS(), nd.GRPC()) + } + + // No EVM listener reported a bind failure. + for i := 0; i < n; i++ { + select { + case err := <-net.Node(i).ServeErr(): + t.Fatalf("node %s EVM serve error: %v", net.Node(i).Name(), err) + default: + } + } + + // VERIFY 3: tx broadcast on node 0 is observable on node 1's independent RPC. + assertCrossNodeTxRoundTrip(t, ctx, net) +} + +// assertCrossNodeTxRoundTrip broadcasts a bank send from node 0's validator key +// via node 0's RPC, then polls node 1's RPC until the tx is queryable by hash — +// the load-bearing proof that the two nodes share consensus and each serves an +// independent RPC stack. +func assertCrossNodeTxRoundTrip(t *testing.T, ctx context.Context, net *Network) { + t.Helper() + n0, n1 := net.nodes[0], net.nodes[1] + bondDenom := sdk.DefaultBondDenom + + to := sdk.AccAddress(make([]byte, 20)) + msg := banktypes.NewMsgSend(n0.addr, to, sdk.NewCoins(sdk.NewCoin(bondDenom, sdk.NewInt(1)))) + + num, seq, err := n0.clientCx.AccountRetriever.GetAccountNumberSequence(n0.clientCx, n0.addr) + if err != nil { + t.Fatalf("fetch account for node0: %v", err) + } + txf := tx.Factory{}. + WithChainID(net.opts.ChainID).WithKeybase(n0.clientCx.Keyring). + WithTxConfig(n0.clientCx.TxConfig).WithGas(300000). + WithFees(fmt.Sprintf("200000%s", bondDenom)). + WithAccountRetriever(n0.clientCx.AccountRetriever). + WithAccountNumber(num).WithSequence(seq) + + txb, err := tx.BuildUnsignedTx(txf, msg) + if err != nil { + t.Fatalf("build tx: %v", err) + } + if err := tx.Sign(txf, n0.moniker, txb, true); err != nil { + t.Fatalf("sign tx: %v", err) + } + txBz, err := n0.clientCx.TxConfig.TxEncoder()(txb.GetTx()) + if err != nil { + t.Fatalf("encode tx: %v", err) + } + + res, err := n0.rpc.BroadcastTxSync(ctx, txBz) + if err != nil { + t.Fatalf("broadcast via node0: %v", err) + } + t.Logf("broadcast via node0: code=%d hash=%X", res.Code, res.Hash) + + deadline := time.Now().Add(30 * time.Second) + for time.Now().Before(deadline) { + q, err := n1.rpc.Tx(ctx, res.Hash, false) + if err == nil && q != nil { + t.Logf("PASS: tx %X broadcast on node0 found on node1 at height %d (code=%d)", res.Hash, q.Height, q.TxResult.Code) + return + } + time.Sleep(500 * time.Millisecond) + } + t.Fatalf("tx %X not observed on node1 within deadline", res.Hash) +} + +// TestStartRejectsZeroValidators guards the input validation. +func TestStartRejectsZeroValidators(t *testing.T) { + _, err := Start(context.Background(), Options{Validators: 0}) + if err == nil { + t.Fatal("Start with 0 validators: want error, got nil") + } +} diff --git a/inprocess/readiness.go b/inprocess/readiness.go new file mode 100644 index 0000000000..4306d9754d --- /dev/null +++ b/inprocess/readiness.go @@ -0,0 +1,141 @@ +//go:build inprocess + +package inprocess + +import ( + "context" + "encoding/json" + "fmt" + "io" + "net/http" + "strconv" + "strings" + "time" +) + +// Readiness probes mirror the SDK's sei.WaitHeightAdvances / sei.WaitEVMServing +// (sdk/sei/readiness.go). They are duplicated rather than imported because the +// SDK module declares a newer go toolchain than sei-chain builds with (see +// doc.go); when that skew is resolved the harness should delegate to the SDK +// helpers and drop these. Kept stdlib-only and behavior-compatible so the swap +// is mechanical. + +// probeInterval is the readiness poll cadence. +var probeInterval = 500 * time.Millisecond + +// waitHeightAdvances blocks until tmRPC's committed height rises by >= delta +// from the first successful read — proof the chain is producing blocks, not +// merely reachable (a stalled node reports catching_up == false at a frozen +// height). ctx bounds the wait. +func waitHeightAdvances(ctx context.Context, hc *http.Client, tmRPC string, delta int64) error { + tick := time.NewTicker(probeInterval) + defer tick.Stop() + var start, last int64 = -1, -1 + for { + if h, ok := latestHeight(ctx, hc, tmRPC); ok { + if start < 0 { + start = h + } + last = h + if h >= start+delta { + return nil + } + } + select { + case <-ctx.Done(): + return fmt.Errorf("%s height did not advance +%d (start=%d last=%d): %w", tmRPC, delta, start, last, ctx.Err()) + case <-tick.C: + } + } +} + +// waitEVMServing blocks until evmRPC answers eth_blockNumber with a non-empty, +// error-free result — proof the EVM JSON-RPC listener is bound and serving. +func waitEVMServing(ctx context.Context, hc *http.Client, evmRPC string) error { + const body = `{"jsonrpc":"2.0","id":1,"method":"eth_blockNumber","params":[]}` + tick := time.NewTicker(probeInterval) + defer tick.Stop() + for { + if raw, ok := getJSON(ctx, hc, http.MethodPost, evmRPC, body); ok { + var r struct { + Result string `json:"result"` + Error *struct { + Message string `json:"message"` + } `json:"error,omitempty"` + } + if json.Unmarshal(raw, &r) == nil && r.Error == nil && r.Result != "" { + return nil + } + } + select { + case <-ctx.Done(): + return fmt.Errorf("%s eth_blockNumber not serving before deadline: %w", evmRPC, ctx.Err()) + case <-tick.C: + } + } +} + +// latestHeight reads tmRPC's committed block height from /status. ok=false on an +// unreachable endpoint or unparseable body. Accepts both the enveloped and +// unwrapped /status shapes the Sei fork emits. +func latestHeight(ctx context.Context, hc *http.Client, tmRPC string) (int64, bool) { + body, ok := getJSON(ctx, hc, http.MethodGet, tmRPC+"/status", "") + if !ok { + return 0, false + } + var s struct { + Result *struct { + SyncInfo syncInfo `json:"sync_info"` + } `json:"result,omitempty"` + SyncInfo syncInfo `json:"sync_info"` + } + if json.Unmarshal(body, &s) != nil { + return 0, false + } + si := s.SyncInfo + if s.Result != nil && s.Result.SyncInfo.LatestBlockHeight != "" { + si = s.Result.SyncInfo + } + h, err := strconv.ParseInt(si.LatestBlockHeight, 10, 64) + if err != nil { + return 0, false + } + return h, true +} + +type syncInfo struct { + LatestBlockHeight string `json:"latest_block_height"` + CatchingUp bool `json:"catching_up"` +} + +// getJSON performs one request and returns the body on HTTP 200, else ok=false +// (a connection error or non-200 just means "not ready yet"). +func getJSON(ctx context.Context, hc *http.Client, method, url, body string) ([]byte, bool) { + if hc == nil { + hc = http.DefaultClient + } + var rdr io.Reader + if body != "" { + rdr = strings.NewReader(body) + } + req, err := http.NewRequestWithContext(ctx, method, url, rdr) + if err != nil { + return nil, false + } + if body != "" { + req.Header.Set("Content-Type", "application/json") + } + resp, err := hc.Do(req) + if err != nil { + return nil, false + } + defer func() { _ = resp.Body.Close() }() + if resp.StatusCode != http.StatusOK { + return nil, false + } + out, err := io.ReadAll(resp.Body) + if err != nil { + return nil, false + } + return out, true +} diff --git a/inprocess/support.go b/inprocess/support.go new file mode 100644 index 0000000000..811576a467 --- /dev/null +++ b/inprocess/support.go @@ -0,0 +1,27 @@ +//go:build inprocess + +package inprocess + +import ( + appparams "github.com/sei-protocol/sei-chain/app/params" + "github.com/sei-protocol/sei-chain/sei-cosmos/client" + "github.com/sei-protocol/sei-chain/sei-cosmos/crypto/hd" + sdk "github.com/sei-protocol/sei-chain/sei-cosmos/types" + authtypes "github.com/sei-protocol/sei-chain/sei-cosmos/x/auth/types" +) + +// encoding is the codec/tx-config bundle threaded through app.New and genesis +// assembly. Aliased so the call sites read without the full package path. +type encoding = appparams.EncodingConfig + +// hdSecp256k1 is the default key-signing algorithm (matches testutil/network). +func hdSecp256k1() hd.PubKeyType { return hd.Secp256k1Type } + +// consensusTokens converts a consensus power to a token amount at the default +// power reduction — the per-validator funding/staking unit. +func consensusTokens(power int64) sdk.Int { + return sdk.TokensFromConsensusPower(power, sdk.DefaultPowerReduction) +} + +// accountRetriever is the client-side account/sequence reader used to build txs. +func accountRetriever() client.AccountRetriever { return authtypes.AccountRetriever{} } From a869e158056771d95d896b4666d34d30b4b2d4a3 Mon Sep 17 00:00:00 2001 From: bdchatham Date: Wed, 24 Jun 2026 14:05:47 -0700 Subject: [PATCH 02/10] feat(inprocess): in-process N-validator harness (C1) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Stands up N sei-chain validators in one Go process reaching real CometBFT consensus, each serving its own RPC stack (Tendermint RPC + EVM JSON-RPC HTTP/WS + gRPC), with deterministic teardown. Gated behind the inprocess build tag so the heavy bring-up never enters a normal seid build. The load-bearing recipe (vs testutil/network): empty genesis valset (derive from InitChain), full P2P mesh, EVM enabled on per-node loopback ports, metrics off, raised conn-tracker ceiling for the loopback burst. Productionization: fresh per-run chain-id (no cross-run genesis collision), partial-startup cleanup, per-node EVM serve-error channel, idempotent Close. Handle methods mirror the SDK sei.NodeHandle signatures by name so a future adapter satisfies the interface structurally — without importing the SDK (its module graph + grpc replace conflict would break the seid build). Test: TestInProcessNetwork stands up N=4, asserts each node serves its RPC stack, and round-trips a tx (broadcast on node0, observed on node1). go test -tags inprocess -run TestInProcessNetwork -v -timeout 300s ./inprocess/ Co-Authored-By: Claude Opus 4.8 --- inprocess/doc.go | 6 +++++- inprocess/handle.go | 32 ++++++++++++++++++++------------ inprocess/harness.go | 34 ++++++++++++++++++++++++++-------- inprocess/harness_test.go | 19 +++++++++++++++++++ 4 files changed, 70 insertions(+), 21 deletions(-) diff --git a/inprocess/doc.go b/inprocess/doc.go index cf15f96395..6807b43087 100644 --- a/inprocess/doc.go +++ b/inprocess/doc.go @@ -42,7 +42,11 @@ // hard-disables them). // 4. tmCfg.Instrumentation.Prometheus = false — avoids the dup-registry panic; // with metrics off no evmrpc/EVM-keeper de-globalization is needed. -// 5. Listeners scoped to 127.0.0.1 (EVM defaults to 0.0.0.0, TM RPC to [::]). +// 5. TM RPC / gRPC / P2P listeners scoped to 127.0.0.1 (they default to [::] / +// 0.0.0.0). The EVM HTTP/WS listeners bind 0.0.0.0 — evmrpc hardcodes the +// bind host (server.LocalAddress) with no config override — but on a free +// ephemeral port, dialed via 127.0.0.1. They are not loopback-scoped; a +// bind-host option in evmrpc would be the only way to tighten that. // 6. MaxIncomingConnectionAttempts raised — loopback collapses all peers onto // 127.0.0.1, so the router's IP-keyed conn-tracker counts the startup burst // on one key. diff --git a/inprocess/handle.go b/inprocess/handle.go index efa24b741b..cc54f4b2a9 100644 --- a/inprocess/handle.go +++ b/inprocess/handle.go @@ -12,6 +12,13 @@ import ( "github.com/sei-protocol/sei-chain/evmrpc" ) +// probeClient is the default HTTP client the readiness probes dial with. It is a +// package-level default so WaitReady takes only a ctx (mirroring the SDK's +// sei.NodeHandle.WaitReady), keeping the http client an internal detail. The +// short timeout bounds a single /status or eth_blockNumber probe — the overall +// wait is governed by the caller's ctx, not this. +var probeClient = &http.Client{Timeout: 5 * time.Second} + // Node is a handle to one running in-process validator. Its method set mirrors // the SDK's sei.NodeHandle (EVMRPC/TendermintRPC/REST/WaitReady/Object) so a thin // adapter can satisfy that interface once the SDK toolchain skew is resolved @@ -56,12 +63,14 @@ func (h Node) Object() any { return h.n.tmNode } func (h Node) ServeErr() <-chan error { return h.n.serveErr } // WaitReady blocks until this node has joined consensus (height advancing) and -// its EVM listener is serving, or ctx fires. -func (h Node) WaitReady(ctx context.Context, hc *http.Client) error { - if err := waitHeightAdvances(ctx, hc, h.TendermintRPC(), 1); err != nil { +// its EVM listener is serving, or ctx fires. Its single-ctx signature mirrors +// the SDK's sei.NodeHandle.WaitReady; the probe HTTP client is an internal +// default (probeClient). +func (h Node) WaitReady(ctx context.Context) error { + if err := waitHeightAdvances(ctx, probeClient, h.TendermintRPC(), 1); err != nil { return fmt.Errorf("%s tendermint: %w", h.n.moniker, err) } - if err := waitEVMServing(ctx, hc, h.EVMRPC()); err != nil { + if err := waitEVMServing(ctx, probeClient, h.EVMRPC()); err != nil { return fmt.Errorf("%s evm: %w", h.n.moniker, err) } return nil @@ -87,9 +96,8 @@ func (net *Network) Len() int { return len(net.nodes) } // ctx fires. It is the heavy readiness gate (per-node height-advance + EVM // probe), distinct from Start (which only constructs + starts the nodes). func (net *Network) WaitReady(ctx context.Context) error { - hc := &http.Client{Timeout: 5 * time.Second} for i := range net.nodes { - if err := net.Node(i).WaitReady(ctx, hc); err != nil { + if err := net.Node(i).WaitReady(ctx); err != nil { return err } } @@ -110,12 +118,12 @@ func (net *Network) Close() { for _, n := range net.nodes { stopNode(n) } - // The EVM worker pool is a process global shared by all N nodes (the LLD's - // known cross-node coupling, accepted for the metrics-off MVP). Drain it once - // here, not per node, so its goroutines do not leak across test runs. - if pool := evmrpc.GetGlobalWorkerPool(); pool != nil { - pool.Close() - } + // The EVM worker pool (evmrpc.GetGlobalWorkerPool) is a process-wide + // sync.Once singleton, NOT Network-owned. Deliberately not Closed here: + // Close is permanent (the Once never re-fires), so a second Start in the + // same process would inherit a closed pool and every EVM request would fail. + // Its goroutines are reaped at process exit. De-globalizing the pool in + // evmrpc is the proper fix for repeated Start/Close in one process. if net.ownBaseDir && net.baseDir != "" { _ = os.RemoveAll(net.baseDir) diff --git a/inprocess/harness.go b/inprocess/harness.go index 59fc874d46..b130cce6f4 100644 --- a/inprocess/harness.go +++ b/inprocess/harness.go @@ -4,6 +4,7 @@ package inprocess import ( "context" + "crypto/rand" "fmt" "io" "os" @@ -30,10 +31,23 @@ import ( "github.com/sei-protocol/sei-chain/sei-wasmd/x/wasm" ) -// defaultChainID is the chain-id the sei-chain integration helpers and CLI -// hardcode; a different value fails every tx, so it is the default and not -// merely a placeholder. -const defaultChainID = "sei" +// chainIDPrefix prefixes every harness-generated chain-id. The value is free — +// the harness signs its own txs with Options.ChainID, and it is NOT the EVM +// chain ID (the keeper derives that). A fresh token per Start mirrors the +// controller harness's runChainID discipline: a static id reused across runs +// collides with a prior run's persisted genesis and halts at height 1. +const chainIDPrefix = "sei-inprocess" + +// freshChainID returns a unique chain-id token (chainIDPrefix-<8 hex>). Falls +// back to a nanosecond timestamp if crypto/rand is unavailable, which still +// yields a distinct id per Start. +func freshChainID() string { + var b [4]byte + if _, err := rand.Read(b[:]); err != nil { + return fmt.Sprintf("%s-%d", chainIDPrefix, time.Now().UnixNano()) + } + return fmt.Sprintf("%s-%x", chainIDPrefix, b[:]) +} // Options configures a Start. The zero value is invalid (Validators must be // >= 1); use sensible explicit values. @@ -42,7 +56,9 @@ type Options struct { // (app, node.New) pair serving its own RPC stack. Validators int - // ChainID is the genesis chain id; "" defaults to "sei". + // ChainID is the genesis chain id; "" generates a fresh per-run id + // (chainIDPrefix-) so a run never collides with a prior run's genesis. + // Set it explicitly only when a test pins a specific chain id. ChainID string // BaseDir is the parent dir for per-node homes; "" creates a temp dir the @@ -56,7 +72,7 @@ type Options struct { func (o Options) withDefaults() Options { if o.ChainID == "" { - o.ChainID = defaultChainID + o.ChainID = freshChainID() } if o.TimeoutCommit == 0 { o.TimeoutCommit = 2 * time.Second @@ -284,8 +300,10 @@ type nodeAddrs struct { httpPort, wsPort int } -// buildNodeConfig builds an isolated per-node tendermint config with loopback -// listeners and the conn-tracker ceiling raised (recipes #4, #5, #6). +// buildNodeConfig builds an isolated per-node tendermint config with loopback TM +// RPC / gRPC / P2P listeners and the conn-tracker ceiling raised (recipes #4, #5, +// #6). EVM bind-host is not config-scopable (evmrpc hardcodes 0.0.0.0); the EVM +// ports are allocated free here and dialed via loopback. func buildNodeConfig(nodeDir, moniker string, timeoutCommit time.Duration) (*config.Config, nodeAddrs, error) { sctx := server.NewDefaultContext() tmCfg := sctx.Config diff --git a/inprocess/harness_test.go b/inprocess/harness_test.go index 1dbd5eea70..cf60edddc5 100644 --- a/inprocess/harness_test.go +++ b/inprocess/harness_test.go @@ -5,6 +5,7 @@ package inprocess import ( "context" "fmt" + "strings" "testing" "time" @@ -122,3 +123,21 @@ func TestStartRejectsZeroValidators(t *testing.T) { t.Fatal("Start with 0 validators: want error, got nil") } } + +// TestFreshChainIDPerRun pins the per-run unique chain-id discipline: an empty +// Options.ChainID must yield a distinct id each time, so a run never collides +// with a prior run's persisted genesis. Pure-function check — no bring-up. +func TestFreshChainIDPerRun(t *testing.T) { + a := Options{}.withDefaults().ChainID + b := Options{}.withDefaults().ChainID + if a == b { + t.Fatalf("fresh chain-id not unique across runs: %q == %q", a, b) + } + if !strings.HasPrefix(a, chainIDPrefix) { + t.Fatalf("chain-id %q lacks prefix %q", a, chainIDPrefix) + } + // An explicit ChainID is honored verbatim. + if got := (Options{ChainID: "pinned"}).withDefaults().ChainID; got != "pinned" { + t.Fatalf("explicit ChainID not honored: got %q", got) + } +} From 04a53c17b0f7851498a691c4903d6bec3d845c9f Mon Sep 17 00:00:00 2001 From: bdchatham Date: Wed, 24 Jun 2026 14:23:43 -0700 Subject: [PATCH 03/10] fix(inprocess): drop unserved gRPC surface; tighten serve-error + docs The harness never starts a cosmos gRPC listener (servergrpc.StartGRPCServer is only on the seid start path), so enabling GRPC in app.toml and exposing Node.GRPC() advertised a port nothing binds. Remove the gRPC surface entirely: harness serves TM RPC + EVM (HTTP/WS) only. REST stays an honest "" parity stub. Also: move the harness-only app.App accessors (SetEVMServeErr, EVMHTTPServer, EVMWebSocketServer) behind //go:build inprocess in app/app_inprocess.go so production App's public surface stays unchanged; remove the dead wireMesh path (collectGentxs is authoritative for persistent-peers); correct serve-error wording to listener-start (construct -time bind is still fail-fast); state the metrics-off and 0.0.0.0-bind invariants as standing conditions; stripScheme via strings.CutPrefix. Co-Authored-By: Claude Opus 4.8 --- app/app.go | 53 +++++++++++++----------------------- app/app_inprocess.go | 27 +++++++++++++++++++ inprocess/doc.go | 31 ++++++++++++--------- inprocess/handle.go | 19 +++++++------ inprocess/harness.go | 57 ++++++++++++++------------------------- inprocess/harness_test.go | 2 +- 6 files changed, 94 insertions(+), 95 deletions(-) create mode 100644 app/app_inprocess.go diff --git a/app/app.go b/app/app.go index c653b7846c..579094b8a0 100644 --- a/app/app.go +++ b/app/app.go @@ -478,19 +478,20 @@ type App struct { httpServerStartSignalSent bool wsServerStartSignalSent bool - // evmHTTPServer/evmWSServer retain the EVM JSON-RPC HTTP and WebSocket - // listeners constructed in RegisterLocalServices so an embedding orchestrator - // (the in-process harness) can Stop() them at teardown. Nil for a node with - // the respective listener disabled. Production seid never reads these — its - // process exit reaps the listeners — but discarding them leaked the only Stop - // handle, which an in-process host running N apps in one process needs. + // evmHTTPServer/evmWSServer hold the EVM JSON-RPC HTTP and WebSocket listeners + // constructed in RegisterLocalServices so an embedding orchestrator (the + // in-process harness) can Stop() them at teardown. Nil when the respective + // listener is disabled. Production seid does not read these; its process exit + // reaps the listeners. evmHTTPServer evmrpc.EVMServer evmWSServer evmrpc.EVMServer - // evmServeErr, when non-nil, diverts an EVM listener Start() failure to the - // channel instead of panicking. Production leaves it nil and keeps the - // fail-loud panic (a bind failure must crash a real node). The harness sets it - // via SetEVMServeErr before the first block so a single node's bind failure is - // a reportable error, not a process-wide panic that kills all N nodes. + // evmServeErr, when non-nil, diverts an EVM listener Start() (listener-start) + // failure to the channel instead of panicking. Production leaves it nil: a + // listener-start failure panics and crashes the node. An in-process host sets + // it via SetEVMServeErr before the first block so one node's listener-start + // failure is a reportable error rather than a process-wide panic killing all N. + // Construct-time bind failures (NewEVM*Server below) are not covered — those + // stay fail-fast. evmServeErr chan<- error txPrioritizer sdk.TxPrioritizer @@ -2779,9 +2780,9 @@ func (app *App) RegisterLocalServices(node client.LocalClient, txConfig client.T } // reportEVMServeErr diverts an EVM listener Start() failure to the registered -// error channel, falling back to the historical panic when no channel is set -// (production seid). The send is non-blocking: the channel is buffered and a -// second listener's failure must not deadlock the goroutine. +// error channel, or panics when no channel is set (production seid). The send is +// non-blocking: the channel is buffered and a second listener's failure must not +// deadlock the goroutine. func (app *App) reportEVMServeErr(err error) { if app.evmServeErr == nil { panic(err) @@ -2793,10 +2794,10 @@ func (app *App) reportEVMServeErr(err error) { } // recoverEVMServe is the deferred guard on the EVM listener goroutines. A panic -// inside Start() (beyond the bind error it returns cleanly) is converted to a -// reported error when a channel is registered, so one node's listener panic does -// not crash an in-process host running N nodes. With no channel (production -// seid) it re-panics, preserving the historical fail-loud behavior exactly. +// inside Start() (beyond the error it returns cleanly) is converted to a reported +// error when a channel is registered, so one node's listener panic does not crash +// an in-process host running N nodes. With no channel (production seid) it +// re-panics, keeping the fail-loud contract. func (app *App) recoverEVMServe() { if r := recover(); r != nil { if app.evmServeErr == nil { @@ -2810,22 +2811,6 @@ func (app *App) recoverEVMServe() { } } -// SetEVMServeErr registers the channel that EVM listener Start() failures are -// sent to, replacing the default fail-loud panic. An in-process host that runs -// multiple apps in one process calls this before the first block so one node's -// bind failure is a reportable error rather than a process-wide panic. The -// channel should be buffered (>= 2: one HTTP + one WS listener). -func (app *App) SetEVMServeErr(ch chan<- error) { app.evmServeErr = ch } - -// EVMHTTPServer returns the EVM JSON-RPC HTTP listener constructed in -// RegisterLocalServices, or nil if HTTP serving is disabled. An embedding -// orchestrator calls Stop() on it at teardown. -func (app *App) EVMHTTPServer() evmrpc.EVMServer { return app.evmHTTPServer } - -// EVMWebSocketServer returns the EVM JSON-RPC WebSocket listener, or nil if WS -// serving is disabled. -func (app *App) EVMWebSocketServer() evmrpc.EVMServer { return app.evmWSServer } - // RegisterSwaggerAPI registers swagger route with API Server func RegisterSwaggerAPI(rtr *mux.Router) { statikFS, err := fs.NewWithNamespace("swagger") diff --git a/app/app_inprocess.go b/app/app_inprocess.go new file mode 100644 index 0000000000..5023eb4261 --- /dev/null +++ b/app/app_inprocess.go @@ -0,0 +1,27 @@ +//go:build inprocess + +package app + +import "github.com/sei-protocol/sei-chain/evmrpc" + +// This file holds the harness-only accessors for App's EVM serve plumbing. They +// are gated behind the `inprocess` build tag so production App's public surface +// does not widen — only the in-process harness (which builds with that tag) sees +// them. The backing fields and the reportEVMServeErr/recoverEVMServe helpers stay +// in untagged app.go because the production serve goroutines use them. + +// SetEVMServeErr registers the channel that EVM listener Start() (listener-start) +// failures are sent to, replacing the default fail-loud panic. An in-process host +// that runs multiple apps in one process calls this before the first block so one +// node's listener-start failure is a reportable error rather than a process-wide +// panic. The channel should be buffered (>= 2: one HTTP + one WS listener). +func (app *App) SetEVMServeErr(ch chan<- error) { app.evmServeErr = ch } + +// EVMHTTPServer returns the EVM JSON-RPC HTTP listener constructed in +// RegisterLocalServices, or nil if HTTP serving is disabled. An embedding +// orchestrator calls Stop() on it at teardown. +func (app *App) EVMHTTPServer() evmrpc.EVMServer { return app.evmHTTPServer } + +// EVMWebSocketServer returns the EVM JSON-RPC WebSocket listener, or nil if WS +// serving is disabled. +func (app *App) EVMWebSocketServer() evmrpc.EVMServer { return app.evmWSServer } diff --git a/inprocess/doc.go b/inprocess/doc.go index 6807b43087..261ce28165 100644 --- a/inprocess/doc.go +++ b/inprocess/doc.go @@ -1,8 +1,8 @@ //go:build inprocess // Package inprocess stands up N sei-chain validators in a single Go process, -// reaching real CometBFT consensus and each serving its own full RPC stack -// (Tendermint RPC + EVM JSON-RPC HTTP/WS + gRPC), with deterministic teardown. +// reaching real CometBFT consensus and each serving its own RPC stack +// (Tendermint RPC + EVM JSON-RPC HTTP/WS), with deterministic teardown. // // It is the in-process provisioning foundation for the SDK "local" provider // (design: bdchatham-designs/designs/test-harness/sdk-local-provider-lld.md). @@ -37,17 +37,22 @@ // InitChain response. testutil/network sets it to []{self}, which fails // consensus replay for N>1. // 2. Full P2P mesh — persistent-peers wired nodeID@127.0.0.1:p2pPort across all -// N (testutil/network wires zero). -// 3. Injected AppOptions enable EVM HTTP/WS on per-node ports (app.TestAppOpts -// hard-disables them). -// 4. tmCfg.Instrumentation.Prometheus = false — avoids the dup-registry panic; -// with metrics off no evmrpc/EVM-keeper de-globalization is needed. -// 5. TM RPC / gRPC / P2P listeners scoped to 127.0.0.1 (they default to [::] / -// 0.0.0.0). The EVM HTTP/WS listeners bind 0.0.0.0 — evmrpc hardcodes the -// bind host (server.LocalAddress) with no config override — but on a free -// ephemeral port, dialed via 127.0.0.1. They are not loopback-scoped; a -// bind-host option in evmrpc would be the only way to tighten that. +// N (testutil/network wires zero) — without the mesh nodes never gossip and +// consensus never forms for N>1. +// 3. Injected AppOptions enable EVM HTTP/WS on per-node ports — without them +// app.TestAppOpts hard-disables the listeners and no node serves EVM. +// 4. tmCfg.Instrumentation.Prometheus = false — metrics off avoids the +// dup-registry panic from the process-wide registries. Invariant: metrics +// must stay off until the evmrpc/EVM-keeper metrics are de-globalized — +// re-enabling Prometheus without that reintroduces the panic. +// 5. TM RPC / P2P listeners scoped to 127.0.0.1 (they default to [::] / +// 0.0.0.0) — without scoping an in-process harness publishes externally +// reachable listeners. Caveat (accepted): the EVM HTTP/WS listeners bind all +// interfaces (0.0.0.0) for the harness lifetime; only TM RPC/P2P are +// loopback-scoped. They run on free ephemeral ports, dialed via 127.0.0.1. +// Tightening requires a bind-host option in evmrpc (not yet present). // 6. MaxIncomingConnectionAttempts raised — loopback collapses all peers onto // 127.0.0.1, so the router's IP-keyed conn-tracker counts the startup burst -// on one key. +// on one key — without the raise the burst trips the per-IP cap and peers +// are rejected. package inprocess diff --git a/inprocess/handle.go b/inprocess/handle.go index cc54f4b2a9..49f326a83e 100644 --- a/inprocess/handle.go +++ b/inprocess/handle.go @@ -7,6 +7,7 @@ import ( "fmt" "net/http" "os" + "strings" "time" "github.com/sei-protocol/sei-chain/evmrpc" @@ -43,14 +44,11 @@ func (h Node) EVMRPC() string { return fmt.Sprintf("http://127.0.0.1:%d", h.n.ht // NodeHandle surface, but the in-process harness binds it, so it is exposed. func (h Node) EVMWS() string { return fmt.Sprintf("ws://127.0.0.1:%d", h.n.wsPort) } -// REST is "" — the harness does not enable the Cosmos LCD listener (validators -// serve none by default; present for SDK handle parity). +// REST is "" — the harness does not start the Cosmos LCD listener (reserved: +// REST is part of the SDK NodeHandle shape, so it is present as an honest parity +// stub; validators serve none by default). func (h Node) REST() string { return "" } -// GRPC is the node's Cosmos gRPC address (host:port). Not in the SDK NodeHandle -// surface (gRPC is not a published status endpoint); exposed for in-process dials. -func (h Node) GRPC() string { return h.n.grpcAddr } - // Object returns the live *node.Node behind the handle (SDK escape hatch: the // dynamic value behind any). Read-oriented — driving it is an in-process-only // capability k8s mode never offers. @@ -154,10 +152,11 @@ func stopEVMServer(s evmrpc.EVMServer) { // stripScheme drops a leading scheme:// from a listen address so it can be // recomposed with a concrete scheme (TM RPC config carries tcp://). func stripScheme(addr string) string { - for _, p := range []string{"tcp://", "http://"} { - if len(addr) >= len(p) && addr[:len(p)] == p { - return addr[len(p):] - } + if rest, ok := strings.CutPrefix(addr, "tcp://"); ok { + return rest + } + if rest, ok := strings.CutPrefix(addr, "http://"); ok { + return rest } return addr } diff --git a/inprocess/harness.go b/inprocess/harness.go index b130cce6f4..6893fef371 100644 --- a/inprocess/harness.go +++ b/inprocess/harness.go @@ -10,7 +10,6 @@ import ( "os" "path/filepath" "strconv" - "strings" "time" dbm "github.com/tendermint/tm-db" @@ -95,7 +94,6 @@ type node struct { p2pHost string p2pPort string rpcAddr string // tcp://127.0.0.1:PORT (TM RPC listen address) - grpcAddr string // 127.0.0.1:PORT httpPort int // EVM JSON-RPC HTTP wsPort int // EVM JSON-RPC WS @@ -152,7 +150,6 @@ func Start(ctx context.Context, opts Options) (_ *Network, retErr error) { if err := net.provisionNodes(enc, gb); err != nil { return nil, err } - wireMesh(net.nodes) baseState := app.ModuleBasics.DefaultGenesis(enc.Marshaler) genFiles := make([]string, len(net.nodes)) @@ -218,7 +215,7 @@ func (net *Network) provisionNodes(enc encoding, gb *genesisBuilder) error { return err } - writeAppConfig(filepath.Join(nodeDir, "config/app.toml"), addrs.grpcAddr, net.opts) + writeAppConfig(filepath.Join(nodeDir, "config/app.toml")) clientCx := client.Context{}. WithKeyringDir(clientDir).WithKeyring(kb).WithHomeDir(tmCfg.RootDir). @@ -230,7 +227,7 @@ func (net *Network) provisionNodes(enc encoding, gb *genesisBuilder) error { moniker: moniker, nodeID: nodeID, pubKey: pubKey, addr: addr, home: nodeDir, tmCfg: tmCfg, clientCx: clientCx, p2pHost: addrs.p2pHost, p2pPort: addrs.p2pPort, - rpcAddr: addrs.rpcAddr, grpcAddr: addrs.grpcAddr, + rpcAddr: addrs.rpcAddr, httpPort: addrs.httpPort, wsPort: addrs.wsPort, serveErr: make(chan error, 2), // one HTTP + one WS listener }) @@ -248,6 +245,9 @@ func (net *Network) startNode(ctx context.Context, n *node, enc encoding) error n.app = theApp // recipe #1: zero the validator set so CometBFT derives it from InitChain. + // genesis.go writes Validators=nil at genesis-build time; this re-asserts the + // invariant against the file round-trip here (collectGentxs rewrites the + // genesis via ExportGenesisFileWithTime, so re-read it defensively). genDoc, err := tmtypes.GenesisDocFromFile(n.tmCfg.GenesisFile()) if err != nil { return err @@ -273,8 +273,10 @@ func (net *Network) startNode(ctx context.Context, n *node, enc encoding) error } n.rpc = lc n.clientCx = n.clientCx.WithClient(lc) - // RegisterLocalServices builds the EVM HTTP/WS listeners (their goroutines - // block on the first-block start signal) and the gRPC tx service. + // RegisterLocalServices builds the EVM HTTP/WS listeners; their goroutines + // block on the first-block start signal. (It also registers query/tx services + // on the in-process gRPC query router, but the harness starts no standalone + // cosmos gRPC listener — TM RPC + EVM are the served surface.) theApp.RegisterLocalServices(lc, n.clientCx.TxConfig) return nil } @@ -296,14 +298,13 @@ func resolveBaseDir(dir string) (string, bool, error) { type nodeAddrs struct { p2pHost, p2pPort string rpcAddr string - grpcAddr string httpPort, wsPort int } // buildNodeConfig builds an isolated per-node tendermint config with loopback TM -// RPC / gRPC / P2P listeners and the conn-tracker ceiling raised (recipes #4, #5, -// #6). EVM bind-host is not config-scopable (evmrpc hardcodes 0.0.0.0); the EVM -// ports are allocated free here and dialed via loopback. +// RPC / P2P listeners and the conn-tracker ceiling raised (recipes #4, #5, #6). +// EVM bind-host is not config-scopable (evmrpc hardcodes 0.0.0.0); the EVM ports +// are allocated free here and dialed via loopback. func buildNodeConfig(nodeDir, moniker string, timeoutCommit time.Duration) (*config.Config, nodeAddrs, error) { sctx := server.NewDefaultContext() tmCfg := sctx.Config @@ -318,7 +319,9 @@ func buildNodeConfig(nodeDir, moniker string, timeoutCommit time.Duration) (*con tmCfg.P2P.MaxIncomingConnectionAttempts = 10000 tmCfg.P2P.AllowDuplicateIP = true // recipe #4: metrics-off avoids the prometheus.DefaultRegisterer dup panic - // and lets the evmrpc/EVM-keeper metrics globals commingle harmlessly. + // from the process-wide registries. Invariant: this must stay off until the + // evmrpc/EVM-keeper metrics are de-globalized — re-enabling Prometheus + // without that reintroduces the panic. tmCfg.Instrumentation.Prometheus = false // recipe #5: server.FreeTCPAddr composes tcp://0.0.0.0:PORT — a publicly-bound @@ -332,12 +335,6 @@ func buildNodeConfig(nodeDir, moniker string, timeoutCommit time.Duration) (*con a.rpcAddr = fmt.Sprintf("tcp://127.0.0.1:%d", rpcPort) tmCfg.RPC.ListenAddress = a.rpcAddr - grpcPort, err := freePort() - if err != nil { - return nil, a, err - } - a.grpcAddr = fmt.Sprintf("127.0.0.1:%d", grpcPort) - p2pPort, err := freePort() if err != nil { return nil, a, err @@ -355,21 +352,6 @@ func buildNodeConfig(nodeDir, moniker string, timeoutCommit time.Duration) (*con return tmCfg, a, nil } -// wireMesh wires a full persistent-peer mesh: every node lists all others as -// nodeID@127.0.0.1:p2pPort (recipe #2 — testutil/network wires zero peers). -func wireMesh(nodes []*node) { - for i, n := range nodes { - var peers []string - for j, peer := range nodes { - if j == i { - continue - } - peers = append(peers, fmt.Sprintf("%s@127.0.0.1:%s", peer.nodeID, peer.p2pPort)) - } - n.tmCfg.P2P.PersistentPeers = strings.Join(peers, ",") - } -} - // newNodeApp builds a real sei-chain app for one node with EVM serving on its // per-node ports against an in-memory DB and on-disk home. func newNodeApp(n *node, enc encoding) *app.App { @@ -390,12 +372,13 @@ func newNodeApp(n *node, enc encoding) *app.App { ) } -// writeAppConfig writes a minimal per-node app.toml enabling gRPC on grpcAddr. -func writeAppConfig(path, grpcAddr string, opts Options) { +// writeAppConfig writes a minimal per-node app.toml. The harness serves TM RPC + +// EVM (HTTP/WS) only; the cosmos gRPC server stays off (nothing in the harness +// path calls servergrpc.StartGRPCServer, so enabling it would advertise a port +// no listener binds). +func writeAppConfig(path string) { appCfg := srvconfig.DefaultConfig() appCfg.Telemetry.Enabled = false - appCfg.GRPC.Enable = true - appCfg.GRPC.Address = grpcAddr srvconfig.WriteConfigFile(path, appCfg) } diff --git a/inprocess/harness_test.go b/inprocess/harness_test.go index cf60edddc5..1c022516b5 100644 --- a/inprocess/harness_test.go +++ b/inprocess/harness_test.go @@ -47,7 +47,7 @@ func TestInProcessNetwork(t *testing.T) { } for i := 0; i < n; i++ { nd := net.Node(i) - t.Logf("node %s: tm=%s evm=%s ws=%s grpc=%s", nd.Name(), nd.TendermintRPC(), nd.EVMRPC(), nd.EVMWS(), nd.GRPC()) + t.Logf("node %s: tm=%s evm=%s ws=%s", nd.Name(), nd.TendermintRPC(), nd.EVMRPC(), nd.EVMWS()) } // No EVM listener reported a bind failure. From a1ae06c286060c40b594eff100609d57d88972ed Mon Sep 17 00:00:00 2001 From: bdchatham Date: Wed, 24 Jun 2026 14:33:20 -0700 Subject: [PATCH 04/10] inprocess: disable cosmos gRPC in written app.toml; scope listener docs Explicitly set GRPC.Enable/GRPCWeb.Enable=false so app.toml matches the "gRPC stays off" comment and can't collide on the fixed default port if the standard start path is ever wired. Scope doc.go recipe #5's bare "listeners" to consensus/RPC, and note on EVMRPC/EVMWS that the URL dials loopback while the listener binds 0.0.0.0. Co-Authored-By: Claude Opus 4.8 --- inprocess/doc.go | 2 +- inprocess/handle.go | 5 ++++- inprocess/harness.go | 4 ++++ 3 files changed, 9 insertions(+), 2 deletions(-) diff --git a/inprocess/doc.go b/inprocess/doc.go index 261ce28165..90d2acef69 100644 --- a/inprocess/doc.go +++ b/inprocess/doc.go @@ -47,7 +47,7 @@ // re-enabling Prometheus without that reintroduces the panic. // 5. TM RPC / P2P listeners scoped to 127.0.0.1 (they default to [::] / // 0.0.0.0) — without scoping an in-process harness publishes externally -// reachable listeners. Caveat (accepted): the EVM HTTP/WS listeners bind all +// reachable consensus/RPC listeners. Caveat (accepted): the EVM HTTP/WS listeners bind all // interfaces (0.0.0.0) for the harness lifetime; only TM RPC/P2P are // loopback-scoped. They run on free ephemeral ports, dialed via 127.0.0.1. // Tightening requires a bind-host option in evmrpc (not yet present). diff --git a/inprocess/handle.go b/inprocess/handle.go index 49f326a83e..20820febea 100644 --- a/inprocess/handle.go +++ b/inprocess/handle.go @@ -37,11 +37,14 @@ func (h Node) Namespace() string { return "" } // TendermintRPC is the node's CometBFT RPC base URL (http://127.0.0.1:PORT). func (h Node) TendermintRPC() string { return "http://" + stripScheme(h.n.rpcAddr) } -// EVMRPC is the node's EVM JSON-RPC HTTP URL. +// EVMRPC is the node's EVM JSON-RPC HTTP URL. The URL dials loopback, but the +// listener itself binds 0.0.0.0 (see doc.go recipe #5's accepted caveat). func (h Node) EVMRPC() string { return fmt.Sprintf("http://127.0.0.1:%d", h.n.httpPort) } // EVMWS is the node's EVM JSON-RPC WebSocket URL. Not part of the SDK // NodeHandle surface, but the in-process harness binds it, so it is exposed. +// The URL dials loopback, but the listener itself binds 0.0.0.0 (see doc.go +// recipe #5's accepted caveat). func (h Node) EVMWS() string { return fmt.Sprintf("ws://127.0.0.1:%d", h.n.wsPort) } // REST is "" — the harness does not start the Cosmos LCD listener (reserved: diff --git a/inprocess/harness.go b/inprocess/harness.go index 6893fef371..328b7ed7fc 100644 --- a/inprocess/harness.go +++ b/inprocess/harness.go @@ -378,6 +378,10 @@ func newNodeApp(n *node, enc encoding) *app.App { // no listener binds). func writeAppConfig(path string) { appCfg := srvconfig.DefaultConfig() + // No gRPC listener is started; keep the written config consistent with that + // and avoid an N>1 fixed-port collision if the standard start path is ever wired. + appCfg.GRPC.Enable = false + appCfg.GRPCWeb.Enable = false appCfg.Telemetry.Enabled = false srvconfig.WriteConfigFile(path, appCfg) } From 3a6be997aefcb83753ceb8eb441a5f784b2d9724 Mon Sep 17 00:00:00 2001 From: bdchatham Date: Wed, 24 Jun 2026 15:10:46 -0700 Subject: [PATCH 05/10] feat(inprocess): run the YAML bank suite in-process via a pluggable runner execer (C2) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Wire the integration_test/runner to drive a real bank query/tx suite against the C1 inprocess.Network — no docker. Runner seam: extract execCmd into an `execer` interface. The docker-exec arm stays the zero-value default (existing yaml_integration runs unaffected). A new build-tagged in-process arm (runner_inprocess.go, tag `inprocess`) runs each command on the host against a `seid` it builds once, redirected to a node via a PATH shim that prepends `--home "$SEID_HOME"` — so opaque sourced helpers that call bare `seid` land on the right node without rewriting the commands. Harness bridge: keyring moves into the node home (so host `seid --home` resolves it), each home gets a client.toml pinning test keyring + chain-id + that node's loopback RPC, and Options.ExtraKeys genesis-funds non-validator signing keys (admin on node 0) mirroring the docker localnode topology the suites sign as. bank_module/send_funds_test.yaml is GREEN in-memory (N=3, the min topology that leaves block-sync and forms consensus): a real admin->bank-test send plus historical balance queries at distinct heights, all four verifiers passing. go test -tags inprocess -run TestInProcessBankModule ./integration_test/runner/ Out of scope (process/binary boundary): upgrade + statesync suites. Co-Authored-By: Claude Opus 4.8 --- inprocess/genesis.go | 20 +++ inprocess/handle.go | 12 ++ inprocess/harness.go | 85 ++++++++- integration_test/runner/runner.go | 43 ++++- integration_test/runner/runner_inprocess.go | 167 ++++++++++++++++++ .../runner/runner_inprocess_test.go | 75 ++++++++ 6 files changed, 392 insertions(+), 10 deletions(-) create mode 100644 integration_test/runner/runner_inprocess.go create mode 100644 integration_test/runner/runner_inprocess_test.go diff --git a/inprocess/genesis.go b/inprocess/genesis.go index 4b18ea8825..9359db228a 100644 --- a/inprocess/genesis.go +++ b/inprocess/genesis.go @@ -104,6 +104,26 @@ func (b *genesisBuilder) fundValidator( return addr, nil } +// fundAccount stores a non-validator key in kb and funds its genesis account + +// balance. Unlike fundValidator it writes no gentx (the account never stakes) — +// it is the genesis-funded signing account a suite spends from (e.g. `admin`). +func (b *genesisBuilder) fundAccount( + kb keyring.Keyring, + name string, + algo keyring.SignatureAlgo, + coins sdk.Coins, +) error { + addr, _, err := testutil.GenerateSaveCoinKey(kb, name, "", true, algo) + if err != nil { + return fmt.Errorf("generate key for %s: %w", name, err) + } + b.accounts = append(b.accounts, authtypes.NewBaseAccount(addr, nil, 0, 0)) + if !coins.Empty() { + b.balances = append(b.balances, banktypes.Balance{Address: addr.String(), Coins: coins.Sort()}) + } + return nil +} + // writeBaseGenesis writes a base genesis file (accounts + balances, empty // validator set) to every validator's genesis path. Mirrors initGenFiles. func (b *genesisBuilder) writeBaseGenesis(baseState map[string]json.RawMessage, genFiles []string) error { diff --git a/inprocess/handle.go b/inprocess/handle.go index 20820febea..59cfe514ad 100644 --- a/inprocess/handle.go +++ b/inprocess/handle.go @@ -34,6 +34,18 @@ func (h Node) Name() string { return h.n.moniker } // handle parity. func (h Node) Namespace() string { return "" } +// Home is the node's on-disk home dir (the seid --home target). It holds the +// node's config/, data/, and the `test` keyring this node's genesis keys were +// written into — what the YAML runner's in-process arm points a host `seid` at. +// Not part of the SDK NodeHandle surface (a home dir is in-process-only); exposed +// because the host-binary runner arm needs it. +func (h Node) Home() string { return h.n.home } + +// RPCNodeAddr is the node's CometBFT RPC dial address in tcp:// form +// (tcp://127.0.0.1:PORT) — the value a host `seid --node` flag wants, distinct +// from TendermintRPC's http:// form used by the readiness probes. +func (h Node) RPCNodeAddr() string { return h.n.rpcAddr } + // TendermintRPC is the node's CometBFT RPC base URL (http://127.0.0.1:PORT). func (h Node) TendermintRPC() string { return "http://" + stripScheme(h.n.rpcAddr) } diff --git a/inprocess/harness.go b/inprocess/harness.go index 328b7ed7fc..c94a295296 100644 --- a/inprocess/harness.go +++ b/inprocess/harness.go @@ -67,6 +67,27 @@ type Options struct { // TimeoutCommit is the consensus commit timeout; 0 defaults to 2s. The // dominant cadence lever — lower it (e.g. 500ms) for faster tests. TimeoutCommit time.Duration + + // ExtraKeys are non-validator genesis accounts to create + fund. Each key is + // written into its target node's home `test` keyring (so a host `seid --home + // --keyring-backend test` resolves it) and funded at genesis. This is + // the bridge the YAML runner's in-process arm needs: the bank suite signs as + // `admin` (node 0) and the docker topology also seeds `node_admin` per node. + ExtraKeys []ExtraKey +} + +// ExtraKey is a non-validator genesis account the harness creates and funds. It +// mirrors the docker localnode topology where `admin` lives on node 0 only and +// `node_admin` exists per node, so suites that sign as those names run unchanged +// against the in-process arm. +type ExtraKey struct { + // Name is the keyring key name (e.g. "admin", "node_admin"). + Name string + // Node is the 0-based validator index whose home keyring receives the key. + Node int + // Coins is the genesis balance for the key's account. Empty funds nothing + // (the account still exists), which is rarely what a signing key wants. + Coins sdk.Coins } func (o Options) withDefaults() Options { @@ -150,6 +171,9 @@ func Start(ctx context.Context, opts Options) (_ *Network, retErr error) { if err := net.provisionNodes(enc, gb); err != nil { return nil, err } + if err := net.provisionExtraKeys(gb); err != nil { + return nil, err + } baseState := app.ModuleBasics.DefaultGenesis(enc.Marshaler) genFiles := make([]string, len(net.nodes)) @@ -178,13 +202,14 @@ func (net *Network) provisionNodes(enc encoding, gb *genesisBuilder) error { for i := 0; i < net.opts.Validators; i++ { moniker := fmt.Sprintf("node%d", i) nodeDir := filepath.Join(net.baseDir, moniker, "simd") - clientDir := filepath.Join(net.baseDir, moniker, "simcli") + // The keyring lives in the node home (not a separate simcli dir) so a host + // `seid --home --keyring-backend test` — how the YAML runner's + // in-process arm targets a node — resolves the same keys this harness wrote + // (keyring dir falls back to --home; see client/cmd.go). + clientDir := nodeDir if err := os.MkdirAll(filepath.Join(nodeDir, "config"), 0o750); err != nil { return err } - if err := os.MkdirAll(clientDir, 0o750); err != nil { - return err - } tmCfg, addrs, err := buildNodeConfig(nodeDir, moniker, net.opts.TimeoutCommit) if err != nil { @@ -216,6 +241,12 @@ func (net *Network) provisionNodes(enc encoding, gb *genesisBuilder) error { } writeAppConfig(filepath.Join(nodeDir, "config/app.toml")) + // Seed a client.toml so a bare host `seid --home ` (no per-command + // flags) already targets this node: test keyring, the harness chain-id, and + // this node's loopback TM RPC. The in-process runner arm still injects the + // same values as flags defensively, but pinning them here keeps opaque + // sourced helper scripts (which call bare `seid`) on the right node. + writeClientConfig(filepath.Join(nodeDir, "config/client.toml"), net.opts.ChainID, addrs.rpcAddr) clientCx := client.Context{}. WithKeyringDir(clientDir).WithKeyring(kb).WithHomeDir(tmCfg.RootDir). @@ -235,6 +266,31 @@ func (net *Network) provisionNodes(enc encoding, gb *genesisBuilder) error { return nil } +// provisionExtraKeys creates each Options.ExtraKey in its target node's home +// `test` keyring and funds its genesis account. It runs after provisionNodes (so +// every node's keyring exists) and before genesis assembly (so the balances fold +// into the base genesis). This is the keyring/home bridge the YAML runner's +// in-process arm relies on — `admin` on node 0, `node_admin` per node — matching +// the docker localnode topology so bank suites sign unchanged. +func (net *Network) provisionExtraKeys(gb *genesisBuilder) error { + algoStr := string(hdSecp256k1()) + for _, ek := range net.opts.ExtraKeys { + if ek.Node < 0 || ek.Node >= len(net.nodes) { + return fmt.Errorf("extra key %q targets node %d, out of range [0,%d)", ek.Name, ek.Node, len(net.nodes)) + } + kb := net.nodes[ek.Node].clientCx.Keyring + algos, _ := kb.SupportedAlgorithms() + algo, err := keyring.NewSigningAlgoFromString(algoStr, algos) + if err != nil { + return err + } + if err := gb.fundAccount(kb, ek.Name, algo, ek.Coins); err != nil { + return fmt.Errorf("provision extra key %q on node%d: %w", ek.Name, ek.Node, err) + } + } + return nil +} + // startNode builds the app, constructs + starts the tendermint node with an // EMPTY-valset genesis (recipe #1), wires the local RPC client, and registers // the EVM listeners. The node's EVM Start() failures land on n.serveErr instead @@ -386,6 +442,27 @@ func writeAppConfig(path string) { srvconfig.WriteConfigFile(path, appCfg) } +// clientConfigTemplate matches sei-cosmos client/config's client.toml schema. It +// is reproduced here (not imported) because that package's writer + config +// struct are unexported — the same reason genesis.go reimplements the network +// package's unexported helpers rather than forcing a cosmos source change. +const clientConfigTemplate = `chain-id = "%s" +keyring-backend = "test" +output = "json" +node = "%s" +broadcast-mode = "sync" +` + +// writeClientConfig writes a client.toml pinning the test keyring, chain-id, and +// this node's loopback TM RPC so a bare host `seid --home ` already +// targets the node without per-command flags (client/config.ReadFromClientConfig +// reads /config/client.toml). broadcast-mode stays sync — the suites +// broadcast with -b sync and poll on-chain side effects. Best-effort: a failed +// write leaves the in-process arm's explicit per-command flags as the fallback. +func writeClientConfig(path, chainID, rpcAddr string) { + _ = os.WriteFile(path, []byte(fmt.Sprintf(clientConfigTemplate, chainID, rpcAddr)), 0o600) +} + // freePort allocates a free loopback TCP port via server.FreeTCPAddr. func freePort() (int, error) { _, portStr, err := server.FreeTCPAddr() diff --git a/integration_test/runner/runner.go b/integration_test/runner/runner.go index 94717d2811..d0570c8d54 100644 --- a/integration_test/runner/runner.go +++ b/integration_test/runner/runner.go @@ -50,6 +50,17 @@ type Verifier struct { Result string `yaml:"result,omitempty"` // env var to match (regex only) } +// execer runs one command step against a target node and returns its trimmed +// stdout. It is the seam between the two backends: the docker arm runs the +// command via `docker exec`, the in-process arm runs it on the host against an +// inprocess.Network. node is the Input's resolved target (a docker container +// name / a "sei-node-N" moniker); env is the accumulated capture map. A non-zero +// command exit is reported via the returned string (the captured code), not err +// — err is reserved for harness-level failures (mirrors the runner.py contract). +type execer interface { + run(t *testing.T, cmd, node string, env map[string]string, opts Options) (string, error) +} + // Options controls how RunFile executes commands. type Options struct { // DefaultContainer is the docker container used when an Input has no Node set. @@ -59,6 +70,11 @@ type Options struct { // Shell is the shell used to execute commands (e.g. "sh", "bash"). // Resolved via PATH at runtime. Defaults to "sh". Shell string + // exec is the backend. nil selects the docker arm (the default), so existing + // docker runs are unaffected. The in-process arm is installed via + // WithInProcessNetwork (build-tagged `inprocess`); it never enters a normal + // runner build. + exec execer } // Option is a functional option for Options. @@ -79,6 +95,13 @@ func WithShell(shell string) Option { return func(o *Options) { o.Shell = shell } } +// withExecer installs a backend execer. Unexported: callers select the +// in-process arm via WithInProcessNetwork (build-tagged), and the docker arm is +// the zero-value default. +func withExecer(e execer) Option { + return func(o *Options) { o.exec = e } +} + func newOptions(opts []Option) Options { var o Options //applying default options @@ -88,6 +111,9 @@ func newOptions(opts []Option) Options { for _, opt := range opts { opt(&o) } + if o.exec == nil { + o.exec = dockerExecer{} + } return o } @@ -111,11 +137,11 @@ func runCase(t *testing.T, tc TestCase, opts Options) { envMap := make(map[string]string) for i, inp := range tc.Inputs { - container := inp.Node - if container == "" { - container = opts.DefaultContainer + node := inp.Node + if node == "" { + node = opts.DefaultContainer } - out, err := execCmd(t, inp.Cmd, container, envMap, opts) + out, err := opts.exec.run(t, inp.Cmd, node, envMap, opts) t.Logf("[%d] $ %s\n => %s", i, inp.Cmd, out) require.NoError(t, err, "input[%d] failed: %v", i, err) if inp.Env != "" { @@ -130,11 +156,16 @@ func runCase(t *testing.T, tc TestCase, opts Options) { } } -// execCmd runs cmd in the given docker container (or locally if container is empty), +// dockerExecer is the default backend: it runs each command via `docker exec` +// in the target container (the existing behavior). It is selected whenever no +// other execer is installed, so docker runs are unaffected by the seam. +type dockerExecer struct{} + +// run runs cmd in the given docker container (or locally if container is empty), // injecting the accumulated envMap. Non-zero exit is logged but not fatal — this // matches runner.py behaviour where commands that echo error codes exit 0 from // bash but the captured output is the code. -func execCmd(t *testing.T, cmd, container string, envMap map[string]string, opts Options) (string, error) { +func (dockerExecer) run(t *testing.T, cmd, container string, envMap map[string]string, opts Options) (string, error) { t.Helper() var c *exec.Cmd diff --git a/integration_test/runner/runner_inprocess.go b/integration_test/runner/runner_inprocess.go new file mode 100644 index 0000000000..4d09e428ff --- /dev/null +++ b/integration_test/runner/runner_inprocess.go @@ -0,0 +1,167 @@ +//go:build inprocess + +// This file installs the runner's in-process backend. It is gated behind the +// `inprocess` build tag so the heavy inprocess.Network bring-up (and its +// sei-tendermint/sei-cosmos graph) never enters a normal runner build — the +// docker arm in runner.go stays the only backend without the tag. +package runner + +import ( + "errors" + "fmt" + "os" + "os/exec" + "path/filepath" + "strconv" + "strings" + "sync" + "testing" + + "github.com/sei-protocol/sei-chain/inprocess" +) + +// WithInProcessNetwork selects the in-process backend: commands run on the HOST +// against a real `seid` binary pointed at one of net's in-process nodes, with no +// docker. The Input `node:` field ("sei-node-N", default "sei-node-0") selects +// the node; the command's `seid` invocations are redirected to that node's home +// (and its loopback TM RPC / EVM endpoints) so suites written for the docker +// cluster run unchanged. +// +// The build tag means this option only exists in an `inprocess` build; docker +// runs (built without the tag) cannot reference it, and so cannot regress. +func WithInProcessNetwork(net *inprocess.Network) Option { + return withExecer(newInProcessExecer(net)) +} + +// inProcessExecer runs commands on the host against an inprocess.Network. It +// shims `seid` so opaque sourced helper scripts (which call bare `seid` / +// `$seidbin`) land on the right node: the shim prepends `--home "$SEID_HOME"` +// to every real seid call, and the per-node client.toml the harness wrote under +// that home supplies chain-id, the test keyring, and the node's RPC address. +type inProcessExecer struct { + net *inprocess.Network + + once sync.Once + binDir string // dir holding the seid shim + real binary, prepended to PATH + setup error // first-build error, returned to every run after +} + +func newInProcessExecer(net *inprocess.Network) *inProcessExecer { + return &inProcessExecer{net: net} +} + +// run resolves node → harness node, sets the per-node targeting env (SEID_HOME +// for the shim, SEI_EVM_RPC/WS for curl/EVM commands) plus the accumulated +// capture env, and runs the command on the host. Non-zero command exit is +// reported via stdout (the captured code), matching the docker arm + runner.py +// contract; err is reserved for harness-level failures. +func (e *inProcessExecer) run(t *testing.T, cmd, node string, envMap map[string]string, opts Options) (string, error) { + t.Helper() + if err := e.ensureBin(); err != nil { + return "", fmt.Errorf("prepare seid: %w", err) + } + h, err := e.nodeFor(node) + if err != nil { + return "", err + } + + c := exec.Command(opts.Shell, "-c", cmd) //nolint:gosec + // Run from the repo root so the suites' relative `source + // integration_test/utils/_tx_helpers.sh` resolves (docker runs with the repo + // mounted at the container CWD; `go test` runs with CWD = the package dir). + c.Dir = repoRoot() + c.Env = append(os.Environ(), envMapSlice(envMap)...) + c.Env = append(c.Env, + "PATH="+e.binDir+string(os.PathListSeparator)+os.Getenv("PATH"), + "SEID_HOME="+h.Home(), + "SEI_EVM_RPC="+h.EVMRPC(), + "SEI_EVM_WS="+h.EVMWS(), + // Some EVM suites read EVM_RPC; keep parity with SEI_EVM_RPC. + "EVM_RPC="+h.EVMRPC(), + ) + + out, err := c.Output() + stdout := strings.TrimSpace(string(out)) + if err != nil { + var exit *exec.ExitError + if errors.As(err, &exit) { + t.Logf(" (exit %d) stderr: %s", exit.ExitCode(), strings.TrimSpace(string(exit.Stderr))) + return stdout, nil + } + return stdout, err + } + return stdout, nil +} + +// nodeFor maps a "sei-node-N" moniker (the docker container naming the suites +// use) to the harness node at index N. An empty string defaults to node 0, the +// suite default (admin's home). +func (e *inProcessExecer) nodeFor(node string) (inprocess.Node, error) { + idx := 0 + if node != "" { + const prefix = "sei-node-" + s, ok := strings.CutPrefix(node, prefix) + if !ok { + return inprocess.Node{}, fmt.Errorf("in-process arm: node %q is not %sN", node, prefix) + } + n, err := strconv.Atoi(s) + if err != nil { + return inprocess.Node{}, fmt.Errorf("in-process arm: node %q has non-numeric index: %w", node, err) + } + idx = n + } + if idx < 0 || idx >= e.net.Len() { + return inprocess.Node{}, fmt.Errorf("in-process arm: node index %d out of range [0,%d)", idx, e.net.Len()) + } + return e.net.Node(idx), nil +} + +// ensureBin builds the seid binary once and writes a `seid` shim alongside it, +// in a dir prepended to PATH. The shim execs the real binary with `--home +// "$SEID_HOME"` prepended: --home is a global persistent flag every seid +// subcommand accepts, so a single shim redirects bare `seid` calls (inside +// opaque sourced helpers) to the per-command node home without rewriting the +// commands. The build is on the same branch as the harness, so the CLI and the +// in-process app are the same code. +func (e *inProcessExecer) ensureBin() error { + e.once.Do(func() { + dir, err := os.MkdirTemp("", "sei-inprocess-bin-") + if err != nil { + e.setup = err + return + } + e.binDir = dir + + realBin := filepath.Join(dir, "seid.real") + // Build from this branch's source so the CLI matches the in-process app. + build := exec.Command("go", "build", "-tags", "inprocess", "-o", realBin, "./cmd/seid") + build.Dir = repoRoot() + if out, berr := build.CombinedOutput(); berr != nil { + e.setup = fmt.Errorf("go build seid: %w\n%s", berr, out) + return + } + + shim := filepath.Join(dir, "seid") + // --home is global; prepending it is valid for every subcommand. exec + // replaces the shim process so signals/exit codes pass through cleanly. + script := "#!/bin/sh\nexec \"" + realBin + "\" --home \"$SEID_HOME\" \"$@\"\n" + if werr := os.WriteFile(shim, []byte(script), 0o700); werr != nil { //nolint:gosec + e.setup = werr + return + } + }) + return e.setup +} + +// repoRoot returns the sei-chain repo root by walking up from this source file's +// package dir (integration_test/runner) to the module root, so `go build +// ./cmd/seid` resolves regardless of the test's working directory. +func repoRoot() string { + // runner package lives at /integration_test/runner; climb two levels. + wd, err := os.Getwd() + if err != nil { + return "." + } + // `go test` runs with CWD = the package dir. + return filepath.Clean(filepath.Join(wd, "..", "..")) +} diff --git a/integration_test/runner/runner_inprocess_test.go b/integration_test/runner/runner_inprocess_test.go new file mode 100644 index 0000000000..a119db0419 --- /dev/null +++ b/integration_test/runner/runner_inprocess_test.go @@ -0,0 +1,75 @@ +//go:build inprocess + +// Package runner_test's in-process arm runs the YAML suites against an +// inprocess.Network (no docker). It is tagged `inprocess` so it never enters a +// normal runner build; the docker-backed runner_test.go (tag `yaml_integration`) +// is unaffected. +// +// Run the bank send suite in-memory: +// +// go test -tags inprocess -run TestInProcessBankModule -v -timeout 600s ./integration_test/runner/ +package runner_test + +import ( + "context" + "testing" + "time" + + sdk "github.com/sei-protocol/sei-chain/sei-cosmos/types" + + "github.com/sei-protocol/sei-chain/inprocess" + "github.com/sei-protocol/sei-chain/integration_test/runner" +) + +// chainID is the chain-id the bank suite signs with (`--chain-id=sei` in the tx +// helpers). The in-process harness must use the same id, and the per-node +// client.toml it writes carries it so bare `seid` calls match. +const chainID = "sei" + +// adminFunding mirrors the docker step2_genesis admin grant +// (1000000000000000000000usei). Large enough to cover the suite's sends + fees +// with room to spare. +func adminFunding() sdk.Coins { + amt, ok := sdk.NewIntFromString("1000000000000000000000") + if !ok { + panic("bad admin funding literal") + } + return sdk.NewCoins(sdk.NewCoin("usei", amt)) +} + +// TestInProcessBankModule is the C2 end-to-end proof: it stands up an in-process +// network with a genesis-funded `admin` on node 0 (the suite's signing key) and +// runs bank_module/send_funds_test.yaml through the runner's in-process arm — a +// real bank tx + historical balance queries, in-memory, no docker. +func TestInProcessBankModule(t *testing.T) { + ctx, cancel := context.WithTimeout(context.Background(), 8*time.Minute) + defer cancel() + + net, err := inprocess.Start(ctx, inprocess.Options{ + // Three validators: send_funds asserts only node-0 state, but fewer than + // three in-process validators stay in block-sync and never enter consensus + // (the block-sync→consensus transition needs a >2/3 voting-power quorum of + // reachable peers, which N<3 can't form). N=3 is the minimum live topology. + // admin lives on node 0 (the suite default); nodes 1-2 provide the quorum. + Validators: 3, + ChainID: chainID, + TimeoutCommit: time.Second, + ExtraKeys: []inprocess.ExtraKey{ + // admin lives on node 0 only and is genesis-funded — the docker + // localnode topology the suite signs against. + {Name: "admin", Node: 0, Coins: adminFunding()}, + }, + }) + if err != nil { + t.Fatalf("inprocess.Start: %v", err) + } + defer net.Close() + + if err := net.WaitReady(ctx); err != nil { + t.Fatalf("WaitReady: %v", err) + } + + runner.RunFile(t, "../bank_module/send_funds_test.yaml", + runner.WithInProcessNetwork(net), + ) +} From 8403b1e6ca1c94056128cef1f93044c0ed6a3110 Mon Sep 17 00:00:00 2001 From: bdchatham Date: Wed, 24 Jun 2026 15:36:13 -0700 Subject: [PATCH 06/10] fix(inprocess): correct N-floor mechanism; reject N=2; harden RPC targeting + cleanup MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The N-floor was documented as a >2/3 voting-power quorum (N<3 stay in block-sync). That is wrong. The real constraint is CometBFT's block-sync handoff, verified against sei-tendermint and empirically: - N=1 produces blocks as solo proposer IF onlyValidatorIsUs fires — which needs state.Validators.Size()==1 at the blockSync decision. Recipe #1's empty genesis valset leaves size 0 there (decision precedes InitChain), so the solo node fell into block-sync and hung at height 1. Fixed by pinning the single validator into genesis for N=1. - N=2 deadlocks: each node has exactly 1 peer and BlockPool.IsCaughtUp requires >1. Start now rejects N=2 loudly instead of hanging. - N>=3 works (>=2 peers each). Bank suite stays at N=3. Corrected the false call-site comment + Options doc; added a doc.go recipe entry. Guard test now asserts N=2 rejected. Hardening: - F2: shim injects --node (client subcommands only; --node is not root-persistent, so keys/* would break) so RPC targeting is explicit, not client.toml-only. writeClientConfig returns its error (keyring-backend=test resolves only from it). Fixed the stale "injects same values defensively" comment. - F5: t.Cleanup removes the temp build dir holding seid.real + shim. - repoRoot surfaces the Getwd error instead of degrading to ".". Co-Authored-By: Claude Opus 4.8 --- inprocess/doc.go | 20 +++++ inprocess/harness.go | 82 ++++++++++++++----- inprocess/harness_test.go | 11 ++- integration_test/runner/runner_inprocess.go | 72 +++++++++++----- .../runner/runner_inprocess_test.go | 15 ++-- 5 files changed, 150 insertions(+), 50 deletions(-) diff --git a/inprocess/doc.go b/inprocess/doc.go index 90d2acef69..7bc834da0d 100644 --- a/inprocess/doc.go +++ b/inprocess/doc.go @@ -55,4 +55,24 @@ // 127.0.0.1, so the router's IP-keyed conn-tracker counts the startup burst // on one key — without the raise the burst trips the per-IP cap and peers // are rejected. +// +// # Validator count: 1 or >= 3 (2 is the trap) +// +// When wiring a suite, pick Validators = 1 or Validators >= 3. Start rejects 2. +// The constraint is CometBFT's block-sync→consensus handoff, NOT a voting-power +// quorum: +// +// - N=1 works. A sole validator skips block-sync and proposes blocks solo +// (sei-tendermint onlyValidatorIsUs, node/setup.go, gating +// `blockSync := !onlyValidatorIsUs` in node/node.go). That decision reads the +// genesis-derived valset BEFORE InitChain, so the harness pins the single +// validator into genesis for N=1 (recipe #1's empty valset would leave size 0, +// defeat onlyValidatorIsUs, and the solo node would hang in block-sync — see +// startNode). +// - N=2 hangs. Each node has exactly one peer, and BlockPool.IsCaughtUp +// (internal/blocksync/pool.go) hard-requires len(peers) > 1 to ever report +// caught-up, so neither node leaves block-sync. This is a peer-count deadlock, +// not a stake threshold. Start rejects N=2 loudly rather than let it hang. +// - N>=3 works. Every node has >= 2 peers, so IsCaughtUp can fire and hand off +// to consensus. N=3 is the smallest real multi-node topology. package inprocess diff --git a/inprocess/harness.go b/inprocess/harness.go index c94a295296..78868072c0 100644 --- a/inprocess/harness.go +++ b/inprocess/harness.go @@ -17,6 +17,7 @@ import ( "github.com/sei-protocol/sei-chain/app" "github.com/sei-protocol/sei-chain/sei-cosmos/client" + cryptocodec "github.com/sei-protocol/sei-chain/sei-cosmos/crypto/codec" "github.com/sei-protocol/sei-chain/sei-cosmos/crypto/keyring" cryptotypes "github.com/sei-protocol/sei-chain/sei-cosmos/crypto/types" "github.com/sei-protocol/sei-chain/sei-cosmos/server" @@ -48,10 +49,14 @@ func freshChainID() string { return fmt.Sprintf("%s-%x", chainIDPrefix, b[:]) } -// Options configures a Start. The zero value is invalid (Validators must be -// >= 1); use sensible explicit values. +// Options configures a Start. The zero value is invalid (Validators must be 1 +// or >= 3; 2 is rejected — see the Validators doc); use explicit values. type Options struct { - // Validators is the number of in-process validators (>= 1). Each is a full + // Validators is the number of in-process validators. Valid: 1 or >= 3. 2 is + // REJECTED — two validators each have exactly one peer, and CometBFT's + // BlockPool.IsCaughtUp requires >1 peer, so an N=2 mesh deadlocks in + // block-sync. N=1 runs as a solo proposer (onlyValidatorIsUs skips + // block-sync); N>=3 gives every node >=2 peers. Each validator is a full // (app, node.New) pair serving its own RPC stack. Validators int @@ -144,7 +149,15 @@ type Network struct { func Start(ctx context.Context, opts Options) (_ *Network, retErr error) { opts = opts.withDefaults() if opts.Validators < 1 { - return nil, fmt.Errorf("inprocess: Options.Validators must be >= 1, got %d", opts.Validators) + return nil, fmt.Errorf("inprocess: Options.Validators must be 1 or >= 3, got %d", opts.Validators) + } + // N=2 deadlocks in CometBFT block-sync: each node has exactly 1 peer, and + // BlockPool.IsCaughtUp (sei-tendermint internal/blocksync/pool.go) hard-requires + // >1 peer to ever report caught-up, so neither node leaves block-sync. Reject it + // loudly rather than hang. N=1 (solo proposer via onlyValidatorIsUs) and N>=3 + // (>=2 peers each) both work — see startNode and doc.go. + if opts.Validators == 2 { + return nil, fmt.Errorf("inprocess: Options.Validators == 2 deadlocks in CometBFT block-sync (BlockPool.IsCaughtUp requires >1 peer); use 1 or >= 3") } baseDir, ownBaseDir, err := resolveBaseDir(opts.BaseDir) @@ -243,10 +256,14 @@ func (net *Network) provisionNodes(enc encoding, gb *genesisBuilder) error { writeAppConfig(filepath.Join(nodeDir, "config/app.toml")) // Seed a client.toml so a bare host `seid --home ` (no per-command // flags) already targets this node: test keyring, the harness chain-id, and - // this node's loopback TM RPC. The in-process runner arm still injects the - // same values as flags defensively, but pinning them here keeps opaque - // sourced helper scripts (which call bare `seid`) on the right node. - writeClientConfig(filepath.Join(nodeDir, "config/client.toml"), net.opts.ChainID, addrs.rpcAddr) + // this node's loopback TM RPC. The runner arm's shim also injects --home and + // --node explicitly (so RPC targeting does not rest on this file alone), but + // keyring-backend=test is resolved ONLY from here — the sourced helpers pass + // no --keyring-backend flag — so this write is load-bearing and its failure + // must surface. + if err := writeClientConfig(filepath.Join(nodeDir, "config/client.toml"), net.opts.ChainID, addrs.rpcAddr); err != nil { + return fmt.Errorf("write client.toml for %s: %w", moniker, err) + } clientCx := client.Context{}. WithKeyringDir(clientDir).WithKeyring(kb).WithHomeDir(tmCfg.RootDir). @@ -291,24 +308,44 @@ func (net *Network) provisionExtraKeys(gb *genesisBuilder) error { return nil } -// startNode builds the app, constructs + starts the tendermint node with an -// EMPTY-valset genesis (recipe #1), wires the local RPC client, and registers -// the EVM listeners. The node's EVM Start() failures land on n.serveErr instead -// of panicking (recipe: a single bind failure must not kill all N nodes). +// startNode builds the app, constructs + starts the tendermint node, wires the +// local RPC client, and registers the EVM listeners. The node's EVM Start() +// failures land on n.serveErr instead of panicking (recipe: a single bind +// failure must not kill all N nodes). The genesis valset is N-dependent — see +// the recipe #1 / N=1 block below. func (net *Network) startNode(ctx context.Context, n *node, enc encoding) error { theApp := newNodeApp(n, enc) theApp.SetEVMServeErr(n.serveErr) n.app = theApp - // recipe #1: zero the validator set so CometBFT derives it from InitChain. - // genesis.go writes Validators=nil at genesis-build time; this re-asserts the - // invariant against the file round-trip here (collectGentxs rewrites the - // genesis via ExportGenesisFileWithTime, so re-read it defensively). + // recipe #1 (N>=2): zero the validator set so every node derives the valset + // from its own InitChain response — without this, multi-node consensus replay + // fails. genesis.go writes Validators=nil at build time; re-assert it here + // against the collectGentxs file round-trip (ExportGenesisFileWithTime). + // + // N=1 EXCEPTION: a sole validator must skip block-sync and produce blocks as + // solo proposer, which only happens when sei-tendermint's onlyValidatorIsUs + // (node/setup.go) sees state.Validators.Size()==1 with our consensus key at + // the blockSync decision (node/node.go: `blockSync := !onlyValidatorIsUs`). + // That decision reads the genesis-derived state (MakeGenesisState) BEFORE + // InitChain runs, so an empty valset leaves size 0, onlyValidatorIsUs returns + // false, and the node enters block-sync — where BlockPool.IsCaughtUp requires + // >1 peer (pool.go) and a 0-peer solo node hangs forever at height 1. Pinning + // the single validator into genesis here makes onlyValidatorIsUs fire. genDoc, err := tmtypes.GenesisDocFromFile(n.tmCfg.GenesisFile()) if err != nil { return err } genDoc.Validators = nil + if len(net.nodes) == 1 { + tmPub, perr := cryptocodec.ToTmPubKeyInterface(n.pubKey) + if perr != nil { + return fmt.Errorf("convert consensus pubkey for %s: %w", n.moniker, perr) + } + genDoc.Validators = []tmtypes.GenesisValidator{ + {PubKey: tmPub, Address: tmPub.Address(), Name: n.moniker, Power: 100}, + } + } tmNode, err := tmnode.New( ctx, n.tmCfg, func() {}, theApp, genDoc, @@ -457,10 +494,15 @@ broadcast-mode = "sync" // this node's loopback TM RPC so a bare host `seid --home ` already // targets the node without per-command flags (client/config.ReadFromClientConfig // reads /config/client.toml). broadcast-mode stays sync — the suites -// broadcast with -b sync and poll on-chain side effects. Best-effort: a failed -// write leaves the in-process arm's explicit per-command flags as the fallback. -func writeClientConfig(path, chainID, rpcAddr string) { - _ = os.WriteFile(path, []byte(fmt.Sprintf(clientConfigTemplate, chainID, rpcAddr)), 0o600) +// broadcast with -b sync and poll on-chain side effects. +// +// This write is load-bearing, not best-effort: the sourced _tx_helpers.sh call +// bare `seid` with no --keyring-backend flag, so keyring-backend=test is resolved +// from this file (the shim only injects --home and --node). A failed write would +// silently fall the keyring back to the OS default and break signing — so the +// error is returned, not swallowed. +func writeClientConfig(path, chainID, rpcAddr string) error { + return os.WriteFile(path, []byte(fmt.Sprintf(clientConfigTemplate, chainID, rpcAddr)), 0o600) } // freePort allocates a free loopback TCP port via server.FreeTCPAddr. diff --git a/inprocess/harness_test.go b/inprocess/harness_test.go index 1c022516b5..06451e480e 100644 --- a/inprocess/harness_test.go +++ b/inprocess/harness_test.go @@ -116,11 +116,14 @@ func assertCrossNodeTxRoundTrip(t *testing.T, ctx context.Context, net *Network) t.Fatalf("tx %X not observed on node1 within deadline", res.Hash) } -// TestStartRejectsZeroValidators guards the input validation. +// TestStartRejectsZeroValidators guards the input validation: 0 (too few) and 2 +// (the block-sync deadlock) are rejected without bring-up. N=1 and N>=3 are the +// valid topologies (proven live by TestInProcessNetwork at N=4). func TestStartRejectsZeroValidators(t *testing.T) { - _, err := Start(context.Background(), Options{Validators: 0}) - if err == nil { - t.Fatal("Start with 0 validators: want error, got nil") + for _, n := range []int{0, 2} { + if _, err := Start(context.Background(), Options{Validators: n}); err == nil { + t.Fatalf("Start with %d validators: want error, got nil", n) + } } } diff --git a/integration_test/runner/runner_inprocess.go b/integration_test/runner/runner_inprocess.go index 4d09e428ff..6327aecc5c 100644 --- a/integration_test/runner/runner_inprocess.go +++ b/integration_test/runner/runner_inprocess.go @@ -36,8 +36,9 @@ func WithInProcessNetwork(net *inprocess.Network) Option { // inProcessExecer runs commands on the host against an inprocess.Network. It // shims `seid` so opaque sourced helper scripts (which call bare `seid` / // `$seidbin`) land on the right node: the shim prepends `--home "$SEID_HOME"` -// to every real seid call, and the per-node client.toml the harness wrote under -// that home supplies chain-id, the test keyring, and the node's RPC address. +// and `--node "$SEID_NODE"` to every real seid call (explicit home + RPC +// targeting), and the per-node client.toml the harness wrote under that home +// supplies chain-id and the test keyring. type inProcessExecer struct { net *inprocess.Network @@ -57,7 +58,7 @@ func newInProcessExecer(net *inprocess.Network) *inProcessExecer { // contract; err is reserved for harness-level failures. func (e *inProcessExecer) run(t *testing.T, cmd, node string, envMap map[string]string, opts Options) (string, error) { t.Helper() - if err := e.ensureBin(); err != nil { + if err := e.ensureBin(t); err != nil { return "", fmt.Errorf("prepare seid: %w", err) } h, err := e.nodeFor(node) @@ -69,11 +70,19 @@ func (e *inProcessExecer) run(t *testing.T, cmd, node string, envMap map[string] // Run from the repo root so the suites' relative `source // integration_test/utils/_tx_helpers.sh` resolves (docker runs with the repo // mounted at the container CWD; `go test` runs with CWD = the package dir). - c.Dir = repoRoot() + root, err := repoRoot() + if err != nil { + return "", err + } + c.Dir = root c.Env = append(os.Environ(), envMapSlice(envMap)...) c.Env = append(c.Env, "PATH="+e.binDir+string(os.PathListSeparator)+os.Getenv("PATH"), "SEID_HOME="+h.Home(), + // SEID_NODE makes TM RPC targeting explicit via the shim's --node flag + // rather than resting solely on the per-node client.toml. RPCNodeAddr is the + // tcp:// form --node wants. + "SEID_NODE="+h.RPCNodeAddr(), "SEI_EVM_RPC="+h.EVMRPC(), "SEI_EVM_WS="+h.EVMWS(), // Some EVM suites read EVM_RPC; keep parity with SEI_EVM_RPC. @@ -117,13 +126,12 @@ func (e *inProcessExecer) nodeFor(node string) (inprocess.Node, error) { } // ensureBin builds the seid binary once and writes a `seid` shim alongside it, -// in a dir prepended to PATH. The shim execs the real binary with `--home -// "$SEID_HOME"` prepended: --home is a global persistent flag every seid -// subcommand accepts, so a single shim redirects bare `seid` calls (inside -// opaque sourced helpers) to the per-command node home without rewriting the -// commands. The build is on the same branch as the harness, so the CLI and the -// in-process app are the same code. -func (e *inProcessExecer) ensureBin() error { +// in a dir prepended to PATH. The shim redirects bare `seid` calls (inside +// opaque sourced helpers) to the per-command node home + RPC without rewriting +// the commands — see shimScript for the --home/--node split. The build is on the +// same branch as the harness, so the CLI and the in-process app are the same +// code. The temp build dir is removed via t.Cleanup so each run leaves none. +func (e *inProcessExecer) ensureBin(t *testing.T) error { e.once.Do(func() { dir, err := os.MkdirTemp("", "sei-inprocess-bin-") if err != nil { @@ -131,20 +139,26 @@ func (e *inProcessExecer) ensureBin() error { return } e.binDir = dir + // F5: the build dir holds a freshly-built seid + shim; remove it at test end + // so repeated runs don't accrue a binary per run. + t.Cleanup(func() { _ = os.RemoveAll(dir) }) + root, err := repoRoot() + if err != nil { + e.setup = err + return + } realBin := filepath.Join(dir, "seid.real") // Build from this branch's source so the CLI matches the in-process app. build := exec.Command("go", "build", "-tags", "inprocess", "-o", realBin, "./cmd/seid") - build.Dir = repoRoot() + build.Dir = root if out, berr := build.CombinedOutput(); berr != nil { e.setup = fmt.Errorf("go build seid: %w\n%s", berr, out) return } shim := filepath.Join(dir, "seid") - // --home is global; prepending it is valid for every subcommand. exec - // replaces the shim process so signals/exit codes pass through cleanly. - script := "#!/bin/sh\nexec \"" + realBin + "\" --home \"$SEID_HOME\" \"$@\"\n" + script := shimScript(realBin) if werr := os.WriteFile(shim, []byte(script), 0o700); werr != nil { //nolint:gosec e.setup = werr return @@ -153,15 +167,31 @@ func (e *inProcessExecer) ensureBin() error { return e.setup } +// shimScript builds the `seid` shim. It always prepends --home (a root +// persistent flag) and appends --node only for client subcommands (query/q/tx/ +// status), where --node is registered — appending it to `keys` or other +// non-client subcommands would fail cobra flag parsing. +func shimScript(realBin string) string { + return "#!/bin/sh\n" + + "home_args=\"--home $SEID_HOME\"\n" + + "case \"$1\" in\n" + + " q|query|tx|status) node_args=\"--node $SEID_NODE\" ;;\n" + + " *) node_args=\"\" ;;\n" + + "esac\n" + + "exec \"" + realBin + "\" $home_args $node_args \"$@\"\n" +} + // repoRoot returns the sei-chain repo root by walking up from this source file's // package dir (integration_test/runner) to the module root, so `go build -// ./cmd/seid` resolves regardless of the test's working directory. -func repoRoot() string { - // runner package lives at /integration_test/runner; climb two levels. +// ./cmd/seid` resolves regardless of the test's working directory. It surfaces a +// Getwd failure rather than silently degrading to "." (a wrong build/run dir), +// which would fail confusingly downstream. +func repoRoot() (string, error) { + // `go test` runs with CWD = the package dir; runner lives at + // /integration_test/runner, so climb two levels. wd, err := os.Getwd() if err != nil { - return "." + return "", fmt.Errorf("resolve repo root: %w", err) } - // `go test` runs with CWD = the package dir. - return filepath.Clean(filepath.Join(wd, "..", "..")) + return filepath.Clean(filepath.Join(wd, "..", "..")), nil } diff --git a/integration_test/runner/runner_inprocess_test.go b/integration_test/runner/runner_inprocess_test.go index a119db0419..9fa1fcccf8 100644 --- a/integration_test/runner/runner_inprocess_test.go +++ b/integration_test/runner/runner_inprocess_test.go @@ -46,11 +46,16 @@ func TestInProcessBankModule(t *testing.T) { defer cancel() net, err := inprocess.Start(ctx, inprocess.Options{ - // Three validators: send_funds asserts only node-0 state, but fewer than - // three in-process validators stay in block-sync and never enter consensus - // (the block-sync→consensus transition needs a >2/3 voting-power quorum of - // reachable peers, which N<3 can't form). N=3 is the minimum live topology. - // admin lives on node 0 (the suite default); nodes 1-2 provide the quorum. + // Three validators. send_funds asserts only node-0 state, but the N choice + // is constrained by CometBFT's block-sync handoff, NOT a voting-power quorum: + // N=1 works — a sole validator skips block-sync and proposes solo + // (sei-tendermint onlyValidatorIsUs). + // N=2 HANGS — each node has exactly 1 peer; BlockPool.IsCaughtUp requires + // >1 peer, so neither leaves block-sync (Start rejects N=2). + // N>=3 works — every node has >=2 peers, so IsCaughtUp can fire and hand + // off to consensus. + // N=3 is the smallest MULTI-NODE topology, the point of this end-to-end demo: + // admin lives on node 0 (the suite default); nodes 1-2 are real consensus peers. Validators: 3, ChainID: chainID, TimeoutCommit: time.Second, From a3a398c9f47160263be1a8733c633b5d505307b3 Mon Sep 17 00:00:00 2001 From: bdchatham Date: Wed, 24 Jun 2026 16:04:09 -0700 Subject: [PATCH 07/10] fix(inprocess): surface EVM serve errors through WaitReady waitEVMServing now selects on the node's serveErr channel alongside the poll tick, so a reported EVM listener-start failure short-circuits with the real error instead of polling eth_blockNumber until the ctx deadline and masking it as a generic timeout. Consumption is non-destructive: the received error is re-sent (non-blocking, slot just freed) so Node.ServeErr() still observes it after WaitReady returns. Production seid (nil channel -> panic in app.go) is untouched. Co-Authored-By: Claude Opus 4.8 --- inprocess/handle.go | 2 +- inprocess/readiness.go | 22 +++++++++++++-- inprocess/readiness_test.go | 55 +++++++++++++++++++++++++++++++++++++ 3 files changed, 76 insertions(+), 3 deletions(-) create mode 100644 inprocess/readiness_test.go diff --git a/inprocess/handle.go b/inprocess/handle.go index 59cfe514ad..3abc3384bb 100644 --- a/inprocess/handle.go +++ b/inprocess/handle.go @@ -83,7 +83,7 @@ func (h Node) WaitReady(ctx context.Context) error { if err := waitHeightAdvances(ctx, probeClient, h.TendermintRPC(), 1); err != nil { return fmt.Errorf("%s tendermint: %w", h.n.moniker, err) } - if err := waitEVMServing(ctx, probeClient, h.EVMRPC()); err != nil { + if err := waitEVMServing(ctx, probeClient, h.EVMRPC(), h.n.serveErr); err != nil { return fmt.Errorf("%s evm: %w", h.n.moniker, err) } return nil diff --git a/inprocess/readiness.go b/inprocess/readiness.go index 4306d9754d..aa6b6d43fd 100644 --- a/inprocess/readiness.go +++ b/inprocess/readiness.go @@ -50,8 +50,18 @@ func waitHeightAdvances(ctx context.Context, hc *http.Client, tmRPC string, delt } // waitEVMServing blocks until evmRPC answers eth_blockNumber with a non-empty, -// error-free result — proof the EVM JSON-RPC listener is bound and serving. -func waitEVMServing(ctx context.Context, hc *http.Client, evmRPC string) error { +// error-free result — proof the EVM JSON-RPC listener is bound and serving — or +// until a listener-start failure arrives on serveErr, short-circuiting the poll +// with the actual reported error rather than letting it time out generically. +// +// serveErr is the node's buffered (cap 2: HTTP+WS) EVM-serve channel; the app's +// reportEVMServeErr diverts a listener Start() failure here instead of panicking. +// Consumption is non-destructive: a received error is re-sent (non-blocking — we +// just freed a slot, so the buffer has room) so the public Node.ServeErr() +// accessor still observes it after WaitReady returns. It is passed bidirectional +// (not <-chan) precisely so it can be re-sent. serveErr may be nil (no EVM +// listeners registered), in which case the receive arm is never ready. +func waitEVMServing(ctx context.Context, hc *http.Client, evmRPC string, serveErr chan error) error { const body = `{"jsonrpc":"2.0","id":1,"method":"eth_blockNumber","params":[]}` tick := time.NewTicker(probeInterval) defer tick.Stop() @@ -70,6 +80,14 @@ func waitEVMServing(ctx context.Context, hc *http.Client, evmRPC string) error { select { case <-ctx.Done(): return fmt.Errorf("%s eth_blockNumber not serving before deadline: %w", evmRPC, ctx.Err()) + case err := <-serveErr: + // Re-send so a later Node.ServeErr() read still sees it; non-blocking, + // and the slot we just drained guarantees room. + select { + case serveErr <- err: + default: + } + return fmt.Errorf("%s EVM serve failed: %w", evmRPC, err) case <-tick.C: } } diff --git a/inprocess/readiness_test.go b/inprocess/readiness_test.go new file mode 100644 index 0000000000..730b87f482 --- /dev/null +++ b/inprocess/readiness_test.go @@ -0,0 +1,55 @@ +//go:build inprocess + +package inprocess + +import ( + "context" + "errors" + "net/http" + "strings" + "testing" + "time" +) + +// TestWaitEVMServingSurfacesServeErr proves a reported EVM listener-start failure +// short-circuits waitEVMServing with the actual error, rather than polling the +// unreachable endpoint until the ctx deadline and returning a generic timeout. +// The EVM URL points at a closed loopback port so the poll never succeeds; the +// pre-seeded serveErr channel stands in for app.reportEVMServeErr's divert. +func TestWaitEVMServingSurfacesServeErr(t *testing.T) { + serveErr := make(chan error, 2) // matches the node's HTTP+WS buffer + bindErr := errors.New("listen tcp 0.0.0.0:8545: bind: address already in use") + serveErr <- bindErr + + // Generous ctx: if the short-circuit failed we'd block on the poll loop until + // this fires, so the test would catch a regression as a timeout-shaped error. + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + + hc := &http.Client{Timeout: time.Second} + start := time.Now() + err := waitEVMServing(ctx, hc, "http://127.0.0.1:1", serveErr) + if err == nil { + t.Fatal("waitEVMServing returned nil; want the reported serve error") + } + if !errors.Is(err, bindErr) { + t.Fatalf("error does not wrap the reported serve error: %v", err) + } + if errors.Is(err, context.DeadlineExceeded) || strings.Contains(err.Error(), "not serving before deadline") { + t.Fatalf("got a generic timeout, want the real serve error: %v", err) + } + if elapsed := time.Since(start); elapsed > 5*time.Second { + t.Fatalf("short-circuit took %v; expected near-immediate", elapsed) + } + + // Non-destructive contract: the error is re-sent, so a later ServeErr()-style + // read still observes it. + select { + case got := <-serveErr: + if !errors.Is(got, bindErr) { + t.Fatalf("re-sent error = %v, want %v", got, bindErr) + } + default: + t.Fatal("serveErr drained: readiness consumption was destructive") + } +} From 30459acd0f28122a37aed4ff2b7bdebe216729bf Mon Sep 17 00:00:00 2001 From: bdchatham Date: Wed, 24 Jun 2026 16:20:55 -0700 Subject: [PATCH 08/10] refactor(inprocess): guard the implicit P2P mesh; named invariants; cleanup The harness's P2P mesh is derived implicitly: collectGentxs mutates each node's tmCfg.P2P.PersistentPeers in place. Correct the doc to describe that mechanism (it never set PersistentPeers itself) and add a post-collectGentxs guard that fails loudly for N>=2 if the wiring didn't land, turning a fragile silent dependency into a fast failure. Replace the rot-prone "recipe #N" taxonomy with self-describing named invariants referenced at point-of-use (empty-valset, gentx-derived peer mesh, EVM-enable injection, metrics-off constraint, loopback bind scope / 0.0.0.0 EVM caveat, loopback conn-tracker ceiling, validator-count rule). Also: nolint:gosec on the seid build exec (consistency with siblings); drop the F5 step-tag comment; probeInterval var -> const; document that ServeErr() must be read after WaitReady, not concurrently; add a test asserting metrics stay off. Co-Authored-By: Claude Opus 4.8 --- inprocess/appoptions.go | 2 +- inprocess/doc.go | 62 +++++++++++-------- inprocess/genesis.go | 5 +- inprocess/handle.go | 11 +++- inprocess/harness.go | 68 ++++++++++++++------- inprocess/harness_test.go | 15 +++++ inprocess/readiness.go | 2 +- integration_test/runner/runner_inprocess.go | 6 +- 8 files changed, 114 insertions(+), 57 deletions(-) diff --git a/inprocess/appoptions.go b/inprocess/appoptions.go index c3ade3ee88..8f3a6cee3f 100644 --- a/inprocess/appoptions.go +++ b/inprocess/appoptions.go @@ -7,7 +7,7 @@ import "github.com/sei-protocol/sei-chain/app" // appOptions is the per-node servertypes.AppOptions the harness injects into // app.New. app.TestAppOpts hard-disables the EVM HTTP/WS listeners to avoid port // clashes in single-app tests; the harness needs the opposite — EVM enabled on -// distinct per-node ports (recipe #3) — plus the chain-id the sei-chain helpers +// distinct per-node ports (the EVM-enable injection invariant) — plus the chain-id the sei-chain helpers // hardcode. Unknown keys return nil, matching servertypes.AppOptions semantics // (callers treat a nil as "unset, use the default"). type appOptions struct { diff --git a/inprocess/doc.go b/inprocess/doc.go index 7bc834da0d..6f1d4c9458 100644 --- a/inprocess/doc.go +++ b/inprocess/doc.go @@ -28,35 +28,47 @@ // sei.NodeHandle / sei.NetworkHandle so a thin adapter can satisfy the SDK // interface once the toolchain skew is resolved — see Node and Network below. // -// # Recipe (the gotchas that make N>1 consensus + per-node RPC work) +// # Invariants (the gotchas that make N>1 consensus + per-node RPC work) // // These are the load-bearing deltas vs sei-cosmos/testutil/network.New, proven -// by the N-RPC spike and preserved here: +// by the N-RPC spike and preserved here. Each is named and referenced by name at +// its point-of-use in the code (there is no central numbered list to drift): // -// 1. genDoc.Validators = nil — let CometBFT derive the valset from the app's -// InitChain response. testutil/network sets it to []{self}, which fails -// consensus replay for N>1. -// 2. Full P2P mesh — persistent-peers wired nodeID@127.0.0.1:p2pPort across all -// N (testutil/network wires zero) — without the mesh nodes never gossip and -// consensus never forms for N>1. -// 3. Injected AppOptions enable EVM HTTP/WS on per-node ports — without them -// app.TestAppOpts hard-disables the listeners and no node serves EVM. -// 4. tmCfg.Instrumentation.Prometheus = false — metrics off avoids the -// dup-registry panic from the process-wide registries. Invariant: metrics +// - empty-valset invariant: genDoc.Validators = nil — let CometBFT derive the +// valset from the app's InitChain response. testutil/network sets it to +// []{self}, which fails consensus replay for N>1. (N=1 is the documented +// exception under the validator-count rule below.) +// - gentx-derived peer mesh: the P2P mesh is NOT wired explicitly by the +// harness. Each validator's gentx memo carries nodeID@127.0.0.1:p2pPort, and +// collectGentxs → genutil.GenAppStateFromConfig (sei-cosmos x/genutil) mutates +// P2P.PersistentPeers IN PLACE on the same *config.Config the harness holds in +// node.tmCfg and later hands to tmnode.New. So the mesh is derived from the +// gentxs, not set by harness code — without it nodes never gossip and +// consensus never forms for N>1. This in-place mutation is invisible at the +// harness layer and fragile: a refactor that clones tmCfg before collectGentxs, +// or builds nodes before collecting, silently breaks consensus for all N. Start +// guards it — after collectGentxs it asserts PersistentPeers is non-empty for +// N>=2 and fails loudly otherwise. +// - EVM-enable injection: injected AppOptions enable EVM HTTP/WS on per-node +// ports — without them app.TestAppOpts hard-disables the listeners and no node +// serves EVM. +// - metrics-off constraint: tmCfg.Instrumentation.Prometheus = false — metrics +// off avoids the dup-registry panic from the process-wide registries. Metrics // must stay off until the evmrpc/EVM-keeper metrics are de-globalized — // re-enabling Prometheus without that reintroduces the panic. -// 5. TM RPC / P2P listeners scoped to 127.0.0.1 (they default to [::] / -// 0.0.0.0) — without scoping an in-process harness publishes externally -// reachable consensus/RPC listeners. Caveat (accepted): the EVM HTTP/WS listeners bind all -// interfaces (0.0.0.0) for the harness lifetime; only TM RPC/P2P are -// loopback-scoped. They run on free ephemeral ports, dialed via 127.0.0.1. -// Tightening requires a bind-host option in evmrpc (not yet present). -// 6. MaxIncomingConnectionAttempts raised — loopback collapses all peers onto -// 127.0.0.1, so the router's IP-keyed conn-tracker counts the startup burst -// on one key — without the raise the burst trips the per-IP cap and peers -// are rejected. +// - loopback bind scope: TM RPC / P2P listeners scoped to 127.0.0.1 (they +// default to [::] / 0.0.0.0) — without scoping an in-process harness publishes +// externally reachable consensus/RPC listeners. 0.0.0.0 EVM caveat (accepted): +// the EVM HTTP/WS listeners bind all interfaces (0.0.0.0) for the harness +// lifetime; only TM RPC/P2P are loopback-scoped. They run on free ephemeral +// ports, dialed via 127.0.0.1. Tightening requires a bind-host option in +// evmrpc (not yet present). +// - loopback conn-tracker ceiling: MaxIncomingConnectionAttempts raised — +// loopback collapses all peers onto 127.0.0.1, so the router's IP-keyed +// conn-tracker counts the startup burst on one key — without the raise the +// burst trips the per-IP cap and peers are rejected. // -// # Validator count: 1 or >= 3 (2 is the trap) +// # Validator-count rule: 1 or >= 3 (2 is the trap) // // When wiring a suite, pick Validators = 1 or Validators >= 3. Start rejects 2. // The constraint is CometBFT's block-sync→consensus handoff, NOT a voting-power @@ -66,8 +78,8 @@ // (sei-tendermint onlyValidatorIsUs, node/setup.go, gating // `blockSync := !onlyValidatorIsUs` in node/node.go). That decision reads the // genesis-derived valset BEFORE InitChain, so the harness pins the single -// validator into genesis for N=1 (recipe #1's empty valset would leave size 0, -// defeat onlyValidatorIsUs, and the solo node would hang in block-sync — see +// validator into genesis for N=1 (the empty-valset invariant would leave size +// 0, defeat onlyValidatorIsUs, and the solo node would hang in block-sync — see // startNode). // - N=2 hangs. Each node has exactly one peer, and BlockPool.IsCaughtUp // (internal/blocksync/pool.go) hard-requires len(peers) > 1 to ever report diff --git a/inprocess/genesis.go b/inprocess/genesis.go index 9359db228a..6c91644b2b 100644 --- a/inprocess/genesis.go +++ b/inprocess/genesis.go @@ -27,7 +27,8 @@ import ( // genesisBuilder accumulates per-validator accounts, balances, and gentxs across // the key-generation pass, then assembles a shared genesis whose validator set // is left EMPTY so every node derives the consensus valset from its InitChain -// response (recipe #1) — the single most important delta from testutil/network. +// response (the empty-valset invariant) — the single most important delta from +// testutil/network. // // This is a self-contained reimplementation of the unexported initGenFiles / // collectGenFiles / writeFile helpers in sei-cosmos/testutil/network: lifting @@ -148,7 +149,7 @@ func (b *genesisBuilder) writeBaseGenesis(baseState map[string]json.RawMessage, genDoc := tmtypes.GenesisDoc{ ChainID: b.chainID, AppState: appStateJSON, - Validators: nil, // recipe #1: derive valset from InitChain. + Validators: nil, // empty-valset invariant: derive valset from InitChain. } for _, gf := range genFiles { if err := genDoc.SaveAs(gf); err != nil { diff --git a/inprocess/handle.go b/inprocess/handle.go index 3abc3384bb..47b74f4142 100644 --- a/inprocess/handle.go +++ b/inprocess/handle.go @@ -50,13 +50,13 @@ func (h Node) RPCNodeAddr() string { return h.n.rpcAddr } func (h Node) TendermintRPC() string { return "http://" + stripScheme(h.n.rpcAddr) } // EVMRPC is the node's EVM JSON-RPC HTTP URL. The URL dials loopback, but the -// listener itself binds 0.0.0.0 (see doc.go recipe #5's accepted caveat). +// listener itself binds 0.0.0.0 (see doc.go's 0.0.0.0 EVM caveat). func (h Node) EVMRPC() string { return fmt.Sprintf("http://127.0.0.1:%d", h.n.httpPort) } // EVMWS is the node's EVM JSON-RPC WebSocket URL. Not part of the SDK // NodeHandle surface, but the in-process harness binds it, so it is exposed. -// The URL dials loopback, but the listener itself binds 0.0.0.0 (see doc.go -// recipe #5's accepted caveat). +// The URL dials loopback, but the listener itself binds 0.0.0.0 (see doc.go's +// 0.0.0.0 EVM caveat). func (h Node) EVMWS() string { return fmt.Sprintf("ws://127.0.0.1:%d", h.n.wsPort) } // REST is "" — the harness does not start the Cosmos LCD listener (reserved: @@ -73,6 +73,11 @@ func (h Node) Object() any { return h.n.tmNode } // (instead of the process-wide panic the production path uses). A non-nil // receive means that node's EVM listener failed to bind; consensus may still be // healthy. Buffered (cap 2: HTTP + WS). +// +// Read it AFTER WaitReady returns, not concurrently with it. WaitReady's probe +// drains and re-sends on this same channel, and the re-send is single-receiver- +// at-a-time — a concurrent read can race WaitReady for the re-sent error. This is +// within the documented "not goroutine-safe across calls" contract. func (h Node) ServeErr() <-chan error { return h.n.serveErr } // WaitReady blocks until this node has joined consensus (height advancing) and diff --git a/inprocess/harness.go b/inprocess/harness.go index 78868072c0..0b7ec19e00 100644 --- a/inprocess/harness.go +++ b/inprocess/harness.go @@ -126,7 +126,7 @@ type node struct { app *app.App tmNode rpclocal.NodeService rpc *rpclocal.Local - serveErr chan error // EVM listener Start() failures (recipe: no process-wide panic) + serveErr chan error // EVM listener Start() failures, diverted here instead of a process-wide panic } // Network is a handle to a running in-process mesh. It owns the lifecycle: Close @@ -139,9 +139,16 @@ type Network struct { closed bool } -// Start stands up opts.Validators in-process validators, wires a full P2P mesh, -// starts each node's RPC + EVM listeners, and returns once every node is -// constructed and started (NOT once consensus is live — call WaitReady for that). +// Start stands up opts.Validators in-process validators, starts each node's RPC +// + EVM listeners, and returns once every node is constructed and started (NOT +// once consensus is live — call WaitReady for that). +// +// The P2P mesh is not wired by Start directly. It is derived per the +// gentx-derived peer mesh invariant (see doc.go): collectGentxs → +// genutil.GenAppStateFromConfig mutates each node's tmCfg.P2P.PersistentPeers in +// place from the gentx memos. Start guards that this implicit wiring actually +// happened (see the assertion after collectGentxs) so a refactor that breaks it +// fails loudly instead of silently dropping consensus. // // On any error mid-bring-up, every already-started node is torn down before // returning, so a partial failure leaks nothing. The caller still must Close the @@ -199,6 +206,20 @@ func Start(ctx context.Context, opts Options) (_ *Network, retErr error) { if err := gb.collectGentxs(net.nodes, filepath.Join(baseDir, "gentxs")); err != nil { return nil, fmt.Errorf("collect gentxs: %w", err) } + // gentx-derived peer mesh guard: collectGentxs is what populates each node's + // PersistentPeers (in place, via GenAppStateFromConfig — see doc.go). For N>=2 + // an empty PersistentPeers means the implicit wiring did not land and consensus + // will never form; fail loudly here rather than hang in WaitReady. + if len(net.nodes) >= 2 { + for _, n := range net.nodes { + if n.tmCfg.P2P.PersistentPeers == "" { + return nil, fmt.Errorf( + "inprocess: gentx-derived peer mesh not wired: collectGentxs did not populate PersistentPeers for %s — did a refactor clone or reorder the config?", + n.moniker, + ) + } + } + } for _, n := range net.nodes { if err := net.startNode(ctx, n, enc); err != nil { @@ -310,18 +331,19 @@ func (net *Network) provisionExtraKeys(gb *genesisBuilder) error { // startNode builds the app, constructs + starts the tendermint node, wires the // local RPC client, and registers the EVM listeners. The node's EVM Start() -// failures land on n.serveErr instead of panicking (recipe: a single bind -// failure must not kill all N nodes). The genesis valset is N-dependent — see -// the recipe #1 / N=1 block below. +// failures land on n.serveErr instead of panicking (so a single bind failure +// must not kill all N nodes). The genesis valset is N-dependent per the +// empty-valset invariant — see the N=1 exception below. func (net *Network) startNode(ctx context.Context, n *node, enc encoding) error { theApp := newNodeApp(n, enc) theApp.SetEVMServeErr(n.serveErr) n.app = theApp - // recipe #1 (N>=2): zero the validator set so every node derives the valset - // from its own InitChain response — without this, multi-node consensus replay - // fails. genesis.go writes Validators=nil at build time; re-assert it here - // against the collectGentxs file round-trip (ExportGenesisFileWithTime). + // empty-valset invariant (N>=2): zero the validator set so every node derives + // the valset from its own InitChain response — without this, multi-node + // consensus replay fails. genesis.go writes Validators=nil at build time; + // re-assert it here against the collectGentxs file round-trip + // (ExportGenesisFileWithTime). // // N=1 EXCEPTION: a sole validator must skip block-sync and produce blocks as // solo proposer, which only happens when sei-tendermint's onlyValidatorIsUs @@ -394,10 +416,11 @@ type nodeAddrs struct { httpPort, wsPort int } -// buildNodeConfig builds an isolated per-node tendermint config with loopback TM -// RPC / P2P listeners and the conn-tracker ceiling raised (recipes #4, #5, #6). +// buildNodeConfig builds an isolated per-node tendermint config: metrics off +// (metrics-off constraint), loopback TM RPC / P2P listeners (loopback bind +// scope), and the conn-tracker ceiling raised (loopback conn-tracker ceiling). // EVM bind-host is not config-scopable (evmrpc hardcodes 0.0.0.0); the EVM ports -// are allocated free here and dialed via loopback. +// are allocated free here and dialed via loopback (the 0.0.0.0 EVM caveat). func buildNodeConfig(nodeDir, moniker string, timeoutCommit time.Duration) (*config.Config, nodeAddrs, error) { sctx := server.NewDefaultContext() tmCfg := sctx.Config @@ -406,18 +429,19 @@ func buildNodeConfig(nodeDir, moniker string, timeoutCommit time.Duration) (*con tmCfg.SetRoot(nodeDir) tmCfg.Consensus.UnsafeCommitTimeoutOverride = timeoutCommit tmCfg.TxIndex = config.TestTxIndexConfig() - // recipe #6: loopback collapses every peer onto 127.0.0.1, so the router's - // IP-keyed conn-tracker counts all N-1 inbound on one key. AllowDuplicateIP - // is a peer-manager flag and does NOT touch the router conn-tracker. + // loopback conn-tracker ceiling: loopback collapses every peer onto 127.0.0.1, + // so the router's IP-keyed conn-tracker counts all N-1 inbound on one key. + // AllowDuplicateIP is a peer-manager flag and does NOT touch the router + // conn-tracker. tmCfg.P2P.MaxIncomingConnectionAttempts = 10000 tmCfg.P2P.AllowDuplicateIP = true - // recipe #4: metrics-off avoids the prometheus.DefaultRegisterer dup panic - // from the process-wide registries. Invariant: this must stay off until the - // evmrpc/EVM-keeper metrics are de-globalized — re-enabling Prometheus - // without that reintroduces the panic. + // metrics-off constraint: metrics-off avoids the prometheus.DefaultRegisterer + // dup panic from the process-wide registries. This must stay off until the + // evmrpc/EVM-keeper metrics are de-globalized — re-enabling Prometheus without + // that reintroduces the panic. tmCfg.Instrumentation.Prometheus = false - // recipe #5: server.FreeTCPAddr composes tcp://0.0.0.0:PORT — a publicly-bound + // loopback bind scope: server.FreeTCPAddr composes tcp://0.0.0.0:PORT — a publicly-bound // listener. An in-process harness must scope every listener to loopback, so we // take only the free port and compose the 127.0.0.1 address ourselves. var a nodeAddrs diff --git a/inprocess/harness_test.go b/inprocess/harness_test.go index 06451e480e..3ef88ab452 100644 --- a/inprocess/harness_test.go +++ b/inprocess/harness_test.go @@ -127,6 +127,21 @@ func TestStartRejectsZeroValidators(t *testing.T) { } } +// TestBuildNodeConfigMetricsOff mechanically guards the metrics-off constraint +// (see doc.go): a built node config must keep Instrumentation.Prometheus = false. +// Re-enabling it reintroduces the process-wide dup-registry panic, so this catches +// a regression in CI rather than relying on reviewer memory. +func TestBuildNodeConfigMetricsOff(t *testing.T) { + dir := t.TempDir() + tmCfg, _, err := buildNodeConfig(dir, "node0", time.Second) + if err != nil { + t.Fatalf("buildNodeConfig: %v", err) + } + if tmCfg.Instrumentation.Prometheus { + t.Fatal("Instrumentation.Prometheus = true; metrics-off constraint violated — this reintroduces the dup-registry panic") + } +} + // TestFreshChainIDPerRun pins the per-run unique chain-id discipline: an empty // Options.ChainID must yield a distinct id each time, so a run never collides // with a prior run's persisted genesis. Pure-function check — no bring-up. diff --git a/inprocess/readiness.go b/inprocess/readiness.go index aa6b6d43fd..e8fce4800c 100644 --- a/inprocess/readiness.go +++ b/inprocess/readiness.go @@ -21,7 +21,7 @@ import ( // is mechanical. // probeInterval is the readiness poll cadence. -var probeInterval = 500 * time.Millisecond +const probeInterval = 500 * time.Millisecond // waitHeightAdvances blocks until tmRPC's committed height rises by >= delta // from the first successful read — proof the chain is producing blocks, not diff --git a/integration_test/runner/runner_inprocess.go b/integration_test/runner/runner_inprocess.go index 6327aecc5c..da19b439d9 100644 --- a/integration_test/runner/runner_inprocess.go +++ b/integration_test/runner/runner_inprocess.go @@ -139,8 +139,8 @@ func (e *inProcessExecer) ensureBin(t *testing.T) error { return } e.binDir = dir - // F5: the build dir holds a freshly-built seid + shim; remove it at test end - // so repeated runs don't accrue a binary per run. + // Remove the build dir at test end so repeated runs don't accrue a binary + // per run. t.Cleanup(func() { _ = os.RemoveAll(dir) }) root, err := repoRoot() @@ -150,7 +150,7 @@ func (e *inProcessExecer) ensureBin(t *testing.T) error { } realBin := filepath.Join(dir, "seid.real") // Build from this branch's source so the CLI matches the in-process app. - build := exec.Command("go", "build", "-tags", "inprocess", "-o", realBin, "./cmd/seid") + build := exec.Command("go", "build", "-tags", "inprocess", "-o", realBin, "./cmd/seid") //nolint:gosec build.Dir = root if out, berr := build.CombinedOutput(); berr != nil { e.setup = fmt.Errorf("go build seid: %w\n%s", berr, out) From 4e143fcb987938aa74167742ca64ddb7ea8b1b08 Mon Sep 17 00:00:00 2001 From: bdchatham Date: Wed, 24 Jun 2026 17:19:27 -0700 Subject: [PATCH 09/10] drop EVM serve-error diversion; keep handle exposure for teardown The diversion (route a listener Start() failure to a channel instead of panicking) only softened a rare EVM port-bind collision. For a test harness a loud panic on a rare event is fine, and the diversion's production footprint is not worth it. Production app.go reverts to the original bare panic(err) serve goroutines; the sole retained production change is keeping the constructed EVM HTTP/WS handles so the harness can Stop() them at teardown. Co-Authored-By: Claude Opus 4.8 --- app/app.go | 46 ++----------------------------- app/app_inprocess.go | 11 ++------ inprocess/doc.go | 6 +++- inprocess/handle.go | 13 +-------- inprocess/harness.go | 15 ++++------ inprocess/harness_test.go | 9 ------ inprocess/readiness.go | 23 +++------------- inprocess/readiness_test.go | 55 ------------------------------------- 8 files changed, 19 insertions(+), 159 deletions(-) delete mode 100644 inprocess/readiness_test.go diff --git a/app/app.go b/app/app.go index 579094b8a0..4ce2d3f1f2 100644 --- a/app/app.go +++ b/app/app.go @@ -485,14 +485,6 @@ type App struct { // reaps the listeners. evmHTTPServer evmrpc.EVMServer evmWSServer evmrpc.EVMServer - // evmServeErr, when non-nil, diverts an EVM listener Start() (listener-start) - // failure to the channel instead of panicking. Production leaves it nil: a - // listener-start failure panics and crashes the node. An in-process host sets - // it via SetEVMServeErr before the first block so one node's listener-start - // failure is a reportable error rather than a process-wide panic killing all N. - // Construct-time bind failures (NewEVM*Server below) are not covered — those - // stay fail-fast. - evmServeErr chan<- error txPrioritizer sdk.TxPrioritizer @@ -2744,10 +2736,9 @@ func (app *App) RegisterLocalServices(node client.LocalClient, txConfig client.T } app.evmHTTPServer = evmHTTPServer go func() { - defer app.recoverEVMServe() <-app.httpServerStartSignal if err := evmHTTPServer.Start(); err != nil { - app.reportEVMServeErr(err) + panic(err) } }() } @@ -2760,10 +2751,9 @@ func (app *App) RegisterLocalServices(node client.LocalClient, txConfig client.T } app.evmWSServer = evmWSServer go func() { - defer app.recoverEVMServe() <-app.wsServerStartSignal if err := evmWSServer.Start(); err != nil { - app.reportEVMServeErr(err) + panic(err) } }() } @@ -2779,38 +2769,6 @@ func (app *App) RegisterLocalServices(node client.LocalClient, txConfig client.T } } -// reportEVMServeErr diverts an EVM listener Start() failure to the registered -// error channel, or panics when no channel is set (production seid). The send is -// non-blocking: the channel is buffered and a second listener's failure must not -// deadlock the goroutine. -func (app *App) reportEVMServeErr(err error) { - if app.evmServeErr == nil { - panic(err) - } - select { - case app.evmServeErr <- err: - default: - } -} - -// recoverEVMServe is the deferred guard on the EVM listener goroutines. A panic -// inside Start() (beyond the error it returns cleanly) is converted to a reported -// error when a channel is registered, so one node's listener panic does not crash -// an in-process host running N nodes. With no channel (production seid) it -// re-panics, keeping the fail-loud contract. -func (app *App) recoverEVMServe() { - if r := recover(); r != nil { - if app.evmServeErr == nil { - panic(r) - } - err, ok := r.(error) - if !ok { - err = fmt.Errorf("evm serve panic: %v", r) - } - app.reportEVMServeErr(err) - } -} - // RegisterSwaggerAPI registers swagger route with API Server func RegisterSwaggerAPI(rtr *mux.Router) { statikFS, err := fs.NewWithNamespace("swagger") diff --git a/app/app_inprocess.go b/app/app_inprocess.go index 5023eb4261..c46a33c2af 100644 --- a/app/app_inprocess.go +++ b/app/app_inprocess.go @@ -7,15 +7,8 @@ import "github.com/sei-protocol/sei-chain/evmrpc" // This file holds the harness-only accessors for App's EVM serve plumbing. They // are gated behind the `inprocess` build tag so production App's public surface // does not widen — only the in-process harness (which builds with that tag) sees -// them. The backing fields and the reportEVMServeErr/recoverEVMServe helpers stay -// in untagged app.go because the production serve goroutines use them. - -// SetEVMServeErr registers the channel that EVM listener Start() (listener-start) -// failures are sent to, replacing the default fail-loud panic. An in-process host -// that runs multiple apps in one process calls this before the first block so one -// node's listener-start failure is a reportable error rather than a process-wide -// panic. The channel should be buffered (>= 2: one HTTP + one WS listener). -func (app *App) SetEVMServeErr(ch chan<- error) { app.evmServeErr = ch } +// them. The backing handle fields stay in untagged app.go because the production +// serve goroutines construct them. // EVMHTTPServer returns the EVM JSON-RPC HTTP listener constructed in // RegisterLocalServices, or nil if HTTP serving is disabled. An embedding diff --git a/inprocess/doc.go b/inprocess/doc.go index 6f1d4c9458..b908d6913e 100644 --- a/inprocess/doc.go +++ b/inprocess/doc.go @@ -62,7 +62,11 @@ // the EVM HTTP/WS listeners bind all interfaces (0.0.0.0) for the harness // lifetime; only TM RPC/P2P are loopback-scoped. They run on free ephemeral // ports, dialed via 127.0.0.1. Tightening requires a bind-host option in -// evmrpc (not yet present). +// evmrpc (not yet present). A rare EVM port-bind collision (the free port is +// taken between FreeTCPAddr's probe-close and the listener's bind) panics the +// node's serve goroutine — the production fail-loud path, intentionally not +// diverted here. If it ever flakes, the fix is hardening the FreeTCPAddr +// bind-close-rebind TOCTOU window, NOT re-adding a serve-error diversion. // - loopback conn-tracker ceiling: MaxIncomingConnectionAttempts raised — // loopback collapses all peers onto 127.0.0.1, so the router's IP-keyed // conn-tracker counts the startup burst on one key — without the raise the diff --git a/inprocess/handle.go b/inprocess/handle.go index 47b74f4142..039aed126a 100644 --- a/inprocess/handle.go +++ b/inprocess/handle.go @@ -69,17 +69,6 @@ func (h Node) REST() string { return "" } // capability k8s mode never offers. func (h Node) Object() any { return h.n.tmNode } -// ServeErr returns the channel EVM listener Start() failures are reported on -// (instead of the process-wide panic the production path uses). A non-nil -// receive means that node's EVM listener failed to bind; consensus may still be -// healthy. Buffered (cap 2: HTTP + WS). -// -// Read it AFTER WaitReady returns, not concurrently with it. WaitReady's probe -// drains and re-sends on this same channel, and the re-send is single-receiver- -// at-a-time — a concurrent read can race WaitReady for the re-sent error. This is -// within the documented "not goroutine-safe across calls" contract. -func (h Node) ServeErr() <-chan error { return h.n.serveErr } - // WaitReady blocks until this node has joined consensus (height advancing) and // its EVM listener is serving, or ctx fires. Its single-ctx signature mirrors // the SDK's sei.NodeHandle.WaitReady; the probe HTTP client is an internal @@ -88,7 +77,7 @@ func (h Node) WaitReady(ctx context.Context) error { if err := waitHeightAdvances(ctx, probeClient, h.TendermintRPC(), 1); err != nil { return fmt.Errorf("%s tendermint: %w", h.n.moniker, err) } - if err := waitEVMServing(ctx, probeClient, h.EVMRPC(), h.n.serveErr); err != nil { + if err := waitEVMServing(ctx, probeClient, h.EVMRPC()); err != nil { return fmt.Errorf("%s evm: %w", h.n.moniker, err) } return nil diff --git a/inprocess/harness.go b/inprocess/harness.go index 0b7ec19e00..eadaaff38e 100644 --- a/inprocess/harness.go +++ b/inprocess/harness.go @@ -123,10 +123,9 @@ type node struct { httpPort int // EVM JSON-RPC HTTP wsPort int // EVM JSON-RPC WS - app *app.App - tmNode rpclocal.NodeService - rpc *rpclocal.Local - serveErr chan error // EVM listener Start() failures, diverted here instead of a process-wide panic + app *app.App + tmNode rpclocal.NodeService + rpc *rpclocal.Local } // Network is a handle to a running in-process mesh. It owns the lifecycle: Close @@ -298,7 +297,6 @@ func (net *Network) provisionNodes(enc encoding, gb *genesisBuilder) error { p2pHost: addrs.p2pHost, p2pPort: addrs.p2pPort, rpcAddr: addrs.rpcAddr, httpPort: addrs.httpPort, wsPort: addrs.wsPort, - serveErr: make(chan error, 2), // one HTTP + one WS listener }) } return nil @@ -330,13 +328,10 @@ func (net *Network) provisionExtraKeys(gb *genesisBuilder) error { } // startNode builds the app, constructs + starts the tendermint node, wires the -// local RPC client, and registers the EVM listeners. The node's EVM Start() -// failures land on n.serveErr instead of panicking (so a single bind failure -// must not kill all N nodes). The genesis valset is N-dependent per the -// empty-valset invariant — see the N=1 exception below. +// local RPC client, and registers the EVM listeners. The genesis valset is +// N-dependent per the empty-valset invariant — see the N=1 exception below. func (net *Network) startNode(ctx context.Context, n *node, enc encoding) error { theApp := newNodeApp(n, enc) - theApp.SetEVMServeErr(n.serveErr) n.app = theApp // empty-valset invariant (N>=2): zero the validator set so every node derives diff --git a/inprocess/harness_test.go b/inprocess/harness_test.go index 3ef88ab452..723be30770 100644 --- a/inprocess/harness_test.go +++ b/inprocess/harness_test.go @@ -50,15 +50,6 @@ func TestInProcessNetwork(t *testing.T) { t.Logf("node %s: tm=%s evm=%s ws=%s", nd.Name(), nd.TendermintRPC(), nd.EVMRPC(), nd.EVMWS()) } - // No EVM listener reported a bind failure. - for i := 0; i < n; i++ { - select { - case err := <-net.Node(i).ServeErr(): - t.Fatalf("node %s EVM serve error: %v", net.Node(i).Name(), err) - default: - } - } - // VERIFY 3: tx broadcast on node 0 is observable on node 1's independent RPC. assertCrossNodeTxRoundTrip(t, ctx, net) } diff --git a/inprocess/readiness.go b/inprocess/readiness.go index e8fce4800c..3470a5d2b1 100644 --- a/inprocess/readiness.go +++ b/inprocess/readiness.go @@ -51,17 +51,10 @@ func waitHeightAdvances(ctx context.Context, hc *http.Client, tmRPC string, delt // waitEVMServing blocks until evmRPC answers eth_blockNumber with a non-empty, // error-free result — proof the EVM JSON-RPC listener is bound and serving — or -// until a listener-start failure arrives on serveErr, short-circuiting the poll -// with the actual reported error rather than letting it time out generically. -// -// serveErr is the node's buffered (cap 2: HTTP+WS) EVM-serve channel; the app's -// reportEVMServeErr diverts a listener Start() failure here instead of panicking. -// Consumption is non-destructive: a received error is re-sent (non-blocking — we -// just freed a slot, so the buffer has room) so the public Node.ServeErr() -// accessor still observes it after WaitReady returns. It is passed bidirectional -// (not <-chan) precisely so it can be re-sent. serveErr may be nil (no EVM -// listeners registered), in which case the receive arm is never ready. -func waitEVMServing(ctx context.Context, hc *http.Client, evmRPC string, serveErr chan error) error { +// until ctx fires. A rare EVM port-bind collision panics the node's serve +// goroutine (the production fail-loud path); the harness does not divert it, so +// here it surfaces only as a poll that never succeeds before the deadline. +func waitEVMServing(ctx context.Context, hc *http.Client, evmRPC string) error { const body = `{"jsonrpc":"2.0","id":1,"method":"eth_blockNumber","params":[]}` tick := time.NewTicker(probeInterval) defer tick.Stop() @@ -80,14 +73,6 @@ func waitEVMServing(ctx context.Context, hc *http.Client, evmRPC string, serveEr select { case <-ctx.Done(): return fmt.Errorf("%s eth_blockNumber not serving before deadline: %w", evmRPC, ctx.Err()) - case err := <-serveErr: - // Re-send so a later Node.ServeErr() read still sees it; non-blocking, - // and the slot we just drained guarantees room. - select { - case serveErr <- err: - default: - } - return fmt.Errorf("%s EVM serve failed: %w", evmRPC, err) case <-tick.C: } } diff --git a/inprocess/readiness_test.go b/inprocess/readiness_test.go deleted file mode 100644 index 730b87f482..0000000000 --- a/inprocess/readiness_test.go +++ /dev/null @@ -1,55 +0,0 @@ -//go:build inprocess - -package inprocess - -import ( - "context" - "errors" - "net/http" - "strings" - "testing" - "time" -) - -// TestWaitEVMServingSurfacesServeErr proves a reported EVM listener-start failure -// short-circuits waitEVMServing with the actual error, rather than polling the -// unreachable endpoint until the ctx deadline and returning a generic timeout. -// The EVM URL points at a closed loopback port so the poll never succeeds; the -// pre-seeded serveErr channel stands in for app.reportEVMServeErr's divert. -func TestWaitEVMServingSurfacesServeErr(t *testing.T) { - serveErr := make(chan error, 2) // matches the node's HTTP+WS buffer - bindErr := errors.New("listen tcp 0.0.0.0:8545: bind: address already in use") - serveErr <- bindErr - - // Generous ctx: if the short-circuit failed we'd block on the poll loop until - // this fires, so the test would catch a regression as a timeout-shaped error. - ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) - defer cancel() - - hc := &http.Client{Timeout: time.Second} - start := time.Now() - err := waitEVMServing(ctx, hc, "http://127.0.0.1:1", serveErr) - if err == nil { - t.Fatal("waitEVMServing returned nil; want the reported serve error") - } - if !errors.Is(err, bindErr) { - t.Fatalf("error does not wrap the reported serve error: %v", err) - } - if errors.Is(err, context.DeadlineExceeded) || strings.Contains(err.Error(), "not serving before deadline") { - t.Fatalf("got a generic timeout, want the real serve error: %v", err) - } - if elapsed := time.Since(start); elapsed > 5*time.Second { - t.Fatalf("short-circuit took %v; expected near-immediate", elapsed) - } - - // Non-destructive contract: the error is re-sent, so a later ServeErr()-style - // read still observes it. - select { - case got := <-serveErr: - if !errors.Is(got, bindErr) { - t.Fatalf("re-sent error = %v, want %v", got, bindErr) - } - default: - t.Fatal("serveErr drained: readiness consumption was destructive") - } -} From 6320c19a967f0f7c0094627f6bb6cdd87802c90b Mon Sep 17 00:00:00 2001 From: bdchatham Date: Wed, 24 Jun 2026 18:11:57 -0700 Subject: [PATCH 10/10] docs(inprocess): tighten comments to human-audience register MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Consolidate the package doc (lead with the validator-count rule, centralize the N=1 mechanism, distill the invariant prose) and strip work-item provenance from code comments — "productionizes the spike", "C2 end-to-end proof", "proven live by", "the point of this demo". Collapse the N-count re-derivation in the runner test to a cross-reference; the canonical statement lives in the package doc. No constraint dropped. Co-Authored-By: Claude Opus 4.8 --- inprocess/doc.go | 145 +++++++++--------- inprocess/genesis.go | 2 +- inprocess/harness_test.go | 10 +- .../runner/runner_inprocess_test.go | 22 +-- 4 files changed, 86 insertions(+), 93 deletions(-) diff --git a/inprocess/doc.go b/inprocess/doc.go index b908d6913e..bbb96030ab 100644 --- a/inprocess/doc.go +++ b/inprocess/doc.go @@ -1,13 +1,15 @@ //go:build inprocess -// Package inprocess stands up N sei-chain validators in a single Go process, -// reaching real CometBFT consensus and each serving its own RPC stack -// (Tendermint RPC + EVM JSON-RPC HTTP/WS), with deterministic teardown. +// Package inprocess stands up N sei-chain validators in a single Go process — +// real CometBFT consensus, each node serving its own Tendermint RPC + EVM +// JSON-RPC (HTTP/WS), with deterministic teardown. It is the in-process +// provisioning foundation for the SDK "local" provider (design: +// bdchatham-designs/designs/test-harness/sdk-local-provider-lld.md). // -// It is the in-process provisioning foundation for the SDK "local" provider -// (design: bdchatham-designs/designs/test-harness/sdk-local-provider-lld.md). -// The package is gated behind the `inprocess` build tag so its heavy -// sei-tendermint/sei-cosmos bring-up never leaks into a normal `seid` build. +// Use Validators = 1 or Validators >= 3; Start rejects 2 (see "Choosing the +// validator count"). The package is gated behind the `inprocess` build tag so +// its heavy sei-tendermint/sei-cosmos bring-up never leaks into a normal `seid` +// build. // // # Usage // @@ -17,78 +19,75 @@ // if err := net.WaitReady(ctx); err != nil { ... } // rpc := net.Node(0).TendermintRPC() // http://127.0.0.1:PORT // -// # Why a native API, not the SDK sei.Provider interface +// # Choosing the validator count +// +// Pick 1 or >= 3 — never 2. The constraint is CometBFT's block-sync→consensus +// handoff, not a voting-power quorum: // -// The LLD's target is for Start to back the SDK's sei.Provider so suites written -// against sei.Open(ctx, "local") run unchanged. That wiring is deferred: the SDK -// lives in the github.com/sei-protocol/sei-k8s-controller module, which declares -// `go >= 1.26.0`; sei-chain runs go 1.25.6, so importing the SDK forces a -// chain-wide toolchain bump (and pulls the controller's controller-runtime/AWS -// dep graph into the seid build). The handle methods here intentionally mirror -// sei.NodeHandle / sei.NetworkHandle so a thin adapter can satisfy the SDK -// interface once the toolchain skew is resolved — see Node and Network below. +// - N=1: the sole validator skips block-sync and proposes blocks solo +// (sei-tendermint onlyValidatorIsUs in node/setup.go gates +// `blockSync := !onlyValidatorIsUs` in node/node.go). That decision reads +// the genesis-derived valset before InitChain, so the harness pins the +// single validator into genesis for N=1 — an empty valset would leave size +// 0, defeat onlyValidatorIsUs, and hang the solo node in block-sync (see +// startNode). +// - N=2 deadlocks: each node has exactly one peer, but BlockPool.IsCaughtUp +// (internal/blocksync/pool.go) requires len(peers) > 1 to ever report +// caught-up, so neither node leaves block-sync. It is a peer-count +// deadlock, not a stake threshold — Start rejects 2 loudly rather than hang. +// - N>=3: every node has >= 2 peers, so IsCaughtUp can fire and hand off to +// consensus. N=3 is the smallest real multi-node topology. // -// # Invariants (the gotchas that make N>1 consensus + per-node RPC work) +// # Bring-up invariants // -// These are the load-bearing deltas vs sei-cosmos/testutil/network.New, proven -// by the N-RPC spike and preserved here. Each is named and referenced by name at -// its point-of-use in the code (there is no central numbered list to drift): +// These are the load-bearing deltas vs sei-cosmos/testutil/network.New. Each is +// named and referenced by name at its point of use in the code — there is no +// central numbered list to drift: // -// - empty-valset invariant: genDoc.Validators = nil — let CometBFT derive the +// - empty-valset: set genDoc.Validators = nil and let CometBFT derive the // valset from the app's InitChain response. testutil/network sets it to -// []{self}, which fails consensus replay for N>1. (N=1 is the documented -// exception under the validator-count rule below.) -// - gentx-derived peer mesh: the P2P mesh is NOT wired explicitly by the -// harness. Each validator's gentx memo carries nodeID@127.0.0.1:p2pPort, and -// collectGentxs → genutil.GenAppStateFromConfig (sei-cosmos x/genutil) mutates -// P2P.PersistentPeers IN PLACE on the same *config.Config the harness holds in -// node.tmCfg and later hands to tmnode.New. So the mesh is derived from the -// gentxs, not set by harness code — without it nodes never gossip and -// consensus never forms for N>1. This in-place mutation is invisible at the -// harness layer and fragile: a refactor that clones tmCfg before collectGentxs, -// or builds nodes before collecting, silently breaks consensus for all N. Start -// guards it — after collectGentxs it asserts PersistentPeers is non-empty for -// N>=2 and fails loudly otherwise. +// []{self}, which fails consensus replay for N>1. (N=1 is the exception — +// it pins the validator into genesis; see "Choosing the validator count".) +// - gentx-derived peer mesh: the harness never wires the P2P mesh. Each +// validator's gentx memo carries nodeID@127.0.0.1:p2pPort, and +// collectGentxs → genutil.GenAppStateFromConfig (sei-cosmos x/genutil) +// mutates P2P.PersistentPeers in place on the same *config.Config the +// harness holds in node.tmCfg and later hands to tmnode.New. Without it +// nodes never gossip and consensus never forms for N>1. The in-place +// mutation is invisible at the harness layer and fragile — cloning tmCfg +// before collectGentxs, or building nodes before collecting, silently +// breaks consensus for all N — so Start asserts PersistentPeers is +// non-empty (N>=2) right after collectGentxs and fails loudly otherwise. // - EVM-enable injection: injected AppOptions enable EVM HTTP/WS on per-node -// ports — without them app.TestAppOpts hard-disables the listeners and no node -// serves EVM. -// - metrics-off constraint: tmCfg.Instrumentation.Prometheus = false — metrics -// off avoids the dup-registry panic from the process-wide registries. Metrics -// must stay off until the evmrpc/EVM-keeper metrics are de-globalized — -// re-enabling Prometheus without that reintroduces the panic. -// - loopback bind scope: TM RPC / P2P listeners scoped to 127.0.0.1 (they -// default to [::] / 0.0.0.0) — without scoping an in-process harness publishes -// externally reachable consensus/RPC listeners. 0.0.0.0 EVM caveat (accepted): -// the EVM HTTP/WS listeners bind all interfaces (0.0.0.0) for the harness -// lifetime; only TM RPC/P2P are loopback-scoped. They run on free ephemeral -// ports, dialed via 127.0.0.1. Tightening requires a bind-host option in -// evmrpc (not yet present). A rare EVM port-bind collision (the free port is -// taken between FreeTCPAddr's probe-close and the listener's bind) panics the -// node's serve goroutine — the production fail-loud path, intentionally not -// diverted here. If it ever flakes, the fix is hardening the FreeTCPAddr -// bind-close-rebind TOCTOU window, NOT re-adding a serve-error diversion. -// - loopback conn-tracker ceiling: MaxIncomingConnectionAttempts raised — -// loopback collapses all peers onto 127.0.0.1, so the router's IP-keyed -// conn-tracker counts the startup burst on one key — without the raise the -// burst trips the per-IP cap and peers are rejected. +// ports. Without them app.TestAppOpts hard-disables the listeners and no +// node serves EVM. +// - metrics-off: set tmCfg.Instrumentation.Prometheus = false to avoid the +// dup-registry panic from the process-wide registries. Metrics must stay +// off until the evmrpc/EVM-keeper metrics are de-globalized — re-enabling +// Prometheus before then reintroduces the panic. +// - loopback bind scope: scope TM RPC and P2P to 127.0.0.1 (they default to +// [::]/0.0.0.0), or the harness publishes externally reachable +// consensus/RPC listeners. The EVM HTTP/WS listeners are the accepted +// exception: they bind all interfaces (0.0.0.0) because evmrpc has no +// bind-host option yet, but run on free ephemeral ports dialed via +// 127.0.0.1. A rare port-bind collision — the free port is taken between +// FreeTCPAddr's probe-close and the listener's bind — panics the node's +// serve goroutine (the production fail-loud path, intentionally not +// diverted). If that ever flakes, harden the FreeTCPAddr TOCTOU window +// rather than re-add a serve-error diversion. +// - loopback conn-tracker ceiling: raise MaxIncomingConnectionAttempts. +// Loopback collapses every peer onto 127.0.0.1, so the router's IP-keyed +// conn-tracker counts the whole startup burst against one key; without the +// raise the burst trips the per-IP cap and peers are rejected. // -// # Validator-count rule: 1 or >= 3 (2 is the trap) -// -// When wiring a suite, pick Validators = 1 or Validators >= 3. Start rejects 2. -// The constraint is CometBFT's block-sync→consensus handoff, NOT a voting-power -// quorum: +// # Why a native API, not the SDK sei.Provider interface // -// - N=1 works. A sole validator skips block-sync and proposes blocks solo -// (sei-tendermint onlyValidatorIsUs, node/setup.go, gating -// `blockSync := !onlyValidatorIsUs` in node/node.go). That decision reads the -// genesis-derived valset BEFORE InitChain, so the harness pins the single -// validator into genesis for N=1 (the empty-valset invariant would leave size -// 0, defeat onlyValidatorIsUs, and the solo node would hang in block-sync — see -// startNode). -// - N=2 hangs. Each node has exactly one peer, and BlockPool.IsCaughtUp -// (internal/blocksync/pool.go) hard-requires len(peers) > 1 to ever report -// caught-up, so neither node leaves block-sync. This is a peer-count deadlock, -// not a stake threshold. Start rejects N=2 loudly rather than let it hang. -// - N>=3 works. Every node has >= 2 peers, so IsCaughtUp can fire and hand off -// to consensus. N=3 is the smallest real multi-node topology. +// The LLD's eventual target is for Start to back the SDK's sei.Provider so +// suites written against sei.Open(ctx, "local") run unchanged. That wiring is +// deferred: the SDK lives in the github.com/sei-protocol/sei-k8s-controller +// module, which declares `go >= 1.26.0`, while sei-chain runs go 1.25.6 — so +// importing it would force a chain-wide toolchain bump and pull the controller's +// controller-runtime/AWS dep graph into the seid build. The handle methods here +// intentionally mirror sei.NodeHandle / sei.NetworkHandle so a thin adapter can +// satisfy the SDK interface once the skew is resolved — see Node and Network. package inprocess diff --git a/inprocess/genesis.go b/inprocess/genesis.go index 6c91644b2b..bdd77ad7a2 100644 --- a/inprocess/genesis.go +++ b/inprocess/genesis.go @@ -27,7 +27,7 @@ import ( // genesisBuilder accumulates per-validator accounts, balances, and gentxs across // the key-generation pass, then assembles a shared genesis whose validator set // is left EMPTY so every node derives the consensus valset from its InitChain -// response (the empty-valset invariant) — the single most important delta from +// response (the empty-valset invariant), the load-bearing delta from // testutil/network. // // This is a self-contained reimplementation of the unexported initGenFiles / diff --git a/inprocess/harness_test.go b/inprocess/harness_test.go index 723be30770..482452dd3e 100644 --- a/inprocess/harness_test.go +++ b/inprocess/harness_test.go @@ -14,10 +14,10 @@ import ( banktypes "github.com/sei-protocol/sei-chain/sei-cosmos/x/bank/types" ) -// TestInProcessNetwork productionizes the N-RPC spike: it stands up N=4 -// validators in one process, asserts every node serves Tendermint RPC + EVM -// JSON-RPC, and round-trips a tx (broadcast on node 0, observed on node 1's -// independent RPC) — proving real consensus + N independent RPC stacks. +// TestInProcessNetwork stands up N=4 validators in one process and asserts +// every node serves Tendermint RPC + EVM JSON-RPC, then round-trips a tx +// (broadcast on node 0, observed on node 1's independent RPC) — exercising +// real consensus across N independent RPC stacks. // // Run: // @@ -109,7 +109,7 @@ func assertCrossNodeTxRoundTrip(t *testing.T, ctx context.Context, net *Network) // TestStartRejectsZeroValidators guards the input validation: 0 (too few) and 2 // (the block-sync deadlock) are rejected without bring-up. N=1 and N>=3 are the -// valid topologies (proven live by TestInProcessNetwork at N=4). +// valid topologies. func TestStartRejectsZeroValidators(t *testing.T) { for _, n := range []int{0, 2} { if _, err := Start(context.Background(), Options{Validators: n}); err == nil { diff --git a/integration_test/runner/runner_inprocess_test.go b/integration_test/runner/runner_inprocess_test.go index 9fa1fcccf8..49eb166389 100644 --- a/integration_test/runner/runner_inprocess_test.go +++ b/integration_test/runner/runner_inprocess_test.go @@ -37,25 +37,19 @@ func adminFunding() sdk.Coins { return sdk.NewCoins(sdk.NewCoin("usei", amt)) } -// TestInProcessBankModule is the C2 end-to-end proof: it stands up an in-process -// network with a genesis-funded `admin` on node 0 (the suite's signing key) and -// runs bank_module/send_funds_test.yaml through the runner's in-process arm — a -// real bank tx + historical balance queries, in-memory, no docker. +// TestInProcessBankModule runs bank_module/send_funds_test.yaml end-to-end +// through the runner's in-process arm: a genesis-funded `admin` on node 0 +// (the suite's signing key) drives a real bank tx + historical balance +// queries, in-memory, no docker. func TestInProcessBankModule(t *testing.T) { ctx, cancel := context.WithTimeout(context.Background(), 8*time.Minute) defer cancel() net, err := inprocess.Start(ctx, inprocess.Options{ - // Three validators. send_funds asserts only node-0 state, but the N choice - // is constrained by CometBFT's block-sync handoff, NOT a voting-power quorum: - // N=1 works — a sole validator skips block-sync and proposes solo - // (sei-tendermint onlyValidatorIsUs). - // N=2 HANGS — each node has exactly 1 peer; BlockPool.IsCaughtUp requires - // >1 peer, so neither leaves block-sync (Start rejects N=2). - // N>=3 works — every node has >=2 peers, so IsCaughtUp can fire and hand - // off to consensus. - // N=3 is the smallest MULTI-NODE topology, the point of this end-to-end demo: - // admin lives on node 0 (the suite default); nodes 1-2 are real consensus peers. + // Three validators — the smallest real multi-node topology. send_funds + // asserts only node-0 state, but a single-node run wouldn't exercise + // cross-peer consensus. N is constrained to 1 or >=3 (never 2) by CometBFT's + // block-sync handoff; see inprocess.Options.Validators and the package doc. Validators: 3, ChainID: chainID, TimeoutCommit: time.Second,