From defc4b22a799dc7947af6525cf2ce1c500ff9a2b Mon Sep 17 00:00:00 2001 From: Amir Deris Date: Tue, 5 May 2026 15:12:40 -0700 Subject: [PATCH 01/11] Added otel metrics for app and app/ante --- app/abci.go | 57 +++++++++--- app/ante/evm_checktx.go | 15 +++- app/ante/metrics.go | 27 ++++++ app/app.go | 60 ++++++++++--- app/invariance.go | 102 ++++++---------------- app/metrics.go | 189 ++++++++++++++++++++++++++++++++++++++++ 6 files changed, 344 insertions(+), 106 deletions(-) create mode 100644 app/ante/metrics.go create mode 100644 app/metrics.go diff --git a/app/abci.go b/app/abci.go index bcc8c3cbb8..225cafcd46 100644 --- a/app/abci.go +++ b/app/abci.go @@ -5,6 +5,9 @@ import ( "crypto/sha256" "time" + "go.opentelemetry.io/otel/attribute" + otelmetric "go.opentelemetry.io/otel/metric" + "github.com/sei-protocol/sei-chain/app/legacyabci" "github.com/sei-protocol/sei-chain/sei-cosmos/tasks" "github.com/sei-protocol/sei-chain/sei-cosmos/telemetry" @@ -13,7 +16,7 @@ import ( "github.com/sei-protocol/sei-chain/sei-cosmos/types/legacytm" abci "github.com/sei-protocol/sei-chain/sei-tendermint/abci/types" "github.com/sei-protocol/sei-chain/sei-tendermint/libs/utils" - "github.com/sei-protocol/sei-chain/utils/metrics" + utilmetrics "github.com/sei-protocol/sei-chain/utils/metrics" ) func (app *App) BeginBlock( @@ -27,14 +30,18 @@ func (app *App) BeginBlock( defer beginBlockSpan.End() ctx = ctx.WithTraceSpanContext(spanCtx) ctx = ctx.WithEventManager(sdk.NewEventManager()) - defer telemetry.MeasureSince(time.Now(), "abci", "begin_block") + beginBlockStart := time.Now() + defer func() { + telemetry.MeasureSince(beginBlockStart, "abci", "begin_block") // TODO(PLT-327): remove once app_abci_begin_block_duration_seconds verified + appMetrics.beginBlockDuration.Record(ctx.Context(), time.Since(beginBlockStart).Seconds()) + }() // inline begin block if checkHeight { if err := app.ValidateHeight(height); err != nil { panic(err) } } - metrics.GaugeSeidVersionAndCommit(app.versionInfo.Version, app.versionInfo.GitCommit) + utilmetrics.GaugeSeidVersionAndCommit(app.versionInfo.Version, app.versionInfo.GitCommit) // TODO(PLT-327): remove once app_build_info observable gauge verified // check if we've reached a target height, if so, execute any applicable handlers if app.forkInitializer != nil { app.forkInitializer(ctx) @@ -59,9 +66,17 @@ func (app *App) EndBlock(ctx sdk.Context, height int64, blockGasUsed int64) (res spanCtx, span := app.GetBaseApp().TracingInfo.StartWithContext("EndBlock", ctx.TraceSpanContext()) defer span.End() ctx = ctx.WithTraceSpanContext(spanCtx) - defer telemetry.MeasureSince(time.Now(), "abci", "end_block") + endBlockStart := time.Now() + defer func() { + telemetry.MeasureSince(endBlockStart, "abci", "end_block") // TODO(PLT-327): remove once app_abci_end_block_duration_seconds verified + appMetrics.endBlockDuration.Record(ctx.Context(), time.Since(endBlockStart).Seconds()) + }() ctx = ctx.WithEventManager(sdk.NewEventManager()) - defer telemetry.MeasureSince(time.Now(), "module", "total_end_block") + moduleEndBlockStart := time.Now() + defer func() { + telemetry.MeasureSince(moduleEndBlockStart, "module", "total_end_block") // TODO(PLT-327): remove once app_abci_module_end_block_duration_seconds verified + appMetrics.moduleEndBlockDuration.Record(ctx.Context(), time.Since(moduleEndBlockStart).Seconds()) + }() res.ValidatorUpdates = legacyabci.EndBlock(ctx, height, blockGasUsed, app.EndBlockKeepers) res.Events = sdk.MarkEventsToIndex(ctx.EventManager().ABCIEvents(), app.IndexEvents) if cp := app.GetConsensusParams(ctx); cp != nil { @@ -83,7 +98,11 @@ func (app *App) CheckTx(ctx context.Context, req *abci.RequestCheckTxV2) *abci.R } _, span := app.GetBaseApp().TracingInfo.StartWithContext("CheckTx", ctx) defer span.End() - defer telemetry.MeasureSince(time.Now(), "abci", "check_tx") + checkTxStart := time.Now() + defer func() { + telemetry.MeasureSince(checkTxStart, "abci", "check_tx") // TODO(PLT-327): remove once app_abci_check_tx_duration_seconds verified + appMetrics.checkTxDuration.Record(ctx, time.Since(checkTxStart).Seconds()) + }() sdkCtx := app.GetCheckTxContext(req.Tx, req.Type == abci.CheckTxTypeV2Recheck) tx, err := app.txDecoder(req.Tx) if err != nil { @@ -121,22 +140,30 @@ func (app *App) CheckTx(ctx context.Context, req *abci.RequestCheckTxV2) *abci.R } func (app *App) DeliverTx(ctx sdk.Context, req abci.RequestDeliverTxV2, tx sdk.Tx, checksum [32]byte) abci.ResponseDeliverTx { - defer metrics.MeasureDeliverTxDuration(time.Now()) + deliverTxStart := time.Now() // ensure we carry the initial context from tracer here spanCtx, span := app.GetBaseApp().TracingInfo.StartWithContext("DeliverTx", ctx.TraceSpanContext()) defer span.End() // update context with trace span new context ctx = ctx.WithTraceSpanContext(spanCtx) - defer telemetry.MeasureSince(time.Now(), "abci", "deliver_tx") + defer func() { + utilmetrics.MeasureDeliverTxDuration(deliverTxStart) // TODO(PLT-327): remove once app_abci_deliver_tx_duration_seconds verified + telemetry.MeasureSince(deliverTxStart, "abci", "deliver_tx") // TODO(PLT-327): remove once app_abci_deliver_tx_duration_seconds verified + appMetrics.deliverTxDuration.Record(ctx.Context(), time.Since(deliverTxStart).Seconds()) + }() gInfo := sdk.GasInfo{} resultStr := "successful" defer func() { - telemetry.IncrCounter(1, "tx", "count") - telemetry.IncrCounter(1, "tx", resultStr) - telemetry.SetGauge(float32(gInfo.GasUsed), "tx", "gas", "used") - telemetry.SetGauge(float32(gInfo.GasWanted), "tx", "gas", "wanted") + telemetry.IncrCounter(1, "tx", "count") // TODO(PLT-327): remove once app_tx_count_total verified + telemetry.IncrCounter(1, "tx", resultStr) // TODO(PLT-327): remove once app_tx_count_total verified + telemetry.SetGauge(float32(gInfo.GasUsed), "tx", "gas", "used") // TODO(PLT-327): remove once app_tx_gas_used verified + telemetry.SetGauge(float32(gInfo.GasWanted), "tx", "gas", "wanted") // TODO(PLT-327): remove once app_tx_gas_wanted verified + appMetrics.txCount.Add(ctx.Context(), 1) + appMetrics.txCount.Add(ctx.Context(), 1, otelmetric.WithAttributes(attribute.String("result", resultStr))) + appMetrics.txGasUsed.Record(ctx.Context(), int64(gInfo.GasUsed)) //nolint:gosec + appMetrics.txGasWanted.Record(ctx.Context(), int64(gInfo.GasWanted)) //nolint:gosec }() gInfo, result, anteEvents, resCtx, err := legacyabci.DeliverTx(ctx.WithTxBytes(req.Tx).WithTxSum(checksum), tx, app.GetTxConfig(), &app.DeliverTxKeepers, checksum, func(ctx sdk.Context) (sdk.Context, sdk.CacheMultiStore) { return app.CacheTxContext(ctx, checksum) @@ -177,7 +204,11 @@ func (app *App) DeliverTx(ctx sdk.Context, req abci.RequestDeliverTxV2, tx sdk.T // DeliverTxBatch is not part of the ABCI specification, but this is here for code convention func (app *App) DeliverTxBatch(ctx sdk.Context, req sdk.DeliverTxBatchRequest) (res sdk.DeliverTxBatchResponse) { - defer metrics.MeasureDeliverBatchTxDuration(time.Now()) + deliverBatchStart := time.Now() + defer func() { + utilmetrics.MeasureDeliverBatchTxDuration(deliverBatchStart) // TODO(PLT-327): remove once app_abci_deliver_batch_tx_duration_seconds verified + appMetrics.deliverBatchTxDuration.Record(ctx.Context(), time.Since(deliverBatchStart).Seconds()) + }() spanCtx, span := app.GetBaseApp().TracingInfo.StartWithContext("DeliverTxBatch", ctx.TraceSpanContext()) defer span.End() // update context with trace span new context diff --git a/app/ante/evm_checktx.go b/app/ante/evm_checktx.go index 02a0e53dc8..ebece757ed 100644 --- a/app/ante/evm_checktx.go +++ b/app/ante/evm_checktx.go @@ -21,6 +21,9 @@ import ( upgradekeeper "github.com/sei-protocol/sei-chain/sei-cosmos/x/upgrade/keeper" abci "github.com/sei-protocol/sei-chain/sei-tendermint/abci/types" tmtypes "github.com/sei-protocol/sei-chain/sei-tendermint/types" + "go.opentelemetry.io/otel/attribute" + otelmetric "go.opentelemetry.io/otel/metric" + "github.com/sei-protocol/sei-chain/utils" "github.com/sei-protocol/sei-chain/utils/helpers" "github.com/sei-protocol/sei-chain/utils/metrics" @@ -297,14 +300,16 @@ func CheckNonce(ctx sdk.Context, latestCtxGetter func() sdk.Context, ek *evmkeep ctx = ctx.WithCheckTxCallback(func(priority int64) { txHash := tmtypes.Tx(ctx.TxBytes()).Hash() ek.AddPendingNonce(txHash, evmAddr, etx.Nonce(), priority) - metrics.IncrementPendingNonce("added") + metrics.IncrementPendingNonce("added") // TODO(PLT-327): remove once app_pending_nonce_total verified + getAnteMetrics().pendingNonce.Add(ctx.Context(), 1, otelmetric.WithAttributes(attribute.String("event", "added"))) }) // if the mempool expires a transaction, this handler is invoked ctx = ctx.WithExpireTxHandler(func() { txHash := tmtypes.Tx(ctx.TxBytes()).Hash() ek.RemovePendingNonce(txHash) - metrics.IncrementPendingNonce("expired") + metrics.IncrementPendingNonce("expired") // TODO(PLT-327): remove once app_pending_nonce_total verified + getAnteMetrics().pendingNonce.Add(ctx.Context(), 1, otelmetric.WithAttributes(attribute.String("event", "expired"))) }) if txNonce > nextNonce { @@ -323,7 +328,8 @@ func CheckNonce(ctx sdk.Context, latestCtxGetter func() sdk.Context, ek *evmkeep if txNonce < nextNonceToBeMined { // this nonce has already been mined, we cannot accept it again - metrics.IncrementPendingNonce("rejected") + metrics.IncrementPendingNonce("rejected") // TODO(PLT-327): remove once app_pending_nonce_total verified + getAnteMetrics().pendingNonce.Add(ctx.Context(), 1, otelmetric.WithAttributes(attribute.String("event", "rejected"))) return abci.Rejected } else if txNonce < nextPendingNonce { // check if the sender still has enough funds to pay for gas @@ -335,7 +341,8 @@ func CheckNonce(ctx sdk.Context, latestCtxGetter func() sdk.Context, ek *evmkeep // this nonce is allowed to process as it is part of the // consecutive nonces from nextNonceToBeMined to nextPendingNonce // This logic allows multiple nonces from an account to be processed in a block. - metrics.IncrementPendingNonce("accepted") + metrics.IncrementPendingNonce("accepted") // TODO(PLT-327): remove once app_pending_nonce_total verified + getAnteMetrics().pendingNonce.Add(ctx.Context(), 1, otelmetric.WithAttributes(attribute.String("event", "accepted"))) return abci.Accepted } return abci.Pending diff --git a/app/ante/metrics.go b/app/ante/metrics.go new file mode 100644 index 0000000000..a7c727aa71 --- /dev/null +++ b/app/ante/metrics.go @@ -0,0 +1,27 @@ +package ante + +import ( + "sync" + + "go.opentelemetry.io/otel" + "go.opentelemetry.io/otel/metric" +) + +type anteMetrics struct { + pendingNonce metric.Int64Counter +} + +// getAnteMetrics returns the package-level OTel instruments, initializing them +// lazily on first call (after the global MeterProvider is set in NewApp). +var getAnteMetrics = sync.OnceValue(func() *anteMetrics { + m := &anteMetrics{} + meter := otel.Meter("app_ante") + var err error + if m.pendingNonce, err = meter.Int64Counter( + "app_pending_nonce_total", + metric.WithDescription("Pending nonce events by type (added, expired, rejected, accepted)"), + ); err != nil { + panic("ante anteMetrics: " + err.Error()) + } + return m +}) diff --git a/app/app.go b/app/app.go index 6bcef5afab..5b1e30db26 100644 --- a/app/app.go +++ b/app/app.go @@ -138,6 +138,7 @@ import ( tmproto "github.com/sei-protocol/sei-chain/sei-tendermint/proto/tendermint/types" tmtypes "github.com/sei-protocol/sei-chain/sei-tendermint/types" wasmkeeper "github.com/sei-protocol/sei-chain/sei-wasmd/x/wasm/keeper" + "github.com/sei-protocol/sei-chain/utils" "github.com/sei-protocol/sei-chain/utils/metrics" "github.com/sei-protocol/sei-chain/wasmbinding" @@ -165,6 +166,7 @@ import ( "github.com/spf13/cast" dbm "github.com/tendermint/tm-db" "go.opentelemetry.io/otel/attribute" + otelmetric "go.opentelemetry.io/otel/metric" // this line is used by starport scaffolding # stargate/app/moduleImport @@ -491,6 +493,7 @@ func New( if err := metrics.SetupOtelMetricsProvider(); err != nil { logger.Error(err.Error()) } + initAppMetrics() keys := sdk.NewKVStoreKeys( authtypes.StoreKey, authzkeeper.StoreKey, banktypes.StoreKey, stakingtypes.StoreKey, @@ -1217,7 +1220,9 @@ func (app *App) ProcessProposalHandler(ctx sdk.Context, req *abci.RequestProcess // Invariant: at this point nil entries in typedTxs are proto decode failures only. // EVM preprocessing runs later inside ProcessBlock; do not reorder that before this check. if !app.checkTotalBlockGas(checkCtx, typedTxs) { - metrics.IncrFailedTotalGasWantedCheck(string(req.Header.ProposerAddress)) + metrics.IncrFailedTotalGasWantedCheck(string(req.Header.ProposerAddress)) // TODO(PLT-327): remove once app_failed_total_gas_wanted_check_total verified + appMetrics.failedGasWantedCheck.Add(ctx.Context(), 1, + otelmetric.WithAttributes(attribute.String("proposer", string(req.Header.ProposerAddress)))) return &abci.ResponseProcessProposal{ Status: abci.ResponseProcessProposal_REJECT, }, nil @@ -1318,7 +1323,9 @@ func (app *App) FinalizeBlocker(ctx sdk.Context, req *abci.RequestFinalizeBlock) app.optimisticProcessingInfoMutex.RUnlock() if !aborted && bytes.Equal(finalHash, req.Hash) { - metrics.IncrementOptimisticProcessingCounter(true) + metrics.IncrementOptimisticProcessingCounter(true) // TODO(PLT-327): remove once app_optimistic_processing_total verified + appMetrics.optimisticProcessing.Add(ctx.Context(), 1, + otelmetric.WithAttributes(attribute.String("enabled", "true"))) app.SetProcessProposalStateToCommit() if app.EvmKeeper.EthReplayConfig.Enabled || app.EvmKeeper.EthBlockTestConfig.Enabled { return &abci.ResponseFinalizeBlock{}, nil @@ -1330,7 +1337,9 @@ func (app *App) FinalizeBlocker(ctx sdk.Context, req *abci.RequestFinalizeBlock) return &resp, nil } } - metrics.IncrementOptimisticProcessingCounter(false) + metrics.IncrementOptimisticProcessingCounter(false) // TODO(PLT-327): remove once app_optimistic_processing_total verified + appMetrics.optimisticProcessing.Add(ctx.Context(), 1, + otelmetric.WithAttributes(attribute.String("enabled", "false"))) logger.Info("optimistic processing ineligible") bpreq := &BlockProcessRequest{ Hash: req.Hash, @@ -1384,8 +1393,12 @@ func (app *App) DeliverTxWithResult(ctx sdk.Context, tx []byte, typedTx sdk.Tx) if !skipMetrics { // Record metrics for non-gasless transactions - metrics.IncrGasCounter("gas_used", deliverTxResp.GasUsed) - metrics.IncrGasCounter("gas_wanted", deliverTxResp.GasWanted) + metrics.IncrGasCounter("gas_used", deliverTxResp.GasUsed) // TODO(PLT-327): remove once app_tx_gas_total verified + metrics.IncrGasCounter("gas_wanted", deliverTxResp.GasWanted) // TODO(PLT-327): remove once app_tx_gas_total verified + appMetrics.txGas.Add(ctx.Context(), deliverTxResp.GasUsed, + otelmetric.WithAttributes(attribute.String("type", "gas_used"))) + appMetrics.txGas.Add(ctx.Context(), deliverTxResp.GasWanted, + otelmetric.WithAttributes(attribute.String("type", "gas_wanted"))) } return &abci.ExecTxResult{ @@ -1402,20 +1415,32 @@ func (app *App) DeliverTxWithResult(ctx sdk.Context, tx []byte, typedTx sdk.Tx) } func (app *App) ProcessTxsSynchronousV2(ctx sdk.Context, txs [][]byte, typedTxs []sdk.Tx) []*abci.ExecTxResult { - defer metrics.BlockProcessLatency(time.Now(), metrics.SYNCHRONOUS) + blockProcessStart := time.Now() + defer func() { + metrics.BlockProcessLatency(blockProcessStart, metrics.SYNCHRONOUS) // TODO(PLT-327): remove once app_block_process_duration_seconds verified + appMetrics.blockProcessDuration.Record(ctx.Context(), time.Since(blockProcessStart).Seconds(), + otelmetric.WithAttributes(attribute.String("type", metrics.SYNCHRONOUS))) + }() txResults := make([]*abci.ExecTxResult, 0, len(txs)) for i, tx := range txs { ctx = ctx.WithTxIndex(i) res := app.DeliverTxWithResult(ctx, tx, typedTxs[i]) txResults = append(txResults, res) - metrics.IncrTxProcessTypeCounter(metrics.SYNCHRONOUS) + metrics.IncrTxProcessTypeCounter(metrics.SYNCHRONOUS) // TODO(PLT-327): remove once app_tx_process_type_total verified + appMetrics.txProcessType.Add(ctx.Context(), 1, + otelmetric.WithAttributes(attribute.String("type", metrics.SYNCHRONOUS))) } return txResults } func (app *App) ProcessTxsSynchronousGiga(ctx sdk.Context, txs [][]byte, typedTxs []sdk.Tx) []*abci.ExecTxResult { - defer metrics.BlockProcessLatency(time.Now(), metrics.SYNCHRONOUS) + blockProcessGigaStart := time.Now() + defer func() { + metrics.BlockProcessLatency(blockProcessGigaStart, metrics.SYNCHRONOUS) // TODO(PLT-327): remove once app_block_process_duration_seconds verified + appMetrics.blockProcessDuration.Record(ctx.Context(), time.Since(blockProcessGigaStart).Seconds(), + otelmetric.WithAttributes(attribute.String("type", metrics.SYNCHRONOUS))) + }() ms := ctx.MultiStore().CacheMultiStore() defer ms.Write() @@ -1500,7 +1525,9 @@ func (app *App) ProcessTxsSynchronousGiga(ctx sdk.Context, txs [][]byte, typedTx txResults[i] = result ctx.GigaMultiStore().WriteGiga() - metrics.IncrTxProcessTypeCounter(metrics.SYNCHRONOUS) + metrics.IncrTxProcessTypeCounter(metrics.SYNCHRONOUS) // TODO(PLT-327): remove once app_tx_process_type_total verified + appMetrics.txProcessType.Add(ctx.Context(), 1, + otelmetric.WithAttributes(attribute.String("type", metrics.SYNCHRONOUS))) } return txResults @@ -1549,7 +1576,9 @@ func (app *App) ProcessTXsWithOCCV2(ctx sdk.Context, txs [][]byte, typedTxs []sd execResults := make([]*abci.ExecTxResult, 0, len(batchResult.Results)) for i, r := range batchResult.Results { - metrics.IncrTxProcessTypeCounter(metrics.OCC_CONCURRENT) + metrics.IncrTxProcessTypeCounter(metrics.OCC_CONCURRENT) // TODO(PLT-327): remove once app_tx_process_type_total verified + appMetrics.txProcessType.Add(ctx.Context(), 1, + otelmetric.WithAttributes(attribute.String("type", metrics.OCC_CONCURRENT))) // Check if transaction is gasless before recording gas metrics var recordGasMetrics = true @@ -1575,8 +1604,12 @@ func (app *App) ProcessTXsWithOCCV2(ctx sdk.Context, txs [][]byte, typedTxs []sd } if recordGasMetrics { - metrics.IncrGasCounter("gas_used", r.Response.GasUsed) - metrics.IncrGasCounter("gas_wanted", r.Response.GasWanted) + metrics.IncrGasCounter("gas_used", r.Response.GasUsed) // TODO(PLT-327): remove once app_tx_gas_total verified + metrics.IncrGasCounter("gas_wanted", r.Response.GasWanted) // TODO(PLT-327): remove once app_tx_gas_total verified + appMetrics.txGas.Add(ctx.Context(), r.Response.GasUsed, + otelmetric.WithAttributes(attribute.String("type", "gas_used"))) + appMetrics.txGas.Add(ctx.Context(), r.Response.GasWanted, + otelmetric.WithAttributes(attribute.String("type", "gas_wanted"))) } execResults = append(execResults, &abci.ExecTxResult{ @@ -1655,7 +1688,8 @@ func (app *App) ProcessTXsWithOCCGiga(ctx sdk.Context, txs [][]byte, typedTxs [] } if fallbackToV2 { - metrics.IncrGigaFallbackToV2Counter() + metrics.IncrGigaFallbackToV2Counter() // TODO(PLT-327): remove once app_giga_fallback_to_v2_total verified + appMetrics.gigaFallback.Add(ctx.Context(), 1) // Discard all EVM changes by skipping cache writes, then re-run all txs via DeliverTx. evmBatchResult = nil v2Entries = make([]*sdk.DeliverTxEntry, len(txs)) diff --git a/app/invariance.go b/app/invariance.go index c8def4baf8..263f42f98e 100644 --- a/app/invariance.go +++ b/app/invariance.go @@ -1,6 +1,7 @@ package app import ( + "context" "fmt" "time" @@ -8,6 +9,8 @@ import ( servertypes "github.com/sei-protocol/sei-chain/sei-cosmos/server/types" "github.com/sei-protocol/sei-chain/sei-cosmos/storev2/commitment" "github.com/spf13/cast" + "go.opentelemetry.io/otel/attribute" + otelmetric "go.opentelemetry.io/otel/metric" sdk "github.com/sei-protocol/sei-chain/sei-cosmos/types" @@ -52,10 +55,11 @@ func (app *App) LightInvarianceChecks(cms sdk.CommitMultiStore, config LightInva } func (app *App) LightInvarianceTotalSupply(cms sdk.CommitMultiStore) { - defer metrics.MeasureSince( - []string{"sei", "lightinvariance_supply", "milliseconds"}, - time.Now().UTC(), - ) + invarianceStart := time.Now() + defer func() { + metrics.MeasureSince([]string{"sei", "lightinvariance_supply", "milliseconds"}, invarianceStart.UTC()) // TODO(PLT-327): remove once app_lightinvariance_supply_duration_seconds verified + appMetrics.invarianceDuration.Record(context.Background(), time.Since(invarianceStart).Seconds()) + }() ckv, ok := cms.GetStore(app.BankKeeper.GetStoreKey()).(*commitment.Store) if !ok { logger.Error("bank store is not a memiavl store; cannot run light invariance check") @@ -67,24 +71,16 @@ func (app *App) LightInvarianceTotalSupply(cms sdk.CommitMultiStore) { for _, p := range balanceChangePairs { if len(p.Key) < 2 { // invalid key; ignore - seimetrics.SafeMetricsIncrCounterWithLabels([]string{"sei", "lightinvariance_supply", "invalid_changed_key"}, 1, []metrics.Label{ - { - Name: "type", - Value: "sei", - }, - }) + seimetrics.SafeMetricsIncrCounterWithLabels([]string{"sei", "lightinvariance_supply", "invalid_changed_key"}, 1, []metrics.Label{{Name: "type", Value: "sei"}}) // TODO(PLT-327): remove once app_lightinvariance_supply_invalid_key_total verified + appMetrics.invarianceInvalidKey.Add(context.Background(), 1, otelmetric.WithAttributes(attribute.String("type", "sei"))) logger.Error("invalid changed pair key for usei", "key", fmt.Sprintf("%X", p.Key)) continue } addrLen := int(p.Key[1]) if len(p.Key) < addrLen+2 { // invalid key length; ignore - seimetrics.SafeMetricsIncrCounterWithLabels([]string{"sei", "lightinvariance_supply", "invalid_changed_key"}, 1, []metrics.Label{ - { - Name: "type", - Value: "sei", - }, - }) + seimetrics.SafeMetricsIncrCounterWithLabels([]string{"sei", "lightinvariance_supply", "invalid_changed_key"}, 1, []metrics.Label{{Name: "type", Value: "sei"}}) // TODO(PLT-327): remove once app_lightinvariance_supply_invalid_key_total verified + appMetrics.invarianceInvalidKey.Add(context.Background(), 1, otelmetric.WithAttributes(attribute.String("type", "sei"))) logger.Error("invalid changed pair key for usei", "key", fmt.Sprintf("%X", p.Key)) continue } @@ -96,15 +92,8 @@ func (app *App) LightInvarianceTotalSupply(cms sdk.CommitMultiStore) { if !p.Delete { var balance sdk.Coin if err := balance.Unmarshal(p.Value); err != nil { - seimetrics.SafeMetricsIncrCounterWithLabels([]string{"sei", "lightinvariance_supply", "unmarshal_failure"}, 1, []metrics.Label{ - { - Name: "type", - Value: "usei", - }, { - Name: "step", - Value: "post_block", - }, - }) + seimetrics.SafeMetricsIncrCounterWithLabels([]string{"sei", "lightinvariance_supply", "unmarshal_failure"}, 1, []metrics.Label{{Name: "type", Value: "usei"}, {Name: "step", Value: "post_block"}}) // TODO(PLT-327): remove once app_lightinvariance_supply_unmarshal_failure_total verified + appMetrics.invarianceUnmarshalFail.Add(context.Background(), 1, otelmetric.WithAttributes(attribute.String("type", "usei"), attribute.String("step", "post_block"))) logger.Error("failed to unmarshal balance", "err", err) continue } @@ -124,15 +113,8 @@ func (app *App) LightInvarianceTotalSupply(cms sdk.CommitMultiStore) { } var balance sdk.Coin if err := balance.Unmarshal(val); err != nil { - seimetrics.SafeMetricsIncrCounterWithLabels([]string{"sei", "lightinvariance_supply", "unmarshal_failure"}, 1, []metrics.Label{ - { - Name: "type", - Value: "usei", - }, { - Name: "step", - Value: "pre_block", - }, - }) + seimetrics.SafeMetricsIncrCounterWithLabels([]string{"sei", "lightinvariance_supply", "unmarshal_failure"}, 1, []metrics.Label{{Name: "type", Value: "usei"}, {Name: "step", Value: "pre_block"}}) // TODO(PLT-327): remove once app_lightinvariance_supply_unmarshal_failure_total verified + appMetrics.invarianceUnmarshalFail.Add(context.Background(), 1, otelmetric.WithAttributes(attribute.String("type", "usei"), attribute.String("step", "pre_block"))) logger.Error("failed to unmarshal preblock balance", "err", err) continue } @@ -144,26 +126,15 @@ func (app *App) LightInvarianceTotalSupply(cms sdk.CommitMultiStore) { for _, p := range weiChangePairs { var amt sdk.Int if len(p.Key) < 1 { - seimetrics.SafeMetricsIncrCounterWithLabels([]string{"sei", "lightinvariance_supply", "invalid_changed_key"}, 1, []metrics.Label{ - { - Name: "type", - Value: "wei", - }, - }) + seimetrics.SafeMetricsIncrCounterWithLabels([]string{"sei", "lightinvariance_supply", "invalid_changed_key"}, 1, []metrics.Label{{Name: "type", Value: "wei"}}) // TODO(PLT-327): remove once app_lightinvariance_supply_invalid_key_total verified + appMetrics.invarianceInvalidKey.Add(context.Background(), 1, otelmetric.WithAttributes(attribute.String("type", "wei"))) logger.Error("invalid changed pair key", "key", fmt.Sprintf("%X", p.Key)) continue } if !p.Delete { if err := amt.Unmarshal(p.Value); err != nil { - seimetrics.SafeMetricsIncrCounterWithLabels([]string{"sei", "lightinvariance_supply", "unmarshal_failure"}, 1, []metrics.Label{ - { - Name: "type", - Value: "wei", - }, { - Name: "step", - Value: "post_block", - }, - }) + seimetrics.SafeMetricsIncrCounterWithLabels([]string{"sei", "lightinvariance_supply", "unmarshal_failure"}, 1, []metrics.Label{{Name: "type", Value: "wei"}, {Name: "step", Value: "post_block"}}) // TODO(PLT-327): remove once app_lightinvariance_supply_unmarshal_failure_total verified + appMetrics.invarianceUnmarshalFail.Add(context.Background(), 1, otelmetric.WithAttributes(attribute.String("type", "wei"), attribute.String("step", "post_block"))) logger.Error("failed to unmarshal wei balance", "err", err) continue } @@ -183,15 +154,8 @@ func (app *App) LightInvarianceTotalSupply(cms sdk.CommitMultiStore) { } var amt sdk.Int if err := amt.Unmarshal(val); err != nil { - seimetrics.SafeMetricsIncrCounterWithLabels([]string{"sei", "lightinvariance_supply", "unmarshal_failure"}, 1, []metrics.Label{ - { - Name: "type", - Value: "wei", - }, { - Name: "step", - Value: "pre_block", - }, - }) + seimetrics.SafeMetricsIncrCounterWithLabels([]string{"sei", "lightinvariance_supply", "unmarshal_failure"}, 1, []metrics.Label{{Name: "type", Value: "wei"}, {Name: "step", Value: "pre_block"}}) // TODO(PLT-327): remove once app_lightinvariance_supply_unmarshal_failure_total verified + appMetrics.invarianceUnmarshalFail.Add(context.Background(), 1, otelmetric.WithAttributes(attribute.String("type", "wei"), attribute.String("step", "pre_block"))) logger.Error("failed to unmarshal preblock wei balance", "err", err) continue } @@ -203,15 +167,8 @@ func (app *App) LightInvarianceTotalSupply(cms sdk.CommitMultiStore) { if bz := ckv.Get(append(banktypes.SupplyKey, []byte(sdk.MustGetBaseDenom())...)); bz != nil { var amt sdk.Int if err := amt.Unmarshal(bz); err != nil { - seimetrics.SafeMetricsIncrCounterWithLabels([]string{"sei", "lightinvariance_supply", "unmarshal_failure"}, 1, []metrics.Label{ - { - Name: "type", - Value: "total_supply", - }, { - Name: "step", - Value: "pre_block", - }, - }) + seimetrics.SafeMetricsIncrCounterWithLabels([]string{"sei", "lightinvariance_supply", "unmarshal_failure"}, 1, []metrics.Label{{Name: "type", Value: "total_supply"}, {Name: "step", Value: "pre_block"}}) // TODO(PLT-327): remove once app_lightinvariance_supply_unmarshal_failure_total verified + appMetrics.invarianceUnmarshalFail.Add(context.Background(), 1, otelmetric.WithAttributes(attribute.String("type", "total_supply"), attribute.String("step", "pre_block"))) logger.Error("failed to unmarshal pre total supply", "err", err) return } @@ -224,15 +181,8 @@ func (app *App) LightInvarianceTotalSupply(cms sdk.CommitMultiStore) { } else { var amt sdk.Int if err := amt.Unmarshal(p.Value); err != nil { - seimetrics.SafeMetricsIncrCounterWithLabels([]string{"sei", "lightinvariance_supply", "unmarshal_failure"}, 1, []metrics.Label{ - { - Name: "type", - Value: "total_supply", - }, { - Name: "step", - Value: "post_block", - }, - }) + seimetrics.SafeMetricsIncrCounterWithLabels([]string{"sei", "lightinvariance_supply", "unmarshal_failure"}, 1, []metrics.Label{{Name: "type", Value: "total_supply"}, {Name: "step", Value: "post_block"}}) // TODO(PLT-327): remove once app_lightinvariance_supply_unmarshal_failure_total verified + appMetrics.invarianceUnmarshalFail.Add(context.Background(), 1, otelmetric.WithAttributes(attribute.String("type", "total_supply"), attribute.String("step", "post_block"))) logger.Error("failed to unmarshal total supply", "err", err) } else { supplyChanged = amt.Sub(preTotalSupply) diff --git a/app/metrics.go b/app/metrics.go new file mode 100644 index 0000000000..577115af44 --- /dev/null +++ b/app/metrics.go @@ -0,0 +1,189 @@ +package app + +import ( + "context" + + "go.opentelemetry.io/otel" + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/metric" + + "github.com/sei-protocol/sei-chain/sei-cosmos/version" +) + +const appMeterName = "app" + +// histogramBuckets aligns with block-time SLO thresholds +// (p50 ≤ 500ms, p95 ≤ 1.5s, p99 ≤ 2.5s) expressed in seconds. +var histogramBuckets = metric.WithExplicitBucketBoundaries( + 0.025, 0.05, 0.1, 0.25, 0.5, 0.75, 1.0, 1.5, 2.0, 2.5, 5.0, 10.0, +) + +type metrics struct { + // ABCI phase durations + beginBlockDuration metric.Float64Histogram + endBlockDuration metric.Float64Histogram + moduleEndBlockDuration metric.Float64Histogram + checkTxDuration metric.Float64Histogram + deliverTxDuration metric.Float64Histogram + deliverBatchTxDuration metric.Float64Histogram + + // Block processing duration by execution type + blockProcessDuration metric.Float64Histogram + + // Per-tx counts and gas + txCount metric.Int64Counter + txProcessType metric.Int64Counter + txGas metric.Int64Counter + txGasUsed metric.Int64Gauge + txGasWanted metric.Int64Gauge + + // App-level flow counters + optimisticProcessing metric.Int64Counter + failedGasWantedCheck metric.Int64Counter + gigaFallback metric.Int64Counter + + // Light invariance check + invarianceDuration metric.Float64Histogram + invarianceInvalidKey metric.Int64Counter + invarianceUnmarshalFail metric.Int64Counter +} + +// appMetrics is the package-level OTel instrument set, initialized once in NewApp. +var appMetrics metrics + +func must[V any](v V, err error) V { + if err != nil { + panic(err) + } + return v +} + +// initAppMetrics registers all OTel instruments for the app package. +// Must be called after metrics.SetupOtelMetricsProvider() in NewApp. +func initAppMetrics() { + meter := otel.Meter(appMeterName) + + appMetrics.beginBlockDuration = must(meter.Float64Histogram( + "app_abci_begin_block_duration_seconds", + metric.WithDescription("Duration of ABCI BeginBlock"), + metric.WithUnit("s"), + histogramBuckets, + )) + + appMetrics.endBlockDuration = must(meter.Float64Histogram( + "app_abci_end_block_duration_seconds", + metric.WithDescription("Duration of ABCI EndBlock"), + metric.WithUnit("s"), + histogramBuckets, + )) + + appMetrics.moduleEndBlockDuration = must(meter.Float64Histogram( + "app_abci_module_end_block_duration_seconds", + metric.WithDescription("Duration of module EndBlock calls within ABCI EndBlock"), + metric.WithUnit("s"), + histogramBuckets, + )) + + appMetrics.checkTxDuration = must(meter.Float64Histogram( + "app_abci_check_tx_duration_seconds", + metric.WithDescription("Duration of ABCI CheckTx"), + metric.WithUnit("s"), + histogramBuckets, + )) + + appMetrics.deliverTxDuration = must(meter.Float64Histogram( + "app_abci_deliver_tx_duration_seconds", + metric.WithDescription("Duration of ABCI DeliverTx"), + metric.WithUnit("s"), + histogramBuckets, + )) + + appMetrics.deliverBatchTxDuration = must(meter.Float64Histogram( + "app_abci_deliver_batch_tx_duration_seconds", + metric.WithDescription("Duration of ABCI DeliverTxBatch"), + metric.WithUnit("s"), + histogramBuckets, + )) + + appMetrics.blockProcessDuration = must(meter.Float64Histogram( + "app_block_process_duration_seconds", + metric.WithDescription("Duration of block tx processing by execution type"), + metric.WithUnit("s"), + histogramBuckets, + )) + + appMetrics.txCount = must(meter.Int64Counter( + "app_tx_count_total", + metric.WithDescription("Number of transactions delivered"), + )) + + appMetrics.txProcessType = must(meter.Int64Counter( + "app_tx_process_type_total", + metric.WithDescription("Transactions processed by execution type"), + )) + + appMetrics.txGas = must(meter.Int64Counter( + "app_tx_gas_total", + metric.WithDescription("Cumulative transaction gas by type (gas_used, gas_wanted)"), + )) + + appMetrics.txGasUsed = must(meter.Int64Gauge( + "app_tx_gas_used", + metric.WithDescription("Gas used by the most recently delivered transaction"), + )) + + appMetrics.txGasWanted = must(meter.Int64Gauge( + "app_tx_gas_wanted", + metric.WithDescription("Gas wanted by the most recently delivered transaction"), + )) + + appMetrics.optimisticProcessing = must(meter.Int64Counter( + "app_optimistic_processing_total", + metric.WithDescription("Optimistic processing attempts; enabled=true means cache hit, false means miss"), + )) + + appMetrics.failedGasWantedCheck = must(meter.Int64Counter( + "app_failed_total_gas_wanted_check_total", + metric.WithDescription("Proposals rejected because total block gas wanted exceeded max"), + )) + + appMetrics.gigaFallback = must(meter.Int64Counter( + "app_giga_fallback_to_v2_total", + metric.WithDescription("Times giga executor fell back to V2 processing"), + )) + + appMetrics.invarianceDuration = must(meter.Float64Histogram( + "app_lightinvariance_supply_duration_seconds", + metric.WithDescription("Duration of light invariance total supply check"), + metric.WithUnit("s"), + histogramBuckets, + )) + + appMetrics.invarianceInvalidKey = must(meter.Int64Counter( + "app_lightinvariance_supply_invalid_key_total", + metric.WithDescription("Invalid changed-pair keys detected during invariance check"), + )) + + appMetrics.invarianceUnmarshalFail = must(meter.Int64Counter( + "app_lightinvariance_supply_unmarshal_failure_total", + metric.WithDescription("Unmarshal failures during invariance supply check"), + )) + + // Build-info observable gauge replaces GaugeSeidVersionAndCommit (called per BeginBlock). + // The callback fires on every Prometheus scrape; no per-block call site is needed. + // TODO(PLT-327): remove metrics.GaugeSeidVersionAndCommit call in abci.go once app_build_info verified + vi := version.NewInfo() + _ = must(meter.Int64ObservableGauge( + "app_build_info", + metric.WithDescription("Running binary build info; value is always 1"), + metric.WithInt64Callback(func(_ context.Context, o metric.Int64Observer) error { + o.Observe(1, + metric.WithAttributes( + attribute.String("seid_version", vi.Version), + attribute.String("commit", vi.GitCommit), + ), + ) + return nil + }), + )) +} From 55a79e18c0abcb0a437a07c4b32f988ce04b3957 Mon Sep 17 00:00:00 2001 From: Amir Deris Date: Wed, 6 May 2026 12:10:45 -0700 Subject: [PATCH 02/11] Fixed build errors, passed correct context --- app/abci.go | 4 ++-- app/app.go | 44 ++++++++++++++++++++--------------------- app/invariance.go | 50 +++++++++++++++++++++++------------------------ 3 files changed, 49 insertions(+), 49 deletions(-) diff --git a/app/abci.go b/app/abci.go index 225cafcd46..ccb9d2f2ac 100644 --- a/app/abci.go +++ b/app/abci.go @@ -6,7 +6,7 @@ import ( "time" "go.opentelemetry.io/otel/attribute" - otelmetric "go.opentelemetry.io/otel/metric" + otelmetrics "go.opentelemetry.io/otel/metric" "github.com/sei-protocol/sei-chain/app/legacyabci" "github.com/sei-protocol/sei-chain/sei-cosmos/tasks" @@ -161,7 +161,7 @@ func (app *App) DeliverTx(ctx sdk.Context, req abci.RequestDeliverTxV2, tx sdk.T telemetry.SetGauge(float32(gInfo.GasUsed), "tx", "gas", "used") // TODO(PLT-327): remove once app_tx_gas_used verified telemetry.SetGauge(float32(gInfo.GasWanted), "tx", "gas", "wanted") // TODO(PLT-327): remove once app_tx_gas_wanted verified appMetrics.txCount.Add(ctx.Context(), 1) - appMetrics.txCount.Add(ctx.Context(), 1, otelmetric.WithAttributes(attribute.String("result", resultStr))) + appMetrics.txCount.Add(ctx.Context(), 1, otelmetrics.WithAttributes(attribute.String("result", resultStr))) appMetrics.txGasUsed.Record(ctx.Context(), int64(gInfo.GasUsed)) //nolint:gosec appMetrics.txGasWanted.Record(ctx.Context(), int64(gInfo.GasWanted)) //nolint:gosec }() diff --git a/app/app.go b/app/app.go index 5b1e30db26..d68b93a53a 100644 --- a/app/app.go +++ b/app/app.go @@ -140,7 +140,7 @@ import ( wasmkeeper "github.com/sei-protocol/sei-chain/sei-wasmd/x/wasm/keeper" "github.com/sei-protocol/sei-chain/utils" - "github.com/sei-protocol/sei-chain/utils/metrics" + utilmetrics "github.com/sei-protocol/sei-chain/utils/metrics" "github.com/sei-protocol/sei-chain/wasmbinding" epochmodule "github.com/sei-protocol/sei-chain/x/epoch" epochmodulekeeper "github.com/sei-protocol/sei-chain/x/epoch/keeper" @@ -490,7 +490,7 @@ func New( bApp.SetInterfaceRegistry(interfaceRegistry) // Bind OTEL metrics provider once at application construction - if err := metrics.SetupOtelMetricsProvider(); err != nil { + if err := utilmetrics.SetupOtelMetricsProvider(); err != nil { logger.Error(err.Error()) } initAppMetrics() @@ -1220,7 +1220,7 @@ func (app *App) ProcessProposalHandler(ctx sdk.Context, req *abci.RequestProcess // Invariant: at this point nil entries in typedTxs are proto decode failures only. // EVM preprocessing runs later inside ProcessBlock; do not reorder that before this check. if !app.checkTotalBlockGas(checkCtx, typedTxs) { - metrics.IncrFailedTotalGasWantedCheck(string(req.Header.ProposerAddress)) // TODO(PLT-327): remove once app_failed_total_gas_wanted_check_total verified + utilmetrics.IncrFailedTotalGasWantedCheck(string(req.Header.ProposerAddress)) // TODO(PLT-327): remove once app_failed_total_gas_wanted_check_total verified appMetrics.failedGasWantedCheck.Add(ctx.Context(), 1, otelmetric.WithAttributes(attribute.String("proposer", string(req.Header.ProposerAddress)))) return &abci.ResponseProcessProposal{ @@ -1323,7 +1323,7 @@ func (app *App) FinalizeBlocker(ctx sdk.Context, req *abci.RequestFinalizeBlock) app.optimisticProcessingInfoMutex.RUnlock() if !aborted && bytes.Equal(finalHash, req.Hash) { - metrics.IncrementOptimisticProcessingCounter(true) // TODO(PLT-327): remove once app_optimistic_processing_total verified + utilmetrics.IncrementOptimisticProcessingCounter(true) // TODO(PLT-327): remove once app_optimistic_processing_total verified appMetrics.optimisticProcessing.Add(ctx.Context(), 1, otelmetric.WithAttributes(attribute.String("enabled", "true"))) app.SetProcessProposalStateToCommit() @@ -1331,13 +1331,13 @@ func (app *App) FinalizeBlocker(ctx sdk.Context, req *abci.RequestFinalizeBlock) return &abci.ResponseFinalizeBlock{}, nil } cms := app.WriteState() - app.LightInvarianceChecks(cms, app.lightInvarianceConfig) + app.LightInvarianceChecks(ctx.Context(), cms, app.lightInvarianceConfig) appHash := app.GetWorkingHash() resp := app.getFinalizeBlockResponse(appHash, events, txRes, endBlockResp) return &resp, nil } } - metrics.IncrementOptimisticProcessingCounter(false) // TODO(PLT-327): remove once app_optimistic_processing_total verified + utilmetrics.IncrementOptimisticProcessingCounter(false) // TODO(PLT-327): remove once app_optimistic_processing_total verified appMetrics.optimisticProcessing.Add(ctx.Context(), 1, otelmetric.WithAttributes(attribute.String("enabled", "false"))) logger.Info("optimistic processing ineligible") @@ -1358,7 +1358,7 @@ func (app *App) FinalizeBlocker(ctx sdk.Context, req *abci.RequestFinalizeBlock) return &abci.ResponseFinalizeBlock{}, nil } cms := app.WriteState() - app.LightInvarianceChecks(cms, app.lightInvarianceConfig) + app.LightInvarianceChecks(ctx.Context(), cms, app.lightInvarianceConfig) appHash := app.GetWorkingHash() resp := app.getFinalizeBlockResponse(appHash, events, txResults, endBlockResp) return &resp, nil @@ -1393,8 +1393,8 @@ func (app *App) DeliverTxWithResult(ctx sdk.Context, tx []byte, typedTx sdk.Tx) if !skipMetrics { // Record metrics for non-gasless transactions - metrics.IncrGasCounter("gas_used", deliverTxResp.GasUsed) // TODO(PLT-327): remove once app_tx_gas_total verified - metrics.IncrGasCounter("gas_wanted", deliverTxResp.GasWanted) // TODO(PLT-327): remove once app_tx_gas_total verified + utilmetrics.IncrGasCounter("gas_used", deliverTxResp.GasUsed) // TODO(PLT-327): remove once app_tx_gas_total verified + utilmetrics.IncrGasCounter("gas_wanted", deliverTxResp.GasWanted) // TODO(PLT-327): remove once app_tx_gas_total verified appMetrics.txGas.Add(ctx.Context(), deliverTxResp.GasUsed, otelmetric.WithAttributes(attribute.String("type", "gas_used"))) appMetrics.txGas.Add(ctx.Context(), deliverTxResp.GasWanted, @@ -1417,9 +1417,9 @@ func (app *App) DeliverTxWithResult(ctx sdk.Context, tx []byte, typedTx sdk.Tx) func (app *App) ProcessTxsSynchronousV2(ctx sdk.Context, txs [][]byte, typedTxs []sdk.Tx) []*abci.ExecTxResult { blockProcessStart := time.Now() defer func() { - metrics.BlockProcessLatency(blockProcessStart, metrics.SYNCHRONOUS) // TODO(PLT-327): remove once app_block_process_duration_seconds verified + utilmetrics.BlockProcessLatency(blockProcessStart, utilmetrics.SYNCHRONOUS) // TODO(PLT-327): remove once app_block_process_duration_seconds verified appMetrics.blockProcessDuration.Record(ctx.Context(), time.Since(blockProcessStart).Seconds(), - otelmetric.WithAttributes(attribute.String("type", metrics.SYNCHRONOUS))) + otelmetric.WithAttributes(attribute.String("type", utilmetrics.SYNCHRONOUS))) }() txResults := make([]*abci.ExecTxResult, 0, len(txs)) @@ -1427,9 +1427,9 @@ func (app *App) ProcessTxsSynchronousV2(ctx sdk.Context, txs [][]byte, typedTxs ctx = ctx.WithTxIndex(i) res := app.DeliverTxWithResult(ctx, tx, typedTxs[i]) txResults = append(txResults, res) - metrics.IncrTxProcessTypeCounter(metrics.SYNCHRONOUS) // TODO(PLT-327): remove once app_tx_process_type_total verified + utilmetrics.IncrTxProcessTypeCounter(utilmetrics.SYNCHRONOUS) // TODO(PLT-327): remove once app_tx_process_type_total verified appMetrics.txProcessType.Add(ctx.Context(), 1, - otelmetric.WithAttributes(attribute.String("type", metrics.SYNCHRONOUS))) + otelmetric.WithAttributes(attribute.String("type", utilmetrics.SYNCHRONOUS))) } return txResults } @@ -1437,9 +1437,9 @@ func (app *App) ProcessTxsSynchronousV2(ctx sdk.Context, txs [][]byte, typedTxs func (app *App) ProcessTxsSynchronousGiga(ctx sdk.Context, txs [][]byte, typedTxs []sdk.Tx) []*abci.ExecTxResult { blockProcessGigaStart := time.Now() defer func() { - metrics.BlockProcessLatency(blockProcessGigaStart, metrics.SYNCHRONOUS) // TODO(PLT-327): remove once app_block_process_duration_seconds verified + utilmetrics.BlockProcessLatency(blockProcessGigaStart, utilmetrics.SYNCHRONOUS) // TODO(PLT-327): remove once app_block_process_duration_seconds verified appMetrics.blockProcessDuration.Record(ctx.Context(), time.Since(blockProcessGigaStart).Seconds(), - otelmetric.WithAttributes(attribute.String("type", metrics.SYNCHRONOUS))) + otelmetric.WithAttributes(attribute.String("type", utilmetrics.SYNCHRONOUS))) }() ms := ctx.MultiStore().CacheMultiStore() @@ -1525,9 +1525,9 @@ func (app *App) ProcessTxsSynchronousGiga(ctx sdk.Context, txs [][]byte, typedTx txResults[i] = result ctx.GigaMultiStore().WriteGiga() - metrics.IncrTxProcessTypeCounter(metrics.SYNCHRONOUS) // TODO(PLT-327): remove once app_tx_process_type_total verified + utilmetrics.IncrTxProcessTypeCounter(utilmetrics.SYNCHRONOUS) // TODO(PLT-327): remove once app_tx_process_type_total verified appMetrics.txProcessType.Add(ctx.Context(), 1, - otelmetric.WithAttributes(attribute.String("type", metrics.SYNCHRONOUS))) + otelmetric.WithAttributes(attribute.String("type", utilmetrics.SYNCHRONOUS))) } return txResults @@ -1576,9 +1576,9 @@ func (app *App) ProcessTXsWithOCCV2(ctx sdk.Context, txs [][]byte, typedTxs []sd execResults := make([]*abci.ExecTxResult, 0, len(batchResult.Results)) for i, r := range batchResult.Results { - metrics.IncrTxProcessTypeCounter(metrics.OCC_CONCURRENT) // TODO(PLT-327): remove once app_tx_process_type_total verified + utilmetrics.IncrTxProcessTypeCounter(utilmetrics.OCC_CONCURRENT) // TODO(PLT-327): remove once app_tx_process_type_total verified appMetrics.txProcessType.Add(ctx.Context(), 1, - otelmetric.WithAttributes(attribute.String("type", metrics.OCC_CONCURRENT))) + otelmetric.WithAttributes(attribute.String("type", utilmetrics.OCC_CONCURRENT))) // Check if transaction is gasless before recording gas metrics var recordGasMetrics = true @@ -1604,8 +1604,8 @@ func (app *App) ProcessTXsWithOCCV2(ctx sdk.Context, txs [][]byte, typedTxs []sd } if recordGasMetrics { - metrics.IncrGasCounter("gas_used", r.Response.GasUsed) // TODO(PLT-327): remove once app_tx_gas_total verified - metrics.IncrGasCounter("gas_wanted", r.Response.GasWanted) // TODO(PLT-327): remove once app_tx_gas_total verified + utilmetrics.IncrGasCounter("gas_used", r.Response.GasUsed) // TODO(PLT-327): remove once app_tx_gas_total verified + utilmetrics.IncrGasCounter("gas_wanted", r.Response.GasWanted) // TODO(PLT-327): remove once app_tx_gas_total verified appMetrics.txGas.Add(ctx.Context(), r.Response.GasUsed, otelmetric.WithAttributes(attribute.String("type", "gas_used"))) appMetrics.txGas.Add(ctx.Context(), r.Response.GasWanted, @@ -1688,7 +1688,7 @@ func (app *App) ProcessTXsWithOCCGiga(ctx sdk.Context, txs [][]byte, typedTxs [] } if fallbackToV2 { - metrics.IncrGigaFallbackToV2Counter() // TODO(PLT-327): remove once app_giga_fallback_to_v2_total verified + utilmetrics.IncrGigaFallbackToV2Counter() // TODO(PLT-327): remove once app_giga_fallback_to_v2_total verified appMetrics.gigaFallback.Add(ctx.Context(), 1) // Discard all EVM changes by skipping cache writes, then re-run all txs via DeliverTx. evmBatchResult = nil diff --git a/app/invariance.go b/app/invariance.go index 263f42f98e..b7ea172465 100644 --- a/app/invariance.go +++ b/app/invariance.go @@ -5,12 +5,12 @@ import ( "fmt" "time" - "github.com/armon/go-metrics" + armonmetrics "github.com/armon/go-metrics" servertypes "github.com/sei-protocol/sei-chain/sei-cosmos/server/types" "github.com/sei-protocol/sei-chain/sei-cosmos/storev2/commitment" "github.com/spf13/cast" "go.opentelemetry.io/otel/attribute" - otelmetric "go.opentelemetry.io/otel/metric" + otelmetrics "go.opentelemetry.io/otel/metric" sdk "github.com/sei-protocol/sei-chain/sei-cosmos/types" @@ -43,22 +43,22 @@ func ReadLightInvarianceConfig(opts servertypes.AppOptions) (LightInvarianceConf return cfg, nil } -func (app *App) LightInvarianceChecks(cms sdk.CommitMultiStore, config LightInvarianceConfig) { +func (app *App) LightInvarianceChecks(ctx context.Context, cms sdk.CommitMultiStore, config LightInvarianceConfig) { // Skip invariance checks when mock_balances is enabled since we fake balances // without updating the actual store, which would fail the supply check. if MockBalancesEnabled { return } if config.SupplyEnabled { - app.LightInvarianceTotalSupply(cms) + app.LightInvarianceTotalSupply(ctx, cms) } } -func (app *App) LightInvarianceTotalSupply(cms sdk.CommitMultiStore) { +func (app *App) LightInvarianceTotalSupply(ctx context.Context, cms sdk.CommitMultiStore) { invarianceStart := time.Now() defer func() { - metrics.MeasureSince([]string{"sei", "lightinvariance_supply", "milliseconds"}, invarianceStart.UTC()) // TODO(PLT-327): remove once app_lightinvariance_supply_duration_seconds verified - appMetrics.invarianceDuration.Record(context.Background(), time.Since(invarianceStart).Seconds()) + armonmetrics.MeasureSince([]string{"sei", "lightinvariance_supply", "milliseconds"}, invarianceStart.UTC()) // TODO(PLT-327): remove once app_lightinvariance_supply_duration_seconds verified + appMetrics.invarianceDuration.Record(ctx, time.Since(invarianceStart).Seconds()) }() ckv, ok := cms.GetStore(app.BankKeeper.GetStoreKey()).(*commitment.Store) if !ok { @@ -71,16 +71,16 @@ func (app *App) LightInvarianceTotalSupply(cms sdk.CommitMultiStore) { for _, p := range balanceChangePairs { if len(p.Key) < 2 { // invalid key; ignore - seimetrics.SafeMetricsIncrCounterWithLabels([]string{"sei", "lightinvariance_supply", "invalid_changed_key"}, 1, []metrics.Label{{Name: "type", Value: "sei"}}) // TODO(PLT-327): remove once app_lightinvariance_supply_invalid_key_total verified - appMetrics.invarianceInvalidKey.Add(context.Background(), 1, otelmetric.WithAttributes(attribute.String("type", "sei"))) + seimetrics.SafeMetricsIncrCounterWithLabels([]string{"sei", "lightinvariance_supply", "invalid_changed_key"}, 1, []armonmetrics.Label{{Name: "type", Value: "sei"}}) // TODO(PLT-327): remove once app_lightinvariance_supply_invalid_key_total verified + appMetrics.invarianceInvalidKey.Add(ctx, 1, otelmetrics.WithAttributes(attribute.String("type", "sei"))) logger.Error("invalid changed pair key for usei", "key", fmt.Sprintf("%X", p.Key)) continue } addrLen := int(p.Key[1]) if len(p.Key) < addrLen+2 { // invalid key length; ignore - seimetrics.SafeMetricsIncrCounterWithLabels([]string{"sei", "lightinvariance_supply", "invalid_changed_key"}, 1, []metrics.Label{{Name: "type", Value: "sei"}}) // TODO(PLT-327): remove once app_lightinvariance_supply_invalid_key_total verified - appMetrics.invarianceInvalidKey.Add(context.Background(), 1, otelmetric.WithAttributes(attribute.String("type", "sei"))) + seimetrics.SafeMetricsIncrCounterWithLabels([]string{"sei", "lightinvariance_supply", "invalid_changed_key"}, 1, []armonmetrics.Label{{Name: "type", Value: "sei"}}) // TODO(PLT-327): remove once app_lightinvariance_supply_invalid_key_total verified + appMetrics.invarianceInvalidKey.Add(ctx, 1, otelmetrics.WithAttributes(attribute.String("type", "sei"))) logger.Error("invalid changed pair key for usei", "key", fmt.Sprintf("%X", p.Key)) continue } @@ -92,8 +92,8 @@ func (app *App) LightInvarianceTotalSupply(cms sdk.CommitMultiStore) { if !p.Delete { var balance sdk.Coin if err := balance.Unmarshal(p.Value); err != nil { - seimetrics.SafeMetricsIncrCounterWithLabels([]string{"sei", "lightinvariance_supply", "unmarshal_failure"}, 1, []metrics.Label{{Name: "type", Value: "usei"}, {Name: "step", Value: "post_block"}}) // TODO(PLT-327): remove once app_lightinvariance_supply_unmarshal_failure_total verified - appMetrics.invarianceUnmarshalFail.Add(context.Background(), 1, otelmetric.WithAttributes(attribute.String("type", "usei"), attribute.String("step", "post_block"))) + seimetrics.SafeMetricsIncrCounterWithLabels([]string{"sei", "lightinvariance_supply", "unmarshal_failure"}, 1, []armonmetrics.Label{{Name: "type", Value: "usei"}, {Name: "step", Value: "post_block"}}) // TODO(PLT-327): remove once app_lightinvariance_supply_unmarshal_failure_total verified + appMetrics.invarianceUnmarshalFail.Add(ctx, 1, otelmetrics.WithAttributes(attribute.String("type", "usei"), attribute.String("step", "post_block"))) logger.Error("failed to unmarshal balance", "err", err) continue } @@ -113,8 +113,8 @@ func (app *App) LightInvarianceTotalSupply(cms sdk.CommitMultiStore) { } var balance sdk.Coin if err := balance.Unmarshal(val); err != nil { - seimetrics.SafeMetricsIncrCounterWithLabels([]string{"sei", "lightinvariance_supply", "unmarshal_failure"}, 1, []metrics.Label{{Name: "type", Value: "usei"}, {Name: "step", Value: "pre_block"}}) // TODO(PLT-327): remove once app_lightinvariance_supply_unmarshal_failure_total verified - appMetrics.invarianceUnmarshalFail.Add(context.Background(), 1, otelmetric.WithAttributes(attribute.String("type", "usei"), attribute.String("step", "pre_block"))) + seimetrics.SafeMetricsIncrCounterWithLabels([]string{"sei", "lightinvariance_supply", "unmarshal_failure"}, 1, []armonmetrics.Label{{Name: "type", Value: "usei"}, {Name: "step", Value: "pre_block"}}) // TODO(PLT-327): remove once app_lightinvariance_supply_unmarshal_failure_total verified + appMetrics.invarianceUnmarshalFail.Add(ctx, 1, otelmetrics.WithAttributes(attribute.String("type", "usei"), attribute.String("step", "pre_block"))) logger.Error("failed to unmarshal preblock balance", "err", err) continue } @@ -126,15 +126,15 @@ func (app *App) LightInvarianceTotalSupply(cms sdk.CommitMultiStore) { for _, p := range weiChangePairs { var amt sdk.Int if len(p.Key) < 1 { - seimetrics.SafeMetricsIncrCounterWithLabels([]string{"sei", "lightinvariance_supply", "invalid_changed_key"}, 1, []metrics.Label{{Name: "type", Value: "wei"}}) // TODO(PLT-327): remove once app_lightinvariance_supply_invalid_key_total verified - appMetrics.invarianceInvalidKey.Add(context.Background(), 1, otelmetric.WithAttributes(attribute.String("type", "wei"))) + seimetrics.SafeMetricsIncrCounterWithLabels([]string{"sei", "lightinvariance_supply", "invalid_changed_key"}, 1, []armonmetrics.Label{{Name: "type", Value: "wei"}}) // TODO(PLT-327): remove once app_lightinvariance_supply_invalid_key_total verified + appMetrics.invarianceInvalidKey.Add(ctx, 1, otelmetrics.WithAttributes(attribute.String("type", "wei"))) logger.Error("invalid changed pair key", "key", fmt.Sprintf("%X", p.Key)) continue } if !p.Delete { if err := amt.Unmarshal(p.Value); err != nil { - seimetrics.SafeMetricsIncrCounterWithLabels([]string{"sei", "lightinvariance_supply", "unmarshal_failure"}, 1, []metrics.Label{{Name: "type", Value: "wei"}, {Name: "step", Value: "post_block"}}) // TODO(PLT-327): remove once app_lightinvariance_supply_unmarshal_failure_total verified - appMetrics.invarianceUnmarshalFail.Add(context.Background(), 1, otelmetric.WithAttributes(attribute.String("type", "wei"), attribute.String("step", "post_block"))) + seimetrics.SafeMetricsIncrCounterWithLabels([]string{"sei", "lightinvariance_supply", "unmarshal_failure"}, 1, []armonmetrics.Label{{Name: "type", Value: "wei"}, {Name: "step", Value: "post_block"}}) // TODO(PLT-327): remove once app_lightinvariance_supply_unmarshal_failure_total verified + appMetrics.invarianceUnmarshalFail.Add(ctx, 1, otelmetrics.WithAttributes(attribute.String("type", "wei"), attribute.String("step", "post_block"))) logger.Error("failed to unmarshal wei balance", "err", err) continue } @@ -154,8 +154,8 @@ func (app *App) LightInvarianceTotalSupply(cms sdk.CommitMultiStore) { } var amt sdk.Int if err := amt.Unmarshal(val); err != nil { - seimetrics.SafeMetricsIncrCounterWithLabels([]string{"sei", "lightinvariance_supply", "unmarshal_failure"}, 1, []metrics.Label{{Name: "type", Value: "wei"}, {Name: "step", Value: "pre_block"}}) // TODO(PLT-327): remove once app_lightinvariance_supply_unmarshal_failure_total verified - appMetrics.invarianceUnmarshalFail.Add(context.Background(), 1, otelmetric.WithAttributes(attribute.String("type", "wei"), attribute.String("step", "pre_block"))) + seimetrics.SafeMetricsIncrCounterWithLabels([]string{"sei", "lightinvariance_supply", "unmarshal_failure"}, 1, []armonmetrics.Label{{Name: "type", Value: "wei"}, {Name: "step", Value: "pre_block"}}) // TODO(PLT-327): remove once app_lightinvariance_supply_unmarshal_failure_total verified + appMetrics.invarianceUnmarshalFail.Add(ctx, 1, otelmetrics.WithAttributes(attribute.String("type", "wei"), attribute.String("step", "pre_block"))) logger.Error("failed to unmarshal preblock wei balance", "err", err) continue } @@ -167,8 +167,8 @@ func (app *App) LightInvarianceTotalSupply(cms sdk.CommitMultiStore) { if bz := ckv.Get(append(banktypes.SupplyKey, []byte(sdk.MustGetBaseDenom())...)); bz != nil { var amt sdk.Int if err := amt.Unmarshal(bz); err != nil { - seimetrics.SafeMetricsIncrCounterWithLabels([]string{"sei", "lightinvariance_supply", "unmarshal_failure"}, 1, []metrics.Label{{Name: "type", Value: "total_supply"}, {Name: "step", Value: "pre_block"}}) // TODO(PLT-327): remove once app_lightinvariance_supply_unmarshal_failure_total verified - appMetrics.invarianceUnmarshalFail.Add(context.Background(), 1, otelmetric.WithAttributes(attribute.String("type", "total_supply"), attribute.String("step", "pre_block"))) + seimetrics.SafeMetricsIncrCounterWithLabels([]string{"sei", "lightinvariance_supply", "unmarshal_failure"}, 1, []armonmetrics.Label{{Name: "type", Value: "total_supply"}, {Name: "step", Value: "pre_block"}}) // TODO(PLT-327): remove once app_lightinvariance_supply_unmarshal_failure_total verified + appMetrics.invarianceUnmarshalFail.Add(ctx, 1, otelmetrics.WithAttributes(attribute.String("type", "total_supply"), attribute.String("step", "pre_block"))) logger.Error("failed to unmarshal pre total supply", "err", err) return } @@ -181,8 +181,8 @@ func (app *App) LightInvarianceTotalSupply(cms sdk.CommitMultiStore) { } else { var amt sdk.Int if err := amt.Unmarshal(p.Value); err != nil { - seimetrics.SafeMetricsIncrCounterWithLabels([]string{"sei", "lightinvariance_supply", "unmarshal_failure"}, 1, []metrics.Label{{Name: "type", Value: "total_supply"}, {Name: "step", Value: "post_block"}}) // TODO(PLT-327): remove once app_lightinvariance_supply_unmarshal_failure_total verified - appMetrics.invarianceUnmarshalFail.Add(context.Background(), 1, otelmetric.WithAttributes(attribute.String("type", "total_supply"), attribute.String("step", "post_block"))) + seimetrics.SafeMetricsIncrCounterWithLabels([]string{"sei", "lightinvariance_supply", "unmarshal_failure"}, 1, []armonmetrics.Label{{Name: "type", Value: "total_supply"}, {Name: "step", Value: "post_block"}}) // TODO(PLT-327): remove once app_lightinvariance_supply_unmarshal_failure_total verified + appMetrics.invarianceUnmarshalFail.Add(ctx, 1, otelmetrics.WithAttributes(attribute.String("type", "total_supply"), attribute.String("step", "post_block"))) logger.Error("failed to unmarshal total supply", "err", err) } else { supplyChanged = amt.Sub(preTotalSupply) From 391a48982af1970063829b386dfebed36cd95ba7 Mon Sep 17 00:00:00 2001 From: Amir Deris Date: Wed, 6 May 2026 13:45:50 -0700 Subject: [PATCH 03/11] Added metric for commit duration --- app/abci.go | 7 +++++-- app/metrics.go | 10 ++++++++++ 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/app/abci.go b/app/abci.go index ccb9d2f2ac..af75430e7e 100644 --- a/app/abci.go +++ b/app/abci.go @@ -75,9 +75,9 @@ func (app *App) EndBlock(ctx sdk.Context, height int64, blockGasUsed int64) (res moduleEndBlockStart := time.Now() defer func() { telemetry.MeasureSince(moduleEndBlockStart, "module", "total_end_block") // TODO(PLT-327): remove once app_abci_module_end_block_duration_seconds verified - appMetrics.moduleEndBlockDuration.Record(ctx.Context(), time.Since(moduleEndBlockStart).Seconds()) }() res.ValidatorUpdates = legacyabci.EndBlock(ctx, height, blockGasUsed, app.EndBlockKeepers) + appMetrics.moduleEndBlockDuration.Record(ctx.Context(), time.Since(moduleEndBlockStart).Seconds()) res.Events = sdk.MarkEventsToIndex(ctx.EventManager().ABCIEvents(), app.IndexEvents) if cp := app.GetConsensusParams(ctx); cp != nil { res.ConsensusParamUpdates = legacytm.ABCIToLegacyConsensusParams(cp) @@ -238,6 +238,9 @@ func (app *App) Commit(ctx context.Context) (res *abci.ResponseCommit, err error defer span.End() start := time.Now() res, err = app.BaseApp.Commit(ctx) - app.RecordBenchmarkCommitTime(time.Since(start)) + elapsed := time.Since(start) + // legacy: telemetry.MeasureSince in sei-cosmos/baseapp/abci.go TODO(PLT-327) + appMetrics.commitDuration.Record(ctx, elapsed.Seconds()) + app.RecordBenchmarkCommitTime(elapsed) return res, err } diff --git a/app/metrics.go b/app/metrics.go index 577115af44..359c05af4b 100644 --- a/app/metrics.go +++ b/app/metrics.go @@ -27,6 +27,9 @@ type metrics struct { deliverTxDuration metric.Float64Histogram deliverBatchTxDuration metric.Float64Histogram + // Commit duration + commitDuration metric.Float64Histogram + // Block processing duration by execution type blockProcessDuration metric.Float64Histogram @@ -105,6 +108,13 @@ func initAppMetrics() { histogramBuckets, )) + appMetrics.commitDuration = must(meter.Float64Histogram( + "app_abci_commit_duration_seconds", + metric.WithDescription("Duration of ABCI Commit (state write to disk)"), + metric.WithUnit("s"), + histogramBuckets, + )) + appMetrics.blockProcessDuration = must(meter.Float64Histogram( "app_block_process_duration_seconds", metric.WithDescription("Duration of block tx processing by execution type"), From b7b4868391433a0b63481ef946a3d95d0102b271 Mon Sep 17 00:00:00 2001 From: Amir Deris Date: Wed, 6 May 2026 14:00:43 -0700 Subject: [PATCH 04/11] Added test coverage --- app/ante/metrics_test.go | 28 +++++++++++++++++++++++ app/invariance_test.go | 4 +++- app/metrics_test.go | 48 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 79 insertions(+), 1 deletion(-) create mode 100644 app/ante/metrics_test.go create mode 100644 app/metrics_test.go diff --git a/app/ante/metrics_test.go b/app/ante/metrics_test.go new file mode 100644 index 0000000000..bc9806c1fb --- /dev/null +++ b/app/ante/metrics_test.go @@ -0,0 +1,28 @@ +package ante + +import ( + "context" + "testing" + + "go.opentelemetry.io/otel" + "go.opentelemetry.io/otel/attribute" + otelmetric "go.opentelemetry.io/otel/metric" + "go.opentelemetry.io/otel/metric/noop" +) + +func TestGetAnteMetricsNoPanic(t *testing.T) { + otel.SetMeterProvider(noop.NewMeterProvider()) + m := getAnteMetrics() + if m == nil { + t.Fatal("getAnteMetrics() returned nil") + } +} + +func TestAnteMetricsPendingNonceAllEvents(t *testing.T) { + otel.SetMeterProvider(noop.NewMeterProvider()) + ctx := context.Background() + m := getAnteMetrics() + for _, event := range []string{"added", "expired", "rejected", "accepted"} { + m.pendingNonce.Add(ctx, 1, otelmetric.WithAttributes(attribute.String("event", event))) + } +} diff --git a/app/invariance_test.go b/app/invariance_test.go index 3886f19bc6..9891ae0302 100644 --- a/app/invariance_test.go +++ b/app/invariance_test.go @@ -172,7 +172,9 @@ func TestLightInvarianceChecks(t *testing.T) { } a.BankKeeper.SetSupply(ctx, useiCoin(tt.postSupply)) a.SetDeliverStateToCommit() - f := func() { a.LightInvarianceChecks(a.WriteState(), app.LightInvarianceConfig{SupplyEnabled: true}) } + f := func() { + a.LightInvarianceChecks(t.Context(), a.WriteState(), app.LightInvarianceConfig{SupplyEnabled: true}) + } if tt.success { require.NotPanics(t, f) } else { diff --git a/app/metrics_test.go b/app/metrics_test.go new file mode 100644 index 0000000000..a1f416cec7 --- /dev/null +++ b/app/metrics_test.go @@ -0,0 +1,48 @@ +package app + +import ( + "testing" + + "go.opentelemetry.io/otel" + "go.opentelemetry.io/otel/attribute" + otelmetric "go.opentelemetry.io/otel/metric" + "go.opentelemetry.io/otel/metric/noop" +) + +func TestInitAppMetricsNoPanic(t *testing.T) { + otel.SetMeterProvider(noop.NewMeterProvider()) + initAppMetrics() +} + +func TestAppMetricsAllInstrumentsUsable(t *testing.T) { + otel.SetMeterProvider(noop.NewMeterProvider()) + initAppMetrics() + + ctx := t.Context() + + appMetrics.beginBlockDuration.Record(ctx, 0.1) + appMetrics.endBlockDuration.Record(ctx, 0.2) + appMetrics.moduleEndBlockDuration.Record(ctx, 0.15) + appMetrics.checkTxDuration.Record(ctx, 0.05) + appMetrics.deliverTxDuration.Record(ctx, 0.01) + appMetrics.deliverBatchTxDuration.Record(ctx, 0.1) + appMetrics.commitDuration.Record(ctx, 0.3) + appMetrics.blockProcessDuration.Record(ctx, 0.5) + + appMetrics.txCount.Add(ctx, 1) + appMetrics.txProcessType.Add(ctx, 1, otelmetric.WithAttributes(attribute.String("type", "sequential"))) + appMetrics.txGas.Add(ctx, 100, otelmetric.WithAttributes(attribute.String("type", "gas_used"))) + appMetrics.txGasUsed.Record(ctx, 1000) + appMetrics.txGasWanted.Record(ctx, 2000) + + appMetrics.optimisticProcessing.Add(ctx, 1, otelmetric.WithAttributes(attribute.Bool("enabled", true))) + appMetrics.failedGasWantedCheck.Add(ctx, 1) + appMetrics.gigaFallback.Add(ctx, 1) + + appMetrics.invarianceDuration.Record(ctx, 0.01) + appMetrics.invarianceInvalidKey.Add(ctx, 1, otelmetric.WithAttributes(attribute.String("type", "sei"))) + appMetrics.invarianceUnmarshalFail.Add(ctx, 1, otelmetric.WithAttributes( + attribute.String("type", "usei"), + attribute.String("step", "post_block"), + )) +} From 9aa174c301602eb4c0640539ebbe0039e90fc5b2 Mon Sep 17 00:00:00 2001 From: Amir Deris Date: Wed, 6 May 2026 17:46:52 -0700 Subject: [PATCH 05/11] Fixing the failed race detection tests --- app/metrics.go | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/app/metrics.go b/app/metrics.go index 359c05af4b..909e4380c3 100644 --- a/app/metrics.go +++ b/app/metrics.go @@ -2,6 +2,7 @@ package app import ( "context" + "sync" "go.opentelemetry.io/otel" "go.opentelemetry.io/otel/attribute" @@ -19,6 +20,9 @@ var histogramBuckets = metric.WithExplicitBucketBoundaries( ) type metrics struct { + mu sync.Mutex + initialized bool + // ABCI phase durations beginBlockDuration metric.Float64Histogram endBlockDuration metric.Float64Histogram @@ -62,8 +66,13 @@ func must[V any](v V, err error) V { } // initAppMetrics registers all OTel instruments for the app package. -// Must be called after metrics.SetupOtelMetricsProvider() in NewApp. +// Safe to call concurrently; instruments are registered exactly once. func initAppMetrics() { + appMetrics.mu.Lock() + defer appMetrics.mu.Unlock() + if appMetrics.initialized { + return + } meter := otel.Meter(appMeterName) appMetrics.beginBlockDuration = must(meter.Float64Histogram( @@ -196,4 +205,5 @@ func initAppMetrics() { return nil }), )) + appMetrics.initialized = true } From 352089b48463320df018eeead3c322eab33f7fb3 Mon Sep 17 00:00:00 2001 From: Amir Deris Date: Wed, 6 May 2026 18:19:56 -0700 Subject: [PATCH 06/11] Refactored ante metrics to use package level variable --- app/ante/evm_checktx.go | 8 ++++---- app/ante/metrics.go | 25 +++++++++++++++++-------- app/ante/metrics_test.go | 11 ++++------- app/app.go | 1 + 4 files changed, 26 insertions(+), 19 deletions(-) diff --git a/app/ante/evm_checktx.go b/app/ante/evm_checktx.go index ebece757ed..960479571c 100644 --- a/app/ante/evm_checktx.go +++ b/app/ante/evm_checktx.go @@ -301,7 +301,7 @@ func CheckNonce(ctx sdk.Context, latestCtxGetter func() sdk.Context, ek *evmkeep txHash := tmtypes.Tx(ctx.TxBytes()).Hash() ek.AddPendingNonce(txHash, evmAddr, etx.Nonce(), priority) metrics.IncrementPendingNonce("added") // TODO(PLT-327): remove once app_pending_nonce_total verified - getAnteMetrics().pendingNonce.Add(ctx.Context(), 1, otelmetric.WithAttributes(attribute.String("event", "added"))) + appAnteMetrics.pendingNonce.Add(ctx.Context(), 1, otelmetric.WithAttributes(attribute.String("event", "added"))) }) // if the mempool expires a transaction, this handler is invoked @@ -309,7 +309,7 @@ func CheckNonce(ctx sdk.Context, latestCtxGetter func() sdk.Context, ek *evmkeep txHash := tmtypes.Tx(ctx.TxBytes()).Hash() ek.RemovePendingNonce(txHash) metrics.IncrementPendingNonce("expired") // TODO(PLT-327): remove once app_pending_nonce_total verified - getAnteMetrics().pendingNonce.Add(ctx.Context(), 1, otelmetric.WithAttributes(attribute.String("event", "expired"))) + appAnteMetrics.pendingNonce.Add(ctx.Context(), 1, otelmetric.WithAttributes(attribute.String("event", "expired"))) }) if txNonce > nextNonce { @@ -329,7 +329,7 @@ func CheckNonce(ctx sdk.Context, latestCtxGetter func() sdk.Context, ek *evmkeep if txNonce < nextNonceToBeMined { // this nonce has already been mined, we cannot accept it again metrics.IncrementPendingNonce("rejected") // TODO(PLT-327): remove once app_pending_nonce_total verified - getAnteMetrics().pendingNonce.Add(ctx.Context(), 1, otelmetric.WithAttributes(attribute.String("event", "rejected"))) + appAnteMetrics.pendingNonce.Add(ctx.Context(), 1, otelmetric.WithAttributes(attribute.String("event", "rejected"))) return abci.Rejected } else if txNonce < nextPendingNonce { // check if the sender still has enough funds to pay for gas @@ -342,7 +342,7 @@ func CheckNonce(ctx sdk.Context, latestCtxGetter func() sdk.Context, ek *evmkeep // consecutive nonces from nextNonceToBeMined to nextPendingNonce // This logic allows multiple nonces from an account to be processed in a block. metrics.IncrementPendingNonce("accepted") // TODO(PLT-327): remove once app_pending_nonce_total verified - getAnteMetrics().pendingNonce.Add(ctx.Context(), 1, otelmetric.WithAttributes(attribute.String("event", "accepted"))) + appAnteMetrics.pendingNonce.Add(ctx.Context(), 1, otelmetric.WithAttributes(attribute.String("event", "accepted"))) return abci.Accepted } return abci.Pending diff --git a/app/ante/metrics.go b/app/ante/metrics.go index a7c727aa71..44c1a0edd9 100644 --- a/app/ante/metrics.go +++ b/app/ante/metrics.go @@ -8,20 +8,29 @@ import ( ) type anteMetrics struct { + mu sync.Mutex + initialized bool + pendingNonce metric.Int64Counter } -// getAnteMetrics returns the package-level OTel instruments, initializing them -// lazily on first call (after the global MeterProvider is set in NewApp). -var getAnteMetrics = sync.OnceValue(func() *anteMetrics { - m := &anteMetrics{} +var appAnteMetrics anteMetrics + +// InitAnteMetrics registers all OTel instruments for the ante package. +// Safe to call concurrently; instruments are registered exactly once. +func InitAnteMetrics() { + appAnteMetrics.mu.Lock() + defer appAnteMetrics.mu.Unlock() + if appAnteMetrics.initialized { + return + } meter := otel.Meter("app_ante") var err error - if m.pendingNonce, err = meter.Int64Counter( + if appAnteMetrics.pendingNonce, err = meter.Int64Counter( "app_pending_nonce_total", metric.WithDescription("Pending nonce events by type (added, expired, rejected, accepted)"), ); err != nil { - panic("ante anteMetrics: " + err.Error()) + panic("ante metrics: " + err.Error()) } - return m -}) + appAnteMetrics.initialized = true +} diff --git a/app/ante/metrics_test.go b/app/ante/metrics_test.go index bc9806c1fb..5124033f9a 100644 --- a/app/ante/metrics_test.go +++ b/app/ante/metrics_test.go @@ -10,19 +10,16 @@ import ( "go.opentelemetry.io/otel/metric/noop" ) -func TestGetAnteMetricsNoPanic(t *testing.T) { +func TestInitAnteMetricsNoPanic(t *testing.T) { otel.SetMeterProvider(noop.NewMeterProvider()) - m := getAnteMetrics() - if m == nil { - t.Fatal("getAnteMetrics() returned nil") - } + InitAnteMetrics() } func TestAnteMetricsPendingNonceAllEvents(t *testing.T) { otel.SetMeterProvider(noop.NewMeterProvider()) + InitAnteMetrics() ctx := context.Background() - m := getAnteMetrics() for _, event := range []string{"added", "expired", "rejected", "accepted"} { - m.pendingNonce.Add(ctx, 1, otelmetric.WithAttributes(attribute.String("event", event))) + appAnteMetrics.pendingNonce.Add(ctx, 1, otelmetric.WithAttributes(attribute.String("event", event))) } } diff --git a/app/app.go b/app/app.go index d68b93a53a..633da2e129 100644 --- a/app/app.go +++ b/app/app.go @@ -494,6 +494,7 @@ func New( logger.Error(err.Error()) } initAppMetrics() + appante.InitAnteMetrics() keys := sdk.NewKVStoreKeys( authtypes.StoreKey, authzkeeper.StoreKey, banktypes.StoreKey, stakingtypes.StoreKey, From c1bcbefa3695750210828c7b6baf5b225595b80c Mon Sep 17 00:00:00 2001 From: Amir Deris Date: Thu, 7 May 2026 14:59:52 -0700 Subject: [PATCH 07/11] Addressed some pr feedback --- app/abci.go | 1 - app/app.go | 3 ++- app/metrics.go | 26 ++++++++++++++++---------- 3 files changed, 18 insertions(+), 12 deletions(-) diff --git a/app/abci.go b/app/abci.go index af75430e7e..084861ab8a 100644 --- a/app/abci.go +++ b/app/abci.go @@ -160,7 +160,6 @@ func (app *App) DeliverTx(ctx sdk.Context, req abci.RequestDeliverTxV2, tx sdk.T telemetry.IncrCounter(1, "tx", resultStr) // TODO(PLT-327): remove once app_tx_count_total verified telemetry.SetGauge(float32(gInfo.GasUsed), "tx", "gas", "used") // TODO(PLT-327): remove once app_tx_gas_used verified telemetry.SetGauge(float32(gInfo.GasWanted), "tx", "gas", "wanted") // TODO(PLT-327): remove once app_tx_gas_wanted verified - appMetrics.txCount.Add(ctx.Context(), 1) appMetrics.txCount.Add(ctx.Context(), 1, otelmetrics.WithAttributes(attribute.String("result", resultStr))) appMetrics.txGasUsed.Record(ctx.Context(), int64(gInfo.GasUsed)) //nolint:gosec appMetrics.txGasWanted.Record(ctx.Context(), int64(gInfo.GasWanted)) //nolint:gosec diff --git a/app/app.go b/app/app.go index 633da2e129..1045338458 100644 --- a/app/app.go +++ b/app/app.go @@ -4,6 +4,7 @@ import ( "bytes" "context" "crypto/sha256" + "encoding/hex" "encoding/json" "errors" "fmt" @@ -1223,7 +1224,7 @@ func (app *App) ProcessProposalHandler(ctx sdk.Context, req *abci.RequestProcess if !app.checkTotalBlockGas(checkCtx, typedTxs) { utilmetrics.IncrFailedTotalGasWantedCheck(string(req.Header.ProposerAddress)) // TODO(PLT-327): remove once app_failed_total_gas_wanted_check_total verified appMetrics.failedGasWantedCheck.Add(ctx.Context(), 1, - otelmetric.WithAttributes(attribute.String("proposer", string(req.Header.ProposerAddress)))) + otelmetric.WithAttributes(attribute.String("proposer", hex.EncodeToString(req.Header.ProposerAddress)))) return &abci.ResponseProcessProposal{ Status: abci.ResponseProcessProposal_REJECT, }, nil diff --git a/app/metrics.go b/app/metrics.go index 909e4380c3..abb14b4ce1 100644 --- a/app/metrics.go +++ b/app/metrics.go @@ -14,9 +14,15 @@ import ( const appMeterName = "app" // histogramBuckets aligns with block-time SLO thresholds -// (p50 ≤ 500ms, p95 ≤ 1.5s, p99 ≤ 2.5s) expressed in seconds. +// (p50 ≤ 500ms, p95 ≤ 1.5s, p99 ≤ 2.5s) expressed in seconds; 3s/4s refine quantiles just above the p99 line. var histogramBuckets = metric.WithExplicitBucketBoundaries( - 0.025, 0.05, 0.1, 0.25, 0.5, 0.75, 1.0, 1.5, 2.0, 2.5, 5.0, 10.0, + 0.025, 0.05, 0.1, 0.25, 0.5, 0.75, 1.0, 1.5, 2.0, 2.5, 3.0, 4.0, 5.0, 10.0, +) + +// millisecondBuckets is for metrics that typically complete in under 100ms, expressed in seconds. +// Covers µs-range fast paths (25µs–1ms) and occasional slower outliers up to 100ms. +var millisecondBuckets = metric.WithExplicitBucketBoundaries( + 0.000025, 0.000050, 0.0001, 0.0005, 0.001, 0.0025, 0.005, 0.010, 0.020, 0.050, 0.075, 0.1, ) type metrics struct { @@ -79,49 +85,49 @@ func initAppMetrics() { "app_abci_begin_block_duration_seconds", metric.WithDescription("Duration of ABCI BeginBlock"), metric.WithUnit("s"), - histogramBuckets, + millisecondBuckets, )) appMetrics.endBlockDuration = must(meter.Float64Histogram( "app_abci_end_block_duration_seconds", metric.WithDescription("Duration of ABCI EndBlock"), metric.WithUnit("s"), - histogramBuckets, + millisecondBuckets, )) appMetrics.moduleEndBlockDuration = must(meter.Float64Histogram( "app_abci_module_end_block_duration_seconds", metric.WithDescription("Duration of module EndBlock calls within ABCI EndBlock"), metric.WithUnit("s"), - histogramBuckets, + millisecondBuckets, )) appMetrics.checkTxDuration = must(meter.Float64Histogram( "app_abci_check_tx_duration_seconds", metric.WithDescription("Duration of ABCI CheckTx"), metric.WithUnit("s"), - histogramBuckets, + millisecondBuckets, )) appMetrics.deliverTxDuration = must(meter.Float64Histogram( "app_abci_deliver_tx_duration_seconds", metric.WithDescription("Duration of ABCI DeliverTx"), metric.WithUnit("s"), - histogramBuckets, + millisecondBuckets, )) appMetrics.deliverBatchTxDuration = must(meter.Float64Histogram( "app_abci_deliver_batch_tx_duration_seconds", metric.WithDescription("Duration of ABCI DeliverTxBatch"), metric.WithUnit("s"), - histogramBuckets, + millisecondBuckets, )) appMetrics.commitDuration = must(meter.Float64Histogram( "app_abci_commit_duration_seconds", metric.WithDescription("Duration of ABCI Commit (state write to disk)"), metric.WithUnit("s"), - histogramBuckets, + millisecondBuckets, )) appMetrics.blockProcessDuration = must(meter.Float64Histogram( @@ -175,7 +181,7 @@ func initAppMetrics() { "app_lightinvariance_supply_duration_seconds", metric.WithDescription("Duration of light invariance total supply check"), metric.WithUnit("s"), - histogramBuckets, + millisecondBuckets, )) appMetrics.invarianceInvalidKey = must(meter.Int64Counter( From ccd7515c78db68e9c1954f7e39e8f584baa9893e Mon Sep 17 00:00:00 2001 From: Amir Deris Date: Thu, 7 May 2026 15:00:33 -0700 Subject: [PATCH 08/11] Added Giga vs OCC attributes for block process duration --- app/app.go | 18 +++++++++++++++++- utils/metrics/labels.go | 12 +++++++----- 2 files changed, 24 insertions(+), 6 deletions(-) diff --git a/app/app.go b/app/app.go index 1045338458..43f3b460cf 100644 --- a/app/app.go +++ b/app/app.go @@ -1441,7 +1441,7 @@ func (app *App) ProcessTxsSynchronousGiga(ctx sdk.Context, txs [][]byte, typedTx defer func() { utilmetrics.BlockProcessLatency(blockProcessGigaStart, utilmetrics.SYNCHRONOUS) // TODO(PLT-327): remove once app_block_process_duration_seconds verified appMetrics.blockProcessDuration.Record(ctx.Context(), time.Since(blockProcessGigaStart).Seconds(), - otelmetric.WithAttributes(attribute.String("type", utilmetrics.SYNCHRONOUS))) + otelmetric.WithAttributes(attribute.String("type", utilmetrics.SYNCHRONOUS_GIGA))) }() ms := ctx.MultiStore().CacheMultiStore() @@ -1569,6 +1569,12 @@ func (app *App) GetDeliverTxEntry(ctx sdk.Context, txIndex int, bz []byte, tx sd // ProcessTXsWithOCCV2 runs the transactions concurrently via OCC, using the V2 executor func (app *App) ProcessTXsWithOCCV2(ctx sdk.Context, txs [][]byte, typedTxs []sdk.Tx) ([]*abci.ExecTxResult, sdk.Context) { + blockProcessStart := time.Now() + defer func() { + appMetrics.blockProcessDuration.Record(ctx.Context(), time.Since(blockProcessStart).Seconds(), + otelmetric.WithAttributes(attribute.String("type", utilmetrics.OCC_CONCURRENT))) + }() + entries := make([]*sdk.DeliverTxEntry, len(txs)) for txIndex, tx := range txs { entries[txIndex] = app.GetDeliverTxEntry(ctx, txIndex, tx, typedTxs[txIndex]) @@ -1633,6 +1639,15 @@ func (app *App) ProcessTXsWithOCCV2(ctx sdk.Context, txs [][]byte, typedTxs []sd // ProcessTXsWithOCCGiga runs the transactions concurrently via OCC, using the Giga executor func (app *App) ProcessTXsWithOCCGiga(ctx sdk.Context, txs [][]byte, typedTxs []sdk.Tx) ([]*abci.ExecTxResult, sdk.Context) { + blockProcessStart := time.Now() + delegatedToV2 := false + defer func() { + if !delegatedToV2 { + appMetrics.blockProcessDuration.Record(ctx.Context(), time.Since(blockProcessStart).Seconds(), + otelmetric.WithAttributes(attribute.String("type", utilmetrics.OCC_GIGA))) + } + }() + evmEntries := make([]*sdk.DeliverTxEntry, 0, len(txs)) v2Entries := make([]*sdk.DeliverTxEntry, 0, len(txs)) firstCosmosSeen := false @@ -1641,6 +1656,7 @@ func (app *App) ProcessTXsWithOCCGiga(ctx sdk.Context, txs [][]byte, typedTxs [] if firstCosmosSeen { logger.Error("Giga OCC cannot execute block due to tx ordering, falling back to V2") // Oops! This isn't "all EVM txs, then all Cosmos txs" - we need to fallback to V2. + delegatedToV2 = true return app.ProcessTXsWithOCCV2(ctx, txs, typedTxs) } diff --git a/utils/metrics/labels.go b/utils/metrics/labels.go index 81808cf532..56d1339a26 100644 --- a/utils/metrics/labels.go +++ b/utils/metrics/labels.go @@ -1,9 +1,11 @@ package metrics const ( - CONCURRENT = "concurrent" - SYNCHRONOUS = "synchronous" - OCC_CONCURRENT = "optimistic_concurrency" - GovMsgInBlock = "gov-msg-in-block" - FailedToBuild = "failed-to-build" + CONCURRENT = "concurrent" + SYNCHRONOUS = "synchronous" + SYNCHRONOUS_GIGA = "synchronous_giga" + OCC_CONCURRENT = "optimistic_concurrency" + OCC_GIGA = "occ_giga" + GovMsgInBlock = "gov-msg-in-block" + FailedToBuild = "failed-to-build" ) From 5d61836a43f45cbc8092a069cb3869c2352e9db7 Mon Sep 17 00:00:00 2001 From: Amir Deris Date: Thu, 7 May 2026 15:53:22 -0700 Subject: [PATCH 09/11] Addressing more pr feedback --- app/abci.go | 2 -- app/metrics.go | 12 ------------ app/metrics_test.go | 2 -- 3 files changed, 16 deletions(-) diff --git a/app/abci.go b/app/abci.go index 084861ab8a..dac041b635 100644 --- a/app/abci.go +++ b/app/abci.go @@ -161,8 +161,6 @@ func (app *App) DeliverTx(ctx sdk.Context, req abci.RequestDeliverTxV2, tx sdk.T telemetry.SetGauge(float32(gInfo.GasUsed), "tx", "gas", "used") // TODO(PLT-327): remove once app_tx_gas_used verified telemetry.SetGauge(float32(gInfo.GasWanted), "tx", "gas", "wanted") // TODO(PLT-327): remove once app_tx_gas_wanted verified appMetrics.txCount.Add(ctx.Context(), 1, otelmetrics.WithAttributes(attribute.String("result", resultStr))) - appMetrics.txGasUsed.Record(ctx.Context(), int64(gInfo.GasUsed)) //nolint:gosec - appMetrics.txGasWanted.Record(ctx.Context(), int64(gInfo.GasWanted)) //nolint:gosec }() gInfo, result, anteEvents, resCtx, err := legacyabci.DeliverTx(ctx.WithTxBytes(req.Tx).WithTxSum(checksum), tx, app.GetTxConfig(), &app.DeliverTxKeepers, checksum, func(ctx sdk.Context) (sdk.Context, sdk.CacheMultiStore) { return app.CacheTxContext(ctx, checksum) diff --git a/app/metrics.go b/app/metrics.go index abb14b4ce1..5dcbb07a23 100644 --- a/app/metrics.go +++ b/app/metrics.go @@ -47,8 +47,6 @@ type metrics struct { txCount metric.Int64Counter txProcessType metric.Int64Counter txGas metric.Int64Counter - txGasUsed metric.Int64Gauge - txGasWanted metric.Int64Gauge // App-level flow counters optimisticProcessing metric.Int64Counter @@ -152,16 +150,6 @@ func initAppMetrics() { metric.WithDescription("Cumulative transaction gas by type (gas_used, gas_wanted)"), )) - appMetrics.txGasUsed = must(meter.Int64Gauge( - "app_tx_gas_used", - metric.WithDescription("Gas used by the most recently delivered transaction"), - )) - - appMetrics.txGasWanted = must(meter.Int64Gauge( - "app_tx_gas_wanted", - metric.WithDescription("Gas wanted by the most recently delivered transaction"), - )) - appMetrics.optimisticProcessing = must(meter.Int64Counter( "app_optimistic_processing_total", metric.WithDescription("Optimistic processing attempts; enabled=true means cache hit, false means miss"), diff --git a/app/metrics_test.go b/app/metrics_test.go index a1f416cec7..0d426044aa 100644 --- a/app/metrics_test.go +++ b/app/metrics_test.go @@ -32,8 +32,6 @@ func TestAppMetricsAllInstrumentsUsable(t *testing.T) { appMetrics.txCount.Add(ctx, 1) appMetrics.txProcessType.Add(ctx, 1, otelmetric.WithAttributes(attribute.String("type", "sequential"))) appMetrics.txGas.Add(ctx, 100, otelmetric.WithAttributes(attribute.String("type", "gas_used"))) - appMetrics.txGasUsed.Record(ctx, 1000) - appMetrics.txGasWanted.Record(ctx, 2000) appMetrics.optimisticProcessing.Add(ctx, 1, otelmetric.WithAttributes(attribute.Bool("enabled", true))) appMetrics.failedGasWantedCheck.Add(ctx, 1) From 6903eef4664225a22fd924d3c6519b21428af3b6 Mon Sep 17 00:00:00 2001 From: Amir Deris Date: Thu, 7 May 2026 16:11:09 -0700 Subject: [PATCH 10/11] Used sync.once instead of mutex --- app/ante/metrics.go | 27 ++--- app/metrics.go | 255 ++++++++++++++++++++++---------------------- 2 files changed, 136 insertions(+), 146 deletions(-) diff --git a/app/ante/metrics.go b/app/ante/metrics.go index 44c1a0edd9..a072d50f3b 100644 --- a/app/ante/metrics.go +++ b/app/ante/metrics.go @@ -8,8 +8,7 @@ import ( ) type anteMetrics struct { - mu sync.Mutex - initialized bool + once sync.Once pendingNonce metric.Int64Counter } @@ -19,18 +18,14 @@ var appAnteMetrics anteMetrics // InitAnteMetrics registers all OTel instruments for the ante package. // Safe to call concurrently; instruments are registered exactly once. func InitAnteMetrics() { - appAnteMetrics.mu.Lock() - defer appAnteMetrics.mu.Unlock() - if appAnteMetrics.initialized { - return - } - meter := otel.Meter("app_ante") - var err error - if appAnteMetrics.pendingNonce, err = meter.Int64Counter( - "app_pending_nonce_total", - metric.WithDescription("Pending nonce events by type (added, expired, rejected, accepted)"), - ); err != nil { - panic("ante metrics: " + err.Error()) - } - appAnteMetrics.initialized = true + appAnteMetrics.once.Do(func() { + meter := otel.Meter("app_ante") + var err error + if appAnteMetrics.pendingNonce, err = meter.Int64Counter( + "app_pending_nonce_total", + metric.WithDescription("Pending nonce events by type (added, expired, rejected, accepted)"), + ); err != nil { + panic("ante metrics: " + err.Error()) + } + }) } diff --git a/app/metrics.go b/app/metrics.go index 5dcbb07a23..cd264b212c 100644 --- a/app/metrics.go +++ b/app/metrics.go @@ -26,8 +26,7 @@ var millisecondBuckets = metric.WithExplicitBucketBoundaries( ) type metrics struct { - mu sync.Mutex - initialized bool + once sync.Once // ABCI phase durations beginBlockDuration metric.Float64Histogram @@ -72,132 +71,128 @@ func must[V any](v V, err error) V { // initAppMetrics registers all OTel instruments for the app package. // Safe to call concurrently; instruments are registered exactly once. func initAppMetrics() { - appMetrics.mu.Lock() - defer appMetrics.mu.Unlock() - if appMetrics.initialized { - return - } - meter := otel.Meter(appMeterName) - - appMetrics.beginBlockDuration = must(meter.Float64Histogram( - "app_abci_begin_block_duration_seconds", - metric.WithDescription("Duration of ABCI BeginBlock"), - metric.WithUnit("s"), - millisecondBuckets, - )) - - appMetrics.endBlockDuration = must(meter.Float64Histogram( - "app_abci_end_block_duration_seconds", - metric.WithDescription("Duration of ABCI EndBlock"), - metric.WithUnit("s"), - millisecondBuckets, - )) - - appMetrics.moduleEndBlockDuration = must(meter.Float64Histogram( - "app_abci_module_end_block_duration_seconds", - metric.WithDescription("Duration of module EndBlock calls within ABCI EndBlock"), - metric.WithUnit("s"), - millisecondBuckets, - )) - - appMetrics.checkTxDuration = must(meter.Float64Histogram( - "app_abci_check_tx_duration_seconds", - metric.WithDescription("Duration of ABCI CheckTx"), - metric.WithUnit("s"), - millisecondBuckets, - )) - - appMetrics.deliverTxDuration = must(meter.Float64Histogram( - "app_abci_deliver_tx_duration_seconds", - metric.WithDescription("Duration of ABCI DeliverTx"), - metric.WithUnit("s"), - millisecondBuckets, - )) - - appMetrics.deliverBatchTxDuration = must(meter.Float64Histogram( - "app_abci_deliver_batch_tx_duration_seconds", - metric.WithDescription("Duration of ABCI DeliverTxBatch"), - metric.WithUnit("s"), - millisecondBuckets, - )) - - appMetrics.commitDuration = must(meter.Float64Histogram( - "app_abci_commit_duration_seconds", - metric.WithDescription("Duration of ABCI Commit (state write to disk)"), - metric.WithUnit("s"), - millisecondBuckets, - )) - - appMetrics.blockProcessDuration = must(meter.Float64Histogram( - "app_block_process_duration_seconds", - metric.WithDescription("Duration of block tx processing by execution type"), - metric.WithUnit("s"), - histogramBuckets, - )) - - appMetrics.txCount = must(meter.Int64Counter( - "app_tx_count_total", - metric.WithDescription("Number of transactions delivered"), - )) - - appMetrics.txProcessType = must(meter.Int64Counter( - "app_tx_process_type_total", - metric.WithDescription("Transactions processed by execution type"), - )) - - appMetrics.txGas = must(meter.Int64Counter( - "app_tx_gas_total", - metric.WithDescription("Cumulative transaction gas by type (gas_used, gas_wanted)"), - )) - - appMetrics.optimisticProcessing = must(meter.Int64Counter( - "app_optimistic_processing_total", - metric.WithDescription("Optimistic processing attempts; enabled=true means cache hit, false means miss"), - )) - - appMetrics.failedGasWantedCheck = must(meter.Int64Counter( - "app_failed_total_gas_wanted_check_total", - metric.WithDescription("Proposals rejected because total block gas wanted exceeded max"), - )) - - appMetrics.gigaFallback = must(meter.Int64Counter( - "app_giga_fallback_to_v2_total", - metric.WithDescription("Times giga executor fell back to V2 processing"), - )) - - appMetrics.invarianceDuration = must(meter.Float64Histogram( - "app_lightinvariance_supply_duration_seconds", - metric.WithDescription("Duration of light invariance total supply check"), - metric.WithUnit("s"), - millisecondBuckets, - )) - - appMetrics.invarianceInvalidKey = must(meter.Int64Counter( - "app_lightinvariance_supply_invalid_key_total", - metric.WithDescription("Invalid changed-pair keys detected during invariance check"), - )) - - appMetrics.invarianceUnmarshalFail = must(meter.Int64Counter( - "app_lightinvariance_supply_unmarshal_failure_total", - metric.WithDescription("Unmarshal failures during invariance supply check"), - )) - - // Build-info observable gauge replaces GaugeSeidVersionAndCommit (called per BeginBlock). - // The callback fires on every Prometheus scrape; no per-block call site is needed. - // TODO(PLT-327): remove metrics.GaugeSeidVersionAndCommit call in abci.go once app_build_info verified - vi := version.NewInfo() - _ = must(meter.Int64ObservableGauge( - "app_build_info", - metric.WithDescription("Running binary build info; value is always 1"), - metric.WithInt64Callback(func(_ context.Context, o metric.Int64Observer) error { - o.Observe(1, - metric.WithAttributes( - attribute.String("seid_version", vi.Version), - attribute.String("commit", vi.GitCommit), - ), - ) - return nil - }), - )) - appMetrics.initialized = true + appMetrics.once.Do(func() { + meter := otel.Meter(appMeterName) + + appMetrics.beginBlockDuration = must(meter.Float64Histogram( + "app_abci_begin_block_duration_seconds", + metric.WithDescription("Duration of ABCI BeginBlock"), + metric.WithUnit("s"), + millisecondBuckets, + )) + + appMetrics.endBlockDuration = must(meter.Float64Histogram( + "app_abci_end_block_duration_seconds", + metric.WithDescription("Duration of ABCI EndBlock"), + metric.WithUnit("s"), + millisecondBuckets, + )) + + appMetrics.moduleEndBlockDuration = must(meter.Float64Histogram( + "app_abci_module_end_block_duration_seconds", + metric.WithDescription("Duration of module EndBlock calls within ABCI EndBlock"), + metric.WithUnit("s"), + millisecondBuckets, + )) + + appMetrics.checkTxDuration = must(meter.Float64Histogram( + "app_abci_check_tx_duration_seconds", + metric.WithDescription("Duration of ABCI CheckTx"), + metric.WithUnit("s"), + millisecondBuckets, + )) + + appMetrics.deliverTxDuration = must(meter.Float64Histogram( + "app_abci_deliver_tx_duration_seconds", + metric.WithDescription("Duration of ABCI DeliverTx"), + metric.WithUnit("s"), + millisecondBuckets, + )) + + appMetrics.deliverBatchTxDuration = must(meter.Float64Histogram( + "app_abci_deliver_batch_tx_duration_seconds", + metric.WithDescription("Duration of ABCI DeliverTxBatch"), + metric.WithUnit("s"), + millisecondBuckets, + )) + + appMetrics.commitDuration = must(meter.Float64Histogram( + "app_abci_commit_duration_seconds", + metric.WithDescription("Duration of ABCI Commit (state write to disk)"), + metric.WithUnit("s"), + millisecondBuckets, + )) + + appMetrics.blockProcessDuration = must(meter.Float64Histogram( + "app_block_process_duration_seconds", + metric.WithDescription("Duration of block tx processing by execution type"), + metric.WithUnit("s"), + histogramBuckets, + )) + + appMetrics.txCount = must(meter.Int64Counter( + "app_tx_count_total", + metric.WithDescription("Number of transactions delivered"), + )) + + appMetrics.txProcessType = must(meter.Int64Counter( + "app_tx_process_type_total", + metric.WithDescription("Transactions processed by execution type"), + )) + + appMetrics.txGas = must(meter.Int64Counter( + "app_tx_gas_total", + metric.WithDescription("Cumulative transaction gas by type (gas_used, gas_wanted)"), + )) + + appMetrics.optimisticProcessing = must(meter.Int64Counter( + "app_optimistic_processing_total", + metric.WithDescription("Optimistic processing attempts; enabled=true means cache hit, false means miss"), + )) + + appMetrics.failedGasWantedCheck = must(meter.Int64Counter( + "app_failed_total_gas_wanted_check_total", + metric.WithDescription("Proposals rejected because total block gas wanted exceeded max"), + )) + + appMetrics.gigaFallback = must(meter.Int64Counter( + "app_giga_fallback_to_v2_total", + metric.WithDescription("Times giga executor fell back to V2 processing"), + )) + + appMetrics.invarianceDuration = must(meter.Float64Histogram( + "app_lightinvariance_supply_duration_seconds", + metric.WithDescription("Duration of light invariance total supply check"), + metric.WithUnit("s"), + millisecondBuckets, + )) + + appMetrics.invarianceInvalidKey = must(meter.Int64Counter( + "app_lightinvariance_supply_invalid_key_total", + metric.WithDescription("Invalid changed-pair keys detected during invariance check"), + )) + + appMetrics.invarianceUnmarshalFail = must(meter.Int64Counter( + "app_lightinvariance_supply_unmarshal_failure_total", + metric.WithDescription("Unmarshal failures during invariance supply check"), + )) + + // Build-info observable gauge replaces GaugeSeidVersionAndCommit (called per BeginBlock). + // The callback fires on every Prometheus scrape; no per-block call site is needed. + // TODO(PLT-327): remove metrics.GaugeSeidVersionAndCommit call in abci.go once app_build_info verified + vi := version.NewInfo() + _ = must(meter.Int64ObservableGauge( + "app_build_info", + metric.WithDescription("Running binary build info; value is always 1"), + metric.WithInt64Callback(func(_ context.Context, o metric.Int64Observer) error { + o.Observe(1, + metric.WithAttributes( + attribute.String("seid_version", vi.Version), + attribute.String("commit", vi.GitCommit), + ), + ) + return nil + }), + )) + }) } From 773902676d5c12969c039f3a2db5004415219328 Mon Sep 17 00:00:00 2001 From: Amir Deris Date: Thu, 7 May 2026 16:52:28 -0700 Subject: [PATCH 11/11] Updated histogram comment and added new boundary to bucket --- app/metrics.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/app/metrics.go b/app/metrics.go index cd264b212c..db4b4d1c44 100644 --- a/app/metrics.go +++ b/app/metrics.go @@ -13,8 +13,8 @@ import ( const appMeterName = "app" -// histogramBuckets aligns with block-time SLO thresholds -// (p50 ≤ 500ms, p95 ≤ 1.5s, p99 ≤ 2.5s) expressed in seconds; 3s/4s refine quantiles just above the p99 line. +// histogramBuckets aligns with block-processing +// latency SLO thresholds (p50 ≤ 500ms, p95 ≤ 1.5s, p99 ≤ 2.5s) in seconds; 3s, 4s refine quantiles just above the p99 line. var histogramBuckets = metric.WithExplicitBucketBoundaries( 0.025, 0.05, 0.1, 0.25, 0.5, 0.75, 1.0, 1.5, 2.0, 2.5, 3.0, 4.0, 5.0, 10.0, ) @@ -22,7 +22,7 @@ var histogramBuckets = metric.WithExplicitBucketBoundaries( // millisecondBuckets is for metrics that typically complete in under 100ms, expressed in seconds. // Covers µs-range fast paths (25µs–1ms) and occasional slower outliers up to 100ms. var millisecondBuckets = metric.WithExplicitBucketBoundaries( - 0.000025, 0.000050, 0.0001, 0.0005, 0.001, 0.0025, 0.005, 0.010, 0.020, 0.050, 0.075, 0.1, + 0.000025, 0.000050, 0.0001, 0.0005, 0.001, 0.0025, 0.005, 0.010, 0.020, 0.050, 0.075, 0.1, 0.25, ) type metrics struct {