Skip to content

Commit d9c4ae2

Browse files
authored
feat(metrics): add comprehensive submission debugging metrics with env control (#29)
1 parent e20a6b2 commit d9c4ae2

4 files changed

Lines changed: 264 additions & 2 deletions

File tree

README.md

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,7 @@ Metrics will be available at `http://localhost:2112/metrics`
7777
- `--balance.scrape-interval`: Balance check scrape interval in seconds (default: 30)
7878
- `--verifier.workers`: Number of concurrent workers for block verification (default: 50)
7979
- `--verbose`: Enable verbose logging (default: false)
80+
- `--debug`: Enable debug logging for submission details (default: false, can also be set via `EVMETRICS_DEBUG=true` environment variable)
8081

8182
### Example with Custom Endpoints
8283

@@ -150,6 +151,26 @@ When metrics are enabled, the following metrics are exposed:
150151
- **Labels**: `chain_id`, `type`
151152
- **Description**: Latest DA height for header and data submissions
152153

154+
### `ev_metrics_submission_attempts_total`
155+
- **Type**: Counter
156+
- **Labels**: `chain_id`, `type`
157+
- **Description**: Total number of DA submission attempts (both successful and failed)
158+
159+
### `ev_metrics_submission_failures_total`
160+
- **Type**: Counter
161+
- **Labels**: `chain_id`, `type`
162+
- **Description**: Total number of failed DA submission attempts
163+
164+
### `ev_metrics_last_submission_attempt_time`
165+
- **Type**: Gauge
166+
- **Labels**: `chain_id`, `type`
167+
- **Description**: Timestamp of the last DA submission attempt (Unix timestamp)
168+
169+
### `ev_metrics_last_successful_submission_time`
170+
- **Type**: Gauge
171+
- **Labels**: `chain_id`, `type`
172+
- **Description**: Timestamp of the last successful DA submission (Unix timestamp)
173+
153174
### Block Time Metrics
154175

155176
### `ev_metrics_block_time_seconds`
@@ -253,3 +274,18 @@ When `--balance.addresses` and `--balance.consensus-rpc-urls` are provided:
253274
- **Type**: Counter
254275
- **Labels**: `chain_id`, `endpoint`, `error_type`
255276
- **Description**: Total number of consensus RPC endpoint errors by type
277+
278+
## Debug Logging
279+
280+
Debug logging provides detailed visibility into DA submission process. Enable with `--debug` flag or `EVMETRICS_DEBUG=true` environment variable.
281+
282+
**Use cases**: Troubleshoot submission failures, verify submission flow, diagnose Celestia RPC issues.
283+
284+
**Logs include**: Successful/failed submissions, DA height updates, submission timing.
285+
286+
**Example**:
287+
```bash
288+
# Enable debug logging
289+
export EVMETRICS_DEBUG=true
290+
./ev-metrics monitor --header-namespace testnet_header --data-namespace testnet_data
291+
```

pkg/exporters/verifier/verifier.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -143,10 +143,12 @@ func (e *exporter) processBlocks(ctx context.Context, m *metrics.Metrics, worker
143143

144144
func (e *exporter) onVerified(m *metrics.Metrics, namespace string, blockHeight, daHeight uint64, verified bool, submissionDuration time.Duration) {
145145
if verified {
146+
m.RecordSubmissionAttempt(e.chainID, namespace, true)
146147
m.RecordSubmissionDaHeight(e.chainID, namespace, daHeight)
147148
m.RemoveVerifiedBlock(e.chainID, namespace, blockHeight)
148149
m.RecordSubmissionDuration(e.chainID, namespace, submissionDuration)
149150
} else {
151+
m.RecordSubmissionAttempt(e.chainID, namespace, false)
150152
m.RecordMissingBlock(e.chainID, namespace, blockHeight)
151153
}
152154
}

pkg/metrics/metrics.go

Lines changed: 94 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@ package metrics
22

33
import (
44
"fmt"
5+
"log"
6+
"os"
57
"sort"
68
"strings"
79
"sync"
@@ -29,6 +31,14 @@ type Metrics struct {
2931
SubmissionDuration *prometheus.SummaryVec
3032
// SubmissionDaHeight tracks the DA height at which blocks were submitted.
3133
SubmissionDaHeight *prometheus.GaugeVec
34+
// SubmissionAttemptsTotal tracks the total number of submission attempts.
35+
SubmissionAttemptsTotal *prometheus.CounterVec
36+
// SubmissionFailuresTotal tracks the total number of failed submission attempts.
37+
SubmissionFailuresTotal *prometheus.CounterVec
38+
// LastSubmissionAttemptTime tracks the timestamp of the last submission attempt.
39+
LastSubmissionAttemptTime *prometheus.GaugeVec
40+
// LastSuccessfulSubmissionTime tracks the timestamp of the last successful submission.
41+
LastSuccessfulSubmissionTime *prometheus.GaugeVec
3242
// BlockTime tracks the time between consecutive blocks with histogram buckets for accurate SLO calculations.
3343
BlockTime *prometheus.HistogramVec
3444
// BlockTimeSummary tracks block time with percentiles over a rolling window.
@@ -72,6 +82,11 @@ type Metrics struct {
7282
ranges map[string][]*blockRange // key: blobType -> sorted slice of ranges
7383
}
7484

85+
// isDebugEnabled returns true if debug logging is enabled via environment variable
86+
func isDebugEnabled() bool {
87+
return strings.ToLower(os.Getenv("EVMETRICS_DEBUG")) == "true"
88+
}
89+
7590
type blockRange struct {
7691
start uint64
7792
end uint64
@@ -289,6 +304,38 @@ func NewWithRegistry(namespace string, registerer prometheus.Registerer) *Metric
289304
},
290305
[]string{"chain_id", "endpoint", "error_type"},
291306
),
307+
SubmissionAttemptsTotal: factory.NewCounterVec(
308+
prometheus.CounterOpts{
309+
Namespace: namespace,
310+
Name: "submission_attempts_total",
311+
Help: "total number of DA submission attempts",
312+
},
313+
[]string{"chain_id", "type"},
314+
),
315+
SubmissionFailuresTotal: factory.NewCounterVec(
316+
prometheus.CounterOpts{
317+
Namespace: namespace,
318+
Name: "submission_failures_total",
319+
Help: "total number of failed DA submission attempts",
320+
},
321+
[]string{"chain_id", "type"},
322+
),
323+
LastSubmissionAttemptTime: factory.NewGaugeVec(
324+
prometheus.GaugeOpts{
325+
Namespace: namespace,
326+
Name: "last_submission_attempt_time",
327+
Help: "timestamp of the last DA submission attempt",
328+
},
329+
[]string{"chain_id", "type"},
330+
),
331+
LastSuccessfulSubmissionTime: factory.NewGaugeVec(
332+
prometheus.GaugeOpts{
333+
Namespace: namespace,
334+
Name: "last_successful_submission_time",
335+
Help: "timestamp of the last successful DA submission",
336+
},
337+
[]string{"chain_id", "type"},
338+
),
292339
ranges: make(map[string][]*blockRange),
293340
lastBlockArrivalTime: make(map[string]time.Time),
294341
lastSubmissionDurations: make(map[string]time.Duration),
@@ -297,6 +344,34 @@ func NewWithRegistry(namespace string, registerer prometheus.Registerer) *Metric
297344
return m
298345
}
299346

347+
// RecordSubmissionAttempt records a submission attempt and updates related metrics
348+
func (m *Metrics) RecordSubmissionAttempt(chainID, submissionType string, success bool) {
349+
m.mu.Lock()
350+
defer m.mu.Unlock()
351+
352+
// Always record the attempt
353+
m.SubmissionAttemptsTotal.WithLabelValues(chainID, submissionType).Inc()
354+
355+
if !success {
356+
m.SubmissionFailuresTotal.WithLabelValues(chainID, submissionType).Inc()
357+
}
358+
359+
// Record timestamp of this attempt
360+
now := time.Now()
361+
m.LastSubmissionAttemptTime.WithLabelValues(chainID, submissionType).Set(float64(now.Unix()))
362+
363+
if success {
364+
m.LastSuccessfulSubmissionTime.WithLabelValues(chainID, submissionType).Set(float64(now.Unix()))
365+
if isDebugEnabled() {
366+
log.Printf("DEBUG: Successful submission - chain: %s, type: %s, timestamp: %d", chainID, submissionType, now.Unix())
367+
}
368+
} else {
369+
if isDebugEnabled() {
370+
log.Printf("DEBUG: Failed submission attempt - chain: %s, type: %s, timestamp: %d", chainID, submissionType, now.Unix())
371+
}
372+
}
373+
}
374+
300375
// RecordSubmissionDaHeight records the DA height only if it's higher than previously recorded
301376
func (m *Metrics) RecordSubmissionDaHeight(chainID, submissionType string, daHeight uint64) {
302377
m.mu.Lock()
@@ -306,6 +381,15 @@ func (m *Metrics) RecordSubmissionDaHeight(chainID, submissionType string, daHei
306381
if daHeight > m.latestHeaderDaHeight {
307382
m.latestHeaderDaHeight = daHeight
308383
m.SubmissionDaHeight.WithLabelValues(chainID, "header").Set(float64(daHeight))
384+
// Debug log when submission DA height is recorded
385+
if isDebugEnabled() {
386+
log.Printf("DEBUG: Recorded header submission DA height - chain: %s, height: %d", chainID, daHeight)
387+
}
388+
} else {
389+
// Debug log when DA height is not higher than previous
390+
if isDebugEnabled() {
391+
log.Printf("DEBUG: Header DA height %d not higher than previous %d for chain %s", daHeight, m.latestHeaderDaHeight, chainID)
392+
}
309393
}
310394
return
311395
}
@@ -314,6 +398,15 @@ func (m *Metrics) RecordSubmissionDaHeight(chainID, submissionType string, daHei
314398
if daHeight > m.latestDataDaHeight {
315399
m.latestDataDaHeight = daHeight
316400
m.SubmissionDaHeight.WithLabelValues(chainID, "data").Set(float64(daHeight))
401+
// Debug log when submission DA height is recorded
402+
if isDebugEnabled() {
403+
log.Printf("DEBUG: Recorded data submission DA height - chain: %s, height: %d", chainID, daHeight)
404+
}
405+
} else {
406+
// Debug log when DA height is not higher than previous
407+
if isDebugEnabled() {
408+
log.Printf("DEBUG: Data DA height %d not higher than previous %d for chain %s", daHeight, m.latestDataDaHeight, chainID)
409+
}
317410
}
318411
}
319412
}
@@ -530,6 +623,7 @@ func (m *Metrics) RecordSubmissionDuration(chainID, submissionType string, durat
530623

531624
key := fmt.Sprintf("%s:%s", chainID, submissionType)
532625
m.lastSubmissionDurations[key] = duration
626+
533627
}
534628

535629
// RefreshSubmissionDuration re-observes the last known submission duration to keep the metric alive.

pkg/metrics/metrics_test.go

Lines changed: 132 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -563,6 +563,130 @@ func TestMetrics_RefreshSubmissionDuration_Empty(t *testing.T) {
563563
})
564564
}
565565

566+
func TestMetrics_RecordSubmissionAttempt(t *testing.T) {
567+
reg := prometheus.NewRegistry()
568+
m := NewWithRegistry("test", reg)
569+
570+
// Test successful submission
571+
beforeAttempt := time.Now()
572+
m.RecordSubmissionAttempt("testchain", "header", true)
573+
afterAttempt := time.Now()
574+
575+
// Verify counters
576+
attempts := getMetricValue(t, reg, "test_submission_attempts_total", map[string]string{
577+
"chain_id": "testchain",
578+
"type": "header",
579+
})
580+
require.Equal(t, float64(1), attempts, "should have 1 attempt")
581+
582+
// For successful submission, failures should be 0 (metric may not be exported if 0)
583+
// We'll check if the metric exists and has value 0, or doesn't exist (both are valid)
584+
failuresMetricFound := false
585+
failures := float64(0)
586+
metrics, err := reg.Gather()
587+
require.NoError(t, err)
588+
for _, mf := range metrics {
589+
if mf.GetName() == "test_submission_failures_total" {
590+
for _, m := range mf.GetMetric() {
591+
// check if labels match
592+
match := true
593+
for _, label := range m.GetLabel() {
594+
if expectedVal, ok := map[string]string{"chain_id": "testchain", "type": "header"}[label.GetName()]; ok {
595+
if label.GetValue() != expectedVal {
596+
match = false
597+
break
598+
}
599+
}
600+
}
601+
if match && len(m.GetLabel()) == 2 {
602+
failuresMetricFound = true
603+
if m.GetCounter() != nil {
604+
failures = m.GetCounter().GetValue()
605+
}
606+
break
607+
}
608+
}
609+
}
610+
}
611+
if failuresMetricFound {
612+
require.Equal(t, float64(0), failures, "should have 0 failures")
613+
} // else: metric not exported because value is 0, which is expected behavior
614+
615+
// Verify timestamps are within expected range
616+
lastAttemptTime := getMetricValue(t, reg, "test_last_submission_attempt_time", map[string]string{
617+
"chain_id": "testchain",
618+
"type": "header",
619+
})
620+
require.GreaterOrEqual(t, lastAttemptTime, float64(beforeAttempt.Unix()))
621+
require.LessOrEqual(t, lastAttemptTime, float64(afterAttempt.Unix()))
622+
623+
lastSuccessTime := getMetricValue(t, reg, "test_last_successful_submission_time", map[string]string{
624+
"chain_id": "testchain",
625+
"type": "header",
626+
})
627+
require.GreaterOrEqual(t, lastSuccessTime, float64(beforeAttempt.Unix()))
628+
require.LessOrEqual(t, lastSuccessTime, float64(afterAttempt.Unix()))
629+
630+
// Test failed submission
631+
beforeFailure := time.Now()
632+
m.RecordSubmissionAttempt("testchain", "data", false)
633+
afterFailure := time.Now()
634+
635+
// Verify counters
636+
attempts = getMetricValue(t, reg, "test_submission_attempts_total", map[string]string{
637+
"chain_id": "testchain",
638+
"type": "data",
639+
})
640+
require.Equal(t, float64(1), attempts, "should have 1 attempt for data")
641+
642+
failures = getMetricValue(t, reg, "test_submission_failures_total", map[string]string{
643+
"chain_id": "testchain",
644+
"type": "data",
645+
})
646+
require.Equal(t, float64(1), failures, "should have 1 failure for data")
647+
648+
// Verify timestamps - should have attempt time but not success time for failed submission
649+
lastAttemptTime = getMetricValue(t, reg, "test_last_submission_attempt_time", map[string]string{
650+
"chain_id": "testchain",
651+
"type": "data",
652+
})
653+
require.GreaterOrEqual(t, lastAttemptTime, float64(beforeFailure.Unix()))
654+
require.LessOrEqual(t, lastAttemptTime, float64(afterFailure.Unix()))
655+
656+
// Last successful submission time should still be 0 for data type (never succeeded)
657+
// Gauge metrics with 0 values may not be exported, so we need to check if it exists
658+
var lastSuccessTimeData float64
659+
var successMetricFoundData bool
660+
metrics, err = reg.Gather()
661+
require.NoError(t, err)
662+
for _, mf := range metrics {
663+
if mf.GetName() == "test_last_successful_submission_time" {
664+
for _, m := range mf.GetMetric() {
665+
// check if labels match
666+
match := true
667+
for _, label := range m.GetLabel() {
668+
if expectedVal, ok := map[string]string{"chain_id": "testchain", "type": "data"}[label.GetName()]; ok {
669+
if label.GetValue() != expectedVal {
670+
match = false
671+
break
672+
}
673+
}
674+
}
675+
if match && len(m.GetLabel()) == 2 {
676+
successMetricFoundData = true
677+
if m.GetGauge() != nil {
678+
lastSuccessTimeData = m.GetGauge().GetValue()
679+
}
680+
break
681+
}
682+
}
683+
}
684+
}
685+
if successMetricFoundData {
686+
require.Equal(t, float64(0), lastSuccessTimeData, "should have no successful submission time for data")
687+
} // else: metric not exported because value is 0, which is expected behavior
688+
}
689+
566690
// helper types for table tests
567691
type blockToRecord struct {
568692
chain string
@@ -593,7 +717,7 @@ func calculateExpectedTotal(ranges []expectedRange, blobType string) uint64 {
593717
return total
594718
}
595719

596-
// getMetricValue retrieves the current value of a gauge metric
720+
// getMetricValue retrieves the current value of a metric (gauge or counter)
597721
func getMetricValue(t *testing.T, reg *prometheus.Registry, metricName string, labels map[string]string) float64 {
598722
t.Helper()
599723
metrics, err := reg.Gather()
@@ -613,7 +737,13 @@ func getMetricValue(t *testing.T, reg *prometheus.Registry, metricName string, l
613737
}
614738
}
615739
if match && len(m.GetLabel()) == len(labels) {
616-
return m.GetGauge().GetValue()
740+
// Try gauge first, then counter
741+
if m.GetGauge() != nil {
742+
return m.GetGauge().GetValue()
743+
}
744+
if m.GetCounter() != nil {
745+
return m.GetCounter().GetValue()
746+
}
617747
}
618748
}
619749
}

0 commit comments

Comments
 (0)