feat(metrics): add comprehensive submission debugging metrics with env control (#29)

auricom · web-flow · commit d9c4ae20b086 · 2025-12-22T19:36:39.000+01:00
diff --git a/README.md b/README.md
@@ -77,6 +77,7 @@ Metrics will be available at `http://localhost:2112/metrics`
 - `--balance.scrape-interval`: Balance check scrape interval in seconds (default: 30)
 - `--verifier.workers`: Number of concurrent workers for block verification (default: 50)
 - `--verbose`: Enable verbose logging (default: false)
+- `--debug`: Enable debug logging for submission details (default: false, can also be set via `EVMETRICS_DEBUG=true` environment variable)
 
 ### Example with Custom Endpoints
 
@@ -150,6 +151,26 @@ When metrics are enabled, the following metrics are exposed:
 - **Labels**: `chain_id`, `type`
 - **Description**: Latest DA height for header and data submissions
 
+### `ev_metrics_submission_attempts_total`
+- **Type**: Counter
+- **Labels**: `chain_id`, `type`
+- **Description**: Total number of DA submission attempts (both successful and failed)
+
+### `ev_metrics_submission_failures_total`
+- **Type**: Counter
+- **Labels**: `chain_id`, `type`
+- **Description**: Total number of failed DA submission attempts
+
+### `ev_metrics_last_submission_attempt_time`
+- **Type**: Gauge
+- **Labels**: `chain_id`, `type`
+- **Description**: Timestamp of the last DA submission attempt (Unix timestamp)
+
+### `ev_metrics_last_successful_submission_time`
+- **Type**: Gauge
+- **Labels**: `chain_id`, `type`
+- **Description**: Timestamp of the last successful DA submission (Unix timestamp)
+
 ### Block Time Metrics
 
 ### `ev_metrics_block_time_seconds`
@@ -253,3 +274,18 @@ When `--balance.addresses` and `--balance.consensus-rpc-urls` are provided:
 - **Type**: Counter
 - **Labels**: `chain_id`, `endpoint`, `error_type`
 - **Description**: Total number of consensus RPC endpoint errors by type
+
+## Debug Logging
+
+Debug logging provides detailed visibility into DA submission process. Enable with `--debug` flag or `EVMETRICS_DEBUG=true` environment variable.
+
+**Use cases**: Troubleshoot submission failures, verify submission flow, diagnose Celestia RPC issues.
+
+**Logs include**: Successful/failed submissions, DA height updates, submission timing.
+
+**Example**:
+```bash
+# Enable debug logging
+export EVMETRICS_DEBUG=true
+./ev-metrics monitor --header-namespace testnet_header --data-namespace testnet_data
+```
diff --git a/pkg/exporters/verifier/verifier.go b/pkg/exporters/verifier/verifier.go
@@ -143,10 +143,12 @@ func (e *exporter) processBlocks(ctx context.Context, m *metrics.Metrics, worker
 
 func (e *exporter) onVerified(m *metrics.Metrics, namespace string, blockHeight, daHeight uint64, verified bool, submissionDuration time.Duration) {
 	if verified {
+		m.RecordSubmissionAttempt(e.chainID, namespace, true)
 		m.RecordSubmissionDaHeight(e.chainID, namespace, daHeight)
 		m.RemoveVerifiedBlock(e.chainID, namespace, blockHeight)
 		m.RecordSubmissionDuration(e.chainID, namespace, submissionDuration)
 	} else {
+		m.RecordSubmissionAttempt(e.chainID, namespace, false)
 		m.RecordMissingBlock(e.chainID, namespace, blockHeight)
 	}
 }
diff --git a/pkg/metrics/metrics.go b/pkg/metrics/metrics.go
@@ -2,6 +2,8 @@ package metrics
 
 import (
 	"fmt"
+	"log"
+	"os"
 	"sort"
 	"strings"
 	"sync"
@@ -29,6 +31,14 @@ type Metrics struct {
 	SubmissionDuration *prometheus.SummaryVec
 	// SubmissionDaHeight tracks the DA height at which blocks were submitted.
 	SubmissionDaHeight *prometheus.GaugeVec
+	// SubmissionAttemptsTotal tracks the total number of submission attempts.
+	SubmissionAttemptsTotal *prometheus.CounterVec
+	// SubmissionFailuresTotal tracks the total number of failed submission attempts.
+	SubmissionFailuresTotal *prometheus.CounterVec
+	// LastSubmissionAttemptTime tracks the timestamp of the last submission attempt.
+	LastSubmissionAttemptTime *prometheus.GaugeVec
+	// LastSuccessfulSubmissionTime tracks the timestamp of the last successful submission.
+	LastSuccessfulSubmissionTime *prometheus.GaugeVec
 	// BlockTime tracks the time between consecutive blocks with histogram buckets for accurate SLO calculations.
 	BlockTime *prometheus.HistogramVec
 	// BlockTimeSummary tracks block time with percentiles over a rolling window.
@@ -72,6 +82,11 @@ type Metrics struct {
 	ranges map[string][]*blockRange // key: blobType -> sorted slice of ranges
 }
 
+// isDebugEnabled returns true if debug logging is enabled via environment variable
+func isDebugEnabled() bool {
+	return strings.ToLower(os.Getenv("EVMETRICS_DEBUG")) == "true"
+}
+
 type blockRange struct {
 	start uint64
 	end   uint64
@@ -289,6 +304,38 @@ func NewWithRegistry(namespace string, registerer prometheus.Registerer) *Metric
 			},
 			[]string{"chain_id", "endpoint", "error_type"},
 		),
+		SubmissionAttemptsTotal: factory.NewCounterVec(
+			prometheus.CounterOpts{
+				Namespace: namespace,
+				Name:      "submission_attempts_total",
+				Help:      "total number of DA submission attempts",
+			},
+			[]string{"chain_id", "type"},
+		),
+		SubmissionFailuresTotal: factory.NewCounterVec(
+			prometheus.CounterOpts{
+				Namespace: namespace,
+				Name:      "submission_failures_total",
+				Help:      "total number of failed DA submission attempts",
+			},
+			[]string{"chain_id", "type"},
+		),
+		LastSubmissionAttemptTime: factory.NewGaugeVec(
+			prometheus.GaugeOpts{
+				Namespace: namespace,
+				Name:      "last_submission_attempt_time",
+				Help:      "timestamp of the last DA submission attempt",
+			},
+			[]string{"chain_id", "type"},
+		),
+		LastSuccessfulSubmissionTime: factory.NewGaugeVec(
+			prometheus.GaugeOpts{
+				Namespace: namespace,
+				Name:      "last_successful_submission_time",
+				Help:      "timestamp of the last successful DA submission",
+			},
+			[]string{"chain_id", "type"},
+		),
 		ranges:                  make(map[string][]*blockRange),
 		lastBlockArrivalTime:    make(map[string]time.Time),
 		lastSubmissionDurations: make(map[string]time.Duration),
@@ -297,6 +344,34 @@ func NewWithRegistry(namespace string, registerer prometheus.Registerer) *Metric
 	return m
 }
 
+// RecordSubmissionAttempt records a submission attempt and updates related metrics
+func (m *Metrics) RecordSubmissionAttempt(chainID, submissionType string, success bool) {
+	m.mu.Lock()
+	defer m.mu.Unlock()
+
+	// Always record the attempt
+	m.SubmissionAttemptsTotal.WithLabelValues(chainID, submissionType).Inc()
+
+	if !success {
+		m.SubmissionFailuresTotal.WithLabelValues(chainID, submissionType).Inc()
+	}
+
+	// Record timestamp of this attempt
+	now := time.Now()
+	m.LastSubmissionAttemptTime.WithLabelValues(chainID, submissionType).Set(float64(now.Unix()))
+
+	if success {
+		m.LastSuccessfulSubmissionTime.WithLabelValues(chainID, submissionType).Set(float64(now.Unix()))
+		if isDebugEnabled() {
+			log.Printf("DEBUG: Successful submission - chain: %s, type: %s, timestamp: %d", chainID, submissionType, now.Unix())
+		}
+	} else {
+		if isDebugEnabled() {
+			log.Printf("DEBUG: Failed submission attempt - chain: %s, type: %s, timestamp: %d", chainID, submissionType, now.Unix())
+		}
+	}
+}
+
 // RecordSubmissionDaHeight records the DA height only if it's higher than previously recorded
 func (m *Metrics) RecordSubmissionDaHeight(chainID, submissionType string, daHeight uint64) {
 	m.mu.Lock()
@@ -306,6 +381,15 @@ func (m *Metrics) RecordSubmissionDaHeight(chainID, submissionType string, daHei
 		if daHeight > m.latestHeaderDaHeight {
 			m.latestHeaderDaHeight = daHeight
 			m.SubmissionDaHeight.WithLabelValues(chainID, "header").Set(float64(daHeight))
+			// Debug log when submission DA height is recorded
+			if isDebugEnabled() {
+				log.Printf("DEBUG: Recorded header submission DA height - chain: %s, height: %d", chainID, daHeight)
+			}
+		} else {
+			// Debug log when DA height is not higher than previous
+			if isDebugEnabled() {
+				log.Printf("DEBUG: Header DA height %d not higher than previous %d for chain %s", daHeight, m.latestHeaderDaHeight, chainID)
+			}
 		}
 		return
 	}
@@ -314,6 +398,15 @@ func (m *Metrics) RecordSubmissionDaHeight(chainID, submissionType string, daHei
 		if daHeight > m.latestDataDaHeight {
 			m.latestDataDaHeight = daHeight
 			m.SubmissionDaHeight.WithLabelValues(chainID, "data").Set(float64(daHeight))
+			// Debug log when submission DA height is recorded
+			if isDebugEnabled() {
+				log.Printf("DEBUG: Recorded data submission DA height - chain: %s, height: %d", chainID, daHeight)
+			}
+		} else {
+			// Debug log when DA height is not higher than previous
+			if isDebugEnabled() {
+				log.Printf("DEBUG: Data DA height %d not higher than previous %d for chain %s", daHeight, m.latestDataDaHeight, chainID)
+			}
 		}
 	}
 }
@@ -530,6 +623,7 @@ func (m *Metrics) RecordSubmissionDuration(chainID, submissionType string, durat
 
 	key := fmt.Sprintf("%s:%s", chainID, submissionType)
 	m.lastSubmissionDurations[key] = duration
+
 }
 
 // RefreshSubmissionDuration re-observes the last known submission duration to keep the metric alive.
diff --git a/pkg/metrics/metrics_test.go b/pkg/metrics/metrics_test.go
@@ -563,6 +563,130 @@ func TestMetrics_RefreshSubmissionDuration_Empty(t *testing.T) {
 	})
 }
 
+func TestMetrics_RecordSubmissionAttempt(t *testing.T) {
+	reg := prometheus.NewRegistry()
+	m := NewWithRegistry("test", reg)
+
+	// Test successful submission
+	beforeAttempt := time.Now()
+	m.RecordSubmissionAttempt("testchain", "header", true)
+	afterAttempt := time.Now()
+
+	// Verify counters
+	attempts := getMetricValue(t, reg, "test_submission_attempts_total", map[string]string{
+		"chain_id": "testchain",
+		"type":     "header",
+	})
+	require.Equal(t, float64(1), attempts, "should have 1 attempt")
+
+	// For successful submission, failures should be 0 (metric may not be exported if 0)
+	// We'll check if the metric exists and has value 0, or doesn't exist (both are valid)
+	failuresMetricFound := false
+	failures := float64(0)
+	metrics, err := reg.Gather()
+	require.NoError(t, err)
+	for _, mf := range metrics {
+		if mf.GetName() == "test_submission_failures_total" {
+			for _, m := range mf.GetMetric() {
+				// check if labels match
+				match := true
+				for _, label := range m.GetLabel() {
+					if expectedVal, ok := map[string]string{"chain_id": "testchain", "type": "header"}[label.GetName()]; ok {
+						if label.GetValue() != expectedVal {
+							match = false
+							break
+						}
+					}
+				}
+				if match && len(m.GetLabel()) == 2 {
+					failuresMetricFound = true
+					if m.GetCounter() != nil {
+						failures = m.GetCounter().GetValue()
+					}
+					break
+				}
+			}
+		}
+	}
+	if failuresMetricFound {
+		require.Equal(t, float64(0), failures, "should have 0 failures")
+	} // else: metric not exported because value is 0, which is expected behavior
+
+	// Verify timestamps are within expected range
+	lastAttemptTime := getMetricValue(t, reg, "test_last_submission_attempt_time", map[string]string{
+		"chain_id": "testchain",
+		"type":     "header",
+	})
+	require.GreaterOrEqual(t, lastAttemptTime, float64(beforeAttempt.Unix()))
+	require.LessOrEqual(t, lastAttemptTime, float64(afterAttempt.Unix()))
+
+	lastSuccessTime := getMetricValue(t, reg, "test_last_successful_submission_time", map[string]string{
+		"chain_id": "testchain",
+		"type":     "header",
+	})
+	require.GreaterOrEqual(t, lastSuccessTime, float64(beforeAttempt.Unix()))
+	require.LessOrEqual(t, lastSuccessTime, float64(afterAttempt.Unix()))
+
+	// Test failed submission
+	beforeFailure := time.Now()
+	m.RecordSubmissionAttempt("testchain", "data", false)
+	afterFailure := time.Now()
+
+	// Verify counters
+	attempts = getMetricValue(t, reg, "test_submission_attempts_total", map[string]string{
+		"chain_id": "testchain",
+		"type":     "data",
+	})
+	require.Equal(t, float64(1), attempts, "should have 1 attempt for data")
+
+	failures = getMetricValue(t, reg, "test_submission_failures_total", map[string]string{
+		"chain_id": "testchain",
+		"type":     "data",
+	})
+	require.Equal(t, float64(1), failures, "should have 1 failure for data")
+
+	// Verify timestamps - should have attempt time but not success time for failed submission
+	lastAttemptTime = getMetricValue(t, reg, "test_last_submission_attempt_time", map[string]string{
+		"chain_id": "testchain",
+		"type":     "data",
+	})
+	require.GreaterOrEqual(t, lastAttemptTime, float64(beforeFailure.Unix()))
+	require.LessOrEqual(t, lastAttemptTime, float64(afterFailure.Unix()))
+
+	// Last successful submission time should still be 0 for data type (never succeeded)
+	// Gauge metrics with 0 values may not be exported, so we need to check if it exists
+	var lastSuccessTimeData float64
+	var successMetricFoundData bool
+	metrics, err = reg.Gather()
+	require.NoError(t, err)
+	for _, mf := range metrics {
+		if mf.GetName() == "test_last_successful_submission_time" {
+			for _, m := range mf.GetMetric() {
+				// check if labels match
+				match := true
+				for _, label := range m.GetLabel() {
+					if expectedVal, ok := map[string]string{"chain_id": "testchain", "type": "data"}[label.GetName()]; ok {
+						if label.GetValue() != expectedVal {
+							match = false
+							break
+						}
+					}
+				}
+					if match && len(m.GetLabel()) == 2 {
+					successMetricFoundData = true
+					if m.GetGauge() != nil {
+						lastSuccessTimeData = m.GetGauge().GetValue()
+					}
+					break
+				}
+			}
+		}
+	}
+	if successMetricFoundData {
+		require.Equal(t, float64(0), lastSuccessTimeData, "should have no successful submission time for data")
+	} // else: metric not exported because value is 0, which is expected behavior
+}
+
 // helper types for table tests
 type blockToRecord struct {
 	chain    string
@@ -593,7 +717,7 @@ func calculateExpectedTotal(ranges []expectedRange, blobType string) uint64 {
 	return total
 }
 
-// getMetricValue retrieves the current value of a gauge metric
+// getMetricValue retrieves the current value of a metric (gauge or counter)
 func getMetricValue(t *testing.T, reg *prometheus.Registry, metricName string, labels map[string]string) float64 {
 	t.Helper()
 	metrics, err := reg.Gather()
@@ -613,7 +737,13 @@ func getMetricValue(t *testing.T, reg *prometheus.Registry, metricName string, l
 					}
 				}
 				if match && len(m.GetLabel()) == len(labels) {
-					return m.GetGauge().GetValue()
+					// Try gauge first, then counter
+					if m.GetGauge() != nil {
+						return m.GetGauge().GetValue()
+					}
+					if m.GetCounter() != nil {
+						return m.GetCounter().GetValue()
+					}
 				}
 			}
 		}

Original file line number	Diff line number	Diff line change
`@@ -143,10 +143,12 @@ func (e exporter) processBlocks(ctx context.Context, m metrics.Metrics, worker`
`143`	`143`
`144`	`144`	`func (e exporter) onVerified(m metrics.Metrics, namespace string, blockHeight, daHeight uint64, verified bool, submissionDuration time.Duration) {`
`145`	`145`	`if verified {`
	`146`	`+ m.RecordSubmissionAttempt(e.chainID, namespace, true)`
`146`	`147`	`m.RecordSubmissionDaHeight(e.chainID, namespace, daHeight)`
`147`	`148`	`m.RemoveVerifiedBlock(e.chainID, namespace, blockHeight)`
`148`	`149`	`m.RecordSubmissionDuration(e.chainID, namespace, submissionDuration)`
`149`	`150`	`} else {`
	`151`	`+ m.RecordSubmissionAttempt(e.chainID, namespace, false)`
`150`	`152`	`m.RecordMissingBlock(e.chainID, namespace, blockHeight)`
`151`	`153`	`}`
`152`	`154`	`}`