Skip to content
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ public static class Counter {
public static final String TXS = "tron:txs";
public static final String MINER = "tron:miner";
public static final String BLOCK_FORK = "tron:block_fork";
public static final String SR_SET_CHANGE = "tron:sr_set_change";
public static final String P2P_ERROR = "tron:p2p_error";
public static final String P2P_DISCONNECT = "tron:p2p_disconnect";
public static final String INTERNAL_SERVICE_FAIL = "tron:internal_service_fail";
Expand Down Expand Up @@ -62,6 +63,7 @@ public static class Histogram {
public static final String MESSAGE_PROCESS_LATENCY = "tron:message_process_latency_seconds";
public static final String BLOCK_FETCH_LATENCY = "tron:block_fetch_latency_seconds";
public static final String BLOCK_RECEIVE_DELAY = "tron:block_receive_delay_seconds";
public static final String BLOCK_TRANSACTION_COUNT = "tron:block_transaction_count";

private Histogram() {
throw new IllegalStateException("Histogram");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@ public static class Counter {
public static final String TXS_FAIL_SIG = "sig";
public static final String TXS_FAIL_TAPOS = "tapos";
public static final String TXS_FAIL_DUP = "dup";
public static final String SR_ADD = "add";
public static final String SR_REMOVE = "remove";

private Counter() {
throw new IllegalStateException("Counter");
Expand Down Expand Up @@ -66,6 +68,7 @@ private Gauge() {

// Histogram
public static class Histogram {
public static final String MINER = "miner";
public static final String TRAFFIC_IN = "in";
public static final String TRAFFIC_OUT = "out";

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ class MetricsCounter {
init(MetricKeys.Counter.TXS, "tron txs info .", "type", "detail");
init(MetricKeys.Counter.MINER, "tron miner info .", "miner", "type");
init(MetricKeys.Counter.BLOCK_FORK, "tron block fork info .", "type");
init(MetricKeys.Counter.SR_SET_CHANGE, "tron sr set change .", "action", "witness");
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The witness label uses addresses. Although the Active SR pool is relatively limited (27 slots), as SRs rotate in and out over time, label values will accumulate monotonically — they only grow, never shrink. This is a known Prometheus anti-pattern.

Practical impact assessment:

  • TRON's SR candidate pool is not infinite, and rotation frequency is low, so it won't explode in the short term

  • However, as a long-running node metric, months or years of operation could still accumulate a significant number of time series

Possible approaches:

  • If the address dimension is truly needed, keep the current design, but document this characteristic

  • Or remove the witness label entirely, keeping only action (add/remove), and output address details via logs instead. Follow the principle that Prometheus should be used for alerting and awareness, while logs are for precise investigation and root-cause analysis.

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for raising the high-cardinality concern — it's a valid Prometheus best practice to watch for.

However, I believe the witness label is acceptable here given TRON's specific constraints:

  1. Bounded candidate pool: The SR candidate pool is finite. Even over years of operation, the total number of distinct addresses that have ever held an SR slot has a practical ceiling — far from the
    unbounded growth typical of high-cardinality anti-patterns.
  2. Low rotation frequency: SR set changes only occur at maintenance intervals (every 6 hours), so new label values are introduced very infrequently.
  3. Diagnostic value: The primary purpose of this metric is to identify which SR was added or removed. Dropping the witness label would reduce the metric to just "something changed", requiring operators to
    cross-reference logs — losing the at-a-glance visibility in Grafana dashboards that makes this metric useful in the first place.

In summary, the worst-case time series count is roughly candidate_pool_size × 2 (add/remove), which stays well within Prometheus's comfortable operating range.

That said, I agree it's worth documenting this characteristic. Could you suggest what form you'd prefer — a code comment at the metric registration site, a note in the PR description, or something else?

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

prefer to a code comment at the metric registration site.

init(MetricKeys.Counter.P2P_ERROR, "tron p2p error info .", "type");
init(MetricKeys.Counter.P2P_DISCONNECT, "tron p2p disconnect .", "type");
init(MetricKeys.Counter.INTERNAL_SERVICE_FAIL, "internal Service fail.",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ public class MetricsHistogram {
init(MetricKeys.Histogram.JSONRPC_SERVICE_LATENCY, "JsonRpc Service latency.",
"method");
init(MetricKeys.Histogram.MINER_LATENCY, "miner latency.",
"miner");
MetricLabels.Histogram.MINER);
init(MetricKeys.Histogram.PING_PONG_LATENCY, "node ping pong latency.");
init(MetricKeys.Histogram.VERIFY_SIGN_LATENCY, "verify sign latency for trx , block.",
"type");
Expand All @@ -36,7 +36,7 @@ public class MetricsHistogram {
init(MetricKeys.Histogram.PROCESS_TRANSACTION_LATENCY, "process transaction latency.",
"type", "contract");
init(MetricKeys.Histogram.MINER_DELAY, "miner delay time, actualTime - planTime.",
"miner");
MetricLabels.Histogram.MINER);
init(MetricKeys.Histogram.UDP_BYTES, "udp_bytes traffic.",
"type");
init(MetricKeys.Histogram.TCP_BYTES, "tcp_bytes traffic.",
Expand All @@ -48,6 +48,11 @@ public class MetricsHistogram {
init(MetricKeys.Histogram.BLOCK_FETCH_LATENCY, "fetch block latency.");
init(MetricKeys.Histogram.BLOCK_RECEIVE_DELAY,
"receive block delay time, receiveTime - blockTime.");

init(MetricKeys.Histogram.BLOCK_TRANSACTION_COUNT,
"Distribution of transaction counts per block.",
new double[]{0, 10, 50, 100, 200, 500, 1000, 2000, 5000, 10000},
MetricLabels.Histogram.MINER);
}

private MetricsHistogram() {
Expand All @@ -62,6 +67,17 @@ private static void init(String name, String help, String... labels) {
.register());
}

private static void init(String name, String help, double[] buckets, String... labels) {
Histogram.Builder builder = Histogram.build()
.name(name)
.help(help)
.labelNames(labels);
if (buckets != null && buckets.length > 0) {
builder.buckets(buckets);
}
container.put(name, builder.register());
}

static Histogram.Timer startTimer(String key, String... labels) {
if (Metrics.enabled()) {
Histogram histogram = container.get(key);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,13 @@
import com.codahale.metrics.Counter;
import com.google.protobuf.ByteString;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.SortedMap;
import java.util.concurrent.ConcurrentHashMap;
import java.util.stream.Collectors;
import lombok.Getter;
import lombok.Setter;
import org.bouncycastle.util.encoders.Hex;
Expand Down Expand Up @@ -42,6 +45,9 @@ public class BlockChainMetricManager {
private long failProcessBlockNum = 0;
@Setter
private String failProcessBlockReason = "";
private final Set<String> lastActiveWitnesses = ConcurrentHashMap.newKeySet();
// To control SR set change metric update logic, -1 means not initialized
private long lastNextMaintenanceTime = -1;

public BlockChainInfo getBlockChainInfo() {
BlockChainInfo blockChainInfo = new BlockChainInfo();
Expand Down Expand Up @@ -164,11 +170,52 @@ public void applyBlock(BlockCapsule block) {
}

//TPS
if (block.getTransactions().size() > 0) {
MetricsUtil.meterMark(MetricsKey.BLOCKCHAIN_TPS, block.getTransactions().size());
Metrics.counterInc(MetricKeys.Counter.TXS, block.getTransactions().size(),
int txCount = block.getTransactions().size();
if (txCount > 0) {
MetricsUtil.meterMark(MetricsKey.BLOCKCHAIN_TPS, txCount);
Metrics.counterInc(MetricKeys.Counter.TXS, txCount,
MetricLabels.Counter.TXS_SUCCESS, MetricLabels.Counter.TXS_SUCCESS);
}
// Record transaction count distribution for all blocks (including empty blocks)
Metrics.histogramObserve(MetricKeys.Histogram.BLOCK_TRANSACTION_COUNT, txCount,
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

metricsService.applyBlock(block) runs before dbManager.pushBlock(block) in TronNetDelegate.processBlock(), so this histogram is updated for blocks that are later rejected or only live on a fork. For an empty-block monitoring metric, that produces false positives per witness and lets invalid peer traffic skew the distribution. This should be recorded only after the block has been applied successfully.

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good catch. I actually noticed this ordering issue during my investigation — metricsService.applyBlock() is called before dbManager.pushBlock(), meaning all metrics in BlockChainMetricManager.applyBlock() are recorded for blocks that may later be rejected.

However, this is not specific to the histogram I added — the existing metrics in the same method (e.g., MINER_LATENCY, TXS, NET_LATENCY, duplicate witness detection) all share the same problem. I chose to keep the same pattern to maintain architectural consistency and avoid expanding the scope of this PR.

If you agree, I'd like to open a separate issue to move metricsService.applyBlock() after dbManager.pushBlock() succeeds in TronNetDelegate.processBlock(), which would fix the timing problem for all metrics at once.

StringUtil.encode58Check(address));

// SR set change detection
long nextMaintenanceTime = dbManager.getDynamicPropertiesStore().getNextMaintenanceTime();
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The SR-set change detection is reading nextMaintenanceTime and getActiveWitnesses() before the current block is applied. In the real pipeline those values are updated during dbManager.pushBlock() / consensus.applyBlock(), so on the maintenance block this branch still sees the old schedule and old maintenance timestamp. That makes tron:sr_set_change show up one block late, and the added test masks the problem by mutating both stores manually before generating the next block.

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same root cause as above — since metricsService.applyBlock() runs before dbManager.pushBlock(), the maintenance time and active witness list haven't been updated yet at the point of reading.

In fact, this is precisely why the SR-set change detection relies on nextMaintenanceTime change as the trigger — because getActiveWitnesses() still returns the old schedule at this point, comparing against the previous nextMaintenanceTime is the only way to detect that a maintenance cycle has occurred and capture the witness list transition.

This would be addressed together in that follow-up issue: once metricsService.applyBlock() is moved after a successful pushBlock(), the SR-set change detection will naturally read the up-to-date state without needing this workaround.

if (lastNextMaintenanceTime == -1) {
lastNextMaintenanceTime = nextMaintenanceTime;
lastActiveWitnesses.addAll(chainBaseManager.getWitnessScheduleStore().getActiveWitnesses()
.stream().map(w -> StringUtil.encode58Check(w.toByteArray()))
.collect(Collectors.toSet()));
} else if (nextMaintenanceTime != lastNextMaintenanceTime) {
Set<String> currentWitnesses = chainBaseManager.getWitnessScheduleStore().getActiveWitnesses()
.stream()
.map(w -> StringUtil.encode58Check(w.toByteArray()))
.collect(Collectors.toSet());
recordSrSetChange(currentWitnesses);
lastNextMaintenanceTime = nextMaintenanceTime;
}
}

private void recordSrSetChange(Set<String> currentWitnesses) {
Set<String> added = new HashSet<>(currentWitnesses);
added.removeAll(lastActiveWitnesses);

Set<String> removed = new HashSet<>(lastActiveWitnesses);
removed.removeAll(currentWitnesses);

for (String address : added) {
Metrics.counterInc(MetricKeys.Counter.SR_SET_CHANGE, 1,
MetricLabels.Counter.SR_ADD, address);
}
for (String address : removed) {
Metrics.counterInc(MetricKeys.Counter.SR_SET_CHANGE, 1,
MetricLabels.Counter.SR_REMOVE, address);
}
if (!added.isEmpty() || !removed.isEmpty()) {
lastActiveWitnesses.clear();
lastActiveWitnesses.addAll(currentWitnesses);
}
}

private List<WitnessInfo> getSrList() {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import java.time.ZoneId;
import java.time.ZonedDateTime;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.stream.Collectors;
Expand All @@ -25,6 +26,7 @@
import org.tron.common.utils.ByteArray;
import org.tron.common.utils.PublicMethod;
import org.tron.common.utils.Sha256Hash;
import org.tron.common.utils.StringUtil;
import org.tron.common.utils.Utils;
import org.tron.consensus.dpos.DposSlot;
import org.tron.core.ChainBaseManager;
Expand All @@ -38,6 +40,8 @@

@Slf4j(topic = "metric")
public class PrometheusApiServiceTest extends BaseTest {


static LocalDateTime localDateTime = LocalDateTime.now();
@Resource
private DposSlot dposSlot;
Expand Down Expand Up @@ -65,7 +69,7 @@ protected static void initParameter(CommonParameter parameter) {
parameter.setMetricsPrometheusEnable(true);
}

protected void check() throws Exception {
protected void check(byte[] address, Map<ByteString, String> witnessAndAccount) throws Exception {
Double memoryBytes = CollectorRegistry.defaultRegistry.getSampleValue(
"system_total_physical_memory_bytes");
Assert.assertNotNull(memoryBytes);
Expand All @@ -80,6 +84,55 @@ protected void check() throws Exception {
new String[] {"sync"}, new String[] {"false"});
Assert.assertNotNull(pushBlock);
Assert.assertEquals(pushBlock.intValue(), blocks + 1);

String minerBase58 = StringUtil.encode58Check(address);
// Query histogram bucket le="0.0" for empty blocks
Double emptyBlock = CollectorRegistry.defaultRegistry.getSampleValue(
"tron:block_transaction_count_bucket",
new String[] {MetricLabels.Histogram.MINER, "le"}, new String[] {minerBase58, "0.0"});

Assert.assertNotNull("Empty block bucket should exist for miner: " + minerBase58, emptyBlock);
Assert.assertEquals("Should have 1 empty block", 1, emptyBlock.intValue());

// Check SR_REMOVE for initial address (removed when addTestWitnessAndAccount() is called)
Double srRemoveCount = CollectorRegistry.defaultRegistry.getSampleValue(
"tron:sr_set_change_total",
new String[] {"action", "witness"},
new String[] {MetricLabels.Counter.SR_REMOVE, minerBase58}
);
Assert.assertNotNull(srRemoveCount);
Assert.assertEquals(1, srRemoveCount.intValue());

// Check SR_ADD and empty blocks for each new witness in witnessAndAccount
// (excluding initial address)
ByteString addressByteString = ByteString.copyFrom(address);
int totalNewWitnessEmptyBlocks = 0;
for (ByteString witnessAddress : witnessAndAccount.keySet()) {
if (witnessAddress.equals(addressByteString)) {
continue; // Skip initial address
}
String witnessBase58 = StringUtil.encode58Check(witnessAddress.toByteArray());

// Check SR_ADD
Double srAddCount = CollectorRegistry.defaultRegistry.getSampleValue(
"tron:sr_set_change_total",
new String[] {"action", "witness"},
new String[] {MetricLabels.Counter.SR_ADD, witnessBase58}
);
Assert.assertNotNull("SR_ADD should be recorded for witness: " + witnessBase58,
srAddCount);
Assert.assertEquals("Each new witness should have 1 SR_ADD record", 1,
srAddCount.intValue());

// Collect empty blocks count from histogram bucket
int witnessEmptyBlock = CollectorRegistry.defaultRegistry.getSampleValue(
"tron:block_transaction_count_bucket",
new String[] {MetricLabels.Histogram.MINER, "le"}, new String[] {witnessBase58, "0.0"})
.intValue();
totalNewWitnessEmptyBlocks += witnessEmptyBlock;
}
Assert.assertEquals(blocks, totalNewWitnessEmptyBlocks);

Double errorLogs = CollectorRegistry.defaultRegistry.getSampleValue(
"tron:error_info_total", new String[] {"net"}, new String[] {MetricLabels.UNDEFINED});
Assert.assertNull(errorLogs);
Expand Down Expand Up @@ -130,10 +183,23 @@ public void testMetric() throws Exception {

Map<ByteString, String> witnessAndAccount = addTestWitnessAndAccount();
witnessAndAccount.put(ByteString.copyFrom(address), key);

// Explicitly update WitnessScheduleStore to remove initial address,
// triggering SR_REMOVE metric
List<ByteString> newActiveWitnesses = new ArrayList<>(witnessAndAccount.keySet());
newActiveWitnesses.remove(ByteString.copyFrom(address));
chainBaseManager.getWitnessScheduleStore().saveActiveWitnesses(newActiveWitnesses);

// Update nextMaintenanceTime to trigger SR set change detection
long nextMaintenanceTime =
chainBaseManager.getDynamicPropertiesStore().getNextMaintenanceTime();
chainBaseManager.getDynamicPropertiesStore().updateNextMaintenanceTime(
nextMaintenanceTime + 3600_000L);

for (int i = 0; i < blocks; i++) {
generateBlock(witnessAndAccount);
}
check();
check(address, witnessAndAccount);
}

private Map<ByteString, String> addTestWitnessAndAccount() {
Expand Down
Loading