Skip to content

Commit 4d02703

Browse files
committed
fix(validator): add BlockSync reconnection + startup weight submission
- BlockSync now reconnects on health check failure (not just subtensor) - Weights submitted immediately at startup (not just hourly) - Prevents complete block event starvation after WebSocket death - Fresh BlockSync ensures blocks resume flowing after reconnect Both issues could leave validator in zombie state: 1. Old: BlockSync never reconnected, only metagraph client 2. Old: No weights at startup, waited up to 60min for first submit
1 parent 6eb287a commit 4d02703

2 files changed

Lines changed: 43 additions & 1 deletion

File tree

bins/validator-node/src/main.rs

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -543,6 +543,7 @@ async fn main() -> Result<()> {
543543
let mut subtensor_client: Option<SubtensorClient>;
544544
let mut bittensor_client_for_metagraph: Option<Arc<BittensorClient>>;
545545
let mut block_rx: Option<tokio::sync::mpsc::Receiver<BlockSyncEvent>> = None;
546+
let block_sync_netuid: u16 = args.netuid;
546547
let mut weight_task_handle: Option<WeightTaskHandle> = None;
547548
let mut background_weight_handler: Option<BackgroundWeightHandler> = None;
548549
let mut standalone_weight_submitter: Option<Arc<StandaloneWeightSubmitter>> = None;
@@ -2326,6 +2327,33 @@ async fn main() -> Result<()> {
23262327
error!("Failed to recreate Bittensor client during health-check: {}", e);
23272328
}
23282329
}
2330+
// Recreate BlockSync to get fresh block events
2331+
info!("Recreating BlockSync with fresh connection...");
2332+
let mut sync = BlockSync::new(BlockSyncConfig {
2333+
netuid: block_sync_netuid,
2334+
..Default::default()
2335+
});
2336+
let rx = sync.take_event_receiver();
2337+
2338+
match bittensor_client_for_metagraph.as_ref() {
2339+
Some(client) => {
2340+
if let Err(e) = sync.connect(client.clone()).await {
2341+
error!("Failed to reconnect BlockSync: {}", e);
2342+
} else {
2343+
tokio::spawn(async move {
2344+
if let Err(e) = sync.start().await {
2345+
error!("Block sync error after reconnect: {}", e);
2346+
}
2347+
});
2348+
block_rx = rx;
2349+
last_block_event_time = std::time::Instant::now();
2350+
info!("BlockSync reconnected successfully");
2351+
}
2352+
}
2353+
None => {
2354+
warn!("No bittensor_client_for_metagraph available for BlockSync reconnection");
2355+
}
2356+
}
23292357
}
23302358
}
23312359
}

bins/validator-node/src/standalone_weight_submitter.rs

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -104,9 +104,23 @@ impl StandaloneWeightSubmitter {
104104

105105
info!(
106106
url = WEIGHT_RPC_URL,
107-
"Standalone weight submitter started (hourly at :00)"
107+
"Standalone weight submitter started (hourly at :00 + startup)"
108108
);
109109

110+
info!("Submitting weights at startup...");
111+
match self.submit_with_retry().await {
112+
Ok(()) => {
113+
let now = Utc::now();
114+
let hour_epoch = now.timestamp() / 3600;
115+
let mut last = self.last_submission_hour.lock().await;
116+
*last = Some(hour_epoch);
117+
info!(hour = hour_epoch, "Startup weights submitted successfully");
118+
}
119+
Err(e) => {
120+
error!(error = %e, "Failed to submit weights at startup (will retry at next hourly checkpoint)");
121+
}
122+
}
123+
110124
loop {
111125
interval.tick().await;
112126

0 commit comments

Comments
 (0)