Skip to content

Commit d2a9183

Browse files
AIQnetLabclaude
andcommitted
fix: sync deadlock — sequential window + priority eviction for pending queue
Sync was stuck because parallel_download_microblocks requested ALL missing ranges simultaneously, flooding PENDING_SYNC_BLOCKS (max 1000) with far-away blocks and rejecting near-height blocks the node needs to advance. - Limit parallel sync to nearest 500 blocks (MAX_SYNC_WINDOW), caller re-invokes as local_height advances — sliding window - Smart eviction: when queue overflows, evict farthest blocks first so near-height blocks always have priority - Lower snapshot threshold from 10K to 2K blocks for faster catch-up Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent ea67c68 commit d2a9183

2 files changed

Lines changed: 49 additions & 22 deletions

File tree

development/qnet-integration/src/node.rs

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -7354,18 +7354,19 @@ impl BlockchainNode {
73547354
// Skip snapshot for Light nodes - they sync only recent blocks
73557355
let is_light_node = blockchain_for_sync.node_type == NodeType::Light;
73567356

7357-
// v3.3: IMPROVED SNAPSHOT SYNC for returning nodes
7357+
// v10.3: IMPROVED SNAPSHOT SYNC for returning nodes
73587358
// Try snapshot if:
73597359
// 1. Not a Light node (they don't need full history)
73607360
// 2. EITHER: New node (local_height < 100)
7361-
// OR: Large gap (>10,000 blocks = ~2.7 hours offline)
7361+
// OR: Large gap (>2,000 blocks = ~33 min offline)
73627362
// 3. Network has at least one full snapshot (43,200+ blocks)
73637363
//
7364-
// RATIONALE: Node offline for days should use snapshot, not sync 500K blocks!
7365-
// Example: 500K blocks × 100 batch = 5,000 requests × 25 sec = ~35 hours to sync
7366-
// With snapshot: download snapshot + sync remaining = minutes
7364+
// RATIONALE: Block-by-block sync for large gaps floods the
7365+
// pending queue with parallel requests, causing backpressure
7366+
// deadlocks. Snapshot avoids this entirely.
7367+
// Threshold lowered from 10K to 2K to use snapshot more aggressively.
73677368
let gap = network_height.saturating_sub(local_height);
7368-
let large_gap_threshold = 10_000; // ~2.7 hours worth of blocks
7369+
let large_gap_threshold = 2_000; // ~33 minutes worth of blocks
73697370
let should_use_snapshot = !is_light_node
73707371
&& (local_height < 100 || gap > large_gap_threshold)
73717372
&& network_height > SNAPSHOT_FULL_INTERVAL; // Need at least 1 full snapshot

development/qnet-integration/src/unified_p2p.rs

Lines changed: 42 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -301,28 +301,46 @@ pub fn mark_block_pending_sync(height: u64) -> bool {
301301
cleanup_pending_sync_blocks();
302302
}
303303

304-
// Hard limit: emergency cleanup
304+
// Hard limit: emergency cleanup with PRIORITY EVICTION
305305
if PENDING_SYNC_BLOCKS.len() >= MAX_PENDING_SYNC_BLOCKS {
306306
let local_height = LOCAL_BLOCKCHAIN_HEIGHT.load(std::sync::atomic::Ordering::Relaxed);
307307
let mut entries_to_remove: Vec<u64> = Vec::new();
308-
309-
// Remove entries below local height (already processed)
308+
309+
// Phase 1: Remove entries below local height (already processed)
310310
for entry in PENDING_SYNC_BLOCKS.iter() {
311311
if *entry.key() < local_height.saturating_sub(5) {
312312
entries_to_remove.push(*entry.key());
313313
}
314-
if entries_to_remove.len() >= 100 {
315-
break;
316-
}
317314
}
318-
319-
for h in entries_to_remove {
320-
PENDING_SYNC_BLOCKS.remove(&h);
315+
316+
for h in &entries_to_remove {
317+
PENDING_SYNC_BLOCKS.remove(h);
321318
}
322-
319+
320+
// Phase 2: If still full, EVICT FARTHEST blocks from local_height.
321+
// Near-height blocks are needed next; far blocks can be re-requested later.
322+
// This prevents the deadlock where far blocks occupy all slots and
323+
// near blocks (that the node needs to advance) get rejected.
324+
if PENDING_SYNC_BLOCKS.len() >= MAX_PENDING_SYNC_BLOCKS {
325+
let mut all_heights: Vec<u64> = PENDING_SYNC_BLOCKS.iter()
326+
.map(|entry| *entry.key())
327+
.collect();
328+
// Sort by distance from local_height (farthest first)
329+
all_heights.sort_by_key(|h| std::cmp::Reverse(h.abs_diff(local_height)));
330+
// Evict top 50% (farthest blocks)
331+
let evict_count = all_heights.len() / 2;
332+
for h in all_heights.iter().take(evict_count) {
333+
PENDING_SYNC_BLOCKS.remove(h);
334+
}
335+
if crate::node::is_info() {
336+
println!("[INFO][SYNC] priority_eviction local_h={} evicted={} remaining={}",
337+
local_height, evict_count, PENDING_SYNC_BLOCKS.len());
338+
}
339+
}
340+
323341
if PENDING_SYNC_BLOCKS.len() >= MAX_PENDING_SYNC_BLOCKS {
324342
if crate::node::is_warn() {
325-
println!("[WARN][SYNC] queue_full_after_cleanup size={} rejecting={}",
343+
println!("[WARN][SYNC] queue_full_after_cleanup size={} rejecting={}",
326344
PENDING_SYNC_BLOCKS.len(), height);
327345
}
328346
return false;
@@ -10468,11 +10486,19 @@ impl SimplifiedP2P {
1046810486
let parallel_workers: usize = workers;
1046910487
let chunk_size_blocks: u64 = chunk_size;
1047010488

10471-
// v10.1: Download ALL missing blocks in one call.
10472-
// Wave-based limiting (old WAVE_SIZE=100) was causing our fast sync loop
10473-
// to need many iterations. Semaphore already limits concurrency.
10474-
// The caller (fast sync loop) handles re-invocation if network advances.
10475-
let actual_target = target_height;
10489+
// v10.3: Download only NEAREST missing blocks, not all at once.
10490+
// Downloading ALL ranges simultaneously floods PENDING_SYNC_BLOCKS (max 1000)
10491+
// with far-away blocks, causing backpressure that rejects near-height blocks
10492+
// the node actually needs next. This creates a sync deadlock where the node
10493+
// advances ~100 blocks per 60s TTL cycle instead of continuously.
10494+
//
10495+
// Solution: limit to nearest 500 blocks. The caller (fast sync loop) re-invokes
10496+
// as local_height advances, naturally sliding the window forward.
10497+
const MAX_SYNC_WINDOW: u64 = 500;
10498+
let actual_target = std::cmp::min(target_height, current_height + MAX_SYNC_WINDOW);
10499+
10500+
// Filter missing_blocks to only include blocks within the window
10501+
missing_blocks.retain(|h| *h <= actual_target);
1047610502

1047710503
if crate::node::is_info() {
1047810504
println!("[SYNC] ⚡ Starting parallel sync: {} blocks (target: {}) with {} workers",

0 commit comments

Comments
 (0)