From 8f876ba29cc65bd696fc3fb863aa85e148fa2c75 Mon Sep 17 00:00:00 2001
From: Stuart Inglis <stuart.inglis@exaba.com>
Date: Wed, 1 Jul 2026 07:29:16 +1200
Subject: [PATCH] match: bound the hash_search() chain walk (issue #217)

hash_search() walks the entire hash-table chain for the current rolling
checksum at every byte offset of the source file. Disk and VM images
contain large runs of identical blocks, so a single weak checksum
(get_checksum1) can collide thousands of times and pile every one of
those blocks onto one chain. When the sender then rolls across a region
whose weak checksum keeps landing on that chain without ever producing a
strong-checksum match, it re-walks the whole chain for every byte, giving
O(file_size * chain_length) behaviour. The result is rsync sitting at
100% CPU for hours with no apparent progress -- the long-standing "rsync
hangs on large files" reports.

Cap the number of same-weak-checksum candidates examined per offset at
MAX_CHAIN_LEN. Once the cap is hit we treat the offset as a non-match and
roll forward a byte; any block skipped this way is simply sent as literal
data, so the transferred result is always correct -- only the transfer
size is marginally affected. This is purely a sender-side search limit:
it changes no checksum, emitted byte, or protocol field, so a capped
sender interoperates with an unmodified receiver and vice versa.

On a synthetic 40000-block basis sharing one weak checksum, syncing a
60KB source dropped from ~18.4s to ~0.7s; the unbounded cost grows with
the square of the file size.

testsuite/hashsearch-chain_test.py reproduces the pathology with a tiny
basis of weak-checksum-colliding decoy blocks and asserts, via the
existing false_alarms counter (--debug=deltasum1), that the per-hash-hit
chain walk stays bounded. The assertion is exact and machine-independent
rather than timing-based.
---
 match.c                            |  32 +++++++++
 testsuite/hashsearch-chain_test.py | 108 +++++++++++++++++++++++++++++
 2 files changed, 140 insertions(+)
 create mode 100644 testsuite/hashsearch-chain_test.py

diff --git a/match.c b/match.c
index dfd6af2c9..82a80a08b 100644
--- a/match.c
+++ b/match.c
@@ -44,6 +44,29 @@ extern struct stats stats;
 
 #define TRADITIONAL_TABLESIZE (1<<16)
 
+/* The maximum number of same-weak-checksum candidates we will compare
+ * against at a single file offset before giving up and rolling forward a
+ * byte.  A weak checksum that collides thousands of times (very common in
+ * disk/VM images, which contain large runs of identical blocks) would
+ * otherwise turn hash_search()'s inner loop into an O(file_size *
+ * chain_length) scan, pegging a CPU at 100% for hours with no apparent
+ * progress (issue #217).
+ *
+ * Concretely, a synthetic 40000-block basis whose blocks all share one weak
+ * checksum took ~18.4s to sync a 60KB source on a modern x86_64 box before
+ * this cap and ~0.7s after it -- and the unbounded cost grows with the
+ * square of the file size, which is what produced the multi-hour "hangs"
+ * reported against real multi-GB images.
+ *
+ * Capping the per-offset work keeps the search bounded; any block we skip
+ * over is simply sent as literal data, so the result is always correct --
+ * only the transfer size is (slightly) affected.  This is purely a
+ * sender-side search limit: it changes no checksum, emitted byte, or
+ * protocol field, so a capped sender interoperates with any receiver. */
+#ifndef MAX_CHAIN_LEN
+#define MAX_CHAIN_LEN 1024
+#endif
+
 static uint32 tablesize;
 static int32 *hash_table;
 
@@ -182,6 +205,7 @@ static void hash_search(int f,struct sum_struct *s,
 		int done_csum2 = 0;
 		uint32 hash_entry;
 		int32 i, *prev;
+		int32 chain_len = 0;
 
 		if (DEBUG_GTE(DELTASUM, 4)) {
 			rprintf(FINFO, "offset=%s sum=%04x%04x\n",
@@ -218,6 +242,14 @@ static void hash_search(int f,struct sum_struct *s,
 			if (sum != s->sums[i].sum1)
 				continue;
 
+			/* Bound the work spent on a single pathological hash
+			 * bucket.  If this weak checksum matches more than
+			 * MAX_CHAIN_LEN records, stop scanning and treat this
+			 * offset as a non-match (issue #217).  The skipped data
+			 * is sent literally, never corrupted. */
+			if (++chain_len > MAX_CHAIN_LEN)
+				break;
+
 			/* also make sure the two blocks are the same length */
 			l = (int32)MIN((OFF_T)s->blength, len-offset);
 			if (l != s->sums[i].len)
diff --git a/testsuite/hashsearch-chain_test.py b/testsuite/hashsearch-chain_test.py
new file mode 100644
index 000000000..9dcb45b45
--- /dev/null
+++ b/testsuite/hashsearch-chain_test.py
@@ -0,0 +1,108 @@
+#!/usr/bin/env python3
+"""Regression test for issue #217: hash_search() must not blow up to O(N^2).
+
+A disk/VM image can contain a huge number of blocks that share the same weak
+rolling checksum (e.g. long runs of identical bytes).  All such blocks land on
+a single hash-table chain.  When the sender then rolls across a region whose
+weak checksum keeps landing on that chain but never produces a strong-checksum
+match, the inner loop of hash_search() used to walk the *entire* chain for
+every single byte offset.  The transfer would peg one CPU at 100% for hours
+with no visible progress -- "rsync hangs at 100% CPU".
+
+We reproduce a small, deterministic version of that pathology:
+
+  * Destination/basis file: many identical "decoy" blocks.  Each decoy is an
+    all-C block whose first three bytes are perturbed by (+1,-2,+1).  That
+    perturbation leaves rsync's weak checksum (get_checksum1) unchanged but
+    changes the strong checksum, so every decoy shares one weak checksum yet
+    never strong-matches the source -> one very long chain, all false alarms.
+
+  * Source file: a short run of the constant byte C.  Every window has that
+    same weak checksum, so without a bound the sender walks the whole chain at
+    every offset.
+
+Rather than measure wall-clock time (which is hopelessly machine dependent),
+we count the work directly.  rsync's existing `false_alarms` counter -- shown
+by `--debug=deltasum1` -- is incremented once per strong-checksum comparison
+that fails, i.e. exactly once per chain entry examined and rejected.  With the
+per-offset cap in match.c the sender examines at most MAX_CHAIN_LEN entries per
+hash hit, so false_alarms / hash_hits stays bounded no matter how long the
+chain is; without the cap that ratio equals the full chain length.  We assert
+the bound, which is the same integer on every machine.
+
+The fix is sender-only (it changes no checksum, byte, or protocol field), so
+the transferred result must also still be byte-for-byte correct.
+"""
+
+import re
+
+from rsyncfns import (
+    FROMDIR, TODIR, assert_same, rmtree, run_rsync, test_fail,
+)
+
+BLOCK = 256
+NBLOCKS = 8000           # chain length -- much longer than match.c's cap (1024)
+RUN = 3000               # length of the constant-byte region in the source
+C = 100                  # the constant source byte
+
+# A correct fix bounds the per-hash-hit chain walk to match.c's MAX_CHAIN_LEN
+# (1024).  Pick an assertion bound comfortably between that cap and the full
+# chain length so the test is insensitive to modest cap retuning but still
+# fails hard for the unbounded (pre-fix) walk of NBLOCKS per hit.
+MAX_FALSE_ALARMS_PER_HIT = NBLOCKS // 4   # 2000: > cap(1024), << chain(8000)
+
+
+def make_decoy_block() -> bytes:
+    b = bytearray([C]) * BLOCK
+    # (+1,-2,+1) at positions 0,1,2 preserves get_checksum1 (both s1 and s2)
+    # while changing the block's content and thus its strong checksum.
+    b[0] = C + 1
+    b[1] = C - 2
+    b[2] = C + 1
+    return bytes(b)
+
+
+rmtree(FROMDIR)
+rmtree(TODIR)
+FROMDIR.mkdir(parents=True, exist_ok=True)
+TODIR.mkdir(parents=True, exist_ok=True)
+
+src = FROMDIR / 'image.bin'
+dst = TODIR / 'image.bin'
+
+# Basis: NBLOCKS identical decoys -> one giant weak-checksum chain.
+decoy = make_decoy_block()
+with open(dst, 'wb') as f:
+    for _ in range(NBLOCKS):
+        f.write(decoy)
+
+# Source: a constant-byte run (constant weak checksum, no strong match).
+with open(src, 'wb') as f:
+    f.write(bytes([C]) * RUN)
+
+proc = run_rsync('-a', '--no-whole-file', f'--block-size={BLOCK}',
+                 '--no-compress', '--debug=deltasum1',
+                 f'{src}', f'{dst}', capture_output=True)
+
+out = proc.stdout + proc.stderr
+m = re.search(r'hash_hits=(\d+)\s+false_alarms=(\d+)', out)
+if not m:
+    test_fail(f"could not find deltasum stats in rsync output:\n{out}")
+hash_hits = int(m.group(1))
+false_alarms = int(m.group(2))
+
+if hash_hits == 0:
+    test_fail("expected the source to hit the decoy chain but hash_hits=0")
+
+ratio = false_alarms / hash_hits
+if ratio > MAX_FALSE_ALARMS_PER_HIT:
+    test_fail(
+        f"hash_search() walked ~{ratio:.0f} chain entries per hash hit "
+        f"(false_alarms={false_alarms}, hash_hits={hash_hits}); the chain of "
+        f"{NBLOCKS} entries is not being capped -- issue #217 regression")
+
+# Correctness is non-negotiable: the cap only skips matches, never data.
+assert_same(dst, src, label='issue #217 chain-cap transfer')
+
+print(f"issue #217: bounded at {ratio:.0f} false alarms/hit "
+      f"(chain={NBLOCKS}, cap keeps it under {MAX_FALSE_ALARMS_PER_HIT})")