From b1000bdcf30dd70d663d28245d7f80e2f20cf98b Mon Sep 17 00:00:00 2001
From: Stuart Inglis <stuart.inglis@exaba.com>
Date: Wed, 1 Jul 2026 08:03:13 +1200
Subject: [PATCH] io: use poll() instead of select() to avoid an FD_SETSIZE
 hang (issue #231)

rsync's I/O loops (safe_read, safe_write, and the main perform_io
multiplexer) waited for readiness with select() and fd_set bitmaps. An
fd_set can only represent descriptors below FD_SETSIZE (1024 with glibc).

When rsync is started with many descriptors already open -- e.g. inherited
from a parent process that leaked fds, a high "ulimit -n", or a busy daemon
-- its own socket and pipe fds get allocated at or above 1024. FD_SET() and
FD_ISSET() then index past the end of the fixed-size fd_set, which is
undefined behavior: select() reports the fd as ready, but FD_ISSET() reads
the out-of-bounds bit as 0, so the read or write never happens and rsync
spins at 100% CPU forever with no progress. This is the long-standing
"rsync hangs at 100% CPU on large systems" report, and it matches the
MemorySanitizer use-of-uninitialized-value seen in perform_io.

Convert the three loops to poll(), which identifies descriptors by value in
a small array and has no FD_SETSIZE ceiling, so a high-numbered fd works
fine. rsync only ever waits on a handful of fds (at most three in
perform_io: in_fd, out_fd, and the files-from forward fd), so poll() is as
fast as -- or faster than -- select() here; the select()-vs-poll() cost gap
only appears when watching thousands of descriptors, which rsync never
does. The remaining select(0, ...) call is a pure timed sleep with no fds
and is unaffected.

The conversion is behavior-preserving: the same max_fd bookkeeping decides
when there is nothing to wait on, the per-fd readiness checks map to the
matching pollfd revents, and the timeout is the same (now expressed in
milliseconds).

testsuite/highfd-hang_test.py reproduces the hang deterministically by
opening enough inheritable dummy fds to push rsync's descriptors past
FD_SETSIZE before an ordinary transfer; it hangs (caught by a timeout) on
the select() code and passes instantly with poll().
---
 io.c                          | 94 +++++++++++++++++++----------------
 testsuite/highfd-hang_test.py | 84 +++++++++++++++++++++++++++++++
 2 files changed, 134 insertions(+), 44 deletions(-)
 create mode 100644 testsuite/highfd-hang_test.py

diff --git a/io.c b/io.c
index 2dfeeaa34..e9de0b713 100644
--- a/io.c
+++ b/io.c
@@ -31,7 +31,9 @@
 #include "ifuncs.h"
 #include "inums.h"
 
-/** If no timeout is specified then use a 60 second select timeout */
+#include <poll.h>
+
+/** If no timeout is specified then use a 60 second I/O timeout */
 #define SELECT_TIMEOUT 60
 
 extern int bwlimit;
@@ -243,31 +245,26 @@ static size_t safe_read(int fd, char *buf, size_t len)
 	assert(fd != iobuf.in_fd);
 
 	while (1) {
-		struct timeval tv;
-		fd_set r_fds, e_fds;
+		struct pollfd pfd;
 		int cnt;
 
-		FD_ZERO(&r_fds);
-		FD_SET(fd, &r_fds);
-		FD_ZERO(&e_fds);
-		FD_SET(fd, &e_fds);
-		tv.tv_sec = select_timeout;
-		tv.tv_usec = 0;
+		/* We use poll() rather than select() so that a high-numbered fd
+		 * (>= FD_SETSIZE) cannot overflow an fd_set bitmap. */
+		pfd.fd = fd;
+		pfd.events = POLLIN;
+		pfd.revents = 0;
 
-		cnt = select(fd+1, &r_fds, NULL, &e_fds, &tv);
+		cnt = poll(&pfd, 1, select_timeout * 1000);
 		if (cnt <= 0) {
 			if (cnt < 0 && errno == EBADF) {
-				rsyserr(FERROR, errno, "safe_read select failed");
+				rsyserr(FERROR, errno, "safe_read poll failed");
 				exit_cleanup(RERR_FILEIO);
 			}
 			check_timeout(1, MSK_ALLOW_FLUSH);
 			continue;
 		}
 
-		/*if (FD_ISSET(fd, &e_fds))
-			rprintf(FINFO, "select exception on fd %d\n", fd); */
-
-		if (FD_ISSET(fd, &r_fds)) {
+		if (pfd.revents) {
 			ssize_t n = read(fd, buf + got, len - got);
 			if (DEBUG_GTE(IO, 2)) {
 				rprintf(FINFO, "[%s] safe_read(%d)=%" SIZE_T_FMT_MOD "d\n",
@@ -332,19 +329,18 @@ static void safe_write(int fd, const char *buf, size_t len)
 	}
 
 	while (len) {
-		struct timeval tv;
-		fd_set w_fds;
+		struct pollfd pfd;
 		int cnt;
 
-		FD_ZERO(&w_fds);
-		FD_SET(fd, &w_fds);
-		tv.tv_sec = select_timeout;
-		tv.tv_usec = 0;
+		/* poll() avoids the FD_SETSIZE limit that select() imposes. */
+		pfd.fd = fd;
+		pfd.events = POLLOUT;
+		pfd.revents = 0;
 
-		cnt = select(fd + 1, NULL, &w_fds, NULL, &tv);
+		cnt = poll(&pfd, 1, select_timeout * 1000);
 		if (cnt <= 0) {
 			if (cnt < 0 && errno == EBADF) {
-				rsyserr(FERROR, errno, "safe_write select failed on %s", what_fd_is(fd));
+				rsyserr(FERROR, errno, "safe_write poll failed on %s", what_fd_is(fd));
 				exit_cleanup(RERR_FILEIO);
 			}
 			if (io_timeout)
@@ -352,7 +348,7 @@ static void safe_write(int fd, const char *buf, size_t len)
 			continue;
 		}
 
-		if (FD_ISSET(fd, &w_fds)) {
+		if (pfd.revents) {
 			n = write(fd, buf, len);
 			if (n < 0) {
 				if (errno == EINTR)
@@ -561,9 +557,8 @@ static void handle_kill_signal(BOOL flush_ok)
  * unused raw data in the buf would prevent the reading of socket data. */
 static char *perform_io(size_t needed, int flags)
 {
-	fd_set r_fds, e_fds, w_fds;
-	struct timeval tv;
-	int cnt, max_fd;
+	struct pollfd pfds[3];
+	int cnt, max_fd, npfds, poll_timeout, in_pollpos, out_pollpos, ff_pollpos;
 	size_t empty_buf_len = 0;
 	xbuf *out;
 	char *data;
@@ -656,13 +651,15 @@ static char *perform_io(size_t needed, int flags)
 		}
 
 		max_fd = -1;
+		npfds = 0;
+		in_pollpos = out_pollpos = ff_pollpos = -1;
 
-		FD_ZERO(&r_fds);
-		FD_ZERO(&e_fds);
 		if (iobuf.in_fd >= 0 && iobuf.in.size - iobuf.in.len) {
 			if (!read_batch || batch_fd >= 0) {
-				FD_SET(iobuf.in_fd, &r_fds);
-				FD_SET(iobuf.in_fd, &e_fds);
+				pfds[npfds].fd = iobuf.in_fd;
+				pfds[npfds].events = POLLIN;
+				pfds[npfds].revents = 0;
+				in_pollpos = npfds++;
 			}
 			if (iobuf.in_fd > max_fd)
 				max_fd = iobuf.in_fd;
@@ -670,12 +667,14 @@ static char *perform_io(size_t needed, int flags)
 
 		/* Only do more filesfrom processing if there is enough room in the out buffer. */
 		if (ff_forward_fd >= 0 && iobuf.out.size - iobuf.out.len > FILESFROM_BUFLEN*2) {
-			FD_SET(ff_forward_fd, &r_fds);
+			pfds[npfds].fd = ff_forward_fd;
+			pfds[npfds].events = POLLIN;
+			pfds[npfds].revents = 0;
+			ff_pollpos = npfds++;
 			if (ff_forward_fd > max_fd)
 				max_fd = ff_forward_fd;
 		}
 
-		FD_ZERO(&w_fds);
 		if (iobuf.out_fd >= 0) {
 			if (iobuf.raw_flushing_ends_before
 			 || (!iobuf.msg.len && iobuf.out.len > iobuf.out_empty_len && !(flags & PIO_NEED_MSGROOM))) {
@@ -715,7 +714,10 @@ static char *perform_io(size_t needed, int flags)
 			} else
 				out = NULL;
 			if (out) {
-				FD_SET(iobuf.out_fd, &w_fds);
+				pfds[npfds].fd = iobuf.out_fd;
+				pfds[npfds].events = POLLOUT;
+				pfds[npfds].revents = 0;
+				out_pollpos = npfds++;
 				if (iobuf.out_fd > max_fd)
 					max_fd = iobuf.out_fd;
 			}
@@ -752,16 +754,15 @@ static char *perform_io(size_t needed, int flags)
 
 		if (extra_flist_sending_enabled) {
 			if (file_total - file_old_total < MAX_FILECNT_LOOKAHEAD && IN_MULTIPLEXED_AND_READY)
-				tv.tv_sec = 0;
+				poll_timeout = 0;
 			else {
 				extra_flist_sending_enabled = False;
-				tv.tv_sec = select_timeout;
+				poll_timeout = select_timeout * 1000;
 			}
 		} else
-			tv.tv_sec = select_timeout;
-		tv.tv_usec = 0;
+			poll_timeout = select_timeout * 1000;
 
-		cnt = select(max_fd + 1, &r_fds, &w_fds, &e_fds, &tv);
+		cnt = poll(pfds, npfds, poll_timeout);
 
 		if (cnt <= 0) {
 			if (cnt < 0 && errno == EBADF) {
@@ -774,11 +775,16 @@ static char *perform_io(size_t needed, int flags)
 				extra_flist_sending_enabled = !flist_eof;
 			} else
 				check_timeout((flags & PIO_NEED_INPUT) != 0, 0);
-			FD_ZERO(&r_fds); /* Just in case... */
-			FD_ZERO(&w_fds);
+			/* Just in case... */
+			if (in_pollpos >= 0)
+				pfds[in_pollpos].revents = 0;
+			if (ff_pollpos >= 0)
+				pfds[ff_pollpos].revents = 0;
+			if (out_pollpos >= 0)
+				pfds[out_pollpos].revents = 0;
 		}
 
-		if (iobuf.in_fd >= 0 && FD_ISSET(iobuf.in_fd, &r_fds)) {
+		if (iobuf.in_fd >= 0 && in_pollpos >= 0 && pfds[in_pollpos].revents) {
 			size_t len, pos = iobuf.in.pos + iobuf.in.len;
 			ssize_t n;
 			if (pos >= iobuf.in.size) {
@@ -827,7 +833,7 @@ static char *perform_io(size_t needed, int flags)
 			exit_cleanup(RERR_TIMEOUT);
 		}
 
-		if (out && FD_ISSET(iobuf.out_fd, &w_fds)) {
+		if (out && out_pollpos >= 0 && pfds[out_pollpos].revents) {
 			size_t len = iobuf.raw_flushing_ends_before ? iobuf.raw_flushing_ends_before - out->pos : out->len;
 			ssize_t n;
 
@@ -888,7 +894,7 @@ static char *perform_io(size_t needed, int flags)
 				wait_for_receiver(); /* generator only */
 		}
 
-		if (ff_forward_fd >= 0 && FD_ISSET(ff_forward_fd, &r_fds)) {
+		if (ff_forward_fd >= 0 && ff_pollpos >= 0 && pfds[ff_pollpos].revents) {
 			/* This can potentially flush all output and enable
 			 * multiplexed output, so keep this last in the loop
 			 * and be sure to not cache anything that would break
diff --git a/testsuite/highfd-hang_test.py b/testsuite/highfd-hang_test.py
new file mode 100644
index 000000000..d86837e2b
--- /dev/null
+++ b/testsuite/highfd-hang_test.py
@@ -0,0 +1,84 @@
+#!/usr/bin/env python3
+"""Regression test for issue #231: rsync must not hang when its I/O file
+descriptors are numbered at/above FD_SETSIZE.
+
+rsync's I/O loops in io.c used to call select() with fd_set bitmaps, which can
+only represent descriptors below FD_SETSIZE (1024 with glibc).  If rsync is
+started with many descriptors already open -- e.g. inherited from a parent that
+leaked fds -- its own socket/pipe fds get allocated above 1024.  FD_SET() and
+FD_ISSET() then index past the end of the fixed-size fd_set, which is undefined
+behavior: select() reports the fd ready, but FD_ISSET() reads the out-of-bounds
+bit as 0, so the read/write never happens and rsync spins at 100% CPU forever
+("rsync hangs, 100% of one CPU, no progress").
+
+We reproduce that deterministically by opening enough inheritable dummy fds to
+push rsync's descriptors past FD_SETSIZE, then running an ordinary transfer
+with close_fds=False so the child inherits them.  With the poll()-based io.c
+the transfer finishes instantly; with the old select()-based code it never
+finishes and is caught by the timeout.  The cross-over is binary (instant vs.
+infinite), so the timeout is a robust signal rather than a timing race.
+"""
+
+import os
+import resource
+import subprocess
+
+from rsyncfns import (
+    FROMDIR, TODIR, rmtree, rsync_argv, test_fail, test_skipped,
+)
+
+# select()'s fd_set holds FD_SETSIZE bits; glibc and most libcs use 1024.
+FD_SETSIZE = 1024
+NDUMMY = FD_SETSIZE + 80      # force rsync's fds comfortably past the limit
+TIMEOUT = 30                  # poll build: ~instant; select build: hangs forever
+
+# We must be able to open enough descriptors to cross FD_SETSIZE.
+soft, hard = resource.getrlimit(resource.RLIMIT_NOFILE)
+want = NDUMMY + 64
+if soft < want:
+    if hard != resource.RLIM_INFINITY and hard < want:
+        test_skipped(f"RLIMIT_NOFILE hard cap {hard} < {want}; cannot place "
+                     "fds above FD_SETSIZE to exercise issue #231")
+    resource.setrlimit(resource.RLIMIT_NOFILE, (want, hard))
+
+# A small tree to transfer.
+rmtree(FROMDIR)
+rmtree(TODIR)
+FROMDIR.mkdir(parents=True, exist_ok=True)
+payload = {f"f{i}": os.urandom(1000) for i in range(20)}
+for name, data in payload.items():
+    (FROMDIR / name).write_bytes(data)
+
+# Occupy the low fd numbers with inheritable dummy descriptors so that the
+# rsync child's socket/pipe fds are forced above FD_SETSIZE.  Python opens fds
+# O_CLOEXEC by default, so mark each inheritable and use close_fds=False.
+dummies = []
+try:
+    while True:
+        fd = os.open(os.devnull, os.O_RDONLY)
+        os.set_inheritable(fd, True)
+        dummies.append(fd)
+        if fd >= NDUMMY:
+            break
+
+    argv = rsync_argv('-a', f'{FROMDIR}/', f'{TODIR}/')
+    try:
+        proc = subprocess.run(argv, timeout=TIMEOUT, close_fds=False)
+    except subprocess.TimeoutExpired:
+        test_fail("rsync hung with high-numbered fds -- select()/fd_set "
+                  "overflow (issue #231 regression)")
+finally:
+    for fd in dummies:
+        os.close(fd)
+
+if proc.returncode != 0:
+    test_fail(f"rsync exited {proc.returncode}: {' '.join(argv)}")
+
+# The cap only matters if it stayed correct: verify every file transferred.
+for name, data in payload.items():
+    got = (TODIR / name).read_bytes()
+    if got != data:
+        test_fail(f"{name} differs after a high-fd transfer")
+
+print(f"issue #231: transfer with fds >= {NDUMMY} (above FD_SETSIZE) "
+      "completed correctly")