From 8e5b2b2f9663f0254240e6978ad61167ddc333ce Mon Sep 17 00:00:00 2001 From: Tigran Mkrtchyan Date: Tue, 30 Jun 2026 00:14:56 +0200 Subject: [PATCH] nfs: restart pool selection process on timeout for reads Motivation: If mover start message get lost then NFS door will fall into an infinite retry loop as it will wait for mover to start. Modification: On read, we pool selection or mover start fails with timeout, then restart the selection process as for a new open. Result: recover from cases then mover start gets timeout of lost. Acked-by: Lennart Sack Target: master, 12.0, 11.2 Require-book: no Require-notes: yes (cherry picked from commit 6288b6c75fd2160ffbc37a7359b1a324b5ac22d6) Signed-off-by: Tigran Mkrtchyan --- .../org/dcache/chimera/nfsv41/door/NFSv41Door.java | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/modules/dcache-nfs/src/main/java/org/dcache/chimera/nfsv41/door/NFSv41Door.java b/modules/dcache-nfs/src/main/java/org/dcache/chimera/nfsv41/door/NFSv41Door.java index e51f3ce906e..ac0420ddaaa 100644 --- a/modules/dcache-nfs/src/main/java/org/dcache/chimera/nfsv41/door/NFSv41Door.java +++ b/modules/dcache-nfs/src/main/java/org/dcache/chimera/nfsv41/door/NFSv41Door.java @@ -1463,7 +1463,17 @@ deviceid4[] selectDataServers(long timeout) throws } } - _redirectFuture.get(NFS_REQUEST_BLOCKING, TimeUnit.MILLISECONDS); + try { + _redirectFuture.get(NFS_REQUEST_BLOCKING, TimeUnit.MILLISECONDS); + } catch (Exception e) { + // No mover is started. Let retry the full selection process on next attempt. + if (!hasMover()) { + _redirectFuture.cancel(true); + _redirectFuture = null; + } + throw e; + } + _log.debug("mover ready: pool={} moverid={}", getPool(), getMoverId()); deviceid4 ds = waitForRedirect(NFS_REQUEST_BLOCKING).getDeviceId();