From 19f3f038c2ab0a3a4a2cefeb40b3ea39c1c15064 Mon Sep 17 00:00:00 2001 From: Matt Toohey Date: Mon, 11 May 2026 14:41:02 +1000 Subject: [PATCH 1/4] fix: use session-based kill for graceful action stop Interactive shells use job control, placing child commands in separate process groups. The previous approach sent SIGTERM to only the shell's process group via kill(-pid), which missed child process groups entirely. Additionally, interactive shells ignore SIGTERM by default, so the graceful stop always fell through to the 5-second SIGKILL escalation. Switch to `pkill -s ` which targets every process in the session regardless of process group, reliably reaching the shell, its child commands, and all their descendants. The SIGKILL escalation also uses session-based kill to avoid leaving orphaned processes. Co-Authored-By: Claude Opus 4.6 (1M context) Signed-off-by: Matt Toohey --- crates/builderbot-actions/src/executor.rs | 48 ++++++++++++----------- 1 file changed, 25 insertions(+), 23 deletions(-) diff --git a/crates/builderbot-actions/src/executor.rs b/crates/builderbot-actions/src/executor.rs index 5934c779..25197ce0 100644 --- a/crates/builderbot-actions/src/executor.rs +++ b/crates/builderbot-actions/src/executor.rs @@ -663,35 +663,37 @@ impl ActionExecutor { if let Some(pid) = pid { #[cfg(unix)] { - // Send SIGTERM to the entire process group (negative PID). + // Send SIGTERM to every process in the session. // - // Because we used `setsid()` in pre_exec, the shell and all its - // children share a process group whose PGID equals the shell's PID. - // Sending the signal to `-pid` reaches every process in the group, - // ensuring child processes (npm, cargo, etc.) are also terminated. + // Because we used `setsid()` in pre_exec, the shell is the + // session leader and all descendant processes share its SID. + // However, the interactive shell (`-i`) uses job control and + // places child commands in their own process groups. A simple + // `kill(-pid, SIGTERM)` only reaches the shell's process group, + // missing child process groups entirely. Additionally, + // interactive shells ignore SIGTERM by default. // - // SAFETY: Calling `libc::kill` with a negative PID targets a process - // group. The PID came from `Child::id()` at spawn time. If the group - // no longer exists, kill() returns ESRCH which we safely ignore. - // After SIGTERM, we spawn a background thread that waits briefly and - // escalates to SIGKILL if the group is still alive. - unsafe { - libc::kill(-(pid as i32), libc::SIGTERM); - } + // `pkill -s ` targets every process in the session + // regardless of process group, which reliably reaches the + // shell, its child commands, and their descendants. + let pid_str = pid.to_string(); + let _ = Command::new("pkill") + .args(["-TERM", "-s", &pid_str]) + .status(); if let Some(force_kill_after) = options.force_kill_after { - // Escalate to SIGKILL after a short grace period in case the - // process group ignores SIGTERM (e.g. a process traps the signal). + // Escalate to SIGKILL after a short grace period in case + // processes ignore SIGTERM (e.g. a process traps the signal). thread::spawn(move || { thread::sleep(force_kill_after); - // SAFETY: Same considerations as above. If the process group - // already exited, kill() harmlessly returns ESRCH. - unsafe { - // Check if the process group still exists before sending SIGKILL - let ret = libc::kill(-(pid as i32), 0); - if ret == 0 { - libc::kill(-(pid as i32), libc::SIGKILL); - } + // Check if any process in the session still exists + // before escalating. pkill exits 0 when it matches + // at least one process. + let probe = Command::new("pkill").args(["-0", "-s", &pid_str]).status(); + if probe.map(|s| s.success()).unwrap_or(false) { + let _ = Command::new("pkill") + .args(["-KILL", "-s", &pid_str]) + .status(); } }); } From da3b6e859e3b46d8200e0d7108ca93c975e6cc53 Mon Sep 17 00:00:00 2001 From: Matt Toohey Date: Mon, 11 May 2026 16:39:20 +1000 Subject: [PATCH 2/4] fix: use SIGHUP for graceful action stop instead of pkill -s MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit macOS `pkill` does not support the `-s` (session) flag — it's Linux-only. The previous commit silently failed to send any signal, causing "Stopping…" to hang forever. Replace `pkill -s` with `libc::kill(-pgid, SIGHUP)`. Interactive shells handle SIGHUP by forwarding it to all managed jobs before exiting, which is POSIX-standard across bash/zsh/fish. The SIGKILL escalation also switches to direct `libc::kill(-pgid, SIGKILL)`, removing the platform-specific `pkill` dependency entirely. Co-Authored-By: Claude Opus 4.6 (1M context) Signed-off-by: Matt Toohey --- crates/builderbot-actions/src/executor.rs | 47 +++++++++++------------ 1 file changed, 23 insertions(+), 24 deletions(-) diff --git a/crates/builderbot-actions/src/executor.rs b/crates/builderbot-actions/src/executor.rs index 25197ce0..e552f45d 100644 --- a/crates/builderbot-actions/src/executor.rs +++ b/crates/builderbot-actions/src/executor.rs @@ -663,37 +663,36 @@ impl ActionExecutor { if let Some(pid) = pid { #[cfg(unix)] { - // Send SIGTERM to every process in the session. + // Send SIGHUP to the shell's process group. // - // Because we used `setsid()` in pre_exec, the shell is the - // session leader and all descendant processes share its SID. - // However, the interactive shell (`-i`) uses job control and - // places child commands in their own process groups. A simple - // `kill(-pid, SIGTERM)` only reaches the shell's process group, - // missing child process groups entirely. Additionally, - // interactive shells ignore SIGTERM by default. + // We used `setsid()` in pre_exec, so the shell is the session + // leader and its PID equals the PGID of its initial process + // group. Interactive shells (`-i`) use job control, placing + // child commands in separate process groups, so + // `kill(-pid, SIGTERM)` misses them. Interactive shells also + // ignore SIGTERM by default. // - // `pkill -s ` targets every process in the session - // regardless of process group, which reliably reaches the - // shell, its child commands, and their descendants. - let pid_str = pid.to_string(); - let _ = Command::new("pkill") - .args(["-TERM", "-s", &pid_str]) - .status(); + // SIGHUP is the correct signal here: interactive shells handle + // it by forwarding SIGHUP+SIGCONT to every job they manage, + // then exiting. This is POSIX-standard behavior across + // bash/zsh/fish and works on both macOS and Linux. + let pgid = pid as i32; + unsafe { + libc::kill(-pgid, libc::SIGHUP); + } if let Some(force_kill_after) = options.force_kill_after { // Escalate to SIGKILL after a short grace period in case - // processes ignore SIGTERM (e.g. a process traps the signal). + // processes ignore SIGHUP or the shell fails to clean up. thread::spawn(move || { thread::sleep(force_kill_after); - // Check if any process in the session still exists - // before escalating. pkill exits 0 when it matches - // at least one process. - let probe = Command::new("pkill").args(["-0", "-s", &pid_str]).status(); - if probe.map(|s| s.success()).unwrap_or(false) { - let _ = Command::new("pkill") - .args(["-KILL", "-s", &pid_str]) - .status(); + // kill(-pgid, 0) checks if any process in the group + // still exists (returns 0 on success). + let alive = unsafe { libc::kill(-pgid, 0) } == 0; + if alive { + unsafe { + libc::kill(-pgid, libc::SIGKILL); + } } }); } From f976737990dfa1c03e3ae623cb240993b70d5688 Mon Sep 17 00:00:00 2001 From: Matt Toohey Date: Mon, 11 May 2026 17:10:37 +1000 Subject: [PATCH 3/4] fix: use session-based SIGKILL escalation for orphaned process groups MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The previous SIGKILL escalation only targeted the shell's initial process group via kill(-pgid, SIGKILL). Interactive shells use job control, placing child commands in separate process groups. If the shell exits from SIGHUP but a child in a different process group survives (e.g. a backgrounded process that trapped SIGHUP), kill(-pgid, 0) returns ESRCH and SIGKILL is never sent — leaving orphans. Add platform-specific kill_session() that enumerates all PIDs in the session and sends SIGKILL to each: - macOS: sysctl(KERN_PROC_SESSION) to query processes by session ID - Linux: /proc/*/stat field 6 (session ID) enumeration Also check the SIGHUP return value — if it fails with ESRCH, the process group is already gone and the SIGKILL escalation thread is skipped. Co-Authored-By: Claude Opus 4.6 (1M context) Signed-off-by: Matt Toohey --- crates/builderbot-actions/src/executor.rs | 170 +++++++++++++++++++--- 1 file changed, 153 insertions(+), 17 deletions(-) diff --git a/crates/builderbot-actions/src/executor.rs b/crates/builderbot-actions/src/executor.rs index e552f45d..a622f8ef 100644 --- a/crates/builderbot-actions/src/executor.rs +++ b/crates/builderbot-actions/src/executor.rs @@ -676,25 +676,28 @@ impl ActionExecutor { // it by forwarding SIGHUP+SIGCONT to every job they manage, // then exiting. This is POSIX-standard behavior across // bash/zsh/fish and works on both macOS and Linux. - let pgid = pid as i32; - unsafe { - libc::kill(-pgid, libc::SIGHUP); - } + // The child's PID equals its session ID (we used setsid()). + let sid = pid as i32; + // SAFETY: kill(2) does not dereference pointers. A negative + // pid targets the process group. + let hup_result = unsafe { libc::kill(-sid, libc::SIGHUP) }; if let Some(force_kill_after) = options.force_kill_after { - // Escalate to SIGKILL after a short grace period in case - // processes ignore SIGHUP or the shell fails to clean up. - thread::spawn(move || { - thread::sleep(force_kill_after); - // kill(-pgid, 0) checks if any process in the group - // still exists (returns 0 on success). - let alive = unsafe { libc::kill(-pgid, 0) } == 0; - if alive { - unsafe { - libc::kill(-pgid, libc::SIGKILL); - } - } - }); + // If SIGHUP failed with ESRCH, the process group is + // already gone — no need to escalate. + let already_dead = hup_result == -1 + && std::io::Error::last_os_error().raw_os_error() == Some(libc::ESRCH); + + if !already_dead { + // Escalate to SIGKILL after a grace period. Use + // session-based kill to reach child commands that + // the shell may have placed in separate process + // groups via job control. + thread::spawn(move || { + thread::sleep(force_kill_after); + kill_session(sid, libc::SIGKILL); + }); + } } } @@ -778,3 +781,136 @@ fn now_timestamp() -> i64 { .unwrap() .as_millis() as i64 } + +// ============================================================================= +// Session-based kill (unix only) +// ============================================================================= + +/// Send `signal` to every process in session `sid`. +/// +/// The child was started with `setsid()`, so its PID equals the session ID. +/// Interactive shells may place child commands in separate process groups, +/// so `kill(-pgid, sig)` can miss them. This function enumerates all PIDs +/// in the session and signals each one individually. +/// +/// Returns the number of processes successfully signalled. +#[cfg(target_os = "macos")] +fn kill_session(sid: i32, signal: i32) -> usize { + // On macOS, sysctl(KERN_PROC_SESSION) returns an array of kinfo_proc + // structs. The libc crate doesn't expose kinfo_proc for macOS, so we + // work with raw bytes. On 64-bit macOS (both arm64 and x86_64): + // sizeof(struct kinfo_proc) = 648 + // offsetof(kinfo_proc, kp_proc.p_pid) = 40 + const KINFO_PROC_SIZE: usize = 648; + const P_PID_OFFSET: usize = 40; + + let mut mib: [libc::c_int; 4] = [ + libc::CTL_KERN, + libc::KERN_PROC, + libc::KERN_PROC_SESSION, + sid, + ]; + let mut size: libc::size_t = 0; + + // First call: get the buffer size needed. + // SAFETY: sysctl with a null buffer just returns the needed size. + let ret = unsafe { + libc::sysctl( + mib.as_mut_ptr(), + 4, + std::ptr::null_mut(), + &mut size, + std::ptr::null_mut(), + 0, + ) + }; + if ret != 0 || size < KINFO_PROC_SIZE { + return 0; + } + + let mut buf: Vec = vec![0u8; size]; + let mut actual_size = size; + + // Second call: fill the buffer. + // SAFETY: buf is a properly sized byte buffer. + let ret = unsafe { + libc::sysctl( + mib.as_mut_ptr(), + 4, + buf.as_mut_ptr().cast(), + &mut actual_size, + std::ptr::null_mut(), + 0, + ) + }; + if ret != 0 { + return 0; + } + + let count = actual_size / KINFO_PROC_SIZE; + let mut signalled = 0usize; + for i in 0..count { + let offset = i * KINFO_PROC_SIZE + P_PID_OFFSET; + if offset + 4 > actual_size { + break; + } + let pid = i32::from_ne_bytes([ + buf[offset], + buf[offset + 1], + buf[offset + 2], + buf[offset + 3], + ]); + // SAFETY: kill(2) does not dereference pointers. + if unsafe { libc::kill(pid, signal) } == 0 { + signalled += 1; + } + } + signalled +} + +/// Send `signal` to every process in session `sid`. +/// +/// On Linux, enumerates `/proc/*/stat` and parses field 6 (session ID) +/// to find matching processes. +/// +/// Returns the number of processes successfully signalled. +#[cfg(target_os = "linux")] +fn kill_session(sid: i32, signal: i32) -> usize { + let mut signalled = 0usize; + let Ok(entries) = std::fs::read_dir("/proc") else { + return 0; + }; + for entry in entries.flatten() { + let name = entry.file_name(); + let Some(name_str) = name.to_str() else { + continue; + }; + let Ok(pid) = name_str.parse::() else { + continue; + }; + let stat_path = format!("/proc/{pid}/stat"); + let Ok(stat) = std::fs::read_to_string(&stat_path) else { + continue; + }; + // The stat format is: pid (comm) state ppid pgrp session ... + // comm can contain spaces and parens, so find the last ')' first. + let Some(close_paren) = stat.rfind(')') else { + continue; + }; + let fields_after_comm = &stat[close_paren + 2..]; // skip ") " + // fields_after_comm: state ppid pgrp session ... + // [0] [1] [2] [3] + let mut fields = fields_after_comm.split_whitespace(); + if let Some(session_str) = fields.nth(3) { + if let Ok(proc_sid) = session_str.parse::() { + if proc_sid == sid { + // SAFETY: kill(2) does not dereference pointers. + if unsafe { libc::kill(pid, signal) } == 0 { + signalled += 1; + } + } + } + } + } + signalled +} From 8ca70c14746e2379db97065bfd4bdd15925b6289 Mon Sep 17 00:00:00 2001 From: Matt Toohey Date: Mon, 11 May 2026 17:19:52 +1000 Subject: [PATCH 4/4] fix: add kill_session fallback for non-macOS/Linux unix targets The kill_session function was only defined for macOS and Linux via target_os cfg gates, but called from a #[cfg(unix)] block. This would fail to compile on other unix targets like FreeBSD. Add a fallback implementation that uses kill(-sid, signal) for any other unix platform. Co-Authored-By: Claude Opus 4.6 (1M context) Signed-off-by: Matt Toohey --- crates/builderbot-actions/src/executor.rs | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/crates/builderbot-actions/src/executor.rs b/crates/builderbot-actions/src/executor.rs index a622f8ef..b3b6f6f2 100644 --- a/crates/builderbot-actions/src/executor.rs +++ b/crates/builderbot-actions/src/executor.rs @@ -914,3 +914,20 @@ fn kill_session(sid: i32, signal: i32) -> usize { } signalled } + +/// Send `signal` to every process in session `sid`. +/// +/// Fallback for unix platforms other than macOS and Linux: sends the signal +/// to the session leader's process group via `kill(-sid, signal)`, which may +/// miss child commands in separate process groups under job control. +/// +/// Returns 1 if the signal was delivered, 0 otherwise. +#[cfg(all(unix, not(any(target_os = "macos", target_os = "linux"))))] +fn kill_session(sid: i32, signal: i32) -> usize { + // SAFETY: kill(2) does not dereference pointers. + if unsafe { libc::kill(-sid, signal) } == 0 { + 1 + } else { + 0 + } +}