From 835f1f92a15ab3e8c9b47ae2f9cb1c62b9434eee Mon Sep 17 00:00:00 2001 From: Davanum Srinivas Date: Sat, 23 May 2026 11:31:47 -0400 Subject: [PATCH] feat(sandbox): opt-in best-effort bootstrap via OPENSHELL_BEST_EFFORT_FAILURES MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When the OPENSHELL_BEST_EFFORT_FAILURES env var is set, failures from the three subsystems an outer sandbox typically degrades — network namespace creation, the supervisor seccomp prelude, and the workload seccomp filter — are logged and skipped instead of aborting startup. Default remains strict. The gVisor runtime, when invoked with --network=host on Kubernetes, returns EPERM from unshare(CLONE_NEWNET), EINVAL from seccomp(2) on filters it does not yet model, and EPERM from setresuid/setresgid when the container entrypoint already dropped to a non-root uid. These are defense-in-depth on a bare-metal host but duplicative when the workload already runs inside a strong outer sandbox. The env-var gate keeps the strict default for standalone deployments while letting outer-sandbox integrations (gVisor, Firecracker, Kata) opt in. Also make drop_privileges idempotent: when the process is already at the resolved target uid/gid, skip initgroups/setresgid/setresuid instead of failing with EPERM. Lets a container entrypoint pre-drop privileges before exec'ing the sandbox without breaking the verification path. Signed-off-by: Davanum Srinivas --- crates/openshell-sandbox/src/lib.rs | 46 ++++++++++++++++--- crates/openshell-sandbox/src/process.rs | 8 ++++ .../src/sandbox/linux/mod.rs | 4 +- 3 files changed, 51 insertions(+), 7 deletions(-) diff --git a/crates/openshell-sandbox/src/lib.rs b/crates/openshell-sandbox/src/lib.rs index b83125f12..1bbd0469a 100644 --- a/crates/openshell-sandbox/src/lib.rs +++ b/crates/openshell-sandbox/src/lib.rs @@ -111,6 +111,34 @@ pub(crate) fn agent_proposals_enabled() -> bool { .is_some_and(|flag| flag.load(Ordering::Relaxed)) } +/// Operator-opt-in to best-effort bootstrap. When the +/// `OPENSHELL_BEST_EFFORT_FAILURES` environment variable is set (to any +/// value), the sandbox tolerates failures from optional hardening +/// subsystems (network namespace, seccomp) instead of aborting. Intended +/// for gVisor-on-Kubernetes deployments where the runtime degrades these +/// syscalls. Default is strict (any failure aborts). +fn best_effort_failures() -> bool { + static FLAG: OnceLock = OnceLock::new(); + *FLAG.get_or_init(|| std::env::var_os("OPENSHELL_BEST_EFFORT_FAILURES").is_some()) +} + +/// Dispatch a sandbox bootstrap failure: abort by default, warn-and-continue +/// when the operator has opted into best-effort mode via +/// [`best_effort_failures`]. +pub(crate) fn handle_bootstrap_failure(subsystem: &str, err: miette::Report) -> Result<()> { + if best_effort_failures() { + warn!( + subsystem, + error = %err, + "Sandbox bootstrap subsystem unavailable; continuing in best-effort mode \ + (operator opted in via OPENSHELL_BEST_EFFORT_FAILURES)" + ); + Ok(()) + } else { + Err(err) + } +} + /// Test-only helpers shared across sibling test modules. #[cfg(test)] pub(crate) mod test_helpers { @@ -547,11 +575,15 @@ pub async fn run_sandbox( Some(ns) } Err(e) => { - return Err(miette::miette!( - "Network namespace creation failed and proxy mode requires isolation. \ - Ensure CAP_NET_ADMIN and CAP_SYS_ADMIN are available and iproute2 is installed. \ - Error: {e}" - )); + handle_bootstrap_failure( + "network-namespace", + miette::miette!( + "Network namespace creation failed and proxy mode requires isolation. \ + Ensure CAP_NET_ADMIN and CAP_SYS_ADMIN are available and iproute2 is installed. \ + Error: {e}" + ), + )?; + None } } } else { @@ -566,7 +598,9 @@ pub async fn run_sandbox( // Install the supervisor seccomp prelude after privileged startup helpers // (network namespace setup, nftables probes) complete, but before the SSH // listener and workload process are exposed. - apply_supervisor_startup_hardening()?; + if let Err(e) = apply_supervisor_startup_hardening() { + handle_bootstrap_failure("supervisor-seccomp", e)?; + } // Shared PID: set after process spawn so the proxy can look up // the entrypoint process's /proc/net/tcp for identity binding. diff --git a/crates/openshell-sandbox/src/process.rs b/crates/openshell-sandbox/src/process.rs index 9bbcfe66c..18e25283c 100644 --- a/crates/openshell-sandbox/src/process.rs +++ b/crates/openshell-sandbox/src/process.rs @@ -477,6 +477,14 @@ pub fn drop_privileges(policy: &SandboxPolicy) -> Result<()> { .ok_or_else(|| miette::miette!("Failed to resolve user primary group"))? }; + // Idempotent fast-path: if the process is already running as the target + // uid/gid (e.g. the container entrypoint dropped privileges before exec), + // there is nothing to do. Skipping avoids initgroups(3), which requires + // CAP_SETGID and would fail otherwise. + if nix::unistd::geteuid() == user.uid && nix::unistd::getegid() == group.gid { + return Ok(()); + } + if user_name.is_some() { let user_cstr = CString::new(user.name.clone()).map_err(|_| miette::miette!("Invalid user name"))?; diff --git a/crates/openshell-sandbox/src/sandbox/linux/mod.rs b/crates/openshell-sandbox/src/sandbox/linux/mod.rs index a3a32c77a..3c461df32 100644 --- a/crates/openshell-sandbox/src/sandbox/linux/mod.rs +++ b/crates/openshell-sandbox/src/sandbox/linux/mod.rs @@ -40,7 +40,9 @@ pub fn enforce(prepared: PreparedSandbox) -> Result<()> { if let Some(ruleset) = prepared.landlock { landlock::enforce(ruleset)?; } - seccomp::apply(&prepared.policy)?; + if let Err(e) = seccomp::apply(&prepared.policy) { + crate::handle_bootstrap_failure("workload-seccomp", e)?; + } Ok(()) }