From 7a7e017c638de64947b2218b4c44864a2e5503cf Mon Sep 17 00:00:00 2001 From: mjamiv <142179942+mjamiv@users.noreply.github.com> Date: Sat, 23 May 2026 19:04:45 +0000 Subject: [PATCH] fix(sandbox): delegate PID limits to runtimes Signed-off-by: mjamiv <142179942+mjamiv@users.noreply.github.com> --- crates/openshell-driver-docker/README.md | 1 + crates/openshell-driver-docker/src/lib.rs | 33 +++++ crates/openshell-driver-docker/src/tests.rs | 18 +++ crates/openshell-driver-podman/README.md | 1 + crates/openshell-driver-podman/src/config.rs | 34 +++++ .../openshell-driver-podman/src/container.rs | 25 +++- crates/openshell-driver-podman/src/driver.rs | 1 + crates/openshell-driver-podman/src/main.rs | 14 +- crates/openshell-sandbox/src/lib.rs | 10 ++ crates/openshell-sandbox/src/process.rs | 139 ++++++++++++++++-- 10 files changed, 259 insertions(+), 17 deletions(-) diff --git a/crates/openshell-driver-docker/README.md b/crates/openshell-driver-docker/README.md index 96caa7b12..ea57f44e4 100644 --- a/crates/openshell-driver-docker/README.md +++ b/crates/openshell-driver-docker/README.md @@ -31,6 +31,7 @@ contract: | `cap_add` | Grants supervisor-only capabilities required for namespace setup and process inspection. | | `apparmor=unconfined` | Avoids Docker's default profile blocking required mount operations. | | `restart_policy = unless-stopped` | Keeps managed sandboxes resumable across daemon or gateway restarts. | +| `PidsLimit` | Enforces the sandbox PID budget at the Docker cgroup layer. Set `[openshell.drivers.docker].sandbox_pids_limit = 0` to inherit the Docker/runtime default. | | CDI GPU request | Uses the sandbox `gpu_device` value when set; otherwise requests all NVIDIA GPUs when the sandbox spec asks for GPU support and daemon CDI support is detected. | The agent child process does not retain these supervisor privileges. diff --git a/crates/openshell-driver-docker/src/lib.rs b/crates/openshell-driver-docker/src/lib.rs index c74b07437..8c81b1a98 100644 --- a/crates/openshell-driver-docker/src/lib.rs +++ b/crates/openshell-driver-docker/src/lib.rs @@ -61,6 +61,7 @@ const SUPERVISOR_PATH: &str = "/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin const HOST_OPENSHELL_INTERNAL: &str = "host.openshell.internal"; const HOST_DOCKER_INTERNAL: &str = "host.docker.internal"; const DOCKER_NETWORK_DRIVER: &str = "bridge"; +const DEFAULT_SANDBOX_PIDS_LIMIT: i64 = 2048; /// Default image holding the Linux `openshell-sandbox` binary. The gateway /// pulls this image and extracts the binary to a host-side cache when no @@ -163,6 +164,11 @@ pub struct DockerComputeConfig { /// Unix socket path the in-container supervisor bridges relay traffic to. pub ssh_socket_path: String, + + /// Container cgroup PID limit for Docker-managed sandboxes. + /// + /// Set to `0` to leave Docker's runtime/default PID limit unchanged. + pub sandbox_pids_limit: i64, } impl Default for DockerComputeConfig { @@ -180,6 +186,7 @@ impl Default for DockerComputeConfig { network_name: DEFAULT_DOCKER_NETWORK_NAME.to_string(), host_gateway_ip: String::new(), ssh_socket_path: "/run/openshell/ssh.sock".to_string(), + sandbox_pids_limit: DEFAULT_SANDBOX_PIDS_LIMIT, } } } @@ -206,6 +213,7 @@ struct DockerDriverRuntimeConfig { guest_tls: Option, daemon_version: String, supports_gpu: bool, + sandbox_pids_limit: i64, } #[derive(Debug, Clone, PartialEq, Eq)] @@ -252,6 +260,7 @@ impl DockerComputeDriver { .cdi_spec_dirs .as_ref() .is_some_and(|dirs| !dirs.is_empty()); + validate_sandbox_pids_limit(docker_config.sandbox_pids_limit)?; let gateway_port = config.bind_address.port(); if gateway_port == 0 { return Err(Error::config( @@ -298,6 +307,7 @@ impl DockerComputeDriver { guest_tls, daemon_version: version.version.unwrap_or_else(|| "unknown".to_string()), supports_gpu, + sandbox_pids_limit: docker_config.sandbox_pids_limit, }, events: broadcast::channel(WATCH_BUFFER).0, supervisor_readiness, @@ -1163,6 +1173,7 @@ fn build_container_create_body( host_config: Some(HostConfig { nano_cpus: resource_limits.nano_cpus, memory: resource_limits.memory_bytes, + pids_limit: docker_pids_limit(config.sandbox_pids_limit)?, device_requests: docker_gpu_device_requests(spec.gpu, &spec.gpu_device), binds: Some(build_binds(sandbox, config)?), restart_policy: Some(RestartPolicy { @@ -1454,6 +1465,28 @@ fn docker_resource_limits( }) } +fn validate_sandbox_pids_limit(value: i64) -> CoreResult<()> { + if value < 0 { + return Err(Error::config( + "docker sandbox_pids_limit must be zero or greater", + )); + } + Ok(()) +} + +fn docker_pids_limit(value: i64) -> Result, Status> { + if value < 0 { + return Err(Status::failed_precondition( + "docker sandbox_pids_limit must be zero or greater", + )); + } + if value == 0 { + Ok(None) + } else { + Ok(Some(value)) + } +} + #[allow(clippy::cast_possible_truncation)] fn parse_cpu_limit(value: &str) -> Result, Status> { let value = value.trim(); diff --git a/crates/openshell-driver-docker/src/tests.rs b/crates/openshell-driver-docker/src/tests.rs index 9afec4be4..9fa743516 100644 --- a/crates/openshell-driver-docker/src/tests.rs +++ b/crates/openshell-driver-docker/src/tests.rs @@ -68,6 +68,7 @@ fn runtime_config() -> DockerDriverRuntimeConfig { }), daemon_version: "28.0.0".to_string(), supports_gpu: false, + sandbox_pids_limit: DEFAULT_SANDBOX_PIDS_LIMIT, } } @@ -411,6 +412,23 @@ fn docker_resource_limits_applies_cpu_and_memory_limits() { assert_eq!(limits.memory_bytes, Some(2_147_483_648)); } +#[test] +fn docker_pids_limit_uses_driver_default_and_allows_runtime_inherit() { + assert_eq!( + docker_pids_limit(DEFAULT_SANDBOX_PIDS_LIMIT).unwrap(), + Some(DEFAULT_SANDBOX_PIDS_LIMIT) + ); + assert_eq!(docker_pids_limit(0).unwrap(), None); + assert!(docker_pids_limit(-1).is_err()); +} + +#[test] +fn container_create_body_sets_driver_owned_pids_limit() { + let body = build_container_create_body(&test_sandbox(), &runtime_config()).unwrap(); + let host_config = body.host_config.expect("host config"); + assert_eq!(host_config.pids_limit, Some(DEFAULT_SANDBOX_PIDS_LIMIT)); +} + #[test] fn build_environment_sets_docker_tls_paths() { let env = build_environment(&test_sandbox(), &runtime_config()); diff --git a/crates/openshell-driver-podman/README.md b/crates/openshell-driver-podman/README.md index dbf508c03..defb27b20 100644 --- a/crates/openshell-driver-podman/README.md +++ b/crates/openshell-driver-podman/README.md @@ -293,6 +293,7 @@ Podman resources after out-of-band container removal or label drift. | `OPENSHELL_NETWORK_NAME` | `--network-name` | `openshell` | Podman bridge network name. | | `OPENSHELL_SANDBOX_SSH_SOCKET_PATH` | `--sandbox-ssh-socket-path` | `/run/openshell/ssh.sock` | Supervisor Unix socket path in `PodmanComputeConfig`. | | `OPENSHELL_STOP_TIMEOUT` | `--stop-timeout` | `10` | Container stop timeout in seconds. | +| `OPENSHELL_SANDBOX_PIDS_LIMIT` | `--sandbox-pids-limit` | `2048` | Podman cgroup PID limit for sandbox containers. Set `0` to inherit Podman's runtime/default PID limit. | | `OPENSHELL_SUPERVISOR_IMAGE` | `--supervisor-image` | `ghcr.io/nvidia/openshell/supervisor:latest` through the gateway, required standalone | OCI image containing the supervisor binary. | | `OPENSHELL_PODMAN_TLS_CA` | `--podman-tls-ca` | unset | Host path to the CA certificate mounted for sandbox mTLS. | | `OPENSHELL_PODMAN_TLS_CERT` | `--podman-tls-cert` | unset | Host path to the client certificate mounted for sandbox mTLS. | diff --git a/crates/openshell-driver-podman/src/config.rs b/crates/openshell-driver-podman/src/config.rs index 270804882..49b161a9f 100644 --- a/crates/openshell-driver-podman/src/config.rs +++ b/crates/openshell-driver-podman/src/config.rs @@ -7,6 +7,7 @@ use std::str::FromStr; /// Default Podman bridge network name. pub const DEFAULT_NETWORK_NAME: &str = "openshell"; +pub const DEFAULT_SANDBOX_PIDS_LIMIT: i64 = 2048; /// Image pull policy for sandbox and supervisor images. /// @@ -106,6 +107,10 @@ pub struct PodmanComputeConfig { pub guest_tls_cert: Option, /// Host path to the client private key for sandbox mTLS. pub guest_tls_key: Option, + /// Container cgroup PID limit for Podman-managed sandboxes. + /// + /// Set to `0` to leave Podman's runtime/default PID limit unchanged. + pub sandbox_pids_limit: i64, } impl PodmanComputeConfig { @@ -149,6 +154,16 @@ impl PodmanComputeConfig { ))) } + /// Validate runtime resource-limit configuration. + pub fn validate_runtime_limits(&self) -> Result<(), crate::client::PodmanApiError> { + if self.sandbox_pids_limit < 0 { + return Err(crate::client::PodmanApiError::InvalidInput( + "sandbox_pids_limit must be zero or greater".to_string(), + )); + } + Ok(()) + } + /// Resolve the default socket path from the environment. /// /// - **macOS**: `$HOME/.local/share/containers/podman/machine/podman.sock` @@ -191,6 +206,7 @@ impl Default for PodmanComputeConfig { guest_tls_ca: None, guest_tls_cert: None, guest_tls_key: None, + sandbox_pids_limit: DEFAULT_SANDBOX_PIDS_LIMIT, } } } @@ -210,6 +226,7 @@ impl std::fmt::Debug for PodmanComputeConfig { .field("guest_tls_ca", &self.guest_tls_ca) .field("guest_tls_cert", &self.guest_tls_cert) .field("guest_tls_key", &self.guest_tls_key) + .field("sandbox_pids_limit", &self.sandbox_pids_limit) .finish() } } @@ -251,6 +268,23 @@ mod tests { }); } + #[test] + fn default_config_sets_driver_owned_pids_limit() { + let cfg = PodmanComputeConfig::default(); + assert_eq!(cfg.sandbox_pids_limit, DEFAULT_SANDBOX_PIDS_LIMIT); + assert!(cfg.validate_runtime_limits().is_ok()); + } + + #[test] + fn runtime_limit_validation_rejects_negative_pids_limit() { + let cfg = PodmanComputeConfig { + sandbox_pids_limit: -1, + ..PodmanComputeConfig::default() + }; + let err = cfg.validate_runtime_limits().unwrap_err(); + assert!(err.to_string().contains("sandbox_pids_limit")); + } + #[test] #[cfg(target_os = "macos")] fn default_socket_path_uses_podman_machine_on_macos() { diff --git a/crates/openshell-driver-podman/src/container.rs b/crates/openshell-driver-podman/src/container.rs index e73619756..c6b969e40 100644 --- a/crates/openshell-driver-podman/src/container.rs +++ b/crates/openshell-driver-podman/src/container.rs @@ -183,6 +183,8 @@ struct HealthConfig { struct ResourceLimits { cpu: CpuLimits, memory: MemoryLimits, + #[serde(rename = "PidsLimit", skip_serializing_if = "Option::is_none")] + pids_limit: Option, } #[derive(Serialize)] @@ -340,7 +342,7 @@ fn build_labels(sandbox: &DriverSandbox) -> BTreeMap { } /// Parse resource limits from the sandbox template, falling back to defaults. -fn build_resource_limits(sandbox: &DriverSandbox) -> ResourceLimits { +fn build_resource_limits(sandbox: &DriverSandbox, config: &PodmanComputeConfig) -> ResourceLimits { let resources = sandbox .spec .as_ref() @@ -363,9 +365,14 @@ fn build_resource_limits(sandbox: &DriverSandbox) -> ResourceLimits { period: DEFAULT_CPU_PERIOD, }, memory: MemoryLimits { limit: mem_bytes }, + pids_limit: podman_pids_limit(config.sandbox_pids_limit), } } +fn podman_pids_limit(value: i64) -> Option { + if value > 0 { Some(value) } else { None } +} + /// Build CDI GPU device list if GPU is requested. fn build_devices(sandbox: &DriverSandbox) -> Option> { let spec = sandbox.spec.as_ref()?; @@ -396,7 +403,7 @@ pub fn build_container_spec_with_token( let env = build_env(sandbox, config, image); let labels = build_labels(sandbox); - let resource_limits = build_resource_limits(sandbox); + let resource_limits = build_resource_limits(sandbox, config); let devices = build_devices(sandbox); // Network configuration -- always bridge mode. @@ -735,6 +742,20 @@ mod tests { spec["resource_limits"]["memory"]["limit"].as_u64(), Some(2 * 1024 * 1024 * 1024) ); + assert_eq!( + spec["resource_limits"]["PidsLimit"].as_i64(), + Some(crate::config::DEFAULT_SANDBOX_PIDS_LIMIT) + ); + } + + #[test] + fn container_spec_can_inherit_runtime_pids_limit() { + let sandbox = test_sandbox("test-id", "test-name"); + let mut config = test_config(); + config.sandbox_pids_limit = 0; + let spec = build_container_spec(&sandbox, &config); + + assert!(spec["resource_limits"].get("PidsLimit").is_none()); } #[test] diff --git a/crates/openshell-driver-podman/src/driver.rs b/crates/openshell-driver-podman/src/driver.rs index a8ac6d424..d3955b27f 100644 --- a/crates/openshell-driver-podman/src/driver.rs +++ b/crates/openshell-driver-podman/src/driver.rs @@ -146,6 +146,7 @@ impl PodmanComputeDriver { // (e.g. CA set but cert/key missing) are rejected early so operators // get a clear error instead of a silent fallback to plaintext HTTP. config.validate_tls_config()?; + config.validate_runtime_limits()?; let client = PodmanClient::new(config.socket_path.clone()); diff --git a/crates/openshell-driver-podman/src/main.rs b/crates/openshell-driver-podman/src/main.rs index 5a0227ef6..e275b2101 100644 --- a/crates/openshell-driver-podman/src/main.rs +++ b/crates/openshell-driver-podman/src/main.rs @@ -11,7 +11,9 @@ use tracing_subscriber::EnvFilter; use openshell_core::VERSION; use openshell_core::config::DEFAULT_STOP_TIMEOUT_SECS; use openshell_core::proto::compute::v1::compute_driver_server::ComputeDriverServer; -use openshell_driver_podman::config::{DEFAULT_NETWORK_NAME, ImagePullPolicy}; +use openshell_driver_podman::config::{ + DEFAULT_NETWORK_NAME, DEFAULT_SANDBOX_PIDS_LIMIT, ImagePullPolicy, +}; use openshell_driver_podman::{ComputeDriverService, PodmanComputeConfig, PodmanComputeDriver}; #[derive(Parser)] @@ -71,6 +73,15 @@ struct Args { #[arg(long, env = "OPENSHELL_STOP_TIMEOUT", default_value_t = DEFAULT_STOP_TIMEOUT_SECS)] stop_timeout: u32, + /// Container cgroup PID limit for sandbox containers. Set 0 to inherit + /// Podman's runtime/default PID limit. + #[arg( + long, + env = "OPENSHELL_SANDBOX_PIDS_LIMIT", + default_value_t = DEFAULT_SANDBOX_PIDS_LIMIT + )] + sandbox_pids_limit: i64, + /// OCI image containing the openshell-sandbox supervisor binary. #[arg(long, env = "OPENSHELL_SUPERVISOR_IMAGE")] supervisor_image: String, @@ -114,6 +125,7 @@ async fn main() -> Result<()> { guest_tls_ca: args.podman_tls_ca, guest_tls_cert: args.podman_tls_cert, guest_tls_key: args.podman_tls_key, + sandbox_pids_limit: args.sandbox_pids_limit, }) .await .into_diagnostic()?; diff --git a/crates/openshell-sandbox/src/lib.rs b/crates/openshell-sandbox/src/lib.rs index b83125f12..124d2b6ad 100644 --- a/crates/openshell-sandbox/src/lib.rs +++ b/crates/openshell-sandbox/src/lib.rs @@ -414,6 +414,16 @@ pub async fn run_sandbox( // Prepare filesystem: create and chown read_write directories prepare_filesystem(&policy)?; + #[cfg(target_os = "linux")] + { + let pid_limit_mode = if std::env::var_os("OPENSHELL_REQUIRE_RUNTIME_PID_LIMIT").is_some() { + process::RuntimePidLimitMode::Require + } else { + process::RuntimePidLimitMode::Warn + }; + process::check_runtime_pid_limit(pid_limit_mode)?; + } + // Initialize the agent-proposals feature flag. Default false until the // initial settings fetch (or the poll loop) tells us otherwise. The flag // gates the skill install, the policy.local route handler, and the L7 diff --git a/crates/openshell-sandbox/src/process.rs b/crates/openshell-sandbox/src/process.rs index 9bbcfe66c..0fc657007 100644 --- a/crates/openshell-sandbox/src/process.rs +++ b/crates/openshell-sandbox/src/process.rs @@ -20,7 +20,7 @@ use std::os::unix::io::RawFd; use std::path::PathBuf; use std::process::Stdio; use tokio::process::{Child, Command}; -use tracing::debug; +use tracing::{debug, warn}; fn inject_provider_env(cmd: &mut Command, provider_env: &HashMap) { for (key, value) in provider_env { @@ -41,19 +41,6 @@ pub fn harden_child_process() -> Result<()> { ) .map_err(|e| miette::miette!("Failed to disable core dumps: {e}"))?; - // Limit process creation to prevent fork bombs. 512 processes per UID is - // sufficient for typical agent workloads (shell, compilers, language servers) - // while preventing runaway forking. Set as a hard limit so the sandbox user - // cannot raise it after privilege drop. - setrlimit( - Resource::Nproc, - Rlimit { - current: Some(512), - maximum: Some(512), - }, - ) - .map_err(|e| miette::miette!("Failed to set RLIMIT_NPROC: {e}"))?; - #[cfg(target_os = "linux")] { use rustix::process::{DumpableBehavior, set_dumpable_behavior}; @@ -64,6 +51,84 @@ pub fn harden_child_process() -> Result<()> { Ok(()) } +#[cfg(target_os = "linux")] +const CGROUP_PIDS_MAX_PATH: &str = "/sys/fs/cgroup/pids.max"; + +#[cfg(target_os = "linux")] +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum RuntimePidLimitStatus { + Limited(u64), + Unlimited, + Unavailable(String), +} + +#[cfg(target_os = "linux")] +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum RuntimePidLimitMode { + Warn, + Require, +} + +#[cfg(target_os = "linux")] +pub fn check_runtime_pid_limit(mode: RuntimePidLimitMode) -> Result<()> { + check_runtime_pid_limit_status(runtime_pid_limit_status(), mode) +} + +#[cfg(target_os = "linux")] +fn check_runtime_pid_limit_status( + status: RuntimePidLimitStatus, + mode: RuntimePidLimitMode, +) -> Result<()> { + match status { + RuntimePidLimitStatus::Limited(limit) => { + debug!(pids_max = limit, "runtime PID limit detected"); + Ok(()) + } + RuntimePidLimitStatus::Unlimited => { + let message = "runtime cgroup pids.max is unlimited; configure the compute driver or container runtime to enforce a PID limit"; + if matches!(mode, RuntimePidLimitMode::Require) { + Err(miette::miette!(message)) + } else { + warn!("{message}"); + Ok(()) + } + } + RuntimePidLimitStatus::Unavailable(reason) => { + let message = format!( + "runtime cgroup pids.max is unavailable ({reason}); configure the compute driver or container runtime to enforce a PID limit" + ); + if matches!(mode, RuntimePidLimitMode::Require) { + Err(miette::miette!(message)) + } else { + warn!("{message}"); + Ok(()) + } + } + } +} + +#[cfg(target_os = "linux")] +fn runtime_pid_limit_status() -> RuntimePidLimitStatus { + match std::fs::read_to_string(CGROUP_PIDS_MAX_PATH) { + Ok(contents) => parse_pids_max(&contents), + Err(err) => RuntimePidLimitStatus::Unavailable(err.to_string()), + } +} + +#[cfg(target_os = "linux")] +fn parse_pids_max(contents: &str) -> RuntimePidLimitStatus { + let raw = contents.trim(); + if raw.eq_ignore_ascii_case("max") { + return RuntimePidLimitStatus::Unlimited; + } + match raw.parse::() { + Ok(limit) => RuntimePidLimitStatus::Limited(limit), + Err(err) => { + RuntimePidLimitStatus::Unavailable(format!("invalid pids.max value {raw:?}: {err}")) + } + } +} + /// Handle to a running process. pub struct ProcessHandle { child: Child, @@ -805,6 +870,52 @@ mod tests { assert_eq!(probe_hardened_child(dumpable_flag_probe), 0); } + #[test] + #[cfg(target_os = "linux")] + fn parse_pids_max_detects_limited_runtime() { + assert_eq!( + parse_pids_max("2048\n"), + RuntimePidLimitStatus::Limited(2048) + ); + } + + #[test] + #[cfg(target_os = "linux")] + fn parse_pids_max_detects_unlimited_runtime() { + assert_eq!(parse_pids_max("max\n"), RuntimePidLimitStatus::Unlimited); + } + + #[test] + #[cfg(target_os = "linux")] + fn parse_pids_max_reports_invalid_values() { + let status = parse_pids_max("not-a-number\n"); + assert!(matches!(status, RuntimePidLimitStatus::Unavailable(_))); + } + + #[test] + #[cfg(target_os = "linux")] + fn pid_limit_require_mode_rejects_missing_guardrail_statuses() { + for status in [ + RuntimePidLimitStatus::Unlimited, + RuntimePidLimitStatus::Unavailable("missing".to_string()), + ] { + let result = check_runtime_pid_limit_status(status, RuntimePidLimitMode::Require); + assert!(result.is_err()); + } + } + + #[test] + #[cfg(target_os = "linux")] + fn pid_limit_warn_mode_accepts_missing_guardrail_statuses() { + for status in [ + RuntimePidLimitStatus::Unlimited, + RuntimePidLimitStatus::Unavailable("missing".to_string()), + ] { + let result = check_runtime_pid_limit_status(status, RuntimePidLimitMode::Warn); + assert!(result.is_ok()); + } + } + #[tokio::test] async fn inject_provider_env_sets_placeholder_values() { let mut cmd = Command::new("/usr/bin/env");