From b5b5bb1bfff7aa5ea7c280f6848e5809535b29ca Mon Sep 17 00:00:00 2001 From: Dmitry Kovba Date: Fri, 13 Mar 2026 21:01:24 -0700 Subject: [PATCH 1/2] Set `no_new_privs` for container processes --- Sources/Containerization/LinuxProcessConfiguration.swift | 3 +++ vminitd/Sources/LCShim/include/syscall.h | 2 ++ vminitd/Sources/LCShim/syscall.c | 2 ++ vminitd/Sources/vmexec/ExecCommand.swift | 5 +++++ vminitd/Sources/vmexec/RunCommand.swift | 5 +++++ vminitd/Sources/vmexec/vmexec.swift | 6 ++++++ 6 files changed, 23 insertions(+) diff --git a/Sources/Containerization/LinuxProcessConfiguration.swift b/Sources/Containerization/LinuxProcessConfiguration.swift index 2644ccde..8ca9a2e4 100644 --- a/Sources/Containerization/LinuxProcessConfiguration.swift +++ b/Sources/Containerization/LinuxProcessConfiguration.swift @@ -372,6 +372,8 @@ public struct LinuxProcessConfiguration: Sendable { public var user: ContainerizationOCI.User = .init() /// The rlimits for the container process. public var rlimits: [LinuxRLimit] = [] + /// Prevents the process from gaining additional privileges. + public var noNewPrivileges: Bool = false /// The Linux capabilities for the container process. public var capabilities: LinuxCapabilities = .allCapabilities /// Whether to allocate a pseudo terminal for the process. If you'd like interactive @@ -437,6 +439,7 @@ public struct LinuxProcessConfiguration: Sendable { args: self.arguments, cwd: self.workingDirectory, env: self.environmentVariables, + noNewPrivileges: self.noNewPrivileges, capabilities: self.capabilities.toOCI(), user: self.user, rlimits: self.rlimits.map { $0.toOCI() }, diff --git a/vminitd/Sources/LCShim/include/syscall.h b/vminitd/Sources/LCShim/include/syscall.h index 52f60da3..5e9f8a70 100644 --- a/vminitd/Sources/LCShim/include/syscall.h +++ b/vminitd/Sources/LCShim/include/syscall.h @@ -23,6 +23,8 @@ int CZ_pivot_root(const char *new_root, const char *put_old); int CZ_set_sub_reaper(); +int CZ_set_no_new_privs(); + #ifndef SYS_pidfd_open #define SYS_pidfd_open 434 #endif diff --git a/vminitd/Sources/LCShim/syscall.c b/vminitd/Sources/LCShim/syscall.c index 4137aa84..b289456f 100644 --- a/vminitd/Sources/LCShim/syscall.c +++ b/vminitd/Sources/LCShim/syscall.c @@ -26,6 +26,8 @@ int CZ_pivot_root(const char *new_root, const char *put_old) { int CZ_set_sub_reaper() { return prctl(PR_SET_CHILD_SUBREAPER, 1); } +int CZ_set_no_new_privs() { return prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); } + int CZ_pidfd_open(pid_t pid, unsigned int flags) { // Musl doesn't have pidfd_open. return syscall(SYS_pidfd_open, pid, flags); diff --git a/vminitd/Sources/vmexec/ExecCommand.swift b/vminitd/Sources/vmexec/ExecCommand.swift index e31b62e0..8dc97f13 100644 --- a/vminitd/Sources/vmexec/ExecCommand.swift +++ b/vminitd/Sources/vmexec/ExecCommand.swift @@ -145,6 +145,11 @@ struct ExecCommand: ParsableCommand { // Finish capabilities (after user change) try App.finishCapabilities(preparedCaps) + // Set no_new_privs (after user/capability changes). + if process.noNewPrivileges { + try App.setNoNewPrivs() + } + try App.exec(process: process, currentEnv: process.env) } else { // parent process // Send our child's pid to our parent before we exit. diff --git a/vminitd/Sources/vmexec/RunCommand.swift b/vminitd/Sources/vmexec/RunCommand.swift index 7759a802..9023ff13 100644 --- a/vminitd/Sources/vmexec/RunCommand.swift +++ b/vminitd/Sources/vmexec/RunCommand.swift @@ -179,6 +179,11 @@ struct RunCommand: ParsableCommand { // Finish capabilities (after user change) try App.finishCapabilities(preparedCaps) + // Set no_new_privs (after user/capability changes). + if process.noNewPrivileges { + try App.setNoNewPrivs() + } + // Finally execve the container process. try App.exec(process: process, currentEnv: process.env) } diff --git a/vminitd/Sources/vmexec/vmexec.swift b/vminitd/Sources/vmexec/vmexec.swift index 5643cf70..89b01877 100644 --- a/vminitd/Sources/vmexec/vmexec.swift +++ b/vminitd/Sources/vmexec/vmexec.swift @@ -106,6 +106,12 @@ extension App { fatalError("execvpe failed") } + static func setNoNewPrivs() throws { + guard CZ_set_no_new_privs() == 0 else { + throw App.Errno(stage: "prctl(PR_SET_NO_NEW_PRIVS)") + } + } + static func setPermissions(user: ContainerizationOCI.User) throws { if user.additionalGids.count > 0 { guard setgroups(user.additionalGids.count, user.additionalGids) == 0 else { From e3d8a4ef71e2796d323db96440a82394b3e9fca7 Mon Sep 17 00:00:00 2001 From: Dmitry Kovba Date: Fri, 13 Mar 2026 21:01:53 -0700 Subject: [PATCH 2/2] Use consistent comments --- vminitd/Sources/vmexec/ExecCommand.swift | 6 +++--- vminitd/Sources/vmexec/RunCommand.swift | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/vminitd/Sources/vmexec/ExecCommand.swift b/vminitd/Sources/vmexec/ExecCommand.swift index 8dc97f13..b1b2feef 100644 --- a/vminitd/Sources/vmexec/ExecCommand.swift +++ b/vminitd/Sources/vmexec/ExecCommand.swift @@ -133,16 +133,16 @@ struct ExecCommand: ParsableCommand { try App.applyCloseExecOnFDs() try App.setRLimits(rlimits: process.rlimits) - // Prepare capabilities (before user change) + // Prepare capabilities (before user change). let preparedCaps = try App.prepareCapabilities(capabilities: process.capabilities ?? ContainerizationOCI.LinuxCapabilities()) // Change stdio to be owned by the requested user. try App.fixStdioPerms(user: process.user) - // Set uid, gid, and supplementary groups + // Set uid, gid, and supplementary groups. try App.setPermissions(user: process.user) - // Finish capabilities (after user change) + // Finish capabilities (after user change). try App.finishCapabilities(preparedCaps) // Set no_new_privs (after user/capability changes). diff --git a/vminitd/Sources/vmexec/RunCommand.swift b/vminitd/Sources/vmexec/RunCommand.swift index 9023ff13..1e83da5c 100644 --- a/vminitd/Sources/vmexec/RunCommand.swift +++ b/vminitd/Sources/vmexec/RunCommand.swift @@ -167,7 +167,7 @@ struct RunCommand: ParsableCommand { try App.setRLimits(rlimits: process.rlimits) - // Prepare capabilities (before user change) + // Prepare capabilities (before user change). let preparedCaps = try App.prepareCapabilities(capabilities: process.capabilities ?? ContainerizationOCI.LinuxCapabilities()) // Change stdio to be owned by the requested user. @@ -176,7 +176,7 @@ struct RunCommand: ParsableCommand { // Set uid, gid, and supplementary groups. try App.setPermissions(user: process.user) - // Finish capabilities (after user change) + // Finish capabilities (after user change). try App.finishCapabilities(preparedCaps) // Set no_new_privs (after user/capability changes).