From 88cee8930f41af4d5251f6a8feb6418dd87e15b4 Mon Sep 17 00:00:00 2001 From: Danny Canter Date: Fri, 13 Mar 2026 05:51:14 -0700 Subject: [PATCH] LinuxContainer/LinuxPod: Add support for NoNewPrivileges Basically just a direct mapping to the prctl/OCI spec field. --- .../LinuxProcessConfiguration.swift | 7 ++ Sources/Integration/ContainerTests.swift | 110 ++++++++++++++++++ Sources/Integration/Suite.swift | 3 + vminitd/Sources/LCShim/include/syscall.h | 2 + vminitd/Sources/LCShim/syscall.c | 4 + vminitd/Sources/vmexec/ExecCommand.swift | 3 + vminitd/Sources/vmexec/RunCommand.swift | 3 + vminitd/Sources/vmexec/vmexec.swift | 7 ++ 8 files changed, 139 insertions(+) diff --git a/Sources/Containerization/LinuxProcessConfiguration.swift b/Sources/Containerization/LinuxProcessConfiguration.swift index 2644ccde..1eef8b0a 100644 --- a/Sources/Containerization/LinuxProcessConfiguration.swift +++ b/Sources/Containerization/LinuxProcessConfiguration.swift @@ -372,6 +372,10 @@ public struct LinuxProcessConfiguration: Sendable { public var user: ContainerizationOCI.User = .init() /// The rlimits for the container process. public var rlimits: [LinuxRLimit] = [] + /// Whether to set the no_new_privileges bit on the container process. When true, the + /// process and its children cannot gain additional privileges via setuid/setgid binaries + /// or file capabilities. + public var noNewPrivileges: Bool = false /// The Linux capabilities for the container process. public var capabilities: LinuxCapabilities = .allCapabilities /// Whether to allocate a pseudo terminal for the process. If you'd like interactive @@ -393,6 +397,7 @@ public struct LinuxProcessConfiguration: Sendable { workingDirectory: String = "/", user: ContainerizationOCI.User = .init(), rlimits: [LinuxRLimit] = [], + noNewPrivileges: Bool = false, capabilities: LinuxCapabilities = .allCapabilities, terminal: Bool = false, stdin: ReaderStream? = nil, @@ -404,6 +409,7 @@ public struct LinuxProcessConfiguration: Sendable { self.workingDirectory = workingDirectory self.user = user self.rlimits = rlimits + self.noNewPrivileges = noNewPrivileges self.capabilities = capabilities self.terminal = terminal self.stdin = stdin @@ -437,6 +443,7 @@ public struct LinuxProcessConfiguration: Sendable { args: self.arguments, cwd: self.workingDirectory, env: self.environmentVariables, + noNewPrivileges: self.noNewPrivileges, capabilities: self.capabilities.toOCI(), user: self.user, rlimits: self.rlimits.map { $0.toOCI() }, diff --git a/Sources/Integration/ContainerTests.swift b/Sources/Integration/ContainerTests.swift index 302d4507..2dd978b1 100644 --- a/Sources/Integration/ContainerTests.swift +++ b/Sources/Integration/ContainerTests.swift @@ -4153,4 +4153,114 @@ extension IntegrationSuite { throw error } } + + func testNoNewPrivileges() async throws { + let id = "test-no-new-privileges" + + let bs = try await bootstrap(id) + let buffer = BufferWriter() + let container = try LinuxContainer(id, rootfs: bs.rootfs, vmm: bs.vmm) { config in + config.process.arguments = ["cat", "/proc/self/status"] + config.process.noNewPrivileges = true + config.process.stdout = buffer + config.bootLog = bs.bootLog + } + + try await container.create() + try await container.start() + + let status = try await container.wait() + try await container.stop() + + guard status.exitCode == 0 else { + throw IntegrationError.assert(msg: "process status \(status) != 0") + } + + guard let output = String(data: buffer.data, encoding: .utf8) else { + throw IntegrationError.assert(msg: "failed to convert stdout to UTF8") + } + + // /proc/self/status contains "NoNewPrivs:\t1" when the bit is set + guard output.contains("NoNewPrivs:\t1") else { + throw IntegrationError.assert(msg: "expected NoNewPrivs to be 1, got: \(output)") + } + } + + func testNoNewPrivilegesDisabled() async throws { + let id = "test-no-new-privileges-disabled" + + let bs = try await bootstrap(id) + let buffer = BufferWriter() + let container = try LinuxContainer(id, rootfs: bs.rootfs, vmm: bs.vmm) { config in + config.process.arguments = ["cat", "/proc/self/status"] + // noNewPrivileges defaults to false + config.process.stdout = buffer + config.bootLog = bs.bootLog + } + + try await container.create() + try await container.start() + + let status = try await container.wait() + try await container.stop() + + guard status.exitCode == 0 else { + throw IntegrationError.assert(msg: "process status \(status) != 0") + } + + guard let output = String(data: buffer.data, encoding: .utf8) else { + throw IntegrationError.assert(msg: "failed to convert stdout to UTF8") + } + + // When noNewPrivileges is not set, NoNewPrivs should be 0 + guard output.contains("NoNewPrivs:\t0") else { + throw IntegrationError.assert(msg: "expected NoNewPrivs to be 0, got: \(output)") + } + } + + func testNoNewPrivilegesExec() async throws { + let id = "test-no-new-privileges-exec" + + let bs = try await bootstrap(id) + let container = try LinuxContainer(id, rootfs: bs.rootfs, vmm: bs.vmm) { config in + config.process.arguments = ["sleep", "100"] + config.bootLog = bs.bootLog + } + + do { + try await container.create() + try await container.start() + + // Exec a process with noNewPrivileges set + let buffer = BufferWriter() + let exec = try await container.exec("nnp-exec") { config in + config.arguments = ["cat", "/proc/self/status"] + config.noNewPrivileges = true + config.stdout = buffer + } + + try await exec.start() + let status = try await exec.wait() + try await exec.delete() + + guard status.exitCode == 0 else { + throw IntegrationError.assert(msg: "exec status \(status) != 0") + } + + guard let output = String(data: buffer.data, encoding: .utf8) else { + throw IntegrationError.assert(msg: "failed to convert stdout to UTF8") + } + + guard output.contains("NoNewPrivs:\t1") else { + throw IntegrationError.assert(msg: "expected NoNewPrivs to be 1 in exec, got: \(output)") + } + + try await container.kill(SIGKILL) + try await container.wait() + try await container.stop() + } catch { + try? await container.stop() + throw error + } + } } diff --git a/Sources/Integration/Suite.swift b/Sources/Integration/Suite.swift index 670f14b3..9e18c8ae 100644 --- a/Sources/Integration/Suite.swift +++ b/Sources/Integration/Suite.swift @@ -371,6 +371,9 @@ struct IntegrationSuite: AsyncParsableCommand { Test("container useInit with stdin", testUseInitWithStdin), Test("container sysctl", testSysctl), Test("container sysctl multiple", testSysctlMultiple), + Test("container noNewPrivileges", testNoNewPrivileges), + Test("container noNewPrivileges disabled", testNoNewPrivilegesDisabled), + Test("container noNewPrivileges exec", testNoNewPrivilegesExec), // Pods Test("pod single container", testPodSingleContainer), diff --git a/vminitd/Sources/LCShim/include/syscall.h b/vminitd/Sources/LCShim/include/syscall.h index 52f60da3..f30e3ab2 100644 --- a/vminitd/Sources/LCShim/include/syscall.h +++ b/vminitd/Sources/LCShim/include/syscall.h @@ -35,4 +35,6 @@ int CZ_pidfd_open(pid_t pid, unsigned int flags); int CZ_pidfd_getfd(int pidfd, int targetfd, unsigned int flags); +int CZ_prctl_set_no_new_privs(); + #endif diff --git a/vminitd/Sources/LCShim/syscall.c b/vminitd/Sources/LCShim/syscall.c index 4137aa84..4070196c 100644 --- a/vminitd/Sources/LCShim/syscall.c +++ b/vminitd/Sources/LCShim/syscall.c @@ -35,3 +35,7 @@ int CZ_pidfd_getfd(int pidfd, int targetfd, unsigned int flags) { // Musl doesn't have pidfd_getfd. return syscall(SYS_pidfd_getfd, pidfd, targetfd, flags); } + +int CZ_prctl_set_no_new_privs() { + return prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); +} diff --git a/vminitd/Sources/vmexec/ExecCommand.swift b/vminitd/Sources/vmexec/ExecCommand.swift index e31b62e0..a3f0dd23 100644 --- a/vminitd/Sources/vmexec/ExecCommand.swift +++ b/vminitd/Sources/vmexec/ExecCommand.swift @@ -145,6 +145,9 @@ struct ExecCommand: ParsableCommand { // Finish capabilities (after user change) try App.finishCapabilities(preparedCaps) + // Set no_new_privs if requested by the OCI spec. + try App.setNoNewPrivileges(process: process) + try App.exec(process: process, currentEnv: process.env) } else { // parent process // Send our child's pid to our parent before we exit. diff --git a/vminitd/Sources/vmexec/RunCommand.swift b/vminitd/Sources/vmexec/RunCommand.swift index a96f9f06..ba070340 100644 --- a/vminitd/Sources/vmexec/RunCommand.swift +++ b/vminitd/Sources/vmexec/RunCommand.swift @@ -196,6 +196,9 @@ struct RunCommand: ParsableCommand { // Finish capabilities (after user change) try App.finishCapabilities(preparedCaps) + // Set no_new_privs if requested by the OCI spec. + try App.setNoNewPrivileges(process: process) + // Finally execve the container process. try App.exec(process: process, currentEnv: process.env) } diff --git a/vminitd/Sources/vmexec/vmexec.swift b/vminitd/Sources/vmexec/vmexec.swift index 5643cf70..dd716d26 100644 --- a/vminitd/Sources/vmexec/vmexec.swift +++ b/vminitd/Sources/vmexec/vmexec.swift @@ -233,6 +233,13 @@ extension App { try? caps.apply(kind: [.ambs]) } + static func setNoNewPrivileges(process: ContainerizationOCI.Process) throws { + guard process.noNewPrivileges else { return } + guard CZ_prctl_set_no_new_privs() == 0 else { + throw App.Errno(stage: "prctl(PR_SET_NO_NEW_PRIVS)") + } + } + static func Errno(stage: String, info: String = "") -> ContainerizationError { let posix = POSIXError(.init(rawValue: errno)!, userInfo: ["stage": stage]) return ContainerizationError(.internalError, message: "\(info) \(String(describing: posix))")