diff --git a/Sources/Containerization/VZVirtualMachine+Helpers.swift b/Sources/Containerization/VZVirtualMachine+Helpers.swift index 2cbadb1c..816dad49 100644 --- a/Sources/Containerization/VZVirtualMachine+Helpers.swift +++ b/Sources/Containerization/VZVirtualMachine+Helpers.swift @@ -122,13 +122,15 @@ extension VZVirtualMachine { } extension VZVirtualMachine { - func waitForAgent(queue: DispatchQueue) async throws -> FileHandle { + func waitForAgent(queue: DispatchQueue) async throws -> (FileHandle, VsockTransport) { let agentConnectionRetryCount: Int = 200 let agentConnectionSleepDuration: Duration = .milliseconds(20) for _ in 0...agentConnectionRetryCount { do { - return try await self.connect(queue: queue, port: Vminitd.port).dupHandle() + let conn = try await self.connect(queue: queue, port: Vminitd.port) + let handle = try conn.dupFileDescriptor() + return (handle, VsockTransport(conn)) } catch { try await Task.sleep(for: agentConnectionSleepDuration) continue @@ -139,6 +141,12 @@ extension VZVirtualMachine { } extension VZVirtioSocketConnection { + /// Duplicates the file descriptor and immediately closes the connection. + /// + /// Only safe when the returned fd is used synchronously before any + /// suspension point. For deferred use (e.g., gRPC/NIO), use + /// ``dupFileDescriptor()`` and keep the connection alive via + /// ``VsockTransport``. func dupHandle() throws -> FileHandle { let fd = dup(self.fileDescriptor) if fd == -1 { @@ -147,6 +155,20 @@ extension VZVirtioSocketConnection { self.close() return FileHandle(fileDescriptor: fd, closeOnDealloc: false) } + + /// Duplicates the connection's file descriptor without closing the connection. + /// + /// The caller must keep the `VZVirtioSocketConnection` alive until the dup'd + /// descriptor is no longer needed. The Virtualization framework tears down the + /// vsock endpoint when the connection is closed, which invalidates dup'd + /// descriptors. + func dupFileDescriptor() throws -> FileHandle { + let fd = dup(self.fileDescriptor) + if fd == -1 { + throw POSIXError.fromErrno() + } + return FileHandle(fileDescriptor: fd, closeOnDealloc: false) + } } #endif diff --git a/Sources/Containerization/VZVirtualMachineInstance.swift b/Sources/Containerization/VZVirtualMachineInstance.swift index 9e429c5a..fe6d93b5 100644 --- a/Sources/Containerization/VZVirtualMachineInstance.swift +++ b/Sources/Containerization/VZVirtualMachineInstance.swift @@ -125,10 +125,8 @@ extension VZVirtualMachineInstance: VirtualMachineInstance { try await self.vm.start(queue: self.queue) - let agent = Vminitd( - connection: try await self.vm.waitForAgent(queue: self.queue), - group: self.group - ) + let (handle, transport) = try await self.vm.waitForAgent(queue: self.queue) + let agent = Vminitd(connection: handle, transport: transport, group: self.group) do { if self.config.rosetta { @@ -189,9 +187,8 @@ extension VZVirtualMachineInstance: VirtualMachineInstance { queue: queue, port: Vminitd.port ) - let handle = try conn.dupHandle() - let agent = Vminitd(connection: handle, group: self.group) - return agent + let handle = try conn.dupFileDescriptor() + return Vminitd(connection: handle, transport: VsockTransport(conn), group: self.group) } catch { if let err = error as? ContainerizationError { throw err diff --git a/Sources/Containerization/Vminitd.swift b/Sources/Containerization/Vminitd.swift index 2a177d66..62eec79c 100644 --- a/Sources/Containerization/Vminitd.swift +++ b/Sources/Containerization/Vminitd.swift @@ -33,16 +33,33 @@ public struct Vminitd: Sendable { let client: Client + /// Retains the underlying vsock connection to keep the file descriptor + /// valid for the gRPC client's lifetime. The Virtualization framework + /// tears down the vsock endpoint when the connection is closed, which + /// invalidates dup'd descriptors. Must remain open until the gRPC + /// channel is shut down. + private let transport: VsockTransport? + public init(client: Client) { self.client = client + self.transport = nil } public init(connection: FileHandle, group: EventLoopGroup) { self.client = .init(connection: connection, group: group) + self.transport = nil + } + + init(connection: FileHandle, transport: VsockTransport, group: EventLoopGroup) { + self.client = .init(connection: connection, group: group) + self.transport = transport } /// Close the connection to the guest agent. public func close() async throws { + // Shut down the gRPC channel first (NIO closes the dup'd fd), + // then close the vsock endpoint so the guest sees EOF immediately. + defer { transport?.close() } try await client.close() } } diff --git a/Sources/Containerization/VsockListener.swift b/Sources/Containerization/VsockListener.swift index 7a7b36fa..a7b44eb6 100644 --- a/Sources/Containerization/VsockListener.swift +++ b/Sources/Containerization/VsockListener.swift @@ -52,6 +52,13 @@ public final class VsockListener: NSObject, Sendable, AsyncSequence { #if os(macOS) extension VsockListener: VZVirtioSocketListenerDelegate { + /// Accepts a new vsock connection by dup'ing its fd and closing the original. + /// + /// The dup'd fd is yielded into the `AsyncStream` for immediate consumption. + /// Callers must use the `FileHandle` before any suspension point — the + /// Virtualization framework tears down the vsock endpoint when the connection + /// is closed, which can invalidate dup'd descriptors if the underlying kernel + /// object is reclaimed. For deferred use (e.g., gRPC/NIO), see `VsockTransport`. public func listener( _: VZVirtioSocketListener, shouldAcceptNewConnection conn: VZVirtioSocketConnection, from _: VZVirtioSocketDevice diff --git a/Sources/Containerization/VsockTransport.swift b/Sources/Containerization/VsockTransport.swift new file mode 100644 index 00000000..78520485 --- /dev/null +++ b/Sources/Containerization/VsockTransport.swift @@ -0,0 +1,55 @@ +//===----------------------------------------------------------------------===// +// Copyright © 2025-2026 Apple Inc. and the Containerization project authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +//===----------------------------------------------------------------------===// + +#if os(macOS) +import Foundation +import Virtualization + +/// Manages the lifecycle of a VZVirtioSocketConnection for use as a gRPC transport. +/// +/// When a vsock connection's file descriptor is dup'd and handed to gRPC/NIO, +/// the original VZVirtioSocketConnection must remain open. The Virtualization +/// framework tears down the host-to-guest vsock mapping when the connection is +/// closed, which invalidates dup'd descriptors. This wrapper keeps the +/// connection alive and provides explicit close semantics. +/// +/// Uses `@unchecked Sendable` because VZVirtioSocketConnection is not Sendable, +/// which also prevents using Mutex (its init requires a `sending` parameter that +/// conflicts with the non-Sendable connection at async call sites). +final class VsockTransport: @unchecked Sendable { + private var connection: VZVirtioSocketConnection? + private let lock = NSLock() + + init(_ connection: VZVirtioSocketConnection) { + self.connection = connection + } + + /// Closes the underlying vsock connection, tearing down the host-side endpoint. + func close() { + lock.lock() + defer { lock.unlock() } + connection?.close() + connection = nil + } + + deinit { + // No lock needed: deinit runs only after all strong references are + // released, so no concurrent close() call is possible. + connection?.close() + } +} + +#endif diff --git a/Sources/Integration/ContainerTests.swift b/Sources/Integration/ContainerTests.swift index cf6af0dd..4882b861 100644 --- a/Sources/Integration/ContainerTests.swift +++ b/Sources/Integration/ContainerTests.swift @@ -3969,6 +3969,76 @@ extension IntegrationSuite { } } + /// Exercises the dialAgent() → gRPC RPC path that previously crashed with + /// EBADF when the VZVirtioSocketConnection was closed before the gRPC + /// client made its first call. + /// + /// Each exec() call creates a new vsock connection via dialAgent(). The + /// gRPC ClientConnection defers NIO channel creation until the first RPC + /// (createProcess). A delay between exec() and start() widens the window + /// where the fd must remain valid — if the VZVirtioSocketConnection is + /// closed prematurely, the fd may be invalidated by the time NIO tries + /// fcntl(F_SETNOSIGPIPE), causing a precondition failure. + /// + /// The same VsockTransport fix also applies to the waitForAgent() startup + /// path (where the first RPC is setTime via TimeSyncer). That path is + /// implicitly exercised by every integration test that boots a container, + /// but isn't stress-tested with an artificial delay here because the timing + /// depends on VM boot and Rosetta setup, which aren't controllable. + func testExecDeferredConnectionStability() async throws { + let id = "test-exec-deferred-connection-stability" + + let bs = try await bootstrap(id) + let container = try LinuxContainer(id, rootfs: bs.rootfs, vmm: bs.vmm) { config in + config.process.arguments = ["/bin/sleep", "1000"] + config.bootLog = bs.bootLog + } + + do { + try await container.create() + try await container.start() + + // Run multiple sequential exec calls with delays between creating the + // gRPC connection (exec) and making the first RPC (start). This is the + // pattern that triggered the EBADF crash: the fd was dup'd, the + // VZVirtioSocketConnection was closed, and by the time NIO tried to + // create the channel the fd was invalid. + for i in 0..<10 { + let buffer = BufferWriter() + let exec = try await container.exec("deferred-\(i)") { config in + config.arguments = ["/bin/echo", "exec-\(i)"] + config.stdout = buffer + } + + // Delay between exec() (which calls dialAgent/creates gRPC connection) + // and start() (which triggers the first RPC/NIO channel creation). + try await Task.sleep(for: .milliseconds(100)) + + try await exec.start() + let status = try await exec.wait() + try await exec.delete() + + guard status.exitCode == 0 else { + throw IntegrationError.assert(msg: "exec deferred-\(i) status \(status) != 0") + } + + guard let output = String(data: buffer.data, encoding: .utf8) else { + throw IntegrationError.assert(msg: "failed to read output from deferred-\(i)") + } + guard output.trimmingCharacters(in: .whitespacesAndNewlines) == "exec-\(i)" else { + throw IntegrationError.assert(msg: "deferred-\(i) output mismatch: \(output)") + } + } + + try await container.kill(SIGKILL) + try await container.wait() + try await container.stop() + } catch { + try? await container.stop() + throw error + } + } + @available(macOS 26.0, *) func testNetworkingDisabled() async throws { let id = "test-networking-disabled" diff --git a/Sources/Integration/Suite.swift b/Sources/Integration/Suite.swift index 02f26354..8b07747d 100644 --- a/Sources/Integration/Suite.swift +++ b/Sources/Integration/Suite.swift @@ -369,6 +369,7 @@ struct IntegrationSuite: AsyncParsableCommand { Test("container useInit zombie reaping", testUseInitZombieReaping), Test("container useInit with terminal", testUseInitWithTerminal), Test("container useInit with stdin", testUseInitWithStdin), + Test("exec deferred connection stability", testExecDeferredConnectionStability), // Pods Test("pod single container", testPodSingleContainer), diff --git a/Tests/ContainerizationTests/VsockTransportTests.swift b/Tests/ContainerizationTests/VsockTransportTests.swift new file mode 100644 index 00000000..7f2ffaef --- /dev/null +++ b/Tests/ContainerizationTests/VsockTransportTests.swift @@ -0,0 +1,156 @@ +//===----------------------------------------------------------------------===// +// Copyright © 2025-2026 Apple Inc. and the Containerization project authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +//===----------------------------------------------------------------------===// + +#if os(macOS) + +import Darwin +import Foundation +import Testing + +@testable import Containerization + +/// Tests for the VsockTransport fd lifecycle fix. +/// +/// The Virtualization framework tears down the vsock endpoint when a +/// VZVirtioSocketConnection is closed, invalidating dup'd descriptors. +/// The fix keeps the connection alive via VsockTransport until the gRPC +/// channel is shut down. +/// +/// These tests use Unix socket pairs to verify: +/// 1. A dup'd fd is fully functional when the original is kept alive. +/// 2. The specific fcntl call that triggers the NIO crash (F_SETNOSIGPIPE) +/// works on the dup'd fd. +/// 3. The correct teardown order (close dup'd fd first, then original) +/// preserves the connection for the peer until the original is closed. +@Suite("VsockTransport tests") +struct VsockTransportTests { + + /// Creates a connected Unix socket pair. Returns (fd0, fd1). + private func makeSocketPair() throws -> (Int32, Int32) { + var fds: [Int32] = [0, 0] + let result = socketpair(AF_UNIX, SOCK_STREAM, 0, &fds) + try #require(result == 0, "socketpair should succeed") + return (fds[0], fds[1]) + } + + // MARK: - fd lifecycle tests + + /// Verifies that F_SETNOSIGPIPE (the exact fcntl call where NIO crashes) + /// succeeds on a dup'd fd when the original is kept alive. + @Test func dupdDescriptorSupportsFcntlWhenOriginalAlive() throws { + let (fd0, fd1) = try makeSocketPair() + defer { + close(fd0) + close(fd1) + } + + let dupdFd = dup(fd0) + try #require(dupdFd != -1) + defer { close(dupdFd) } + + // This is the exact operation that triggers the NIO EBADF crash + // when the underlying vsock endpoint has been torn down. + let result = fcntl(dupdFd, F_SETNOSIGPIPE, 1) + #expect(result == 0, "F_SETNOSIGPIPE should succeed on dup'd fd when original is alive") + } + + /// Verifies that a dup'd fd can read data written by the peer when the + /// original fd is kept alive. + @Test func dupdDescriptorCanReadWhenOriginalAlive() throws { + let (fd0, fd1) = try makeSocketPair() + defer { + close(fd0) + close(fd1) + } + + let dupdFd = dup(fd0) + try #require(dupdFd != -1) + defer { close(dupdFd) } + + // Peer writes data. + let message: [UInt8] = [1, 2, 3] + let writeResult = message.withUnsafeBufferPointer { buf in + write(fd1, buf.baseAddress, buf.count) + } + try #require(writeResult == 3) + + // Dup'd fd can read because the original keeps the connection alive. + var readBuf = [UInt8](repeating: 0, count: 3) + let readResult = readBuf.withUnsafeMutableBufferPointer { buf in + read(dupdFd, buf.baseAddress, buf.count) + } + #expect(readResult == 3) + #expect(readBuf == [1, 2, 3]) + } + + /// Verifies the correct teardown order: closing the dup'd fd first (gRPC + /// channel shutdown) does not break the connection for the peer, because + /// the original fd (transport) is still alive. + @Test func peerCanWriteAfterDupdFdClosedWhileOriginalAlive() throws { + let (fd0, fd1) = try makeSocketPair() + defer { + close(fd0) + close(fd1) + } + + let dupdFd = dup(fd0) + try #require(dupdFd != -1) + + // Close the dup'd fd (simulates gRPC channel shutdown). + close(dupdFd) + + // The peer can still write because the original fd keeps the + // connection alive. This matters for orderly shutdown: the guest + // doesn't see an unexpected EOF while the host is still tearing + // down the gRPC channel. + let message: [UInt8] = [42] + let writeResult = message.withUnsafeBufferPointer { buf in + write(fd1, buf.baseAddress, buf.count) + } + #expect(writeResult == 1, "Peer can still write after dup'd fd is closed") + + // Read from the original to confirm data arrived. + var readBuf = [UInt8](repeating: 0, count: 1) + let readResult = readBuf.withUnsafeMutableBufferPointer { buf in + read(fd0, buf.baseAddress, buf.count) + } + #expect(readResult == 1) + #expect(readBuf == [42]) + } + + /// Verifies that after both the dup'd fd and the original are closed, + /// the peer sees EOF (read returns 0). + @Test func peerSeesEOFAfterBothDescriptorsClosed() throws { + let (fd0, fd1) = try makeSocketPair() + defer { close(fd1) } + + let dupdFd = dup(fd0) + try #require(dupdFd != -1) + + // Close dup'd fd first (gRPC shutdown), then original (transport.close()). + close(dupdFd) + close(fd0) + + // Peer should see EOF. + var readBuf = [UInt8](repeating: 0, count: 1) + let readResult = readBuf.withUnsafeMutableBufferPointer { buf in + read(fd1, buf.baseAddress, buf.count) + } + #expect(readResult == 0, "Peer should see EOF after both descriptors are closed") + } +} + +#endif