diff --git a/src/core/guest.c b/src/core/guest.c index 01422b8..d4e455e 100644 --- a/src/core/guest.c +++ b/src/core/guest.c @@ -441,12 +441,13 @@ int guest_init(guest_t *g, uint64_t size, uint32_t ipa_bits) int guest_init_from_shm(guest_t *g, int shm_fd, uint64_t size, - uint32_t ipa_bits) + uint32_t ipa_bits, + bool retain_shared) { uint64_t t0; memset(g, 0, sizeof(*g)); - g->shm_fd = -1; /* Child does not own the shm */ + g->shm_fd = -1; /* Child does not own the shm unless retain_shared */ g->ipa_base = GUEST_IPA_BASE; g->elf_load_min = ELF_DEFAULT_BASE; g->brk_base = BRK_BASE_DEFAULT; @@ -471,13 +472,21 @@ int guest_init_from_shm(guest_t *g, } g->pt_pool_next = g->pt_pool_base; - /* Map the shm fd MAP_PRIVATE: copy-on-write semantics. Reads see the - * parent's frozen snapshot; writes are private to this process. macOS CoW - * is page-granular: only modified pages are duplicated. + /* Two mapping modes: + * retain_shared: shm_fd is an independent APFS clone of the parent's + * memory (already isolated from the parent). Map MAP_SHARED so the + * child's writes land in the clone file, then keep the fd so the child + * can fclonefileat it for its own nested CoW fork. guest_destroy closes + * it. + * otherwise: shm_fd may be the parent's live fd (clonefile fallback). Map + * MAP_PRIVATE so writes stay private to this process, then close the + * fd. macOS CoW is page-granular either way: only modified pages are + * duplicated. */ + int map_flags = retain_shared ? MAP_SHARED : MAP_PRIVATE; t0 = startup_trace_now_ns(); g->host_base = - mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_PRIVATE, shm_fd, 0); + mmap(NULL, size, PROT_READ | PROT_WRITE, map_flags, shm_fd, 0); startup_trace_step("shm_mmap", t0); if (g->host_base == MAP_FAILED) { perror("guest: mmap shm"); @@ -486,8 +495,10 @@ int guest_init_from_shm(guest_t *g, return -1; } - /* Close the shm fd; the mapping keeps the pages alive */ - close(shm_fd); + if (retain_shared) + g->shm_fd = shm_fd; /* Child owns the clone; guest_destroy closes it */ + else + close(shm_fd); /* MAP_PRIVATE mapping keeps the pages alive */ /* Create HVF VM with the same IPA width as the parent */ hv_return_t ret = HV_ERROR; @@ -506,6 +517,10 @@ int guest_init_from_shm(guest_t *g, log_error("guest: hv_vm_create (shm) failed: %d", (int) ret); munmap(g->host_base, size); g->host_base = NULL; + if (g->shm_fd >= 0) { + close(g->shm_fd); + g->shm_fd = -1; + } return -1; } @@ -518,6 +533,10 @@ int guest_init_from_shm(guest_t *g, hv_vm_destroy(); munmap(g->host_base, size); g->host_base = NULL; + if (g->shm_fd >= 0) { + close(g->shm_fd); + g->shm_fd = -1; + } return -1; } diff --git a/src/core/guest.h b/src/core/guest.h index 133088e..58ca8b1 100644 --- a/src/core/guest.h +++ b/src/core/guest.h @@ -774,16 +774,22 @@ static inline bool guest_addr_in_infra(const guest_t *g, uint64_t addr) */ int guest_init(guest_t *g, uint64_t size, uint32_t ipa_bits); -/* Initialize guest from a POSIX shared memory fd (CoW fork path). Maps shm_fd - * MAP_PRIVATE (copy-on-write), creates HVF VM, maps to hypervisor. The child - * gets an instant CoW snapshot of parent's guest memory without copying. shm_fd - * is closed after mapping. +/* Initialize guest from a shared memory fd (CoW fork path). Creates the HVF VM + * and maps the fd to the hypervisor. The child gets an instant CoW snapshot of + * the parent's guest memory without copying. + * + * retain_shared selects the mapping: when true, shm_fd is an independent APFS + * clone, so it is mapped MAP_SHARED and retained in g->shm_fd (guest_destroy + * closes it) so the child can fclonefileat it for nested CoW fork. When false, + * shm_fd may be the parent's live fd, so it is mapped MAP_PRIVATE and closed + * after mapping. This function takes ownership of shm_fd on every path. * Returns 0 on success, -1 on failure. */ int guest_init_from_shm(guest_t *g, int shm_fd, uint64_t size, - uint32_t ipa_bits); + uint32_t ipa_bits, + bool retain_shared); /* Tear down VM and free guest memory. */ void guest_destroy(guest_t *g); diff --git a/src/runtime/fork-state.h b/src/runtime/fork-state.h index 18f3443..84e73a5 100644 --- a/src/runtime/fork-state.h +++ b/src/runtime/fork-state.h @@ -60,6 +60,22 @@ typedef struct { uint64_t rosetta_entry; uint64_t kbuf_gpa; uint64_t ttbr1; + /* Clone TID-sync state for the fork path. glibc's fork wrapper passes + * CLONE_CHILD_SETTID | CLONE_CHILD_CLEARTID so the child writes its new TID + * into the TCB and clears it on exit. The posix_spawn child has no access + * to the original clone() arguments, so the parent forwards them here: + * clone_flags carries the CHILD_SETTID / CHILD_CLEARTID bits and ctid_gva + * the guest address. Zero for callers (e.g. raw fork(2)) that pass neither. + */ + uint64_t clone_flags; + uint64_t ctid_gva; + /* Nonzero when the shm fd sent below is an independent fclonefileat clone + * (not the parent's live fd). Only then may the child map it MAP_SHARED and + * retain it for its own nested CoW fork; the live-fd fallback must stay + * MAP_PRIVATE so the child does not share writes with the parent. + */ + uint32_t shm_is_clone; + uint32_t _pad2; } ipc_header_t; typedef struct { diff --git a/src/runtime/forkipc.c b/src/runtime/forkipc.c index 19de841..a4224e9 100644 --- a/src/runtime/forkipc.c +++ b/src/runtime/forkipc.c @@ -48,6 +48,18 @@ #include "debug/log.h" #include "debug/syscall-hist.h" +/* Linux clone flags. Shared by the fork-child TID-sync emulation below and + * sys_clone further down. + */ +#define LINUX_CLONE_VM 0x00000100 +#define LINUX_CLONE_VFORK 0x00004000 +#define LINUX_CLONE_THREAD 0x00010000 +#define LINUX_CLONE_SETTLS 0x00080000 +#define LINUX_CLONE_PARENT_SETTID 0x00100000 +#define LINUX_CLONE_CHILD_CLEARTID 0x00200000 +#define LINUX_CLONE_CHILD_SETTID 0x01000000 +/* LINUX_SIGCHLD defined in syscall_signal.h (included above) */ + /* fork_child_main. */ static int fork_child_vfork_notify_fd = -1; @@ -166,7 +178,8 @@ int fork_child_main(int ipc_fd, close(ipc_fd); return 1; } - if (guest_init_from_shm(&g, shm_fd, hdr.guest_size, hdr.ipa_bits) < 0) { + if (guest_init_from_shm(&g, shm_fd, hdr.guest_size, hdr.ipa_bits, + hdr.shm_is_clone != 0) < 0) { log_error("fork-child: guest_init_from_shm failed"); close(ipc_fd); return 1; @@ -363,6 +376,30 @@ int fork_child_main(int ipc_fd, */ thread_register_main(vcpu, vexit, hdr.child_pid, regs.sp_el1); + /* Emulate CLONE_CHILD_SETTID for the fork child. glibc's fork wrapper + * passes CLONE_CHILD_SETTID | CLONE_CHILD_CLEARTID so the child's TCB + * caches its own TID; without the SETTID write the child keeps the parent's + * cached TID and modern glibc trips stack-canary / TLS checks ("stack + * smashing detected"). The write goes through guest memory, valid for both + * the CoW and region-copy paths. A faulting ctid_gva is the guest's own bad + * pointer: warn and continue, matching how the kernel ignores a + * child_tidptr fault. + * + * CLONE_CHILD_CLEARTID is deliberately not honored here. The clear-and-wake + * on exit only matters to an in-process joiner waiting on the futex (that + * is how the worker-thread exit path serves pthread_join). A fork child is + * a separate process with its own address space, so its ctid lives in + * memory no other process can observe -- the parent reaps it via + * wait4/SIGCHLD, not a cross-process futex. Registering clear_child_tid + * would be inert. + */ + if (hdr.clone_flags & LINUX_CLONE_CHILD_SETTID) { + int32_t tid32 = (int32_t) hdr.child_pid; + if (guest_write_small(&g, hdr.ctid_gva, &tid32, sizeof(tid32)) < 0) + log_warn("fork-child: CHILD_SETTID write to 0x%llx failed", + (unsigned long long) hdr.ctid_gva); + } + /* Re-publish identity into the child's shim-globals cache: the CoW / region * copy inherits the parent's pid/uid values, and the shim's identity fast * path would otherwise return the parent's pid to the child. Identity is @@ -420,16 +457,6 @@ int fork_child_main(int ipc_fd, /* sys_clone. */ -/* Linux clone flags */ -#define LINUX_CLONE_VM 0x00000100 -#define LINUX_CLONE_VFORK 0x00004000 -#define LINUX_CLONE_THREAD 0x00010000 -#define LINUX_CLONE_SETTLS 0x00080000 -#define LINUX_CLONE_PARENT_SETTID 0x00100000 -#define LINUX_CLONE_CHILD_CLEARTID 0x00200000 -#define LINUX_CLONE_CHILD_SETTID 0x01000000 -/* LINUX_SIGCHLD defined in syscall_signal.h (included above) */ - /* Namespace flags. elfuse implements no namespace isolation. Both sys_clone and * sys_clone3 reject them. */ @@ -1528,6 +1555,10 @@ int64_t sys_clone(hv_vcpu_t vcpu, .rosetta_entry = g->rosetta_entry, .kbuf_gpa = g->kbuf_gpa, .ttbr1 = g->ttbr1, + .clone_flags = + flags & (LINUX_CLONE_CHILD_SETTID | LINUX_CLONE_CHILD_CLEARTID), + .ctid_gva = ctid_gva, + .shm_is_clone = (snapshot_shm_fd >= 0) ? 1 : 0, }; if (fork_ipc_write_all(ipc_sock, &hdr, sizeof(hdr)) < 0) { log_error("clone: failed to send header"); diff --git a/tests/manifest.txt b/tests/manifest.txt index b8e779f..42391f1 100644 --- a/tests/manifest.txt +++ b/tests/manifest.txt @@ -95,6 +95,7 @@ test-signal-thread [section] Fork edge cases test-clone3 # diff=skip +test-clone-childtid test-fork-exec $TESTDIR/echo-test test-fork-lowbase diff --git a/tests/test-clone-childtid.c b/tests/test-clone-childtid.c new file mode 100644 index 0000000..bfe3b45 --- /dev/null +++ b/tests/test-clone-childtid.c @@ -0,0 +1,107 @@ +/* Test CLONE_CHILD_SETTID / CLONE_CHILD_CLEARTID on the fork (posix_spawn) path + * + * Copyright 2026 elfuse contributors + * SPDX-License-Identifier: Apache-2.0 + * + * Issue #99: glibc's fork wrapper clones with CLONE_CHILD_SETTID | + * CLONE_CHILD_CLEARTID | SIGCHLD. The child's TID must be written into the + * ctid address so glibc's TCB caches the right value. This calls clone() + * directly with those exact flags (no CLONE_VM/THREAD/VFORK, so elfuse takes + * the fork helper-process path) and checks the child observes its own TID at + * the ctid slot -- glibc-version-independent, unlike the canary symptom. + * + * Raw syscall throughout: glibc's own clone wrapper does not expose the ctid + * arg, and we want to exercise elfuse's handling rather than libc's. + */ + +#include +#include +#include +#include +#include +#include +#include + +#ifndef CLONE_CHILD_CLEARTID +#define CLONE_CHILD_CLEARTID 0x00200000 +#endif +#ifndef CLONE_CHILD_SETTID +#define CLONE_CHILD_SETTID 0x01000000 +#endif + +static volatile int child_tid_slot; + +int main(void) +{ + /* aarch64 clone(2): clone(flags, stack, parent_tid, tls, child_tid). */ + unsigned long flags = CLONE_CHILD_SETTID | CLONE_CHILD_CLEARTID | SIGCHLD; + long rc = syscall(SYS_clone, flags, (void *) 0, (void *) 0, (void *) 0, + (void *) &child_tid_slot); + if (rc < 0) { + printf("test-clone-childtid: clone failed -- FAIL\n"); + return 1; + } + + if (rc == 0) { + /* Child: the kernel (here, elfuse) must have written our TID into the + * ctid slot before we resumed. + */ + pid_t tid = (pid_t) syscall(SYS_gettid); + if (child_tid_slot != tid) { + /* Cannot printf reliably from a possibly-confused child; encode the + * result in the exit status instead. + */ + _exit(child_tid_slot == 0 ? 2 : 3); + } + + /* Nested clone: the child forks a grandchild with the same flags. This + * exercises the child-side CoW shm retention (issue #99 part 2): the + * child must be able to clone its own memory again, and the grandchild + * must likewise see a fresh TID at its ctid slot. + */ + static volatile int grand_tid_slot; + long grc = syscall(SYS_clone, flags, (void *) 0, (void *) 0, (void *) 0, + (void *) &grand_tid_slot); + if (grc < 0) + _exit(4); + if (grc == 0) { + pid_t gtid = (pid_t) syscall(SYS_gettid); + _exit(grand_tid_slot == gtid ? 0 : 5); + } + int gstatus; + if (waitpid((pid_t) grc, &gstatus, 0) < 0) + _exit(6); + if (!WIFEXITED(gstatus) || WEXITSTATUS(gstatus) != 0) + _exit(7); + _exit(0); + } + + int status; + if (waitpid((pid_t) rc, &status, 0) < 0) { + printf("test-clone-childtid: waitpid failed -- FAIL\n"); + return 1; + } + if (!WIFEXITED(status)) { + printf( + "test-clone-childtid: child did not exit cleanly (0x%x) -- FAIL\n", + status); + return 1; + } + switch (WEXITSTATUS(status)) { + case 0: + printf("test-clone-childtid: child saw its TID at ctid -- PASS\n"); + return 0; + case 2: + printf( + "test-clone-childtid: ctid slot still 0 (SETTID ignored) -- " + "FAIL\n"); + return 1; + case 3: + printf("test-clone-childtid: ctid slot holds wrong TID -- FAIL\n"); + return 1; + default: + printf("test-clone-childtid: unexpected child exit %d -- FAIL\n", + WEXITSTATUS(status)); + return 1; + } +} diff --git a/tests/test-matrix.sh b/tests/test-matrix.sh index 9b0d8b6..2611d04 100755 --- a/tests/test-matrix.sh +++ b/tests/test-matrix.sh @@ -453,6 +453,7 @@ run_unit_tests() printf "\nProcess tests\n" test_check "$runner" "test-fork" "PASS" "$bindir/test-fork" + test_check "$runner" "test-clone-childtid" "PASS" "$bindir/test-clone-childtid" test_check "$runner" "test-exec" "exec-works" "$bindir/test-exec" "$bindir/echo-test" exec-works test_check "$runner" "test-fork-exec" "PASS" "$bindir/test-fork-exec" "$bindir/echo-test" test_check "$runner" "test-cloexec" "PASS" "$bindir/test-cloexec"