From b8e578c2a2dd3deae512ca6d6bcde1be5df31e7a Mon Sep 17 00:00:00 2001 From: Jim Huang Date: Wed, 17 Jun 2026 22:54:50 +0800 Subject: [PATCH] Sync child TID and retain CoW across nested fork glibc's fork wrapper clones with CLONE_CHILD_{SETTID | CLEARTID} | SIGCHLD, but the posix_spawn fork path could not see the original clone arguments, so the child never wrote its new TID into the guest ctid address. The child kept the parent's cached TID and modern glibc tripped its stack-canary / TLS checks ("stack smashing detected"), which surfaced on nested forks. Forward the relevant clone flags and the ctid address through ipc_header_t. In fork_child_main, after the main thread is registered, honor CLONE_CHILD_SETTID by writing the child TID into ctid_gva. A faulting address is the guest's own bad pointer, so warn and continue, matching how the kernel ignores a child_tidptr fault. CLONE_CHILD_CLEARTID is intentionally not honored: a fork child is a separate process whose ctid no other process can observe, and the parent reaps it via wait4/SIGCHLD rather than a cross-process futex. A fork child also closed its inherited shm fd and mapped it MAP_PRIVATE, so any nested grandchild fork dropped off the copy-on-write fast path into the slow region-copy path. When the inherited fd is an independent fclonefileat clone (the new shm_is_clone header flag), map it MAP_SHARED and retain it in g->shm_fd so the child can clone it again for its own nested fork; guest_destroy closes it. The live-fd fallback keeps the MAP_PRIVATE behavior so the child does not share writes with the parent. guest_init_from_shm gains a retain_shared parameter and closes the inherited fd on every error path so the ownership contract holds. Close #99 --- src/core/guest.c | 35 +++++++++--- src/core/guest.h | 16 ++++-- src/runtime/fork-state.h | 16 ++++++ src/runtime/forkipc.c | 53 ++++++++++++++---- tests/manifest.txt | 1 + tests/test-clone-childtid.c | 107 ++++++++++++++++++++++++++++++++++++ tests/test-matrix.sh | 1 + 7 files changed, 205 insertions(+), 24 deletions(-) create mode 100644 tests/test-clone-childtid.c diff --git a/src/core/guest.c b/src/core/guest.c index 01422b8..d4e455e 100644 --- a/src/core/guest.c +++ b/src/core/guest.c @@ -441,12 +441,13 @@ int guest_init(guest_t *g, uint64_t size, uint32_t ipa_bits) int guest_init_from_shm(guest_t *g, int shm_fd, uint64_t size, - uint32_t ipa_bits) + uint32_t ipa_bits, + bool retain_shared) { uint64_t t0; memset(g, 0, sizeof(*g)); - g->shm_fd = -1; /* Child does not own the shm */ + g->shm_fd = -1; /* Child does not own the shm unless retain_shared */ g->ipa_base = GUEST_IPA_BASE; g->elf_load_min = ELF_DEFAULT_BASE; g->brk_base = BRK_BASE_DEFAULT; @@ -471,13 +472,21 @@ int guest_init_from_shm(guest_t *g, } g->pt_pool_next = g->pt_pool_base; - /* Map the shm fd MAP_PRIVATE: copy-on-write semantics. Reads see the - * parent's frozen snapshot; writes are private to this process. macOS CoW - * is page-granular: only modified pages are duplicated. + /* Two mapping modes: + * retain_shared: shm_fd is an independent APFS clone of the parent's + * memory (already isolated from the parent). Map MAP_SHARED so the + * child's writes land in the clone file, then keep the fd so the child + * can fclonefileat it for its own nested CoW fork. guest_destroy closes + * it. + * otherwise: shm_fd may be the parent's live fd (clonefile fallback). Map + * MAP_PRIVATE so writes stay private to this process, then close the + * fd. macOS CoW is page-granular either way: only modified pages are + * duplicated. */ + int map_flags = retain_shared ? MAP_SHARED : MAP_PRIVATE; t0 = startup_trace_now_ns(); g->host_base = - mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_PRIVATE, shm_fd, 0); + mmap(NULL, size, PROT_READ | PROT_WRITE, map_flags, shm_fd, 0); startup_trace_step("shm_mmap", t0); if (g->host_base == MAP_FAILED) { perror("guest: mmap shm"); @@ -486,8 +495,10 @@ int guest_init_from_shm(guest_t *g, return -1; } - /* Close the shm fd; the mapping keeps the pages alive */ - close(shm_fd); + if (retain_shared) + g->shm_fd = shm_fd; /* Child owns the clone; guest_destroy closes it */ + else + close(shm_fd); /* MAP_PRIVATE mapping keeps the pages alive */ /* Create HVF VM with the same IPA width as the parent */ hv_return_t ret = HV_ERROR; @@ -506,6 +517,10 @@ int guest_init_from_shm(guest_t *g, log_error("guest: hv_vm_create (shm) failed: %d", (int) ret); munmap(g->host_base, size); g->host_base = NULL; + if (g->shm_fd >= 0) { + close(g->shm_fd); + g->shm_fd = -1; + } return -1; } @@ -518,6 +533,10 @@ int guest_init_from_shm(guest_t *g, hv_vm_destroy(); munmap(g->host_base, size); g->host_base = NULL; + if (g->shm_fd >= 0) { + close(g->shm_fd); + g->shm_fd = -1; + } return -1; } diff --git a/src/core/guest.h b/src/core/guest.h index 133088e..58ca8b1 100644 --- a/src/core/guest.h +++ b/src/core/guest.h @@ -774,16 +774,22 @@ static inline bool guest_addr_in_infra(const guest_t *g, uint64_t addr) */ int guest_init(guest_t *g, uint64_t size, uint32_t ipa_bits); -/* Initialize guest from a POSIX shared memory fd (CoW fork path). Maps shm_fd - * MAP_PRIVATE (copy-on-write), creates HVF VM, maps to hypervisor. The child - * gets an instant CoW snapshot of parent's guest memory without copying. shm_fd - * is closed after mapping. +/* Initialize guest from a shared memory fd (CoW fork path). Creates the HVF VM + * and maps the fd to the hypervisor. The child gets an instant CoW snapshot of + * the parent's guest memory without copying. + * + * retain_shared selects the mapping: when true, shm_fd is an independent APFS + * clone, so it is mapped MAP_SHARED and retained in g->shm_fd (guest_destroy + * closes it) so the child can fclonefileat it for nested CoW fork. When false, + * shm_fd may be the parent's live fd, so it is mapped MAP_PRIVATE and closed + * after mapping. This function takes ownership of shm_fd on every path. * Returns 0 on success, -1 on failure. */ int guest_init_from_shm(guest_t *g, int shm_fd, uint64_t size, - uint32_t ipa_bits); + uint32_t ipa_bits, + bool retain_shared); /* Tear down VM and free guest memory. */ void guest_destroy(guest_t *g); diff --git a/src/runtime/fork-state.h b/src/runtime/fork-state.h index 18f3443..84e73a5 100644 --- a/src/runtime/fork-state.h +++ b/src/runtime/fork-state.h @@ -60,6 +60,22 @@ typedef struct { uint64_t rosetta_entry; uint64_t kbuf_gpa; uint64_t ttbr1; + /* Clone TID-sync state for the fork path. glibc's fork wrapper passes + * CLONE_CHILD_SETTID | CLONE_CHILD_CLEARTID so the child writes its new TID + * into the TCB and clears it on exit. The posix_spawn child has no access + * to the original clone() arguments, so the parent forwards them here: + * clone_flags carries the CHILD_SETTID / CHILD_CLEARTID bits and ctid_gva + * the guest address. Zero for callers (e.g. raw fork(2)) that pass neither. + */ + uint64_t clone_flags; + uint64_t ctid_gva; + /* Nonzero when the shm fd sent below is an independent fclonefileat clone + * (not the parent's live fd). Only then may the child map it MAP_SHARED and + * retain it for its own nested CoW fork; the live-fd fallback must stay + * MAP_PRIVATE so the child does not share writes with the parent. + */ + uint32_t shm_is_clone; + uint32_t _pad2; } ipc_header_t; typedef struct { diff --git a/src/runtime/forkipc.c b/src/runtime/forkipc.c index 19de841..a4224e9 100644 --- a/src/runtime/forkipc.c +++ b/src/runtime/forkipc.c @@ -48,6 +48,18 @@ #include "debug/log.h" #include "debug/syscall-hist.h" +/* Linux clone flags. Shared by the fork-child TID-sync emulation below and + * sys_clone further down. + */ +#define LINUX_CLONE_VM 0x00000100 +#define LINUX_CLONE_VFORK 0x00004000 +#define LINUX_CLONE_THREAD 0x00010000 +#define LINUX_CLONE_SETTLS 0x00080000 +#define LINUX_CLONE_PARENT_SETTID 0x00100000 +#define LINUX_CLONE_CHILD_CLEARTID 0x00200000 +#define LINUX_CLONE_CHILD_SETTID 0x01000000 +/* LINUX_SIGCHLD defined in syscall_signal.h (included above) */ + /* fork_child_main. */ static int fork_child_vfork_notify_fd = -1; @@ -166,7 +178,8 @@ int fork_child_main(int ipc_fd, close(ipc_fd); return 1; } - if (guest_init_from_shm(&g, shm_fd, hdr.guest_size, hdr.ipa_bits) < 0) { + if (guest_init_from_shm(&g, shm_fd, hdr.guest_size, hdr.ipa_bits, + hdr.shm_is_clone != 0) < 0) { log_error("fork-child: guest_init_from_shm failed"); close(ipc_fd); return 1; @@ -363,6 +376,30 @@ int fork_child_main(int ipc_fd, */ thread_register_main(vcpu, vexit, hdr.child_pid, regs.sp_el1); + /* Emulate CLONE_CHILD_SETTID for the fork child. glibc's fork wrapper + * passes CLONE_CHILD_SETTID | CLONE_CHILD_CLEARTID so the child's TCB + * caches its own TID; without the SETTID write the child keeps the parent's + * cached TID and modern glibc trips stack-canary / TLS checks ("stack + * smashing detected"). The write goes through guest memory, valid for both + * the CoW and region-copy paths. A faulting ctid_gva is the guest's own bad + * pointer: warn and continue, matching how the kernel ignores a + * child_tidptr fault. + * + * CLONE_CHILD_CLEARTID is deliberately not honored here. The clear-and-wake + * on exit only matters to an in-process joiner waiting on the futex (that + * is how the worker-thread exit path serves pthread_join). A fork child is + * a separate process with its own address space, so its ctid lives in + * memory no other process can observe -- the parent reaps it via + * wait4/SIGCHLD, not a cross-process futex. Registering clear_child_tid + * would be inert. + */ + if (hdr.clone_flags & LINUX_CLONE_CHILD_SETTID) { + int32_t tid32 = (int32_t) hdr.child_pid; + if (guest_write_small(&g, hdr.ctid_gva, &tid32, sizeof(tid32)) < 0) + log_warn("fork-child: CHILD_SETTID write to 0x%llx failed", + (unsigned long long) hdr.ctid_gva); + } + /* Re-publish identity into the child's shim-globals cache: the CoW / region * copy inherits the parent's pid/uid values, and the shim's identity fast * path would otherwise return the parent's pid to the child. Identity is @@ -420,16 +457,6 @@ int fork_child_main(int ipc_fd, /* sys_clone. */ -/* Linux clone flags */ -#define LINUX_CLONE_VM 0x00000100 -#define LINUX_CLONE_VFORK 0x00004000 -#define LINUX_CLONE_THREAD 0x00010000 -#define LINUX_CLONE_SETTLS 0x00080000 -#define LINUX_CLONE_PARENT_SETTID 0x00100000 -#define LINUX_CLONE_CHILD_CLEARTID 0x00200000 -#define LINUX_CLONE_CHILD_SETTID 0x01000000 -/* LINUX_SIGCHLD defined in syscall_signal.h (included above) */ - /* Namespace flags. elfuse implements no namespace isolation. Both sys_clone and * sys_clone3 reject them. */ @@ -1528,6 +1555,10 @@ int64_t sys_clone(hv_vcpu_t vcpu, .rosetta_entry = g->rosetta_entry, .kbuf_gpa = g->kbuf_gpa, .ttbr1 = g->ttbr1, + .clone_flags = + flags & (LINUX_CLONE_CHILD_SETTID | LINUX_CLONE_CHILD_CLEARTID), + .ctid_gva = ctid_gva, + .shm_is_clone = (snapshot_shm_fd >= 0) ? 1 : 0, }; if (fork_ipc_write_all(ipc_sock, &hdr, sizeof(hdr)) < 0) { log_error("clone: failed to send header"); diff --git a/tests/manifest.txt b/tests/manifest.txt index b8e779f..42391f1 100644 --- a/tests/manifest.txt +++ b/tests/manifest.txt @@ -95,6 +95,7 @@ test-signal-thread [section] Fork edge cases test-clone3 # diff=skip +test-clone-childtid test-fork-exec $TESTDIR/echo-test test-fork-lowbase diff --git a/tests/test-clone-childtid.c b/tests/test-clone-childtid.c new file mode 100644 index 0000000..bfe3b45 --- /dev/null +++ b/tests/test-clone-childtid.c @@ -0,0 +1,107 @@ +/* Test CLONE_CHILD_SETTID / CLONE_CHILD_CLEARTID on the fork (posix_spawn) path + * + * Copyright 2026 elfuse contributors + * SPDX-License-Identifier: Apache-2.0 + * + * Issue #99: glibc's fork wrapper clones with CLONE_CHILD_SETTID | + * CLONE_CHILD_CLEARTID | SIGCHLD. The child's TID must be written into the + * ctid address so glibc's TCB caches the right value. This calls clone() + * directly with those exact flags (no CLONE_VM/THREAD/VFORK, so elfuse takes + * the fork helper-process path) and checks the child observes its own TID at + * the ctid slot -- glibc-version-independent, unlike the canary symptom. + * + * Raw syscall throughout: glibc's own clone wrapper does not expose the ctid + * arg, and we want to exercise elfuse's handling rather than libc's. + */ + +#include +#include +#include +#include +#include +#include +#include + +#ifndef CLONE_CHILD_CLEARTID +#define CLONE_CHILD_CLEARTID 0x00200000 +#endif +#ifndef CLONE_CHILD_SETTID +#define CLONE_CHILD_SETTID 0x01000000 +#endif + +static volatile int child_tid_slot; + +int main(void) +{ + /* aarch64 clone(2): clone(flags, stack, parent_tid, tls, child_tid). */ + unsigned long flags = CLONE_CHILD_SETTID | CLONE_CHILD_CLEARTID | SIGCHLD; + long rc = syscall(SYS_clone, flags, (void *) 0, (void *) 0, (void *) 0, + (void *) &child_tid_slot); + if (rc < 0) { + printf("test-clone-childtid: clone failed -- FAIL\n"); + return 1; + } + + if (rc == 0) { + /* Child: the kernel (here, elfuse) must have written our TID into the + * ctid slot before we resumed. + */ + pid_t tid = (pid_t) syscall(SYS_gettid); + if (child_tid_slot != tid) { + /* Cannot printf reliably from a possibly-confused child; encode the + * result in the exit status instead. + */ + _exit(child_tid_slot == 0 ? 2 : 3); + } + + /* Nested clone: the child forks a grandchild with the same flags. This + * exercises the child-side CoW shm retention (issue #99 part 2): the + * child must be able to clone its own memory again, and the grandchild + * must likewise see a fresh TID at its ctid slot. + */ + static volatile int grand_tid_slot; + long grc = syscall(SYS_clone, flags, (void *) 0, (void *) 0, (void *) 0, + (void *) &grand_tid_slot); + if (grc < 0) + _exit(4); + if (grc == 0) { + pid_t gtid = (pid_t) syscall(SYS_gettid); + _exit(grand_tid_slot == gtid ? 0 : 5); + } + int gstatus; + if (waitpid((pid_t) grc, &gstatus, 0) < 0) + _exit(6); + if (!WIFEXITED(gstatus) || WEXITSTATUS(gstatus) != 0) + _exit(7); + _exit(0); + } + + int status; + if (waitpid((pid_t) rc, &status, 0) < 0) { + printf("test-clone-childtid: waitpid failed -- FAIL\n"); + return 1; + } + if (!WIFEXITED(status)) { + printf( + "test-clone-childtid: child did not exit cleanly (0x%x) -- FAIL\n", + status); + return 1; + } + switch (WEXITSTATUS(status)) { + case 0: + printf("test-clone-childtid: child saw its TID at ctid -- PASS\n"); + return 0; + case 2: + printf( + "test-clone-childtid: ctid slot still 0 (SETTID ignored) -- " + "FAIL\n"); + return 1; + case 3: + printf("test-clone-childtid: ctid slot holds wrong TID -- FAIL\n"); + return 1; + default: + printf("test-clone-childtid: unexpected child exit %d -- FAIL\n", + WEXITSTATUS(status)); + return 1; + } +} diff --git a/tests/test-matrix.sh b/tests/test-matrix.sh index 9b0d8b6..2611d04 100755 --- a/tests/test-matrix.sh +++ b/tests/test-matrix.sh @@ -453,6 +453,7 @@ run_unit_tests() printf "\nProcess tests\n" test_check "$runner" "test-fork" "PASS" "$bindir/test-fork" + test_check "$runner" "test-clone-childtid" "PASS" "$bindir/test-clone-childtid" test_check "$runner" "test-exec" "exec-works" "$bindir/test-exec" "$bindir/echo-test" exec-works test_check "$runner" "test-fork-exec" "PASS" "$bindir/test-fork-exec" "$bindir/echo-test" test_check "$runner" "test-cloexec" "PASS" "$bindir/test-cloexec"