From c5aa82e909b3a0f44d158fba592908a8701e5fde Mon Sep 17 00:00:00 2001 From: James Toyer Date: Mon, 27 Apr 2026 12:54:31 +0100 Subject: [PATCH 1/2] Place inner container cgroups under the pod's cgroup tree --- cli/docker.go | 58 ++++++++++++++++++++++++++++++++++++++++++++++++--- xunix/sys.go | 12 ++++++++++- 2 files changed, 66 insertions(+), 4 deletions(-) diff --git a/cli/docker.go b/cli/docker.go index 1573612..7162fc0 100644 --- a/cli/docker.go +++ b/cli/docker.go @@ -256,7 +256,8 @@ func dockerCmd() *cobra.Command { log.Debug(ctx, "starting dockerd", slog.F("args", args)) blog.Info("Waiting for sysbox processes to startup...") - dockerd := background.New(ctx, log, "dockerd", dargs...) + wrapCmd, wrapArgs := wrapDockerdCmd(dargs) + dockerd := background.New(ctx, log, wrapCmd, wrapArgs...) err = dockerd.Start() if err != nil { return xerrors.Errorf("start dockerd: %w", err) @@ -289,7 +290,8 @@ func dockerCmd() *cobra.Command { log.Fatal(ctx, "dockerd exited, failed getting args for restart", slog.Error(err)) } - err = dockerd.Restart(ctx, "dockerd", args...) + wrapCmd, wrapArgs := wrapDockerdCmd(args) + err = dockerd.Restart(ctx, wrapCmd, wrapArgs...) if err != nil { blog.Info("Failed to create Container-based Virtual Machine: " + err.Error()) //nolint @@ -357,7 +359,8 @@ func dockerCmd() *cobra.Command { log.Debug(ctx, "restarting dockerd", slog.F("args", args)) - err = dockerd.Restart(ctx, "dockerd", args...) + wrapCmd, wrapArgs := wrapDockerdCmd(args) + err = dockerd.Restart(ctx, wrapCmd, wrapArgs...) if err != nil { return xerrors.Errorf("restart dockerd: %w", err) } @@ -881,6 +884,55 @@ func dockerdArgs(link, cidr string, isNoSpace bool) ([]string, error) { return args, nil } +// wrapDockerdCmd wraps the dockerd invocation with `unshare --cgroup` + +// cgroup2 remount + cgroupv2 delegation setup. This creates a new cgroup +// namespace for dockerd and moves existing processes into a sibling `/init` +// cgroup so the root cgroup can enable subtree_control. Without this, cgroupv2 +// "no internal processes" rule prevents Docker from creating child cgroups +// with processes under a cgroup that already has processes. +// +// Note: the `umount /sys/fs/cgroup && mount -t cgroup2 ...` leaks back into +// envbox's mount namespace because we don't use `--mount`. Using `--mount` +// would break sysbox-fs propagation (its per-container mounts under +// /var/lib/sysboxfs/ stop being visible to sysbox-runc in the dockerd NS). +// The side effect of the leak is a non-fatal "Unable to read CPU quota" +// warning when envbox later tries to read /sys/fs/cgroup//cpu.max +// — that path no longer exists under the remounted view. The inner +// container's CPU limits still flow through the parent pod's cgroup hierarchy, +// but cgroup-aware tools inside the workspace may see the host CPU count. +// +// The cgroup delegation logic mirrors moby's `hack/dind` wrapper: +// https://github.com/moby/moby/blob/master/hack/dind +// +// After this setup, inner container cgroups are placed under the envbox +// container's cgroup on the host. Tetragon's cgtracker can then associate +// these child cgroups with the parent pod for attribution. +// +// See also: https://github.com/moby/moby/issues/45378#issuecomment-2886261231 +func wrapDockerdCmd(dargs []string) (string, []string) { + shellCmd := `set -e +umount /sys/fs/cgroup +mount -t cgroup2 cgroup /sys/fs/cgroup +if [ -f /sys/fs/cgroup/cgroup.controllers ]; then + mkdir -p /sys/fs/cgroup/init + while ! { + xargs -rn1 < /sys/fs/cgroup/cgroup.procs > /sys/fs/cgroup/init/cgroup.procs || : + sed -e 's/ / +/g' -e 's/^/+/' < /sys/fs/cgroup/cgroup.controllers > /sys/fs/cgroup/cgroup.subtree_control + }; do :; done +fi +exec "$0" "$@" +` + wrapperArgs := []string{ + "--cgroup", + "/bin/sh", + "-c", + shellCmd, + "dockerd", + } + wrapperArgs = append(wrapperArgs, dargs...) + return "unshare", wrapperArgs +} + // TODO This is bad code. func filterElements(ss []string, filters ...string) []string { filtered := make([]string, 0, len(ss)) diff --git a/xunix/sys.go b/xunix/sys.go index eef07cd..5dd5fe6 100644 --- a/xunix/sys.go +++ b/xunix/sys.go @@ -63,9 +63,19 @@ func readCPUQuotaCGroupV2(ctx context.Context) (CPUQuota, error) { return CPUQuota{}, xerrors.Errorf("determine own cgroup: %w", err) } + // Try the standard path (self-relative) first. maxStr, err := afero.ReadFile(fs, filepath.Join("/sys/fs/cgroup/", self, "cpu.max")) if err != nil { - return CPUQuota{}, xerrors.Errorf("read cpu.max outside container: %w", err) + // Fallback: read cpu.max at the mount root. This handles the case where + // `/sys/fs/cgroup/` has been remounted to be rooted at the current + // cgroup (e.g. after `unshare --cgroup` + `mount -t cgroup2`), so the + // self-relative path no longer exists — but the mount root IS the + // current cgroup and its cpu.max reflects the same limits. + rootMaxStr, rootErr := afero.ReadFile(fs, "/sys/fs/cgroup/cpu.max") + if rootErr != nil { + return CPUQuota{}, xerrors.Errorf("read cpu.max outside container: %w", err) + } + maxStr = rootMaxStr } list := strings.Split(string(bytes.TrimSpace(maxStr)), " ") From 29ad3ef5c8d7dc9fbe6cc5449b23d1482c5976f4 Mon Sep 17 00:00:00 2001 From: James Toyer Date: Wed, 6 May 2026 17:20:03 +0100 Subject: [PATCH 2/2] Address PR review feedback --- background/process.go | 18 +++++++ cli/docker.go | 108 ++++++++++++++++++++++++++++++------------ cli/docker_test.go | 59 +++++++++++++++++++---- cli/export_test.go | 8 ++++ xunix/sys.go | 8 ++-- xunix/sys_test.go | 12 +++++ 6 files changed, 171 insertions(+), 42 deletions(-) create mode 100644 cli/export_test.go diff --git a/background/process.go b/background/process.go index dc4ddaa..6d8bd37 100644 --- a/background/process.go +++ b/background/process.go @@ -56,6 +56,24 @@ func (d *Process) Start() error { return d.startProcess() } +// SetBinName overrides the binary name used to detect whether the running +// process has exited (via the contents of /proc//cmdline). By default +// the binary name is the `cmd` argument passed to New or Restart. When the +// command is a wrapper that exec's into a different binary (e.g. +// `unshare ... -- /bin/sh -c 'exec dockerd ...'`), the post-exec cmdline +// reflects the final binary rather than the wrapper, so the default +// binary name will not match. Call SetBinName with the post-exec binary +// name to keep exit detection accurate. +// +// SetBinName must be called again after each Restart since Restart resets +// the binary name to its `cmd` argument. +func (d *Process) SetBinName(name string) { + d.mu.Lock() + defer d.mu.Unlock() + + d.binName = name +} + // Wait waits for the process to exit, returning the error on the provided // channel. func (d *Process) Wait() <-chan error { diff --git a/cli/docker.go b/cli/docker.go index 7162fc0..2e0f0a0 100644 --- a/cli/docker.go +++ b/cli/docker.go @@ -258,6 +258,10 @@ func dockerCmd() *cobra.Command { blog.Info("Waiting for sysbox processes to startup...") wrapCmd, wrapArgs := wrapDockerdCmd(dargs) dockerd := background.New(ctx, log, wrapCmd, wrapArgs...) + // The unshare wrapper exec's into dockerd, so /proc//cmdline + // reflects "dockerd" not "unshare". Track exits against the + // post-exec name to keep Restart/kill detection accurate. + dockerd.SetBinName(dockerdBinName) err = dockerd.Start() if err != nil { return xerrors.Errorf("start dockerd: %w", err) @@ -297,6 +301,9 @@ func dockerCmd() *cobra.Command { //nolint log.Fatal(ctx, "restart dockerd", slog.Error(err)) } + // Restart resets the tracked binary name to its `cmd` + // argument; re-set it for the wrapped invocation. + dockerd.SetBinName(dockerdBinName) err = <-dockerd.Wait() } @@ -364,6 +371,9 @@ func dockerCmd() *cobra.Command { if err != nil { return xerrors.Errorf("restart dockerd: %w", err) } + // Restart resets the tracked binary name to its `cmd` + // argument; re-set it for the wrapped invocation. + dockerd.SetBinName(dockerdBinName) go func() { err = <-dockerd.Wait() blog.Errorf("restarted dockerd exited: %v", err) @@ -884,50 +894,88 @@ func dockerdArgs(link, cidr string, isNoSpace bool) ([]string, error) { return args, nil } -// wrapDockerdCmd wraps the dockerd invocation with `unshare --cgroup` + -// cgroup2 remount + cgroupv2 delegation setup. This creates a new cgroup -// namespace for dockerd and moves existing processes into a sibling `/init` -// cgroup so the root cgroup can enable subtree_control. Without this, cgroupv2 -// "no internal processes" rule prevents Docker from creating child cgroups -// with processes under a cgroup that already has processes. +// dockerdBinName is the binary name reported by /proc//cmdline once the +// `unshare` -> `/bin/sh` -> dockerd exec chain set up by wrapDockerdCmd has +// completed. background.Process tracking must use this name (not the literal +// "unshare" command we invoke) so that exit detection compares against the +// running cmdline correctly. +const dockerdBinName = "dockerd" + +// dockerdSubtreeControlMaxAttempts caps the cgroup-subtree-control retry loop +// so a stuck race can't block envbox startup indefinitely. +const dockerdSubtreeControlMaxAttempts = 100 + +// wrapDockerdCmd wraps the dockerd invocation with `unshare --cgroup`, plus a +// cgroup2 remount and cgroupv2 delegation setup, so that inner container +// cgroups become descendants of the envbox container's own cgroup on the +// host. Without this the inner Docker daemon places container cgroups at +// /sys/fs/cgroup/docker/, which is a sibling of the pod's cgroup tree +// rather than a descendant; host-level cgroup-aware tools (Tetragon, Falco, +// custom eBPF agents) then cannot attribute processes inside a workspace +// container back to the pod that's running them. // -// Note: the `umount /sys/fs/cgroup && mount -t cgroup2 ...` leaks back into -// envbox's mount namespace because we don't use `--mount`. Using `--mount` -// would break sysbox-fs propagation (its per-container mounts under -// /var/lib/sysboxfs/ stop being visible to sysbox-runc in the dockerd NS). -// The side effect of the leak is a non-fatal "Unable to read CPU quota" -// warning when envbox later tries to read /sys/fs/cgroup//cpu.max -// — that path no longer exists under the remounted view. The inner -// container's CPU limits still flow through the parent pod's cgroup hierarchy, -// but cgroup-aware tools inside the workspace may see the host CPU count. +// On cgroupv2 the delegation block (move all current processes into +// /sys/fs/cgroup/init, then enable cgroup.subtree_control on the root) is +// required: cgroupv2's "no internal processes" rule otherwise prevents +// dockerd from creating child cgroups with processes under a parent that +// already has its own processes. The block is gated on +// /sys/fs/cgroup/cgroup.controllers existing so that on cgroupv1 hosts the +// wrapper is effectively a no-op around `unshare --cgroup -- exec dockerd`. // -// The cgroup delegation logic mirrors moby's `hack/dind` wrapper: -// https://github.com/moby/moby/blob/master/hack/dind +// The delegation logic mirrors moby's hack/dind wrapper (lines 61-79): +// https://github.com/moby/moby/blob/8d9e3502aba39127e4d12196dae16d306f76993d/hack/dind#L61-L79 // -// After this setup, inner container cgroups are placed under the envbox -// container's cgroup on the host. Tetragon's cgtracker can then associate -// these child cgroups with the parent pod for attribution. +// Note: the `umount /sys/fs/cgroup && mount -t cgroup2 ...` leaks back into +// envbox's mount namespace because we don't use `--mount` on unshare. Using +// `--mount` would break sysbox-fs propagation (its per-container mounts +// under /var/lib/sysboxfs/ stop being visible to sysbox-runc in the dockerd +// namespace). The observable side effect of the leak is that envbox's later +// cgroupv2 cpu.max read uses a different path; xunix.readCPUQuotaCGroupV2 +// has a fallback that handles this. // // See also: https://github.com/moby/moby/issues/45378#issuecomment-2886261231 func wrapDockerdCmd(dargs []string) (string, []string) { - shellCmd := `set -e -umount /sys/fs/cgroup -mount -t cgroup2 cgroup /sys/fs/cgroup + shellCmd := fmt.Sprintf(`set -e +# cgroup v2: enable nesting if [ -f /sys/fs/cgroup/cgroup.controllers ]; then - mkdir -p /sys/fs/cgroup/init - while ! { - xargs -rn1 < /sys/fs/cgroup/cgroup.procs > /sys/fs/cgroup/init/cgroup.procs || : - sed -e 's/ / +/g' -e 's/^/+/' < /sys/fs/cgroup/cgroup.controllers > /sys/fs/cgroup/cgroup.subtree_control - }; do :; done + # Remount /sys/fs/cgroup so the new cgroup namespace's view becomes the + # fs root. Inner container cgroups will then be created under the envbox + # container's cgroup on the host. + umount /sys/fs/cgroup + mount -t cgroup2 cgroup /sys/fs/cgroup + + # move the processes from the root group to the /init group, + # otherwise writing subtree_control fails with EBUSY. + # An error during moving non-existent process (i.e., "cat") is ignored. + mkdir -p /sys/fs/cgroup/init + # this happens in a loop because things like "docker exec" on our dind + # container will create new processes, which creates a race between our + # moving everything to "init" and enabling subtree_control + envbox_attempts=0 + while ! { + # move the processes from the root group to the /init group, + # otherwise writing subtree_control fails with EBUSY. + # An error during moving non-existent process (i.e., "cat") is ignored. + xargs -rn1 < /sys/fs/cgroup/cgroup.procs > /sys/fs/cgroup/init/cgroup.procs || : + # enable controllers + sed -e 's/ / +/g' -e 's/^/+/' < /sys/fs/cgroup/cgroup.controllers \ + > /sys/fs/cgroup/cgroup.subtree_control + }; do + envbox_attempts=$((envbox_attempts + 1)) + if [ "$envbox_attempts" -ge %d ]; then + echo "envbox: failed to enable cgroup.subtree_control after $envbox_attempts attempts" >&2 + exit 1 + fi + done fi exec "$0" "$@" -` +`, dockerdSubtreeControlMaxAttempts) wrapperArgs := []string{ "--cgroup", "/bin/sh", "-c", shellCmd, - "dockerd", + dockerdBinName, } wrapperArgs = append(wrapperArgs, dargs...) return "unshare", wrapperArgs diff --git a/cli/docker_test.go b/cli/docker_test.go index 64ca338..3b07f82 100644 --- a/cli/docker_test.go +++ b/cli/docker_test.go @@ -178,18 +178,23 @@ func TestDocker(t *testing.T) { fmt.Sprintf("--bridge-cidr=%s", bridgeCIDR), ) + dockerdArgs := []string{ + "--debug", + "--log-level=debug", + fmt.Sprintf("--mtu=%d", nl.Attrs().MTU), + "--userns-remap=coder", + "--storage-driver=overlay2", + fmt.Sprintf("--bip=%s", bridgeCIDR), + } + // dockerd is launched via an unshare wrapper that exec's into + // dockerd with these args; assert against the full wrapped argv. + wrapCmd, wrapArgs := cli.WrapDockerdCmd(dockerdArgs) + expectedArgv := append([]string{wrapCmd}, wrapArgs...) + execer := clitest.Execer(ctx) execer.AddCommands(&xunixfake.FakeCmd{ FakeCmd: &testingexec.FakeCmd{ - Argv: []string{ - "dockerd", - "--debug", - "--log-level=debug", - fmt.Sprintf("--mtu=%d", nl.Attrs().MTU), - "--userns-remap=coder", - "--storage-driver=overlay2", - fmt.Sprintf("--bip=%s", bridgeCIDR), - }, + Argv: expectedArgv, }, }) @@ -741,6 +746,42 @@ func TestDocker(t *testing.T) { }) } +func TestWrapDockerdCmd(t *testing.T) { + t.Parallel() + + dargs := []string{"--debug", "--mtu=1500"} + cmd, args := cli.WrapDockerdCmd(dargs) + + // The wrapper invokes `unshare`, exec'ing through `/bin/sh -c