diff --git a/background/process.go b/background/process.go index dc4ddaa..6d8bd37 100644 --- a/background/process.go +++ b/background/process.go @@ -56,6 +56,24 @@ func (d *Process) Start() error { return d.startProcess() } +// SetBinName overrides the binary name used to detect whether the running +// process has exited (via the contents of /proc//cmdline). By default +// the binary name is the `cmd` argument passed to New or Restart. When the +// command is a wrapper that exec's into a different binary (e.g. +// `unshare ... -- /bin/sh -c 'exec dockerd ...'`), the post-exec cmdline +// reflects the final binary rather than the wrapper, so the default +// binary name will not match. Call SetBinName with the post-exec binary +// name to keep exit detection accurate. +// +// SetBinName must be called again after each Restart since Restart resets +// the binary name to its `cmd` argument. +func (d *Process) SetBinName(name string) { + d.mu.Lock() + defer d.mu.Unlock() + + d.binName = name +} + // Wait waits for the process to exit, returning the error on the provided // channel. func (d *Process) Wait() <-chan error { diff --git a/cli/docker.go b/cli/docker.go index 1573612..2e0f0a0 100644 --- a/cli/docker.go +++ b/cli/docker.go @@ -256,7 +256,12 @@ func dockerCmd() *cobra.Command { log.Debug(ctx, "starting dockerd", slog.F("args", args)) blog.Info("Waiting for sysbox processes to startup...") - dockerd := background.New(ctx, log, "dockerd", dargs...) + wrapCmd, wrapArgs := wrapDockerdCmd(dargs) + dockerd := background.New(ctx, log, wrapCmd, wrapArgs...) + // The unshare wrapper exec's into dockerd, so /proc//cmdline + // reflects "dockerd" not "unshare". Track exits against the + // post-exec name to keep Restart/kill detection accurate. + dockerd.SetBinName(dockerdBinName) err = dockerd.Start() if err != nil { return xerrors.Errorf("start dockerd: %w", err) @@ -289,12 +294,16 @@ func dockerCmd() *cobra.Command { log.Fatal(ctx, "dockerd exited, failed getting args for restart", slog.Error(err)) } - err = dockerd.Restart(ctx, "dockerd", args...) + wrapCmd, wrapArgs := wrapDockerdCmd(args) + err = dockerd.Restart(ctx, wrapCmd, wrapArgs...) if err != nil { blog.Info("Failed to create Container-based Virtual Machine: " + err.Error()) //nolint log.Fatal(ctx, "restart dockerd", slog.Error(err)) } + // Restart resets the tracked binary name to its `cmd` + // argument; re-set it for the wrapped invocation. + dockerd.SetBinName(dockerdBinName) err = <-dockerd.Wait() } @@ -357,10 +366,14 @@ func dockerCmd() *cobra.Command { log.Debug(ctx, "restarting dockerd", slog.F("args", args)) - err = dockerd.Restart(ctx, "dockerd", args...) + wrapCmd, wrapArgs := wrapDockerdCmd(args) + err = dockerd.Restart(ctx, wrapCmd, wrapArgs...) if err != nil { return xerrors.Errorf("restart dockerd: %w", err) } + // Restart resets the tracked binary name to its `cmd` + // argument; re-set it for the wrapped invocation. + dockerd.SetBinName(dockerdBinName) go func() { err = <-dockerd.Wait() blog.Errorf("restarted dockerd exited: %v", err) @@ -881,6 +894,93 @@ func dockerdArgs(link, cidr string, isNoSpace bool) ([]string, error) { return args, nil } +// dockerdBinName is the binary name reported by /proc//cmdline once the +// `unshare` -> `/bin/sh` -> dockerd exec chain set up by wrapDockerdCmd has +// completed. background.Process tracking must use this name (not the literal +// "unshare" command we invoke) so that exit detection compares against the +// running cmdline correctly. +const dockerdBinName = "dockerd" + +// dockerdSubtreeControlMaxAttempts caps the cgroup-subtree-control retry loop +// so a stuck race can't block envbox startup indefinitely. +const dockerdSubtreeControlMaxAttempts = 100 + +// wrapDockerdCmd wraps the dockerd invocation with `unshare --cgroup`, plus a +// cgroup2 remount and cgroupv2 delegation setup, so that inner container +// cgroups become descendants of the envbox container's own cgroup on the +// host. Without this the inner Docker daemon places container cgroups at +// /sys/fs/cgroup/docker/, which is a sibling of the pod's cgroup tree +// rather than a descendant; host-level cgroup-aware tools (Tetragon, Falco, +// custom eBPF agents) then cannot attribute processes inside a workspace +// container back to the pod that's running them. +// +// On cgroupv2 the delegation block (move all current processes into +// /sys/fs/cgroup/init, then enable cgroup.subtree_control on the root) is +// required: cgroupv2's "no internal processes" rule otherwise prevents +// dockerd from creating child cgroups with processes under a parent that +// already has its own processes. The block is gated on +// /sys/fs/cgroup/cgroup.controllers existing so that on cgroupv1 hosts the +// wrapper is effectively a no-op around `unshare --cgroup -- exec dockerd`. +// +// The delegation logic mirrors moby's hack/dind wrapper (lines 61-79): +// https://github.com/moby/moby/blob/8d9e3502aba39127e4d12196dae16d306f76993d/hack/dind#L61-L79 +// +// Note: the `umount /sys/fs/cgroup && mount -t cgroup2 ...` leaks back into +// envbox's mount namespace because we don't use `--mount` on unshare. Using +// `--mount` would break sysbox-fs propagation (its per-container mounts +// under /var/lib/sysboxfs/ stop being visible to sysbox-runc in the dockerd +// namespace). The observable side effect of the leak is that envbox's later +// cgroupv2 cpu.max read uses a different path; xunix.readCPUQuotaCGroupV2 +// has a fallback that handles this. +// +// See also: https://github.com/moby/moby/issues/45378#issuecomment-2886261231 +func wrapDockerdCmd(dargs []string) (string, []string) { + shellCmd := fmt.Sprintf(`set -e +# cgroup v2: enable nesting +if [ -f /sys/fs/cgroup/cgroup.controllers ]; then + # Remount /sys/fs/cgroup so the new cgroup namespace's view becomes the + # fs root. Inner container cgroups will then be created under the envbox + # container's cgroup on the host. + umount /sys/fs/cgroup + mount -t cgroup2 cgroup /sys/fs/cgroup + + # move the processes from the root group to the /init group, + # otherwise writing subtree_control fails with EBUSY. + # An error during moving non-existent process (i.e., "cat") is ignored. + mkdir -p /sys/fs/cgroup/init + # this happens in a loop because things like "docker exec" on our dind + # container will create new processes, which creates a race between our + # moving everything to "init" and enabling subtree_control + envbox_attempts=0 + while ! { + # move the processes from the root group to the /init group, + # otherwise writing subtree_control fails with EBUSY. + # An error during moving non-existent process (i.e., "cat") is ignored. + xargs -rn1 < /sys/fs/cgroup/cgroup.procs > /sys/fs/cgroup/init/cgroup.procs || : + # enable controllers + sed -e 's/ / +/g' -e 's/^/+/' < /sys/fs/cgroup/cgroup.controllers \ + > /sys/fs/cgroup/cgroup.subtree_control + }; do + envbox_attempts=$((envbox_attempts + 1)) + if [ "$envbox_attempts" -ge %d ]; then + echo "envbox: failed to enable cgroup.subtree_control after $envbox_attempts attempts" >&2 + exit 1 + fi + done +fi +exec "$0" "$@" +`, dockerdSubtreeControlMaxAttempts) + wrapperArgs := []string{ + "--cgroup", + "/bin/sh", + "-c", + shellCmd, + dockerdBinName, + } + wrapperArgs = append(wrapperArgs, dargs...) + return "unshare", wrapperArgs +} + // TODO This is bad code. func filterElements(ss []string, filters ...string) []string { filtered := make([]string, 0, len(ss)) diff --git a/cli/docker_test.go b/cli/docker_test.go index 64ca338..3b07f82 100644 --- a/cli/docker_test.go +++ b/cli/docker_test.go @@ -178,18 +178,23 @@ func TestDocker(t *testing.T) { fmt.Sprintf("--bridge-cidr=%s", bridgeCIDR), ) + dockerdArgs := []string{ + "--debug", + "--log-level=debug", + fmt.Sprintf("--mtu=%d", nl.Attrs().MTU), + "--userns-remap=coder", + "--storage-driver=overlay2", + fmt.Sprintf("--bip=%s", bridgeCIDR), + } + // dockerd is launched via an unshare wrapper that exec's into + // dockerd with these args; assert against the full wrapped argv. + wrapCmd, wrapArgs := cli.WrapDockerdCmd(dockerdArgs) + expectedArgv := append([]string{wrapCmd}, wrapArgs...) + execer := clitest.Execer(ctx) execer.AddCommands(&xunixfake.FakeCmd{ FakeCmd: &testingexec.FakeCmd{ - Argv: []string{ - "dockerd", - "--debug", - "--log-level=debug", - fmt.Sprintf("--mtu=%d", nl.Attrs().MTU), - "--userns-remap=coder", - "--storage-driver=overlay2", - fmt.Sprintf("--bip=%s", bridgeCIDR), - }, + Argv: expectedArgv, }, }) @@ -741,6 +746,42 @@ func TestDocker(t *testing.T) { }) } +func TestWrapDockerdCmd(t *testing.T) { + t.Parallel() + + dargs := []string{"--debug", "--mtu=1500"} + cmd, args := cli.WrapDockerdCmd(dargs) + + // The wrapper invokes `unshare`, exec'ing through `/bin/sh -c