From c5aa82e909b3a0f44d158fba592908a8701e5fde Mon Sep 17 00:00:00 2001
From: James Toyer <jamestoyer@gmail.com>
Date: Mon, 27 Apr 2026 12:54:31 +0100
Subject: [PATCH 1/2] Place inner container cgroups under the pod's cgroup tree

---
 cli/docker.go | 58 ++++++++++++++++++++++++++++++++++++++++++++++++---
 xunix/sys.go  | 12 ++++++++++-
 2 files changed, 66 insertions(+), 4 deletions(-)
diff --git a/cli/docker.go b/cli/docker.go
index 1573612..7162fc0 100644
--- a/cli/docker.go
+++ b/cli/docker.go
@@ -256,7 +256,8 @@ func dockerCmd() *cobra.Command {
 			log.Debug(ctx, "starting dockerd", slog.F("args", args))
 
 			blog.Info("Waiting for sysbox processes to startup...")
-			dockerd := background.New(ctx, log, "dockerd", dargs...)
+			wrapCmd, wrapArgs := wrapDockerdCmd(dargs)
+			dockerd := background.New(ctx, log, wrapCmd, wrapArgs...)
 			err = dockerd.Start()
 			if err != nil {
 				return xerrors.Errorf("start dockerd: %w", err)
@@ -289,7 +290,8 @@ func dockerCmd() *cobra.Command {
 						log.Fatal(ctx, "dockerd exited, failed getting args for restart", slog.Error(err))
 					}
 
-					err = dockerd.Restart(ctx, "dockerd", args...)
+					wrapCmd, wrapArgs := wrapDockerdCmd(args)
+					err = dockerd.Restart(ctx, wrapCmd, wrapArgs...)
 					if err != nil {
 						blog.Info("Failed to create Container-based Virtual Machine: " + err.Error())
 						//nolint
@@ -357,7 +359,8 @@ func dockerCmd() *cobra.Command {
 
 					log.Debug(ctx, "restarting dockerd", slog.F("args", args))
 
-					err = dockerd.Restart(ctx, "dockerd", args...)
+					wrapCmd, wrapArgs := wrapDockerdCmd(args)
+					err = dockerd.Restart(ctx, wrapCmd, wrapArgs...)
 					if err != nil {
 						return xerrors.Errorf("restart dockerd: %w", err)
 					}
@@ -881,6 +884,55 @@ func dockerdArgs(link, cidr string, isNoSpace bool) ([]string, error) {
 	return args, nil
 }
 
+// wrapDockerdCmd wraps the dockerd invocation with `unshare --cgroup` +
+// cgroup2 remount + cgroupv2 delegation setup. This creates a new cgroup
+// namespace for dockerd and moves existing processes into a sibling `/init`
+// cgroup so the root cgroup can enable subtree_control. Without this, cgroupv2
+// "no internal processes" rule prevents Docker from creating child cgroups
+// with processes under a cgroup that already has processes.
+//
+// Note: the `umount /sys/fs/cgroup && mount -t cgroup2 ...` leaks back into
+// envbox's mount namespace because we don't use `--mount`. Using `--mount`
+// would break sysbox-fs propagation (its per-container mounts under
+// /var/lib/sysboxfs/ stop being visible to sysbox-runc in the dockerd NS).
+// The side effect of the leak is a non-fatal "Unable to read CPU quota"
+// warning when envbox later tries to read /sys/fs/cgroup/<host-path>/cpu.max
+// — that path no longer exists under the remounted view. The inner
+// container's CPU limits still flow through the parent pod's cgroup hierarchy,
+// but cgroup-aware tools inside the workspace may see the host CPU count.
+//
+// The cgroup delegation logic mirrors moby's `hack/dind` wrapper:
+// https://github.com/moby/moby/blob/master/hack/dind
+//
+// After this setup, inner container cgroups are placed under the envbox
+// container's cgroup on the host. Tetragon's cgtracker can then associate
+// these child cgroups with the parent pod for attribution.
+//
+// See also: https://github.com/moby/moby/issues/45378#issuecomment-2886261231
+func wrapDockerdCmd(dargs []string) (string, []string) {
+	shellCmd := `set -e
+umount /sys/fs/cgroup
+mount -t cgroup2 cgroup /sys/fs/cgroup
+if [ -f /sys/fs/cgroup/cgroup.controllers ]; then
+  mkdir -p /sys/fs/cgroup/init
+  while ! {
+    xargs -rn1 < /sys/fs/cgroup/cgroup.procs > /sys/fs/cgroup/init/cgroup.procs || :
+    sed -e 's/ / +/g' -e 's/^/+/' < /sys/fs/cgroup/cgroup.controllers > /sys/fs/cgroup/cgroup.subtree_control
+  }; do :; done
+fi
+exec "$0" "$@"
+`
+	wrapperArgs := []string{
+		"--cgroup",
+		"/bin/sh",
+		"-c",
+		shellCmd,
+		"dockerd",
+	}
+	wrapperArgs = append(wrapperArgs, dargs...)
+	return "unshare", wrapperArgs
+}
+
 // TODO This is bad code.
 func filterElements(ss []string, filters ...string) []string {
 	filtered := make([]string, 0, len(ss))
diff --git a/xunix/sys.go b/xunix/sys.go
index eef07cd..5dd5fe6 100644
--- a/xunix/sys.go
+++ b/xunix/sys.go
@@ -63,9 +63,19 @@ func readCPUQuotaCGroupV2(ctx context.Context) (CPUQuota, error) {
 		return CPUQuota{}, xerrors.Errorf("determine own cgroup: %w", err)
 	}
 
+	// Try the standard path (self-relative) first.
 	maxStr, err := afero.ReadFile(fs, filepath.Join("/sys/fs/cgroup/", self, "cpu.max"))
 	if err != nil {
-		return CPUQuota{}, xerrors.Errorf("read cpu.max outside container: %w", err)
+		// Fallback: read cpu.max at the mount root. This handles the case where
+		// `/sys/fs/cgroup/` has been remounted to be rooted at the current
+		// cgroup (e.g. after `unshare --cgroup` + `mount -t cgroup2`), so the
+		// self-relative path no longer exists — but the mount root IS the
+		// current cgroup and its cpu.max reflects the same limits.
+		rootMaxStr, rootErr := afero.ReadFile(fs, "/sys/fs/cgroup/cpu.max")
+		if rootErr != nil {
+			return CPUQuota{}, xerrors.Errorf("read cpu.max outside container: %w", err)
+		}
+		maxStr = rootMaxStr
 	}
 
 	list := strings.Split(string(bytes.TrimSpace(maxStr)), " ")

From 29ad3ef5c8d7dc9fbe6cc5449b23d1482c5976f4 Mon Sep 17 00:00:00 2001
From: James Toyer <jamestoyer@gmail.com>
Date: Wed, 6 May 2026 17:20:03 +0100
Subject: [PATCH 2/2] Address PR review feedback

---
 background/process.go |  18 +++++++
 cli/docker.go         | 108 ++++++++++++++++++++++++++++++------------
 cli/docker_test.go    |  59 +++++++++++++++++++----
 cli/export_test.go    |   8 ++++
 xunix/sys.go          |   8 ++--
 xunix/sys_test.go     |  12 +++++
 6 files changed, 171 insertions(+), 42 deletions(-)
 create mode 100644 cli/export_test.go

diff --git a/background/process.go b/background/process.go
index dc4ddaa..6d8bd37 100644
--- a/background/process.go
+++ b/background/process.go
@@ -56,6 +56,24 @@ func (d *Process) Start() error {
 	return d.startProcess()
 }
 
+// SetBinName overrides the binary name used to detect whether the running
+// process has exited (via the contents of /proc/<pid>/cmdline). By default
+// the binary name is the `cmd` argument passed to New or Restart. When the
+// command is a wrapper that exec's into a different binary (e.g.
+// `unshare ... -- /bin/sh -c 'exec dockerd ...'`), the post-exec cmdline
+// reflects the final binary rather than the wrapper, so the default
+// binary name will not match. Call SetBinName with the post-exec binary
+// name to keep exit detection accurate.
+//
+// SetBinName must be called again after each Restart since Restart resets
+// the binary name to its `cmd` argument.
+func (d *Process) SetBinName(name string) {
+	d.mu.Lock()
+	defer d.mu.Unlock()
+
+	d.binName = name
+}
+
 // Wait waits for the process to exit, returning the error on the provided
 // channel.
 func (d *Process) Wait() <-chan error {
diff --git a/cli/docker.go b/cli/docker.go
index 7162fc0..2e0f0a0 100644
--- a/cli/docker.go
+++ b/cli/docker.go
@@ -258,6 +258,10 @@ func dockerCmd() *cobra.Command {
 			blog.Info("Waiting for sysbox processes to startup...")
 			wrapCmd, wrapArgs := wrapDockerdCmd(dargs)
 			dockerd := background.New(ctx, log, wrapCmd, wrapArgs...)
+			// The unshare wrapper exec's into dockerd, so /proc/<pid>/cmdline
+			// reflects "dockerd" not "unshare". Track exits against the
+			// post-exec name to keep Restart/kill detection accurate.
+			dockerd.SetBinName(dockerdBinName)
 			err = dockerd.Start()
 			if err != nil {
 				return xerrors.Errorf("start dockerd: %w", err)
@@ -297,6 +301,9 @@ func dockerCmd() *cobra.Command {
 						//nolint
 						log.Fatal(ctx, "restart dockerd", slog.Error(err))
 					}
+					// Restart resets the tracked binary name to its `cmd`
+					// argument; re-set it for the wrapped invocation.
+					dockerd.SetBinName(dockerdBinName)
 
 					err = <-dockerd.Wait()
 				}
@@ -364,6 +371,9 @@ func dockerCmd() *cobra.Command {
 					if err != nil {
 						return xerrors.Errorf("restart dockerd: %w", err)
 					}
+					// Restart resets the tracked binary name to its `cmd`
+					// argument; re-set it for the wrapped invocation.
+					dockerd.SetBinName(dockerdBinName)
 					go func() {
 						err = <-dockerd.Wait()
 						blog.Errorf("restarted dockerd exited: %v", err)
@@ -884,50 +894,88 @@ func dockerdArgs(link, cidr string, isNoSpace bool) ([]string, error) {
 	return args, nil
 }
 
-// wrapDockerdCmd wraps the dockerd invocation with `unshare --cgroup` +
-// cgroup2 remount + cgroupv2 delegation setup. This creates a new cgroup
-// namespace for dockerd and moves existing processes into a sibling `/init`
-// cgroup so the root cgroup can enable subtree_control. Without this, cgroupv2
-// "no internal processes" rule prevents Docker from creating child cgroups
-// with processes under a cgroup that already has processes.
+// dockerdBinName is the binary name reported by /proc/<pid>/cmdline once the
+// `unshare` -> `/bin/sh` -> dockerd exec chain set up by wrapDockerdCmd has
+// completed. background.Process tracking must use this name (not the literal
+// "unshare" command we invoke) so that exit detection compares against the
+// running cmdline correctly.
+const dockerdBinName = "dockerd"
+
+// dockerdSubtreeControlMaxAttempts caps the cgroup-subtree-control retry loop
+// so a stuck race can't block envbox startup indefinitely.
+const dockerdSubtreeControlMaxAttempts = 100
+
+// wrapDockerdCmd wraps the dockerd invocation with `unshare --cgroup`, plus a
+// cgroup2 remount and cgroupv2 delegation setup, so that inner container
+// cgroups become descendants of the envbox container's own cgroup on the
+// host. Without this the inner Docker daemon places container cgroups at
+// /sys/fs/cgroup/docker/<id>, which is a sibling of the pod's cgroup tree
+// rather than a descendant; host-level cgroup-aware tools (Tetragon, Falco,
+// custom eBPF agents) then cannot attribute processes inside a workspace
+// container back to the pod that's running them.
 //
-// Note: the `umount /sys/fs/cgroup && mount -t cgroup2 ...` leaks back into
-// envbox's mount namespace because we don't use `--mount`. Using `--mount`
-// would break sysbox-fs propagation (its per-container mounts under
-// /var/lib/sysboxfs/ stop being visible to sysbox-runc in the dockerd NS).
-// The side effect of the leak is a non-fatal "Unable to read CPU quota"
-// warning when envbox later tries to read /sys/fs/cgroup/<host-path>/cpu.max
-// — that path no longer exists under the remounted view. The inner
-// container's CPU limits still flow through the parent pod's cgroup hierarchy,
-// but cgroup-aware tools inside the workspace may see the host CPU count.
+// On cgroupv2 the delegation block (move all current processes into
+// /sys/fs/cgroup/init, then enable cgroup.subtree_control on the root) is
+// required: cgroupv2's "no internal processes" rule otherwise prevents
+// dockerd from creating child cgroups with processes under a parent that
+// already has its own processes. The block is gated on
+// /sys/fs/cgroup/cgroup.controllers existing so that on cgroupv1 hosts the
+// wrapper is effectively a no-op around `unshare --cgroup -- exec dockerd`.
 //
-// The cgroup delegation logic mirrors moby's `hack/dind` wrapper:
-// https://github.com/moby/moby/blob/master/hack/dind
+// The delegation logic mirrors moby's hack/dind wrapper (lines 61-79):
+// https://github.com/moby/moby/blob/8d9e3502aba39127e4d12196dae16d306f76993d/hack/dind#L61-L79
 //
-// After this setup, inner container cgroups are placed under the envbox
-// container's cgroup on the host. Tetragon's cgtracker can then associate
-// these child cgroups with the parent pod for attribution.
+// Note: the `umount /sys/fs/cgroup && mount -t cgroup2 ...` leaks back into
+// envbox's mount namespace because we don't use `--mount` on unshare. Using
+// `--mount` would break sysbox-fs propagation (its per-container mounts
+// under /var/lib/sysboxfs/ stop being visible to sysbox-runc in the dockerd
+// namespace). The observable side effect of the leak is that envbox's later
+// cgroupv2 cpu.max read uses a different path; xunix.readCPUQuotaCGroupV2
+// has a fallback that handles this.
 //
 // See also: https://github.com/moby/moby/issues/45378#issuecomment-2886261231
 func wrapDockerdCmd(dargs []string) (string, []string) {
-	shellCmd := `set -e
-umount /sys/fs/cgroup
-mount -t cgroup2 cgroup /sys/fs/cgroup
+	shellCmd := fmt.Sprintf(`set -e
+# cgroup v2: enable nesting
 if [ -f /sys/fs/cgroup/cgroup.controllers ]; then
-  mkdir -p /sys/fs/cgroup/init
-  while ! {
-    xargs -rn1 < /sys/fs/cgroup/cgroup.procs > /sys/fs/cgroup/init/cgroup.procs || :
-    sed -e 's/ / +/g' -e 's/^/+/' < /sys/fs/cgroup/cgroup.controllers > /sys/fs/cgroup/cgroup.subtree_control
-  }; do :; done
+	# Remount /sys/fs/cgroup so the new cgroup namespace's view becomes the
+	# fs root. Inner container cgroups will then be created under the envbox
+	# container's cgroup on the host.
+	umount /sys/fs/cgroup
+	mount -t cgroup2 cgroup /sys/fs/cgroup
+
+	# move the processes from the root group to the /init group,
+	# otherwise writing subtree_control fails with EBUSY.
+	# An error during moving non-existent process (i.e., "cat") is ignored.
+	mkdir -p /sys/fs/cgroup/init
+	# this happens in a loop because things like "docker exec" on our dind
+	# container will create new processes, which creates a race between our
+	# moving everything to "init" and enabling subtree_control
+	envbox_attempts=0
+	while ! {
+		# move the processes from the root group to the /init group,
+		# otherwise writing subtree_control fails with EBUSY.
+		# An error during moving non-existent process (i.e., "cat") is ignored.
+		xargs -rn1 < /sys/fs/cgroup/cgroup.procs > /sys/fs/cgroup/init/cgroup.procs || :
+		# enable controllers
+		sed -e 's/ / +/g' -e 's/^/+/' < /sys/fs/cgroup/cgroup.controllers \
+			> /sys/fs/cgroup/cgroup.subtree_control
+	}; do
+		envbox_attempts=$((envbox_attempts + 1))
+		if [ "$envbox_attempts" -ge %d ]; then
+			echo "envbox: failed to enable cgroup.subtree_control after $envbox_attempts attempts" >&2
+			exit 1
+		fi
+	done
 fi
 exec "$0" "$@"
-`
+`, dockerdSubtreeControlMaxAttempts)
 	wrapperArgs := []string{
 		"--cgroup",
 		"/bin/sh",
 		"-c",
 		shellCmd,
-		"dockerd",
+		dockerdBinName,
 	}
 	wrapperArgs = append(wrapperArgs, dargs...)
 	return "unshare", wrapperArgs
diff --git a/cli/docker_test.go b/cli/docker_test.go
index 64ca338..3b07f82 100644
--- a/cli/docker_test.go
+++ b/cli/docker_test.go
@@ -178,18 +178,23 @@ func TestDocker(t *testing.T) {
 			fmt.Sprintf("--bridge-cidr=%s", bridgeCIDR),
 		)
 
+		dockerdArgs := []string{
+			"--debug",
+			"--log-level=debug",
+			fmt.Sprintf("--mtu=%d", nl.Attrs().MTU),
+			"--userns-remap=coder",
+			"--storage-driver=overlay2",
+			fmt.Sprintf("--bip=%s", bridgeCIDR),
+		}
+		// dockerd is launched via an unshare wrapper that exec's into
+		// dockerd with these args; assert against the full wrapped argv.
+		wrapCmd, wrapArgs := cli.WrapDockerdCmd(dockerdArgs)
+		expectedArgv := append([]string{wrapCmd}, wrapArgs...)
+
 		execer := clitest.Execer(ctx)
 		execer.AddCommands(&xunixfake.FakeCmd{
 			FakeCmd: &testingexec.FakeCmd{
-				Argv: []string{
-					"dockerd",
-					"--debug",
-					"--log-level=debug",
-					fmt.Sprintf("--mtu=%d", nl.Attrs().MTU),
-					"--userns-remap=coder",
-					"--storage-driver=overlay2",
-					fmt.Sprintf("--bip=%s", bridgeCIDR),
-				},
+				Argv: expectedArgv,
 			},
 		})
 
@@ -741,6 +746,42 @@ func TestDocker(t *testing.T) {
 	})
 }
 
+func TestWrapDockerdCmd(t *testing.T) {
+	t.Parallel()
+
+	dargs := []string{"--debug", "--mtu=1500"}
+	cmd, args := cli.WrapDockerdCmd(dargs)
+
+	// The wrapper invokes `unshare`, exec'ing through `/bin/sh -c <script>`
+	// and finally exec'ing dockerd. /proc/<pid>/cmdline ends up as "dockerd",
+	// which is what background.Process tracking should compare against.
+	require.Equal(t, "unshare", cmd)
+	require.Equal(t, "dockerd", cli.DockerdBinName)
+
+	// Argv prefix: --cgroup /bin/sh -c <script> dockerd <dargs...>
+	require.GreaterOrEqual(t, len(args), 6, "args=%v", args)
+	require.Equal(t, "--cgroup", args[0])
+	require.Equal(t, "/bin/sh", args[1])
+	require.Equal(t, "-c", args[2])
+	require.Equal(t, cli.DockerdBinName, args[4])
+	require.Equal(t, dargs, args[5:])
+
+	// The shell script should:
+	//  - guard the v2-only block on cgroup.controllers existing
+	//  - perform the umount/mount inside that guard (cgroupv1 hosts unaffected)
+	//  - delegate via /init + subtree_control
+	//  - bound the retry loop
+	//  - exec into dockerd
+	script := args[3]
+	require.Contains(t, script, "[ -f /sys/fs/cgroup/cgroup.controllers ]")
+	require.Contains(t, script, "umount /sys/fs/cgroup")
+	require.Contains(t, script, "mount -t cgroup2 cgroup /sys/fs/cgroup")
+	require.Contains(t, script, "mkdir -p /sys/fs/cgroup/init")
+	require.Contains(t, script, "/sys/fs/cgroup/cgroup.subtree_control")
+	require.Contains(t, script, fmt.Sprintf("ge %d ]", cli.DockerdSubtreeControlMaxAttempts))
+	require.Contains(t, script, `exec "$0" "$@"`)
+}
+
 // rawDockerAuth is sample input for a kubernetes secret to a gcr.io private
 // registry.
 const rawDockerAuth = `{"auths":{"us.gcr.io":{"username":"_json_key","password":"{\"type\": \"service_account\", \"project_id\": \"some-test\", \"private_key_id\": \"blahblah\", \"private_key\": \"-----BEGIN PRIVATE KEY-----mykey-----END PRIVATE KEY-----\", \"client_email\": \"test@test.iam.gserviceaccount.com\", \"client_id\": \"123\", \"auth_uri\": \"https://accounts.google.com/o/oauth2/auth\", \"token_uri\": \"https://oauth2.googleapis.com/token\", \"auth_provider_x509_cert_url\": \"https://www.googleapis.com/oauth2/v1/certs\", \"client_x509_cert_url\": \"https://www.googleapis.com/robot/v1/metadata/x509/test.iam.gserviceaccount.com\" }","email":"test@test.iam.gserviceaccount.com","auth":"X2pzb25fa2V5OnsKCgkidHlwZSI6ICJzZXJ2aWNlX2FjY291bnQiLAoJInByb2plY3RfaWQiOiAic29tZS10ZXN0IiwKCSJwcml2YXRlX2tleV9pZCI6ICJibGFoYmxhaCIsCgkicHJpdmF0ZV9rZXkiOiAiLS0tLS1CRUdJTiBQUklWQVRFIEtFWS0tLS0tCm15a2V5LS0tLS1FTkQgUFJJVkFURSBLRVktLS0tLQoiLAoJImNsaWVudF9lbWFpbCI6ICJ0ZXN0QHRlc3QuaWFtLmdzZXJ2aWNlYWNjb3VudC5jb20iLAoJImNsaWVudF9pZCI6ICIxMjMiLAoJImF1dGhfdXJpIjogImh0dHBzOi8vYWNjb3VudHMuZ29vZ2xlLmNvbS9vL29hdXRoMi9hdXRoIiwKCSJ0b2tlbl91cmkiOiAiaHR0cHM6Ly9vYXV0aDIuZ29vZ2xlYXBpcy5jb20vdG9rZW4iLAoJImF1dGhfcHJvdmlkZXJfeDUwOV9jZXJ0X3VybCI6ICJodHRwczovL3d3dy5nb29nbGVhcGlzLmNvbS9vYXV0aDIvdjEvY2VydHMiLAoJImNsaWVudF94NTA5X2NlcnRfdXJsIjogImh0dHBzOi8vd3d3Lmdvb2dsZWFwaXMuY29tL3JvYm90L3YxL21ldGFkYXRhL3g1MDkvdGVzdC5pYW0uZ3NlcnZpY2VhY2NvdW50LmNvbSIKfQo="}}}`
diff --git a/cli/export_test.go b/cli/export_test.go
new file mode 100644
index 0000000..ff723ed
--- /dev/null
+++ b/cli/export_test.go
@@ -0,0 +1,8 @@
+package cli
+
+// Aliases to expose internal helpers to the external _test package.
+var (
+	WrapDockerdCmd                   = wrapDockerdCmd
+	DockerdBinName                   = dockerdBinName
+	DockerdSubtreeControlMaxAttempts = dockerdSubtreeControlMaxAttempts
+)
diff --git a/xunix/sys.go b/xunix/sys.go
index 5dd5fe6..55077c7 100644
--- a/xunix/sys.go
+++ b/xunix/sys.go
@@ -64,16 +64,18 @@ func readCPUQuotaCGroupV2(ctx context.Context) (CPUQuota, error) {
 	}
 
 	// Try the standard path (self-relative) first.
-	maxStr, err := afero.ReadFile(fs, filepath.Join("/sys/fs/cgroup/", self, "cpu.max"))
+	selfPath := filepath.Join("/sys/fs/cgroup/", self, "cpu.max")
+	maxStr, err := afero.ReadFile(fs, selfPath)
 	if err != nil {
 		// Fallback: read cpu.max at the mount root. This handles the case where
 		// `/sys/fs/cgroup/` has been remounted to be rooted at the current
 		// cgroup (e.g. after `unshare --cgroup` + `mount -t cgroup2`), so the
 		// self-relative path no longer exists — but the mount root IS the
 		// current cgroup and its cpu.max reflects the same limits.
-		rootMaxStr, rootErr := afero.ReadFile(fs, "/sys/fs/cgroup/cpu.max")
+		const rootPath = "/sys/fs/cgroup/cpu.max"
+		rootMaxStr, rootErr := afero.ReadFile(fs, rootPath)
 		if rootErr != nil {
-			return CPUQuota{}, xerrors.Errorf("read cpu.max outside container: %w", err)
+			return CPUQuota{}, xerrors.Errorf("read cpu.max outside container (tried %s: %v; fallback %s: %w)", selfPath, err, rootPath, rootErr)
 		}
 		maxStr = rootMaxStr
 	}
diff --git a/xunix/sys_test.go b/xunix/sys_test.go
index dbd553b..4372565 100644
--- a/xunix/sys_test.go
+++ b/xunix/sys_test.go
@@ -59,6 +59,18 @@ func TestReadCPUQuota(t *testing.T) {
 			},
 			Expected: xunix.CPUQuota{Quota: -1, Period: 100000, CGroup: xunix.CGroupV2},
 		},
+		{
+			// Simulates the cgroup namespace + remount the dockerd wrapper
+			// performs: the self-relative path no longer exists but cpu.max
+			// at the mount root reflects the same limits.
+			Name:    "CGroupV2_RootFallback",
+			Subpath: "docker/dummy",
+			FS: map[string]string{
+				"/proc/self/cgroup":         "0::/kubepods/pod/container\n",
+				"/sys/fs/cgroup/cpu.max":    "150000 100000\n",
+			},
+			Expected: xunix.CPUQuota{Quota: 150000, Period: 100000, CGroup: xunix.CGroupV2},
+		},
 		{
 			Name:  "Empty",
 			FS:    map[string]string{},