diff --git a/cmd/nerdctl/container/container_health_check_linux_test.go b/cmd/nerdctl/container/container_health_check_linux_test.go index 1217045a3ed..ebdf150fb26 100644 --- a/cmd/nerdctl/container/container_health_check_linux_test.go +++ b/cmd/nerdctl/container/container_health_check_linux_test.go @@ -1155,3 +1155,57 @@ func TestHealthCheck_SystemdIntegration_Advanced(t *testing.T) { } testCase.Run(t) } + +func TestStartHealthcheckedContainerAfterExited(t *testing.T) { + testCase := nerdtest.Setup() + testCase.Require = require.All( + require.Not(nerdtest.Rootless), + // Docker CLI does not provide a standalone healthcheck command. + require.Not(nerdtest.Docker), + ) + + testCase.Cleanup = func(data test.Data, helpers test.Helpers) { + helpers.Anyhow("rm", "-f", data.Identifier()) + } + + testCase.Setup = func(data test.Data, helpers test.Helpers) { + helpers.Ensure("run", "-d", "--name", data.Identifier(), + "--health-cmd", "true", "--health-interval", "1s", + testutil.CommonImage, "sleep", "1", + ) + nerdtest.EnsureContainerExited(helpers, data.Identifier(), expect.ExitCodeSuccess) + + data.Labels().Set("containerName", data.Identifier()) + } + + testCase.SubTests = []*test.Case{ + { + Description: "transient service unit is garbage-collected from systemd after container exit", + NoParallel: true, + Command: func(data test.Data, helpers test.Helpers) test.TestableCommand { + containerID := nerdtest.InspectContainer(helpers, data.Labels().Get("containerName")).ID + return helpers.Custom("systemctl", "list-units", "--all", containerID+".service", containerID+".timer") + }, + Expected: func(data test.Data, helpers test.Helpers) *test.Expected { + return &test.Expected{ + ExitCode: expect.ExitCodeNoCheck, + Output: func(stdout string, t tig.T) { + containerID := nerdtest.InspectContainer(helpers, data.Labels().Get("containerName")).ID + assert.Assert(t, !strings.Contains(stdout, containerID), + "expected transient service unit to be cleaned up, but got: %s", stdout) + }, + } + }, + }, + { + Description: "exited container with healthcheck can be started again", + NoParallel: true, + Command: func(data test.Data, helpers test.Helpers) test.TestableCommand { + return helpers.Command("start", data.Labels().Get("containerName")) + }, + Expected: test.Expects(expect.ExitCodeSuccess, nil, nil), + }, + } + + testCase.Run(t) +} diff --git a/pkg/cmd/container/health_check.go b/pkg/cmd/container/health_check.go index e2646497c3f..1a96028eb50 100644 --- a/pkg/cmd/container/health_check.go +++ b/pkg/cmd/container/health_check.go @@ -29,10 +29,21 @@ import ( // HealthCheck executes the health check command for a container func HealthCheck(ctx context.Context, client *containerd.Client, container containerd.Container) error { - // verify container status and get task - task, err := isContainerRunning(ctx, container) + task, err := container.Task(ctx, nil) + if err != nil { + return fmt.Errorf("failed to get container task: %w", err) + } + // Check if container is running + status, err := task.Status(ctx) if err != nil { - return err + return fmt.Errorf("failed to get container status: %w", err) + } + s := status.Status + if s != containerd.Running { + if s == containerd.Stopped { + healthcheck.CleanupStaleHealthcheckTimer(ctx, container.ID()) + } + return fmt.Errorf("container is not running (status: %s)", status.Status) } // Check if container has health check configured @@ -67,25 +78,6 @@ func HealthCheck(ctx context.Context, client *containerd.Client, container conta return healthcheck.ExecuteHealthCheck(ctx, task, container, hcConfig) } -func isContainerRunning(ctx context.Context, container containerd.Container) (containerd.Task, error) { - // Get container task to check status - task, err := container.Task(ctx, nil) - if err != nil { - return nil, fmt.Errorf("failed to get container task: %w", err) - } - - // Check if container is running - status, err := task.Status(ctx) - if err != nil { - return nil, fmt.Errorf("failed to get container status: %w", err) - } - if status.Status != containerd.Running { - return nil, fmt.Errorf("container is not running (status: %s)", status.Status) - } - - return task, nil -} - // If configuredValue is zero, use defaultValue instead. func timeoutWithDefault(configuredValue time.Duration, defaultValue time.Duration) time.Duration { if configuredValue == 0 { diff --git a/pkg/healthcheck/healthcheck_manager_darwin.go b/pkg/healthcheck/healthcheck_manager_darwin.go index 289d0c16704..f48e5df6a38 100644 --- a/pkg/healthcheck/healthcheck_manager_darwin.go +++ b/pkg/healthcheck/healthcheck_manager_darwin.go @@ -39,6 +39,9 @@ func RemoveTransientHealthCheckFiles(ctx context.Context, container containerd.C return nil } +// CleanupStaleHealthcheckTimer removes any pre-existing transient timer unit for the given container. +func CleanupStaleHealthcheckTimer(ctx context.Context, containerID string) {} + // ForceRemoveTransientHealthCheckFiles forcefully stops and cleans up the transient timer and service // using just the container ID. This function is non-blocking and uses timeouts to prevent hanging // on systemd operations. On Darwin, this is a no-op since systemd is not available. diff --git a/pkg/healthcheck/healthcheck_manager_freebsd.go b/pkg/healthcheck/healthcheck_manager_freebsd.go index 289d0c16704..f48e5df6a38 100644 --- a/pkg/healthcheck/healthcheck_manager_freebsd.go +++ b/pkg/healthcheck/healthcheck_manager_freebsd.go @@ -39,6 +39,9 @@ func RemoveTransientHealthCheckFiles(ctx context.Context, container containerd.C return nil } +// CleanupStaleHealthcheckTimer removes any pre-existing transient timer unit for the given container. +func CleanupStaleHealthcheckTimer(ctx context.Context, containerID string) {} + // ForceRemoveTransientHealthCheckFiles forcefully stops and cleans up the transient timer and service // using just the container ID. This function is non-blocking and uses timeouts to prevent hanging // on systemd operations. On Darwin, this is a no-op since systemd is not available. diff --git a/pkg/healthcheck/healthcheck_manager_linux.go b/pkg/healthcheck/healthcheck_manager_linux.go index 92b49bd0cc4..f3a992c2bc4 100644 --- a/pkg/healthcheck/healthcheck_manager_linux.go +++ b/pkg/healthcheck/healthcheck_manager_linux.go @@ -63,12 +63,23 @@ func CreateTimer(ctx context.Context, container containerd.Container, cfg *confi } // Always use health-interval for timer frequency - cmdOpts = append(cmdOpts, "--unit", containerID, "--on-unit-inactive="+hc.Interval.String(), "--timer-property=AccuracySec=1s") + // + // --collect: + // Even when the healthcheck fails with the error "container is not running" after the container has + // stopped, and the transient service unit enters a failed state, it will still be subject to garbage + // collection due to the --collect option. Without this option, `systemctl reset-failed` would explicitly be needed. + // See: https://www.freedesktop.org/software/systemd/man/latest/systemd-run.html#-G + cmdOpts = append(cmdOpts, "--unit", containerID, "--on-unit-inactive="+hc.Interval.String(), "--timer-property=AccuracySec=1s", "--collect") cmdOpts = append(cmdOpts, nerdctlCmd) cmdOpts = append(cmdOpts, nerdctlArgs...) cmdOpts = append(cmdOpts, "container", "healthcheck", containerID) + // Defensively remove any pre-existing transient timer unit that may have leaked from a previous run + // (e.g. when restarted before the self-cleanup in HealthCheck has a chance to run). Without this, + // the systemd-run below would fail with "Unit .timer was already loaded". + CleanupStaleHealthcheckTimer(ctx, containerID) + log.G(ctx).Debugf("creating healthcheck timer with: systemd-run %s", strings.Join(cmdOpts, " ")) run := exec.Command("systemd-run", cmdOpts...) if out, err := run.CombinedOutput(); err != nil { @@ -78,6 +89,22 @@ func CreateTimer(ctx context.Context, container containerd.Container, cfg *confi return nil } +func createDbusConn(ctx context.Context) (*dbus.Conn, error) { + var conn *dbus.Conn + var err error + + if rootlessutil.IsRootless() { + conn, err = dbus.NewUserConnectionContext(ctx) + } else { + conn, err = dbus.NewSystemConnectionContext(ctx) + } + if err != nil { + return nil, fmt.Errorf("systemd DBUS connect error: %w", err) + } + + return conn, nil +} + // StartTimer starts the healthcheck timer unit. func StartTimer(ctx context.Context, container containerd.Container, cfg *config.Config) error { hc := extractHealthcheck(ctx, container) @@ -89,13 +116,7 @@ func StartTimer(ctx context.Context, container containerd.Container, cfg *config } containerID := container.ID() - var conn *dbus.Conn - var err error - if rootlessutil.IsRootless() { - conn, err = dbus.NewUserConnectionContext(ctx) - } else { - conn, err = dbus.NewSystemConnectionContext(ctx) - } + conn, err := createDbusConn(ctx) if err != nil { return fmt.Errorf("systemd DBUS connect error: %w", err) } @@ -122,6 +143,54 @@ func RemoveTransientHealthCheckFiles(ctx context.Context, container containerd.C return ForceRemoveTransientHealthCheckFiles(ctx, container.ID()) } +func CleanupStaleHealthcheckTimer(ctx context.Context, containerID string) { + conn, err := createDbusConn(ctx) + if err != nil { + log.G(ctx).Warnf("dbus connect failed during stale cleanup: %v", err) + return + } + defer conn.Close() + + timer := containerID + ".timer" + units, err := conn.ListUnitsByNamesContext(ctx, []string{timer}) + if err != nil { + log.G(ctx).Warnf("list units failed during stale cleanup: %v", err) + return + } + // The status of a systemd unit is described below: + // See: https://github.com/systemd/systemd/blob/v260.1/src/basic/unit-def.c + if len(units) == 0 || units[0].LoadState == "not-found" { + return + } + u := units[0] + + log.G(ctx).Warnf("found stale healthcheck timer %s (load=%s, active=%s, sub=%s), cleaning up", + timer, u.LoadState, u.ActiveState, u.SubState) + timeoutCtx, cancel := context.WithTimeout(ctx, 3*time.Second) + defer cancel() + stopSystemdUnit(ctx, timeoutCtx, conn, timer) +} + +func stopSystemdUnit(ctx context.Context, timeoutCtx context.Context, conn *dbus.Conn, unit string) { + if err := timeoutCtx.Err(); err != nil { + log.G(ctx).Warnf("context already done before stopping unit %s: %v", unit, err) + return + } + ch := make(chan string, 1) + if _, err := conn.StopUnitContext(timeoutCtx, unit, "ignore-dependencies", ch); err != nil { + log.G(ctx).Warnf("failed to stop unit %s: %v", unit, err) + return + } + select { + case msg := <-ch: + if msg != "done" { + log.G(ctx).Warnf("stop unit %s: unexpected result %s", unit, msg) + } + case <-timeoutCtx.Done(): + log.G(ctx).Warnf("timeout waiting for stop confirmation of unit %s", unit) + } +} + // ForceRemoveTransientHealthCheckFiles forcefully stops and cleans up the transient timer and service // using just the container ID. This function is non-blocking and uses timeouts to prevent hanging // on systemd operations. It logs errors as warnings but continues cleanup attempts. @@ -142,13 +211,7 @@ func ForceRemoveTransientHealthCheckFiles(ctx context.Context, containerID strin go func() { defer close(errChan) - var conn *dbus.Conn - var err error - if rootlessutil.IsRootless() { - conn, err = dbus.NewUserConnectionContext(ctx) - } else { - conn, err = dbus.NewSystemConnectionContext(ctx) - } + conn, err := createDbusConn(ctx) if err != nil { log.G(ctx).Warnf("systemd DBUS connect error during force cleanup: %v", err) errChan <- fmt.Errorf("systemd DBUS connect error: %w", err) @@ -158,61 +221,12 @@ func ForceRemoveTransientHealthCheckFiles(ctx context.Context, containerID strin // Stop timer with timeout go func() { - select { - case <-timeoutCtx.Done(): - log.G(ctx).Warnf("timeout stopping timer %s during force cleanup", timer) - return - default: - tChan := make(chan string, 1) - if _, err := conn.StopUnitContext(timeoutCtx, timer, "ignore-dependencies", tChan); err == nil { - select { - case msg := <-tChan: - if msg != "done" { - log.G(ctx).Warnf("timer stop message during force cleanup: %s", msg) - } - case <-timeoutCtx.Done(): - log.G(ctx).Warnf("timeout waiting for timer stop confirmation: %s", timer) - } - } else { - log.G(ctx).Warnf("failed to stop timer %s during force cleanup: %v", timer, err) - } - } + stopSystemdUnit(ctx, timeoutCtx, conn, timer) }() // Stop service with timeout go func() { - select { - case <-timeoutCtx.Done(): - log.G(ctx).Warnf("timeout stopping service %s during force cleanup", service) - return - default: - sChan := make(chan string, 1) - if _, err := conn.StopUnitContext(timeoutCtx, service, "ignore-dependencies", sChan); err == nil { - select { - case msg := <-sChan: - if msg != "done" { - log.G(ctx).Warnf("service stop message during force cleanup: %s", msg) - } - case <-timeoutCtx.Done(): - log.G(ctx).Warnf("timeout waiting for service stop confirmation: %s", service) - } - } else { - log.G(ctx).Warnf("failed to stop service %s during force cleanup: %v", service, err) - } - } - }() - - // Reset failed units (best effort, non-blocking) - go func() { - select { - case <-timeoutCtx.Done(): - log.G(ctx).Warnf("timeout resetting failed unit %s during force cleanup", service) - return - default: - if err := conn.ResetFailedUnitContext(timeoutCtx, service); err != nil { - log.G(ctx).Warnf("failed to reset failed unit %s during force cleanup: %v", service, err) - } - } + stopSystemdUnit(ctx, timeoutCtx, conn, service) }() // Wait a short time for operations to complete, but don't block indefinitely diff --git a/pkg/healthcheck/healthcheck_manager_windows.go b/pkg/healthcheck/healthcheck_manager_windows.go index e5fa58a4a08..efd606e7da1 100644 --- a/pkg/healthcheck/healthcheck_manager_windows.go +++ b/pkg/healthcheck/healthcheck_manager_windows.go @@ -39,6 +39,9 @@ func RemoveTransientHealthCheckFiles(ctx context.Context, container containerd.C return nil } +// CleanupStaleHealthcheckTimer removes any pre-existing transient timer unit for the given container. +func CleanupStaleHealthcheckTimer(ctx context.Context, containerID string) {} + // ForceRemoveTransientHealthCheckFiles forcefully stops and cleans up the transient timer and service // using just the container ID. This function is non-blocking and uses timeouts to prevent hanging // on systemd operations. On Windows, this is a no-op since systemd is not available.