Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 54 additions & 0 deletions cmd/nerdctl/container/container_health_check_linux_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1155,3 +1155,57 @@ func TestHealthCheck_SystemdIntegration_Advanced(t *testing.T) {
}
testCase.Run(t)
}

func TestStartHealthcheckedContainerAfterExited(t *testing.T) {
testCase := nerdtest.Setup()
testCase.Require = require.All(
require.Not(nerdtest.Rootless),
// Docker CLI does not provide a standalone healthcheck command.
require.Not(nerdtest.Docker),
)

testCase.Cleanup = func(data test.Data, helpers test.Helpers) {
helpers.Anyhow("rm", "-f", data.Identifier())
}

testCase.Setup = func(data test.Data, helpers test.Helpers) {
helpers.Ensure("run", "-d", "--name", data.Identifier(),
"--health-cmd", "true", "--health-interval", "1s",
testutil.CommonImage, "sleep", "1",
)
nerdtest.EnsureContainerExited(helpers, data.Identifier(), expect.ExitCodeSuccess)

data.Labels().Set("containerName", data.Identifier())
}

testCase.SubTests = []*test.Case{
{
Description: "transient service unit is garbage-collected from systemd after container exit",
NoParallel: true,
Command: func(data test.Data, helpers test.Helpers) test.TestableCommand {
containerID := nerdtest.InspectContainer(helpers, data.Labels().Get("containerName")).ID
return helpers.Custom("systemctl", "list-units", "--all", containerID+".service", containerID+".timer")
},
Expected: func(data test.Data, helpers test.Helpers) *test.Expected {
return &test.Expected{
ExitCode: expect.ExitCodeNoCheck,
Output: func(stdout string, t tig.T) {
containerID := nerdtest.InspectContainer(helpers, data.Labels().Get("containerName")).ID
assert.Assert(t, !strings.Contains(stdout, containerID),
"expected transient service unit to be cleaned up, but got: %s", stdout)
},
}
},
},
{
Description: "exited container with healthcheck can be started again",
NoParallel: true,
Command: func(data test.Data, helpers test.Helpers) test.TestableCommand {
return helpers.Command("start", data.Labels().Get("containerName"))
},
Expected: test.Expects(expect.ExitCodeSuccess, nil, nil),
},
}

testCase.Run(t)
}
36 changes: 14 additions & 22 deletions pkg/cmd/container/health_check.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,10 +29,21 @@ import (

// HealthCheck executes the health check command for a container
func HealthCheck(ctx context.Context, client *containerd.Client, container containerd.Container) error {
// verify container status and get task
task, err := isContainerRunning(ctx, container)
task, err := container.Task(ctx, nil)
if err != nil {
return fmt.Errorf("failed to get container task: %w", err)
}
// Check if container is running
status, err := task.Status(ctx)
if err != nil {
return err
return fmt.Errorf("failed to get container status: %w", err)
}
s := status.Status
if s != containerd.Running {
if s == containerd.Stopped {
healthcheck.CleanupStaleHealthcheckTimer(ctx, container.ID())
}
return fmt.Errorf("container is not running (status: %s)", status.Status)
}

// Check if container has health check configured
Expand Down Expand Up @@ -67,25 +78,6 @@ func HealthCheck(ctx context.Context, client *containerd.Client, container conta
return healthcheck.ExecuteHealthCheck(ctx, task, container, hcConfig)
}

func isContainerRunning(ctx context.Context, container containerd.Container) (containerd.Task, error) {
// Get container task to check status
task, err := container.Task(ctx, nil)
if err != nil {
return nil, fmt.Errorf("failed to get container task: %w", err)
}

// Check if container is running
status, err := task.Status(ctx)
if err != nil {
return nil, fmt.Errorf("failed to get container status: %w", err)
}
if status.Status != containerd.Running {
return nil, fmt.Errorf("container is not running (status: %s)", status.Status)
}

return task, nil
}

// If configuredValue is zero, use defaultValue instead.
func timeoutWithDefault(configuredValue time.Duration, defaultValue time.Duration) time.Duration {
if configuredValue == 0 {
Expand Down
3 changes: 3 additions & 0 deletions pkg/healthcheck/healthcheck_manager_darwin.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,9 @@ func RemoveTransientHealthCheckFiles(ctx context.Context, container containerd.C
return nil
}

// CleanupStaleHealthcheckTimer removes any pre-existing transient timer unit for the given container.
func CleanupStaleHealthcheckTimer(ctx context.Context, containerID string) {}

// ForceRemoveTransientHealthCheckFiles forcefully stops and cleans up the transient timer and service
// using just the container ID. This function is non-blocking and uses timeouts to prevent hanging
// on systemd operations. On Darwin, this is a no-op since systemd is not available.
Expand Down
3 changes: 3 additions & 0 deletions pkg/healthcheck/healthcheck_manager_freebsd.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,9 @@ func RemoveTransientHealthCheckFiles(ctx context.Context, container containerd.C
return nil
}

// CleanupStaleHealthcheckTimer removes any pre-existing transient timer unit for the given container.
func CleanupStaleHealthcheckTimer(ctx context.Context, containerID string) {}

// ForceRemoveTransientHealthCheckFiles forcefully stops and cleans up the transient timer and service
// using just the container ID. This function is non-blocking and uses timeouts to prevent hanging
// on systemd operations. On Darwin, this is a no-op since systemd is not available.
Expand Down
146 changes: 80 additions & 66 deletions pkg/healthcheck/healthcheck_manager_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -63,12 +63,23 @@ func CreateTimer(ctx context.Context, container containerd.Container, cfg *confi
}

// Always use health-interval for timer frequency
cmdOpts = append(cmdOpts, "--unit", containerID, "--on-unit-inactive="+hc.Interval.String(), "--timer-property=AccuracySec=1s")
//
// --collect:
// Even when the healthcheck fails with the error "container is not running" after the container has
// stopped, and the transient service unit enters a failed state, it will still be subject to garbage
// collection due to the --collect option. Without this option, `systemctl reset-failed` would explicitly be needed.
// See: https://www.freedesktop.org/software/systemd/man/latest/systemd-run.html#-G
cmdOpts = append(cmdOpts, "--unit", containerID, "--on-unit-inactive="+hc.Interval.String(), "--timer-property=AccuracySec=1s", "--collect")

cmdOpts = append(cmdOpts, nerdctlCmd)
cmdOpts = append(cmdOpts, nerdctlArgs...)
cmdOpts = append(cmdOpts, "container", "healthcheck", containerID)

// Defensively remove any pre-existing transient timer unit that may have leaked from a previous run
// (e.g. when restarted before the self-cleanup in HealthCheck has a chance to run). Without this,
// the systemd-run below would fail with "Unit <container-ID>.timer was already loaded".
CleanupStaleHealthcheckTimer(ctx, containerID)

log.G(ctx).Debugf("creating healthcheck timer with: systemd-run %s", strings.Join(cmdOpts, " "))
run := exec.Command("systemd-run", cmdOpts...)
if out, err := run.CombinedOutput(); err != nil {
Expand All @@ -78,6 +89,22 @@ func CreateTimer(ctx context.Context, container containerd.Container, cfg *confi
return nil
}

func createDbusConn(ctx context.Context) (*dbus.Conn, error) {
var conn *dbus.Conn
var err error

if rootlessutil.IsRootless() {
conn, err = dbus.NewUserConnectionContext(ctx)
} else {
conn, err = dbus.NewSystemConnectionContext(ctx)
}
if err != nil {
return nil, fmt.Errorf("systemd DBUS connect error: %w", err)
}

return conn, nil
}

// StartTimer starts the healthcheck timer unit.
func StartTimer(ctx context.Context, container containerd.Container, cfg *config.Config) error {
hc := extractHealthcheck(ctx, container)
Expand All @@ -89,13 +116,7 @@ func StartTimer(ctx context.Context, container containerd.Container, cfg *config
}

containerID := container.ID()
var conn *dbus.Conn
var err error
if rootlessutil.IsRootless() {
conn, err = dbus.NewUserConnectionContext(ctx)
} else {
conn, err = dbus.NewSystemConnectionContext(ctx)
}
conn, err := createDbusConn(ctx)
if err != nil {
return fmt.Errorf("systemd DBUS connect error: %w", err)
}
Expand All @@ -122,6 +143,54 @@ func RemoveTransientHealthCheckFiles(ctx context.Context, container containerd.C
return ForceRemoveTransientHealthCheckFiles(ctx, container.ID())
}

func CleanupStaleHealthcheckTimer(ctx context.Context, containerID string) {
conn, err := createDbusConn(ctx)
if err != nil {
log.G(ctx).Warnf("dbus connect failed during stale cleanup: %v", err)
return
}
defer conn.Close()

timer := containerID + ".timer"
units, err := conn.ListUnitsByNamesContext(ctx, []string{timer})
if err != nil {
log.G(ctx).Warnf("list units failed during stale cleanup: %v", err)
return
}
// The status of a systemd unit is described below:
// See: https://github.com/systemd/systemd/blob/v260.1/src/basic/unit-def.c
if len(units) == 0 || units[0].LoadState == "not-found" {
return
}
u := units[0]

log.G(ctx).Warnf("found stale healthcheck timer %s (load=%s, active=%s, sub=%s), cleaning up",
timer, u.LoadState, u.ActiveState, u.SubState)
timeoutCtx, cancel := context.WithTimeout(ctx, 3*time.Second)
defer cancel()
stopSystemdUnit(ctx, timeoutCtx, conn, timer)
}

func stopSystemdUnit(ctx context.Context, timeoutCtx context.Context, conn *dbus.Conn, unit string) {
if err := timeoutCtx.Err(); err != nil {
log.G(ctx).Warnf("context already done before stopping unit %s: %v", unit, err)
return
}
ch := make(chan string, 1)
if _, err := conn.StopUnitContext(timeoutCtx, unit, "ignore-dependencies", ch); err != nil {
log.G(ctx).Warnf("failed to stop unit %s: %v", unit, err)
return
}
select {
case msg := <-ch:
if msg != "done" {
log.G(ctx).Warnf("stop unit %s: unexpected result %s", unit, msg)
}
case <-timeoutCtx.Done():
log.G(ctx).Warnf("timeout waiting for stop confirmation of unit %s", unit)
}
}

// ForceRemoveTransientHealthCheckFiles forcefully stops and cleans up the transient timer and service
// using just the container ID. This function is non-blocking and uses timeouts to prevent hanging
// on systemd operations. It logs errors as warnings but continues cleanup attempts.
Expand All @@ -142,13 +211,7 @@ func ForceRemoveTransientHealthCheckFiles(ctx context.Context, containerID strin
go func() {
defer close(errChan)

var conn *dbus.Conn
var err error
if rootlessutil.IsRootless() {
conn, err = dbus.NewUserConnectionContext(ctx)
} else {
conn, err = dbus.NewSystemConnectionContext(ctx)
}
conn, err := createDbusConn(ctx)
if err != nil {
log.G(ctx).Warnf("systemd DBUS connect error during force cleanup: %v", err)
errChan <- fmt.Errorf("systemd DBUS connect error: %w", err)
Expand All @@ -158,61 +221,12 @@ func ForceRemoveTransientHealthCheckFiles(ctx context.Context, containerID strin

// Stop timer with timeout
go func() {
select {
case <-timeoutCtx.Done():
log.G(ctx).Warnf("timeout stopping timer %s during force cleanup", timer)
return
default:
tChan := make(chan string, 1)
if _, err := conn.StopUnitContext(timeoutCtx, timer, "ignore-dependencies", tChan); err == nil {
select {
case msg := <-tChan:
if msg != "done" {
log.G(ctx).Warnf("timer stop message during force cleanup: %s", msg)
}
case <-timeoutCtx.Done():
log.G(ctx).Warnf("timeout waiting for timer stop confirmation: %s", timer)
}
} else {
log.G(ctx).Warnf("failed to stop timer %s during force cleanup: %v", timer, err)
}
}
stopSystemdUnit(ctx, timeoutCtx, conn, timer)
}()

// Stop service with timeout
go func() {
select {
case <-timeoutCtx.Done():
log.G(ctx).Warnf("timeout stopping service %s during force cleanup", service)
return
default:
sChan := make(chan string, 1)
if _, err := conn.StopUnitContext(timeoutCtx, service, "ignore-dependencies", sChan); err == nil {
select {
case msg := <-sChan:
if msg != "done" {
log.G(ctx).Warnf("service stop message during force cleanup: %s", msg)
}
case <-timeoutCtx.Done():
log.G(ctx).Warnf("timeout waiting for service stop confirmation: %s", service)
}
} else {
log.G(ctx).Warnf("failed to stop service %s during force cleanup: %v", service, err)
}
}
}()

// Reset failed units (best effort, non-blocking)
go func() {
select {
case <-timeoutCtx.Done():
log.G(ctx).Warnf("timeout resetting failed unit %s during force cleanup", service)
return
default:
if err := conn.ResetFailedUnitContext(timeoutCtx, service); err != nil {
log.G(ctx).Warnf("failed to reset failed unit %s during force cleanup: %v", service, err)
}
}
stopSystemdUnit(ctx, timeoutCtx, conn, service)
}()

// Wait a short time for operations to complete, but don't block indefinitely
Expand Down
3 changes: 3 additions & 0 deletions pkg/healthcheck/healthcheck_manager_windows.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,9 @@ func RemoveTransientHealthCheckFiles(ctx context.Context, container containerd.C
return nil
}

// CleanupStaleHealthcheckTimer removes any pre-existing transient timer unit for the given container.
func CleanupStaleHealthcheckTimer(ctx context.Context, containerID string) {}

// ForceRemoveTransientHealthCheckFiles forcefully stops and cleans up the transient timer and service
// using just the container ID. This function is non-blocking and uses timeouts to prevent hanging
// on systemd operations. On Windows, this is a no-op since systemd is not available.
Expand Down
Loading