diff --git a/coreapi/lifecycle.go b/coreapi/lifecycle.go index 5482acc..90f61b1 100644 --- a/coreapi/lifecycle.go +++ b/coreapi/lifecycle.go @@ -121,9 +121,33 @@ func startWithPanicRecovery(ctx context.Context, s Service, deps Deps) (err erro return s.Start(ctx, deps) } -// StopAll stops every started service in reverse order. Errors from -// individual Stop calls are collected; the first one is returned but -// every service still gets its Stop call invoked. +// stopWithPanicRecovery calls s.Stop(ctx) inside a defer recover() +// so a buggy plugin panicking during shutdown (channel-send on closed +// channel, nil dereference, os.Remove of a path whose parent dir +// vanished) surfaces as a normal Stop error rather than crashing the +// entire daemon process mid-teardown. +// +// Without this wrapper, a plugin that panics in Stop propagates the +// panic up through StopAll, terminating the daemon before remaining +// plugins get their Stop calls — leaving orphaned goroutines, half- +// written on-disk state, and no tear-down log. +func stopWithPanicRecovery(ctx context.Context, s Service) (err error) { + defer func() { + if r := recover(); r != nil { + err = fmt.Errorf("plugin %q Stop panicked: %v", s.Name(), r) + } + }() + return s.Stop(ctx) +} + +// StopAll stops every started service in reverse order. Each Stop is +// wrapped in stopWithPanicRecovery so a buggy plugin panicking during +// shutdown cannot crash the daemon; the panic is converted to an error +// and all remaining services still get their Stop call. +// +// Errors from individual Stop calls (including recovered panics) are +// collected; the first one is returned but every service still gets +// its Stop call invoked. func (sr *ServiceRegistry) StopAll(ctx context.Context) error { sr.mu.Lock() queue := append([]Service(nil), sr.started...) @@ -132,7 +156,7 @@ func (sr *ServiceRegistry) StopAll(ctx context.Context) error { var firstErr error for i := len(queue) - 1; i >= 0; i-- { - if err := queue[i].Stop(ctx); err != nil && firstErr == nil { + if err := stopWithPanicRecovery(ctx, queue[i]); err != nil && firstErr == nil { firstErr = err } } diff --git a/coreapi/zz_panic_recovery_test.go b/coreapi/zz_panic_recovery_test.go index 3918548..1a9b853 100644 --- a/coreapi/zz_panic_recovery_test.go +++ b/coreapi/zz_panic_recovery_test.go @@ -29,6 +29,88 @@ func (p *panickingService) Order() int { retu func (p *panickingService) Start(_ context.Context, _ coreapi.Deps) error { panic(p.msg) } func (p *panickingService) Stop(_ context.Context) error { return nil } +// panickingStop panics during Stop with the given message. +type panickingStop struct{ msg string } + +func (p *panickingStop) Name() string { return "panicker-stop" } +func (p *panickingStop) Order() int { return 100 } +func (p *panickingStop) Start(_ context.Context, _ coreapi.Deps) error { return nil } +func (p *panickingStop) Stop(_ context.Context) error { panic(p.msg) } + +func TestServiceRegistry_StopAllRecoversFromPluginPanic(t *testing.T) { + t.Parallel() + + sr := &coreapi.ServiceRegistry{} + if err := sr.Register(&panickingStop{msg: "boom during shutdown"}); err != nil { + t.Fatalf("Register: %v", err) + } + + // Start first so the service is in the started queue. + if err := sr.StartAll(context.Background(), coreapi.Deps{}); err != nil { + t.Fatalf("StartAll: %v", err) + } + + // Without the recover wrapper in StopAll, this CRASHES the test process. + err := sr.StopAll(context.Background()) + + if err == nil { + t.Fatal("StopAll returned nil for panicking plugin — recover wrapper missing") + } + if !strings.Contains(err.Error(), "panic") { + t.Errorf("expected error to mention 'panic'; got %q", err.Error()) + } + if !strings.Contains(err.Error(), "boom during shutdown") { + t.Errorf("expected error to include the panic message; got %q", err.Error()) + } +} + +// stopThenPanic panics during Stop AFTER calling an observer so the +// test can verify downstream services still get their Stop calls. +type stopThenPanic struct { + name string + order int + called *bool + msg string +} + +func (s *stopThenPanic) Name() string { return s.name } +func (s *stopThenPanic) Order() int { return s.order } +func (s *stopThenPanic) Start(_ context.Context, _ coreapi.Deps) error { return nil } +func (s *stopThenPanic) Stop(_ context.Context) error { + *s.called = true + panic(s.msg) +} + +func TestServiceRegistry_StopAllContinuesAfterPanic(t *testing.T) { + t.Parallel() + + sr := &coreapi.ServiceRegistry{} + + laterCalled := false + panicker := &stopThenPanic{name: "panicker", order: 50, called: new(bool), msg: "crash"} + later := &stopThenPanic{name: "later", order: 100, called: &laterCalled, msg: ""} + + // later.Order > panicker.Order → later stops first (reverse order). + // panicker panics in Stop; later should already have been stopped. + for _, s := range []coreapi.Service{panicker, later} { + if err := sr.Register(s); err != nil { + t.Fatalf("Register: %v", err) + } + } + + if err := sr.StartAll(context.Background(), coreapi.Deps{}); err != nil { + t.Fatalf("StartAll: %v", err) + } + + _ = sr.StopAll(context.Background()) + + // later must have been stopped despite panicker panicking earlier + // (later starts first, so it stops first — reverse order). + if !laterCalled { + t.Error("downstream service was not stopped after panicker; StopAll aborted early") + } +} + func TestServiceRegistry_StartAllRecoversFromPluginPanic(t *testing.T) { t.Parallel()