From 672947f95a1057d1e1eecf8d7b408428df8aff77 Mon Sep 17 00:00:00 2001 From: Eric Pickard Date: Thu, 9 Apr 2026 13:46:29 -0400 Subject: [PATCH 1/5] add informer sync timeout Signed-off-by: Eric Pickard --- internal/controller/controller.go | 16 +++++++-- internal/controller/controller_test.go | 47 ++++++++++++++++++++++++++ 2 files changed, 61 insertions(+), 2 deletions(-) diff --git a/internal/controller/controller.go b/internal/controller/controller.go index 52febfb..d1a8adf 100644 --- a/internal/controller/controller.go +++ b/internal/controller/controller.go @@ -38,6 +38,10 @@ const ( // deployment record API. Once an artifact is known to be missing, // we suppress further API calls for this duration. unknownArtifactTTL = 1 * time.Hour + + // informerSyncTimeoutDuration is the maximum duration of time allowed + // for the informers to sync to prevent the controller from hanging indefinitely. + informerSyncTimeoutDuration = 60 * time.Second ) type ttlCache interface { @@ -92,6 +96,9 @@ type Controller struct { // best effort cache to suppress API calls for artifacts that // returned a 404 (no artifact found). Keyed by image digest. unknownArtifacts ttlCache + // informerSyncTimeout is the maximum time allowed for all informers to sync + // and prevents sync from hanging indefinitely. + informerSyncTimeout time.Duration } // New creates a new deployment tracker controller. @@ -160,6 +167,7 @@ func New(clientset kubernetes.Interface, metadataAggregator podMetadataAggregato cfg: cfg, observedDeployments: amcache.NewExpiring(), unknownArtifacts: amcache.NewExpiring(), + informerSyncTimeout: informerSyncTimeoutDuration, } // Add event handlers to the informer @@ -320,7 +328,9 @@ func (c *Controller) Run(ctx context.Context, workers int) error { // Wait for the caches to be synced slog.Info("Waiting for informer caches to sync") - if !cache.WaitForCacheSync(ctx.Done(), + informerSyncCxt, cancel := context.WithTimeout(ctx, c.informerSyncTimeout) + + if !cache.WaitForCacheSync(informerSyncCxt.Done(), c.podInformer.HasSynced, c.deploymentInformer.HasSynced, c.daemonSetInformer.HasSynced, @@ -328,8 +338,10 @@ func (c *Controller) Run(ctx context.Context, workers int) error { c.jobInformer.HasSynced, c.cronJobInformer.HasSynced, ) { - return errors.New("timed out waiting for caches to sync") + cancel() + return errors.New("timed out waiting for caches to sync - please ensure deployment tracker has the correct kubernetes permissions") } + cancel() slog.Info("Starting workers", "count", workers, diff --git a/internal/controller/controller_test.go b/internal/controller/controller_test.go index f758a4d..5c3122c 100644 --- a/internal/controller/controller_test.go +++ b/internal/controller/controller_test.go @@ -12,7 +12,11 @@ import ( "github.com/stretchr/testify/require" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" amcache "k8s.io/apimachinery/pkg/util/cache" + "k8s.io/client-go/kubernetes/fake" + k8stesting "k8s.io/client-go/testing" + "k8s.io/client-go/util/workqueue" ) // mockPoster records all PostOne calls and returns a configurable error. @@ -533,3 +537,46 @@ func TestIsTerminalPhase(t *testing.T) { }) } } + +func TestRun_InformerSyncTimeout(t *testing.T) { + t.Parallel() + fakeClient := fake.NewSimpleClientset() + fakeClient.PrependReactor("list", "*", func(action k8stesting.Action) (bool, runtime.Object, error) { + // Block until the test context is cancelled. + <-make(chan struct{}) + return true, nil, nil + }) + + factory := createInformerFactory(fakeClient, "", "") + + ctrl := &Controller{ + clientset: fakeClient, + podInformer: factory.Core().V1().Pods().Informer(), + deploymentInformer: factory.Apps().V1().Deployments().Informer(), + daemonSetInformer: factory.Apps().V1().DaemonSets().Informer(), + statefulSetInformer: factory.Apps().V1().StatefulSets().Informer(), + jobInformer: factory.Batch().V1().Jobs().Informer(), + cronJobInformer: factory.Batch().V1().CronJobs().Informer(), + workqueue: workqueue.NewTypedRateLimitingQueue( + workqueue.DefaultTypedControllerRateLimiter[PodEvent](), + ), + apiClient: &mockPoster{}, + cfg: &Config{}, + observedDeployments: amcache.NewExpiring(), + unknownArtifacts: amcache.NewExpiring(), + informerSyncTimeout: 2 * time.Second, + } + + errCh := make(chan error, 1) + go func() { + errCh <- ctrl.Run(context.Background(), 1) + }() + + select { + case err := <-errCh: + require.Error(t, err) + assert.Contains(t, err.Error(), "timed out waiting for caches to sync") + case <-time.After(5 * time.Second): + t.Fatal("Run did not return within 5 seconds — informer sync timeout was 2 seconds") + } +} From d2ee730940e5d79035092e53dea7ded90bfb758b Mon Sep 17 00:00:00 2001 From: Eric Pickard Date: Thu, 9 Apr 2026 15:47:44 -0400 Subject: [PATCH 2/5] fix linting issue Signed-off-by: Eric Pickard --- internal/controller/controller_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/internal/controller/controller_test.go b/internal/controller/controller_test.go index 5c3122c..39137a0 100644 --- a/internal/controller/controller_test.go +++ b/internal/controller/controller_test.go @@ -541,7 +541,7 @@ func TestIsTerminalPhase(t *testing.T) { func TestRun_InformerSyncTimeout(t *testing.T) { t.Parallel() fakeClient := fake.NewSimpleClientset() - fakeClient.PrependReactor("list", "*", func(action k8stesting.Action) (bool, runtime.Object, error) { + fakeClient.PrependReactor("list", "*", func(_ k8stesting.Action) (bool, runtime.Object, error) { // Block until the test context is cancelled. <-make(chan struct{}) return true, nil, nil From 32d1f4d8c9bfe4451e41666e54d4d2a1395d998f Mon Sep 17 00:00:00 2001 From: Eric Pickard Date: Thu, 9 Apr 2026 16:39:05 -0400 Subject: [PATCH 3/5] minor fixes Signed-off-by: Eric Pickard --- internal/controller/controller.go | 4 ++-- internal/controller/controller_test.go | 6 ++++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/internal/controller/controller.go b/internal/controller/controller.go index d1a8adf..7a078da 100644 --- a/internal/controller/controller.go +++ b/internal/controller/controller.go @@ -328,9 +328,9 @@ func (c *Controller) Run(ctx context.Context, workers int) error { // Wait for the caches to be synced slog.Info("Waiting for informer caches to sync") - informerSyncCxt, cancel := context.WithTimeout(ctx, c.informerSyncTimeout) + informerSyncCtx, cancel := context.WithTimeout(ctx, c.informerSyncTimeout) - if !cache.WaitForCacheSync(informerSyncCxt.Done(), + if !cache.WaitForCacheSync(informerSyncCtx.Done(), c.podInformer.HasSynced, c.deploymentInformer.HasSynced, c.daemonSetInformer.HasSynced, diff --git a/internal/controller/controller_test.go b/internal/controller/controller_test.go index 39137a0..a35937c 100644 --- a/internal/controller/controller_test.go +++ b/internal/controller/controller_test.go @@ -541,11 +541,13 @@ func TestIsTerminalPhase(t *testing.T) { func TestRun_InformerSyncTimeout(t *testing.T) { t.Parallel() fakeClient := fake.NewSimpleClientset() + blocker := make(chan struct{}) fakeClient.PrependReactor("list", "*", func(_ k8stesting.Action) (bool, runtime.Object, error) { - // Block until the test context is cancelled. - <-make(chan struct{}) + // Block until the test completes. + <-blocker return true, nil, nil }) + defer close(blocker) factory := createInformerFactory(fakeClient, "", "") From c58ea295f067522cb5c16c3f1c8e65557c32bff8 Mon Sep 17 00:00:00 2001 From: Eric Pickard Date: Thu, 9 Apr 2026 17:41:39 -0400 Subject: [PATCH 4/5] address comments Signed-off-by: Eric Pickard --- internal/controller/controller.go | 3 +++ internal/controller/controller_test.go | 7 +++++-- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/internal/controller/controller.go b/internal/controller/controller.go index 7a078da..b722d3b 100644 --- a/internal/controller/controller.go +++ b/internal/controller/controller.go @@ -339,6 +339,9 @@ func (c *Controller) Run(ctx context.Context, workers int) error { c.cronJobInformer.HasSynced, ) { cancel() + if ctx.Err() != nil { + return fmt.Errorf("cache sync interrupted: %w", ctx.Err()) + } return errors.New("timed out waiting for caches to sync - please ensure deployment tracker has the correct kubernetes permissions") } cancel() diff --git a/internal/controller/controller_test.go b/internal/controller/controller_test.go index a35937c..7c797fd 100644 --- a/internal/controller/controller_test.go +++ b/internal/controller/controller_test.go @@ -545,7 +545,7 @@ func TestRun_InformerSyncTimeout(t *testing.T) { fakeClient.PrependReactor("list", "*", func(_ k8stesting.Action) (bool, runtime.Object, error) { // Block until the test completes. <-blocker - return true, nil, nil + return true, nil, fmt.Errorf("fail") }) defer close(blocker) @@ -569,9 +569,12 @@ func TestRun_InformerSyncTimeout(t *testing.T) { informerSyncTimeout: 2 * time.Second, } + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + errCh := make(chan error, 1) go func() { - errCh <- ctrl.Run(context.Background(), 1) + errCh <- ctrl.Run(ctx, 1) }() select { From f967a993afba96bf268057124b0a2d16bb8ef040 Mon Sep 17 00:00:00 2001 From: Eric Pickard Date: Thu, 9 Apr 2026 17:45:29 -0400 Subject: [PATCH 5/5] fix linting Signed-off-by: Eric Pickard --- internal/controller/controller_test.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/internal/controller/controller_test.go b/internal/controller/controller_test.go index 7c797fd..e66a61b 100644 --- a/internal/controller/controller_test.go +++ b/internal/controller/controller_test.go @@ -2,6 +2,7 @@ package controller import ( "context" + "errors" "fmt" "sync" "testing" @@ -545,7 +546,7 @@ func TestRun_InformerSyncTimeout(t *testing.T) { fakeClient.PrependReactor("list", "*", func(_ k8stesting.Action) (bool, runtime.Object, error) { // Block until the test completes. <-blocker - return true, nil, fmt.Errorf("fail") + return true, nil, errors.New("fail") }) defer close(blocker)