diff --git a/lib/instances/manager.go b/lib/instances/manager.go index 1a2215c0..71bb75b4 100644 --- a/lib/instances/manager.go +++ b/lib/instances/manager.go @@ -226,9 +226,21 @@ func NewManagerWithConfig(p *paths.Paths, imageManager images.Manager, systemMan logger.FromContext(context.Background()).WarnContext(context.Background(), "failed to recover pending standby compression jobs", "error", err) } + // Heal any drift between the templates registry and on-disk + // instances after a crash or out-of-band fork delete. + if err := m.reconcileTemplateState(context.Background()); err != nil { + logger.FromContext(context.Background()).WarnContext(context.Background(), "failed to reconcile template state at boot", "error", err) + } + return m } +// ReconcileTemplateState heals registry/instances drift on demand. Useful +// for tests and for periodic GC tickers driven by the host process. +func (m *manager) ReconcileTemplateState(ctx context.Context) error { + return m.reconcileTemplateState(ctx) +} + // SetResourceValidator sets the resource validator for aggregate limit checking. // This is called after initialization to avoid circular dependencies. func (m *manager) SetResourceValidator(v ResourceValidator) { diff --git a/lib/instances/storage.go b/lib/instances/storage.go index f290298c..73eaf6d6 100644 --- a/lib/instances/storage.go +++ b/lib/instances/storage.go @@ -77,6 +77,21 @@ func (m *manager) loadMetadata(id string) (*metadata, error) { return &meta, nil } +// loadMetadataFromFile reads a metadata file by path. Used by sweepers +// that already have the path from listMetadataFiles and don't want to +// reverse-derive an instance id. +func loadMetadataFromFile(path string) (*metadata, error) { + data, err := os.ReadFile(path) + if err != nil { + return nil, fmt.Errorf("read metadata: %w", err) + } + var meta metadata + if err := json.Unmarshal(data, &meta); err != nil { + return nil, fmt.Errorf("unmarshal metadata: %w", err) + } + return &meta, nil +} + // saveMetadata saves instance metadata to disk func (m *manager) saveMetadata(meta *metadata) error { metaPath := m.paths.InstanceMetadata(meta.Id) diff --git a/lib/instances/templates.go b/lib/instances/templates.go index 91383615..e5f00a42 100644 --- a/lib/instances/templates.go +++ b/lib/instances/templates.go @@ -316,3 +316,68 @@ func (m *manager) dropTemplateForkRefcount(ctx context.Context, templateID strin "template_id", templateID, "error", err) } } + +// reconcileTemplateState heals drift between the templates registry and +// the instances directory. It rewrites ForkCount on every template using +// the actual ForkOfTemplate population on disk, and removes registry +// entries whose source instance no longer exists. Safe to call at boot +// and on a periodic GC tick. Bestow the work on a background goroutine +// when the cost matters; this method is synchronous and modest. +func (m *manager) reconcileTemplateState(ctx context.Context) error { + if m.templateRegistry == nil { + return nil + } + log := logger.FromContext(ctx) + + metaFiles, err := m.listMetadataFiles() + if err != nil { + return fmt.Errorf("list instance metadata for template GC: %w", err) + } + + observedForks := make(map[string]int) + templateSourceInstances := make(map[string]struct{}) + for _, path := range metaFiles { + meta, err := loadMetadataFromFile(path) + if err != nil { + log.WarnContext(ctx, "skip metadata during template GC", "path", path, "error", err) + continue + } + stored := meta.StoredMetadata + if stored.ForkOfTemplate != "" { + observedForks[stored.ForkOfTemplate]++ + } + if stored.IsTemplate { + templateSourceInstances[stored.Id] = struct{}{} + } + } + + if err := m.templateRegistry.Reconcile(ctx, observedForks); err != nil { + return fmt.Errorf("reconcile template fork counts: %w", err) + } + + all, err := m.templateRegistry.List(ctx, nil) + if err != nil { + return fmt.Errorf("list templates for orphan sweep: %w", err) + } + for _, tpl := range all { + if _, ok := templateSourceInstances[tpl.SourceInstanceID]; ok { + continue + } + if tpl.ForkCount > 0 { + // Source gone but forks still reference the template; leave + // the registry entry so future fork GC can find it. The + // underlying mem-file is still alive on disk because forks + // hold open file descriptors via the symlink. + log.WarnContext(ctx, "template has live forks but no source instance", + "template_id", tpl.ID, "fork_count", tpl.ForkCount) + continue + } + log.InfoContext(ctx, "deleting orphaned template registry entry", + "template_id", tpl.ID, "source_instance_id", tpl.SourceInstanceID) + if err := m.templateRegistry.Delete(ctx, tpl.ID); err != nil { + log.WarnContext(ctx, "failed to delete orphaned template", + "template_id", tpl.ID, "error", err) + } + } + return nil +} diff --git a/lib/templates/registry.go b/lib/templates/registry.go index fed3f169..0cb2cb9c 100644 --- a/lib/templates/registry.go +++ b/lib/templates/registry.go @@ -45,6 +45,13 @@ type Registry interface { // template (floor 0). Used when a fork is deleted. Touching // templates that were already deleted is a no-op. DecrementForkCount(ctx context.Context, id string) (*Template, error) + + // Reconcile walks the registry and rewrites ForkCount on every + // template using observedForks: the count of live forks per + // template id. Templates not present in observedForks fall to + // zero. Used to heal drift after a crash, an out-of-band fork + // delete, or any other path that bypassed Increment/Decrement. + Reconcile(ctx context.Context, observedForks map[string]int) error } // ListFilter narrows the templates returned by Registry.List. @@ -231,6 +238,34 @@ func (r *FileRegistry) IncrementForkCount(ctx context.Context, id string) (*Temp return t, nil } +// Reconcile rewrites ForkCount on every persisted template using +// observedForks as the authority. Templates not present in observedForks +// are treated as having zero live forks. Errors on individual templates +// are returned as a wrapped multi-error so the caller can decide whether +// to treat partial reconciliation as fatal; reconciliation is best-effort +// and never deletes templates by itself. +func (r *FileRegistry) Reconcile(ctx context.Context, observedForks map[string]int) error { + _ = ctx + r.mu.Lock() + defer r.mu.Unlock() + all, err := r.listLocked() + if err != nil { + return err + } + var firstErr error + for _, t := range all { + want := observedForks[t.ID] + if t.ForkCount == want { + continue + } + t.ForkCount = want + if err := r.writeLocked(t); err != nil && firstErr == nil { + firstErr = fmt.Errorf("reconcile template %s: %w", t.ID, err) + } + } + return firstErr +} + func (r *FileRegistry) DecrementForkCount(ctx context.Context, id string) (*Template, error) { _ = ctx r.mu.Lock() diff --git a/lib/templates/registry_test.go b/lib/templates/registry_test.go index 79a93031..164fe13d 100644 --- a/lib/templates/registry_test.go +++ b/lib/templates/registry_test.go @@ -123,3 +123,32 @@ func TestFileRegistry_SaveValidates(t *testing.T) { err := r.Save(context.Background(), &Template{Name: "x"}) assert.True(t, errors.Is(err, ErrInvalid)) } + +func TestFileRegistry_Reconcile(t *testing.T) { + r := newTestRegistry(t) + a := sampleTemplate("a", "alpha") + a.ForkCount = 5 + b := sampleTemplate("b", "beta") + b.ForkCount = 0 + c := sampleTemplate("c", "gamma") + c.ForkCount = 7 + require.NoError(t, r.Save(context.Background(), a)) + require.NoError(t, r.Save(context.Background(), b)) + require.NoError(t, r.Save(context.Background(), c)) + + require.NoError(t, r.Reconcile(context.Background(), map[string]int{ + "a": 2, + "b": 3, + // c omitted -> should fall to 0 + })) + + got, err := r.Get(context.Background(), "a") + require.NoError(t, err) + assert.Equal(t, 2, got.ForkCount) + got, err = r.Get(context.Background(), "b") + require.NoError(t, err) + assert.Equal(t, 3, got.ForkCount) + got, err = r.Get(context.Background(), "c") + require.NoError(t, err) + assert.Equal(t, 0, got.ForkCount) +}