Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions lib/instances/manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -226,9 +226,21 @@ func NewManagerWithConfig(p *paths.Paths, imageManager images.Manager, systemMan
logger.FromContext(context.Background()).WarnContext(context.Background(), "failed to recover pending standby compression jobs", "error", err)
}

// Heal any drift between the templates registry and on-disk
// instances after a crash or out-of-band fork delete.
if err := m.reconcileTemplateState(context.Background()); err != nil {
logger.FromContext(context.Background()).WarnContext(context.Background(), "failed to reconcile template state at boot", "error", err)
}

return m
}

// ReconcileTemplateState heals registry/instances drift on demand. Useful
// for tests and for periodic GC tickers driven by the host process.
func (m *manager) ReconcileTemplateState(ctx context.Context) error {
return m.reconcileTemplateState(ctx)
}

// SetResourceValidator sets the resource validator for aggregate limit checking.
// This is called after initialization to avoid circular dependencies.
func (m *manager) SetResourceValidator(v ResourceValidator) {
Expand Down
15 changes: 15 additions & 0 deletions lib/instances/storage.go
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,21 @@ func (m *manager) loadMetadata(id string) (*metadata, error) {
return &meta, nil
}

// loadMetadataFromFile reads a metadata file by path. Used by sweepers
// that already have the path from listMetadataFiles and don't want to
// reverse-derive an instance id.
func loadMetadataFromFile(path string) (*metadata, error) {
data, err := os.ReadFile(path)
if err != nil {
return nil, fmt.Errorf("read metadata: %w", err)
}
var meta metadata
if err := json.Unmarshal(data, &meta); err != nil {
return nil, fmt.Errorf("unmarshal metadata: %w", err)
}
return &meta, nil
}

// saveMetadata saves instance metadata to disk
func (m *manager) saveMetadata(meta *metadata) error {
metaPath := m.paths.InstanceMetadata(meta.Id)
Expand Down
65 changes: 65 additions & 0 deletions lib/instances/templates.go
Original file line number Diff line number Diff line change
Expand Up @@ -316,3 +316,68 @@ func (m *manager) dropTemplateForkRefcount(ctx context.Context, templateID strin
"template_id", templateID, "error", err)
}
}

// reconcileTemplateState heals drift between the templates registry and
// the instances directory. It rewrites ForkCount on every template using
// the actual ForkOfTemplate population on disk, and removes registry
// entries whose source instance no longer exists. Safe to call at boot
// and on a periodic GC tick. Bestow the work on a background goroutine
// when the cost matters; this method is synchronous and modest.
func (m *manager) reconcileTemplateState(ctx context.Context) error {
if m.templateRegistry == nil {
return nil
}
log := logger.FromContext(ctx)

metaFiles, err := m.listMetadataFiles()
if err != nil {
return fmt.Errorf("list instance metadata for template GC: %w", err)
}

observedForks := make(map[string]int)
templateSourceInstances := make(map[string]struct{})
for _, path := range metaFiles {
meta, err := loadMetadataFromFile(path)
if err != nil {
log.WarnContext(ctx, "skip metadata during template GC", "path", path, "error", err)
continue
}
stored := meta.StoredMetadata
if stored.ForkOfTemplate != "" {
observedForks[stored.ForkOfTemplate]++
}
if stored.IsTemplate {
templateSourceInstances[stored.Id] = struct{}{}
}
}

if err := m.templateRegistry.Reconcile(ctx, observedForks); err != nil {
return fmt.Errorf("reconcile template fork counts: %w", err)
}

all, err := m.templateRegistry.List(ctx, nil)
if err != nil {
return fmt.Errorf("list templates for orphan sweep: %w", err)
}
for _, tpl := range all {
if _, ok := templateSourceInstances[tpl.SourceInstanceID]; ok {
continue
}
if tpl.ForkCount > 0 {
// Source gone but forks still reference the template; leave
// the registry entry so future fork GC can find it. The
// underlying mem-file is still alive on disk because forks
// hold open file descriptors via the symlink.
log.WarnContext(ctx, "template has live forks but no source instance",
"template_id", tpl.ID, "fork_count", tpl.ForkCount)
continue
}
log.InfoContext(ctx, "deleting orphaned template registry entry",
"template_id", tpl.ID, "source_instance_id", tpl.SourceInstanceID)
if err := m.templateRegistry.Delete(ctx, tpl.ID); err != nil {
log.WarnContext(ctx, "failed to delete orphaned template",
"template_id", tpl.ID, "error", err)
}
}
return nil
}
35 changes: 35 additions & 0 deletions lib/templates/registry.go
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,13 @@ type Registry interface {
// template (floor 0). Used when a fork is deleted. Touching
// templates that were already deleted is a no-op.
DecrementForkCount(ctx context.Context, id string) (*Template, error)

// Reconcile walks the registry and rewrites ForkCount on every
// template using observedForks: the count of live forks per
// template id. Templates not present in observedForks fall to
// zero. Used to heal drift after a crash, an out-of-band fork
// delete, or any other path that bypassed Increment/Decrement.
Reconcile(ctx context.Context, observedForks map[string]int) error
}

// ListFilter narrows the templates returned by Registry.List.
Expand Down Expand Up @@ -231,6 +238,34 @@ func (r *FileRegistry) IncrementForkCount(ctx context.Context, id string) (*Temp
return t, nil
}

// Reconcile rewrites ForkCount on every persisted template using
// observedForks as the authority. Templates not present in observedForks
// are treated as having zero live forks. Errors on individual templates
// are returned as a wrapped multi-error so the caller can decide whether
// to treat partial reconciliation as fatal; reconciliation is best-effort
// and never deletes templates by itself.
func (r *FileRegistry) Reconcile(ctx context.Context, observedForks map[string]int) error {
_ = ctx
r.mu.Lock()
defer r.mu.Unlock()
all, err := r.listLocked()
if err != nil {
return err
}
var firstErr error
for _, t := range all {
want := observedForks[t.ID]
if t.ForkCount == want {
continue
}
t.ForkCount = want
if err := r.writeLocked(t); err != nil && firstErr == nil {
firstErr = fmt.Errorf("reconcile template %s: %w", t.ID, err)
}
}
return firstErr
}

func (r *FileRegistry) DecrementForkCount(ctx context.Context, id string) (*Template, error) {
_ = ctx
r.mu.Lock()
Expand Down
29 changes: 29 additions & 0 deletions lib/templates/registry_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -123,3 +123,32 @@ func TestFileRegistry_SaveValidates(t *testing.T) {
err := r.Save(context.Background(), &Template{Name: "x"})
assert.True(t, errors.Is(err, ErrInvalid))
}

func TestFileRegistry_Reconcile(t *testing.T) {
r := newTestRegistry(t)
a := sampleTemplate("a", "alpha")
a.ForkCount = 5
b := sampleTemplate("b", "beta")
b.ForkCount = 0
c := sampleTemplate("c", "gamma")
c.ForkCount = 7
require.NoError(t, r.Save(context.Background(), a))
require.NoError(t, r.Save(context.Background(), b))
require.NoError(t, r.Save(context.Background(), c))

require.NoError(t, r.Reconcile(context.Background(), map[string]int{
"a": 2,
"b": 3,
// c omitted -> should fall to 0
}))

got, err := r.Get(context.Background(), "a")
require.NoError(t, err)
assert.Equal(t, 2, got.ForkCount)
got, err = r.Get(context.Background(), "b")
require.NoError(t, err)
assert.Equal(t, 3, got.ForkCount)
got, err = r.Get(context.Background(), "c")
require.NoError(t, err)
assert.Equal(t, 0, got.ForkCount)
}
Loading