From c088b2522e64f2a51ea6d606bf3e3508fe39ba5f Mon Sep 17 00:00:00 2001 From: sjmiller609 <7516283+sjmiller609@users.noreply.github.com> Date: Fri, 8 May 2026 15:09:58 +0000 Subject: [PATCH 1/4] firecracker: serve fork mem-file via per-template uffd page server Wires firecracker's snapshot restore through a userfaultfd-backed page server when the fork descends from a template. Each template gets one uffd.Server lazily started on the first fork and torn down once the last fork is released, so non-template forks (the symlink-only path) are unaffected. Adds an E2E test that boots a firecracker source, standbys it, promotes it to a template, forks against the template, and asserts: fork reaches Running, mem-file is a symlink to the template's, refcount bumps to 1, the per-template uffd tracker registers the fork, and on fork delete the refcount drops and the tracker detaches. Also adds unit coverage for the uffd tracker lifecycle (lazy start, multi-fork share, last-release teardown, closeAll, empty-acquire rejection). --- lib/hypervisor/firecracker/config.go | 31 +++- lib/hypervisor/firecracker/config_test.go | 9 +- lib/hypervisor/firecracker/firecracker.go | 4 +- lib/hypervisor/firecracker/fork.go | 4 + lib/hypervisor/firecracker/process.go | 2 +- lib/hypervisor/hypervisor.go | 6 + lib/instances/delete.go | 8 +- lib/instances/firecracker_test.go | 98 ++++++++++++ lib/instances/fork.go | 10 ++ lib/instances/manager.go | 6 + lib/instances/uffd.go | 184 ++++++++++++++++++++++ lib/instances/uffd_test.go | 93 +++++++++++ lib/paths/paths.go | 8 + 13 files changed, 455 insertions(+), 8 deletions(-) create mode 100644 lib/instances/uffd.go create mode 100644 lib/instances/uffd_test.go diff --git a/lib/hypervisor/firecracker/config.go b/lib/hypervisor/firecracker/config.go index 9576f2ca..192103ea 100644 --- a/lib/hypervisor/firecracker/config.go +++ b/lib/hypervisor/firecracker/config.go @@ -75,12 +75,21 @@ type snapshotCreateParams struct { type snapshotLoadParams struct { MemFilePath string `json:"mem_file_path,omitempty"` + MemBackend *memBackend `json:"mem_backend,omitempty"` SnapshotPath string `json:"snapshot_path"` EnableDiffSnapshots bool `json:"enable_diff_snapshots,omitempty"` ResumeVM bool `json:"resume_vm,omitempty"` NetworkOverrides []networkOverride `json:"network_overrides,omitempty"` } +// memBackend selects how firecracker materializes guest memory during +// restore. backend_type "Uffd" hands page-fault handling off to a +// userfaultfd page server reachable at backend_path (Unix domain socket). +type memBackend struct { + BackendType string `json:"backend_type"` + BackendPath string `json:"backend_path"` +} + type networkOverride struct { IfaceID string `json:"iface_id"` HostDevName string `json:"host_dev_name"` @@ -103,6 +112,11 @@ type instanceInfo struct { type restoreMetadata struct { NetworkOverrides []networkOverride `json:"network_overrides,omitempty"` SnapshotSourceDataDir string `json:"snapshot_source_data_dir,omitempty"` + // UffdSocketPath, when non-empty, makes loadSnapshot send a Uffd + // mem_backend pointing at the page server instead of letting + // firecracker mmap the mem-file directly. PrepareFork records it + // per fork so RestoreVM can pick it up after a hypeman restart. + UffdSocketPath string `json:"uffd_socket_path,omitempty"` } func toBootSource(cfg hypervisor.VMConfig) bootSource { @@ -212,14 +226,25 @@ func toSnapshotCreateParams(snapshotDir string) snapshotCreateParams { } } -func toSnapshotLoadParams(snapshotDir string, networkOverrides []networkOverride) snapshotLoadParams { - return snapshotLoadParams{ - MemFilePath: snapshotMemoryPath(snapshotDir), +func toSnapshotLoadParams(snapshotDir string, networkOverrides []networkOverride, uffdSocketPath string) snapshotLoadParams { + params := snapshotLoadParams{ SnapshotPath: snapshotStatePath(snapshotDir), EnableDiffSnapshots: true, ResumeVM: false, NetworkOverrides: networkOverrides, } + if uffdSocketPath != "" { + // Firecracker rejects load requests that set both mem_file_path + // and a uffd backend. The page server takes the file path through + // its own configuration, so we drop it from the request. + params.MemBackend = &memBackend{ + BackendType: "Uffd", + BackendPath: uffdSocketPath, + } + } else { + params.MemFilePath = snapshotMemoryPath(snapshotDir) + } + return params } func snapshotStatePath(snapshotDir string) string { diff --git a/lib/hypervisor/firecracker/config_test.go b/lib/hypervisor/firecracker/config_test.go index 6e912ee7..7e49b4f2 100644 --- a/lib/hypervisor/firecracker/config_test.go +++ b/lib/hypervisor/firecracker/config_test.go @@ -82,12 +82,19 @@ func TestSnapshotParamPaths(t *testing.T) { load := toSnapshotLoadParams("/tmp/snapshot-latest", []networkOverride{ {IfaceID: "eth0", HostDevName: "hype-abc123"}, - }) + }, "") assert.Equal(t, "/tmp/snapshot-latest/state", load.SnapshotPath) assert.Equal(t, "/tmp/snapshot-latest/memory", load.MemFilePath) + assert.Nil(t, load.MemBackend) assert.True(t, load.EnableDiffSnapshots) assert.False(t, load.ResumeVM) require.Len(t, load.NetworkOverrides, 1) + + loadUffd := toSnapshotLoadParams("/tmp/snapshot-latest", nil, "/run/uffd/abc.sock") + assert.Equal(t, "", loadUffd.MemFilePath, "mem_file_path must be empty when a uffd backend is set") + require.NotNil(t, loadUffd.MemBackend) + assert.Equal(t, "Uffd", loadUffd.MemBackend.BackendType) + assert.Equal(t, "/run/uffd/abc.sock", loadUffd.MemBackend.BackendPath) } func TestToBalloonConfig(t *testing.T) { diff --git a/lib/hypervisor/firecracker/firecracker.go b/lib/hypervisor/firecracker/firecracker.go index e22e42f3..e3bf435e 100644 --- a/lib/hypervisor/firecracker/firecracker.go +++ b/lib/hypervisor/firecracker/firecracker.go @@ -223,8 +223,8 @@ func (f *Firecracker) instanceStart(ctx context.Context) error { return f.postAction(ctx, "InstanceStart") } -func (f *Firecracker) loadSnapshot(ctx context.Context, snapshotDir string, networkOverrides []networkOverride) error { - params := toSnapshotLoadParams(snapshotDir, networkOverrides) +func (f *Firecracker) loadSnapshot(ctx context.Context, snapshotDir string, networkOverrides []networkOverride, uffdSocketPath string) error { + params := toSnapshotLoadParams(snapshotDir, networkOverrides, uffdSocketPath) if _, err := f.do(ctx, http.MethodPut, "/snapshot/load", params, http.StatusNoContent); err != nil { return err } diff --git a/lib/hypervisor/firecracker/fork.go b/lib/hypervisor/firecracker/fork.go index 81936929..3cb8049e 100644 --- a/lib/hypervisor/firecracker/fork.go +++ b/lib/hypervisor/firecracker/fork.go @@ -53,6 +53,10 @@ func (s *Starter) PrepareFork(ctx context.Context, req hypervisor.ForkPrepareReq changed = true } } + if meta.UffdSocketPath != req.UffdSocketPath { + meta.UffdSocketPath = req.UffdSocketPath + changed = true + } if changed { if err := saveRestoreMetadataState(instanceDir, meta); err != nil { diff --git a/lib/hypervisor/firecracker/process.go b/lib/hypervisor/firecracker/process.go index 371e1f2e..fbe6cbea 100644 --- a/lib/hypervisor/firecracker/process.go +++ b/lib/hypervisor/firecracker/process.go @@ -119,7 +119,7 @@ func (s *Starter) RestoreVM(ctx context.Context, p *paths.Paths, version string, snapshotSourceAliasMu.Lock() defer snapshotSourceAliasMu.Unlock() return withSnapshotSourceDirAlias(meta, filepath.Dir(socketPath), func() error { - return hv.loadSnapshot(ctx, snapshotPath, meta.NetworkOverrides) + return hv.loadSnapshot(ctx, snapshotPath, meta.NetworkOverrides, meta.UffdSocketPath) }) }() if err != nil { diff --git a/lib/hypervisor/hypervisor.go b/lib/hypervisor/hypervisor.go index a3e5d01f..e593a9bf 100644 --- a/lib/hypervisor/hypervisor.go +++ b/lib/hypervisor/hypervisor.go @@ -146,6 +146,12 @@ type ForkPrepareRequest struct { SerialLogPath string Network *ForkNetworkConfig + + // UffdSocketPath is set when the fork should restore from a userfaultfd + // page-server socket instead of mmap'ing its mem-file directly. The + // hypervisor records this so RestoreVM can attach a uffd memory backend + // in the snapshot/load request. Empty means use the default mmap path. + UffdSocketPath string } // ForkPrepareResult describes which optional fork rewrites were actually applied. diff --git a/lib/instances/delete.go b/lib/instances/delete.go index 79b679ef..6011c057 100644 --- a/lib/instances/delete.go +++ b/lib/instances/delete.go @@ -149,9 +149,15 @@ func (m *manager) deleteInstance( } // 9. If this instance was a fork of a template, drop the template's - // fork refcount so the template can eventually be deleted. + // fork refcount so the template can eventually be deleted, and + // detach it from the uffd page server if one is running. if stored.ForkOfTemplate != "" { m.dropTemplateForkRefcount(ctx, stored.ForkOfTemplate) + if m.uffd != nil { + if err := m.uffd.releaseUffdForFork(stored.ForkOfTemplate, id); err != nil { + log.WarnContext(ctx, "failed to release uffd page server for fork", "instance_id", id, "template_id", stored.ForkOfTemplate, "error", err) + } + } } log.InfoContext(ctx, "instance deleted successfully", "instance_id", id) diff --git a/lib/instances/firecracker_test.go b/lib/instances/firecracker_test.go index 25973930..d1d855bf 100644 --- a/lib/instances/firecracker_test.go +++ b/lib/instances/firecracker_test.go @@ -544,3 +544,101 @@ func TestFirecrackerSnapshotFeature(t *testing.T) { forkName: "fc-snapshot-fork", }) } + +// TestFirecrackerForkFromTemplate exercises the full template-driven fork +// path: standby a firecracker source, promote it to a template, fork off it, +// and assert the fork (a) reaches Running, (b) has its mem-file as a symlink +// into the template's snapshot dir (the fan-out optimisation), (c) bumped +// the template's fork refcount, (d) registered with the per-template uffd +// page server, and (e) on delete, drops the refcount and detaches from uffd. +func TestFirecrackerForkFromTemplate(t *testing.T) { + t.Parallel() + requireFirecrackerIntegrationPrereqs(t) + + mgr, tmpDir := setupTestManagerForFirecracker(t) + ctx := context.Background() + p := paths.New(tmpDir) + + imageManager, err := images.NewManager(p, 1, nil) + require.NoError(t, err) + createNginxImageAndWait(t, ctx, imageManager) + + systemManager := system.NewManager(p) + require.NoError(t, systemManager.EnsureSystemFiles(ctx)) + require.NoError(t, mgr.networkManager.Initialize(ctx, nil)) + + source, err := mgr.CreateInstance(ctx, CreateInstanceRequest{ + Name: "fc-tpl-src", + Image: integrationTestImageRef(t, "docker.io/library/nginx:alpine"), + Size: 1024 * 1024 * 1024, + HotplugSize: 256 * 1024 * 1024, + OverlaySize: 5 * 1024 * 1024 * 1024, + Vcpus: 1, + NetworkEnabled: true, + Hypervisor: hypervisor.TypeFirecracker, + }) + require.NoError(t, err) + source, err = waitForInstanceState(ctx, mgr, source.Id, StateRunning, integrationTestTimeout(20*time.Second)) + require.NoError(t, err) + sourceID := source.Id + t.Cleanup(func() { _ = mgr.DeleteInstance(context.Background(), sourceID) }) + + // Standby is a precondition for promotion. + source, err = mgr.StandbyInstance(ctx, sourceID, StandbyInstanceRequest{}) + require.NoError(t, err) + require.Equal(t, StateStandby, source.State) + require.True(t, source.HasSnapshot) + + tpl, err := mgr.promoteToTemplate(ctx, sourceID, PromoteToTemplateRequest{Name: "fc-tpl-e2e"}) + require.NoError(t, err) + require.NotNil(t, tpl) + require.Equal(t, sourceID, tpl.SourceInstanceID) + require.Equal(t, hypervisor.TypeFirecracker, tpl.HypervisorType) + require.Equal(t, 0, tpl.ForkCount) + + // Fork from the template (no source instance id passed). + forked, err := mgr.ForkInstance(ctx, "", ForkInstanceRequest{ + Name: "fc-tpl-fork", + TemplateID: tpl.ID, + TargetState: StateRunning, + }) + require.NoError(t, err) + forked, err = waitForInstanceState(ctx, mgr, forked.Id, StateRunning, integrationTestTimeout(30*time.Second)) + require.NoError(t, err) + require.Equal(t, StateRunning, forked.State) + forkID := forked.Id + deletedFork := false + t.Cleanup(func() { + if !deletedFork { + _ = mgr.DeleteInstance(context.Background(), forkID) + } + }) + + // (b) The fork's mem-file must be a symlink into the template snapshot. + forkMemPath := filepath.Join(p.InstanceSnapshotLatest(forkID), templateSharedMemFileName) + info, err := os.Lstat(forkMemPath) + require.NoError(t, err, "fork mem-file should exist at snapshot-latest/memory") + assert.Equal(t, os.ModeSymlink, info.Mode()&os.ModeSymlink, "fork mem-file should be a symlink, not a copy") + target, err := os.Readlink(forkMemPath) + require.NoError(t, err) + expectedTarget := filepath.Join(p.InstanceSnapshotLatest(sourceID), templateSharedMemFileName) + assert.Equal(t, expectedTarget, target, "fork mem-file symlink should point at the template's mem-file") + + // (c) Refcount on the template must be bumped to 1. + tplAfterFork, err := mgr.getTemplate(ctx, tpl.ID) + require.NoError(t, err) + assert.Equal(t, 1, tplAfterFork.ForkCount, "template fork refcount should be 1 after one fork") + + // (d) The per-template uffd page server should be tracking this fork. + require.NotNil(t, mgr.uffd) + assert.True(t, mgr.uffd.hasFork(tpl.ID, forkID), "uffd tracker should report fork as registered against its template") + + // Deleting the fork drops the refcount and detaches from uffd. + require.NoError(t, mgr.DeleteInstance(ctx, forkID)) + deletedFork = true + + tplAfterDelete, err := mgr.getTemplate(ctx, tpl.ID) + require.NoError(t, err) + assert.Equal(t, 0, tplAfterDelete.ForkCount, "template fork refcount should drop back to 0") + assert.False(t, mgr.uffd.hasFork(tpl.ID, forkID), "uffd tracker should no longer track the deleted fork") +} diff --git a/lib/instances/fork.go b/lib/instances/fork.go index bae4182b..9446960f 100644 --- a/lib/instances/fork.go +++ b/lib/instances/fork.go @@ -336,6 +336,15 @@ func (m *manager) forkInstanceFromStoppedOrStandby(ctx context.Context, id strin if forkMeta.NetworkEnabled { netCfg = &hypervisor.ForkNetworkConfig{TAPDevice: network.GenerateTAPName(forkID)} } + uffdSocketPath, err := m.acquireForkUffdIfApplicable(ctx, tpl, forkID, stored.HypervisorType) + if err != nil { + return nil, fmt.Errorf("attach uffd page server: %w", err) + } + if uffdSocketPath != "" { + cu.Add(func() { + _ = m.uffd.releaseUffdForFork(tpl.ID, forkID) + }) + } if _, err := starter.PrepareFork(ctx, hypervisor.ForkPrepareRequest{ SnapshotConfigPath: snapshotConfigPath, SourceDataDir: stored.DataDir, @@ -344,6 +353,7 @@ func (m *manager) forkInstanceFromStoppedOrStandby(ctx context.Context, id strin VsockSocket: forkMeta.VsockSocket, SerialLogPath: m.paths.InstanceAppLog(forkID), Network: netCfg, + UffdSocketPath: uffdSocketPath, }); err != nil { if errors.Is(err, hypervisor.ErrNotSupported) { return nil, fmt.Errorf("%w: fork is not supported for hypervisor %s", ErrNotSupported, stored.HypervisorType) diff --git a/lib/instances/manager.go b/lib/instances/manager.go index 1a2215c0..629adb2e 100644 --- a/lib/instances/manager.go +++ b/lib/instances/manager.go @@ -154,6 +154,11 @@ type manager struct { // fork/delete). Constructed lazily so existing managers without // template support keep working unchanged. templateRegistry templates.Registry + + // uffd is the per-template userfaultfd page-server tracker. nil on + // non-Linux hosts; on Linux it is started lazily for forks that + // resolve to a template and torn down once no forks remain. + uffd *uffdTracker } // platformStarters is populated by platform-specific init functions. @@ -209,6 +214,7 @@ func NewManagerWithConfig(p *paths.Paths, imageManager images.Manager, systemMan nativeCodecPaths: make(map[string]string), lifecycleEvents: newLifecycleSubscribersWithBufferSize(managerConfig.LifecycleEventBufferSize), templateRegistry: templates.NewFileRegistry(p.TemplatesDir()), + uffd: newUffdTracker(), } m.deleteSnapshotFn = m.deleteSnapshot diff --git a/lib/instances/uffd.go b/lib/instances/uffd.go new file mode 100644 index 00000000..cf057f62 --- /dev/null +++ b/lib/instances/uffd.go @@ -0,0 +1,184 @@ +package instances + +import ( + "context" + "errors" + "fmt" + "path/filepath" + "sync" + + "github.com/kernel/hypeman/lib/hypervisor" + "github.com/kernel/hypeman/lib/templates" + "github.com/kernel/hypeman/lib/uffd" +) + +// uffdTracker owns one uffd.Server per template mem-file and tracks which +// forks are currently restored against each one. The first +// acquireUffdForFork for a template lazily starts the server; the last +// releaseUffdForFork tears it down. This keeps the server out of the +// critical path for non-template forks (the symlink-only path from +// PR 3) and avoids leaking page-server goroutines once a template +// becomes idle. +// +// Lifecycle assumption: this PR scopes uffd to the *active* fork-create +// path. After a hypeman restart, any firecracker process previously +// backed by a uffd server has no one to handle its faults until those +// forks are themselves restarted; that gap is documented in +// recoverTemplateForkRefcounts and is the next follow-up. +type uffdTracker struct { + mu sync.Mutex + entries map[string]*uffdEntry +} + +type uffdEntry struct { + server *uffd.Server + forks map[string]struct{} +} + +func newUffdTracker() *uffdTracker { + return &uffdTracker{entries: map[string]*uffdEntry{}} +} + +// acquireUffdForFork ensures a uffd.Server is running for the template, +// registers forkID with it, and returns the per-fork UDS path. Callers +// must call releaseUffdForFork(templateID, forkID) once the fork no +// longer exists, even if firecracker never connected. +func (t *uffdTracker) acquireUffdForFork(ctx context.Context, tpl *templates.Template, memFilePath, socketDir, forkID string) (string, error) { + if tpl == nil { + return "", errors.New("uffd: template is required") + } + if forkID == "" { + return "", errors.New("uffd: fork id is required") + } + t.mu.Lock() + entry, ok := t.entries[tpl.ID] + if !ok { + srv, err := uffd.NewServer(uffd.Config{ + MemFilePath: memFilePath, + SocketDir: socketDir, + }) + if err != nil { + t.mu.Unlock() + return "", fmt.Errorf("uffd: start server for template %s: %w", tpl.ID, err) + } + entry = &uffdEntry{server: srv, forks: map[string]struct{}{}} + t.entries[tpl.ID] = entry + } + t.mu.Unlock() + + socketPath, err := entry.server.RegisterFork(ctx, forkID) + if err != nil { + t.maybeCloseEmpty(tpl.ID) + return "", fmt.Errorf("uffd: register fork %s with template %s: %w", forkID, tpl.ID, err) + } + + t.mu.Lock() + entry.forks[forkID] = struct{}{} + t.mu.Unlock() + return socketPath, nil +} + +// releaseUffdForFork unregisters the fork from its template's server +// and shuts the server down once no forks remain. It is safe to call +// for templates that never had a server (returns nil). +func (t *uffdTracker) releaseUffdForFork(templateID, forkID string) error { + if templateID == "" || forkID == "" { + return nil + } + t.mu.Lock() + entry, ok := t.entries[templateID] + if !ok { + t.mu.Unlock() + return nil + } + if _, present := entry.forks[forkID]; !present { + t.mu.Unlock() + return nil + } + delete(entry.forks, forkID) + srv := entry.server + empty := len(entry.forks) == 0 + if empty { + delete(t.entries, templateID) + } + t.mu.Unlock() + + var firstErr error + if err := srv.UnregisterFork(forkID); err != nil { + firstErr = err + } + if empty { + if err := srv.Close(); err != nil && firstErr == nil { + firstErr = err + } + } + return firstErr +} + +// maybeCloseEmpty drops a template's entry when no forks ever attached +// to a freshly-created server (acquire-then-RegisterFork failure). +func (t *uffdTracker) maybeCloseEmpty(templateID string) { + t.mu.Lock() + entry, ok := t.entries[templateID] + if !ok || len(entry.forks) > 0 { + t.mu.Unlock() + return + } + delete(t.entries, templateID) + srv := entry.server + t.mu.Unlock() + _ = srv.Close() +} + +// closeAll tears down every server. Called by the manager during +// shutdown so the uffd goroutines and mem-file fds don't leak. +func (t *uffdTracker) closeAll() error { + t.mu.Lock() + entries := t.entries + t.entries = map[string]*uffdEntry{} + t.mu.Unlock() + + var firstErr error + for _, entry := range entries { + for forkID := range entry.forks { + _ = entry.server.UnregisterFork(forkID) + } + if err := entry.server.Close(); err != nil && firstErr == nil { + firstErr = err + } + } + return firstErr +} + +// hasFork is a test-only helper that reports whether forkID is currently +// tracked under templateID. +func (t *uffdTracker) hasFork(templateID, forkID string) bool { + t.mu.Lock() + defer t.mu.Unlock() + entry, ok := t.entries[templateID] + if !ok { + return false + } + _, present := entry.forks[forkID] + return present +} + +// uffdSupportedForFork reports whether the manager will try to serve a +// fork's mem-file via uffd. Only firecracker is wired to consume the +// backend; other hypervisors fall back to the symlink-only path. +func uffdSupportedForFork(hvType hypervisor.Type) bool { + return hvType == hypervisor.TypeFirecracker +} + +// acquireForkUffdIfApplicable returns a per-fork uffd UDS path when the +// fork should be backed by a userfaultfd page server, or "" when it +// should fall back to the symlink-only path. Failure to start the +// server is bubbled up as an error so the caller can abort the fork. +func (m *manager) acquireForkUffdIfApplicable(ctx context.Context, tpl *templates.Template, forkID string, hvType hypervisor.Type) (string, error) { + if tpl == nil || !uffdSupportedForFork(hvType) || m.uffd == nil { + return "", nil + } + memFilePath := filepath.Join(m.paths.InstanceSnapshotLatest(tpl.SourceInstanceID), templateSharedMemFileName) + socketDir := m.paths.TemplateUffdDir(tpl.ID) + return m.uffd.acquireUffdForFork(ctx, tpl, memFilePath, socketDir, forkID) +} diff --git a/lib/instances/uffd_test.go b/lib/instances/uffd_test.go new file mode 100644 index 00000000..604b946c --- /dev/null +++ b/lib/instances/uffd_test.go @@ -0,0 +1,93 @@ +//go:build linux + +package instances + +import ( + "context" + "os" + "path/filepath" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "github.com/kernel/hypeman/lib/templates" +) + +func writeUffdTrackerMemFile(t *testing.T, dir string) string { + t.Helper() + p := filepath.Join(dir, "memory") + require.NoError(t, os.WriteFile(p, make([]byte, 4096), 0o644)) + return p +} + +func TestUffdTracker_AcquireAndReleaseLifecycle(t *testing.T) { + tracker := newUffdTracker() + t.Cleanup(func() { _ = tracker.closeAll() }) + + tplDir := t.TempDir() + memPath := writeUffdTrackerMemFile(t, tplDir) + tpl := &templates.Template{ID: "tpl-1", SourceInstanceID: "src-1"} + + socketDir := filepath.Join(tplDir, "uffd") + socketA, err := tracker.acquireUffdForFork(context.Background(), tpl, memPath, socketDir, "fork-a") + require.NoError(t, err) + require.NotEmpty(t, socketA) + require.True(t, tracker.hasFork(tpl.ID, "fork-a")) + + // Second fork against the same template reuses the existing server and + // returns a distinct UDS path. + socketB, err := tracker.acquireUffdForFork(context.Background(), tpl, memPath, socketDir, "fork-b") + require.NoError(t, err) + require.NotEmpty(t, socketB) + require.NotEqual(t, socketA, socketB) + require.True(t, tracker.hasFork(tpl.ID, "fork-b")) + + // Releasing one fork keeps the server alive for the remaining fork. + require.NoError(t, tracker.releaseUffdForFork(tpl.ID, "fork-a")) + require.False(t, tracker.hasFork(tpl.ID, "fork-a")) + require.True(t, tracker.hasFork(tpl.ID, "fork-b")) + + // Releasing the last fork tears the server down. + require.NoError(t, tracker.releaseUffdForFork(tpl.ID, "fork-b")) + require.False(t, tracker.hasFork(tpl.ID, "fork-b")) + + // A subsequent acquire should be able to start a fresh server. + socketC, err := tracker.acquireUffdForFork(context.Background(), tpl, memPath, socketDir, "fork-c") + require.NoError(t, err) + require.NotEmpty(t, socketC) + require.True(t, tracker.hasFork(tpl.ID, "fork-c")) +} + +func TestUffdTracker_ReleaseUnknownFork_NoError(t *testing.T) { + tracker := newUffdTracker() + require.NoError(t, tracker.releaseUffdForFork("missing-template", "missing-fork")) +} + +func TestUffdTracker_AcquireRejectsEmpty(t *testing.T) { + tracker := newUffdTracker() + _, err := tracker.acquireUffdForFork(context.Background(), nil, "/tmp/x", "/tmp/y", "fork") + assert.Error(t, err) + + tplDir := t.TempDir() + memPath := writeUffdTrackerMemFile(t, tplDir) + tpl := &templates.Template{ID: "tpl-1", SourceInstanceID: "src-1"} + _, err = tracker.acquireUffdForFork(context.Background(), tpl, memPath, filepath.Join(tplDir, "uffd"), "") + assert.Error(t, err) +} + +func TestUffdTracker_CloseAll(t *testing.T) { + tracker := newUffdTracker() + tplDir := t.TempDir() + memPath := writeUffdTrackerMemFile(t, tplDir) + tpl := &templates.Template{ID: "tpl-1", SourceInstanceID: "src-1"} + + _, err := tracker.acquireUffdForFork(context.Background(), tpl, memPath, filepath.Join(tplDir, "uffd"), "fork-a") + require.NoError(t, err) + _, err = tracker.acquireUffdForFork(context.Background(), tpl, memPath, filepath.Join(tplDir, "uffd"), "fork-b") + require.NoError(t, err) + + require.NoError(t, tracker.closeAll()) + assert.False(t, tracker.hasFork(tpl.ID, "fork-a")) + assert.False(t, tracker.hasFork(tpl.ID, "fork-b")) +} diff --git a/lib/paths/paths.go b/lib/paths/paths.go index 26662644..c678d507 100644 --- a/lib/paths/paths.go +++ b/lib/paths/paths.go @@ -279,6 +279,14 @@ func (p *Paths) TemplateMetadata(id string) string { return filepath.Join(p.TemplateDir(id), "template.json") } +// TemplateUffdDir returns the directory under a template that holds the +// per-fork userfaultfd page-server sockets. Sockets live here (rather than +// inside each fork's data dir) because Unix domain socket paths are tightly +// length-limited; the template id keeps the prefix short and stable. +func (p *Paths) TemplateUffdDir(id string) string { + return filepath.Join(p.TemplateDir(id), "uffd") +} + // Device path methods // DevicesDir returns the root devices directory. From 6e6b6eff4a515cd3d4b0e2615b3e3646e1721298 Mon Sep 17 00:00:00 2001 From: sjmiller609 <7516283+sjmiller609@users.noreply.github.com> Date: Fri, 8 May 2026 15:21:22 +0000 Subject: [PATCH 2/4] uffd: anchor sockets at short runtime root, not under data dir Unix domain socket paths cap at 108 bytes (sun_path). Putting the per- fork socket under /templates/<25-char-cuid>/uffd/<25-char- cuid>.uffd blew that limit on CI runners where t.TempDir() returns long prefixes, surfacing as "bind: invalid argument" in TestFirecrackerForkFromTemplate. Sockets are ephemeral, so anchoring them at /tmp/h-uffd// keeps the path well under the limit regardless of how deep DataDir is. The tracker now also rm -rfs the per-template socket dir on the last release so we don't leak stale entries. --- lib/instances/uffd.go | 19 ++++++++++++++++--- lib/paths/paths.go | 23 ++++++++++++++++++----- 2 files changed, 34 insertions(+), 8 deletions(-) diff --git a/lib/instances/uffd.go b/lib/instances/uffd.go index cf057f62..c9b8c9d0 100644 --- a/lib/instances/uffd.go +++ b/lib/instances/uffd.go @@ -4,6 +4,7 @@ import ( "context" "errors" "fmt" + "os" "path/filepath" "sync" @@ -31,8 +32,9 @@ type uffdTracker struct { } type uffdEntry struct { - server *uffd.Server - forks map[string]struct{} + server *uffd.Server + socketDir string + forks map[string]struct{} } func newUffdTracker() *uffdTracker { @@ -61,7 +63,7 @@ func (t *uffdTracker) acquireUffdForFork(ctx context.Context, tpl *templates.Tem t.mu.Unlock() return "", fmt.Errorf("uffd: start server for template %s: %w", tpl.ID, err) } - entry = &uffdEntry{server: srv, forks: map[string]struct{}{}} + entry = &uffdEntry{server: srv, socketDir: socketDir, forks: map[string]struct{}{}} t.entries[tpl.ID] = entry } t.mu.Unlock() @@ -97,6 +99,7 @@ func (t *uffdTracker) releaseUffdForFork(templateID, forkID string) error { } delete(entry.forks, forkID) srv := entry.server + socketDir := entry.socketDir empty := len(entry.forks) == 0 if empty { delete(t.entries, templateID) @@ -111,6 +114,9 @@ func (t *uffdTracker) releaseUffdForFork(templateID, forkID string) error { if err := srv.Close(); err != nil && firstErr == nil { firstErr = err } + if socketDir != "" { + _ = os.RemoveAll(socketDir) + } } return firstErr } @@ -126,8 +132,12 @@ func (t *uffdTracker) maybeCloseEmpty(templateID string) { } delete(t.entries, templateID) srv := entry.server + socketDir := entry.socketDir t.mu.Unlock() _ = srv.Close() + if socketDir != "" { + _ = os.RemoveAll(socketDir) + } } // closeAll tears down every server. Called by the manager during @@ -146,6 +156,9 @@ func (t *uffdTracker) closeAll() error { if err := entry.server.Close(); err != nil && firstErr == nil { firstErr = err } + if entry.socketDir != "" { + _ = os.RemoveAll(entry.socketDir) + } } return firstErr } diff --git a/lib/paths/paths.go b/lib/paths/paths.go index c678d507..8fe78603 100644 --- a/lib/paths/paths.go +++ b/lib/paths/paths.go @@ -279,12 +279,25 @@ func (p *Paths) TemplateMetadata(id string) string { return filepath.Join(p.TemplateDir(id), "template.json") } -// TemplateUffdDir returns the directory under a template that holds the -// per-fork userfaultfd page-server sockets. Sockets live here (rather than -// inside each fork's data dir) because Unix domain socket paths are tightly -// length-limited; the template id keeps the prefix short and stable. +// TemplateUffdDir returns the directory that holds a template's per-fork +// userfaultfd page-server sockets. Sockets cannot live under the data dir +// because Unix domain socket paths are limited to 108 bytes (sun_path); a +// deep DataDir + the cuid2 template+fork ids easily blow that. Anchoring +// at a short, fixed runtime root keeps the absolute path well under the +// limit (~30 chars for typical ids) regardless of where the data dir lives. +// +// Sockets are ephemeral — losing them on reboot is fine; firecracker +// reconnects on restore. func (p *Paths) TemplateUffdDir(id string) string { - return filepath.Join(p.TemplateDir(id), "uffd") + return filepath.Join(uffdRuntimeRoot(), id) +} + +// uffdRuntimeRoot returns the short root used to hold uffd sockets. It is +// not configurable on purpose: the only requirement is that it stay short +// and writable, and /tmp meets both. Tests get isolation from the cuid +// segment in TemplateUffdDir. +func uffdRuntimeRoot() string { + return filepath.Join("/tmp", "h-uffd") } // Device path methods From 8ebb61c2185adc572e84f6785015cc13b9b56ecd Mon Sep 17 00:00:00 2001 From: sjmiller609 <7516283+sjmiller609@users.noreply.github.com> Date: Fri, 8 May 2026 15:36:26 +0000 Subject: [PATCH 3/4] fork: hardlink shared mem-file to dodge restore-alias ELOOP The previous fork-mem-file symlink looped through withSnapshotSourceDirAlias during firecracker restore: fork/.../memory -> source/.../memory, but the alias dance temporarily symlinks source dir -> fork dir, so resolution ping-pongs back to fork/.../memory and trips ELOOP. Switching to a hardlink makes the fork's mem-file resolve by inode so the temporary directory alias no longer affects it. Co-Authored-By: Claude Opus 4.7 --- lib/instances/firecracker_test.go | 22 ++++++++++++++-------- lib/instances/templates.go | 20 ++++++++++++++------ lib/instances/types.go | 6 +++--- 3 files changed, 31 insertions(+), 17 deletions(-) diff --git a/lib/instances/firecracker_test.go b/lib/instances/firecracker_test.go index d1d855bf..809bd953 100644 --- a/lib/instances/firecracker_test.go +++ b/lib/instances/firecracker_test.go @@ -10,6 +10,7 @@ import ( "os" "path/filepath" "strings" + "syscall" "testing" "time" @@ -547,8 +548,8 @@ func TestFirecrackerSnapshotFeature(t *testing.T) { // TestFirecrackerForkFromTemplate exercises the full template-driven fork // path: standby a firecracker source, promote it to a template, fork off it, -// and assert the fork (a) reaches Running, (b) has its mem-file as a symlink -// into the template's snapshot dir (the fan-out optimisation), (c) bumped +// and assert the fork (a) reaches Running, (b) has its mem-file hardlinked +// to the template's snapshot mem-file (the fan-out optimisation), (c) bumped // the template's fork refcount, (d) registered with the per-template uffd // page server, and (e) on delete, drops the refcount and detaches from uffd. func TestFirecrackerForkFromTemplate(t *testing.T) { @@ -614,15 +615,20 @@ func TestFirecrackerForkFromTemplate(t *testing.T) { } }) - // (b) The fork's mem-file must be a symlink into the template snapshot. + // (b) The fork's mem-file must share the source's inode (hardlink), not + // be a copy. We can't compare paths because the link is by inode; we + // compare st_ino + st_dev between the two instances' mem-files. forkMemPath := filepath.Join(p.InstanceSnapshotLatest(forkID), templateSharedMemFileName) - info, err := os.Lstat(forkMemPath) + srcMemPath := filepath.Join(p.InstanceSnapshotLatest(sourceID), templateSharedMemFileName) + forkInfo, err := os.Stat(forkMemPath) require.NoError(t, err, "fork mem-file should exist at snapshot-latest/memory") - assert.Equal(t, os.ModeSymlink, info.Mode()&os.ModeSymlink, "fork mem-file should be a symlink, not a copy") - target, err := os.Readlink(forkMemPath) + assert.True(t, forkInfo.Mode().IsRegular(), "fork mem-file should be a regular file (hardlink), not a symlink") + srcInfo, err := os.Stat(srcMemPath) require.NoError(t, err) - expectedTarget := filepath.Join(p.InstanceSnapshotLatest(sourceID), templateSharedMemFileName) - assert.Equal(t, expectedTarget, target, "fork mem-file symlink should point at the template's mem-file") + forkSys := forkInfo.Sys().(*syscall.Stat_t) + srcSys := srcInfo.Sys().(*syscall.Stat_t) + assert.Equal(t, srcSys.Ino, forkSys.Ino, "fork mem-file should share the source's inode (hardlink, not copy)") + assert.Equal(t, srcSys.Dev, forkSys.Dev, "fork mem-file should be on the same filesystem as source") // (c) Refcount on the template must be bumped to 1. tplAfterFork, err := mgr.getTemplate(ctx, tpl.ID) diff --git a/lib/instances/templates.go b/lib/instances/templates.go index 91383615..f6607a99 100644 --- a/lib/instances/templates.go +++ b/lib/instances/templates.go @@ -252,14 +252,22 @@ func (m *manager) resolveForkFromTemplateRequest(ctx context.Context, instanceID } // installForkSharedMemFile arranges the fork's snapshot directory so the -// guest mem-file is a symlink into the template's snapshot directory +// guest mem-file is a hardlink to the template's snapshot mem-file // instead of a per-fork copy. firecracker mmaps the mem-file MAP_PRIVATE -// during restore, so all forks COW from the same backing file. +// during restore, so all forks COW from the same backing inode. // // Layout: dst is the fork's data dir. The snapshot dir is at // /snapshots/snapshot-latest, and the mem-file lives at -// /memory. The symlink target is the template's source -// instance's standby snapshot mem-file. +// /memory. The hardlink shares the inode with the +// template's source instance's standby snapshot mem-file. +// +// We use a hardlink rather than a symlink because RestoreVM temporarily +// aliases the source data dir to the fork data dir while firecracker +// loads the snapshot (see withSnapshotSourceDirAlias). A symlink whose +// target traverses the source dir would resolve back into the fork dir +// during that window and trip ELOOP; a hardlink resolves by inode so +// the alias has no effect on it. Hardlinks require both paths on the +// same filesystem, which holds for our standard data-dir layout. func (m *manager) installForkSharedMemFile(forkDataDir string, tpl *templates.Template) error { if tpl == nil { return nil @@ -276,8 +284,8 @@ func (m *manager) installForkSharedMemFile(forkDataDir string, tpl *templates.Te // Tolerate a leftover entry (e.g. from a partial copy that wasn't fully // skipped on a different filesystem layout). _ = os.Remove(dstMem) - if err := os.Symlink(srcMem, dstMem); err != nil { - return fmt.Errorf("symlink shared mem-file: %w", err) + if err := os.Link(srcMem, dstMem); err != nil { + return fmt.Errorf("hardlink shared mem-file: %w", err) } return nil } diff --git a/lib/instances/types.go b/lib/instances/types.go index 0f5cda9f..bc9ee2cc 100644 --- a/lib/instances/types.go +++ b/lib/instances/types.go @@ -260,9 +260,9 @@ type ForkInstanceRequest struct { // TemplateID resolves the source instance from the template registry by // id-or-name. When set, the source instance id passed to ForkInstance is - // ignored (must be empty). The fork's mem-file is shared with the - // template's mem-file via symlink instead of being copied per-fork, so - // many forks fan out from the same warm guest memory. + // ignored (must be empty). The fork's mem-file is hardlinked to the + // template's mem-file instead of being copied per-fork, so many forks + // fan out from the same warm guest memory. TemplateID string } From 030b69e79f0db2f4b5fcf1ef1df723998e4d8cd9 Mon Sep 17 00:00:00 2001 From: sjmiller609 <7516283+sjmiller609@users.noreply.github.com> Date: Fri, 8 May 2026 15:56:35 +0000 Subject: [PATCH 4/4] fork test: assert mem-file at snapshot-base after firecracker restore Firecracker enables snapshot base reuse, which renames the post-restore snapshot dir from snapshot-latest to snapshot-base. The hardlink survives the rename (same inode), so the test just needs the right path. --- lib/instances/firecracker_test.go | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/lib/instances/firecracker_test.go b/lib/instances/firecracker_test.go index 809bd953..3b115d67 100644 --- a/lib/instances/firecracker_test.go +++ b/lib/instances/firecracker_test.go @@ -618,10 +618,15 @@ func TestFirecrackerForkFromTemplate(t *testing.T) { // (b) The fork's mem-file must share the source's inode (hardlink), not // be a copy. We can't compare paths because the link is by inode; we // compare st_ino + st_dev between the two instances' mem-files. - forkMemPath := filepath.Join(p.InstanceSnapshotLatest(forkID), templateSharedMemFileName) + // + // Firecracker retains the post-restore snapshot dir as snapshot-base + // (see restoreRetainedSnapshotBase), so after the Standby -> Running + // transition the hardlink lives under snapshot-base/, not snapshot-latest/. + // Hardlinks survive the rename because they bind to the inode. + forkMemPath := filepath.Join(p.InstanceSnapshotBase(forkID), templateSharedMemFileName) srcMemPath := filepath.Join(p.InstanceSnapshotLatest(sourceID), templateSharedMemFileName) forkInfo, err := os.Stat(forkMemPath) - require.NoError(t, err, "fork mem-file should exist at snapshot-latest/memory") + require.NoError(t, err, "fork mem-file should exist at snapshot-base/memory after restore") assert.True(t, forkInfo.Mode().IsRegular(), "fork mem-file should be a regular file (hardlink), not a symlink") srcInfo, err := os.Stat(srcMemPath) require.NoError(t, err)