Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 29 additions & 0 deletions lib/forkvm/copy.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,28 @@ import (

var ErrSparseCopyUnsupported = errors.New("sparse copy unsupported")

// CopyOptions tunes CopyGuestDirectory behavior. The zero value reproduces
// the original full-copy semantics; callers can opt into skipping specific
// paths when the consumer arranges its own substitute (e.g. a symlink to a
// template-shared mem-file).
type CopyOptions struct {
// SkipRelPaths lists relative paths under srcDir that should not be
// materialized in dstDir. Comparison is exact and uses forward-slash
// separators on all platforms.
SkipRelPaths []string
}

// CopyGuestDirectory recursively copies a guest directory to a new destination.
// Regular files are copied using sparse extent copy only (SEEK_DATA/SEEK_HOLE).
// Runtime sockets and logs are skipped because they are host-runtime artifacts.
func CopyGuestDirectory(srcDir, dstDir string) error {
return CopyGuestDirectoryWithOptions(srcDir, dstDir, CopyOptions{})
}

// CopyGuestDirectoryWithOptions is the option-taking variant of
// CopyGuestDirectory. Use this when forking with template-shared assets, so
// the caller can install a symlink in place of a heavy copied file.
func CopyGuestDirectoryWithOptions(srcDir, dstDir string, opts CopyOptions) error {
srcInfo, err := os.Stat(srcDir)
if err != nil {
return fmt.Errorf("stat source directory: %w", err)
Expand All @@ -27,6 +45,11 @@ func CopyGuestDirectory(srcDir, dstDir string) error {
return fmt.Errorf("create destination directory: %w", err)
}

skipSet := make(map[string]struct{}, len(opts.SkipRelPaths))
for _, p := range opts.SkipRelPaths {
skipSet[filepath.ToSlash(p)] = struct{}{}
}

return filepath.WalkDir(srcDir, func(path string, d fs.DirEntry, walkErr error) error {
if walkErr != nil {
return walkErr
Expand All @@ -39,6 +62,12 @@ func CopyGuestDirectory(srcDir, dstDir string) error {
if relPath == "." {
return nil
}
if _, skip := skipSet[filepath.ToSlash(relPath)]; skip {
if d.IsDir() {
return filepath.SkipDir
}
return nil
}
if d.IsDir() && shouldSkipDirectory(relPath) {
return filepath.SkipDir
}
Expand Down
19 changes: 19 additions & 0 deletions lib/forkvm/copy_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,25 @@ func TestCopyGuestDirectory(t *testing.T) {
assert.Equal(t, "metadata.json", linkTarget)
}

func TestCopyGuestDirectory_SkipRelPaths(t *testing.T) {
src := filepath.Join(t.TempDir(), "src")
dst := filepath.Join(t.TempDir(), "dst")

require.NoError(t, os.MkdirAll(filepath.Join(src, "snapshots", "snapshot-latest"), 0755))
require.NoError(t, os.WriteFile(filepath.Join(src, "snapshots", "snapshot-latest", "config.json"), []byte(`{}`), 0644))
require.NoError(t, os.WriteFile(filepath.Join(src, "snapshots", "snapshot-latest", "memory"), []byte("the heavy mem-file"), 0644))
require.NoError(t, os.WriteFile(filepath.Join(src, "snapshots", "snapshot-latest", "state"), []byte("device state"), 0644))

err := CopyGuestDirectoryWithOptions(src, dst, CopyOptions{
SkipRelPaths: []string{"snapshots/snapshot-latest/memory"},
})
require.NoError(t, err)

assert.NoFileExists(t, filepath.Join(dst, "snapshots", "snapshot-latest", "memory"))
assert.FileExists(t, filepath.Join(dst, "snapshots", "snapshot-latest", "config.json"))
assert.FileExists(t, filepath.Join(dst, "snapshots", "snapshot-latest", "state"))
}

func TestCopyGuestDirectory_DoesNotSkipTmpSuffixedDirectories(t *testing.T) {
src := filepath.Join(t.TempDir(), "src")
dst := filepath.Join(t.TempDir(), "dst")
Expand Down
18 changes: 18 additions & 0 deletions lib/instances/delete.go
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,18 @@ func (m *manager) deleteInstance(
stored := &meta.StoredMetadata
log.DebugContext(ctx, "loaded instance", "instance_id", id, "state", inst.State)

// If this instance was promoted to a template parent, refuse to delete
// it while live forks reference it. Removing the registry entry now
// (instead of after the data wipe) gives us a single transactional
// "in-use" check via templates.ErrInUse.
if stored.IsTemplate && stored.TemplateID != "" && m.templateRegistry != nil {
if err := m.templateRegistry.Delete(ctx, stored.TemplateID); err != nil {
return fmt.Errorf("delete template registry entry for instance %s: %w", id, err)
}
stored.IsTemplate = false
stored.TemplateID = ""
}

target, err := m.cancelAndWaitCompressionJob(ctx, m.snapshotJobKeyForInstance(id))
if err != nil {
return fmt.Errorf("wait for instance compression to stop: %w", err)
Expand Down Expand Up @@ -136,6 +148,12 @@ func (m *manager) deleteInstance(
return fmt.Errorf("delete instance data: %w", err)
}

// 9. If this instance was a fork of a template, drop the template's
// fork refcount so the template can eventually be deleted.
if stored.ForkOfTemplate != "" {
m.dropTemplateForkRefcount(ctx, stored.ForkOfTemplate)
}

log.InfoContext(ctx, "instance deleted successfully", "instance_id", id)
return nil
}
Expand Down
49 changes: 45 additions & 4 deletions lib/instances/fork.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ import (
"github.com/kernel/hypeman/lib/hypervisor"
"github.com/kernel/hypeman/lib/logger"
"github.com/kernel/hypeman/lib/network"
"github.com/kernel/hypeman/lib/templates"
"github.com/nrednav/cuid2"
"go.opentelemetry.io/otel/attribute"
"gvisor.dev/gvisor/pkg/cleanup"
Expand All @@ -36,11 +37,22 @@ func (m *manager) forkInstance(ctx context.Context, id string, req ForkInstanceR
return nil, "", err
}

resolvedID, tpl, err := m.resolveForkFromTemplateRequest(ctx, id, req)
if err != nil {
return nil, "", err
}
id = resolvedID

meta, err := m.loadMetadata(id)
if err != nil {
return nil, "", err
}
source := m.toInstance(ctx, meta)
if tpl != nil {
if err := validateForkResolvedFromTemplate(tpl, source.HypervisorType); err != nil {
return nil, "", err
}
}
targetState, err := resolveForkTargetState(req.TargetState, source.State)
if err != nil {
return nil, "", err
Expand All @@ -65,7 +77,7 @@ func (m *manager) forkInstance(ctx context.Context, id string, req ForkInstanceR
return nil, "", fmt.Errorf("standby source instance: %w", err)
}

forked, forkErr := m.forkInstanceFromStoppedOrStandby(ctx, id, req, true)
forked, forkErr := m.forkInstanceFromStoppedOrStandby(ctx, id, req, true, tpl)
if forkErr == nil {
if err := m.rotateSourceVsockForRestore(ctx, id, forked.Id); err != nil {
forkErr = fmt.Errorf("prepare source snapshot for restore: %w", err)
Expand Down Expand Up @@ -104,7 +116,7 @@ func (m *manager) forkInstance(ctx context.Context, id string, req ForkInstanceR
}
return forked, targetState, nil
case StateStopped, StateStandby:
forked, err := m.forkInstanceFromStoppedOrStandby(ctx, id, req, false)
forked, err := m.forkInstanceFromStoppedOrStandby(ctx, id, req, false, tpl)
if err != nil {
return nil, "", err
}
Expand Down Expand Up @@ -192,7 +204,7 @@ func generateForkSourceVsockCID(sourceID, forkID string, current int64) int64 {
return cid
}

func (m *manager) forkInstanceFromStoppedOrStandby(ctx context.Context, id string, req ForkInstanceRequest, supportValidated bool) (*Instance, error) {
func (m *manager) forkInstanceFromStoppedOrStandby(ctx context.Context, id string, req ForkInstanceRequest, supportValidated bool, tpl *templates.Template) (*Instance, error) {
log := logger.FromContext(ctx)

meta, err := m.loadMetadata(id)
Expand All @@ -202,6 +214,9 @@ func (m *manager) forkInstanceFromStoppedOrStandby(ctx context.Context, id strin

source := m.toInstance(ctx, meta)
stored := &meta.StoredMetadata
if tpl != nil && !stored.IsTemplate {
return nil, fmt.Errorf("%w: template %s source instance %s is not flagged as a template parent", ErrInvalidState, tpl.ID, id)
}

switch source.State {
case StateStopped, StateStandby:
Expand Down Expand Up @@ -255,12 +270,21 @@ func (m *manager) forkInstanceFromStoppedOrStandby(ctx context.Context, id strin
}
}

if err := forkvm.CopyGuestDirectory(srcDir, dstDir); err != nil {
copyOpts := forkvm.CopyOptions{}
if tpl != nil {
copyOpts.SkipRelPaths = []string{templateSharedMemFileRelPath}
}
if err := forkvm.CopyGuestDirectoryWithOptions(srcDir, dstDir, copyOpts); err != nil {
if errors.Is(err, forkvm.ErrSparseCopyUnsupported) {
return nil, fmt.Errorf("fork requires sparse-capable filesystem (SEEK_DATA/SEEK_HOLE unsupported): %w", err)
}
return nil, fmt.Errorf("clone guest directory: %w", err)
}
if tpl != nil {
if err := m.installForkSharedMemFile(dstDir, tpl); err != nil {
return nil, fmt.Errorf("install shared mem-file: %w", err)
}
}

starter, err := m.getVMStarter(stored.HypervisorType)
if err != nil {
Expand All @@ -280,6 +304,15 @@ func (m *manager) forkInstanceFromStoppedOrStandby(ctx context.Context, id strin
forkMeta.VsockSocket = m.paths.InstanceSocket(forkID, hypervisor.VsockSocketNameForType(forkMeta.HypervisorType))
forkMeta.ExitCode = nil
forkMeta.ExitMessage = ""
// Forks of a template carry the template id but never inherit the
// IsTemplate flag — they are working copies.
forkMeta.IsTemplate = false
forkMeta.TemplateID = ""
if tpl != nil {
forkMeta.ForkOfTemplate = tpl.ID
} else {
forkMeta.ForkOfTemplate = stored.ForkOfTemplate
}

// Keep the original CID for snapshot-based forks.
// Rewriting CID in restored memory snapshots is not reliable across
Expand Down Expand Up @@ -324,6 +357,14 @@ func (m *manager) forkInstanceFromStoppedOrStandby(ctx context.Context, id strin
return nil, fmt.Errorf("save fork metadata: %w", err)
}

if tpl != nil {
// Bumped before cu.Release so a refcount failure leaves no orphan
// fork directory (deferred cu.Clean removes the data dir + metadata).
if err := m.bumpTemplateForkRefcount(ctx, tpl); err != nil {
return nil, fmt.Errorf("record template fork refcount: %w", err)
}
}

cu.Release()
forked := m.toInstance(ctx, newMeta)
log.InfoContext(ctx, "instance forked successfully",
Expand Down
2 changes: 1 addition & 1 deletion lib/instances/fork_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -265,7 +265,7 @@ func TestForkInstanceFromStandbyCancelsCompressionJobAndCopiesRawMemory(t *testi
forked, err := manager.forkInstanceFromStoppedOrStandby(ctx, sourceID, ForkInstanceRequest{
Name: "fork-standby-compressed-copy",
TargetState: StateStopped,
}, true)
}, true, nil)
require.NoError(t, err)
require.NotNil(t, forked)

Expand Down
8 changes: 7 additions & 1 deletion lib/instances/manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -381,7 +381,13 @@ func (m *manager) DeleteSnapshot(ctx context.Context, snapshotID string) error {

// ForkInstance creates a forked copy of an instance.
func (m *manager) ForkInstance(ctx context.Context, id string, req ForkInstanceRequest) (*Instance, error) {
lock := m.getInstanceLock(id)
// Resolve TemplateID outside the lock so we hold the source instance
// lock — not an empty string lock — when forking from a template.
resolvedID, _, err := m.resolveForkFromTemplateRequest(ctx, id, req)
if err != nil {
return nil, err
}
lock := m.getInstanceLock(resolvedID)
lock.Lock()
forked, targetState, err := m.forkInstance(ctx, id, req)
lock.Unlock()
Expand Down
4 changes: 4 additions & 0 deletions lib/instances/restore.go
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,10 @@ func (m *manager) restoreInstance(
log.ErrorContext(ctx, "no snapshot available", "instance_id", id)
return nil, fmt.Errorf("no snapshot available for instance %s", id)
}
if err := m.templateGuard(stored, "restore"); err != nil {
log.ErrorContext(ctx, "refusing to restore template instance", "instance_id", id, "template_id", stored.TemplateID)
return nil, err
}

// 2b. Validate aggregate resource limits before allocating resources (if configured)
reservedResources := false
Expand Down
4 changes: 4 additions & 0 deletions lib/instances/start.go
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,10 @@ func (m *manager) startInstance(
log.ErrorContext(ctx, "invalid state for start", "instance_id", id, "state", inst.State)
return nil, fmt.Errorf("%w: cannot start from state %s, must be Stopped", ErrInvalidState, inst.State)
}
if err := m.templateGuard(stored, "start"); err != nil {
log.ErrorContext(ctx, "refusing to start template instance", "instance_id", id, "template_id", stored.TemplateID)
return nil, err
}

// 2a. Clear stale exit info from previous run and apply command overrides
stored.ExitCode = nil
Expand Down
97 changes: 97 additions & 0 deletions lib/instances/templates.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@ import (
"context"
"errors"
"fmt"
"os"
"path/filepath"
"time"

"github.com/kernel/hypeman/lib/hypervisor"
Expand Down Expand Up @@ -219,3 +221,98 @@ func (m *manager) nowOrDefault() time.Time {
}
return m.now()
}

// resolveForkFromTemplateRequest expands a ForkInstanceRequest with a
// non-empty TemplateID into (sourceInstanceID, *Template). Returns
// (instanceID, nil, nil) when TemplateID is empty so callers fall through
// to the ordinary fork path. Returns an error when the caller passed both
// instanceID and TemplateID, when the registry is unconfigured, or when
// the template cannot be resolved.
func (m *manager) resolveForkFromTemplateRequest(ctx context.Context, instanceID string, req ForkInstanceRequest) (string, *templates.Template, error) {
if req.TemplateID == "" {
return instanceID, nil, nil
}
if instanceID != "" {
return "", nil, fmt.Errorf("%w: pass either an instance id or a template id, not both", ErrInvalidRequest)
}
if m.templateRegistry == nil {
return "", nil, fmt.Errorf("%w: template registry not configured", ErrNotSupported)
}
tpl, err := m.templateForFork(ctx, req.TemplateID)
if err != nil {
return "", nil, fmt.Errorf("resolve template %q: %w", req.TemplateID, err)
}
if tpl == nil {
return "", nil, fmt.Errorf("%w: template %q not found", ErrNotFound, req.TemplateID)
}
if tpl.SourceInstanceID == "" {
return "", nil, fmt.Errorf("%w: template %s has no source instance", ErrInvalidState, tpl.ID)
}
return tpl.SourceInstanceID, tpl, nil
}

// installForkSharedMemFile arranges the fork's snapshot directory so the
// guest mem-file is a symlink into the template's snapshot directory
// instead of a per-fork copy. firecracker mmaps the mem-file MAP_PRIVATE
// during restore, so all forks COW from the same backing file.
//
// Layout: dst is the fork's data dir. The snapshot dir is at
// <dst>/snapshots/snapshot-latest, and the mem-file lives at
// <snapshot dir>/memory. The symlink target is the template's source
// instance's standby snapshot mem-file.
func (m *manager) installForkSharedMemFile(forkDataDir string, tpl *templates.Template) error {
if tpl == nil {
return nil
}
srcMem := filepath.Join(m.paths.InstanceSnapshotLatest(tpl.SourceInstanceID), templateSharedMemFileName)
if _, err := os.Stat(srcMem); err != nil {
return fmt.Errorf("stat template mem-file: %w", err)
}
dstSnapshotDir := filepath.Join(forkDataDir, "snapshots", "snapshot-latest")
if err := os.MkdirAll(dstSnapshotDir, 0o755); err != nil {
return fmt.Errorf("ensure fork snapshot dir: %w", err)
}
dstMem := filepath.Join(dstSnapshotDir, templateSharedMemFileName)
// Tolerate a leftover entry (e.g. from a partial copy that wasn't fully
// skipped on a different filesystem layout).
_ = os.Remove(dstMem)
if err := os.Symlink(srcMem, dstMem); err != nil {
return fmt.Errorf("symlink shared mem-file: %w", err)
}
return nil
}

// templateSharedMemFileRelPath is the relative path under the source data
// dir that points at the snapshotted guest mem-file. Encoded here so the
// fork copy step can skip it without importing firecracker internals.
const (
templateSharedMemFileName = "memory"
templateSharedMemFileRelPath = "snapshots/snapshot-latest/memory"
)

// bumpTemplateForkRefcount records that a fork now depends on a template.
// Best-effort touch of LastUsedAt happens alongside.
func (m *manager) bumpTemplateForkRefcount(ctx context.Context, tpl *templates.Template) error {
if tpl == nil || m.templateRegistry == nil {
return nil
}
if _, err := m.templateRegistry.IncrementForkCount(ctx, tpl.ID); err != nil {
return fmt.Errorf("increment template fork count: %w", err)
}
m.touchTemplateUsage(ctx, tpl.ID)
return nil
}

// dropTemplateForkRefcount mirrors bumpTemplateForkRefcount and is called
// when a fork instance is deleted. Missing templates are tolerated so
// orphaned forks don't block delete.
func (m *manager) dropTemplateForkRefcount(ctx context.Context, templateID string) {
if templateID == "" || m.templateRegistry == nil {
return
}
if _, err := m.templateRegistry.DecrementForkCount(ctx, templateID); err != nil {
log := logger.FromContext(ctx)
log.WarnContext(ctx, "failed to decrement template fork refcount",
"template_id", templateID, "error", err)
}
}
Loading
Loading