Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions cmd/api/api/instances.go
Original file line number Diff line number Diff line change
Expand Up @@ -638,6 +638,7 @@ func (s *ApiService) ForkInstance(ctx context.Context, request oapi.ForkInstance
Name: request.Body.Name,
FromRunning: request.Body.FromRunning != nil && *request.Body.FromRunning,
TargetState: targetState,
ShareMemory: request.Body.ShareMemory != nil && *request.Body.ShareMemory,
})
if err != nil {
switch {
Expand Down Expand Up @@ -1062,6 +1063,8 @@ func instanceToOAPI(inst instances.Instance) oapi.Instance {
ExitCode: inst.ExitCode,
HasSnapshot: lo.ToPtr(inst.HasSnapshot),
Hypervisor: &hvType,
ForkCount: lo.ToPtr(inst.ForkCount),
MemLocked: lo.ToPtr(inst.MemLocked),
}

if b, err := json.Marshal(networkPayload); err == nil {
Expand Down
16 changes: 16 additions & 0 deletions lib/instances/fork.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,14 @@ func (m *manager) forkInstance(ctx context.Context, id string, req ForkInstanceR
return nil, "", err
}

if req.ShareMemory {
shared, err := m.ensureShareMemoryTemplate(ctx, id)
if err != nil {
return nil, "", err
}
req.TemplateID = shared.ID
id = ""
}
resolvedID, tpl, err := m.resolveForkFromTemplateRequest(ctx, id, req)
if err != nil {
return nil, "", err
Expand Down Expand Up @@ -406,6 +414,14 @@ func validateForkRequest(req ForkInstanceRequest) error {
if req.TargetState != "" && req.TargetState != StateStopped && req.TargetState != StateStandby && req.TargetState != StateRunning {
return fmt.Errorf("%w: invalid fork target state %q (must be one of %s, %s, %s)", ErrInvalidRequest, req.TargetState, StateStopped, StateStandby, StateRunning)
}
if req.ShareMemory {
if req.TemplateID != "" {
return fmt.Errorf("%w: share_memory cannot be combined with template_id", ErrInvalidRequest)
}
if req.FromRunning {
return fmt.Errorf("%w: share_memory requires the source to already be in Standby; from_running=true would re-restore the source after locking", ErrInvalidRequest)
}
}
return nil
}

Expand Down
1 change: 1 addition & 0 deletions lib/instances/query.go
Original file line number Diff line number Diff line change
Expand Up @@ -356,6 +356,7 @@ func (m *manager) toInstanceWithStateDerivation(ctx context.Context, meta *metad
BootMarkersHydrated: result.BootMarkersHydrated,
}
refreshHypervisorPID(&inst.StoredMetadata, result.State)
hydrateForkLockState(ctx, m.templateRegistry, &inst)

// If VM is stopped and exit info isn't persisted yet, populate in-memory
// from the serial console log. This is read-only -- no metadata writes.
Expand Down
169 changes: 169 additions & 0 deletions lib/instances/share_memory_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,169 @@
package instances

import (
"context"
"os"
"path/filepath"
"testing"
"time"

"github.com/kernel/hypeman/lib/hypervisor"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)

func TestValidateForkRequest_ShareMemoryConflicts(t *testing.T) {
t.Parallel()

t.Run("share_memory with template_id is rejected", func(t *testing.T) {
err := validateForkRequest(ForkInstanceRequest{
Name: "fork-bad-combo",
ShareMemory: true,
TemplateID: "tpl-123",
})
require.Error(t, err)
assert.ErrorIs(t, err, ErrInvalidRequest)
})

t.Run("share_memory with from_running is rejected", func(t *testing.T) {
err := validateForkRequest(ForkInstanceRequest{
Name: "fork-bad-combo",
ShareMemory: true,
FromRunning: true,
})
require.Error(t, err)
assert.ErrorIs(t, err, ErrInvalidRequest)
})

t.Run("share_memory alone is allowed", func(t *testing.T) {
err := validateForkRequest(ForkInstanceRequest{
Name: "fork-ok",
ShareMemory: true,
})
require.NoError(t, err)
})
}

// stagedStandbySource creates a metadata + fake snapshot directory for an
// instance so toInstance reports State=Standby without involving any real
// hypervisor. Returns the source instance ID.
func stagedStandbySource(t *testing.T, mgr *manager, name string) string {
t.Helper()
id := name
require.NoError(t, mgr.ensureDirectories(id))

dataDir := mgr.paths.InstanceDir(id)
snapDir := filepath.Join(dataDir, "snapshots", "snapshot-latest")
require.NoError(t, os.MkdirAll(snapDir, 0o755))
require.NoError(t, os.WriteFile(filepath.Join(snapDir, "memory"), []byte("fake-mem"), 0o644))
require.NoError(t, os.WriteFile(filepath.Join(snapDir, "config.json"), []byte("{}"), 0o644))

now := time.Now()
meta := &metadata{StoredMetadata: StoredMetadata{
Id: id,
Name: id,
Image: "docker.io/library/alpine:latest",
CreatedAt: now,
HypervisorType: hypervisor.TypeFirecracker,
HypervisorVersion: "test",
// Intentionally no SocketPath so deriveState falls through to the
// snapshot check and reports Standby.
DataDir: dataDir,
VsockCID: 44,
VsockSocket: mgr.paths.InstanceVsockSocket(id),
}}
require.NoError(t, mgr.saveMetadata(meta))
return id
}

func TestEnsureShareMemoryTemplate_AutoPromoteAndReuse(t *testing.T) {
t.Parallel()
mgr, _ := setupTestManager(t)
ctx := context.Background()

sourceID := stagedStandbySource(t, mgr, "share-mem-source")

tpl1, err := mgr.ensureShareMemoryTemplate(ctx, sourceID)
require.NoError(t, err)
require.NotNil(t, tpl1)
assert.Equal(t, sourceID, tpl1.SourceInstanceID)
assert.Equal(t, shareMemoryTemplateName(sourceID), tpl1.Name)

// Source is now flagged as a template parent.
meta, err := mgr.loadMetadata(sourceID)
require.NoError(t, err)
assert.True(t, meta.StoredMetadata.IsTemplate)
assert.Equal(t, tpl1.ID, meta.StoredMetadata.TemplateID)

// Second call returns the same registry entry — no duplicate promotion.
tpl2, err := mgr.ensureShareMemoryTemplate(ctx, sourceID)
require.NoError(t, err)
assert.Equal(t, tpl1.ID, tpl2.ID)
}

func TestEnsureShareMemoryTemplate_RejectsNonStandby(t *testing.T) {
t.Parallel()
mgr, _ := setupTestManager(t)
ctx := context.Background()

// Same staged source layout but no snapshot dir → state derives as Stopped.
id := "share-mem-stopped-source"
require.NoError(t, mgr.ensureDirectories(id))
now := time.Now()
require.NoError(t, mgr.saveMetadata(&metadata{StoredMetadata: StoredMetadata{
Id: id,
Name: id,
Image: "docker.io/library/alpine:latest",
CreatedAt: now,
HypervisorType: hypervisor.TypeFirecracker,
HypervisorVersion: "test",
DataDir: mgr.paths.InstanceDir(id),
VsockCID: 45,
VsockSocket: mgr.paths.InstanceVsockSocket(id),
}}))

_, err := mgr.ensureShareMemoryTemplate(ctx, id)
require.Error(t, err)
assert.ErrorIs(t, err, ErrInvalidState)
}

func TestTemplateGuard_ReturnsInvalidStateNotUnsupported(t *testing.T) {
t.Parallel()
mgr, _ := setupTestManager(t)

// Locked: a template parent should return ErrInvalidState (409), not
// ErrNotSupported (501) — the lock is transient (resolves once forks
// are deleted), not a hypervisor capability gap.
stored := &StoredMetadata{Id: "src", IsTemplate: true, TemplateID: "tpl-xyz"}
err := mgr.templateGuard(stored, "start")
require.Error(t, err)
assert.ErrorIs(t, err, ErrInvalidState)
assert.NotErrorIs(t, err, ErrNotSupported)

// Not a template: no error.
stored.IsTemplate = false
require.NoError(t, mgr.templateGuard(stored, "start"))
}

func TestHydrateForkLockState(t *testing.T) {
t.Parallel()
mgr, _ := setupTestManager(t)
ctx := context.Background()

sourceID := stagedStandbySource(t, mgr, "share-mem-hydrate-source")
tpl, err := mgr.ensureShareMemoryTemplate(ctx, sourceID)
require.NoError(t, err)

// Zero forks initially.
inst, err := mgr.GetInstance(ctx, sourceID)
require.NoError(t, err)
assert.Equal(t, 0, inst.ForkCount)
assert.False(t, inst.MemLocked)

// Bump refcount and re-read: ForkCount/MemLocked should reflect it.
require.NoError(t, mgr.bumpTemplateForkRefcount(ctx, tpl))
inst, err = mgr.GetInstance(ctx, sourceID)
require.NoError(t, err)
assert.Equal(t, 1, inst.ForkCount)
assert.True(t, inst.MemLocked)
}
73 changes: 70 additions & 3 deletions lib/instances/templates.go
Original file line number Diff line number Diff line change
Expand Up @@ -165,13 +165,16 @@ func (m *manager) touchTemplateUsage(ctx context.Context, templateID string) {

// templateGuard returns an error when the instance is a template parent.
// Templates must not be Started or Restored — the snapshot is shared with
// live forks and resuming it would corrupt them. PR 3 hardens this further
// when forks rely on the template's mem-file directly.
// live forks and resuming it would corrupt them.
//
// Returns ErrInvalidState (409) so callers see this as a transient
// state-conflict (resolves once forks are deleted), not as a hypervisor
// capability gap (501).
func (m *manager) templateGuard(stored *StoredMetadata, op string) error {
if stored == nil || !stored.IsTemplate {
return nil
}
return fmt.Errorf("%w: cannot %s template instance %s (template_id=%s); fork from it instead", ErrNotSupported, op, stored.Id, stored.TemplateID)
return fmt.Errorf("%w: cannot %s instance %s while it is mem-locked by live forks; delete the forks (or wait for them to exit) first", ErrInvalidState, op, stored.Id)
}

// validateForkResolvedFromTemplate confirms a fork-from-template request
Expand Down Expand Up @@ -324,3 +327,67 @@ func (m *manager) dropTemplateForkRefcount(ctx context.Context, templateID strin
"template_id", templateID, "error", err)
}
}

// ensureShareMemoryTemplate resolves (or creates) the template entry that
// backs ShareMemory=true forks against the given source instance. If the
// source is already a template parent, the existing entry is returned.
// Otherwise the source is auto-promoted with a deterministic, internal
// name derived from its instance ID — the public API never exposes the
// templates resource, so the name is purely a registry detail.
//
// The source must be in Standby; this is checked here so callers see a
// clear error before the fork machinery starts allocating fork state.
func (m *manager) ensureShareMemoryTemplate(ctx context.Context, instanceID string) (*templates.Template, error) {
if instanceID == "" {
return nil, fmt.Errorf("%w: share_memory requires a source instance id", ErrInvalidRequest)
}
if m.templateRegistry == nil {
return nil, fmt.Errorf("%w: template registry not configured", ErrNotSupported)
}
meta, err := m.loadMetadata(instanceID)
if err != nil {
return nil, err
}
stored := &meta.StoredMetadata
if stored.IsTemplate && stored.TemplateID != "" {
tpl, err := m.templateRegistry.Get(ctx, stored.TemplateID)
if err != nil {
return nil, fmt.Errorf("load existing share-memory template: %w", err)
}
return tpl, nil
}
inst := m.toInstance(ctx, meta)
if inst.State != StateStandby {
return nil, fmt.Errorf("%w: share_memory requires the source to be in Standby (got %s)", ErrInvalidState, inst.State)
}
return m.promoteToTemplate(ctx, instanceID, PromoteToTemplateRequest{
Name: shareMemoryTemplateName(instanceID),
})
}

// shareMemoryTemplateName computes the registry name used for auto-promoted
// share-memory templates. Encoded as a function so tests can assert that
// repeated ShareMemory forks against the same source resolve the same
// registry entry.
func shareMemoryTemplateName(instanceID string) string {
return "share-mem-" + instanceID
}

// hydrateForkLockState fills in ForkCount/MemLocked on inst by looking up
// the instance's template entry. Non-fatal: any registry lookup error
// leaves the fields at their zero value so callers see "not locked" rather
// than a hard failure.
func hydrateForkLockState(ctx context.Context, registry templates.Registry, inst *Instance) {
if inst == nil || registry == nil {
return
}
if !inst.IsTemplate || inst.TemplateID == "" {
return
}
tpl, err := registry.Get(ctx, inst.TemplateID)
if err != nil || tpl == nil {
return
}
inst.ForkCount = tpl.ForkCount
inst.MemLocked = tpl.ForkCount > 0
}
19 changes: 19 additions & 0 deletions lib/instances/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,15 @@ type Instance struct {
StateError *string // Error message if state couldn't be determined (non-nil when State=Unknown)
HasSnapshot bool // Derived from filesystem check
BootMarkersHydrated bool // True when missing boot markers were hydrated from logs in this read

// ForkCount is the number of live forks created against this instance
// with ShareMemory=true. Derived from the templates registry when the
// instance is a fan-out parent; zero otherwise.
ForkCount int
// MemLocked is true iff ForkCount > 0. While true, start/restore/delete
// of this instance fails with ErrInvalidState because the snapshot
// mem-file is being served to live forks.
MemLocked bool
}

// GetHypervisorType returns the hypervisor type as a string.
Expand Down Expand Up @@ -264,6 +273,16 @@ type ForkInstanceRequest struct {
// template's mem-file instead of being copied per-fork, so many forks
// fan out from the same warm guest memory.
TemplateID string

// ShareMemory opts the fork into mem-file sharing with the source
// instance: instead of copying the snapshot mem-file, the fork's
// hypervisor reads pages from the source's mem-file (via uffd or
// hardlink, depending on hypervisor). Requires the source to be in
// Standby. The first such fork against a source auto-promotes that
// source so subsequent ShareMemory forks reuse the same registry entry;
// while any are alive, the source is mem-locked (start/restore/delete
// return ErrInvalidState).
ShareMemory bool
}

// SnapshotKind determines how snapshot data is captured and restored.
Expand Down
Loading
Loading