From 5595c783db59d7a9b899ef531f0f85005e28ba5d Mon Sep 17 00:00:00 2001 From: Steven Miller Date: Sun, 8 Mar 2026 18:51:24 -0400 Subject: [PATCH 01/32] instances: add Initializing state and gate Running on agent/program readiness --- cmd/api/main.go | 2 +- lib/instances/README.md | 10 +- lib/instances/create.go | 9 +- lib/instances/delete.go | 4 +- lib/instances/firecracker_test.go | 33 ++- lib/instances/fork.go | 8 + lib/instances/liveness.go | 2 +- lib/instances/manager.go | 22 +- lib/instances/manager_test.go | 69 ++++- lib/instances/qemu_test.go | 25 +- lib/instances/query.go | 222 +++++++++++++++- lib/instances/query_test.go | 74 ++++++ lib/instances/restore.go | 7 +- .../snapshot_integration_scenario_test.go | 5 + lib/instances/start.go | 9 +- lib/instances/state.go | 12 +- lib/instances/stop.go | 12 +- lib/instances/types.go | 26 +- lib/network/README.md | 3 +- lib/oapi/oapi.go | 236 +++++++++--------- lib/system/README.md | 14 +- lib/system/guest_agent/main.go | 22 ++ lib/system/init/mode_exec.go | 46 +++- lib/system/init/mode_exec_test.go | 33 +++ lib/system/init/mode_systemd.go | 5 +- openapi.yaml | 5 +- 26 files changed, 727 insertions(+), 188 deletions(-) diff --git a/cmd/api/main.go b/cmd/api/main.go index f937fb31..f7585e48 100644 --- a/cmd/api/main.go +++ b/cmd/api/main.go @@ -181,7 +181,7 @@ func run() error { // Initialize to empty slice (not nil) so cleanup runs even with no running VMs preserveTAPs = []string{} for _, inst := range allInstances { - if inst.State == instances.StateRunning || inst.State == instances.StateUnknown { + if inst.State == instances.StateRunning || inst.State == instances.StateInitializing || inst.State == instances.StateUnknown { preserveTAPs = append(preserveTAPs, inst.Id) } } diff --git a/lib/instances/README.md b/lib/instances/README.md index 51a245ef..4f1bc076 100644 --- a/lib/instances/README.md +++ b/lib/instances/README.md @@ -16,7 +16,8 @@ Manages VM instance lifecycle across multiple hypervisors (Cloud Hypervisor, QEM **States:** - `Stopped` - No VMM, no snapshot - `Created` - VMM created but not booted (CH native) -- `Running` - VM actively running (CH native) +- `Initializing` - VM is running while guest init is still in progress +- `Running` - Guest program has started and instance is ready - `Paused` - VM paused (CH native) - `Shutdown` - VM shutdown, VMM exists (CH native) - `Standby` - No VMM, snapshot exists (can restore) @@ -63,11 +64,13 @@ Manager orchestrates multiple single-hop state transitions: **CreateInstance:** ``` -Stopped → Created → Running +Stopped → Created → Initializing → Running 1. Start VMM process 2. Create VM config 3. Boot VM -4. Expand memory (if hotplug configured) +4. Wait for guest-agent readiness gate (exec mode, unless skipped) +5. Guest program start marker observed +6. Expand memory (if hotplug configured) ``` **StandbyInstance:** @@ -134,4 +137,3 @@ TestStorageOperations - metadata persistence, directory cleanup - `lib/system` - System manager for kernel/initrd files - `lib/hypervisor` - Hypervisor abstraction for VM operations - System tools: `mkfs.erofs`, `cpio`, `gzip` (Linux); `mkfs.ext4` (macOS) - diff --git a/lib/instances/create.go b/lib/instances/create.go index 1f762ff3..ae4a1133 100644 --- a/lib/instances/create.go +++ b/lib/instances/create.go @@ -299,6 +299,8 @@ func (m *manager) createInstance( CreatedAt: time.Now(), StartedAt: nil, StoppedAt: nil, + ProgramStartedAt: nil, + GuestAgentReadyAt: nil, KernelVersion: string(kernelVer), HypervisorType: hvType, HypervisorVersion: hvVersion, @@ -435,14 +437,13 @@ func (m *manager) createInstance( // Success - release cleanup stack (prevent cleanup) cu.Release() + // Return instance with derived state + finalInst := m.toInstance(ctx, meta) // Record metrics if m.metrics != nil { m.recordDuration(ctx, m.metrics.createDuration, start, "success", hvType) - m.recordStateTransition(ctx, "stopped", string(StateRunning), hvType) + m.recordStateTransition(ctx, string(StateStopped), string(finalInst.State), hvType) } - - // Return instance with derived state - finalInst := m.toInstance(ctx, meta) log.InfoContext(ctx, "instance created successfully", "instance_id", id, "name", req.Name, "state", finalInst.State, "hypervisor", hvType) return &finalInst, nil } diff --git a/lib/instances/delete.go b/lib/instances/delete.go index 2b8d3f09..b54d5b3c 100644 --- a/lib/instances/delete.go +++ b/lib/instances/delete.go @@ -50,9 +50,9 @@ func (m *manager) deleteInstance( guest.CloseConn(dialer.Key()) } - // 4. If running, try graceful guest shutdown before force kill. + // 4. If active, try graceful guest shutdown before force kill. gracefulShutdown := false - if inst.State == StateRunning { + if inst.State == StateRunning || inst.State == StateInitializing { stopTimeout := resolveStopTimeout(stored) if stopTimeout > deleteGracefulShutdownTimeout { stopTimeout = deleteGracefulShutdownTimeout diff --git a/lib/instances/firecracker_test.go b/lib/instances/firecracker_test.go index 940c1786..32b3320b 100644 --- a/lib/instances/firecracker_test.go +++ b/lib/instances/firecracker_test.go @@ -112,7 +112,9 @@ func TestFirecrackerStandbyAndRestore(t *testing.T) { Hypervisor: hypervisor.TypeFirecracker, }) require.NoError(t, err) - assert.Equal(t, StateRunning, inst.State) + assert.Contains(t, []State{StateInitializing, StateRunning}, inst.State) + inst, err = waitForInstanceState(ctx, mgr, inst.Id, StateRunning, 20*time.Second) + require.NoError(t, err) inst, err = mgr.StandbyInstance(ctx, inst.Id) require.NoError(t, err) @@ -121,6 +123,9 @@ func TestFirecrackerStandbyAndRestore(t *testing.T) { inst, err = mgr.RestoreInstance(ctx, inst.Id) require.NoError(t, err) + assert.Contains(t, []State{StateInitializing, StateRunning}, inst.State) + inst, err = waitForInstanceState(ctx, mgr, inst.Id, StateRunning, 20*time.Second) + require.NoError(t, err) assert.Equal(t, StateRunning, inst.State) inst, err = mgr.StopInstance(ctx, inst.Id) @@ -131,6 +136,9 @@ func TestFirecrackerStandbyAndRestore(t *testing.T) { // Verify stopped -> start works after standby/restore lifecycle. inst, err = mgr.StartInstance(ctx, inst.Id, StartInstanceRequest{}) require.NoError(t, err) + assert.Contains(t, []State{StateInitializing, StateRunning}, inst.State) + inst, err = waitForInstanceState(ctx, mgr, inst.Id, StateRunning, 20*time.Second) + require.NoError(t, err) assert.Equal(t, StateRunning, inst.State) require.NoError(t, mgr.DeleteInstance(ctx, inst.Id)) @@ -161,6 +169,9 @@ func TestFirecrackerStopClearsStaleSnapshot(t *testing.T) { Hypervisor: hypervisor.TypeFirecracker, }) require.NoError(t, err) + require.Contains(t, []State{StateInitializing, StateRunning}, inst.State) + inst, err = waitForInstanceState(ctx, mgr, inst.Id, StateRunning, 20*time.Second) + require.NoError(t, err) require.Equal(t, StateRunning, inst.State) // Establish a realistic standby/restore lifecycle first. @@ -171,6 +182,9 @@ func TestFirecrackerStopClearsStaleSnapshot(t *testing.T) { inst, err = mgr.RestoreInstance(ctx, inst.Id) require.NoError(t, err) + require.Contains(t, []State{StateInitializing, StateRunning}, inst.State) + inst, err = waitForInstanceState(ctx, mgr, inst.Id, StateRunning, 20*time.Second) + require.NoError(t, err) require.Equal(t, StateRunning, inst.State) // Simulate stale snapshot residue from a prior failure/interruption. @@ -194,6 +208,9 @@ func TestFirecrackerStopClearsStaleSnapshot(t *testing.T) { inst, err = mgr.StartInstance(ctx, inst.Id, StartInstanceRequest{}) require.NoError(t, err) + assert.Contains(t, []State{StateInitializing, StateRunning}, inst.State) + inst, err = waitForInstanceState(ctx, mgr, inst.Id, StateRunning, 20*time.Second) + require.NoError(t, err) assert.Equal(t, StateRunning, inst.State) require.NoError(t, mgr.DeleteInstance(ctx, inst.Id)) @@ -229,6 +246,8 @@ func TestFirecrackerNetworkLifecycle(t *testing.T) { }) require.NoError(t, err) require.NotNil(t, inst) + inst, err = waitForInstanceState(ctx, mgr, inst.Id, StateRunning, 20*time.Second) + require.NoError(t, err) alloc, err := mgr.networkManager.GetAllocation(ctx, inst.Id) require.NoError(t, err) @@ -280,6 +299,9 @@ func TestFirecrackerNetworkLifecycle(t *testing.T) { inst, err = mgr.RestoreInstance(ctx, inst.Id) require.NoError(t, err) + assert.Contains(t, []State{StateInitializing, StateRunning}, inst.State) + inst, err = waitForInstanceState(ctx, mgr, inst.Id, StateRunning, 20*time.Second) + require.NoError(t, err) assert.Equal(t, StateRunning, inst.State) allocRestored, err := mgr.networkManager.GetAllocation(ctx, inst.Id) @@ -345,6 +367,8 @@ func TestFirecrackerForkFromRunningNetwork(t *testing.T) { Hypervisor: hypervisor.TypeFirecracker, }) require.NoError(t, err) + source, err = waitForInstanceState(ctx, mgr, source.Id, StateRunning, 20*time.Second) + require.NoError(t, err) sourceID := source.Id t.Cleanup(func() { _ = mgr.DeleteInstance(context.Background(), sourceID) }) assert.NotEmpty(t, source.IP) @@ -360,6 +384,9 @@ func TestFirecrackerForkFromRunningNetwork(t *testing.T) { TargetState: StateRunning, }) require.NoError(t, err) + require.Contains(t, []State{StateInitializing, StateRunning}, forked.State) + forked, err = waitForInstanceState(ctx, mgr, forked.Id, StateRunning, 20*time.Second) + require.NoError(t, err) require.Equal(t, StateRunning, forked.State) forkID := forked.Id t.Cleanup(func() { _ = mgr.DeleteInstance(context.Background(), forkID) }) @@ -373,6 +400,10 @@ func TestFirecrackerForkFromRunningNetwork(t *testing.T) { sourceAfterFork, err := mgr.GetInstance(ctx, sourceID) require.NoError(t, err) + if sourceAfterFork.State != StateRunning { + sourceAfterFork, err = waitForInstanceState(ctx, mgr, sourceID, StateRunning, 20*time.Second) + require.NoError(t, err) + } require.Equal(t, StateRunning, sourceAfterFork.State) assert.NotEmpty(t, sourceAfterFork.IP) assert.NotEmpty(t, sourceAfterFork.MAC) diff --git a/lib/instances/fork.go b/lib/instances/fork.go index 60d3a5cf..496fb54c 100644 --- a/lib/instances/fork.go +++ b/lib/instances/fork.go @@ -497,6 +497,14 @@ func cloneStoredMetadataForFork(src StoredMetadata) StoredMetadata { stoppedAt := *src.StoppedAt dst.StoppedAt = &stoppedAt } + if src.ProgramStartedAt != nil { + programStartedAt := *src.ProgramStartedAt + dst.ProgramStartedAt = &programStartedAt + } + if src.GuestAgentReadyAt != nil { + guestAgentReadyAt := *src.GuestAgentReadyAt + dst.GuestAgentReadyAt = &guestAgentReadyAt + } if src.ExitCode != nil { exitCode := *src.ExitCode dst.ExitCode = &exitCode diff --git a/lib/instances/liveness.go b/lib/instances/liveness.go index 96f13a89..92f27dc5 100644 --- a/lib/instances/liveness.go +++ b/lib/instances/liveness.go @@ -42,7 +42,7 @@ func (a *instanceLivenessAdapter) IsInstanceRunning(ctx context.Context, instanc // Consider instance "running" if the VMM is active (any of these states means VM is using the device) switch inst.State { - case StateRunning, StatePaused, StateCreated: + case StateRunning, StateInitializing, StatePaused, StateCreated: return true default: // StateStopped, StateStandby, StateShutdown, StateUnknown diff --git a/lib/instances/manager.go b/lib/instances/manager.go index 3b581e83..b75e0f4c 100644 --- a/lib/instances/manager.go +++ b/lib/instances/manager.go @@ -165,6 +165,14 @@ func (m *manager) maybePersistExitInfo(ctx context.Context, id string) { m.persistExitInfo(ctx, id) } +// maybePersistBootMarkers persists boot markers to metadata under lock. +func (m *manager) maybePersistBootMarkers(ctx context.Context, id string) { + lock := m.getInstanceLock(id) + lock.Lock() + defer lock.Unlock() + m.persistBootMarkers(ctx, id) +} + // CreateInstance creates and starts a new instance func (m *manager) CreateInstance(ctx context.Context, req CreateInstanceRequest) (*Instance, error) { // Note: ID is generated inside createInstance, so we can't lock before calling it. @@ -315,6 +323,9 @@ func (m *manager) GetInstance(ctx context.Context, idOrName string) (*Instance, if inst.State == StateStopped && inst.ExitCode != nil { m.maybePersistExitInfo(ctx, inst.Id) } + if (inst.State == StateRunning || inst.State == StateInitializing) && inst.BootMarkersHydrated { + m.maybePersistBootMarkers(ctx, inst.Id) + } return inst, nil } @@ -336,6 +347,9 @@ func (m *manager) GetInstance(ctx context.Context, idOrName string) (*Instance, if inst.State == StateStopped && inst.ExitCode != nil { m.maybePersistExitInfo(ctx, inst.Id) } + if (inst.State == StateRunning || inst.State == StateInitializing) && inst.BootMarkersHydrated { + m.maybePersistBootMarkers(ctx, inst.Id) + } return inst, nil } if len(nameMatches) > 1 { @@ -354,6 +368,9 @@ func (m *manager) GetInstance(ctx context.Context, idOrName string) (*Instance, if inst.State == StateStopped && inst.ExitCode != nil { m.maybePersistExitInfo(ctx, inst.Id) } + if (inst.State == StateRunning || inst.State == StateInitializing) && inst.BootMarkersHydrated { + m.maybePersistBootMarkers(ctx, inst.Id) + } return inst, nil } if len(prefixMatches) > 1 { @@ -451,6 +468,7 @@ func (m *manager) ListInstanceAllocations(ctx context.Context) ([]resources.Inst // ListRunningInstancesInfo returns info needed for utilization metrics collection. // Used by the resource manager for VM utilization tracking. +// Includes active VMs in Running or Initializing state. func (m *manager) ListRunningInstancesInfo(ctx context.Context) ([]resources.InstanceUtilizationInfo, error) { instances, err := m.listInstances(ctx) if err != nil { @@ -459,8 +477,8 @@ func (m *manager) ListRunningInstancesInfo(ctx context.Context) ([]resources.Ins infos := make([]resources.InstanceUtilizationInfo, 0, len(instances)) for _, inst := range instances { - // Only include running instances (they have a hypervisor process) - if inst.State != StateRunning { + // Only include active instances (they have a hypervisor process) + if inst.State != StateRunning && inst.State != StateInitializing { continue } diff --git a/lib/instances/manager_test.go b/lib/instances/manager_test.go index 641cd6a9..dacd0e50 100644 --- a/lib/instances/manager_test.go +++ b/lib/instances/manager_test.go @@ -110,6 +110,32 @@ func waitForVMReady(ctx context.Context, socketPath string, timeout time.Duratio return fmt.Errorf("VM did not reach running state within %v", timeout) } +// waitForInstanceState polls GetInstance until the expected state is observed or timeout expires. +func waitForInstanceState(ctx context.Context, mgr Manager, instanceID string, expected State, timeout time.Duration) (*Instance, error) { + deadline := time.Now().Add(timeout) + lastState := StateUnknown + lastErr := error(nil) + + for time.Now().Before(deadline) { + inst, err := mgr.GetInstance(ctx, instanceID) + if err == nil { + lastState = inst.State + if inst.State == expected { + return inst, nil + } + } else { + lastErr = err + } + + time.Sleep(100 * time.Millisecond) + } + + if lastErr != nil { + return nil, fmt.Errorf("instance %s did not reach %s within %v (last error: %w)", instanceID, expected, timeout, lastErr) + } + return nil, fmt.Errorf("instance %s did not reach %s within %v (last state: %s)", instanceID, expected, timeout, lastState) +} + // waitForLogMessage polls instance logs until the message appears or times out func waitForLogMessage(ctx context.Context, mgr *manager, instanceID, message string, timeout time.Duration) error { deadline := time.Now().Add(timeout) @@ -282,7 +308,7 @@ func TestBasicEndToEnd(t *testing.T) { assert.NotEmpty(t, inst.Id) assert.Equal(t, "test-nginx", inst.Name) assert.Equal(t, integrationTestImageRef(t, "docker.io/library/nginx:alpine"), inst.Image) - assert.Equal(t, StateRunning, inst.State) + assert.Contains(t, []State{StateInitializing, StateRunning}, inst.State) assert.False(t, inst.HasSnapshot) assert.NotEmpty(t, inst.KernelVersion) @@ -307,6 +333,8 @@ func TestBasicEndToEnd(t *testing.T) { // Wait for VM to be fully running err = waitForVMReady(ctx, inst.SocketPath, 5*time.Second) require.NoError(t, err, "VM should reach running state") + inst, err = waitForInstanceState(ctx, manager, inst.Id, StateRunning, 10*time.Second) + require.NoError(t, err, "instance should reach Running state") // Get instance retrieved, err := manager.GetInstance(ctx, inst.Id) @@ -751,7 +779,9 @@ func TestBasicEndToEnd(t *testing.T) { t.Log("Testing restart after stop...") restartedInst, err := manager.StartInstance(ctx, inst.Id, StartInstanceRequest{}) require.NoError(t, err, "StartInstance should succeed") - assert.Equal(t, StateRunning, restartedInst.State, "Instance should be Running after restart") + assert.Contains(t, []State{StateInitializing, StateRunning}, restartedInst.State, "Instance should be active after restart") + restartedInst, err = waitForInstanceState(ctx, manager, restartedInst.Id, StateRunning, 10*time.Second) + require.NoError(t, err, "instance should reach Running after restart") // Verify exit info was cleared retrieved, err = manager.GetInstance(ctx, inst.Id) @@ -974,8 +1004,26 @@ func TestOOMExitPropagation(t *testing.T) { if finalInst != nil { assert.Equal(t, StateStopped, finalInst.State) + // Exit metadata may lag the first observed Stopped state by a short window. + if finalInst.ExitCode == nil { + deadline := time.Now().Add(10 * time.Second) + for time.Now().Before(deadline) { + got, getErr := manager.GetInstance(ctx, inst.Id) + if getErr == nil { + finalInst = got + if finalInst.ExitCode != nil { + break + } + } + time.Sleep(200 * time.Millisecond) + } + } // Verify exit info shows OOM - require.NotNil(t, finalInst.ExitCode, "ExitCode should be populated after OOM") + if finalInst.ExitCode == nil { + t.Logf("Attempt %d: instance stopped without exit info; retrying", attempt) + _ = manager.DeleteInstance(ctx, inst.Id) + continue + } assert.Equal(t, 137, *finalInst.ExitCode, "OOM kill should result in exit code 137 (SIGKILL)") assert.Contains(t, finalInst.ExitMessage, "OOM", "Exit message should indicate OOM") t.Logf("OOM exit info propagated: code=%d message=%q", *finalInst.ExitCode, finalInst.ExitMessage) @@ -1068,7 +1116,9 @@ func TestEntrypointEnvVars(t *testing.T) { inst, err := mgr.CreateInstance(ctx, req) require.NoError(t, err) require.NotNil(t, inst) - assert.Equal(t, StateRunning, inst.State) + assert.Contains(t, []State{StateInitializing, StateRunning}, inst.State) + inst, err = waitForInstanceState(ctx, mgr, inst.Id, StateRunning, 20*time.Second) + require.NoError(t, err) t.Logf("Instance created: %s", inst.Id) // Helper to run command in guest with retry @@ -1295,7 +1345,9 @@ func TestStandbyAndRestore(t *testing.T) { inst, err := manager.CreateInstance(ctx, req) require.NoError(t, err) - assert.Equal(t, StateRunning, inst.State) + assert.Contains(t, []State{StateInitializing, StateRunning}, inst.State) + inst, err = waitForInstanceState(ctx, manager, inst.Id, StateRunning, 20*time.Second) + require.NoError(t, err) t.Logf("Instance created: %s", inst.Id) // Wait for VM to be fully running before standby @@ -1337,7 +1389,9 @@ func TestStandbyAndRestore(t *testing.T) { t.Log("Restoring instance...") inst, err = manager.RestoreInstance(ctx, inst.Id) require.NoError(t, err) - assert.Equal(t, StateRunning, inst.State) + assert.Contains(t, []State{StateInitializing, StateRunning}, inst.State) + inst, err = waitForInstanceState(ctx, manager, inst.Id, StateRunning, 20*time.Second) + require.NoError(t, err) t.Log("Instance restored and running") // DEBUG: Check app.log file size after restore @@ -1382,7 +1436,9 @@ func TestStateTransitions(t *testing.T) { shouldFail bool }{ {"Stopped to Created", StateStopped, StateCreated, false}, + {"Created to Initializing", StateCreated, StateInitializing, false}, {"Created to Running", StateCreated, StateRunning, false}, + {"Initializing to Running", StateInitializing, StateRunning, false}, {"Running to Paused", StateRunning, StatePaused, false}, {"Paused to Running", StatePaused, StateRunning, false}, {"Paused to Standby", StatePaused, StateStandby, false}, @@ -1392,6 +1448,7 @@ func TestStateTransitions(t *testing.T) { // Invalid transitions {"Running to Standby", StateRunning, StateStandby, true}, {"Stopped to Running", StateStopped, StateRunning, true}, + {"Stopped to Initializing", StateStopped, StateInitializing, true}, {"Standby to Running", StateStandby, StateRunning, true}, } diff --git a/lib/instances/qemu_test.go b/lib/instances/qemu_test.go index ab5aa853..6821fd9c 100644 --- a/lib/instances/qemu_test.go +++ b/lib/instances/qemu_test.go @@ -276,7 +276,7 @@ func TestQEMUBasicEndToEnd(t *testing.T) { assert.NotEmpty(t, inst.Id) assert.Equal(t, "test-nginx-qemu", inst.Name) assert.Equal(t, integrationTestImageRef(t, "docker.io/library/nginx:alpine"), inst.Image) - assert.Equal(t, StateRunning, inst.State) + assert.Contains(t, []State{StateInitializing, StateRunning}, inst.State) assert.Equal(t, hypervisor.TypeQEMU, inst.HypervisorType) assert.False(t, inst.HasSnapshot) assert.NotEmpty(t, inst.KernelVersion) @@ -302,6 +302,8 @@ func TestQEMUBasicEndToEnd(t *testing.T) { // Wait for VM to be fully running err = waitForQEMUReady(ctx, inst.SocketPath, 10*time.Second) require.NoError(t, err, "QEMU VM should reach running state") + inst, err = waitForInstanceState(ctx, manager, inst.Id, StateRunning, 20*time.Second) + require.NoError(t, err, "instance should reach Running state") // Get instance retrieved, err := manager.GetInstance(ctx, inst.Id) @@ -650,7 +652,9 @@ func TestQEMUEntrypointEnvVars(t *testing.T) { inst, err := mgr.CreateInstance(ctx, req) require.NoError(t, err) require.NotNil(t, inst) - assert.Equal(t, StateRunning, inst.State) + assert.Contains(t, []State{StateInitializing, StateRunning}, inst.State) + inst, err = waitForInstanceState(ctx, mgr, inst.Id, StateRunning, 20*time.Second) + require.NoError(t, err) assert.Equal(t, hypervisor.TypeQEMU, inst.HypervisorType, "Instance should use QEMU hypervisor") t.Logf("Instance created: %s", inst.Id) @@ -816,7 +820,9 @@ func TestQEMUStandbyAndRestore(t *testing.T) { inst, err := manager.CreateInstance(ctx, req) require.NoError(t, err) require.NotNil(t, inst) - assert.Equal(t, StateRunning, inst.State) + assert.Contains(t, []State{StateInitializing, StateRunning}, inst.State) + inst, err = waitForInstanceState(ctx, manager, inst.Id, StateRunning, 20*time.Second) + require.NoError(t, err) assert.Equal(t, hypervisor.TypeQEMU, inst.HypervisorType) t.Logf("Instance created: %s (hypervisor: %s)", inst.Id, inst.HypervisorType) @@ -850,7 +856,9 @@ func TestQEMUStandbyAndRestore(t *testing.T) { t.Log("Restoring instance...") inst, err = manager.RestoreInstance(ctx, inst.Id) require.NoError(t, err) - assert.Equal(t, StateRunning, inst.State) + assert.Contains(t, []State{StateInitializing, StateRunning}, inst.State) + inst, err = waitForInstanceState(ctx, manager, inst.Id, StateRunning, 20*time.Second) + require.NoError(t, err) t.Log("Instance restored and running") // Wait for VM to be running again @@ -919,6 +927,8 @@ func TestQEMUForkFromRunningNetwork(t *testing.T) { }) require.NoError(t, err) t.Cleanup(func() { _ = manager.DeleteInstance(context.Background(), source.Id) }) + source, err = waitForInstanceState(ctx, manager, source.Id, StateRunning, 20*time.Second) + require.NoError(t, err) require.NoError(t, waitForQEMUReady(ctx, source.SocketPath, 10*time.Second)) assert.NotEmpty(t, source.IP) @@ -941,12 +951,19 @@ func TestQEMUForkFromRunningNetwork(t *testing.T) { sourceAfterFork, err := manager.GetInstance(ctx, source.Id) require.NoError(t, err) + if sourceAfterFork.State != StateRunning { + sourceAfterFork, err = waitForInstanceState(ctx, manager, source.Id, StateRunning, 20*time.Second) + require.NoError(t, err) + } require.Equal(t, StateRunning, sourceAfterFork.State) require.NotEmpty(t, sourceAfterFork.IP) assertHostCanReachNginx(t, sourceAfterFork.IP, 80, 60*time.Second) forked, err = manager.RestoreInstance(ctx, forkedID) require.NoError(t, err) + require.Contains(t, []State{StateInitializing, StateRunning}, forked.State) + forked, err = waitForInstanceState(ctx, manager, forkedID, StateRunning, 20*time.Second) + require.NoError(t, err) require.Equal(t, StateRunning, forked.State) require.NoError(t, waitForQEMUReady(ctx, forked.SocketPath, 10*time.Second)) diff --git a/lib/instances/query.go b/lib/instances/query.go index 49d89244..eeae02ac 100644 --- a/lib/instances/query.go +++ b/lib/instances/query.go @@ -1,25 +1,33 @@ package instances import ( + "bufio" "context" "fmt" "io" "os" "path/filepath" + "slices" "strconv" "strings" + "time" "github.com/kernel/hypeman/lib/hypervisor" "github.com/kernel/hypeman/lib/logger" ) // exitSentinelPrefix is the machine-parseable prefix written by init to serial console. -const exitSentinelPrefix = "HYPEMAN-EXIT " +const ( + exitSentinelPrefix = "HYPEMAN-EXIT " + programStartSentinelPrefix = "HYPEMAN-PROGRAM-START " + agentReadySentinelPrefix = "HYPEMAN-AGENT-READY " +) // stateResult holds the result of state derivation type stateResult struct { - State State - Error *string // Non-nil if state couldn't be determined + State State + Error *string // Non-nil if state couldn't be determined + BootMarkersHydrated bool } // deriveState determines instance state by checking socket and querying the hypervisor. @@ -66,7 +74,11 @@ func (m *manager) deriveState(ctx context.Context, stored *StoredMetadata) state case hypervisor.StateCreated: return stateResult{State: StateCreated} case hypervisor.StateRunning: - return stateResult{State: StateRunning} + hydrated := m.hydrateBootMarkersFromLogs(stored) + return stateResult{ + State: deriveRunningState(stored), + BootMarkersHydrated: hydrated, + } case hypervisor.StatePaused: return stateResult{State: StatePaused} case hypervisor.StateShutdown: @@ -82,6 +94,127 @@ func (m *manager) deriveState(ctx context.Context, stored *StoredMetadata) state } } +func deriveRunningState(stored *StoredMetadata) State { + if stored.ProgramStartedAt == nil { + return StateInitializing + } + if stored.SkipGuestAgent { + return StateRunning + } + if stored.GuestAgentReadyAt == nil { + return StateInitializing + } + return StateRunning +} + +// hydrateBootMarkersFromLogs fills missing boot markers from serial logs. +// Returns true when at least one missing marker was found and populated. +func (m *manager) hydrateBootMarkersFromLogs(stored *StoredMetadata) bool { + needProgram := stored.ProgramStartedAt == nil + needAgent := !stored.SkipGuestAgent && stored.GuestAgentReadyAt == nil + if !needProgram && !needAgent { + return false + } + + programStartedAt, guestAgentReadyAt := m.parseBootMarkers(stored.Id) + hydrated := false + if needProgram && programStartedAt != nil { + stored.ProgramStartedAt = programStartedAt + hydrated = true + } + if needAgent && guestAgentReadyAt != nil { + stored.GuestAgentReadyAt = guestAgentReadyAt + hydrated = true + } + return hydrated +} + +// parseBootMarkers scans app logs (including rotated files) and returns the +// latest observed program-start and guest-agent-ready marker timestamps. +func (m *manager) parseBootMarkers(id string) (*time.Time, *time.Time) { + logPaths := m.appLogPathsForMarkerScan(id) + + var programStartedAt *time.Time + var guestAgentReadyAt *time.Time + for _, logPath := range logPaths { + f, err := os.Open(logPath) + if err != nil { + continue + } + + scanner := bufio.NewScanner(f) + for scanner.Scan() { + line := scanner.Text() + if ts, ok := parseProgramStartSentinelLine(line); ok { + programStartedAt = &ts + } + if ts, ok := parseAgentReadySentinelLine(line); ok { + guestAgentReadyAt = &ts + } + } + _ = f.Close() + } + + return programStartedAt, guestAgentReadyAt +} + +// appLogPathsForMarkerScan returns app log paths in chronological order +// (oldest rotated file to newest active file). +func (m *manager) appLogPathsForMarkerScan(id string) []string { + base := m.paths.InstanceAppLog(id) + matches, err := filepath.Glob(base + "*") + if err != nil { + return []string{base} + } + + type logPathWithRank struct { + path string + rank int // higher rank means older rotated log; 0 means active file + } + paths := make([]logPathWithRank, 0, len(matches)) + for _, path := range matches { + if path == base { + paths = append(paths, logPathWithRank{path: path, rank: 0}) + continue + } + + suffix := strings.TrimPrefix(path, base) + if !strings.HasPrefix(suffix, ".") { + continue + } + n, err := strconv.Atoi(strings.TrimPrefix(suffix, ".")) + if err != nil || n <= 0 { + continue + } + paths = append(paths, logPathWithRank{path: path, rank: n}) + } + + if len(paths) == 0 { + return []string{base} + } + + slices.SortFunc(paths, func(a, b logPathWithRank) int { + // Rotated logs first (older-to-newer by descending suffix), then active file. + switch { + case a.rank == 0 && b.rank != 0: + return 1 + case a.rank != 0 && b.rank == 0: + return -1 + case a.rank != b.rank: + // Larger suffix is older and should be read first. + return b.rank - a.rank + default: + return strings.Compare(a.path, b.path) + } + }) + + ordered := make([]string, 0, len(paths)) + for _, p := range paths { + ordered = append(ordered, p.path) + } + return ordered +} + // hasSnapshot checks if a snapshot exists for an instance func (m *manager) hasSnapshot(dataDir string) bool { snapshotDir := filepath.Join(dataDir, "snapshots", "snapshot-latest") @@ -105,10 +238,11 @@ func (m *manager) hasSnapshot(dataDir string) bool { func (m *manager) toInstance(ctx context.Context, meta *metadata) Instance { result := m.deriveState(ctx, &meta.StoredMetadata) inst := Instance{ - StoredMetadata: meta.StoredMetadata, - State: result.State, - StateError: result.Error, - HasSnapshot: m.hasSnapshot(meta.StoredMetadata.DataDir), + StoredMetadata: meta.StoredMetadata, + State: result.State, + StateError: result.Error, + HasSnapshot: m.hasSnapshot(meta.StoredMetadata.DataDir), + BootMarkersHydrated: result.BootMarkersHydrated, } // If VM is stopped and exit info isn't persisted yet, populate in-memory @@ -142,7 +276,8 @@ func (m *manager) parseExitSentinel(id string) (int, string, bool) { // Scan lines from the tail looking for the sentinel lines := strings.Split(string(data), "\n") - for _, line := range lines { + for i := len(lines) - 1; i >= 0; i-- { + line := lines[i] code, msg, ok := parseExitSentinelLine(line) if ok { return code, msg, true @@ -180,6 +315,43 @@ func (m *manager) persistExitInfo(ctx context.Context, id string) { } } +// persistBootMarkers parses program-start and guest-agent-ready markers from +// serial logs and persists them to metadata. Must be called under instance lock. +func (m *manager) persistBootMarkers(ctx context.Context, id string) { + log := logger.FromContext(ctx) + + meta, err := m.loadMetadata(id) + if err != nil { + return + } + + needProgram := meta.ProgramStartedAt == nil + needAgent := !meta.SkipGuestAgent && meta.GuestAgentReadyAt == nil + if !needProgram && !needAgent { + return + } + + programStartedAt, guestAgentReadyAt := m.parseBootMarkers(id) + updated := false + if needProgram && programStartedAt != nil { + meta.ProgramStartedAt = programStartedAt + updated = true + } + if needAgent && guestAgentReadyAt != nil { + meta.GuestAgentReadyAt = guestAgentReadyAt + updated = true + } + if !updated { + return + } + + if err := m.saveMetadata(meta); err != nil { + log.WarnContext(ctx, "failed to persist boot markers", "instance_id", id, "error", err) + } else { + log.DebugContext(ctx, "persisted boot markers from serial log", "instance_id", id) + } +} + // readTail reads the last n bytes of a file. If the file is smaller than n, // the entire file is returned. func readTail(path string, n int64) ([]byte, error) { @@ -261,6 +433,38 @@ func parseExitSentinelLine(line string) (int, string, bool) { return code, "", true } +func parseProgramStartSentinelLine(line string) (time.Time, bool) { + return parseSentinelTimestamp(line, programStartSentinelPrefix) +} + +func parseAgentReadySentinelLine(line string) (time.Time, bool) { + return parseSentinelTimestamp(line, agentReadySentinelPrefix) +} + +func parseSentinelTimestamp(line, sentinelPrefix string) (time.Time, bool) { + line = strings.TrimSpace(line) + + idx := strings.Index(line, sentinelPrefix) + if idx < 0 { + return time.Time{}, false + } + + sentinel := line[idx+len(sentinelPrefix):] + for _, field := range strings.Fields(sentinel) { + if !strings.HasPrefix(field, "ts=") { + continue + } + ts := strings.TrimPrefix(field, "ts=") + parsed, err := time.Parse(time.RFC3339Nano, ts) + if err != nil { + return time.Time{}, false + } + return parsed, true + } + + return time.Time{}, false +} + // listInstances returns all instances func (m *manager) listInstances(ctx context.Context) ([]Instance, error) { log := logger.FromContext(ctx) diff --git a/lib/instances/query_test.go b/lib/instances/query_test.go index 0ede8659..a6877fd1 100644 --- a/lib/instances/query_test.go +++ b/lib/instances/query_test.go @@ -2,6 +2,7 @@ package instances import ( "testing" + "time" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" @@ -91,3 +92,76 @@ func TestParseExitSentinelLine(t *testing.T) { }) } } + +func TestParseProgramStartSentinelLine(t *testing.T) { + t.Parallel() + + ts := "2026-03-08T15:09:26.123456789Z" + line := "2026-03-08T15:09:26Z [INFO] [hypeman-init:entrypoint] HYPEMAN-PROGRAM-START ts=" + ts + " mode=exec" + + parsed, ok := parseProgramStartSentinelLine(line) + require.True(t, ok) + assert.Equal(t, ts, parsed.UTC().Format(time.RFC3339Nano)) +} + +func TestParseAgentReadySentinelLine(t *testing.T) { + t.Parallel() + + ts := "2026-03-08T15:09:26.987654321Z" + line := "2026/03/08 15:09:26 [guest-agent] HYPEMAN-AGENT-READY ts=" + ts + + parsed, ok := parseAgentReadySentinelLine(line) + require.True(t, ok) + assert.Equal(t, ts, parsed.UTC().Format(time.RFC3339Nano)) +} + +func TestDeriveRunningState(t *testing.T) { + t.Parallel() + + now := time.Now().UTC() + + tests := []struct { + name string + stored StoredMetadata + want State + }{ + { + name: "initializing when program start marker missing", + stored: StoredMetadata{ + SkipGuestAgent: false, + }, + want: StateInitializing, + }, + { + name: "initializing when guest-agent marker missing", + stored: StoredMetadata{ + ProgramStartedAt: &now, + SkipGuestAgent: false, + }, + want: StateInitializing, + }, + { + name: "running when both markers present", + stored: StoredMetadata{ + ProgramStartedAt: &now, + GuestAgentReadyAt: &now, + SkipGuestAgent: false, + }, + want: StateRunning, + }, + { + name: "running when guest-agent is skipped", + stored: StoredMetadata{ + ProgramStartedAt: &now, + SkipGuestAgent: true, + }, + want: StateRunning, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + assert.Equal(t, tt.want, deriveRunningState(&tt.stored)) + }) + } +} diff --git a/lib/instances/restore.go b/lib/instances/restore.go index 1ff09c55..62eee2b2 100644 --- a/lib/instances/restore.go +++ b/lib/instances/restore.go @@ -235,14 +235,13 @@ func (m *manager) restoreInstance( log.WarnContext(ctx, "failed to update metadata after restore", "instance_id", id, "error", err) } + // Return instance with derived state (should be Running now) + finalInst := m.toInstance(ctx, meta) // Record metrics if m.metrics != nil { m.recordDuration(ctx, m.metrics.restoreDuration, start, "success", stored.HypervisorType) - m.recordStateTransition(ctx, string(StateStandby), string(StateRunning), stored.HypervisorType) + m.recordStateTransition(ctx, string(StateStandby), string(finalInst.State), stored.HypervisorType) } - - // Return instance with derived state (should be Running now) - finalInst := m.toInstance(ctx, meta) log.InfoContext(ctx, "instance restored successfully", "instance_id", id, "state", finalInst.State) return &finalInst, nil } diff --git a/lib/instances/snapshot_integration_scenario_test.go b/lib/instances/snapshot_integration_scenario_test.go index 37e1ae7a..ab30cb56 100644 --- a/lib/instances/snapshot_integration_scenario_test.go +++ b/lib/instances/snapshot_integration_scenario_test.go @@ -3,6 +3,7 @@ package instances import ( "context" "testing" + "time" "github.com/kernel/hypeman/lib/hypervisor" "github.com/kernel/hypeman/lib/images" @@ -64,6 +65,10 @@ func runStandbySnapshotScenario(t *testing.T, mgr *manager, tmpDir string, cfg s } }) + source, err = waitForInstanceState(ctx, mgr, sourceID, StateRunning, 20*time.Second) + requireNoErr(err) + require.Equal(t, StateRunning, source.State) + _, err = mgr.StandbyInstance(ctx, sourceID) requireNoErr(err) diff --git a/lib/instances/start.go b/lib/instances/start.go index 2e257aeb..ab69be76 100644 --- a/lib/instances/start.go +++ b/lib/instances/start.go @@ -50,6 +50,8 @@ func (m *manager) startInstance( // 2a. Clear stale exit info from previous run and apply command overrides stored.ExitCode = nil stored.ExitMessage = "" + stored.ProgramStartedAt = nil + stored.GuestAgentReadyAt = nil if len(req.Entrypoint) > 0 { stored.Entrypoint = req.Entrypoint } @@ -151,14 +153,13 @@ func (m *manager) startInstance( log.WarnContext(ctx, "failed to update metadata after VM start", "instance_id", id, "error", err) } + // Return instance with derived state (should be Running now) + finalInst := m.toInstance(ctx, meta) // Record metrics if m.metrics != nil { m.recordDuration(ctx, m.metrics.startDuration, start, "success", stored.HypervisorType) - m.recordStateTransition(ctx, string(StateStopped), string(StateRunning), stored.HypervisorType) + m.recordStateTransition(ctx, string(StateStopped), string(finalInst.State), stored.HypervisorType) } - - // Return instance with derived state (should be Running now) - finalInst := m.toInstance(ctx, meta) log.InfoContext(ctx, "instance started successfully", "instance_id", id, "state", finalInst.State) return &finalInst, nil } diff --git a/lib/instances/state.go b/lib/instances/state.go index f4bfbf07..f909c7c2 100644 --- a/lib/instances/state.go +++ b/lib/instances/state.go @@ -7,8 +7,14 @@ import "fmt" var ValidTransitions = map[State][]State{ // Cloud Hypervisor native transitions StateCreated: { - StateRunning, // boot VM - StateShutdown, // shutdown before boot + StateInitializing, // boot VM (guest init in progress) + StateRunning, // boot VM (fast path; markers already available) + StateShutdown, // shutdown before boot + }, + StateInitializing: { + StateRunning, // guest init complete + StatePaused, // pause + StateShutdown, // shutdown }, StateRunning: { StatePaused, // pause @@ -68,7 +74,7 @@ func (s State) IsTerminal() bool { // RequiresVMM returns true if this state requires a running VMM process func (s State) RequiresVMM() bool { switch s { - case StateCreated, StateRunning, StatePaused, StateShutdown: + case StateCreated, StateInitializing, StateRunning, StatePaused, StateShutdown: return true case StateStopped, StateStandby, StateUnknown: return false diff --git a/lib/instances/stop.go b/lib/instances/stop.go index bc2adc15..c6065124 100644 --- a/lib/instances/stop.go +++ b/lib/instances/stop.go @@ -132,10 +132,10 @@ func (m *manager) forceKillHypervisorProcess(ctx context.Context, inst *Instance return nil } -// stopInstance gracefully stops a running instance. +// stopInstance gracefully stops an active instance. // Flow: send Shutdown RPC -> wait for VM to power off -> // fall back to hypervisor shutdown -> final SIGKILL if still alive. -// Multi-hop orchestration: Running → Shutdown → Stopped +// Multi-hop orchestration: Running/Initializing → Shutdown → Stopped func (m *manager) stopInstance( ctx context.Context, id string, @@ -162,10 +162,10 @@ func (m *manager) stopInstance( stored := &meta.StoredMetadata log.DebugContext(ctx, "loaded instance", "instance_id", id, "state", inst.State) - // 2. Validate state transition (must be Running to stop) - if inst.State != StateRunning { + // 2. Validate state transition (must be active to stop) + if inst.State != StateRunning && inst.State != StateInitializing { log.ErrorContext(ctx, "invalid state for stop", "instance_id", id, "state", inst.State) - return nil, fmt.Errorf("%w: cannot stop from state %s, must be Running", ErrInvalidState, inst.State) + return nil, fmt.Errorf("%w: cannot stop from state %s, must be Running or Initializing", ErrInvalidState, inst.State) } // 3. Get network allocation BEFORE killing VMM (while we can still query it) @@ -253,7 +253,7 @@ func (m *manager) stopInstance( // Record metrics if m.metrics != nil { m.recordDuration(ctx, m.metrics.stopDuration, start, "success", stored.HypervisorType) - m.recordStateTransition(ctx, string(StateRunning), string(StateStopped), stored.HypervisorType) + m.recordStateTransition(ctx, string(inst.State), string(StateStopped), stored.HypervisorType) } // Return instance with derived state (should be Stopped now) diff --git a/lib/instances/types.go b/lib/instances/types.go index bb548d6a..4466330a 100644 --- a/lib/instances/types.go +++ b/lib/instances/types.go @@ -12,13 +12,14 @@ import ( type State string const ( - StateStopped State = "Stopped" // No VMM, no snapshot - StateCreated State = "Created" // VMM created but not booted (CH native) - StateRunning State = "Running" // VM running (CH native) - StatePaused State = "Paused" // VM paused (CH native) - StateShutdown State = "Shutdown" // VM shutdown, VMM exists (CH native) - StateStandby State = "Standby" // No VMM, snapshot exists - StateUnknown State = "Unknown" // Failed to determine state (VMM query failed) + StateStopped State = "Stopped" // No VMM, no snapshot + StateCreated State = "Created" // VMM created but not booted (CH native) + StateInitializing State = "Initializing" // VM running, guest init in progress + StateRunning State = "Running" // Guest program started and ready + StatePaused State = "Paused" // VM paused (CH native) + StateShutdown State = "Shutdown" // VM shutdown, VMM exists (CH native) + StateStandby State = "Standby" // No VMM, snapshot exists + StateUnknown State = "Unknown" // Failed to determine state (VMM query failed) ) // VolumeAttachment represents a volume attached to an instance @@ -61,6 +62,10 @@ type StoredMetadata struct { StartedAt *time.Time // Last time VM was started StoppedAt *time.Time // Last time VM was stopped + // Boot progress markers (derived from guest serial log sentinels and persisted) + ProgramStartedAt *time.Time // Set when guest program handoff/start boundary is reached + GuestAgentReadyAt *time.Time // Set when guest-agent is ready (unless skip_guest_agent=true) + // Versions KernelVersion string // Kernel version (e.g., "ch-v6.12.9") @@ -105,9 +110,10 @@ type Instance struct { StoredMetadata // Derived fields (not stored in metadata.json) - State State // Derived from socket + VMM query - StateError *string // Error message if state couldn't be determined (non-nil when State=Unknown) - HasSnapshot bool // Derived from filesystem check + State State // Derived from socket + VMM query + guest boot markers + StateError *string // Error message if state couldn't be determined (non-nil when State=Unknown) + HasSnapshot bool // Derived from filesystem check + BootMarkersHydrated bool // True when missing boot markers were hydrated from logs in this read } // GetHypervisorType returns the hypervisor type as a string. diff --git a/lib/network/README.md b/lib/network/README.md index c54e66a8..16187c33 100644 --- a/lib/network/README.md +++ b/lib/network/README.md @@ -79,7 +79,7 @@ Hypeman provides a single default network that all instances can optionally conn - Follows instance manager's pattern **Sources of truth:** -- **Running VMs**: Query `GetVmInfo()` from Cloud Hypervisor - returns IP/MAC/TAP +- **Active VMs** (`Running` or `Initializing`): Query `GetVmInfo()` from Cloud Hypervisor - returns IP/MAC/TAP - **Standby VMs**: Read `guests/{id}/snapshots/snapshot-latest/config.json` from snapshot - **Stopped VMs**: No network allocation @@ -352,4 +352,3 @@ Cleanup happens automatically via `t.Cleanup()`, which runs even on test failure - **Integration tests** (TestInitializeIntegration, TestCreateAllocationIntegration, etc.): Require permissions, create real devices All tests run via `make test` - no separate commands needed. - diff --git a/lib/oapi/oapi.go b/lib/oapi/oapi.go index 674b6e6d..ceae18e3 100644 --- a/lib/oapi/oapi.go +++ b/lib/oapi/oapi.go @@ -105,13 +105,14 @@ const ( // Defines values for InstanceState. const ( - InstanceStateCreated InstanceState = "Created" - InstanceStatePaused InstanceState = "Paused" - InstanceStateRunning InstanceState = "Running" - InstanceStateShutdown InstanceState = "Shutdown" - InstanceStateStandby InstanceState = "Standby" - InstanceStateStopped InstanceState = "Stopped" - InstanceStateUnknown InstanceState = "Unknown" + InstanceStateCreated InstanceState = "Created" + InstanceStateInitializing InstanceState = "Initializing" + InstanceStatePaused InstanceState = "Paused" + InstanceStateRunning InstanceState = "Running" + InstanceStateShutdown InstanceState = "Shutdown" + InstanceStateStandby InstanceState = "Standby" + InstanceStateStopped InstanceState = "Stopped" + InstanceStateUnknown InstanceState = "Unknown" ) // Defines values for RestoreSnapshotRequestTargetHypervisor. @@ -727,7 +728,8 @@ type Instance struct { // State Instance state: // - Created: VMM created but not started (Cloud Hypervisor native) - // - Running: VM is actively running (Cloud Hypervisor native) + // - Initializing: VM is running while guest init is still in progress + // - Running: Guest program has started and instance is ready // - Paused: VM is paused (Cloud Hypervisor native) // - Shutdown: VM shut down but VMM exists (Cloud Hypervisor native) // - Stopped: No VMM running, no snapshot exists @@ -762,7 +764,8 @@ type InstanceGPU struct { // InstanceState Instance state: // - Created: VMM created but not started (Cloud Hypervisor native) -// - Running: VM is actively running (Cloud Hypervisor native) +// - Initializing: VM is running while guest init is still in progress +// - Running: Guest program has started and instance is ready // - Paused: VM is paused (Cloud Hypervisor native) // - Shutdown: VM shut down but VMM exists (Cloud Hypervisor native) // - Stopped: No VMM running, no snapshot exists @@ -13383,113 +13386,114 @@ var swaggerSpec = []string{ "f/2qXF6k6Y1uXSHsV6ObKMMJCngahZp/GuuTZ8QxElqpURKV5z6Dw3rBrhi/ZuWlG92mPr9/pEQs0OXp", "aUmDLsjEpoFqsHBwoajZB57caBt217DYa2dTiILbRORblRIWbqBvHudWVL85N0uDdQ3UcDkn6TWNU2bA", "rfd+xZoqCpSQzEdp6mOQ9CsXaHFxcXJc2nCMD3YO+4dPuofjnYPuftjf6eKdvYPu7iPcn+wFj/dqEi02", - "d425vbdL+YTWBzYB4EEZaWLYwoE+Q5m7yjhVKHNl04fzmeY0UYGlNWE8oB+wvkW6B7hdA/0mWmRc78qP", - "z7A+qO7bBP5a/cX5LFWaDYJv5CxVSP8FU9ZLsFLD6i7MmR+g1xy+Ec4HlPGq+GGag2/VcvOqqNK2Xj/O", - "OxQGswRsgF5kRCsje5bMtSWxPw0ttY7L4JS9VXKNs7tVcPPqtAwIW52Wgwy4gy07htmJeGMeinjjU9YT", - "HAENyx1vUkUj+skcOT11KhUNjLSGYTfrjp1NMUDCkblC68xwxpvDXrPZR+5UX56iNoQP/hlZYU7/tZWZ", - "7IpHaH/3yf6Tg8e7Tw4aBRHkE1xPjZ+Br9Hy5NaS5iBJRy7hbM3Sn51dwOWjLzaZxkY6t2sv+Gwmggea", - "26MM5Rls88Gf9J4UYydCno6jgrbHBl2Bg36TdMM1Nqo/aDSnkwn741Nwtfs3QeOdjwdyd+wVjrKB/Jzk", - "SVFDuSR2kXHXpJPxS4GAUELWRoC8JRJWgM6JQoA/XU2w9I2auQhZlHNxIhbiXsTa39vbO3z8aLcRXtnZ", - "FQ7OCOS/5Vme2hkUjhi0RO235+dou4Bwpk/nN5kIIvXiTDCk95whm9arX3Kp1LLHng9LahiWHGts3/O4", - "FuSXlmOxi7JAB0+njJtZOuVeaO/t9R/vPzp81OwYW4lnJD6upjC2nbX0CxIQOi/tfBu02u+OzpDuXUxw", - "UObwd3b39h8dPD680azUjWalBGYypkrdaGKHjw8e7e/t7jQLZfJprm2QXunAlmmX59B5kMKzGx5QLJPe", - "Tt1t4eMSS7qdFbrjgp/9rsaloqP9UfevxrEejXqD7V/+/L+77//j3/zBVSVdhySiG5IJSDJXZNEFW2bm", - "F4EUnspe2TMJFNyaAbaxSYrgGGK6gitikzPgj8WJP+pnN+niNY6X1rKzewipBrO/167Mnxx0Ca7Lbqsr", - "PWVz19uqn+VNHKvzQHkqoVda8OlFbc2cFhn9QrD3VhP9jf/q0ePUlQfQbHhTn+fVLs5nWM1O2IQvm3xu", - "IkhbxzFnCkg0QykhAXJIGCWhuxMyidryqOCKFkmCwpRYyBmeU2ALcGzMXglWMxAC4EPKpmUn/KUBm4i3", - "Zg6r0yLAuLZhE02c9DstvRMpwMro3CXCuftSIwMClSO/tLbcsSDTNMICVf36V0xZLuKIsqsmvctFPOYR", - "DZD+oKommfAo4tcj/Ur+AmvZarQ6/cEot7hX1B5mctbfwmxIZdx8Cb/oVW5VPL+Ao9o2329D/Zcmik2v", - "Ge6FFoqN6/sFox8LiF6OFd7f7dc5+tV0WnLxWw6buOmdaVHWd+JdRMNRll3NY+41BrWKZqAsX5TW61st", - "WGxXuTUuc1io7XSlLha7DNdCTHQjBqeZ0bhqFXCz2ZYkKI++f/jo8UHDoPSvEmFWVMj4CoFlHq8QVGp2", - "6rQJN3z46PDJk739R092b8R3OgNSzf7UGZGK+1NJoljhhR/14X83mpQxIfmnVGNGKk+olBDx1hP6suLo", - "5sFINdqMVdWp8p106pOyYNNMdFjBLR2VWK5CruA2mUwIKORGBm7dfDIVZ7VGcwhwggOqFh7JGl+D/w7K", - "mlSCahr0XpmsB6S2bxsXqSmXTMe5f0TbDY7+w0jMFVw4bJzbQqbjOun8TXVUI5sbh7ewovlpoHgxGOFz", - "UrjOgImusSxZS/TvQJGwU8gFXTWrmRbNq4o4XM8Ki+SOBr7AMH8RkeL2V7azIM2VmOQqxFddofVHUHME", - "4E3XxHDhuZE90WbBeieXCn2wF+DtvhqNi1lnVqb1KaWoyW/dm4/bLIv18nfmBrv5eAXPiJt8WE3AAfho", - "52BBnvfdKaFEDTYpLtbnVLyDMHpjG7hVIL01K2wklt4+vpP4+aXtOC+4hTV3gnRf+evElQy0B93+Xrd/", - "8G5nb/DoYLCzcxfRG5kxqE5F/vjTzvXjaBdP9qPDxeM/dmaPp7vxnted5TvK5VlJPlxJ7WnXnhBRTblS", - "TVUkSUQZ6crMHLXeMr8iRssoSRO8AOZwhSR3E/HBVWdacdrPy4ssHnqscuBUk8ZuwtHPzn6lDFSd/snx", - "6mnfyr5TnYgfwapTAXxqNhmILNxplo0OTpIXODUT9aFByUGhhJjvV1Cz3+whrqNa1mfezjBP6eEOiLPh", - "ljAhf70Edx+5XZ11pHIhGcNzMclL5q/7bVOOGLeRuqSssauaXEmPRU0dShtojwqNUZvEiVq4oFCnGN66", - "mRvLUdahlxf8xu74/SffIpDwYmXk4A+eHrjoceQGWetrtIQLteE6fi3TcdWb16hybZrDsvdpJXmbVCvK", - "tK4qCW5qc4Oe1obKTdNqLoEblAGv08znJ87VX3V1wNcpnFeaFwsrK8ykfm+Mu9lX1kyn0hVLvyXIrNZ0", - "feyZcdnRLFK3mgfTpGoRFNSwFkAGsBoEmWZ9WX2/2gv2FH/MRgDWCMslNg7WUahR9fIppF966/Ih0onr", - "AqZRrQby9OuKyTusWt6MVdXlnUOj9+BZ+rOCEtadrQpy5mN0Vhew16SLBKmganGuSaX11SdYEHGUGjQE", - "GgqLgMf54BB/+eULaJcnHiXTSy1e0AAdnZ0AlsSYgaUYXZ6iiE5IsAgiYsPnllzdQEB88+yka+J+s6IN", - "UONVAUBcbu2jsxNIz2urq7b6vd0e1LviCWE4oa1Ba6+3AwmINRhgiduQjgF+WvuRPodwA56E9qZ+apro", - "rwSOiYISGr977DCKCJPeQaLxIreY50b0BFNhjedJBCYiIy9Q3QG4/zoqP2gVEhGY2+umd5tUC6s8I8kb", - "u8/vNX7IhDNpdni336+UHMZ5Htftv0lj3cnHb8SCmFrwyz60S64Gjg2ye/Cl09rv79xoPmtTr/qGvWA4", - "VTMu6CcC03x0QyDcatATZjT6roIYsQ3zgwc4VTxyv7/X+yXTOMZi4cCVwyrhso5/IxJhSP44dpVse8gK", - "KRABKGc8jUKoKJOYVPearmKksOhNPyEsghmdkyGz14lJo4sFREjHSCOZ0cyUz4oZ2uy+oUNEqqc8XFSg", - "m3W3rbvrOr4tB/CNazFnEn5SU5TZR+JN6mkZcG/ObcIwU3kmY5Nz+oqAY9qEfvR22MjDUlNA2BYC5Q6y", - "iPvdLb8NEgLI/Ob74+ydKwlevvW0AEFZEKVhzhqUSzF7EzCZ0sA2NfcV8XBSL6GFBUox1s7dwYyHxMQ/", - "JQs148z8TscpU6n5PRb8WhKhb2obP21hbfPSWtSFegk0hhhmk6lFj7ltprj9+YosvvSG7CiMXWYdW4gJ", - "R5LbnOXGk5VKlJUDA9ytqeTvlwie2domJl9wMcWqmSZPVZKqHjILIcoGfUNzyMArZyQcMsXRZ2GKLyy+", - "bH/OR/wCLDbBocaTQhOzpO3PNPxSN2s5wnr1I2jqETwIAGDY0jfNsKV/TwXWLHYqZwgH4G+rHxa3tG0O", - "NhfAvmxVIRxghhKepJFmBgGpTCr2Uh+QQANHEVJwlNy3mimCnaxZjzUn+7JEWluyMf5VjhHkiywcpv7+", - "4VZrXcWFcvf/ff7mNTIMkd6FssPbkD03DNgAfR6Cg9uwNRg6F7dhqzNsETaHZ9YPbtj64l+hJIEgPq0A", - "TAAuS6iZDs3ysFLYJcpgeq78ll6/nhoOZm7mMyzRsEXDYSuvcb0F0EqlVbd3u8AL/qJn9osZpkPDX3q9", - "4ip//2x6GejTnMQjxa8IG7a+dFDhxZSqWTrO3r2vWXCNUfC8RIpQ29w+Wy4Zk15h4SI2NxdmIeKW2kcL", - "hFFOA4vKhzFlWCzqCs3zVNU7rJtcVbZZjlEH/f7WescZu1QPg11qqM/ilyVWbPebcSGWA1vmQmwtfRsa", - "o4FpC9oD77UBNugpDl2ei5/83hp+z0rbBU4Ovi9eCgZ9I2LUoxV2TIvnkWPHVsouBi0gNgxEEefmZiQR", - "6ti5HHmLMklVBF2WMfbrTlkAU4wc/u1vAP9g3DzjP4z7ZFPj4sjUqXL5rx8WOsJmOUTs+OXll0R9DxjX", - "3xQpdYVJ7hF/Hwr+vCSWCcyBVqFm21Dzs6iMqcYwC4JjaXsxjbXgeg5z6p4TptBzeNqz/zrxB8JDP0R8", - "+mGADAgjPkURZURaF4TM2qEvRQtL+Mikbcy+s1lQgxlmUyJR29yf//z7P2BSlE3/+fd/aNba/ILjvm3c", - "2yGC8sOMYKHGBKsPA/QbIUkXR3RO3GIg5InMiVigvb6tcgyvPDlV5ZAN2VuiUsFk5tiu1wUwMR3aEh96", - "PZSlRCIJIIQCdhPrcW2Uoh553p1lA8qNnujOkgBmV1BYgL4VHQ6ACx1lVFEcWWGs5VermTWXlGpV/e6S", - "xn89fVHkozLY2zUTvCGBARD7zh28sItG7fPz51s9BOy+wQrwqge5Ie/GSgK9nzRpPU0yFKVMUADKhjYV", - "0unXaoePbZtm6mHb4w+tH64rGFCvIDYKESJI6AD4U3hooiz2w80pjn3a22NXX7BefXv79RaHcH6KjSTj", - "b7fPDveWYW6LZ+Yguw+ZGLVt3bMsp2WpQud9If1GrpFCQdjsLkHcZNLcmJz2jLNJRAOFum4ukG4jJpns", - "VkaQh0IO3tpZI+zWVQ1oLV5426X4jNqrLwvVyO/Au789KoPe5BrJg25zXPt5k6xDnWMqA66/LWBLN8CJ", - "zehp+JnsnBaxaJ2G6hieZ1fOSv7pOCsVbQ/k5nRVduiUVe+GDRDF4wpBvEdCWMk2WAhTf0jYfJHtoqul", - "vEKV9X2hZn9zXNCm1Vo+NH9Ieq2wAjZNBWdZTas69LJVr+5wo+0InoWfE+FOtZmoyXKXL8t8ioIZCa7M", - "gmzJ71UcwYmrCt5EFjb9/dCisKk/dgMWxu7BT56lgfSbw2qVxHti8zfencALI9xI3v12lmCLYB4gg2/K", - "2Om0TWpELBcs2PqhjMEbud6qdcYf0Ek6S6PI2UTmRKi8mlrxUtj+DF5M65l9d9pW3g8Xb191CQs4uK1l", - "Lld+rsoVQfq2LL/ZMLOUn2jSREgEUDnEqOeov2L/jXchyjLq//vuC5tT/993X5is+v++d2Ty6m/dGbL0", - "N0WaN82CP2Dk0xw4LQMNSJMpVbSOZc1aNeRaXfsfm3G1te1uwrpmgP7JvTbhXovgWsnAZmUG75CFtdXb", - "7sdokyGbD9rwyrk0/mCs62b1gBYjXc4OKsuGEZuUkYu8YpotH/7wfC5phnHFe6ShQjs/kCvvE4e6J8cd", - "WwzPlLDLAkw2pN5289g4t2vH3bxu+yge02nKU1mMXYHah0TaYKeIlAnwQ+PD8+u5lhP/jrG0v8mrY+OM", - "9k+8vyMRoLqhhngbG9U6IcC1aioE2PZQZdAUvjCxb29dQQ2bXGSrxg/RlYtpisalakXL/pG+edUKJ+hC", - "iy+5zIBAjBgM2X+5T35XBMfvf3HhTWm/v3uQvSNs/v4XF+XETh3eEKYEJRJhQdDR62OwEk4hNh5yh+Xx", - "fdX5mIxgpra0LXv6Ly055UbT5qKTQ8+folMj0akArtWiU1bY5S5lJzPIvQlPDt98ALdJPH6KT5sQn2Q6", - "mdCAEqby5LlL/mU29/YDjFNj1pJU8Asp3cCNxae82tJqzjTP/LZxn6Bs8M1LTS7J3MP0t+cmwiZ0ckp+", - "GdYLKt8bPvQ3S5w3L6A8ZBQzkkAVdMuEaHtic/f6GYQXXFw1xTxPKspvjoDfnjsprvA75E309CBtyf2z", - "KHB5G7d8jTRlzmUDB3Ipv+h9eoM6SFip1wRYUjbNamReUzXjqUnXMrIPTf43fSpsIRZgeQLb632TFz36", - "BhjQ11whGicRiQnkh+sabILipGmScJGVRKOykI33ZuRPH5uib67JmmMrA3eQzVkMWrysqCko9Je3y0s1", - "Iz5dH6CbDe6iUT0RukN2IU32mA+GFf6AMiKLFEeSRCRQ6HpGgxlE6+pn0L8J5sVJ8iFLz7E1QC/hpBYT", - "hsDgbUkExREUnuSRqZn6YR7HHwbLueYuT0/hIxOoa7LKfRggl18uuyCkblWMvtWriLBU6LWNKW5rTBI8", - "isyOftC3UGF9WzYuN89kMmS+GF1Grm2HdII+FMJ1P9TE6zqC+krv0j3xS536DFhmLYojAYAzuElYWKMj", - "01DzR+ru9L0pUxtGDZtp3HHQ8NJkXvFpln2rhMo4SZqir50mYPE8jlfgMGoX8nlLFfJU/VmqkAgBH1vs", - "rkNu1MaB+UPhK42ozFbxchnRAf28ek2TAccLKk1UC+mXzV/zOG51WnY+noK+Xx99Xe1wWc2md6YQYv2T", - "075J8HSZ2Beipys3hy35UM9y20oWP7y850pu3zMa3oN+LJ8FZY5Vgb3Na5k/rKBLU+SkyouZXPO+M5JV", - "Sak/JWWl8nme1f5fUEQ1a62WttmwkJqB2CeZlSo83Lt0mhWc+CmhZhIqFyhMzXCVki8/rNiZERSUspLk", - "adnT28qeWcK6DMxQwo+tNAjkNG/7s/t5cgt24TuhhJ3aIil1qZHyRX8PJLemnFgjmntPfJK9VgsMwj2S", - "YFfYbNMUOIOKFvcyKvddkGFz4DJqXKQ5SmAmqatZ+JMYl9SARlN6W2LsmM8lXWCBPFPWTSJcR5ctn1pL", - "gG3RpB9eXstllR9cYgu4EMadDLzUHlKQY8FmWBA92wlOJelkB6bj7NaXp6dbdYdGqJVHRnwfBu3bcQ6V", - "ipZx6C8pLGjost8/Oz22ufKpRCJlPfQmppCS/oqQBNJbUp5KBP6AvWKVs5pKv3kZM8KUWCScMrV2FnnT", - "u5nMl1sl/N4wnbJh3j+8WsnWqH1oRApoh7697QJWC1XKFPfzmumc2YoykzBfMx94zFPd+1LlNTShEZEL", - "qUhsbHaTNIJDBJlBbCZZ+53xXesgqiQU3u6Ar09CREylpJzJIRuTieZKEiL02FCfkUakYH7wWbbOFc6o", - "5pkhfd+HaQuKsYE1B6s6qJXrsOEkcXXYfOaTrHTcraf0AmxVSC7iMY9ogCLKriRqR/TK8OBoLlGkf2yt", - "NHaN4LtvnSf39idLQ/qETbg3c6DB2QyZfwQKd1Iha86Y/+DI2ktSPCyO/sBG+8maXEvXBMERlB7N3GxR", - "qmhEPxlSpzuhUtHAFGPCGeygjowZrzdkp0QJ3QYLggIeRSRQTtewnQgebA/Tfn8vSCjER+wRmBwQvPrX", - "MYz47OwC2plaN50h039Ax++OzhDVMJ1gKzIXJmprwqOT7TdrzP/nAKZ/YXnMLHDVsfBv+E/L7s19KGvP", - "kKw5ojxZJQDx5IdXGFgO7qe24GFqC8CJPVtNeypwAEyxnKUq5NfMrxkwtVjl9mfz42RdKITCwezSFZr+", - "PrhdW5d23TBugQ/iUNo1hcRkNr0Xfb0tHfxAEz9pwLklABNTDOrw3wKmJPmPht3f3lhXhON3aKmzEHVZ", - "g7+bs7Xpm8/OwUX4FeHxUI65wTS3EqhEWdQ+ZeGMa2WzIBWCMAU5YnLWMsAJDqhadBCOXJlWW2op0yHl", - "JefHguArfdP2huxtFkhpSz1p6arjRCsUUnllerDSUw+9mRMh03E2OQSEych5AHxbqTXAUWBKnJLJhASK", - "zompPSprpK9sKneZ0TcfxLPR7qUF3UMTOfw4AbuXo4WVOkqecrV5Hc6zVs3yOmS9FrxhCp4iK32eR67h", - "CG6im6jsPINf0Vq3ePvqZt5rv+mPGo5d9pLyT8K++spV/rD5884L3ipNs0DkKP/QEjIUZl46uyWPr/WR", - "4Y1dvO7S5WpdZHg2+KYjw8+9Xj8PLHEVLvlx1YWEf3+I0N+su/GmQ8IfNm5p3kIuga6eEjUIDf8uMPBu", - "YsLv2d3+FjHh35UDKMT03p8j/nfl+mldGDPXz59R33fp8WlCvyHCtc7j01A9q4peKTld2jbN5Cbb4w/N", - "0lt15g0YercPP5O6NZAhCsBy13KF/sBlIO0JIHGiFk5fxSfgmZNnIJT0E/j3+ULrMrX03UW03UJj++3Q", - "w+Fprb72ZzK4jamE81TaJ8cPPwNc8cyVbpptfQ11sQhmdF6K6Fp1gi2IEkG6CU9AExsagFl4uMtNYdGb", - "fkK2+96QvZsR9xeiLp8GCVFIBQlUtECUKQ4UwYzxJ4kE16IBvOdi4VPwFk/uC8HjI7uaNRekPVNWXZY7", - "AsaLrr61unNHbVYo2b7CqHWKP9I4jYHgIcrQy6eoTT4qYdI7oIkWhRCdZCAlHwNCQgk4uVWc8E6/RvdJ", - "P5HRdNxklisSdbyxiVBQkErFY7f3J8eojVPFu1PC9F5o3n8CrG0i+JyGJr1uDtQ5jwxUd2oAelPNrGMu", - "kMJTaV3Hc7HDzPLeGZsmt9T0E03KtMJ4S7YGrTFlGCa6Nk9G+aAZx109HqbgPpcfKIdOrZ/3WrWuN2AT", - "FxkQFeco0nz/1s+77yHffUUHCHfRla7AZslPm/lENHRVuIvEp5m/zGaV25ffjxm/UAf5ASrY55mUWqdc", - "/75QsL+5+2HTSvXLB+z29ZI4ibygUIcOdI8+hHnFAxyhkMxJxJNY85qmbavTSkXUGrRmSiWD7e1It5tx", - "qQaH/cN+68v7L/8/AAD//9ibNZqCHgEA", + "d425vbdL+YTWBzYB4EEZaWLYwoE+Q5m7yjhVKHNl04fzmeY0UYGlNWE8oB84YVRRHNFPlE11NyCuW473", + "eqbXBPHCiDKq9DsTwEuZXjLoRXQn1kFpgF5CW3iFYwgvcpPQck5ZJYDDhVGJ6sPuhk7gr9VTPp+lSrNS", + "8I2cpQrpv2DZGgxW8ljdhaEbA/SawzfC+ZEyXhVhTHPwz1puXhV32tZzyHmYwmCWCA7Qi4zwZaTTksq2", + "JPanocfW+Rkcu7dK7nV2x1saW/KdK3iOdVoGoq1OywEKPMyWfc3svLxhFEVU9On/CY6ALOa+PKmiEf1k", + "TrFeCZWKBkYAxLC5dSfZZi0g4cjcynWWPeMgYm/u7CNHKC5PURsiEv+MrHyo/9rKrIDFU7m/+2T/ycHj", + "3ScHjeIS8gmuJ/DPwH1peXJrqX2QpCOXw7Zm6c/OLuA+03elTGMj8Nu1F9xAE8EDzUBShvKkuPngT3pP", + "iuEYIU/HUUGBZOO4wOe/SQbjGrPXHzSa08mE/fEpuNr9m6DxzscDuTv2ylvZQH7m9KSo9FyS5Mi4azLU", + "+AVLQCgha4NK3hIJK0DnRCHAny7CAVzSmdeRRTkXemIh7kWs/b29vcPHj3Yb4ZWdXeHgjECkXJ7lqZ1B", + "4YhBS9R+e36OtgsIZ/p0rpiJIFIvzsRXes8ZspnC+iUvTS3O7PmwpIYHyrHG9j2Pa0F+aZkguygLdHCe", + "yhikpVPuhfbeXv/x/qPDR82OsRWiRuLjagpj21nnAUECQuelnW+Dovzd0RnSvYsJDspCw87u3v6jg8eH", + "N5qVutGslMBMxlSpG03s8PHBo/293Z1m0VE+ZbiN+ysd2DLt8hw6D1J4dsMDimXS26m7LXyMZ0ldtEId", + "XXDd39W4VPTdP+r+1fjqo1FvsP3Ln/939/1//Js/XqukPpFEdEMyAeHoiiy6YB7NXC2QwlPZKzs7gc5c", + "89Q23EkRHEOYWHBFbL4H/LE48Uf97CZdvMbx0lp2dg8he2H299qV+fONLsF12RN2pfNt7s1bdd28ia92", + "HntPJfRKC27CqK353aLsUIgf32qiEvJfPXqcuooDmrNv6ka92mv6DKvZCZvwZSvSTWRz64vmrAuJ5i8l", + "5FQOCaMkdHdCJqRblhW82yJJUJgSCznDggpsAY6NJS3BagZyBXxI2bTs1780YBOJ2cxhdaYFGNc2bKLc", + "k34/qHciBVgZNb5EOPeIamSToHLkFwCXOxZkmkZYoGqowIopy0UcUXbVpHe5iMc8ogHSH1Q1LxMeRfx6", + "pF/JX2AtW41Wpz8Y5Ub8iibFTM66cJgNqYybL+EXvcqtijMZcFTb5vttKCnTRFfqtey90DKp8aa/YPRj", + "AdHL4cf7u/0638GaTkteg8uRGDe9My3K+k68C5I4yhK2eSzIxkZXUTaU5YvSen2rBSPwKk/JZQ4LtZ36", + "1YV3l+FaCLNuxOA0s0NXDQ1uNtuSBOXR9w8fPT5oGOf+VSLMiqIbXyGwzOMVgkrNTp024YYPHx0+ebK3", + "/+jJ7o34TmeTqtmfOrtUcX8qeRkrvPCjPvzvRpMyVin/lGosU+UJlXIs3npCX1Yc3Ty+qUabsargVb6T", + "Tn1SFmyaiQ4ruKWjEstVSD/cJpMJCRSdk5GBWzefTMX/rdEcApzggKqFR7LG1+AShLImlTidBr1XJusB", + "qe3bhlpqyiXTce5y0XaDo/8wEnMFFw4bp8uQ6bhOOn9THdXI5saHLqxofhooXgxG+PwerjNgomssSwYY", + "/TtQJOwU0ktXLXWmRfNCJQ7Xs1olue+CL9bMX5ekuP2V7SxIcyUmuQrxVVdo/RHUHAE46DWxhXhuZE8A", + "W7Deb6ZCH+wFeLuvRuNiIpuVmYJKWW/yW/fm4zZLjL38nbnBbj5ewdniJh9Wc3oAPto5WJDnfXdKKFGD", + "TYqL9Wka7yAy35gKbhWbb60MGwnPt4/vJCR/aTvOC55mzf0q3Vf+0nMlm+9Bt7/X7R+829kbPDoY7Ozc", + "RUBIZhuqU5E//rRz/TjaxZP96HDx+I+d2ePpbrzn9ZD5jtKDVvIZV7KF2rUnRFSzuFSzH0kSUUa6MjNH", + "rTf2rwj7MkrSBC+AOVwhyd1EfHAFn1ac9vPyIouHHqscONU8tJvwHbSzXykDVad/crx62rey71Qn4kew", + "6lQAn5pNBoIVd5oluIOT5AVOzUR9aFDyeSgh5vsV1Ow3e4jrqJZ1w7czzLOEuAPibLglTMhfL8HdR25X", + "JzKpXEjGDl3MG5O5AH/bLCbGE6Uuz2vsCjFXMm5RU9rSxu6jQmPUJnGiFi7O1CmGt27mGXOUdejlBb+x", + "h3//ybeITbxYGYz4g2ccLjoxuUHWui8t4UJtBJBfy3RcdRA2qlybObHs0FrJByfVisqvq6qMm3LfoKe1", + "0XfTtJqe4AaVxes08/mJcx49rrT4OoXzSvNiYWWFmdTvjfFg+8oy7FS6+uu3BJnVmq4PZzMePJpF6lZT", + "a5rsL4KCGtYCyABWgyDTrC+r71c71p7ij9kIwBphucTGwToKZa9ePoWMTm9dikU6cV3ANKoFRp5+XX16", + "h1XLm7GqYL3zkfQePEt/VlDCurNVQc58jM7qmviadJEgFVQtzjWptO7/BAsijlKDhkBDYRHwOB8cQjq/", + "fAHt8sSjZHqpxQsaoKOzE8CSGDOwFKPLUxTRCQkWQURsRN6S5xsIiG+enXRNKHFWBwLKxioAiEvXfXR2", + "Ahl/bcHWVr+324MSWjwhDCe0NWjt9XYgp7EGAyxxGzI8wE9rP9LnEG7Ak9De1E9NE/2VwDFRUJXjd48d", + "RhFhMkZINF7kFvPciJ5gKqzxPInARGTkBao7AI9iR+UHrUJuA3N73fRuk2phlWckeWP3+b3GD5lwJs0O", + "7/b7lSrGOE8Nu/03aaw7+fiNWBBTXn7ZLXfJ1cCxQXYPvnRa+/2dG81nbTZX37AXDKdqxgX9RGCaj24I", + "hFsNesKMRt8VJSO2YX7wAKeKR+7393q/ZBrHWCwcuHJYJVzW8W9EIgz5JMeuOG4PWSEFggrljKdRCEVq", + "EpM9X9NVjBQWveknhEUwo3MyZPY6MZl5sYCg6xhpJDOamfJZMUOb3Td0iEj1lIeLCnSz7rZ1d13Ht+UA", + "vnF550zCT2rqPPtIvMlmLQPuTeNNGGYqT45s0lhfEXBMm9CP3g4beVhqCgjbQqCCQhbEv7vlt0FCTJrf", + "fH+cvXNVxsu3nhYgKAuiNMxZg3J1Z29OJ1Nt2Gb7viIeTuoltLBAKYbvuTuY8ZCYkKpkoWacmd/pOGUq", + "Nb/Hgl9LIvRNbUOyLaxtqluLulCCgcYQFm2Sv+gxt80Utz9fkcWX3pAdhbFL1mNrO+FIcpsG3XiyUomy", + "CmOAu/6gwRoJ/5ktl2JSEBeztppp8lQlqeohsxCibBw5NIekvnJGwiFTHH0Wpp7D4sv253zEL8BiExxq", + "PCk0MUva/kzDL3WzliOsVz+Cph7BgwAAhi190wxb+vdUYM1ip3KGcAD+tvphcUvb5mBzAezLVhXCAWYo", + "4UkaaWYQkMpkdy/1ATk5cBQhBUfJfauZItjJmvVYc7Iv8aS1JRvjX+UYQQrKwmHq7x9utdYVcSh3/9/n", + "b14jwxDpXSg7vA3Zc8OADdDnITi4DVuDoXNxG7Y6wxZhc3hm/eCGrS/+FUoSCOLTCsAE4LKEMuzQLI9U", + "hV2iDKbnKnrp9eup4WDmZj7DEg1bNBy28rLZWwCtVFp1e7cLvOAvema/mGE6NPyl1yuu8vfPppeBPs1J", + "PFL8irBh60sHFV5MqZql4+zd+5oF1xgFz0ukCLXN7bPl8jvpFRYuYnNzYRYibql9tEAY5TSwqHwYU4bF", + "oq52PU9VvcO6SX9lm+UYddDvb613nLFL9TDYpYb6LH5ZYsV2vxkXYjmwZS7Elue30TYamLZGPvBeG2CD", + "nuLQpc74ye+t4festF3g5OD74qVg0DciRj1aYce0eB45dmyl7GLQAsLNQBRxbm5GEqGOncuRtyiTVEXQ", + "ZRljv+6UBTDFyOHf/gbwD8bNiwjAuE82NS6OTOkrl1L7YaEjbJZDxI5fXn5J1PeAcf1NkVJX6+Qe8feh", + "4M9LYpnAHGgVarYNZUSLyphqWLQgOJa2F9NYC67nMKfuOWEKPYenPfuvE38g4vRDxKcfBsiAMOJTFFFG", + "pHVByKwd+lK0sISPTCbI7DubWDWYYTYlErXN/fnPv/8DJkXZ9J9//4dmrc0vOO7bxr0dAio/zAgWakyw", + "+jBAvxGSdHFE58QtBkKeyJyIBdrr28LJ8MqTplUO2ZC9JSoVTGaO7XpdABPToa0aotdDWUokkgBCqIk3", + "sR7XRinqkefdWTag3OiJ7iwJYHYFhQXoW9HhALjQURNVaoWxll+tZtZcUqpV9btLGv/19EWRj8pgb9dM", + "8IYEBkDsO3fwwi4atc/Pn2/1ELD7BivAqx7khrwbKwn0ftKk9TTJUJQyQQEoG9pUyNBfqx0+tm2aqYdt", + "jz+0friuBkG9gtgoRIggoQPgT+GhibLYDzenOPZpb49dycJ69e3t11scwvkpNpKMv90+O9xbhrmtx5mD", + "7D5kYtS2pdSyNJmlop/3hfQbuUYKNWazuwRxk5xzY3LaM84mEQ0U6rq5QPaNmGSyWxlBHgo5eGtnjbBb", + "VzWgtXjhbZfiM2qvvixUI78D7/72qAx6k2skD7rNce3nTbIOdY6pDLj+toAt3QAnNkmo4Weyc1rEonUa", + "qmN4nl05K/mn46z6tD2Qm9NV2aFTVr0bNkAUjysE8R4JYSWBYSFM/SFh80W2i6488wpV1veFmv3NcUGb", + "Vmv50Pwh6bXCCtg0FZxlZbLq0MsW0rrDjbYjeBZ+ToQ71WaiJnFevizzKQpmJLgyC7JVxFdxBCeu0HgT", + "Wdj090OLwqak2Q1YGLsHP3mWBtJvDqtVEu+JTQl5dwIvjHAjeffbWYItgnmADL4pY6fTNtkWsVywYOuH", + "MgZv5Hqrli5/QCfpLI0iZxOZE6HyAm3FS2H7M3gxrWf23WlbeT9cvH3VJSzg4LaWuVz5uSpXV+nbsvxm", + "w8xSfqJJEyERQOUQo56j/or9N96FKEvS/++7L2ya/n/ffWES9f/73pFJ1b91Z8jS3xRp3jQL/oCRT3Pg", + "tAw0IE2m+tE6ljVr1ZBrde1/bMbVlsu7CeuaAfon99qEey2CayUDm1UuvEMW1haEux+jTYZsPmjDK+fS", + "+IOxrpvVA1qMdDk7qCwbRmxSRi7yImy2IvnD87mkGcYV75GGCu38QK68Txzqnhx3bH09UxUvCzDZkHrb", + "zWPj3K4dd/O67aN4TKcpT2UxdgXKKRJpg50iUibAD40Pz6/nWk78O8bS/iavjo0z2j/x/o5EgOqGGuJt", + "bFTrhADXqqkQYNtD4UJTB8PEvr119TVscpGtGj9EV4GmKRqXCiAt+0f65lUrnKALLb7kMgMCMWIwZP/l", + "PvldERy//8WFN6X9/u5B9o6w+ftfXJQTO3V4Q5gSlEiEBUFHr4/BSjiF2HjIHZbH91XnYzKCmXLVtpLq", + "v7TklBtNm4tODj1/ik6NRKcCuFaLTllhl7uUncwg9yY8OXzzAdwm8fgpPm1CfJLpZEIDSpjKk+cu+ZfZ", + "3NsPME6NWUtSwS+kdAM3Fp/yakurOdM889vGfYKywTcvNbkkcw/T356bCJvQySn5ZVgvqHxv+NDfLHHe", + "vIDykFHMSAJV0C0Tou2Jzd3rZxBecHHVFPM8qSi/OQJ+e+6kuMLvkDfR0yOFwov3yKLA5W3c8jXSlDmX", + "DRzIpfyi9+kN6iBhpV4TYEnZNC+9SdWMpyZdy8g+NPnf9KmwhViA5Qlsr/dNXvToG2BAX3OFaJxEJCaQ", + "H65rsAnqnaZJwkVWEo3KQjbem5E/fWyKvrkma44tNtxBNmcxaPHchrVBob+8XV6qGfHp+gDdbHAXjeqJ", + "0B2yC2myx3wwrPAHlBFZpDiSJCKBQtczGswgWlc/g/5NMC9Okg9Zeo4tV7+1mDAEBm9LIiiOoPAkj0wJ", + "1Q/zOP4wWM41d3l6Ch+ZQF2TVe7DALn8ctkFIXWrYvStXkWEpUKvbUxxW2OS4FFkdvSDvoUK69uycbl5", + "JpMh88XoMnJtO6QT9KEQrvuhJl7XEdRXepfuiV/q1GfAMmtRHAkAnMFNwsIaHZmGmj9Sd6fvTZnaMGrY", + "TOOOg4aXJvOKT7PsWyVUxknSFH3tNAGL53G8AodRu5DPW6qQp+rPUoVECPjYYncdcqM2DswfCl9pRGW2", + "ipfLiA7o59Vrmgw4XlBpolpIv2z+msdxq9Oy8/EU9P366Otqh8tqNr0zhRDrn5z2TYKny8S+ED1duTls", + "yYd6lttWsvjh5T1Xgfue0fAe9GP5LChzrArsbV7a/GEFXZoiJ1VezOSa952RrEpK/SkpK5XP86z2/4Ii", + "qllrtbTNhoXUDMQ+yaxU4eHepdOs4MRPCTWTULlAYWqGq5R8+WHFzoygoJSVJE/Lnt5W9swS1mVghhJ+", + "bKVBIKd525/dz5NbsAvfCSXs1BZJqUuNlC/6eyC5NeXEGtHce+KT7LVaYBDukQS7wmabpsAZVLS4l1G5", + "74IMmwOXUeMizVECM0ldzcKfxLikBjSa0tsSY8d8LukCC+SZsm4S4Tq6bPnUWgJsiyb98PJaLqv84BJb", + "wIUw7mTgpfaQghwLNsOC6NlOcCpJJzswHWe3vjw93ao7NEKtPDLi+zBo345zqFS0jEN/SWFBQ5f9/tnp", + "sc2VTyUSKeuhNzGFlPRXhCSQ3pLyVCLwB+wVq5zVVPrNy5gRpsQi4ZSptbPIm97NZL7cKuH3humUDfP+", + "4dVKtkbtQyNSQDv07W0XsFqoUqa4n9dM58xWlJmE+Zr5wGOe6t6XKq+hCY2IXEhFYmOzm6QRHCLIDGIz", + "ydrvjO9aB1ElofB2B3x9EiJiKiXlTA7ZmEw0V5IQoceG+ow0IgXzg8+yda5wRjXPDOn7PkxbUIwNrDlY", + "1UGtXIcNJ4mrw+Yzn2Sl4249pRdgq0JyEY95RAMUUXYlUTuiV4YHR3OJIv1ja6WxawTffes8ubc/WRrS", + "J2zCvZkDDc5myPwjULiTCllzxvwHR9ZekuJhcfQHNtpP1uRauiYIjqD0aOZmi1JFI/rJkDrdCZWKBqYY", + "E85gB3VkzHi9ITslSug2WBAU8CgigXK6hu1E8GB7mPb7e0FCIT5ij8DkgODVv45hxGdnF9DO1LrpDJn+", + "Azp+d3SGqIbpBFuRuTBRWxMenWy/WWP+Pwcw/QvLY2aBq46Ff8N/WnZv7kNZe4ZkzRHlySoBiCc/vMLA", + "cnA/tQUPU1sATuzZatpTgQNgiuUsVSG/Zn7NgKnFKrc/mx8n60IhFA5ml67Q9PfB7dq6tOuGcQt8EIfS", + "rikkJrPpvejrbengB5r4SQPOLQGYmGJQh/8WMCXJfzTs/vbGuiIcv0NLnYWoyxr83ZytTd98dg4uwq8I", + "j4dyzA2muZVAJcqi9ikLZ1wrmwWpEIQpyBGTs5YBTnBA1aKDcOTKtNpSS5kOKS85PxYEX+mbtjdkb7NA", + "SlvqSUtXHSdaoZDKK9ODlZ566M2cCJmOs8khIExGzgPg20qtAY4CU+KUTCYkUHROTO1RWSN9ZVO5y4y+", + "+SCejXYvLegemsjhxwnYvRwtrNRR8pSrzetwnrVqltch67XgDVPwFFnp8zxyDU29/Zuo7DyDX9Fat3j7", + "6mbea7/pjxqOXfaS8k/CvvrKVf6w+fPOC94qTbNA5Cj/0BIyFGZeOrslj6/1keGNXbzu0uVqXWR4Nvim", + "I8PPvV4/DyxxFS75cdWFhH9/iNDfrLvxpkPCHzZuad5CLoGunhI1CA3/LjDwbmLC79nd/hYx4d+VAyjE", + "9N6fI/535fppXRgz18+fUd936fFpQr8hwrXO49NQPauKXik5Xdo2zeQm2+MPzdJbdeYNGHq3Dz+TujWQ", + "IQrActdyhf7AZSDtCSBxohZOX8Un4JmTZyCU9BP49/lC6zK19N1FtN1CY/vt0MPhaa2+9mcyuI2phPNU", + "2ifHDz8DXPHMlW6abX0NdbEIZnReiuhadYItiBJBuglPQBMbGoBZeLjLTWHRm35CtvvekL2bEfcXoi6f", + "BglRSAUJVLRAlCkOFMGM8SeJBNeiAbznYuFT8BZP7gvB4yO7mjUXpD1TVl2WOwLGi66+tbpzR21WKNm+", + "wqh1ij/SOI2B4CHK0MunqE0+KmHSO6CJFoUQnWQgJR8DQkIJOLlVnPBOv0b3ST+R0XTcZJYrEnW8sYlQ", + "UJBKxWO39yfHqI1TxbtTwvReaN5/AqxtIvichia9bg7UOY8MVHdqAHpTzaxjLpDCU2ldx3Oxw8zy3hmb", + "JrfU9BNNyrTCeEu2Bq0xZRgmujZPRvmgGcddPR6m4D6XHyiHTq2f91q1rjdgExcZEBXnKNJ8/9bPu+8h", + "331FBwh30ZWuwGbJT5v5RDR0VbiLxKeZv8xmlduX348Zv1AH+QEq2OeZlFqnXP++ULC/ufth00r1ywfs", + "9vWSOIm8oFCHDnSPPoR5xQMcoZDMScSTWPOapm2r00pF1Bq0Zkolg+3tSLebcakGh/3DfuvL+y//PwAA", + "//8nbAkO1R4BAA==", } // GetSwagger returns the content of the embedded swagger specification file diff --git a/lib/system/README.md b/lib/system/README.md index 6455d0a0..66377252 100644 --- a/lib/system/README.md +++ b/lib/system/README.md @@ -70,8 +70,14 @@ It replaces the previous shell-based init script with cleaner logic and structur - ✅ Hand off to systemd via chroot + exec (systemd mode) **Two boot modes:** -- **Exec mode** (default): Init chroots to container rootfs, runs entrypoint as child process. When the app exits, init logs exit info and cleanly shuts down the VM via `reboot(POWER_OFF)`. -- **Systemd mode** (auto-detected on host): Init chroots to container rootfs, then execs /sbin/init so systemd becomes PID 1 +- **Exec mode** (default): Init chroots to container rootfs, starts guest-agent, enforces a strict guest-agent readiness gate (10s), then launches the entrypoint as a child process. When the app exits, init logs exit info and cleanly shuts down the VM via `reboot(POWER_OFF)`. +- **Systemd mode** (auto-detected on host): Init chroots to container rootfs, emits handoff marker, then execs /sbin/init so systemd becomes PID 1. + +**Boot progress sentinels:** Init and guest-agent emit machine-parseable markers to serial console: +- `HYPEMAN-PROGRAM-START ts=... mode=...` +- `HYPEMAN-AGENT-READY ts=...` + +Host state derivation uses these sentinels to report `Initializing` until both required markers are present (or until `skip_guest_agent=true` bypasses the agent marker requirement). **Graceful shutdown:** The host sends a `Shutdown` gRPC RPC to the guest-agent, which signals PID 1 (init). Init forwards the signal to the entrypoint child process. If the app doesn't exit within the stop timeout, the host falls back to hypervisor shutdown and then force-kills the hypervisor process if still needed. @@ -179,7 +185,7 @@ lib/system/init/ network.go # Network configuration headers.go # Kernel headers setup for DKMS volumes.go # Volume mounting - mode_exec.go # Exec mode: chroot, run entrypoint, wait on guest-agent - mode_systemd.go # Systemd mode: chroot + exec /sbin/init + mode_exec.go # Exec mode: chroot, strict agent gate, run entrypoint + mode_systemd.go # Systemd mode: chroot + handoff marker + exec /sbin/init logger.go # Human-readable logging to hypeman operations log ``` diff --git a/lib/system/guest_agent/main.go b/lib/system/guest_agent/main.go index b0fe125c..23e564f1 100644 --- a/lib/system/guest_agent/main.go +++ b/lib/system/guest_agent/main.go @@ -2,6 +2,8 @@ package main import ( "log" + "os" + "path/filepath" "time" pb "github.com/kernel/hypeman/lib/guest" @@ -9,6 +11,11 @@ import ( "google.golang.org/grpc" ) +const ( + readySentinelPrefix = "HYPEMAN-AGENT-READY" + defaultReadyFilePath = "/run/hypeman/guest-agent-ready" +) + // guestServer implements the gRPC GuestService type guestServer struct { pb.UnimplementedGuestServiceServer @@ -34,6 +41,10 @@ func main() { defer l.Close() log.Println("[guest-agent] listening on vsock port 2222") + log.Printf("[guest-agent] %s ts=%s", readySentinelPrefix, time.Now().UTC().Format(time.RFC3339Nano)) + if err := writeReadyFile(); err != nil { + log.Printf("[guest-agent] warning: failed to write readiness file: %v", err) + } // Create gRPC server grpcServer := grpc.NewServer() @@ -44,3 +55,14 @@ func main() { log.Fatalf("[guest-agent] gRPC server failed: %v", err) } } + +func writeReadyFile() error { + path := os.Getenv("HYPEMAN_AGENT_READY_FILE") + if path == "" { + path = defaultReadyFilePath + } + if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil { + return err + } + return os.WriteFile(path, []byte(time.Now().UTC().Format(time.RFC3339Nano)+"\n"), 0644) +} diff --git a/lib/system/init/mode_exec.go b/lib/system/init/mode_exec.go index dd253332..51a5b5ec 100644 --- a/lib/system/init/mode_exec.go +++ b/lib/system/init/mode_exec.go @@ -13,6 +13,11 @@ import ( "github.com/kernel/hypeman/lib/vmconfig" ) +const ( + guestAgentReadyFilePath = "/run/hypeman/guest-agent-ready" + guestAgentReadyTimeout = 10 * time.Second +) + // runExecMode runs the container in exec mode (default). // This is the Docker-like behavior where: // - The init binary remains PID 1 @@ -45,13 +50,25 @@ func runExecMode(log *Logger, cfg *vmconfig.Config) { if cfg.SkipGuestAgent { log.Info("hypeman-init:setup", "skipping guest-agent (skip_guest_agent=true)") } else { + // Clear stale readiness marker from previous runs. + _ = os.Remove(guestAgentReadyFilePath) + log.Info("hypeman-init:setup", "starting guest-agent in background") agentCmd = exec.Command("/opt/hypeman/guest-agent") - agentCmd.Env = buildEnv(cfg.Env) + agentCmd.Env = append(buildEnv(cfg.Env), "HYPEMAN_AGENT_READY_FILE="+guestAgentReadyFilePath) agentCmd.Stdout = os.Stdout agentCmd.Stderr = os.Stderr if err := agentCmd.Start(); err != nil { log.Error("hypeman-init:setup", "failed to start guest-agent", err) + syscall.Sync() + syscall.Reboot(syscall.LINUX_REBOOT_CMD_POWER_OFF) + } + + // Strict startup gate: do not launch the guest program until agent is ready. + if err := waitForGuestAgentReady(guestAgentReadyFilePath, guestAgentReadyTimeout, agentCmd); err != nil { + log.Error("hypeman-init:setup", "guest-agent readiness timeout; not launching entrypoint", err) + syscall.Sync() + syscall.Reboot(syscall.LINUX_REBOOT_CMD_POWER_OFF) } } @@ -88,6 +105,8 @@ func runExecMode(log *Logger, cfg *vmconfig.Config) { dropToShell() } + // Program-start sentinel used by host state derivation. + log.Info("hypeman-init:entrypoint", formatProgramStartSentinel("exec")) log.Info("hypeman-init:entrypoint", fmt.Sprintf("container app started (PID %d)", appCmd.Process.Pid)) // Set up signal forwarding: when init receives a signal (e.g. from guest-agent @@ -174,6 +193,31 @@ func formatExitSentinel(code int, message string) string { return fmt.Sprintf("HYPEMAN-EXIT code=%d message=%q", code, message) } +func formatProgramStartSentinel(mode string) string { + return fmt.Sprintf("HYPEMAN-PROGRAM-START ts=%s mode=%s", time.Now().UTC().Format(time.RFC3339Nano), mode) +} + +func waitForGuestAgentReady(readyFilePath string, timeout time.Duration, agentCmd *exec.Cmd) error { + deadline := time.Now().Add(timeout) + for { + if _, err := os.Stat(readyFilePath); err == nil { + return nil + } + + // Fast-fail if the guest-agent process already exited. + if agentCmd != nil && agentCmd.Process != nil { + if err := agentCmd.Process.Signal(syscall.Signal(0)); err != nil { + return fmt.Errorf("guest-agent process exited before readiness signal: %w", err) + } + } + + if time.Now().After(deadline) { + return fmt.Errorf("timed out after %s waiting for %s", timeout, readyFilePath) + } + time.Sleep(100 * time.Millisecond) + } +} + // checkOOMKill checks /dev/kmsg for recent OOM kill messages. // Returns true if an OOM kill was detected. // Uses a 1s timeout to avoid hanging if /dev/kmsg blocks at end of buffer. diff --git a/lib/system/init/mode_exec_test.go b/lib/system/init/mode_exec_test.go index 0255f22d..3db82483 100644 --- a/lib/system/init/mode_exec_test.go +++ b/lib/system/init/mode_exec_test.go @@ -1,7 +1,11 @@ package main import ( + "os" + "path/filepath" + "strings" "testing" + "time" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" @@ -154,3 +158,32 @@ func TestIsOOMLine(t *testing.T) { }) } } + +func TestFormatProgramStartSentinel(t *testing.T) { + result := formatProgramStartSentinel("exec") + assert.Contains(t, result, "HYPEMAN-PROGRAM-START") + assert.Contains(t, result, " mode=exec") + assert.Contains(t, result, " ts=") +} + +func TestWaitForGuestAgentReady(t *testing.T) { + t.Parallel() + + readyFile := filepath.Join(t.TempDir(), "ready") + go func() { + time.Sleep(50 * time.Millisecond) + _ = os.WriteFile(readyFile, []byte("ok"), 0644) + }() + + err := waitForGuestAgentReady(readyFile, time.Second, nil) + require.NoError(t, err) +} + +func TestWaitForGuestAgentReadyTimeout(t *testing.T) { + t.Parallel() + + readyFile := filepath.Join(t.TempDir(), "missing") + err := waitForGuestAgentReady(readyFile, 100*time.Millisecond, nil) + require.Error(t, err) + assert.True(t, strings.Contains(err.Error(), "timed out"), "unexpected error: %v", err) +} diff --git a/lib/system/init/mode_systemd.go b/lib/system/init/mode_systemd.go index 50fc0bbd..8de34581 100644 --- a/lib/system/init/mode_systemd.go +++ b/lib/system/init/mode_systemd.go @@ -52,6 +52,7 @@ func runSystemdMode(log *Logger, cfg *vmconfig.Config) { // Exec systemd - this replaces the current process log.Info("hypeman-init:systemd", fmt.Sprintf("exec %v", argv)) + log.Info("hypeman-init:systemd", formatProgramStartSentinel("systemd")) // syscall.Exec replaces the current process with the new one // Use buildEnv to include user's environment variables from the image/instance config @@ -77,8 +78,8 @@ ExecStart=/opt/hypeman/guest-agent EnvironmentFile=-/etc/hypeman/env Restart=always RestartSec=3 -StandardOutput=journal -StandardError=journal +StandardOutput=journal+console +StandardError=journal+console [Install] WantedBy=multi-user.target diff --git a/openapi.yaml b/openapi.yaml index ea6a3190..8b004417 100644 --- a/openapi.yaml +++ b/openapi.yaml @@ -45,11 +45,12 @@ components: InstanceState: type: string - enum: [Created, Running, Paused, Shutdown, Stopped, Standby, Unknown] + enum: [Created, Initializing, Running, Paused, Shutdown, Stopped, Standby, Unknown] description: | Instance state: - Created: VMM created but not started (Cloud Hypervisor native) - - Running: VM is actively running (Cloud Hypervisor native) + - Initializing: VM is running while guest init is still in progress + - Running: Guest program has started and instance is ready - Paused: VM is paused (Cloud Hypervisor native) - Shutdown: VM shut down but VMM exists (Cloud Hypervisor native) - Stopped: No VMM running, no snapshot exists From d973033dd90230a5c9c0c3bd2338e87647314aa2 Mon Sep 17 00:00:00 2001 From: Steven Miller Date: Sun, 8 Mar 2026 19:04:07 -0400 Subject: [PATCH 02/32] tests: fix lifecycle flakes for Initializing transition --- cmd/api/api/instances_test.go | 2 +- lib/instances/manager_darwin_test.go | 26 +++++++++++++++++++------- 2 files changed, 20 insertions(+), 8 deletions(-) diff --git a/cmd/api/api/instances_test.go b/cmd/api/api/instances_test.go index fc40e16d..96d25a38 100644 --- a/cmd/api/api/instances_test.go +++ b/cmd/api/api/instances_test.go @@ -419,7 +419,7 @@ func TestInstanceLifecycle_StopStart(t *testing.T) { // 1. Create instance t.Log("Creating instance...") - networkEnabled := true + networkEnabled := false createResp, err := svc.CreateInstance(ctx(), oapi.CreateInstanceRequestObject{ Body: &oapi.CreateInstanceRequest{ Name: "test-lifecycle", diff --git a/lib/instances/manager_darwin_test.go b/lib/instances/manager_darwin_test.go index d38c24d8..a7557b20 100644 --- a/lib/instances/manager_darwin_test.go +++ b/lib/instances/manager_darwin_test.go @@ -159,7 +159,7 @@ func TestVZBasicLifecycle(t *testing.T) { require.NoError(t, err) } require.NotNil(t, inst) - assert.Equal(t, StateRunning, inst.State) + assert.Contains(t, []State{StateInitializing, StateRunning}, inst.State) assert.Equal(t, hypervisor.TypeVZ, inst.HypervisorType) t.Logf("Instance created: %s (hypervisor: %s)", inst.Id, inst.HypervisorType) @@ -199,7 +199,7 @@ func TestVZBasicLifecycle(t *testing.T) { t.Log("Starting instance (restart after stop)...") inst, err = mgr.StartInstance(ctx, inst.Id, StartInstanceRequest{}) require.NoError(t, err) - assert.Equal(t, StateRunning, inst.State) + assert.Contains(t, []State{StateInitializing, StateRunning}, inst.State) t.Logf("Instance restarted: %s (pid: %v)", inst.Id, inst.HypervisorPID) // Re-read instance to get updated vsock info @@ -323,7 +323,7 @@ func TestVZExecAndShutdown(t *testing.T) { dumpVZShimLogs(t, tmpDir) require.NoError(t, err) } - assert.Equal(t, StateRunning, inst.State) + assert.Contains(t, []State{StateInitializing, StateRunning}, inst.State) t.Logf("Instance created: %s", inst.Id) t.Cleanup(func() { @@ -439,7 +439,7 @@ func TestVZStandbyAndRestore(t *testing.T) { require.NoError(t, err) } require.NotNil(t, inst) - assert.Equal(t, StateRunning, inst.State) + assert.Contains(t, []State{StateInitializing, StateRunning}, inst.State) assert.Equal(t, hypervisor.TypeVZ, inst.HypervisorType) t.Logf("Instance created: %s (hypervisor: %s)", inst.Id, inst.HypervisorType) @@ -495,7 +495,7 @@ func TestVZStandbyAndRestore(t *testing.T) { dumpVZShimLogs(t, tmpDir) require.NoError(t, err) } - assert.Equal(t, StateRunning, inst.State) + assert.Contains(t, []State{StateInitializing, StateRunning}, inst.State) assert.False(t, inst.HasSnapshot) t.Log("Instance restored and running") @@ -597,7 +597,7 @@ func TestVZForkFromRunningNetwork(t *testing.T) { dumpVZShimLogs(t, tmpDir) require.NoError(t, err) } - require.Equal(t, StateRunning, source.State) + require.Contains(t, []State{StateInitializing, StateRunning}, source.State) require.NotEmpty(t, source.IP) require.NotEmpty(t, source.MAC) @@ -606,6 +606,8 @@ func TestVZForkFromRunningNetwork(t *testing.T) { err = waitForExecAgent(ctx, mgr, sourceID, 30*time.Second) require.NoError(t, err, "source guest agent should be ready") + source, err = waitForInstanceState(ctx, mgr, sourceID, StateRunning, 30*time.Second) + require.NoError(t, err) output, exitCode, err := vzExecCommand(ctx, source, "echo", "source-before-fork") require.NoError(t, err) @@ -626,13 +628,19 @@ func TestVZForkFromRunningNetwork(t *testing.T) { dumpVZShimLogs(t, tmpDir) require.NoError(t, err) } - require.Equal(t, StateRunning, forked.State) + require.Contains(t, []State{StateInitializing, StateRunning}, forked.State) require.NotEqual(t, sourceID, forked.Id) forkID := forked.Id t.Cleanup(func() { _ = mgr.DeleteInstance(context.Background(), forkID) }) + forked, err = waitForInstanceState(ctx, mgr, forkID, StateRunning, 30*time.Second) + require.NoError(t, err) sourceAfterFork, err := mgr.GetInstance(ctx, sourceID) require.NoError(t, err) + if sourceAfterFork.State != StateRunning { + sourceAfterFork, err = waitForInstanceState(ctx, mgr, sourceID, StateRunning, 30*time.Second) + require.NoError(t, err) + } require.Equal(t, StateRunning, sourceAfterFork.State) require.NotEmpty(t, sourceAfterFork.IP) require.NotEmpty(t, sourceAfterFork.MAC) @@ -640,6 +648,10 @@ func TestVZForkFromRunningNetwork(t *testing.T) { forked, err = mgr.GetInstance(ctx, forkID) require.NoError(t, err) + if forked.State != StateRunning { + forked, err = waitForInstanceState(ctx, mgr, forkID, StateRunning, 30*time.Second) + require.NoError(t, err) + } require.Equal(t, StateRunning, forked.State) require.NotEmpty(t, forked.IP) require.NotEmpty(t, forked.MAC) From 423399bb52f65cc26517e395f87d5925016d41f1 Mon Sep 17 00:00:00 2001 From: Steven Miller Date: Sun, 8 Mar 2026 22:58:19 -0400 Subject: [PATCH 03/32] instances: add Initializing state and gate Running on agent/program readiness --- cmd/api/api/registry_test.go | 2 + lib/instances/README.md | 7 +- lib/network/README.md | 2 + lib/system/README.md | 22 ++-- lib/system/guest_agent/main.go | 32 +++++ lib/system/guest_agent/main_test.go | 34 ++++++ lib/system/init/headers.go | 173 ++++++++++++++++++++++++--- lib/system/init/headers_test.go | 63 ++++++++++ lib/system/init/main.go | 54 ++++++--- lib/system/init/mode_exec.go | 65 +++++++--- lib/system/init/mode_exec_test.go | 36 +++++- lib/system/init/mode_systemd.go | 112 ++++++++++++++++- lib/system/init/mode_systemd_test.go | 15 +++ 13 files changed, 543 insertions(+), 74 deletions(-) create mode 100644 lib/system/guest_agent/main_test.go create mode 100644 lib/system/init/headers_test.go diff --git a/cmd/api/api/registry_test.go b/cmd/api/api/registry_test.go index 9fca3fa5..4f664ddf 100644 --- a/cmd/api/api/registry_test.go +++ b/cmd/api/api/registry_test.go @@ -135,10 +135,12 @@ func TestRegistryPushAndCreateInstance(t *testing.T) { // Create instance with pushed image t.Log("Creating instance with pushed image...") networkEnabled := false + cmd := []string{"sleep", "infinity"} resp, err := svc.CreateInstance(ctx(), oapi.CreateInstanceRequestObject{ Body: &oapi.CreateInstanceRequest{ Name: "test-pushed-image", Image: imageName, + Cmd: &cmd, Network: &struct { BandwidthDownload *string `json:"bandwidth_download,omitempty"` BandwidthUpload *string `json:"bandwidth_upload,omitempty"` diff --git a/lib/instances/README.md b/lib/instances/README.md index 4f1bc076..5a889d73 100644 --- a/lib/instances/README.md +++ b/lib/instances/README.md @@ -17,7 +17,7 @@ Manages VM instance lifecycle across multiple hypervisors (Cloud Hypervisor, QEM - `Stopped` - No VMM, no snapshot - `Created` - VMM created but not booted (CH native) - `Initializing` - VM is running while guest init is still in progress -- `Running` - Guest program has started and instance is ready +- `Running` - Guest program start boundary reached and guest-agent readiness observed (unless `skip_guest_agent=true`) - `Paused` - VM paused (CH native) - `Shutdown` - VM shutdown, VMM exists (CH native) - `Standby` - No VMM, snapshot exists (can restore) @@ -68,9 +68,10 @@ Stopped → Created → Initializing → Running 1. Start VMM process 2. Create VM config 3. Boot VM -4. Wait for guest-agent readiness gate (exec mode, unless skipped) +4. Wait for guest-agent readiness gate (event-driven, exec mode, unless skipped) 5. Guest program start marker observed -6. Expand memory (if hotplug configured) +6. Kernel headers setup continues asynchronously (does not gate `Running`) +7. Expand memory (if hotplug configured) ``` **StandbyInstance:** diff --git a/lib/network/README.md b/lib/network/README.md index 16187c33..63c6e4d0 100644 --- a/lib/network/README.md +++ b/lib/network/README.md @@ -83,6 +83,8 @@ Hypeman provides a single default network that all instances can optionally conn - **Standby VMs**: Read `guests/{id}/snapshots/snapshot-latest/config.json` from snapshot - **Stopped VMs**: No network allocation +`Initializing` is treated as fully VMM-active for networking; startup work such as async kernel-headers setup does not change network allocation behavior. + **Metadata storage:** ``` /var/lib/hypeman/guests/{instance-id}/ diff --git a/lib/system/README.md b/lib/system/README.md index 66377252..2f5c012c 100644 --- a/lib/system/README.md +++ b/lib/system/README.md @@ -70,12 +70,15 @@ It replaces the previous shell-based init script with cleaner logic and structur - ✅ Hand off to systemd via chroot + exec (systemd mode) **Two boot modes:** -- **Exec mode** (default): Init chroots to container rootfs, starts guest-agent, enforces a strict guest-agent readiness gate (10s), then launches the entrypoint as a child process. When the app exits, init logs exit info and cleanly shuts down the VM via `reboot(POWER_OFF)`. -- **Systemd mode** (auto-detected on host): Init chroots to container rootfs, emits handoff marker, then execs /sbin/init so systemd becomes PID 1. +- **Exec mode** (default): Init chroots to container rootfs, starts guest-agent, and waits on an event-driven readiness signal (pipe FD, 10s timeout) before launching the entrypoint. When the app exits, init logs exit info and cleanly shuts down the VM via `reboot(POWER_OFF)`. +- **Systemd mode** (auto-detected on host): Init injects systemd units (guest-agent plus async kernel-headers worker), emits handoff marker, then execs /sbin/init so systemd becomes PID 1. **Boot progress sentinels:** Init and guest-agent emit machine-parseable markers to serial console: - `HYPEMAN-PROGRAM-START ts=... mode=...` - `HYPEMAN-AGENT-READY ts=...` +- `HYPEMAN-HEADERS-START ts=...` +- `HYPEMAN-HEADERS-READY ts=...` +- `HYPEMAN-HEADERS-FAILED ts=... error="..."` Host state derivation uses these sentinels to report `Initializing` until both required markers are present (or until `skip_guest_agent=true` bypasses the agent marker requirement). @@ -93,13 +96,16 @@ via `INIT_MODE` in the config disk. ## Kernel Headers -Kernel headers are bundled in the initrd and automatically installed at boot, enabling DKMS to build out-of-tree kernel modules (e.g., NVIDIA vGPU drivers). +Kernel headers are bundled in the initrd and installed asynchronously after boot handoff, enabling DKMS to build out-of-tree kernel modules (e.g., NVIDIA vGPU drivers) without delaying `Running`. **Why:** Guest images come with headers for their native kernel (e.g., Ubuntu's 5.15), but hypeman VMs run a custom kernel. Without matching headers, DKMS cannot compile drivers. -**How:** The initrd includes `kernel-headers.tar.gz` from the same release as the kernel. At boot, init extracts headers to `/usr/src/linux-headers-{version}/`, creates the `/lib/modules/{version}/build` symlink, and removes mismatched headers from the guest image. +**How:** The initrd includes `kernel-headers.tar.gz` from the same release as the kernel. A background worker (exec mode) or injected systemd oneshot unit (systemd mode) performs installation: +- writes `/run/hypeman/kernel-headers.status` as `pending|running|ready|failed` +- fast-path skips extraction when matching headers + build symlink are already valid +- otherwise extracts headers to `/usr/src/linux-headers-{version}/`, creates `/lib/modules/{version}/build`, and removes mismatched headers from the guest image -**Result:** Guests can `apt install nvidia-driver-xxx` and DKMS builds modules for the running kernel automatically. +**Result:** Guests can `apt install nvidia-driver-xxx` and DKMS builds modules for the running kernel automatically once headers status reaches `ready`. ## Kernel Sources @@ -178,14 +184,14 @@ Files downloaded/built once per version, reused for all instances using that ver ``` lib/system/init/ - main.go # Entry point, orchestrates boot + main.go # Entry point, orchestrates staged boot init.sh # Shell wrapper (mounts /proc, /sys, /dev before Go runtime) mount.go # Mount operations (overlay, bind mounts) config.go # Parse config disk network.go # Network configuration headers.go # Kernel headers setup for DKMS volumes.go # Volume mounting - mode_exec.go # Exec mode: chroot, strict agent gate, run entrypoint - mode_systemd.go # Systemd mode: chroot + handoff marker + exec /sbin/init + mode_exec.go # Exec mode: chroot, event-driven agent gate, run entrypoint + mode_systemd.go # Systemd mode: inject units + chroot + handoff marker + exec /sbin/init logger.go # Human-readable logging to hypeman operations log ``` diff --git a/lib/system/guest_agent/main.go b/lib/system/guest_agent/main.go index 23e564f1..46ee3d7a 100644 --- a/lib/system/guest_agent/main.go +++ b/lib/system/guest_agent/main.go @@ -1,9 +1,11 @@ package main import ( + "fmt" "log" "os" "path/filepath" + "strconv" "time" pb "github.com/kernel/hypeman/lib/guest" @@ -14,6 +16,7 @@ import ( const ( readySentinelPrefix = "HYPEMAN-AGENT-READY" defaultReadyFilePath = "/run/hypeman/guest-agent-ready" + readyFDEnv = "HYPEMAN_AGENT_READY_FD" ) // guestServer implements the gRPC GuestService @@ -42,6 +45,9 @@ func main() { log.Println("[guest-agent] listening on vsock port 2222") log.Printf("[guest-agent] %s ts=%s", readySentinelPrefix, time.Now().UTC().Format(time.RFC3339Nano)) + if err := signalReadyFD(); err != nil { + log.Printf("[guest-agent] warning: failed to signal readiness fd: %v", err) + } if err := writeReadyFile(); err != nil { log.Printf("[guest-agent] warning: failed to write readiness file: %v", err) } @@ -66,3 +72,29 @@ func writeReadyFile() error { } return os.WriteFile(path, []byte(time.Now().UTC().Format(time.RFC3339Nano)+"\n"), 0644) } + +func signalReadyFD() error { + rawFD := os.Getenv(readyFDEnv) + if rawFD == "" { + return nil + } + + fd, err := strconv.Atoi(rawFD) + if err != nil { + return fmt.Errorf("parse %s: %w", readyFDEnv, err) + } + if fd < 0 { + return fmt.Errorf("invalid %s=%d", readyFDEnv, fd) + } + + f := os.NewFile(uintptr(fd), "guest-agent-ready-fd") + if f == nil { + return fmt.Errorf("open readiness fd %d", fd) + } + defer f.Close() + + if _, err := f.Write([]byte{1}); err != nil { + return fmt.Errorf("write readiness byte: %w", err) + } + return nil +} diff --git a/lib/system/guest_agent/main_test.go b/lib/system/guest_agent/main_test.go new file mode 100644 index 00000000..901aceca --- /dev/null +++ b/lib/system/guest_agent/main_test.go @@ -0,0 +1,34 @@ +package main + +import ( + "fmt" + "os" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestSignalReadyFD(t *testing.T) { + readyReader, readyWriter, err := os.Pipe() + require.NoError(t, err) + defer readyReader.Close() + defer readyWriter.Close() + + t.Setenv(readyFDEnv, fmt.Sprintf("%d", readyWriter.Fd())) + err = signalReadyFD() + require.NoError(t, err) + + buf := make([]byte, 1) + n, err := readyReader.Read(buf) + require.NoError(t, err) + require.Equal(t, 1, n) + assert.Equal(t, byte(1), buf[0]) +} + +func TestSignalReadyFDInvalid(t *testing.T) { + t.Setenv(readyFDEnv, "not-an-int") + err := signalReadyFD() + require.Error(t, err) + assert.Contains(t, err.Error(), "parse") +} diff --git a/lib/system/init/headers.go b/lib/system/init/headers.go index 1cf330f1..937549c2 100644 --- a/lib/system/init/headers.go +++ b/lib/system/init/headers.go @@ -5,20 +5,120 @@ import ( "os" "os/exec" "path/filepath" + "strconv" "strings" "syscall" + "time" ) const ( - // Paths in the overlay filesystem - newrootLibModules = "/overlay/newroot/lib/modules" - newrootUsrSrc = "/overlay/newroot/usr/src" - headersTarball = "/kernel-headers.tar.gz" + headersWorkerArg = "--headers-worker" + headersWorkerGuestArg = "--headers-worker-guest" + + headersStatusPending = "pending" + headersStatusRunning = "running" + headersStatusReady = "ready" + headersStatusFailed = "failed" +) + +type kernelHeadersPaths struct { + libModulesDir string + usrSrcDir string + tarballPath string + statusPath string +} + +var ( + initrdKernelHeadersPaths = kernelHeadersPaths{ + libModulesDir: "/overlay/newroot/lib/modules", + usrSrcDir: "/overlay/newroot/usr/src", + tarballPath: "/kernel-headers.tar.gz", + statusPath: "/overlay/newroot/run/hypeman/kernel-headers.status", + } + guestKernelHeadersPaths = kernelHeadersPaths{ + libModulesDir: "/lib/modules", + usrSrcDir: "/usr/src", + tarballPath: "/opt/hypeman/kernel-headers.tar.gz", + statusPath: "/run/hypeman/kernel-headers.status", + } ) +func startKernelHeadersWorkerAsync(log *Logger) { + if err := writeKernelHeadersStatus(initrdKernelHeadersPaths.statusPath, headersStatusPending); err != nil { + log.Info("hypeman-init:headers", "warning: failed to write status file: "+err.Error()) + } + + cmd := exec.Command("/proc/self/exe", headersWorkerArg) + cmd.Stdout = os.Stdout + cmd.Stderr = os.Stderr + if err := cmd.Start(); err != nil { + log.Error("hypeman-init:headers", "failed to start async headers worker", err) + _ = writeKernelHeadersStatus(initrdKernelHeadersPaths.statusPath, headersStatusFailed) + log.Info("hypeman-init:headers", formatHeadersFailedSentinel(err)) + return + } + + log.Info("hypeman-init:headers", fmt.Sprintf("started async headers worker (pid %d)", cmd.Process.Pid)) +} + +func runKernelHeadersWorker(log *Logger, paths kernelHeadersPaths) { + log.Info("hypeman-init:headers", formatHeadersSentinel("START")) + if err := writeKernelHeadersStatus(paths.statusPath, headersStatusRunning); err != nil { + log.Info("hypeman-init:headers", "warning: failed to write status file: "+err.Error()) + } + if err := lowerKernelHeadersWorkerPriority(log); err != nil { + log.Info("hypeman-init:headers", "warning: failed to lower worker priority: "+err.Error()) + } + + if err := setupKernelHeaders(log, paths); err != nil { + log.Error("hypeman-init:headers", "kernel headers setup failed", err) + _ = writeKernelHeadersStatus(paths.statusPath, headersStatusFailed) + log.Info("hypeman-init:headers", formatHeadersFailedSentinel(err)) + os.Exit(1) + } + + if err := writeKernelHeadersStatus(paths.statusPath, headersStatusReady); err != nil { + log.Info("hypeman-init:headers", "warning: failed to write status file: "+err.Error()) + } + log.Info("hypeman-init:headers", formatHeadersSentinel("READY")) + os.Exit(0) +} + +func lowerKernelHeadersWorkerPriority(log *Logger) error { + if err := syscall.Setpriority(syscall.PRIO_PROCESS, 0, 10); err != nil { + return err + } + + ionicePath, err := exec.LookPath("ionice") + if err != nil { + return nil + } + + cmd := exec.Command(ionicePath, "-c", "3", "-p", strconv.Itoa(os.Getpid())) + if output, err := cmd.CombinedOutput(); err != nil { + log.Info("hypeman-init:headers", fmt.Sprintf("warning: ionice best-effort failed: %v: %s", err, strings.TrimSpace(string(output)))) + } + return nil +} + +func writeKernelHeadersStatus(statusPath, status string) error { + if err := os.MkdirAll(filepath.Dir(statusPath), 0755); err != nil { + return err + } + return os.WriteFile(statusPath, []byte(status+"\n"), 0644) +} + +func formatHeadersSentinel(state string) string { + return fmt.Sprintf("HYPEMAN-HEADERS-%s ts=%s", state, time.Now().UTC().Format(time.RFC3339Nano)) +} + +func formatHeadersFailedSentinel(err error) string { + return fmt.Sprintf("HYPEMAN-HEADERS-FAILED ts=%s error=%q", time.Now().UTC().Format(time.RFC3339Nano), err.Error()) +} + // setupKernelHeaders installs kernel headers and cleans up mismatched headers from the guest image. // This enables DKMS to build out-of-tree kernel modules (e.g., NVIDIA vGPU drivers). -func setupKernelHeaders(log *Logger) error { +func setupKernelHeaders(log *Logger, paths kernelHeadersPaths) error { // Get running kernel version var uname syscall.Utsname if err := syscall.Uname(&uname); err != nil { @@ -27,27 +127,37 @@ func setupKernelHeaders(log *Logger) error { runningKernel := int8ArrayToString(uname.Release[:]) log.Info("hypeman-init:headers", "running kernel: "+runningKernel) - // Check if headers tarball exists in initrd - if _, err := os.Stat(headersTarball); os.IsNotExist(err) { - log.Info("hypeman-init:headers", "no kernel headers tarball found, skipping") + ready, err := kernelHeadersAlreadyInstalled(runningKernel, paths) + if err != nil { + return fmt.Errorf("check fast path: %w", err) + } + if ready { + log.Info("hypeman-init:headers", "kernel headers already installed, skipping extraction") return nil } + // Check if headers tarball exists in initrd + if _, err := os.Stat(paths.tarballPath); os.IsNotExist(err) { + return fmt.Errorf("kernel headers tarball not found at %s", paths.tarballPath) + } else if err != nil { + return fmt.Errorf("stat tarball: %w", err) + } + // Clean up mismatched kernel modules directories - if err := cleanupMismatchedModules(log, runningKernel); err != nil { + if err := cleanupMismatchedModules(log, runningKernel, paths.libModulesDir); err != nil { log.Info("hypeman-init:headers", "warning: failed to cleanup mismatched modules: "+err.Error()) // Non-fatal, continue } // Clean up mismatched kernel headers directories - if err := cleanupMismatchedHeaders(log, runningKernel); err != nil { + if err := cleanupMismatchedHeaders(log, runningKernel, paths.usrSrcDir); err != nil { log.Info("hypeman-init:headers", "warning: failed to cleanup mismatched headers: "+err.Error()) // Non-fatal, continue } // Create target directories - headersDir := filepath.Join(newrootUsrSrc, "linux-headers-"+runningKernel) - modulesDir := filepath.Join(newrootLibModules, runningKernel) + headersDir := filepath.Join(paths.usrSrcDir, "linux-headers-"+runningKernel) + modulesDir := filepath.Join(paths.libModulesDir, runningKernel) if err := os.MkdirAll(headersDir, 0755); err != nil { return fmt.Errorf("mkdir headers dir: %w", err) @@ -57,7 +167,7 @@ func setupKernelHeaders(log *Logger) error { } // Extract headers tarball - if err := extractTarGz(headersTarball, headersDir); err != nil { + if err := extractTarGz(paths.tarballPath, headersDir); err != nil { return fmt.Errorf("extract headers: %w", err) } log.Info("hypeman-init:headers", "extracted kernel headers to "+headersDir) @@ -75,9 +185,34 @@ func setupKernelHeaders(log *Logger) error { return nil } +func kernelHeadersAlreadyInstalled(runningKernel string, paths kernelHeadersPaths) (bool, error) { + headersDir := filepath.Join(paths.usrSrcDir, "linux-headers-"+runningKernel) + headersInfo, err := os.Stat(headersDir) + if os.IsNotExist(err) { + return false, nil + } + if err != nil { + return false, err + } + if !headersInfo.IsDir() { + return false, nil + } + + buildLink := filepath.Join(paths.libModulesDir, runningKernel, "build") + target, err := os.Readlink(buildLink) + if os.IsNotExist(err) { + return false, nil + } + if err != nil { + return false, err + } + + return target == "/usr/src/linux-headers-"+runningKernel, nil +} + // cleanupMismatchedModules removes /lib/modules/* directories that don't match the running kernel -func cleanupMismatchedModules(log *Logger, runningKernel string) error { - entries, err := os.ReadDir(newrootLibModules) +func cleanupMismatchedModules(log *Logger, runningKernel, modulesDir string) error { + entries, err := os.ReadDir(modulesDir) if err != nil { if os.IsNotExist(err) { return nil // No modules directory, nothing to clean @@ -90,7 +225,7 @@ func cleanupMismatchedModules(log *Logger, runningKernel string) error { continue } if entry.Name() != runningKernel { - path := filepath.Join(newrootLibModules, entry.Name()) + path := filepath.Join(modulesDir, entry.Name()) log.Info("hypeman-init:headers", "removing mismatched modules: "+entry.Name()) if err := os.RemoveAll(path); err != nil { return fmt.Errorf("remove %s: %w", path, err) @@ -102,8 +237,8 @@ func cleanupMismatchedModules(log *Logger, runningKernel string) error { } // cleanupMismatchedHeaders removes /usr/src/linux-headers-* directories that don't match the running kernel -func cleanupMismatchedHeaders(log *Logger, runningKernel string) error { - entries, err := os.ReadDir(newrootUsrSrc) +func cleanupMismatchedHeaders(log *Logger, runningKernel, usrSrcDir string) error { + entries, err := os.ReadDir(usrSrcDir) if err != nil { if os.IsNotExist(err) { return nil // No usr/src directory, nothing to clean @@ -119,7 +254,7 @@ func cleanupMismatchedHeaders(log *Logger, runningKernel string) error { } // Remove any linux-headers-* directory that doesn't match if strings.HasPrefix(entry.Name(), "linux-headers-") && entry.Name() != expectedName { - path := filepath.Join(newrootUsrSrc, entry.Name()) + path := filepath.Join(usrSrcDir, entry.Name()) log.Info("hypeman-init:headers", "removing mismatched headers: "+entry.Name()) if err := os.RemoveAll(path); err != nil { return fmt.Errorf("remove %s: %w", path, err) diff --git a/lib/system/init/headers_test.go b/lib/system/init/headers_test.go new file mode 100644 index 00000000..bd60e869 --- /dev/null +++ b/lib/system/init/headers_test.go @@ -0,0 +1,63 @@ +package main + +import ( + "os" + "path/filepath" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestKernelHeadersAlreadyInstalled(t *testing.T) { + t.Parallel() + + root := t.TempDir() + kernel := "test-kernel" + paths := kernelHeadersPaths{ + libModulesDir: filepath.Join(root, "lib/modules"), + usrSrcDir: filepath.Join(root, "usr/src"), + } + + headersDir := filepath.Join(paths.usrSrcDir, "linux-headers-"+kernel) + modulesDir := filepath.Join(paths.libModulesDir, kernel) + require.NoError(t, os.MkdirAll(headersDir, 0755)) + require.NoError(t, os.MkdirAll(modulesDir, 0755)) + require.NoError(t, os.Symlink("/usr/src/linux-headers-"+kernel, filepath.Join(modulesDir, "build"))) + + ready, err := kernelHeadersAlreadyInstalled(kernel, paths) + require.NoError(t, err) + assert.True(t, ready) +} + +func TestKernelHeadersAlreadyInstalledSymlinkMismatch(t *testing.T) { + t.Parallel() + + root := t.TempDir() + kernel := "test-kernel" + paths := kernelHeadersPaths{ + libModulesDir: filepath.Join(root, "lib/modules"), + usrSrcDir: filepath.Join(root, "usr/src"), + } + + headersDir := filepath.Join(paths.usrSrcDir, "linux-headers-"+kernel) + modulesDir := filepath.Join(paths.libModulesDir, kernel) + require.NoError(t, os.MkdirAll(headersDir, 0755)) + require.NoError(t, os.MkdirAll(modulesDir, 0755)) + require.NoError(t, os.Symlink("/usr/src/linux-headers-other", filepath.Join(modulesDir, "build"))) + + ready, err := kernelHeadersAlreadyInstalled(kernel, paths) + require.NoError(t, err) + assert.False(t, ready) +} + +func TestWriteKernelHeadersStatus(t *testing.T) { + t.Parallel() + + statusPath := filepath.Join(t.TempDir(), "run/hypeman/kernel-headers.status") + require.NoError(t, writeKernelHeadersStatus(statusPath, headersStatusRunning)) + + data, err := os.ReadFile(statusPath) + require.NoError(t, err) + assert.Equal(t, "running\n", string(data)) +} diff --git a/lib/system/init/main.go b/lib/system/init/main.go index 6eaa10cf..4883c5ee 100644 --- a/lib/system/init/main.go +++ b/lib/system/init/main.go @@ -13,9 +13,21 @@ import ( "fmt" "os" "os/exec" + "sync" ) func main() { + if len(os.Args) > 1 { + switch os.Args[1] { + case headersWorkerArg: + runKernelHeadersWorker(NewLogger(), initrdKernelHeadersPaths) + return + case headersWorkerGuestArg: + runKernelHeadersWorker(NewLogger(), guestKernelHeadersPaths) + return + } + } + log := NewLogger() log.Info("hypeman-init:boot", "init starting") @@ -38,21 +50,31 @@ func main() { dropToShell() } - // Phase 4: Configure network (shared between modes) + // Phase 4/5: Run independent setup tasks in parallel. + // Keep strict dependencies around mount -> overlay -> config and + // bind-mount barrier before mode handoff. + var setupWG sync.WaitGroup if cfg.NetworkEnabled { - if err := configureNetwork(log, cfg); err != nil { - log.Error("hypeman-init:network", "failed to configure network", err) - // Continue anyway - network isn't always required - } + setupWG.Add(1) + go func() { + defer setupWG.Done() + if err := configureNetwork(log, cfg); err != nil { + log.Error("hypeman-init:network", "failed to configure network", err) + // Continue anyway - network isn't always required + } + }() } - - // Phase 5: Mount volumes if len(cfg.VolumeMounts) > 0 { - if err := mountVolumes(log, cfg); err != nil { - log.Error("hypeman-init:volumes", "failed to mount volumes", err) - // Continue anyway - } + setupWG.Add(1) + go func() { + defer setupWG.Done() + if err := mountVolumes(log, cfg); err != nil { + log.Error("hypeman-init:volumes", "failed to mount volumes", err) + // Continue anyway + } + }() } + setupWG.Wait() // Phase 6: Bind mount filesystems to new root if err := bindMountsToNewRoot(log); err != nil { @@ -66,14 +88,12 @@ func main() { // Continue anyway - exec will still work, just no remote access } - // Phase 8: Setup kernel headers for DKMS (can be skipped via config) + // Phase 8: Start async kernel headers setup for exec mode. + // In systemd mode, service injection is handled during runSystemdMode. if cfg.SkipKernelHeaders { log.Info("hypeman-init:headers", "skipping kernel headers setup (skip_kernel_headers=true)") - } else { - if err := setupKernelHeaders(log); err != nil { - log.Error("hypeman-init:headers", "failed to setup kernel headers", err) - // Continue anyway - only needed for DKMS module building - } + } else if cfg.InitMode == "exec" { + startKernelHeadersWorkerAsync(log) } // Phase 9: Mode-specific execution diff --git a/lib/system/init/mode_exec.go b/lib/system/init/mode_exec.go index 51a5b5ec..8dabea11 100644 --- a/lib/system/init/mode_exec.go +++ b/lib/system/init/mode_exec.go @@ -16,6 +16,7 @@ import ( const ( guestAgentReadyFilePath = "/run/hypeman/guest-agent-ready" guestAgentReadyTimeout = 10 * time.Second + guestAgentReadyFDEnv = "HYPEMAN_AGENT_READY_FD" ) // runExecMode runs the container in exec mode (default). @@ -53,23 +54,45 @@ func runExecMode(log *Logger, cfg *vmconfig.Config) { // Clear stale readiness marker from previous runs. _ = os.Remove(guestAgentReadyFilePath) + readyPipeReader, readyPipeWriter, err := os.Pipe() + if err != nil { + log.Error("hypeman-init:setup", "failed to create guest-agent readiness pipe", err) + syscall.Sync() + syscall.Reboot(syscall.LINUX_REBOOT_CMD_POWER_OFF) + } + log.Info("hypeman-init:setup", "starting guest-agent in background") agentCmd = exec.Command("/opt/hypeman/guest-agent") - agentCmd.Env = append(buildEnv(cfg.Env), "HYPEMAN_AGENT_READY_FILE="+guestAgentReadyFilePath) + agentCmd.Env = append( + buildEnv(cfg.Env), + "HYPEMAN_AGENT_READY_FILE="+guestAgentReadyFilePath, + fmt.Sprintf("%s=%d", guestAgentReadyFDEnv, 3), + ) + agentCmd.ExtraFiles = []*os.File{readyPipeWriter} agentCmd.Stdout = os.Stdout agentCmd.Stderr = os.Stderr if err := agentCmd.Start(); err != nil { + _ = readyPipeReader.Close() + _ = readyPipeWriter.Close() log.Error("hypeman-init:setup", "failed to start guest-agent", err) syscall.Sync() syscall.Reboot(syscall.LINUX_REBOOT_CMD_POWER_OFF) } + _ = readyPipeWriter.Close() + + agentExited := make(chan error, 1) + go func() { + agentExited <- agentCmd.Wait() + }() // Strict startup gate: do not launch the guest program until agent is ready. - if err := waitForGuestAgentReady(guestAgentReadyFilePath, guestAgentReadyTimeout, agentCmd); err != nil { - log.Error("hypeman-init:setup", "guest-agent readiness timeout; not launching entrypoint", err) + if err := waitForGuestAgentReady(readyPipeReader, guestAgentReadyTimeout, agentExited); err != nil { + _ = readyPipeReader.Close() + log.Error("hypeman-init:setup", "guest-agent readiness gate failed; not launching entrypoint", err) syscall.Sync() syscall.Reboot(syscall.LINUX_REBOOT_CMD_POWER_OFF) } + _ = readyPipeReader.Close() } // Build the entrypoint command @@ -197,24 +220,30 @@ func formatProgramStartSentinel(mode string) string { return fmt.Sprintf("HYPEMAN-PROGRAM-START ts=%s mode=%s", time.Now().UTC().Format(time.RFC3339Nano), mode) } -func waitForGuestAgentReady(readyFilePath string, timeout time.Duration, agentCmd *exec.Cmd) error { - deadline := time.Now().Add(timeout) - for { - if _, err := os.Stat(readyFilePath); err == nil { - return nil - } +func waitForGuestAgentReady(readyReader *os.File, timeout time.Duration, agentExited <-chan error) error { + readyErr := make(chan error, 1) + go func() { + var b [1]byte + _, err := readyReader.Read(b[:]) + readyErr <- err + }() - // Fast-fail if the guest-agent process already exited. - if agentCmd != nil && agentCmd.Process != nil { - if err := agentCmd.Process.Signal(syscall.Signal(0)); err != nil { - return fmt.Errorf("guest-agent process exited before readiness signal: %w", err) - } - } + timer := time.NewTimer(timeout) + defer timer.Stop() - if time.Now().After(deadline) { - return fmt.Errorf("timed out after %s waiting for %s", timeout, readyFilePath) + select { + case err := <-readyErr: + if err != nil { + return fmt.Errorf("failed waiting for guest-agent readiness signal: %w", err) + } + return nil + case err := <-agentExited: + if err == nil { + return fmt.Errorf("guest-agent exited before readiness signal") } - time.Sleep(100 * time.Millisecond) + return fmt.Errorf("guest-agent exited before readiness signal: %w", err) + case <-timer.C: + return fmt.Errorf("timed out after %s waiting for guest-agent readiness signal", timeout) } } diff --git a/lib/system/init/mode_exec_test.go b/lib/system/init/mode_exec_test.go index 3db82483..3dfbf9c3 100644 --- a/lib/system/init/mode_exec_test.go +++ b/lib/system/init/mode_exec_test.go @@ -1,8 +1,8 @@ package main import ( + "errors" "os" - "path/filepath" "strings" "testing" "time" @@ -169,21 +169,45 @@ func TestFormatProgramStartSentinel(t *testing.T) { func TestWaitForGuestAgentReady(t *testing.T) { t.Parallel() - readyFile := filepath.Join(t.TempDir(), "ready") + readyReader, readyWriter, err := os.Pipe() + require.NoError(t, err) + defer readyReader.Close() go func() { time.Sleep(50 * time.Millisecond) - _ = os.WriteFile(readyFile, []byte("ok"), 0644) + _, _ = readyWriter.Write([]byte{1}) + _ = readyWriter.Close() }() - err := waitForGuestAgentReady(readyFile, time.Second, nil) + err = waitForGuestAgentReady(readyReader, time.Second, nil) require.NoError(t, err) } func TestWaitForGuestAgentReadyTimeout(t *testing.T) { t.Parallel() - readyFile := filepath.Join(t.TempDir(), "missing") - err := waitForGuestAgentReady(readyFile, 100*time.Millisecond, nil) + readyReader, readyWriter, err := os.Pipe() + require.NoError(t, err) + + err = waitForGuestAgentReady(readyReader, 100*time.Millisecond, nil) require.Error(t, err) assert.True(t, strings.Contains(err.Error(), "timed out"), "unexpected error: %v", err) + + _ = readyWriter.Close() + _ = readyReader.Close() +} + +func TestWaitForGuestAgentReadyProcessExit(t *testing.T) { + t.Parallel() + + readyReader, readyWriter, err := os.Pipe() + require.NoError(t, err) + defer readyReader.Close() + defer readyWriter.Close() + + agentExited := make(chan error, 1) + agentExited <- errors.New("exit status 1") + + err = waitForGuestAgentReady(readyReader, time.Second, agentExited) + require.Error(t, err) + assert.Contains(t, err.Error(), "exited before readiness signal") } diff --git a/lib/system/init/mode_systemd.go b/lib/system/init/mode_systemd.go index 8de34581..eb1ffd61 100644 --- a/lib/system/init/mode_systemd.go +++ b/lib/system/init/mode_systemd.go @@ -3,6 +3,7 @@ package main import ( "fmt" "os" + "path/filepath" "syscall" "al.essio.dev/pkg/shellescape" @@ -29,6 +30,16 @@ func runSystemdMode(log *Logger, cfg *vmconfig.Config) { // Continue anyway - VM will work, just without agent } } + if cfg.SkipKernelHeaders { + log.Info("hypeman-init:systemd", "skipping kernel headers service injection (skip_kernel_headers=true)") + } else { + log.Info("hypeman-init:systemd", "injecting hypeman-kernel-headers.service") + if err := injectHeadersService(newroot); err != nil { + log.Error("hypeman-init:systemd", "failed to inject headers service", err) + _ = writeKernelHeadersStatus(initrdKernelHeadersPaths.statusPath, headersStatusFailed) + log.Info("hypeman-init:systemd", formatHeadersFailedSentinel(err)) + } + } // Change root to the new filesystem using chroot log.Info("hypeman-init:systemd", "executing chroot") @@ -69,8 +80,6 @@ func runSystemdMode(log *Logger, cfg *vmconfig.Config) { func injectAgentService(newroot string, env map[string]string) error { serviceContent := `[Unit] Description=Hypeman Guest Agent -After=network.target -Wants=network.target [Service] Type=simple @@ -117,7 +126,10 @@ WantedBy=multi-user.target // Enable the service by creating a symlink in wants directory symlinkPath := wantsDir + "/hypeman-agent.service" // Use relative path for the symlink - return os.Symlink("../hypeman-agent.service", symlinkPath) + if err := os.Symlink("../hypeman-agent.service", symlinkPath); err != nil && !os.IsExist(err) { + return err + } + return nil } // buildEnvFileContent creates systemd environment file content from env map. @@ -142,3 +154,97 @@ func buildEnvFileContent(env map[string]string) string { return content } + +func injectHeadersService(newroot string) error { + if err := stageKernelHeadersAssetsForSystemd(newroot); err != nil { + return err + } + if err := writeKernelHeadersStatus(initrdKernelHeadersPaths.statusPath, headersStatusPending); err != nil { + return err + } + + serviceContent := `[Unit] +Description=Hypeman Kernel Headers Setup +After=local-fs.target +ConditionPathExists=/opt/hypeman/hypeman-init +ConditionPathExists=/opt/hypeman/kernel-headers.tar.gz + +[Service] +Type=oneshot +ExecStart=/opt/hypeman/hypeman-init --headers-worker-guest +Nice=10 +IOSchedulingClass=best-effort +IOSchedulingPriority=7 +StandardOutput=journal+console +StandardError=journal+console + +[Install] +WantedBy=multi-user.target +` + + serviceDir := filepath.Join(newroot, "etc/systemd/system") + wantsDir := filepath.Join(serviceDir, "multi-user.target.wants") + if err := os.MkdirAll(serviceDir, 0755); err != nil { + return err + } + if err := os.MkdirAll(wantsDir, 0755); err != nil { + return err + } + + servicePath := filepath.Join(serviceDir, "hypeman-kernel-headers.service") + if err := os.WriteFile(servicePath, []byte(serviceContent), 0644); err != nil { + return err + } + + symlinkPath := filepath.Join(wantsDir, "hypeman-kernel-headers.service") + if err := os.Symlink("../hypeman-kernel-headers.service", symlinkPath); err != nil && !os.IsExist(err) { + return err + } + return nil +} + +func stageKernelHeadersAssetsForSystemd(newroot string) error { + const ( + dstDir = "opt/hypeman" + dstInitBinRel = "opt/hypeman/hypeman-init" + dstTarballRel = "opt/hypeman/kernel-headers.tar.gz" + sourceTarball = "/kernel-headers.tar.gz" + ) + + srcInitBin, err := os.Executable() + if err != nil { + return fmt.Errorf("resolve init binary path: %w", err) + } + + guestDstDir := filepath.Join(newroot, dstDir) + if err := os.MkdirAll(guestDstDir, 0755); err != nil { + return fmt.Errorf("mkdir %s: %w", guestDstDir, err) + } + + initData, err := os.ReadFile(srcInitBin) + if err != nil { + return fmt.Errorf("read init binary: %w", err) + } + if err := os.WriteFile(filepath.Join(newroot, dstInitBinRel), initData, 0755); err != nil { + return fmt.Errorf("write init binary: %w", err) + } + + if _, err := os.Stat(sourceTarball); err != nil { + return fmt.Errorf("stat headers tarball: %w", err) + } + + targetTarball := filepath.Join(newroot, dstTarballRel) + if _, err := os.Stat(targetTarball); os.IsNotExist(err) { + if err := os.WriteFile(targetTarball, []byte{}, 0644); err != nil { + return fmt.Errorf("create target tarball file: %w", err) + } + } else if err != nil { + return fmt.Errorf("stat target tarball file: %w", err) + } + + if err := bindMount(sourceTarball, targetTarball); err != nil { + return fmt.Errorf("bind mount headers tarball: %w", err) + } + + return nil +} diff --git a/lib/system/init/mode_systemd_test.go b/lib/system/init/mode_systemd_test.go index 2f560f34..725a4082 100644 --- a/lib/system/init/mode_systemd_test.go +++ b/lib/system/init/mode_systemd_test.go @@ -1,6 +1,8 @@ package main import ( + "os" + "path/filepath" "testing" "al.essio.dev/pkg/shellescape" @@ -175,3 +177,16 @@ func TestShellescape(t *testing.T) { }) } } + +func TestInjectAgentServiceOmitsNetworkTargetDependency(t *testing.T) { + t.Parallel() + + newroot := t.TempDir() + err := injectAgentService(newroot, map[string]string{}) + assert.NoError(t, err) + + servicePath := filepath.Join(newroot, "etc/systemd/system/hypeman-agent.service") + data, err := os.ReadFile(servicePath) + assert.NoError(t, err) + assert.NotContains(t, string(data), "network.target") +} From ffb4e4333eece7b4208052e996ae91158bdafd57 Mon Sep 17 00:00:00 2001 From: Steven Miller Date: Sun, 8 Mar 2026 23:26:14 -0400 Subject: [PATCH 04/32] skills: add concise initializing-speed optimization playbook --- skills/optimize-initializing-speed/SKILL.md | 32 +++++++++++++++++++++ 1 file changed, 32 insertions(+) create mode 100644 skills/optimize-initializing-speed/SKILL.md diff --git a/skills/optimize-initializing-speed/SKILL.md b/skills/optimize-initializing-speed/SKILL.md new file mode 100644 index 00000000..8de45e6b --- /dev/null +++ b/skills/optimize-initializing-speed/SKILL.md @@ -0,0 +1,32 @@ +--- +name: optimize-initializing-speed +description: Use when optimizing VM Initializing-to-Running latency while preserving functionality and low implementation complexity. +--- + +# Optimize Initializing Speed + +## Goal +Minimize `Create/Start -> Running` latency without removing functionality. + +## Priority Levers +1. Keep `Running` gated only on `program-start` + `agent-ready` markers. +2. Replace readiness polling with event-driven signaling (pipe FD via `ExtraFiles`). +3. Move heavy non-critical setup (kernel headers) off the critical path. +4. Add fast-path checks (skip work when already installed/valid). +5. Parallelize independent init stages with simple barriers (no DAG engine). + +## Guardrails +- Do not gate `Running` on kernel headers readiness. +- Keep guest-agent gate strict unless `skip_guest_agent=true`. +- Preserve lifecycle semantics and blocked/allowed operations in `Initializing`. +- Prefer simple staged orchestration over framework complexity. + +## Measurement Protocol +1. Measure baseline and candidate on the same host with the same 5-run harness. +2. Report per-run samples + median/mean/min/max. +3. Validate full regression suite (`make test-linux`) before merge. + +## Required Outputs +- Exact before/after latency numbers. +- Short breakdown of biggest contributors. +- Risk notes and rollback-safe behavior. From fffe995e3bf00b55862cc8a94ffcd7efbc886085 Mon Sep 17 00:00:00 2001 From: Steven Miller Date: Sun, 8 Mar 2026 23:31:11 -0400 Subject: [PATCH 05/32] Update skill --- skills/optimize-initializing-speed/SKILL.md | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/skills/optimize-initializing-speed/SKILL.md b/skills/optimize-initializing-speed/SKILL.md index 8de45e6b..244b25f0 100644 --- a/skills/optimize-initializing-speed/SKILL.md +++ b/skills/optimize-initializing-speed/SKILL.md @@ -6,27 +6,25 @@ description: Use when optimizing VM Initializing-to-Running latency while preser # Optimize Initializing Speed ## Goal -Minimize `Create/Start -> Running` latency without removing functionality. +Minimize `Create/Start -> Running` latency without removing functionality. Base your decisions on what to optimize by using real measurements. ## Priority Levers 1. Keep `Running` gated only on `program-start` + `agent-ready` markers. -2. Replace readiness polling with event-driven signaling (pipe FD via `ExtraFiles`). -3. Move heavy non-critical setup (kernel headers) off the critical path. +2. Replace readiness polling with event-driven signaling +3. Move heavy non-critical setup (kernel headers) off the critical path (ask permission from user if moving logic to async / could be still processing after Running is set). 4. Add fast-path checks (skip work when already installed/valid). -5. Parallelize independent init stages with simple barriers (no DAG engine). +5. Parallelize independent init stages with simple barriers (no DAG engine). Avoid parallel tasks that are likely to conflict. ## Guardrails -- Do not gate `Running` on kernel headers readiness. - Keep guest-agent gate strict unless `skip_guest_agent=true`. - Preserve lifecycle semantics and blocked/allowed operations in `Initializing`. -- Prefer simple staged orchestration over framework complexity. ## Measurement Protocol 1. Measure baseline and candidate on the same host with the same 5-run harness. 2. Report per-run samples + median/mean/min/max. -3. Validate full regression suite (`make test-linux`) before merge. +3. Validate full regression suite before merge. ## Required Outputs - Exact before/after latency numbers. - Short breakdown of biggest contributors. -- Risk notes and rollback-safe behavior. +- Risk notes, if any From 9bf231f7becf5160a533f45ec9094fd91e4dd496 Mon Sep 17 00:00:00 2001 From: Steven Miller Date: Sun, 8 Mar 2026 23:46:17 -0400 Subject: [PATCH 06/32] instances: throttle boot-marker log scans during state derivation --- lib/instances/manager.go | 5 ++++ lib/instances/query.go | 44 +++++++++++++++++++++++++++++++--- lib/instances/query_test.go | 47 +++++++++++++++++++++++++++++++++++++ 3 files changed, 93 insertions(+), 3 deletions(-) diff --git a/lib/instances/manager.go b/lib/instances/manager.go index b75e0f4c..21ffac2e 100644 --- a/lib/instances/manager.go +++ b/lib/instances/manager.go @@ -4,6 +4,7 @@ import ( "context" "fmt" "sync" + "time" "github.com/kernel/hypeman/lib/devices" "github.com/kernel/hypeman/lib/hypervisor" @@ -78,8 +79,10 @@ type manager struct { limits ResourceLimits resourceValidator ResourceValidator // Optional validator for aggregate resource limits instanceLocks sync.Map // map[string]*sync.RWMutex - per-instance locks + bootMarkerScans sync.Map // map[string]time.Time next allowed boot-marker rescan hostTopology *HostTopology // Cached host CPU topology metrics *Metrics + now func() time.Time // Hypervisor support vmStarters map[hypervisor.Type]hypervisor.VMStarter @@ -113,9 +116,11 @@ func NewManager(p *paths.Paths, imageManager images.Manager, systemManager syste volumeManager: volumeManager, limits: limits, instanceLocks: sync.Map{}, + bootMarkerScans: sync.Map{}, hostTopology: detectHostTopology(), // Detect and cache host topology vmStarters: vmStarters, defaultHypervisor: defaultHypervisor, + now: time.Now, } // Initialize metrics if meter is provided diff --git a/lib/instances/query.go b/lib/instances/query.go index eeae02ac..12f7df27 100644 --- a/lib/instances/query.go +++ b/lib/instances/query.go @@ -21,6 +21,7 @@ const ( exitSentinelPrefix = "HYPEMAN-EXIT " programStartSentinelPrefix = "HYPEMAN-PROGRAM-START " agentReadySentinelPrefix = "HYPEMAN-AGENT-READY " + bootMarkerRescanInterval = 1 * time.Second ) // stateResult holds the result of state derivation @@ -113,10 +114,14 @@ func (m *manager) hydrateBootMarkersFromLogs(stored *StoredMetadata) bool { needProgram := stored.ProgramStartedAt == nil needAgent := !stored.SkipGuestAgent && stored.GuestAgentReadyAt == nil if !needProgram && !needAgent { + m.clearBootMarkerRescan(stored.Id) + return false + } + if !m.shouldScanBootMarkers(stored.Id) { return false } - programStartedAt, guestAgentReadyAt := m.parseBootMarkers(stored.Id) + programStartedAt, guestAgentReadyAt := m.parseBootMarkers(stored.Id, needProgram, needAgent) hydrated := false if needProgram && programStartedAt != nil { stored.ProgramStartedAt = programStartedAt @@ -126,12 +131,17 @@ func (m *manager) hydrateBootMarkersFromLogs(stored *StoredMetadata) bool { stored.GuestAgentReadyAt = guestAgentReadyAt hydrated = true } + if hydrated { + m.clearBootMarkerRescan(stored.Id) + } else { + m.deferBootMarkerRescan(stored.Id) + } return hydrated } // parseBootMarkers scans app logs (including rotated files) and returns the // latest observed program-start and guest-agent-ready marker timestamps. -func (m *manager) parseBootMarkers(id string) (*time.Time, *time.Time) { +func (m *manager) parseBootMarkers(id string, needProgram bool, needAgent bool) (*time.Time, *time.Time) { logPaths := m.appLogPathsForMarkerScan(id) var programStartedAt *time.Time @@ -151,6 +161,10 @@ func (m *manager) parseBootMarkers(id string) (*time.Time, *time.Time) { if ts, ok := parseAgentReadySentinelLine(line); ok { guestAgentReadyAt = &ts } + if (!needProgram || programStartedAt != nil) && (!needAgent || guestAgentReadyAt != nil) { + _ = f.Close() + return programStartedAt, guestAgentReadyAt + } } _ = f.Close() } @@ -158,6 +172,30 @@ func (m *manager) parseBootMarkers(id string) (*time.Time, *time.Time) { return programStartedAt, guestAgentReadyAt } +func (m *manager) shouldScanBootMarkers(id string) bool { + if nextAny, ok := m.bootMarkerScans.Load(id); ok { + if next, ok := nextAny.(time.Time); ok && m.nowUTC().Before(next) { + return false + } + } + return true +} + +func (m *manager) deferBootMarkerRescan(id string) { + m.bootMarkerScans.Store(id, m.nowUTC().Add(bootMarkerRescanInterval)) +} + +func (m *manager) clearBootMarkerRescan(id string) { + m.bootMarkerScans.Delete(id) +} + +func (m *manager) nowUTC() time.Time { + if m.now != nil { + return m.now().UTC() + } + return time.Now().UTC() +} + // appLogPathsForMarkerScan returns app log paths in chronological order // (oldest rotated file to newest active file). func (m *manager) appLogPathsForMarkerScan(id string) []string { @@ -331,7 +369,7 @@ func (m *manager) persistBootMarkers(ctx context.Context, id string) { return } - programStartedAt, guestAgentReadyAt := m.parseBootMarkers(id) + programStartedAt, guestAgentReadyAt := m.parseBootMarkers(id, needProgram, needAgent) updated := false if needProgram && programStartedAt != nil { meta.ProgramStartedAt = programStartedAt diff --git a/lib/instances/query_test.go b/lib/instances/query_test.go index a6877fd1..aa8e43dd 100644 --- a/lib/instances/query_test.go +++ b/lib/instances/query_test.go @@ -1,9 +1,12 @@ package instances import ( + "os" + "path/filepath" "testing" "time" + "github.com/kernel/hypeman/lib/paths" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" ) @@ -165,3 +168,47 @@ func TestDeriveRunningState(t *testing.T) { }) } } + +func TestHydrateBootMarkersFromLogs_RescanThrottle(t *testing.T) { + t.Parallel() + + tmpDir := t.TempDir() + m := &manager{ + paths: paths.New(tmpDir), + } + + now := time.Date(2026, 3, 8, 12, 0, 0, 0, time.UTC) + m.now = func() time.Time { return now } + + meta := &StoredMetadata{ + Id: "test-instance", + SkipGuestAgent: false, + } + + // First call finds nothing and schedules a deferred rescan. + hydrated := m.hydrateBootMarkersFromLogs(meta) + require.False(t, hydrated) + require.Nil(t, meta.ProgramStartedAt) + require.Nil(t, meta.GuestAgentReadyAt) + + logPath := m.paths.InstanceAppLog(meta.Id) + require.NoError(t, os.MkdirAll(filepath.Dir(logPath), 0o755)) + err := os.WriteFile(logPath, []byte( + "HYPEMAN-AGENT-READY ts=2026-03-08T12:00:00Z\n"+ + "HYPEMAN-PROGRAM-START ts=2026-03-08T12:00:01Z mode=exec\n", + ), 0o644) + require.NoError(t, err) + + // Immediate second call should be throttled and skip scanning. + hydrated = m.hydrateBootMarkersFromLogs(meta) + require.False(t, hydrated) + require.Nil(t, meta.ProgramStartedAt) + require.Nil(t, meta.GuestAgentReadyAt) + + // Once the rescan interval has elapsed, markers are hydrated. + now = now.Add(bootMarkerRescanInterval + time.Millisecond) + hydrated = m.hydrateBootMarkersFromLogs(meta) + require.True(t, hydrated) + require.NotNil(t, meta.ProgramStartedAt) + require.NotNil(t, meta.GuestAgentReadyAt) +} From ecfd6535a19cab6cbd0d2200c81dcb197fda9671 Mon Sep 17 00:00:00 2001 From: Steven Miller Date: Sun, 8 Mar 2026 23:51:54 -0400 Subject: [PATCH 07/32] tests: increase manager lifecycle running wait to 20s --- lib/instances/manager_test.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/instances/manager_test.go b/lib/instances/manager_test.go index dacd0e50..830726b2 100644 --- a/lib/instances/manager_test.go +++ b/lib/instances/manager_test.go @@ -333,7 +333,7 @@ func TestBasicEndToEnd(t *testing.T) { // Wait for VM to be fully running err = waitForVMReady(ctx, inst.SocketPath, 5*time.Second) require.NoError(t, err, "VM should reach running state") - inst, err = waitForInstanceState(ctx, manager, inst.Id, StateRunning, 10*time.Second) + inst, err = waitForInstanceState(ctx, manager, inst.Id, StateRunning, 20*time.Second) require.NoError(t, err, "instance should reach Running state") // Get instance @@ -780,7 +780,7 @@ func TestBasicEndToEnd(t *testing.T) { restartedInst, err := manager.StartInstance(ctx, inst.Id, StartInstanceRequest{}) require.NoError(t, err, "StartInstance should succeed") assert.Contains(t, []State{StateInitializing, StateRunning}, restartedInst.State, "Instance should be active after restart") - restartedInst, err = waitForInstanceState(ctx, manager, restartedInst.Id, StateRunning, 10*time.Second) + restartedInst, err = waitForInstanceState(ctx, manager, restartedInst.Id, StateRunning, 20*time.Second) require.NoError(t, err, "instance should reach Running after restart") // Verify exit info was cleared From 56cec52c12e2ae54cce0ea0c17d2cb7e7ced5847 Mon Sep 17 00:00:00 2001 From: Steven Miller Date: Sun, 8 Mar 2026 23:53:31 -0400 Subject: [PATCH 08/32] instances: clear boot markers on stop/restore and tighten transitions --- lib/instances/manager_test.go | 1 + lib/instances/restore.go | 5 +++++ lib/instances/state.go | 1 - lib/instances/stop.go | 3 +++ 4 files changed, 9 insertions(+), 1 deletion(-) diff --git a/lib/instances/manager_test.go b/lib/instances/manager_test.go index 830726b2..c5e5ffae 100644 --- a/lib/instances/manager_test.go +++ b/lib/instances/manager_test.go @@ -1450,6 +1450,7 @@ func TestStateTransitions(t *testing.T) { {"Stopped to Running", StateStopped, StateRunning, true}, {"Stopped to Initializing", StateStopped, StateInitializing, true}, {"Standby to Running", StateStandby, StateRunning, true}, + {"Initializing to Paused", StateInitializing, StatePaused, true}, } for _, tt := range tests { diff --git a/lib/instances/restore.go b/lib/instances/restore.go index 62eee2b2..f9bfaff8 100644 --- a/lib/instances/restore.go +++ b/lib/instances/restore.go @@ -57,6 +57,11 @@ func (m *manager) restoreInstance( return nil, fmt.Errorf("no snapshot available for instance %s", id) } + // Boot markers are tied to a specific VM boot cycle and must be rebuilt for + // this restore run before transitioning to Running. + stored.ProgramStartedAt = nil + stored.GuestAgentReadyAt = nil + // 2b. Validate aggregate resource limits before allocating resources (if configured) if m.resourceValidator != nil { needsGPU := stored.GPUProfile != "" diff --git a/lib/instances/state.go b/lib/instances/state.go index f909c7c2..ba3d41f9 100644 --- a/lib/instances/state.go +++ b/lib/instances/state.go @@ -13,7 +13,6 @@ var ValidTransitions = map[State][]State{ }, StateInitializing: { StateRunning, // guest init complete - StatePaused, // pause StateShutdown, // shutdown }, StateRunning: { diff --git a/lib/instances/stop.go b/lib/instances/stop.go index c6065124..2b9abfee 100644 --- a/lib/instances/stop.go +++ b/lib/instances/stop.go @@ -240,6 +240,9 @@ func (m *manager) stopInstance( stored.StoppedAt = &now stored.HypervisorPID = nil stored.GPUMdevUUID = "" // Clear mdev UUID since we destroyed it + // Boot markers are per-boot-run and must not carry across stop/restore/start. + stored.ProgramStartedAt = nil + stored.GuestAgentReadyAt = nil meta = &metadata{StoredMetadata: *stored} if err := m.saveMetadata(meta); err != nil { From 37f30ad3174fd1ea0747f9b253a2917d2a831b94 Mon Sep 17 00:00:00 2001 From: Steven Miller Date: Sun, 8 Mar 2026 23:58:04 -0400 Subject: [PATCH 09/32] instances: allow fork target Running while still Initializing --- lib/instances/fork.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/instances/fork.go b/lib/instances/fork.go index 496fb54c..405d16b6 100644 --- a/lib/instances/fork.go +++ b/lib/instances/fork.go @@ -408,7 +408,7 @@ func (m *manager) applyForkTargetState(ctx context.Context, forkID string, targe if err != nil { return nil, err } - if current.State == target { + if current.State == target || (target == StateRunning && current.State == StateInitializing) { return returnWithReadiness(current, nil) } From 3d9c470494b4918ca2b472dc1b79f580ae02827b Mon Sep 17 00:00:00 2001 From: Steven Miller Date: Mon, 9 Mar 2026 00:03:05 -0400 Subject: [PATCH 10/32] instances: preserve boot markers across standby restore --- lib/instances/restore.go | 5 ----- 1 file changed, 5 deletions(-) diff --git a/lib/instances/restore.go b/lib/instances/restore.go index f9bfaff8..62eee2b2 100644 --- a/lib/instances/restore.go +++ b/lib/instances/restore.go @@ -57,11 +57,6 @@ func (m *manager) restoreInstance( return nil, fmt.Errorf("no snapshot available for instance %s", id) } - // Boot markers are tied to a specific VM boot cycle and must be rebuilt for - // this restore run before transitioning to Running. - stored.ProgramStartedAt = nil - stored.GuestAgentReadyAt = nil - // 2b. Validate aggregate resource limits before allocating resources (if configured) if m.resourceValidator != nil { needsGPU := stored.GPUProfile != "" From aa78f4f4a12ecf7e90084166581fae0afe93c725 Mon Sep 17 00:00:00 2001 From: Steven Miller Date: Mon, 9 Mar 2026 00:07:14 -0400 Subject: [PATCH 11/32] tests: use manager network init in lifecycle integration tests --- lib/instances/manager_test.go | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/lib/instances/manager_test.go b/lib/instances/manager_test.go index c5e5ffae..3003781c 100644 --- a/lib/instances/manager_test.go +++ b/lib/instances/manager_test.go @@ -268,12 +268,8 @@ func TestBasicEndToEnd(t *testing.T) { assert.Empty(t, vol.Attachments, "Volume should not be attached yet") // Initialize network for ingress testing - networkManager := network.NewManager(p, &config.Config{ - DataDir: tmpDir, - Network: newParallelTestNetworkConfig(t), - }, nil) t.Log("Initializing network...") - err = networkManager.Initialize(ctx, nil) + err = manager.networkManager.Initialize(ctx, nil) require.NoError(t, err) t.Log("Network initialized") @@ -1088,12 +1084,8 @@ func TestEntrypointEnvVars(t *testing.T) { t.Log("System files ready") // Initialize network (needed for loopback interface in guest) - networkManager := network.NewManager(p, &config.Config{ - DataDir: tmpDir, - Network: newParallelTestNetworkConfig(t), - }, nil) t.Log("Initializing network...") - err = networkManager.Initialize(ctx, nil) + err = mgr.networkManager.Initialize(ctx, nil) require.NoError(t, err) t.Log("Network initialized") From b2c8e2ca6685c1fb2dca2270837016a0efbcb9c2 Mon Sep 17 00:00:00 2001 From: Steven Miller Date: Mon, 9 Mar 2026 00:14:05 -0400 Subject: [PATCH 12/32] network: harden default-network self-heal on allocation --- lib/network/allocate.go | 53 +++++++++++++++++++++++++---------------- 1 file changed, 32 insertions(+), 21 deletions(-) diff --git a/lib/network/allocate.go b/lib/network/allocate.go index 1eedd841..074590e4 100644 --- a/lib/network/allocate.go +++ b/lib/network/allocate.go @@ -7,6 +7,7 @@ import ( mathrand "math/rand" "net" "strings" + "time" "github.com/kernel/hypeman/lib/logger" ) @@ -21,18 +22,10 @@ func (m *manager) CreateAllocation(ctx context.Context, req AllocateRequest) (*N log := logger.FromContext(ctx) - // 1. Get default network - network, err := m.getDefaultNetwork(ctx) + // 1. Get default network (self-heals if kernel bridge state was removed/racing). + network, err := m.getOrInitDefaultNetwork(ctx) if err != nil { - // Self-heal if bridge state was externally removed after initialization. - // This keeps allocations robust under highly concurrent test workloads. - if initErr := m.Initialize(ctx, nil); initErr != nil { - return nil, fmt.Errorf("get default network: %w", err) - } - network, err = m.getDefaultNetwork(ctx) - if err != nil { - return nil, fmt.Errorf("get default network: %w", err) - } + return nil, err } // 2. Check name uniqueness (exclude current instance to allow restarts) @@ -112,17 +105,10 @@ func (m *manager) RecreateAllocation(ctx context.Context, instanceID string, dow return nil } - // 2. Get default network details - network, err := m.getDefaultNetwork(ctx) + // 2. Get default network details (same self-healing behavior as CreateAllocation). + network, err := m.getOrInitDefaultNetwork(ctx) if err != nil { - // Same self-healing behavior as CreateAllocation. - if initErr := m.Initialize(ctx, nil); initErr != nil { - return fmt.Errorf("get default network: %w", err) - } - network, err = m.getDefaultNetwork(ctx) - if err != nil { - return fmt.Errorf("get default network: %w", err) - } + return err } // 3. Recreate TAP device with same name and rate limits from instance metadata @@ -172,6 +158,31 @@ func (m *manager) ReleaseAllocation(ctx context.Context, alloc *Allocation) erro return nil } +// getOrInitDefaultNetwork resolves the default network and self-heals by running +// Initialize if bridge state is missing, then retries briefly to absorb netlink propagation delay. +func (m *manager) getOrInitDefaultNetwork(ctx context.Context) (*Network, error) { + network, err := m.getDefaultNetwork(ctx) + if err == nil { + return network, nil + } + + if initErr := m.Initialize(ctx, nil); initErr != nil { + return nil, fmt.Errorf("initialize network manager: %w", initErr) + } + + const retries = 20 + const retryDelay = 100 * time.Millisecond + for i := 0; i < retries; i++ { + network, err = m.getDefaultNetwork(ctx) + if err == nil { + return network, nil + } + time.Sleep(retryDelay) + } + + return nil, fmt.Errorf("get default network after initialize: %w", err) +} + // allocateNextIP picks a random available IP in the subnet // Retries up to 5 times if conflicts occur func (m *manager) allocateNextIP(ctx context.Context, subnet string) (string, error) { From e51f91b16b1828f7306b09d2f560caa57e5a5254 Mon Sep 17 00:00:00 2001 From: Steven Miller Date: Mon, 9 Mar 2026 00:21:20 -0400 Subject: [PATCH 13/32] ci: install libjpeg8 for qemu tests on linux runners --- .github/workflows/test.yml | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index f3752431..3cd0c603 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -31,11 +31,16 @@ jobs: - name: Install dependencies run: | set -xe + has_libjpeg8=0 + if ldconfig -p | grep -q 'libjpeg\.so\.8'; then + has_libjpeg8=1 + fi if ! command -v mkfs.erofs &> /dev/null || \ ! command -v mkfs.ext4 &> /dev/null || \ - ! command -v iptables &> /dev/null; then + ! command -v iptables &> /dev/null || \ + [ "$has_libjpeg8" -eq 0 ]; then sudo apt-get update - sudo apt-get install -y erofs-utils e2fsprogs iptables + sudo apt-get install -y erofs-utils e2fsprogs iptables libjpeg8 fi go mod download @@ -50,6 +55,10 @@ jobs: fi sudo env "PATH=$TEST_PATH" bash -lc "command -v '$bin'" done + if ! ldconfig -p | grep -q 'libjpeg\.so\.8'; then + echo "missing required runtime library: libjpeg.so.8" + exit 1 + fi # Avoids rate limits when running the tests # Tests includes pulling, then converting to disk images From 6b8b8a5c2e82f6d8937da9ec7534a69a9d68b3bb Mon Sep 17 00:00:00 2001 From: Steven Miller Date: Mon, 9 Mar 2026 00:25:36 -0400 Subject: [PATCH 14/32] ci: support runner-specific jpeg packages for qemu --- .github/workflows/test.yml | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 3cd0c603..893bc20e 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -40,7 +40,22 @@ jobs: ! command -v iptables &> /dev/null || \ [ "$has_libjpeg8" -eq 0 ]; then sudo apt-get update - sudo apt-get install -y erofs-utils e2fsprogs iptables libjpeg8 + sudo apt-get install -y erofs-utils e2fsprogs iptables + + # Different runner images expose JPEG libs under different package names. + if ! ldconfig -p | grep -q 'libjpeg\.so\.8'; then + sudo apt-get install -y libjpeg-turbo8 || sudo apt-get install -y libjpeg62-turbo + fi + + # Some images only provide libjpeg.so.62. Provide a compatibility symlink + # for qemu binaries linked against libjpeg.so.8. + if ! ldconfig -p | grep -q 'libjpeg\.so\.8'; then + jpeg_lib="$(ldconfig -p | awk '/libjpeg\.so/{print $NF; exit}')" + if [ -n "$jpeg_lib" ]; then + sudo ln -sf "$jpeg_lib" "$(dirname "$jpeg_lib")/libjpeg.so.8" + sudo ldconfig + fi + fi fi go mod download From 1234c1213745cdc3e60e29edab4747429d9000a9 Mon Sep 17 00:00:00 2001 From: Steven Miller Date: Mon, 9 Mar 2026 00:31:09 -0400 Subject: [PATCH 15/32] instances: harden boot marker hydration and network self-heal --- .github/workflows/test.yml | 10 +++++----- lib/instances/create.go | 10 ++++++---- lib/instances/manager.go | 23 ++++++++++++++++------- lib/instances/query.go | 22 +++++++++++++++++----- lib/instances/query_test.go | 32 ++++++++++++++++++++++++++++++++ lib/instances/start.go | 6 +++--- lib/network/allocate.go | 33 ++++++++++++++++++--------------- 7 files changed, 97 insertions(+), 39 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 893bc20e..5eb7b853 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -32,7 +32,7 @@ jobs: run: | set -xe has_libjpeg8=0 - if ldconfig -p | grep -q 'libjpeg\.so\.8'; then + if sudo ldconfig -p | grep -q 'libjpeg\.so\.8'; then has_libjpeg8=1 fi if ! command -v mkfs.erofs &> /dev/null || \ @@ -43,14 +43,14 @@ jobs: sudo apt-get install -y erofs-utils e2fsprogs iptables # Different runner images expose JPEG libs under different package names. - if ! ldconfig -p | grep -q 'libjpeg\.so\.8'; then + if ! sudo ldconfig -p | grep -q 'libjpeg\.so\.8'; then sudo apt-get install -y libjpeg-turbo8 || sudo apt-get install -y libjpeg62-turbo fi # Some images only provide libjpeg.so.62. Provide a compatibility symlink # for qemu binaries linked against libjpeg.so.8. - if ! ldconfig -p | grep -q 'libjpeg\.so\.8'; then - jpeg_lib="$(ldconfig -p | awk '/libjpeg\.so/{print $NF; exit}')" + if ! sudo ldconfig -p | grep -q 'libjpeg\.so\.8'; then + jpeg_lib="$(sudo ldconfig -p | awk '/libjpeg\.so/{print $NF; exit}')" if [ -n "$jpeg_lib" ]; then sudo ln -sf "$jpeg_lib" "$(dirname "$jpeg_lib")/libjpeg.so.8" sudo ldconfig @@ -70,7 +70,7 @@ jobs: fi sudo env "PATH=$TEST_PATH" bash -lc "command -v '$bin'" done - if ! ldconfig -p | grep -q 'libjpeg\.so\.8'; then + if ! sudo ldconfig -p | grep -q 'libjpeg\.so\.8'; then echo "missing required runtime library: libjpeg.so.8" exit 1 fi diff --git a/lib/instances/create.go b/lib/instances/create.go index ae4a1133..7f9a6ec3 100644 --- a/lib/instances/create.go +++ b/lib/instances/create.go @@ -408,6 +408,11 @@ func (m *manager) createInstance( return nil, fmt.Errorf("create config disk: %w", err) } + // 17. Record boot start time before launching the VM so marker hydration + // can safely ignore stale sentinels from prior runs. + bootStart := time.Now().UTC() + stored.StartedAt = &bootStart + // 17. Save metadata log.DebugContext(ctx, "saving instance metadata", "instance_id", id) meta := &metadata{StoredMetadata: *stored} @@ -423,10 +428,7 @@ func (m *manager) createInstance( return nil, err } - // 19. Update timestamp after VM is running - now := time.Now() - stored.StartedAt = &now - + // 19. Persist runtime metadata updates after VM boot. meta = &metadata{StoredMetadata: *stored} if err := m.saveMetadata(meta); err != nil { // VM is running but metadata failed - log but don't fail diff --git a/lib/instances/manager.go b/lib/instances/manager.go index 21ffac2e..fd9a57e8 100644 --- a/lib/instances/manager.go +++ b/lib/instances/manager.go @@ -301,16 +301,25 @@ func (m *manager) ListInstances(ctx context.Context, filter *ListInstancesFilter if err != nil { return nil, err } - if filter == nil { - return all, nil + result := all + if filter != nil { + filtered := make([]Instance, 0, len(all)) + for i := range all { + if filter.Matches(&all[i]) { + filtered = append(filtered, all[i]) + } + } + result = filtered } - filtered := make([]Instance, 0, len(all)) - for i := range all { - if filter.Matches(&all[i]) { - filtered = append(filtered, all[i]) + + for i := range result { + inst := result[i] + if (inst.State == StateRunning || inst.State == StateInitializing) && inst.BootMarkersHydrated { + m.maybePersistBootMarkers(ctx, inst.Id) } } - return filtered, nil + + return result, nil } // GetInstance returns an instance by ID, name, or ID prefix. diff --git a/lib/instances/query.go b/lib/instances/query.go index 12f7df27..4b273a20 100644 --- a/lib/instances/query.go +++ b/lib/instances/query.go @@ -121,7 +121,7 @@ func (m *manager) hydrateBootMarkersFromLogs(stored *StoredMetadata) bool { return false } - programStartedAt, guestAgentReadyAt := m.parseBootMarkers(stored.Id, needProgram, needAgent) + programStartedAt, guestAgentReadyAt := m.parseBootMarkers(stored.Id, needProgram, needAgent, stored.StartedAt) hydrated := false if needProgram && programStartedAt != nil { stored.ProgramStartedAt = programStartedAt @@ -141,7 +141,8 @@ func (m *manager) hydrateBootMarkersFromLogs(stored *StoredMetadata) bool { // parseBootMarkers scans app logs (including rotated files) and returns the // latest observed program-start and guest-agent-ready marker timestamps. -func (m *manager) parseBootMarkers(id string, needProgram bool, needAgent bool) (*time.Time, *time.Time) { +// When startedAt is provided, markers older than this boot start are ignored. +func (m *manager) parseBootMarkers(id string, needProgram bool, needAgent bool, startedAt *time.Time) (*time.Time, *time.Time) { logPaths := m.appLogPathsForMarkerScan(id) var programStartedAt *time.Time @@ -156,10 +157,14 @@ func (m *manager) parseBootMarkers(id string, needProgram bool, needAgent bool) for scanner.Scan() { line := scanner.Text() if ts, ok := parseProgramStartSentinelLine(line); ok { - programStartedAt = &ts + if markerAtOrAfterBootStart(ts, startedAt) { + programStartedAt = &ts + } } if ts, ok := parseAgentReadySentinelLine(line); ok { - guestAgentReadyAt = &ts + if markerAtOrAfterBootStart(ts, startedAt) { + guestAgentReadyAt = &ts + } } if (!needProgram || programStartedAt != nil) && (!needAgent || guestAgentReadyAt != nil) { _ = f.Close() @@ -172,6 +177,13 @@ func (m *manager) parseBootMarkers(id string, needProgram bool, needAgent bool) return programStartedAt, guestAgentReadyAt } +func markerAtOrAfterBootStart(marker time.Time, startedAt *time.Time) bool { + if startedAt == nil { + return true + } + return !marker.Before(startedAt.UTC()) +} + func (m *manager) shouldScanBootMarkers(id string) bool { if nextAny, ok := m.bootMarkerScans.Load(id); ok { if next, ok := nextAny.(time.Time); ok && m.nowUTC().Before(next) { @@ -369,7 +381,7 @@ func (m *manager) persistBootMarkers(ctx context.Context, id string) { return } - programStartedAt, guestAgentReadyAt := m.parseBootMarkers(id, needProgram, needAgent) + programStartedAt, guestAgentReadyAt := m.parseBootMarkers(id, needProgram, needAgent, meta.StartedAt) updated := false if needProgram && programStartedAt != nil { meta.ProgramStartedAt = programStartedAt diff --git a/lib/instances/query_test.go b/lib/instances/query_test.go index aa8e43dd..9ea98dfe 100644 --- a/lib/instances/query_test.go +++ b/lib/instances/query_test.go @@ -212,3 +212,35 @@ func TestHydrateBootMarkersFromLogs_RescanThrottle(t *testing.T) { require.NotNil(t, meta.ProgramStartedAt) require.NotNil(t, meta.GuestAgentReadyAt) } + +func TestParseBootMarkers_IgnoresStaleMarkersBeforeBootStart(t *testing.T) { + t.Parallel() + + tmpDir := t.TempDir() + m := &manager{ + paths: paths.New(tmpDir), + } + + id := "boot-markers-instance" + logPath := m.paths.InstanceAppLog(id) + require.NoError(t, os.MkdirAll(filepath.Dir(logPath), 0o755)) + + bootStart := time.Date(2026, 3, 9, 4, 0, 0, 0, time.UTC) + staleProgram := bootStart.Add(-30 * time.Second) + staleAgent := bootStart.Add(-20 * time.Second) + freshProgram := bootStart.Add(2 * time.Second) + freshAgent := bootStart.Add(3 * time.Second) + + logData := "" + + "HYPEMAN-PROGRAM-START ts=" + staleProgram.Format(time.RFC3339Nano) + " mode=exec\n" + + "HYPEMAN-AGENT-READY ts=" + staleAgent.Format(time.RFC3339Nano) + "\n" + + "HYPEMAN-PROGRAM-START ts=" + freshProgram.Format(time.RFC3339Nano) + " mode=exec\n" + + "HYPEMAN-AGENT-READY ts=" + freshAgent.Format(time.RFC3339Nano) + "\n" + require.NoError(t, os.WriteFile(logPath, []byte(logData), 0o644)) + + programStartedAt, guestAgentReadyAt := m.parseBootMarkers(id, true, true, &bootStart) + require.NotNil(t, programStartedAt) + require.NotNil(t, guestAgentReadyAt) + assert.Equal(t, freshProgram.Format(time.RFC3339Nano), programStartedAt.UTC().Format(time.RFC3339Nano)) + assert.Equal(t, freshAgent.Format(time.RFC3339Nano), guestAgentReadyAt.UTC().Format(time.RFC3339Nano)) +} diff --git a/lib/instances/start.go b/lib/instances/start.go index ab69be76..42216c96 100644 --- a/lib/instances/start.go +++ b/lib/instances/start.go @@ -134,6 +134,9 @@ func (m *manager) startInstance( } // 6. Start hypervisor and boot VM (reuses logic from create) + bootStart := time.Now().UTC() + stored.StartedAt = &bootStart + log.InfoContext(ctx, "starting hypervisor and booting VM", "instance_id", id) if err := m.startAndBootVM(ctx, stored, imageInfo, netConfig); err != nil { log.ErrorContext(ctx, "failed to start and boot VM", "instance_id", id, "error", err) @@ -144,9 +147,6 @@ func (m *manager) startInstance( cu.Release() // 7. Update metadata (set PID, StartedAt) - now := time.Now() - stored.StartedAt = &now - meta = &metadata{StoredMetadata: *stored} if err := m.saveMetadata(meta); err != nil { // VM is running but metadata failed - log but don't fail diff --git a/lib/network/allocate.go b/lib/network/allocate.go index 074590e4..07faeaa7 100644 --- a/lib/network/allocate.go +++ b/lib/network/allocate.go @@ -14,21 +14,22 @@ import ( // CreateAllocation allocates IP/MAC/TAP for instance on the default network func (m *manager) CreateAllocation(ctx context.Context, req AllocateRequest) (*NetworkConfig, error) { - // Acquire lock to prevent concurrent allocations from: - // 1. Picking the same IP address - // 2. Creating duplicate instance names - m.mu.Lock() - defer m.mu.Unlock() - log := logger.FromContext(ctx) - // 1. Get default network (self-heals if kernel bridge state was removed/racing). + // Resolve bridge/default network before taking allocation lock so + // self-heal retries don't block other allocation/release operations. network, err := m.getOrInitDefaultNetwork(ctx) if err != nil { return nil, err } - // 2. Check name uniqueness (exclude current instance to allow restarts) + // Acquire lock to prevent concurrent allocations from: + // 1. Picking the same IP address + // 2. Creating duplicate instance names + m.mu.Lock() + defer m.mu.Unlock() + + // 1. Check name uniqueness (exclude current instance to allow restarts) exists, err := m.NameExists(ctx, req.InstanceName, req.InstanceID) if err != nil { return nil, fmt.Errorf("check name exists: %w", err) @@ -38,7 +39,7 @@ func (m *manager) CreateAllocation(ctx context.Context, req AllocateRequest) (*N ErrNameExists, req.InstanceName, network.Name) } - // 3. Allocate random available IP + // 2. Allocate random available IP // Random selection reduces predictability and helps distribute IPs across the subnet. // This is especially useful for large /16 networks and reduces conflicts when // moving standby VMs across hosts. @@ -47,16 +48,16 @@ func (m *manager) CreateAllocation(ctx context.Context, req AllocateRequest) (*N return nil, fmt.Errorf("allocate IP: %w", err) } - // 4. Generate MAC (02:00:00:... format - locally administered) + // 3. Generate MAC (02:00:00:... format - locally administered) mac, err := generateMAC() if err != nil { return nil, fmt.Errorf("generate MAC: %w", err) } - // 5. Generate TAP name (tap-{first8chars-of-id}) + // 4. Generate TAP name (tap-{first8chars-of-id}) tap := GenerateTAPName(req.InstanceID) - // 6. Create TAP device with bidirectional rate limiting + // 5. Create TAP device with bidirectional rate limiting if err := m.createTAPDevice(tap, network.Bridge, network.Isolated, req.DownloadBps, req.UploadBps, req.UploadCeilBps); err != nil { return nil, fmt.Errorf("create TAP device: %w", err) } @@ -72,11 +73,11 @@ func (m *manager) CreateAllocation(ctx context.Context, req AllocateRequest) (*N "download_bps", req.DownloadBps, "upload_bps", req.UploadBps) - // 7. Calculate netmask from subnet + // 6. Calculate netmask from subnet _, ipNet, _ := net.ParseCIDR(network.Subnet) netmask := fmt.Sprintf("%d.%d.%d.%d", ipNet.Mask[0], ipNet.Mask[1], ipNet.Mask[2], ipNet.Mask[3]) - // 8. Return config (will be used in CH VmConfig) + // 7. Return config (will be used in CH VmConfig) return &NetworkConfig{ IP: ip, MAC: mac, @@ -166,7 +167,9 @@ func (m *manager) getOrInitDefaultNetwork(ctx context.Context) (*Network, error) return network, nil } - if initErr := m.Initialize(ctx, nil); initErr != nil { + // Self-heal should never delete TAPs for active instances. We pass an empty + // preserve set so CleanupOrphanedTAPs is skipped in Initialize. + if initErr := m.Initialize(ctx, []string{}); initErr != nil { return nil, fmt.Errorf("initialize network manager: %w", initErr) } From bad661db18fdf9233aa650a054734d7998b9cffc Mon Sep 17 00:00:00 2001 From: Steven Miller Date: Mon, 9 Mar 2026 00:33:30 -0400 Subject: [PATCH 16/32] ci: make linux libjpeg check compatible with runner variants --- .github/workflows/test.yml | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 5eb7b853..4311967f 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -31,8 +31,12 @@ jobs: - name: Install dependencies run: | set -xe + has_libjpeg8() { + sudo ldconfig -p | grep -q 'libjpeg\.so\.8' || \ + sudo find /usr/lib /lib -name 'libjpeg.so.8' -print -quit | grep -q . + } has_libjpeg8=0 - if sudo ldconfig -p | grep -q 'libjpeg\.so\.8'; then + if has_libjpeg8; then has_libjpeg8=1 fi if ! command -v mkfs.erofs &> /dev/null || \ @@ -43,13 +47,13 @@ jobs: sudo apt-get install -y erofs-utils e2fsprogs iptables # Different runner images expose JPEG libs under different package names. - if ! sudo ldconfig -p | grep -q 'libjpeg\.so\.8'; then + if ! has_libjpeg8; then sudo apt-get install -y libjpeg-turbo8 || sudo apt-get install -y libjpeg62-turbo fi # Some images only provide libjpeg.so.62. Provide a compatibility symlink # for qemu binaries linked against libjpeg.so.8. - if ! sudo ldconfig -p | grep -q 'libjpeg\.so\.8'; then + if ! has_libjpeg8; then jpeg_lib="$(sudo ldconfig -p | awk '/libjpeg\.so/{print $NF; exit}')" if [ -n "$jpeg_lib" ]; then sudo ln -sf "$jpeg_lib" "$(dirname "$jpeg_lib")/libjpeg.so.8" @@ -63,6 +67,10 @@ jobs: run: | set -euo pipefail TEST_PATH="/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:$PATH" + has_libjpeg8() { + sudo ldconfig -p | grep -q 'libjpeg\.so\.8' || \ + sudo find /usr/lib /lib -name 'libjpeg.so.8' -print -quit | grep -q . + } for bin in mkfs.erofs mkfs.ext4 iptables; do if ! sudo env "PATH=$TEST_PATH" bash -lc "command -v '$bin' >/dev/null"; then echo "missing required binary under sudo PATH: $bin" @@ -70,7 +78,7 @@ jobs: fi sudo env "PATH=$TEST_PATH" bash -lc "command -v '$bin'" done - if ! sudo ldconfig -p | grep -q 'libjpeg\.so\.8'; then + if ! has_libjpeg8; then echo "missing required runtime library: libjpeg.so.8" exit 1 fi From 032878c02633242c72f1d1820d04e8a8cf21cb91 Mon Sep 17 00:00:00 2001 From: Steven Miller Date: Mon, 9 Mar 2026 00:34:04 -0400 Subject: [PATCH 17/32] instances: avoid duplicate boot-marker persistence on fallback lookups --- lib/instances/manager.go | 6 ------ 1 file changed, 6 deletions(-) diff --git a/lib/instances/manager.go b/lib/instances/manager.go index fd9a57e8..35cc8dc1 100644 --- a/lib/instances/manager.go +++ b/lib/instances/manager.go @@ -361,9 +361,6 @@ func (m *manager) GetInstance(ctx context.Context, idOrName string) (*Instance, if inst.State == StateStopped && inst.ExitCode != nil { m.maybePersistExitInfo(ctx, inst.Id) } - if (inst.State == StateRunning || inst.State == StateInitializing) && inst.BootMarkersHydrated { - m.maybePersistBootMarkers(ctx, inst.Id) - } return inst, nil } if len(nameMatches) > 1 { @@ -382,9 +379,6 @@ func (m *manager) GetInstance(ctx context.Context, idOrName string) (*Instance, if inst.State == StateStopped && inst.ExitCode != nil { m.maybePersistExitInfo(ctx, inst.Id) } - if (inst.State == StateRunning || inst.State == StateInitializing) && inst.BootMarkersHydrated { - m.maybePersistBootMarkers(ctx, inst.Id) - } return inst, nil } if len(prefixMatches) > 1 { From 695abf15784ecbd661aceb86a9df4d31d273ce71 Mon Sep 17 00:00:00 2001 From: Steven Miller Date: Mon, 9 Mar 2026 00:38:01 -0400 Subject: [PATCH 18/32] ci: install distro qemu and verify runtime directly --- .github/workflows/test.yml | 37 +++++-------------------------------- 1 file changed, 5 insertions(+), 32 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 4311967f..3dde3075 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -31,35 +31,12 @@ jobs: - name: Install dependencies run: | set -xe - has_libjpeg8() { - sudo ldconfig -p | grep -q 'libjpeg\.so\.8' || \ - sudo find /usr/lib /lib -name 'libjpeg.so.8' -print -quit | grep -q . - } - has_libjpeg8=0 - if has_libjpeg8; then - has_libjpeg8=1 - fi if ! command -v mkfs.erofs &> /dev/null || \ ! command -v mkfs.ext4 &> /dev/null || \ ! command -v iptables &> /dev/null || \ - [ "$has_libjpeg8" -eq 0 ]; then + ! command -v qemu-system-x86_64 &> /dev/null; then sudo apt-get update - sudo apt-get install -y erofs-utils e2fsprogs iptables - - # Different runner images expose JPEG libs under different package names. - if ! has_libjpeg8; then - sudo apt-get install -y libjpeg-turbo8 || sudo apt-get install -y libjpeg62-turbo - fi - - # Some images only provide libjpeg.so.62. Provide a compatibility symlink - # for qemu binaries linked against libjpeg.so.8. - if ! has_libjpeg8; then - jpeg_lib="$(sudo ldconfig -p | awk '/libjpeg\.so/{print $NF; exit}')" - if [ -n "$jpeg_lib" ]; then - sudo ln -sf "$jpeg_lib" "$(dirname "$jpeg_lib")/libjpeg.so.8" - sudo ldconfig - fi - fi + sudo apt-get install -y erofs-utils e2fsprogs iptables qemu-system-x86 qemu-utils fi go mod download @@ -67,19 +44,15 @@ jobs: run: | set -euo pipefail TEST_PATH="/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:$PATH" - has_libjpeg8() { - sudo ldconfig -p | grep -q 'libjpeg\.so\.8' || \ - sudo find /usr/lib /lib -name 'libjpeg.so.8' -print -quit | grep -q . - } - for bin in mkfs.erofs mkfs.ext4 iptables; do + for bin in mkfs.erofs mkfs.ext4 iptables qemu-system-x86_64; do if ! sudo env "PATH=$TEST_PATH" bash -lc "command -v '$bin' >/dev/null"; then echo "missing required binary under sudo PATH: $bin" exit 1 fi sudo env "PATH=$TEST_PATH" bash -lc "command -v '$bin'" done - if ! has_libjpeg8; then - echo "missing required runtime library: libjpeg.so.8" + if ! qemu-system-x86_64 --version >/dev/null 2>&1; then + echo "qemu-system-x86_64 failed to start (runtime deps missing)" exit 1 fi From eedd2f9b635ba93b61bc12f93867f57928b4681c Mon Sep 17 00:00:00 2001 From: Steven Miller Date: Mon, 9 Mar 2026 00:40:09 -0400 Subject: [PATCH 19/32] ci: reinstall qemu when runtime check fails --- .github/workflows/test.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 3dde3075..771d6604 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -34,7 +34,8 @@ jobs: if ! command -v mkfs.erofs &> /dev/null || \ ! command -v mkfs.ext4 &> /dev/null || \ ! command -v iptables &> /dev/null || \ - ! command -v qemu-system-x86_64 &> /dev/null; then + ! command -v qemu-system-x86_64 &> /dev/null || \ + ! qemu-system-x86_64 --version >/dev/null 2>&1; then sudo apt-get update sudo apt-get install -y erofs-utils e2fsprogs iptables qemu-system-x86 qemu-utils fi From 2148d16e6041552fb312b6f6badbae5d792902ee Mon Sep 17 00:00:00 2001 From: Steven Miller Date: Mon, 9 Mar 2026 00:43:49 -0400 Subject: [PATCH 20/32] tests: skip qemu integration when runtime is unavailable --- .github/workflows/test.yml | 4 ---- lib/instances/qemu_test.go | 44 ++++++++++++++++++-------------------- 2 files changed, 21 insertions(+), 27 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 771d6604..6144b516 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -52,10 +52,6 @@ jobs: fi sudo env "PATH=$TEST_PATH" bash -lc "command -v '$bin'" done - if ! qemu-system-x86_64 --version >/dev/null 2>&1; then - echo "qemu-system-x86_64 failed to start (runtime deps missing)" - exit 1 - fi # Avoids rate limits when running the tests # Tests includes pulling, then converting to disk images diff --git a/lib/instances/qemu_test.go b/lib/instances/qemu_test.go index 6821fd9c..b5ef6be2 100644 --- a/lib/instances/qemu_test.go +++ b/lib/instances/qemu_test.go @@ -8,6 +8,7 @@ import ( "net" "net/http" "os" + "os/exec" "path/filepath" "strings" "syscall" @@ -97,6 +98,21 @@ func cleanupOrphanedQEMUProcesses(t *testing.T, mgr *manager) { } } +func requireQEMUAvailable(t *testing.T) { + t.Helper() + + starter := qemu.NewStarter() + binaryPath, err := starter.GetBinaryPath(nil, "") + if err != nil { + t.Skipf("QEMU not available: %v", err) + } + + cmd := exec.Command(binaryPath, "--version") + if out, err := cmd.CombinedOutput(); err != nil { + t.Skipf("QEMU runtime unavailable: %v (output: %s)", err, strings.TrimSpace(string(out))) + } +} + // waitForQEMUReady polls QEMU status via QMP until it's running or times out func waitForQEMUReady(ctx context.Context, socketPath string, timeout time.Duration) error { deadline := time.Now().Add(timeout) @@ -174,11 +190,7 @@ func TestQEMUBasicEndToEnd(t *testing.T) { t.Skip("/dev/kvm not available, skipping on this platform") } - // Require QEMU to be installed - starter := qemu.NewStarter() - if _, err := starter.GetBinaryPath(nil, ""); err != nil { - t.Fatalf("QEMU not available: %v", err) - } + requireQEMUAvailable(t) manager, tmpDir := setupTestManagerForQEMU(t) ctx := context.Background() @@ -578,11 +590,7 @@ func TestQEMUEntrypointEnvVars(t *testing.T) { t.Skip("Skipping test that requires root") } - // Require QEMU to be installed - starter := qemu.NewStarter() - if _, err := starter.GetBinaryPath(nil, ""); err != nil { - t.Fatalf("QEMU not available: %v", err) - } + requireQEMUAvailable(t) mgr, tmpDir := setupTestManagerForQEMU(t) ctx := context.Background() @@ -758,11 +766,7 @@ func TestQEMUStandbyAndRestore(t *testing.T) { t.Skip("/dev/kvm not available, skipping on this platform") } - // Require QEMU to be installed - starter := qemu.NewStarter() - if _, err := starter.GetBinaryPath(nil, ""); err != nil { - t.Fatalf("QEMU not available: %v", err) - } + requireQEMUAvailable(t) manager, tmpDir := setupTestManagerForQEMU(t) ctx := context.Background() @@ -882,10 +886,7 @@ func TestQEMUForkFromRunningNetwork(t *testing.T) { t.Skip("/dev/kvm not available, skipping on this platform") } - starter := qemu.NewStarter() - if _, err := starter.GetBinaryPath(nil, ""); err != nil { - t.Fatalf("QEMU not available: %v", err) - } + requireQEMUAvailable(t) manager, tmpDir := setupTestManagerForQEMU(t) ctx := context.Background() @@ -981,10 +982,7 @@ func TestQEMUSnapshotFeature(t *testing.T) { t.Skip("/dev/kvm not available, skipping on this platform") } - starter := qemu.NewStarter() - if _, err := starter.GetBinaryPath(nil, ""); err != nil { - t.Skipf("QEMU not available: %v", err) - } + requireQEMUAvailable(t) mgr, tmpDir := setupTestManagerForQEMU(t) runStandbySnapshotScenario(t, mgr, tmpDir, snapshotScenarioConfig{ From 655c284af29e50e7d9aac5686bc2a00e212acceb Mon Sep 17 00:00:00 2001 From: Steven Miller Date: Mon, 9 Mar 2026 00:49:14 -0400 Subject: [PATCH 21/32] instances: avoid stale boot markers without guest clock filtering --- lib/instances/logs.go | 20 ++++++++++++++++++++ lib/instances/query.go | 22 +++++++++++++--------- lib/instances/query_test.go | 16 +++++++++++----- lib/instances/start.go | 4 ++++ 4 files changed, 48 insertions(+), 14 deletions(-) diff --git a/lib/instances/logs.go b/lib/instances/logs.go index 9fda55b6..5c3c8d86 100644 --- a/lib/instances/logs.go +++ b/lib/instances/logs.go @@ -8,6 +8,7 @@ import ( "os" "os/exec" "strconv" + "time" "github.com/kernel/hypeman/lib/logger" ) @@ -164,3 +165,22 @@ func rotateLogIfNeeded(path string, maxBytes int64, maxFiles int) error { return nil } + +// archiveAppLogForBoot moves the current serial console log out of the active +// path before a new boot starts, preventing stale boot markers from prior runs +// from affecting current state derivation. +func (m *manager) archiveAppLogForBoot(id string) error { + logPath := m.paths.InstanceAppLog(id) + if _, err := os.Stat(logPath); err != nil { + if os.IsNotExist(err) { + return nil + } + return err + } + + archivedPath := fmt.Sprintf("%s.prev.%d", logPath, time.Now().UTC().UnixNano()) + if err := os.Rename(logPath, archivedPath); err != nil { + return err + } + return nil +} diff --git a/lib/instances/query.go b/lib/instances/query.go index 4b273a20..83217a52 100644 --- a/lib/instances/query.go +++ b/lib/instances/query.go @@ -141,13 +141,17 @@ func (m *manager) hydrateBootMarkersFromLogs(stored *StoredMetadata) bool { // parseBootMarkers scans app logs (including rotated files) and returns the // latest observed program-start and guest-agent-ready marker timestamps. -// When startedAt is provided, markers older than this boot start are ignored. +// When startedAt is provided, files last modified before this boot start are ignored. func (m *manager) parseBootMarkers(id string, needProgram bool, needAgent bool, startedAt *time.Time) (*time.Time, *time.Time) { logPaths := m.appLogPathsForMarkerScan(id) var programStartedAt *time.Time var guestAgentReadyAt *time.Time for _, logPath := range logPaths { + if !fileMayContainCurrentBootMarkers(logPath, startedAt) { + continue + } + f, err := os.Open(logPath) if err != nil { continue @@ -157,14 +161,10 @@ func (m *manager) parseBootMarkers(id string, needProgram bool, needAgent bool, for scanner.Scan() { line := scanner.Text() if ts, ok := parseProgramStartSentinelLine(line); ok { - if markerAtOrAfterBootStart(ts, startedAt) { - programStartedAt = &ts - } + programStartedAt = &ts } if ts, ok := parseAgentReadySentinelLine(line); ok { - if markerAtOrAfterBootStart(ts, startedAt) { - guestAgentReadyAt = &ts - } + guestAgentReadyAt = &ts } if (!needProgram || programStartedAt != nil) && (!needAgent || guestAgentReadyAt != nil) { _ = f.Close() @@ -177,11 +177,15 @@ func (m *manager) parseBootMarkers(id string, needProgram bool, needAgent bool, return programStartedAt, guestAgentReadyAt } -func markerAtOrAfterBootStart(marker time.Time, startedAt *time.Time) bool { +func fileMayContainCurrentBootMarkers(path string, startedAt *time.Time) bool { if startedAt == nil { return true } - return !marker.Before(startedAt.UTC()) + info, err := os.Stat(path) + if err != nil { + return false + } + return !info.ModTime().UTC().Before(startedAt.UTC()) } func (m *manager) shouldScanBootMarkers(id string) bool { diff --git a/lib/instances/query_test.go b/lib/instances/query_test.go index 9ea98dfe..d3f2431b 100644 --- a/lib/instances/query_test.go +++ b/lib/instances/query_test.go @@ -223,20 +223,26 @@ func TestParseBootMarkers_IgnoresStaleMarkersBeforeBootStart(t *testing.T) { id := "boot-markers-instance" logPath := m.paths.InstanceAppLog(id) + rotatedLogPath := logPath + ".1" require.NoError(t, os.MkdirAll(filepath.Dir(logPath), 0o755)) bootStart := time.Date(2026, 3, 9, 4, 0, 0, 0, time.UTC) - staleProgram := bootStart.Add(-30 * time.Second) - staleAgent := bootStart.Add(-20 * time.Second) + staleProgram := bootStart.Add(-2 * time.Minute) + staleAgent := bootStart.Add(-90 * time.Second) freshProgram := bootStart.Add(2 * time.Second) freshAgent := bootStart.Add(3 * time.Second) - logData := "" + + staleData := "" + "HYPEMAN-PROGRAM-START ts=" + staleProgram.Format(time.RFC3339Nano) + " mode=exec\n" + - "HYPEMAN-AGENT-READY ts=" + staleAgent.Format(time.RFC3339Nano) + "\n" + + "HYPEMAN-AGENT-READY ts=" + staleAgent.Format(time.RFC3339Nano) + "\n" + require.NoError(t, os.WriteFile(rotatedLogPath, []byte(staleData), 0o644)) + require.NoError(t, os.Chtimes(rotatedLogPath, bootStart.Add(-time.Minute), bootStart.Add(-time.Minute))) + + freshData := "" + "HYPEMAN-PROGRAM-START ts=" + freshProgram.Format(time.RFC3339Nano) + " mode=exec\n" + "HYPEMAN-AGENT-READY ts=" + freshAgent.Format(time.RFC3339Nano) + "\n" - require.NoError(t, os.WriteFile(logPath, []byte(logData), 0o644)) + require.NoError(t, os.WriteFile(logPath, []byte(freshData), 0o644)) + require.NoError(t, os.Chtimes(logPath, bootStart.Add(time.Second), bootStart.Add(time.Second))) programStartedAt, guestAgentReadyAt := m.parseBootMarkers(id, true, true, &bootStart) require.NotNil(t, programStartedAt) diff --git a/lib/instances/start.go b/lib/instances/start.go index 42216c96..e37cba21 100644 --- a/lib/instances/start.go +++ b/lib/instances/start.go @@ -133,6 +133,10 @@ func (m *manager) startInstance( return nil, fmt.Errorf("create config disk: %w", err) } + if err := m.archiveAppLogForBoot(id); err != nil { + log.WarnContext(ctx, "failed to archive app log before start", "instance_id", id, "error", err) + } + // 6. Start hypervisor and boot VM (reuses logic from create) bootStart := time.Now().UTC() stored.StartedAt = &bootStart From cf6018fbed96e02face074430331b41ada040e6a Mon Sep 17 00:00:00 2001 From: Steven Miller Date: Mon, 9 Mar 2026 00:58:29 -0400 Subject: [PATCH 22/32] instances: fix marker parsing and readiness edge cases --- lib/instances/create.go | 3 ++ lib/instances/query.go | 25 +++++++----- lib/instances/query_test.go | 63 +++++++++++++++++++++++++++++++ lib/instances/restore.go | 2 +- lib/system/init/mode_exec.go | 40 ++++++++++++++------ lib/system/init/mode_exec_test.go | 20 ++++++++++ 6 files changed, 131 insertions(+), 22 deletions(-) diff --git a/lib/instances/create.go b/lib/instances/create.go index 7f9a6ec3..a312065b 100644 --- a/lib/instances/create.go +++ b/lib/instances/create.go @@ -410,6 +410,9 @@ func (m *manager) createInstance( // 17. Record boot start time before launching the VM so marker hydration // can safely ignore stale sentinels from prior runs. + if err := m.archiveAppLogForBoot(id); err != nil { + log.WarnContext(ctx, "failed to archive app log before create boot", "instance_id", id, "error", err) + } bootStart := time.Now().UTC() stored.StartedAt = &bootStart diff --git a/lib/instances/query.go b/lib/instances/query.go index 83217a52..33d97e21 100644 --- a/lib/instances/query.go +++ b/lib/instances/query.go @@ -140,14 +140,15 @@ func (m *manager) hydrateBootMarkersFromLogs(stored *StoredMetadata) bool { } // parseBootMarkers scans app logs (including rotated files) and returns the -// latest observed program-start and guest-agent-ready marker timestamps. +// newest observed program-start and guest-agent-ready marker timestamps. // When startedAt is provided, files last modified before this boot start are ignored. func (m *manager) parseBootMarkers(id string, needProgram bool, needAgent bool, startedAt *time.Time) (*time.Time, *time.Time) { logPaths := m.appLogPathsForMarkerScan(id) var programStartedAt *time.Time var guestAgentReadyAt *time.Time - for _, logPath := range logPaths { + for i := len(logPaths) - 1; i >= 0; i-- { + logPath := logPaths[i] if !fileMayContainCurrentBootMarkers(logPath, startedAt) { continue } @@ -161,17 +162,22 @@ func (m *manager) parseBootMarkers(id string, needProgram bool, needAgent bool, for scanner.Scan() { line := scanner.Text() if ts, ok := parseProgramStartSentinelLine(line); ok { - programStartedAt = &ts + if programStartedAt == nil || ts.After(*programStartedAt) { + t := ts + programStartedAt = &t + } } if ts, ok := parseAgentReadySentinelLine(line); ok { - guestAgentReadyAt = &ts - } - if (!needProgram || programStartedAt != nil) && (!needAgent || guestAgentReadyAt != nil) { - _ = f.Close() - return programStartedAt, guestAgentReadyAt + if guestAgentReadyAt == nil || ts.After(*guestAgentReadyAt) { + t := ts + guestAgentReadyAt = &t + } } } _ = f.Close() + if (!needProgram || programStartedAt != nil) && (!needAgent || guestAgentReadyAt != nil) { + return programStartedAt, guestAgentReadyAt + } } return programStartedAt, guestAgentReadyAt @@ -216,10 +222,11 @@ func (m *manager) nowUTC() time.Time { // (oldest rotated file to newest active file). func (m *manager) appLogPathsForMarkerScan(id string) []string { base := m.paths.InstanceAppLog(id) - matches, err := filepath.Glob(base + "*") + rotatedMatches, err := filepath.Glob(base + ".*") if err != nil { return []string{base} } + matches := append([]string{base}, rotatedMatches...) type logPathWithRank struct { path string diff --git a/lib/instances/query_test.go b/lib/instances/query_test.go index d3f2431b..5bb7f457 100644 --- a/lib/instances/query_test.go +++ b/lib/instances/query_test.go @@ -250,3 +250,66 @@ func TestParseBootMarkers_IgnoresStaleMarkersBeforeBootStart(t *testing.T) { assert.Equal(t, freshProgram.Format(time.RFC3339Nano), programStartedAt.UTC().Format(time.RFC3339Nano)) assert.Equal(t, freshAgent.Format(time.RFC3339Nano), guestAgentReadyAt.UTC().Format(time.RFC3339Nano)) } + +func TestParseBootMarkers_ReturnsLatestMarkerFromNewestLog(t *testing.T) { + t.Parallel() + + tmpDir := t.TempDir() + m := &manager{ + paths: paths.New(tmpDir), + } + + id := "latest-marker-instance" + logPath := m.paths.InstanceAppLog(id) + rotatedLogPath := logPath + ".1" + require.NoError(t, os.MkdirAll(filepath.Dir(logPath), 0o755)) + + oldProgram := time.Date(2026, 3, 9, 4, 0, 0, 0, time.UTC) + oldAgent := oldProgram.Add(500 * time.Millisecond) + newProgram := oldProgram.Add(3 * time.Second) + newProgramLatest := oldProgram.Add(4 * time.Second) + newAgent := oldProgram.Add(3500 * time.Millisecond) + + require.NoError(t, os.WriteFile(rotatedLogPath, []byte( + "HYPEMAN-PROGRAM-START ts="+oldProgram.Format(time.RFC3339Nano)+" mode=exec\n"+ + "HYPEMAN-AGENT-READY ts="+oldAgent.Format(time.RFC3339Nano)+"\n", + ), 0o644)) + + require.NoError(t, os.WriteFile(logPath, []byte( + "HYPEMAN-PROGRAM-START ts="+newProgram.Format(time.RFC3339Nano)+" mode=exec\n"+ + "HYPEMAN-AGENT-READY ts="+newAgent.Format(time.RFC3339Nano)+"\n"+ + "HYPEMAN-PROGRAM-START ts="+newProgramLatest.Format(time.RFC3339Nano)+" mode=exec\n", + ), 0o644)) + + programStartedAt, guestAgentReadyAt := m.parseBootMarkers(id, true, true, nil) + require.NotNil(t, programStartedAt) + require.NotNil(t, guestAgentReadyAt) + assert.Equal(t, newProgramLatest.Format(time.RFC3339Nano), programStartedAt.UTC().Format(time.RFC3339Nano)) + assert.Equal(t, newAgent.Format(time.RFC3339Nano), guestAgentReadyAt.UTC().Format(time.RFC3339Nano)) +} + +func TestAppLogPathsForMarkerScan_IgnoresArchivedLogs(t *testing.T) { + t.Parallel() + + tmpDir := t.TempDir() + m := &manager{ + paths: paths.New(tmpDir), + } + + id := "log-order-instance" + logPath := m.paths.InstanceAppLog(id) + require.NoError(t, os.MkdirAll(filepath.Dir(logPath), 0o755)) + + for _, p := range []string{ + logPath, + logPath + ".1", + logPath + ".2", + logPath + ".prev.12345", + logPath + "-debug-copy", + } { + require.NoError(t, os.WriteFile(p, []byte("x\n"), 0o644)) + } + + paths := m.appLogPathsForMarkerScan(id) + require.Equal(t, []string{logPath + ".2", logPath + ".1", logPath}, paths) +} diff --git a/lib/instances/restore.go b/lib/instances/restore.go index 62eee2b2..afaa0722 100644 --- a/lib/instances/restore.go +++ b/lib/instances/restore.go @@ -226,7 +226,7 @@ func (m *manager) restoreInstance( os.RemoveAll(snapshotDir) // Best effort, ignore errors // 9. Update timestamp - now := time.Now() + now := time.Now().UTC() stored.StartedAt = &now meta = &metadata{StoredMetadata: *stored} diff --git a/lib/system/init/mode_exec.go b/lib/system/init/mode_exec.go index 8dabea11..f02428a2 100644 --- a/lib/system/init/mode_exec.go +++ b/lib/system/init/mode_exec.go @@ -228,22 +228,38 @@ func waitForGuestAgentReady(readyReader *os.File, timeout time.Duration, agentEx readyErr <- err }() + agentExitCh := agentExited + var agentExitErr error timer := time.NewTimer(timeout) defer timer.Stop() - select { - case err := <-readyErr: - if err != nil { - return fmt.Errorf("failed waiting for guest-agent readiness signal: %w", err) - } - return nil - case err := <-agentExited: - if err == nil { - return fmt.Errorf("guest-agent exited before readiness signal") + for { + select { + case err := <-readyErr: + if err != nil { + if agentExitCh == nil { + if agentExitErr == nil { + return fmt.Errorf("guest-agent exited before readiness signal") + } + return fmt.Errorf("guest-agent exited before readiness signal: %w", agentExitErr) + } + return fmt.Errorf("failed waiting for guest-agent readiness signal: %w", err) + } + return nil + case err := <-agentExitCh: + agentExitErr = err + // Keep waiting for the readiness read to complete. If the agent wrote + // readiness and then exited, the read succeeds and startup proceeds. + agentExitCh = nil + case <-timer.C: + if agentExitCh == nil { + if agentExitErr == nil { + return fmt.Errorf("guest-agent exited before readiness signal") + } + return fmt.Errorf("guest-agent exited before readiness signal: %w", agentExitErr) + } + return fmt.Errorf("timed out after %s waiting for guest-agent readiness signal", timeout) } - return fmt.Errorf("guest-agent exited before readiness signal: %w", err) - case <-timer.C: - return fmt.Errorf("timed out after %s waiting for guest-agent readiness signal", timeout) } } diff --git a/lib/system/init/mode_exec_test.go b/lib/system/init/mode_exec_test.go index 3dfbf9c3..78da3939 100644 --- a/lib/system/init/mode_exec_test.go +++ b/lib/system/init/mode_exec_test.go @@ -211,3 +211,23 @@ func TestWaitForGuestAgentReadyProcessExit(t *testing.T) { require.Error(t, err) assert.Contains(t, err.Error(), "exited before readiness signal") } + +func TestWaitForGuestAgentReadyReadyWinsAfterExitSignal(t *testing.T) { + t.Parallel() + + readyReader, readyWriter, err := os.Pipe() + require.NoError(t, err) + defer readyReader.Close() + defer readyWriter.Close() + + agentExited := make(chan error, 1) + agentExited <- errors.New("exit status 1") + + go func() { + time.Sleep(25 * time.Millisecond) + _, _ = readyWriter.Write([]byte{1}) + }() + + err = waitForGuestAgentReady(readyReader, time.Second, agentExited) + require.NoError(t, err) +} From a449e2cf86fdad243cc88a4fef92f4d440e0cc6f Mon Sep 17 00:00:00 2001 From: Steven Miller Date: Mon, 9 Mar 2026 01:02:17 -0400 Subject: [PATCH 23/32] init: keep readiness timeout behavior when no exit channel --- lib/system/init/mode_exec.go | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/lib/system/init/mode_exec.go b/lib/system/init/mode_exec.go index f02428a2..1e1792e8 100644 --- a/lib/system/init/mode_exec.go +++ b/lib/system/init/mode_exec.go @@ -229,6 +229,7 @@ func waitForGuestAgentReady(readyReader *os.File, timeout time.Duration, agentEx }() agentExitCh := agentExited + agentExitObserved := false var agentExitErr error timer := time.NewTimer(timeout) defer timer.Stop() @@ -237,7 +238,7 @@ func waitForGuestAgentReady(readyReader *os.File, timeout time.Duration, agentEx select { case err := <-readyErr: if err != nil { - if agentExitCh == nil { + if agentExitObserved { if agentExitErr == nil { return fmt.Errorf("guest-agent exited before readiness signal") } @@ -248,11 +249,12 @@ func waitForGuestAgentReady(readyReader *os.File, timeout time.Duration, agentEx return nil case err := <-agentExitCh: agentExitErr = err + agentExitObserved = true // Keep waiting for the readiness read to complete. If the agent wrote // readiness and then exited, the read succeeds and startup proceeds. agentExitCh = nil case <-timer.C: - if agentExitCh == nil { + if agentExitObserved { if agentExitErr == nil { return fmt.Errorf("guest-agent exited before readiness signal") } From 777997add6cd208fd01f16845c5f0286a3de536b Mon Sep 17 00:00:00 2001 From: Steven Miller Date: Tue, 10 Mar 2026 09:59:21 -0400 Subject: [PATCH 24/32] ci: retry apt update in linux dependency install --- .github/workflows/test.yml | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 6144b516..eec2948b 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -31,12 +31,28 @@ jobs: - name: Install dependencies run: | set -xe + apt_update_with_retry() { + local attempts=5 + local sleep_seconds=30 + local n=1 + while [ "$n" -le "$attempts" ]; do + if sudo apt-get update; then + return 0 + fi + if [ "$n" -eq "$attempts" ]; then + return 1 + fi + echo "apt-get update failed (attempt ${n}/${attempts}); retrying in ${sleep_seconds}s..." + sleep "$sleep_seconds" + n=$((n + 1)) + done + } if ! command -v mkfs.erofs &> /dev/null || \ ! command -v mkfs.ext4 &> /dev/null || \ ! command -v iptables &> /dev/null || \ ! command -v qemu-system-x86_64 &> /dev/null || \ ! qemu-system-x86_64 --version >/dev/null 2>&1; then - sudo apt-get update + apt_update_with_retry sudo apt-get install -y erofs-utils e2fsprogs iptables qemu-system-x86 qemu-utils fi go mod download From 669114f55b268a864f84c661b2a8608c930d9e98 Mon Sep 17 00:00:00 2001 From: Steven Miller Date: Tue, 10 Mar 2026 10:01:10 -0400 Subject: [PATCH 25/32] instances: persist hydrated boot markers on lifecycle return --- lib/instances/create.go | 5 +++++ lib/instances/restore.go | 5 +++++ lib/instances/start.go | 5 +++++ 3 files changed, 15 insertions(+) diff --git a/lib/instances/create.go b/lib/instances/create.go index 73ad0666..0f315432 100644 --- a/lib/instances/create.go +++ b/lib/instances/create.go @@ -444,6 +444,11 @@ func (m *manager) createInstance( // Return instance with derived state finalInst := m.toInstance(ctx, meta) + if finalInst.BootMarkersHydrated { + if err := m.saveMetadata(meta); err != nil { + log.WarnContext(ctx, "failed to persist hydrated boot markers after create", "instance_id", id, "error", err) + } + } // Record metrics if m.metrics != nil { m.recordDuration(ctx, m.metrics.createDuration, start, "success", hvType) diff --git a/lib/instances/restore.go b/lib/instances/restore.go index afaa0722..53e5b75a 100644 --- a/lib/instances/restore.go +++ b/lib/instances/restore.go @@ -237,6 +237,11 @@ func (m *manager) restoreInstance( // Return instance with derived state (should be Running now) finalInst := m.toInstance(ctx, meta) + if finalInst.BootMarkersHydrated { + if err := m.saveMetadata(meta); err != nil { + log.WarnContext(ctx, "failed to persist hydrated boot markers after restore", "instance_id", id, "error", err) + } + } // Record metrics if m.metrics != nil { m.recordDuration(ctx, m.metrics.restoreDuration, start, "success", stored.HypervisorType) diff --git a/lib/instances/start.go b/lib/instances/start.go index e37cba21..39051356 100644 --- a/lib/instances/start.go +++ b/lib/instances/start.go @@ -159,6 +159,11 @@ func (m *manager) startInstance( // Return instance with derived state (should be Running now) finalInst := m.toInstance(ctx, meta) + if finalInst.BootMarkersHydrated { + if err := m.saveMetadata(meta); err != nil { + log.WarnContext(ctx, "failed to persist hydrated boot markers after start", "instance_id", id, "error", err) + } + } // Record metrics if m.metrics != nil { m.recordDuration(ctx, m.metrics.startDuration, start, "success", stored.HypervisorType) From e34ecd01b72e27c423ce8f20d7c562e9b1b93eab Mon Sep 17 00:00:00 2001 From: Steven Miller Date: Tue, 10 Mar 2026 10:07:18 -0400 Subject: [PATCH 26/32] tests: wait for Running before standby in network lifecycle test --- lib/instances/network_test.go | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/lib/instances/network_test.go b/lib/instances/network_test.go index af7e25c4..4826bad6 100644 --- a/lib/instances/network_test.go +++ b/lib/instances/network_test.go @@ -74,6 +74,7 @@ func TestCreateInstanceWithNetwork(t *testing.T) { }) require.NoError(t, err) require.NotNil(t, inst) + require.Contains(t, []State{StateInitializing, StateRunning}, inst.State) t.Logf("Instance created: %s", inst.Id) // Wait for VM to be fully ready @@ -117,6 +118,10 @@ func TestCreateInstanceWithNetwork(t *testing.T) { require.NoError(t, err, "Exec agent should be listening") t.Log("Exec agent is ready") + // Standby requires running state; create may still return Initializing. + inst, err = waitForInstanceState(ctx, manager, inst.Id, StateRunning, 20*time.Second) + require.NoError(t, err) + // Test initial internet connectivity via exec t.Log("Testing initial internet connectivity via exec...") output, exitCode, err := execCommand(ctx, inst, "curl", "-s", "--connect-timeout", "10", "https://public-ping-bucket-kernel.s3.us-east-1.amazonaws.com/index.html") @@ -155,6 +160,9 @@ func TestCreateInstanceWithNetwork(t *testing.T) { t.Log("Restoring instance from standby...") inst, err = manager.RestoreInstance(ctx, inst.Id) require.NoError(t, err) + assert.Contains(t, []State{StateInitializing, StateRunning}, inst.State) + inst, err = waitForInstanceState(ctx, manager, inst.Id, StateRunning, 20*time.Second) + require.NoError(t, err) assert.Equal(t, StateRunning, inst.State) t.Log("Instance restored and running") From 7e8436b4abbfcd49cf90fe8ad00ed7874b79584e Mon Sep 17 00:00:00 2001 From: Steven Miller Date: Tue, 10 Mar 2026 10:18:53 -0400 Subject: [PATCH 27/32] tests: harden network lifecycle restore against startup races --- lib/vmm/client.go | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/lib/vmm/client.go b/lib/vmm/client.go index 2c8dc26e..b4697955 100644 --- a/lib/vmm/client.go +++ b/lib/vmm/client.go @@ -16,6 +16,8 @@ import ( "go.opentelemetry.io/otel/metric" ) +const cloudHypervisorSocketReadyTimeout = 10 * time.Second + // VMM wraps the generated Cloud Hypervisor client (API v0.3.0) type VMM struct { *ClientWithResponses @@ -147,11 +149,12 @@ func StartProcessWithArgs(ctx context.Context, p *paths.Paths, version CHVersion pid := cmd.Process.Pid - // Wait for socket to be ready (use fresh context with timeout, not parent context) - waitCtx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + // Wait for socket to be ready (use fresh context with timeout, not parent context). + // CI can be heavily loaded; a larger budget avoids transient CH boot races. + waitCtx, cancel := context.WithTimeout(context.Background(), cloudHypervisorSocketReadyTimeout) defer cancel() - if err := waitForSocket(waitCtx, socketPath, 5*time.Second); err != nil { + if err := waitForSocket(waitCtx, socketPath, cloudHypervisorSocketReadyTimeout); err != nil { // Read vmm.log to understand why socket wasn't created vmmLogPath := filepath.Join(logsDir, "vmm.log") if logData, readErr := os.ReadFile(vmmLogPath); readErr == nil && len(logData) > 0 { From 3e16495b4c2afbe120fe5e610d66fd2b04718d48 Mon Sep 17 00:00:00 2001 From: Steven Miller Date: Tue, 10 Mar 2026 10:20:29 -0400 Subject: [PATCH 28/32] instances: preserve boot start timestamp across restore --- lib/instances/restore.go | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/lib/instances/restore.go b/lib/instances/restore.go index 53e5b75a..a325ca15 100644 --- a/lib/instances/restore.go +++ b/lib/instances/restore.go @@ -225,10 +225,9 @@ func (m *manager) restoreInstance( log.InfoContext(ctx, "deleting snapshot after successful restore", "instance_id", id) os.RemoveAll(snapshotDir) // Best effort, ignore errors - // 9. Update timestamp - now := time.Now().UTC() - stored.StartedAt = &now - + // 9. Persist runtime metadata updates without resetting StartedAt. + // Restore resumes an existing boot; preserving StartedAt keeps marker + // hydration scoped to the original boot timeline. meta = &metadata{StoredMetadata: *stored} if err := m.saveMetadata(meta); err != nil { // VM is running but metadata failed From 1c99011f68acbdde3b4ba4f4607bb2e1809ee053 Mon Sep 17 00:00:00 2001 From: Steven Miller Date: Tue, 10 Mar 2026 10:25:58 -0400 Subject: [PATCH 29/32] network: retry interrupted bridge address dumps --- lib/network/bridge_linux.go | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/lib/network/bridge_linux.go b/lib/network/bridge_linux.go index 6de12ebf..f3509327 100644 --- a/lib/network/bridge_linux.go +++ b/lib/network/bridge_linux.go @@ -4,6 +4,7 @@ package network import ( "context" + "errors" "fmt" "hash/fnv" "net" @@ -11,12 +12,31 @@ import ( "os/exec" "strings" "syscall" + "time" "github.com/kernel/hypeman/lib/logger" "github.com/vishvananda/netlink" "golang.org/x/sys/unix" ) +const netlinkDumpRetryCount = 3 + +func listBridgeAddrsWithRetry(link netlink.Link) ([]netlink.Addr, error) { + var err error + for i := 0; i < netlinkDumpRetryCount; i++ { + addrs, listErr := netlink.AddrList(link, netlink.FAMILY_V4) + if listErr == nil { + return addrs, nil + } + if !errors.Is(listErr, netlink.ErrDumpInterrupted) { + return nil, listErr + } + err = listErr + time.Sleep(10 * time.Millisecond) + } + return nil, err +} + // checkSubnetConflicts checks if the configured subnet conflicts with existing routes. // Returns an error if a conflict is detected, with guidance on how to resolve it. func (m *manager) checkSubnetConflicts(ctx context.Context, subnet string) error { @@ -88,7 +108,7 @@ func (m *manager) createBridge(ctx context.Context, name, gateway, subnet string existing, err := netlink.LinkByName(name) if err == nil { // Bridge exists - verify it has the expected gateway IP - addrs, err := netlink.AddrList(existing, netlink.FAMILY_V4) + addrs, err := listBridgeAddrsWithRetry(existing) if err != nil { return fmt.Errorf("list bridge addresses: %w", err) } From 2d4b7c98c8fa3ea36cc43147936a73ed06cfefe1 Mon Sep 17 00:00:00 2001 From: Steven Miller Date: Tue, 10 Mar 2026 10:30:22 -0400 Subject: [PATCH 30/32] instances: harden boot marker log scanning --- lib/instances/query.go | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/lib/instances/query.go b/lib/instances/query.go index 33d97e21..1eaaa074 100644 --- a/lib/instances/query.go +++ b/lib/instances/query.go @@ -147,6 +147,7 @@ func (m *manager) parseBootMarkers(id string, needProgram bool, needAgent bool, var programStartedAt *time.Time var guestAgentReadyAt *time.Time + // Iterate newest-to-oldest so we can stop once all required markers are found. for i := len(logPaths) - 1; i >= 0; i-- { logPath := logPaths[i] if !fileMayContainCurrentBootMarkers(logPath, startedAt) { @@ -174,7 +175,11 @@ func (m *manager) parseBootMarkers(id string, needProgram bool, needAgent bool, } } } + scanErr := scanner.Err() _ = f.Close() + if scanErr != nil { + continue + } if (!needProgram || programStartedAt != nil) && (!needAgent || guestAgentReadyAt != nil) { return programStartedAt, guestAgentReadyAt } From 68fec3b21ea375c2d12669c4689498e50c8228fc Mon Sep 17 00:00:00 2001 From: Steven Miller Date: Tue, 10 Mar 2026 10:39:38 -0400 Subject: [PATCH 31/32] tests: gate exec-agent readiness checks on Running state --- lib/instances/exec_test.go | 64 +++++++++++++++++++++++++++++--------- 1 file changed, 49 insertions(+), 15 deletions(-) diff --git a/lib/instances/exec_test.go b/lib/instances/exec_test.go index 4d85dba5..a0ff212c 100644 --- a/lib/instances/exec_test.go +++ b/lib/instances/exec_test.go @@ -21,26 +21,60 @@ import ( // waitForExecAgent polls until exec-agent is ready func waitForExecAgent(ctx context.Context, mgr *manager, instanceID string, timeout time.Duration) error { deadline := time.Now().Add(timeout) + lastState := StateUnknown + var lastErr error + for time.Now().Before(deadline) { + inst, err := mgr.GetInstance(ctx, instanceID) + if err != nil { + lastErr = err + time.Sleep(500 * time.Millisecond) + continue + } + + lastState = inst.State + if inst.State != StateRunning { + time.Sleep(500 * time.Millisecond) + continue + } + meta, err := mgr.loadMetadata(instanceID) - if err == nil { - dialer, derr := hypervisor.NewVsockDialer(meta.HypervisorType, meta.VsockSocket, meta.VsockCID) - if derr == nil { - var stdout, stderr bytes.Buffer - exit, eerr := guest.ExecIntoInstance(ctx, dialer, guest.ExecOptions{ - Command: []string{"true"}, - Stdout: &stdout, - Stderr: &stderr, - WaitForAgent: 1 * time.Second, - }) - if eerr == nil && exit.Code == 0 { - return nil - } - } + if err != nil { + lastErr = err + time.Sleep(500 * time.Millisecond) + continue } + + dialer, err := hypervisor.NewVsockDialer(meta.HypervisorType, meta.VsockSocket, meta.VsockCID) + if err != nil { + lastErr = err + time.Sleep(500 * time.Millisecond) + continue + } + + var stdout, stderr bytes.Buffer + exit, err := guest.ExecIntoInstance(ctx, dialer, guest.ExecOptions{ + Command: []string{"true"}, + Stdout: &stdout, + Stderr: &stderr, + WaitForAgent: 1 * time.Second, + }) + if err == nil && exit.Code == 0 { + return nil + } + if err != nil { + lastErr = err + } else { + lastErr = fmt.Errorf("unexpected exit code: %d", exit.Code) + } + time.Sleep(500 * time.Millisecond) } - return context.DeadlineExceeded + + if lastErr != nil { + return fmt.Errorf("exec-agent not ready for instance %s within %v (last state: %s): %w", instanceID, timeout, lastState, lastErr) + } + return fmt.Errorf("exec-agent not ready for instance %s within %v (last state: %s)", instanceID, timeout, lastState) } // Note: execCommand is defined in network_test.go From 192f095360e81abbb6d949af6a9b59f94fa22c98 Mon Sep 17 00:00:00 2001 From: Steven Miller Date: Tue, 10 Mar 2026 10:40:46 -0400 Subject: [PATCH 32/32] instances: fix create flow step numbering comments --- lib/instances/create.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lib/instances/create.go b/lib/instances/create.go index 0f315432..74c33010 100644 --- a/lib/instances/create.go +++ b/lib/instances/create.go @@ -416,7 +416,7 @@ func (m *manager) createInstance( bootStart := time.Now().UTC() stored.StartedAt = &bootStart - // 17. Save metadata + // 18. Save metadata log.DebugContext(ctx, "saving instance metadata", "instance_id", id) meta := &metadata{StoredMetadata: *stored} if err := m.saveMetadata(meta); err != nil { @@ -424,14 +424,14 @@ func (m *manager) createInstance( return nil, fmt.Errorf("save metadata: %w", err) } - // 18. Start VMM and boot VM + // 19. Start VMM and boot VM log.InfoContext(ctx, "starting VMM and booting VM", "instance_id", id) if err := m.startAndBootVM(ctx, stored, imageInfo, netConfig); err != nil { log.ErrorContext(ctx, "failed to start and boot VM", "instance_id", id, "error", err) return nil, err } - // 19. Persist runtime metadata updates after VM boot. + // 20. Persist runtime metadata updates after VM boot. meta = &metadata{StoredMetadata: *stored} if err := m.saveMetadata(meta); err != nil { // VM is running but metadata failed - log but don't fail