From abfb22220d7e4913d4c2f5b037f987e24d0c44f6 Mon Sep 17 00:00:00 2001 From: Steven Miller Date: Tue, 10 Mar 2026 10:49:42 -0400 Subject: [PATCH 1/5] tests: wait for Running in cloud-hypervisor running-fork path --- lib/instances/fork_test.go | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/lib/instances/fork_test.go b/lib/instances/fork_test.go index 7767d8b6..52f8e28e 100644 --- a/lib/instances/fork_test.go +++ b/lib/instances/fork_test.go @@ -381,6 +381,8 @@ func TestForkCloudHypervisorFromRunningNetwork(t *testing.T) { }) require.NoError(t, err) t.Cleanup(func() { _ = manager.DeleteInstance(context.Background(), source.Id) }) + source, err = waitForInstanceState(ctx, manager, source.Id, StateRunning, 20*time.Second) + require.NoError(t, err) require.NoError(t, waitForVMReady(ctx, source.SocketPath, 5*time.Second)) assert.NotEmpty(t, source.IP) @@ -399,6 +401,9 @@ func TestForkCloudHypervisorFromRunningNetwork(t *testing.T) { TargetState: StateRunning, }) require.NoError(t, err) + require.Contains(t, []State{StateInitializing, StateRunning}, forked.State) + forked, err = waitForInstanceState(ctx, manager, forked.Id, StateRunning, 20*time.Second) + require.NoError(t, err) require.Equal(t, StateRunning, forked.State) forkedID := forked.Id t.Cleanup(func() { _ = manager.DeleteInstance(context.Background(), forkedID) }) @@ -406,6 +411,10 @@ func TestForkCloudHypervisorFromRunningNetwork(t *testing.T) { // Source should be restored and still reachable by its private IP. sourceAfterFork, err := manager.GetInstance(ctx, source.Id) require.NoError(t, err) + if sourceAfterFork.State != StateRunning { + sourceAfterFork, err = waitForInstanceState(ctx, manager, source.Id, StateRunning, 20*time.Second) + require.NoError(t, err) + } require.Equal(t, StateRunning, sourceAfterFork.State) require.NotEmpty(t, sourceAfterFork.IP) assertHostCanReachNginx(t, sourceAfterFork.IP, 80, 60*time.Second) From bc126f65828859581e4643b85d04098f6fa6d04a Mon Sep 17 00:00:00 2001 From: Steven Miller Date: Tue, 10 Mar 2026 10:55:34 -0400 Subject: [PATCH 2/5] tests: avoid nil cleanup capture in running-fork test --- lib/instances/fork_test.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lib/instances/fork_test.go b/lib/instances/fork_test.go index 52f8e28e..88e48537 100644 --- a/lib/instances/fork_test.go +++ b/lib/instances/fork_test.go @@ -380,7 +380,8 @@ func TestForkCloudHypervisorFromRunningNetwork(t *testing.T) { NetworkEnabled: true, }) require.NoError(t, err) - t.Cleanup(func() { _ = manager.DeleteInstance(context.Background(), source.Id) }) + sourceID := source.Id + t.Cleanup(func() { _ = manager.DeleteInstance(context.Background(), sourceID) }) source, err = waitForInstanceState(ctx, manager, source.Id, StateRunning, 20*time.Second) require.NoError(t, err) require.NoError(t, waitForVMReady(ctx, source.SocketPath, 5*time.Second)) From 2664097069c799ed804ab2ffdfd8518fad2b5197 Mon Sep 17 00:00:00 2001 From: Steven Miller Date: Tue, 10 Mar 2026 11:06:24 -0400 Subject: [PATCH 3/5] instances/init: follow up merged PR review comments --- lib/instances/create.go | 9 +--- lib/instances/logs.go | 24 +++++++++- lib/instances/logs_test.go | 46 +++++++++++++++++++ lib/instances/query.go | 31 ++++++++++++- lib/instances/restore.go | 9 +--- lib/instances/start.go | 9 +--- lib/instances/types.go | 2 +- lib/network/allocate.go | 6 +-- lib/system/init/main.go | 69 +++++++++++++++++++++-------- lib/system/init/main_test.go | 85 ++++++++++++++++++++++++++++++++++++ 10 files changed, 244 insertions(+), 46 deletions(-) create mode 100644 lib/instances/logs_test.go create mode 100644 lib/system/init/main_test.go diff --git a/lib/instances/create.go b/lib/instances/create.go index 74c33010..531d02a0 100644 --- a/lib/instances/create.go +++ b/lib/instances/create.go @@ -442,13 +442,8 @@ func (m *manager) createInstance( // Success - release cleanup stack (prevent cleanup) cu.Release() - // Return instance with derived state - finalInst := m.toInstance(ctx, meta) - if finalInst.BootMarkersHydrated { - if err := m.saveMetadata(meta); err != nil { - log.WarnContext(ctx, "failed to persist hydrated boot markers after create", "instance_id", id, "error", err) - } - } + // Return instance state from current metadata without forcing a log scan. + finalInst := m.toInstanceWithoutHydration(ctx, meta) // Record metrics if m.metrics != nil { m.recordDuration(ctx, m.metrics.createDuration, start, "success", hvType) diff --git a/lib/instances/logs.go b/lib/instances/logs.go index 5c3c8d86..f10e8537 100644 --- a/lib/instances/logs.go +++ b/lib/instances/logs.go @@ -8,6 +8,7 @@ import ( "os" "os/exec" "strconv" + "strings" "time" "github.com/kernel/hypeman/lib/logger" @@ -31,6 +32,23 @@ var ErrTailNotFound = fmt.Errorf("tail command not found: required for log strea // ErrLogNotFound is returned when the requested log file doesn't exist var ErrLogNotFound = fmt.Errorf("log file not found") +var appLogNoiseMarkers = []string{ + "HYPEMAN-PROGRAM-START", + "HYPEMAN-AGENT-READY", + "HYPEMAN-HEADERS-START", + "HYPEMAN-HEADERS-READY", + "HYPEMAN-HEADERS-FAILED", +} + +func shouldSkipAppLogLine(line string) bool { + for _, marker := range appLogNoiseMarkers { + if strings.Contains(line, marker) { + return true + } + } + return false +} + // streamInstanceLogs streams instance logs from the specified source // Returns last N lines, then continues following if follow=true func (m *manager) streamInstanceLogs(ctx context.Context, id string, tail int, follow bool, source LogSource) (<-chan string, error) { @@ -91,11 +109,15 @@ func (m *manager) streamInstanceLogs(ctx context.Context, id string, tail int, f scanner := bufio.NewScanner(stdout) for scanner.Scan() { + line := scanner.Text() + if source == LogSourceApp && shouldSkipAppLogLine(line) { + continue + } select { case <-ctx.Done(): log.DebugContext(ctx, "log stream cancelled", "instance_id", id) return - case out <- scanner.Text(): + case out <- line: } } diff --git a/lib/instances/logs_test.go b/lib/instances/logs_test.go new file mode 100644 index 00000000..3ce9ac80 --- /dev/null +++ b/lib/instances/logs_test.go @@ -0,0 +1,46 @@ +package instances + +import ( + "testing" + + "github.com/stretchr/testify/assert" +) + +func TestShouldSkipAppLogLine(t *testing.T) { + t.Parallel() + + tests := []struct { + name string + line string + want bool + }{ + { + name: "program start marker", + line: "2026-03-10T10:00:00Z [INFO] HYPEMAN-PROGRAM-START ts=2026-03-10T10:00:00Z mode=exec", + want: true, + }, + { + name: "agent ready marker", + line: "2026-03-10T10:00:01Z [INFO] HYPEMAN-AGENT-READY ts=2026-03-10T10:00:01Z", + want: true, + }, + { + name: "headers marker", + line: "2026-03-10T10:00:02Z [INFO] HYPEMAN-HEADERS-READY", + want: true, + }, + { + name: "normal app log line", + line: "2026-03-10T10:00:03Z [INFO] build completed successfully", + want: false, + }, + } + + for _, tt := range tests { + tt := tt + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + assert.Equal(t, tt.want, shouldSkipAppLogLine(tt.line)) + }) + } +} diff --git a/lib/instances/query.go b/lib/instances/query.go index 1eaaa074..ed59f5e1 100644 --- a/lib/instances/query.go +++ b/lib/instances/query.go @@ -34,6 +34,16 @@ type stateResult struct { // deriveState determines instance state by checking socket and querying the hypervisor. // Returns StateUnknown with an error message if the socket exists but hypervisor is unreachable. func (m *manager) deriveState(ctx context.Context, stored *StoredMetadata) stateResult { + return m.deriveStateWithOptions(ctx, stored, true) +} + +// deriveStateWithoutHydration determines instance state without scanning serial logs +// to hydrate missing boot markers. +func (m *manager) deriveStateWithoutHydration(ctx context.Context, stored *StoredMetadata) stateResult { + return m.deriveStateWithOptions(ctx, stored, false) +} + +func (m *manager) deriveStateWithOptions(ctx context.Context, stored *StoredMetadata, hydrateBootMarkers bool) stateResult { log := logger.FromContext(ctx) // 1. Check if socket exists @@ -75,7 +85,10 @@ func (m *manager) deriveState(ctx context.Context, stored *StoredMetadata) state case hypervisor.StateCreated: return stateResult{State: StateCreated} case hypervisor.StateRunning: - hydrated := m.hydrateBootMarkersFromLogs(stored) + hydrated := false + if hydrateBootMarkers { + hydrated = m.hydrateBootMarkersFromLogs(stored) + } return stateResult{ State: deriveRunningState(stored), BootMarkersHydrated: hydrated, @@ -302,7 +315,21 @@ func (m *manager) hasSnapshot(dataDir string) bool { // toInstance converts stored metadata to Instance with derived fields func (m *manager) toInstance(ctx context.Context, meta *metadata) Instance { - result := m.deriveState(ctx, &meta.StoredMetadata) + return m.toInstanceWithStateDerivation(ctx, meta, true) +} + +func (m *manager) toInstanceWithoutHydration(ctx context.Context, meta *metadata) Instance { + return m.toInstanceWithStateDerivation(ctx, meta, false) +} + +func (m *manager) toInstanceWithStateDerivation(ctx context.Context, meta *metadata, hydrateBootMarkers bool) Instance { + var result stateResult + if hydrateBootMarkers { + result = m.deriveState(ctx, &meta.StoredMetadata) + } else { + result = m.deriveStateWithoutHydration(ctx, &meta.StoredMetadata) + } + inst := Instance{ StoredMetadata: meta.StoredMetadata, State: result.State, diff --git a/lib/instances/restore.go b/lib/instances/restore.go index a325ca15..4b1d4395 100644 --- a/lib/instances/restore.go +++ b/lib/instances/restore.go @@ -234,13 +234,8 @@ func (m *manager) restoreInstance( log.WarnContext(ctx, "failed to update metadata after restore", "instance_id", id, "error", err) } - // Return instance with derived state (should be Running now) - finalInst := m.toInstance(ctx, meta) - if finalInst.BootMarkersHydrated { - if err := m.saveMetadata(meta); err != nil { - log.WarnContext(ctx, "failed to persist hydrated boot markers after restore", "instance_id", id, "error", err) - } - } + // Return instance state from current metadata without forcing a log scan. + finalInst := m.toInstanceWithoutHydration(ctx, meta) // Record metrics if m.metrics != nil { m.recordDuration(ctx, m.metrics.restoreDuration, start, "success", stored.HypervisorType) diff --git a/lib/instances/start.go b/lib/instances/start.go index 39051356..7d5ab265 100644 --- a/lib/instances/start.go +++ b/lib/instances/start.go @@ -157,13 +157,8 @@ func (m *manager) startInstance( log.WarnContext(ctx, "failed to update metadata after VM start", "instance_id", id, "error", err) } - // Return instance with derived state (should be Running now) - finalInst := m.toInstance(ctx, meta) - if finalInst.BootMarkersHydrated { - if err := m.saveMetadata(meta); err != nil { - log.WarnContext(ctx, "failed to persist hydrated boot markers after start", "instance_id", id, "error", err) - } - } + // Return instance state from current metadata without forcing a log scan. + finalInst := m.toInstanceWithoutHydration(ctx, meta) // Record metrics if m.metrics != nil { m.recordDuration(ctx, m.metrics.startDuration, start, "success", stored.HypervisorType) diff --git a/lib/instances/types.go b/lib/instances/types.go index 23fc1bcc..8bf2043f 100644 --- a/lib/instances/types.go +++ b/lib/instances/types.go @@ -59,7 +59,7 @@ type StoredMetadata struct { // Timestamps (stored for historical tracking) CreatedAt time.Time - StartedAt *time.Time // Last time VM was started + StartedAt *time.Time // Boot epoch start time (set on create/start; preserved across standby restore) StoppedAt *time.Time // Last time VM was stopped // Boot progress markers (derived from guest serial log sentinels and persisted) diff --git a/lib/network/allocate.go b/lib/network/allocate.go index 07faeaa7..37afd107 100644 --- a/lib/network/allocate.go +++ b/lib/network/allocate.go @@ -167,9 +167,9 @@ func (m *manager) getOrInitDefaultNetwork(ctx context.Context) (*Network, error) return network, nil } - // Self-heal should never delete TAPs for active instances. We pass an empty - // preserve set so CleanupOrphanedTAPs is skipped in Initialize. - if initErr := m.Initialize(ctx, []string{}); initErr != nil { + // Self-heal should never delete TAPs for active instances. We pass nil so + // CleanupOrphanedTAPs is skipped in Initialize. + if initErr := m.Initialize(ctx, nil); initErr != nil { return nil, fmt.Errorf("initialize network manager: %w", initErr) } diff --git a/lib/system/init/main.go b/lib/system/init/main.go index 4883c5ee..71f2b4c0 100644 --- a/lib/system/init/main.go +++ b/lib/system/init/main.go @@ -13,7 +13,11 @@ import ( "fmt" "os" "os/exec" + "path/filepath" + "strings" "sync" + + "github.com/kernel/hypeman/lib/vmconfig" ) func main() { @@ -50,31 +54,44 @@ func main() { dropToShell() } - // Phase 4/5: Run independent setup tasks in parallel. - // Keep strict dependencies around mount -> overlay -> config and - // bind-mount barrier before mode handoff. - var setupWG sync.WaitGroup - if cfg.NetworkEnabled { - setupWG.Add(1) + runNetworkSetup := func() { + if !cfg.NetworkEnabled { + return + } + if err := configureNetwork(log, cfg); err != nil { + log.Error("hypeman-init:network", "failed to configure network", err) + // Continue anyway - network isn't always required + } + } + runVolumesSetup := func() { + if len(cfg.VolumeMounts) == 0 { + return + } + if err := mountVolumes(log, cfg); err != nil { + log.Error("hypeman-init:volumes", "failed to mount volumes", err) + // Continue anyway + } + } + + // Phase 4/5: Run setup tasks. + // Network + volume setup are parallelized only when mounted paths are disjoint + // from /etc, because network setup writes /overlay/newroot/etc/resolv.conf. + if shouldRunNetworkAndVolumesInParallel(cfg) { + var setupWG sync.WaitGroup + setupWG.Add(2) go func() { defer setupWG.Done() - if err := configureNetwork(log, cfg); err != nil { - log.Error("hypeman-init:network", "failed to configure network", err) - // Continue anyway - network isn't always required - } + runNetworkSetup() }() - } - if len(cfg.VolumeMounts) > 0 { - setupWG.Add(1) go func() { defer setupWG.Done() - if err := mountVolumes(log, cfg); err != nil { - log.Error("hypeman-init:volumes", "failed to mount volumes", err) - // Continue anyway - } + runVolumesSetup() }() + setupWG.Wait() + } else { + runNetworkSetup() + runVolumesSetup() } - setupWG.Wait() // Phase 6: Bind mount filesystems to new root if err := bindMountsToNewRoot(log); err != nil { @@ -106,6 +123,22 @@ func main() { } } +func shouldRunNetworkAndVolumesInParallel(cfg *vmconfig.Config) bool { + if !cfg.NetworkEnabled || len(cfg.VolumeMounts) == 0 { + return false + } + + for _, vol := range cfg.VolumeMounts { + // Normalize to an absolute path inside the guest. + mountPath := filepath.Clean("/" + strings.TrimPrefix(vol.Path, "/")) + if mountPath == "/" || mountPath == "/etc" || strings.HasPrefix(mountPath, "/etc/") { + return false + } + } + + return true +} + // dropToShell drops to an interactive shell for debugging when boot fails func dropToShell() { fmt.Fprintln(os.Stderr, "FATAL: dropping to shell for debugging") diff --git a/lib/system/init/main_test.go b/lib/system/init/main_test.go new file mode 100644 index 00000000..6f39988b --- /dev/null +++ b/lib/system/init/main_test.go @@ -0,0 +1,85 @@ +package main + +import ( + "testing" + + "github.com/kernel/hypeman/lib/vmconfig" + "github.com/stretchr/testify/assert" +) + +func TestShouldRunNetworkAndVolumesInParallel(t *testing.T) { + t.Parallel() + + tests := []struct { + name string + cfg *vmconfig.Config + want bool + }{ + { + name: "false when network disabled", + cfg: &vmconfig.Config{ + NetworkEnabled: false, + VolumeMounts: []vmconfig.VolumeMount{ + {Path: "/mnt/data"}, + }, + }, + want: false, + }, + { + name: "false when no volumes", + cfg: &vmconfig.Config{ + NetworkEnabled: true, + }, + want: false, + }, + { + name: "true for disjoint mount paths", + cfg: &vmconfig.Config{ + NetworkEnabled: true, + VolumeMounts: []vmconfig.VolumeMount{ + {Path: "/mnt/data"}, + {Path: "var/lib/app-cache"}, + }, + }, + want: true, + }, + { + name: "false for root mount", + cfg: &vmconfig.Config{ + NetworkEnabled: true, + VolumeMounts: []vmconfig.VolumeMount{ + {Path: "/"}, + }, + }, + want: false, + }, + { + name: "false for etc mount", + cfg: &vmconfig.Config{ + NetworkEnabled: true, + VolumeMounts: []vmconfig.VolumeMount{ + {Path: "/etc"}, + }, + }, + want: false, + }, + { + name: "false for etc subtree mount", + cfg: &vmconfig.Config{ + NetworkEnabled: true, + VolumeMounts: []vmconfig.VolumeMount{ + {Path: "/etc/resolv.conf"}, + }, + }, + want: false, + }, + } + + for _, tt := range tests { + tt := tt + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + assert.Equal(t, tt.want, shouldRunNetworkAndVolumesInParallel(tt.cfg)) + }) + } +} From 735631380e57c9578fe1d1d8501cc19b3e3bdc44 Mon Sep 17 00:00:00 2001 From: Steven Miller Date: Tue, 10 Mar 2026 11:16:03 -0400 Subject: [PATCH 4/5] instances/fork: wait for guest agent when state is initializing --- lib/instances/fork.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/instances/fork.go b/lib/instances/fork.go index 0e1fd13c..108381f8 100644 --- a/lib/instances/fork.go +++ b/lib/instances/fork.go @@ -396,7 +396,7 @@ func (m *manager) applyForkTargetState(ctx context.Context, forkID string, targe if err != nil { return nil, err } - if inst != nil && inst.State == StateRunning { + if inst != nil && (inst.State == StateRunning || inst.State == StateInitializing) { if err := ensureGuestAgentReadyForForkPhase(ctx, &inst.StoredMetadata, "before returning running fork instance"); err != nil { return nil, fmt.Errorf("wait for forked guest agent readiness: %w", err) } From 16469731449d4067a503bcadbf5a6a59fddb458f Mon Sep 17 00:00:00 2001 From: Steven Miller Date: Tue, 10 Mar 2026 11:20:21 -0400 Subject: [PATCH 5/5] init: run volume mounts before network in /etc fallback --- lib/system/init/main.go | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/lib/system/init/main.go b/lib/system/init/main.go index 71f2b4c0..1d10428c 100644 --- a/lib/system/init/main.go +++ b/lib/system/init/main.go @@ -89,8 +89,10 @@ func main() { }() setupWG.Wait() } else { - runNetworkSetup() + // When /etc (or /etc/*) is volume-mounted, configure network after volumes + // so resolv.conf is written into the mounted path instead of being hidden. runVolumesSetup() + runNetworkSetup() } // Phase 6: Bind mount filesystems to new root