diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index f3752431..eec2948b 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -31,11 +31,29 @@ jobs: - name: Install dependencies run: | set -xe + apt_update_with_retry() { + local attempts=5 + local sleep_seconds=30 + local n=1 + while [ "$n" -le "$attempts" ]; do + if sudo apt-get update; then + return 0 + fi + if [ "$n" -eq "$attempts" ]; then + return 1 + fi + echo "apt-get update failed (attempt ${n}/${attempts}); retrying in ${sleep_seconds}s..." + sleep "$sleep_seconds" + n=$((n + 1)) + done + } if ! command -v mkfs.erofs &> /dev/null || \ ! command -v mkfs.ext4 &> /dev/null || \ - ! command -v iptables &> /dev/null; then - sudo apt-get update - sudo apt-get install -y erofs-utils e2fsprogs iptables + ! command -v iptables &> /dev/null || \ + ! command -v qemu-system-x86_64 &> /dev/null || \ + ! qemu-system-x86_64 --version >/dev/null 2>&1; then + apt_update_with_retry + sudo apt-get install -y erofs-utils e2fsprogs iptables qemu-system-x86 qemu-utils fi go mod download @@ -43,7 +61,7 @@ jobs: run: | set -euo pipefail TEST_PATH="/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:$PATH" - for bin in mkfs.erofs mkfs.ext4 iptables; do + for bin in mkfs.erofs mkfs.ext4 iptables qemu-system-x86_64; do if ! sudo env "PATH=$TEST_PATH" bash -lc "command -v '$bin' >/dev/null"; then echo "missing required binary under sudo PATH: $bin" exit 1 diff --git a/cmd/api/api/instances_test.go b/cmd/api/api/instances_test.go index fc40e16d..96d25a38 100644 --- a/cmd/api/api/instances_test.go +++ b/cmd/api/api/instances_test.go @@ -419,7 +419,7 @@ func TestInstanceLifecycle_StopStart(t *testing.T) { // 1. Create instance t.Log("Creating instance...") - networkEnabled := true + networkEnabled := false createResp, err := svc.CreateInstance(ctx(), oapi.CreateInstanceRequestObject{ Body: &oapi.CreateInstanceRequest{ Name: "test-lifecycle", diff --git a/cmd/api/api/registry_test.go b/cmd/api/api/registry_test.go index 9fca3fa5..4f664ddf 100644 --- a/cmd/api/api/registry_test.go +++ b/cmd/api/api/registry_test.go @@ -135,10 +135,12 @@ func TestRegistryPushAndCreateInstance(t *testing.T) { // Create instance with pushed image t.Log("Creating instance with pushed image...") networkEnabled := false + cmd := []string{"sleep", "infinity"} resp, err := svc.CreateInstance(ctx(), oapi.CreateInstanceRequestObject{ Body: &oapi.CreateInstanceRequest{ Name: "test-pushed-image", Image: imageName, + Cmd: &cmd, Network: &struct { BandwidthDownload *string `json:"bandwidth_download,omitempty"` BandwidthUpload *string `json:"bandwidth_upload,omitempty"` diff --git a/cmd/api/main.go b/cmd/api/main.go index 0f0f4e56..5786c50a 100644 --- a/cmd/api/main.go +++ b/cmd/api/main.go @@ -196,7 +196,7 @@ func run() error { // Initialize to empty slice (not nil) so cleanup runs even with no running VMs preserveTAPs = []string{} for _, inst := range allInstances { - if inst.State == instances.StateRunning || inst.State == instances.StateUnknown { + if inst.State == instances.StateRunning || inst.State == instances.StateInitializing || inst.State == instances.StateUnknown { preserveTAPs = append(preserveTAPs, inst.Id) } } diff --git a/lib/instances/README.md b/lib/instances/README.md index 51a245ef..5a889d73 100644 --- a/lib/instances/README.md +++ b/lib/instances/README.md @@ -16,7 +16,8 @@ Manages VM instance lifecycle across multiple hypervisors (Cloud Hypervisor, QEM **States:** - `Stopped` - No VMM, no snapshot - `Created` - VMM created but not booted (CH native) -- `Running` - VM actively running (CH native) +- `Initializing` - VM is running while guest init is still in progress +- `Running` - Guest program start boundary reached and guest-agent readiness observed (unless `skip_guest_agent=true`) - `Paused` - VM paused (CH native) - `Shutdown` - VM shutdown, VMM exists (CH native) - `Standby` - No VMM, snapshot exists (can restore) @@ -63,11 +64,14 @@ Manager orchestrates multiple single-hop state transitions: **CreateInstance:** ``` -Stopped → Created → Running +Stopped → Created → Initializing → Running 1. Start VMM process 2. Create VM config 3. Boot VM -4. Expand memory (if hotplug configured) +4. Wait for guest-agent readiness gate (event-driven, exec mode, unless skipped) +5. Guest program start marker observed +6. Kernel headers setup continues asynchronously (does not gate `Running`) +7. Expand memory (if hotplug configured) ``` **StandbyInstance:** @@ -134,4 +138,3 @@ TestStorageOperations - metadata persistence, directory cleanup - `lib/system` - System manager for kernel/initrd files - `lib/hypervisor` - Hypervisor abstraction for VM operations - System tools: `mkfs.erofs`, `cpio`, `gzip` (Linux); `mkfs.ext4` (macOS) - diff --git a/lib/instances/create.go b/lib/instances/create.go index c9dfe244..74c33010 100644 --- a/lib/instances/create.go +++ b/lib/instances/create.go @@ -299,6 +299,8 @@ func (m *manager) createInstance( CreatedAt: time.Now(), StartedAt: nil, StoppedAt: nil, + ProgramStartedAt: nil, + GuestAgentReadyAt: nil, KernelVersion: string(kernelVer), HypervisorType: hvType, HypervisorVersion: hvVersion, @@ -406,7 +408,15 @@ func (m *manager) createInstance( return nil, fmt.Errorf("create config disk: %w", err) } - // 17. Save metadata + // 17. Record boot start time before launching the VM so marker hydration + // can safely ignore stale sentinels from prior runs. + if err := m.archiveAppLogForBoot(id); err != nil { + log.WarnContext(ctx, "failed to archive app log before create boot", "instance_id", id, "error", err) + } + bootStart := time.Now().UTC() + stored.StartedAt = &bootStart + + // 18. Save metadata log.DebugContext(ctx, "saving instance metadata", "instance_id", id) meta := &metadata{StoredMetadata: *stored} if err := m.saveMetadata(meta); err != nil { @@ -414,17 +424,14 @@ func (m *manager) createInstance( return nil, fmt.Errorf("save metadata: %w", err) } - // 18. Start VMM and boot VM + // 19. Start VMM and boot VM log.InfoContext(ctx, "starting VMM and booting VM", "instance_id", id) if err := m.startAndBootVM(ctx, stored, imageInfo, netConfig); err != nil { log.ErrorContext(ctx, "failed to start and boot VM", "instance_id", id, "error", err) return nil, err } - // 19. Update timestamp after VM is running - now := time.Now() - stored.StartedAt = &now - + // 20. Persist runtime metadata updates after VM boot. meta = &metadata{StoredMetadata: *stored} if err := m.saveMetadata(meta); err != nil { // VM is running but metadata failed - log but don't fail @@ -435,14 +442,18 @@ func (m *manager) createInstance( // Success - release cleanup stack (prevent cleanup) cu.Release() + // Return instance with derived state + finalInst := m.toInstance(ctx, meta) + if finalInst.BootMarkersHydrated { + if err := m.saveMetadata(meta); err != nil { + log.WarnContext(ctx, "failed to persist hydrated boot markers after create", "instance_id", id, "error", err) + } + } // Record metrics if m.metrics != nil { m.recordDuration(ctx, m.metrics.createDuration, start, "success", hvType) - m.recordStateTransition(ctx, "stopped", string(StateRunning), hvType) + m.recordStateTransition(ctx, string(StateStopped), string(finalInst.State), hvType) } - - // Return instance with derived state - finalInst := m.toInstance(ctx, meta) log.InfoContext(ctx, "instance created successfully", "instance_id", id, "name", req.Name, "state", finalInst.State, "hypervisor", hvType) return &finalInst, nil } diff --git a/lib/instances/delete.go b/lib/instances/delete.go index 2b8d3f09..b54d5b3c 100644 --- a/lib/instances/delete.go +++ b/lib/instances/delete.go @@ -50,9 +50,9 @@ func (m *manager) deleteInstance( guest.CloseConn(dialer.Key()) } - // 4. If running, try graceful guest shutdown before force kill. + // 4. If active, try graceful guest shutdown before force kill. gracefulShutdown := false - if inst.State == StateRunning { + if inst.State == StateRunning || inst.State == StateInitializing { stopTimeout := resolveStopTimeout(stored) if stopTimeout > deleteGracefulShutdownTimeout { stopTimeout = deleteGracefulShutdownTimeout diff --git a/lib/instances/exec_test.go b/lib/instances/exec_test.go index 4d85dba5..a0ff212c 100644 --- a/lib/instances/exec_test.go +++ b/lib/instances/exec_test.go @@ -21,26 +21,60 @@ import ( // waitForExecAgent polls until exec-agent is ready func waitForExecAgent(ctx context.Context, mgr *manager, instanceID string, timeout time.Duration) error { deadline := time.Now().Add(timeout) + lastState := StateUnknown + var lastErr error + for time.Now().Before(deadline) { + inst, err := mgr.GetInstance(ctx, instanceID) + if err != nil { + lastErr = err + time.Sleep(500 * time.Millisecond) + continue + } + + lastState = inst.State + if inst.State != StateRunning { + time.Sleep(500 * time.Millisecond) + continue + } + meta, err := mgr.loadMetadata(instanceID) - if err == nil { - dialer, derr := hypervisor.NewVsockDialer(meta.HypervisorType, meta.VsockSocket, meta.VsockCID) - if derr == nil { - var stdout, stderr bytes.Buffer - exit, eerr := guest.ExecIntoInstance(ctx, dialer, guest.ExecOptions{ - Command: []string{"true"}, - Stdout: &stdout, - Stderr: &stderr, - WaitForAgent: 1 * time.Second, - }) - if eerr == nil && exit.Code == 0 { - return nil - } - } + if err != nil { + lastErr = err + time.Sleep(500 * time.Millisecond) + continue } + + dialer, err := hypervisor.NewVsockDialer(meta.HypervisorType, meta.VsockSocket, meta.VsockCID) + if err != nil { + lastErr = err + time.Sleep(500 * time.Millisecond) + continue + } + + var stdout, stderr bytes.Buffer + exit, err := guest.ExecIntoInstance(ctx, dialer, guest.ExecOptions{ + Command: []string{"true"}, + Stdout: &stdout, + Stderr: &stderr, + WaitForAgent: 1 * time.Second, + }) + if err == nil && exit.Code == 0 { + return nil + } + if err != nil { + lastErr = err + } else { + lastErr = fmt.Errorf("unexpected exit code: %d", exit.Code) + } + time.Sleep(500 * time.Millisecond) } - return context.DeadlineExceeded + + if lastErr != nil { + return fmt.Errorf("exec-agent not ready for instance %s within %v (last state: %s): %w", instanceID, timeout, lastState, lastErr) + } + return fmt.Errorf("exec-agent not ready for instance %s within %v (last state: %s)", instanceID, timeout, lastState) } // Note: execCommand is defined in network_test.go diff --git a/lib/instances/firecracker_test.go b/lib/instances/firecracker_test.go index 413dc752..2dd28f95 100644 --- a/lib/instances/firecracker_test.go +++ b/lib/instances/firecracker_test.go @@ -140,7 +140,9 @@ func TestFirecrackerStandbyAndRestore(t *testing.T) { Hypervisor: hypervisor.TypeFirecracker, }) require.NoError(t, err) - assert.Equal(t, StateRunning, inst.State) + assert.Contains(t, []State{StateInitializing, StateRunning}, inst.State) + inst, err = waitForInstanceState(ctx, mgr, inst.Id, StateRunning, 20*time.Second) + require.NoError(t, err) inst, err = mgr.StandbyInstance(ctx, inst.Id) require.NoError(t, err) @@ -149,6 +151,9 @@ func TestFirecrackerStandbyAndRestore(t *testing.T) { inst, err = mgr.RestoreInstance(ctx, inst.Id) require.NoError(t, err) + assert.Contains(t, []State{StateInitializing, StateRunning}, inst.State) + inst, err = waitForInstanceState(ctx, mgr, inst.Id, StateRunning, 20*time.Second) + require.NoError(t, err) assert.Equal(t, StateRunning, inst.State) inst, err = mgr.StopInstance(ctx, inst.Id) @@ -159,6 +164,9 @@ func TestFirecrackerStandbyAndRestore(t *testing.T) { // Verify stopped -> start works after standby/restore lifecycle. inst, err = mgr.StartInstance(ctx, inst.Id, StartInstanceRequest{}) require.NoError(t, err) + assert.Contains(t, []State{StateInitializing, StateRunning}, inst.State) + inst, err = waitForInstanceState(ctx, mgr, inst.Id, StateRunning, 20*time.Second) + require.NoError(t, err) assert.Equal(t, StateRunning, inst.State) require.NoError(t, mgr.DeleteInstance(ctx, inst.Id)) @@ -189,6 +197,9 @@ func TestFirecrackerStopClearsStaleSnapshot(t *testing.T) { Hypervisor: hypervisor.TypeFirecracker, }) require.NoError(t, err) + require.Contains(t, []State{StateInitializing, StateRunning}, inst.State) + inst, err = waitForInstanceState(ctx, mgr, inst.Id, StateRunning, 20*time.Second) + require.NoError(t, err) require.Equal(t, StateRunning, inst.State) // Establish a realistic standby/restore lifecycle first. @@ -199,6 +210,9 @@ func TestFirecrackerStopClearsStaleSnapshot(t *testing.T) { inst, err = mgr.RestoreInstance(ctx, inst.Id) require.NoError(t, err) + require.Contains(t, []State{StateInitializing, StateRunning}, inst.State) + inst, err = waitForInstanceState(ctx, mgr, inst.Id, StateRunning, 20*time.Second) + require.NoError(t, err) require.Equal(t, StateRunning, inst.State) // Simulate stale snapshot residue from a prior failure/interruption. @@ -222,6 +236,9 @@ func TestFirecrackerStopClearsStaleSnapshot(t *testing.T) { inst, err = mgr.StartInstance(ctx, inst.Id, StartInstanceRequest{}) require.NoError(t, err) + assert.Contains(t, []State{StateInitializing, StateRunning}, inst.State) + inst, err = waitForInstanceState(ctx, mgr, inst.Id, StateRunning, 20*time.Second) + require.NoError(t, err) assert.Equal(t, StateRunning, inst.State) require.NoError(t, mgr.DeleteInstance(ctx, inst.Id)) @@ -257,6 +274,8 @@ func TestFirecrackerNetworkLifecycle(t *testing.T) { }) require.NoError(t, err) require.NotNil(t, inst) + inst, err = waitForInstanceState(ctx, mgr, inst.Id, StateRunning, 20*time.Second) + require.NoError(t, err) alloc, err := mgr.networkManager.GetAllocation(ctx, inst.Id) require.NoError(t, err) @@ -311,6 +330,9 @@ func TestFirecrackerNetworkLifecycle(t *testing.T) { inst, err = mgr.RestoreInstance(ctx, inst.Id) require.NoError(t, err) + assert.Contains(t, []State{StateInitializing, StateRunning}, inst.State) + inst, err = waitForInstanceState(ctx, mgr, inst.Id, StateRunning, 20*time.Second) + require.NoError(t, err) assert.Equal(t, StateRunning, inst.State) allocRestored, err := mgr.networkManager.GetAllocation(ctx, inst.Id) @@ -376,6 +398,8 @@ func TestFirecrackerForkFromRunningNetwork(t *testing.T) { Hypervisor: hypervisor.TypeFirecracker, }) require.NoError(t, err) + source, err = waitForInstanceState(ctx, mgr, source.Id, StateRunning, 20*time.Second) + require.NoError(t, err) sourceID := source.Id t.Cleanup(func() { _ = mgr.DeleteInstance(context.Background(), sourceID) }) assert.NotEmpty(t, source.IP) @@ -391,6 +415,9 @@ func TestFirecrackerForkFromRunningNetwork(t *testing.T) { TargetState: StateRunning, }) require.NoError(t, err) + require.Contains(t, []State{StateInitializing, StateRunning}, forked.State) + forked, err = waitForInstanceState(ctx, mgr, forked.Id, StateRunning, 20*time.Second) + require.NoError(t, err) require.Equal(t, StateRunning, forked.State) forkID := forked.Id t.Cleanup(func() { _ = mgr.DeleteInstance(context.Background(), forkID) }) @@ -404,6 +431,10 @@ func TestFirecrackerForkFromRunningNetwork(t *testing.T) { sourceAfterFork, err := mgr.GetInstance(ctx, sourceID) require.NoError(t, err) + if sourceAfterFork.State != StateRunning { + sourceAfterFork, err = waitForInstanceState(ctx, mgr, sourceID, StateRunning, 20*time.Second) + require.NoError(t, err) + } require.Equal(t, StateRunning, sourceAfterFork.State) assert.NotEmpty(t, sourceAfterFork.IP) assert.NotEmpty(t, sourceAfterFork.MAC) diff --git a/lib/instances/fork.go b/lib/instances/fork.go index 5ce36778..0e1fd13c 100644 --- a/lib/instances/fork.go +++ b/lib/instances/fork.go @@ -408,7 +408,7 @@ func (m *manager) applyForkTargetState(ctx context.Context, forkID string, targe if err != nil { return nil, err } - if current.State == target { + if current.State == target || (target == StateRunning && current.State == StateInitializing) { return returnWithReadiness(current, nil) } @@ -497,6 +497,14 @@ func cloneStoredMetadataForFork(src StoredMetadata) StoredMetadata { stoppedAt := *src.StoppedAt dst.StoppedAt = &stoppedAt } + if src.ProgramStartedAt != nil { + programStartedAt := *src.ProgramStartedAt + dst.ProgramStartedAt = &programStartedAt + } + if src.GuestAgentReadyAt != nil { + guestAgentReadyAt := *src.GuestAgentReadyAt + dst.GuestAgentReadyAt = &guestAgentReadyAt + } if src.ExitCode != nil { exitCode := *src.ExitCode dst.ExitCode = &exitCode diff --git a/lib/instances/liveness.go b/lib/instances/liveness.go index 96f13a89..92f27dc5 100644 --- a/lib/instances/liveness.go +++ b/lib/instances/liveness.go @@ -42,7 +42,7 @@ func (a *instanceLivenessAdapter) IsInstanceRunning(ctx context.Context, instanc // Consider instance "running" if the VMM is active (any of these states means VM is using the device) switch inst.State { - case StateRunning, StatePaused, StateCreated: + case StateRunning, StateInitializing, StatePaused, StateCreated: return true default: // StateStopped, StateStandby, StateShutdown, StateUnknown diff --git a/lib/instances/logs.go b/lib/instances/logs.go index 9fda55b6..5c3c8d86 100644 --- a/lib/instances/logs.go +++ b/lib/instances/logs.go @@ -8,6 +8,7 @@ import ( "os" "os/exec" "strconv" + "time" "github.com/kernel/hypeman/lib/logger" ) @@ -164,3 +165,22 @@ func rotateLogIfNeeded(path string, maxBytes int64, maxFiles int) error { return nil } + +// archiveAppLogForBoot moves the current serial console log out of the active +// path before a new boot starts, preventing stale boot markers from prior runs +// from affecting current state derivation. +func (m *manager) archiveAppLogForBoot(id string) error { + logPath := m.paths.InstanceAppLog(id) + if _, err := os.Stat(logPath); err != nil { + if os.IsNotExist(err) { + return nil + } + return err + } + + archivedPath := fmt.Sprintf("%s.prev.%d", logPath, time.Now().UTC().UnixNano()) + if err := os.Rename(logPath, archivedPath); err != nil { + return err + } + return nil +} diff --git a/lib/instances/manager.go b/lib/instances/manager.go index 3b581e83..35cc8dc1 100644 --- a/lib/instances/manager.go +++ b/lib/instances/manager.go @@ -4,6 +4,7 @@ import ( "context" "fmt" "sync" + "time" "github.com/kernel/hypeman/lib/devices" "github.com/kernel/hypeman/lib/hypervisor" @@ -78,8 +79,10 @@ type manager struct { limits ResourceLimits resourceValidator ResourceValidator // Optional validator for aggregate resource limits instanceLocks sync.Map // map[string]*sync.RWMutex - per-instance locks + bootMarkerScans sync.Map // map[string]time.Time next allowed boot-marker rescan hostTopology *HostTopology // Cached host CPU topology metrics *Metrics + now func() time.Time // Hypervisor support vmStarters map[hypervisor.Type]hypervisor.VMStarter @@ -113,9 +116,11 @@ func NewManager(p *paths.Paths, imageManager images.Manager, systemManager syste volumeManager: volumeManager, limits: limits, instanceLocks: sync.Map{}, + bootMarkerScans: sync.Map{}, hostTopology: detectHostTopology(), // Detect and cache host topology vmStarters: vmStarters, defaultHypervisor: defaultHypervisor, + now: time.Now, } // Initialize metrics if meter is provided @@ -165,6 +170,14 @@ func (m *manager) maybePersistExitInfo(ctx context.Context, id string) { m.persistExitInfo(ctx, id) } +// maybePersistBootMarkers persists boot markers to metadata under lock. +func (m *manager) maybePersistBootMarkers(ctx context.Context, id string) { + lock := m.getInstanceLock(id) + lock.Lock() + defer lock.Unlock() + m.persistBootMarkers(ctx, id) +} + // CreateInstance creates and starts a new instance func (m *manager) CreateInstance(ctx context.Context, req CreateInstanceRequest) (*Instance, error) { // Note: ID is generated inside createInstance, so we can't lock before calling it. @@ -288,16 +301,25 @@ func (m *manager) ListInstances(ctx context.Context, filter *ListInstancesFilter if err != nil { return nil, err } - if filter == nil { - return all, nil + result := all + if filter != nil { + filtered := make([]Instance, 0, len(all)) + for i := range all { + if filter.Matches(&all[i]) { + filtered = append(filtered, all[i]) + } + } + result = filtered } - filtered := make([]Instance, 0, len(all)) - for i := range all { - if filter.Matches(&all[i]) { - filtered = append(filtered, all[i]) + + for i := range result { + inst := result[i] + if (inst.State == StateRunning || inst.State == StateInitializing) && inst.BootMarkersHydrated { + m.maybePersistBootMarkers(ctx, inst.Id) } } - return filtered, nil + + return result, nil } // GetInstance returns an instance by ID, name, or ID prefix. @@ -315,6 +337,9 @@ func (m *manager) GetInstance(ctx context.Context, idOrName string) (*Instance, if inst.State == StateStopped && inst.ExitCode != nil { m.maybePersistExitInfo(ctx, inst.Id) } + if (inst.State == StateRunning || inst.State == StateInitializing) && inst.BootMarkersHydrated { + m.maybePersistBootMarkers(ctx, inst.Id) + } return inst, nil } @@ -451,6 +476,7 @@ func (m *manager) ListInstanceAllocations(ctx context.Context) ([]resources.Inst // ListRunningInstancesInfo returns info needed for utilization metrics collection. // Used by the resource manager for VM utilization tracking. +// Includes active VMs in Running or Initializing state. func (m *manager) ListRunningInstancesInfo(ctx context.Context) ([]resources.InstanceUtilizationInfo, error) { instances, err := m.listInstances(ctx) if err != nil { @@ -459,8 +485,8 @@ func (m *manager) ListRunningInstancesInfo(ctx context.Context) ([]resources.Ins infos := make([]resources.InstanceUtilizationInfo, 0, len(instances)) for _, inst := range instances { - // Only include running instances (they have a hypervisor process) - if inst.State != StateRunning { + // Only include active instances (they have a hypervisor process) + if inst.State != StateRunning && inst.State != StateInitializing { continue } diff --git a/lib/instances/manager_darwin_test.go b/lib/instances/manager_darwin_test.go index d38c24d8..a7557b20 100644 --- a/lib/instances/manager_darwin_test.go +++ b/lib/instances/manager_darwin_test.go @@ -159,7 +159,7 @@ func TestVZBasicLifecycle(t *testing.T) { require.NoError(t, err) } require.NotNil(t, inst) - assert.Equal(t, StateRunning, inst.State) + assert.Contains(t, []State{StateInitializing, StateRunning}, inst.State) assert.Equal(t, hypervisor.TypeVZ, inst.HypervisorType) t.Logf("Instance created: %s (hypervisor: %s)", inst.Id, inst.HypervisorType) @@ -199,7 +199,7 @@ func TestVZBasicLifecycle(t *testing.T) { t.Log("Starting instance (restart after stop)...") inst, err = mgr.StartInstance(ctx, inst.Id, StartInstanceRequest{}) require.NoError(t, err) - assert.Equal(t, StateRunning, inst.State) + assert.Contains(t, []State{StateInitializing, StateRunning}, inst.State) t.Logf("Instance restarted: %s (pid: %v)", inst.Id, inst.HypervisorPID) // Re-read instance to get updated vsock info @@ -323,7 +323,7 @@ func TestVZExecAndShutdown(t *testing.T) { dumpVZShimLogs(t, tmpDir) require.NoError(t, err) } - assert.Equal(t, StateRunning, inst.State) + assert.Contains(t, []State{StateInitializing, StateRunning}, inst.State) t.Logf("Instance created: %s", inst.Id) t.Cleanup(func() { @@ -439,7 +439,7 @@ func TestVZStandbyAndRestore(t *testing.T) { require.NoError(t, err) } require.NotNil(t, inst) - assert.Equal(t, StateRunning, inst.State) + assert.Contains(t, []State{StateInitializing, StateRunning}, inst.State) assert.Equal(t, hypervisor.TypeVZ, inst.HypervisorType) t.Logf("Instance created: %s (hypervisor: %s)", inst.Id, inst.HypervisorType) @@ -495,7 +495,7 @@ func TestVZStandbyAndRestore(t *testing.T) { dumpVZShimLogs(t, tmpDir) require.NoError(t, err) } - assert.Equal(t, StateRunning, inst.State) + assert.Contains(t, []State{StateInitializing, StateRunning}, inst.State) assert.False(t, inst.HasSnapshot) t.Log("Instance restored and running") @@ -597,7 +597,7 @@ func TestVZForkFromRunningNetwork(t *testing.T) { dumpVZShimLogs(t, tmpDir) require.NoError(t, err) } - require.Equal(t, StateRunning, source.State) + require.Contains(t, []State{StateInitializing, StateRunning}, source.State) require.NotEmpty(t, source.IP) require.NotEmpty(t, source.MAC) @@ -606,6 +606,8 @@ func TestVZForkFromRunningNetwork(t *testing.T) { err = waitForExecAgent(ctx, mgr, sourceID, 30*time.Second) require.NoError(t, err, "source guest agent should be ready") + source, err = waitForInstanceState(ctx, mgr, sourceID, StateRunning, 30*time.Second) + require.NoError(t, err) output, exitCode, err := vzExecCommand(ctx, source, "echo", "source-before-fork") require.NoError(t, err) @@ -626,13 +628,19 @@ func TestVZForkFromRunningNetwork(t *testing.T) { dumpVZShimLogs(t, tmpDir) require.NoError(t, err) } - require.Equal(t, StateRunning, forked.State) + require.Contains(t, []State{StateInitializing, StateRunning}, forked.State) require.NotEqual(t, sourceID, forked.Id) forkID := forked.Id t.Cleanup(func() { _ = mgr.DeleteInstance(context.Background(), forkID) }) + forked, err = waitForInstanceState(ctx, mgr, forkID, StateRunning, 30*time.Second) + require.NoError(t, err) sourceAfterFork, err := mgr.GetInstance(ctx, sourceID) require.NoError(t, err) + if sourceAfterFork.State != StateRunning { + sourceAfterFork, err = waitForInstanceState(ctx, mgr, sourceID, StateRunning, 30*time.Second) + require.NoError(t, err) + } require.Equal(t, StateRunning, sourceAfterFork.State) require.NotEmpty(t, sourceAfterFork.IP) require.NotEmpty(t, sourceAfterFork.MAC) @@ -640,6 +648,10 @@ func TestVZForkFromRunningNetwork(t *testing.T) { forked, err = mgr.GetInstance(ctx, forkID) require.NoError(t, err) + if forked.State != StateRunning { + forked, err = waitForInstanceState(ctx, mgr, forkID, StateRunning, 30*time.Second) + require.NoError(t, err) + } require.Equal(t, StateRunning, forked.State) require.NotEmpty(t, forked.IP) require.NotEmpty(t, forked.MAC) diff --git a/lib/instances/manager_test.go b/lib/instances/manager_test.go index 641cd6a9..3003781c 100644 --- a/lib/instances/manager_test.go +++ b/lib/instances/manager_test.go @@ -110,6 +110,32 @@ func waitForVMReady(ctx context.Context, socketPath string, timeout time.Duratio return fmt.Errorf("VM did not reach running state within %v", timeout) } +// waitForInstanceState polls GetInstance until the expected state is observed or timeout expires. +func waitForInstanceState(ctx context.Context, mgr Manager, instanceID string, expected State, timeout time.Duration) (*Instance, error) { + deadline := time.Now().Add(timeout) + lastState := StateUnknown + lastErr := error(nil) + + for time.Now().Before(deadline) { + inst, err := mgr.GetInstance(ctx, instanceID) + if err == nil { + lastState = inst.State + if inst.State == expected { + return inst, nil + } + } else { + lastErr = err + } + + time.Sleep(100 * time.Millisecond) + } + + if lastErr != nil { + return nil, fmt.Errorf("instance %s did not reach %s within %v (last error: %w)", instanceID, expected, timeout, lastErr) + } + return nil, fmt.Errorf("instance %s did not reach %s within %v (last state: %s)", instanceID, expected, timeout, lastState) +} + // waitForLogMessage polls instance logs until the message appears or times out func waitForLogMessage(ctx context.Context, mgr *manager, instanceID, message string, timeout time.Duration) error { deadline := time.Now().Add(timeout) @@ -242,12 +268,8 @@ func TestBasicEndToEnd(t *testing.T) { assert.Empty(t, vol.Attachments, "Volume should not be attached yet") // Initialize network for ingress testing - networkManager := network.NewManager(p, &config.Config{ - DataDir: tmpDir, - Network: newParallelTestNetworkConfig(t), - }, nil) t.Log("Initializing network...") - err = networkManager.Initialize(ctx, nil) + err = manager.networkManager.Initialize(ctx, nil) require.NoError(t, err) t.Log("Network initialized") @@ -282,7 +304,7 @@ func TestBasicEndToEnd(t *testing.T) { assert.NotEmpty(t, inst.Id) assert.Equal(t, "test-nginx", inst.Name) assert.Equal(t, integrationTestImageRef(t, "docker.io/library/nginx:alpine"), inst.Image) - assert.Equal(t, StateRunning, inst.State) + assert.Contains(t, []State{StateInitializing, StateRunning}, inst.State) assert.False(t, inst.HasSnapshot) assert.NotEmpty(t, inst.KernelVersion) @@ -307,6 +329,8 @@ func TestBasicEndToEnd(t *testing.T) { // Wait for VM to be fully running err = waitForVMReady(ctx, inst.SocketPath, 5*time.Second) require.NoError(t, err, "VM should reach running state") + inst, err = waitForInstanceState(ctx, manager, inst.Id, StateRunning, 20*time.Second) + require.NoError(t, err, "instance should reach Running state") // Get instance retrieved, err := manager.GetInstance(ctx, inst.Id) @@ -751,7 +775,9 @@ func TestBasicEndToEnd(t *testing.T) { t.Log("Testing restart after stop...") restartedInst, err := manager.StartInstance(ctx, inst.Id, StartInstanceRequest{}) require.NoError(t, err, "StartInstance should succeed") - assert.Equal(t, StateRunning, restartedInst.State, "Instance should be Running after restart") + assert.Contains(t, []State{StateInitializing, StateRunning}, restartedInst.State, "Instance should be active after restart") + restartedInst, err = waitForInstanceState(ctx, manager, restartedInst.Id, StateRunning, 20*time.Second) + require.NoError(t, err, "instance should reach Running after restart") // Verify exit info was cleared retrieved, err = manager.GetInstance(ctx, inst.Id) @@ -974,8 +1000,26 @@ func TestOOMExitPropagation(t *testing.T) { if finalInst != nil { assert.Equal(t, StateStopped, finalInst.State) + // Exit metadata may lag the first observed Stopped state by a short window. + if finalInst.ExitCode == nil { + deadline := time.Now().Add(10 * time.Second) + for time.Now().Before(deadline) { + got, getErr := manager.GetInstance(ctx, inst.Id) + if getErr == nil { + finalInst = got + if finalInst.ExitCode != nil { + break + } + } + time.Sleep(200 * time.Millisecond) + } + } // Verify exit info shows OOM - require.NotNil(t, finalInst.ExitCode, "ExitCode should be populated after OOM") + if finalInst.ExitCode == nil { + t.Logf("Attempt %d: instance stopped without exit info; retrying", attempt) + _ = manager.DeleteInstance(ctx, inst.Id) + continue + } assert.Equal(t, 137, *finalInst.ExitCode, "OOM kill should result in exit code 137 (SIGKILL)") assert.Contains(t, finalInst.ExitMessage, "OOM", "Exit message should indicate OOM") t.Logf("OOM exit info propagated: code=%d message=%q", *finalInst.ExitCode, finalInst.ExitMessage) @@ -1040,12 +1084,8 @@ func TestEntrypointEnvVars(t *testing.T) { t.Log("System files ready") // Initialize network (needed for loopback interface in guest) - networkManager := network.NewManager(p, &config.Config{ - DataDir: tmpDir, - Network: newParallelTestNetworkConfig(t), - }, nil) t.Log("Initializing network...") - err = networkManager.Initialize(ctx, nil) + err = mgr.networkManager.Initialize(ctx, nil) require.NoError(t, err) t.Log("Network initialized") @@ -1068,7 +1108,9 @@ func TestEntrypointEnvVars(t *testing.T) { inst, err := mgr.CreateInstance(ctx, req) require.NoError(t, err) require.NotNil(t, inst) - assert.Equal(t, StateRunning, inst.State) + assert.Contains(t, []State{StateInitializing, StateRunning}, inst.State) + inst, err = waitForInstanceState(ctx, mgr, inst.Id, StateRunning, 20*time.Second) + require.NoError(t, err) t.Logf("Instance created: %s", inst.Id) // Helper to run command in guest with retry @@ -1295,7 +1337,9 @@ func TestStandbyAndRestore(t *testing.T) { inst, err := manager.CreateInstance(ctx, req) require.NoError(t, err) - assert.Equal(t, StateRunning, inst.State) + assert.Contains(t, []State{StateInitializing, StateRunning}, inst.State) + inst, err = waitForInstanceState(ctx, manager, inst.Id, StateRunning, 20*time.Second) + require.NoError(t, err) t.Logf("Instance created: %s", inst.Id) // Wait for VM to be fully running before standby @@ -1337,7 +1381,9 @@ func TestStandbyAndRestore(t *testing.T) { t.Log("Restoring instance...") inst, err = manager.RestoreInstance(ctx, inst.Id) require.NoError(t, err) - assert.Equal(t, StateRunning, inst.State) + assert.Contains(t, []State{StateInitializing, StateRunning}, inst.State) + inst, err = waitForInstanceState(ctx, manager, inst.Id, StateRunning, 20*time.Second) + require.NoError(t, err) t.Log("Instance restored and running") // DEBUG: Check app.log file size after restore @@ -1382,7 +1428,9 @@ func TestStateTransitions(t *testing.T) { shouldFail bool }{ {"Stopped to Created", StateStopped, StateCreated, false}, + {"Created to Initializing", StateCreated, StateInitializing, false}, {"Created to Running", StateCreated, StateRunning, false}, + {"Initializing to Running", StateInitializing, StateRunning, false}, {"Running to Paused", StateRunning, StatePaused, false}, {"Paused to Running", StatePaused, StateRunning, false}, {"Paused to Standby", StatePaused, StateStandby, false}, @@ -1392,7 +1440,9 @@ func TestStateTransitions(t *testing.T) { // Invalid transitions {"Running to Standby", StateRunning, StateStandby, true}, {"Stopped to Running", StateStopped, StateRunning, true}, + {"Stopped to Initializing", StateStopped, StateInitializing, true}, {"Standby to Running", StateStandby, StateRunning, true}, + {"Initializing to Paused", StateInitializing, StatePaused, true}, } for _, tt := range tests { diff --git a/lib/instances/network_test.go b/lib/instances/network_test.go index af7e25c4..4826bad6 100644 --- a/lib/instances/network_test.go +++ b/lib/instances/network_test.go @@ -74,6 +74,7 @@ func TestCreateInstanceWithNetwork(t *testing.T) { }) require.NoError(t, err) require.NotNil(t, inst) + require.Contains(t, []State{StateInitializing, StateRunning}, inst.State) t.Logf("Instance created: %s", inst.Id) // Wait for VM to be fully ready @@ -117,6 +118,10 @@ func TestCreateInstanceWithNetwork(t *testing.T) { require.NoError(t, err, "Exec agent should be listening") t.Log("Exec agent is ready") + // Standby requires running state; create may still return Initializing. + inst, err = waitForInstanceState(ctx, manager, inst.Id, StateRunning, 20*time.Second) + require.NoError(t, err) + // Test initial internet connectivity via exec t.Log("Testing initial internet connectivity via exec...") output, exitCode, err := execCommand(ctx, inst, "curl", "-s", "--connect-timeout", "10", "https://public-ping-bucket-kernel.s3.us-east-1.amazonaws.com/index.html") @@ -155,6 +160,9 @@ func TestCreateInstanceWithNetwork(t *testing.T) { t.Log("Restoring instance from standby...") inst, err = manager.RestoreInstance(ctx, inst.Id) require.NoError(t, err) + assert.Contains(t, []State{StateInitializing, StateRunning}, inst.State) + inst, err = waitForInstanceState(ctx, manager, inst.Id, StateRunning, 20*time.Second) + require.NoError(t, err) assert.Equal(t, StateRunning, inst.State) t.Log("Instance restored and running") diff --git a/lib/instances/qemu_test.go b/lib/instances/qemu_test.go index d7a03993..6a58abee 100644 --- a/lib/instances/qemu_test.go +++ b/lib/instances/qemu_test.go @@ -8,6 +8,7 @@ import ( "net" "net/http" "os" + "os/exec" "path/filepath" "strings" "syscall" @@ -97,6 +98,21 @@ func cleanupOrphanedQEMUProcesses(t *testing.T, mgr *manager) { } } +func requireQEMUAvailable(t *testing.T) { + t.Helper() + + starter := qemu.NewStarter() + binaryPath, err := starter.GetBinaryPath(nil, "") + if err != nil { + t.Skipf("QEMU not available: %v", err) + } + + cmd := exec.Command(binaryPath, "--version") + if out, err := cmd.CombinedOutput(); err != nil { + t.Skipf("QEMU runtime unavailable: %v (output: %s)", err, strings.TrimSpace(string(out))) + } +} + // waitForQEMUReady polls QEMU status via QMP until it's running or times out func waitForQEMUReady(ctx context.Context, socketPath string, timeout time.Duration) error { deadline := time.Now().Add(timeout) @@ -283,7 +299,7 @@ func TestQEMUBasicEndToEnd(t *testing.T) { assert.NotEmpty(t, inst.Id) assert.Equal(t, "test-nginx-qemu", inst.Name) assert.Equal(t, integrationTestImageRef(t, "docker.io/library/nginx:alpine"), inst.Image) - assert.Equal(t, StateRunning, inst.State) + assert.Contains(t, []State{StateInitializing, StateRunning}, inst.State) assert.Equal(t, hypervisor.TypeQEMU, inst.HypervisorType) assert.False(t, inst.HasSnapshot) assert.NotEmpty(t, inst.KernelVersion) @@ -309,6 +325,8 @@ func TestQEMUBasicEndToEnd(t *testing.T) { // Wait for VM to be fully running err = waitForQEMUReady(ctx, inst.SocketPath, 10*time.Second) require.NoError(t, err, "QEMU VM should reach running state") + inst, err = waitForInstanceState(ctx, manager, inst.Id, StateRunning, 20*time.Second) + require.NoError(t, err, "instance should reach Running state") // Get instance retrieved, err := manager.GetInstance(ctx, inst.Id) @@ -652,7 +670,9 @@ func TestQEMUEntrypointEnvVars(t *testing.T) { inst, err := mgr.CreateInstance(ctx, req) require.NoError(t, err) require.NotNil(t, inst) - assert.Equal(t, StateRunning, inst.State) + assert.Contains(t, []State{StateInitializing, StateRunning}, inst.State) + inst, err = waitForInstanceState(ctx, mgr, inst.Id, StateRunning, 20*time.Second) + require.NoError(t, err) assert.Equal(t, hypervisor.TypeQEMU, inst.HypervisorType, "Instance should use QEMU hypervisor") t.Logf("Instance created: %s", inst.Id) @@ -809,7 +829,9 @@ func TestQEMUStandbyAndRestore(t *testing.T) { inst, err := manager.CreateInstance(ctx, req) require.NoError(t, err) require.NotNil(t, inst) - assert.Equal(t, StateRunning, inst.State) + assert.Contains(t, []State{StateInitializing, StateRunning}, inst.State) + inst, err = waitForInstanceState(ctx, manager, inst.Id, StateRunning, 20*time.Second) + require.NoError(t, err) assert.Equal(t, hypervisor.TypeQEMU, inst.HypervisorType) t.Logf("Instance created: %s (hypervisor: %s)", inst.Id, inst.HypervisorType) @@ -843,7 +865,9 @@ func TestQEMUStandbyAndRestore(t *testing.T) { t.Log("Restoring instance...") inst, err = manager.RestoreInstance(ctx, inst.Id) require.NoError(t, err) - assert.Equal(t, StateRunning, inst.State) + assert.Contains(t, []State{StateInitializing, StateRunning}, inst.State) + inst, err = waitForInstanceState(ctx, manager, inst.Id, StateRunning, 20*time.Second) + require.NoError(t, err) t.Log("Instance restored and running") // Wait for VM to be running again @@ -905,6 +929,8 @@ func TestQEMUForkFromRunningNetwork(t *testing.T) { }) require.NoError(t, err) t.Cleanup(func() { _ = manager.DeleteInstance(context.Background(), source.Id) }) + source, err = waitForInstanceState(ctx, manager, source.Id, StateRunning, 20*time.Second) + require.NoError(t, err) require.NoError(t, waitForQEMUReady(ctx, source.SocketPath, 10*time.Second)) assert.NotEmpty(t, source.IP) @@ -927,12 +953,19 @@ func TestQEMUForkFromRunningNetwork(t *testing.T) { sourceAfterFork, err := manager.GetInstance(ctx, source.Id) require.NoError(t, err) + if sourceAfterFork.State != StateRunning { + sourceAfterFork, err = waitForInstanceState(ctx, manager, source.Id, StateRunning, 20*time.Second) + require.NoError(t, err) + } require.Equal(t, StateRunning, sourceAfterFork.State) require.NotEmpty(t, sourceAfterFork.IP) assertHostCanReachNginx(t, sourceAfterFork.IP, 80, 60*time.Second) forked, err = manager.RestoreInstance(ctx, forkedID) require.NoError(t, err) + require.Contains(t, []State{StateInitializing, StateRunning}, forked.State) + forked, err = waitForInstanceState(ctx, manager, forkedID, StateRunning, 20*time.Second) + require.NoError(t, err) require.Equal(t, StateRunning, forked.State) require.NoError(t, waitForQEMUReady(ctx, forked.SocketPath, 10*time.Second)) diff --git a/lib/instances/query.go b/lib/instances/query.go index 49d89244..1eaaa074 100644 --- a/lib/instances/query.go +++ b/lib/instances/query.go @@ -1,25 +1,34 @@ package instances import ( + "bufio" "context" "fmt" "io" "os" "path/filepath" + "slices" "strconv" "strings" + "time" "github.com/kernel/hypeman/lib/hypervisor" "github.com/kernel/hypeman/lib/logger" ) // exitSentinelPrefix is the machine-parseable prefix written by init to serial console. -const exitSentinelPrefix = "HYPEMAN-EXIT " +const ( + exitSentinelPrefix = "HYPEMAN-EXIT " + programStartSentinelPrefix = "HYPEMAN-PROGRAM-START " + agentReadySentinelPrefix = "HYPEMAN-AGENT-READY " + bootMarkerRescanInterval = 1 * time.Second +) // stateResult holds the result of state derivation type stateResult struct { - State State - Error *string // Non-nil if state couldn't be determined + State State + Error *string // Non-nil if state couldn't be determined + BootMarkersHydrated bool } // deriveState determines instance state by checking socket and querying the hypervisor. @@ -66,7 +75,11 @@ func (m *manager) deriveState(ctx context.Context, stored *StoredMetadata) state case hypervisor.StateCreated: return stateResult{State: StateCreated} case hypervisor.StateRunning: - return stateResult{State: StateRunning} + hydrated := m.hydrateBootMarkersFromLogs(stored) + return stateResult{ + State: deriveRunningState(stored), + BootMarkersHydrated: hydrated, + } case hypervisor.StatePaused: return stateResult{State: StatePaused} case hypervisor.StateShutdown: @@ -82,6 +95,192 @@ func (m *manager) deriveState(ctx context.Context, stored *StoredMetadata) state } } +func deriveRunningState(stored *StoredMetadata) State { + if stored.ProgramStartedAt == nil { + return StateInitializing + } + if stored.SkipGuestAgent { + return StateRunning + } + if stored.GuestAgentReadyAt == nil { + return StateInitializing + } + return StateRunning +} + +// hydrateBootMarkersFromLogs fills missing boot markers from serial logs. +// Returns true when at least one missing marker was found and populated. +func (m *manager) hydrateBootMarkersFromLogs(stored *StoredMetadata) bool { + needProgram := stored.ProgramStartedAt == nil + needAgent := !stored.SkipGuestAgent && stored.GuestAgentReadyAt == nil + if !needProgram && !needAgent { + m.clearBootMarkerRescan(stored.Id) + return false + } + if !m.shouldScanBootMarkers(stored.Id) { + return false + } + + programStartedAt, guestAgentReadyAt := m.parseBootMarkers(stored.Id, needProgram, needAgent, stored.StartedAt) + hydrated := false + if needProgram && programStartedAt != nil { + stored.ProgramStartedAt = programStartedAt + hydrated = true + } + if needAgent && guestAgentReadyAt != nil { + stored.GuestAgentReadyAt = guestAgentReadyAt + hydrated = true + } + if hydrated { + m.clearBootMarkerRescan(stored.Id) + } else { + m.deferBootMarkerRescan(stored.Id) + } + return hydrated +} + +// parseBootMarkers scans app logs (including rotated files) and returns the +// newest observed program-start and guest-agent-ready marker timestamps. +// When startedAt is provided, files last modified before this boot start are ignored. +func (m *manager) parseBootMarkers(id string, needProgram bool, needAgent bool, startedAt *time.Time) (*time.Time, *time.Time) { + logPaths := m.appLogPathsForMarkerScan(id) + + var programStartedAt *time.Time + var guestAgentReadyAt *time.Time + // Iterate newest-to-oldest so we can stop once all required markers are found. + for i := len(logPaths) - 1; i >= 0; i-- { + logPath := logPaths[i] + if !fileMayContainCurrentBootMarkers(logPath, startedAt) { + continue + } + + f, err := os.Open(logPath) + if err != nil { + continue + } + + scanner := bufio.NewScanner(f) + for scanner.Scan() { + line := scanner.Text() + if ts, ok := parseProgramStartSentinelLine(line); ok { + if programStartedAt == nil || ts.After(*programStartedAt) { + t := ts + programStartedAt = &t + } + } + if ts, ok := parseAgentReadySentinelLine(line); ok { + if guestAgentReadyAt == nil || ts.After(*guestAgentReadyAt) { + t := ts + guestAgentReadyAt = &t + } + } + } + scanErr := scanner.Err() + _ = f.Close() + if scanErr != nil { + continue + } + if (!needProgram || programStartedAt != nil) && (!needAgent || guestAgentReadyAt != nil) { + return programStartedAt, guestAgentReadyAt + } + } + + return programStartedAt, guestAgentReadyAt +} + +func fileMayContainCurrentBootMarkers(path string, startedAt *time.Time) bool { + if startedAt == nil { + return true + } + info, err := os.Stat(path) + if err != nil { + return false + } + return !info.ModTime().UTC().Before(startedAt.UTC()) +} + +func (m *manager) shouldScanBootMarkers(id string) bool { + if nextAny, ok := m.bootMarkerScans.Load(id); ok { + if next, ok := nextAny.(time.Time); ok && m.nowUTC().Before(next) { + return false + } + } + return true +} + +func (m *manager) deferBootMarkerRescan(id string) { + m.bootMarkerScans.Store(id, m.nowUTC().Add(bootMarkerRescanInterval)) +} + +func (m *manager) clearBootMarkerRescan(id string) { + m.bootMarkerScans.Delete(id) +} + +func (m *manager) nowUTC() time.Time { + if m.now != nil { + return m.now().UTC() + } + return time.Now().UTC() +} + +// appLogPathsForMarkerScan returns app log paths in chronological order +// (oldest rotated file to newest active file). +func (m *manager) appLogPathsForMarkerScan(id string) []string { + base := m.paths.InstanceAppLog(id) + rotatedMatches, err := filepath.Glob(base + ".*") + if err != nil { + return []string{base} + } + matches := append([]string{base}, rotatedMatches...) + + type logPathWithRank struct { + path string + rank int // higher rank means older rotated log; 0 means active file + } + paths := make([]logPathWithRank, 0, len(matches)) + for _, path := range matches { + if path == base { + paths = append(paths, logPathWithRank{path: path, rank: 0}) + continue + } + + suffix := strings.TrimPrefix(path, base) + if !strings.HasPrefix(suffix, ".") { + continue + } + n, err := strconv.Atoi(strings.TrimPrefix(suffix, ".")) + if err != nil || n <= 0 { + continue + } + paths = append(paths, logPathWithRank{path: path, rank: n}) + } + + if len(paths) == 0 { + return []string{base} + } + + slices.SortFunc(paths, func(a, b logPathWithRank) int { + // Rotated logs first (older-to-newer by descending suffix), then active file. + switch { + case a.rank == 0 && b.rank != 0: + return 1 + case a.rank != 0 && b.rank == 0: + return -1 + case a.rank != b.rank: + // Larger suffix is older and should be read first. + return b.rank - a.rank + default: + return strings.Compare(a.path, b.path) + } + }) + + ordered := make([]string, 0, len(paths)) + for _, p := range paths { + ordered = append(ordered, p.path) + } + return ordered +} + // hasSnapshot checks if a snapshot exists for an instance func (m *manager) hasSnapshot(dataDir string) bool { snapshotDir := filepath.Join(dataDir, "snapshots", "snapshot-latest") @@ -105,10 +304,11 @@ func (m *manager) hasSnapshot(dataDir string) bool { func (m *manager) toInstance(ctx context.Context, meta *metadata) Instance { result := m.deriveState(ctx, &meta.StoredMetadata) inst := Instance{ - StoredMetadata: meta.StoredMetadata, - State: result.State, - StateError: result.Error, - HasSnapshot: m.hasSnapshot(meta.StoredMetadata.DataDir), + StoredMetadata: meta.StoredMetadata, + State: result.State, + StateError: result.Error, + HasSnapshot: m.hasSnapshot(meta.StoredMetadata.DataDir), + BootMarkersHydrated: result.BootMarkersHydrated, } // If VM is stopped and exit info isn't persisted yet, populate in-memory @@ -142,7 +342,8 @@ func (m *manager) parseExitSentinel(id string) (int, string, bool) { // Scan lines from the tail looking for the sentinel lines := strings.Split(string(data), "\n") - for _, line := range lines { + for i := len(lines) - 1; i >= 0; i-- { + line := lines[i] code, msg, ok := parseExitSentinelLine(line) if ok { return code, msg, true @@ -180,6 +381,43 @@ func (m *manager) persistExitInfo(ctx context.Context, id string) { } } +// persistBootMarkers parses program-start and guest-agent-ready markers from +// serial logs and persists them to metadata. Must be called under instance lock. +func (m *manager) persistBootMarkers(ctx context.Context, id string) { + log := logger.FromContext(ctx) + + meta, err := m.loadMetadata(id) + if err != nil { + return + } + + needProgram := meta.ProgramStartedAt == nil + needAgent := !meta.SkipGuestAgent && meta.GuestAgentReadyAt == nil + if !needProgram && !needAgent { + return + } + + programStartedAt, guestAgentReadyAt := m.parseBootMarkers(id, needProgram, needAgent, meta.StartedAt) + updated := false + if needProgram && programStartedAt != nil { + meta.ProgramStartedAt = programStartedAt + updated = true + } + if needAgent && guestAgentReadyAt != nil { + meta.GuestAgentReadyAt = guestAgentReadyAt + updated = true + } + if !updated { + return + } + + if err := m.saveMetadata(meta); err != nil { + log.WarnContext(ctx, "failed to persist boot markers", "instance_id", id, "error", err) + } else { + log.DebugContext(ctx, "persisted boot markers from serial log", "instance_id", id) + } +} + // readTail reads the last n bytes of a file. If the file is smaller than n, // the entire file is returned. func readTail(path string, n int64) ([]byte, error) { @@ -261,6 +499,38 @@ func parseExitSentinelLine(line string) (int, string, bool) { return code, "", true } +func parseProgramStartSentinelLine(line string) (time.Time, bool) { + return parseSentinelTimestamp(line, programStartSentinelPrefix) +} + +func parseAgentReadySentinelLine(line string) (time.Time, bool) { + return parseSentinelTimestamp(line, agentReadySentinelPrefix) +} + +func parseSentinelTimestamp(line, sentinelPrefix string) (time.Time, bool) { + line = strings.TrimSpace(line) + + idx := strings.Index(line, sentinelPrefix) + if idx < 0 { + return time.Time{}, false + } + + sentinel := line[idx+len(sentinelPrefix):] + for _, field := range strings.Fields(sentinel) { + if !strings.HasPrefix(field, "ts=") { + continue + } + ts := strings.TrimPrefix(field, "ts=") + parsed, err := time.Parse(time.RFC3339Nano, ts) + if err != nil { + return time.Time{}, false + } + return parsed, true + } + + return time.Time{}, false +} + // listInstances returns all instances func (m *manager) listInstances(ctx context.Context) ([]Instance, error) { log := logger.FromContext(ctx) diff --git a/lib/instances/query_test.go b/lib/instances/query_test.go index 0ede8659..5bb7f457 100644 --- a/lib/instances/query_test.go +++ b/lib/instances/query_test.go @@ -1,8 +1,12 @@ package instances import ( + "os" + "path/filepath" "testing" + "time" + "github.com/kernel/hypeman/lib/paths" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" ) @@ -91,3 +95,221 @@ func TestParseExitSentinelLine(t *testing.T) { }) } } + +func TestParseProgramStartSentinelLine(t *testing.T) { + t.Parallel() + + ts := "2026-03-08T15:09:26.123456789Z" + line := "2026-03-08T15:09:26Z [INFO] [hypeman-init:entrypoint] HYPEMAN-PROGRAM-START ts=" + ts + " mode=exec" + + parsed, ok := parseProgramStartSentinelLine(line) + require.True(t, ok) + assert.Equal(t, ts, parsed.UTC().Format(time.RFC3339Nano)) +} + +func TestParseAgentReadySentinelLine(t *testing.T) { + t.Parallel() + + ts := "2026-03-08T15:09:26.987654321Z" + line := "2026/03/08 15:09:26 [guest-agent] HYPEMAN-AGENT-READY ts=" + ts + + parsed, ok := parseAgentReadySentinelLine(line) + require.True(t, ok) + assert.Equal(t, ts, parsed.UTC().Format(time.RFC3339Nano)) +} + +func TestDeriveRunningState(t *testing.T) { + t.Parallel() + + now := time.Now().UTC() + + tests := []struct { + name string + stored StoredMetadata + want State + }{ + { + name: "initializing when program start marker missing", + stored: StoredMetadata{ + SkipGuestAgent: false, + }, + want: StateInitializing, + }, + { + name: "initializing when guest-agent marker missing", + stored: StoredMetadata{ + ProgramStartedAt: &now, + SkipGuestAgent: false, + }, + want: StateInitializing, + }, + { + name: "running when both markers present", + stored: StoredMetadata{ + ProgramStartedAt: &now, + GuestAgentReadyAt: &now, + SkipGuestAgent: false, + }, + want: StateRunning, + }, + { + name: "running when guest-agent is skipped", + stored: StoredMetadata{ + ProgramStartedAt: &now, + SkipGuestAgent: true, + }, + want: StateRunning, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + assert.Equal(t, tt.want, deriveRunningState(&tt.stored)) + }) + } +} + +func TestHydrateBootMarkersFromLogs_RescanThrottle(t *testing.T) { + t.Parallel() + + tmpDir := t.TempDir() + m := &manager{ + paths: paths.New(tmpDir), + } + + now := time.Date(2026, 3, 8, 12, 0, 0, 0, time.UTC) + m.now = func() time.Time { return now } + + meta := &StoredMetadata{ + Id: "test-instance", + SkipGuestAgent: false, + } + + // First call finds nothing and schedules a deferred rescan. + hydrated := m.hydrateBootMarkersFromLogs(meta) + require.False(t, hydrated) + require.Nil(t, meta.ProgramStartedAt) + require.Nil(t, meta.GuestAgentReadyAt) + + logPath := m.paths.InstanceAppLog(meta.Id) + require.NoError(t, os.MkdirAll(filepath.Dir(logPath), 0o755)) + err := os.WriteFile(logPath, []byte( + "HYPEMAN-AGENT-READY ts=2026-03-08T12:00:00Z\n"+ + "HYPEMAN-PROGRAM-START ts=2026-03-08T12:00:01Z mode=exec\n", + ), 0o644) + require.NoError(t, err) + + // Immediate second call should be throttled and skip scanning. + hydrated = m.hydrateBootMarkersFromLogs(meta) + require.False(t, hydrated) + require.Nil(t, meta.ProgramStartedAt) + require.Nil(t, meta.GuestAgentReadyAt) + + // Once the rescan interval has elapsed, markers are hydrated. + now = now.Add(bootMarkerRescanInterval + time.Millisecond) + hydrated = m.hydrateBootMarkersFromLogs(meta) + require.True(t, hydrated) + require.NotNil(t, meta.ProgramStartedAt) + require.NotNil(t, meta.GuestAgentReadyAt) +} + +func TestParseBootMarkers_IgnoresStaleMarkersBeforeBootStart(t *testing.T) { + t.Parallel() + + tmpDir := t.TempDir() + m := &manager{ + paths: paths.New(tmpDir), + } + + id := "boot-markers-instance" + logPath := m.paths.InstanceAppLog(id) + rotatedLogPath := logPath + ".1" + require.NoError(t, os.MkdirAll(filepath.Dir(logPath), 0o755)) + + bootStart := time.Date(2026, 3, 9, 4, 0, 0, 0, time.UTC) + staleProgram := bootStart.Add(-2 * time.Minute) + staleAgent := bootStart.Add(-90 * time.Second) + freshProgram := bootStart.Add(2 * time.Second) + freshAgent := bootStart.Add(3 * time.Second) + + staleData := "" + + "HYPEMAN-PROGRAM-START ts=" + staleProgram.Format(time.RFC3339Nano) + " mode=exec\n" + + "HYPEMAN-AGENT-READY ts=" + staleAgent.Format(time.RFC3339Nano) + "\n" + require.NoError(t, os.WriteFile(rotatedLogPath, []byte(staleData), 0o644)) + require.NoError(t, os.Chtimes(rotatedLogPath, bootStart.Add(-time.Minute), bootStart.Add(-time.Minute))) + + freshData := "" + + "HYPEMAN-PROGRAM-START ts=" + freshProgram.Format(time.RFC3339Nano) + " mode=exec\n" + + "HYPEMAN-AGENT-READY ts=" + freshAgent.Format(time.RFC3339Nano) + "\n" + require.NoError(t, os.WriteFile(logPath, []byte(freshData), 0o644)) + require.NoError(t, os.Chtimes(logPath, bootStart.Add(time.Second), bootStart.Add(time.Second))) + + programStartedAt, guestAgentReadyAt := m.parseBootMarkers(id, true, true, &bootStart) + require.NotNil(t, programStartedAt) + require.NotNil(t, guestAgentReadyAt) + assert.Equal(t, freshProgram.Format(time.RFC3339Nano), programStartedAt.UTC().Format(time.RFC3339Nano)) + assert.Equal(t, freshAgent.Format(time.RFC3339Nano), guestAgentReadyAt.UTC().Format(time.RFC3339Nano)) +} + +func TestParseBootMarkers_ReturnsLatestMarkerFromNewestLog(t *testing.T) { + t.Parallel() + + tmpDir := t.TempDir() + m := &manager{ + paths: paths.New(tmpDir), + } + + id := "latest-marker-instance" + logPath := m.paths.InstanceAppLog(id) + rotatedLogPath := logPath + ".1" + require.NoError(t, os.MkdirAll(filepath.Dir(logPath), 0o755)) + + oldProgram := time.Date(2026, 3, 9, 4, 0, 0, 0, time.UTC) + oldAgent := oldProgram.Add(500 * time.Millisecond) + newProgram := oldProgram.Add(3 * time.Second) + newProgramLatest := oldProgram.Add(4 * time.Second) + newAgent := oldProgram.Add(3500 * time.Millisecond) + + require.NoError(t, os.WriteFile(rotatedLogPath, []byte( + "HYPEMAN-PROGRAM-START ts="+oldProgram.Format(time.RFC3339Nano)+" mode=exec\n"+ + "HYPEMAN-AGENT-READY ts="+oldAgent.Format(time.RFC3339Nano)+"\n", + ), 0o644)) + + require.NoError(t, os.WriteFile(logPath, []byte( + "HYPEMAN-PROGRAM-START ts="+newProgram.Format(time.RFC3339Nano)+" mode=exec\n"+ + "HYPEMAN-AGENT-READY ts="+newAgent.Format(time.RFC3339Nano)+"\n"+ + "HYPEMAN-PROGRAM-START ts="+newProgramLatest.Format(time.RFC3339Nano)+" mode=exec\n", + ), 0o644)) + + programStartedAt, guestAgentReadyAt := m.parseBootMarkers(id, true, true, nil) + require.NotNil(t, programStartedAt) + require.NotNil(t, guestAgentReadyAt) + assert.Equal(t, newProgramLatest.Format(time.RFC3339Nano), programStartedAt.UTC().Format(time.RFC3339Nano)) + assert.Equal(t, newAgent.Format(time.RFC3339Nano), guestAgentReadyAt.UTC().Format(time.RFC3339Nano)) +} + +func TestAppLogPathsForMarkerScan_IgnoresArchivedLogs(t *testing.T) { + t.Parallel() + + tmpDir := t.TempDir() + m := &manager{ + paths: paths.New(tmpDir), + } + + id := "log-order-instance" + logPath := m.paths.InstanceAppLog(id) + require.NoError(t, os.MkdirAll(filepath.Dir(logPath), 0o755)) + + for _, p := range []string{ + logPath, + logPath + ".1", + logPath + ".2", + logPath + ".prev.12345", + logPath + "-debug-copy", + } { + require.NoError(t, os.WriteFile(p, []byte("x\n"), 0o644)) + } + + paths := m.appLogPathsForMarkerScan(id) + require.Equal(t, []string{logPath + ".2", logPath + ".1", logPath}, paths) +} diff --git a/lib/instances/restore.go b/lib/instances/restore.go index 1ff09c55..a325ca15 100644 --- a/lib/instances/restore.go +++ b/lib/instances/restore.go @@ -225,24 +225,27 @@ func (m *manager) restoreInstance( log.InfoContext(ctx, "deleting snapshot after successful restore", "instance_id", id) os.RemoveAll(snapshotDir) // Best effort, ignore errors - // 9. Update timestamp - now := time.Now() - stored.StartedAt = &now - + // 9. Persist runtime metadata updates without resetting StartedAt. + // Restore resumes an existing boot; preserving StartedAt keeps marker + // hydration scoped to the original boot timeline. meta = &metadata{StoredMetadata: *stored} if err := m.saveMetadata(meta); err != nil { // VM is running but metadata failed log.WarnContext(ctx, "failed to update metadata after restore", "instance_id", id, "error", err) } + // Return instance with derived state (should be Running now) + finalInst := m.toInstance(ctx, meta) + if finalInst.BootMarkersHydrated { + if err := m.saveMetadata(meta); err != nil { + log.WarnContext(ctx, "failed to persist hydrated boot markers after restore", "instance_id", id, "error", err) + } + } // Record metrics if m.metrics != nil { m.recordDuration(ctx, m.metrics.restoreDuration, start, "success", stored.HypervisorType) - m.recordStateTransition(ctx, string(StateStandby), string(StateRunning), stored.HypervisorType) + m.recordStateTransition(ctx, string(StateStandby), string(finalInst.State), stored.HypervisorType) } - - // Return instance with derived state (should be Running now) - finalInst := m.toInstance(ctx, meta) log.InfoContext(ctx, "instance restored successfully", "instance_id", id, "state", finalInst.State) return &finalInst, nil } diff --git a/lib/instances/snapshot_integration_scenario_test.go b/lib/instances/snapshot_integration_scenario_test.go index 37e1ae7a..ab30cb56 100644 --- a/lib/instances/snapshot_integration_scenario_test.go +++ b/lib/instances/snapshot_integration_scenario_test.go @@ -3,6 +3,7 @@ package instances import ( "context" "testing" + "time" "github.com/kernel/hypeman/lib/hypervisor" "github.com/kernel/hypeman/lib/images" @@ -64,6 +65,10 @@ func runStandbySnapshotScenario(t *testing.T, mgr *manager, tmpDir string, cfg s } }) + source, err = waitForInstanceState(ctx, mgr, sourceID, StateRunning, 20*time.Second) + requireNoErr(err) + require.Equal(t, StateRunning, source.State) + _, err = mgr.StandbyInstance(ctx, sourceID) requireNoErr(err) diff --git a/lib/instances/start.go b/lib/instances/start.go index 2e257aeb..39051356 100644 --- a/lib/instances/start.go +++ b/lib/instances/start.go @@ -50,6 +50,8 @@ func (m *manager) startInstance( // 2a. Clear stale exit info from previous run and apply command overrides stored.ExitCode = nil stored.ExitMessage = "" + stored.ProgramStartedAt = nil + stored.GuestAgentReadyAt = nil if len(req.Entrypoint) > 0 { stored.Entrypoint = req.Entrypoint } @@ -131,7 +133,14 @@ func (m *manager) startInstance( return nil, fmt.Errorf("create config disk: %w", err) } + if err := m.archiveAppLogForBoot(id); err != nil { + log.WarnContext(ctx, "failed to archive app log before start", "instance_id", id, "error", err) + } + // 6. Start hypervisor and boot VM (reuses logic from create) + bootStart := time.Now().UTC() + stored.StartedAt = &bootStart + log.InfoContext(ctx, "starting hypervisor and booting VM", "instance_id", id) if err := m.startAndBootVM(ctx, stored, imageInfo, netConfig); err != nil { log.ErrorContext(ctx, "failed to start and boot VM", "instance_id", id, "error", err) @@ -142,23 +151,24 @@ func (m *manager) startInstance( cu.Release() // 7. Update metadata (set PID, StartedAt) - now := time.Now() - stored.StartedAt = &now - meta = &metadata{StoredMetadata: *stored} if err := m.saveMetadata(meta); err != nil { // VM is running but metadata failed - log but don't fail log.WarnContext(ctx, "failed to update metadata after VM start", "instance_id", id, "error", err) } + // Return instance with derived state (should be Running now) + finalInst := m.toInstance(ctx, meta) + if finalInst.BootMarkersHydrated { + if err := m.saveMetadata(meta); err != nil { + log.WarnContext(ctx, "failed to persist hydrated boot markers after start", "instance_id", id, "error", err) + } + } // Record metrics if m.metrics != nil { m.recordDuration(ctx, m.metrics.startDuration, start, "success", stored.HypervisorType) - m.recordStateTransition(ctx, string(StateStopped), string(StateRunning), stored.HypervisorType) + m.recordStateTransition(ctx, string(StateStopped), string(finalInst.State), stored.HypervisorType) } - - // Return instance with derived state (should be Running now) - finalInst := m.toInstance(ctx, meta) log.InfoContext(ctx, "instance started successfully", "instance_id", id, "state", finalInst.State) return &finalInst, nil } diff --git a/lib/instances/state.go b/lib/instances/state.go index f4bfbf07..ba3d41f9 100644 --- a/lib/instances/state.go +++ b/lib/instances/state.go @@ -7,8 +7,13 @@ import "fmt" var ValidTransitions = map[State][]State{ // Cloud Hypervisor native transitions StateCreated: { - StateRunning, // boot VM - StateShutdown, // shutdown before boot + StateInitializing, // boot VM (guest init in progress) + StateRunning, // boot VM (fast path; markers already available) + StateShutdown, // shutdown before boot + }, + StateInitializing: { + StateRunning, // guest init complete + StateShutdown, // shutdown }, StateRunning: { StatePaused, // pause @@ -68,7 +73,7 @@ func (s State) IsTerminal() bool { // RequiresVMM returns true if this state requires a running VMM process func (s State) RequiresVMM() bool { switch s { - case StateCreated, StateRunning, StatePaused, StateShutdown: + case StateCreated, StateInitializing, StateRunning, StatePaused, StateShutdown: return true case StateStopped, StateStandby, StateUnknown: return false diff --git a/lib/instances/stop.go b/lib/instances/stop.go index bc2adc15..2b9abfee 100644 --- a/lib/instances/stop.go +++ b/lib/instances/stop.go @@ -132,10 +132,10 @@ func (m *manager) forceKillHypervisorProcess(ctx context.Context, inst *Instance return nil } -// stopInstance gracefully stops a running instance. +// stopInstance gracefully stops an active instance. // Flow: send Shutdown RPC -> wait for VM to power off -> // fall back to hypervisor shutdown -> final SIGKILL if still alive. -// Multi-hop orchestration: Running → Shutdown → Stopped +// Multi-hop orchestration: Running/Initializing → Shutdown → Stopped func (m *manager) stopInstance( ctx context.Context, id string, @@ -162,10 +162,10 @@ func (m *manager) stopInstance( stored := &meta.StoredMetadata log.DebugContext(ctx, "loaded instance", "instance_id", id, "state", inst.State) - // 2. Validate state transition (must be Running to stop) - if inst.State != StateRunning { + // 2. Validate state transition (must be active to stop) + if inst.State != StateRunning && inst.State != StateInitializing { log.ErrorContext(ctx, "invalid state for stop", "instance_id", id, "state", inst.State) - return nil, fmt.Errorf("%w: cannot stop from state %s, must be Running", ErrInvalidState, inst.State) + return nil, fmt.Errorf("%w: cannot stop from state %s, must be Running or Initializing", ErrInvalidState, inst.State) } // 3. Get network allocation BEFORE killing VMM (while we can still query it) @@ -240,6 +240,9 @@ func (m *manager) stopInstance( stored.StoppedAt = &now stored.HypervisorPID = nil stored.GPUMdevUUID = "" // Clear mdev UUID since we destroyed it + // Boot markers are per-boot-run and must not carry across stop/restore/start. + stored.ProgramStartedAt = nil + stored.GuestAgentReadyAt = nil meta = &metadata{StoredMetadata: *stored} if err := m.saveMetadata(meta); err != nil { @@ -253,7 +256,7 @@ func (m *manager) stopInstance( // Record metrics if m.metrics != nil { m.recordDuration(ctx, m.metrics.stopDuration, start, "success", stored.HypervisorType) - m.recordStateTransition(ctx, string(StateRunning), string(StateStopped), stored.HypervisorType) + m.recordStateTransition(ctx, string(inst.State), string(StateStopped), stored.HypervisorType) } // Return instance with derived state (should be Stopped now) diff --git a/lib/instances/types.go b/lib/instances/types.go index edd85b07..23fc1bcc 100644 --- a/lib/instances/types.go +++ b/lib/instances/types.go @@ -12,13 +12,14 @@ import ( type State string const ( - StateStopped State = "Stopped" // No VMM, no snapshot - StateCreated State = "Created" // VMM created but not booted (CH native) - StateRunning State = "Running" // VM running (CH native) - StatePaused State = "Paused" // VM paused (CH native) - StateShutdown State = "Shutdown" // VM shutdown, VMM exists (CH native) - StateStandby State = "Standby" // No VMM, snapshot exists - StateUnknown State = "Unknown" // Failed to determine state (VMM query failed) + StateStopped State = "Stopped" // No VMM, no snapshot + StateCreated State = "Created" // VMM created but not booted (CH native) + StateInitializing State = "Initializing" // VM running, guest init in progress + StateRunning State = "Running" // Guest program started and ready + StatePaused State = "Paused" // VM paused (CH native) + StateShutdown State = "Shutdown" // VM shutdown, VMM exists (CH native) + StateStandby State = "Standby" // No VMM, snapshot exists + StateUnknown State = "Unknown" // Failed to determine state (VMM query failed) ) // VolumeAttachment represents a volume attached to an instance @@ -61,6 +62,10 @@ type StoredMetadata struct { StartedAt *time.Time // Last time VM was started StoppedAt *time.Time // Last time VM was stopped + // Boot progress markers (derived from guest serial log sentinels and persisted) + ProgramStartedAt *time.Time // Set when guest program handoff/start boundary is reached + GuestAgentReadyAt *time.Time // Set when guest-agent is ready (unless skip_guest_agent=true) + // Versions KernelVersion string // Kernel version (e.g., "ch-v6.12.9") @@ -105,9 +110,10 @@ type Instance struct { StoredMetadata // Derived fields (not stored in metadata.json) - State State // Derived from socket + VMM query - StateError *string // Error message if state couldn't be determined (non-nil when State=Unknown) - HasSnapshot bool // Derived from filesystem check + State State // Derived from socket + VMM query + guest boot markers + StateError *string // Error message if state couldn't be determined (non-nil when State=Unknown) + HasSnapshot bool // Derived from filesystem check + BootMarkersHydrated bool // True when missing boot markers were hydrated from logs in this read } // GetHypervisorType returns the hypervisor type as a string. diff --git a/lib/network/README.md b/lib/network/README.md index c54e66a8..63c6e4d0 100644 --- a/lib/network/README.md +++ b/lib/network/README.md @@ -79,10 +79,12 @@ Hypeman provides a single default network that all instances can optionally conn - Follows instance manager's pattern **Sources of truth:** -- **Running VMs**: Query `GetVmInfo()` from Cloud Hypervisor - returns IP/MAC/TAP +- **Active VMs** (`Running` or `Initializing`): Query `GetVmInfo()` from Cloud Hypervisor - returns IP/MAC/TAP - **Standby VMs**: Read `guests/{id}/snapshots/snapshot-latest/config.json` from snapshot - **Stopped VMs**: No network allocation +`Initializing` is treated as fully VMM-active for networking; startup work such as async kernel-headers setup does not change network allocation behavior. + **Metadata storage:** ``` /var/lib/hypeman/guests/{instance-id}/ @@ -352,4 +354,3 @@ Cleanup happens automatically via `t.Cleanup()`, which runs even on test failure - **Integration tests** (TestInitializeIntegration, TestCreateAllocationIntegration, etc.): Require permissions, create real devices All tests run via `make test` - no separate commands needed. - diff --git a/lib/network/allocate.go b/lib/network/allocate.go index e4934716..07faeaa7 100644 --- a/lib/network/allocate.go +++ b/lib/network/allocate.go @@ -14,21 +14,22 @@ import ( // CreateAllocation allocates IP/MAC/TAP for instance on the default network func (m *manager) CreateAllocation(ctx context.Context, req AllocateRequest) (*NetworkConfig, error) { + log := logger.FromContext(ctx) + + // Resolve bridge/default network before taking allocation lock so + // self-heal retries don't block other allocation/release operations. + network, err := m.getOrInitDefaultNetwork(ctx) + if err != nil { + return nil, err + } + // Acquire lock to prevent concurrent allocations from: // 1. Picking the same IP address // 2. Creating duplicate instance names m.mu.Lock() defer m.mu.Unlock() - log := logger.FromContext(ctx) - - // 1. Get default network - network, err := m.getDefaultNetworkWithSelfHeal(ctx) - if err != nil { - return nil, fmt.Errorf("get default network: %w", err) - } - - // 2. Check name uniqueness (exclude current instance to allow restarts) + // 1. Check name uniqueness (exclude current instance to allow restarts) exists, err := m.NameExists(ctx, req.InstanceName, req.InstanceID) if err != nil { return nil, fmt.Errorf("check name exists: %w", err) @@ -38,7 +39,7 @@ func (m *manager) CreateAllocation(ctx context.Context, req AllocateRequest) (*N ErrNameExists, req.InstanceName, network.Name) } - // 3. Allocate random available IP + // 2. Allocate random available IP // Random selection reduces predictability and helps distribute IPs across the subnet. // This is especially useful for large /16 networks and reduces conflicts when // moving standby VMs across hosts. @@ -47,16 +48,16 @@ func (m *manager) CreateAllocation(ctx context.Context, req AllocateRequest) (*N return nil, fmt.Errorf("allocate IP: %w", err) } - // 4. Generate MAC (02:00:00:... format - locally administered) + // 3. Generate MAC (02:00:00:... format - locally administered) mac, err := generateMAC() if err != nil { return nil, fmt.Errorf("generate MAC: %w", err) } - // 5. Generate TAP name (tap-{first8chars-of-id}) + // 4. Generate TAP name (tap-{first8chars-of-id}) tap := GenerateTAPName(req.InstanceID) - // 6. Create TAP device with bidirectional rate limiting + // 5. Create TAP device with bidirectional rate limiting if err := m.createTAPDevice(tap, network.Bridge, network.Isolated, req.DownloadBps, req.UploadBps, req.UploadCeilBps); err != nil { return nil, fmt.Errorf("create TAP device: %w", err) } @@ -72,11 +73,11 @@ func (m *manager) CreateAllocation(ctx context.Context, req AllocateRequest) (*N "download_bps", req.DownloadBps, "upload_bps", req.UploadBps) - // 7. Calculate netmask from subnet + // 6. Calculate netmask from subnet _, ipNet, _ := net.ParseCIDR(network.Subnet) netmask := fmt.Sprintf("%d.%d.%d.%d", ipNet.Mask[0], ipNet.Mask[1], ipNet.Mask[2], ipNet.Mask[3]) - // 8. Return config (will be used in CH VmConfig) + // 7. Return config (will be used in CH VmConfig) return &NetworkConfig{ IP: ip, MAC: mac, @@ -105,10 +106,10 @@ func (m *manager) RecreateAllocation(ctx context.Context, instanceID string, dow return nil } - // 2. Get default network details - network, err := m.getDefaultNetworkWithSelfHeal(ctx) + // 2. Get default network details (same self-healing behavior as CreateAllocation). + network, err := m.getOrInitDefaultNetwork(ctx) if err != nil { - return fmt.Errorf("get default network: %w", err) + return err } // 3. Recreate TAP device with same name and rate limits from instance metadata @@ -158,6 +159,33 @@ func (m *manager) ReleaseAllocation(ctx context.Context, alloc *Allocation) erro return nil } +// getOrInitDefaultNetwork resolves the default network and self-heals by running +// Initialize if bridge state is missing, then retries briefly to absorb netlink propagation delay. +func (m *manager) getOrInitDefaultNetwork(ctx context.Context) (*Network, error) { + network, err := m.getDefaultNetwork(ctx) + if err == nil { + return network, nil + } + + // Self-heal should never delete TAPs for active instances. We pass an empty + // preserve set so CleanupOrphanedTAPs is skipped in Initialize. + if initErr := m.Initialize(ctx, []string{}); initErr != nil { + return nil, fmt.Errorf("initialize network manager: %w", initErr) + } + + const retries = 20 + const retryDelay = 100 * time.Millisecond + for i := 0; i < retries; i++ { + network, err = m.getDefaultNetwork(ctx) + if err == nil { + return network, nil + } + time.Sleep(retryDelay) + } + + return nil, fmt.Errorf("get default network after initialize: %w", err) +} + // allocateNextIP picks a random available IP in the subnet // Retries up to 5 times if conflicts occur func (m *manager) allocateNextIP(ctx context.Context, subnet string) (string, error) { @@ -225,31 +253,6 @@ func (m *manager) allocateNextIP(ctx context.Context, subnet string) (string, er return "", fmt.Errorf("no available IPs in subnet %s after %d random attempts and full scan", subnet, maxRetries) } -func (m *manager) getDefaultNetworkWithSelfHeal(ctx context.Context) (*Network, error) { - network, err := m.getDefaultNetwork(ctx) - if err == nil { - return network, nil - } - - // Self-heal if bridge state was externally removed after initialization. - // After re-initialization, kernel bridge/IP state may take a brief moment to become visible. - if initErr := m.Initialize(ctx, nil); initErr != nil { - return nil, err - } - - deadline := time.Now().Add(2 * time.Second) - for { - network, err = m.getDefaultNetwork(ctx) - if err == nil { - return network, nil - } - if time.Now().After(deadline) { - return nil, err - } - time.Sleep(100 * time.Millisecond) - } -} - // incrementIP increments IP address by n func incrementIP(ip net.IP, n int) net.IP { // Ensure we're working with IPv4 (4 bytes) diff --git a/lib/network/bridge_linux.go b/lib/network/bridge_linux.go index 6de12ebf..f3509327 100644 --- a/lib/network/bridge_linux.go +++ b/lib/network/bridge_linux.go @@ -4,6 +4,7 @@ package network import ( "context" + "errors" "fmt" "hash/fnv" "net" @@ -11,12 +12,31 @@ import ( "os/exec" "strings" "syscall" + "time" "github.com/kernel/hypeman/lib/logger" "github.com/vishvananda/netlink" "golang.org/x/sys/unix" ) +const netlinkDumpRetryCount = 3 + +func listBridgeAddrsWithRetry(link netlink.Link) ([]netlink.Addr, error) { + var err error + for i := 0; i < netlinkDumpRetryCount; i++ { + addrs, listErr := netlink.AddrList(link, netlink.FAMILY_V4) + if listErr == nil { + return addrs, nil + } + if !errors.Is(listErr, netlink.ErrDumpInterrupted) { + return nil, listErr + } + err = listErr + time.Sleep(10 * time.Millisecond) + } + return nil, err +} + // checkSubnetConflicts checks if the configured subnet conflicts with existing routes. // Returns an error if a conflict is detected, with guidance on how to resolve it. func (m *manager) checkSubnetConflicts(ctx context.Context, subnet string) error { @@ -88,7 +108,7 @@ func (m *manager) createBridge(ctx context.Context, name, gateway, subnet string existing, err := netlink.LinkByName(name) if err == nil { // Bridge exists - verify it has the expected gateway IP - addrs, err := netlink.AddrList(existing, netlink.FAMILY_V4) + addrs, err := listBridgeAddrsWithRetry(existing) if err != nil { return fmt.Errorf("list bridge addresses: %w", err) } diff --git a/lib/oapi/oapi.go b/lib/oapi/oapi.go index 34c0ac65..e4290fb0 100644 --- a/lib/oapi/oapi.go +++ b/lib/oapi/oapi.go @@ -105,13 +105,14 @@ const ( // Defines values for InstanceState. const ( - InstanceStateCreated InstanceState = "Created" - InstanceStatePaused InstanceState = "Paused" - InstanceStateRunning InstanceState = "Running" - InstanceStateShutdown InstanceState = "Shutdown" - InstanceStateStandby InstanceState = "Standby" - InstanceStateStopped InstanceState = "Stopped" - InstanceStateUnknown InstanceState = "Unknown" + InstanceStateCreated InstanceState = "Created" + InstanceStateInitializing InstanceState = "Initializing" + InstanceStatePaused InstanceState = "Paused" + InstanceStateRunning InstanceState = "Running" + InstanceStateShutdown InstanceState = "Shutdown" + InstanceStateStandby InstanceState = "Standby" + InstanceStateStopped InstanceState = "Stopped" + InstanceStateUnknown InstanceState = "Unknown" ) // Defines values for RestoreSnapshotRequestTargetHypervisor. @@ -724,7 +725,8 @@ type Instance struct { // State Instance state: // - Created: VMM created but not started (Cloud Hypervisor native) - // - Running: VM is actively running (Cloud Hypervisor native) + // - Initializing: VM is running while guest init is still in progress + // - Running: Guest program has started and instance is ready // - Paused: VM is paused (Cloud Hypervisor native) // - Shutdown: VM shut down but VMM exists (Cloud Hypervisor native) // - Stopped: No VMM running, no snapshot exists @@ -762,7 +764,8 @@ type InstanceGPU struct { // InstanceState Instance state: // - Created: VMM created but not started (Cloud Hypervisor native) -// - Running: VM is actively running (Cloud Hypervisor native) +// - Initializing: VM is running while guest init is still in progress +// - Running: Guest program has started and instance is ready // - Paused: VM is paused (Cloud Hypervisor native) // - Shutdown: VM shut down but VMM exists (Cloud Hypervisor native) // - Stopped: No VMM running, no snapshot exists @@ -13295,201 +13298,201 @@ func (sh *strictHandler) GetVolume(w http.ResponseWriter, r *http.Request, id st // Base64 encoded, gzipped, json marshaled Swagger object var swaggerSpec = []string{ - "H4sIAAAAAAAC/+x97XLbOJboq6B0d2vkHUmWP+I43ura69iJ29tx4hvH3rvTylUgEpLQJgE2AMpRUvk7", - "DzCPOE9yCwcAvwRKlGM78SZTUx2ZBPFxcHBwvs/nVsDjhDPClGwdfG7JYEpiDD8PlcLB9IpHaUzekj9T", - "IpV+nAieEKEogUYxT5kaJlhN9V8hkYGgiaKctQ5a51hN0c2UCIJm0AuSU55GIRoRBN+RsNVpkY84TiLS", - "OmhtxkxthljhVqel5ol+JJWgbNL60mkJgkPOorkZZozTSLUOxjiSpFMZ9kx3jbBE+pMufJP1N+I8Ipi1", - "vkCPf6ZUkLB18HtxGe+zxnz0BwmUHvxwhmmERxE5JjMakEUwBKkQhKlhKOiMiEVQHJn30RyNeMpCZNqh", - "NkujCNExYpyRjRIw2IyGVENCN9FDtw6USIkHMiHMaUhDzw4cnSLzGp0eo/aUfCwPsv10tN+q75LhmCx2", - "+msaY9bVwNXTcv1D22Lfr3Z9PVMex+lwIniaLPZ8+ubs7BLBS8TSeEREscf97aw/yhSZEKE7TAI6xGEo", - "iJT+9buXxbn1+/3+Ad4+6Pd7fd8sZ4SFXNSC1Lz2g3SrH5IlXTYCqe1/AaSvr06PTw/RERcJFxi+XRip", - "gthF8BTXVUSb8q748P95SqNwEetH+jERQ8qkwqwGB0/tSw0uPkZqSpD9Dl2dofaYCxSSUTqZUDbZaILv", - "mmBFRJFwiNXicDBVZNtQzpCiMZEKx0mr0xpzEeuPWiFWpKvfNBpQELxiON2i0WCLRy01OzmMZV3vrgmi", - "DMU0iqgkAWehLI5BmdrbrV9M4cAQIbiHQr3Qj1FMpMQTgtqabGrazZBUWKUSUYnGmEYkbLRHPkQwi/mD", - "jxANCVN0TMvn26BTF4+Cre0dL+2I8YQMQzqxN1G5+2N4rlFM96MQtPYvRB+0ebN1wJCCjBfHewmkGwYR", - "ZEwE0Tj+lcMlgs8I06dFj/cvMG7rf23mV/SmvZ83AZjnefMvndafKUnJMOGSmhkuUC77RqMRgBrBF/45", - "w6tle13AKKmwWH4+oMUdnEQzv0awuTBNv3RaCk9WfvJOt6nSTiCNdsgSFaglkS9mhHmYpIAzZV+UofOK", - "T1BEGUG2hd0LTRP1AL9EHEjiHcEhA//i4dfzvgXxMg9qetPvOi3C0lgDM+KTIjSnBAs1IiVg1lxhtqN8", - "drXgPy8dn8pdhSUZLqcg55QxEiLd0h5s0xKlEjjVheXDKbqmajgjQnrPHEzrN6qQbVHbVcSD6zGNyHCK", - "5dTMGIchnFccnZdW4uHWSuwvTjQRdB0CFyGR4uji18PtJ3vIDuCBoeSpCMwMFldS+Fp3b9oihcUIR5EX", - "N+rRbf07ehFD/BhwkR2Mursnw0CHmIbStexu6u47rSSVU/MLaLeeFdx9mgxo9Ir07/eeRR8BkTBSQq3M", - "5OcB3yRms9Ek4hqmc5Qy+mdaYrB76FTLCgrpi4KGJOwgDC80ycap4t0JYURoOoXGgsfAbRWYYNQmvUmv", - "gwaaL+xqLriLt7v9frc/aJXZ2Gi3O0lSDQqsFBF6gv/vd9z9dNj9W7/77H3+c9jrvv/rv/gQoCln7rhC", - "u862O/sd5CZbZNerE13Fyt+a+hen76M4ZqtPNZ1Yd6ePThcZB7PWkAfXRPQo34zoSGAx32QTyj4eRFgR", - "qcorX972TmEB61gCBDbRYFoTDBWhB9C4HfEbIgJNgSOiEU92NBGmSnYQ1nIzEC+kb8l/RwFm+iwY5oIL", - "RFiIbqiaIgztytCK512c0C41U211WjH++IqwiZq2DvZ2FvBcI3nb/ui+/zf3aOM/vKgu0oh4kPwtTxVl", - "EwSvza0+pRLlc6CKxCt3xEE3jYDNiyk7NZ9tZTPBQuD51++wW8iynTbCXO1WB7GH838zI0LQ0N2qR2fH", - "qB3Ra2LRHYmUoUHa7+8E0AB+Evsk4HGMWWiebfTQm5gqfZul+SVttEG94nb/3iLBlAOfEUVcLygDdQ0T", - "k8PQ0CHPdh47TYpEVjqHexWDngy29+T8clNTtgRLqaaCp5NpeVaWrK43Hyqvh5QPR4lvTlReo9PNN0gT", - "fRRRDZ2MyG/1+2fPN+Wgpf944v7Y6KFjAzKYvt4/LuzdI6dYEOCAQsQZOjq/RDiKeGDlz7FmVMd0kgoS", - "9ipqD+jddzgIU2KecOpjgCuYkTddRJBuN3+7Bh5sjijblHobusF6cCds9hVs2As2o4KzWLPCMyyopnEl", - "JdTn1us3xy+GL15ftQ70IQrTwGp0zt+8fdc6aO30+/2Wj9PRGLTijJ+cXx7BTun2U66SKJ0MJf3kIcOH", - "2fpQTGIujPhhv0HtaZlKG+4MweYMWjsnzw1ybZ0AXrlNCamE1q4X03EZY7ZPnvuwZTpPiJhR6dNR/Jq9", - "cztfoKmGMJVxWxIxIyJDWsDiXoH3CyKeht3CkJ3WmAoSCKzRrtVp/UlizQTNPmnUyefu+c6vOmh0+a+4", - "1XGUUEaWXOvfyfV6w8V1xHHY3brj25URpfteXOJr86K8vxYnSIYSrc6CKMjCGxqq6TDkN0xP2UNX7RuU", - "Nc6I60e9Ehz98+//uDrLedStk1FiKe3W9pOvpLQV2qq79sqf2ULSxL+My8S/iKuzf/79H24l33YRhGn8", - "DEt2HaP+KS/lv6ZETYko3Lhug/UjI0DA58jhS2H4kj6paARaIK58RkSE5wViaefU2uoDxarMSlAF58t+", - "p0nfNdIfryCdujd3MZ9UhZrtvp84eiblmdNzfb4tLW8yk2wiW9tn9uf24pRqZnRNk+FE84JDPMl0XMvM", - "cxfXNEHwRRe+MNsYRebwhqnuGY04V70B+68pYQj2DjaYfCQB0CktxKPD81OJbmgUgUQMhGDxOhiwdwVS", - "YJpLpf8rUtZBo1QhQWKuCLKMJgySwlyg8YiglGFn/+sNWBEqdoFVvLJguSaCkWg4JTgkQjaEjPkI2Y9q", - "gQNLHWOpiDAUOk3K8Dr+7ewCtY/nDMc0QL+ZXs94mEYEXaSJPsMbZeh1BiwRZEYYyCyaqaB2XD5GPFVd", - "Pu4qQYibYgydZToFa5yanZxfWvOm3OgN2FuiAUtYSEKYs7slJFJTrFDI2V/0iSVhudvi+BWg+8/yOrJP", - "pzULkrS8I9vV3XgNBki99hkVKsWRJm8lDs5rjzSWbg+nbgzpRYnBkq0MObEqG5KaCoimZzB7L/KxfjnP", - "MCf1ct4Fw4mcclUr511TFq6al+vkN922lk/J9F7SNr9vViURpJsmE4HBUHuXjMqtpW+AZv1urPDB8Bnb", - "MqgGqVQ8LpjcULuiKKRllWIZWDMedUOsMDB1DTlPM91F83U8N12ZI1J3vw0nI4/2WV9jlKEJneDRXJUl", - "qa2+7yB+rSrEzcW3LXVuIOZgk3Co+HJDOB0j17aJ3QucRoaKD2dj6uk5Y41yLSqVKKj4nFhyo7voJgG1", - "RLqDbqZUM1MSOSAAnb46K2oxegPWhYvlAB1nA2TdZl3qgwkac+iizUVhEhSMH2g030AYXZ310Ltstn+R", - "iGFFZ8T5xUyxRCNCGEqBCSchjA+XZnECqdQ3FVXVz+2NZFxoNkBZw+27HtJCZIzt7a6PQowVDUDhPqKV", - "9YBR1GyUHkmTblbkLRrxAsvcB96SCZVKVJwHUPvty6OdnZ1nVa5w+0m3v9XdevJuq3/Q1///W3M/g7v3", - "EvL1dVimLdaEUaQ+R5enx9uWBS2Poz7t4mf7Hz9i9WyP3shnn+KRmPyxgx/Ej8hPyo5z2wtqp5KIriOT", - "Gqt8FpeCYaPGonJrQ8k92T1yM+6ytgYS73TL+3CQ8pnereF3fRemKsFcabwvLG5hPfqp5gLzU1JQIFkb", - "WUC91sBjKq+fC4KvQ37DPPe2ZsLk0Nxnfs1uqiXr0RyRj5phJyESnKuxNBqkMjO6tft0d39nb3e/3/f4", - "BS0iPA/oMNA3UKMJvDk6RRGeE4HgG9QG0T9Eo4iPyoj+ZGdv/2n/2dZ203kYwbkZHDJe2X2F2hYif3U+", - "pu5NaVLb20/3dnZ2+nt727uNZmXZ+EaTcix/iSV5uvN0d2t/e7cRFHyKiBfOT6vqSxL6lLhJElGjdunK", - "hAR0TAMEnl5If4DaMVxhJNMBlM/kCIdDYdlL792hMI3kUt2xGcy2NG59cRopmkTEvIMNaSTPwMqPoSef", - "Xp4yRsQwc2Nboyfr3bZSV+rWkjVBJS/FEujOqAQuJGeeKInCA3NCV9I52M18Yu/r8MCuoSE2vNKiUzci", - "MxIVkcBcXXqyMRcEZXhiNq20KspmOKLhkLIk9aJELShfpgJ4UdMpwiOeKqO8gQ0rDgK2c5A9xppcN3Pz", - "eMnF9UorpL6JhyJlTHezUu9yGEX8Rm/xtYYN3OIY2a+do0uB6cuULEYVZd9L9NZ8YVRV+eMkVYgyxbV0", - "ysLRvAMjkRDaMSSIVBwoKQ6uNYdpu2nKXfr5lteaYXGKcDNeTjsfyArQHRsl7N1K2GJC1FAqrFZyLBpT", - "3kH7C2je2KlBf7hSAdIA7ozcPATQweujq9G2KxlO7gfiy8xyma4hbwS3sKAh6SE4XWAfcF6mlZN2oXiS", - "kDDT//QG7MIcleyRRHEqQed5beCgpoQKxAWd0PLA9tg8gH1vHVR02HRrdCx+uMihwktQitcfejxWRBgI", - "Ogf6ohec3YRWp2Vh3+q0LCUqg8Y99EAkNzovTPHk/HJdK10i+JhGnuWChtm+tZKZs1+92u1fdLf+j7FF", - "a3wDFo0yo5WOeUh6lRgVaN/s5jk5vzyvm1MWIISKs1tYU2ZH8FCOTN3sIGI15AFmaESQlWAc+uuLJRsk", - "572f+XjZscAxGaXjMRHD2KNce6nfI9PAGIwoQ2fPy/ys5pubSs3npc0BsXmMAxvf0Qz6HoVcZRmdAjTf", - "+7frLTHXcJ1XqN4qYdtYx9Aeep2FZKGT80uJctuPR1NX3t5az6Hz6VzSAEemR+PkTVlRwQbI2ZhDPs8/", - "tKpID58ce3lDdxBQezZJUjiGF2+7p2+uNuOQzDqlOYG9Zsojoue9UaAWM+cbmrs5lYjErE7TYRBDNj1A", - "BVhlJ7gxkArn1QMdxRWOhjLiyjObd/olgpeoffXS+O7pGXRQUtpK/bwAhRJ+73lPjKZIdcNewIBVlWnp", - "gHtlx3Iko1GvFJZXGtR3VH4lODIBnGV8zsMM3Mbz6/JG8+uVp9d24hv31LnINPAhPDo7NgxDwJnClBGB", - "YqKwDRctOHwBO9TqtLr6jgoxicGAOf735c5fNSr4DF2WKXGPFqK/7kWBWxO1oIlcNCMhijGjYyKVjVoo", - "jSynePvJ3oGJrQrJePfJXq/XW9db70XuntdoKzaNM1PBca8np1+3D/fglNdkLZ9b54fvfm0dtDZTKTYj", - "HuBoU44oOyj8nf2Zv4Af5s8RZV5nvkbheHS8EIZXNmnqO8s8P9ArYSTIEJKDAL/SxFQjz2jUjOgnEiKv", - "17rCEy2fGIz7Wvf0Wwew5VHUqhC4VrTTNwhio5+Wa0IdYwRt7JgpUzTK4/sWdaC3itCUS4NYFgJYEsKy", - "sJUoMr8Czmb6VPhiWEoE3L37KvvBjRHohiH1YPJ/WWkv1IKXAv/S1eettYmTZDXa+hnFjP41jd2zHvae", - "m+ibU/3b2NjKo7+Z/Oef/1eeP/1j689XV1f/PTv5z+PX9L+vovM3X+VLujy44ptGSNxZUAQYlkqREU1R", - "6QyrwMNQTblUNRC2b5DiKNYf99ARCH4HA9ZFr6giAkcHaNDCCe1ZwPcCHg9aqE0+4kCZrxBnSHdl/cc2", - "9MfnRv2jP/7sZMsv1T5C6ygm7IZkPp0yHYU8xpRtDNiA2b6QW4gEm77+FaIAJyoVRO+e5mGjORoJHOQO", - "YvngHfQZJ8mXjQEDCZd8VEKvIMFCZdFgbgRACjsr4zNgm5MQzXCUEmkl5AHL7iUQ+XUnRkfTy5QgoJuv", - "aFZrgOIVX7goOzju9zuefUS6nd7IiEpFGMq0HVQCoqO281Td75dIxX5/v7+Swc9waAn6wUlYzNXikLLB", - "WTIIDEMbwj2cKpU00KVr2mTOCPr13btzDQb97wVyHeWwyLbYCHk4SSJKpNERqgh4HescvNHyO/zp3W24", - "IKMkg8+iBr6YL2Bg9O7VBVJExJQZWt8ONDjHNNDrA/M/lTLVqEgxOjw6e7HRa5BsBmCbzX/JPr7LVlix", - "HDulWZ0uMMN4Dd8OOj3uaDbNntCcgQO3mpdcoMgQmPxcH6BLScruibBVxqpvdjKa55o3cwMMWhuux6RK", - "KQ7Q24xvxNlUsijVHBlcl/m5hG6t4cX4/Cz03inPFbyZrFxkSRt4+GCFrJ0Tru16UrD8+HsgDmees6pO", - "c72zXVSG6sH8qJHv/b1zKzvryqjrhqqVPdMLkQhZtFrzMLP7CNdalNc+UjWsNcIj/dqa3J1UcnWGpliy", - "vyh4WZFNtnaeNkraokdtar4uGq752EwpO1XOzT0zuxqH/2saRcabQdIJwxF6htoXpye/nb56tYG66M2b", - "s+pWLPvCtz8NotYcap+cX0IoGJZDZwGqd3rEueMw+UilkotRAI0Mqcuj5H4tRbJ5wyo27jC8zVmfF5bx", - "EIFr39Kt7/sLmlsa5va1sWqW2b2nULVa4uoL8yrTWfP4boPO7mU6pfAxH30o8gTO5/rWEWOdFvX4mx5K", - "TQJJiE7P80wbuVLKdV9Z07Pt3tbefm+r3+9t9Zuo6GIcLBn77PCo+eD9baOIOMCjgyA8IOOvUBFaxDbM", - "G45u8FyigWOvBy3DzxcY+cKxtSx4I/PrYmDe7eLwqgzFqki7dSLrmoXMLUmXdVFOlNWYR3vyt6/KqUWa", - "3szWdcF+NVxHeU1QwNMo1HzQSJ88I1aR0Ep/kqg8Bxkc1kt2zfgNKy/d6DD1+f0zJWKOrs7OShpvQcY2", - "xVKDhYPLQ80+8GStbdhewSqvnM0to9ceImKtSjULt9Wdx6cVVW7OhdJgaAPVW849es3elJmt0XiyZE0V", - "pUlIZsM09TFF+pULnLi8PD0uIQfGe1v7/f1n3f3R1l53N+xvdfHWzl53+wnuj3eCpzs1SQ6bu73c3pOl", - "fJrrA5UA8KCANHFo4YE+b5kryihVKHNT0wf5SHOXqMDGmrAc0AlYvyHdA9zEgX4TzTNOd+nH51gfavdt", - "An8t/+JimirNMsE3cpoqpP+CKeslWElheReGPhyg1xy+Ec6/k/GqyGGag9/UYvOqeNK2Hj3O8xMGs8Tu", - "AL3MCFxGIi1JbEtifxq6a52SweF6o+T2Zner4MLVaRkQtjotBxlw9Vp0+rIT8cYzFPHGp8wnOAJ6lzvV", - "pIpG9JM5cnrqVCoaGAkNw27WHTubRoCEQ3Pd1pnmjKeGvZKzj9ypvjpDbQgd/CuyApz+ayMz4xWP0O72", - "s91ne0+3n+01ChDIJ7iaGh+BH9Hi5FaS5iBJhy7Za83Sj84v4aLSl6BMYyOR27UX/DETwQPNGVKG8uyx", - "+eDPes+KcREhT0dRQcNjg6jA+b5Jqt8aW9SfNJrR8Zj9+Sm43v5D0Hjr457cHnkFqWwgP9d5WtRKLoho", - "ZNQ1qV/8ruuAUELWRne8JRJWgC6IQoA/XU2w9I2auf9YlHMxIBbiXsTa3dnZ2X/6ZLsRXtnZFQ7OEGTF", - "xVme2RkUjhi0RO23Fxdos4Bwpk/nE5kIIvXiTHCj95whm4KrX3KX1HLKjg9LapibHGts37O4FuRXlmOx", - "i7JABy+mjJtZOOVeaO/s9J/uPtl/0uwYW+loKD4upzC2nbX+CxIQOivtfBs02e8Oz5HuXYxxUJYGtrZ3", - "dp/sPd1fa1ZqrVkpgZmMqVJrTWz/6d6T3Z3trWZhSj5ttQ3AKx3YMu3yHDoPUnh2wwOKRdLbqbstfFzi", - "ouvkUm/N3P2z6uu3jnNvHqBNJfRKC36lqK2ZqCJDWggy3miik/CTSD1OXQp5zS429btd7mZ7jtX0lI35", - "ojliHeHQOi85NXWiGR8JCXNDwigJHe3KpETLS4E7VCQJClNiIWd4I4EtwLExySRYTYFZhQ8pm5QdwRcG", - "bCKymTksD8eHcW3DJtol6XeieSdSgJXRB0uEc3eaRsptKod+qWKxY0EmaYQFqvqWL5mynMcRZddNepfz", - "eMQjGiD9QVX0H/Mo4jdD/Ur+AmvZaLQ6/cEwtwZXRHkzOesLYDakMm6+hF/0Kjcqnkhw82+a7zehRkgT", - "ZZ3XRPRSC2/G/fqS0Y8FRC/Hq+5u9+uc1Go6LbmnLbrur0vbLcr6Trzzqj/MMn15TJHG2FORYMt8cGm9", - "vtWCNXGZS94iJ4DaTv/n4oHLcC3E5Ta6iJsZNKuabjebTUmC8ui7+0+e7jUMjP4qVntJFYWvYKxn8RKG", - "umanzppwbftP9p8929l98mx7Lf7IGUVq9qfOMFLcn0pCvwrP9qQP/1trUsYs4p9SjWmkPKFScr5bT+jL", - "kqObB8TUSN3LKhjlO+nE/DID3ozFXcItHZZYrkL+2TYZjwkojoYGbt18MhVHqkZzCHCCA6rmHgkQ34Bv", - "CcqaVAI7GvRemawHpLZvG5unKZdMR7ntvu0GR/9mJLsKLuw3zq8g01GdFPmmOqqRIY0zVljRUDRQEBiM", - "8BnQbzJgohssSxYA/TtQJOwU8gtXTUWmRfMqFA7Xs0IUuRHcF5zkLzpR3P7KdhakjhKTXIX4siu0/ghq", - "jgA8vZoo2D03sscNNljtgFGhD/YCvN1Xw1Ex88nS1DKlNCn5rbv+uM0yIy9+Z26w9ccrWPvX+bCaBALw", - "0c7Bgjzvu1NCiRpsUlysztl3D6HcRod9q2Buq/5+kHhu+/heYrgXtuOi4LLU3EHPfeWvJVYyOu51+zvd", - "/t67rZ2DJ3sHW1v3EU2QGS3qVLlPP23dPI228Xg32p8//XNr+nSyHe94XTTuIVdkJaFtJXWkXUNCRDV9", - "RzXtjSQRZaQrM/PHaqvxkjgho5RL8ByYvCUS2TpigKvKs+TUXpQXWTy8WOXAqSYXfQhnMjv7pbJMdfqn", - "x8unfSt7QnUifgSrTgXwqdlkILpt607TgoJeFU6PF5A1i/KhTMl4XkLi90so2G/24NZRKuvDbWeYp5Jw", - "h8nZF0tYk79eAJSPxC7PdlG5hIxRtJhcJPMfvdtUF+/sltZ5GxcysGw/2SunYDns/s2kXEHD3sHmL3/9", - "3933//Yv/rRbJdFREtENyRg45msy70L0C9LI1SuHqoIndEsqbJNVKYJjoHbBNTHUNcYfi/N90s902fPX", - "OF5YAogaMWXZ3ysX5K8ItYBoxjOkLo9q7IoSV7JbUVPm0cbJo0Jj1CZxouYuFtTp1DfW81Q5zDr0stF3", - "7GXff3YXMYGXS4MAf8AsvkVHIjehlS5EC/tfG3njV8odVx16jebbZiYsO6BW8q1JtaTy6bIq26bcNai1", - "bdTbJK2G/69RWbvOkJGfMlfS1JXWXqWfX2o1LKysMJP6vTFeZF9ZhpxKV3/8liCzSubVYWTGE0dzot1q", - "6kqTXUVQ0FpbABnAahBkhohFa8dyR9gz/DEbAThQLBe4ZVhHoUzUyXPImPTWpTCkY9cFTKNayOP519Vn", - "d1i1uBnLCrY7P0XvwbO0agn1qztbFeTMx+gsrwmvyRwJUkHV/EKTIeuuT7Ag4jA1aAj0CRYBj/PBIZTy", - "yxdQxo89OrkTLY3RAB2enwKWxJjBXY6uzlBExySYBxGxkXALHmwgT785Ou2aEN6s3gKUUFUAEJcC+/D8", - "FDLq2uKlrX5vuwclp3hCGE5o66C109uC/MIaDLDETci8AD+tuU2fQ7j1TkN7Oz83TfRXAsdEQfWL3z1m", - "K0WEyeQgwWEBTwqMTYKpsJxNEoExzUhkVH8Lzr+OwB+YW6JjAI6betFKNbeqRZK8sdv6XqODTDiTZkO3", - "+/1KAV+cZ1rd/EMa21c+biMuw1RTX/SEXWD5HKdjQf6l09rtb601n5XJUX3DXjKcqikX9BOBaT5ZEwi3", - "GvSUGXuHq9lFbMP8nAEKFU/Y7+/1fsk0jrGYO3DlsEq4rGPRiEQY0jOOXF3YHrKiH8TuySlPoxBqvyQm", - "Ab0moxgpLHqTTwiLYEpnZMDs7WES3WIBsc0x0reG0VuVj4YZ2uy+ITtEquc8nFegm3W3qbsDbqsM4LUr", - "G2d6k6SmxLGPopvk0DLg3qzYhGGm8lzDJiv0NQH3sjH96O2wkZ+kJniwLQSKEGSx8tsbfgsthH75nRuO", - "s3euwHb5ktMyAmVBlIY5J1AubOxNkWQK9Nrk2dfEwzidQAsLlGKUnLtyGQ+JiXhK5mrKmfmdjlKmUvN7", - "JPiNJEJfzDby2cLaZo61qAtVDGgM0ccmr4oec9NMcfPzNZl/6Q3YYRi7nDm2ZBKOJLdZxY0/KpUoK9wF", - "uFtTC98vLBzZKiMmo28xCaqZJk9VkqoeMgshyoZrQ3PIkSunJBwwxdFnYUoizL9sfs5H/AIcNcGhxpNC", - "E7Okzc80/FI3aznEevVDaOqRSQgAYNDSt8ugpX9PBNYcdSqnCAfgNasfFre0bQ42F8CtbFQhHGCGEp6k", - "keb9AKlMsvRSH5D6AkcRUnCU3LeaB4KdrFmPNbb78jhaS7sxjVaOEWR0LBym/u6+/zxJEgjiE7v/8+LN", - "awRXFdT/hmZ5OCbAiDJ9i2ZlqvTovQF7gYMpMnwTFPYYtGg4aOV1njdgrqm0poBuFxivX6AAvhmmQ8Nf", - "ej3dleHpDtDvn00vB/osJfFQ8WvCBq0vHVR4MaFqmo6yd+/9AK0zWF6UCAFqG9q/4RIX6RUWrkFzb2AW", - "Im5pbTRHGOUUqCjdjyjDYmnWJQ/oLQS1gIknsgiMzwNQ8QxaBwOn5Bm0OoMWYTN4ZjVBg9YXPwRslrB6", - "T3OTeMo2y5For9/fWO1JZOHrYaFLDfXx+7LAfW3fGeNhma5FxsMWo7cxLXoHbUV4YLcegPN5jkOXlOIn", - "i7eCxbPydIF5g++L94BB34gY3XGFA4Pi/Y4DWyqdGLSAoC6QOJzfnxE4qOPgcuQtih9VIXNRrNitO2UB", - "TDFy+Lf7APgH4+Zp+GHcZw81Lo5MwSiXlPpxoSNslkPEjl8iPiHqe8C4/kORUlct5Bvi72PBnxNi+b4c", - "aBVqtgkFOYvqlmqgsiA4lrYX01jLqhcwp+4FYQq9gKc9+6+TeCCu80PEJx8OkAFhxCcoooxI65OR2TD0", - "pWhhCR+ZvIrZdzY1aTDFbEIkapv7859//wdMirLJP//+D81Nm19w3DeNvz+EPn6YEizUiGD14QD9RkjS", - "xRGdEbcYiFUiMyLmaKdvSxDDK0+iUzlgA/aWqFQwmXn663UBTEyHtu6GXg9lKZFIAgihktzYuqAbtadH", - "hHdn2YDyQU90Z0HmsisoLEDfig4HwKeQMqoojqz81fJrz8yaS/qzqgZ3Qae/mr4o8lEZ7O2aCa5JYADE", - "vnMHL+yiUfvi4sVGD4GMYbACwgyAY867scxz7ydNWk2TDEUpExSAsqFNhRz3tfrfY9ummQLY9vgjaYDr", - "kvbXq4CNyoMIEjp4/ZQVmqiD/XBzqmGffvbY1firV9Defr3FIZyfZiNB+O722eHeIsxtscscZN9CBEZt", - "W3ssyzdZqqj5rZD+QW6NQiHW7OpA3GS5fDCx7IizcUQDhbpuLpAWIyaZqFZGkMdCDt7aWSPs1lUN6C3e", - "b5ul+JTamy4LVcmvvPu/PSqDrnON5EHHOa79vElWoc4xlQHX3xawpRvgxGbbNOxLdk6LWLRKIXUMz7Mr", - "Zym7dJyVaLYH8uFUU3bolFXvhgcgiscVgvgNCWElg2AhTP8xYfNltouunvESzdX3hZr9h+OCHlqL5UPz", - "x6TGCitg01RwmtWVqkMvW3nqHjfajuBZ+AUR7lSbiZpsdPmyzKcomJLg2izIlt1exhGcusrcTURf09+P", - "JPmakl9rcCwW5D9ZlAbCbg6rZQLuqU2reH/yLYywlnh7d3Zei2AeIIOzychprE3GQiznLNj4oUy9D3Kb", - "VUt7P6KTdJ5GkbN4zIhQeeGz4h2w+Rncklbz9u60Lb0OLt++6hIWcPBDy3yo/EyUq0d0txy+2TCzlJ9o", - "0kQmBFA5xKhnoL9i/427IMqS2//r9kub3v5ft1+aBPf/unNoUtxv3Buy9B+KND80x/2IkU8z3LQMNCBN", - "pmrQKg41a9WQSXXtfyg+1VagW4dTzeD6k1ltwqwWwbWUX82KAd4jx2rrpn0bk0yGbD5owyvnn/iDcaoP", - "q+WzGFkosV8ye9iUk1zktcpsge7H50BJM4wrXhsN1dX5gVx6fTjUPT3u2DJ0pnhcFiDyQMprN48HZ27t", - "uA+vuT6MR3SS8lQWY0+g6iCRNlgpImUC/NjY7vx6rmW8v2Ms7T/k1fHgfPVPvL8njr+6oYZ4GwvUKp7f", - "tWrK89v2UN/PlJ8wsWtvXVkLm0Zlo8ap0BVtaYrGpfpCi86Ovnn5ZBF0qQWVXFxAIEEcDNh/aPnjd0Vw", - "/P4XFyST9vvbe/CcsNn7X1ycDDtzqEKYEpRIhAVBh6+Pwew3geh1SIaWh+RV52FSnJlCzrbG6P84ASm3", - "fDaXkBwW/pSQGklIBXAtl5CyKir3KSKZQb6ZjOTwzQdwm1rjp5T0EFKSTMdjGlDCVJ4BeMFJzCYQf4Sx", - "ZczahwrOHaWLtrGUlJc2Ws6A5mnvHtyxJxv84YUjl2HvcfrIcxMVEzpxJL8M6+WR7w0f+g9LnB9eDnnM", - "KHZSrOLv5/hNgNjYJiD2MwgvubhuinmePJx3joB3z50UV/gd8iZ6epBd5NuzKHB5G996jTRlzuUBDuRC", - "ctVv6dLpIGGFWxMUSdkkK0h5Q9WUpyarytA+NFnZ9Kmw1WSA5Qlsr9+avOjRH4ABfc0VonESkZhA1rau", - "wSaoBJomCRdZ/TEqC6mI1yN/+tgUHWxNchtbsreDbMJmUNZlFURBb7+4XV6qGfHJ6qDabHAXQeqJqh2w", - "S2mSvHwwrPAHlBFZpDiSJCKBQjdTGkwhwlY/g/5NAC5Okg9ZSo2NA3QCJ7WYWQQGb0siKI6gyiOPTIHS", - "D7M4/nCwmAHu6uwMPjLBtSbX24cD5LK+ZReE1K2KEbN6FRGWCr22ccBtjUmCR5HZ0Q/6Fiqsb8PG0uYp", - "TwbMF1fLyI3tkI7Rh0KI7YeaGFtHUF/xifxW/FKnPlGVWYviSADgDG4SFrbqFDs08kfXbvX7vvwpDSN9", - "zTTuOdB3YTKv+CRLklVCZZwkTdHXThOweBbHS3AYtQvJzKUKear+KlVIhICPLXbXITdq48D8ofC1RlRm", - "S5G5dPCAfl71pcla4wWVJqqFfNLmr1kctzotOx9P9dyvj5iudrioZtM7UwiL/slprxPwXCb2hYjnys1h", - "61bUs9y2HMcPL++5+tbfGA2/gX4snwVljlWBvc0Lhz+uyElTqaXKi5nk+b4zkpV6qT8lZaXyRZ6m/3+g", - "iGrWWq3P88BCagZin2RWKm/xzaXTrNrGTwk1k1C5QGFqhqvUu/lhxc6MoKCUlSRPy57eVvbMksxlYIY6", - "hGypQSCneZuf3c/TW7AL3wkl7NRWfalLZ5Qv+nsguTU10RrR3G/EJ9lrtcAgfEMS7KqzPTQFzqCixb2M", - "yn0XZNgcuIwaF2kOVN6nrvDiT2JcUgMaTeltibFjPhd0gQXyTFk3iXAdXbZ8ai0BtlWgfnh5LZdVfnCJ", - "LeBCGNcxcEZ7TKGLBZthQfRsJziVpJMdmI6zW1+dnW3UHRqhlh4Z8X0YtG/HOVTKcsahvy6yoKFLUn90", - "dmxT2lOJRMp66E1MIXP8NSEJpKSkPJUIfAB7xXpjdVXQsoJihCkxTzhlauUs8qb3M5kvt0rS/cB0ygZv", - "//BqJVto97ERKaAd+va2C1guVClTZs9rpnNmK8pMZn3NfOART3XvC/XQ0JhGRM6lIrGx2Y3TCA4RpPew", - "2V/td8Z3rYOoklA9vAO+PgkRMZWSciYHbETGmitJiNBjQ8FJGpGC+cFn2bpQOKOa54b0fR+mLSiRBtYc", - "rOqgVq6OhpPEVUfzmU+ygm63ntJLsFUhOY9HPKIBiii7lqgd0WvDg6OZRJH+sbHU2DWE7+46t+3tT5aG", - "9Ckbc2/6P4OzGTL/CBTutELWnDH/0ZG1E1I8LI7+wEb7yZpcSdcEwREUAc3cbFGqaEQ/GVKnO6FS0cDU", - "TMIZ7KDcixmvN2BnRAndBguCAh5FJFBO17CZCB5sDtJ+fydIKMRD7BCYHBC8+tcxjHh0fgntTEmazoDp", - "P6Djd4fniGqYjrEVmQsTtYXt0enmmxXm/wsA0/9gecwscNmx8G/4T8vu+j6UtWdI1hxRniwTgHjywysM", - "LAf3U1vwOLUF4MSeraY9ETgAplhOUxXyG+bXDJgKqXLzs/lxuioUQuFgeuVKRX8f3K6tFrtqGLfAR3Eo", - "7ZpCYtKTfhN9vS3o+0jTOWnAuSUAE1MM6vDfAqZQ+I+G3XdvrCvC8Tu01FmIutS/383Zeuibz87BRfgV", - "4fFYjrnBNLcSKFlZ1D5l4YwrZbMgFYIwBalgctYywAkOqJp3EI5cNVVbHinTIeWF4EeC4Gt90/YG7G0W", - "SGnLM2npquNEKxRSeW16sNJTD72ZESHTUTY5BITJyHkAfFtQNcBRYCqRkvGYBIrOiCkRKmukr2wq95mW", - "Nx/Es9HupQXdYxM5/DgBu5ejhZU6Sp5ytekbLrJWzdI3ZL0WvGEKniJLfZ6HrqGpgr+Oys4z+DWtdYu3", - "r9bzXvtNf9Rw7LKXlH8S9tVXrvJHyYp3UXBOaZr0Icfwx5Z/oTDz0lEtOXitDgRv7NF1nx5WqwLBs8Ef", - "OhD8wuvk88jSUeGS21ZdBPj3hwj9h/UufugI8MeNW5qVkAugq6dEDSLBvwsMvJ8Q8G/sXX+LEPDvyt8T", - "Qni/nd/9d+XpaT0WM0/Pn0He9+ngaSK9IaC1zsHTUD2reV4qKF3ZNs3EJNvjj8TBW2XlGvy7A/vPlG0N", - "RIYCsNwtXCE3QPulRXgSJ2rutFF8DH43eU5BST+B954vcC5TOt9fvNot9LF3hx4OT2u1sT9TvT2YwjfP", - "h316/PjzuxXPXOli2dS3TheLYEpnpXitZSfYgigRpJvwBPSsoQGYhYe7yxQWvcknZLvvDdi7KXF/Ieqy", - "ZZAQhVSQQEVzRJniQBHMGH+RSHAtCcB7LuY+9W3x5L4UPD60q1lxH9ozZZVhuZtfPO+GWOHuzFGbJSq0", - "rzBZneGPNE5jIHiIMnTyHLXJRyVM8gY01pIPouMMpORjQEgoASc3ihPe6tdoNuknMpyMmsxySRqONzbN", - "CQpSqXjs9v70GLVxqnh3QpjeC83qj4GTTQSf0dDkyM2BOuORgepWDUDX1btqpsL6g+fChZncN+FhmlxI", - "k080KZMF4/bYOmiNKMMwuZUJL8pnynjg6vEwBT+4/Ow4zGn9vMKqVbY1JmohxwFRcY4izdFv/LzmHvM1", - "V/RkcHda6bZrlsW0mXNDQ5+D+8hgmjm+PKza+ur7sccXqhI/QtX5LBNI69Tm3xcK9h/ufnhodfnVI/bf", - "OiFO+C6oyqED3aMPYV7xAEcoJDMS8STWbKVp2+q0UhG1DlpTpZKDzc1It5tyqQ72+/v91pf3X/5/AAAA", - "//9dM5gHNB0BAA==", + "H4sIAAAAAAAC/+x97XLbOJboq6B0d2vkHUmWP+I42ura68RJ2ttx4hvH3rvTylUgEpLQJgE2AMpRUvk7", + "DzCPOE9yCwcAvwRKlGM78SZTUx2ZBPFxcHBwvs/nVsDjhDPClGwNPrdkMCMxhp9HSuFgdsmjNCZvyZ8p", + "kUo/TgRPiFCUQKOYp0yNEqxm+q+QyEDQRFHOWoPWGVYzdD0jgqA59ILkjKdRiMYEwXckbHVa5COOk4i0", + "Bq3tmKntECvc6rTUItGPpBKUTVtfOi1BcMhZtDDDTHAaqdZggiNJOpVhT3XXCEukP+nCN1l/Y84jglnr", + "C/T4Z0oFCVuD34vLeJ815uM/SKD04EdzTCM8jsgxmdOALIMhSIUgTI1CQedELIPimXkfLdCYpyxEph1q", + "szSKEJ0gxhnZKgGDzWlINSR0Ez10a6BESjyQCWFOIxp6duDZCTKv0ckxas/Ix/Igu4/Hh636LhmOyXKn", + "v6YxZl0NXD0t1z+0Lfb9at/XM+VxnI6mgqfJcs8nb05PLxC8RCyNx0QUezzczfqjTJEpEbrDJKAjHIaC", + "SOlfv3tZnFu/3+8P8O6g3+/1fbOcExZyUQtS89oP0p1+SFZ02Qiktv8lkL6+PDk+OULPuEi4wPDt0kgV", + "xC6Cp7iuItqUd8WH/09TGoXLWD/Wj4kYUSYVZjU4eGJfanDxCVIzgux36PIUtSdcoJCM0+mUsulWE3zX", + "BCsiioQjrJaHg6ki24ZyhhSNiVQ4Tlqd1oSLWH/UCrEiXf2m0YCC4DXD6RaNBls+aqnZyVEs63p3TRBl", + "KKZRRCUJOAtlcQzK1MF+/WIKB4YIwT0U6rl+jGIiJZ4S1NZkU9NuhqTCKpWISjTBNCJhoz3yIYJZzB98", + "jGhImKITWj7fBp26eBzs7O55aUeMp2QU0qm9icrdH8NzjWK6H4WgtX8h+qAtmq0DhhRksjzeCyDdMIgg", + "EyKIxvGvHC4RfE6YPi16vH+BcVv/azu/orft/bwNwDzLm3/ptP5MSUpGCZfUzHCJctk3Go0A1Ai+8M8Z", + "Xq3a6wJGSYXF6vMBLW7hJJr5NYLNuWn6pdNSeLr2k3e6TZV2Amm0Q5aoQC2JfD4nzMMkBZwp+6IMnVd8", + "iiLKCLIt7F5omqgH+CXiQBJvCQ4Z+JcPv573DYiXeVDTm37XaRGWxhqYEZ8WoTkjWKgxKQGz5gqzHeWz", + "qwX/Wen4VO4qLMloNQU5o4yREOmW9mCbliiVwKkuLR9O0RVVozkR0nvmYFq/UYVsi9quIh5cTWhERjMs", + "Z2bGOAzhvOLorLQSD7dWYn9xoomg6xC4CIkUR+e/Hu0+OkB2AA8MJU9FYGawvJLC17p70xYpLMY4iry4", + "UY9um9/Ryxjix4Dz7GDU3T0ZBjrENJSuZXdTd99pJamcmV9Au/Ws4O7TZECjV6R/v/cs+hkQCSMl1MpM", + "fh7wTWI2G00jrmG6QCmjf6YlBruHTrSsoJC+KGhIwg7C8EKTbJwq3p0SRoSmU2gieAzcVoEJRm3Sm/Y6", + "aKj5wq7mgrt4t9vvd/vDVpmNjfa70yTVoMBKEaEn+P9+x91PR92/9btP3uc/R73u+7/+iw8BmnLmjiu0", + "62y7s99BbrJFdr060XWs/I2pf3H6PopjtvpE04lNd/rZyTLjYNYa8uCKiB7l2xEdCywW22xK2cdBhBWR", + "qrzy1W1vFRawjhVAYFMNpg3BUBF6AI3bEb8mItAUOCIa8WRHE2GqZAdhLTcD8UL6lvx3FGCmz4JhLrhA", + "hIXomqoZwtCuDK140cUJ7VIz1VanFeOPrwibqllrcLC3hOcaydv2R/f9v7lHW//hRXWRRsSD5G95qiib", + "InhtbvUZlSifA1UkXrsjDrppBGxeTNmJ+WwnmwkWAi++fofdQlbttBHmarc6iD2c/5s5EYKG7lZ9dnqM", + "2hG9IhbdkUgZGqb9/l4ADeAnsU8CHseYhebZVg+9ianSt1maX9JGG9QrbvfvLRLMOPAZUcT1gjJQ1zAx", + "OQwNHfJs57HTpEhkpXO4VzHoyWB7X55dbGvKlmAp1UzwdDorz8qS1c3mQ+XViPLROPHNicordLL9Bmmi", + "jyKqoZMR+Z1+//Tpthy29B+P3B9bPXRsQAbT1/vHhb175AwLAhxQiDhDz84uEI4iHlj5c6IZ1QmdpoKE", + "vYraA3r3HQ7ClFgknPoY4Apm5E2XEaTbzd9ugAfbY8q2pd6GbrAZ3AmbfwUb9pzNqeAs1qzwHAuqaVxJ", + "CfW59frN8fPR89eXrYE+RGEaWI3O2Zu371qD1l6/32/5OB2NQWvO+Muzi2ewU7r9jKskSqcjST95yPBR", + "tj4Uk5gLI37Yb1B7VqbShjtDsDnD1t7Lpwa5dl4CXrlNCamE1q4X03EZY3ZfPvVhy2yREDGn0qej+DV7", + "53a+QFMNYSrjtiRiTkSGtIDFvQLvF0Q8DbuFITutCRUkEFijXavT+pPEmgmaf9Kok8/d851fddDo8l9z", + "q+MooYysuNa/k+v1mouriOOwu3PLtysjSve9vMTX5kV5fy1OkAwlWp0lUZCF1zRUs1HIr5mesoeu2jco", + "a5wR1496JTj659//cXma86g7L8eJpbQ7u4++ktJWaKvu2it/ZgtJE/8yLhL/Ii5P//n3f7iVfNtFEKbx", + "MyzZdYz6p7yU/5oRNSOicOO6DdaPjAABnyOHL4XhS/qkohFoibjyORERXhSIpZ1Ta6cPFKsyK0EVnC/7", + "nSZ9V0h/vIZ06t7cxfyyKtTs9v3E0TMpz5ye6vNtaXmTmWQT2dk9tT93l6dUM6Mrmoymmhcc4Wmm41pl", + "nju/ogmCL7rwhdnGKDKHN0x1z2jMueoN2X/NCEOwd7DB5CMJgE5pIR4dnZ1IdE2jCCRiIATL18GQvSuQ", + "AtNcKv1fkbIOGqcKCRJzRZBlNGGQFOYCjccEpQw7+19vyIpQsQus4pUFyxURjESjGcEhEbIhZMxHyH5U", + "CxxY6gRLRYSh0GlShtfxb6fnqH28YDimAfrN9HrKwzQi6DxN9BneKkOvM2SJIHPCQGbRTAW14/IJ4qnq", + "8klXCULcFGPoLNMpWOPU/OXZhTVvyq3ekL0lGrCEhSSEObtbQiI1wwqFnP1Fn1gSlrstjl8Buv8sbyL7", + "dFrzIEnLO7Jb3Y3XYIDUa59ToVIcafJW4uC89khj6fZw6saQXpQYLNnKkBOrsiGpqYBoegaz9zIf65fz", + "DHNSL+edM5zIGVe1ct4VZeG6eblOftNta/mUTO8lbfO7ZlUSQbppMhUYDLW3yajcWPoGaNbvxhofDJ+x", + "LYNqkErF44LJDbUrikJaVimWgTXnUTfECgNT15DzNNNdNl/HC9OVOSJ199toOvZon/U1Rhma0ikeL1RZ", + "ktrp+w7i16pC3Fx821LnBmIONglHiq82hNMJcm2b2L3AaWSk+Gg+oZ6eM9Yo16JSiYKKz4klN7qLbhJQ", + "S6Q76HpGNTMlkQMC0OnL06IWozdkXbhYBug4GyDrNutSH0zQmEMXbS4Kk6Bg/EDjxRbC6PK0h95ls/2L", + "RAwrOifOL2aGJRoTwlAKTDgJYXy4NIsTSKW+qaiqfm5vJONCswXKGm7f9ZAWImNsb3d9FGKsaAAK9zGt", + "rAeMomaj9EiadLMib9GIF1jlPvCWTKlUouI8gNpvXzzb29t7UuUKdx91+zvdnUfvdvqDvv7/35r7Gdy+", + "l5Cvr6MybbEmjCL1eXZxcrxrWdDyOOrTPn5y+PEjVk8O6LV88ikei+kfe/he/Ij8pOw4t72gdiqJ6Doy", + "qbHKZ3EpGDZqLCo3NpTckd0jN+Ouamsg8U63vAsHKZ/p3Rp+N3dhqhLMtcb7wuKW1qOfai4wPyUFBZK1", + "kQXUaw08pvLqqSD4KuTXzHNvayZMjsx95tfsplqyHi8Q+agZdhIiwbmaSKNBKjOjO/uP9w/3DvYP+32P", + "X9AywvOAjgJ9AzWawJtnJyjCCyIQfIPaIPqHaBzxcRnRH+0dHD7uP9nZbToPIzg3g0PGK7uvUNtC5K/O", + "x9S9KU1qd/fxwd7eXv/gYHe/0awsG99oUo7lL7Ekj/ce7+8c7u43goJPEfHc+WlVfUlCnxI3SSJq1C5d", + "mZCATmiAwNML6Q9QO4YrjGQ6gPKZHONwJCx76b07FKaRXKk7NoPZlsatL04jRZOImHewIY3kGVj5MfTk", + "08tTxogYZW5sG/RkvdvW6krdWrImqOSlWALdKZXAheTMEyVRODAndC2dg93MJ/a+Dg/sGhpiwystOnUj", + "MidREQnM1aUnG3NBUIYnZtNKq6JsjiMajihLUi9K1ILyRSqAFzWdIjzmqTLKG9iw4iBgOwfZY6LJdTM3", + "jxdcXK21QuqbeCRSxnQ3a/UuR1HEr/UWX2nYwC2Okf3aOboUmL5MyWJUUfa9RG/NF0ZVlT9OUoUoU1xL", + "pywcLzowEgmhHUOCSMWBkuLgSnOYtpum3KWfb3mtGRanCDfj5bTznqwA3YlRwt6uhC2mRI2kwmotx6Ix", + "5R20P4fmjZ0a9IdrFSAN4M7I9X0AHbw+uhptu5Lh5G4gvsosl+ka8kZwCwsakh6C0wX2AedlWjlp54on", + "CQkz/U9vyM7NUckeSRSnEnSeVwYOakaoQFzQKS0PbI/NPdj3NkFFh003Rsfih8scKrwEpXj9occTRYSB", + "oHOgL3rB2U1odVoW9q1Oy1KiMmjcQw9EcqPz0hRfnl1saqVLBJ/QyLNc0DDbt1Yyc/arV/v98+7O/zG2", + "aI1vwKJRZrTSMQ9JrxKjAu2b3Twvzy7O6uaUBQih4uyW1pTZETyUI1M3O4hYDXmAGRoTZCUYh/76YskG", + "yXnvJz5ediJwTMbpZELEKPYo117o98g0MAYjytDp0zI/q/nmplLzWWlzQGye4MDGdzSDvkchV1lGpwDN", + "9/7tekvMNVznFaq3Stg21jG0h15nIVno5dmFRLntx6OpK29vrefQ2WwhaYAj06Nx8qasqGAD5GzMIZ/l", + "H1pVpIdPjr28oTsIqD2fJikcw/O33ZM3l9txSOad0pzAXjPjEdHz3ipQi7nzDc3dnEpEYl6n6TCIIZse", + "oAKsshPcGEiF8+qBjuIKRyMZceWZzTv9EsFL1L58YXz39Aw6KCltpX5egEIJvw+8J0ZTpLphz2HAqsq0", + "dMC9smM5ktGoVwrLKw3qOyq/EhyZAM4yPudhBm7j+VV5o/nV2tNrO/GNe+JcZBr4ED47PTYMQ8CZwpQR", + "gWKisA0XLTh8ATvU6rS6+o4KMYnBgDn599XOXzUq+AxdVilxny1Ff92JArcmakETuWhOQhRjRidEKhu1", + "UBpZzvDuo4OBia0KyWT/0UGv19vUW+957p7XaCu2jTNTwXGvJ2dftw934JTXZC2fW2dH735tDVrbqRTb", + "EQ9wtC3HlA0Kf2d/5i/gh/lzTJnXma9ROB6dLIXhlU2a+s4yzwd6JYwEGUJyEODXmphq5BmNmhH9RELk", + "9VpXeKrlE4NxX+uefuMAtjyKWhUC14p2+gZBbPTTak2oY4ygjR0zZYpGeXzfsg70RhGacmUQy1IAS0JY", + "FrYSReZXwNlcnwpfDEuJgLt3X2U/uDYC3SikHkz+LyvthVrwUuBfuv68tbZxkqxHWz+jmNG/prF71sPe", + "cxN9c6p/ExtbefQ30//88//Ks8d/7Pz56vLyv+cv//P4Nf3vy+jszVf5kq4OrvimERK3FhQBhqVSZERT", + "VDrFKvAwVDMuVQ2E7RukOIr1xz30DAS/wZB10SuqiMDRAA1bOKE9C/hewONhC7XJRxwo8xXiDOmurP/Y", + "lv74zKh/9MefnWz5pdpHaB3FhN2QzKdTpuOQx5iyrSEbMtsXcguRYNPXv0IU4ESlgujd0zxstEBjgYPc", + "QSwfvIM+4yT5sjVkIOGSj0roFSRYqCwazI0ASGFnZXwGbHMSojmOUiKthDxk2b0EIr/uxOhoepkSBHTz", + "Fc1qDVC84gsXZQfHw37Hs49It9MbGVGpCEOZtoNKQHTUdp6qh/0SqTjsH/bXMvgZDq1APzgJy7laHFI2", + "OEsGgWFoQ7hHM6WSBrp0TZvMGUG/vnt3psGg/z1HrqMcFtkWGyEPJ0lEiTQ6QhUBr2Odg7dafoc/vbsN", + "F2SUZPBZ1MAX8zkMjN69OkeKiJgyQ+vbgQbnhAZ6fWD+p1KmGhUpRkfPTp9v9RokmwHYZvNfsY/vshVW", + "LMdOaVanC8wwXsO3g06OO5pNsyc0Z+DAreYFFygyBCY/1wN0IUnZPRG2ylj1zU5Gi1zzZm6AYWvL9ZhU", + "KcUAvc34RpxNJYtSzZHBdZmfS+jWGl6Mz89S753yXMGbycpFlrSBhw9WyNo54dquJwWrj78H4nDmOavq", + "NDc720VlqB7Mjxr53t85t7K3qYy6aaha2TO9EImQRas1DzO7i3CtZXntI1WjWiM80q+tyd1JJZenaIYl", + "+4uClxXZZGfvcaOkLXrUpubrouGaT8yUslPl3Nwzs6tx+L+iUWS8GSSdMhyhJ6h9fvLyt5NXr7ZQF715", + "c1rdilVf+PanQdSaQ+2XZxcQCoblyFmA6p0ece44TD5SqeRyFEAjQ+rqKLlfS5Fs3rCKrVsMb3PW56Vl", + "3Efg2rd06/v+guZWhrl9bayaZXbvKFStlrj6wrzKdNY8vt2gszuZTil8zEcfijyB87m+ccRYp0U9/qZH", + "UpNAEqKTszzTRq6Uct1X1vRkt7dzcNjb6fd7O/0mKroYByvGPj161nzw/q5RRAzweBCEAzL5ChWhRWzD", + "vOHoGi8kGjr2etgy/HyBkS8cW8uCNzK/Lgfm3SwOr8pQrIu02ySyrlnI3Ip0WeflRFmNebRHf/uqnFqk", + "6c1sXRfsV6NNlNcEBTyNQs0HjfXJM2IVCa30J4nKc5DBYb1gV4xfs/LSjQ5Tn98/UyIW6PL0tKTxFmRi", + "Uyw1WDi4PNTsA0822obdNazy2tncMHrtPiLWqlSzcFvdenxaUeXmXCgNhjZQveXco9fsTZnZGo0nK9ZU", + "UZqEZD5KUx9TpF+5wImLi5PjEnJgfLBz2D980j0c7xx098P+Thfv7B10dx/h/mQveLxXk+SwudvLzT1Z", + "yqe5PlAJAA8KSBOHFg70ectcUcapQpmbmj7IzzR3iQpsrAnLAZ3ACaOK4oh+omyquwER3XK51zO9JogJ", + "RpRRpd+ZIF3K9JJBF6I7sc5HA/QS2sIrHEO4kJuElm3KagAcLowaVBMGN3QCf62e8vksVZrtgm/kLFVI", + "/wXL1mCw0sbqLgyNGaDXHL4RzkeU8arYYpqD79Vy86qI07ZeQc57FAazBHOAXmREMiOzlqy2JbE/De22", + "js3gtL1Vcp2zO97S2JLvXMErrNMyEG11Wg5Q4D227Edm5+UNkSiios8+QHAEJDT300kVjegnc4r1SqhU", + "NDBCH4bNrTvJNjMBCUfmBq+z9hnnD3vLZx85QnF5itoQjfhXZGVC/ddWZhksnsr93Sf7Tw4e7z45aBRz", + "kE9wPYF/Bq5Jy5NbS+2DJB25/LE1S392dgF3n75XZRobId+uveDimQgeaGaTMpQnpM0Hf9J7Ugy1CHk6", + "jgpKIxuXBf78TbIH15i3/qTRnE4m7M9PwdXuH4LGOx8P5O7YK5tlA/kZ2ZOionNJ6iPjrskm4/eGB4QS", + "sjZg5C2RsAJ0ThQC/OkiHMAlnXkUWZRzYSUW4l7E2t/b2zt8/Gi3EV7Z2RUOzgjEz+VZntoZFI4YtETt", + "t+fnaLuAcKZP52aZCCL14ky8pPecIZvVq1/ywNSiz54PS2r4pRxrbN/zuBbkl5YJsouyQAfHqIxBWjrl", + "Xmjv7fUf7z86fNTsGFuBayQ+rqYwtp11KBAkIHRe2vk2KMffHZ0h3buY4KAsYOzs7u0/Onh8uNGs1Eaz", + "UgIzGVOlNprY4eODR/t7uzvNIp98CnAb01c6sGXa5Tl0HqTw7IYHFMukt1N3W/gYz2VvzJUOoLlHadV9", + "cBN/4Tzmm0rolRZcVVFb82VFHrcQt7zVRM3hJ5F6nLqs9JoDberKu9pz9wyr2Qmb8GULxybypvWHcprv", + "RPNBEnLwhoRREjralQmelrUCD6tIEhSmxELOsEoCW4BjY+VJsJoB/wsfUjYt+5YvDdhECjRzWB3hD+Pa", + "hk0UVtLvl/NOpAAro2KWCOceOo305VSO/ILKcseCTNMIC1R1V18xZbmII8qumvQuF/GYRzRA+oOqNmHC", + "o4hfj/Qr+QusZavR6vQHo9zAXNEOmMlZ9wKzIZVx8yX8ole5VXFugpt/23y/DWVHmuj/vFanF1p2Mh7d", + "F4x+LCB6OQR2f7df5/dW02nJ4205GmBT2m5R1nfinaP+UZY8zGPdNPajilBc5oNL6/WtFgyUq7z8ljkB", + "1HYqRRdiXIZrIdS30UXczEZaVZ672WxLEpRH3z989PigYaz1V7HaKwozfAVjPY9XMNQ1O3XahGs7fHT4", + "5Mne/qMnuxvxR87OUrM/dbaW4v5UcgRWeLZHffjfRpMylhb/lGqsLeUJlfL93XhCX1Yc3TzGpkbqXlUU", + "Kd9JJ+aXGfBmLO4KbumoxHIVUtq2yWRCAkXnZGTg1s0nU/HNajSHACc4oGrhkQDxNbiroKxJJVakQe+V", + "yXpAavu24X6acsl0nLsDtN3g6N+MZFfBhcPGKRtkOq6TIt9URzUypPHvCisaigYKAoMRPpv8dQZMdI1l", + "yaigfweKhJ1CyuKq9cm0aF7YwuF6Vtsit6v74p38dSyK21/ZzoLUUWKSqxBfdYXWH0HNEYDzWBOdvedG", + "9njWBut9Oir0wV6AN/tqNC4mU1mZraaUeSW/dTcft1my5eXvzA22+XgFB4JNPqzmlQB8tHOwIM/77pRQ", + "ogabFBfr0wDeQXS4UWnfKD7casPvJUTcPr6TsPCl7TgveEE19/lzX/nLk5XsmAfd/l63f/BuZ2/w6GCw", + "s3MXAQqZDaNOlfv4087142gXT/ajw8XjP3dmj6e78Z7X6+MO0k9WcuRWslHaNSREVDOCVDPpSBJRRroy", + "M3+sN0SvCD0ySrkEL4DJWyGRbSIGuEI/K07teXmRxcOLVQ6car7S+/BPs7NfKctUp39yvHraN7InVCfi", + "R7DqVACfmk0GAuZ2bjXTKOhV4fR4AVmzKB/KlOzxJSR+v4KC/WYPbh2lsm7hdoZ5dgp3mJx9sYQ1+esl", + "QPlI7OoEGpVLyNhIi/lKMpfU282e8c5uaZ0DcyGpy+6jg3JWl6Pu30wWFzTqDbZ/+ev/7r7/t3/xZ/Iq", + "iY6SiG5IJsAxX5FFFwJqkEauXjn6FZyrW1Jhm/9KERwDtQuuiKGuMf5YnO+jfqbLXrzG8dISQNSIKcv+", + "Xrsgf5GpJUQzziZ1qVljV+e4kjCLmsqRNvQeFRqjNokTtXDhpU6nvrWZ88tR1qGXjb5lx/3+k9sIM7xY", + "GVf4AyYGLvomuQmt9Upa2v/aYB6/Uu646iNsNN822WHZp7WSwk2qFcVUVxXuNhW0Qa1tA+mmaTWjwAbF", + "uusMGfkpc446rlr3Ov38SqthYWWFmdTvjXFM+8rK5lS6kuY3BJlVMq+PTDOOOZoT7VazYZqELYKC1toC", + "yABWgyAzRCxbO1b71p7ij9kIwIFiucQtwzoKladePoUkTG9dVkQ6cV3ANKq1QZ5+Xcl3h1XLm7GqBrxz", + "ffQePEurVlC/urNVQc58jM7qMvOazJEgFVQtzjUZshEABAsijlKDhkCfYBHwOB8cojO/fAFl/MSjk3up", + "pTEaoKOzE8CSGDO4y9HlKYrohASLICI2uG7JoQ3k6TfPTromKjgr4QBVWRUAxGXVPjo7gSS9th5qq9/b", + "7UEVK54QhhPaGrT2ejuQsliDAZa4Dckc4Kc1t+lzCLfeSWhv56emif5K4JgoKKjxu8dspYgwySEkOCzg", + "aYGxSTAVlrNJIjCmGYmM6m/Bn9gR+IG5JToG4LipY65UC6taJMkbu63vNTrIhDNpNnS336/UBMZ58tbt", + "P6SxfeXjNuIyTIH2ZefaJZbPcToW5F86rf3+zkbzWZtv1TfsBcOpmnFBPxGY5qMNgXCjQU+YsXe4MmDE", + "NszPGaBQ8YT9/l7vl0zjGIuFA1cOq4TLOhaNSIQh4+PYlZrtISv6QTignPE0CqGcTGJy2msyipHCojf9", + "hLAIZnROhszeHiZ3LhYQLh0jfWsYvVX5aJihze4bskOkesrDRQW6WXfbujvgtsoA3rhYcqY3SWqqJvso", + "usk3LQPuTbRNGGYqT19sEk1fEXAvm9CP3g4b+UlqggfbQqCuQRZ+v7vlt9BCNJnfueE4e+dqdpcvOS0j", + "UBZEaZhzAuVayd6sS6bmr83HfUU8jNNLaGGBUgy8c1cu4yExQVTJQs04M7/TccpUan6PBb+WROiL2QZT", + "W1jbZLQWdaEwAo0hoNmkatFjbpspbn++IosvvSE7CmOXhsdWYcKR5DZRufFHpRJltcAAd2vK6/uFhWe2", + "cIlJElzMq2qmyVOVpKqHzEKIshHg0BzS7soZCYdMcfRZmCoLiy/bn/MRvwBHTXCo8aTQxCxp+zMNv9TN", + "Wo6wXv0ImnpkEgIAGLb07TJs6d9TgTVHncoZwgF4zeqHxS1tm4PNBXArW1UIB5ihhCdppHk/QCqTf73U", + "B2TTwFGEFBwl963mgWAna9Zjje2+1JDW0m5Mo5VjBEkiC4epv3/oP0+SBIL4xO7/PH/zGsFVBSXFoVke", + "4QkwokzfolnlKz16b8ie42CGDN8Ezv/DFg2Hrbx09BbMNZXWFNDtAuP1C9TUN8N0aPhLr6e7MjzdAP3+", + "2fQy0GcpiUeKXxE2bH3poMKLKVWzdJy9e+8HaJ3B8rxECFDb0P4tlwtJr7BwDZp7A7MQcUtrowXCKKdA", + "Rel+TBkWKxM5eUBvIagFTDyVRWB8HoKKZ9gaDJ2SZ9jqDFuEzeGZ1QQNW1/8ELCJx+o9zU0uK9ssR6KD", + "fn9rvSeRha+HhS411MfvyxL3tXtrjIdlupYZD1vf3obJ6B20ReaB3boHzucpDl2ei58s3hoWz8rTBeYN", + "vi/eAwZ9I2J0xxUOTAvgkePAVkonBi0gTgwkDuf3ZwQO6ji4HHmL4kdVyFwWK/brTlkAU4wc/u3fA/7B", + "uHlmfxj3yX2NiyNTg8rluX5Y6Aib5RCx45eIXxL1PWBc/75IqStA8g3x96Hgz0ti+b4caBVqtg01Povq", + "lmrssyA4lrYX01jLqucwp+45YQo9h6c9+6+TeCBU9EPEpx8GyIAw4lMUUUak9cnIbBj6UrSwhI9Mqsbs", + "O5vtNJhhNiUStc39+c+//wMmRdn0n3//h+amzS847tvG3x8iIT/MCBZqTLD6MEC/EZJ0cUTnxC0GYpXI", + "nIgF2uvbqsbwypM7VQ7ZkL0lKhVMZp7+el0AE9OhLeWh10NZSiSSAEIoTjexLuhG7ekR4d1ZNqC81xPd", + "WZK57AoKC9C3osMB8CmkJhzUyl8tv/bMrLmkP6tqcJd0+uvpiyIflcHerpnghgQGQOw7d/DCLhq1z8+f", + "b/UQyBgGKyDMADjmvBvLPPd+0qT1NMlQlDJBASgb2lRIm1+r/z22bZopgG2PP5IGuK4OQL0K2Kg8iCCh", + "g9dPWaGJOtgPN6ca9ulnj13ZwHoF7c3XWxzC+Wk2EoRvb58d7i3D3NbPzEH2LURg1LblzLIUlqUind8K", + "6e/l1ijUds2uDsRN4sx7E8uecTaJaKBQ180FsmTEJBPVygjyUMjBWztrhN26qgG9xfttuxSfUnvTZaEq", + "+ZV397dHZdBNrpE86DjHtZ83yTrUOaYy4PrbArZ0A5zYBJ6GfcnOaRGL1imkjuF5duWsZJeOs6rP9kDe", + "n2rKDp2y6t1wD0TxuEIQvyEhrCQlLITpPyRsvsh20ZVIXqG5+r5Qs39/XNB9a7F8aP6Q1FhhBWyaCs6y", + "UlV16GWLWd3hRtsRPAs/J8KdajNRk+AuX5b5FAUzElyZBdlK3qs4ghNX7LuJ6Gv6+5EkX1NFbAOOxYL8", + "J4vSQNjNYbVKwD2xmRrvTr6FETYSb2/PzmsRzANkcDYZO421SYKI5YIFWz+UqfdebrNqtfAHdJLO0ihy", + "Fo85ESqvpVa8A7Y/g1vSet7enbaV18HF21ddwgIOfmiZD5WfiXIljm6XwzcbZpbyE02ayIQAKocY9Qz0", + "V+y/cRdEWb78f919YTPm/+vuC5Mz/1/3jkzW/K07Q5b+fZHm++a4HzDyaYabloEGpMkUIlrHoWatGjKp", + "rv0PxafaonabcKoZXH8yq02Y1SK4VvKrWX3BO+RYbSm2b2OSyZDNB2145fwTfzBO9X61fBYjC1X7S2YP", + "m3KSi7z8ma35/fAcKGmGccVro6G6Oj+QK68Ph7onxx1b2c7Uo8sCRO5Jee3mce/MrR33/jXXR/GYTlOe", + "ymLsCRQyJNIGK0WkTIAfGtudX8+1jPd3jKX9+7w67p2v/on3d8TxVzfUEG9jgVrH87tWTXl+2x5KBppq", + "FCZ27a2rcmHTqGzVOBW6OjBN0bhUsmjZ2dE3L58sgi60oJKLCwgkiMGQ/YeWP35XBMfvf3FBMmm/v3sA", + "zwmbv//FxcmwU4cqhClBiURYEHT0+hjMflOIXodkaHlIXnUeJsWZqQ1ty5b+jxOQcstncwnJYeFPCamR", + "hFQA12oJKauicpcikhnkm8lIDt98ALepNX5KSfchJcl0MqEBJUzlGYCXnMRsAvEHGFvGrH2o4NxRumgb", + "S0l5aaPVDGie9u7eHXuywe9fOHIZ9h6mjzw3UTGhE0fyy7BeHvne8KF/v8T5/uWQh4xihuGvgm6ZEG1P", + "bAJiP4PwgourppjnycN56wh4+9xJcYXfIW+ip0cKVQ6/IYsCl7fxrddIU+Zc7uFALiVX/ZYunQ4SVrg1", + "QZGUTfM6l1TNeGqyqozsQ5OVTZ8KW00GWJ7A9vqtyYse/R4Y0NdcIRonEYkJZG3rGmyC4qJpknCR1R+j", + "spCKeDPyp49N0cHWJLexVYA7yCZsBmWd27A26O2Xt8tLNSM+XR9Umw3uIkg9UbVDdiFNkpcPhhX+gDIi", + "ixRHkkQkUOh6RoMZRNjqZ9C/CcDFSfIhS6mx5YqlFjOLwOBtSQTFEVR55JGpV/phHscfBssZ4C5PT+Ej", + "E1xrcr19GCCX9S27IKRuVYyY1auIsFTotY0DbmtMEjyKzI5+0LdQYX1bNpY2T3kyZL64WkaubYd0gj4U", + "Qmw/1MTYOoL6ik/lt+KXOvWJqsxaFEcCAGdwk7CwVafYoZE/unan3/flT2kY6WumcceBvkuTecWnWZKs", + "EirjJGmKvnaagMXzOF6Bw6hdSGYuVchT9VepQiIEfGyxuw65URsH5g+FrzSiMluKzKWDB/Tzqi9N1hov", + "qDRRLeSTNn/N47jVadn5eKrnfn3EdLXDZTWb3plCWPRPTnuTgOcysS9EPFduDlu3op7ltuU4fnh5z5W7", + "/sZo+A30Y/ksKHOsCuxtXkf8YUVOmkotVV7MJM/3nZGs1Ev9KSkrlc/zNP3/A0VUs9ZqfZ57FlIzEPsk", + "s1J5i28unWbVNn5KqJmEygUKUzNcpd7NDyt2ZgQFpawkeVr29KayZ5ZkLgMz1CFkKw0COc3b/ux+ntyA", + "XfhOKGGntupLXTqjfNHfA8mtqYnWiOZ+Iz7JXqsFBuEbkmBXne2+KXAGFS3uZVTuuyDD5sBl1LhIc6Dy", + "PnWFF38S45Ia0GhKb0qMHfO5pAsskGfKukmE6+iy5VNrCbCtAvXDy2u5rPKDS2wBF8K4joEz2kMKXSzY", + "DAuiZzvBqSSd7MB0nN368vR0q+7QCLXyyIjvw6B9M86hUpYzDv11kQUNXZL6Z6fHNqU9lUikrIfexBQy", + "x18RkkBKSspTicAHsFesN1ZXBS0rKEaYEouEU6bWziJvejeT+XKjJN33TKds8PYPr1ayhXYfGpEC2qFv", + "b7uA1UKVMmX2vGY6Z7aizGTW18wHHvNU975UDw1NaETkQioSG5vdJI3gEEF6D5v91X5nfNc6iCoJ1cM7", + "4OuTEBFTKSlncsjGZKK5koQIPTYUnKQRKZgffJatc4UzqnlmSN/3YdqCEmlgzcGqDmrl6mg4SVx1NJ/5", + "JCvoduMpvQBbFZKLeMwjGqCIsiuJ2hG9Mjw4mksU6R9bK41dI/jutnPb3vxkaUifsAn3pv8zOJsh849A", + "4U4qZM0Z8x8cWXtJiofF0R/YaD9Zk2vpmiA4giKgmZstShWN6CdD6nQnVCoamJpJOIMdlHsx4/WG7JQo", + "odtgQVDAo4gEyukathPBg+1h2u/vBQmFeIg9ApMDglf/OoYRn51dQDtTkqYzZPoP6Pjd0RmiGqYTbEXm", + "wkRtYXt0sv1mjfn/HMD0P1geMwtcdSz8G/7Tsru5D2XtGZI1R5QnqwQgnvzwCgPLwf3UFjxMbQE4sWer", + "aU8FDoAplrNUhfya+TUDpkKq3P5sfpysC4VQOJhdulLR3we3a6vFrhvGLfBBHEq7ppCY9KTfRF9vC/o+", + "0HROGnBuCcDEFIM6/LeAKRT+o2H37RvrinD8Di11FqIu9e93c7bu++azc3ARfkV4PJRjbjDNrQRKVha1", + "T1k441rZLEiFIExBKpictQxwggOqFh2EI1dN1ZZHynRIeSH4sSD4St+0vSF7mwVS2vJMWrrqONEKhVRe", + "mR6s9NRDb+ZEyHScTQ4BYTJyHgDfFlQNcBSYSqRkMiGBonNiSoTKGukrm8pdpuXNB/FstHtpQffQRA4/", + "TsDu5WhhpY6Sp1xt+obzrFWz9A1ZrwVvmIKnyEqf55FraKrgb6Ky8wx+RWvd4u2rzbzXftMfNRy77CXl", + "n4R99ZWr/FGy4p0XnFOaJn3IMfyh5V8ozLx0VEsOXusDwRt7dN2lh9W6QPBs8PsOBD/3Ovk8sHRUuOS2", + "VRcB/v0hQv9+vYvvOwL8YeOWZiXkEujqKVGDSPDvAgPvJgT8G3vX3yAE/Lvy94QQ3m/nd/9deXpaj8XM", + "0/NnkPddOniaSG8IaK1z8DRUz2qeVwpKl7ZNMzHJ9vgjcfBWWbkB/+7A/jNlWwORoQAsdwtXyA3QfmkR", + "nsSJWjhtFJ+A302eU1DST+C95wucy5TOdxevdgN97O2hh8PTWm3sz1Rv96bwzfNhnxw//PxuxTNXuli2", + "9a3TxSKY0XkpXmvVCbYgSgTpJjwBPWtoAGbh4e4yhUVv+gnZ7ntD9m5G3F+IumwZJEQhFSRQ0QJRpjhQ", + "BDPGXyQSXEsC8J6LhU99Wzy5LwSPj+xq1tyH9kxZZVju5hcvuiFWuDt31GaFCu0rTFan+CON0xgIHqIM", + "vXyK2uSjEiZ5A5poyQfRSQZS8jEgJJSAk1vFCe/0azSb9BMZTcdNZrkiDccbm+YEBalUPHZ7f3KM2jhV", + "vDslTO+FZvUnwMkmgs9paHLk5kCd88hAdacGoJvqXTVTYf3Bc+HCTO6b8DBNLqTpJ5qUyYJxe2wNWmPK", + "MExubcKL8pkyHrh6PEzBDy4/Ow5zWj+vsGqVbY2JWshxQFSco0hz9Fs/r7mHfM0VPRncnVa67ZplMW3m", + "3NDQ5+AuMphmji/3q7a+/H7s8YWqxA9QdT7PBNI6tfn3hYL9+7sf7ltdfvmA/bdeEid8F1Tl0IHu0Ycw", + "r3iAIxSSOYl4Emu20rRtdVqpiFqD1kypZLC9Hel2My7V4LB/2G99ef/l/wcAAP//du3TWocdAQA=", } // GetSwagger returns the content of the embedded swagger specification file diff --git a/lib/system/README.md b/lib/system/README.md index 6455d0a0..2f5c012c 100644 --- a/lib/system/README.md +++ b/lib/system/README.md @@ -70,8 +70,17 @@ It replaces the previous shell-based init script with cleaner logic and structur - ✅ Hand off to systemd via chroot + exec (systemd mode) **Two boot modes:** -- **Exec mode** (default): Init chroots to container rootfs, runs entrypoint as child process. When the app exits, init logs exit info and cleanly shuts down the VM via `reboot(POWER_OFF)`. -- **Systemd mode** (auto-detected on host): Init chroots to container rootfs, then execs /sbin/init so systemd becomes PID 1 +- **Exec mode** (default): Init chroots to container rootfs, starts guest-agent, and waits on an event-driven readiness signal (pipe FD, 10s timeout) before launching the entrypoint. When the app exits, init logs exit info and cleanly shuts down the VM via `reboot(POWER_OFF)`. +- **Systemd mode** (auto-detected on host): Init injects systemd units (guest-agent plus async kernel-headers worker), emits handoff marker, then execs /sbin/init so systemd becomes PID 1. + +**Boot progress sentinels:** Init and guest-agent emit machine-parseable markers to serial console: +- `HYPEMAN-PROGRAM-START ts=... mode=...` +- `HYPEMAN-AGENT-READY ts=...` +- `HYPEMAN-HEADERS-START ts=...` +- `HYPEMAN-HEADERS-READY ts=...` +- `HYPEMAN-HEADERS-FAILED ts=... error="..."` + +Host state derivation uses these sentinels to report `Initializing` until both required markers are present (or until `skip_guest_agent=true` bypasses the agent marker requirement). **Graceful shutdown:** The host sends a `Shutdown` gRPC RPC to the guest-agent, which signals PID 1 (init). Init forwards the signal to the entrypoint child process. If the app doesn't exit within the stop timeout, the host falls back to hypervisor shutdown and then force-kills the hypervisor process if still needed. @@ -87,13 +96,16 @@ via `INIT_MODE` in the config disk. ## Kernel Headers -Kernel headers are bundled in the initrd and automatically installed at boot, enabling DKMS to build out-of-tree kernel modules (e.g., NVIDIA vGPU drivers). +Kernel headers are bundled in the initrd and installed asynchronously after boot handoff, enabling DKMS to build out-of-tree kernel modules (e.g., NVIDIA vGPU drivers) without delaying `Running`. **Why:** Guest images come with headers for their native kernel (e.g., Ubuntu's 5.15), but hypeman VMs run a custom kernel. Without matching headers, DKMS cannot compile drivers. -**How:** The initrd includes `kernel-headers.tar.gz` from the same release as the kernel. At boot, init extracts headers to `/usr/src/linux-headers-{version}/`, creates the `/lib/modules/{version}/build` symlink, and removes mismatched headers from the guest image. +**How:** The initrd includes `kernel-headers.tar.gz` from the same release as the kernel. A background worker (exec mode) or injected systemd oneshot unit (systemd mode) performs installation: +- writes `/run/hypeman/kernel-headers.status` as `pending|running|ready|failed` +- fast-path skips extraction when matching headers + build symlink are already valid +- otherwise extracts headers to `/usr/src/linux-headers-{version}/`, creates `/lib/modules/{version}/build`, and removes mismatched headers from the guest image -**Result:** Guests can `apt install nvidia-driver-xxx` and DKMS builds modules for the running kernel automatically. +**Result:** Guests can `apt install nvidia-driver-xxx` and DKMS builds modules for the running kernel automatically once headers status reaches `ready`. ## Kernel Sources @@ -172,14 +184,14 @@ Files downloaded/built once per version, reused for all instances using that ver ``` lib/system/init/ - main.go # Entry point, orchestrates boot + main.go # Entry point, orchestrates staged boot init.sh # Shell wrapper (mounts /proc, /sys, /dev before Go runtime) mount.go # Mount operations (overlay, bind mounts) config.go # Parse config disk network.go # Network configuration headers.go # Kernel headers setup for DKMS volumes.go # Volume mounting - mode_exec.go # Exec mode: chroot, run entrypoint, wait on guest-agent - mode_systemd.go # Systemd mode: chroot + exec /sbin/init + mode_exec.go # Exec mode: chroot, event-driven agent gate, run entrypoint + mode_systemd.go # Systemd mode: inject units + chroot + handoff marker + exec /sbin/init logger.go # Human-readable logging to hypeman operations log ``` diff --git a/lib/system/guest_agent/main.go b/lib/system/guest_agent/main.go index b0fe125c..46ee3d7a 100644 --- a/lib/system/guest_agent/main.go +++ b/lib/system/guest_agent/main.go @@ -1,7 +1,11 @@ package main import ( + "fmt" "log" + "os" + "path/filepath" + "strconv" "time" pb "github.com/kernel/hypeman/lib/guest" @@ -9,6 +13,12 @@ import ( "google.golang.org/grpc" ) +const ( + readySentinelPrefix = "HYPEMAN-AGENT-READY" + defaultReadyFilePath = "/run/hypeman/guest-agent-ready" + readyFDEnv = "HYPEMAN_AGENT_READY_FD" +) + // guestServer implements the gRPC GuestService type guestServer struct { pb.UnimplementedGuestServiceServer @@ -34,6 +44,13 @@ func main() { defer l.Close() log.Println("[guest-agent] listening on vsock port 2222") + log.Printf("[guest-agent] %s ts=%s", readySentinelPrefix, time.Now().UTC().Format(time.RFC3339Nano)) + if err := signalReadyFD(); err != nil { + log.Printf("[guest-agent] warning: failed to signal readiness fd: %v", err) + } + if err := writeReadyFile(); err != nil { + log.Printf("[guest-agent] warning: failed to write readiness file: %v", err) + } // Create gRPC server grpcServer := grpc.NewServer() @@ -44,3 +61,40 @@ func main() { log.Fatalf("[guest-agent] gRPC server failed: %v", err) } } + +func writeReadyFile() error { + path := os.Getenv("HYPEMAN_AGENT_READY_FILE") + if path == "" { + path = defaultReadyFilePath + } + if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil { + return err + } + return os.WriteFile(path, []byte(time.Now().UTC().Format(time.RFC3339Nano)+"\n"), 0644) +} + +func signalReadyFD() error { + rawFD := os.Getenv(readyFDEnv) + if rawFD == "" { + return nil + } + + fd, err := strconv.Atoi(rawFD) + if err != nil { + return fmt.Errorf("parse %s: %w", readyFDEnv, err) + } + if fd < 0 { + return fmt.Errorf("invalid %s=%d", readyFDEnv, fd) + } + + f := os.NewFile(uintptr(fd), "guest-agent-ready-fd") + if f == nil { + return fmt.Errorf("open readiness fd %d", fd) + } + defer f.Close() + + if _, err := f.Write([]byte{1}); err != nil { + return fmt.Errorf("write readiness byte: %w", err) + } + return nil +} diff --git a/lib/system/guest_agent/main_test.go b/lib/system/guest_agent/main_test.go new file mode 100644 index 00000000..901aceca --- /dev/null +++ b/lib/system/guest_agent/main_test.go @@ -0,0 +1,34 @@ +package main + +import ( + "fmt" + "os" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestSignalReadyFD(t *testing.T) { + readyReader, readyWriter, err := os.Pipe() + require.NoError(t, err) + defer readyReader.Close() + defer readyWriter.Close() + + t.Setenv(readyFDEnv, fmt.Sprintf("%d", readyWriter.Fd())) + err = signalReadyFD() + require.NoError(t, err) + + buf := make([]byte, 1) + n, err := readyReader.Read(buf) + require.NoError(t, err) + require.Equal(t, 1, n) + assert.Equal(t, byte(1), buf[0]) +} + +func TestSignalReadyFDInvalid(t *testing.T) { + t.Setenv(readyFDEnv, "not-an-int") + err := signalReadyFD() + require.Error(t, err) + assert.Contains(t, err.Error(), "parse") +} diff --git a/lib/system/init/headers.go b/lib/system/init/headers.go index 1cf330f1..937549c2 100644 --- a/lib/system/init/headers.go +++ b/lib/system/init/headers.go @@ -5,20 +5,120 @@ import ( "os" "os/exec" "path/filepath" + "strconv" "strings" "syscall" + "time" ) const ( - // Paths in the overlay filesystem - newrootLibModules = "/overlay/newroot/lib/modules" - newrootUsrSrc = "/overlay/newroot/usr/src" - headersTarball = "/kernel-headers.tar.gz" + headersWorkerArg = "--headers-worker" + headersWorkerGuestArg = "--headers-worker-guest" + + headersStatusPending = "pending" + headersStatusRunning = "running" + headersStatusReady = "ready" + headersStatusFailed = "failed" +) + +type kernelHeadersPaths struct { + libModulesDir string + usrSrcDir string + tarballPath string + statusPath string +} + +var ( + initrdKernelHeadersPaths = kernelHeadersPaths{ + libModulesDir: "/overlay/newroot/lib/modules", + usrSrcDir: "/overlay/newroot/usr/src", + tarballPath: "/kernel-headers.tar.gz", + statusPath: "/overlay/newroot/run/hypeman/kernel-headers.status", + } + guestKernelHeadersPaths = kernelHeadersPaths{ + libModulesDir: "/lib/modules", + usrSrcDir: "/usr/src", + tarballPath: "/opt/hypeman/kernel-headers.tar.gz", + statusPath: "/run/hypeman/kernel-headers.status", + } ) +func startKernelHeadersWorkerAsync(log *Logger) { + if err := writeKernelHeadersStatus(initrdKernelHeadersPaths.statusPath, headersStatusPending); err != nil { + log.Info("hypeman-init:headers", "warning: failed to write status file: "+err.Error()) + } + + cmd := exec.Command("/proc/self/exe", headersWorkerArg) + cmd.Stdout = os.Stdout + cmd.Stderr = os.Stderr + if err := cmd.Start(); err != nil { + log.Error("hypeman-init:headers", "failed to start async headers worker", err) + _ = writeKernelHeadersStatus(initrdKernelHeadersPaths.statusPath, headersStatusFailed) + log.Info("hypeman-init:headers", formatHeadersFailedSentinel(err)) + return + } + + log.Info("hypeman-init:headers", fmt.Sprintf("started async headers worker (pid %d)", cmd.Process.Pid)) +} + +func runKernelHeadersWorker(log *Logger, paths kernelHeadersPaths) { + log.Info("hypeman-init:headers", formatHeadersSentinel("START")) + if err := writeKernelHeadersStatus(paths.statusPath, headersStatusRunning); err != nil { + log.Info("hypeman-init:headers", "warning: failed to write status file: "+err.Error()) + } + if err := lowerKernelHeadersWorkerPriority(log); err != nil { + log.Info("hypeman-init:headers", "warning: failed to lower worker priority: "+err.Error()) + } + + if err := setupKernelHeaders(log, paths); err != nil { + log.Error("hypeman-init:headers", "kernel headers setup failed", err) + _ = writeKernelHeadersStatus(paths.statusPath, headersStatusFailed) + log.Info("hypeman-init:headers", formatHeadersFailedSentinel(err)) + os.Exit(1) + } + + if err := writeKernelHeadersStatus(paths.statusPath, headersStatusReady); err != nil { + log.Info("hypeman-init:headers", "warning: failed to write status file: "+err.Error()) + } + log.Info("hypeman-init:headers", formatHeadersSentinel("READY")) + os.Exit(0) +} + +func lowerKernelHeadersWorkerPriority(log *Logger) error { + if err := syscall.Setpriority(syscall.PRIO_PROCESS, 0, 10); err != nil { + return err + } + + ionicePath, err := exec.LookPath("ionice") + if err != nil { + return nil + } + + cmd := exec.Command(ionicePath, "-c", "3", "-p", strconv.Itoa(os.Getpid())) + if output, err := cmd.CombinedOutput(); err != nil { + log.Info("hypeman-init:headers", fmt.Sprintf("warning: ionice best-effort failed: %v: %s", err, strings.TrimSpace(string(output)))) + } + return nil +} + +func writeKernelHeadersStatus(statusPath, status string) error { + if err := os.MkdirAll(filepath.Dir(statusPath), 0755); err != nil { + return err + } + return os.WriteFile(statusPath, []byte(status+"\n"), 0644) +} + +func formatHeadersSentinel(state string) string { + return fmt.Sprintf("HYPEMAN-HEADERS-%s ts=%s", state, time.Now().UTC().Format(time.RFC3339Nano)) +} + +func formatHeadersFailedSentinel(err error) string { + return fmt.Sprintf("HYPEMAN-HEADERS-FAILED ts=%s error=%q", time.Now().UTC().Format(time.RFC3339Nano), err.Error()) +} + // setupKernelHeaders installs kernel headers and cleans up mismatched headers from the guest image. // This enables DKMS to build out-of-tree kernel modules (e.g., NVIDIA vGPU drivers). -func setupKernelHeaders(log *Logger) error { +func setupKernelHeaders(log *Logger, paths kernelHeadersPaths) error { // Get running kernel version var uname syscall.Utsname if err := syscall.Uname(&uname); err != nil { @@ -27,27 +127,37 @@ func setupKernelHeaders(log *Logger) error { runningKernel := int8ArrayToString(uname.Release[:]) log.Info("hypeman-init:headers", "running kernel: "+runningKernel) - // Check if headers tarball exists in initrd - if _, err := os.Stat(headersTarball); os.IsNotExist(err) { - log.Info("hypeman-init:headers", "no kernel headers tarball found, skipping") + ready, err := kernelHeadersAlreadyInstalled(runningKernel, paths) + if err != nil { + return fmt.Errorf("check fast path: %w", err) + } + if ready { + log.Info("hypeman-init:headers", "kernel headers already installed, skipping extraction") return nil } + // Check if headers tarball exists in initrd + if _, err := os.Stat(paths.tarballPath); os.IsNotExist(err) { + return fmt.Errorf("kernel headers tarball not found at %s", paths.tarballPath) + } else if err != nil { + return fmt.Errorf("stat tarball: %w", err) + } + // Clean up mismatched kernel modules directories - if err := cleanupMismatchedModules(log, runningKernel); err != nil { + if err := cleanupMismatchedModules(log, runningKernel, paths.libModulesDir); err != nil { log.Info("hypeman-init:headers", "warning: failed to cleanup mismatched modules: "+err.Error()) // Non-fatal, continue } // Clean up mismatched kernel headers directories - if err := cleanupMismatchedHeaders(log, runningKernel); err != nil { + if err := cleanupMismatchedHeaders(log, runningKernel, paths.usrSrcDir); err != nil { log.Info("hypeman-init:headers", "warning: failed to cleanup mismatched headers: "+err.Error()) // Non-fatal, continue } // Create target directories - headersDir := filepath.Join(newrootUsrSrc, "linux-headers-"+runningKernel) - modulesDir := filepath.Join(newrootLibModules, runningKernel) + headersDir := filepath.Join(paths.usrSrcDir, "linux-headers-"+runningKernel) + modulesDir := filepath.Join(paths.libModulesDir, runningKernel) if err := os.MkdirAll(headersDir, 0755); err != nil { return fmt.Errorf("mkdir headers dir: %w", err) @@ -57,7 +167,7 @@ func setupKernelHeaders(log *Logger) error { } // Extract headers tarball - if err := extractTarGz(headersTarball, headersDir); err != nil { + if err := extractTarGz(paths.tarballPath, headersDir); err != nil { return fmt.Errorf("extract headers: %w", err) } log.Info("hypeman-init:headers", "extracted kernel headers to "+headersDir) @@ -75,9 +185,34 @@ func setupKernelHeaders(log *Logger) error { return nil } +func kernelHeadersAlreadyInstalled(runningKernel string, paths kernelHeadersPaths) (bool, error) { + headersDir := filepath.Join(paths.usrSrcDir, "linux-headers-"+runningKernel) + headersInfo, err := os.Stat(headersDir) + if os.IsNotExist(err) { + return false, nil + } + if err != nil { + return false, err + } + if !headersInfo.IsDir() { + return false, nil + } + + buildLink := filepath.Join(paths.libModulesDir, runningKernel, "build") + target, err := os.Readlink(buildLink) + if os.IsNotExist(err) { + return false, nil + } + if err != nil { + return false, err + } + + return target == "/usr/src/linux-headers-"+runningKernel, nil +} + // cleanupMismatchedModules removes /lib/modules/* directories that don't match the running kernel -func cleanupMismatchedModules(log *Logger, runningKernel string) error { - entries, err := os.ReadDir(newrootLibModules) +func cleanupMismatchedModules(log *Logger, runningKernel, modulesDir string) error { + entries, err := os.ReadDir(modulesDir) if err != nil { if os.IsNotExist(err) { return nil // No modules directory, nothing to clean @@ -90,7 +225,7 @@ func cleanupMismatchedModules(log *Logger, runningKernel string) error { continue } if entry.Name() != runningKernel { - path := filepath.Join(newrootLibModules, entry.Name()) + path := filepath.Join(modulesDir, entry.Name()) log.Info("hypeman-init:headers", "removing mismatched modules: "+entry.Name()) if err := os.RemoveAll(path); err != nil { return fmt.Errorf("remove %s: %w", path, err) @@ -102,8 +237,8 @@ func cleanupMismatchedModules(log *Logger, runningKernel string) error { } // cleanupMismatchedHeaders removes /usr/src/linux-headers-* directories that don't match the running kernel -func cleanupMismatchedHeaders(log *Logger, runningKernel string) error { - entries, err := os.ReadDir(newrootUsrSrc) +func cleanupMismatchedHeaders(log *Logger, runningKernel, usrSrcDir string) error { + entries, err := os.ReadDir(usrSrcDir) if err != nil { if os.IsNotExist(err) { return nil // No usr/src directory, nothing to clean @@ -119,7 +254,7 @@ func cleanupMismatchedHeaders(log *Logger, runningKernel string) error { } // Remove any linux-headers-* directory that doesn't match if strings.HasPrefix(entry.Name(), "linux-headers-") && entry.Name() != expectedName { - path := filepath.Join(newrootUsrSrc, entry.Name()) + path := filepath.Join(usrSrcDir, entry.Name()) log.Info("hypeman-init:headers", "removing mismatched headers: "+entry.Name()) if err := os.RemoveAll(path); err != nil { return fmt.Errorf("remove %s: %w", path, err) diff --git a/lib/system/init/headers_test.go b/lib/system/init/headers_test.go new file mode 100644 index 00000000..bd60e869 --- /dev/null +++ b/lib/system/init/headers_test.go @@ -0,0 +1,63 @@ +package main + +import ( + "os" + "path/filepath" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestKernelHeadersAlreadyInstalled(t *testing.T) { + t.Parallel() + + root := t.TempDir() + kernel := "test-kernel" + paths := kernelHeadersPaths{ + libModulesDir: filepath.Join(root, "lib/modules"), + usrSrcDir: filepath.Join(root, "usr/src"), + } + + headersDir := filepath.Join(paths.usrSrcDir, "linux-headers-"+kernel) + modulesDir := filepath.Join(paths.libModulesDir, kernel) + require.NoError(t, os.MkdirAll(headersDir, 0755)) + require.NoError(t, os.MkdirAll(modulesDir, 0755)) + require.NoError(t, os.Symlink("/usr/src/linux-headers-"+kernel, filepath.Join(modulesDir, "build"))) + + ready, err := kernelHeadersAlreadyInstalled(kernel, paths) + require.NoError(t, err) + assert.True(t, ready) +} + +func TestKernelHeadersAlreadyInstalledSymlinkMismatch(t *testing.T) { + t.Parallel() + + root := t.TempDir() + kernel := "test-kernel" + paths := kernelHeadersPaths{ + libModulesDir: filepath.Join(root, "lib/modules"), + usrSrcDir: filepath.Join(root, "usr/src"), + } + + headersDir := filepath.Join(paths.usrSrcDir, "linux-headers-"+kernel) + modulesDir := filepath.Join(paths.libModulesDir, kernel) + require.NoError(t, os.MkdirAll(headersDir, 0755)) + require.NoError(t, os.MkdirAll(modulesDir, 0755)) + require.NoError(t, os.Symlink("/usr/src/linux-headers-other", filepath.Join(modulesDir, "build"))) + + ready, err := kernelHeadersAlreadyInstalled(kernel, paths) + require.NoError(t, err) + assert.False(t, ready) +} + +func TestWriteKernelHeadersStatus(t *testing.T) { + t.Parallel() + + statusPath := filepath.Join(t.TempDir(), "run/hypeman/kernel-headers.status") + require.NoError(t, writeKernelHeadersStatus(statusPath, headersStatusRunning)) + + data, err := os.ReadFile(statusPath) + require.NoError(t, err) + assert.Equal(t, "running\n", string(data)) +} diff --git a/lib/system/init/main.go b/lib/system/init/main.go index 6eaa10cf..4883c5ee 100644 --- a/lib/system/init/main.go +++ b/lib/system/init/main.go @@ -13,9 +13,21 @@ import ( "fmt" "os" "os/exec" + "sync" ) func main() { + if len(os.Args) > 1 { + switch os.Args[1] { + case headersWorkerArg: + runKernelHeadersWorker(NewLogger(), initrdKernelHeadersPaths) + return + case headersWorkerGuestArg: + runKernelHeadersWorker(NewLogger(), guestKernelHeadersPaths) + return + } + } + log := NewLogger() log.Info("hypeman-init:boot", "init starting") @@ -38,21 +50,31 @@ func main() { dropToShell() } - // Phase 4: Configure network (shared between modes) + // Phase 4/5: Run independent setup tasks in parallel. + // Keep strict dependencies around mount -> overlay -> config and + // bind-mount barrier before mode handoff. + var setupWG sync.WaitGroup if cfg.NetworkEnabled { - if err := configureNetwork(log, cfg); err != nil { - log.Error("hypeman-init:network", "failed to configure network", err) - // Continue anyway - network isn't always required - } + setupWG.Add(1) + go func() { + defer setupWG.Done() + if err := configureNetwork(log, cfg); err != nil { + log.Error("hypeman-init:network", "failed to configure network", err) + // Continue anyway - network isn't always required + } + }() } - - // Phase 5: Mount volumes if len(cfg.VolumeMounts) > 0 { - if err := mountVolumes(log, cfg); err != nil { - log.Error("hypeman-init:volumes", "failed to mount volumes", err) - // Continue anyway - } + setupWG.Add(1) + go func() { + defer setupWG.Done() + if err := mountVolumes(log, cfg); err != nil { + log.Error("hypeman-init:volumes", "failed to mount volumes", err) + // Continue anyway + } + }() } + setupWG.Wait() // Phase 6: Bind mount filesystems to new root if err := bindMountsToNewRoot(log); err != nil { @@ -66,14 +88,12 @@ func main() { // Continue anyway - exec will still work, just no remote access } - // Phase 8: Setup kernel headers for DKMS (can be skipped via config) + // Phase 8: Start async kernel headers setup for exec mode. + // In systemd mode, service injection is handled during runSystemdMode. if cfg.SkipKernelHeaders { log.Info("hypeman-init:headers", "skipping kernel headers setup (skip_kernel_headers=true)") - } else { - if err := setupKernelHeaders(log); err != nil { - log.Error("hypeman-init:headers", "failed to setup kernel headers", err) - // Continue anyway - only needed for DKMS module building - } + } else if cfg.InitMode == "exec" { + startKernelHeadersWorkerAsync(log) } // Phase 9: Mode-specific execution diff --git a/lib/system/init/mode_exec.go b/lib/system/init/mode_exec.go index dd253332..1e1792e8 100644 --- a/lib/system/init/mode_exec.go +++ b/lib/system/init/mode_exec.go @@ -13,6 +13,12 @@ import ( "github.com/kernel/hypeman/lib/vmconfig" ) +const ( + guestAgentReadyFilePath = "/run/hypeman/guest-agent-ready" + guestAgentReadyTimeout = 10 * time.Second + guestAgentReadyFDEnv = "HYPEMAN_AGENT_READY_FD" +) + // runExecMode runs the container in exec mode (default). // This is the Docker-like behavior where: // - The init binary remains PID 1 @@ -45,14 +51,48 @@ func runExecMode(log *Logger, cfg *vmconfig.Config) { if cfg.SkipGuestAgent { log.Info("hypeman-init:setup", "skipping guest-agent (skip_guest_agent=true)") } else { + // Clear stale readiness marker from previous runs. + _ = os.Remove(guestAgentReadyFilePath) + + readyPipeReader, readyPipeWriter, err := os.Pipe() + if err != nil { + log.Error("hypeman-init:setup", "failed to create guest-agent readiness pipe", err) + syscall.Sync() + syscall.Reboot(syscall.LINUX_REBOOT_CMD_POWER_OFF) + } + log.Info("hypeman-init:setup", "starting guest-agent in background") agentCmd = exec.Command("/opt/hypeman/guest-agent") - agentCmd.Env = buildEnv(cfg.Env) + agentCmd.Env = append( + buildEnv(cfg.Env), + "HYPEMAN_AGENT_READY_FILE="+guestAgentReadyFilePath, + fmt.Sprintf("%s=%d", guestAgentReadyFDEnv, 3), + ) + agentCmd.ExtraFiles = []*os.File{readyPipeWriter} agentCmd.Stdout = os.Stdout agentCmd.Stderr = os.Stderr if err := agentCmd.Start(); err != nil { + _ = readyPipeReader.Close() + _ = readyPipeWriter.Close() log.Error("hypeman-init:setup", "failed to start guest-agent", err) + syscall.Sync() + syscall.Reboot(syscall.LINUX_REBOOT_CMD_POWER_OFF) } + _ = readyPipeWriter.Close() + + agentExited := make(chan error, 1) + go func() { + agentExited <- agentCmd.Wait() + }() + + // Strict startup gate: do not launch the guest program until agent is ready. + if err := waitForGuestAgentReady(readyPipeReader, guestAgentReadyTimeout, agentExited); err != nil { + _ = readyPipeReader.Close() + log.Error("hypeman-init:setup", "guest-agent readiness gate failed; not launching entrypoint", err) + syscall.Sync() + syscall.Reboot(syscall.LINUX_REBOOT_CMD_POWER_OFF) + } + _ = readyPipeReader.Close() } // Build the entrypoint command @@ -88,6 +128,8 @@ func runExecMode(log *Logger, cfg *vmconfig.Config) { dropToShell() } + // Program-start sentinel used by host state derivation. + log.Info("hypeman-init:entrypoint", formatProgramStartSentinel("exec")) log.Info("hypeman-init:entrypoint", fmt.Sprintf("container app started (PID %d)", appCmd.Process.Pid)) // Set up signal forwarding: when init receives a signal (e.g. from guest-agent @@ -174,6 +216,55 @@ func formatExitSentinel(code int, message string) string { return fmt.Sprintf("HYPEMAN-EXIT code=%d message=%q", code, message) } +func formatProgramStartSentinel(mode string) string { + return fmt.Sprintf("HYPEMAN-PROGRAM-START ts=%s mode=%s", time.Now().UTC().Format(time.RFC3339Nano), mode) +} + +func waitForGuestAgentReady(readyReader *os.File, timeout time.Duration, agentExited <-chan error) error { + readyErr := make(chan error, 1) + go func() { + var b [1]byte + _, err := readyReader.Read(b[:]) + readyErr <- err + }() + + agentExitCh := agentExited + agentExitObserved := false + var agentExitErr error + timer := time.NewTimer(timeout) + defer timer.Stop() + + for { + select { + case err := <-readyErr: + if err != nil { + if agentExitObserved { + if agentExitErr == nil { + return fmt.Errorf("guest-agent exited before readiness signal") + } + return fmt.Errorf("guest-agent exited before readiness signal: %w", agentExitErr) + } + return fmt.Errorf("failed waiting for guest-agent readiness signal: %w", err) + } + return nil + case err := <-agentExitCh: + agentExitErr = err + agentExitObserved = true + // Keep waiting for the readiness read to complete. If the agent wrote + // readiness and then exited, the read succeeds and startup proceeds. + agentExitCh = nil + case <-timer.C: + if agentExitObserved { + if agentExitErr == nil { + return fmt.Errorf("guest-agent exited before readiness signal") + } + return fmt.Errorf("guest-agent exited before readiness signal: %w", agentExitErr) + } + return fmt.Errorf("timed out after %s waiting for guest-agent readiness signal", timeout) + } + } +} + // checkOOMKill checks /dev/kmsg for recent OOM kill messages. // Returns true if an OOM kill was detected. // Uses a 1s timeout to avoid hanging if /dev/kmsg blocks at end of buffer. diff --git a/lib/system/init/mode_exec_test.go b/lib/system/init/mode_exec_test.go index 0255f22d..78da3939 100644 --- a/lib/system/init/mode_exec_test.go +++ b/lib/system/init/mode_exec_test.go @@ -1,7 +1,11 @@ package main import ( + "errors" + "os" + "strings" "testing" + "time" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" @@ -154,3 +158,76 @@ func TestIsOOMLine(t *testing.T) { }) } } + +func TestFormatProgramStartSentinel(t *testing.T) { + result := formatProgramStartSentinel("exec") + assert.Contains(t, result, "HYPEMAN-PROGRAM-START") + assert.Contains(t, result, " mode=exec") + assert.Contains(t, result, " ts=") +} + +func TestWaitForGuestAgentReady(t *testing.T) { + t.Parallel() + + readyReader, readyWriter, err := os.Pipe() + require.NoError(t, err) + defer readyReader.Close() + go func() { + time.Sleep(50 * time.Millisecond) + _, _ = readyWriter.Write([]byte{1}) + _ = readyWriter.Close() + }() + + err = waitForGuestAgentReady(readyReader, time.Second, nil) + require.NoError(t, err) +} + +func TestWaitForGuestAgentReadyTimeout(t *testing.T) { + t.Parallel() + + readyReader, readyWriter, err := os.Pipe() + require.NoError(t, err) + + err = waitForGuestAgentReady(readyReader, 100*time.Millisecond, nil) + require.Error(t, err) + assert.True(t, strings.Contains(err.Error(), "timed out"), "unexpected error: %v", err) + + _ = readyWriter.Close() + _ = readyReader.Close() +} + +func TestWaitForGuestAgentReadyProcessExit(t *testing.T) { + t.Parallel() + + readyReader, readyWriter, err := os.Pipe() + require.NoError(t, err) + defer readyReader.Close() + defer readyWriter.Close() + + agentExited := make(chan error, 1) + agentExited <- errors.New("exit status 1") + + err = waitForGuestAgentReady(readyReader, time.Second, agentExited) + require.Error(t, err) + assert.Contains(t, err.Error(), "exited before readiness signal") +} + +func TestWaitForGuestAgentReadyReadyWinsAfterExitSignal(t *testing.T) { + t.Parallel() + + readyReader, readyWriter, err := os.Pipe() + require.NoError(t, err) + defer readyReader.Close() + defer readyWriter.Close() + + agentExited := make(chan error, 1) + agentExited <- errors.New("exit status 1") + + go func() { + time.Sleep(25 * time.Millisecond) + _, _ = readyWriter.Write([]byte{1}) + }() + + err = waitForGuestAgentReady(readyReader, time.Second, agentExited) + require.NoError(t, err) +} diff --git a/lib/system/init/mode_systemd.go b/lib/system/init/mode_systemd.go index 50fc0bbd..eb1ffd61 100644 --- a/lib/system/init/mode_systemd.go +++ b/lib/system/init/mode_systemd.go @@ -3,6 +3,7 @@ package main import ( "fmt" "os" + "path/filepath" "syscall" "al.essio.dev/pkg/shellescape" @@ -29,6 +30,16 @@ func runSystemdMode(log *Logger, cfg *vmconfig.Config) { // Continue anyway - VM will work, just without agent } } + if cfg.SkipKernelHeaders { + log.Info("hypeman-init:systemd", "skipping kernel headers service injection (skip_kernel_headers=true)") + } else { + log.Info("hypeman-init:systemd", "injecting hypeman-kernel-headers.service") + if err := injectHeadersService(newroot); err != nil { + log.Error("hypeman-init:systemd", "failed to inject headers service", err) + _ = writeKernelHeadersStatus(initrdKernelHeadersPaths.statusPath, headersStatusFailed) + log.Info("hypeman-init:systemd", formatHeadersFailedSentinel(err)) + } + } // Change root to the new filesystem using chroot log.Info("hypeman-init:systemd", "executing chroot") @@ -52,6 +63,7 @@ func runSystemdMode(log *Logger, cfg *vmconfig.Config) { // Exec systemd - this replaces the current process log.Info("hypeman-init:systemd", fmt.Sprintf("exec %v", argv)) + log.Info("hypeman-init:systemd", formatProgramStartSentinel("systemd")) // syscall.Exec replaces the current process with the new one // Use buildEnv to include user's environment variables from the image/instance config @@ -68,8 +80,6 @@ func runSystemdMode(log *Logger, cfg *vmconfig.Config) { func injectAgentService(newroot string, env map[string]string) error { serviceContent := `[Unit] Description=Hypeman Guest Agent -After=network.target -Wants=network.target [Service] Type=simple @@ -77,8 +87,8 @@ ExecStart=/opt/hypeman/guest-agent EnvironmentFile=-/etc/hypeman/env Restart=always RestartSec=3 -StandardOutput=journal -StandardError=journal +StandardOutput=journal+console +StandardError=journal+console [Install] WantedBy=multi-user.target @@ -116,7 +126,10 @@ WantedBy=multi-user.target // Enable the service by creating a symlink in wants directory symlinkPath := wantsDir + "/hypeman-agent.service" // Use relative path for the symlink - return os.Symlink("../hypeman-agent.service", symlinkPath) + if err := os.Symlink("../hypeman-agent.service", symlinkPath); err != nil && !os.IsExist(err) { + return err + } + return nil } // buildEnvFileContent creates systemd environment file content from env map. @@ -141,3 +154,97 @@ func buildEnvFileContent(env map[string]string) string { return content } + +func injectHeadersService(newroot string) error { + if err := stageKernelHeadersAssetsForSystemd(newroot); err != nil { + return err + } + if err := writeKernelHeadersStatus(initrdKernelHeadersPaths.statusPath, headersStatusPending); err != nil { + return err + } + + serviceContent := `[Unit] +Description=Hypeman Kernel Headers Setup +After=local-fs.target +ConditionPathExists=/opt/hypeman/hypeman-init +ConditionPathExists=/opt/hypeman/kernel-headers.tar.gz + +[Service] +Type=oneshot +ExecStart=/opt/hypeman/hypeman-init --headers-worker-guest +Nice=10 +IOSchedulingClass=best-effort +IOSchedulingPriority=7 +StandardOutput=journal+console +StandardError=journal+console + +[Install] +WantedBy=multi-user.target +` + + serviceDir := filepath.Join(newroot, "etc/systemd/system") + wantsDir := filepath.Join(serviceDir, "multi-user.target.wants") + if err := os.MkdirAll(serviceDir, 0755); err != nil { + return err + } + if err := os.MkdirAll(wantsDir, 0755); err != nil { + return err + } + + servicePath := filepath.Join(serviceDir, "hypeman-kernel-headers.service") + if err := os.WriteFile(servicePath, []byte(serviceContent), 0644); err != nil { + return err + } + + symlinkPath := filepath.Join(wantsDir, "hypeman-kernel-headers.service") + if err := os.Symlink("../hypeman-kernel-headers.service", symlinkPath); err != nil && !os.IsExist(err) { + return err + } + return nil +} + +func stageKernelHeadersAssetsForSystemd(newroot string) error { + const ( + dstDir = "opt/hypeman" + dstInitBinRel = "opt/hypeman/hypeman-init" + dstTarballRel = "opt/hypeman/kernel-headers.tar.gz" + sourceTarball = "/kernel-headers.tar.gz" + ) + + srcInitBin, err := os.Executable() + if err != nil { + return fmt.Errorf("resolve init binary path: %w", err) + } + + guestDstDir := filepath.Join(newroot, dstDir) + if err := os.MkdirAll(guestDstDir, 0755); err != nil { + return fmt.Errorf("mkdir %s: %w", guestDstDir, err) + } + + initData, err := os.ReadFile(srcInitBin) + if err != nil { + return fmt.Errorf("read init binary: %w", err) + } + if err := os.WriteFile(filepath.Join(newroot, dstInitBinRel), initData, 0755); err != nil { + return fmt.Errorf("write init binary: %w", err) + } + + if _, err := os.Stat(sourceTarball); err != nil { + return fmt.Errorf("stat headers tarball: %w", err) + } + + targetTarball := filepath.Join(newroot, dstTarballRel) + if _, err := os.Stat(targetTarball); os.IsNotExist(err) { + if err := os.WriteFile(targetTarball, []byte{}, 0644); err != nil { + return fmt.Errorf("create target tarball file: %w", err) + } + } else if err != nil { + return fmt.Errorf("stat target tarball file: %w", err) + } + + if err := bindMount(sourceTarball, targetTarball); err != nil { + return fmt.Errorf("bind mount headers tarball: %w", err) + } + + return nil +} diff --git a/lib/system/init/mode_systemd_test.go b/lib/system/init/mode_systemd_test.go index 2f560f34..725a4082 100644 --- a/lib/system/init/mode_systemd_test.go +++ b/lib/system/init/mode_systemd_test.go @@ -1,6 +1,8 @@ package main import ( + "os" + "path/filepath" "testing" "al.essio.dev/pkg/shellescape" @@ -175,3 +177,16 @@ func TestShellescape(t *testing.T) { }) } } + +func TestInjectAgentServiceOmitsNetworkTargetDependency(t *testing.T) { + t.Parallel() + + newroot := t.TempDir() + err := injectAgentService(newroot, map[string]string{}) + assert.NoError(t, err) + + servicePath := filepath.Join(newroot, "etc/systemd/system/hypeman-agent.service") + data, err := os.ReadFile(servicePath) + assert.NoError(t, err) + assert.NotContains(t, string(data), "network.target") +} diff --git a/lib/vmm/client.go b/lib/vmm/client.go index 2c8dc26e..b4697955 100644 --- a/lib/vmm/client.go +++ b/lib/vmm/client.go @@ -16,6 +16,8 @@ import ( "go.opentelemetry.io/otel/metric" ) +const cloudHypervisorSocketReadyTimeout = 10 * time.Second + // VMM wraps the generated Cloud Hypervisor client (API v0.3.0) type VMM struct { *ClientWithResponses @@ -147,11 +149,12 @@ func StartProcessWithArgs(ctx context.Context, p *paths.Paths, version CHVersion pid := cmd.Process.Pid - // Wait for socket to be ready (use fresh context with timeout, not parent context) - waitCtx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + // Wait for socket to be ready (use fresh context with timeout, not parent context). + // CI can be heavily loaded; a larger budget avoids transient CH boot races. + waitCtx, cancel := context.WithTimeout(context.Background(), cloudHypervisorSocketReadyTimeout) defer cancel() - if err := waitForSocket(waitCtx, socketPath, 5*time.Second); err != nil { + if err := waitForSocket(waitCtx, socketPath, cloudHypervisorSocketReadyTimeout); err != nil { // Read vmm.log to understand why socket wasn't created vmmLogPath := filepath.Join(logsDir, "vmm.log") if logData, readErr := os.ReadFile(vmmLogPath); readErr == nil && len(logData) > 0 { diff --git a/openapi.yaml b/openapi.yaml index 4fceba4a..cc61fc8f 100644 --- a/openapi.yaml +++ b/openapi.yaml @@ -45,11 +45,12 @@ components: InstanceState: type: string - enum: [Created, Running, Paused, Shutdown, Stopped, Standby, Unknown] + enum: [Created, Initializing, Running, Paused, Shutdown, Stopped, Standby, Unknown] description: | Instance state: - Created: VMM created but not started (Cloud Hypervisor native) - - Running: VM is actively running (Cloud Hypervisor native) + - Initializing: VM is running while guest init is still in progress + - Running: Guest program has started and instance is ready - Paused: VM is paused (Cloud Hypervisor native) - Shutdown: VM shut down but VMM exists (Cloud Hypervisor native) - Stopped: No VMM running, no snapshot exists diff --git a/skills/optimize-initializing-speed/SKILL.md b/skills/optimize-initializing-speed/SKILL.md new file mode 100644 index 00000000..244b25f0 --- /dev/null +++ b/skills/optimize-initializing-speed/SKILL.md @@ -0,0 +1,30 @@ +--- +name: optimize-initializing-speed +description: Use when optimizing VM Initializing-to-Running latency while preserving functionality and low implementation complexity. +--- + +# Optimize Initializing Speed + +## Goal +Minimize `Create/Start -> Running` latency without removing functionality. Base your decisions on what to optimize by using real measurements. + +## Priority Levers +1. Keep `Running` gated only on `program-start` + `agent-ready` markers. +2. Replace readiness polling with event-driven signaling +3. Move heavy non-critical setup (kernel headers) off the critical path (ask permission from user if moving logic to async / could be still processing after Running is set). +4. Add fast-path checks (skip work when already installed/valid). +5. Parallelize independent init stages with simple barriers (no DAG engine). Avoid parallel tasks that are likely to conflict. + +## Guardrails +- Keep guest-agent gate strict unless `skip_guest_agent=true`. +- Preserve lifecycle semantics and blocked/allowed operations in `Initializing`. + +## Measurement Protocol +1. Measure baseline and candidate on the same host with the same 5-run harness. +2. Report per-run samples + median/mean/min/max. +3. Validate full regression suite before merge. + +## Required Outputs +- Exact before/after latency numbers. +- Short breakdown of biggest contributors. +- Risk notes, if any