Skip to content

Commit 3ee0c14

Browse files
authored
Add Initializing state and optimize speed to init (#136)
* instances: add Initializing state and gate Running on agent/program readiness * tests: fix lifecycle flakes for Initializing transition * instances: add Initializing state and gate Running on agent/program readiness * skills: add concise initializing-speed optimization playbook * Update skill * instances: throttle boot-marker log scans during state derivation * tests: increase manager lifecycle running wait to 20s * instances: clear boot markers on stop/restore and tighten transitions * instances: allow fork target Running while still Initializing * instances: preserve boot markers across standby restore * tests: use manager network init in lifecycle integration tests * network: harden default-network self-heal on allocation * ci: install libjpeg8 for qemu tests on linux runners * ci: support runner-specific jpeg packages for qemu * instances: harden boot marker hydration and network self-heal * ci: make linux libjpeg check compatible with runner variants * instances: avoid duplicate boot-marker persistence on fallback lookups * ci: install distro qemu and verify runtime directly * ci: reinstall qemu when runtime check fails * tests: skip qemu integration when runtime is unavailable * instances: avoid stale boot markers without guest clock filtering * instances: fix marker parsing and readiness edge cases * init: keep readiness timeout behavior when no exit channel
1 parent 2e16427 commit 3ee0c14

42 files changed

Lines changed: 1873 additions & 424 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.github/workflows/test.yml

Lines changed: 22 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -31,19 +31,37 @@ jobs:
3131
- name: Install dependencies
3232
run: |
3333
set -xe
34+
apt_update_with_retry() {
35+
local attempts=5
36+
local sleep_seconds=30
37+
local n=1
38+
while [ "$n" -le "$attempts" ]; do
39+
if sudo apt-get update; then
40+
return 0
41+
fi
42+
if [ "$n" -eq "$attempts" ]; then
43+
return 1
44+
fi
45+
echo "apt-get update failed (attempt ${n}/${attempts}); retrying in ${sleep_seconds}s..."
46+
sleep "$sleep_seconds"
47+
n=$((n + 1))
48+
done
49+
}
3450
if ! command -v mkfs.erofs &> /dev/null || \
3551
! command -v mkfs.ext4 &> /dev/null || \
36-
! command -v iptables &> /dev/null; then
37-
sudo apt-get update
38-
sudo apt-get install -y erofs-utils e2fsprogs iptables
52+
! command -v iptables &> /dev/null || \
53+
! command -v qemu-system-x86_64 &> /dev/null || \
54+
! qemu-system-x86_64 --version >/dev/null 2>&1; then
55+
apt_update_with_retry
56+
sudo apt-get install -y erofs-utils e2fsprogs iptables qemu-system-x86 qemu-utils
3957
fi
4058
go mod download
4159
4260
- name: Verify Linux test toolchain
4361
run: |
4462
set -euo pipefail
4563
TEST_PATH="/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:$PATH"
46-
for bin in mkfs.erofs mkfs.ext4 iptables; do
64+
for bin in mkfs.erofs mkfs.ext4 iptables qemu-system-x86_64; do
4765
if ! sudo env "PATH=$TEST_PATH" bash -lc "command -v '$bin' >/dev/null"; then
4866
echo "missing required binary under sudo PATH: $bin"
4967
exit 1

cmd/api/api/instances_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -419,7 +419,7 @@ func TestInstanceLifecycle_StopStart(t *testing.T) {
419419

420420
// 1. Create instance
421421
t.Log("Creating instance...")
422-
networkEnabled := true
422+
networkEnabled := false
423423
createResp, err := svc.CreateInstance(ctx(), oapi.CreateInstanceRequestObject{
424424
Body: &oapi.CreateInstanceRequest{
425425
Name: "test-lifecycle",

cmd/api/api/registry_test.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -135,10 +135,12 @@ func TestRegistryPushAndCreateInstance(t *testing.T) {
135135
// Create instance with pushed image
136136
t.Log("Creating instance with pushed image...")
137137
networkEnabled := false
138+
cmd := []string{"sleep", "infinity"}
138139
resp, err := svc.CreateInstance(ctx(), oapi.CreateInstanceRequestObject{
139140
Body: &oapi.CreateInstanceRequest{
140141
Name: "test-pushed-image",
141142
Image: imageName,
143+
Cmd: &cmd,
142144
Network: &struct {
143145
BandwidthDownload *string `json:"bandwidth_download,omitempty"`
144146
BandwidthUpload *string `json:"bandwidth_upload,omitempty"`

cmd/api/main.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -196,7 +196,7 @@ func run() error {
196196
// Initialize to empty slice (not nil) so cleanup runs even with no running VMs
197197
preserveTAPs = []string{}
198198
for _, inst := range allInstances {
199-
if inst.State == instances.StateRunning || inst.State == instances.StateUnknown {
199+
if inst.State == instances.StateRunning || inst.State == instances.StateInitializing || inst.State == instances.StateUnknown {
200200
preserveTAPs = append(preserveTAPs, inst.Id)
201201
}
202202
}

lib/instances/README.md

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,8 @@ Manages VM instance lifecycle across multiple hypervisors (Cloud Hypervisor, QEM
1616
**States:**
1717
- `Stopped` - No VMM, no snapshot
1818
- `Created` - VMM created but not booted (CH native)
19-
- `Running` - VM actively running (CH native)
19+
- `Initializing` - VM is running while guest init is still in progress
20+
- `Running` - Guest program start boundary reached and guest-agent readiness observed (unless `skip_guest_agent=true`)
2021
- `Paused` - VM paused (CH native)
2122
- `Shutdown` - VM shutdown, VMM exists (CH native)
2223
- `Standby` - No VMM, snapshot exists (can restore)
@@ -63,11 +64,14 @@ Manager orchestrates multiple single-hop state transitions:
6364

6465
**CreateInstance:**
6566
```
66-
Stopped → Created → Running
67+
Stopped → Created → Initializing → Running
6768
1. Start VMM process
6869
2. Create VM config
6970
3. Boot VM
70-
4. Expand memory (if hotplug configured)
71+
4. Wait for guest-agent readiness gate (event-driven, exec mode, unless skipped)
72+
5. Guest program start marker observed
73+
6. Kernel headers setup continues asynchronously (does not gate `Running`)
74+
7. Expand memory (if hotplug configured)
7175
```
7276

7377
**StandbyInstance:**
@@ -134,4 +138,3 @@ TestStorageOperations - metadata persistence, directory cleanup
134138
- `lib/system` - System manager for kernel/initrd files
135139
- `lib/hypervisor` - Hypervisor abstraction for VM operations
136140
- System tools: `mkfs.erofs`, `cpio`, `gzip` (Linux); `mkfs.ext4` (macOS)
137-

lib/instances/create.go

Lines changed: 21 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -299,6 +299,8 @@ func (m *manager) createInstance(
299299
CreatedAt: time.Now(),
300300
StartedAt: nil,
301301
StoppedAt: nil,
302+
ProgramStartedAt: nil,
303+
GuestAgentReadyAt: nil,
302304
KernelVersion: string(kernelVer),
303305
HypervisorType: hvType,
304306
HypervisorVersion: hvVersion,
@@ -406,25 +408,30 @@ func (m *manager) createInstance(
406408
return nil, fmt.Errorf("create config disk: %w", err)
407409
}
408410

409-
// 17. Save metadata
411+
// 17. Record boot start time before launching the VM so marker hydration
412+
// can safely ignore stale sentinels from prior runs.
413+
if err := m.archiveAppLogForBoot(id); err != nil {
414+
log.WarnContext(ctx, "failed to archive app log before create boot", "instance_id", id, "error", err)
415+
}
416+
bootStart := time.Now().UTC()
417+
stored.StartedAt = &bootStart
418+
419+
// 18. Save metadata
410420
log.DebugContext(ctx, "saving instance metadata", "instance_id", id)
411421
meta := &metadata{StoredMetadata: *stored}
412422
if err := m.saveMetadata(meta); err != nil {
413423
log.ErrorContext(ctx, "failed to save metadata", "instance_id", id, "error", err)
414424
return nil, fmt.Errorf("save metadata: %w", err)
415425
}
416426

417-
// 18. Start VMM and boot VM
427+
// 19. Start VMM and boot VM
418428
log.InfoContext(ctx, "starting VMM and booting VM", "instance_id", id)
419429
if err := m.startAndBootVM(ctx, stored, imageInfo, netConfig); err != nil {
420430
log.ErrorContext(ctx, "failed to start and boot VM", "instance_id", id, "error", err)
421431
return nil, err
422432
}
423433

424-
// 19. Update timestamp after VM is running
425-
now := time.Now()
426-
stored.StartedAt = &now
427-
434+
// 20. Persist runtime metadata updates after VM boot.
428435
meta = &metadata{StoredMetadata: *stored}
429436
if err := m.saveMetadata(meta); err != nil {
430437
// VM is running but metadata failed - log but don't fail
@@ -435,14 +442,18 @@ func (m *manager) createInstance(
435442
// Success - release cleanup stack (prevent cleanup)
436443
cu.Release()
437444

445+
// Return instance with derived state
446+
finalInst := m.toInstance(ctx, meta)
447+
if finalInst.BootMarkersHydrated {
448+
if err := m.saveMetadata(meta); err != nil {
449+
log.WarnContext(ctx, "failed to persist hydrated boot markers after create", "instance_id", id, "error", err)
450+
}
451+
}
438452
// Record metrics
439453
if m.metrics != nil {
440454
m.recordDuration(ctx, m.metrics.createDuration, start, "success", hvType)
441-
m.recordStateTransition(ctx, "stopped", string(StateRunning), hvType)
455+
m.recordStateTransition(ctx, string(StateStopped), string(finalInst.State), hvType)
442456
}
443-
444-
// Return instance with derived state
445-
finalInst := m.toInstance(ctx, meta)
446457
log.InfoContext(ctx, "instance created successfully", "instance_id", id, "name", req.Name, "state", finalInst.State, "hypervisor", hvType)
447458
return &finalInst, nil
448459
}

lib/instances/delete.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -50,9 +50,9 @@ func (m *manager) deleteInstance(
5050
guest.CloseConn(dialer.Key())
5151
}
5252

53-
// 4. If running, try graceful guest shutdown before force kill.
53+
// 4. If active, try graceful guest shutdown before force kill.
5454
gracefulShutdown := false
55-
if inst.State == StateRunning {
55+
if inst.State == StateRunning || inst.State == StateInitializing {
5656
stopTimeout := resolveStopTimeout(stored)
5757
if stopTimeout > deleteGracefulShutdownTimeout {
5858
stopTimeout = deleteGracefulShutdownTimeout

lib/instances/exec_test.go

Lines changed: 49 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -21,26 +21,60 @@ import (
2121
// waitForExecAgent polls until exec-agent is ready
2222
func waitForExecAgent(ctx context.Context, mgr *manager, instanceID string, timeout time.Duration) error {
2323
deadline := time.Now().Add(timeout)
24+
lastState := StateUnknown
25+
var lastErr error
26+
2427
for time.Now().Before(deadline) {
28+
inst, err := mgr.GetInstance(ctx, instanceID)
29+
if err != nil {
30+
lastErr = err
31+
time.Sleep(500 * time.Millisecond)
32+
continue
33+
}
34+
35+
lastState = inst.State
36+
if inst.State != StateRunning {
37+
time.Sleep(500 * time.Millisecond)
38+
continue
39+
}
40+
2541
meta, err := mgr.loadMetadata(instanceID)
26-
if err == nil {
27-
dialer, derr := hypervisor.NewVsockDialer(meta.HypervisorType, meta.VsockSocket, meta.VsockCID)
28-
if derr == nil {
29-
var stdout, stderr bytes.Buffer
30-
exit, eerr := guest.ExecIntoInstance(ctx, dialer, guest.ExecOptions{
31-
Command: []string{"true"},
32-
Stdout: &stdout,
33-
Stderr: &stderr,
34-
WaitForAgent: 1 * time.Second,
35-
})
36-
if eerr == nil && exit.Code == 0 {
37-
return nil
38-
}
39-
}
42+
if err != nil {
43+
lastErr = err
44+
time.Sleep(500 * time.Millisecond)
45+
continue
4046
}
47+
48+
dialer, err := hypervisor.NewVsockDialer(meta.HypervisorType, meta.VsockSocket, meta.VsockCID)
49+
if err != nil {
50+
lastErr = err
51+
time.Sleep(500 * time.Millisecond)
52+
continue
53+
}
54+
55+
var stdout, stderr bytes.Buffer
56+
exit, err := guest.ExecIntoInstance(ctx, dialer, guest.ExecOptions{
57+
Command: []string{"true"},
58+
Stdout: &stdout,
59+
Stderr: &stderr,
60+
WaitForAgent: 1 * time.Second,
61+
})
62+
if err == nil && exit.Code == 0 {
63+
return nil
64+
}
65+
if err != nil {
66+
lastErr = err
67+
} else {
68+
lastErr = fmt.Errorf("unexpected exit code: %d", exit.Code)
69+
}
70+
4171
time.Sleep(500 * time.Millisecond)
4272
}
43-
return context.DeadlineExceeded
73+
74+
if lastErr != nil {
75+
return fmt.Errorf("exec-agent not ready for instance %s within %v (last state: %s): %w", instanceID, timeout, lastState, lastErr)
76+
}
77+
return fmt.Errorf("exec-agent not ready for instance %s within %v (last state: %s)", instanceID, timeout, lastState)
4478
}
4579

4680
// Note: execCommand is defined in network_test.go

lib/instances/firecracker_test.go

Lines changed: 32 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -140,7 +140,9 @@ func TestFirecrackerStandbyAndRestore(t *testing.T) {
140140
Hypervisor: hypervisor.TypeFirecracker,
141141
})
142142
require.NoError(t, err)
143-
assert.Equal(t, StateRunning, inst.State)
143+
assert.Contains(t, []State{StateInitializing, StateRunning}, inst.State)
144+
inst, err = waitForInstanceState(ctx, mgr, inst.Id, StateRunning, 20*time.Second)
145+
require.NoError(t, err)
144146

145147
inst, err = mgr.StandbyInstance(ctx, inst.Id)
146148
require.NoError(t, err)
@@ -149,6 +151,9 @@ func TestFirecrackerStandbyAndRestore(t *testing.T) {
149151

150152
inst, err = mgr.RestoreInstance(ctx, inst.Id)
151153
require.NoError(t, err)
154+
assert.Contains(t, []State{StateInitializing, StateRunning}, inst.State)
155+
inst, err = waitForInstanceState(ctx, mgr, inst.Id, StateRunning, 20*time.Second)
156+
require.NoError(t, err)
152157
assert.Equal(t, StateRunning, inst.State)
153158

154159
inst, err = mgr.StopInstance(ctx, inst.Id)
@@ -159,6 +164,9 @@ func TestFirecrackerStandbyAndRestore(t *testing.T) {
159164
// Verify stopped -> start works after standby/restore lifecycle.
160165
inst, err = mgr.StartInstance(ctx, inst.Id, StartInstanceRequest{})
161166
require.NoError(t, err)
167+
assert.Contains(t, []State{StateInitializing, StateRunning}, inst.State)
168+
inst, err = waitForInstanceState(ctx, mgr, inst.Id, StateRunning, 20*time.Second)
169+
require.NoError(t, err)
162170
assert.Equal(t, StateRunning, inst.State)
163171

164172
require.NoError(t, mgr.DeleteInstance(ctx, inst.Id))
@@ -189,6 +197,9 @@ func TestFirecrackerStopClearsStaleSnapshot(t *testing.T) {
189197
Hypervisor: hypervisor.TypeFirecracker,
190198
})
191199
require.NoError(t, err)
200+
require.Contains(t, []State{StateInitializing, StateRunning}, inst.State)
201+
inst, err = waitForInstanceState(ctx, mgr, inst.Id, StateRunning, 20*time.Second)
202+
require.NoError(t, err)
192203
require.Equal(t, StateRunning, inst.State)
193204

194205
// Establish a realistic standby/restore lifecycle first.
@@ -199,6 +210,9 @@ func TestFirecrackerStopClearsStaleSnapshot(t *testing.T) {
199210

200211
inst, err = mgr.RestoreInstance(ctx, inst.Id)
201212
require.NoError(t, err)
213+
require.Contains(t, []State{StateInitializing, StateRunning}, inst.State)
214+
inst, err = waitForInstanceState(ctx, mgr, inst.Id, StateRunning, 20*time.Second)
215+
require.NoError(t, err)
202216
require.Equal(t, StateRunning, inst.State)
203217

204218
// Simulate stale snapshot residue from a prior failure/interruption.
@@ -222,6 +236,9 @@ func TestFirecrackerStopClearsStaleSnapshot(t *testing.T) {
222236

223237
inst, err = mgr.StartInstance(ctx, inst.Id, StartInstanceRequest{})
224238
require.NoError(t, err)
239+
assert.Contains(t, []State{StateInitializing, StateRunning}, inst.State)
240+
inst, err = waitForInstanceState(ctx, mgr, inst.Id, StateRunning, 20*time.Second)
241+
require.NoError(t, err)
225242
assert.Equal(t, StateRunning, inst.State)
226243

227244
require.NoError(t, mgr.DeleteInstance(ctx, inst.Id))
@@ -257,6 +274,8 @@ func TestFirecrackerNetworkLifecycle(t *testing.T) {
257274
})
258275
require.NoError(t, err)
259276
require.NotNil(t, inst)
277+
inst, err = waitForInstanceState(ctx, mgr, inst.Id, StateRunning, 20*time.Second)
278+
require.NoError(t, err)
260279

261280
alloc, err := mgr.networkManager.GetAllocation(ctx, inst.Id)
262281
require.NoError(t, err)
@@ -311,6 +330,9 @@ func TestFirecrackerNetworkLifecycle(t *testing.T) {
311330

312331
inst, err = mgr.RestoreInstance(ctx, inst.Id)
313332
require.NoError(t, err)
333+
assert.Contains(t, []State{StateInitializing, StateRunning}, inst.State)
334+
inst, err = waitForInstanceState(ctx, mgr, inst.Id, StateRunning, 20*time.Second)
335+
require.NoError(t, err)
314336
assert.Equal(t, StateRunning, inst.State)
315337

316338
allocRestored, err := mgr.networkManager.GetAllocation(ctx, inst.Id)
@@ -376,6 +398,8 @@ func TestFirecrackerForkFromRunningNetwork(t *testing.T) {
376398
Hypervisor: hypervisor.TypeFirecracker,
377399
})
378400
require.NoError(t, err)
401+
source, err = waitForInstanceState(ctx, mgr, source.Id, StateRunning, 20*time.Second)
402+
require.NoError(t, err)
379403
sourceID := source.Id
380404
t.Cleanup(func() { _ = mgr.DeleteInstance(context.Background(), sourceID) })
381405
assert.NotEmpty(t, source.IP)
@@ -391,6 +415,9 @@ func TestFirecrackerForkFromRunningNetwork(t *testing.T) {
391415
TargetState: StateRunning,
392416
})
393417
require.NoError(t, err)
418+
require.Contains(t, []State{StateInitializing, StateRunning}, forked.State)
419+
forked, err = waitForInstanceState(ctx, mgr, forked.Id, StateRunning, 20*time.Second)
420+
require.NoError(t, err)
394421
require.Equal(t, StateRunning, forked.State)
395422
forkID := forked.Id
396423
t.Cleanup(func() { _ = mgr.DeleteInstance(context.Background(), forkID) })
@@ -404,6 +431,10 @@ func TestFirecrackerForkFromRunningNetwork(t *testing.T) {
404431

405432
sourceAfterFork, err := mgr.GetInstance(ctx, sourceID)
406433
require.NoError(t, err)
434+
if sourceAfterFork.State != StateRunning {
435+
sourceAfterFork, err = waitForInstanceState(ctx, mgr, sourceID, StateRunning, 20*time.Second)
436+
require.NoError(t, err)
437+
}
407438
require.Equal(t, StateRunning, sourceAfterFork.State)
408439
assert.NotEmpty(t, sourceAfterFork.IP)
409440
assert.NotEmpty(t, sourceAfterFork.MAC)

lib/instances/fork.go

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -408,7 +408,7 @@ func (m *manager) applyForkTargetState(ctx context.Context, forkID string, targe
408408
if err != nil {
409409
return nil, err
410410
}
411-
if current.State == target {
411+
if current.State == target || (target == StateRunning && current.State == StateInitializing) {
412412
return returnWithReadiness(current, nil)
413413
}
414414

@@ -497,6 +497,14 @@ func cloneStoredMetadataForFork(src StoredMetadata) StoredMetadata {
497497
stoppedAt := *src.StoppedAt
498498
dst.StoppedAt = &stoppedAt
499499
}
500+
if src.ProgramStartedAt != nil {
501+
programStartedAt := *src.ProgramStartedAt
502+
dst.ProgramStartedAt = &programStartedAt
503+
}
504+
if src.GuestAgentReadyAt != nil {
505+
guestAgentReadyAt := *src.GuestAgentReadyAt
506+
dst.GuestAgentReadyAt = &guestAgentReadyAt
507+
}
500508
if src.ExitCode != nil {
501509
exitCode := *src.ExitCode
502510
dst.ExitCode = &exitCode

0 commit comments

Comments
 (0)