Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
5595c78
instances: add Initializing state and gate Running on agent/program r…
sjmiller609 Mar 8, 2026
d973033
tests: fix lifecycle flakes for Initializing transition
sjmiller609 Mar 8, 2026
423399b
instances: add Initializing state and gate Running on agent/program r…
sjmiller609 Mar 9, 2026
ffb4e43
skills: add concise initializing-speed optimization playbook
sjmiller609 Mar 9, 2026
fffe995
Update skill
sjmiller609 Mar 9, 2026
9bf231f
instances: throttle boot-marker log scans during state derivation
sjmiller609 Mar 9, 2026
ecfd653
tests: increase manager lifecycle running wait to 20s
sjmiller609 Mar 9, 2026
56cec52
instances: clear boot markers on stop/restore and tighten transitions
sjmiller609 Mar 9, 2026
37f30ad
instances: allow fork target Running while still Initializing
sjmiller609 Mar 9, 2026
3d9c470
instances: preserve boot markers across standby restore
sjmiller609 Mar 9, 2026
aa78f4f
tests: use manager network init in lifecycle integration tests
sjmiller609 Mar 9, 2026
b2c8e2c
network: harden default-network self-heal on allocation
sjmiller609 Mar 9, 2026
e51f91b
ci: install libjpeg8 for qemu tests on linux runners
sjmiller609 Mar 9, 2026
6b8b8a5
ci: support runner-specific jpeg packages for qemu
sjmiller609 Mar 9, 2026
1234c12
instances: harden boot marker hydration and network self-heal
sjmiller609 Mar 9, 2026
bad661d
ci: make linux libjpeg check compatible with runner variants
sjmiller609 Mar 9, 2026
032878c
instances: avoid duplicate boot-marker persistence on fallback lookups
sjmiller609 Mar 9, 2026
695abf1
ci: install distro qemu and verify runtime directly
sjmiller609 Mar 9, 2026
eedd2f9
ci: reinstall qemu when runtime check fails
sjmiller609 Mar 9, 2026
2148d16
tests: skip qemu integration when runtime is unavailable
sjmiller609 Mar 9, 2026
655c284
instances: avoid stale boot markers without guest clock filtering
sjmiller609 Mar 9, 2026
cf6018f
instances: fix marker parsing and readiness edge cases
sjmiller609 Mar 9, 2026
a449e2c
init: keep readiness timeout behavior when no exit channel
sjmiller609 Mar 9, 2026
9462250
Merge remote-tracking branch 'origin/main' into codex/max-speed-initi…
sjmiller609 Mar 10, 2026
777997a
ci: retry apt update in linux dependency install
sjmiller609 Mar 10, 2026
669114f
instances: persist hydrated boot markers on lifecycle return
sjmiller609 Mar 10, 2026
e34ecd0
tests: wait for Running before standby in network lifecycle test
sjmiller609 Mar 10, 2026
7e8436b
tests: harden network lifecycle restore against startup races
sjmiller609 Mar 10, 2026
3e16495
instances: preserve boot start timestamp across restore
sjmiller609 Mar 10, 2026
1c99011
network: retry interrupted bridge address dumps
sjmiller609 Mar 10, 2026
2d4b7c9
instances: harden boot marker log scanning
sjmiller609 Mar 10, 2026
68fec3b
tests: gate exec-agent readiness checks on Running state
sjmiller609 Mar 10, 2026
192f095
instances: fix create flow step numbering comments
sjmiller609 Mar 10, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 22 additions & 4 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -31,19 +31,37 @@ jobs:
- name: Install dependencies
run: |
set -xe
apt_update_with_retry() {
local attempts=5
local sleep_seconds=30
local n=1
while [ "$n" -le "$attempts" ]; do
if sudo apt-get update; then
return 0
fi
if [ "$n" -eq "$attempts" ]; then
return 1
fi
echo "apt-get update failed (attempt ${n}/${attempts}); retrying in ${sleep_seconds}s..."
sleep "$sleep_seconds"
n=$((n + 1))
done
}
if ! command -v mkfs.erofs &> /dev/null || \
! command -v mkfs.ext4 &> /dev/null || \
! command -v iptables &> /dev/null; then
sudo apt-get update
sudo apt-get install -y erofs-utils e2fsprogs iptables
! command -v iptables &> /dev/null || \
! command -v qemu-system-x86_64 &> /dev/null || \
! qemu-system-x86_64 --version >/dev/null 2>&1; then
apt_update_with_retry
sudo apt-get install -y erofs-utils e2fsprogs iptables qemu-system-x86 qemu-utils
fi
go mod download

- name: Verify Linux test toolchain
run: |
set -euo pipefail
TEST_PATH="/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:$PATH"
for bin in mkfs.erofs mkfs.ext4 iptables; do
for bin in mkfs.erofs mkfs.ext4 iptables qemu-system-x86_64; do
if ! sudo env "PATH=$TEST_PATH" bash -lc "command -v '$bin' >/dev/null"; then
echo "missing required binary under sudo PATH: $bin"
exit 1
Expand Down
2 changes: 1 addition & 1 deletion cmd/api/api/instances_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -419,7 +419,7 @@ func TestInstanceLifecycle_StopStart(t *testing.T) {

// 1. Create instance
t.Log("Creating instance...")
networkEnabled := true
networkEnabled := false
createResp, err := svc.CreateInstance(ctx(), oapi.CreateInstanceRequestObject{
Body: &oapi.CreateInstanceRequest{
Name: "test-lifecycle",
Expand Down
2 changes: 2 additions & 0 deletions cmd/api/api/registry_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -135,10 +135,12 @@ func TestRegistryPushAndCreateInstance(t *testing.T) {
// Create instance with pushed image
t.Log("Creating instance with pushed image...")
networkEnabled := false
cmd := []string{"sleep", "infinity"}
resp, err := svc.CreateInstance(ctx(), oapi.CreateInstanceRequestObject{
Body: &oapi.CreateInstanceRequest{
Name: "test-pushed-image",
Image: imageName,
Cmd: &cmd,
Network: &struct {
BandwidthDownload *string `json:"bandwidth_download,omitempty"`
BandwidthUpload *string `json:"bandwidth_upload,omitempty"`
Expand Down
2 changes: 1 addition & 1 deletion cmd/api/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -196,7 +196,7 @@ func run() error {
// Initialize to empty slice (not nil) so cleanup runs even with no running VMs
preserveTAPs = []string{}
for _, inst := range allInstances {
if inst.State == instances.StateRunning || inst.State == instances.StateUnknown {
if inst.State == instances.StateRunning || inst.State == instances.StateInitializing || inst.State == instances.StateUnknown {
preserveTAPs = append(preserveTAPs, inst.Id)
}
}
Expand Down
11 changes: 7 additions & 4 deletions lib/instances/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,8 @@ Manages VM instance lifecycle across multiple hypervisors (Cloud Hypervisor, QEM
**States:**
- `Stopped` - No VMM, no snapshot
- `Created` - VMM created but not booted (CH native)
- `Running` - VM actively running (CH native)
- `Initializing` - VM is running while guest init is still in progress
- `Running` - Guest program start boundary reached and guest-agent readiness observed (unless `skip_guest_agent=true`)
- `Paused` - VM paused (CH native)
- `Shutdown` - VM shutdown, VMM exists (CH native)
- `Standby` - No VMM, snapshot exists (can restore)
Expand Down Expand Up @@ -63,11 +64,14 @@ Manager orchestrates multiple single-hop state transitions:

**CreateInstance:**
```
Stopped → Created → Running
Stopped → Created → Initializing → Running
1. Start VMM process
2. Create VM config
3. Boot VM
4. Expand memory (if hotplug configured)
4. Wait for guest-agent readiness gate (event-driven, exec mode, unless skipped)
5. Guest program start marker observed
6. Kernel headers setup continues asynchronously (does not gate `Running`)
7. Expand memory (if hotplug configured)
```

**StandbyInstance:**
Expand Down Expand Up @@ -134,4 +138,3 @@ TestStorageOperations - metadata persistence, directory cleanup
- `lib/system` - System manager for kernel/initrd files
- `lib/hypervisor` - Hypervisor abstraction for VM operations
- System tools: `mkfs.erofs`, `cpio`, `gzip` (Linux); `mkfs.ext4` (macOS)

31 changes: 21 additions & 10 deletions lib/instances/create.go
Original file line number Diff line number Diff line change
Expand Up @@ -299,6 +299,8 @@ func (m *manager) createInstance(
CreatedAt: time.Now(),
StartedAt: nil,
StoppedAt: nil,
ProgramStartedAt: nil,
GuestAgentReadyAt: nil,
KernelVersion: string(kernelVer),
HypervisorType: hvType,
HypervisorVersion: hvVersion,
Expand Down Expand Up @@ -406,25 +408,30 @@ func (m *manager) createInstance(
return nil, fmt.Errorf("create config disk: %w", err)
}

// 17. Save metadata
// 17. Record boot start time before launching the VM so marker hydration
// can safely ignore stale sentinels from prior runs.
if err := m.archiveAppLogForBoot(id); err != nil {
log.WarnContext(ctx, "failed to archive app log before create boot", "instance_id", id, "error", err)
}
bootStart := time.Now().UTC()
stored.StartedAt = &bootStart

// 18. Save metadata
log.DebugContext(ctx, "saving instance metadata", "instance_id", id)
meta := &metadata{StoredMetadata: *stored}
if err := m.saveMetadata(meta); err != nil {
log.ErrorContext(ctx, "failed to save metadata", "instance_id", id, "error", err)
return nil, fmt.Errorf("save metadata: %w", err)
}

// 18. Start VMM and boot VM
// 19. Start VMM and boot VM
log.InfoContext(ctx, "starting VMM and booting VM", "instance_id", id)
if err := m.startAndBootVM(ctx, stored, imageInfo, netConfig); err != nil {
log.ErrorContext(ctx, "failed to start and boot VM", "instance_id", id, "error", err)
return nil, err
}

// 19. Update timestamp after VM is running
now := time.Now()
stored.StartedAt = &now

// 20. Persist runtime metadata updates after VM boot.
meta = &metadata{StoredMetadata: *stored}
if err := m.saveMetadata(meta); err != nil {
// VM is running but metadata failed - log but don't fail
Expand All @@ -435,14 +442,18 @@ func (m *manager) createInstance(
// Success - release cleanup stack (prevent cleanup)
cu.Release()

// Return instance with derived state
finalInst := m.toInstance(ctx, meta)
if finalInst.BootMarkersHydrated {
if err := m.saveMetadata(meta); err != nil {
log.WarnContext(ctx, "failed to persist hydrated boot markers after create", "instance_id", id, "error", err)
}
}
// Record metrics
if m.metrics != nil {
m.recordDuration(ctx, m.metrics.createDuration, start, "success", hvType)
m.recordStateTransition(ctx, "stopped", string(StateRunning), hvType)
m.recordStateTransition(ctx, string(StateStopped), string(finalInst.State), hvType)
}

// Return instance with derived state
finalInst := m.toInstance(ctx, meta)
log.InfoContext(ctx, "instance created successfully", "instance_id", id, "name", req.Name, "state", finalInst.State, "hypervisor", hvType)
return &finalInst, nil
}
Expand Down
4 changes: 2 additions & 2 deletions lib/instances/delete.go
Original file line number Diff line number Diff line change
Expand Up @@ -50,9 +50,9 @@ func (m *manager) deleteInstance(
guest.CloseConn(dialer.Key())
}

// 4. If running, try graceful guest shutdown before force kill.
// 4. If active, try graceful guest shutdown before force kill.
gracefulShutdown := false
if inst.State == StateRunning {
if inst.State == StateRunning || inst.State == StateInitializing {
stopTimeout := resolveStopTimeout(stored)
if stopTimeout > deleteGracefulShutdownTimeout {
stopTimeout = deleteGracefulShutdownTimeout
Expand Down
64 changes: 49 additions & 15 deletions lib/instances/exec_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,26 +21,60 @@ import (
// waitForExecAgent polls until exec-agent is ready
func waitForExecAgent(ctx context.Context, mgr *manager, instanceID string, timeout time.Duration) error {
deadline := time.Now().Add(timeout)
lastState := StateUnknown
var lastErr error

for time.Now().Before(deadline) {
inst, err := mgr.GetInstance(ctx, instanceID)
if err != nil {
lastErr = err
time.Sleep(500 * time.Millisecond)
continue
}

lastState = inst.State
if inst.State != StateRunning {
time.Sleep(500 * time.Millisecond)
continue
}

meta, err := mgr.loadMetadata(instanceID)
if err == nil {
dialer, derr := hypervisor.NewVsockDialer(meta.HypervisorType, meta.VsockSocket, meta.VsockCID)
if derr == nil {
var stdout, stderr bytes.Buffer
exit, eerr := guest.ExecIntoInstance(ctx, dialer, guest.ExecOptions{
Command: []string{"true"},
Stdout: &stdout,
Stderr: &stderr,
WaitForAgent: 1 * time.Second,
})
if eerr == nil && exit.Code == 0 {
return nil
}
}
if err != nil {
lastErr = err
time.Sleep(500 * time.Millisecond)
continue
}

dialer, err := hypervisor.NewVsockDialer(meta.HypervisorType, meta.VsockSocket, meta.VsockCID)
if err != nil {
lastErr = err
time.Sleep(500 * time.Millisecond)
continue
}

var stdout, stderr bytes.Buffer
exit, err := guest.ExecIntoInstance(ctx, dialer, guest.ExecOptions{
Command: []string{"true"},
Stdout: &stdout,
Stderr: &stderr,
WaitForAgent: 1 * time.Second,
})
if err == nil && exit.Code == 0 {
return nil
}
if err != nil {
lastErr = err
} else {
lastErr = fmt.Errorf("unexpected exit code: %d", exit.Code)
}

time.Sleep(500 * time.Millisecond)
}
return context.DeadlineExceeded

if lastErr != nil {
return fmt.Errorf("exec-agent not ready for instance %s within %v (last state: %s): %w", instanceID, timeout, lastState, lastErr)
}
return fmt.Errorf("exec-agent not ready for instance %s within %v (last state: %s)", instanceID, timeout, lastState)
}

// Note: execCommand is defined in network_test.go
Expand Down
33 changes: 32 additions & 1 deletion lib/instances/firecracker_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,9 @@ func TestFirecrackerStandbyAndRestore(t *testing.T) {
Hypervisor: hypervisor.TypeFirecracker,
})
require.NoError(t, err)
assert.Equal(t, StateRunning, inst.State)
assert.Contains(t, []State{StateInitializing, StateRunning}, inst.State)
inst, err = waitForInstanceState(ctx, mgr, inst.Id, StateRunning, 20*time.Second)
require.NoError(t, err)

inst, err = mgr.StandbyInstance(ctx, inst.Id)
require.NoError(t, err)
Expand All @@ -149,6 +151,9 @@ func TestFirecrackerStandbyAndRestore(t *testing.T) {

inst, err = mgr.RestoreInstance(ctx, inst.Id)
require.NoError(t, err)
assert.Contains(t, []State{StateInitializing, StateRunning}, inst.State)
inst, err = waitForInstanceState(ctx, mgr, inst.Id, StateRunning, 20*time.Second)
require.NoError(t, err)
assert.Equal(t, StateRunning, inst.State)

inst, err = mgr.StopInstance(ctx, inst.Id)
Expand All @@ -159,6 +164,9 @@ func TestFirecrackerStandbyAndRestore(t *testing.T) {
// Verify stopped -> start works after standby/restore lifecycle.
inst, err = mgr.StartInstance(ctx, inst.Id, StartInstanceRequest{})
require.NoError(t, err)
assert.Contains(t, []State{StateInitializing, StateRunning}, inst.State)
inst, err = waitForInstanceState(ctx, mgr, inst.Id, StateRunning, 20*time.Second)
require.NoError(t, err)
assert.Equal(t, StateRunning, inst.State)

require.NoError(t, mgr.DeleteInstance(ctx, inst.Id))
Expand Down Expand Up @@ -189,6 +197,9 @@ func TestFirecrackerStopClearsStaleSnapshot(t *testing.T) {
Hypervisor: hypervisor.TypeFirecracker,
})
require.NoError(t, err)
require.Contains(t, []State{StateInitializing, StateRunning}, inst.State)
inst, err = waitForInstanceState(ctx, mgr, inst.Id, StateRunning, 20*time.Second)
require.NoError(t, err)
require.Equal(t, StateRunning, inst.State)

// Establish a realistic standby/restore lifecycle first.
Expand All @@ -199,6 +210,9 @@ func TestFirecrackerStopClearsStaleSnapshot(t *testing.T) {

inst, err = mgr.RestoreInstance(ctx, inst.Id)
require.NoError(t, err)
require.Contains(t, []State{StateInitializing, StateRunning}, inst.State)
inst, err = waitForInstanceState(ctx, mgr, inst.Id, StateRunning, 20*time.Second)
require.NoError(t, err)
require.Equal(t, StateRunning, inst.State)

// Simulate stale snapshot residue from a prior failure/interruption.
Expand All @@ -222,6 +236,9 @@ func TestFirecrackerStopClearsStaleSnapshot(t *testing.T) {

inst, err = mgr.StartInstance(ctx, inst.Id, StartInstanceRequest{})
require.NoError(t, err)
assert.Contains(t, []State{StateInitializing, StateRunning}, inst.State)
inst, err = waitForInstanceState(ctx, mgr, inst.Id, StateRunning, 20*time.Second)
require.NoError(t, err)
assert.Equal(t, StateRunning, inst.State)

require.NoError(t, mgr.DeleteInstance(ctx, inst.Id))
Expand Down Expand Up @@ -257,6 +274,8 @@ func TestFirecrackerNetworkLifecycle(t *testing.T) {
})
require.NoError(t, err)
require.NotNil(t, inst)
inst, err = waitForInstanceState(ctx, mgr, inst.Id, StateRunning, 20*time.Second)
require.NoError(t, err)

alloc, err := mgr.networkManager.GetAllocation(ctx, inst.Id)
require.NoError(t, err)
Expand Down Expand Up @@ -311,6 +330,9 @@ func TestFirecrackerNetworkLifecycle(t *testing.T) {

inst, err = mgr.RestoreInstance(ctx, inst.Id)
require.NoError(t, err)
assert.Contains(t, []State{StateInitializing, StateRunning}, inst.State)
inst, err = waitForInstanceState(ctx, mgr, inst.Id, StateRunning, 20*time.Second)
require.NoError(t, err)
assert.Equal(t, StateRunning, inst.State)

allocRestored, err := mgr.networkManager.GetAllocation(ctx, inst.Id)
Expand Down Expand Up @@ -376,6 +398,8 @@ func TestFirecrackerForkFromRunningNetwork(t *testing.T) {
Hypervisor: hypervisor.TypeFirecracker,
})
require.NoError(t, err)
source, err = waitForInstanceState(ctx, mgr, source.Id, StateRunning, 20*time.Second)
require.NoError(t, err)
sourceID := source.Id
t.Cleanup(func() { _ = mgr.DeleteInstance(context.Background(), sourceID) })
assert.NotEmpty(t, source.IP)
Expand All @@ -391,6 +415,9 @@ func TestFirecrackerForkFromRunningNetwork(t *testing.T) {
TargetState: StateRunning,
})
require.NoError(t, err)
require.Contains(t, []State{StateInitializing, StateRunning}, forked.State)
forked, err = waitForInstanceState(ctx, mgr, forked.Id, StateRunning, 20*time.Second)
require.NoError(t, err)
require.Equal(t, StateRunning, forked.State)
forkID := forked.Id
t.Cleanup(func() { _ = mgr.DeleteInstance(context.Background(), forkID) })
Expand All @@ -404,6 +431,10 @@ func TestFirecrackerForkFromRunningNetwork(t *testing.T) {

sourceAfterFork, err := mgr.GetInstance(ctx, sourceID)
require.NoError(t, err)
if sourceAfterFork.State != StateRunning {
sourceAfterFork, err = waitForInstanceState(ctx, mgr, sourceID, StateRunning, 20*time.Second)
require.NoError(t, err)
}
require.Equal(t, StateRunning, sourceAfterFork.State)
assert.NotEmpty(t, sourceAfterFork.IP)
assert.NotEmpty(t, sourceAfterFork.MAC)
Expand Down
10 changes: 9 additions & 1 deletion lib/instances/fork.go
Original file line number Diff line number Diff line change
Expand Up @@ -408,7 +408,7 @@ func (m *manager) applyForkTargetState(ctx context.Context, forkID string, targe
if err != nil {
return nil, err
}
if current.State == target {
if current.State == target || (target == StateRunning && current.State == StateInitializing) {
return returnWithReadiness(current, nil)
}

Expand Down Expand Up @@ -497,6 +497,14 @@ func cloneStoredMetadataForFork(src StoredMetadata) StoredMetadata {
stoppedAt := *src.StoppedAt
dst.StoppedAt = &stoppedAt
}
if src.ProgramStartedAt != nil {
programStartedAt := *src.ProgramStartedAt
dst.ProgramStartedAt = &programStartedAt
}
if src.GuestAgentReadyAt != nil {
guestAgentReadyAt := *src.GuestAgentReadyAt
dst.GuestAgentReadyAt = &guestAgentReadyAt
}
if src.ExitCode != nil {
exitCode := *src.ExitCode
dst.ExitCode = &exitCode
Expand Down
Loading
Loading