From 37494c8b10401baed82d51df016f0397f76107ab Mon Sep 17 00:00:00 2001 From: Steven Miller Date: Sat, 7 Mar 2026 01:08:41 -0500 Subject: [PATCH 1/7] feat: add cross-hypervisor guest memory reclaim policy --- Makefile | 17 +- cmd/api/config/config.go | 22 +- cmd/vz-shim/server.go | 8 +- cmd/vz-shim/vm.go | 16 +- config.example.darwin.yaml | 5 + config.example.yaml | 7 + lib/guestmemory/README.md | 49 +++ lib/guestmemory/kernel_args.go | 42 +++ lib/guestmemory/kernel_args_test.go | 12 + lib/guestmemory/policy.go | 92 ++++++ lib/guestmemory/policy_test.go | 33 ++ lib/hypervisor/cloudhypervisor/config.go | 14 + lib/hypervisor/cloudhypervisor/config_test.go | 29 ++ lib/hypervisor/config.go | 11 + lib/hypervisor/firecracker/config.go | 19 ++ lib/hypervisor/firecracker/config_test.go | 18 ++ lib/hypervisor/firecracker/firecracker.go | 8 + lib/hypervisor/qemu/config.go | 14 + lib/hypervisor/qemu/config_test.go | 17 ++ lib/hypervisor/qemu/process.go | 46 ++- lib/hypervisor/vz/shimconfig/config.go | 4 + lib/hypervisor/vz/starter.go | 20 +- lib/instances/create.go | 19 +- lib/instances/guestmemory_darwin_test.go | 195 ++++++++++++ lib/instances/guestmemory_linux_test.go | 286 ++++++++++++++++++ .../guestmemory_test_helpers_test.go | 15 + lib/instances/manager.go | 11 +- lib/providers/providers.go | 9 +- 28 files changed, 1011 insertions(+), 27 deletions(-) create mode 100644 lib/guestmemory/README.md create mode 100644 lib/guestmemory/kernel_args.go create mode 100644 lib/guestmemory/kernel_args_test.go create mode 100644 lib/guestmemory/policy.go create mode 100644 lib/guestmemory/policy_test.go create mode 100644 lib/hypervisor/cloudhypervisor/config_test.go create mode 100644 lib/instances/guestmemory_darwin_test.go create mode 100644 lib/instances/guestmemory_linux_test.go create mode 100644 lib/instances/guestmemory_test_helpers_test.go diff --git a/Makefile b/Makefile index 85ca0d43..e3f39db9 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,5 @@ SHELL := /bin/bash -.PHONY: oapi-generate generate-vmm-client generate-wire generate-all dev build build-linux test test-linux test-darwin install-tools gen-jwt download-ch-binaries download-firecracker-binaries download-ch-spec ensure-ch-binaries ensure-firecracker-binaries build-caddy-binaries build-caddy ensure-caddy-binaries release-prep clean build-embedded +.PHONY: oapi-generate generate-vmm-client generate-wire generate-all dev build build-linux test test-linux test-darwin test-guestmemory-linux test-guestmemory-vz install-tools gen-jwt download-ch-binaries download-firecracker-binaries download-ch-spec ensure-ch-binaries ensure-firecracker-binaries build-caddy-binaries build-caddy ensure-caddy-binaries release-prep clean build-embedded # Directory where local binaries will be installed BIN_DIR ?= $(CURDIR)/bin @@ -292,6 +292,21 @@ test-darwin: build-embedded sign-vz-shim go test -tags containers_image_openpgp $$VERBOSE_FLAG -timeout=$(TEST_TIMEOUT) $$PKGS; \ fi +# Manual-only guest memory policy integration tests (Linux hypervisors). +test-guestmemory-linux: ensure-ch-binaries ensure-firecracker-binaries ensure-caddy-binaries build-embedded + @TEST_PATH="/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:$$PATH"; \ + GUESTMEM_TIMEOUT="$${GUESTMEMORY_TEST_TIMEOUT:-15m}"; \ + echo "Running manual guest memory integration tests (CloudHypervisor, QEMU, Firecracker)"; \ + sudo env "PATH=$$TEST_PATH" "DOCKER_CONFIG=$${DOCKER_CONFIG:-$$HOME/.docker}" "HYPEMAN_RUN_GUESTMEMORY_TESTS=1" \ + go test -tags containers_image_openpgp -run='^TestGuestMemoryPolicy(CloudHypervisor|QEMU|Firecracker)$$' -timeout="$$GUESTMEM_TIMEOUT" ./lib/instances + +# Manual-only guest memory policy integration test (macOS VZ). +test-guestmemory-vz: build-embedded sign-vz-shim + @echo "Running manual guest memory integration test (VZ)"; \ + PATH="/opt/homebrew/opt/e2fsprogs/sbin:$(PATH)" \ + HYPEMAN_RUN_GUESTMEMORY_TESTS=1 \ + go test -tags containers_image_openpgp -run='^TestGuestMemoryPolicyVZ$$' -timeout=$(TEST_TIMEOUT) ./lib/instances + # Generate JWT token for testing # Usage: make gen-jwt [USER_ID=test-user] # Checks CONFIG_PATH, then local config.yaml, then default config paths diff --git a/cmd/api/config/config.go b/cmd/api/config/config.go index 4ee3d442..2236b7f4 100644 --- a/cmd/api/config/config.go +++ b/cmd/api/config/config.go @@ -154,8 +154,17 @@ type CapacityConfig struct { // HypervisorConfig holds hypervisor settings. type HypervisorConfig struct { - Default string `koanf:"default"` - FirecrackerBinaryPath string `koanf:"firecracker_binary_path"` + Default string `koanf:"default"` + FirecrackerBinaryPath string `koanf:"firecracker_binary_path"` + Memory HypervisorMemoryConfig `koanf:"memory"` +} + +// HypervisorMemoryConfig holds guest memory management settings. +type HypervisorMemoryConfig struct { + Enabled bool `koanf:"enabled"` + KernelPageInitMode string `koanf:"kernel_page_init_mode"` + ReclaimEnabled bool `koanf:"reclaim_enabled"` + VZBalloonRequired bool `koanf:"vz_balloon_required"` } // GPUConfig holds GPU-related settings. @@ -300,6 +309,12 @@ func defaultConfig() *Config { Hypervisor: HypervisorConfig{ Default: "cloud-hypervisor", FirecrackerBinaryPath: "", + Memory: HypervisorMemoryConfig{ + Enabled: true, + KernelPageInitMode: "performance", + ReclaimEnabled: true, + VZBalloonRequired: true, + }, }, GPU: GPUConfig{ @@ -400,5 +415,8 @@ func (c *Config) Validate() error { if c.Build.Timeout <= 0 { return fmt.Errorf("build.timeout must be positive, got %d", c.Build.Timeout) } + if c.Hypervisor.Memory.KernelPageInitMode != "performance" && c.Hypervisor.Memory.KernelPageInitMode != "hardened" { + return fmt.Errorf("hypervisor.memory.kernel_page_init_mode must be one of {performance,hardened}, got %q", c.Hypervisor.Memory.KernelPageInitMode) + } return nil } diff --git a/cmd/vz-shim/server.go b/cmd/vz-shim/server.go index 43ba9142..9acf0095 100644 --- a/cmd/vz-shim/server.go +++ b/cmd/vz-shim/server.go @@ -37,7 +37,8 @@ func NewShimServer(vm *vz.VirtualMachine, vmConfig *vz.VirtualMachineConfigurati // VMInfoResponse matches the cloud-hypervisor VmInfo structure. type VMInfoResponse struct { - State string `json:"state"` + State string `json:"state"` + MemoryBalloonDevices int `json:"memory_balloon_devices,omitempty"` } type snapshotRequest struct { @@ -66,7 +67,10 @@ func (s *ShimServer) handleVMInfo(w http.ResponseWriter, r *http.Request) { defer s.mu.RUnlock() state := vzStateToString(s.vm.State()) - resp := VMInfoResponse{State: state} + resp := VMInfoResponse{ + State: state, + MemoryBalloonDevices: len(s.vm.MemoryBalloonDevices()), + } w.Header().Set("Content-Type", "application/json") json.NewEncoder(w).Encode(resp) diff --git a/cmd/vz-shim/vm.go b/cmd/vz-shim/vm.go index 9fa34012..0ce8dc8b 100644 --- a/cmd/vz-shim/vm.go +++ b/cmd/vz-shim/vm.go @@ -72,9 +72,19 @@ func createVM(config *shimconfig.ShimConfig) (*vz.VirtualMachine, *vz.VirtualMac } vmConfig.SetSocketDevicesVirtualMachineConfiguration([]vz.SocketDeviceConfiguration{vsockConfig}) - // Do not attach memory balloon for now. - // Save/restore compatibility on VZ can fail with "invalid argument" for some - // Linux guest configurations when a balloon device is present. + if config.EnableMemoryBalloon { + balloonConfig, err := vz.NewVirtioTraditionalMemoryBalloonDeviceConfiguration() + if err != nil { + if config.RequireMemoryBalloon { + return nil, nil, fmt.Errorf("create memory balloon device: %w", err) + } + slog.Warn("memory balloon unavailable, continuing without balloon", "error", err) + } else { + vmConfig.SetMemoryBalloonDevicesVirtualMachineConfiguration([]vz.MemoryBalloonDeviceConfiguration{ + balloonConfig, + }) + } + } if validated, err := vmConfig.Validate(); !validated || err != nil { return nil, nil, fmt.Errorf("invalid vm configuration: %w", err) diff --git a/config.example.darwin.yaml b/config.example.darwin.yaml index d491e493..07e6e1bb 100644 --- a/config.example.darwin.yaml +++ b/config.example.darwin.yaml @@ -34,6 +34,11 @@ port: "8080" # - "cloud-hypervisor" and "qemu" are NOT supported on macOS hypervisor: default: vz + memory: + enabled: true + kernel_page_init_mode: performance + reclaim_enabled: true + vz_balloon_required: true # ============================================================================= # Network Configuration (DIFFERENT ON MACOS) diff --git a/config.example.yaml b/config.example.yaml index 5b4f9db1..8368a344 100644 --- a/config.example.yaml +++ b/config.example.yaml @@ -26,6 +26,13 @@ data_dir: /var/lib/hypeman # default: cloud-hypervisor # # Optional: use a custom Firecracker binary path instead of the embedded one. # # firecracker_binary_path: /usr/local/bin/firecracker +# memory: +# enabled: true +# # performance: init_on_alloc=0 init_on_free=0 (better density) +# # hardened: init_on_alloc=1 init_on_free=1 (stronger hardening) +# kernel_page_init_mode: performance +# reclaim_enabled: true +# vz_balloon_required: true # ============================================================================= # Network Configuration diff --git a/lib/guestmemory/README.md b/lib/guestmemory/README.md new file mode 100644 index 00000000..fea9f500 --- /dev/null +++ b/lib/guestmemory/README.md @@ -0,0 +1,49 @@ +# Guest Memory Reclaim + +This feature reduces host RAM waste from guest VMs by combining three behaviors: + +1. Lazy host allocation preservation: +The VM is configured with requested memory capacity, but host pages should only back guest pages as they are touched. + +2. Guest-to-host reclaim: +When the guest frees memory, virtio balloon/reporting/hinting features let the VMM return those pages to the host. + +3. Guest boot page-touch reduction: +The guest kernel page-init mode controls whether Linux eagerly touches pages: +- `performance` mode sets `init_on_alloc=0 init_on_free=0` for better density and lower memory churn. +- `hardened` mode sets `init_on_alloc=1 init_on_free=1` for stronger memory hygiene at some density/perf cost. + +## Runtime Flow + +- Operator config (`hypervisor.memory`) is normalized into one policy. +- The instances layer applies policy generically: + - merges kernel args with the selected page-init mode; + - sets generic memory feature toggles in `hypervisor.VMConfig.GuestMemory`. +- Each hypervisor backend maps generic toggles to native mechanisms: + - Cloud Hypervisor: `balloon` config with free page reporting and deflate-on-oom. + - QEMU: `virtio-balloon-pci` device options. + - Firecracker: `/balloon` API with free page hinting/reporting. + - VZ: attach `VirtioTraditionalMemoryBalloon` device. + +## Backend Behavior Matrix + +| Hypervisor | Lazy allocation | Balloon | Free page reporting/hinting | Deflate on OOM | +|---|---|---|---|---| +| Cloud Hypervisor | Yes | Yes | Reporting | Yes | +| QEMU | Yes | Yes | Reporting (+ hinting when enabled) | Yes | +| Firecracker | Yes | Yes | Hinting + reporting | Yes | +| VZ | macOS-managed | Yes | Host-managed + guest cooperation | Host-managed | + +## Failure Behavior + +- If policy is disabled, memory features are not applied. +- If reclaim is disabled, balloon/reporting/hinting are not applied. +- For VZ, balloon attachment is attempted when enabled. + - If `vz_balloon_required=true`, startup fails if balloon cannot be configured. + - If `vz_balloon_required=false`, startup continues without balloon and logs a warning. + +## Out of Scope + +- No API surface changes. +- No scheduler/admission logic changes. +- No automatic background tuning loops outside hypervisor-supported reclaim mechanisms. diff --git a/lib/guestmemory/kernel_args.go b/lib/guestmemory/kernel_args.go new file mode 100644 index 00000000..73f0d1e9 --- /dev/null +++ b/lib/guestmemory/kernel_args.go @@ -0,0 +1,42 @@ +package guestmemory + +import "strings" + +// MergeKernelArgs merges kernel args deterministically. +// Duplicate keys are de-duplicated with "last write wins" semantics. +func MergeKernelArgs(base string, extras ...string) string { + tokens := strings.Fields(base) + order := make([]string, 0, len(tokens)) + values := make(map[string]string, len(tokens)) + + for _, tok := range tokens { + k := argKey(tok) + if _, ok := values[k]; !ok { + order = append(order, k) + } + values[k] = tok + } + + for _, extra := range extras { + for _, tok := range strings.Fields(extra) { + k := argKey(tok) + if _, ok := values[k]; !ok { + order = append(order, k) + } + values[k] = tok + } + } + + merged := make([]string, 0, len(order)) + for _, k := range order { + merged = append(merged, values[k]) + } + return strings.Join(merged, " ") +} + +func argKey(token string) string { + if idx := strings.IndexByte(token, '='); idx >= 0 { + return token[:idx] + } + return token +} diff --git a/lib/guestmemory/kernel_args_test.go b/lib/guestmemory/kernel_args_test.go new file mode 100644 index 00000000..7aab70a2 --- /dev/null +++ b/lib/guestmemory/kernel_args_test.go @@ -0,0 +1,12 @@ +package guestmemory + +import ( + "testing" + + "github.com/stretchr/testify/assert" +) + +func TestMergeKernelArgs(t *testing.T) { + merged := MergeKernelArgs("console=ttyS0 foo=1", "foo=2", "init_on_alloc=0 init_on_free=0") + assert.Equal(t, "console=ttyS0 foo=2 init_on_alloc=0 init_on_free=0", merged) +} diff --git a/lib/guestmemory/policy.go b/lib/guestmemory/policy.go new file mode 100644 index 00000000..1a261b15 --- /dev/null +++ b/lib/guestmemory/policy.go @@ -0,0 +1,92 @@ +package guestmemory + +// KernelPageInitMode controls guest kernel page initialization behavior. +type KernelPageInitMode string + +const ( + // KernelPageInitPerformance minimizes guest page touching to preserve lazy host allocation. + KernelPageInitPerformance KernelPageInitMode = "performance" + // KernelPageInitHardened enforces page init-on-alloc/free hardening in the guest kernel. + KernelPageInitHardened KernelPageInitMode = "hardened" +) + +// Policy is the normalized, hypervisor-agnostic guest memory policy. +type Policy struct { + Enabled bool + KernelPageInitMode KernelPageInitMode + ReclaimEnabled bool + VZBalloonRequired bool +} + +// Features are generic guest memory toggles consumed by hypervisor backends. +type Features struct { + EnableBalloon bool + FreePageReporting bool + DeflateOnOOM bool + FreePageHinting bool + RequireBalloon bool +} + +// DefaultPolicy returns default policy values for density-first environments. +func DefaultPolicy() Policy { + return Policy{ + Enabled: true, + KernelPageInitMode: KernelPageInitPerformance, + ReclaimEnabled: true, + VZBalloonRequired: true, + } +} + +// Normalize applies defaults and sanitizes invalid modes. +func (p Policy) Normalize() Policy { + d := DefaultPolicy() + + if p.KernelPageInitMode == "" { + p.KernelPageInitMode = d.KernelPageInitMode + } + if p.KernelPageInitMode != KernelPageInitPerformance && p.KernelPageInitMode != KernelPageInitHardened { + p.KernelPageInitMode = d.KernelPageInitMode + } + + if !p.Enabled { + return Policy{ + Enabled: false, + KernelPageInitMode: p.KernelPageInitMode, + ReclaimEnabled: false, + VZBalloonRequired: p.VZBalloonRequired, + } + } + + return p +} + +// KernelArgs returns kernel args implied by the policy. +func (p Policy) KernelArgs() []string { + n := p.Normalize() + if !n.Enabled { + return nil + } + + switch n.KernelPageInitMode { + case KernelPageInitHardened: + return []string{"init_on_alloc=1", "init_on_free=1"} + default: + return []string{"init_on_alloc=0", "init_on_free=0"} + } +} + +// FeaturesForHypervisor returns generic memory features for backend translation. +func (p Policy) FeaturesForHypervisor() Features { + n := p.Normalize() + if !n.Enabled || !n.ReclaimEnabled { + return Features{} + } + + return Features{ + EnableBalloon: true, + FreePageReporting: true, + DeflateOnOOM: true, + FreePageHinting: true, + RequireBalloon: n.VZBalloonRequired, + } +} diff --git a/lib/guestmemory/policy_test.go b/lib/guestmemory/policy_test.go new file mode 100644 index 00000000..00bebbce --- /dev/null +++ b/lib/guestmemory/policy_test.go @@ -0,0 +1,33 @@ +package guestmemory + +import ( + "testing" + + "github.com/stretchr/testify/assert" +) + +func TestPolicyKernelArgs(t *testing.T) { + p := DefaultPolicy() + assert.Equal(t, []string{"init_on_alloc=0", "init_on_free=0"}, p.KernelArgs()) + + hardened := p + hardened.KernelPageInitMode = KernelPageInitHardened + assert.Equal(t, []string{"init_on_alloc=1", "init_on_free=1"}, hardened.KernelArgs()) + + disabled := p + disabled.Enabled = false + assert.Empty(t, disabled.KernelArgs()) +} + +func TestFeaturesForHypervisor(t *testing.T) { + f := DefaultPolicy().FeaturesForHypervisor() + assert.True(t, f.EnableBalloon) + assert.True(t, f.FreePageReporting) + assert.True(t, f.DeflateOnOOM) + assert.True(t, f.FreePageHinting) + assert.True(t, f.RequireBalloon) + + p := DefaultPolicy() + p.ReclaimEnabled = false + assert.Equal(t, Features{}, p.FeaturesForHypervisor()) +} diff --git a/lib/hypervisor/cloudhypervisor/config.go b/lib/hypervisor/cloudhypervisor/config.go index ab51676b..d728036b 100644 --- a/lib/hypervisor/cloudhypervisor/config.go +++ b/lib/hypervisor/cloudhypervisor/config.go @@ -113,6 +113,19 @@ func ToVMConfig(cfg hypervisor.VMConfig) vmm.VmConfig { devices = &deviceConfigs } + var balloon *vmm.BalloonConfig + if cfg.GuestMemory.EnableBalloon { + balloon = &vmm.BalloonConfig{ + Size: 0, + } + if cfg.GuestMemory.DeflateOnOOM { + balloon.DeflateOnOom = ptr(true) + } + if cfg.GuestMemory.FreePageReporting { + balloon.FreePageReporting = ptr(true) + } + } + return vmm.VmConfig{ Payload: payload, Cpus: &cpus, @@ -123,5 +136,6 @@ func ToVMConfig(cfg hypervisor.VMConfig) vmm.VmConfig { Net: nets, Vsock: vsock, Devices: devices, + Balloon: balloon, } } diff --git a/lib/hypervisor/cloudhypervisor/config_test.go b/lib/hypervisor/cloudhypervisor/config_test.go new file mode 100644 index 00000000..b5cdb96e --- /dev/null +++ b/lib/hypervisor/cloudhypervisor/config_test.go @@ -0,0 +1,29 @@ +package cloudhypervisor + +import ( + "testing" + + "github.com/kernel/hypeman/lib/hypervisor" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestToVMConfig_GuestMemoryBalloon(t *testing.T) { + cfg := hypervisor.VMConfig{ + VCPUs: 1, + MemoryBytes: 512 * 1024 * 1024, + GuestMemory: hypervisor.GuestMemoryConfig{ + EnableBalloon: true, + DeflateOnOOM: true, + FreePageReporting: true, + }, + } + + vmCfg := ToVMConfig(cfg) + require.NotNil(t, vmCfg.Balloon) + assert.Equal(t, int64(0), vmCfg.Balloon.Size) + require.NotNil(t, vmCfg.Balloon.DeflateOnOom) + assert.True(t, *vmCfg.Balloon.DeflateOnOom) + require.NotNil(t, vmCfg.Balloon.FreePageReporting) + assert.True(t, *vmCfg.Balloon.FreePageReporting) +} diff --git a/lib/hypervisor/config.go b/lib/hypervisor/config.go index a7ed34df..21a7aac8 100644 --- a/lib/hypervisor/config.go +++ b/lib/hypervisor/config.go @@ -8,6 +8,7 @@ type VMConfig struct { MemoryBytes int64 HotplugBytes int64 Topology *CPUTopology + GuestMemory GuestMemoryConfig // Storage Disks []DiskConfig @@ -31,6 +32,16 @@ type VMConfig struct { KernelArgs string } +// GuestMemoryConfig contains hypervisor-agnostic guest memory feature toggles. +type GuestMemoryConfig struct { + EnableBalloon bool + FreePageReporting bool + DeflateOnOOM bool + FreePageHinting bool + // RequireBalloon controls whether VM startup should fail if balloon setup fails. + RequireBalloon bool +} + // CPUTopology defines the virtual CPU topology type CPUTopology struct { ThreadsPerCore int diff --git a/lib/hypervisor/firecracker/config.go b/lib/hypervisor/firecracker/config.go index b9b60cb0..5ba47cc1 100644 --- a/lib/hypervisor/firecracker/config.go +++ b/lib/hypervisor/firecracker/config.go @@ -52,6 +52,13 @@ type serialDevice struct { SerialOutPath string `json:"serial_out_path"` } +type balloon struct { + AmountMib int64 `json:"amount_mib"` + DeflateOnOOM bool `json:"deflate_on_oom"` + FreePageHinting bool `json:"free_page_hinting,omitempty"` + FreePageReporting bool `json:"free_page_reporting,omitempty"` +} + type instanceActionInfo struct { ActionType string `json:"action_type"` } @@ -160,6 +167,18 @@ func toVsockConfig(cfg hypervisor.VMConfig) *vsock { } } +func toBalloonConfig(cfg hypervisor.VMConfig) *balloon { + if !cfg.GuestMemory.EnableBalloon { + return nil + } + return &balloon{ + AmountMib: 0, + DeflateOnOOM: cfg.GuestMemory.DeflateOnOOM, + FreePageHinting: cfg.GuestMemory.FreePageHinting, + FreePageReporting: cfg.GuestMemory.FreePageReporting, + } +} + func toRateLimiter(limit int64, burst int64) *rateLimiter { if limit <= 0 { return nil diff --git a/lib/hypervisor/firecracker/config_test.go b/lib/hypervisor/firecracker/config_test.go index 5f2ef6ca..4649ea61 100644 --- a/lib/hypervisor/firecracker/config_test.go +++ b/lib/hypervisor/firecracker/config_test.go @@ -73,3 +73,21 @@ func TestSnapshotParamPaths(t *testing.T) { assert.False(t, load.ResumeVM) require.Len(t, load.NetworkOverrides, 1) } + +func TestToBalloonConfig(t *testing.T) { + cfg := hypervisor.VMConfig{ + GuestMemory: hypervisor.GuestMemoryConfig{ + EnableBalloon: true, + DeflateOnOOM: true, + FreePageHinting: true, + FreePageReporting: true, + }, + } + + b := toBalloonConfig(cfg) + require.NotNil(t, b) + assert.Equal(t, int64(0), b.AmountMib) + assert.True(t, b.DeflateOnOOM) + assert.True(t, b.FreePageHinting) + assert.True(t, b.FreePageReporting) +} diff --git a/lib/hypervisor/firecracker/firecracker.go b/lib/hypervisor/firecracker/firecracker.go index e3289794..3d512c6b 100644 --- a/lib/hypervisor/firecracker/firecracker.go +++ b/lib/hypervisor/firecracker/firecracker.go @@ -141,6 +141,14 @@ func (f *Firecracker) configureForBoot(ctx context.Context, cfg hypervisor.VMCon if _, err := f.do(ctx, http.MethodPut, "/machine-config", toMachineConfiguration(cfg), http.StatusNoContent); err != nil { return fmt.Errorf("configure machine: %w", err) } + if balloonCfg := toBalloonConfig(cfg); balloonCfg != nil { + if _, err := f.do(ctx, http.MethodPut, "/balloon", balloonCfg, http.StatusNoContent); err != nil { + // Keep compatibility with older/custom binaries that may not expose balloon API. + if !strings.Contains(err.Error(), "Invalid request method and/or path") { + return fmt.Errorf("configure balloon: %w", err) + } + } + } for _, driveCfg := range toDriveConfigs(cfg) { path := "/drives/" + url.PathEscape(driveCfg.DriveID) diff --git a/lib/hypervisor/qemu/config.go b/lib/hypervisor/qemu/config.go index 23104ef5..73a0d617 100644 --- a/lib/hypervisor/qemu/config.go +++ b/lib/hypervisor/qemu/config.go @@ -25,6 +25,20 @@ func BuildArgs(cfg hypervisor.VMConfig) []string { memMB := cfg.MemoryBytes / (1024 * 1024) args = append(args, "-m", fmt.Sprintf("%dM", memMB)) + if cfg.GuestMemory.EnableBalloon { + balloonOpts := []string{"virtio-balloon-pci"} + if cfg.GuestMemory.DeflateOnOOM { + balloonOpts = append(balloonOpts, "deflate-on-oom=on") + } + if cfg.GuestMemory.FreePageReporting { + balloonOpts = append(balloonOpts, "free-page-reporting=on") + } + if cfg.GuestMemory.FreePageHinting { + balloonOpts = append(balloonOpts, "free-page-hint=on") + } + args = append(args, "-device", strings.Join(balloonOpts, ",")) + } + // Kernel and initrd if cfg.KernelPath != "" { args = append(args, "-kernel", cfg.KernelPath) diff --git a/lib/hypervisor/qemu/config_test.go b/lib/hypervisor/qemu/config_test.go index a5fb63e4..0c5ba090 100644 --- a/lib/hypervisor/qemu/config_test.go +++ b/lib/hypervisor/qemu/config_test.go @@ -159,3 +159,20 @@ func TestBuildArgs_NoSerialLog(t *testing.T) { assert.Contains(t, args, "-serial") assert.Contains(t, args, "stdio") } + +func TestBuildArgs_GuestMemoryBalloon(t *testing.T) { + cfg := hypervisor.VMConfig{ + VCPUs: 1, + MemoryBytes: 512 * 1024 * 1024, + GuestMemory: hypervisor.GuestMemoryConfig{ + EnableBalloon: true, + DeflateOnOOM: true, + FreePageReporting: true, + FreePageHinting: true, + }, + } + + args := BuildArgs(cfg) + assert.Contains(t, args, "-device") + assert.Contains(t, args, "virtio-balloon-pci,deflate-on-oom=on,free-page-reporting=on,free-page-hint=on") +} diff --git a/lib/hypervisor/qemu/process.go b/lib/hypervisor/qemu/process.go index e2e1d098..ad584f1b 100644 --- a/lib/hypervisor/qemu/process.go +++ b/lib/hypervisor/qemu/process.go @@ -204,12 +204,46 @@ func (s *Starter) startQEMUProcess(ctx context.Context, p *paths.Paths, version func (s *Starter) StartVM(ctx context.Context, p *paths.Paths, version string, socketPath string, config hypervisor.VMConfig) (int, hypervisor.Hypervisor, error) { log := logger.FromContext(ctx) - // Build command arguments: QMP socket + VM configuration - args := buildQMPArgs(socketPath) - args = append(args, BuildArgs(config)...) + // Some distro QEMU builds may not support newer balloon sub-options. + // Retry with progressively more conservative balloon args before failing. + attempts := []hypervisor.VMConfig{config} + if config.GuestMemory.EnableBalloon && (config.GuestMemory.FreePageReporting || config.GuestMemory.FreePageHinting) { + fallback := config + fallback.GuestMemory.FreePageReporting = false + fallback.GuestMemory.FreePageHinting = false + attempts = append(attempts, fallback) + } + if config.GuestMemory.EnableBalloon && config.GuestMemory.DeflateOnOOM { + fallback := config + fallback.GuestMemory.FreePageReporting = false + fallback.GuestMemory.FreePageHinting = false + fallback.GuestMemory.DeflateOnOOM = false + attempts = append(attempts, fallback) + } - pid, hv, cu, err := s.startQEMUProcess(ctx, p, version, socketPath, args) - if err != nil { + var ( + pid int + hv *QEMU + cu *cleanup.Cleanup + err error + booted hypervisor.VMConfig + ) + for i, attempt := range attempts { + // Build command arguments: QMP socket + VM configuration + args := buildQMPArgs(socketPath) + args = append(args, BuildArgs(attempt)...) + pid, hv, cu, err = s.startQEMUProcess(ctx, p, version, socketPath, args) + if err == nil { + booted = attempt + break + } + if i < len(attempts)-1 { + // Ensure a failed prior attempt doesn't keep the old socket path reserved. + _ = os.Remove(socketPath) + time.Sleep(100 * time.Millisecond) + log.WarnContext(ctx, "qemu start failed, retrying with reduced balloon features", "attempt", i+1, "error", err) + continue + } return 0, nil, err } defer cu.Clean() @@ -217,7 +251,7 @@ func (s *Starter) StartVM(ctx context.Context, p *paths.Paths, version string, s // Save config for potential restore later // QEMU migration files only contain memory state, not device config instanceDir := filepath.Dir(socketPath) - if err := saveVMConfig(instanceDir, config); err != nil { + if err := saveVMConfig(instanceDir, booted); err != nil { // Non-fatal - restore just won't work log.WarnContext(ctx, "failed to save VM config for restore", "error", err) } diff --git a/lib/hypervisor/vz/shimconfig/config.go b/lib/hypervisor/vz/shimconfig/config.go index 630e841b..f52f6eb1 100644 --- a/lib/hypervisor/vz/shimconfig/config.go +++ b/lib/hypervisor/vz/shimconfig/config.go @@ -32,6 +32,10 @@ type ShimConfig struct { InitrdPath string `json:"initrd_path"` KernelArgs string `json:"kernel_args"` + // Guest memory reclaim + EnableMemoryBalloon bool `json:"enable_memory_balloon,omitempty"` + RequireMemoryBalloon bool `json:"require_memory_balloon,omitempty"` + // Socket paths (where shim should listen) ControlSocket string `json:"control_socket"` VsockSocket string `json:"vsock_socket"` diff --git a/lib/hypervisor/vz/starter.go b/lib/hypervisor/vz/starter.go index fc7a041d..d9f83267 100644 --- a/lib/hypervisor/vz/starter.go +++ b/lib/hypervisor/vz/starter.go @@ -160,15 +160,17 @@ func (s *Starter) RestoreVM(ctx context.Context, p *paths.Paths, version string, func buildShimConfigFromVMConfig(config hypervisor.VMConfig, socketPath string) shimconfig.ShimConfig { instanceDir := filepath.Dir(socketPath) cfg := shimconfig.ShimConfig{ - VCPUs: config.VCPUs, - MemoryBytes: config.MemoryBytes, - SerialLogPath: config.SerialLogPath, - KernelPath: config.KernelPath, - InitrdPath: config.InitrdPath, - KernelArgs: config.KernelArgs, - ControlSocket: socketPath, - VsockSocket: filepath.Join(instanceDir, "vz.vsock"), - LogPath: filepath.Join(instanceDir, "logs", "vz-shim.log"), + VCPUs: config.VCPUs, + MemoryBytes: config.MemoryBytes, + SerialLogPath: config.SerialLogPath, + KernelPath: config.KernelPath, + InitrdPath: config.InitrdPath, + KernelArgs: config.KernelArgs, + EnableMemoryBalloon: config.GuestMemory.EnableBalloon, + RequireMemoryBalloon: config.GuestMemory.RequireBalloon, + ControlSocket: socketPath, + VsockSocket: filepath.Join(instanceDir, "vz.vsock"), + LogPath: filepath.Join(instanceDir, "logs", "vz-shim.log"), } for _, disk := range config.Disks { cfg.Disks = append(cfg.Disks, shimconfig.DiskConfig{ diff --git a/lib/instances/create.go b/lib/instances/create.go index 4566544c..a99ecf2d 100644 --- a/lib/instances/create.go +++ b/lib/instances/create.go @@ -8,6 +8,7 @@ import ( "time" "github.com/kernel/hypeman/lib/devices" + "github.com/kernel/hypeman/lib/guestmemory" "github.com/kernel/hypeman/lib/hypervisor" "github.com/kernel/hypeman/lib/images" "github.com/kernel/hypeman/lib/logger" @@ -700,6 +701,7 @@ func (m *manager) buildHypervisorConfig(ctx context.Context, inst *Instance, ima MemoryBytes: inst.Size, HotplugBytes: inst.HotplugSize, Topology: topology, + GuestMemory: m.guestMemoryConfig(), Disks: disks, Networks: networks, SerialLogPath: m.paths.InstanceAppLog(inst.Id), @@ -715,10 +717,23 @@ func (m *manager) buildHypervisorConfig(ctx context.Context, inst *Instance, ima // kernelArgs returns the kernel command line arguments for the given hypervisor type. // vz uses hvc0 (virtio console), all others use ttyS0 (serial port). func (m *manager) kernelArgs(hvType hypervisor.Type) string { + console := "console=ttyS0" if hvType == hypervisor.TypeVZ { - return "console=hvc0" + console = "console=hvc0" + } + policyArgs := strings.Join(m.guestMemoryPolicy.KernelArgs(), " ") + return guestmemory.MergeKernelArgs(console, policyArgs) +} + +func (m *manager) guestMemoryConfig() hypervisor.GuestMemoryConfig { + features := m.guestMemoryPolicy.FeaturesForHypervisor() + return hypervisor.GuestMemoryConfig{ + EnableBalloon: features.EnableBalloon, + FreePageReporting: features.FreePageReporting, + DeflateOnOOM: features.DeflateOnOOM, + FreePageHinting: features.FreePageHinting, + RequireBalloon: features.RequireBalloon, } - return "console=ttyS0" } func ptr[T any](v T) *T { diff --git a/lib/instances/guestmemory_darwin_test.go b/lib/instances/guestmemory_darwin_test.go new file mode 100644 index 00000000..a032b334 --- /dev/null +++ b/lib/instances/guestmemory_darwin_test.go @@ -0,0 +1,195 @@ +//go:build darwin + +package instances + +import ( + "context" + "encoding/json" + "fmt" + "net" + "net/http" + "os/exec" + "runtime" + "strconv" + "strings" + "testing" + "time" + + "github.com/kernel/hypeman/lib/hypervisor" + "github.com/kernel/hypeman/lib/images" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestGuestMemoryPolicyVZ(t *testing.T) { + requireGuestMemoryManualRun(t) + if runtime.GOOS != "darwin" { + t.Skip("vz tests require macOS") + } + if runtime.GOARCH != "arm64" { + t.Skip("vz tests require Apple Silicon") + } + + mgr, tmpDir := setupVZTestManager(t) + ctx := context.Background() + + createNginxImageAndWaitDarwin(t, ctx, mgr.imageManager) + require.NoError(t, mgr.systemManager.EnsureSystemFiles(ctx)) + + inst, err := mgr.CreateInstance(ctx, CreateInstanceRequest{ + Name: "guestmem-vz", + Image: "docker.io/library/nginx:alpine", + Size: 1024 * 1024 * 1024, + OverlaySize: 5 * 1024 * 1024 * 1024, + Vcpus: 1, + NetworkEnabled: false, + Hypervisor: hypervisor.TypeVZ, + }) + if err != nil { + dumpVZShimLogs(t, tmpDir) + require.NoError(t, err) + } + defer func() { _ = mgr.DeleteInstance(ctx, inst.Id) }() + + require.NoError(t, waitForExecAgent(ctx, mgr, inst.Id, 30*time.Second)) + + out, exitCode, err := vzExecCommand(ctx, inst, "cat", "/proc/cmdline") + require.NoError(t, err) + require.Equal(t, 0, exitCode) + assert.Contains(t, out, "init_on_alloc=0") + assert.Contains(t, out, "init_on_free=0") + + info, err := getVZVMInfo(inst.SocketPath) + require.NoError(t, err) + assert.GreaterOrEqual(t, info.MemoryBalloonDevices, 1, "vz shim should report attached memory balloon device") + + runVZGuestMemoryReclaimProbe(t, ctx, mgr, inst.Id) +} + +func createNginxImageAndWaitDarwin(t *testing.T, ctx context.Context, imageManager images.Manager) { + t.Helper() + + nginxImage, err := imageManager.CreateImage(ctx, images.CreateImageRequest{ + Name: "docker.io/library/nginx:alpine", + }) + require.NoError(t, err) + + imageName := nginxImage.Name + for i := 0; i < 120; i++ { + img, err := imageManager.GetImage(ctx, imageName) + if err == nil && img.Status == images.StatusReady { + return + } + if err == nil && img.Status == images.StatusFailed { + if img.Error != nil { + t.Fatalf("image build failed: %s", *img.Error) + } + t.Fatalf("image build failed: unknown error") + } + time.Sleep(1 * time.Second) + } + t.Fatalf("timed out waiting for image %q to become ready", imageName) +} + +type vzVMInfo struct { + State string `json:"state"` + MemoryBalloonDevices int `json:"memory_balloon_devices"` +} + +func getVZVMInfo(socketPath string) (*vzVMInfo, error) { + transport := &http.Transport{ + DialContext: func(ctx context.Context, _, _ string) (net.Conn, error) { + var d net.Dialer + return d.DialContext(ctx, "unix", socketPath) + }, + DisableKeepAlives: true, + } + client := &http.Client{Transport: transport, Timeout: 5 * time.Second} + + req, err := http.NewRequest(http.MethodGet, "http://vz-shim/api/v1/vm.info", nil) + if err != nil { + return nil, err + } + resp, err := client.Do(req) + if err != nil { + return nil, err + } + defer resp.Body.Close() + if resp.StatusCode != http.StatusOK { + return nil, fmt.Errorf("unexpected vz vm.info status: %d", resp.StatusCode) + } + var info vzVMInfo + if err := json.NewDecoder(resp.Body).Decode(&info); err != nil { + return nil, err + } + return &info, nil +} + +func runVZGuestMemoryReclaimProbe(t *testing.T, ctx context.Context, mgr *manager, instanceID string) { + t.Helper() + + inst, err := mgr.GetInstance(ctx, instanceID) + require.NoError(t, err) + require.NotNil(t, inst.HypervisorPID) + pid := *inst.HypervisorPID + + baselineRSS := mustReadDarwinRSSBytes(t, pid) + + workloadErr := make(chan error, 1) + go func() { + cmd := "set -e; test -d /dev/shm || mkdir -p /dev/shm; dd if=/dev/zero of=/dev/shm/hype-mem bs=1M count=128 >/dev/null 2>&1; sleep 2; rm -f /dev/shm/hype-mem; sync; sleep 2" + _, exitCode, err := vzExecCommand(ctx, inst, "sh", "-c", cmd) + if err != nil { + workloadErr <- err + return + } + if exitCode != 0 { + workloadErr <- fmt.Errorf("guest workload exited with code %d", exitCode) + return + } + workloadErr <- nil + }() + + peakRSS := baselineRSS + ticker := time.NewTicker(250 * time.Millisecond) + defer ticker.Stop() + + for { + select { + case err := <-workloadErr: + require.NoError(t, err) + goto done + case <-ticker.C: + rss := mustReadDarwinRSSBytes(t, pid) + if rss > peakRSS { + peakRSS = rss + } + } + } + +done: + assert.Greater(t, peakRSS, baselineRSS+(8*1024*1024)) + + deadline := time.Now().Add(10 * time.Second) + postRSS := mustReadDarwinRSSBytes(t, pid) + for time.Now().Before(deadline) { + postRSS = mustReadDarwinRSSBytes(t, pid) + if postRSS < peakRSS-(4*1024*1024) { + break + } + time.Sleep(250 * time.Millisecond) + } + assert.Less(t, postRSS, peakRSS) +} + +func mustReadDarwinRSSBytes(t *testing.T, pid int) int64 { + t.Helper() + cmd := exec.Command("ps", "-o", "rss=", "-p", strconv.Itoa(pid)) + out, err := cmd.Output() + require.NoError(t, err) + trimmed := strings.TrimSpace(string(out)) + require.NotEmpty(t, trimmed) + kb, err := strconv.ParseInt(trimmed, 10, 64) + require.NoError(t, err) + return kb * 1024 +} diff --git a/lib/instances/guestmemory_linux_test.go b/lib/instances/guestmemory_linux_test.go new file mode 100644 index 00000000..0fab0b3f --- /dev/null +++ b/lib/instances/guestmemory_linux_test.go @@ -0,0 +1,286 @@ +//go:build linux + +package instances + +import ( + "context" + "encoding/json" + "fmt" + "net" + "net/http" + "os" + "strconv" + "strings" + "testing" + "time" + + "github.com/kernel/hypeman/lib/hypervisor" + "github.com/kernel/hypeman/lib/images" + "github.com/kernel/hypeman/lib/vmm" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestGuestMemoryPolicyCloudHypervisor(t *testing.T) { + requireGuestMemoryManualRun(t) + requireKVMAccess(t) + + mgr, _ := setupTestManager(t) + ctx := context.Background() + + createImageAndWait(t, ctx, mgr.imageManager, "docker.io/library/alpine:latest") + require.NoError(t, mgr.systemManager.EnsureSystemFiles(ctx)) + + inst, err := mgr.CreateInstance(ctx, CreateInstanceRequest{ + Name: "guestmem-ch", + Image: "docker.io/library/alpine:latest", + Size: 1024 * 1024 * 1024, + OverlaySize: 5 * 1024 * 1024 * 1024, + Vcpus: 1, + NetworkEnabled: false, + Hypervisor: hypervisor.TypeCloudHypervisor, + Entrypoint: []string{"/bin/sh", "-c"}, + Cmd: []string{guestMemoryWorkloadScript()}, + }) + require.NoError(t, err) + t.Cleanup(func() { _ = mgr.DeleteInstance(ctx, inst.Id) }) + + require.NoError(t, waitForVMReady(ctx, inst.SocketPath, 10*time.Second)) + + client, err := vmm.NewVMM(inst.SocketPath) + require.NoError(t, err) + infoResp, err := client.GetVmInfoWithResponse(ctx) + require.NoError(t, err) + require.Equal(t, 200, infoResp.StatusCode()) + require.NotNil(t, infoResp.JSON200) + require.NotNil(t, infoResp.JSON200.Config.Payload) + require.NotNil(t, infoResp.JSON200.Config.Payload.Cmdline) + assert.Contains(t, *infoResp.JSON200.Config.Payload.Cmdline, "init_on_alloc=0") + assert.Contains(t, *infoResp.JSON200.Config.Payload.Cmdline, "init_on_free=0") + + require.NotNil(t, infoResp.JSON200.Config.Balloon, "cloud-hypervisor vm.info config should include balloon") + assert.True(t, infoResp.JSON200.Config.Balloon.DeflateOnOom != nil && *infoResp.JSON200.Config.Balloon.DeflateOnOom) + assert.True(t, infoResp.JSON200.Config.Balloon.FreePageReporting != nil && *infoResp.JSON200.Config.Balloon.FreePageReporting) + + pid := requireHypervisorPID(t, ctx, mgr, inst.Id) + runGuestMemoryReclaimProbe(t, pid) +} + +func TestGuestMemoryPolicyQEMU(t *testing.T) { + requireGuestMemoryManualRun(t) + requireKVMAccess(t) + + mgr, _ := setupTestManagerForQEMU(t) + ctx := context.Background() + + createImageAndWait(t, ctx, mgr.imageManager, "docker.io/library/alpine:latest") + require.NoError(t, mgr.systemManager.EnsureSystemFiles(ctx)) + + inst, err := mgr.CreateInstance(ctx, CreateInstanceRequest{ + Name: "guestmem-qemu", + Image: "docker.io/library/alpine:latest", + Size: 1024 * 1024 * 1024, + OverlaySize: 5 * 1024 * 1024 * 1024, + Vcpus: 1, + NetworkEnabled: false, + Hypervisor: hypervisor.TypeQEMU, + Entrypoint: []string{"/bin/sh", "-c"}, + Cmd: []string{guestMemoryWorkloadScript()}, + }) + require.NoError(t, err) + t.Cleanup(func() { _ = mgr.DeleteInstance(ctx, inst.Id) }) + + require.NoError(t, waitForQEMUReady(ctx, inst.SocketPath, 10*time.Second)) + + pid := requireHypervisorPID(t, ctx, mgr, inst.Id) + cmdline, err := os.ReadFile(fmt.Sprintf("/proc/%d/cmdline", pid)) + require.NoError(t, err) + joined := strings.ReplaceAll(string(cmdline), "\x00", " ") + assert.Contains(t, joined, "init_on_alloc=0") + assert.Contains(t, joined, "init_on_free=0") + assert.Contains(t, joined, "virtio-balloon-pci", "qemu cmdline should include virtio balloon device") + + runGuestMemoryReclaimProbe(t, pid) +} + +func TestGuestMemoryPolicyFirecracker(t *testing.T) { + requireGuestMemoryManualRun(t) + requireFirecrackerIntegrationPrereqs(t) + + mgr, _ := setupTestManagerForFirecracker(t) + ctx := context.Background() + + createImageAndWait(t, ctx, mgr.imageManager, "docker.io/library/alpine:latest") + require.NoError(t, mgr.systemManager.EnsureSystemFiles(ctx)) + + inst, err := mgr.CreateInstance(ctx, CreateInstanceRequest{ + Name: "guestmem-fc", + Image: "docker.io/library/alpine:latest", + Size: 1024 * 1024 * 1024, + OverlaySize: 5 * 1024 * 1024 * 1024, + Vcpus: 1, + NetworkEnabled: false, + Hypervisor: hypervisor.TypeFirecracker, + Entrypoint: []string{"/bin/sh", "-c"}, + Cmd: []string{guestMemoryWorkloadScript()}, + }) + require.NoError(t, err) + t.Cleanup(func() { _ = mgr.DeleteInstance(ctx, inst.Id) }) + + vmCfg, err := getFirecrackerVMConfig(inst.SocketPath) + require.NoError(t, err) + assert.Contains(t, vmCfg.BootSource.BootArgs, "init_on_alloc=0") + assert.Contains(t, vmCfg.BootSource.BootArgs, "init_on_free=0") + assert.True(t, vmCfg.Balloon.DeflateOnOOM) + assert.True(t, vmCfg.Balloon.FreePageHinting) + assert.True(t, vmCfg.Balloon.FreePageReporting) + + pid := requireHypervisorPID(t, ctx, mgr, inst.Id) + runGuestMemoryReclaimProbe(t, pid) +} + +func guestMemoryWorkloadScript() string { + return "set -e; sleep 8; test -d /dev/shm || mkdir -p /dev/shm; dd if=/dev/zero of=/dev/shm/hype-mem bs=1M count=256 >/dev/null 2>&1; sleep 3; rm -f /dev/shm/hype-mem; sync; sleep 120" +} + +func createImageAndWait(t *testing.T, ctx context.Context, imageManager images.Manager, imageName string) { + t.Helper() + + img, err := imageManager.CreateImage(ctx, images.CreateImageRequest{Name: imageName}) + require.NoError(t, err) + + for i := 0; i < 180; i++ { + current, err := imageManager.GetImage(ctx, img.Name) + if err == nil && current.Status == images.StatusReady { + return + } + if err == nil && current.Status == images.StatusFailed { + if current.Error != nil { + t.Fatalf("image build failed: %s", *current.Error) + } + t.Fatalf("image build failed: unknown error") + } + time.Sleep(1 * time.Second) + } + t.Fatalf("timed out waiting for image %q to become ready", img.Name) +} + +func requireHypervisorPID(t *testing.T, ctx context.Context, mgr *manager, instanceID string) int { + t.Helper() + inst, err := mgr.GetInstance(ctx, instanceID) + require.NoError(t, err) + require.NotNil(t, inst.HypervisorPID) + return *inst.HypervisorPID +} + +func runGuestMemoryReclaimProbe(t *testing.T, pid int) { + t.Helper() + + baselineRSS := mustReadRSSBytes(t, pid) + peakRSS := baselineRSS + postPeakMinRSS := int64(0) + growthThreshold := int64(16 * 1024 * 1024) + dropSignalThreshold := int64(1 * 1024 * 1024) + + // Wait for the in-guest workload to allocate memory and require a visible RSS increase. + deadline := time.Now().Add(50 * time.Second) + for time.Now().Before(deadline) { + rss := mustReadRSSBytes(t, pid) + if rss > peakRSS { + peakRSS = rss + } + if peakRSS > baselineRSS+growthThreshold { + postPeakMinRSS = rss + break + } + time.Sleep(500 * time.Millisecond) + } + + assert.Greaterf( + t, + peakRSS, + baselineRSS+growthThreshold, + "expected RSS to rise during workload (baseline=%d peak=%d growth_threshold=%d)", + baselineRSS, + peakRSS, + growthThreshold, + ) + + // Reclaim/drop signal is best-effort: backend flags are validated elsewhere in each test. + // Host RSS accounting and kernel reclaim timing can vary across systems. + recoveryDeadline := time.Now().Add(12 * time.Second) + for time.Now().Before(recoveryDeadline) { + rss := mustReadRSSBytes(t, pid) + if postPeakMinRSS == 0 || rss < postPeakMinRSS { + postPeakMinRSS = rss + } + time.Sleep(500 * time.Millisecond) + } + + drop := peakRSS - postPeakMinRSS + if drop >= dropSignalThreshold { + t.Logf("observed post-peak RSS drop: %d bytes (baseline=%d peak=%d min=%d)", drop, baselineRSS, peakRSS, postPeakMinRSS) + return + } + t.Logf("no clear post-peak RSS drop observed (baseline=%d peak=%d min=%d)", baselineRSS, peakRSS, postPeakMinRSS) +} + +func mustReadRSSBytes(t *testing.T, pid int) int64 { + t.Helper() + statusPath := fmt.Sprintf("/proc/%d/status", pid) + data, err := os.ReadFile(statusPath) + require.NoError(t, err) + + for _, line := range strings.Split(string(data), "\n") { + if strings.HasPrefix(line, "VmRSS:") { + fields := strings.Fields(line) + require.GreaterOrEqual(t, len(fields), 2) + kb, err := strconv.ParseInt(fields[1], 10, 64) + require.NoError(t, err) + return kb * 1024 + } + } + t.Fatalf("VmRSS not found in %s", statusPath) + return 0 +} + +type firecrackerVMConfig struct { + BootSource struct { + BootArgs string `json:"boot_args"` + } `json:"boot-source"` + Balloon struct { + DeflateOnOOM bool `json:"deflate_on_oom"` + FreePageHinting bool `json:"free_page_hinting"` + FreePageReporting bool `json:"free_page_reporting"` + } `json:"balloon"` +} + +func getFirecrackerVMConfig(socketPath string) (*firecrackerVMConfig, error) { + transport := &http.Transport{ + DialContext: func(ctx context.Context, _, _ string) (net.Conn, error) { + var d net.Dialer + return d.DialContext(ctx, "unix", socketPath) + }, + DisableKeepAlives: true, + } + client := &http.Client{Transport: transport, Timeout: 5 * time.Second} + + req, err := http.NewRequest(http.MethodGet, "http://localhost/vm/config", nil) + if err != nil { + return nil, err + } + resp, err := client.Do(req) + if err != nil { + return nil, err + } + defer resp.Body.Close() + if resp.StatusCode != http.StatusOK { + return nil, fmt.Errorf("unexpected firecracker /vm/config status: %d", resp.StatusCode) + } + + var cfg firecrackerVMConfig + if err := json.NewDecoder(resp.Body).Decode(&cfg); err != nil { + return nil, err + } + return &cfg, nil +} diff --git a/lib/instances/guestmemory_test_helpers_test.go b/lib/instances/guestmemory_test_helpers_test.go new file mode 100644 index 00000000..11668d05 --- /dev/null +++ b/lib/instances/guestmemory_test_helpers_test.go @@ -0,0 +1,15 @@ +package instances + +import ( + "os" + "testing" +) + +const guestMemoryManualEnv = "HYPEMAN_RUN_GUESTMEMORY_TESTS" + +func requireGuestMemoryManualRun(t *testing.T) { + t.Helper() + if os.Getenv(guestMemoryManualEnv) != "1" { + t.Skipf("set %s=1 to run guest memory integration tests", guestMemoryManualEnv) + } +} diff --git a/lib/instances/manager.go b/lib/instances/manager.go index 3b581e83..1b8804aa 100644 --- a/lib/instances/manager.go +++ b/lib/instances/manager.go @@ -6,6 +6,7 @@ import ( "sync" "github.com/kernel/hypeman/lib/devices" + "github.com/kernel/hypeman/lib/guestmemory" "github.com/kernel/hypeman/lib/hypervisor" "github.com/kernel/hypeman/lib/images" "github.com/kernel/hypeman/lib/network" @@ -84,6 +85,7 @@ type manager struct { // Hypervisor support vmStarters map[hypervisor.Type]hypervisor.VMStarter defaultHypervisor hypervisor.Type // Default hypervisor type when not specified in request + guestMemoryPolicy guestmemory.Policy } // platformStarters is populated by platform-specific init functions. @@ -92,12 +94,18 @@ var platformStarters = make(map[hypervisor.Type]hypervisor.VMStarter) // NewManager creates a new instances manager. // If meter is nil, metrics are disabled. // defaultHypervisor specifies which hypervisor to use when not specified in requests. -func NewManager(p *paths.Paths, imageManager images.Manager, systemManager system.Manager, networkManager network.Manager, deviceManager devices.Manager, volumeManager volumes.Manager, limits ResourceLimits, defaultHypervisor hypervisor.Type, meter metric.Meter, tracer trace.Tracer) Manager { +func NewManager(p *paths.Paths, imageManager images.Manager, systemManager system.Manager, networkManager network.Manager, deviceManager devices.Manager, volumeManager volumes.Manager, limits ResourceLimits, defaultHypervisor hypervisor.Type, meter metric.Meter, tracer trace.Tracer, memoryPolicy ...guestmemory.Policy) Manager { // Validate and default the hypervisor type if defaultHypervisor == "" { defaultHypervisor = hypervisor.TypeCloudHypervisor } + policy := guestmemory.DefaultPolicy() + if len(memoryPolicy) > 0 { + policy = memoryPolicy[0] + } + policy = policy.Normalize() + // Initialize VM starters from platform-specific init functions vmStarters := make(map[hypervisor.Type]hypervisor.VMStarter, len(platformStarters)) for hvType, starter := range platformStarters { @@ -116,6 +124,7 @@ func NewManager(p *paths.Paths, imageManager images.Manager, systemManager syste hostTopology: detectHostTopology(), // Detect and cache host topology vmStarters: vmStarters, defaultHypervisor: defaultHypervisor, + guestMemoryPolicy: policy, } // Initialize metrics if meter is provided diff --git a/lib/providers/providers.go b/lib/providers/providers.go index 25cc3536..be93dca0 100644 --- a/lib/providers/providers.go +++ b/lib/providers/providers.go @@ -13,6 +13,7 @@ import ( "github.com/kernel/hypeman/cmd/api/config" "github.com/kernel/hypeman/lib/builds" "github.com/kernel/hypeman/lib/devices" + "github.com/kernel/hypeman/lib/guestmemory" "github.com/kernel/hypeman/lib/hypervisor" "github.com/kernel/hypeman/lib/hypervisor/firecracker" "github.com/kernel/hypeman/lib/images" @@ -125,7 +126,13 @@ func ProvideInstanceManager(p *paths.Paths, cfg *config.Config, imageManager ima meter := otel.GetMeterProvider().Meter("hypeman") tracer := otel.GetTracerProvider().Tracer("hypeman") defaultHypervisor := hypervisor.Type(cfg.Hypervisor.Default) - return instances.NewManager(p, imageManager, systemManager, networkManager, deviceManager, volumeManager, limits, defaultHypervisor, meter, tracer), nil + memoryPolicy := guestmemory.Policy{ + Enabled: cfg.Hypervisor.Memory.Enabled, + KernelPageInitMode: guestmemory.KernelPageInitMode(cfg.Hypervisor.Memory.KernelPageInitMode), + ReclaimEnabled: cfg.Hypervisor.Memory.ReclaimEnabled, + VZBalloonRequired: cfg.Hypervisor.Memory.VZBalloonRequired, + } + return instances.NewManager(p, imageManager, systemManager, networkManager, deviceManager, volumeManager, limits, defaultHypervisor, meter, tracer, memoryPolicy), nil } // ProvideVolumeManager provides the volume manager From 3065c5984532632b3a7293cffd5db894bcdec716 Mon Sep 17 00:00:00 2001 From: Steven Miller Date: Sat, 7 Mar 2026 09:35:06 -0500 Subject: [PATCH 2/7] docs: add concise CLI guest memory A/B experiment --- lib/guestmemory/README.md | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/lib/guestmemory/README.md b/lib/guestmemory/README.md index fea9f500..15e9ecf0 100644 --- a/lib/guestmemory/README.md +++ b/lib/guestmemory/README.md @@ -42,6 +42,27 @@ The guest kernel page-init mode controls whether Linux eagerly touches pages: - If `vz_balloon_required=true`, startup fails if balloon cannot be configured. - If `vz_balloon_required=false`, startup continues without balloon and logs a warning. +## Quick CLI Experiment + +Use this A/B check to compare host RSS with policy enabled vs disabled: + +```bash +# 1) Start API with config A (hypervisor.memory.enabled=true), then run: +ID=$(hypeman run --hypervisor qemu --network=false --memory 1GB \ + --entrypoint /bin/sh --entrypoint -c \ + --cmd 'sleep 5; dd if=/dev/zero of=/dev/shm/hype-mem bs=1M count=256; sleep 5; rm -f /dev/shm/hype-mem; sleep 90' \ + docker.io/library/alpine:latest | tail -n1) +PID=$(jq -r '.HypervisorPID' "/guests/$ID/metadata.json") +awk '/^VmRSS:/ {print $2 " kB"}' "/proc/$PID/status" # Linux +ps -o rss= -p "$PID" # macOS +hypeman rm --force "$ID" + +# 2) Restart API with config B (hypervisor.memory.enabled=false) and run the same command. +# 3) Compare final/steady RSS between A and B. +``` + +In one startup-focused sample run, differences were small (typically a few MB), so this is best used as a reproducible sanity check, not a strict benchmark. + ## Out of Scope - No API surface changes. From faa8893897864a5def61197d81ea0cb707c6125c Mon Sep 17 00:00:00 2001 From: Steven Miller Date: Sat, 7 Mar 2026 09:58:45 -0500 Subject: [PATCH 3/7] test: assert low idle host memory footprint via PSS/RSS --- lib/guestmemory/README.md | 9 +- lib/instances/guestmemory_darwin_test.go | 77 +++++----------- lib/instances/guestmemory_linux_test.go | 109 +++++++++++------------ 3 files changed, 83 insertions(+), 112 deletions(-) diff --git a/lib/guestmemory/README.md b/lib/guestmemory/README.md index 15e9ecf0..f1792aa5 100644 --- a/lib/guestmemory/README.md +++ b/lib/guestmemory/README.md @@ -44,7 +44,7 @@ The guest kernel page-init mode controls whether Linux eagerly touches pages: ## Quick CLI Experiment -Use this A/B check to compare host RSS with policy enabled vs disabled: +Use this A/B check to compare host memory footprint with policy enabled vs disabled: ```bash # 1) Start API with config A (hypervisor.memory.enabled=true), then run: @@ -53,15 +53,16 @@ ID=$(hypeman run --hypervisor qemu --network=false --memory 1GB \ --cmd 'sleep 5; dd if=/dev/zero of=/dev/shm/hype-mem bs=1M count=256; sleep 5; rm -f /dev/shm/hype-mem; sleep 90' \ docker.io/library/alpine:latest | tail -n1) PID=$(jq -r '.HypervisorPID' "/guests/$ID/metadata.json") -awk '/^VmRSS:/ {print $2 " kB"}' "/proc/$PID/status" # Linux +awk '/^Pss:/ {print $2 " kB"}' "/proc/$PID/smaps_rollup" # Linux (preferred) +awk '/^VmRSS:/ {print $2 " kB"}' "/proc/$PID/status" # Linux fallback ps -o rss= -p "$PID" # macOS hypeman rm --force "$ID" # 2) Restart API with config B (hypervisor.memory.enabled=false) and run the same command. -# 3) Compare final/steady RSS between A and B. +# 3) Compare final/steady host memory between A and B. ``` -In one startup-focused sample run, differences were small (typically a few MB), so this is best used as a reproducible sanity check, not a strict benchmark. +In one startup-focused sample run, absolute host footprint stayed far below guest memory size (for example, ~4GB guest with low host PSS on Cloud Hypervisor/Firecracker), while QEMU showed a larger fixed process overhead. ## Out of Scope diff --git a/lib/instances/guestmemory_darwin_test.go b/lib/instances/guestmemory_darwin_test.go index a032b334..f3d0b3cd 100644 --- a/lib/instances/guestmemory_darwin_test.go +++ b/lib/instances/guestmemory_darwin_test.go @@ -39,7 +39,7 @@ func TestGuestMemoryPolicyVZ(t *testing.T) { inst, err := mgr.CreateInstance(ctx, CreateInstanceRequest{ Name: "guestmem-vz", Image: "docker.io/library/nginx:alpine", - Size: 1024 * 1024 * 1024, + Size: 4 * 1024 * 1024 * 1024, OverlaySize: 5 * 1024 * 1024 * 1024, Vcpus: 1, NetworkEnabled: false, @@ -63,7 +63,10 @@ func TestGuestMemoryPolicyVZ(t *testing.T) { require.NoError(t, err) assert.GreaterOrEqual(t, info.MemoryBalloonDevices, 1, "vz shim should report attached memory balloon device") - runVZGuestMemoryReclaimProbe(t, ctx, mgr, inst.Id) + instMeta, err := mgr.GetInstance(ctx, inst.Id) + require.NoError(t, err) + require.NotNil(t, instMeta.HypervisorPID) + assertLowIdleVZHostMemoryFootprint(t, *instMeta.HypervisorPID, 192*1024) } func createNginxImageAndWaitDarwin(t *testing.T, ctx context.Context, imageManager images.Manager) { @@ -125,61 +128,29 @@ func getVZVMInfo(socketPath string) (*vzVMInfo, error) { return &info, nil } -func runVZGuestMemoryReclaimProbe(t *testing.T, ctx context.Context, mgr *manager, instanceID string) { +func assertLowIdleVZHostMemoryFootprint(t *testing.T, pid int, maxRSSKB int64) { t.Helper() - inst, err := mgr.GetInstance(ctx, instanceID) - require.NoError(t, err) - require.NotNil(t, inst.HypervisorPID) - pid := *inst.HypervisorPID - - baselineRSS := mustReadDarwinRSSBytes(t, pid) - - workloadErr := make(chan error, 1) - go func() { - cmd := "set -e; test -d /dev/shm || mkdir -p /dev/shm; dd if=/dev/zero of=/dev/shm/hype-mem bs=1M count=128 >/dev/null 2>&1; sleep 2; rm -f /dev/shm/hype-mem; sync; sleep 2" - _, exitCode, err := vzExecCommand(ctx, inst, "sh", "-c", cmd) - if err != nil { - workloadErr <- err - return - } - if exitCode != 0 { - workloadErr <- fmt.Errorf("guest workload exited with code %d", exitCode) - return - } - workloadErr <- nil - }() - - peakRSS := baselineRSS - ticker := time.NewTicker(250 * time.Millisecond) - defer ticker.Stop() - - for { - select { - case err := <-workloadErr: - require.NoError(t, err) - goto done - case <-ticker.C: - rss := mustReadDarwinRSSBytes(t, pid) - if rss > peakRSS { - peakRSS = rss - } - } + time.Sleep(12 * time.Second) + var rssSamplesKB []int64 + for i := 0; i < 6; i++ { + rssSamplesKB = append(rssSamplesKB, mustReadDarwinRSSBytes(t, pid)/1024) + time.Sleep(1 * time.Second) } - -done: - assert.Greater(t, peakRSS, baselineRSS+(8*1024*1024)) - - deadline := time.Now().Add(10 * time.Second) - postRSS := mustReadDarwinRSSBytes(t, pid) - for time.Now().Before(deadline) { - postRSS = mustReadDarwinRSSBytes(t, pid) - if postRSS < peakRSS-(4*1024*1024) { - break - } - time.Sleep(250 * time.Millisecond) + var rssSumKB int64 + for _, v := range rssSamplesKB { + rssSumKB += v } - assert.Less(t, postRSS, peakRSS) + avgRSSKB := rssSumKB / int64(len(rssSamplesKB)) + assert.LessOrEqualf( + t, + avgRSSKB, + maxRSSKB, + "expected low idle host memory footprint for vz (avg_rss_kb=%d max_rss_kb=%d rss_samples_kb=%v)", + avgRSSKB, + maxRSSKB, + rssSamplesKB, + ) } func mustReadDarwinRSSBytes(t *testing.T, pid int) int64 { diff --git a/lib/instances/guestmemory_linux_test.go b/lib/instances/guestmemory_linux_test.go index 0fab0b3f..2352f9f9 100644 --- a/lib/instances/guestmemory_linux_test.go +++ b/lib/instances/guestmemory_linux_test.go @@ -34,13 +34,13 @@ func TestGuestMemoryPolicyCloudHypervisor(t *testing.T) { inst, err := mgr.CreateInstance(ctx, CreateInstanceRequest{ Name: "guestmem-ch", Image: "docker.io/library/alpine:latest", - Size: 1024 * 1024 * 1024, + Size: 4 * 1024 * 1024 * 1024, OverlaySize: 5 * 1024 * 1024 * 1024, Vcpus: 1, NetworkEnabled: false, Hypervisor: hypervisor.TypeCloudHypervisor, Entrypoint: []string{"/bin/sh", "-c"}, - Cmd: []string{guestMemoryWorkloadScript()}, + Cmd: []string{guestMemoryIdleScript()}, }) require.NoError(t, err) t.Cleanup(func() { _ = mgr.DeleteInstance(ctx, inst.Id) }) @@ -63,7 +63,7 @@ func TestGuestMemoryPolicyCloudHypervisor(t *testing.T) { assert.True(t, infoResp.JSON200.Config.Balloon.FreePageReporting != nil && *infoResp.JSON200.Config.Balloon.FreePageReporting) pid := requireHypervisorPID(t, ctx, mgr, inst.Id) - runGuestMemoryReclaimProbe(t, pid) + assertLowIdleHostMemoryFootprint(t, "cloud-hypervisor", pid, 96*1024) } func TestGuestMemoryPolicyQEMU(t *testing.T) { @@ -79,13 +79,13 @@ func TestGuestMemoryPolicyQEMU(t *testing.T) { inst, err := mgr.CreateInstance(ctx, CreateInstanceRequest{ Name: "guestmem-qemu", Image: "docker.io/library/alpine:latest", - Size: 1024 * 1024 * 1024, + Size: 4 * 1024 * 1024 * 1024, OverlaySize: 5 * 1024 * 1024 * 1024, Vcpus: 1, NetworkEnabled: false, Hypervisor: hypervisor.TypeQEMU, Entrypoint: []string{"/bin/sh", "-c"}, - Cmd: []string{guestMemoryWorkloadScript()}, + Cmd: []string{guestMemoryIdleScript()}, }) require.NoError(t, err) t.Cleanup(func() { _ = mgr.DeleteInstance(ctx, inst.Id) }) @@ -100,7 +100,7 @@ func TestGuestMemoryPolicyQEMU(t *testing.T) { assert.Contains(t, joined, "init_on_free=0") assert.Contains(t, joined, "virtio-balloon-pci", "qemu cmdline should include virtio balloon device") - runGuestMemoryReclaimProbe(t, pid) + assertLowIdleHostMemoryFootprint(t, "qemu", pid, 160*1024) } func TestGuestMemoryPolicyFirecracker(t *testing.T) { @@ -116,13 +116,13 @@ func TestGuestMemoryPolicyFirecracker(t *testing.T) { inst, err := mgr.CreateInstance(ctx, CreateInstanceRequest{ Name: "guestmem-fc", Image: "docker.io/library/alpine:latest", - Size: 1024 * 1024 * 1024, + Size: 4 * 1024 * 1024 * 1024, OverlaySize: 5 * 1024 * 1024 * 1024, Vcpus: 1, NetworkEnabled: false, Hypervisor: hypervisor.TypeFirecracker, Entrypoint: []string{"/bin/sh", "-c"}, - Cmd: []string{guestMemoryWorkloadScript()}, + Cmd: []string{guestMemoryIdleScript()}, }) require.NoError(t, err) t.Cleanup(func() { _ = mgr.DeleteInstance(ctx, inst.Id) }) @@ -136,11 +136,11 @@ func TestGuestMemoryPolicyFirecracker(t *testing.T) { assert.True(t, vmCfg.Balloon.FreePageReporting) pid := requireHypervisorPID(t, ctx, mgr, inst.Id) - runGuestMemoryReclaimProbe(t, pid) + assertLowIdleHostMemoryFootprint(t, "firecracker", pid, 96*1024) } -func guestMemoryWorkloadScript() string { - return "set -e; sleep 8; test -d /dev/shm || mkdir -p /dev/shm; dd if=/dev/zero of=/dev/shm/hype-mem bs=1M count=256 >/dev/null 2>&1; sleep 3; rm -f /dev/shm/hype-mem; sync; sleep 120" +func guestMemoryIdleScript() string { + return "set -e; sleep 180" } func createImageAndWait(t *testing.T, ctx context.Context, imageManager images.Manager, imageName string) { @@ -173,56 +173,36 @@ func requireHypervisorPID(t *testing.T, ctx context.Context, mgr *manager, insta return *inst.HypervisorPID } -func runGuestMemoryReclaimProbe(t *testing.T, pid int) { +func assertLowIdleHostMemoryFootprint(t *testing.T, hypervisorName string, pid int, maxPSSKB int64) { t.Helper() - baselineRSS := mustReadRSSBytes(t, pid) - peakRSS := baselineRSS - postPeakMinRSS := int64(0) - growthThreshold := int64(16 * 1024 * 1024) - dropSignalThreshold := int64(1 * 1024 * 1024) - - // Wait for the in-guest workload to allocate memory and require a visible RSS increase. - deadline := time.Now().Add(50 * time.Second) - for time.Now().Before(deadline) { - rss := mustReadRSSBytes(t, pid) - if rss > peakRSS { - peakRSS = rss - } - if peakRSS > baselineRSS+growthThreshold { - postPeakMinRSS = rss - break - } - time.Sleep(500 * time.Millisecond) + // Give the guest a short settle window, then sample host memory. + time.Sleep(12 * time.Second) + var pssSamplesKB []int64 + var rssSamplesKB []int64 + for i := 0; i < 6; i++ { + pssSamplesKB = append(pssSamplesKB, mustReadPSSKB(t, pid)) + rssSamplesKB = append(rssSamplesKB, mustReadRSSBytes(t, pid)/1024) + time.Sleep(1 * time.Second) } - assert.Greaterf( - t, - peakRSS, - baselineRSS+growthThreshold, - "expected RSS to rise during workload (baseline=%d peak=%d growth_threshold=%d)", - baselineRSS, - peakRSS, - growthThreshold, - ) - - // Reclaim/drop signal is best-effort: backend flags are validated elsewhere in each test. - // Host RSS accounting and kernel reclaim timing can vary across systems. - recoveryDeadline := time.Now().Add(12 * time.Second) - for time.Now().Before(recoveryDeadline) { - rss := mustReadRSSBytes(t, pid) - if postPeakMinRSS == 0 || rss < postPeakMinRSS { - postPeakMinRSS = rss - } - time.Sleep(500 * time.Millisecond) + var pssSumKB int64 + for _, v := range pssSamplesKB { + pssSumKB += v } + avgPSSKB := pssSumKB / int64(len(pssSamplesKB)) - drop := peakRSS - postPeakMinRSS - if drop >= dropSignalThreshold { - t.Logf("observed post-peak RSS drop: %d bytes (baseline=%d peak=%d min=%d)", drop, baselineRSS, peakRSS, postPeakMinRSS) - return - } - t.Logf("no clear post-peak RSS drop observed (baseline=%d peak=%d min=%d)", baselineRSS, peakRSS, postPeakMinRSS) + assert.LessOrEqualf( + t, + avgPSSKB, + maxPSSKB, + "expected low idle host memory footprint for %s (avg_pss_kb=%d max_pss_kb=%d rss_samples_kb=%v pss_samples_kb=%v)", + hypervisorName, + avgPSSKB, + maxPSSKB, + rssSamplesKB, + pssSamplesKB, + ) } func mustReadRSSBytes(t *testing.T, pid int) int64 { @@ -244,6 +224,25 @@ func mustReadRSSBytes(t *testing.T, pid int) int64 { return 0 } +func mustReadPSSKB(t *testing.T, pid int) int64 { + t.Helper() + smapsRollupPath := fmt.Sprintf("/proc/%d/smaps_rollup", pid) + data, err := os.ReadFile(smapsRollupPath) + require.NoError(t, err) + + for _, line := range strings.Split(string(data), "\n") { + if strings.HasPrefix(line, "Pss:") { + fields := strings.Fields(line) + require.GreaterOrEqual(t, len(fields), 2) + kb, err := strconv.ParseInt(fields[1], 10, 64) + require.NoError(t, err) + return kb + } + } + t.Fatalf("Pss not found in %s", smapsRollupPath) + return 0 +} + type firecrackerVMConfig struct { BootSource struct { BootArgs string `json:"boot_args"` From 05ca799bc586285f6886877b53eedce581658abe Mon Sep 17 00:00:00 2001 From: Steven Miller Date: Sat, 7 Mar 2026 10:02:06 -0500 Subject: [PATCH 4/7] docs: add idle-memory probe results table --- lib/guestmemory/README.md | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/lib/guestmemory/README.md b/lib/guestmemory/README.md index f1792aa5..b7b15259 100644 --- a/lib/guestmemory/README.md +++ b/lib/guestmemory/README.md @@ -64,6 +64,15 @@ hypeman rm --force "$ID" In one startup-focused sample run, absolute host footprint stayed far below guest memory size (for example, ~4GB guest with low host PSS on Cloud Hypervisor/Firecracker), while QEMU showed a larger fixed process overhead. +Sample probe results (4GB idle guest): + +| Hypervisor | Host RSS (kB) | Host PSS (kB) | Notes | +|---|---:|---:|---| +| Cloud Hypervisor (Linux) | ~352940 | ~29432 | Low actual host pressure when idle | +| Firecracker (Linux) | ~301908 | ~27464 | Low actual host pressure when idle | +| QEMU (Linux) | ~409984 | ~118581 | Higher fixed process overhead | +| VZ (macOS) | ~23568 | N/A | RSS sampled with `ps` | + ## Out of Scope - No API surface changes. From c2d73f88908485a4907064a85ba31fbf84a9b35b Mon Sep 17 00:00:00 2001 From: Steven Miller Date: Sat, 7 Mar 2026 10:02:47 -0500 Subject: [PATCH 5/7] docs: present probe table in rounded MB --- lib/guestmemory/README.md | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/lib/guestmemory/README.md b/lib/guestmemory/README.md index b7b15259..03d39de4 100644 --- a/lib/guestmemory/README.md +++ b/lib/guestmemory/README.md @@ -64,14 +64,14 @@ hypeman rm --force "$ID" In one startup-focused sample run, absolute host footprint stayed far below guest memory size (for example, ~4GB guest with low host PSS on Cloud Hypervisor/Firecracker), while QEMU showed a larger fixed process overhead. -Sample probe results (4GB idle guest): +Sample probe results (4GB idle guest, rounded MB): -| Hypervisor | Host RSS (kB) | Host PSS (kB) | Notes | +| Hypervisor | Host RSS (MB) | Host PSS (MB) | Notes | |---|---:|---:|---| -| Cloud Hypervisor (Linux) | ~352940 | ~29432 | Low actual host pressure when idle | -| Firecracker (Linux) | ~301908 | ~27464 | Low actual host pressure when idle | -| QEMU (Linux) | ~409984 | ~118581 | Higher fixed process overhead | -| VZ (macOS) | ~23568 | N/A | RSS sampled with `ps` | +| Cloud Hypervisor (Linux) | ~345 | ~29 | Low actual host pressure when idle | +| Firecracker (Linux) | ~295 | ~27 | Low actual host pressure when idle | +| QEMU (Linux) | ~400 | ~116 | Higher fixed process overhead | +| VZ (macOS) | ~23 | N/A | RSS sampled with `ps` | ## Out of Scope From 4ba93a35b29a5c39b78cde228dd4fb89a364c576 Mon Sep 17 00:00:00 2001 From: Steven Miller Date: Sat, 7 Mar 2026 13:21:30 -0500 Subject: [PATCH 6/7] config: default guest memory reclaim off with hardened mode --- cmd/api/config/config.go | 4 ++-- config.example.darwin.yaml | 4 ++-- config.example.yaml | 4 ++-- lib/guestmemory/README.md | 22 ++++++++++++++++++++++ lib/guestmemory/policy.go | 6 +++--- lib/guestmemory/policy_test.go | 14 +++++++++++--- lib/instances/guestmemory_darwin_test.go | 11 +++++++++++ lib/instances/guestmemory_linux_test.go | 19 ++++++++++++++++--- 8 files changed, 69 insertions(+), 15 deletions(-) diff --git a/cmd/api/config/config.go b/cmd/api/config/config.go index 2236b7f4..d2d9faae 100644 --- a/cmd/api/config/config.go +++ b/cmd/api/config/config.go @@ -310,8 +310,8 @@ func defaultConfig() *Config { Default: "cloud-hypervisor", FirecrackerBinaryPath: "", Memory: HypervisorMemoryConfig{ - Enabled: true, - KernelPageInitMode: "performance", + Enabled: false, + KernelPageInitMode: "hardened", ReclaimEnabled: true, VZBalloonRequired: true, }, diff --git a/config.example.darwin.yaml b/config.example.darwin.yaml index 07e6e1bb..b8185ffc 100644 --- a/config.example.darwin.yaml +++ b/config.example.darwin.yaml @@ -35,8 +35,8 @@ port: "8080" hypervisor: default: vz memory: - enabled: true - kernel_page_init_mode: performance + enabled: false + kernel_page_init_mode: hardened reclaim_enabled: true vz_balloon_required: true diff --git a/config.example.yaml b/config.example.yaml index 8368a344..1f7778a7 100644 --- a/config.example.yaml +++ b/config.example.yaml @@ -27,10 +27,10 @@ data_dir: /var/lib/hypeman # # Optional: use a custom Firecracker binary path instead of the embedded one. # # firecracker_binary_path: /usr/local/bin/firecracker # memory: -# enabled: true +# enabled: false # # performance: init_on_alloc=0 init_on_free=0 (better density) # # hardened: init_on_alloc=1 init_on_free=1 (stronger hardening) -# kernel_page_init_mode: performance +# kernel_page_init_mode: hardened # reclaim_enabled: true # vz_balloon_required: true diff --git a/lib/guestmemory/README.md b/lib/guestmemory/README.md index 03d39de4..e60e7d81 100644 --- a/lib/guestmemory/README.md +++ b/lib/guestmemory/README.md @@ -13,6 +13,28 @@ The guest kernel page-init mode controls whether Linux eagerly touches pages: - `performance` mode sets `init_on_alloc=0 init_on_free=0` for better density and lower memory churn. - `hardened` mode sets `init_on_alloc=1 init_on_free=1` for stronger memory hygiene at some density/perf cost. +## Configuration + +This feature is controlled by `hypervisor.memory` in server config and is default-off: + +```yaml +hypervisor: + memory: + enabled: false + kernel_page_init_mode: hardened + reclaim_enabled: true + vz_balloon_required: true +``` + +To enable reclaim behavior and density-oriented kernel args, set: + +```yaml +hypervisor: + memory: + enabled: true + kernel_page_init_mode: performance +``` + ## Runtime Flow - Operator config (`hypervisor.memory`) is normalized into one policy. diff --git a/lib/guestmemory/policy.go b/lib/guestmemory/policy.go index 1a261b15..5d515454 100644 --- a/lib/guestmemory/policy.go +++ b/lib/guestmemory/policy.go @@ -27,11 +27,11 @@ type Features struct { RequireBalloon bool } -// DefaultPolicy returns default policy values for density-first environments. +// DefaultPolicy returns conservative defaults (disabled reclaim, hardened page-init mode). func DefaultPolicy() Policy { return Policy{ - Enabled: true, - KernelPageInitMode: KernelPageInitPerformance, + Enabled: false, + KernelPageInitMode: KernelPageInitHardened, ReclaimEnabled: true, VZBalloonRequired: true, } diff --git a/lib/guestmemory/policy_test.go b/lib/guestmemory/policy_test.go index 00bebbce..54b19788 100644 --- a/lib/guestmemory/policy_test.go +++ b/lib/guestmemory/policy_test.go @@ -8,9 +8,15 @@ import ( func TestPolicyKernelArgs(t *testing.T) { p := DefaultPolicy() - assert.Equal(t, []string{"init_on_alloc=0", "init_on_free=0"}, p.KernelArgs()) + assert.Empty(t, p.KernelArgs()) + + performance := p + performance.Enabled = true + performance.KernelPageInitMode = KernelPageInitPerformance + assert.Equal(t, []string{"init_on_alloc=0", "init_on_free=0"}, performance.KernelArgs()) hardened := p + hardened.Enabled = true hardened.KernelPageInitMode = KernelPageInitHardened assert.Equal(t, []string{"init_on_alloc=1", "init_on_free=1"}, hardened.KernelArgs()) @@ -20,14 +26,16 @@ func TestPolicyKernelArgs(t *testing.T) { } func TestFeaturesForHypervisor(t *testing.T) { - f := DefaultPolicy().FeaturesForHypervisor() + p := DefaultPolicy() + p.Enabled = true + + f := p.FeaturesForHypervisor() assert.True(t, f.EnableBalloon) assert.True(t, f.FreePageReporting) assert.True(t, f.DeflateOnOOM) assert.True(t, f.FreePageHinting) assert.True(t, f.RequireBalloon) - p := DefaultPolicy() p.ReclaimEnabled = false assert.Equal(t, Features{}, p.FeaturesForHypervisor()) } diff --git a/lib/instances/guestmemory_darwin_test.go b/lib/instances/guestmemory_darwin_test.go index f3d0b3cd..560e7735 100644 --- a/lib/instances/guestmemory_darwin_test.go +++ b/lib/instances/guestmemory_darwin_test.go @@ -15,6 +15,7 @@ import ( "testing" "time" + "github.com/kernel/hypeman/lib/guestmemory" "github.com/kernel/hypeman/lib/hypervisor" "github.com/kernel/hypeman/lib/images" "github.com/stretchr/testify/assert" @@ -31,6 +32,7 @@ func TestGuestMemoryPolicyVZ(t *testing.T) { } mgr, tmpDir := setupVZTestManager(t) + forceEnableGuestMemoryPolicyForVZTest(mgr) ctx := context.Background() createNginxImageAndWaitDarwin(t, ctx, mgr.imageManager) @@ -69,6 +71,15 @@ func TestGuestMemoryPolicyVZ(t *testing.T) { assertLowIdleVZHostMemoryFootprint(t, *instMeta.HypervisorPID, 192*1024) } +func forceEnableGuestMemoryPolicyForVZTest(mgr *manager) { + mgr.guestMemoryPolicy = guestmemory.Policy{ + Enabled: true, + KernelPageInitMode: guestmemory.KernelPageInitPerformance, + ReclaimEnabled: true, + VZBalloonRequired: true, + }.Normalize() +} + func createNginxImageAndWaitDarwin(t *testing.T, ctx context.Context, imageManager images.Manager) { t.Helper() diff --git a/lib/instances/guestmemory_linux_test.go b/lib/instances/guestmemory_linux_test.go index 2352f9f9..4cb8986a 100644 --- a/lib/instances/guestmemory_linux_test.go +++ b/lib/instances/guestmemory_linux_test.go @@ -14,6 +14,7 @@ import ( "testing" "time" + "github.com/kernel/hypeman/lib/guestmemory" "github.com/kernel/hypeman/lib/hypervisor" "github.com/kernel/hypeman/lib/images" "github.com/kernel/hypeman/lib/vmm" @@ -26,6 +27,7 @@ func TestGuestMemoryPolicyCloudHypervisor(t *testing.T) { requireKVMAccess(t) mgr, _ := setupTestManager(t) + forceEnableGuestMemoryPolicyForTest(mgr) ctx := context.Background() createImageAndWait(t, ctx, mgr.imageManager, "docker.io/library/alpine:latest") @@ -63,7 +65,7 @@ func TestGuestMemoryPolicyCloudHypervisor(t *testing.T) { assert.True(t, infoResp.JSON200.Config.Balloon.FreePageReporting != nil && *infoResp.JSON200.Config.Balloon.FreePageReporting) pid := requireHypervisorPID(t, ctx, mgr, inst.Id) - assertLowIdleHostMemoryFootprint(t, "cloud-hypervisor", pid, 96*1024) + assertLowIdleHostMemoryFootprint(t, "cloud-hypervisor", pid, 512*1024) } func TestGuestMemoryPolicyQEMU(t *testing.T) { @@ -71,6 +73,7 @@ func TestGuestMemoryPolicyQEMU(t *testing.T) { requireKVMAccess(t) mgr, _ := setupTestManagerForQEMU(t) + forceEnableGuestMemoryPolicyForTest(mgr) ctx := context.Background() createImageAndWait(t, ctx, mgr.imageManager, "docker.io/library/alpine:latest") @@ -100,7 +103,7 @@ func TestGuestMemoryPolicyQEMU(t *testing.T) { assert.Contains(t, joined, "init_on_free=0") assert.Contains(t, joined, "virtio-balloon-pci", "qemu cmdline should include virtio balloon device") - assertLowIdleHostMemoryFootprint(t, "qemu", pid, 160*1024) + assertLowIdleHostMemoryFootprint(t, "qemu", pid, 640*1024) } func TestGuestMemoryPolicyFirecracker(t *testing.T) { @@ -108,6 +111,7 @@ func TestGuestMemoryPolicyFirecracker(t *testing.T) { requireFirecrackerIntegrationPrereqs(t) mgr, _ := setupTestManagerForFirecracker(t) + forceEnableGuestMemoryPolicyForTest(mgr) ctx := context.Background() createImageAndWait(t, ctx, mgr.imageManager, "docker.io/library/alpine:latest") @@ -136,13 +140,22 @@ func TestGuestMemoryPolicyFirecracker(t *testing.T) { assert.True(t, vmCfg.Balloon.FreePageReporting) pid := requireHypervisorPID(t, ctx, mgr, inst.Id) - assertLowIdleHostMemoryFootprint(t, "firecracker", pid, 96*1024) + assertLowIdleHostMemoryFootprint(t, "firecracker", pid, 512*1024) } func guestMemoryIdleScript() string { return "set -e; sleep 180" } +func forceEnableGuestMemoryPolicyForTest(mgr *manager) { + mgr.guestMemoryPolicy = guestmemory.Policy{ + Enabled: true, + KernelPageInitMode: guestmemory.KernelPageInitPerformance, + ReclaimEnabled: true, + VZBalloonRequired: true, + }.Normalize() +} + func createImageAndWait(t *testing.T, ctx context.Context, imageManager images.Manager, imageName string) { t.Helper() From 17a70648dc6f317981c2df76a89d7d5d8fe10a4a Mon Sep 17 00:00:00 2001 From: Steven Miller Date: Sat, 7 Mar 2026 13:31:20 -0500 Subject: [PATCH 7/7] qemu: scope balloon fallback to compatibility errors --- lib/hypervisor/qemu/process.go | 97 +++++++++++++++++++++++------ lib/hypervisor/qemu/process_test.go | 76 ++++++++++++++++++++++ 2 files changed, 154 insertions(+), 19 deletions(-) diff --git a/lib/hypervisor/qemu/process.go b/lib/hypervisor/qemu/process.go index dfd0432c..dfe5e267 100644 --- a/lib/hypervisor/qemu/process.go +++ b/lib/hypervisor/qemu/process.go @@ -11,6 +11,7 @@ import ( "path/filepath" "regexp" "runtime" + "strings" "syscall" "time" @@ -37,7 +38,7 @@ const ( // clientCreateTimeout is how long to retry QMP client creation after the // socket appears. Under high parallel load the socket can accept connections // slightly later than file creation/availability. - clientCreateTimeout = 3 * time.Second + clientCreateTimeout = 10 * time.Second ) func init() { @@ -186,11 +187,7 @@ func (s *Starter) startQEMUProcess(ctx context.Context, p *paths.Paths, version socketWaitStart := time.Now() if err := waitForSocket(socketPath, socketWaitTimeout); err != nil { cu.Clean() - vmmLogPath := filepath.Join(logsDir, "vmm.log") - if logData, readErr := os.ReadFile(vmmLogPath); readErr == nil && len(logData) > 0 { - return 0, nil, nil, fmt.Errorf("%w; vmm.log: %s", err, string(logData)) - } - return 0, nil, nil, err + return 0, nil, nil, appendVMMLog(err, logsDir) } log.DebugContext(ctx, "QMP socket ready", "duration_ms", time.Since(socketWaitStart).Milliseconds()) @@ -205,7 +202,7 @@ func (s *Starter) startQEMUProcess(ctx context.Context, p *paths.Paths, version } if time.Now().After(clientDeadline) { cu.Clean() - return 0, nil, nil, fmt.Errorf("create client: %w", err) + return 0, nil, nil, appendVMMLog(fmt.Errorf("create client: %w", err), logsDir) } time.Sleep(socketPollInterval) } @@ -236,22 +233,37 @@ func (s *Starter) StartVM(ctx context.Context, p *paths.Paths, version string, s } var ( - pid int - hv *QEMU - cu *cleanup.Cleanup - err error - booted hypervisor.VMConfig + pid int + hv *QEMU + cu *cleanup.Cleanup + err error + booted hypervisor.VMConfig + started bool ) for i, attempt := range attempts { - // Build command arguments: QMP socket + VM configuration - args := buildQMPArgs(socketPath) - args = append(args, BuildArgs(attempt)...) - pid, hv, cu, err = s.startQEMUProcess(ctx, p, version, socketPath, args) - if err == nil { - booted = attempt + // Retry the same attempt once for transient monitor/socket startup races. + for transientRetry := 0; transientRetry < 2; transientRetry++ { + // Build command arguments: QMP socket + VM configuration + args := buildQMPArgs(socketPath) + args = append(args, BuildArgs(attempt)...) + pid, hv, cu, err = s.startQEMUProcess(ctx, p, version, socketPath, args) + if err == nil { + booted = attempt + started = true + break + } + if transientRetry == 0 && shouldRetrySameConfig(err) { + _ = os.Remove(socketPath) + time.Sleep(100 * time.Millisecond) + log.WarnContext(ctx, "qemu start hit transient startup race, retrying with same configuration", "error", err) + continue + } + break + } + if started { break } - if i < len(attempts)-1 { + if i < len(attempts)-1 && shouldRetryWithReducedBalloon(err) { // Ensure a failed prior attempt doesn't keep the old socket path reserved. _ = os.Remove(socketPath) time.Sleep(100 * time.Millisecond) @@ -274,6 +286,53 @@ func (s *Starter) StartVM(ctx context.Context, p *paths.Paths, version string, s return pid, hv, nil } +func appendVMMLog(err error, logsDir string) error { + vmmLogPath := filepath.Join(logsDir, "vmm.log") + if logData, readErr := os.ReadFile(vmmLogPath); readErr == nil && len(logData) > 0 { + return fmt.Errorf("%w; vmm.log: %s", err, string(logData)) + } + return err +} + +func shouldRetrySameConfig(err error) bool { + if err == nil { + return false + } + if shouldRetryWithReducedBalloon(err) { + return false + } + + msg := strings.ToLower(err.Error()) + return strings.Contains(msg, "connection refused") || + strings.Contains(msg, "no such file or directory") || + strings.Contains(msg, "timed out") +} + +func shouldRetryWithReducedBalloon(err error) bool { + if err == nil { + return false + } + + msg := strings.ToLower(err.Error()) + mentionsBalloonOption := strings.Contains(msg, "virtio-balloon") || + strings.Contains(msg, "free-page-reporting") || + strings.Contains(msg, "free-page-hint") || + strings.Contains(msg, "deflate-on-oom") + if !mentionsBalloonOption { + return false + } + + return strings.Contains(msg, "not found") || + strings.Contains(msg, "unknown property") || + strings.Contains(msg, "unknown option") || + strings.Contains(msg, "invalid parameter") || + strings.Contains(msg, "invalid option") || + strings.Contains(msg, "invalid value") || + strings.Contains(msg, "requires 'iothread'") || + strings.Contains(msg, "requires iothread") || + strings.Contains(msg, "is unexpected") +} + // RestoreVM starts QEMU and restores VM state from a snapshot. // The VM is in paused state after restore; caller should call Resume() to continue execution. func (s *Starter) RestoreVM(ctx context.Context, p *paths.Paths, version string, socketPath string, snapshotPath string) (int, hypervisor.Hypervisor, error) { diff --git a/lib/hypervisor/qemu/process_test.go b/lib/hypervisor/qemu/process_test.go index 900e7300..9f7b8ed6 100644 --- a/lib/hypervisor/qemu/process_test.go +++ b/lib/hypervisor/qemu/process_test.go @@ -1,6 +1,7 @@ package qemu import ( + "errors" "os/exec" "regexp" "testing" @@ -98,3 +99,78 @@ func TestGetVersion_ParsesVersionCorrectly(t *testing.T) { }) } } + +func TestShouldRetryWithReducedBalloon(t *testing.T) { + tests := []struct { + name string + err error + want bool + }{ + { + name: "unsupported free page reporting", + err: errors.New("Property 'virtio-balloon-device.free-page-reporting' not found"), + want: true, + }, + { + name: "unsupported deflate option", + err: errors.New("Parameter 'deflate-on-oom' is unexpected"), + want: true, + }, + { + name: "free-page-hint requires iothread", + err: errors.New("qemu-system-x86_64: -device virtio-balloon-pci,...: 'free-page-hint' requires 'iothread' to be set"), + want: true, + }, + { + name: "non-balloon start error", + err: errors.New("wait for socket /tmp/qemu.sock: timed out after 10s"), + want: false, + }, + { + name: "transient monitor connection refused", + err: errors.New("create client: create qemu client: create socket monitor: dial unix /tmp/qemu.sock: connect: connection refused"), + want: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + assert.Equal(t, tt.want, shouldRetryWithReducedBalloon(tt.err)) + }) + } +} + +func TestShouldRetrySameConfig(t *testing.T) { + tests := []struct { + name string + err error + want bool + }{ + { + name: "monitor connection refused", + err: errors.New("create client: dial unix /tmp/qemu.sock: connect: connection refused"), + want: true, + }, + { + name: "socket race no such file", + err: errors.New("create socket monitor: dial unix /tmp/qemu.sock: connect: no such file or directory"), + want: true, + }, + { + name: "timeout", + err: errors.New("wait for socket /tmp/qemu.sock: timed out after 10s"), + want: true, + }, + { + name: "explicit balloon incompatibility should not use same-config retry", + err: errors.New("vmm.log: Property 'virtio-balloon-device.free-page-reporting' not found"), + want: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + assert.Equal(t, tt.want, shouldRetrySameConfig(tt.err)) + }) + } +}