diff --git a/Makefile b/Makefile index 6188f5a9..7264de7f 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,5 @@ SHELL := /bin/bash -.PHONY: oapi-generate generate-vmm-client generate-wire generate-all dev build build-linux test test-linux test-darwin install-tools gen-jwt download-ch-binaries download-firecracker-binaries download-ch-spec ensure-ch-binaries ensure-firecracker-binaries build-caddy-binaries build-caddy ensure-caddy-binaries release-prep clean build-embedded +.PHONY: oapi-generate generate-vmm-client generate-wire generate-all dev build build-linux test test-linux test-darwin test-guestmemory-linux test-guestmemory-vz install-tools gen-jwt download-ch-binaries download-firecracker-binaries download-ch-spec ensure-ch-binaries ensure-firecracker-binaries build-caddy-binaries build-caddy ensure-caddy-binaries release-prep clean build-embedded # Directory where local binaries will be installed BIN_DIR ?= $(CURDIR)/bin @@ -300,6 +300,21 @@ test-darwin: build-embedded sign-vz-shim go test -tags containers_image_openpgp $$VERBOSE_FLAG -timeout=$(TEST_TIMEOUT) $$PKGS; \ fi +# Manual-only guest memory policy integration tests (Linux hypervisors). +test-guestmemory-linux: ensure-ch-binaries ensure-firecracker-binaries ensure-caddy-binaries build-embedded + @TEST_PATH="/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:$$PATH"; \ + GUESTMEM_TIMEOUT="$${GUESTMEMORY_TEST_TIMEOUT:-15m}"; \ + echo "Running manual guest memory integration tests (CloudHypervisor, QEMU, Firecracker)"; \ + sudo env "PATH=$$TEST_PATH" "DOCKER_CONFIG=$${DOCKER_CONFIG:-$$HOME/.docker}" "HYPEMAN_RUN_GUESTMEMORY_TESTS=1" \ + go test -tags containers_image_openpgp -run='^TestGuestMemoryPolicy(CloudHypervisor|QEMU|Firecracker)$$' -timeout="$$GUESTMEM_TIMEOUT" ./lib/instances + +# Manual-only guest memory policy integration test (macOS VZ). +test-guestmemory-vz: build-embedded sign-vz-shim + @echo "Running manual guest memory integration test (VZ)"; \ + PATH="/opt/homebrew/opt/e2fsprogs/sbin:$(PATH)" \ + HYPEMAN_RUN_GUESTMEMORY_TESTS=1 \ + go test -tags containers_image_openpgp -run='^TestGuestMemoryPolicyVZ$$' -timeout=$(TEST_TIMEOUT) ./lib/instances + # Generate JWT token for testing # Usage: make gen-jwt [USER_ID=test-user] # Checks CONFIG_PATH, then local config.yaml, then default config paths diff --git a/cmd/api/config/config.go b/cmd/api/config/config.go index 4ee3d442..d2d9faae 100644 --- a/cmd/api/config/config.go +++ b/cmd/api/config/config.go @@ -154,8 +154,17 @@ type CapacityConfig struct { // HypervisorConfig holds hypervisor settings. type HypervisorConfig struct { - Default string `koanf:"default"` - FirecrackerBinaryPath string `koanf:"firecracker_binary_path"` + Default string `koanf:"default"` + FirecrackerBinaryPath string `koanf:"firecracker_binary_path"` + Memory HypervisorMemoryConfig `koanf:"memory"` +} + +// HypervisorMemoryConfig holds guest memory management settings. +type HypervisorMemoryConfig struct { + Enabled bool `koanf:"enabled"` + KernelPageInitMode string `koanf:"kernel_page_init_mode"` + ReclaimEnabled bool `koanf:"reclaim_enabled"` + VZBalloonRequired bool `koanf:"vz_balloon_required"` } // GPUConfig holds GPU-related settings. @@ -300,6 +309,12 @@ func defaultConfig() *Config { Hypervisor: HypervisorConfig{ Default: "cloud-hypervisor", FirecrackerBinaryPath: "", + Memory: HypervisorMemoryConfig{ + Enabled: false, + KernelPageInitMode: "hardened", + ReclaimEnabled: true, + VZBalloonRequired: true, + }, }, GPU: GPUConfig{ @@ -400,5 +415,8 @@ func (c *Config) Validate() error { if c.Build.Timeout <= 0 { return fmt.Errorf("build.timeout must be positive, got %d", c.Build.Timeout) } + if c.Hypervisor.Memory.KernelPageInitMode != "performance" && c.Hypervisor.Memory.KernelPageInitMode != "hardened" { + return fmt.Errorf("hypervisor.memory.kernel_page_init_mode must be one of {performance,hardened}, got %q", c.Hypervisor.Memory.KernelPageInitMode) + } return nil } diff --git a/cmd/vz-shim/server.go b/cmd/vz-shim/server.go index 43ba9142..9acf0095 100644 --- a/cmd/vz-shim/server.go +++ b/cmd/vz-shim/server.go @@ -37,7 +37,8 @@ func NewShimServer(vm *vz.VirtualMachine, vmConfig *vz.VirtualMachineConfigurati // VMInfoResponse matches the cloud-hypervisor VmInfo structure. type VMInfoResponse struct { - State string `json:"state"` + State string `json:"state"` + MemoryBalloonDevices int `json:"memory_balloon_devices,omitempty"` } type snapshotRequest struct { @@ -66,7 +67,10 @@ func (s *ShimServer) handleVMInfo(w http.ResponseWriter, r *http.Request) { defer s.mu.RUnlock() state := vzStateToString(s.vm.State()) - resp := VMInfoResponse{State: state} + resp := VMInfoResponse{ + State: state, + MemoryBalloonDevices: len(s.vm.MemoryBalloonDevices()), + } w.Header().Set("Content-Type", "application/json") json.NewEncoder(w).Encode(resp) diff --git a/cmd/vz-shim/vm.go b/cmd/vz-shim/vm.go index 9fa34012..0ce8dc8b 100644 --- a/cmd/vz-shim/vm.go +++ b/cmd/vz-shim/vm.go @@ -72,9 +72,19 @@ func createVM(config *shimconfig.ShimConfig) (*vz.VirtualMachine, *vz.VirtualMac } vmConfig.SetSocketDevicesVirtualMachineConfiguration([]vz.SocketDeviceConfiguration{vsockConfig}) - // Do not attach memory balloon for now. - // Save/restore compatibility on VZ can fail with "invalid argument" for some - // Linux guest configurations when a balloon device is present. + if config.EnableMemoryBalloon { + balloonConfig, err := vz.NewVirtioTraditionalMemoryBalloonDeviceConfiguration() + if err != nil { + if config.RequireMemoryBalloon { + return nil, nil, fmt.Errorf("create memory balloon device: %w", err) + } + slog.Warn("memory balloon unavailable, continuing without balloon", "error", err) + } else { + vmConfig.SetMemoryBalloonDevicesVirtualMachineConfiguration([]vz.MemoryBalloonDeviceConfiguration{ + balloonConfig, + }) + } + } if validated, err := vmConfig.Validate(); !validated || err != nil { return nil, nil, fmt.Errorf("invalid vm configuration: %w", err) diff --git a/config.example.darwin.yaml b/config.example.darwin.yaml index d491e493..b8185ffc 100644 --- a/config.example.darwin.yaml +++ b/config.example.darwin.yaml @@ -34,6 +34,11 @@ port: "8080" # - "cloud-hypervisor" and "qemu" are NOT supported on macOS hypervisor: default: vz + memory: + enabled: false + kernel_page_init_mode: hardened + reclaim_enabled: true + vz_balloon_required: true # ============================================================================= # Network Configuration (DIFFERENT ON MACOS) diff --git a/config.example.yaml b/config.example.yaml index 5b4f9db1..1f7778a7 100644 --- a/config.example.yaml +++ b/config.example.yaml @@ -26,6 +26,13 @@ data_dir: /var/lib/hypeman # default: cloud-hypervisor # # Optional: use a custom Firecracker binary path instead of the embedded one. # # firecracker_binary_path: /usr/local/bin/firecracker +# memory: +# enabled: false +# # performance: init_on_alloc=0 init_on_free=0 (better density) +# # hardened: init_on_alloc=1 init_on_free=1 (stronger hardening) +# kernel_page_init_mode: hardened +# reclaim_enabled: true +# vz_balloon_required: true # ============================================================================= # Network Configuration diff --git a/lib/guestmemory/README.md b/lib/guestmemory/README.md new file mode 100644 index 00000000..e60e7d81 --- /dev/null +++ b/lib/guestmemory/README.md @@ -0,0 +1,102 @@ +# Guest Memory Reclaim + +This feature reduces host RAM waste from guest VMs by combining three behaviors: + +1. Lazy host allocation preservation: +The VM is configured with requested memory capacity, but host pages should only back guest pages as they are touched. + +2. Guest-to-host reclaim: +When the guest frees memory, virtio balloon/reporting/hinting features let the VMM return those pages to the host. + +3. Guest boot page-touch reduction: +The guest kernel page-init mode controls whether Linux eagerly touches pages: +- `performance` mode sets `init_on_alloc=0 init_on_free=0` for better density and lower memory churn. +- `hardened` mode sets `init_on_alloc=1 init_on_free=1` for stronger memory hygiene at some density/perf cost. + +## Configuration + +This feature is controlled by `hypervisor.memory` in server config and is default-off: + +```yaml +hypervisor: + memory: + enabled: false + kernel_page_init_mode: hardened + reclaim_enabled: true + vz_balloon_required: true +``` + +To enable reclaim behavior and density-oriented kernel args, set: + +```yaml +hypervisor: + memory: + enabled: true + kernel_page_init_mode: performance +``` + +## Runtime Flow + +- Operator config (`hypervisor.memory`) is normalized into one policy. +- The instances layer applies policy generically: + - merges kernel args with the selected page-init mode; + - sets generic memory feature toggles in `hypervisor.VMConfig.GuestMemory`. +- Each hypervisor backend maps generic toggles to native mechanisms: + - Cloud Hypervisor: `balloon` config with free page reporting and deflate-on-oom. + - QEMU: `virtio-balloon-pci` device options. + - Firecracker: `/balloon` API with free page hinting/reporting. + - VZ: attach `VirtioTraditionalMemoryBalloon` device. + +## Backend Behavior Matrix + +| Hypervisor | Lazy allocation | Balloon | Free page reporting/hinting | Deflate on OOM | +|---|---|---|---|---| +| Cloud Hypervisor | Yes | Yes | Reporting | Yes | +| QEMU | Yes | Yes | Reporting (+ hinting when enabled) | Yes | +| Firecracker | Yes | Yes | Hinting + reporting | Yes | +| VZ | macOS-managed | Yes | Host-managed + guest cooperation | Host-managed | + +## Failure Behavior + +- If policy is disabled, memory features are not applied. +- If reclaim is disabled, balloon/reporting/hinting are not applied. +- For VZ, balloon attachment is attempted when enabled. + - If `vz_balloon_required=true`, startup fails if balloon cannot be configured. + - If `vz_balloon_required=false`, startup continues without balloon and logs a warning. + +## Quick CLI Experiment + +Use this A/B check to compare host memory footprint with policy enabled vs disabled: + +```bash +# 1) Start API with config A (hypervisor.memory.enabled=true), then run: +ID=$(hypeman run --hypervisor qemu --network=false --memory 1GB \ + --entrypoint /bin/sh --entrypoint -c \ + --cmd 'sleep 5; dd if=/dev/zero of=/dev/shm/hype-mem bs=1M count=256; sleep 5; rm -f /dev/shm/hype-mem; sleep 90' \ + docker.io/library/alpine:latest | tail -n1) +PID=$(jq -r '.HypervisorPID' "/guests/$ID/metadata.json") +awk '/^Pss:/ {print $2 " kB"}' "/proc/$PID/smaps_rollup" # Linux (preferred) +awk '/^VmRSS:/ {print $2 " kB"}' "/proc/$PID/status" # Linux fallback +ps -o rss= -p "$PID" # macOS +hypeman rm --force "$ID" + +# 2) Restart API with config B (hypervisor.memory.enabled=false) and run the same command. +# 3) Compare final/steady host memory between A and B. +``` + +In one startup-focused sample run, absolute host footprint stayed far below guest memory size (for example, ~4GB guest with low host PSS on Cloud Hypervisor/Firecracker), while QEMU showed a larger fixed process overhead. + +Sample probe results (4GB idle guest, rounded MB): + +| Hypervisor | Host RSS (MB) | Host PSS (MB) | Notes | +|---|---:|---:|---| +| Cloud Hypervisor (Linux) | ~345 | ~29 | Low actual host pressure when idle | +| Firecracker (Linux) | ~295 | ~27 | Low actual host pressure when idle | +| QEMU (Linux) | ~400 | ~116 | Higher fixed process overhead | +| VZ (macOS) | ~23 | N/A | RSS sampled with `ps` | + +## Out of Scope + +- No API surface changes. +- No scheduler/admission logic changes. +- No automatic background tuning loops outside hypervisor-supported reclaim mechanisms. diff --git a/lib/guestmemory/kernel_args.go b/lib/guestmemory/kernel_args.go new file mode 100644 index 00000000..73f0d1e9 --- /dev/null +++ b/lib/guestmemory/kernel_args.go @@ -0,0 +1,42 @@ +package guestmemory + +import "strings" + +// MergeKernelArgs merges kernel args deterministically. +// Duplicate keys are de-duplicated with "last write wins" semantics. +func MergeKernelArgs(base string, extras ...string) string { + tokens := strings.Fields(base) + order := make([]string, 0, len(tokens)) + values := make(map[string]string, len(tokens)) + + for _, tok := range tokens { + k := argKey(tok) + if _, ok := values[k]; !ok { + order = append(order, k) + } + values[k] = tok + } + + for _, extra := range extras { + for _, tok := range strings.Fields(extra) { + k := argKey(tok) + if _, ok := values[k]; !ok { + order = append(order, k) + } + values[k] = tok + } + } + + merged := make([]string, 0, len(order)) + for _, k := range order { + merged = append(merged, values[k]) + } + return strings.Join(merged, " ") +} + +func argKey(token string) string { + if idx := strings.IndexByte(token, '='); idx >= 0 { + return token[:idx] + } + return token +} diff --git a/lib/guestmemory/kernel_args_test.go b/lib/guestmemory/kernel_args_test.go new file mode 100644 index 00000000..7aab70a2 --- /dev/null +++ b/lib/guestmemory/kernel_args_test.go @@ -0,0 +1,12 @@ +package guestmemory + +import ( + "testing" + + "github.com/stretchr/testify/assert" +) + +func TestMergeKernelArgs(t *testing.T) { + merged := MergeKernelArgs("console=ttyS0 foo=1", "foo=2", "init_on_alloc=0 init_on_free=0") + assert.Equal(t, "console=ttyS0 foo=2 init_on_alloc=0 init_on_free=0", merged) +} diff --git a/lib/guestmemory/policy.go b/lib/guestmemory/policy.go new file mode 100644 index 00000000..5d515454 --- /dev/null +++ b/lib/guestmemory/policy.go @@ -0,0 +1,92 @@ +package guestmemory + +// KernelPageInitMode controls guest kernel page initialization behavior. +type KernelPageInitMode string + +const ( + // KernelPageInitPerformance minimizes guest page touching to preserve lazy host allocation. + KernelPageInitPerformance KernelPageInitMode = "performance" + // KernelPageInitHardened enforces page init-on-alloc/free hardening in the guest kernel. + KernelPageInitHardened KernelPageInitMode = "hardened" +) + +// Policy is the normalized, hypervisor-agnostic guest memory policy. +type Policy struct { + Enabled bool + KernelPageInitMode KernelPageInitMode + ReclaimEnabled bool + VZBalloonRequired bool +} + +// Features are generic guest memory toggles consumed by hypervisor backends. +type Features struct { + EnableBalloon bool + FreePageReporting bool + DeflateOnOOM bool + FreePageHinting bool + RequireBalloon bool +} + +// DefaultPolicy returns conservative defaults (disabled reclaim, hardened page-init mode). +func DefaultPolicy() Policy { + return Policy{ + Enabled: false, + KernelPageInitMode: KernelPageInitHardened, + ReclaimEnabled: true, + VZBalloonRequired: true, + } +} + +// Normalize applies defaults and sanitizes invalid modes. +func (p Policy) Normalize() Policy { + d := DefaultPolicy() + + if p.KernelPageInitMode == "" { + p.KernelPageInitMode = d.KernelPageInitMode + } + if p.KernelPageInitMode != KernelPageInitPerformance && p.KernelPageInitMode != KernelPageInitHardened { + p.KernelPageInitMode = d.KernelPageInitMode + } + + if !p.Enabled { + return Policy{ + Enabled: false, + KernelPageInitMode: p.KernelPageInitMode, + ReclaimEnabled: false, + VZBalloonRequired: p.VZBalloonRequired, + } + } + + return p +} + +// KernelArgs returns kernel args implied by the policy. +func (p Policy) KernelArgs() []string { + n := p.Normalize() + if !n.Enabled { + return nil + } + + switch n.KernelPageInitMode { + case KernelPageInitHardened: + return []string{"init_on_alloc=1", "init_on_free=1"} + default: + return []string{"init_on_alloc=0", "init_on_free=0"} + } +} + +// FeaturesForHypervisor returns generic memory features for backend translation. +func (p Policy) FeaturesForHypervisor() Features { + n := p.Normalize() + if !n.Enabled || !n.ReclaimEnabled { + return Features{} + } + + return Features{ + EnableBalloon: true, + FreePageReporting: true, + DeflateOnOOM: true, + FreePageHinting: true, + RequireBalloon: n.VZBalloonRequired, + } +} diff --git a/lib/guestmemory/policy_test.go b/lib/guestmemory/policy_test.go new file mode 100644 index 00000000..54b19788 --- /dev/null +++ b/lib/guestmemory/policy_test.go @@ -0,0 +1,41 @@ +package guestmemory + +import ( + "testing" + + "github.com/stretchr/testify/assert" +) + +func TestPolicyKernelArgs(t *testing.T) { + p := DefaultPolicy() + assert.Empty(t, p.KernelArgs()) + + performance := p + performance.Enabled = true + performance.KernelPageInitMode = KernelPageInitPerformance + assert.Equal(t, []string{"init_on_alloc=0", "init_on_free=0"}, performance.KernelArgs()) + + hardened := p + hardened.Enabled = true + hardened.KernelPageInitMode = KernelPageInitHardened + assert.Equal(t, []string{"init_on_alloc=1", "init_on_free=1"}, hardened.KernelArgs()) + + disabled := p + disabled.Enabled = false + assert.Empty(t, disabled.KernelArgs()) +} + +func TestFeaturesForHypervisor(t *testing.T) { + p := DefaultPolicy() + p.Enabled = true + + f := p.FeaturesForHypervisor() + assert.True(t, f.EnableBalloon) + assert.True(t, f.FreePageReporting) + assert.True(t, f.DeflateOnOOM) + assert.True(t, f.FreePageHinting) + assert.True(t, f.RequireBalloon) + + p.ReclaimEnabled = false + assert.Equal(t, Features{}, p.FeaturesForHypervisor()) +} diff --git a/lib/hypervisor/cloudhypervisor/config.go b/lib/hypervisor/cloudhypervisor/config.go index ab51676b..d728036b 100644 --- a/lib/hypervisor/cloudhypervisor/config.go +++ b/lib/hypervisor/cloudhypervisor/config.go @@ -113,6 +113,19 @@ func ToVMConfig(cfg hypervisor.VMConfig) vmm.VmConfig { devices = &deviceConfigs } + var balloon *vmm.BalloonConfig + if cfg.GuestMemory.EnableBalloon { + balloon = &vmm.BalloonConfig{ + Size: 0, + } + if cfg.GuestMemory.DeflateOnOOM { + balloon.DeflateOnOom = ptr(true) + } + if cfg.GuestMemory.FreePageReporting { + balloon.FreePageReporting = ptr(true) + } + } + return vmm.VmConfig{ Payload: payload, Cpus: &cpus, @@ -123,5 +136,6 @@ func ToVMConfig(cfg hypervisor.VMConfig) vmm.VmConfig { Net: nets, Vsock: vsock, Devices: devices, + Balloon: balloon, } } diff --git a/lib/hypervisor/cloudhypervisor/config_test.go b/lib/hypervisor/cloudhypervisor/config_test.go new file mode 100644 index 00000000..b5cdb96e --- /dev/null +++ b/lib/hypervisor/cloudhypervisor/config_test.go @@ -0,0 +1,29 @@ +package cloudhypervisor + +import ( + "testing" + + "github.com/kernel/hypeman/lib/hypervisor" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestToVMConfig_GuestMemoryBalloon(t *testing.T) { + cfg := hypervisor.VMConfig{ + VCPUs: 1, + MemoryBytes: 512 * 1024 * 1024, + GuestMemory: hypervisor.GuestMemoryConfig{ + EnableBalloon: true, + DeflateOnOOM: true, + FreePageReporting: true, + }, + } + + vmCfg := ToVMConfig(cfg) + require.NotNil(t, vmCfg.Balloon) + assert.Equal(t, int64(0), vmCfg.Balloon.Size) + require.NotNil(t, vmCfg.Balloon.DeflateOnOom) + assert.True(t, *vmCfg.Balloon.DeflateOnOom) + require.NotNil(t, vmCfg.Balloon.FreePageReporting) + assert.True(t, *vmCfg.Balloon.FreePageReporting) +} diff --git a/lib/hypervisor/config.go b/lib/hypervisor/config.go index a7ed34df..21a7aac8 100644 --- a/lib/hypervisor/config.go +++ b/lib/hypervisor/config.go @@ -8,6 +8,7 @@ type VMConfig struct { MemoryBytes int64 HotplugBytes int64 Topology *CPUTopology + GuestMemory GuestMemoryConfig // Storage Disks []DiskConfig @@ -31,6 +32,16 @@ type VMConfig struct { KernelArgs string } +// GuestMemoryConfig contains hypervisor-agnostic guest memory feature toggles. +type GuestMemoryConfig struct { + EnableBalloon bool + FreePageReporting bool + DeflateOnOOM bool + FreePageHinting bool + // RequireBalloon controls whether VM startup should fail if balloon setup fails. + RequireBalloon bool +} + // CPUTopology defines the virtual CPU topology type CPUTopology struct { ThreadsPerCore int diff --git a/lib/hypervisor/firecracker/config.go b/lib/hypervisor/firecracker/config.go index b9b60cb0..5ba47cc1 100644 --- a/lib/hypervisor/firecracker/config.go +++ b/lib/hypervisor/firecracker/config.go @@ -52,6 +52,13 @@ type serialDevice struct { SerialOutPath string `json:"serial_out_path"` } +type balloon struct { + AmountMib int64 `json:"amount_mib"` + DeflateOnOOM bool `json:"deflate_on_oom"` + FreePageHinting bool `json:"free_page_hinting,omitempty"` + FreePageReporting bool `json:"free_page_reporting,omitempty"` +} + type instanceActionInfo struct { ActionType string `json:"action_type"` } @@ -160,6 +167,18 @@ func toVsockConfig(cfg hypervisor.VMConfig) *vsock { } } +func toBalloonConfig(cfg hypervisor.VMConfig) *balloon { + if !cfg.GuestMemory.EnableBalloon { + return nil + } + return &balloon{ + AmountMib: 0, + DeflateOnOOM: cfg.GuestMemory.DeflateOnOOM, + FreePageHinting: cfg.GuestMemory.FreePageHinting, + FreePageReporting: cfg.GuestMemory.FreePageReporting, + } +} + func toRateLimiter(limit int64, burst int64) *rateLimiter { if limit <= 0 { return nil diff --git a/lib/hypervisor/firecracker/config_test.go b/lib/hypervisor/firecracker/config_test.go index 5f2ef6ca..4649ea61 100644 --- a/lib/hypervisor/firecracker/config_test.go +++ b/lib/hypervisor/firecracker/config_test.go @@ -73,3 +73,21 @@ func TestSnapshotParamPaths(t *testing.T) { assert.False(t, load.ResumeVM) require.Len(t, load.NetworkOverrides, 1) } + +func TestToBalloonConfig(t *testing.T) { + cfg := hypervisor.VMConfig{ + GuestMemory: hypervisor.GuestMemoryConfig{ + EnableBalloon: true, + DeflateOnOOM: true, + FreePageHinting: true, + FreePageReporting: true, + }, + } + + b := toBalloonConfig(cfg) + require.NotNil(t, b) + assert.Equal(t, int64(0), b.AmountMib) + assert.True(t, b.DeflateOnOOM) + assert.True(t, b.FreePageHinting) + assert.True(t, b.FreePageReporting) +} diff --git a/lib/hypervisor/firecracker/firecracker.go b/lib/hypervisor/firecracker/firecracker.go index 6f92b24d..64f399f3 100644 --- a/lib/hypervisor/firecracker/firecracker.go +++ b/lib/hypervisor/firecracker/firecracker.go @@ -141,6 +141,14 @@ func (f *Firecracker) configureForBoot(ctx context.Context, cfg hypervisor.VMCon if _, err := f.do(ctx, http.MethodPut, "/machine-config", toMachineConfiguration(cfg), http.StatusNoContent); err != nil { return fmt.Errorf("configure machine: %w", err) } + if balloonCfg := toBalloonConfig(cfg); balloonCfg != nil { + if _, err := f.do(ctx, http.MethodPut, "/balloon", balloonCfg, http.StatusNoContent); err != nil { + // Keep compatibility with older/custom binaries that may not expose balloon API. + if !strings.Contains(err.Error(), "Invalid request method and/or path") { + return fmt.Errorf("configure balloon: %w", err) + } + } + } for _, driveCfg := range toDriveConfigs(cfg) { path := "/drives/" + url.PathEscape(driveCfg.DriveID) diff --git a/lib/hypervisor/qemu/config.go b/lib/hypervisor/qemu/config.go index 659e5c6a..4b395c40 100644 --- a/lib/hypervisor/qemu/config.go +++ b/lib/hypervisor/qemu/config.go @@ -25,6 +25,20 @@ func BuildArgs(cfg hypervisor.VMConfig) []string { memMB := cfg.MemoryBytes / (1024 * 1024) args = append(args, "-m", fmt.Sprintf("%dM", memMB)) + if cfg.GuestMemory.EnableBalloon { + balloonOpts := []string{"virtio-balloon-pci"} + if cfg.GuestMemory.DeflateOnOOM { + balloonOpts = append(balloonOpts, "deflate-on-oom=on") + } + if cfg.GuestMemory.FreePageReporting { + balloonOpts = append(balloonOpts, "free-page-reporting=on") + } + if cfg.GuestMemory.FreePageHinting { + balloonOpts = append(balloonOpts, "free-page-hint=on") + } + args = append(args, "-device", strings.Join(balloonOpts, ",")) + } + // Kernel and initrd if cfg.KernelPath != "" { args = append(args, "-kernel", cfg.KernelPath) diff --git a/lib/hypervisor/qemu/config_test.go b/lib/hypervisor/qemu/config_test.go index 159e7774..1cb64c93 100644 --- a/lib/hypervisor/qemu/config_test.go +++ b/lib/hypervisor/qemu/config_test.go @@ -159,3 +159,20 @@ func TestBuildArgs_NoSerialLog(t *testing.T) { assert.Contains(t, args, "-serial") assert.Contains(t, args, "stdio") } + +func TestBuildArgs_GuestMemoryBalloon(t *testing.T) { + cfg := hypervisor.VMConfig{ + VCPUs: 1, + MemoryBytes: 512 * 1024 * 1024, + GuestMemory: hypervisor.GuestMemoryConfig{ + EnableBalloon: true, + DeflateOnOOM: true, + FreePageReporting: true, + FreePageHinting: true, + }, + } + + args := BuildArgs(cfg) + assert.Contains(t, args, "-device") + assert.Contains(t, args, "virtio-balloon-pci,deflate-on-oom=on,free-page-reporting=on,free-page-hint=on") +} diff --git a/lib/hypervisor/qemu/process.go b/lib/hypervisor/qemu/process.go index 8914010e..dfe5e267 100644 --- a/lib/hypervisor/qemu/process.go +++ b/lib/hypervisor/qemu/process.go @@ -11,6 +11,7 @@ import ( "path/filepath" "regexp" "runtime" + "strings" "syscall" "time" @@ -37,7 +38,7 @@ const ( // clientCreateTimeout is how long to retry QMP client creation after the // socket appears. Under high parallel load the socket can accept connections // slightly later than file creation/availability. - clientCreateTimeout = 3 * time.Second + clientCreateTimeout = 10 * time.Second ) func init() { @@ -186,11 +187,7 @@ func (s *Starter) startQEMUProcess(ctx context.Context, p *paths.Paths, version socketWaitStart := time.Now() if err := waitForSocket(socketPath, socketWaitTimeout); err != nil { cu.Clean() - vmmLogPath := filepath.Join(logsDir, "vmm.log") - if logData, readErr := os.ReadFile(vmmLogPath); readErr == nil && len(logData) > 0 { - return 0, nil, nil, fmt.Errorf("%w; vmm.log: %s", err, string(logData)) - } - return 0, nil, nil, err + return 0, nil, nil, appendVMMLog(err, logsDir) } log.DebugContext(ctx, "QMP socket ready", "duration_ms", time.Since(socketWaitStart).Milliseconds()) @@ -205,7 +202,7 @@ func (s *Starter) startQEMUProcess(ctx context.Context, p *paths.Paths, version } if time.Now().After(clientDeadline) { cu.Clean() - return 0, nil, nil, fmt.Errorf("create client: %w", err) + return 0, nil, nil, appendVMMLog(fmt.Errorf("create client: %w", err), logsDir) } time.Sleep(socketPollInterval) } @@ -218,12 +215,61 @@ func (s *Starter) startQEMUProcess(ctx context.Context, p *paths.Paths, version func (s *Starter) StartVM(ctx context.Context, p *paths.Paths, version string, socketPath string, config hypervisor.VMConfig) (int, hypervisor.Hypervisor, error) { log := logger.FromContext(ctx) - // Build command arguments: QMP socket + VM configuration - args := buildQMPArgs(socketPath) - args = append(args, BuildArgs(config)...) + // Some distro QEMU builds may not support newer balloon sub-options. + // Retry with progressively more conservative balloon args before failing. + attempts := []hypervisor.VMConfig{config} + if config.GuestMemory.EnableBalloon && (config.GuestMemory.FreePageReporting || config.GuestMemory.FreePageHinting) { + fallback := config + fallback.GuestMemory.FreePageReporting = false + fallback.GuestMemory.FreePageHinting = false + attempts = append(attempts, fallback) + } + if config.GuestMemory.EnableBalloon && config.GuestMemory.DeflateOnOOM { + fallback := config + fallback.GuestMemory.FreePageReporting = false + fallback.GuestMemory.FreePageHinting = false + fallback.GuestMemory.DeflateOnOOM = false + attempts = append(attempts, fallback) + } - pid, hv, cu, err := s.startQEMUProcess(ctx, p, version, socketPath, args) - if err != nil { + var ( + pid int + hv *QEMU + cu *cleanup.Cleanup + err error + booted hypervisor.VMConfig + started bool + ) + for i, attempt := range attempts { + // Retry the same attempt once for transient monitor/socket startup races. + for transientRetry := 0; transientRetry < 2; transientRetry++ { + // Build command arguments: QMP socket + VM configuration + args := buildQMPArgs(socketPath) + args = append(args, BuildArgs(attempt)...) + pid, hv, cu, err = s.startQEMUProcess(ctx, p, version, socketPath, args) + if err == nil { + booted = attempt + started = true + break + } + if transientRetry == 0 && shouldRetrySameConfig(err) { + _ = os.Remove(socketPath) + time.Sleep(100 * time.Millisecond) + log.WarnContext(ctx, "qemu start hit transient startup race, retrying with same configuration", "error", err) + continue + } + break + } + if started { + break + } + if i < len(attempts)-1 && shouldRetryWithReducedBalloon(err) { + // Ensure a failed prior attempt doesn't keep the old socket path reserved. + _ = os.Remove(socketPath) + time.Sleep(100 * time.Millisecond) + log.WarnContext(ctx, "qemu start failed, retrying with reduced balloon features", "attempt", i+1, "error", err) + continue + } return 0, nil, err } defer cu.Clean() @@ -231,7 +277,7 @@ func (s *Starter) StartVM(ctx context.Context, p *paths.Paths, version string, s // Save config for potential restore later // QEMU migration files only contain memory state, not device config instanceDir := filepath.Dir(socketPath) - if err := saveVMConfig(instanceDir, config); err != nil { + if err := saveVMConfig(instanceDir, booted); err != nil { // Non-fatal - restore just won't work log.WarnContext(ctx, "failed to save VM config for restore", "error", err) } @@ -240,6 +286,53 @@ func (s *Starter) StartVM(ctx context.Context, p *paths.Paths, version string, s return pid, hv, nil } +func appendVMMLog(err error, logsDir string) error { + vmmLogPath := filepath.Join(logsDir, "vmm.log") + if logData, readErr := os.ReadFile(vmmLogPath); readErr == nil && len(logData) > 0 { + return fmt.Errorf("%w; vmm.log: %s", err, string(logData)) + } + return err +} + +func shouldRetrySameConfig(err error) bool { + if err == nil { + return false + } + if shouldRetryWithReducedBalloon(err) { + return false + } + + msg := strings.ToLower(err.Error()) + return strings.Contains(msg, "connection refused") || + strings.Contains(msg, "no such file or directory") || + strings.Contains(msg, "timed out") +} + +func shouldRetryWithReducedBalloon(err error) bool { + if err == nil { + return false + } + + msg := strings.ToLower(err.Error()) + mentionsBalloonOption := strings.Contains(msg, "virtio-balloon") || + strings.Contains(msg, "free-page-reporting") || + strings.Contains(msg, "free-page-hint") || + strings.Contains(msg, "deflate-on-oom") + if !mentionsBalloonOption { + return false + } + + return strings.Contains(msg, "not found") || + strings.Contains(msg, "unknown property") || + strings.Contains(msg, "unknown option") || + strings.Contains(msg, "invalid parameter") || + strings.Contains(msg, "invalid option") || + strings.Contains(msg, "invalid value") || + strings.Contains(msg, "requires 'iothread'") || + strings.Contains(msg, "requires iothread") || + strings.Contains(msg, "is unexpected") +} + // RestoreVM starts QEMU and restores VM state from a snapshot. // The VM is in paused state after restore; caller should call Resume() to continue execution. func (s *Starter) RestoreVM(ctx context.Context, p *paths.Paths, version string, socketPath string, snapshotPath string) (int, hypervisor.Hypervisor, error) { diff --git a/lib/hypervisor/qemu/process_test.go b/lib/hypervisor/qemu/process_test.go index 900e7300..9f7b8ed6 100644 --- a/lib/hypervisor/qemu/process_test.go +++ b/lib/hypervisor/qemu/process_test.go @@ -1,6 +1,7 @@ package qemu import ( + "errors" "os/exec" "regexp" "testing" @@ -98,3 +99,78 @@ func TestGetVersion_ParsesVersionCorrectly(t *testing.T) { }) } } + +func TestShouldRetryWithReducedBalloon(t *testing.T) { + tests := []struct { + name string + err error + want bool + }{ + { + name: "unsupported free page reporting", + err: errors.New("Property 'virtio-balloon-device.free-page-reporting' not found"), + want: true, + }, + { + name: "unsupported deflate option", + err: errors.New("Parameter 'deflate-on-oom' is unexpected"), + want: true, + }, + { + name: "free-page-hint requires iothread", + err: errors.New("qemu-system-x86_64: -device virtio-balloon-pci,...: 'free-page-hint' requires 'iothread' to be set"), + want: true, + }, + { + name: "non-balloon start error", + err: errors.New("wait for socket /tmp/qemu.sock: timed out after 10s"), + want: false, + }, + { + name: "transient monitor connection refused", + err: errors.New("create client: create qemu client: create socket monitor: dial unix /tmp/qemu.sock: connect: connection refused"), + want: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + assert.Equal(t, tt.want, shouldRetryWithReducedBalloon(tt.err)) + }) + } +} + +func TestShouldRetrySameConfig(t *testing.T) { + tests := []struct { + name string + err error + want bool + }{ + { + name: "monitor connection refused", + err: errors.New("create client: dial unix /tmp/qemu.sock: connect: connection refused"), + want: true, + }, + { + name: "socket race no such file", + err: errors.New("create socket monitor: dial unix /tmp/qemu.sock: connect: no such file or directory"), + want: true, + }, + { + name: "timeout", + err: errors.New("wait for socket /tmp/qemu.sock: timed out after 10s"), + want: true, + }, + { + name: "explicit balloon incompatibility should not use same-config retry", + err: errors.New("vmm.log: Property 'virtio-balloon-device.free-page-reporting' not found"), + want: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + assert.Equal(t, tt.want, shouldRetrySameConfig(tt.err)) + }) + } +} diff --git a/lib/hypervisor/vz/shimconfig/config.go b/lib/hypervisor/vz/shimconfig/config.go index 630e841b..f52f6eb1 100644 --- a/lib/hypervisor/vz/shimconfig/config.go +++ b/lib/hypervisor/vz/shimconfig/config.go @@ -32,6 +32,10 @@ type ShimConfig struct { InitrdPath string `json:"initrd_path"` KernelArgs string `json:"kernel_args"` + // Guest memory reclaim + EnableMemoryBalloon bool `json:"enable_memory_balloon,omitempty"` + RequireMemoryBalloon bool `json:"require_memory_balloon,omitempty"` + // Socket paths (where shim should listen) ControlSocket string `json:"control_socket"` VsockSocket string `json:"vsock_socket"` diff --git a/lib/hypervisor/vz/starter.go b/lib/hypervisor/vz/starter.go index fc7a041d..d9f83267 100644 --- a/lib/hypervisor/vz/starter.go +++ b/lib/hypervisor/vz/starter.go @@ -160,15 +160,17 @@ func (s *Starter) RestoreVM(ctx context.Context, p *paths.Paths, version string, func buildShimConfigFromVMConfig(config hypervisor.VMConfig, socketPath string) shimconfig.ShimConfig { instanceDir := filepath.Dir(socketPath) cfg := shimconfig.ShimConfig{ - VCPUs: config.VCPUs, - MemoryBytes: config.MemoryBytes, - SerialLogPath: config.SerialLogPath, - KernelPath: config.KernelPath, - InitrdPath: config.InitrdPath, - KernelArgs: config.KernelArgs, - ControlSocket: socketPath, - VsockSocket: filepath.Join(instanceDir, "vz.vsock"), - LogPath: filepath.Join(instanceDir, "logs", "vz-shim.log"), + VCPUs: config.VCPUs, + MemoryBytes: config.MemoryBytes, + SerialLogPath: config.SerialLogPath, + KernelPath: config.KernelPath, + InitrdPath: config.InitrdPath, + KernelArgs: config.KernelArgs, + EnableMemoryBalloon: config.GuestMemory.EnableBalloon, + RequireMemoryBalloon: config.GuestMemory.RequireBalloon, + ControlSocket: socketPath, + VsockSocket: filepath.Join(instanceDir, "vz.vsock"), + LogPath: filepath.Join(instanceDir, "logs", "vz-shim.log"), } for _, disk := range config.Disks { cfg.Disks = append(cfg.Disks, shimconfig.DiskConfig{ diff --git a/lib/instances/create.go b/lib/instances/create.go index 1f762ff3..76453f2c 100644 --- a/lib/instances/create.go +++ b/lib/instances/create.go @@ -8,6 +8,7 @@ import ( "time" "github.com/kernel/hypeman/lib/devices" + "github.com/kernel/hypeman/lib/guestmemory" "github.com/kernel/hypeman/lib/hypervisor" "github.com/kernel/hypeman/lib/images" "github.com/kernel/hypeman/lib/logger" @@ -704,6 +705,7 @@ func (m *manager) buildHypervisorConfig(ctx context.Context, inst *Instance, ima MemoryBytes: inst.Size, HotplugBytes: inst.HotplugSize, Topology: topology, + GuestMemory: m.guestMemoryConfig(), Disks: disks, Networks: networks, SerialLogPath: m.paths.InstanceAppLog(inst.Id), @@ -719,10 +721,23 @@ func (m *manager) buildHypervisorConfig(ctx context.Context, inst *Instance, ima // kernelArgs returns the kernel command line arguments for the given hypervisor type. // vz uses hvc0 (virtio console), all others use ttyS0 (serial port). func (m *manager) kernelArgs(hvType hypervisor.Type) string { + console := "console=ttyS0" if hvType == hypervisor.TypeVZ { - return "console=hvc0" + console = "console=hvc0" + } + policyArgs := strings.Join(m.guestMemoryPolicy.KernelArgs(), " ") + return guestmemory.MergeKernelArgs(console, policyArgs) +} + +func (m *manager) guestMemoryConfig() hypervisor.GuestMemoryConfig { + features := m.guestMemoryPolicy.FeaturesForHypervisor() + return hypervisor.GuestMemoryConfig{ + EnableBalloon: features.EnableBalloon, + FreePageReporting: features.FreePageReporting, + DeflateOnOOM: features.DeflateOnOOM, + FreePageHinting: features.FreePageHinting, + RequireBalloon: features.RequireBalloon, } - return "console=ttyS0" } func ptr[T any](v T) *T { diff --git a/lib/instances/guestmemory_darwin_test.go b/lib/instances/guestmemory_darwin_test.go new file mode 100644 index 00000000..560e7735 --- /dev/null +++ b/lib/instances/guestmemory_darwin_test.go @@ -0,0 +1,177 @@ +//go:build darwin + +package instances + +import ( + "context" + "encoding/json" + "fmt" + "net" + "net/http" + "os/exec" + "runtime" + "strconv" + "strings" + "testing" + "time" + + "github.com/kernel/hypeman/lib/guestmemory" + "github.com/kernel/hypeman/lib/hypervisor" + "github.com/kernel/hypeman/lib/images" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestGuestMemoryPolicyVZ(t *testing.T) { + requireGuestMemoryManualRun(t) + if runtime.GOOS != "darwin" { + t.Skip("vz tests require macOS") + } + if runtime.GOARCH != "arm64" { + t.Skip("vz tests require Apple Silicon") + } + + mgr, tmpDir := setupVZTestManager(t) + forceEnableGuestMemoryPolicyForVZTest(mgr) + ctx := context.Background() + + createNginxImageAndWaitDarwin(t, ctx, mgr.imageManager) + require.NoError(t, mgr.systemManager.EnsureSystemFiles(ctx)) + + inst, err := mgr.CreateInstance(ctx, CreateInstanceRequest{ + Name: "guestmem-vz", + Image: "docker.io/library/nginx:alpine", + Size: 4 * 1024 * 1024 * 1024, + OverlaySize: 5 * 1024 * 1024 * 1024, + Vcpus: 1, + NetworkEnabled: false, + Hypervisor: hypervisor.TypeVZ, + }) + if err != nil { + dumpVZShimLogs(t, tmpDir) + require.NoError(t, err) + } + defer func() { _ = mgr.DeleteInstance(ctx, inst.Id) }() + + require.NoError(t, waitForExecAgent(ctx, mgr, inst.Id, 30*time.Second)) + + out, exitCode, err := vzExecCommand(ctx, inst, "cat", "/proc/cmdline") + require.NoError(t, err) + require.Equal(t, 0, exitCode) + assert.Contains(t, out, "init_on_alloc=0") + assert.Contains(t, out, "init_on_free=0") + + info, err := getVZVMInfo(inst.SocketPath) + require.NoError(t, err) + assert.GreaterOrEqual(t, info.MemoryBalloonDevices, 1, "vz shim should report attached memory balloon device") + + instMeta, err := mgr.GetInstance(ctx, inst.Id) + require.NoError(t, err) + require.NotNil(t, instMeta.HypervisorPID) + assertLowIdleVZHostMemoryFootprint(t, *instMeta.HypervisorPID, 192*1024) +} + +func forceEnableGuestMemoryPolicyForVZTest(mgr *manager) { + mgr.guestMemoryPolicy = guestmemory.Policy{ + Enabled: true, + KernelPageInitMode: guestmemory.KernelPageInitPerformance, + ReclaimEnabled: true, + VZBalloonRequired: true, + }.Normalize() +} + +func createNginxImageAndWaitDarwin(t *testing.T, ctx context.Context, imageManager images.Manager) { + t.Helper() + + nginxImage, err := imageManager.CreateImage(ctx, images.CreateImageRequest{ + Name: "docker.io/library/nginx:alpine", + }) + require.NoError(t, err) + + imageName := nginxImage.Name + for i := 0; i < 120; i++ { + img, err := imageManager.GetImage(ctx, imageName) + if err == nil && img.Status == images.StatusReady { + return + } + if err == nil && img.Status == images.StatusFailed { + if img.Error != nil { + t.Fatalf("image build failed: %s", *img.Error) + } + t.Fatalf("image build failed: unknown error") + } + time.Sleep(1 * time.Second) + } + t.Fatalf("timed out waiting for image %q to become ready", imageName) +} + +type vzVMInfo struct { + State string `json:"state"` + MemoryBalloonDevices int `json:"memory_balloon_devices"` +} + +func getVZVMInfo(socketPath string) (*vzVMInfo, error) { + transport := &http.Transport{ + DialContext: func(ctx context.Context, _, _ string) (net.Conn, error) { + var d net.Dialer + return d.DialContext(ctx, "unix", socketPath) + }, + DisableKeepAlives: true, + } + client := &http.Client{Transport: transport, Timeout: 5 * time.Second} + + req, err := http.NewRequest(http.MethodGet, "http://vz-shim/api/v1/vm.info", nil) + if err != nil { + return nil, err + } + resp, err := client.Do(req) + if err != nil { + return nil, err + } + defer resp.Body.Close() + if resp.StatusCode != http.StatusOK { + return nil, fmt.Errorf("unexpected vz vm.info status: %d", resp.StatusCode) + } + var info vzVMInfo + if err := json.NewDecoder(resp.Body).Decode(&info); err != nil { + return nil, err + } + return &info, nil +} + +func assertLowIdleVZHostMemoryFootprint(t *testing.T, pid int, maxRSSKB int64) { + t.Helper() + + time.Sleep(12 * time.Second) + var rssSamplesKB []int64 + for i := 0; i < 6; i++ { + rssSamplesKB = append(rssSamplesKB, mustReadDarwinRSSBytes(t, pid)/1024) + time.Sleep(1 * time.Second) + } + var rssSumKB int64 + for _, v := range rssSamplesKB { + rssSumKB += v + } + avgRSSKB := rssSumKB / int64(len(rssSamplesKB)) + assert.LessOrEqualf( + t, + avgRSSKB, + maxRSSKB, + "expected low idle host memory footprint for vz (avg_rss_kb=%d max_rss_kb=%d rss_samples_kb=%v)", + avgRSSKB, + maxRSSKB, + rssSamplesKB, + ) +} + +func mustReadDarwinRSSBytes(t *testing.T, pid int) int64 { + t.Helper() + cmd := exec.Command("ps", "-o", "rss=", "-p", strconv.Itoa(pid)) + out, err := cmd.Output() + require.NoError(t, err) + trimmed := strings.TrimSpace(string(out)) + require.NotEmpty(t, trimmed) + kb, err := strconv.ParseInt(trimmed, 10, 64) + require.NoError(t, err) + return kb * 1024 +} diff --git a/lib/instances/guestmemory_linux_test.go b/lib/instances/guestmemory_linux_test.go new file mode 100644 index 00000000..4cb8986a --- /dev/null +++ b/lib/instances/guestmemory_linux_test.go @@ -0,0 +1,298 @@ +//go:build linux + +package instances + +import ( + "context" + "encoding/json" + "fmt" + "net" + "net/http" + "os" + "strconv" + "strings" + "testing" + "time" + + "github.com/kernel/hypeman/lib/guestmemory" + "github.com/kernel/hypeman/lib/hypervisor" + "github.com/kernel/hypeman/lib/images" + "github.com/kernel/hypeman/lib/vmm" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestGuestMemoryPolicyCloudHypervisor(t *testing.T) { + requireGuestMemoryManualRun(t) + requireKVMAccess(t) + + mgr, _ := setupTestManager(t) + forceEnableGuestMemoryPolicyForTest(mgr) + ctx := context.Background() + + createImageAndWait(t, ctx, mgr.imageManager, "docker.io/library/alpine:latest") + require.NoError(t, mgr.systemManager.EnsureSystemFiles(ctx)) + + inst, err := mgr.CreateInstance(ctx, CreateInstanceRequest{ + Name: "guestmem-ch", + Image: "docker.io/library/alpine:latest", + Size: 4 * 1024 * 1024 * 1024, + OverlaySize: 5 * 1024 * 1024 * 1024, + Vcpus: 1, + NetworkEnabled: false, + Hypervisor: hypervisor.TypeCloudHypervisor, + Entrypoint: []string{"/bin/sh", "-c"}, + Cmd: []string{guestMemoryIdleScript()}, + }) + require.NoError(t, err) + t.Cleanup(func() { _ = mgr.DeleteInstance(ctx, inst.Id) }) + + require.NoError(t, waitForVMReady(ctx, inst.SocketPath, 10*time.Second)) + + client, err := vmm.NewVMM(inst.SocketPath) + require.NoError(t, err) + infoResp, err := client.GetVmInfoWithResponse(ctx) + require.NoError(t, err) + require.Equal(t, 200, infoResp.StatusCode()) + require.NotNil(t, infoResp.JSON200) + require.NotNil(t, infoResp.JSON200.Config.Payload) + require.NotNil(t, infoResp.JSON200.Config.Payload.Cmdline) + assert.Contains(t, *infoResp.JSON200.Config.Payload.Cmdline, "init_on_alloc=0") + assert.Contains(t, *infoResp.JSON200.Config.Payload.Cmdline, "init_on_free=0") + + require.NotNil(t, infoResp.JSON200.Config.Balloon, "cloud-hypervisor vm.info config should include balloon") + assert.True(t, infoResp.JSON200.Config.Balloon.DeflateOnOom != nil && *infoResp.JSON200.Config.Balloon.DeflateOnOom) + assert.True(t, infoResp.JSON200.Config.Balloon.FreePageReporting != nil && *infoResp.JSON200.Config.Balloon.FreePageReporting) + + pid := requireHypervisorPID(t, ctx, mgr, inst.Id) + assertLowIdleHostMemoryFootprint(t, "cloud-hypervisor", pid, 512*1024) +} + +func TestGuestMemoryPolicyQEMU(t *testing.T) { + requireGuestMemoryManualRun(t) + requireKVMAccess(t) + + mgr, _ := setupTestManagerForQEMU(t) + forceEnableGuestMemoryPolicyForTest(mgr) + ctx := context.Background() + + createImageAndWait(t, ctx, mgr.imageManager, "docker.io/library/alpine:latest") + require.NoError(t, mgr.systemManager.EnsureSystemFiles(ctx)) + + inst, err := mgr.CreateInstance(ctx, CreateInstanceRequest{ + Name: "guestmem-qemu", + Image: "docker.io/library/alpine:latest", + Size: 4 * 1024 * 1024 * 1024, + OverlaySize: 5 * 1024 * 1024 * 1024, + Vcpus: 1, + NetworkEnabled: false, + Hypervisor: hypervisor.TypeQEMU, + Entrypoint: []string{"/bin/sh", "-c"}, + Cmd: []string{guestMemoryIdleScript()}, + }) + require.NoError(t, err) + t.Cleanup(func() { _ = mgr.DeleteInstance(ctx, inst.Id) }) + + require.NoError(t, waitForQEMUReady(ctx, inst.SocketPath, 10*time.Second)) + + pid := requireHypervisorPID(t, ctx, mgr, inst.Id) + cmdline, err := os.ReadFile(fmt.Sprintf("/proc/%d/cmdline", pid)) + require.NoError(t, err) + joined := strings.ReplaceAll(string(cmdline), "\x00", " ") + assert.Contains(t, joined, "init_on_alloc=0") + assert.Contains(t, joined, "init_on_free=0") + assert.Contains(t, joined, "virtio-balloon-pci", "qemu cmdline should include virtio balloon device") + + assertLowIdleHostMemoryFootprint(t, "qemu", pid, 640*1024) +} + +func TestGuestMemoryPolicyFirecracker(t *testing.T) { + requireGuestMemoryManualRun(t) + requireFirecrackerIntegrationPrereqs(t) + + mgr, _ := setupTestManagerForFirecracker(t) + forceEnableGuestMemoryPolicyForTest(mgr) + ctx := context.Background() + + createImageAndWait(t, ctx, mgr.imageManager, "docker.io/library/alpine:latest") + require.NoError(t, mgr.systemManager.EnsureSystemFiles(ctx)) + + inst, err := mgr.CreateInstance(ctx, CreateInstanceRequest{ + Name: "guestmem-fc", + Image: "docker.io/library/alpine:latest", + Size: 4 * 1024 * 1024 * 1024, + OverlaySize: 5 * 1024 * 1024 * 1024, + Vcpus: 1, + NetworkEnabled: false, + Hypervisor: hypervisor.TypeFirecracker, + Entrypoint: []string{"/bin/sh", "-c"}, + Cmd: []string{guestMemoryIdleScript()}, + }) + require.NoError(t, err) + t.Cleanup(func() { _ = mgr.DeleteInstance(ctx, inst.Id) }) + + vmCfg, err := getFirecrackerVMConfig(inst.SocketPath) + require.NoError(t, err) + assert.Contains(t, vmCfg.BootSource.BootArgs, "init_on_alloc=0") + assert.Contains(t, vmCfg.BootSource.BootArgs, "init_on_free=0") + assert.True(t, vmCfg.Balloon.DeflateOnOOM) + assert.True(t, vmCfg.Balloon.FreePageHinting) + assert.True(t, vmCfg.Balloon.FreePageReporting) + + pid := requireHypervisorPID(t, ctx, mgr, inst.Id) + assertLowIdleHostMemoryFootprint(t, "firecracker", pid, 512*1024) +} + +func guestMemoryIdleScript() string { + return "set -e; sleep 180" +} + +func forceEnableGuestMemoryPolicyForTest(mgr *manager) { + mgr.guestMemoryPolicy = guestmemory.Policy{ + Enabled: true, + KernelPageInitMode: guestmemory.KernelPageInitPerformance, + ReclaimEnabled: true, + VZBalloonRequired: true, + }.Normalize() +} + +func createImageAndWait(t *testing.T, ctx context.Context, imageManager images.Manager, imageName string) { + t.Helper() + + img, err := imageManager.CreateImage(ctx, images.CreateImageRequest{Name: imageName}) + require.NoError(t, err) + + for i := 0; i < 180; i++ { + current, err := imageManager.GetImage(ctx, img.Name) + if err == nil && current.Status == images.StatusReady { + return + } + if err == nil && current.Status == images.StatusFailed { + if current.Error != nil { + t.Fatalf("image build failed: %s", *current.Error) + } + t.Fatalf("image build failed: unknown error") + } + time.Sleep(1 * time.Second) + } + t.Fatalf("timed out waiting for image %q to become ready", img.Name) +} + +func requireHypervisorPID(t *testing.T, ctx context.Context, mgr *manager, instanceID string) int { + t.Helper() + inst, err := mgr.GetInstance(ctx, instanceID) + require.NoError(t, err) + require.NotNil(t, inst.HypervisorPID) + return *inst.HypervisorPID +} + +func assertLowIdleHostMemoryFootprint(t *testing.T, hypervisorName string, pid int, maxPSSKB int64) { + t.Helper() + + // Give the guest a short settle window, then sample host memory. + time.Sleep(12 * time.Second) + var pssSamplesKB []int64 + var rssSamplesKB []int64 + for i := 0; i < 6; i++ { + pssSamplesKB = append(pssSamplesKB, mustReadPSSKB(t, pid)) + rssSamplesKB = append(rssSamplesKB, mustReadRSSBytes(t, pid)/1024) + time.Sleep(1 * time.Second) + } + + var pssSumKB int64 + for _, v := range pssSamplesKB { + pssSumKB += v + } + avgPSSKB := pssSumKB / int64(len(pssSamplesKB)) + + assert.LessOrEqualf( + t, + avgPSSKB, + maxPSSKB, + "expected low idle host memory footprint for %s (avg_pss_kb=%d max_pss_kb=%d rss_samples_kb=%v pss_samples_kb=%v)", + hypervisorName, + avgPSSKB, + maxPSSKB, + rssSamplesKB, + pssSamplesKB, + ) +} + +func mustReadRSSBytes(t *testing.T, pid int) int64 { + t.Helper() + statusPath := fmt.Sprintf("/proc/%d/status", pid) + data, err := os.ReadFile(statusPath) + require.NoError(t, err) + + for _, line := range strings.Split(string(data), "\n") { + if strings.HasPrefix(line, "VmRSS:") { + fields := strings.Fields(line) + require.GreaterOrEqual(t, len(fields), 2) + kb, err := strconv.ParseInt(fields[1], 10, 64) + require.NoError(t, err) + return kb * 1024 + } + } + t.Fatalf("VmRSS not found in %s", statusPath) + return 0 +} + +func mustReadPSSKB(t *testing.T, pid int) int64 { + t.Helper() + smapsRollupPath := fmt.Sprintf("/proc/%d/smaps_rollup", pid) + data, err := os.ReadFile(smapsRollupPath) + require.NoError(t, err) + + for _, line := range strings.Split(string(data), "\n") { + if strings.HasPrefix(line, "Pss:") { + fields := strings.Fields(line) + require.GreaterOrEqual(t, len(fields), 2) + kb, err := strconv.ParseInt(fields[1], 10, 64) + require.NoError(t, err) + return kb + } + } + t.Fatalf("Pss not found in %s", smapsRollupPath) + return 0 +} + +type firecrackerVMConfig struct { + BootSource struct { + BootArgs string `json:"boot_args"` + } `json:"boot-source"` + Balloon struct { + DeflateOnOOM bool `json:"deflate_on_oom"` + FreePageHinting bool `json:"free_page_hinting"` + FreePageReporting bool `json:"free_page_reporting"` + } `json:"balloon"` +} + +func getFirecrackerVMConfig(socketPath string) (*firecrackerVMConfig, error) { + transport := &http.Transport{ + DialContext: func(ctx context.Context, _, _ string) (net.Conn, error) { + var d net.Dialer + return d.DialContext(ctx, "unix", socketPath) + }, + DisableKeepAlives: true, + } + client := &http.Client{Transport: transport, Timeout: 5 * time.Second} + + req, err := http.NewRequest(http.MethodGet, "http://localhost/vm/config", nil) + if err != nil { + return nil, err + } + resp, err := client.Do(req) + if err != nil { + return nil, err + } + defer resp.Body.Close() + if resp.StatusCode != http.StatusOK { + return nil, fmt.Errorf("unexpected firecracker /vm/config status: %d", resp.StatusCode) + } + + var cfg firecrackerVMConfig + if err := json.NewDecoder(resp.Body).Decode(&cfg); err != nil { + return nil, err + } + return &cfg, nil +} diff --git a/lib/instances/guestmemory_test_helpers_test.go b/lib/instances/guestmemory_test_helpers_test.go new file mode 100644 index 00000000..11668d05 --- /dev/null +++ b/lib/instances/guestmemory_test_helpers_test.go @@ -0,0 +1,15 @@ +package instances + +import ( + "os" + "testing" +) + +const guestMemoryManualEnv = "HYPEMAN_RUN_GUESTMEMORY_TESTS" + +func requireGuestMemoryManualRun(t *testing.T) { + t.Helper() + if os.Getenv(guestMemoryManualEnv) != "1" { + t.Skipf("set %s=1 to run guest memory integration tests", guestMemoryManualEnv) + } +} diff --git a/lib/instances/manager.go b/lib/instances/manager.go index 3b581e83..1b8804aa 100644 --- a/lib/instances/manager.go +++ b/lib/instances/manager.go @@ -6,6 +6,7 @@ import ( "sync" "github.com/kernel/hypeman/lib/devices" + "github.com/kernel/hypeman/lib/guestmemory" "github.com/kernel/hypeman/lib/hypervisor" "github.com/kernel/hypeman/lib/images" "github.com/kernel/hypeman/lib/network" @@ -84,6 +85,7 @@ type manager struct { // Hypervisor support vmStarters map[hypervisor.Type]hypervisor.VMStarter defaultHypervisor hypervisor.Type // Default hypervisor type when not specified in request + guestMemoryPolicy guestmemory.Policy } // platformStarters is populated by platform-specific init functions. @@ -92,12 +94,18 @@ var platformStarters = make(map[hypervisor.Type]hypervisor.VMStarter) // NewManager creates a new instances manager. // If meter is nil, metrics are disabled. // defaultHypervisor specifies which hypervisor to use when not specified in requests. -func NewManager(p *paths.Paths, imageManager images.Manager, systemManager system.Manager, networkManager network.Manager, deviceManager devices.Manager, volumeManager volumes.Manager, limits ResourceLimits, defaultHypervisor hypervisor.Type, meter metric.Meter, tracer trace.Tracer) Manager { +func NewManager(p *paths.Paths, imageManager images.Manager, systemManager system.Manager, networkManager network.Manager, deviceManager devices.Manager, volumeManager volumes.Manager, limits ResourceLimits, defaultHypervisor hypervisor.Type, meter metric.Meter, tracer trace.Tracer, memoryPolicy ...guestmemory.Policy) Manager { // Validate and default the hypervisor type if defaultHypervisor == "" { defaultHypervisor = hypervisor.TypeCloudHypervisor } + policy := guestmemory.DefaultPolicy() + if len(memoryPolicy) > 0 { + policy = memoryPolicy[0] + } + policy = policy.Normalize() + // Initialize VM starters from platform-specific init functions vmStarters := make(map[hypervisor.Type]hypervisor.VMStarter, len(platformStarters)) for hvType, starter := range platformStarters { @@ -116,6 +124,7 @@ func NewManager(p *paths.Paths, imageManager images.Manager, systemManager syste hostTopology: detectHostTopology(), // Detect and cache host topology vmStarters: vmStarters, defaultHypervisor: defaultHypervisor, + guestMemoryPolicy: policy, } // Initialize metrics if meter is provided diff --git a/lib/providers/providers.go b/lib/providers/providers.go index 25cc3536..be93dca0 100644 --- a/lib/providers/providers.go +++ b/lib/providers/providers.go @@ -13,6 +13,7 @@ import ( "github.com/kernel/hypeman/cmd/api/config" "github.com/kernel/hypeman/lib/builds" "github.com/kernel/hypeman/lib/devices" + "github.com/kernel/hypeman/lib/guestmemory" "github.com/kernel/hypeman/lib/hypervisor" "github.com/kernel/hypeman/lib/hypervisor/firecracker" "github.com/kernel/hypeman/lib/images" @@ -125,7 +126,13 @@ func ProvideInstanceManager(p *paths.Paths, cfg *config.Config, imageManager ima meter := otel.GetMeterProvider().Meter("hypeman") tracer := otel.GetTracerProvider().Tracer("hypeman") defaultHypervisor := hypervisor.Type(cfg.Hypervisor.Default) - return instances.NewManager(p, imageManager, systemManager, networkManager, deviceManager, volumeManager, limits, defaultHypervisor, meter, tracer), nil + memoryPolicy := guestmemory.Policy{ + Enabled: cfg.Hypervisor.Memory.Enabled, + KernelPageInitMode: guestmemory.KernelPageInitMode(cfg.Hypervisor.Memory.KernelPageInitMode), + ReclaimEnabled: cfg.Hypervisor.Memory.ReclaimEnabled, + VZBalloonRequired: cfg.Hypervisor.Memory.VZBalloonRequired, + } + return instances.NewManager(p, imageManager, systemManager, networkManager, deviceManager, volumeManager, limits, defaultHypervisor, meter, tracer, memoryPolicy), nil } // ProvideVolumeManager provides the volume manager