kernel · sjmiller609 · Mar 7, 2026 · Mar 7, 2026 · Mar 7, 2026 · Mar 7, 2026
diff --git a/Makefile b/Makefile
@@ -1,5 +1,5 @@
 SHELL := /bin/bash
-.PHONY: oapi-generate generate-vmm-client generate-wire generate-all dev build build-linux test test-linux test-darwin install-tools gen-jwt download-ch-binaries download-firecracker-binaries download-ch-spec ensure-ch-binaries ensure-firecracker-binaries build-caddy-binaries build-caddy ensure-caddy-binaries release-prep clean build-embedded
+.PHONY: oapi-generate generate-vmm-client generate-wire generate-all dev build build-linux test test-linux test-darwin test-guestmemory-linux test-guestmemory-vz install-tools gen-jwt download-ch-binaries download-firecracker-binaries download-ch-spec ensure-ch-binaries ensure-firecracker-binaries build-caddy-binaries build-caddy ensure-caddy-binaries release-prep clean build-embedded
 
 # Directory where local binaries will be installed
 BIN_DIR ?= $(CURDIR)/bin
@@ -300,6 +300,21 @@ test-darwin: build-embedded sign-vz-shim
 		go test -tags containers_image_openpgp $$VERBOSE_FLAG -timeout=$(TEST_TIMEOUT) $$PKGS; \
 	fi
 
+# Manual-only guest memory policy integration tests (Linux hypervisors).
+test-guestmemory-linux: ensure-ch-binaries ensure-firecracker-binaries ensure-caddy-binaries build-embedded
+	@TEST_PATH="/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:$$PATH"; \
+	GUESTMEM_TIMEOUT="$${GUESTMEMORY_TEST_TIMEOUT:-15m}"; \
+	echo "Running manual guest memory integration tests (CloudHypervisor, QEMU, Firecracker)"; \
+	sudo env "PATH=$$TEST_PATH" "DOCKER_CONFIG=$${DOCKER_CONFIG:-$$HOME/.docker}" "HYPEMAN_RUN_GUESTMEMORY_TESTS=1" \
+		go test -tags containers_image_openpgp -run='^TestGuestMemoryPolicy(CloudHypervisor|QEMU|Firecracker)$$' -timeout="$$GUESTMEM_TIMEOUT" ./lib/instances
+
+# Manual-only guest memory policy integration test (macOS VZ).
+test-guestmemory-vz: build-embedded sign-vz-shim
+	@echo "Running manual guest memory integration test (VZ)"; \
+	PATH="/opt/homebrew/opt/e2fsprogs/sbin:$(PATH)" \
+	HYPEMAN_RUN_GUESTMEMORY_TESTS=1 \
+	go test -tags containers_image_openpgp -run='^TestGuestMemoryPolicyVZ$$' -timeout=$(TEST_TIMEOUT) ./lib/instances
+
 # Generate JWT token for testing
 # Usage: make gen-jwt [USER_ID=test-user]
 # Checks CONFIG_PATH, then local config.yaml, then default config paths

diff --git a/cmd/api/config/config.go b/cmd/api/config/config.go
@@ -154,8 +154,17 @@ type CapacityConfig struct {
 
 // HypervisorConfig holds hypervisor settings.
 type HypervisorConfig struct {
-	Default               string `koanf:"default"`
-	FirecrackerBinaryPath string `koanf:"firecracker_binary_path"`
+	Default               string                 `koanf:"default"`
+	FirecrackerBinaryPath string                 `koanf:"firecracker_binary_path"`
+	Memory                HypervisorMemoryConfig `koanf:"memory"`
+}
+
+// HypervisorMemoryConfig holds guest memory management settings.
+type HypervisorMemoryConfig struct {
+	Enabled            bool   `koanf:"enabled"`
+	KernelPageInitMode string `koanf:"kernel_page_init_mode"`
+	ReclaimEnabled     bool   `koanf:"reclaim_enabled"`
+	VZBalloonRequired  bool   `koanf:"vz_balloon_required"`
 }
 
 // GPUConfig holds GPU-related settings.
@@ -300,6 +309,12 @@ func defaultConfig() *Config {
 		Hypervisor: HypervisorConfig{
 			Default:               "cloud-hypervisor",
 			FirecrackerBinaryPath: "",
+			Memory: HypervisorMemoryConfig{
+				Enabled:            false,
+				KernelPageInitMode: "hardened",
+				ReclaimEnabled:     true,
+				VZBalloonRequired:  true,
+			},
 		},
 
 		GPU: GPUConfig{
@@ -400,5 +415,8 @@ func (c *Config) Validate() error {
 	if c.Build.Timeout <= 0 {
 		return fmt.Errorf("build.timeout must be positive, got %d", c.Build.Timeout)
 	}
+	if c.Hypervisor.Memory.KernelPageInitMode != "performance" && c.Hypervisor.Memory.KernelPageInitMode != "hardened" {
+		return fmt.Errorf("hypervisor.memory.kernel_page_init_mode must be one of {performance,hardened}, got %q", c.Hypervisor.Memory.KernelPageInitMode)
+	}
 	return nil
 }
diff --git a/cmd/vz-shim/server.go b/cmd/vz-shim/server.go
@@ -37,7 +37,8 @@ func NewShimServer(vm *vz.VirtualMachine, vmConfig *vz.VirtualMachineConfigurati
 
 // VMInfoResponse matches the cloud-hypervisor VmInfo structure.
 type VMInfoResponse struct {
-	State string `json:"state"`
+	State                string `json:"state"`
+	MemoryBalloonDevices int    `json:"memory_balloon_devices,omitempty"`
 }
 
 type snapshotRequest struct {
@@ -66,7 +67,10 @@ func (s *ShimServer) handleVMInfo(w http.ResponseWriter, r *http.Request) {
 	defer s.mu.RUnlock()
 
 	state := vzStateToString(s.vm.State())
-	resp := VMInfoResponse{State: state}
+	resp := VMInfoResponse{
+		State:                state,
+		MemoryBalloonDevices: len(s.vm.MemoryBalloonDevices()),
+	}
 
 	w.Header().Set("Content-Type", "application/json")
 	json.NewEncoder(w).Encode(resp)

diff --git a/cmd/vz-shim/vm.go b/cmd/vz-shim/vm.go
@@ -72,9 +72,19 @@ func createVM(config *shimconfig.ShimConfig) (*vz.VirtualMachine, *vz.VirtualMac
 	}
 	vmConfig.SetSocketDevicesVirtualMachineConfiguration([]vz.SocketDeviceConfiguration{vsockConfig})
 
-	// Do not attach memory balloon for now.
-	// Save/restore compatibility on VZ can fail with "invalid argument" for some
-	// Linux guest configurations when a balloon device is present.
+	if config.EnableMemoryBalloon {
+		balloonConfig, err := vz.NewVirtioTraditionalMemoryBalloonDeviceConfiguration()
+		if err != nil {
+			if config.RequireMemoryBalloon {
+				return nil, nil, fmt.Errorf("create memory balloon device: %w", err)
+			}
+			slog.Warn("memory balloon unavailable, continuing without balloon", "error", err)
+		} else {
+			vmConfig.SetMemoryBalloonDevicesVirtualMachineConfiguration([]vz.MemoryBalloonDeviceConfiguration{
+				balloonConfig,
+			})
+		}
+	}
 
 	if validated, err := vmConfig.Validate(); !validated || err != nil {
 		return nil, nil, fmt.Errorf("invalid vm configuration: %w", err)

diff --git a/config.example.darwin.yaml b/config.example.darwin.yaml
@@ -34,6 +34,11 @@ port: "8080"
 # - "cloud-hypervisor" and "qemu" are NOT supported on macOS
 hypervisor:
   default: vz
+  memory:
+    enabled: false
+    kernel_page_init_mode: hardened
+    reclaim_enabled: true
+    vz_balloon_required: true
 
 # =============================================================================
 # Network Configuration (DIFFERENT ON MACOS)

diff --git a/config.example.yaml b/config.example.yaml
@@ -26,6 +26,13 @@ data_dir: /var/lib/hypeman
 #   default: cloud-hypervisor
 #   # Optional: use a custom Firecracker binary path instead of the embedded one.
 #   # firecracker_binary_path: /usr/local/bin/firecracker
+#   memory:
+#     enabled: false
+#     # performance: init_on_alloc=0 init_on_free=0 (better density)
+#     # hardened: init_on_alloc=1 init_on_free=1 (stronger hardening)
+#     kernel_page_init_mode: hardened
+#     reclaim_enabled: true
+#     vz_balloon_required: true
 
 # =============================================================================
 # Network Configuration

diff --git a/lib/guestmemory/README.md b/lib/guestmemory/README.md
@@ -0,0 +1,102 @@
+# Guest Memory Reclaim
+
+This feature reduces host RAM waste from guest VMs by combining three behaviors:
+
+1. Lazy host allocation preservation:
+The VM is configured with requested memory capacity, but host pages should only back guest pages as they are touched.
+
+2. Guest-to-host reclaim:
+When the guest frees memory, virtio balloon/reporting/hinting features let the VMM return those pages to the host.
+
+3. Guest boot page-touch reduction:
+The guest kernel page-init mode controls whether Linux eagerly touches pages:
+- `performance` mode sets `init_on_alloc=0 init_on_free=0` for better density and lower memory churn.
+- `hardened` mode sets `init_on_alloc=1 init_on_free=1` for stronger memory hygiene at some density/perf cost.
+
+## Configuration
+
+This feature is controlled by `hypervisor.memory` in server config and is default-off:
+
+```yaml
+hypervisor:
+  memory:
+    enabled: false
+    kernel_page_init_mode: hardened
+    reclaim_enabled: true
+    vz_balloon_required: true
+```
+
+To enable reclaim behavior and density-oriented kernel args, set:
+
+```yaml
+hypervisor:
+  memory:
+    enabled: true
+    kernel_page_init_mode: performance
+```
+
+## Runtime Flow
+
+- Operator config (`hypervisor.memory`) is normalized into one policy.
+- The instances layer applies policy generically:
+  - merges kernel args with the selected page-init mode;
+  - sets generic memory feature toggles in `hypervisor.VMConfig.GuestMemory`.
+- Each hypervisor backend maps generic toggles to native mechanisms:
+  - Cloud Hypervisor: `balloon` config with free page reporting and deflate-on-oom.
+  - QEMU: `virtio-balloon-pci` device options.
+  - Firecracker: `/balloon` API with free page hinting/reporting.
+  - VZ: attach `VirtioTraditionalMemoryBalloon` device.
+
+## Backend Behavior Matrix
+
+| Hypervisor | Lazy allocation | Balloon | Free page reporting/hinting | Deflate on OOM |
+|---|---|---|---|---|
+| Cloud Hypervisor | Yes | Yes | Reporting | Yes |
+| QEMU | Yes | Yes | Reporting (+ hinting when enabled) | Yes |
+| Firecracker | Yes | Yes | Hinting + reporting | Yes |
+| VZ | macOS-managed | Yes | Host-managed + guest cooperation | Host-managed |
+
+## Failure Behavior
+
+- If policy is disabled, memory features are not applied.
+- If reclaim is disabled, balloon/reporting/hinting are not applied.
+- For VZ, balloon attachment is attempted when enabled.
+  - If `vz_balloon_required=true`, startup fails if balloon cannot be configured.
+  - If `vz_balloon_required=false`, startup continues without balloon and logs a warning.
+
+## Quick CLI Experiment
+
+Use this A/B check to compare host memory footprint with policy enabled vs disabled:
+
+```bash
+# 1) Start API with config A (hypervisor.memory.enabled=true), then run:
+ID=$(hypeman run --hypervisor qemu --network=false --memory 1GB \
+  --entrypoint /bin/sh --entrypoint -c \
+  --cmd 'sleep 5; dd if=/dev/zero of=/dev/shm/hype-mem bs=1M count=256; sleep 5; rm -f /dev/shm/hype-mem; sleep 90' \
+  docker.io/library/alpine:latest | tail -n1)
+PID=$(jq -r '.HypervisorPID' "<data_dir>/guests/$ID/metadata.json")
+awk '/^Pss:/ {print $2 " kB"}' "/proc/$PID/smaps_rollup" # Linux (preferred)
+awk '/^VmRSS:/ {print $2 " kB"}' "/proc/$PID/status"      # Linux fallback
+ps -o rss= -p "$PID"                                  # macOS
+hypeman rm --force "$ID"
+
+# 2) Restart API with config B (hypervisor.memory.enabled=false) and run the same command.
+# 3) Compare final/steady host memory between A and B.
+```
+
+In one startup-focused sample run, absolute host footprint stayed far below guest memory size (for example, ~4GB guest with low host PSS on Cloud Hypervisor/Firecracker), while QEMU showed a larger fixed process overhead.
+
+Sample probe results (4GB idle guest, rounded MB):
+
+| Hypervisor | Host RSS (MB) | Host PSS (MB) | Notes |
+|---|---:|---:|---|
+| Cloud Hypervisor (Linux) | ~345 | ~29 | Low actual host pressure when idle |
+| Firecracker (Linux) | ~295 | ~27 | Low actual host pressure when idle |
+| QEMU (Linux) | ~400 | ~116 | Higher fixed process overhead |
+| VZ (macOS) | ~23 | N/A | RSS sampled with `ps` |
+
+## Out of Scope
+
+- No API surface changes.
+- No scheduler/admission logic changes.
+- No automatic background tuning loops outside hypervisor-supported reclaim mechanisms.
diff --git a/lib/guestmemory/kernel_args.go b/lib/guestmemory/kernel_args.go
@@ -0,0 +1,42 @@
+package guestmemory
+
+import "strings"
+
+// MergeKernelArgs merges kernel args deterministically.
+// Duplicate keys are de-duplicated with "last write wins" semantics.
+func MergeKernelArgs(base string, extras ...string) string {
+	tokens := strings.Fields(base)
+	order := make([]string, 0, len(tokens))
+	values := make(map[string]string, len(tokens))
+
+	for _, tok := range tokens {
+		k := argKey(tok)
+		if _, ok := values[k]; !ok {
+			order = append(order, k)
+		}
+		values[k] = tok
+	}
+
+	for _, extra := range extras {
+		for _, tok := range strings.Fields(extra) {
+			k := argKey(tok)
+			if _, ok := values[k]; !ok {
+				order = append(order, k)
+			}
+			values[k] = tok
+		}
+	}
+
+	merged := make([]string, 0, len(order))
+	for _, k := range order {
+		merged = append(merged, values[k])
+	}
+	return strings.Join(merged, " ")
+}
+
+func argKey(token string) string {
+	if idx := strings.IndexByte(token, '='); idx >= 0 {
+		return token[:idx]
+	}
+	return token
+}
diff --git a/lib/guestmemory/kernel_args_test.go b/lib/guestmemory/kernel_args_test.go
@@ -0,0 +1,12 @@
+package guestmemory
+
+import (
+	"testing"
+
+	"github.com/stretchr/testify/assert"
+)
+
+func TestMergeKernelArgs(t *testing.T) {
+	merged := MergeKernelArgs("console=ttyS0 foo=1", "foo=2", "init_on_alloc=0 init_on_free=0")
+	assert.Equal(t, "console=ttyS0 foo=2 init_on_alloc=0 init_on_free=0", merged)
+}
diff --git a/lib/guestmemory/policy.go b/lib/guestmemory/policy.go
@@ -0,0 +1,92 @@
+package guestmemory
+
+// KernelPageInitMode controls guest kernel page initialization behavior.
+type KernelPageInitMode string
+
+const (
+	// KernelPageInitPerformance minimizes guest page touching to preserve lazy host allocation.
+	KernelPageInitPerformance KernelPageInitMode = "performance"
+	// KernelPageInitHardened enforces page init-on-alloc/free hardening in the guest kernel.
+	KernelPageInitHardened KernelPageInitMode = "hardened"
+)
+
+// Policy is the normalized, hypervisor-agnostic guest memory policy.
+type Policy struct {
+	Enabled            bool
+	KernelPageInitMode KernelPageInitMode
+	ReclaimEnabled     bool
+	VZBalloonRequired  bool
+}
+
+// Features are generic guest memory toggles consumed by hypervisor backends.
+type Features struct {
+	EnableBalloon     bool
+	FreePageReporting bool
+	DeflateOnOOM      bool
+	FreePageHinting   bool
+	RequireBalloon    bool
+}
+
+// DefaultPolicy returns conservative defaults (disabled reclaim, hardened page-init mode).
+func DefaultPolicy() Policy {
+	return Policy{
+		Enabled:            false,
+		KernelPageInitMode: KernelPageInitHardened,
+		ReclaimEnabled:     true,
+		VZBalloonRequired:  true,
+	}
+}
+
+// Normalize applies defaults and sanitizes invalid modes.
+func (p Policy) Normalize() Policy {
+	d := DefaultPolicy()
+
+	if p.KernelPageInitMode == "" {
+		p.KernelPageInitMode = d.KernelPageInitMode
+	}
+	if p.KernelPageInitMode != KernelPageInitPerformance && p.KernelPageInitMode != KernelPageInitHardened {
+		p.KernelPageInitMode = d.KernelPageInitMode
+	}
+
+	if !p.Enabled {
+		return Policy{
+			Enabled:            false,
+			KernelPageInitMode: p.KernelPageInitMode,
+			ReclaimEnabled:     false,
+			VZBalloonRequired:  p.VZBalloonRequired,
+		}
+	}
+
+	return p
+}
+
+// KernelArgs returns kernel args implied by the policy.
+func (p Policy) KernelArgs() []string {
+	n := p.Normalize()
+	if !n.Enabled {
+		return nil
+	}
+
+	switch n.KernelPageInitMode {
+	case KernelPageInitHardened:
+		return []string{"init_on_alloc=1", "init_on_free=1"}
+	default:
+		return []string{"init_on_alloc=0", "init_on_free=0"}
+	}
+}
+
+// FeaturesForHypervisor returns generic memory features for backend translation.
+func (p Policy) FeaturesForHypervisor() Features {
+	n := p.Normalize()
+	if !n.Enabled || !n.ReclaimEnabled {
+		return Features{}
+	}
+
+	return Features{
+		EnableBalloon:     true,
+		FreePageReporting: true,
+		DeflateOnOOM:      true,
+		FreePageHinting:   true,
+		RequireBalloon:    n.VZBalloonRequired,
+	}
+}