Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 16 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
SHELL := /bin/bash
.PHONY: oapi-generate generate-vmm-client generate-wire generate-all dev build build-linux test test-linux test-darwin install-tools gen-jwt download-ch-binaries download-firecracker-binaries download-ch-spec ensure-ch-binaries ensure-firecracker-binaries build-caddy-binaries build-caddy ensure-caddy-binaries release-prep clean build-embedded
.PHONY: oapi-generate generate-vmm-client generate-wire generate-all dev build build-linux test test-linux test-darwin test-guestmemory-linux test-guestmemory-vz install-tools gen-jwt download-ch-binaries download-firecracker-binaries download-ch-spec ensure-ch-binaries ensure-firecracker-binaries build-caddy-binaries build-caddy ensure-caddy-binaries release-prep clean build-embedded

# Directory where local binaries will be installed
BIN_DIR ?= $(CURDIR)/bin
Expand Down Expand Up @@ -300,6 +300,21 @@ test-darwin: build-embedded sign-vz-shim
go test -tags containers_image_openpgp $$VERBOSE_FLAG -timeout=$(TEST_TIMEOUT) $$PKGS; \
fi

# Manual-only guest memory policy integration tests (Linux hypervisors).
test-guestmemory-linux: ensure-ch-binaries ensure-firecracker-binaries ensure-caddy-binaries build-embedded
@TEST_PATH="/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:$$PATH"; \
GUESTMEM_TIMEOUT="$${GUESTMEMORY_TEST_TIMEOUT:-15m}"; \
echo "Running manual guest memory integration tests (CloudHypervisor, QEMU, Firecracker)"; \
sudo env "PATH=$$TEST_PATH" "DOCKER_CONFIG=$${DOCKER_CONFIG:-$$HOME/.docker}" "HYPEMAN_RUN_GUESTMEMORY_TESTS=1" \
go test -tags containers_image_openpgp -run='^TestGuestMemoryPolicy(CloudHypervisor|QEMU|Firecracker)$$' -timeout="$$GUESTMEM_TIMEOUT" ./lib/instances

# Manual-only guest memory policy integration test (macOS VZ).
test-guestmemory-vz: build-embedded sign-vz-shim
@echo "Running manual guest memory integration test (VZ)"; \
PATH="/opt/homebrew/opt/e2fsprogs/sbin:$(PATH)" \
HYPEMAN_RUN_GUESTMEMORY_TESTS=1 \
go test -tags containers_image_openpgp -run='^TestGuestMemoryPolicyVZ$$' -timeout=$(TEST_TIMEOUT) ./lib/instances

# Generate JWT token for testing
# Usage: make gen-jwt [USER_ID=test-user]
# Checks CONFIG_PATH, then local config.yaml, then default config paths
Expand Down
22 changes: 20 additions & 2 deletions cmd/api/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -154,8 +154,17 @@ type CapacityConfig struct {

// HypervisorConfig holds hypervisor settings.
type HypervisorConfig struct {
Default string `koanf:"default"`
FirecrackerBinaryPath string `koanf:"firecracker_binary_path"`
Default string `koanf:"default"`
FirecrackerBinaryPath string `koanf:"firecracker_binary_path"`
Memory HypervisorMemoryConfig `koanf:"memory"`
}

// HypervisorMemoryConfig holds guest memory management settings.
type HypervisorMemoryConfig struct {
Enabled bool `koanf:"enabled"`
KernelPageInitMode string `koanf:"kernel_page_init_mode"`
ReclaimEnabled bool `koanf:"reclaim_enabled"`
VZBalloonRequired bool `koanf:"vz_balloon_required"`
}

// GPUConfig holds GPU-related settings.
Expand Down Expand Up @@ -300,6 +309,12 @@ func defaultConfig() *Config {
Hypervisor: HypervisorConfig{
Default: "cloud-hypervisor",
FirecrackerBinaryPath: "",
Memory: HypervisorMemoryConfig{
Enabled: false,
KernelPageInitMode: "hardened",
ReclaimEnabled: true,
VZBalloonRequired: true,
},
},

GPU: GPUConfig{
Expand Down Expand Up @@ -400,5 +415,8 @@ func (c *Config) Validate() error {
if c.Build.Timeout <= 0 {
return fmt.Errorf("build.timeout must be positive, got %d", c.Build.Timeout)
}
if c.Hypervisor.Memory.KernelPageInitMode != "performance" && c.Hypervisor.Memory.KernelPageInitMode != "hardened" {
return fmt.Errorf("hypervisor.memory.kernel_page_init_mode must be one of {performance,hardened}, got %q", c.Hypervisor.Memory.KernelPageInitMode)
}
return nil
}
8 changes: 6 additions & 2 deletions cmd/vz-shim/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,8 @@ func NewShimServer(vm *vz.VirtualMachine, vmConfig *vz.VirtualMachineConfigurati

// VMInfoResponse matches the cloud-hypervisor VmInfo structure.
type VMInfoResponse struct {
State string `json:"state"`
State string `json:"state"`
MemoryBalloonDevices int `json:"memory_balloon_devices,omitempty"`
}

type snapshotRequest struct {
Expand Down Expand Up @@ -66,7 +67,10 @@ func (s *ShimServer) handleVMInfo(w http.ResponseWriter, r *http.Request) {
defer s.mu.RUnlock()

state := vzStateToString(s.vm.State())
resp := VMInfoResponse{State: state}
resp := VMInfoResponse{
State: state,
MemoryBalloonDevices: len(s.vm.MemoryBalloonDevices()),
}

w.Header().Set("Content-Type", "application/json")
json.NewEncoder(w).Encode(resp)
Expand Down
16 changes: 13 additions & 3 deletions cmd/vz-shim/vm.go
Original file line number Diff line number Diff line change
Expand Up @@ -72,9 +72,19 @@ func createVM(config *shimconfig.ShimConfig) (*vz.VirtualMachine, *vz.VirtualMac
}
vmConfig.SetSocketDevicesVirtualMachineConfiguration([]vz.SocketDeviceConfiguration{vsockConfig})

// Do not attach memory balloon for now.
// Save/restore compatibility on VZ can fail with "invalid argument" for some
// Linux guest configurations when a balloon device is present.
if config.EnableMemoryBalloon {
balloonConfig, err := vz.NewVirtioTraditionalMemoryBalloonDeviceConfiguration()
if err != nil {
if config.RequireMemoryBalloon {
return nil, nil, fmt.Errorf("create memory balloon device: %w", err)
}
slog.Warn("memory balloon unavailable, continuing without balloon", "error", err)
} else {
vmConfig.SetMemoryBalloonDevicesVirtualMachineConfiguration([]vz.MemoryBalloonDeviceConfiguration{
balloonConfig,
})
}
}

if validated, err := vmConfig.Validate(); !validated || err != nil {
return nil, nil, fmt.Errorf("invalid vm configuration: %w", err)
Expand Down
5 changes: 5 additions & 0 deletions config.example.darwin.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,11 @@ port: "8080"
# - "cloud-hypervisor" and "qemu" are NOT supported on macOS
hypervisor:
default: vz
memory:
enabled: false
kernel_page_init_mode: hardened
reclaim_enabled: true
vz_balloon_required: true

# =============================================================================
# Network Configuration (DIFFERENT ON MACOS)
Expand Down
7 changes: 7 additions & 0 deletions config.example.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,13 @@ data_dir: /var/lib/hypeman
# default: cloud-hypervisor
# # Optional: use a custom Firecracker binary path instead of the embedded one.
# # firecracker_binary_path: /usr/local/bin/firecracker
# memory:
# enabled: false
# # performance: init_on_alloc=0 init_on_free=0 (better density)
# # hardened: init_on_alloc=1 init_on_free=1 (stronger hardening)
# kernel_page_init_mode: hardened
# reclaim_enabled: true
# vz_balloon_required: true

# =============================================================================
# Network Configuration
Expand Down
102 changes: 102 additions & 0 deletions lib/guestmemory/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
# Guest Memory Reclaim

This feature reduces host RAM waste from guest VMs by combining three behaviors:

1. Lazy host allocation preservation:
The VM is configured with requested memory capacity, but host pages should only back guest pages as they are touched.

2. Guest-to-host reclaim:
When the guest frees memory, virtio balloon/reporting/hinting features let the VMM return those pages to the host.

3. Guest boot page-touch reduction:
The guest kernel page-init mode controls whether Linux eagerly touches pages:
- `performance` mode sets `init_on_alloc=0 init_on_free=0` for better density and lower memory churn.
- `hardened` mode sets `init_on_alloc=1 init_on_free=1` for stronger memory hygiene at some density/perf cost.

## Configuration

This feature is controlled by `hypervisor.memory` in server config and is default-off:

```yaml
hypervisor:
memory:
enabled: false
kernel_page_init_mode: hardened
reclaim_enabled: true
vz_balloon_required: true
```

To enable reclaim behavior and density-oriented kernel args, set:

```yaml
hypervisor:
memory:
enabled: true
kernel_page_init_mode: performance
```

## Runtime Flow

- Operator config (`hypervisor.memory`) is normalized into one policy.
- The instances layer applies policy generically:
- merges kernel args with the selected page-init mode;
- sets generic memory feature toggles in `hypervisor.VMConfig.GuestMemory`.
- Each hypervisor backend maps generic toggles to native mechanisms:
- Cloud Hypervisor: `balloon` config with free page reporting and deflate-on-oom.
- QEMU: `virtio-balloon-pci` device options.
- Firecracker: `/balloon` API with free page hinting/reporting.
- VZ: attach `VirtioTraditionalMemoryBalloon` device.

## Backend Behavior Matrix

| Hypervisor | Lazy allocation | Balloon | Free page reporting/hinting | Deflate on OOM |
|---|---|---|---|---|
| Cloud Hypervisor | Yes | Yes | Reporting | Yes |
| QEMU | Yes | Yes | Reporting (+ hinting when enabled) | Yes |
| Firecracker | Yes | Yes | Hinting + reporting | Yes |
| VZ | macOS-managed | Yes | Host-managed + guest cooperation | Host-managed |

## Failure Behavior

- If policy is disabled, memory features are not applied.
- If reclaim is disabled, balloon/reporting/hinting are not applied.
- For VZ, balloon attachment is attempted when enabled.
- If `vz_balloon_required=true`, startup fails if balloon cannot be configured.
- If `vz_balloon_required=false`, startup continues without balloon and logs a warning.

## Quick CLI Experiment

Use this A/B check to compare host memory footprint with policy enabled vs disabled:

```bash
# 1) Start API with config A (hypervisor.memory.enabled=true), then run:
ID=$(hypeman run --hypervisor qemu --network=false --memory 1GB \
--entrypoint /bin/sh --entrypoint -c \
--cmd 'sleep 5; dd if=/dev/zero of=/dev/shm/hype-mem bs=1M count=256; sleep 5; rm -f /dev/shm/hype-mem; sleep 90' \
docker.io/library/alpine:latest | tail -n1)
PID=$(jq -r '.HypervisorPID' "<data_dir>/guests/$ID/metadata.json")
awk '/^Pss:/ {print $2 " kB"}' "/proc/$PID/smaps_rollup" # Linux (preferred)
awk '/^VmRSS:/ {print $2 " kB"}' "/proc/$PID/status" # Linux fallback
ps -o rss= -p "$PID" # macOS
hypeman rm --force "$ID"

# 2) Restart API with config B (hypervisor.memory.enabled=false) and run the same command.
# 3) Compare final/steady host memory between A and B.
```

In one startup-focused sample run, absolute host footprint stayed far below guest memory size (for example, ~4GB guest with low host PSS on Cloud Hypervisor/Firecracker), while QEMU showed a larger fixed process overhead.

Sample probe results (4GB idle guest, rounded MB):

| Hypervisor | Host RSS (MB) | Host PSS (MB) | Notes |
|---|---:|---:|---|
| Cloud Hypervisor (Linux) | ~345 | ~29 | Low actual host pressure when idle |
| Firecracker (Linux) | ~295 | ~27 | Low actual host pressure when idle |
| QEMU (Linux) | ~400 | ~116 | Higher fixed process overhead |
| VZ (macOS) | ~23 | N/A | RSS sampled with `ps` |

## Out of Scope

- No API surface changes.
- No scheduler/admission logic changes.
- No automatic background tuning loops outside hypervisor-supported reclaim mechanisms.
42 changes: 42 additions & 0 deletions lib/guestmemory/kernel_args.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
package guestmemory

import "strings"

// MergeKernelArgs merges kernel args deterministically.
// Duplicate keys are de-duplicated with "last write wins" semantics.
func MergeKernelArgs(base string, extras ...string) string {
tokens := strings.Fields(base)
order := make([]string, 0, len(tokens))
values := make(map[string]string, len(tokens))

for _, tok := range tokens {
k := argKey(tok)
if _, ok := values[k]; !ok {
order = append(order, k)
}
values[k] = tok
}

for _, extra := range extras {
for _, tok := range strings.Fields(extra) {
k := argKey(tok)
if _, ok := values[k]; !ok {
order = append(order, k)
}
values[k] = tok
}
}

merged := make([]string, 0, len(order))
for _, k := range order {
merged = append(merged, values[k])
}
return strings.Join(merged, " ")
}

func argKey(token string) string {
if idx := strings.IndexByte(token, '='); idx >= 0 {
return token[:idx]
}
return token
}
12 changes: 12 additions & 0 deletions lib/guestmemory/kernel_args_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
package guestmemory

import (
"testing"

"github.com/stretchr/testify/assert"
)

func TestMergeKernelArgs(t *testing.T) {
merged := MergeKernelArgs("console=ttyS0 foo=1", "foo=2", "init_on_alloc=0 init_on_free=0")
assert.Equal(t, "console=ttyS0 foo=2 init_on_alloc=0 init_on_free=0", merged)
}
92 changes: 92 additions & 0 deletions lib/guestmemory/policy.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
package guestmemory

// KernelPageInitMode controls guest kernel page initialization behavior.
type KernelPageInitMode string

const (
// KernelPageInitPerformance minimizes guest page touching to preserve lazy host allocation.
KernelPageInitPerformance KernelPageInitMode = "performance"
// KernelPageInitHardened enforces page init-on-alloc/free hardening in the guest kernel.
KernelPageInitHardened KernelPageInitMode = "hardened"
)

// Policy is the normalized, hypervisor-agnostic guest memory policy.
type Policy struct {
Enabled bool
KernelPageInitMode KernelPageInitMode
ReclaimEnabled bool
VZBalloonRequired bool
}

// Features are generic guest memory toggles consumed by hypervisor backends.
type Features struct {
EnableBalloon bool
FreePageReporting bool
DeflateOnOOM bool
FreePageHinting bool
RequireBalloon bool
}

// DefaultPolicy returns conservative defaults (disabled reclaim, hardened page-init mode).
func DefaultPolicy() Policy {
return Policy{
Enabled: false,
KernelPageInitMode: KernelPageInitHardened,
ReclaimEnabled: true,
VZBalloonRequired: true,
}
}

// Normalize applies defaults and sanitizes invalid modes.
func (p Policy) Normalize() Policy {
d := DefaultPolicy()

if p.KernelPageInitMode == "" {
p.KernelPageInitMode = d.KernelPageInitMode
}
if p.KernelPageInitMode != KernelPageInitPerformance && p.KernelPageInitMode != KernelPageInitHardened {
p.KernelPageInitMode = d.KernelPageInitMode
}

if !p.Enabled {
return Policy{
Enabled: false,
KernelPageInitMode: p.KernelPageInitMode,
ReclaimEnabled: false,
VZBalloonRequired: p.VZBalloonRequired,
}
}

return p
}

// KernelArgs returns kernel args implied by the policy.
func (p Policy) KernelArgs() []string {
n := p.Normalize()
if !n.Enabled {
return nil
}

switch n.KernelPageInitMode {
case KernelPageInitHardened:
return []string{"init_on_alloc=1", "init_on_free=1"}
default:
return []string{"init_on_alloc=0", "init_on_free=0"}
}
}

// FeaturesForHypervisor returns generic memory features for backend translation.
func (p Policy) FeaturesForHypervisor() Features {
n := p.Normalize()
if !n.Enabled || !n.ReclaimEnabled {
return Features{}
}

return Features{
EnableBalloon: true,
FreePageReporting: true,
DeflateOnOOM: true,
FreePageHinting: true,
RequireBalloon: n.VZBalloonRequired,
}
}
Loading