diff --git a/.github/workflows/dep_build_test.yml b/.github/workflows/dep_build_test.yml index 0c72baa66..e4d64fe75 100644 --- a/.github/workflows/dep_build_test.yml +++ b/.github/workflows/dep_build_test.yml @@ -105,6 +105,11 @@ jobs: # with only one driver enabled (kvm/mshv3 features are unix-only, no-op on Windows) just test ${{ inputs.config }} ${{ inputs.hypervisor == 'mshv3' && 'mshv3' || 'kvm' }} + - name: Run Rust tests with hw-interrupts + run: | + # with hw-interrupts feature enabled (+ explicit driver on Linux) + just test ${{ inputs.config }} ${{ runner.os == 'Linux' && (inputs.hypervisor == 'mshv3' && 'mshv3,hw-interrupts' || 'kvm,hw-interrupts') || 'hw-interrupts' }} + - name: Run Rust Gdb tests env: RUST_LOG: debug diff --git a/Justfile b/Justfile index dda6897bb..fa3bf617d 100644 --- a/Justfile +++ b/Justfile @@ -87,6 +87,9 @@ test-like-ci config=default-target hypervisor="kvm": @# with only one driver enabled + build-metadata + init-paging just test {{config}} build-metadata,init-paging,{{ if hypervisor == "mshv3" {"mshv3"} else {"kvm"} }} + @# with hw-interrupts enabled (+ explicit driver on Linux) + {{ if os() == "linux" { if hypervisor == "mshv3" { "just test " + config + " mshv3,hw-interrupts" } else { "just test " + config + " kvm,hw-interrupts" } } else { "just test " + config + " hw-interrupts" } }} + @# make sure certain cargo features compile just check @@ -151,6 +154,9 @@ build-test-like-ci config=default-target hypervisor="kvm": @# Run Rust tests with single driver {{ if os() == "linux" { "just test " + config+ " " + if hypervisor == "mshv3" { "mshv3" } else { "kvm" } } else { "" } }} + @# Run Rust tests with hw-interrupts + {{ if os() == "linux" { if hypervisor == "mshv3" { "just test " + config + " mshv3,hw-interrupts" } else { "just test " + config + " kvm,hw-interrupts" } } else { "just test " + config + " hw-interrupts" } }} + @# Run Rust Gdb tests just test-rust-gdb-debugging {{config}} @@ -283,6 +289,7 @@ check: {{ cargo-cmd }} check -p hyperlight-host --features print_debug {{ target-triple-flag }} {{ cargo-cmd }} check -p hyperlight-host --features gdb {{ target-triple-flag }} {{ cargo-cmd }} check -p hyperlight-host --features trace_guest,mem_profile {{ target-triple-flag }} + {{ cargo-cmd }} check -p hyperlight-host --features hw-interrupts {{ target-triple-flag }} fmt-check: rustup +nightly component list | grep -q "rustfmt.*installed" || rustup component add rustfmt --toolchain nightly diff --git a/src/hyperlight_common/src/arch/i686/layout.rs b/src/hyperlight_common/src/arch/i686/layout.rs index 68cc2cf1b..f3601c643 100644 --- a/src/hyperlight_common/src/arch/i686/layout.rs +++ b/src/hyperlight_common/src/arch/i686/layout.rs @@ -17,11 +17,9 @@ limitations under the License. // This file is just dummy definitions at the moment, in order to // allow compiling the guest for real mode boot scenarios. -pub const MAX_GVA: usize = 0xffff_efff; -pub const SNAPSHOT_PT_GVA_MIN: usize = 0xef00_0000; -pub const SNAPSHOT_PT_GVA_MAX: usize = 0xefff_efff; +pub const MAX_GVA: usize = 0xffff_ffff; pub const MAX_GPA: usize = 0xffff_ffff; -pub fn min_scratch_size() -> usize { - 1 * crate::vmem::PAGE_SIZE +pub fn min_scratch_size(_input_data_size: usize, _output_data_size: usize) -> usize { + crate::vmem::PAGE_SIZE } diff --git a/src/hyperlight_common/src/layout.rs b/src/hyperlight_common/src/layout.rs index 215a80d87..aa62bea32 100644 --- a/src/hyperlight_common/src/layout.rs +++ b/src/hyperlight_common/src/layout.rs @@ -14,11 +14,20 @@ See the License for the specific language governing permissions and limitations under the License. */ -#[cfg_attr(target_arch = "x86_64", path = "arch/amd64/layout.rs")] #[cfg_attr(target_arch = "x86", path = "arch/i686/layout.rs")] +#[cfg_attr( + all(target_arch = "x86_64", feature = "init-paging"), + path = "arch/amd64/layout.rs" +)] +#[cfg_attr( + all(target_arch = "x86_64", not(feature = "init-paging")), + path = "arch/i686/layout.rs" +)] mod arch; -pub use arch::{MAX_GPA, MAX_GVA, SNAPSHOT_PT_GVA_MAX, SNAPSHOT_PT_GVA_MIN}; +pub use arch::{MAX_GPA, MAX_GVA}; +#[cfg(feature = "init-paging")] +pub use arch::{SNAPSHOT_PT_GVA_MAX, SNAPSHOT_PT_GVA_MIN}; // offsets down from the top of scratch memory for various things pub const SCRATCH_TOP_SIZE_OFFSET: u64 = 0x08; diff --git a/src/hyperlight_common/src/outb.rs b/src/hyperlight_common/src/outb.rs index b060ae5cf..9cba7f4a7 100644 --- a/src/hyperlight_common/src/outb.rs +++ b/src/hyperlight_common/src/outb.rs @@ -104,6 +104,15 @@ pub enum OutBAction { TraceMemoryAlloc = 105, #[cfg(feature = "mem_profile")] TraceMemoryFree = 106, + /// IO port for PV timer configuration. The guest writes a 32-bit + /// LE value representing the desired timer period in microseconds. + /// A value of 0 disables the timer. + PvTimerConfig = 107, + /// IO port the guest writes to signal "I'm done" to the host. + /// This replaces the `hlt` instruction for halt signaling so that + /// KVM's in-kernel LAPIC (which absorbs HLT exits) does not interfere + /// with hyperlight's halt-based guest-host protocol. + Halt = 108, } impl TryFrom for OutBAction { @@ -120,6 +129,8 @@ impl TryFrom for OutBAction { 105 => Ok(OutBAction::TraceMemoryAlloc), #[cfg(feature = "mem_profile")] 106 => Ok(OutBAction::TraceMemoryFree), + 107 => Ok(OutBAction::PvTimerConfig), + 108 => Ok(OutBAction::Halt), _ => Err(anyhow::anyhow!("Invalid OutBAction value: {}", val)), } } diff --git a/src/hyperlight_guest/src/exit.rs b/src/hyperlight_guest/src/exit.rs index e8d58e24e..1926340a4 100644 --- a/src/hyperlight_guest/src/exit.rs +++ b/src/hyperlight_guest/src/exit.rs @@ -18,6 +18,34 @@ use core::arch::asm; use core::ffi::{CStr, c_char}; use hyperlight_common::outb::OutBAction; +use tracing::instrument; + +/// Signal successful completion of the guest's work and return control +/// to the host. This replaces the previous `hlt`-based exit: under the +/// `hw-interrupts` feature, `hlt` becomes a wait-for-interrupt (the +/// in-kernel IRQ chip wakes the vCPU), so we use an explicit IO-port +/// write (port 108) to trigger a VM exit that the host treats as a +/// clean shutdown. +/// +/// This function never returns — the host does not re-enter the guest +/// after seeing the Halt port. +#[inline(never)] +#[instrument(skip_all, level = "Trace")] +pub fn halt() { + #[cfg(all(feature = "trace_guest", target_arch = "x86_64"))] + { + // Send data before halting + // If there is no data, this doesn't do anything + // The reason we do this here instead of in `hlt` asm function + // is to avoid allocating before halting, which leaks memory + // because the guest is not expected to resume execution after halting. + hyperlight_guest_tracing::flush(); + } + + unsafe { + out32(OutBAction::Halt as u16, 0); + } +} /// Exits the VM with an Abort OUT action and code 0. #[unsafe(no_mangle)] diff --git a/src/hyperlight_guest_bin/src/arch/amd64/dispatch.rs b/src/hyperlight_guest_bin/src/arch/amd64/dispatch.rs index bb80b30ed..a39de9020 100644 --- a/src/hyperlight_guest_bin/src/arch/amd64/dispatch.rs +++ b/src/hyperlight_guest_bin/src/arch/amd64/dispatch.rs @@ -68,6 +68,7 @@ core::arch::global_asm!(" mov cr4, rdi flush_done: call {internal_dispatch_function}\n - hlt\n + mov dx, 108\n + out dx, al\n .cfi_endproc ", internal_dispatch_function = sym crate::guest_function::call::internal_dispatch_function); diff --git a/src/hyperlight_guest_bin/src/arch/amd64/init.rs b/src/hyperlight_guest_bin/src/arch/amd64/init.rs index 09f3c4775..bfa458059 100644 --- a/src/hyperlight_guest_bin/src/arch/amd64/init.rs +++ b/src/hyperlight_guest_bin/src/arch/amd64/init.rs @@ -173,6 +173,7 @@ core::arch::global_asm!(" mov rsp, r8\n xor ebp, ebp\n call {generic_init}\n - hlt\n + mov dx, 108\n + out dx, al\n .cfi_endproc\n ", generic_init = sym crate::generic_init); diff --git a/src/hyperlight_host/Cargo.toml b/src/hyperlight_host/Cargo.toml index 84a112a05..24a4d238d 100644 --- a/src/hyperlight_host/Cargo.toml +++ b/src/hyperlight_host/Cargo.toml @@ -40,7 +40,7 @@ tracing = { version = "0.1.44", features = ["log"] } tracing-log = "0.2.0" tracing-core = "0.1.36" tracing-opentelemetry = { version = "0.32.1", optional = true } -hyperlight-common = { workspace = true, default-features = true, features = [ "std", "init-paging" ] } +hyperlight-common = { workspace = true, default-features = true, features = [ "std" ] } hyperlight-guest-tracing = { workspace = true, default-features = true, optional = true } vmm-sys-util = "0.15.0" crossbeam-channel = "0.5.15" @@ -133,11 +133,12 @@ trace_guest = ["dep:opentelemetry", "dep:tracing-opentelemetry", "dep:hyperlight mem_profile = [ "trace_guest", "dep:framehop", "dep:fallible-iterator", "hyperlight-common/mem_profile" ] kvm = ["dep:kvm-bindings", "dep:kvm-ioctls"] mshv3 = ["dep:mshv-bindings", "dep:mshv-ioctls"] +hw-interrupts = [] # This enables easy debug in the guest gdb = ["dep:gdbstub", "dep:gdbstub_arch"] fuzzing = ["hyperlight-common/fuzzing"] build-metadata = ["dep:built"] -init-paging = [] +init-paging = ["hyperlight-common/init-paging"] [[bench]] name = "benchmarks" diff --git a/src/hyperlight_host/src/hypervisor/hyperlight_vm.rs b/src/hyperlight_host/src/hypervisor/hyperlight_vm.rs index d3a210926..05628de7e 100644 --- a/src/hyperlight_host/src/hypervisor/hyperlight_vm.rs +++ b/src/hyperlight_host/src/hypervisor/hyperlight_vm.rs @@ -2272,6 +2272,7 @@ mod tests { // Tests // ========================================================================== + #[cfg_attr(feature = "hw-interrupts", ignore)] #[test] fn reset_vcpu_simple() { // push rax; hlt - aligns stack to 16 bytes @@ -2417,6 +2418,7 @@ mod tests { use super::*; + #[cfg_attr(feature = "hw-interrupts", ignore)] #[test] fn reset_vcpu_regs() { let mut a = CodeAssembler::new(64).unwrap(); @@ -2476,6 +2478,7 @@ mod tests { assert_regs_reset(hyperlight_vm.vm.as_ref()); } + #[cfg_attr(feature = "hw-interrupts", ignore)] #[test] fn reset_vcpu_fpu() { #[cfg(kvm)] @@ -2607,6 +2610,7 @@ mod tests { } } + #[cfg_attr(feature = "hw-interrupts", ignore)] #[test] fn reset_vcpu_debug_regs() { let mut a = CodeAssembler::new(64).unwrap(); @@ -2649,6 +2653,7 @@ mod tests { assert_debug_regs_reset(hyperlight_vm.vm.as_ref()); } + #[cfg_attr(feature = "hw-interrupts", ignore)] #[test] fn reset_vcpu_sregs() { // Build code that modifies special registers and halts @@ -2702,6 +2707,7 @@ mod tests { /// Verifies guest-visible FPU state (via FXSAVE) is properly reset. /// Unlike tests using hypervisor API, this runs actual guest code with FXSAVE. + #[cfg_attr(feature = "hw-interrupts", ignore)] #[test] fn reset_vcpu_fpu_guest_visible_state() { let mut ctx = hyperlight_vm_with_mem_mgr_fxsave(); diff --git a/src/hyperlight_host/src/hypervisor/mod.rs b/src/hyperlight_host/src/hypervisor/mod.rs index a582ddb09..600bcd939 100644 --- a/src/hyperlight_host/src/hypervisor/mod.rs +++ b/src/hyperlight_host/src/hypervisor/mod.rs @@ -469,6 +469,7 @@ pub(crate) mod tests { use crate::sandbox::{SandboxConfiguration, UninitializedSandbox}; use crate::{Result, is_hypervisor_present, new_error}; + #[cfg_attr(feature = "hw-interrupts", ignore)] #[test] fn test_initialise() -> Result<()> { if !is_hypervisor_present() { diff --git a/src/hyperlight_host/src/hypervisor/virtual_machine/hw_interrupts.rs b/src/hyperlight_host/src/hypervisor/virtual_machine/hw_interrupts.rs new file mode 100644 index 000000000..e20718fce --- /dev/null +++ b/src/hyperlight_host/src/hypervisor/virtual_machine/hw_interrupts.rs @@ -0,0 +1,253 @@ +/* +Copyright 2025 The Hyperlight Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +//! Shared helpers for the `hw-interrupts` feature, used by MSHV and WHP +//! backends. KVM uses an in-kernel IRQ chip so it handles most of +//! this transparently. + +/// Hardcoded timer interrupt vector. The guest remaps IRQ0 to +/// vector 0x20 via PIC ICW2, so we use that same vector directly +/// — no PIC state machine needed. +pub(crate) const TIMER_VECTOR: u32 = 0x20; + +/// Handle an IO IN request for hardware-interrupt related ports. +/// Returns `Some(value)` if the port was handled, `None` if the +/// port should be passed through to the guest handler. +/// +/// No PIC state machine — PIC data ports return 0xFF (all masked), +/// PIC command ports return 0 (no pending IRQ), PIT returns 0. +pub(crate) fn handle_io_in(port: u16) -> Option { + match port { + // PIC master/slave data ports — return "all masked" + 0x21 | 0xA1 => Some(0xFF), + // PIC master/slave command ports — return 0 (ISR/IRR read) + 0x20 | 0xA0 => Some(0), + // PIT data port read — return 0 + 0x40 => Some(0), + _ => None, + } +} + +/// Handle IO OUT requests for common legacy hardware ports +/// (PIC, PIT, PC speaker, diagnostic port). +/// +/// Returns `true` if the port was fully handled, `false` otherwise. +/// When the guest sends a non-specific EOI on the master PIC command +/// port (0x20, OCW2 byte with bits 7:5 = 001) and a timer is active, +/// `eoi_callback` is invoked to bridge the PIC EOI to the LAPIC. +pub(crate) fn handle_common_io_out( + port: u16, + data: &[u8], + timer_active: bool, + eoi_callback: impl FnOnce(), +) -> bool { + // PIC ports (0x20, 0x21, 0xA0, 0xA1): accept as no-ops. + // We only need LAPIC EOI bridging when the guest sends a + // non-specific EOI on the master PIC command port. + if port == 0x20 || port == 0x21 || port == 0xA0 || port == 0xA1 { + if port == 0x20 && !data.is_empty() && (data[0] & 0xE0) == 0x20 && timer_active { + eoi_callback(); + } + return true; + } + // PIT ports + if port == 0x43 || port == 0x40 { + return true; + } + // PC speaker + if port == 0x61 { + return true; + } + // Diagnostic port + if port == 0x80 { + return true; + } + false +} + +// --------------------------------------------------------------------------- +// LAPIC register helpers +// +// LAPIC register offsets from Intel SDM Vol. 3A, Table 11-1. +// These operate on a raw byte slice representing the LAPIC register page. +// --------------------------------------------------------------------------- + +/// Write a u32 to a LAPIC register page at the given APIC offset. +pub(crate) fn write_lapic_u32(state: &mut [u8], offset: usize, val: u32) { + state[offset..offset + 4].copy_from_slice(&val.to_le_bytes()); +} + +/// Read a u32 from a LAPIC register page at the given APIC offset. +pub(crate) fn read_lapic_u32(state: &[u8], offset: usize) -> u32 { + u32::from_le_bytes([ + state[offset], + state[offset + 1], + state[offset + 2], + state[offset + 3], + ]) +} + +/// Initialize LAPIC registers in a raw register page to sensible +/// defaults for timer interrupt delivery. +/// +/// Register values from Intel SDM Vol. 3A, Table 11-1: +/// - SVR (0xF0): bit 8 = enable APIC, bits 0-7 = spurious vector 0xFF +/// - TPR (0x80): 0 = accept all interrupt priorities +/// - DFR (0xE0): 0xFFFF_FFFF = flat model +/// - LDR (0xD0): logical APIC ID for flat model +/// - LINT0 (0x350): masked — not wired to PIC +/// - LINT1 (0x360): NMI delivery, not masked +/// - LVT Timer (0x320): masked — host timer thread injects instead +/// - LVT Error (0x370): masked +pub(crate) fn init_lapic_registers(state: &mut [u8]) { + write_lapic_u32(state, 0xF0, 0x1FF); + write_lapic_u32(state, 0x80, 0); + write_lapic_u32(state, 0xE0, 0xFFFF_FFFF); + write_lapic_u32(state, 0xD0, 1 << 24); + write_lapic_u32(state, 0x350, 0x0001_0000); + write_lapic_u32(state, 0x360, 0x400); + write_lapic_u32(state, 0x320, 0x0001_0000); + write_lapic_u32(state, 0x370, 0x0001_0000); +} + +/// Perform LAPIC EOI: clear the highest-priority in-service bit. +/// +/// The ISR is at LAPIC offset 0x100, organized as 8 × 32-bit words +/// (one per 16 bytes). Scans from highest priority (ISR\[7\]) to +/// lowest (ISR\[0\]). +pub(crate) fn lapic_eoi(state: &mut [u8]) { + for i in (0u32..8).rev() { + let offset = 0x100 + (i as usize) * 0x10; + let isr_val = read_lapic_u32(state, offset); + if isr_val != 0 { + let bit = 31 - isr_val.leading_zeros(); + write_lapic_u32(state, offset, isr_val & !(1u32 << bit)); + break; + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn write_read_lapic_roundtrip() { + let mut state = vec![0u8; 1024]; + write_lapic_u32(&mut state, 0xF0, 0xDEAD_BEEF); + assert_eq!(read_lapic_u32(&state, 0xF0), 0xDEAD_BEEF); + } + + #[test] + fn write_read_lapic_multiple_offsets() { + let mut state = vec![0u8; 1024]; + write_lapic_u32(&mut state, 0x80, 0x1234_5678); + write_lapic_u32(&mut state, 0xF0, 0xABCD_EF01); + write_lapic_u32(&mut state, 0xE0, 0xFFFF_FFFF); + assert_eq!(read_lapic_u32(&state, 0x80), 0x1234_5678); + assert_eq!(read_lapic_u32(&state, 0xF0), 0xABCD_EF01); + assert_eq!(read_lapic_u32(&state, 0xE0), 0xFFFF_FFFF); + } + + #[test] + fn write_read_lapic_zero() { + let mut state = vec![0xFFu8; 1024]; + write_lapic_u32(&mut state, 0x80, 0); + assert_eq!(read_lapic_u32(&state, 0x80), 0); + } + + #[test] + fn write_does_not_clobber_neighbors() { + let mut state = vec![0u8; 1024]; + write_lapic_u32(&mut state, 0x80, 0xAAAA_BBBB); + assert_eq!(state[0x7F], 0); + assert_eq!(state[0x84], 0); + } + + #[test] + fn lapic_eoi_clears_highest_isr_bit() { + let mut state = vec![0u8; 1024]; + // Set bit 5 in ISR[0] (offset 0x100) + write_lapic_u32(&mut state, 0x100, 1 << 5); + lapic_eoi(&mut state); + assert_eq!(read_lapic_u32(&state, 0x100), 0); + } + + #[test] + fn lapic_eoi_clears_only_highest() { + let mut state = vec![0u8; 1024]; + // Set bits in ISR[0] and ISR[1] + write_lapic_u32(&mut state, 0x100, 0b11); // bits 0 and 1 + write_lapic_u32(&mut state, 0x110, 1 << 2); // bit 2 in ISR[1] + lapic_eoi(&mut state); + // ISR[1] should be cleared (higher priority), ISR[0] untouched + assert_eq!(read_lapic_u32(&state, 0x110), 0); + assert_eq!(read_lapic_u32(&state, 0x100), 0b11); + } + + #[test] + fn init_lapic_registers_sets_svr() { + let mut state = vec![0u8; 1024]; + init_lapic_registers(&mut state); + let svr = read_lapic_u32(&state, 0xF0); + assert_ne!(svr & 0x100, 0, "APIC enable bit should be set"); + assert_eq!(svr & 0xFF, 0xFF, "spurious vector should be 0xFF"); + } + + #[test] + fn handle_io_in_pic_ports() { + assert_eq!(handle_io_in(0x21), Some(0xFF)); + assert_eq!(handle_io_in(0xA1), Some(0xFF)); + assert_eq!(handle_io_in(0x20), Some(0)); + assert_eq!(handle_io_in(0xA0), Some(0)); + assert_eq!(handle_io_in(0x40), Some(0)); + assert_eq!(handle_io_in(0x42), None); + } + + #[test] + fn handle_common_io_out_pic_eoi() { + let mut eoi_called = false; + // Non-specific EOI (0x20) on master PIC command port + let handled = handle_common_io_out(0x20, &[0x20], true, || eoi_called = true); + assert!(handled); + assert!(eoi_called); + } + + #[test] + fn handle_common_io_out_no_eoi_when_timer_inactive() { + let mut eoi_called = false; + let handled = handle_common_io_out(0x20, &[0x20], false, || eoi_called = true); + assert!(handled); + assert!(!eoi_called); + } + + #[test] + fn handle_common_io_out_pit_ports() { + assert!(handle_common_io_out(0x40, &[], false, || {})); + assert!(handle_common_io_out(0x43, &[], false, || {})); + } + + #[test] + fn handle_common_io_out_speaker_and_diag() { + assert!(handle_common_io_out(0x61, &[], false, || {})); + assert!(handle_common_io_out(0x80, &[], false, || {})); + } + + #[test] + fn handle_common_io_out_unknown_port() { + assert!(!handle_common_io_out(0x100, &[], false, || {})); + } +} diff --git a/src/hyperlight_host/src/hypervisor/virtual_machine/kvm.rs b/src/hyperlight_host/src/hypervisor/virtual_machine/kvm.rs index a0382ab6c..be64581b2 100644 --- a/src/hyperlight_host/src/hypervisor/virtual_machine/kvm.rs +++ b/src/hyperlight_host/src/hypervisor/virtual_machine/kvm.rs @@ -14,8 +14,13 @@ See the License for the specific language governing permissions and limitations under the License. */ +#[cfg(feature = "hw-interrupts")] +use std::sync::Arc; use std::sync::LazyLock; +#[cfg(feature = "hw-interrupts")] +use std::sync::atomic::{AtomicBool, Ordering}; +use hyperlight_common::outb::OutBAction; #[cfg(gdb)] use kvm_bindings::kvm_guest_debug; use kvm_bindings::{ @@ -26,6 +31,8 @@ use kvm_ioctls::{Kvm, VcpuExit, VcpuFd, VmFd}; use tracing::{Span, instrument}; #[cfg(feature = "trace_guest")] use tracing_opentelemetry::OpenTelemetrySpanExt; +#[cfg(feature = "hw-interrupts")] +use vmm_sys_util::eventfd::EventFd; #[cfg(gdb)] use crate::hypervisor::gdb::{DebugError, DebuggableVm}; @@ -33,6 +40,8 @@ use crate::hypervisor::regs::{ CommonDebugRegs, CommonFpu, CommonRegisters, CommonSpecialRegisters, FP_CONTROL_WORD_DEFAULT, MXCSR_DEFAULT, }; +#[cfg(feature = "hw-interrupts")] +use crate::hypervisor::virtual_machine::HypervisorError; #[cfg(all(test, feature = "init-paging"))] use crate::hypervisor::virtual_machine::XSAVE_BUFFER_SIZE; use crate::hypervisor::virtual_machine::{ @@ -89,6 +98,17 @@ pub(crate) struct KvmVm { vm_fd: VmFd, vcpu_fd: VcpuFd, + /// EventFd registered via irqfd for GSI 0 (IRQ0). A timer thread + /// writes to this to inject periodic timer interrupts. + #[cfg(feature = "hw-interrupts")] + timer_irq_eventfd: EventFd, + /// Signals the timer thread to stop. + #[cfg(feature = "hw-interrupts")] + timer_stop: Arc, + /// Handle to the background timer thread (if started). + #[cfg(feature = "hw-interrupts")] + timer_thread: Option>, + // KVM, as opposed to mshv/whp, has no get_guest_debug() ioctl, so we must track the state ourselves #[cfg(gdb)] debug_regs: kvm_guest_debug, @@ -106,6 +126,39 @@ impl KvmVm { let vm_fd = hv .create_vm_with_type(0) .map_err(|e| CreateVmError::CreateVmFd(e.into()))?; + + // When hw-interrupts is enabled, create the in-kernel IRQ chip + // (PIC + IOAPIC + LAPIC) before creating the vCPU so the + // per-vCPU LAPIC is initialised in virtual-wire mode (LINT0 = ExtINT). + // The guest programs the PIC remap via standard IO port writes, + // which the in-kernel PIC handles transparently. + // + // Instead of creating an in-kernel PIT (create_pit2), we use a + // host-side timer thread + irqfd to inject IRQ0 at the rate + // requested by the guest via OutBAction::PvTimerConfig (port 107). + // This eliminates the in-kernel PIT device. Guest PIT port writes + // (0x40, 0x43) become no-ops handled in the run loop. + #[cfg(feature = "hw-interrupts")] + let timer_irq_eventfd = { + vm_fd + .create_irq_chip() + .map_err(|e| CreateVmError::InitializeVm(e.into()))?; + + // Create an EventFd and register it via irqfd for GSI 0 (IRQ0). + // When the timer thread writes to this EventFd, the in-kernel + // PIC will assert IRQ0, which is delivered as the vector the + // guest configured during PIC remap (typically vector 0x20). + let eventfd = EventFd::new(0).map_err(|e| { + CreateVmError::InitializeVm( + kvm_ioctls::Error::new(e.raw_os_error().unwrap_or(libc::EIO)).into(), + ) + })?; + vm_fd + .register_irqfd(&eventfd, 0) + .map_err(|e| CreateVmError::InitializeVm(e.into()))?; + eventfd + }; + let vcpu_fd = vm_fd .create_vcpu(0) .map_err(|e| CreateVmError::CreateVcpuFd(e.into()))?; @@ -132,6 +185,12 @@ impl KvmVm { Ok(Self { vm_fd, vcpu_fd, + #[cfg(feature = "hw-interrupts")] + timer_irq_eventfd, + #[cfg(feature = "hw-interrupts")] + timer_stop: Arc::new(AtomicBool::new(false)), + #[cfg(feature = "hw-interrupts")] + timer_thread: None, #[cfg(gdb)] debug_regs: kvm_guest_debug::default(), }) @@ -171,8 +230,91 @@ impl VirtualMachine for KvmVm { // it sets the guest span, no other traces or spans must be setup in between these calls. #[cfg(feature = "trace_guest")] tc.setup_guest_trace(Span::current().context()); + + // When hw-interrupts is enabled, the in-kernel PIC + LAPIC deliver + // interrupts triggered by the host-side timer thread via irqfd. + // There is no in-kernel PIT; guest PIT port writes are no-ops. + // The guest signals "I'm done" by writing to OutBAction::Halt + // (an IO port exit) instead of using HLT, because the in-kernel + // LAPIC absorbs HLT (never returns VcpuExit::Hlt to userspace). + #[cfg(feature = "hw-interrupts")] + loop { + match self.vcpu_fd.run() { + Ok(VcpuExit::Hlt) => { + // The in-kernel LAPIC normally handles HLT internally. + // If we somehow get here, just re-enter the guest. + continue; + } + Ok(VcpuExit::IoOut(port, data)) => { + if port == OutBAction::Halt as u16 { + // Stop the timer thread before returning. + self.timer_stop.store(true, Ordering::Relaxed); + if let Some(h) = self.timer_thread.take() { + let _ = h.join(); + } + return Ok(VmExit::Halt()); + } + if port == OutBAction::PvTimerConfig as u16 { + // The guest is configuring the timer period. + // Extract the period in microseconds (LE u32). + if let Ok(bytes) = data[..4].try_into() { + let period_us = u32::from_le_bytes(bytes) as u64; + if period_us > 0 && self.timer_thread.is_none() { + // Reset the stop flag — a previous halt (e.g. the + // init halt during evolve()) may have set it. + self.timer_stop.store(false, Ordering::Relaxed); + let eventfd = self.timer_irq_eventfd.try_clone().map_err(|e| { + RunVcpuError::Unknown(HypervisorError::KvmError(e.into())) + })?; + let stop = self.timer_stop.clone(); + let period = std::time::Duration::from_micros(period_us); + self.timer_thread = Some(std::thread::spawn(move || { + while !stop.load(Ordering::Relaxed) { + std::thread::sleep(period); + if stop.load(Ordering::Relaxed) { + break; + } + let _ = eventfd.write(1); + } + })); + } + } + continue; + } + // PIT ports (0x40-0x43): no in-kernel PIT, so these + // exit to userspace. Silently ignore them. + if (0x40..=0x43).contains(&port) { + continue; + } + return Ok(VmExit::IoOut(port, data.to_vec())); + } + Ok(VcpuExit::MmioRead(addr, _)) => return Ok(VmExit::MmioRead(addr)), + Ok(VcpuExit::MmioWrite(addr, _)) => return Ok(VmExit::MmioWrite(addr)), + #[cfg(gdb)] + Ok(VcpuExit::Debug(debug_exit)) => { + return Ok(VmExit::Debug { + dr6: debug_exit.dr6, + exception: debug_exit.exception, + }); + } + Err(e) => match e.errno() { + libc::EINTR => return Ok(VmExit::Cancelled()), + libc::EAGAIN => continue, + _ => return Err(RunVcpuError::Unknown(e.into())), + }, + Ok(other) => { + return Ok(VmExit::Unknown(format!( + "Unknown KVM VCPU exit: {:?}", + other + ))); + } + } + } + + #[cfg(not(feature = "hw-interrupts"))] match self.vcpu_fd.run() { Ok(VcpuExit::Hlt) => Ok(VmExit::Halt()), + Ok(VcpuExit::IoOut(port, _)) if port == OutBAction::Halt as u16 => Ok(VmExit::Halt()), Ok(VcpuExit::IoOut(port, data)) => Ok(VmExit::IoOut(port, data.to_vec())), Ok(VcpuExit::MmioRead(addr, _)) => Ok(VmExit::MmioRead(addr)), Ok(VcpuExit::MmioWrite(addr, _)) => Ok(VmExit::MmioWrite(addr)), @@ -428,3 +570,29 @@ impl DebuggableVm for KvmVm { Ok(()) } } + +#[cfg(feature = "hw-interrupts")] +impl Drop for KvmVm { + fn drop(&mut self) { + self.timer_stop.store(true, Ordering::Relaxed); + if let Some(h) = self.timer_thread.take() { + let _ = h.join(); + } + } +} + +#[cfg(test)] +#[cfg(feature = "hw-interrupts")] +mod hw_interrupt_tests { + use super::*; + + #[test] + fn halt_port_is_not_standard_device() { + // OutBAction::Halt port must not overlap in-kernel PIC/PIT/speaker ports + const HALT: u16 = OutBAction::Halt as u16; + const _: () = assert!(HALT != 0x20 && HALT != 0x21); + const _: () = assert!(HALT != 0xA0 && HALT != 0xA1); + const _: () = assert!(HALT != 0x40 && HALT != 0x41 && HALT != 0x42 && HALT != 0x43); + const _: () = assert!(HALT != 0x61); + } +} diff --git a/src/hyperlight_host/src/hypervisor/virtual_machine/mod.rs b/src/hyperlight_host/src/hypervisor/virtual_machine/mod.rs index 82e05c104..469f84d9d 100644 --- a/src/hyperlight_host/src/hypervisor/virtual_machine/mod.rs +++ b/src/hyperlight_host/src/hypervisor/virtual_machine/mod.rs @@ -38,6 +38,10 @@ pub(crate) mod mshv; #[cfg(target_os = "windows")] pub(crate) mod whp; +/// Shared helpers for hardware interrupt support (MSHV and WHP) +#[cfg(feature = "hw-interrupts")] +pub(crate) mod hw_interrupts; + static AVAILABLE_HYPERVISOR: OnceLock> = OnceLock::new(); /// Returns which type of hypervisor is available, if any @@ -136,10 +140,10 @@ pub(crate) enum VmExit { Unknown(String), /// The operation should be retried, for example this can happen on Linux where a call to run the CPU can return EAGAIN #[cfg_attr( - target_os = "windows", + any(target_os = "windows", feature = "hw-interrupts"), expect( dead_code, - reason = "Retry() is never constructed on Windows, but it is still matched on (which dead_code lint ignores)" + reason = "Retry() is never constructed on Windows or with hw-interrupts (EAGAIN causes continue instead)" ) )] Retry(), diff --git a/src/hyperlight_host/src/hypervisor/virtual_machine/mshv.rs b/src/hyperlight_host/src/hypervisor/virtual_machine/mshv.rs index 705764dd6..0c698d94b 100644 --- a/src/hyperlight_host/src/hypervisor/virtual_machine/mshv.rs +++ b/src/hyperlight_host/src/hypervisor/virtual_machine/mshv.rs @@ -16,8 +16,15 @@ limitations under the License. #[cfg(gdb)] use std::fmt::Debug; +#[cfg(feature = "hw-interrupts")] +use std::sync::Arc; use std::sync::LazyLock; +#[cfg(feature = "hw-interrupts")] +use std::sync::atomic::{AtomicBool, Ordering}; +use hyperlight_common::outb::OutBAction; +#[cfg(feature = "hw-interrupts")] +use mshv_bindings::LapicState; #[cfg(gdb)] use mshv_bindings::{DebugRegisters, hv_message_type_HVMSG_X64_EXCEPTION_INTERCEPT}; use mshv_bindings::{ @@ -26,8 +33,15 @@ use mshv_bindings::{ hv_message_type_HVMSG_X64_HALT, hv_message_type_HVMSG_X64_IO_PORT_INTERCEPT, hv_partition_property_code_HV_PARTITION_PROPERTY_SYNTHETIC_PROC_FEATURES, hv_partition_synthetic_processor_features, hv_register_assoc, - hv_register_name_HV_X64_REGISTER_RIP, hv_register_value, mshv_user_mem_region, + hv_register_name_HV_X64_REGISTER_RIP, hv_register_value, mshv_create_partition_v2, + mshv_user_mem_region, }; +#[cfg(feature = "hw-interrupts")] +use mshv_bindings::{ + hv_interrupt_type_HV_X64_INTERRUPT_TYPE_FIXED, hv_register_name_HV_X64_REGISTER_RAX, +}; +#[cfg(feature = "hw-interrupts")] +use mshv_ioctls::InterruptRequest; use mshv_ioctls::{Mshv, VcpuFd, VmFd}; use tracing::{Span, instrument}; #[cfg(feature = "trace_guest")] @@ -65,35 +79,69 @@ pub(crate) fn is_hypervisor_present() -> bool { /// A MSHV implementation of a single-vcpu VM #[derive(Debug)] pub(crate) struct MshvVm { + /// VmFd wrapped in Arc so the timer thread can call + /// `request_virtual_interrupt` from a background thread. + #[cfg(feature = "hw-interrupts")] + vm_fd: Arc, + #[cfg(not(feature = "hw-interrupts"))] vm_fd: VmFd, vcpu_fd: VcpuFd, + /// Signals the timer thread to stop. + #[cfg(feature = "hw-interrupts")] + timer_stop: Arc, + /// Handle to the background timer thread (if started). + #[cfg(feature = "hw-interrupts")] + timer_thread: Option>, } static MSHV: LazyLock> = LazyLock::new(|| Mshv::new().map_err(|e| CreateVmError::HypervisorNotAvailable(e.into()))); +/// Cast MSHV `LapicState.regs` (`[c_char; 1024]`) to a `&[u8]` slice +/// for use with the shared LAPIC helpers. +#[cfg(feature = "hw-interrupts")] +fn lapic_regs_as_u8(regs: &[::std::os::raw::c_char; 1024]) -> &[u8] { + // Safety: c_char (i8) and u8 have the same size and alignment; + // LAPIC register values are treated as raw bytes. + unsafe { &*(regs as *const [::std::os::raw::c_char; 1024] as *const [u8; 1024]) } +} + +/// Cast MSHV `LapicState.regs` (`[c_char; 1024]`) to a `&mut [u8]` slice +/// for use with the shared LAPIC helpers. +#[cfg(feature = "hw-interrupts")] +fn lapic_regs_as_u8_mut(regs: &mut [::std::os::raw::c_char; 1024]) -> &mut [u8] { + // Safety: same as above. + unsafe { &mut *(regs as *mut [::std::os::raw::c_char; 1024] as *mut [u8; 1024]) } +} + impl MshvVm { /// Create a new instance of a MshvVm #[instrument(skip_all, parent = Span::current(), level = "Trace")] pub(crate) fn new() -> std::result::Result { let mshv = MSHV.as_ref().map_err(|e| e.clone())?; - let pr = Default::default(); - // It's important to avoid create_vm() and explicitly use - // create_vm_with_args() with an empty arguments structure - // here, because otherwise the partition is set up with a SynIC. + #[allow(unused_mut)] + let mut pr: mshv_create_partition_v2 = Default::default(); + // Enable LAPIC for hw-interrupts — required for interrupt delivery + // via request_virtual_interrupt. MSHV_PT_BIT_LAPIC = bit 0. + #[cfg(feature = "hw-interrupts")] + { + pr.pt_flags = 1u64; // LAPIC + } let vm_fd = mshv .create_vm_with_args(&pr) .map_err(|e| CreateVmError::CreateVmFd(e.into()))?; let vcpu_fd = { let features: hv_partition_synthetic_processor_features = Default::default(); + vm_fd .set_partition_property( hv_partition_property_code_HV_PARTITION_PROPERTY_SYNTHETIC_PROC_FEATURES, unsafe { features.as_uint64[0] }, ) .map_err(|e| CreateVmError::SetPartitionProperty(e.into()))?; + vm_fd .initialize() .map_err(|e| CreateVmError::InitializeVm(e.into()))?; @@ -103,7 +151,35 @@ impl MshvVm { .map_err(|e| CreateVmError::CreateVcpuFd(e.into()))? }; - Ok(Self { vm_fd, vcpu_fd }) + // Initialize the virtual LAPIC when hw-interrupts is enabled. + // LAPIC defaults to disabled (SVR bit 8 = 0), which means no APIC + // interrupts can be delivered (request_virtual_interrupt would fail). + #[cfg(feature = "hw-interrupts")] + { + use super::hw_interrupts::init_lapic_registers; + + let mut lapic: LapicState = vcpu_fd + .get_lapic() + .map_err(|e| CreateVmError::InitializeVm(e.into()))?; + + init_lapic_registers(lapic_regs_as_u8_mut(&mut lapic.regs)); + + vcpu_fd + .set_lapic(&lapic) + .map_err(|e| CreateVmError::InitializeVm(e.into()))?; + } + + Ok(Self { + #[cfg(feature = "hw-interrupts")] + vm_fd: Arc::new(vm_fd), + #[cfg(not(feature = "hw-interrupts"))] + vm_fd, + vcpu_fd, + #[cfg(feature = "hw-interrupts")] + timer_stop: Arc::new(AtomicBool::new(false)), + #[cfg(feature = "hw-interrupts")] + timer_thread: None, + }) } } @@ -128,6 +204,7 @@ impl VirtualMachine for MshvVm { .map_err(|e| UnmapMemoryError::Hypervisor(e.into())) } + #[cfg_attr(not(feature = "hw-interrupts"), allow(clippy::never_loop))] fn run_vcpu( &mut self, #[cfg(feature = "trace_guest")] tc: &mut SandboxTraceContext, @@ -144,82 +221,158 @@ impl VirtualMachine for MshvVm { // it sets the guest span, no other traces or spans must be setup in between these calls. #[cfg(feature = "trace_guest")] tc.setup_guest_trace(Span::current().context()); - let exit_reason = self.vcpu_fd.run(); - - let result = match exit_reason { - Ok(m) => match m.header.message_type { - HALT_MESSAGE => VmExit::Halt(), - IO_PORT_INTERCEPT_MESSAGE => { - let io_message = m - .to_ioport_info() - .map_err(|_| RunVcpuError::DecodeIOMessage(m.header.message_type))?; - let port_number = io_message.port_number; - let rip = io_message.header.rip; - let rax = io_message.rax; - let instruction_length = io_message.header.instruction_length() as u64; - - // mshv, unlike kvm, does not automatically increment RIP - self.vcpu_fd - .set_reg(&[hv_register_assoc { - name: hv_register_name_HV_X64_REGISTER_RIP, - value: hv_register_value { - reg64: rip + instruction_length, - }, - ..Default::default() - }]) - .map_err(|e| RunVcpuError::IncrementRip(e.into()))?; - VmExit::IoOut(port_number, rax.to_le_bytes().to_vec()) - } - UNMAPPED_GPA_MESSAGE => { - let mimo_message = m - .to_memory_info() - .map_err(|_| RunVcpuError::DecodeIOMessage(m.header.message_type))?; - let addr = mimo_message.guest_physical_address; - match MemoryRegionFlags::try_from(mimo_message) - .map_err(|_| RunVcpuError::ParseGpaAccessInfo)? - { - MemoryRegionFlags::READ => VmExit::MmioRead(addr), - MemoryRegionFlags::WRITE => VmExit::MmioWrite(addr), - _ => VmExit::Unknown("Unknown MMIO access".to_string()), + + loop { + let exit_reason = self.vcpu_fd.run(); + + match exit_reason { + Ok(m) => { + let msg_type = m.header.message_type; + match msg_type { + HALT_MESSAGE => { + // With timer thread active, re-enter the guest. + // The hypervisor will deliver pending timer + // interrupts on the next run(), waking the + // vCPU from HLT. + #[cfg(feature = "hw-interrupts")] + if self.timer_thread.is_some() { + continue; + } + return Ok(VmExit::Halt()); + } + IO_PORT_INTERCEPT_MESSAGE => { + let io_message = m + .to_ioport_info() + .map_err(|_| RunVcpuError::DecodeIOMessage(msg_type))?; + let port_number = io_message.port_number; + let rip = io_message.header.rip; + let rax = io_message.rax; + let instruction_length = io_message.header.instruction_length() as u64; + let is_write = io_message.header.intercept_access_type != 0; + + // mshv, unlike kvm, does not automatically increment RIP + self.vcpu_fd + .set_reg(&[hv_register_assoc { + name: hv_register_name_HV_X64_REGISTER_RIP, + value: hv_register_value { + reg64: rip + instruction_length, + }, + ..Default::default() + }]) + .map_err(|e| RunVcpuError::IncrementRip(e.into()))?; + + // OutBAction::Halt always means "I'm done", regardless + // of whether a timer is active. + if is_write && port_number == OutBAction::Halt as u16 { + // Stop the timer thread before returning. + #[cfg(feature = "hw-interrupts")] + { + self.timer_stop.store(true, Ordering::Relaxed); + if let Some(h) = self.timer_thread.take() { + let _ = h.join(); + } + } + return Ok(VmExit::Halt()); + } + + #[cfg(feature = "hw-interrupts")] + { + if is_write { + let data = rax.to_le_bytes(); + if self.handle_hw_io_out(port_number, &data) { + continue; + } + } else if let Some(val) = + super::hw_interrupts::handle_io_in(port_number) + { + self.vcpu_fd + .set_reg(&[hv_register_assoc { + name: hv_register_name_HV_X64_REGISTER_RAX, + value: hv_register_value { reg64: val }, + ..Default::default() + }]) + .map_err(|e| RunVcpuError::Unknown(e.into()))?; + continue; + } + } + + // Suppress unused variable warning when hw-interrupts is disabled + let _ = is_write; + + return Ok(VmExit::IoOut(port_number, rax.to_le_bytes().to_vec())); + } + UNMAPPED_GPA_MESSAGE => { + let mimo_message = m + .to_memory_info() + .map_err(|_| RunVcpuError::DecodeIOMessage(msg_type))?; + let addr = mimo_message.guest_physical_address; + return match MemoryRegionFlags::try_from(mimo_message) + .map_err(|_| RunVcpuError::ParseGpaAccessInfo)? + { + MemoryRegionFlags::READ => Ok(VmExit::MmioRead(addr)), + MemoryRegionFlags::WRITE => Ok(VmExit::MmioWrite(addr)), + _ => Ok(VmExit::Unknown("Unknown MMIO access".to_string())), + }; + } + INVALID_GPA_ACCESS_MESSAGE => { + let mimo_message = m + .to_memory_info() + .map_err(|_| RunVcpuError::DecodeIOMessage(msg_type))?; + let gpa = mimo_message.guest_physical_address; + let access_info = MemoryRegionFlags::try_from(mimo_message) + .map_err(|_| RunVcpuError::ParseGpaAccessInfo)?; + return match access_info { + MemoryRegionFlags::READ => Ok(VmExit::MmioRead(gpa)), + MemoryRegionFlags::WRITE => Ok(VmExit::MmioWrite(gpa)), + _ => Ok(VmExit::Unknown("Unknown MMIO access".to_string())), + }; + } + #[cfg(gdb)] + EXCEPTION_INTERCEPT => { + let ex_info = m + .to_exception_info() + .map_err(|_| RunVcpuError::DecodeIOMessage(msg_type))?; + let DebugRegisters { dr6, .. } = self + .vcpu_fd + .get_debug_regs() + .map_err(|e| RunVcpuError::GetDr6(e.into()))?; + return Ok(VmExit::Debug { + dr6, + exception: ex_info.exception_vector as u32, + }); + } + other => { + return Ok(VmExit::Unknown(format!( + "Unknown MSHV VCPU exit: {:?}", + other + ))); + } } } - INVALID_GPA_ACCESS_MESSAGE => { - let mimo_message = m - .to_memory_info() - .map_err(|_| RunVcpuError::DecodeIOMessage(m.header.message_type))?; - let gpa = mimo_message.guest_physical_address; - let access_info = MemoryRegionFlags::try_from(mimo_message) - .map_err(|_| RunVcpuError::ParseGpaAccessInfo)?; - match access_info { - MemoryRegionFlags::READ => VmExit::MmioRead(gpa), - MemoryRegionFlags::WRITE => VmExit::MmioWrite(gpa), - _ => VmExit::Unknown("Unknown MMIO access".to_string()), + Err(e) => match e.errno() { + libc::EINTR => { + // When the timer thread is active, EINTR may be + // a spurious signal. Continue the run loop to + // let the hypervisor deliver any pending timer + // interrupt. + #[cfg(feature = "hw-interrupts")] + if self.timer_thread.is_some() { + continue; + } + return Ok(VmExit::Cancelled()); } - } - #[cfg(gdb)] - EXCEPTION_INTERCEPT => { - let ex_info = m - .to_exception_info() - .map_err(|_| RunVcpuError::DecodeIOMessage(m.header.message_type))?; - let DebugRegisters { dr6, .. } = self - .vcpu_fd - .get_debug_regs() - .map_err(|e| RunVcpuError::GetDr6(e.into()))?; - VmExit::Debug { - dr6, - exception: ex_info.exception_vector as u32, + libc::EAGAIN => { + #[cfg(not(feature = "hw-interrupts"))] + { + return Ok(VmExit::Retry()); + } + #[cfg(feature = "hw-interrupts")] + continue; } - } - other => VmExit::Unknown(format!("Unknown MSHV VCPU exit: {:?}", other)), - }, - Err(e) => match e.errno() { - // InterruptHandle::kill() sends a signal (SIGRTMIN+offset) to interrupt the vcpu, which causes EINTR - libc::EINTR => VmExit::Cancelled(), - libc::EAGAIN => VmExit::Retry(), - _ => Err(RunVcpuError::Unknown(e.into()))?, - }, - }; - Ok(result) + _ => return Err(RunVcpuError::Unknown(e.into())), + }, + } + } } fn regs(&self) -> std::result::Result { @@ -463,3 +616,135 @@ impl DebuggableVm for MshvVm { } } } + +#[cfg(feature = "hw-interrupts")] +impl MshvVm { + /// Standard x86 APIC base MSR value: base address 0xFEE00000 + + /// BSP flag (bit 8) + global enable (bit 11). + const APIC_BASE_DEFAULT: u64 = 0xFEE00900; + + /// Perform LAPIC EOI: clear the highest-priority in-service bit. + /// Called when the guest sends PIC EOI, since the timer thread + /// delivers interrupts through the LAPIC and the guest only + /// acknowledges via PIC. + fn do_lapic_eoi(&self) { + if let Ok(mut lapic) = self.vcpu_fd.get_lapic() { + super::hw_interrupts::lapic_eoi(lapic_regs_as_u8_mut(&mut lapic.regs)); + let _ = self.vcpu_fd.set_lapic(&lapic); + } + } + + fn handle_hw_io_out(&mut self, port: u16, data: &[u8]) -> bool { + if port == OutBAction::PvTimerConfig as u16 { + if data.len() >= 4 { + let period_us = u32::from_le_bytes([data[0], data[1], data[2], data[3]]); + if period_us > 0 && self.timer_thread.is_none() { + // Reset the stop flag — a previous halt (e.g. the + // init halt during evolve()) may have set it. + self.timer_stop.store(false, Ordering::Relaxed); + // Re-enable LAPIC if the guest disabled it (via WRMSR + // to MSR 0x1B clearing bit 11). Some guests clear + // the global APIC enable when no I/O APIC is detected. + // + // The hypervisor may return 0 for APIC_BASE when the + // APIC is globally disabled, so we always restore the + // standard value (0xFEE00900). + use mshv_bindings::hv_register_name_HV_X64_REGISTER_APIC_BASE; + let mut apic_base_reg = [hv_register_assoc { + name: hv_register_name_HV_X64_REGISTER_APIC_BASE, + value: hv_register_value { reg64: 0 }, + ..Default::default() + }]; + if self.vcpu_fd.get_reg(&mut apic_base_reg).is_ok() { + let cur = unsafe { apic_base_reg[0].value.reg64 }; + if cur & (1 << 11) == 0 { + let _ = self.vcpu_fd.set_reg(&[hv_register_assoc { + name: hv_register_name_HV_X64_REGISTER_APIC_BASE, + value: hv_register_value { + reg64: Self::APIC_BASE_DEFAULT, + }, + ..Default::default() + }]); + } + } + // Re-initialize LAPIC SVR (may have been zeroed when + // guest disabled the APIC globally) + if let Ok(mut lapic) = self.vcpu_fd.get_lapic() { + let regs = lapic_regs_as_u8(&lapic.regs); + let svr = super::hw_interrupts::read_lapic_u32(regs, 0xF0); + if svr & 0x100 == 0 { + let regs_mut = lapic_regs_as_u8_mut(&mut lapic.regs); + super::hw_interrupts::write_lapic_u32(regs_mut, 0xF0, 0x1FF); + super::hw_interrupts::write_lapic_u32(regs_mut, 0x80, 0); // TPR + let _ = self.vcpu_fd.set_lapic(&lapic); + } + } + + let vm_fd = self.vm_fd.clone(); + let vector = super::hw_interrupts::TIMER_VECTOR; + let stop = self.timer_stop.clone(); + let period = std::time::Duration::from_micros(period_us as u64); + self.timer_thread = Some(std::thread::spawn(move || { + while !stop.load(Ordering::Relaxed) { + std::thread::sleep(period); + if stop.load(Ordering::Relaxed) { + break; + } + let _ = vm_fd.request_virtual_interrupt(&InterruptRequest { + interrupt_type: hv_interrupt_type_HV_X64_INTERRUPT_TYPE_FIXED, + apic_id: 0, + vector, + level_triggered: false, + logical_destination_mode: false, + long_mode: false, + }); + } + })); + } + } + return true; + } + let timer_active = self.timer_thread.is_some(); + super::hw_interrupts::handle_common_io_out(port, data, timer_active, || self.do_lapic_eoi()) + } +} + +#[cfg(feature = "hw-interrupts")] +impl Drop for MshvVm { + fn drop(&mut self) { + self.timer_stop.store(true, Ordering::Relaxed); + if let Some(h) = self.timer_thread.take() { + let _ = h.join(); + } + } +} + +#[cfg(test)] +#[cfg(feature = "hw-interrupts")] +mod hw_interrupt_tests { + use super::*; + + #[test] + fn lapic_regs_conversion_roundtrip() { + let mut regs = [0i8; 1024]; + let bytes = lapic_regs_as_u8_mut(&mut regs); + super::super::hw_interrupts::write_lapic_u32(bytes, 0xF0, 0xDEAD_BEEF); + let bytes = lapic_regs_as_u8(®s); + assert_eq!( + super::super::hw_interrupts::read_lapic_u32(bytes, 0xF0), + 0xDEAD_BEEF + ); + } + + #[test] + fn apic_base_default_value() { + let base = MshvVm::APIC_BASE_DEFAULT; + assert_ne!(base & (1 << 8), 0, "BSP flag should be set"); + assert_ne!(base & (1 << 11), 0, "global enable should be set"); + assert_eq!( + base & 0xFFFFF000, + 0xFEE00000, + "base address should be 0xFEE00000" + ); + } +} diff --git a/src/hyperlight_host/src/hypervisor/virtual_machine/whp.rs b/src/hyperlight_host/src/hypervisor/virtual_machine/whp.rs index 07f4b5a97..66c0a0a0b 100644 --- a/src/hyperlight_host/src/hypervisor/virtual_machine/whp.rs +++ b/src/hyperlight_host/src/hypervisor/virtual_machine/whp.rs @@ -15,7 +15,12 @@ limitations under the License. */ use std::os::raw::c_void; +#[cfg(feature = "hw-interrupts")] +use std::sync::Arc; +#[cfg(feature = "hw-interrupts")] +use std::sync::atomic::{AtomicBool, Ordering}; +use hyperlight_common::outb::OutBAction; #[cfg(feature = "trace_guest")] use tracing::Span; #[cfg(feature = "trace_guest")] @@ -71,6 +76,12 @@ pub(crate) struct WhpVm { partition: WHV_PARTITION_HANDLE, // Surrogate process for memory mapping surrogate_process: SurrogateProcess, + /// Signal to stop the software timer thread. + #[cfg(feature = "hw-interrupts")] + timer_stop: Option>, + /// Handle for the software timer thread. + #[cfg(feature = "hw-interrupts")] + timer_thread: Option>, } // Safety: `WhpVm` is !Send because it holds `SurrogateProcess` which contains a raw pointer @@ -83,33 +94,135 @@ unsafe impl Send for WhpVm {} impl WhpVm { pub(crate) fn new() -> Result { const NUM_CPU: u32 = 1; + let partition = unsafe { - let partition = - WHvCreatePartition().map_err(|e| CreateVmError::CreateVmFd(e.into()))?; - WHvSetPartitionProperty( - partition, - WHvPartitionPropertyCodeProcessorCount, - &NUM_CPU as *const _ as *const _, - std::mem::size_of_val(&NUM_CPU) as _, - ) - .map_err(|e| CreateVmError::SetPartitionProperty(e.into()))?; - WHvSetupPartition(partition).map_err(|e| CreateVmError::InitializeVm(e.into()))?; - WHvCreateVirtualProcessor(partition, 0, 0) - .map_err(|e| CreateVmError::CreateVcpuFd(e.into()))?; - partition - }; + #[cfg(feature = "hw-interrupts")] + { + // Check whether the WHP host supports LAPIC emulation. + // Bit 1 of WHV_CAPABILITY_FEATURES = LocalApicEmulation. + // LAPIC emulation is required for timer interrupt delivery. + const LAPIC_EMULATION_BIT: u64 = 1 << 1; + + let mut capability: WHV_CAPABILITY = Default::default(); + let has_lapic = WHvGetCapability( + WHvCapabilityCodeFeatures, + &mut capability as *mut _ as *mut c_void, + std::mem::size_of::() as u32, + None, + ) + .is_ok() + && (capability.Features.AsUINT64 & LAPIC_EMULATION_BIT != 0); + + if !has_lapic { + return Err(CreateVmError::InitializeVm( + windows_result::Error::new( + HRESULT::from_win32(0x32), // ERROR_NOT_SUPPORTED + "WHP LocalApicEmulation capability is required for hw-interrupts", + ) + .into(), + )); + } + + let p = WHvCreatePartition().map_err(|e| CreateVmError::CreateVmFd(e.into()))?; + WHvSetPartitionProperty( + p, + WHvPartitionPropertyCodeProcessorCount, + &NUM_CPU as *const _ as *const _, + std::mem::size_of_val(&NUM_CPU) as _, + ) + .map_err(|e| CreateVmError::SetPartitionProperty(e.into()))?; + + let apic_mode: u32 = 1; // WHvX64LocalApicEmulationModeXApic + WHvSetPartitionProperty( + p, + WHvPartitionPropertyCodeLocalApicEmulationMode, + &apic_mode as *const _ as *const _, + std::mem::size_of_val(&apic_mode) as _, + ) + .map_err(|e| CreateVmError::SetPartitionProperty(e.into()))?; + + WHvSetupPartition(p).map_err(|e| CreateVmError::InitializeVm(e.into()))?; + WHvCreateVirtualProcessor(p, 0, 0) + .map_err(|e| CreateVmError::CreateVcpuFd(e.into()))?; + + // Initialize the LAPIC via the bulk interrupt-controller + // state API (individual APIC register writes via + // WHvSetVirtualProcessorRegisters fail with ACCESS_DENIED). + Self::init_lapic_bulk(p).map_err(|e| CreateVmError::InitializeVm(e.into()))?; - // Create the surrogate process with the total memory size - let mgr = get_surrogate_process_manager() - .map_err(|e| CreateVmError::SurrogateProcess(e.to_string()))?; - let surrogate_process = mgr - .get_surrogate_process() - .map_err(|e| CreateVmError::SurrogateProcess(e.to_string()))?; + p + } + + #[cfg(not(feature = "hw-interrupts"))] + { + let p = WHvCreatePartition().map_err(|e| CreateVmError::CreateVmFd(e.into()))?; + WHvSetPartitionProperty( + p, + WHvPartitionPropertyCodeProcessorCount, + &NUM_CPU as *const _ as *const _, + std::mem::size_of_val(&NUM_CPU) as _, + ) + .map_err(|e| CreateVmError::SetPartitionProperty(e.into()))?; + WHvSetupPartition(p).map_err(|e| CreateVmError::InitializeVm(e.into()))?; + WHvCreateVirtualProcessor(p, 0, 0) + .map_err(|e| CreateVmError::CreateVcpuFd(e.into()))?; + p + } + }; - Ok(WhpVm { + let vm = WhpVm { partition, - surrogate_process, - }) + surrogate_process: { + let mgr = get_surrogate_process_manager() + .map_err(|e| CreateVmError::SurrogateProcess(e.to_string()))?; + mgr.get_surrogate_process() + .map_err(|e| CreateVmError::SurrogateProcess(e.to_string()))? + }, + #[cfg(feature = "hw-interrupts")] + timer_stop: None, + #[cfg(feature = "hw-interrupts")] + timer_thread: None, + }; + + Ok(vm) + } + + /// Maximum size for the interrupt controller state blob. + const LAPIC_STATE_MAX_SIZE: u32 = 4096; + + /// Initialize the LAPIC via the bulk interrupt-controller state API. + /// Individual APIC register writes via `WHvSetVirtualProcessorRegisters` + /// fail with ACCESS_DENIED on WHP when LAPIC emulation is enabled, + /// so we use `WHvGet/SetVirtualProcessorInterruptControllerState2` + /// to read-modify-write the entire LAPIC register page. + #[cfg(feature = "hw-interrupts")] + unsafe fn init_lapic_bulk(partition: WHV_PARTITION_HANDLE) -> windows_result::Result<()> { + let mut state = vec![0u8; Self::LAPIC_STATE_MAX_SIZE as usize]; + let mut written: u32 = 0; + + unsafe { + WHvGetVirtualProcessorInterruptControllerState2( + partition, + 0, + state.as_mut_ptr() as *mut c_void, + Self::LAPIC_STATE_MAX_SIZE, + Some(&mut written), + )?; + } + state.truncate(written as usize); + + super::hw_interrupts::init_lapic_registers(&mut state); + + unsafe { + WHvSetVirtualProcessorInterruptControllerState2( + partition, + 0, + state.as_ptr() as *const c_void, + state.len() as u32, + )?; + } + + Ok(()) } /// Helper for setting arbitrary registers. Makes sure the same number @@ -224,85 +337,155 @@ impl VirtualMachine for WhpVm { // it sets the guest span, no other traces or spans must be setup in between these calls. #[cfg(feature = "trace_guest")] tc.setup_guest_trace(Span::current().context()); - unsafe { - WHvRunVirtualProcessor( - self.partition, - 0, - &mut exit_context as *mut _ as *mut c_void, - std::mem::size_of::() as u32, - ) - .map_err(|e| RunVcpuError::Unknown(e.into()))?; - } - let result = match exit_context.ExitReason { - WHvRunVpExitReasonX64IoPortAccess => unsafe { - let instruction_length = exit_context.VpContext._bitfield & 0xF; - let rip = exit_context.VpContext.Rip + instruction_length as u64; - self.set_registers(&[( - WHvX64RegisterRip, - Align16(WHV_REGISTER_VALUE { Reg64: rip }), - )]) - .map_err(|e| RunVcpuError::IncrementRip(e.into()))?; - VmExit::IoOut( - exit_context.Anonymous.IoPortAccess.PortNumber, - exit_context + + loop { + unsafe { + WHvRunVirtualProcessor( + self.partition, + 0, + &mut exit_context as *mut _ as *mut c_void, + std::mem::size_of::() as u32, + ) + .map_err(|e| RunVcpuError::Unknown(e.into()))?; + } + + match exit_context.ExitReason { + WHvRunVpExitReasonX64IoPortAccess => unsafe { + let instruction_length = exit_context.VpContext._bitfield & 0xF; + let rip = exit_context.VpContext.Rip + instruction_length as u64; + let port = exit_context.Anonymous.IoPortAccess.PortNumber; + let rax = exit_context.Anonymous.IoPortAccess.Rax; + let is_write = exit_context .Anonymous .IoPortAccess - .Rax - .to_le_bytes() - .to_vec(), - ) - }, - WHvRunVpExitReasonX64Halt => VmExit::Halt(), - WHvRunVpExitReasonMemoryAccess => { - let gpa = unsafe { exit_context.Anonymous.MemoryAccess.Gpa }; - let access_info = unsafe { - WHV_MEMORY_ACCESS_TYPE( - // 2 first bits are the access type, see https://learn.microsoft.com/en-us/virtualization/api/hypervisor-platform/funcs/memoryaccess#syntax - (exit_context.Anonymous.MemoryAccess.AccessInfo.AsUINT32 & 0b11) as i32, - ) - }; - let access_info = MemoryRegionFlags::try_from(access_info) - .map_err(|_| RunVcpuError::ParseGpaAccessInfo)?; - match access_info { - MemoryRegionFlags::READ => VmExit::MmioRead(gpa), - MemoryRegionFlags::WRITE => VmExit::MmioWrite(gpa), - _ => VmExit::Unknown("Unknown memory access type".to_string()), + .AccessInfo + .Anonymous + ._bitfield + & 1 + != 0; + + self.set_registers(&[( + WHvX64RegisterRip, + Align16(WHV_REGISTER_VALUE { Reg64: rip }), + )]) + .map_err(|e| RunVcpuError::IncrementRip(e.into()))?; + + // OutBAction::Halt always means "I'm done", regardless + // of whether a timer is active. + if is_write && port == OutBAction::Halt as u16 { + return Ok(VmExit::Halt()); + } + + #[cfg(feature = "hw-interrupts")] + { + if is_write { + let data = rax.to_le_bytes(); + if self.handle_hw_io_out(port, &data) { + continue; + } + } else if let Some(val) = super::hw_interrupts::handle_io_in(port) { + self.set_registers(&[( + WHvX64RegisterRax, + Align16(WHV_REGISTER_VALUE { Reg64: val }), + )]) + .map_err(|e| RunVcpuError::Unknown(e.into()))?; + continue; + } + } + + // Suppress unused variable warnings when hw-interrupts is disabled + let _ = is_write; + + return Ok(VmExit::IoOut(port, rax.to_le_bytes().to_vec())); + }, + WHvRunVpExitReasonX64Halt => { + // With software timer active, re-enter the guest. + // WHvRunVirtualProcessor will block until the timer + // thread injects an interrupt via WHvRequestInterrupt, + // waking the vCPU from HLT. + #[cfg(feature = "hw-interrupts")] + if self.timer_thread.is_some() { + continue; + } + return Ok(VmExit::Halt()); } - } - // Execution was cancelled by the host. - WHvRunVpExitReasonCanceled => VmExit::Cancelled(), - #[cfg(gdb)] - WHvRunVpExitReasonException => { - let exception = unsafe { exit_context.Anonymous.VpException }; - - // Get the DR6 register to see which breakpoint was hit - let dr6 = { - let names = [WHvX64RegisterDr6]; - let mut out: [Align16; 1] = unsafe { std::mem::zeroed() }; - unsafe { - WHvGetVirtualProcessorRegisters( - self.partition, - 0, - names.as_ptr(), - 1, - out.as_mut_ptr() as *mut WHV_REGISTER_VALUE, + WHvRunVpExitReasonMemoryAccess => { + let gpa = unsafe { exit_context.Anonymous.MemoryAccess.Gpa }; + let access_info = unsafe { + WHV_MEMORY_ACCESS_TYPE( + (exit_context.Anonymous.MemoryAccess.AccessInfo.AsUINT32 & 0b11) as i32, ) - .map_err(|e| RunVcpuError::GetDr6(e.into()))?; + }; + let access_info = MemoryRegionFlags::try_from(access_info) + .map_err(|_| RunVcpuError::ParseGpaAccessInfo)?; + return match access_info { + MemoryRegionFlags::READ => Ok(VmExit::MmioRead(gpa)), + MemoryRegionFlags::WRITE => Ok(VmExit::MmioWrite(gpa)), + _ => Ok(VmExit::Unknown("Unknown memory access type".to_string())), + }; + } + // Execution was cancelled by the host. + WHvRunVpExitReasonCanceled => { + return Ok(VmExit::Cancelled()); + } + #[cfg(gdb)] + WHvRunVpExitReasonException => { + let exception = unsafe { exit_context.Anonymous.VpException }; + + // Get the DR6 register to see which breakpoint was hit + let dr6 = { + let names = [WHvX64RegisterDr6]; + let mut out: [Align16; 1] = + unsafe { std::mem::zeroed() }; + unsafe { + WHvGetVirtualProcessorRegisters( + self.partition, + 0, + names.as_ptr(), + 1, + out.as_mut_ptr() as *mut WHV_REGISTER_VALUE, + ) + .map_err(|e| RunVcpuError::GetDr6(e.into()))?; + } + unsafe { out[0].0.Reg64 } + }; + + return Ok(VmExit::Debug { + dr6, + exception: exception.ExceptionType as u32, + }); + } + WHV_RUN_VP_EXIT_REASON(_) => { + let rip = exit_context.VpContext.Rip; + tracing::error!( + "WHP unknown exit reason {}: RIP={:#x}", + exit_context.ExitReason.0, + rip, + ); + if let Ok(regs) = self.regs() { + tracing::error!( + " RAX={:#x} RCX={:#x} RDX={:#x}", + regs.rax, + regs.rcx, + regs.rdx + ); } - unsafe { out[0].0.Reg64 } - }; - - VmExit::Debug { - dr6, - exception: exception.ExceptionType as u32, + if let Ok(sregs) = self.sregs() { + tracing::error!( + " CR0={:#x} CR4={:#x} EFER={:#x} APIC_BASE={:#x}", + sregs.cr0, + sregs.cr4, + sregs.efer, + sregs.apic_base + ); + } + return Ok(VmExit::Unknown(format!( + "Unknown exit reason '{}' at RIP={:#x}", + exit_context.ExitReason.0, rip + ))); } } - WHV_RUN_VP_EXIT_REASON(_) => VmExit::Unknown(format!( - "Unknown exit reason '{}'", - exit_context.ExitReason.0 - )), - }; - Ok(result) + } } fn regs(&self) -> std::result::Result { @@ -411,9 +594,30 @@ impl VirtualMachine for WhpVm { fn set_sregs(&self, sregs: &CommonSpecialRegisters) -> std::result::Result<(), RegisterError> { let whp_regs: [(WHV_REGISTER_NAME, Align16); WHP_SREGS_NAMES_LEN] = sregs.into(); - self.set_registers(&whp_regs) - .map_err(|e| RegisterError::SetSregs(e.into()))?; - Ok(()) + + // When LAPIC emulation is active (always with hw-interrupts), + // skip writing APIC_BASE. The generic CommonSpecialRegisters + // defaults APIC_BASE to 0 which would globally disable the LAPIC. + // On some WHP hosts, host-side APIC register writes are blocked + // entirely (ACCESS_DENIED). + #[cfg(feature = "hw-interrupts")] + { + let filtered: Vec<_> = whp_regs + .iter() + .copied() + .filter(|(name, _)| *name != WHvX64RegisterApicBase) + .collect(); + self.set_registers(&filtered) + .map_err(|e| RegisterError::SetSregs(e.into()))?; + Ok(()) + } + + #[cfg(not(feature = "hw-interrupts"))] + { + self.set_registers(&whp_regs) + .map_err(|e| RegisterError::SetSregs(e.into()))?; + Ok(()) + } } fn debug_regs(&self) -> std::result::Result { @@ -762,8 +966,112 @@ impl DebuggableVm for WhpVm { } } +#[cfg(feature = "hw-interrupts")] +impl WhpVm { + /// Get the LAPIC state via the bulk interrupt-controller state API. + fn get_lapic_state(&self) -> windows_result::Result> { + let mut state = vec![0u8; Self::LAPIC_STATE_MAX_SIZE as usize]; + let mut written: u32 = 0; + + unsafe { + WHvGetVirtualProcessorInterruptControllerState2( + self.partition, + 0, + state.as_mut_ptr() as *mut c_void, + Self::LAPIC_STATE_MAX_SIZE, + Some(&mut written), + )?; + } + state.truncate(written as usize); + Ok(state) + } + + /// Set the LAPIC state via the bulk interrupt-controller state API. + fn set_lapic_state(&self, state: &[u8]) -> windows_result::Result<()> { + unsafe { + WHvSetVirtualProcessorInterruptControllerState2( + self.partition, + 0, + state.as_ptr() as *const c_void, + state.len() as u32, + ) + } + } + + /// Perform LAPIC EOI: clear the highest-priority in-service bit. + /// Called when the guest sends PIC EOI, since the LAPIC timer + /// delivers through the LAPIC and the guest only acknowledges via PIC. + fn do_lapic_eoi(&self) { + if let Ok(mut state) = self.get_lapic_state() { + super::hw_interrupts::lapic_eoi(&mut state); + let _ = self.set_lapic_state(&state); + } + } + + /// Stop the software timer thread if running. + fn stop_timer_thread(&mut self) { + if let Some(stop) = self.timer_stop.take() { + stop.store(true, Ordering::Relaxed); + } + if let Some(handle) = self.timer_thread.take() { + let _ = handle.join(); + } + } + + fn handle_hw_io_out(&mut self, port: u16, data: &[u8]) -> bool { + if port == OutBAction::PvTimerConfig as u16 { + if data.len() >= 4 { + let period_us = u32::from_le_bytes([data[0], data[1], data[2], data[3]]); + + // Stop any existing timer thread before (re-)configuring. + self.stop_timer_thread(); + + if period_us > 0 { + let partition_raw = self.partition.0; + let vector = super::hw_interrupts::TIMER_VECTOR; + let stop = Arc::new(AtomicBool::new(false)); + let stop_clone = stop.clone(); + let period = std::time::Duration::from_micros(period_us as u64); + + let handle = std::thread::spawn(move || { + while !stop_clone.load(Ordering::Relaxed) { + std::thread::sleep(period); + if stop_clone.load(Ordering::Relaxed) { + break; + } + let partition = WHV_PARTITION_HANDLE(partition_raw); + let interrupt = WHV_INTERRUPT_CONTROL { + _bitfield: 0, // Type=Fixed, DestMode=Physical, Trigger=Edge + Destination: 0, + Vector: vector, + }; + let _ = unsafe { + WHvRequestInterrupt( + partition, + &interrupt, + std::mem::size_of::() as u32, + ) + }; + } + }); + + self.timer_stop = Some(stop); + self.timer_thread = Some(handle); + } + } + return true; + } + let timer_active = self.timer_thread.is_some(); + super::hw_interrupts::handle_common_io_out(port, data, timer_active, || self.do_lapic_eoi()) + } +} + impl Drop for WhpVm { fn drop(&mut self) { + // Stop the software timer thread before tearing down the partition. + #[cfg(feature = "hw-interrupts")] + self.stop_timer_thread(); + // HyperlightVm::drop() calls set_dropped() before this runs. // set_dropped() ensures no WHvCancelRunVirtualProcessor calls are in progress // or will be made in the future, so it's safe to delete the partition. @@ -812,3 +1120,41 @@ unsafe fn try_load_whv_map_gpa_range2() -> windows_result::Result() as u32, + None, + ) + }; + assert!( + result.is_ok(), + "WHvGetCapability(Features) failed: {result:?}" + ); + let raw = unsafe { capability.Features.AsUINT64 }; + let has_lapic = raw & (1 << 1) != 0; // bit 1 = LocalApicEmulation + assert!( + has_lapic, + "This host does not support WHP LocalApicEmulation. \ + hw-interrupts requires Windows 11 22H2+ or a recent Windows Server build." + ); + } +} diff --git a/src/hyperlight_host/src/mem/shared_mem.rs b/src/hyperlight_host/src/mem/shared_mem.rs index 2cd3b8acc..8cd2d25a3 100644 --- a/src/hyperlight_host/src/mem/shared_mem.rs +++ b/src/hyperlight_host/src/mem/shared_mem.rs @@ -679,7 +679,22 @@ impl GuestSharedMemory { MemoryRegionType::Scratch => { MemoryRegionFlags::READ | MemoryRegionFlags::WRITE | MemoryRegionFlags::EXECUTE } - MemoryRegionType::Snapshot => MemoryRegionFlags::READ | MemoryRegionFlags::EXECUTE, + // For init-paging, the snapshot is read-only because guest page + // tables provide CoW semantics for writable pages. For + // non-init-paging there are no guest page tables, so the snapshot + // must be writable — otherwise writes (including the CPU setting + // the "Accessed" bit in GDT descriptors during segment loads) + // cause EPT violations that KVM retries forever. + MemoryRegionType::Snapshot => { + #[cfg(feature = "init-paging")] + { + MemoryRegionFlags::READ | MemoryRegionFlags::EXECUTE + } + #[cfg(not(feature = "init-paging"))] + { + MemoryRegionFlags::READ | MemoryRegionFlags::WRITE | MemoryRegionFlags::EXECUTE + } + } #[allow(clippy::panic)] // In the future, all the host side knowledge about memory // region types should collapse down to Snapshot vs diff --git a/src/hyperlight_host/src/sandbox/outb.rs b/src/hyperlight_host/src/sandbox/outb.rs index 76ec53cee..5d5bd91d8 100644 --- a/src/hyperlight_host/src/sandbox/outb.rs +++ b/src/hyperlight_host/src/sandbox/outb.rs @@ -233,6 +233,11 @@ pub(crate) fn handle_outb( OutBAction::TraceMemoryAlloc => trace_info.handle_trace_mem_alloc(regs, mem_mgr), #[cfg(feature = "mem_profile")] OutBAction::TraceMemoryFree => trace_info.handle_trace_mem_free(regs, mem_mgr), + // PvTimerConfig and Halt are handled at the hypervisor level + // (in run_vcpu) and should never reach this handler. + OutBAction::PvTimerConfig | OutBAction::Halt => Err(HandleOutbError::InvalidPort(format!( + "port {port:#x} should be handled at hypervisor level" + ))), } } #[cfg(test)] diff --git a/src/hyperlight_host/src/sandbox/snapshot.rs b/src/hyperlight_host/src/sandbox/snapshot.rs index f9a48c871..2329a6b7a 100644 --- a/src/hyperlight_host/src/sandbox/snapshot.rs +++ b/src/hyperlight_host/src/sandbox/snapshot.rs @@ -265,6 +265,7 @@ fn filtered_mappings<'a>( return None; } // neither does the mapping of the snapshot's own page tables + #[cfg(feature = "init-paging")] if mapping.virt_base >= hyperlight_common::layout::SNAPSHOT_PT_GVA_MIN as u64 && mapping.virt_base <= hyperlight_common::layout::SNAPSHOT_PT_GVA_MAX as u64 { diff --git a/src/hyperlight_host/src/sandbox/uninitialized.rs b/src/hyperlight_host/src/sandbox/uninitialized.rs index 2167dca43..2e76af1c4 100644 --- a/src/hyperlight_host/src/sandbox/uninitialized.rs +++ b/src/hyperlight_host/src/sandbox/uninitialized.rs @@ -29,9 +29,10 @@ use crate::func::host_functions::{HostFunction, register_host_function}; use crate::func::{ParameterTuple, SupportedReturnType}; #[cfg(feature = "build-metadata")] use crate::log_build_details; +use crate::mem::layout::SandboxMemoryLayout; use crate::mem::memory_region::{DEFAULT_GUEST_BLOB_MEM_FLAGS, MemoryRegionFlags}; use crate::mem::mgr::SandboxMemoryManager; -use crate::mem::shared_mem::ExclusiveSharedMemory; +use crate::mem::shared_mem::{ExclusiveSharedMemory, SharedMemory}; use crate::sandbox::SandboxConfiguration; use crate::{MultiUseSandbox, Result, new_error}; @@ -169,6 +170,43 @@ impl<'a> From> for GuestEnvironment<'a, '_> { } impl UninitializedSandbox { + /// Returns a host-side pointer to a specific guest physical address (GPA) + /// within the sandbox's shared memory region. + /// + /// This is the safe way to obtain host-side access to guest memory. + /// The method validates that the GPA falls within the sandbox's + /// allocated memory region before returning the corresponding host pointer. + /// + /// # Safety + /// + /// The returned pointer is valid as long as the sandbox (and its underlying + /// shared memory mapping) remains alive. Dereferencing the pointer requires + /// `unsafe` code and the caller must ensure proper synchronization. + pub fn guest_memory_ptr(&mut self, gpa: usize) -> Result<*mut u8> { + let base = SandboxMemoryLayout::BASE_ADDRESS; + let mem_size = self.mgr.shared_mem.mem_size(); + + if gpa < base { + return Err(new_error!( + "GPA {:#x} is below the sandbox base address {:#x}", + gpa, + base + )); + } + + let offset = gpa - base; + if offset >= mem_size { + return Err(new_error!( + "GPA {:#x} (offset {:#x}) is beyond sandbox memory size {:#x}", + gpa, + offset, + mem_size + )); + } + + Ok(unsafe { self.mgr.shared_mem.base_ptr().add(offset) }) + } + // Creates a new uninitialized sandbox from a pre-built snapshot. // Note that since memory configuration is part of the snapshot the only configuration // that can be changed (from the original snapshot) is the configuration defines the behaviour of diff --git a/src/hyperlight_host/tests/integration_test.rs b/src/hyperlight_host/tests/integration_test.rs index 80f997360..c39946096 100644 --- a/src/hyperlight_host/tests/integration_test.rs +++ b/src/hyperlight_host/tests/integration_test.rs @@ -1768,3 +1768,28 @@ fn interrupt_cancel_delete_race() { handle.join().unwrap(); } } + +/// Test that hardware timer interrupts are delivered to the guest via OutBAction::PvTimerConfig port. +/// +/// The guest function `TestTimerInterrupts`: +/// 1. Initializes the PIC (IRQ0 -> vector 0x20) +/// 2. Installs an IDT entry for vector 0x20 +/// 3. Arms the timer via OutBAction::PvTimerConfig port +/// 4. Enables interrupts and busy-waits +/// 5. Returns the number of timer interrupts received +/// +/// Requires the `hw-interrupts` feature on the host side. +#[test] +#[cfg(feature = "hw-interrupts")] +fn hw_timer_interrupts() { + with_rust_sandbox(|mut sbox| { + // 1ms period, up to 100M spin iterations (should fire well before that) + let count: i32 = sbox + .call("TestTimerInterrupts", (1000_i32, 100_000_000_i32)) + .unwrap(); + assert!( + count > 0, + "Expected at least one timer interrupt, got {count}" + ); + }); +} diff --git a/src/tests/rust_guests/dummyguest/src/main.rs b/src/tests/rust_guests/dummyguest/src/main.rs index 01063dbdb..2d9db910e 100644 --- a/src/tests/rust_guests/dummyguest/src/main.rs +++ b/src/tests/rust_guests/dummyguest/src/main.rs @@ -30,8 +30,16 @@ fn panic(_info: &PanicInfo) -> ! { } fn halt() { + // OutBAction::Halt = 108; using raw constant to avoid pulling in + // anyhow (via hyperlight_common's TryFrom impl) which requires alloc. unsafe { - asm!("hlt"); + asm!( + "out dx, eax", + "cli", + "hlt", + in("dx") 108u16, + in("eax") 0u32, + ); } } diff --git a/src/tests/rust_guests/simpleguest/src/main.rs b/src/tests/rust_guests/simpleguest/src/main.rs index 6d53c94c6..2b694be42 100644 --- a/src/tests/rust_guests/simpleguest/src/main.rs +++ b/src/tests/rust_guests/simpleguest/src/main.rs @@ -32,7 +32,7 @@ use alloc::{format, vec}; use core::alloc::Layout; use core::ffi::c_char; use core::hint::black_box; -use core::sync::atomic::{AtomicU64, Ordering}; +use core::sync::atomic::{AtomicU32, AtomicU64, Ordering}; use hyperlight_common::flatbuffer_wrappers::function_call::{FunctionCall, FunctionCallType}; use hyperlight_common::flatbuffer_wrappers::function_types::{ @@ -499,6 +499,157 @@ fn outb_with_port(port: u32, value: u32) { } } +// ============================================================================= +// Hardware timer interrupt test infrastructure +// ============================================================================= + +/// Counter incremented by the timer interrupt handler. +static TIMER_IRQ_COUNT: AtomicU32 = AtomicU32::new(0); + +// Timer IRQ handler (vector 0x20 = IRQ0 after PIC remapping). +// Increments the global counter, sends PIC EOI, and returns from interrupt. +// +// This handler is intentionally minimal: it only touches RAX, uses `lock inc` +// for the atomic counter update, and sends a non-specific EOI to the master PIC. +// +// NOTE: global_asm! on x86_64 in Rust defaults to Intel syntax. +core::arch::global_asm!( + ".globl _timer_irq_handler", + "_timer_irq_handler:", + "push rax", + "lock inc dword ptr [rip + {counter}]", + "mov al, 0x20", + "out 0x20, al", + "pop rax", + "iretq", + counter = sym TIMER_IRQ_COUNT, +); + +unsafe extern "C" { + fn _timer_irq_handler(); +} + +/// IDT pointer structure for SIDT/LIDT instructions. +#[repr(C, packed)] +struct IdtPtr { + limit: u16, + base: u64, +} + +/// Test hardware timer interrupt delivery. +/// +/// This function: +/// 1. Initializes the PIC (remaps IRQ0 to vector 0x20) +/// 2. Installs an IDT entry for vector 0x20 pointing to `_timer_irq_handler` +/// 3. Programs PIT channel 0 as a rate generator at the requested period +/// 4. Arms the PV timer by writing the period to OutBAction::PvTimerConfig port (for MSHV/WHP) +/// 5. Enables interrupts (STI) and busy-waits for timer delivery +/// 6. Disables interrupts (CLI) and returns the interrupt count +/// +/// Parameters: +/// - `period_us`: timer period in microseconds (written to OutBAction::PvTimerConfig port) +/// - `max_spin`: maximum busy-wait iterations before giving up +/// +/// Returns the number of timer interrupts received. +#[guest_function("TestTimerInterrupts")] +fn test_timer_interrupts(period_us: i32, max_spin: i32) -> i32 { + // Reset counter + TIMER_IRQ_COUNT.store(0, Ordering::SeqCst); + + // 1) Initialize PIC — remap IRQ0 to vector 0x20 + unsafe { + // Master PIC + core::arch::asm!("out 0x20, al", in("al") 0x11u8, options(nomem, nostack)); // ICW1 + core::arch::asm!("out 0x21, al", in("al") 0x20u8, options(nomem, nostack)); // ICW2: base 0x20 + core::arch::asm!("out 0x21, al", in("al") 0x04u8, options(nomem, nostack)); // ICW3 + core::arch::asm!("out 0x21, al", in("al") 0x01u8, options(nomem, nostack)); // ICW4 + core::arch::asm!("out 0x21, al", in("al") 0xFEu8, options(nomem, nostack)); // IMR: unmask IRQ0 + + // Slave PIC + core::arch::asm!("out 0xA0, al", in("al") 0x11u8, options(nomem, nostack)); // ICW1 + core::arch::asm!("out 0xA1, al", in("al") 0x28u8, options(nomem, nostack)); // ICW2: base 0x28 + core::arch::asm!("out 0xA1, al", in("al") 0x02u8, options(nomem, nostack)); // ICW3 + core::arch::asm!("out 0xA1, al", in("al") 0x01u8, options(nomem, nostack)); // ICW4 + core::arch::asm!("out 0xA1, al", in("al") 0xFFu8, options(nomem, nostack)); // IMR: mask all + } + + // 2) Install IDT entry for vector 0x20 (timer interrupt) + let handler_addr = _timer_irq_handler as *const () as u64; + + // Read current IDT base via SIDT + let mut idtr = IdtPtr { limit: 0, base: 0 }; + unsafe { + core::arch::asm!( + "sidt [{}]", + in(reg) &mut idtr as *mut IdtPtr, + options(nostack, preserves_flags) + ); + } + + // Write a 16-byte IDT entry at vector 0x20 (offset = 0x20 * 16 = 0x200) + let entry_ptr = (idtr.base as usize + 0x20 * 16) as *mut u8; + unsafe { + // offset_low (bits 0-15 of handler) + core::ptr::write_volatile(entry_ptr as *mut u16, handler_addr as u16); + // selector: 0x08 = kernel code segment + core::ptr::write_volatile(entry_ptr.add(2) as *mut u16, 0x08); + // IST=0, reserved=0 + core::ptr::write_volatile(entry_ptr.add(4), 0); + // type_attr: 0x8E = interrupt gate, present, DPL=0 + core::ptr::write_volatile(entry_ptr.add(5), 0x8E); + // offset_mid (bits 16-31) + core::ptr::write_volatile(entry_ptr.add(6) as *mut u16, (handler_addr >> 16) as u16); + // offset_high (bits 32-63) + core::ptr::write_volatile(entry_ptr.add(8) as *mut u32, (handler_addr >> 32) as u32); + // reserved + core::ptr::write_volatile(entry_ptr.add(12) as *mut u32, 0); + } + + // 3) Program PIT channel 0 as rate generator (mode 2). + // Divisor = period_us * 1_193_182 / 1_000_000 (PIT oscillator is 1.193182 MHz). + // On KVM the in-kernel PIT handles these IO writes directly. + // On MSHV/WHP these ports are silently absorbed (timer is set via OutBAction::PvTimerConfig). + let divisor = ((period_us as u64) * 1_193_182 / 1_000_000).clamp(1, 0xFFFF) as u16; + unsafe { + // Command: channel 0, lobyte/hibyte access, mode 2 (rate generator) + core::arch::asm!("out 0x43, al", in("al") 0x34u8, options(nomem, nostack)); + // Channel 0 data: low byte of divisor + core::arch::asm!("out 0x40, al", in("al") (divisor & 0xFF) as u8, options(nomem, nostack)); + // Channel 0 data: high byte of divisor + core::arch::asm!("out 0x40, al", in("al") (divisor >> 8) as u8, options(nomem, nostack)); + } + + // 4) Arm timer: write period_us to OutBAction::PvTimerConfig port + unsafe { + core::arch::asm!( + "out dx, eax", + in("dx") hyperlight_common::outb::OutBAction::PvTimerConfig as u16, + in("eax") period_us as u32, + options(nomem, nostack, preserves_flags) + ); + } + + // 5) Enable interrupts and wait for at least one timer tick + unsafe { + core::arch::asm!("sti", options(nomem, nostack)); + } + + let max = max_spin as u32; + for _ in 0..max { + if TIMER_IRQ_COUNT.load(Ordering::SeqCst) > 0 { + break; + } + core::hint::spin_loop(); + } + + // 6) Disable interrupts and return count + unsafe { + core::arch::asm!("cli", options(nomem, nostack)); + } + + TIMER_IRQ_COUNT.load(Ordering::SeqCst) as i32 +} + static mut COUNTER: i32 = 0; #[guest_function("AddToStatic")] @@ -823,7 +974,14 @@ fn corrupt_output_size_prefix() -> i32 { buf[12..16].copy_from_slice(&[0u8; 4]); buf[16..24].copy_from_slice(&8_u64.to_le_bytes()); - core::arch::asm!("hlt", options(noreturn)); + core::arch::asm!( + "out dx, eax", + "cli", + "hlt", + in("dx") hyperlight_common::outb::OutBAction::Halt as u16, + in("eax") 0u32, + options(noreturn), + ); } } @@ -839,7 +997,14 @@ fn corrupt_output_back_pointer() -> i32 { buf[8..16].copy_from_slice(&[0u8; 8]); buf[16..24].copy_from_slice(&0xDEAD_u64.to_le_bytes()); - core::arch::asm!("hlt", options(noreturn)); + core::arch::asm!( + "out dx, eax", + "cli", + "hlt", + in("dx") hyperlight_common::outb::OutBAction::Halt as u16, + in("eax") 0u32, + options(noreturn), + ); } }