From dcee02c2fe379f7a22fd944a4e2783f5d4f63602 Mon Sep 17 00:00:00 2001 From: Philipp Schuster Date: Wed, 28 Jan 2026 18:40:55 +0100 Subject: [PATCH 1/4] misc: flake: init direnv Users with Nix and a shell with direnv integration will be able to automatically enter the Nix development shell. Forgot to add this in [0]. [0] https://github.com/cyberus-technology/cloud-hypervisor/pull/73 On-behalf-of: SAP philipp.schuster@sap.com Signed-off-by: Philipp Schuster --- .envrc | 1 + .reuse/dep5 | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) create mode 100644 .envrc diff --git a/.envrc b/.envrc new file mode 100644 index 0000000000..3550a30f2d --- /dev/null +++ b/.envrc @@ -0,0 +1 @@ +use flake diff --git a/.reuse/dep5 b/.reuse/dep5 index 20e0177c48..cfcec31a5e 100644 --- a/.reuse/dep5 +++ b/.reuse/dep5 @@ -7,6 +7,6 @@ Files: docs/*.md *.md Copyright: 2024 License: CC-BY-4.0 -Files: scripts/* test_data/* *.toml .git* fuzz/Cargo.lock fuzz/.gitignore resources/linux-config-* vmm/src/api/openapi/cloud-hypervisor.yaml CODEOWNERS Cargo.lock flake.nix flake.lock chv.nix +Files: scripts/* test_data/* *.toml .git* fuzz/Cargo.lock fuzz/.gitignore resources/linux-config-* vmm/src/api/openapi/cloud-hypervisor.yaml CODEOWNERS Cargo.lock flake.nix flake.lock chv.nix .envrc Copyright: 2024 License: Apache-2.0 From 36b49bc27a17b510ad3ee1a2705e97f7da31ee74 Mon Sep 17 00:00:00 2001 From: Philipp Schuster Date: Wed, 28 Jan 2026 21:21:41 +0100 Subject: [PATCH 2/4] misc: flake: add rustup On-behalf-of: SAP philipp.schuster@sap.com Signed-off-by: Philipp Schuster --- flake.nix | 1 + 1 file changed, 1 insertion(+) diff --git a/flake.nix b/flake.nix index 49a97593d1..24b4671997 100644 --- a/flake.nix +++ b/flake.nix @@ -44,6 +44,7 @@ inputsFrom = builtins.attrValues self.packages; packages = with pkgs; [ gitlint + rustup ]; }; packages = From 84d03d31111c4803300acf037ccc8b2d402de154 Mon Sep 17 00:00:00 2001 From: Philipp Schuster Date: Tue, 27 Jan 2026 12:08:25 +0100 Subject: [PATCH 3/4] vmm: migration: streamline memory transmission in a single method This is the first commit in a series of improvements `memory_copy_iterations()` and `struct MigrationState`. I verified everything in dozens of test runs with excessive logging together with three colleagues (StefanK, PascalS, SebastianE). The series is a pre-requisite for live migration statistics reporting. Previously, the initial transmission was done in a dedicated step and `memory_copy_iterations()` took only care of the delta transmission. This makes aggregating metrics unnecessarily difficult. Therefore, everything is now handled gracefully by `memory_copy_iterations()` in one single place. Further, this consolidation perfectly makes sense as all memory transmission is now streamlined in one function. I had to adapt the iteration counter: Iteration 0 : initial transmission Iteration 1..n: delta transmission ^ done inside the function Iteration n : final transmission ^ done outside the function On-behalf-of: SAP philipp.schuster@sap.com Signed-off-by: Philipp Schuster --- vmm/src/lib.rs | 58 ++++++++++++++++++++++++++++---------------------- 1 file changed, 33 insertions(+), 25 deletions(-) diff --git a/vmm/src/lib.rs b/vmm/src/lib.rs index d2c2ed648c..8fba860f68 100644 --- a/vmm/src/lib.rs +++ b/vmm/src/lib.rs @@ -2019,6 +2019,10 @@ impl Vmm { } } + /// Performs memory copy iterations in pre-copy fashion. + /// + /// This transmits the initial VM memory as well as all VM memory delta transmissions while the + /// VM keeps running. fn memory_copy_iterations( vm: &mut Vm, mem_send: &SendAdditionalConnections, @@ -2030,7 +2034,19 @@ impl Vmm { let mut bandwidth = 0.0; let mut iteration_table; + // We loop until we converge (target downtime is achievable). loop { + // Check if migration has timed out + // migration_timeout > 0 means enabling the timeout check, 0 means disabling the timeout check + if !migration_timeout.is_zero() && s.start_time.elapsed() > migration_timeout { + warn!("Migration timed out after {migration_timeout:?}"); + Request::abandon().write_to(socket)?; + Response::read_from(socket)?.ok_or_abandon( + socket, + MigratableError::MigrateSend(anyhow!("Migration timed out")), + )?; + } + // todo: check if auto-converge is enabled at all? if Self::can_increase_autoconverge_step(s) && vm.throttle_percent() < AUTO_CONVERGE_MAX { @@ -2046,22 +2062,13 @@ impl Vmm { // Update the start time of the iteration s.iteration_start_time = Instant::now(); - // Increment iteration counter - s.iteration += 1; - - // Check if migration has timed out - // migration_timeout > 0 means enabling the timeout check, 0 means disabling the timeout check - if !migration_timeout.is_zero() && s.start_time.elapsed() > migration_timeout { - warn!("Migration timed out after {migration_timeout:?}"); - Request::abandon().write_to(socket)?; - Response::read_from(socket)?.ok_or_abandon( - socket, - MigratableError::MigrateSend(anyhow!("Migration timed out")), - )?; - } - - // Get the dirty page table - iteration_table = vm.dirty_log()?; + // In the first iteration (`0`), we transmit the whole memory. Starting with the + // second iteration (`1`), we start the delta transmission. + iteration_table = if s.iteration == 0 { + vm.memory_range_table()? + } else { + vm.dirty_log()? + }; // Update the pending size (amount of data to transfer) s.pending_size = iteration_table @@ -2075,8 +2082,8 @@ impl Vmm { s.threshold_size = bandwidth as u64 * migrate_downtime_limit.as_millis() as u64; } - // Enter the final stage of migration when the suspension conditions are met - if s.iteration > 1 && s.pending_size <= s.threshold_size { + // Enter the final stage of migration when the handover conditions are met + if s.iteration > 0 && s.pending_size <= s.threshold_size { break; } @@ -2104,11 +2111,15 @@ impl Vmm { s.current_dirty_pages * 1000 / s.iteration_cost_time.as_millis() as u64; } debug!( - "iteration {}: cost={}ms, throttle={}%", + "iteration {}: cost={}ms, throttle={}%, transmitted={}MiB", s.iteration, s.iteration_cost_time.as_millis(), - vm.throttle_percent() + vm.throttle_percent(), + s.current_dirty_pages * 4096 / 1024 / 1024 ); + + // Increment iteration counter + s.iteration += 1; } Ok(iteration_table) @@ -2122,11 +2133,6 @@ impl Vmm { ) -> result::Result<(), MigratableError> { let mem_send = SendAdditionalConnections::new(send_data_migration, &vm.guest_memory())?; - // Start logging dirty pages - vm.start_dirty_log()?; - - mem_send.send_memory(&vm.memory_range_table()?, socket)?; - // Define the maximum allowed downtime 2000 seconds(2000000 milliseconds) const MAX_MIGRATE_DOWNTIME: u64 = 2000000; @@ -2150,6 +2156,8 @@ impl Vmm { ))); } + // Start logging dirty pages + vm.start_dirty_log()?; let iteration_table = Self::memory_copy_iterations( vm, &mem_send, From 08e1456ad82470c10514f800bd52a0da037c052d Mon Sep 17 00:00:00 2001 From: Philipp Schuster Date: Tue, 27 Jan 2026 15:42:13 +0100 Subject: [PATCH 4/4] vmm: migration: refactor memory_copy_iterations() + MigrationState This improves the code quality of `struct MigrationState` and memory_copy_iterations(). This significantly improves maintainability of the code. Further, I've added the ability for expected downtime calculation and dirty page calculation. The new names are much more descriptive. I also removed properties that didn't make sense. These changes have undergone intense manual testing where many colleagues attended (PascalS, StefanK, SebastianE). There is currently no easy way to check that things really work as a reviewer. PS: The old struct comes from an external contributor [0]. [0] https://github.com/cloud-hypervisor/cloud-hypervisor/pull/7033 On-behalf-of: SAP philipp.schuster@sap.com Signed-off-by: Philipp Schuster --- vmm/src/lib.rs | 216 ++++++++++++++++++++++++++++++++++--------------- 1 file changed, 149 insertions(+), 67 deletions(-) diff --git a/vmm/src/lib.rs b/vmm/src/lib.rs index 8fba860f68..c4620fa509 100644 --- a/vmm/src/lib.rs +++ b/vmm/src/lib.rs @@ -688,41 +688,81 @@ impl VmmVersionInfo { } } +/// Holds internal metrics about the ongoing migration. +/// +/// Is supposed to be updated on the fly. #[derive(Debug, Clone)] struct MigrationState { - current_dirty_pages: u64, - downtime: Duration, - downtime_start: Instant, + /* ---------------------------------------------- */ + /* Properties that are updated before the first iteration */ + /// The instant where the actual downtime of the VM began. + downtime_start_time: Instant, + /// The instant where the migration began. + migration_start_time: Instant, + + /* ---------------------------------------------- */ + /* Properties that are updated in every iteration */ + /// The iteration number. It is strictly monotonically increasing. iteration: u64, - iteration_cost_time: Duration, + /// The instant where the current iteration began. iteration_start_time: Instant, - mb_per_sec: f64, - pages_per_second: u64, - pending_size: u64, - start_time: Instant, - threshold_size: u64, - total_time: Duration, + /// The duration of the previous iteration. + iteration_duration: Duration, + /// The number of bytes that are to be transmitted in the current iteration. + bytes_to_transmit: u64, + /// `bytes_to_transmit` but as 4K pages. + pages_to_transmit: u64, + /// The instant where the transmission began. + /// This is after `iteration_start_time` and always shorter than + /// `iteration_duration`. + transmit_start_time: Instant, + /// The duration of the transmission began. + transmit_duration: Duration, + /// The measured throughput in bytes per sec. + bytes_per_sec: f64, + /// The calculated downtime with respect to `bytes_to_transmit` and + /// `bytes_per_sec`. + calculated_downtime_duration: Option, + /// Total amount of transferred bytes across all iterations. total_transferred_bytes: u64, - total_transferred_dirty_pages: u64, + /// `total_transferred_bytes` but as 4K pages. + total_transferred_pages: u64, + /// The dirty rate in pages per second (pps). + dirty_rate_pps: u64, + + /* ---------------------------------------------- */ + /* Properties that are updated after the last iteration */ + /// The actual measured downtime from the sender VMM perspective. + downtime_duration: Duration, + /// Total duration of the migration. + migration_duration: Duration, } impl MigrationState { pub fn new() -> Self { Self { - current_dirty_pages: 0, - downtime: Duration::default(), - downtime_start: Instant::now(), + // Field will be overwritten later. + downtime_start_time: Instant::now(), + // Field will be overwritten later. + migration_start_time: Instant::now(), iteration: 0, - iteration_cost_time: Duration::default(), + // Field will be overwritten later. iteration_start_time: Instant::now(), - mb_per_sec: 0.0, - pages_per_second: 0, - pending_size: 0, - start_time: Instant::now(), - threshold_size: 0, - total_time: Duration::default(), + iteration_duration: Duration::default(), + bytes_to_transmit: 0, + pages_to_transmit: 0, + // Field will be overwritten later. + transmit_start_time: Instant::now(), + transmit_duration: Duration::default(), + bytes_per_sec: 0.0, + calculated_downtime_duration: None, total_transferred_bytes: 0, - total_transferred_dirty_pages: 0, + total_transferred_pages: 0, + // Field will be overwritten later. + dirty_rate_pps: 0, + downtime_duration: Duration::default(), + // Field will be overwritten later. + migration_duration: Duration::default(), } } } @@ -2031,14 +2071,17 @@ impl Vmm { migration_timeout: Duration, migrate_downtime_limit: Duration, ) -> result::Result { - let mut bandwidth = 0.0; let mut iteration_table; // We loop until we converge (target downtime is achievable). loop { + // Update the start time of the iteration + s.iteration_start_time = Instant::now(); + // Check if migration has timed out // migration_timeout > 0 means enabling the timeout check, 0 means disabling the timeout check - if !migration_timeout.is_zero() && s.start_time.elapsed() > migration_timeout { + if !migration_timeout.is_zero() && s.migration_start_time.elapsed() > migration_timeout + { warn!("Migration timed out after {migration_timeout:?}"); Request::abandon().write_to(socket)?; Response::read_from(socket)?.ok_or_abandon( @@ -2047,21 +2090,18 @@ impl Vmm { )?; } - // todo: check if auto-converge is enabled at all? + // We always autoconverge. if Self::can_increase_autoconverge_step(s) && vm.throttle_percent() < AUTO_CONVERGE_MAX { let current_throttle = vm.throttle_percent(); let new_throttle = current_throttle + AUTO_CONVERGE_STEP_SIZE; let new_throttle = std::cmp::min(new_throttle, AUTO_CONVERGE_MAX); - log::info!("Increasing auto-converge: {new_throttle}%"); + info!("Increasing auto-converge: {new_throttle}%"); if new_throttle != current_throttle { vm.set_throttle_percent(new_throttle); } } - // Update the start time of the iteration - s.iteration_start_time = Instant::now(); - // In the first iteration (`0`), we transmit the whole memory. Starting with the // second iteration (`1`), we start the delta transmission. iteration_table = if s.iteration == 0 { @@ -2071,51 +2111,82 @@ impl Vmm { }; // Update the pending size (amount of data to transfer) - s.pending_size = iteration_table + s.bytes_to_transmit = iteration_table .regions() .iter() .map(|range| range.length) .sum(); + s.pages_to_transmit = s.bytes_to_transmit.div_ceil(PAGE_SIZE as u64); - // Update thresholds - if bandwidth > 0.0 { - s.threshold_size = bandwidth as u64 * migrate_downtime_limit.as_millis() as u64; - } - - // Enter the final stage of migration when the handover conditions are met - if s.iteration > 0 && s.pending_size <= s.threshold_size { + // Unlikely happy-path. + if s.bytes_to_transmit == 0 { break; } - // Update the number of dirty pages - s.total_transferred_bytes += s.pending_size; - s.current_dirty_pages = s.pending_size.div_ceil(PAGE_SIZE as u64); - s.total_transferred_dirty_pages += s.current_dirty_pages; + // Update metrics and exit loop, if conditions are met. + if s.iteration > 0 { + // Refresh dirty rate: How many pages have been dirtied since the last time we + // fetched the dirty log. + if s.iteration_duration > Duration::ZERO { + let dirty_rate_pps_f64 = + s.pages_to_transmit as f64 / (s.iteration_duration.as_secs_f64()); + s.dirty_rate_pps = dirty_rate_pps_f64.ceil() as u64; + } else { + s.dirty_rate_pps = 0; + } + + // Update expected downtime: + // Strictly speaking, this is the time to transmit the last + // memory chunk, not the actual downtime, which will be higher. + let transmission_time_s = if s.bytes_per_sec > 0.0 { + s.bytes_to_transmit as f64 / s.bytes_per_sec + } else { + 0.0 + }; + s.calculated_downtime_duration = Some(Duration::from_secs_f64(transmission_time_s)); + + // Exit the loop, when the handover conditions are met + if let Some(downtime) = s.calculated_downtime_duration + && downtime <= migrate_downtime_limit + { + info!("Memory delta transmission stopping - cutoff condition reached!"); + info!( + "iteration:{},remaining:{}MiB,downtime(calc):{}ms,mebibyte/s:{:.2},throttle:{}%,dirty_rate:{}pps", + s.iteration, + s.bytes_to_transmit / 1024 / 1024, + s.calculated_downtime_duration + .expect("should have calculated downtime by now") + .as_millis(), + s.bytes_per_sec / 1024.0 / 1024.0, + vm.throttle_percent(), + s.dirty_rate_pps + ); + break; + } + } // Send the current dirty pages - let transfer_start = Instant::now(); + s.transmit_start_time = Instant::now(); mem_send.send_memory(&iteration_table, socket)?; - let transfer_time = transfer_start.elapsed().as_millis() as f64; + s.transmit_duration = s.transmit_start_time.elapsed(); + + s.total_transferred_bytes += s.bytes_to_transmit; + s.total_transferred_pages += s.pages_to_transmit; // Update bandwidth - if transfer_time > 0.0 && s.pending_size > 0 { - bandwidth = s.pending_size as f64 / transfer_time; - // Convert bandwidth to MB/s - s.mb_per_sec = (bandwidth * 1000.0) / (1024.0 * 1024.0); + if s.transmit_duration > Duration::ZERO && s.bytes_to_transmit > 0 { + s.bytes_per_sec = s.bytes_to_transmit as f64 / s.transmit_duration.as_secs_f64(); } - // Update iteration cost time - s.iteration_cost_time = s.iteration_start_time.elapsed(); - if s.iteration_cost_time.as_millis() > 0 { - s.pages_per_second = - s.current_dirty_pages * 1000 / s.iteration_cost_time.as_millis() as u64; - } - debug!( - "iteration {}: cost={}ms, throttle={}%, transmitted={}MiB", + s.iteration_duration = s.iteration_start_time.elapsed(); + info!( + "iteration:{},cost={}ms,throttle={}%,transmitted={}MiB,dirty_rate={}pps,Mebibyte/s={:.2}", s.iteration, - s.iteration_cost_time.as_millis(), + s.iteration_duration.as_millis(), vm.throttle_percent(), - s.current_dirty_pages * 4096 / 1024 / 1024 + s.bytes_to_transmit / 1024 / 1024, + s.dirty_rate_pps, + s.bytes_per_sec / 1024.0 / 1024.0 ); // Increment iteration counter @@ -2168,11 +2239,11 @@ impl Vmm { )?; info!("Entering downtime phase"); - s.downtime_start = Instant::now(); + s.downtime_start_time = Instant::now(); // End throttle thread - info!("stopping vcpu thread"); + info!("stopping vcpu throttling thread"); vm.stop_vcpu_throttling(); - info!("stopped vcpu thread"); + info!("stopped vcpu throttling thread"); info!("pausing VM"); vm.pause()?; info!("paused VM"); @@ -2181,11 +2252,17 @@ impl Vmm { let mut final_table = vm.dirty_log()?; final_table.extend(iteration_table.clone()); mem_send.send_memory(&final_table, socket)?; + // Update statistics - s.pending_size = final_table.regions().iter().map(|range| range.length).sum(); - s.total_transferred_bytes += s.pending_size; - s.current_dirty_pages = s.pending_size.div_ceil(PAGE_SIZE as u64); - s.total_transferred_dirty_pages += s.current_dirty_pages; + s.bytes_to_transmit = final_table.regions().iter().map(|range| range.length).sum(); + s.pages_to_transmit = s.bytes_to_transmit.div_ceil(PAGE_SIZE as u64); + s.total_transferred_bytes += s.bytes_to_transmit; + s.total_transferred_pages += s.pages_to_transmit; + + info!( + "Memory Migration finished: transmitted {} bytes in total", + s.total_transferred_bytes + ); // Stop logging dirty pages vm.stop_dirty_log()?; @@ -2334,7 +2411,7 @@ impl Vmm { )?; // Record downtime - s.downtime = s.downtime_start.elapsed(); + s.downtime_duration = s.downtime_start_time.elapsed(); // Stop logging dirty pages if !send_data_migration.local { @@ -2342,9 +2419,14 @@ impl Vmm { } // Record total migration time - s.total_time = s.start_time.elapsed(); + s.migration_duration = s.migration_start_time.elapsed(); - info!("Migration complete"); + info!( + "Migration complete: downtime: {:.3}s, total: {:1}s, iterations: {}", + s.downtime_duration.as_secs_f64(), + s.migration_duration.as_secs_f64(), + s.iteration, + ); // Let every Migratable object know about the migration being complete vm.complete_migration()