@@ -688,41 +688,81 @@ impl VmmVersionInfo {
688688 }
689689}
690690
691+ /// Holds internal metrics about the ongoing migration.
692+ ///
693+ /// Is supposed to be updated on the fly.
691694#[ derive( Debug , Clone ) ]
692695struct MigrationState {
693- current_dirty_pages : u64 ,
694- downtime : Duration ,
695- downtime_start : Instant ,
696+ /* ---------------------------------------------- */
697+ /* Properties that are updated before the first iteration */
698+ /// The instant where the actual downtime of the VM began.
699+ downtime_start_time : Instant ,
700+ /// The instant where the migration began.
701+ migration_start_time : Instant ,
702+
703+ /* ---------------------------------------------- */
704+ /* Properties that are updated in every iteration */
705+ /// The iteration number. It is strictly monotonically increasing.
696706 iteration : u64 ,
697- iteration_cost_time : Duration ,
707+ /// The instant where the current iteration began.
698708 iteration_start_time : Instant ,
699- mb_per_sec : f64 ,
700- pages_per_second : u64 ,
701- pending_size : u64 ,
702- start_time : Instant ,
703- threshold_size : u64 ,
704- total_time : Duration ,
709+ /// The duration of the previous iteration.
710+ iteration_duration : Duration ,
711+ /// The number of bytes that are to be transmitted in the current iteration.
712+ bytes_to_transmit : u64 ,
713+ /// `bytes_to_transmit` but as 4K pages.
714+ pages_to_transmit : u64 ,
715+ /// The instant where the transmission began.
716+ /// This is after `iteration_start_time` and always shorter than
717+ /// `iteration_duration`.
718+ transmit_start_time : Instant ,
719+ /// The duration of the transmission began.
720+ transmit_duration : Duration ,
721+ /// The measured throughput in bytes per sec.
722+ bytes_per_sec : f64 ,
723+ /// The calculated downtime with respect to `bytes_to_transmit` and
724+ /// `bytes_per_sec`.
725+ calculated_downtime_duration : Option < Duration > ,
726+ /// Total amount of transferred bytes across all iterations.
705727 total_transferred_bytes : u64 ,
706- total_transferred_dirty_pages : u64 ,
728+ /// `total_transferred_bytes` but as 4K pages.
729+ total_transferred_pages : u64 ,
730+ /// The dirty rate in pages per second (pps).
731+ dirty_rate_pps : u64 ,
732+
733+ /* ---------------------------------------------- */
734+ /* Properties that are updated after the last iteration */
735+ /// The actual measured downtime from the sender VMM perspective.
736+ downtime_duration : Duration ,
737+ /// Total duration of the migration.
738+ migration_duration : Duration ,
707739}
708740
709741impl MigrationState {
710742 pub fn new ( ) -> Self {
711743 Self {
712- current_dirty_pages : 0 ,
713- downtime : Duration :: default ( ) ,
714- downtime_start : Instant :: now ( ) ,
744+ // Field will be overwritten later.
745+ downtime_start_time : Instant :: now ( ) ,
746+ // Field will be overwritten later.
747+ migration_start_time : Instant :: now ( ) ,
715748 iteration : 0 ,
716- iteration_cost_time : Duration :: default ( ) ,
749+ // Field will be overwritten later.
717750 iteration_start_time : Instant :: now ( ) ,
718- mb_per_sec : 0.0 ,
719- pages_per_second : 0 ,
720- pending_size : 0 ,
721- start_time : Instant :: now ( ) ,
722- threshold_size : 0 ,
723- total_time : Duration :: default ( ) ,
751+ iteration_duration : Duration :: default ( ) ,
752+ bytes_to_transmit : 0 ,
753+ pages_to_transmit : 0 ,
754+ // Field will be overwritten later.
755+ transmit_start_time : Instant :: now ( ) ,
756+ transmit_duration : Duration :: default ( ) ,
757+ bytes_per_sec : 0.0 ,
758+ calculated_downtime_duration : None ,
724759 total_transferred_bytes : 0 ,
725- total_transferred_dirty_pages : 0 ,
760+ total_transferred_pages : 0 ,
761+ // Field will be overwritten later.
762+ dirty_rate_pps : 0 ,
763+ downtime_duration : Duration :: default ( ) ,
764+ // Field will be overwritten later.
765+ migration_duration : Duration :: default ( ) ,
726766 }
727767 }
728768}
@@ -2031,14 +2071,17 @@ impl Vmm {
20312071 migration_timeout : Duration ,
20322072 migrate_downtime_limit : Duration ,
20332073 ) -> result:: Result < MemoryRangeTable , MigratableError > {
2034- let mut bandwidth = 0.0 ;
20352074 let mut iteration_table;
20362075
20372076 // We loop until we converge (target downtime is achievable).
20382077 loop {
2078+ // Update the start time of the iteration
2079+ s. iteration_start_time = Instant :: now ( ) ;
2080+
20392081 // Check if migration has timed out
20402082 // migration_timeout > 0 means enabling the timeout check, 0 means disabling the timeout check
2041- if !migration_timeout. is_zero ( ) && s. start_time . elapsed ( ) > migration_timeout {
2083+ if !migration_timeout. is_zero ( ) && s. migration_start_time . elapsed ( ) > migration_timeout
2084+ {
20422085 warn ! ( "Migration timed out after {migration_timeout:?}" ) ;
20432086 Request :: abandon ( ) . write_to ( socket) ?;
20442087 Response :: read_from ( socket) ?. ok_or_abandon (
@@ -2047,21 +2090,18 @@ impl Vmm {
20472090 ) ?;
20482091 }
20492092
2050- // todo: check if auto-converge is enabled at all?
2093+ // We always autoconverge.
20512094 if Self :: can_increase_autoconverge_step ( s) && vm. throttle_percent ( ) < AUTO_CONVERGE_MAX
20522095 {
20532096 let current_throttle = vm. throttle_percent ( ) ;
20542097 let new_throttle = current_throttle + AUTO_CONVERGE_STEP_SIZE ;
20552098 let new_throttle = std:: cmp:: min ( new_throttle, AUTO_CONVERGE_MAX ) ;
2056- log :: info!( "Increasing auto-converge: {new_throttle}%" ) ;
2099+ info ! ( "Increasing auto-converge: {new_throttle}%" ) ;
20572100 if new_throttle != current_throttle {
20582101 vm. set_throttle_percent ( new_throttle) ;
20592102 }
20602103 }
20612104
2062- // Update the start time of the iteration
2063- s. iteration_start_time = Instant :: now ( ) ;
2064-
20652105 // In the first iteration (`0`), we transmit the whole memory. Starting with the
20662106 // second iteration (`1`), we start the delta transmission.
20672107 iteration_table = if s. iteration == 0 {
@@ -2071,51 +2111,82 @@ impl Vmm {
20712111 } ;
20722112
20732113 // Update the pending size (amount of data to transfer)
2074- s. pending_size = iteration_table
2114+ s. bytes_to_transmit = iteration_table
20752115 . regions ( )
20762116 . iter ( )
20772117 . map ( |range| range. length )
20782118 . sum ( ) ;
2119+ s. pages_to_transmit = s. bytes_to_transmit . div_ceil ( PAGE_SIZE as u64 ) ;
20792120
2080- // Update thresholds
2081- if bandwidth > 0.0 {
2082- s. threshold_size = bandwidth as u64 * migrate_downtime_limit. as_millis ( ) as u64 ;
2083- }
2084-
2085- // Enter the final stage of migration when the handover conditions are met
2086- if s. iteration > 0 && s. pending_size <= s. threshold_size {
2121+ // Unlikely happy-path.
2122+ if s. bytes_to_transmit == 0 {
20872123 break ;
20882124 }
20892125
2090- // Update the number of dirty pages
2091- s. total_transferred_bytes += s. pending_size ;
2092- s. current_dirty_pages = s. pending_size . div_ceil ( PAGE_SIZE as u64 ) ;
2093- s. total_transferred_dirty_pages += s. current_dirty_pages ;
2126+ // Update metrics and exit loop, if conditions are met.
2127+ if s. iteration > 0 {
2128+ // Refresh dirty rate: How many pages have been dirtied since the last time we
2129+ // fetched the dirty log.
2130+ if s. iteration_duration > Duration :: ZERO {
2131+ let dirty_rate_pps_f64 =
2132+ s. pages_to_transmit as f64 / ( s. iteration_duration . as_secs_f64 ( ) ) ;
2133+ s. dirty_rate_pps = dirty_rate_pps_f64. ceil ( ) as u64 ;
2134+ } else {
2135+ s. dirty_rate_pps = 0 ;
2136+ }
2137+
2138+ // Update expected downtime:
2139+ // Strictly speaking, this is the time to transmit the last
2140+ // memory chunk, not the actual downtime, which will be higher.
2141+ let transmission_time_s = if s. bytes_per_sec > 0.0 {
2142+ s. bytes_to_transmit as f64 / s. bytes_per_sec
2143+ } else {
2144+ 0.0
2145+ } ;
2146+ s. calculated_downtime_duration = Some ( Duration :: from_secs_f64 ( transmission_time_s) ) ;
2147+
2148+ // Exit the loop, when the handover conditions are met
2149+ if let Some ( downtime) = s. calculated_downtime_duration
2150+ && downtime <= migrate_downtime_limit
2151+ {
2152+ info ! ( "Memory delta transmission stopping - cutoff condition reached!" ) ;
2153+ info ! (
2154+ "iteration:{},remaining:{}MiB,downtime(calc):{}ms,mebibyte/s:{:.2},throttle:{}%,dirty_rate:{}pps" ,
2155+ s. iteration,
2156+ s. bytes_to_transmit / 1024 / 1024 ,
2157+ s. calculated_downtime_duration
2158+ . expect( "should have calculated downtime by now" )
2159+ . as_millis( ) ,
2160+ s. bytes_per_sec / 1024.0 / 1024.0 ,
2161+ vm. throttle_percent( ) ,
2162+ s. dirty_rate_pps
2163+ ) ;
2164+ break ;
2165+ }
2166+ }
20942167
20952168 // Send the current dirty pages
2096- let transfer_start = Instant :: now ( ) ;
2169+ s . transmit_start_time = Instant :: now ( ) ;
20972170 mem_send. send_memory ( & iteration_table, socket) ?;
2098- let transfer_time = transfer_start. elapsed ( ) . as_millis ( ) as f64 ;
2171+ s. transmit_duration = s. transmit_start_time . elapsed ( ) ;
2172+
2173+ s. total_transferred_bytes += s. bytes_to_transmit ;
2174+ s. total_transferred_pages += s. pages_to_transmit ;
20992175
21002176 // Update bandwidth
2101- if transfer_time > 0.0 && s. pending_size > 0 {
2102- bandwidth = s. pending_size as f64 / transfer_time;
2103- // Convert bandwidth to MB/s
2104- s. mb_per_sec = ( bandwidth * 1000.0 ) / ( 1024.0 * 1024.0 ) ;
2177+ if s. transmit_duration > Duration :: ZERO && s. bytes_to_transmit > 0 {
2178+ s. bytes_per_sec = s. bytes_to_transmit as f64 / s. transmit_duration . as_secs_f64 ( ) ;
21052179 }
21062180
2107- // Update iteration cost time
2108- s. iteration_cost_time = s. iteration_start_time . elapsed ( ) ;
2109- if s. iteration_cost_time . as_millis ( ) > 0 {
2110- s. pages_per_second =
2111- s. current_dirty_pages * 1000 / s. iteration_cost_time . as_millis ( ) as u64 ;
2112- }
2113- debug ! (
2114- "iteration {}: cost={}ms, throttle={}%, transmitted={}MiB" ,
2181+ s. iteration_duration = s. iteration_start_time . elapsed ( ) ;
2182+ info ! (
2183+ "iteration:{},cost={}ms,throttle={}%,transmitted={}MiB,dirty_rate={}pps,Mebibyte/s={:.2}" ,
21152184 s. iteration,
2116- s. iteration_cost_time . as_millis( ) ,
2185+ s. iteration_duration . as_millis( ) ,
21172186 vm. throttle_percent( ) ,
2118- s. current_dirty_pages * 4096 / 1024 / 1024
2187+ s. bytes_to_transmit / 1024 / 1024 ,
2188+ s. dirty_rate_pps,
2189+ s. bytes_per_sec / 1024.0 / 1024.0
21192190 ) ;
21202191
21212192 // Increment iteration counter
@@ -2168,11 +2239,11 @@ impl Vmm {
21682239 ) ?;
21692240
21702241 info ! ( "Entering downtime phase" ) ;
2171- s. downtime_start = Instant :: now ( ) ;
2242+ s. downtime_start_time = Instant :: now ( ) ;
21722243 // End throttle thread
2173- info ! ( "stopping vcpu thread" ) ;
2244+ info ! ( "stopping vcpu throttling thread" ) ;
21742245 vm. stop_vcpu_throttling ( ) ;
2175- info ! ( "stopped vcpu thread" ) ;
2246+ info ! ( "stopped vcpu throttling thread" ) ;
21762247 info ! ( "pausing VM" ) ;
21772248 vm. pause ( ) ?;
21782249 info ! ( "paused VM" ) ;
@@ -2181,11 +2252,17 @@ impl Vmm {
21812252 let mut final_table = vm. dirty_log ( ) ?;
21822253 final_table. extend ( iteration_table. clone ( ) ) ;
21832254 mem_send. send_memory ( & final_table, socket) ?;
2255+
21842256 // Update statistics
2185- s. pending_size = final_table. regions ( ) . iter ( ) . map ( |range| range. length ) . sum ( ) ;
2186- s. total_transferred_bytes += s. pending_size ;
2187- s. current_dirty_pages = s. pending_size . div_ceil ( PAGE_SIZE as u64 ) ;
2188- s. total_transferred_dirty_pages += s. current_dirty_pages ;
2257+ s. bytes_to_transmit = final_table. regions ( ) . iter ( ) . map ( |range| range. length ) . sum ( ) ;
2258+ s. pages_to_transmit = s. bytes_to_transmit . div_ceil ( PAGE_SIZE as u64 ) ;
2259+ s. total_transferred_bytes += s. bytes_to_transmit ;
2260+ s. total_transferred_pages += s. pages_to_transmit ;
2261+
2262+ info ! (
2263+ "Memory Migration finished: transmitted {} bytes in total" ,
2264+ s. total_transferred_bytes
2265+ ) ;
21892266
21902267 // Stop logging dirty pages
21912268 vm. stop_dirty_log ( ) ?;
@@ -2334,17 +2411,22 @@ impl Vmm {
23342411 ) ?;
23352412
23362413 // Record downtime
2337- s. downtime = s. downtime_start . elapsed ( ) ;
2414+ s. downtime_duration = s. downtime_start_time . elapsed ( ) ;
23382415
23392416 // Stop logging dirty pages
23402417 if !send_data_migration. local {
23412418 vm. stop_dirty_log ( ) ?;
23422419 }
23432420
23442421 // Record total migration time
2345- s. total_time = s. start_time . elapsed ( ) ;
2422+ s. migration_duration = s. migration_start_time . elapsed ( ) ;
23462423
2347- info ! ( "Migration complete" ) ;
2424+ info ! (
2425+ "Migration complete: downtime: {:.3}s, total: {:1}s, iterations: {}" ,
2426+ s. downtime_duration. as_secs_f64( ) ,
2427+ s. migration_duration. as_secs_f64( ) ,
2428+ s. iteration,
2429+ ) ;
23482430
23492431 // Let every Migratable object know about the migration being complete
23502432 vm. complete_migration ( )
0 commit comments