Skip to content

Commit d8fd2bc

Browse files
committed
reuse a buffer for read/write-through-propolis-heap
this seems to have some effect, but not a lot; nothing when I/Os are all the same size, on the order of 5% better throughput when I/Os vary between 4 KiB, 32 KiB, 256 KiB, and in-between
1 parent 2dc6437 commit d8fd2bc

2 files changed

Lines changed: 86 additions & 24 deletions

File tree

lib/propolis/src/block/file.rs

Lines changed: 23 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,24 @@ impl SharedState {
5959
}
6060

6161
fn processing_loop(&self, wctx: SyncWorkerCtx) {
62+
// Conspire with gross hack in `preadv()` and `pwritev()`: there, we
63+
// want to give the OS a buffer backed by Propolis heap, rather than a
64+
// VMM segment, to avoid a slow fallback to lock pages in memory during
65+
// the I/O. We could either allocate buffers on-demand sized for the
66+
// I/O, or up front in the worker thread, here. On-demand allocation
67+
// works, but when I/O sizes are mixed the high malloc/free rates
68+
// (X00k/sec/process) get a bit busy on locks in libumem. So instead,
69+
// retain a buffer across I/Os, which is passed into `process_request()`
70+
// and on.
71+
//
72+
// This buffer is grown on-demand when handling an I/O because some
73+
// guests' maximum I/O size seems to be smaller than maximums we
74+
// indicate (for example, Linux seems to send NVMe I/Os 256 KiB or
75+
// smaller, even though we report we'll tolerate up to 1 MiB). In those
76+
// cases, sizing against an expected maximum will grossly oversize
77+
// buffers and increase memory pressure for no good reason.
78+
let mut scratch = Vec::new();
79+
6280
while let Some(dreq) = wctx.block_for_req() {
6381
let req = dreq.req();
6482
if self.info.read_only && req.op.is_write() {
@@ -74,7 +92,8 @@ impl SharedState {
7492
dreq.complete(block::Result::Failure);
7593
continue;
7694
};
77-
let res = match self.process_request(&req, &mem) {
95+
let res = match self.process_request(&req, &mem, Some(&mut scratch))
96+
{
7897
Ok(_) => block::Result::Success,
7998
Err(_) => block::Result::Failure,
8099
};
@@ -86,13 +105,14 @@ impl SharedState {
86105
&self,
87106
req: &block::Request,
88107
mem: &MemCtx,
108+
scratch: Option<&mut Vec<u8>>,
89109
) -> std::result::Result<(), &'static str> {
90110
match req.op {
91111
block::Operation::Read(off, len) => {
92112
let maps = req.mappings(mem).ok_or("mapping unavailable")?;
93113

94114
let nbytes = maps
95-
.preadv(self.fp.as_raw_fd(), off as i64)
115+
.preadv(self.fp.as_raw_fd(), off as i64, scratch)
96116
.map_err(|_| "io error")?;
97117
if nbytes != len {
98118
return Err("bad read length");
@@ -102,7 +122,7 @@ impl SharedState {
102122
let maps = req.mappings(mem).ok_or("bad guest region")?;
103123

104124
let nbytes = maps
105-
.pwritev(self.fp.as_raw_fd(), off as i64)
125+
.pwritev(self.fp.as_raw_fd(), off as i64, scratch)
106126
.map_err(|_| "io error")?;
107127
if nbytes != len {
108128
return Err("bad write length");

lib/propolis/src/vmm/mem.rs

Lines changed: 63 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -748,10 +748,20 @@ unsafe impl Sync for SubMapping<'_> {}
748748

749749
pub trait MappingExt {
750750
/// preadv from `file` into multiple mappings
751-
fn preadv(&self, fd: RawFd, offset: i64) -> Result<usize>;
751+
fn preadv(
752+
&self,
753+
fd: RawFd,
754+
offset: i64,
755+
scratch: Option<&mut Vec<u8>>,
756+
) -> Result<usize>;
752757

753758
/// pwritev from multiple mappings to `file`
754-
fn pwritev(&self, fd: RawFd, offset: i64) -> Result<usize>;
759+
fn pwritev(
760+
&self,
761+
fd: RawFd,
762+
offset: i64,
763+
scratch: Option<&mut Vec<u8>>,
764+
) -> Result<usize>;
755765
}
756766

757767
// Gross hack alert: since the mappings below are memory regions backed by
@@ -792,8 +802,45 @@ fn total_mapping_size(mappings: &[SubMapping<'_>]) -> Result<usize> {
792802
})
793803
}
794804

805+
/// Ensure the maybe-provided buffer is ready for I/O of `desired_size`.
806+
///
807+
/// Returns `Some` with a slice with length at least as large as `desired_size`,
808+
/// if the desired size is in the range we're willing to use the I/O buffer
809+
/// proxying workaround.
810+
///
811+
/// If the scratch buffer is provided, was too small, but the desired size is
812+
/// acceptable, this function will grow the scratch buffer.
813+
fn prepare_scratch_buffer(
814+
scratch: Option<&mut Vec<u8>>,
815+
desired_size: usize,
816+
) -> Option<&mut [u8]> {
817+
scratch.and_then(|scratch| {
818+
if desired_size > MAPPING_IO_LIMIT_BYTES {
819+
// Even though the caller provided a buffer, we're not willing to
820+
// grow it to handle this I/O.
821+
return None;
822+
}
823+
824+
if scratch.len() < desired_size {
825+
// The provided buffer is smaller than necessary, but the needed
826+
// size is acceptable, so grow the buffer.
827+
//
828+
// Buffer growth is done lazily because experience shows that guests
829+
// may submit I/Os much smaller than the largest permitted by MDTS.
830+
scratch.resize(desired_size, 0);
831+
}
832+
833+
Some(scratch.as_mut_slice())
834+
})
835+
}
836+
795837
impl<'a, T: AsRef<[SubMapping<'a>]>> MappingExt for T {
796-
fn preadv(&self, fd: RawFd, offset: i64) -> Result<usize> {
838+
fn preadv(
839+
&self,
840+
fd: RawFd,
841+
offset: i64,
842+
scratch: Option<&mut Vec<u8>>,
843+
) -> Result<usize> {
797844
if !self
798845
.as_ref()
799846
.iter()
@@ -808,16 +855,11 @@ impl<'a, T: AsRef<[SubMapping<'a>]>> MappingExt for T {
808855
let total_capacity = total_mapping_size(self.as_ref())?;
809856

810857
// Gross hack: see the comment on `MAPPING_IO_LIMIT_BYTES`.
811-
if total_capacity <= MAPPING_IO_LIMIT_BYTES {
812-
// If we're motivated to avoid the zero-fill via
813-
// `Layout::with_size_align` + `GlobalAlloc::alloc`, we should
814-
// probably avoid this gross hack entirely (see comment on
815-
// MAPPING_IO_LIMIT_BYTES).
816-
let mut buf = vec![0; total_capacity];
817-
858+
let scratch = prepare_scratch_buffer(scratch, total_capacity);
859+
if let Some(buf) = scratch {
818860
let iov = [iovec {
819861
iov_base: buf.as_mut_ptr() as *mut libc::c_void,
820-
iov_len: buf.len(),
862+
iov_len: total_capacity,
821863
}];
822864

823865
let res = unsafe {
@@ -883,7 +925,12 @@ impl<'a, T: AsRef<[SubMapping<'a>]>> MappingExt for T {
883925
}
884926
}
885927

886-
fn pwritev(&self, fd: RawFd, offset: i64) -> Result<usize> {
928+
fn pwritev(
929+
&self,
930+
fd: RawFd,
931+
offset: i64,
932+
scratch: Option<&mut Vec<u8>>,
933+
) -> Result<usize> {
887934
if !self
888935
.as_ref()
889936
.iter()
@@ -898,14 +945,9 @@ impl<'a, T: AsRef<[SubMapping<'a>]>> MappingExt for T {
898945
let total_capacity = total_mapping_size(self.as_ref())?;
899946

900947
// Gross hack: see the comment on `MAPPING_IO_LIMIT_BYTES`.
901-
let written = if total_capacity <= MAPPING_IO_LIMIT_BYTES {
902-
// If we're motivated to avoid the zero-fill via
903-
// `Layout::with_size_align` + `GlobalAlloc::alloc`, we should
904-
// probably avoid this gross hack entirely (see comment on
905-
// MAPPING_IO_LIMIT_BYTES).
906-
let mut buf = vec![0; total_capacity];
907-
908-
let mut remaining = buf.as_mut_slice();
948+
let scratch = prepare_scratch_buffer(scratch, total_capacity);
949+
let written = if let Some(buf) = scratch {
950+
let mut remaining = &mut buf[..];
909951
for mapping in self.as_ref().iter() {
910952
// Safety: guest physical memory is READ|WRITE, and won't become
911953
// unmapped as long as we hold a Mapping, which `self` implies.
@@ -928,7 +970,7 @@ impl<'a, T: AsRef<[SubMapping<'a>]>> MappingExt for T {
928970

929971
let iovs = [iovec {
930972
iov_base: buf.as_mut_ptr() as *mut libc::c_void,
931-
iov_len: buf.len(),
973+
iov_len: total_capacity,
932974
}];
933975

934976
unsafe {

0 commit comments

Comments
 (0)