Skip to content

Commit b1ef353

Browse files
committed
feat: virtualize /proc/loadavg with per-sandbox EWMA tracking
Signed-off-by: Cong Wang <cwang@multikernel.io>
1 parent d54570c commit b1ef353

3 files changed

Lines changed: 136 additions & 1 deletion

File tree

crates/sandlock-core/src/procfs.rs

Lines changed: 108 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,51 @@ pub(crate) fn generate_uptime(elapsed_secs: f64) -> Vec<u8> {
6868
format!("{:.2} 0.00\n", elapsed_secs.max(0.0)).into_bytes()
6969
}
7070

71+
// ============================================================
72+
// /proc/loadavg generator + EWMA tracker
73+
// ============================================================
74+
75+
/// Exponential weighted moving average load tracker, matching the Linux kernel's
76+
/// algorithm (kernel/sched/loadavg.c). Sampled every 5 seconds.
77+
#[derive(Debug, Clone)]
78+
pub struct LoadAvg {
79+
pub avg_1: f64,
80+
pub avg_5: f64,
81+
pub avg_15: f64,
82+
}
83+
84+
// Decay factors: e^(-5/60), e^(-5/300), e^(-5/900)
85+
const EXP_1: f64 = 0.9200444146293232; // e^(-1/12)
86+
const EXP_5: f64 = 0.9834714538216174; // e^(-1/60)
87+
const EXP_15: f64 = 0.9944598480048967; // e^(-1/180)
88+
89+
impl LoadAvg {
90+
pub fn new() -> Self {
91+
Self { avg_1: 0.0, avg_5: 0.0, avg_15: 0.0 }
92+
}
93+
94+
/// Update averages with current runnable process count.
95+
/// Called every 5 seconds by the sampling task.
96+
pub fn sample(&mut self, running: u32) {
97+
let r = running as f64;
98+
self.avg_1 = self.avg_1 * EXP_1 + r * (1.0 - EXP_1);
99+
self.avg_5 = self.avg_5 * EXP_5 + r * (1.0 - EXP_5);
100+
self.avg_15 = self.avg_15 * EXP_15 + r * (1.0 - EXP_15);
101+
}
102+
}
103+
104+
/// Generate /proc/loadavg from tracked EWMA values.
105+
/// Format: "avg1 avg5 avg15 running/total last_pid\n"
106+
pub(crate) fn generate_loadavg(load: &LoadAvg, running: u32, total: u32, last_pid: i32) -> Vec<u8> {
107+
format!(
108+
"{:.2} {:.2} {:.2} {}/{} {}\n",
109+
load.avg_1, load.avg_5, load.avg_15,
110+
running.max(1).min(total), total,
111+
last_pid.max(0),
112+
)
113+
.into_bytes()
114+
}
115+
71116
// /proc/meminfo generator
72117
// ============================================================
73118

@@ -243,6 +288,16 @@ pub(crate) async fn handle_proc_open(
243288
return inject_memfd(&content);
244289
}
245290

291+
// Virtualize /proc/loadavg when proc virtualization is active.
292+
if path == "/proc/loadavg" {
293+
let st = state.lock().await;
294+
let total = st.proc_pids.len() as u32;
295+
let running = st.proc_count;
296+
let last_pid = st.proc_pids.iter().max().copied().unwrap_or(0);
297+
let content = generate_loadavg(&st.load_avg, running, total, last_pid);
298+
return inject_memfd(&content);
299+
}
300+
246301
// Virtualize /proc/net/tcp and /proc/net/tcp6 when port_remap is active.
247302
if policy.port_remap && (path == "/proc/net/tcp" || path == "/proc/net/tcp6") {
248303
let is_v6 = path.ends_with('6');
@@ -703,6 +758,59 @@ mod tests {
703758
assert!(text.starts_with("0.00"));
704759
}
705760

761+
#[test]
762+
fn test_loadavg_ewma() {
763+
let mut la = LoadAvg::new();
764+
assert_eq!(la.avg_1, 0.0);
765+
assert_eq!(la.avg_5, 0.0);
766+
assert_eq!(la.avg_15, 0.0);
767+
768+
// After sampling with 4 running processes, averages should rise
769+
for _ in 0..12 {
770+
la.sample(4);
771+
}
772+
// 1-min average should converge faster than 5 and 15
773+
assert!(la.avg_1 > la.avg_5);
774+
assert!(la.avg_5 > la.avg_15);
775+
assert!(la.avg_1 > 2.0); // should be well above 0 after 60s of load=4
776+
}
777+
778+
#[test]
779+
fn test_loadavg_ewma_decay() {
780+
let mut la = LoadAvg::new();
781+
// Load up
782+
for _ in 0..60 {
783+
la.sample(10);
784+
}
785+
let peak = la.avg_1;
786+
// Load drops to 0
787+
for _ in 0..60 {
788+
la.sample(0);
789+
}
790+
assert!(la.avg_1 < peak * 0.1, "1-min avg should decay quickly");
791+
}
792+
793+
#[test]
794+
fn test_generate_loadavg() {
795+
let la = LoadAvg { avg_1: 1.23, avg_5: 0.45, avg_15: 0.12 };
796+
let info = generate_loadavg(&la, 3, 10, 42);
797+
let text = String::from_utf8(info).unwrap();
798+
assert!(text.contains("1.23"));
799+
assert!(text.contains("0.45"));
800+
assert!(text.contains("0.12"));
801+
assert!(text.contains("3/10"));
802+
assert!(text.contains("42"));
803+
}
804+
805+
#[test]
806+
fn test_generate_loadavg_zero_procs() {
807+
let la = LoadAvg::new();
808+
let info = generate_loadavg(&la, 0, 0, 0);
809+
let text = String::from_utf8(info).unwrap();
810+
// running should be clamped: max(0,1).min(0) = 0
811+
assert!(text.contains("0/0"));
812+
}
813+
706814
#[test]
707815
fn test_build_dirent64() {
708816
let entry = build_dirent64(12345, 1, DT_DIR, "1234");

crates/sandlock-core/src/sandbox.rs

Lines changed: 26 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,7 @@ pub struct Sandbox {
7474
pidfd: Option<OwnedFd>,
7575
notif_handle: Option<JoinHandle<()>>,
7676
throttle_handle: Option<JoinHandle<()>>,
77+
loadavg_handle: Option<JoinHandle<()>>,
7778
/// Capture pipe read ends — kept alive so the child doesn't get SIGPIPE.
7879
_stdout_read: Option<OwnedFd>,
7980
_stderr_read: Option<OwnedFd>,
@@ -134,6 +135,7 @@ impl Sandbox {
134135
pidfd: None,
135136
notif_handle: None,
136137
throttle_handle: None,
138+
loadavg_handle: None,
137139
_stdout_read: None,
138140
_stderr_read: None,
139141
cow_branch: None,
@@ -444,6 +446,9 @@ impl Sandbox {
444446
if let Some(h) = self.throttle_handle.take() {
445447
h.abort();
446448
}
449+
if let Some(h) = self.loadavg_handle.take() {
450+
h.abort();
451+
}
447452

448453
// Extract seccomp COW branch while we're still in async context
449454
// (can properly .lock().await the tokio Mutex). This avoids the
@@ -929,10 +934,27 @@ impl Sandbox {
929934
let sup_state = Arc::new(Mutex::new(sup_state));
930935
self.supervisor_state = Some(Arc::clone(&sup_state));
931936

937+
let has_proc_virt = notif_policy.has_proc_virt;
938+
932939
// Spawn notif supervisor
933940
self.notif_handle = Some(tokio::spawn(
934-
notif::supervisor(notif_fd, notif_policy, sup_state),
941+
notif::supervisor(notif_fd, notif_policy, Arc::clone(&sup_state)),
935942
));
943+
944+
// Spawn load average sampling task (every 5s, like the kernel)
945+
if has_proc_virt {
946+
let la_state = Arc::clone(&sup_state);
947+
self.loadavg_handle = Some(tokio::spawn(async move {
948+
let mut interval = tokio::time::interval(Duration::from_secs(5));
949+
interval.tick().await; // skip immediate first tick
950+
loop {
951+
interval.tick().await;
952+
let mut st = la_state.lock().await;
953+
let running = st.proc_count;
954+
st.load_avg.sample(running);
955+
}
956+
}));
957+
}
936958
}
937959

938960
// 15. Optionally spawn CPU throttle task
@@ -976,6 +998,9 @@ impl Drop for Sandbox {
976998
if let Some(h) = self.throttle_handle.take() {
977999
h.abort();
9781000
}
1001+
if let Some(h) = self.loadavg_handle.take() {
1002+
h.abort();
1003+
}
9791004

9801005
// COW cleanup based on exit status.
9811006
// Determine action once, then apply to whichever branch exists.

crates/sandlock-core/src/seccomp/notif.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,7 @@ pub enum NetworkPolicy {
6464
/// Runtime state shared across notification handlers.
6565
pub struct SupervisorState {
6666
pub start_instant: std::time::Instant,
67+
pub load_avg: crate::procfs::LoadAvg,
6768
pub mem_used: u64,
6869
pub brk_bases: HashMap<i32, u64>,
6970
pub proc_count: u32,
@@ -118,6 +119,7 @@ impl SupervisorState {
118119
) -> Self {
119120
Self {
120121
start_instant: std::time::Instant::now(),
122+
load_avg: crate::procfs::LoadAvg::new(),
121123
mem_used: 0,
122124
brk_bases: HashMap::new(),
123125
proc_count: 0,

0 commit comments

Comments
 (0)