Skip to content

Commit 44c0ce8

Browse files
core: Add CPU and I/O resource limits to run-service
Extend the service resource limitation system beyond memory to include: - CPU limits via cgroups v2 cpu.max (percentage of cores) - I/O bandwidth limits via cgroups v2 io.max (read/write MB/s) Service tuple format updated to: NAME,MEMORY_MB,CPU_PERCENT,IO_READ_MBPS,IO_WRITE_MBPS,COMMAND Environment variables to disable limits: - BLUEOS_DISABLE_RESOURCE_LIMITS: disables all limits - BLUEOS_DISABLE_MEMORY_LIMIT: disables memory limit - BLUEOS_DISABLE_CPU_LIMIT: disables CPU limit - BLUEOS_DISABLE_IO_LIMIT: disables I/O limits
1 parent 9abca66 commit 44c0ce8

2 files changed

Lines changed: 170 additions & 65 deletions

File tree

core/run-service.sh

Lines changed: 106 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,11 @@
22

33
service_name=$1
44
service_command=$2
5-
memory_limit_mb=$3
5+
memory_limit_mb=${3:-0}
6+
cpu_limit_percent=${4:-0}
7+
io_read_mbps=${5:-0}
8+
io_write_mbps=${6:-0}
9+
610
memory_limit_bytes=$((memory_limit_mb * 1024 * 1024))
711
LOG_FILE="/var/logs/blueos/run-service.log"
812

@@ -11,8 +15,68 @@ CHILD_CGROUP="/sys/fs/cgroup/$DOCKER_CGROUP/$service_name"
1115
# Create a new cgroup for the service
1216
mkdir -p "$CHILD_CGROUP"
1317

14-
# Set memory limit for the cgroup
15-
echo "$memory_limit_bytes" > "$CHILD_CGROUP/memory.max"
18+
# Set memory limit for the cgroup (0 = no limit)
19+
if [ "$memory_limit_bytes" -gt 0 ]; then
20+
echo "$memory_limit_bytes" > "$CHILD_CGROUP/memory.max"
21+
fi
22+
23+
# Set CPU limit for the cgroup (0 = no limit)
24+
# cpu.max format: "QUOTA PERIOD" in microseconds
25+
# Example: "50000 100000" means 50% of one CPU core
26+
if [ "$cpu_limit_percent" -gt 0 ]; then
27+
CPU_PERIOD=100000
28+
CPU_QUOTA=$((cpu_limit_percent * CPU_PERIOD / 100))
29+
echo "$CPU_QUOTA $CPU_PERIOD" > "$CHILD_CGROUP/cpu.max"
30+
fi
31+
32+
# Set I/O limits for the cgroup (0 = no limit)
33+
# io.max format: "MAJOR:MINOR rbps=BYTES wbps=BYTES"
34+
if [ "$io_read_mbps" -gt 0 ] || [ "$io_write_mbps" -gt 0 ]; then
35+
# Get the major:minor of the actual block device
36+
# In Docker containers with overlay fs, we need to find the underlying block device
37+
# Note: cgroups v2 I/O limiting works on whole block devices, not partitions
38+
# So we use mmcblk0 (not mmcblk0p2), sda (not sda1), etc.
39+
ROOT_MAJOR=""
40+
ROOT_MINOR=""
41+
42+
# Scan for whole block devices (not partitions)
43+
# Glob patterns match whole disks only: mmcblk0 (not mmcblk0p1), sda (not sda1), etc.
44+
# Order: embedded (mmcblk), common (sd), NVMe, virtual (vd, xvd)
45+
for DEV in /dev/mmcblk[0-9] /dev/sd[a-z] /dev/nvme[0-9]n[0-9] /dev/vd[a-z] /dev/xvd[a-z]; do
46+
if [ -b "$DEV" ]; then
47+
ROOT_MAJOR=$(stat -c '%t' "$DEV" 2>/dev/null)
48+
ROOT_MINOR=$(stat -c '%T' "$DEV" 2>/dev/null)
49+
if [ -n "$ROOT_MAJOR" ] && [ -n "$ROOT_MINOR" ]; then
50+
# Convert from hex to decimal
51+
ROOT_MAJOR=$((16#$ROOT_MAJOR))
52+
ROOT_MINOR=$((16#$ROOT_MINOR))
53+
break
54+
fi
55+
fi
56+
done
57+
58+
# Skip I/O limiting if no valid block device found
59+
if [ -z "$ROOT_MAJOR" ] || [ -z "$ROOT_MINOR" ]; then
60+
echo "Warning: Could not find block device for I/O limiting"
61+
else
62+
IO_LIMIT_STR="$ROOT_MAJOR:$ROOT_MINOR"
63+
if [ "$io_read_mbps" -gt 0 ]; then
64+
IO_READ_BPS=$((io_read_mbps * 1024 * 1024))
65+
IO_LIMIT_STR="$IO_LIMIT_STR rbps=$IO_READ_BPS"
66+
fi
67+
if [ "$io_write_mbps" -gt 0 ]; then
68+
IO_WRITE_BPS=$((io_write_mbps * 1024 * 1024))
69+
IO_LIMIT_STR="$IO_LIMIT_STR wbps=$IO_WRITE_BPS"
70+
fi
71+
echo "$IO_LIMIT_STR" > "$CHILD_CGROUP/io.max"
72+
fi
73+
fi
74+
75+
# Check if any resource limit is enabled
76+
has_any_limit() {
77+
[ "$memory_limit_bytes" -gt 0 ] || [ "$cpu_limit_percent" -gt 0 ] || \
78+
[ "$io_read_mbps" -gt 0 ] || [ "$io_write_mbps" -gt 0 ]
79+
}
1680

1781
# find PIDs for all children of a given process
1882
findpids() {
@@ -24,49 +88,62 @@ findpids() {
2488
echo "$pid_list" | tr ' ' '\n' | sort -u | tr '\n' ' '
2589
}
2690

91+
add_to_cgroup() {
92+
local pid=$1
93+
# Check if the process exists and any limit is set
94+
if ! ps -p $pid > /dev/null || ! has_any_limit; then
95+
# process doesn't exist or no limits set
96+
return
97+
fi
98+
echo $pid > $CHILD_CGROUP/cgroup.procs
99+
}
100+
101+
# Add current shell to cgroup FIRST so all children inherit limits
102+
add_to_cgroup $$
103+
104+
# Recursive function to find and add child processes to the cgroup
105+
add_child_processes_to_cgroup() {
106+
local parent_pid=$1
107+
# Find all child processes of the parent PID
108+
child_pids=$(findpids $parent_pid)
109+
# Add each child process to the cgroup
110+
for pid in $child_pids; do
111+
echo "Adding child process $pid to cgroup $service_name"
112+
add_to_cgroup $pid
113+
done
114+
}
115+
27116
# Function to start the service and add its PIDs to the cgroup
28117
start_service() {
29118
# Start the service in the background
30119
eval "$service_command" &
31120
service_pid=$!
32121

33-
add_to_cgroup() {
34-
local pid=$1
35-
# Check if the process exists and memory limit is set
36-
if ! ps -p $pid > /dev/null || [ $memory_limit_bytes -eq 0 ]; then
37-
# process doesn't exist. presume it is already dead
38-
return
39-
fi
40-
echo $pid > $CHILD_CGROUP/cgroup.procs
41-
}
42-
43-
# Recursive function to find and add child processes to the cgroup
44-
add_child_processes_to_cgroup() {
45-
local parent_pid=$1
46-
# Find all child processes of the parent PID
47-
child_pids=$(findpids $parent_pid)
48-
# Add each child process to the cgroup
49-
for pid in $child_pids; do
50-
echo "Adding child process $pid to cgroup $service_name"
51-
add_to_cgroup $pid
52-
done
53-
}
54-
55122
add_to_cgroup $service_pid
56-
add_to_cgroup $$ # this is the PID of the current process
57123
add_child_processes_to_cgroup $service_pid
58124

59125
# Wait for the process to complete and capture its exit code
60126
wait $service_pid
61127
return $?
62128
}
63129

64-
# Continuously run the service, restarting if it stops or exceeds memory limit
130+
# Build limits description for logging
131+
get_limits_description() {
132+
local desc=""
133+
[ "$memory_limit_mb" -gt 0 ] && desc="${desc}mem=${memory_limit_mb}MB "
134+
[ "$cpu_limit_percent" -gt 0 ] && desc="${desc}cpu=${cpu_limit_percent}% "
135+
[ "$io_read_mbps" -gt 0 ] && desc="${desc}io_r=${io_read_mbps}MB/s "
136+
[ "$io_write_mbps" -gt 0 ] && desc="${desc}io_w=${io_write_mbps}MB/s "
137+
[ -z "$desc" ] && desc="none"
138+
echo "$desc"
139+
}
140+
141+
# Continuously run the service, restarting if it stops or exceeds resource limits
65142
while true; do
66-
echo "Starting service: $service_command with memory limit: $memory_limit_bytes bytes "
143+
echo "Starting service: $service_command with limits: $(get_limits_description)"
67144
if ! start_service; then
68145
timestamp=$(date '+%Y-%m-%d %H:%M:%S')
69-
echo "$timestamp: Service ($service_command) exceeded memory limit or stopped. Restarting..." | tee -a "$LOG_FILE"
146+
echo "$timestamp: Service ($service_command) exceeded resource limit or stopped. Restarting..." | tee -a "$LOG_FILE"
70147
else
71148
echo "Service ($service_command) completed successfully."
72149
break

core/start-blueos-core

Lines changed: 64 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -107,37 +107,44 @@ find /usr/blueos/userdata -type f -exec chmod a+rw {} \;
107107
# and ~1min30s using this strategy.
108108
# From that 1min30s, the startup time is about ~25s, and originally, ~37s, meaning that the
109109
# remaining (~65 seconds) is the docker shutting down, and the Linux booting up.
110+
#
111+
# Service tuple format:
112+
# NAME,MEMORY_MB,CPU_PERCENT,IO_READ_MBPS,IO_WRITE_MBPS,COMMAND
113+
# - MEMORY_MB: Memory limit in MB (0 = no limit)
114+
# - CPU_PERCENT: CPU limit as percentage (100 = 1 core, 200 = 2 cores, 0 = no limit)
115+
# - IO_READ_MBPS: I/O read limit in MB/s (0 = no limit)
116+
# - IO_WRITE_MBPS: I/O write limit in MB/s (0 = no limit)
110117
PRIORITY_SERVICES=(
111-
'autopilot',0,"nice --19 $SERVICES_PATH/ardupilot_manager/main.py"
112-
'cable_guy',0,"$SERVICES_PATH/cable_guy/main.py"
113-
'video',0,"nice --19 mavlink-camera-manager --default-settings BlueROVUDP --mavlink tcpout:127.0.0.1:5777 --mavlink-system-id $MAV_SYSTEM_ID --mavlink-camera-component-id-range=100-105 --gst-feature-rank omxh264enc=0,v4l2h264enc=250,x264enc=260 --log-path /var/logs/blueos/services/mavlink-camera-manager --stun-server stun://stun.l.google.com:19302 --zenoh --verbose"
114-
'mavlink2rest',0,"mavlink2rest --connect=udpout:127.0.0.1:14001 --server [::]:6040 --system-id $MAV_SYSTEM_ID --component-id $MAV_COMPONENT_ID_ONBOARD_COMPUTER4"
118+
'autopilot',0,0,0,0,"nice --19 $SERVICES_PATH/ardupilot_manager/main.py"
119+
'cable_guy',0,0,0,0,"$SERVICES_PATH/cable_guy/main.py"
120+
'video',0,0,0,0,"nice --19 mavlink-camera-manager --default-settings BlueROVUDP --mavlink tcpout:127.0.0.1:5777 --mavlink-system-id $MAV_SYSTEM_ID --mavlink-camera-component-id-range=100-105 --gst-feature-rank omxh264enc=0,v4l2h264enc=250,x264enc=260 --log-path /var/logs/blueos/services/mavlink-camera-manager --stun-server stun://stun.l.google.com:19302 --zenoh --verbose"
121+
'mavlink2rest',0,0,0,0,"mavlink2rest --connect=udpout:127.0.0.1:14001 --server [::]:6040 --system-id $MAV_SYSTEM_ID --component-id $MAV_COMPONENT_ID_ONBOARD_COMPUTER4"
115122
)
116123

117124
SERVICES=(
118125
# This services are not prioritized because they are not fundamental for the vehicle to work
119-
'kraken',0,"nice -19 $BLUEOS_PYTHON_BIN_SECONDARY $SERVICES_PATH/kraken/main.py"
120-
'wifi',0,"nice -19 $SERVICES_PATH/wifi/main.py --socket wlan0"
121-
'zenohd',0,"ZENOH_BACKEND_FS_ROOT=$TOOLS_PATH/zenoh zenohd -c $TOOLS_PATH/zenoh/blueos-zenoh.json5"
126+
'kraken',0,0,0,0,"nice -19 $BLUEOS_PYTHON_BIN_SECONDARY $SERVICES_PATH/kraken/main.py"
127+
'wifi',0,0,0,0,"nice -19 $SERVICES_PATH/wifi/main.py --socket wlan0"
128+
'zenohd',0,0,0,0,"ZENOH_BACKEND_FS_ROOT=$TOOLS_PATH/zenoh zenohd -c $TOOLS_PATH/zenoh/blueos-zenoh.json5"
122129
# This services are not as important as the others
123-
'beacon',250,"$SERVICES_PATH/beacon/main.py"
124-
'bridget',0,"nice -19 $RUN_AS_REGULAR_USER_BEGIN $SERVICES_PATH/bridget/main.py $RUN_AS_REGULAR_USER_END"
125-
'commander',250,"$SERVICES_PATH/commander/main.py"
126-
'nmea_injector',250,"nice -19 $SERVICES_PATH/nmea_injector/main.py"
127-
'helper',250,"$BLUEOS_PYTHON_BIN_SECONDARY $SERVICES_PATH/helper/main.py"
128-
'iperf3',250," iperf3 --server --port 5201"
129-
'linux2rest',250,"linux2rest --log-settings netstat=30,platform=10,serial-ports=10,cpu=10,disk=30,info=10,memory=10,network=10,process=60,temperature=10,unix-time-seconds=10,usb=60"
130-
'filebrowser',250,"nice -19 filebrowser --database /etc/filebrowser/filebrowser.db --baseurl /file-browser"
131-
'versionchooser',0,"$BLUEOS_PYTHON_BIN_SECONDARY $SERVICES_PATH/versionchooser/main.py"
132-
'pardal',250,"nice -19 $SERVICES_PATH/pardal/main.py"
133-
'ping',0,"nice -19 $RUN_AS_REGULAR_USER_BEGIN $SERVICES_PATH/ping/main.py $RUN_AS_REGULAR_USER_END"
134-
'user_terminal',0,"cat /etc/motd"
135-
'ttyd',250,'nice -19 ttyd -p 8088 sh -c "/usr/bin/tmux attach -t user_terminal || /usr/bin/tmux new -s user_terminal"'
136-
'nginx',250,"nice -18 nginx -g \"daemon off;\" -c $TOOLS_PATH/nginx/nginx.conf"
137-
'bag_of_holding',250,"$SERVICES_PATH/bag_of_holding/main.py"
138-
'recorder',250,"blueos-recorder --recorder-path /usr/blueos/userdata/recorder"
139-
'recorder_extractor',250,"$SERVICES_PATH/recorder_extractor/main.py"
140-
'disk_usage',250,"$SERVICES_PATH/disk_usage/main.py"
130+
'beacon',250,0,0,0,"$SERVICES_PATH/beacon/main.py"
131+
'bridget',0,0,0,0,"nice -19 $RUN_AS_REGULAR_USER_BEGIN $SERVICES_PATH/bridget/main.py $RUN_AS_REGULAR_USER_END"
132+
'commander',250,0,0,0,"$SERVICES_PATH/commander/main.py"
133+
'nmea_injector',250,0,0,0,"nice -19 $SERVICES_PATH/nmea_injector/main.py"
134+
'helper',250,0,0,0,"$BLUEOS_PYTHON_BIN_SECONDARY $SERVICES_PATH/helper/main.py"
135+
'iperf3',250,0,0,0," iperf3 --server --port 5201"
136+
'linux2rest',250,0,0,0,"linux2rest --log-settings netstat=30,platform=10,serial-ports=10,cpu=10,disk=30,info=10,memory=10,network=10,process=60,temperature=10,unix-time-seconds=10,usb=60"
137+
'filebrowser',250,0,0,0,"nice -19 filebrowser --database /etc/filebrowser/filebrowser.db --baseurl /file-browser"
138+
'versionchooser',0,0,0,0,"$BLUEOS_PYTHON_BIN_SECONDARY $SERVICES_PATH/versionchooser/main.py"
139+
'pardal',250,0,0,0,"nice -19 $SERVICES_PATH/pardal/main.py"
140+
'ping',0,0,0,0,"nice -19 $RUN_AS_REGULAR_USER_BEGIN $SERVICES_PATH/ping/main.py $RUN_AS_REGULAR_USER_END"
141+
'user_terminal',0,0,0,0,"cat /etc/motd"
142+
'ttyd',250,0,0,0,'nice -19 ttyd -p 8088 sh -c "/usr/bin/tmux attach -t user_terminal || /usr/bin/tmux new -s user_terminal"'
143+
'nginx',250,0,0,0,"nice -18 nginx -g \"daemon off;\" -c $TOOLS_PATH/nginx/nginx.conf"
144+
'bag_of_holding',250,0,0,0,"$SERVICES_PATH/bag_of_holding/main.py"
145+
'recorder',250,0,0,0,"blueos-recorder --recorder-path /usr/blueos/userdata/recorder"
146+
'recorder_extractor',250,0,0,0,"$SERVICES_PATH/recorder_extractor/main.py"
147+
'disk_usage',250,0,0,0,"$SERVICES_PATH/disk_usage/main.py"
141148
)
142149

143150
tmux -f /etc/tmux.conf start-server
@@ -147,10 +154,29 @@ function create_service {
147154
SESSION_NAME="$1:0"
148155
SERVICE_NAME="$1"
149156
local command="$2" # Store the command as a string
150-
local memory_limit_mb=$3
157+
local memory_limit_mb=${3:-0}
158+
local cpu_limit_percent=${4:-0}
159+
local io_read_mbps=${5:-0}
160+
local io_write_mbps=${6:-0}
161+
162+
if [ -n "${BLUEOS_DISABLE_RESOURCE_LIMITS}" ]; then
163+
memory_limit_mb=0
164+
cpu_limit_percent=0
165+
io_read_mbps=0
166+
io_write_mbps=0
167+
fi
151168

152169
if [ -n "${BLUEOS_DISABLE_MEMORY_LIMIT}" ]; then
153-
memory_limit_mb=$TOTAL_RAM_MB
170+
memory_limit_mb=0
171+
fi
172+
173+
if [ -n "${BLUEOS_DISABLE_CPU_LIMIT}" ]; then
174+
cpu_limit_percent=0
175+
fi
176+
177+
if [ -n "${BLUEOS_DISABLE_IO_LIMIT}" ]; then
178+
io_read_mbps=0
179+
io_write_mbps=0
154180
fi
155181

156182
# Check if the service is disabled
@@ -159,16 +185,18 @@ function create_service {
159185
tmux send-keys -t $SESSION_NAME "echo 'Service $1 is disabled'; sleep infinity" C-m
160186
return
161187
fi
162-
echo "Service: $NAME: $EXECUTABLE with memory limit: $memory_limit_mb MB"
188+
echo "Service: $SERVICE_NAME: mem=${memory_limit_mb}MB cpu=${cpu_limit_percent}% io_r=${io_read_mbps}MB/s io_w=${io_write_mbps}MB/s"
163189

164190
# Set all necessary environment variables for the new tmux session
165191
for NAME in $(compgen -v | grep -e MAV_ -e BLUEOS_); do
166192
VALUE=${!NAME}
167193
tmux setenv -t "$SESSION_NAME" -g "$NAME" "$VALUE"
168194
done
195+
# Pass DOCKER_CGROUP for cgroup path resolution in run-service
196+
tmux setenv -t "$SESSION_NAME" -g "DOCKER_CGROUP" "$DOCKER_CGROUP"
169197

170-
# Use run_service to start the service with the memory limit
171-
tmux send-keys -t $SESSION_NAME "run-service '$SERVICE_NAME' '$command' $memory_limit_mb " C-m
198+
# Use run_service to start the service with resource limits
199+
tmux send-keys -t $SESSION_NAME "run-service '$SERVICE_NAME' '$command' $memory_limit_mb $cpu_limit_percent $io_read_mbps $io_write_mbps" C-m
172200
}
173201

174202
SSH_USER=${SSH_USER:-pi}
@@ -220,24 +248,24 @@ prepare_cgroups() {
220248
cat $DOCKER_CGROUP_PATH/cgroup.procs
221249
fi
222250

223-
echo "Enabling subtree_control..."
224-
echo "+memory" > $DOCKER_CGROUP_PATH/cgroup.subtree_control && echo "subtree_control enabled"
251+
echo "Enabling subtree_control for memory, cpu, and io on container cgroup..."
252+
echo "+memory +cpu +io" > $DOCKER_CGROUP_PATH/cgroup.subtree_control && echo "subtree_control enabled on container cgroup"
225253
}
226254

227255
prepare_cgroups
228256

229257
echo "Starting high priority services.."
230258
for TUPLE in "${PRIORITY_SERVICES[@]}"; do
231-
IFS=',' read -r NAME MEMORY_LIMIT_MB EXECUTABLE <<< "$TUPLE"
232-
create_service "$NAME" "$EXECUTABLE" "$MEMORY_LIMIT_MB"
259+
IFS=',' read -r NAME MEMORY_MB CPU_PERCENT IO_READ_MBPS IO_WRITE_MBPS EXECUTABLE <<< "$TUPLE"
260+
create_service "$NAME" "$EXECUTABLE" "$MEMORY_MB" "$CPU_PERCENT" "$IO_READ_MBPS" "$IO_WRITE_MBPS"
233261
done
234262

235263
sleep 5
236264

237265
echo "Starting other services.."
238266
for TUPLE in "${SERVICES[@]}"; do
239-
IFS=',' read -r NAME MEMORY_LIMIT_MB EXECUTABLE <<< "$TUPLE"
240-
create_service "$NAME" "$EXECUTABLE" "$MEMORY_LIMIT_MB"
267+
IFS=',' read -r NAME MEMORY_MB CPU_PERCENT IO_READ_MBPS IO_WRITE_MBPS EXECUTABLE <<< "$TUPLE"
268+
create_service "$NAME" "$EXECUTABLE" "$MEMORY_MB" "$CPU_PERCENT" "$IO_READ_MBPS" "$IO_WRITE_MBPS"
241269
done
242270

243271
echo "BlueOS running!"

0 commit comments

Comments
 (0)