-
Notifications
You must be signed in to change notification settings - Fork 9
Expand file tree
/
Copy pathdoRoCE.sh
More file actions
executable file
·454 lines (407 loc) · 17.9 KB
/
doRoCE.sh
File metadata and controls
executable file
·454 lines (407 loc) · 17.9 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
#!/bin/bash
# The MIT License (MIT)
#
# Copyright (c) 2020, NVIDIA CORPORATION
#
# Permission is hereby granted, free of charge, to any person obtaining a copy of
# this software and associated documentation files (the "Software"), to deal in
# the Software without restriction, including without limitation the rights to
# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
# the Software, and to permit persons to whom the Software is furnished to do so,
# subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
VERSION=0.98
# TODOs
#- multi-port
uninst=0
run_once=0
lossless=0
lossy=0
rttcc=0
selective_repeat=0
debug=0
verbose=0
device_list=()
y_n=""
tos_val=106
gix_val=3
mtu_val=""
set_default=0
trust="dscp"
trust_val="2"
inparams=$@
specific_devices_selected=0
function yn_question ()
{
text=$1
while true; do
if [ -z $y_n ] ; then read -p "$text (Yy/Nn) " yn
else yn=$y_n
fi
case $yn in
[Yy]* ) return 0;;
[Nn]* ) return 1;;
* ) echo "Yy/Nn";;
esac
done
}
function yn_question_cont_wo ()
{
text="Continue without $1? (not recommended)"
yn_question "$text"
if [ 0 -ne $? ] ; then
echo "Exiting"
exit 0
fi
}
function run_cmd ()
{
cmd_name=$1
cmd_line=$2
care=`sudo bash -c "$cmd_line" 2>&1`
err=$?
if [ 0 -ne $err ] ; then
echo "[E] Failed to run $cmd_name (err $err)"
echo "[E] Failed command output:"
echo "$cmd_line" ; echo "$care"
return $err
fi
if [ 1 -eq $verbose ] ; then
echo "-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-"
echo "[V] Running $cmd_name:"
echo "$cmd_line"
echo "$care"
fi
}
function mount_cm_configfs()
{
if (! sudo cat /proc/mounts | \grep /sys/kernel/config > /dev/null) ; then
if (! sudo mount -t configfs none /sys/kernel/config) ; then
echo "[E] Fail to mount configfs"
return 1
fi
fi
if (sudo modinfo configfs &> /dev/null) ; then
if (! cat /proc/modules | \grep configfs > /dev/null) ; then
if (! sudo modprobe configfs) ; then
echo "[E] Fail to modprobe configfs"
return 1
fi
fi
fi
if [ ! -d /sys/kernel/config/rdma_cm ] ; then return 1 ; fi
}
function set_cm_tos()
{
dev=$1
dev_path="/sys/kernel/config/rdma_cm/${dev}"
rem_after_set=0
if [ ! -d $dev_path ] ; then
rem_after_set=1
run_cmd "create configfs dir for $dev" "mkdir $dev_path"
fi
for port in ${dev_path}/ports/* ; do
run_cmd "set TOS for $dev, port `basename $port`" "bash -c \"echo $tos_val > ${port}/default_roce_tos\""
done
if [ 1 -eq $rem_after_set ] ; then
run_cmd "remove configfs dir for $dev" "rmdir $dev_path"
fi
}
\echo ""
\echo " DoRoCE Version $VERSION"
\echo "---------------------------"
\echo " NOTE - this script aggregates steps described in the Mellanox-NVIDIA community"
\echo " pages and provided as a reference for recipe implementation"
\echo " "
\echo " It is recommended for use during bring-up and that you implement only"
\echo " required components for deployment in production environments"
\echo ""
for arg in "$@"
do
case "$arg" in
-h|--help|--h)
\echo ""
\echo " DoRoCE script configures Mellanox-NVIDIA NICs for RoCE deployments"
\echo ""
\echo " Usage: ./doRoCE.sh (options)"
\echo ""
\echo " Options:"
\echo " --run_once - don't install to driver boot process, only run configuration"
\echo " --uninstall - remove from boot process, don't run configuration"
\echo " -d <dev_a,dev_b...> - comma separated RDMA device list (for example: mlx5_0)"
\echo " if '-d' not provided, tool will configure all found devices"
\echo " -t <val> - set TOS value (default: $tos_val) DSCP=TOS>>2, PRIO=DSCP>>3"
\echo " -m <val> - set MTU value (default: don't change)"
\echo " -g <val> - set NCCL conf GID-index value (default: $gix_val)"
\echo " -l / --lossless_opt - assume lossless configuration for performance optimizations (default: $lossless)"
\echo " use this option if you configured a Mellanox-NVIDIA switch with \"roce\" command"
\echo " -s / --lossy_buf - disable PFC, use single larger buffer for all traffic types (default: $lossy)"
\echo " -c / --rttcc - force the usage of ZTR-RTTCC congestion control (default: nvconfig & ECE synchronization)"
\echo " nvconfig parameter: ROCE_CC_LEGACY_DCQCN=False"
\echo " this configuration is required on both ports for dual port NIC"
\echo " -r / --selective_repeat - force the usage of Selective Repeat retransmission mechanism (default: nvconfig & ECE synchronization)"
\echo " nvconfig parameter: RDMA_SELECTIVE_REPEAT_EN"
\echo " -u / --debug - add debug prints"
\echo " -v / --verbose - print commands and outputs"
\echo " -y / --yes - ignore errors and proceed with what's available (default - ask)"
\echo " -n / --no - exit on any missing component"
\echo " -b / --back_to_def - restore OOB config (note - will not restore MTU, please set it manually)"
\echo ""
\echo " List of configurations performed:"
\echo " - Installs the script (with selected parameters) to driver boot process"
\echo " - Set trust mode to DSCP"
\echo " - Enable/disable PFC on priority (TOS>>5) - aligns with default DSCP-to-Priority mapping"
\echo " - Enable/disable lossless performance optimizations"
\echo " - Set /etc/nccl.conf to TOS=106"
\echo " note: conf files are set once, not on every boot"
\echo " note: UCX uses UCX_IB_TRAFFIC_CLASS=106 by default. Change through command line, as conf file isn't supported yet"
\echo " - Set IB VERB override to TOS=106"
\echo " - Set RDMA-CM default TOS"
\echo ""
exit 5;
;;
"--uninstall") uninst=1;;
"--run_once") run_once=1;;
"-l"|"--lossless_opt") lossless=1;;
"-s"|"--lossy_buf") lossy=1;;
"-c"|"--rttcc") rttcc=1;;
"-r"|"--selective_repeat") selective_repeat=1;;
"-u"|"--debug") debug=1;;
"-v"|"--verbose") verbose=1;;
"-y"|"--yes") y_n="y";;
"-n"|"--no") y_n="n";;
"-b"|"--back_to_def") set_default=1;;
-d) p_arg=${arg##"-"} ;;
-t) p_arg=${arg##"-"} ;;
-m) p_arg=${arg##"-"} ;;
-g) p_arg=${arg##"-"} ;;
*) case $p_arg in
d) device_list=(${arg//,/ }) ; p_arg="" ;;
t) tos_val="$arg" ; p_arg="" ;;
m) mtu_val="$arg" ; p_arg="" ;;
g) gix_val="$arg" ; p_arg="" ;;
*) echo "[E] Unknown paramater, see help (-h/--help)" ; exit 5 ;;
esac
esac
done
if [ -d "/etc/infiniband" ] && [ -f /etc/init.d/openibd ] ; then
psh_caller="openibd"
psh_path="/etc/infiniband/post-start-hook.sh"
else
psh_caller="rc.local"
psh_path="/etc/rc.d/rc.local"
fi
nccl_conf_path="/etc/nccl.conf"
if [ 1 -eq $uninst ] || [ 1 -eq $set_default ] ; then
echo "[I] Removing NCCL conf hook"
if [ -f $nccl_conf_path ] ; then
run_cmd "Clear NCCL conf DSCP" "\sed -i -- '/doRoCE\|NCCL_IB_TC\|NCCL_IB_GID_INDEX/I d' $nccl_conf_path"
fi
if [ 1 -eq $uninst ] ; then
echo "[I] Removing script from boot process"
if [ -f $psh_path ] ; then
run_cmd "Clear post-start hook" "sed -i -- '/doRoCE/ d' $psh_path"
fi
echo "[I] Removing script from /usr/bin"
run_cmd "Remove script from /usr/bin" "rm -f /usr/bin/doRoCE.sh"
exit 0
fi
fi
if [ ! -d /sys/bus/pci/drivers/mlx5_core/ ] ; then
echo "[E] mlx5 driver is down, exiting"
exit 6
fi
if [ 1 -eq $lossless ] && [ 1 -eq $lossy ] ; then echo "[E] Lossy and lossless can't be configured at the same time, exiting" ; exit 7 ; fi
if [ 1 -eq $set_default ] ; then
lossless=0
lossy=1
rttcc=0
selective_repeat=0
tos_val=0
trust="pcp"
trust_val=1
fi
pfc_cmd_mask=$((1 << ($tos_val>>5)))
pfc_set_mask=$((!$lossy << ($tos_val>>5)))
if [ 1 -eq $debug ] ; then echo -n "[D] PFC-MASK=" ; printf "0x%.2x\n" $pfc_set_mask ; fi
if [ 1 -eq $debug ] ; then echo "[D] checking for mlxreg/mstreg" ; fi
mlxreg_cmd=""
if (which mlxreg >/dev/null 2>&1) || [ -f "/usr/bin/mlxreg" ] ; then mlxreg_cmd="mlxreg"
elif (which mstreg >/dev/null 2>&1) || [ -f "/usr/bin/mstreg" ] ; then mlxreg_cmd="mstreg"
else
echo "[E] Could not find mlxreg/mstreg tool in \$PATH"
echo "to install: install MLNX_OFED, or:"
echo " "
echo "# git clone https://github.com/Mellanox/mstflint.git"
echo "# cd mstflint"
echo "# ./autogen.sh"
echo "# ./configure --disable-inband --enable-adb-generic-tools"
echo "# make"
echo "# sudo make install"
yn_question_cont_wo "PFC, trust layer and lossy fabric accelerations"
fi
if [ 1 -eq $debug ] ; then echo "[D] checking for mlxconfig/mstconfig" ; fi
mlxconfig_cmd=""
if (which mlxconfig >/dev/null 2>&1) || [ -f "/usr/bin/mlxconfig" ] ; then mlxconfig_cmd="mlxconfig"
elif (which mstconfig >/dev/null 2>&1) || [ -f "/usr/bin/mstconfig" ] ; then mlxconfig_cmd="mstconfig"
else
echo "[E] Could not find mlxconfig/mstconfig tool in \$PATH"
echo "to install: install MLNX_OFED, or:"
echo " "
echo "# git clone https://github.com/Mellanox/mstflint.git"
echo "# cd mstflint"
echo "# ./autogen.sh"
echo "# ./configure --disable-inband --enable-adb-generic-tools"
echo "# make"
echo "# sudo make install"
yn_question_cont_wo "PFC, trust layer and lossy fabric accelerations"
fi
if [ 1 -eq $debug ] ; then echo "[D] checking for RDMA-CM configfs" ; fi
cm_configfs_found=1
if (! mount_cm_configfs) ; then
cm_configfs_found=0
yn_question_cont_wo "setting RDMA-CM default TOS"
fi
mlnx_qos_found=0
if [ 1 -eq $debug ] ; then echo "[D] checking for mlnx_qos/lldptool" ; fi
if (which mlnx_qos >/dev/null 2>&1) || [ -f "/usr/bin/mlnx_qos" ] ; then
mlnx_qos_found=1
fi
# Install to /usr/bin
PARENT_COMMAND=$(ps -o comm= $PPID)
if [ "$PARENT_COMMAND" = "$psh_caller" ] ; then let run_once=1 ; fi
if [ 0 -eq $run_once ] ; then
mypath=`realpath $0`
if [ 0 -ne $? ] ; then
echo "[E] Could not determine current path, exiting"
exit 5
fi
if [ "$mypath" != "/usr/bin/doRoCE.sh" ] ; then
if [ 1 -eq $debug ] ; then echo "[D] Installing to /usr/bin" ; fi
run_cmd "Copy to /usr/bin" "sudo cp -f $mypath /usr/bin/doRoCE.sh && chmod a+x /usr/bin/doRoCE.sh"
fi
if [ 1 -eq $debug ] ; then echo "[D] Adding to OFED post-start-hook" ; fi
if [ -f $psh_path ] ; then
run_cmd "Clear post-start-hook" "sed -i -- '/doRoCE/I d' $psh_path"
fi
run_cmd "Add post-start-hook" "echo -e \"# Added by doRoCE scirpt:\n/usr/bin/doRoCE.sh $inparams --yes >/dev/null\" >> $psh_path"
if [ ! -x $psh_path ] ; then run_cmd "Set post-start-hook +x" "chmod a+x $psh_path" ; fi
fi
if [ "$PARENT_COMMAND" != "$psh_caller" ] && [ 1 -ne $set_default ] ; then
# Set nccl.conf
if [ 1 -eq $debug ] ; then echo "[D] setting NCCL conf" ; fi
if [ -f $nccl_conf_path ] ; then
run_cmd "Clear NCCL conf DSCP" "\sed -i -- '/doRoCE\|NCCL_IB_TC\|NCCL_IB_GID_INDEX/I d' $nccl_conf_path"
fi
run_cmd "Add NCCL conf DSCP" "echo -e \"# Added by doRoCE scirpt:\nNCCL_IB_TC=$tos_val\nNCCL_IB_GID_INDEX=$gix_val\" >> $nccl_conf_path"
#set ucx.conf - not supported by UCX yet!
if [ 106 -ne $tos_val ] ; then
echo "[I] NOTE - for UCX, make sure to add to the command line: \"UCX_IB_TRAFFIC_CLASS=$tos_val\""
fi
fi
if [ -z "$device_list" ] ; then
for dev in `\ls /sys/class/infiniband/` ; do
device_list+=("$dev")
done
else
specific_devices_selected=1
fi
if [ 1 -eq $debug ] ; then echo "[I] Device list: ${device_list[@]}" ; fi
for dev in ${device_list[@]} ; do
if [ 1 -eq $debug ] ; then echo "[D] Starting device $dev" ; fi
# Get device info
dev_linktype=`\cat /sys/class/infiniband/${dev}/ports/1/link_layer`
if [[ "Ethernet" != "$dev_linktype" ]] ; then
echo "[I] Device $dev - link type $dev_linktype, skipping"
continue
fi
bdf=`\readlink /sys/class/infiniband/${dev}/device | \xargs basename`
netdev=`\ls /sys/class/infiniband/${dev}/device/net/ | \xargs basename`
if [ 1 -eq $debug ] ; then echo "[D] Device $dev - bdf: $bdf, netdev: $netdev" ; fi
if [ ! -z $mlxreg_cmd ] ; then
# Configure PFC, trust mode
if [ 1 -eq $mlnx_qos_found ] ; then
mlnx_qos_pfc_mask=""
for i in {0..7} ; do
mlnx_qos_pfc_mask+="$(( ($pfc_set_mask>>$i) & 0x1 ))"
if [ 7 -ne $i ] ; then mlnx_qos_pfc_mask+="," ; fi
done
run_cmd mlnx_qos "mlnx_qos -i $netdev --trust=$trust --pfc=$mlnx_qos_pfc_mask"
else
run_cmd "Set trust DSCP" "$mlxreg_cmd -y -d $bdf --reg_name QPTS -i \"local_port=1\" --set \"trust_state=$trust_val\""
run_cmd "Set PFC" "$mlxreg_cmd -y -d $bdf --reg_name PFCC -i \"local_port=1,pnat=0,dcbx_operation_type=0\" --set \"prio_mask_rx=${pfc_cmd_mask},prio_mask_tx=${pfc_cmd_mask},pfctx=${pfc_set_mask},pfcrx=${pfc_set_mask},pprx=0,pptx=0\""
fi
# Configure lossy accelerations
accl_val=$((1-($lossless || $set_default)))
run_cmd "Set lossy optimizations" "$mlxreg_cmd -y -d $bdf --reg_name ROCE_ACCL --set \"roce_adp_retrans_en=$accl_val,roce_tx_window_en=$accl_val,roce_slow_restart_en=$accl_val\""
# Get device type; Activate the following based on minimum device level; ConnectX-6 Dx onwards
dev_fw_version=`\cat /sys/class/infiniband/${dev}/fw_ver`
dev_type=$(echo $dev_fw_version | cut -c -2)
ga_release_version=$(echo $dev_fw_version | cut -c 4-5)
# Set selective repeat
if [ 1 -eq $selective_repeat ] ; then
if ((22 > $dev_type)) ; then
echo "[I] Device $dev - does not support Selective Repeat, skipping"
continue
fi
accl_val=$((1-$set_default))
run_cmd "Set selective repeat" "$mlxreg_cmd -y -d $bdf --reg_name ROCE_ACCL --set \"selective_repeat_forced_en=$accl_val,adaptive_routing_forced_en=0\""
fi
# Set ZTR-RTTCC
if [ 1 -eq $rttcc ] ; then
if ((22 > $dev_type)) ; then
echo "[I] Device $dev - does not support ZTR-RTTCC, skipping"
continue
fi
if ((37 > $ga_release_version)) ; then
echo "[I] Use a newer GA (37 and above) for the below to work properly, skipping"
continue
fi
# Configure ZTR-RTTCC congestion control
echo "[I] Configure ZTR-RTTCC congestion control"
if [[ "22" == "$dev_type" ]] ; then
echo "[I] Device $dev - is ConnectX-6 Dx, check if legacy DCQCN is enabled"
# verify device is NOT in legacy DCQCN congestion control mode
mlxconfig_out=$($mlxconfig_cmd -d $bdf q ROCE_CC_LEGACY_DCQCN | grep ROCE_CC_LEGACY_DCQCN)
if [[ $mlxconfig_out == *"True"* ]] ; then
echo "[I] Device $dev - DCQCN congestion control in use, skipping"
echo "[I] disable ROCE_CC_LEGACY_DCQCN with mlxconfig and reset the device"
continue
fi
fi
run_cmd "Activate ZTR-RTTCC congestion control" "$mlxreg_cmd -y -d $bdf --reg_name PPCC --set \"cmd_type=2\" --indexes \"local_port=1,pnat=0,lp_msb=0,algo_slot=15,algo_param_index=0\""
if [ 1 -eq $specific_devices_selected ] ; then
echo "[I] Activating ZTR-RTTCC is required on both ports for dual port NIC!"
fi
fi
fi
# Set MTU
if [ ! -z $mtu_val ] ; then
run_cmd "Set MTU" "ifconfig $netdev mtu $mtu_val"
fi
# Set verb default DSCP
tc_filename="/sys/class/infiniband/${dev}/tc/1/traffic_class"
if [ -f $tc_filename ] ; then
run_cmd "Set verbs default DSCP" "echo $tos_val > ${tc_filename}"
else
echo "[E] Could not find $tc_filename, used to force verbs interface TCLASS"
echo "[E] Make sure to configure TCLASS in your applications"
fi
# Set RDMA-CM
set_cm_tos $dev
# if back_to_def - set global pause
if [ 1 -eq $set_default ] ; then
care=`run_cmd "Back to default - set global pause" "ethtool -A $netdev rx on tx on"`
if [ 1 -eq $? ] ; then echo $care ; fi
fi
echo "[I] Device $dev - done"
done