-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathrun.job
More file actions
153 lines (123 loc) · 4.19 KB
/
run.job
File metadata and controls
153 lines (123 loc) · 4.19 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
#!/bin/bash
#SBATCH --time=48:00:00
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=1
#SBATCH --mem=32G
#SBATCH --job-name=user_privacy
#SBATCH --partition=paula
#SBATCH --output=slurm_logs/%j/stdout.out
#SBATCH --error=slurm_logs/%j/stderr.err
set -x # to print all the commands to stderr
# get needed modules and packages
module load CUDA/11.8.0
module load dask/2022.1.0-foss-2021b # Python 3.9.6
module load matplotlib/3.4.3-foss-2021b
# Install packages to user directory to avoid permission issues with system packages
pip install --no-cache-dir xgboost==2.0.3 h5py==3.11.0 seaborn==0.13.2 scikit-learn==1.2.2 fastparquet==2024.11.0 folktables==0.0.12 # numpy==1.21.3
# Default parameters as fallback - see run_all for real parameters
DEFAULT_WORK_DIR=$PWD/results/default
DATASET="adult"
TRAIN_METHOD="original"
TEST_METHOD="original"
GROUP_DUPLICATES=false
N_WORKERS=1
USE_GPU=false
FILTER_BY_RECORD_ID=true
PERCENTAGES="0.33 0.33 0.34" # space-separated string for percentages
CACHE_ONLY=false
# default dataset download and processing location (can be overridden via -i flag from run_all.sh)
# if changed, also update for LLM runs
DATA_DIR=$PWD/data
# common paths
CODE_DIR=$PWD/src
# Parse command line arguments
while getopts w:d:t:e:s:i:gnfp:c flag
do
case "${flag}" in
w) WORK_DIR=${OPTARG};;
d) DATASET=${OPTARG};;
t) TRAIN_METHOD=${OPTARG};;
e) TEST_METHOD=${OPTARG};;
s) N_WORKERS=${OPTARG};;
i) DATA_DIR=${OPTARG};;
g) GROUP_DUPLICATES=true;;
n) USE_GPU=true;;
f) FILTER_BY_RECORD_ID=true;;
p) PERCENTAGES="${OPTARG}";;
c) CACHE_ONLY=true;;
esac
done
# check if workdir exists
if [ -z "$WORK_DIR" ] || ! [ -d "$WORK_DIR" ]; then
echo "working directory ${WORK_DIR} does not exist. Creating and using: ${DEFAULT_WORK_DIR}"
mkdir -p $DEFAULT_WORK_DIR
WORK_DIR=$DEFAULT_WORK_DIR
fi
TS=$(date '+%Y-%m-%d_%H:%M:%S');
RUN_DIR="${WORK_DIR}/${DATASET}"
# Create arguments for group_duplicates if enabled
GROUP_ARGS=""
if [ "$GROUP_DUPLICATES" = true ]; then
GROUP_ARGS="--group_duplicates"
fi
# Create arguments for use_gpu if enabled
GPU_ARGS=""
if [ "$USE_GPU" = true ]; then
GPU_ARGS="--use_gpu"
fi
FILTER_ARGS=""
if [ "$FILTER_BY_RECORD_ID" = true ]; then
FILTER_ARGS="--filter_by_record_id"
fi
CACHE_ARGS=""
if [ "$CACHE_ONLY" = true ]; then
CACHE_ARGS="--cache_only"
fi
# Track start time
start_time=$(date +%s)
start_time_human=$(date '+%Y-%m-%d %H:%M:%S')
# create run command as array to preserve argument boundaries
CMD=(python3 run.py \
--save_dir "$RUN_DIR" \
--data_dir "$DATA_DIR" \
--dataset "$DATASET" \
--train_method "$TRAIN_METHOD" \
--test_method "$TEST_METHOD" \
--n_workers "$N_WORKERS" \
$GROUP_ARGS $GPU_ARGS $FILTER_ARGS $CACHE_ARGS \
--percentages "$PERCENTAGES")
# make the run
echo "START at $start_time_human"
mkdir -p $RUN_DIR
# echo $SLURM_JOB_ID > "${RUN_DIR}/slurm-job-id.txt" # added to results for each run instead
echo "Command: ${CMD[@]}"
export PYTHONUNBUFFERED=1 # to get all print output in log
cd "$PWD" && "${CMD[@]}"
# Track end time
end_time=$(date +%s)
end_time_human=$(date '+%Y-%m-%d %H:%M:%S')
# Calculate runtime
runtime=$((end_time - start_time))
hours=$((runtime / 3600))
minutes=$(((runtime % 3600) / 60))
seconds=$((runtime % 60))
echo "FINISHED at $end_time_human"
echo "Total runtime: ${hours}h ${minutes}m ${seconds}s"
# Add memory and general computation statistics output
echo -e "=== MEMORY USAGE SUMMARY ==="
# Get the MaxRSS value and convert to human-readable format
maxrss_kb=$(sacct -j $SLURM_JOB_ID -o MaxRSS -n | head -1 | tr -d ' ')
# Check if we got a value
if [[ -n "$maxrss_kb" && "$maxrss_kb" != "0" ]]; then
# Remove the 'K' suffix if present
maxrss_kb=${maxrss_kb%K}
# Convert to MB and GB
maxrss_mb=$(echo "scale=2; $maxrss_kb/1024" | bc)
maxrss_gb=$(echo "scale=2; $maxrss_kb/1024/1024" | bc)
echo "Maximum memory used: $maxrss_kb KB = $maxrss_mb MB = $maxrss_gb GB"
else
echo "Memory usage data not available"
fi
# Also show full job statistics
echo -e "\n=== DETAILED JOB STATISTICS ==="
sacct -j $SLURM_JOB_ID --format=JobID,JobName,MaxRSS,AveCPU,Elapsed,MaxDiskRead,MaxDiskWrite