-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathstrat_3_dataset_LC_BC_PS_finetuning.sh
More file actions
144 lines (108 loc) · 3.84 KB
/
strat_3_dataset_LC_BC_PS_finetuning.sh
File metadata and controls
144 lines (108 loc) · 3.84 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
#!/bin/bash
# exit on errors
# set -e
# activate your environment
source .env/bin/activate
SEQUENCE_PATH=$1
NAME=$2
OUTPUT_PATH=$3
TRACE_PATH=$4
PIE_PATH="./pie/$NAME.jsonl"
mkdir -p "$OUTPUT_PATH/tmp"
mkdir -p datasets
python src/data/s3_ft_exec_aware_format.py --pie-path $PIE_PATH --sequence-folder $SEQUENCE_PATH --trace-path $TRACE_PATH --output-path "$OUTPUT_PATH" --suffix $NAME
process_dataset() {
# $1 is dataset path
# $2 is temporary folder to store cpp sources
# $3 is the csv output NAME (no ext)
echo "Processing $1 dataset"
mkdir -p $2
python src/data/generate_c_sources.py \
--dataset $1 \
--src-col source \
--tgt-col target \
--folder $2
# set max jobs in parallel
N=30
job_i=0
for cpp_prog in $(find ./$2 -type f -name '*.cpp')
do
((job_i=job_i%N)); ((job_i++==0)) && wait
gcc -fpreprocessed -dD -E -P "$cpp_prog" -o "${cpp_prog%.*}_processed.cpp" &
done
wait
# reset jobs cnt
job_i=0
for cpp_prog in $(find ./$2 -type f -name '*_processed.cpp' | sort -t '\0' -n)
do
((job_i=job_i%N)); ((job_i++==0)) && wait
(
echo $cpp_prog
clang-format -i --style=llvm $cpp_prog
) &
done
wait
sleep 5
python src/ft/remove_comments/create_c_sources.py \
--folder ./$2 \
--src-suffix _src_processed.cpp \
--tgt-suffix _tgt_processed.cpp \
--full-dataset-path $1 \
--save-as datasets/$3.csv
}
# Preprocess source code removing comments and custom format
echo "Processing exec aware ${NAME} set"
process_dataset "$3/execaware_${NAME}.jsonl" "tmp_execaware_${NAME}" "execaware_${NAME}_processed"
python src/data/s3_specialize.py \
--dataset-path "datasets/execaware_${NAME}_processed.csv" \
--output-path "datasets/LE_execaware_${NAME}.csv" \
--id-col id \
--source-col source_line_exec \
--target-col target
echo "LE dataset generated"
python src/data/s3_specialize.py \
--dataset-path "datasets/execaware_${NAME}_processed.csv" \
--output-path "datasets/LC_execaware_${NAME}.csv" \
--id-col id \
--source-col source_line_cov \
--target-col target
echo "LC dataset generated"
python src/data/s3_specialize.py \
--dataset-path "datasets/execaware_${NAME}_processed.csv" \
--output-path "datasets/BC_execaware_${NAME}.csv" \
--id-col id \
--source-col source_bran_cov \
--target-col target
echo "BC dataset generated"
python src/data/s3_specialize.py \
--dataset-path "datasets/execaware_${NAME}_processed.csv" \
--output-path "datasets/PS_execaware_${NAME}.csv" \
--id-col id \
--source-col source_prog_states \
--target-col target
echo "PS dataset generated"
python src/data/s3_specialize.py \
--dataset-path "datasets/execaware_${NAME}_processed.csv" \
--output-path "datasets/vanilla_format_${NAME}.csv" \
--id-col id \
--source-col source_vanilla \
--target-col target
echo "VANILLA FORMAT dataset generated"
python src/data/s3_specialize.py \
--dataset-path "datasets/execaware_${NAME}_processed.csv" \
--output-path "datasets/vanilla_${NAME}.csv" \
--id-col id \
--source-col source \
--target-col target
echo "VANILLA dataset generated"
# echo "Processing vanilla ${NAME} set"
# process_dataset "$3/vanilla_${NAME}.jsonl" "tmp_vanilla_${NAME}" "vanilla_${NAME}_processed"
# python src/ft/ft_merge_traced_pie.py \
# --original-path /mnt/data/ExecAwarePT/dataset/exec_aware_ft/exec_aware_${NAME}.jsonl \
# --processed-path datasets/exec_aware_${NAME}_processed.csv \
# --output-file datasets/FT_exec_aware_${NAME}.csv
# cp datasets/vanilla_${NAME}_processed.csv datasets/FT_vanilla_${NAME}.csv
# # remove tmp files
# rm -f datasets/vanilla_${NAME}_processed.csv
# rm -f datasets/exec_aware_${NAME}_processed.csv
# generate datasets