@@ -35,7 +35,7 @@ mkdir -p "$TIME_LOG_DIR" "$METRICS_JSON_DIR"
3535TIME_LOG=" $TIME_LOG_DIR /${RUN_ID} .txt"
3636METRICS_JSON=" $METRICS_JSON_DIR /${RUN_ID} .json"
3737METRICS_CSV=" $LOGDIR /metrics.csv"
38- METRICS_HEADER=" run_id,timestamp,output_name,output_dir,exit_code_java,wall_seconds_java,user_seconds_java,sys_seconds_java,max_rss_kb_java,input_mapping_size_bytes,input_vcf_size_bytes,output_dir_size_bytes,output_triples,jar,mapping_file,output_path,combined_rdf_size_bytes,gzip_size_bytes,brotli_size_bytes,hdt_size_bytes,exit_code_gzip,exit_code_brotli,exit_code_hdt,wall_seconds_gzip,user_seconds_gzip,sys_seconds_gzip,max_rss_kb_gzip,wall_seconds_brotli,user_seconds_brotli,sys_seconds_brotli,max_rss_kb_brotli,wall_seconds_hdt,user_seconds_hdt,sys_seconds_hdt,max_rss_kb_hdt,compression_methods,hdt_source,gzip_on_hdt_size_bytes,brotli_on_hdt_size_bytes,exit_code_gzip_on_hdt,exit_code_brotli_on_hdt,wall_seconds_gzip_on_hdt,user_seconds_gzip_on_hdt,sys_seconds_gzip_on_hdt,max_rss_kb_gzip_on_hdt,wall_seconds_brotli_on_hdt,user_seconds_brotli_on_hdt,sys_seconds_brotli_on_hdt,max_rss_kb_brotli_on_hdt "
38+ METRICS_HEADER=" run_id,timestamp,output_name,output_dir,exit_code_java,wall_seconds_java,user_seconds_java,sys_seconds_java,max_rss_kb_java,input_mapping_size_bytes,input_vcf_size_bytes,output_dir_size_bytes,output_triples,jar,mapping_file,output_path"
3939
4040
4141# Return byte size for file or directory (GNU + BSD compatible).
@@ -76,6 +76,21 @@ stat_size() {
7676
7777have_gnu_time () { [[ -x /usr/bin/time ]] && /usr/bin/time --version > /dev/null 2>&1 ; }
7878
79+ # Return stable content hash for duplicate part detection.
80+ hash_file_sha256 () {
81+ local path=" $1 "
82+ if command -v sha256sum > /dev/null 2>&1 ; then
83+ sha256sum " $path " | awk ' {print $1}'
84+ return
85+ fi
86+ if command -v shasum > /dev/null 2>&1 ; then
87+ shasum -a 256 " $path " | awk ' {print $1}'
88+ return
89+ fi
90+ # Last-resort fallback when SHA utilities are unavailable.
91+ cksum " $path " | awk ' {print $1":"$2}'
92+ }
93+
7994# Count triples via non-comment RDF lines ending in '.'.
8095count_triples_json () {
8196 local path=" $1 "
@@ -167,13 +182,30 @@ if [[ "$AGGREGATE_RDF" == "1" ]]; then
167182 PART_FILES=(" $OUT_DIR /$OUT_NAME " /* .nt)
168183 if (( ${# PART_FILES[@]} > 0 )) ; then
169184 : > " $MERGED_NT "
185+ # Defensive dedupe: some Spark/RMLStreamer runs can emit identical part
186+ # files for the same dataset. Skip exact duplicate part payloads to avoid
187+ # doubling every triple in the merged output.
188+ SEEN_HASH_FILE=" $OUT_DIR /$OUT_NAME /.seen_part_hashes.$$ "
189+ SEEN_MAP_FILE=" $OUT_DIR /$OUT_NAME /.seen_part_hash_map.$$ "
190+ : > " $SEEN_HASH_FILE "
191+ : > " $SEEN_MAP_FILE "
170192 for PART_NT in " ${PART_FILES[@]} " ; do
171193 if [[ " $PART_NT " == " $MERGED_NT " ]]; then
172194 continue
173195 fi
196+ PART_HASH=$( hash_file_sha256 " $PART_NT " )
197+ if grep -Fqx " $PART_HASH " " $SEEN_HASH_FILE " ; then
198+ FIRST_SEEN=$( awk -F' \t' -v hash=" $PART_HASH " ' $1 == hash { print $2; exit }' " $SEEN_MAP_FILE " )
199+ echo " WARNING: skipping duplicate RDF part '$PART_NT ' (same content as '$FIRST_SEEN ')." >&2
200+ rm -f " $PART_NT "
201+ continue
202+ fi
203+ printf " %s\n" " $PART_HASH " >> " $SEEN_HASH_FILE "
204+ printf " %s\t%s\n" " $PART_HASH " " $PART_NT " >> " $SEEN_MAP_FILE "
174205 cat " $PART_NT " >> " $MERGED_NT "
175206 rm -f " $PART_NT "
176207 done
208+ rm -f " $SEEN_HASH_FILE " " $SEEN_MAP_FILE "
177209 else
178210 : > " $MERGED_NT "
179211 fi
@@ -265,13 +297,7 @@ csv_fields=(
265297 " $JAR "
266298 " $IN "
267299 " $OUTPUT_PATH "
268- " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " "
269300)
270-
271- # Compression-related fields are initialized as empty from conversion step output.
272- for _ in $( seq 1 13) ; do
273- csv_fields+=(" " )
274- done
275301( IFS=,; echo " ${csv_fields[*]} " ) >> " $METRICS_CSV "
276302
277303echo " Done."
0 commit comments