cpphash/idmerge.sh at main · maartenSXM/cpphash · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
#!/usr/bin/env bash

# idmerge.sh: merge yaml common blocks with an id: tag

# Common blocks are merged backwards in the output by referencing tags
# specified in "id: <tag>" yaml lines. The first common block to
# specify a unique <tag> is the destination for subsequent references.
# lines in subsequent blocks are merged at the end of the block
# that declared "id: <tag>" first. Array elements are each treated
# as common blocks so it is possible to merge  into an array element
# as long as it declares itself using "id: <tag>"

# Usage: idmerge.sh -id <idTag> <input.yaml >output.yaml

# After running idmerge.sh, it is advised to postprocess the yaml with
# awk '/^[[:alnum:]_]/{print "---"} | yq '... comments=""' file.yaml
# before piping it into yq to merge sections using:

#   yq eval-all '. as $item ireduce ({}; . *+ $item)'

# Maarten's Law: "Everything is more complicated than it should be."
# For proof of Maarten's Law, uncomment the next line and run this script.

#   more idmerge.sh; exit

# Require all variables to be declared - to catch variable typos early.

declare -r me=${0##*/}  # basename of this script
declare -r usage="$me: merge yaml common blocks with an id: tag

Usage: $me: [-q] [-p] [-m] [-h] [-t tag] [-o outfile] <file.yaml>\n
  -t|--tag\tItem tag that uniquely names what to merge. Defaults to "id".
  -o|--outfile\tFile to write to, else stdout.
  -q|--quiet\tDo not output operational feedback to stdout.
  -p|--parseinfo\tOutput input parser debug info to stderr
  -m|--mergeinfo\tOutput merge debug info to stderr
  -h|--help\tOutput this help.

<file.yaml>\tThe yaml file to merge, else stdin.

This script is from git repo github.com/maartenSXM/cpphash.
Note: This script does not vet arguments securely. Do not setuid or host it.
"

set -e
set -o nounset
set -o pipefail

if (($BASH_VERSINFO < 4)); then
  echo "$me: *** bash must be at version 4 or greater. Install it. ***" >&2
  exit -1
fi

declare outfile=/dev/stdout # output yaml file
declare idtoken="id"	    # id tag for yaml item unique keys
declare -i chatinfo=1	    # some operational feedback to stdout
declare -i parseinfo=0	    # input parser debug output to stderr
declare -i mergeinfo=0	    # merge engine debug output to stderr

while [[ $# > 0 ]]
do
  case $1 in
    -h|--help)	    printf "$usage"; exit  0;;
    -o|--out)	    outfile="$2";    shift 2;;
    -t|--tag)	    idtoken="$2";    shift 2;;
    -q|--quiet)	    chatinfo=0;	     shift 1;;
    -p|--parseinfo) parseinfo=1;	     shift 1;;
    -m|--mergeinfo) mergeinfo=1;	     shift 1;;
    *) break
  esac
done

if [ -z "${infile:-}" ]; then
  declare -r infile=/dev/stdin
else
  declare -r infile="$1"
fi

# Tips for debugging:

# 1. This program outputs lines numbers starting at 1.
#    To dump line numbers of a test yaml file, use nl. Eg: nl test1.yaml

# 2. For large yaml test files, it is helpful to redirect stdout to
#    /dev/null and then redirect stderr to more.  That is done
#    like this: ./idmerge.sh bigtest.yaml 2>&1 >/dev/null | more
#    and that works because the shell does redirections from right to left.

# 3. parseinfo can be set to one to see how blocks are found.
#    mergeinfo can be set to one to see how blocks are stored and output.

# FWIW, this script does not use any subprocesses and runs entirely in bash.

# Globals

declare retval		# optional return value of a function
declare -i nlines=0	# number of yaml lines input
declare -i _nlines=0	# number of digits in nlines variable (e.g 2 for 64)
declare -i nmapkeys=0	# monotonically incrementing map key number

declare -a    lines	    # array of all input lines

# These arrays are indexed by line number up to <nlines>.

declare -a -i skip	    # flags inidctating lines should not be output
declare -a -i visited	    # flags lines visited during BFS to avoid loops
declare -a block_list	    # a string of block indices to append to lines

# This associative arrays is indexed by the key of the block
# which is <mapkey>/<item>/..  where item can be an array
# item name, an item name or id: tag.

declare -A -i key_block     # the block number of a key if seen, or unset

# These are used by the front-end parser to store information on all
# the common blocks. They are set by scanning each map key in sequence
# using breadth first search. The numeric arrays are indexed by block
# number.

declare -i nblocks=0	    # block number

declare -a    block_key	    # assigned block key (/mapkey:/item:/item:/etc...
declare -a -i block_from    # block first line number
declare -a -i block_to      # block last line number
declare -a -i block_idline  # block "id:" line if it has one, else zero

# These read-only strings are used to identify special yaml lines.

declare -r newdoc="---"
declare -r is_array='^[[:blank:]]*- ([[:alnum:]: _]+)[[:blank:]]*$'
declare -r is_id="^[[:blank:]]+$idtoken:[[:blank:]]+([[:alnum:]_]+)[[:blank:]]*\$"
declare -r is_key='^[[:blank:]]*([[:alnum:]_]+):[[:blank:]]*$'
declare -r is_mapkey='^([[:alnum:]_]+:)[[:blank:]]*$|^---$'
declare -r is_comment='^[[:blank:]]*#.*$'
declare -r is_blank='^[[:blank:]]*$'
declare -r get_mapname='^([[:alnum:]_]+):.*$'

# This function reads in yaml on stdin.

read_lines () {
  declare -i n=0

  # Read in the whole yaml file and mark any document separators
  # and comments to not be output.

  while IFS='' read -r lines[$n]
  do
    skip[$n]=0
    visited[$n]=0
    block_list[$n]=""
    if [[ "${lines[$n]}" == "$newdoc" ]]; then
      skip[$n]=1
    fi
    if [[ "${lines[$n]}" =~ $is_comment ]]; then
      skip[$n]=1
    fi
    if [[ "${lines[$n]}" =~ $is_blank ]]; then
      skip[$n]=1
    fi
    n=n+1
  done < $infile

  nlines=$n		# number of lines of yaml
  _nlines=${#nlines}	# number of digits in nlines
}

# Worker function that outputs lines and marks them for skipping.

_write_lines () {
  declare -i n=$1	# line number
  declare -i oldline=$2	# merging from line, or zero

  declare -a -i blocks=(${block_list[$n]})
  declare -i nblocks=${#blocks[@]}

  declare -i bl # block_list index
  declare -i b  # block index
  declare -i p  # column indentation alignment for merge comment

  # if the line has been flagged to not be output, skip it.
  # This can happen when it is a merge block and the leading lines
  # are duplicates.ince the block being merged into has those lines also.
  # It can also occur for lines in the merge block itself that have
  # already been output since they moved earlier.

  if ((skip[$n] == 1)); then
    return;
  fi

  # Output line $n.

  # oldline is 0 when the line is being simply output and not moving.

  if ((oldline==0)); then
    echo "${lines[$n]}" >$outfile
  else

    # Round comment column to next highest x10 after yaml line, min 40.

    p=(${#lines[$n]}/10+1)*10
    ((p<=40)) && p=40
    printf \
      "%-*s # idmerge.sh: was line $((oldline+1))\n" $p "${lines[$n]}" \
      >$outfile
  fi

  skip[$n]=1		# a line can only be output once

  # Output the lines of any blocks appended to line $n while outputting
  # any blocks appended to their lines, in the process.

  for ((bl=0; bl < nblocks; bl=bl+1 )); do
    b=blocks[bl]
    for ((l=${block_from[$b]}; l <= ${block_to[$b]}; l=l+1 )); do
      _write_lines $l $l
    done
  done
}

# Output the input, while merging.

write_lines () {
  declare -i n=0

  for ((n=0; n<nlines; n=n+1)); do
    _write_lines $n 0
  done
}

# _record_work saves blocks in an associative array index by the block key.

# Before it saves the block it checks if it already exists.
# If it does, then the block has already been seen and it is recorded
# as merge work. In other words, duplicate common blocks become merge work.
# _record_work considers array items, since duplicate array items are
# permitted and so they do not become merge work. However, if a duplicate
# array is named by an id:, then it can become work.  That is why array
# items have id: in their key, if specified.

_record_work () {
  declare -i block="$1"

  declare key="${block_key[$block]}"
  declare -i from=${block_from[$block]}
  declare -i to=${block_to[$block]}
  declare -i idline=${block_idline[$block]}

  declare -i n=0	# utility integer variable
  declare -i kb=0	# key block number
  declare -i dest	# where to merge to
  declare id=""		# <tag> of "id: <tag>" line

  [[ "${lines[$idline]}" =~ $is_id ]]
  id="${BASH_REMATCH[1]}"		# grab <id> from "id: <id>"

  # If the key has never been seen, save it and return.

  if [ -z "${key_block[$key]:-}" ]; then
    ((mergeinfo)) && echo \
      "*** Saved block $((block+1)) $((from+1))-$((to+1)) ($id)" >&2
    key_block[$key]=$block
    return
  fi

  # This is the second time we see this block. Merge it.

  kb=${key_block[$key]}
  dest=${block_to[$kb]}

  ((mergeinfo)) && echo \
      "*** Found block $((kb+1)) for block $((block+1)) $((from+1))-$((to+1)) ($id)" >&2

  ((chatinfo)) && echo \
  "$me: Moving lines $((from+1))-$((to+1)) to line $((dest+1)) ($idtoken: $id)" >&2

  # Skip the id: line since it is in the block being merged into.

  skip[$idline]=1

  ((mergeinfo)) && echo "*** Skipping id line $((idline+1)) ($idtoken: $id)" >&2

  # If we are merging an array elemeny, we will skip the from line too.

  if [[ "${lines[$from]}" =~ $is_array ]]; then
    ((mergeinfo)) && ((skip[from]==0)) && echo \
	"*** Skipping array line $((from+1))" >&2
    skip[$from]=1
  fi

  # Record the merge work by appending the block number to the dest line.

  block_list[$dest]="${block_list[$dest]} $b"

  # Skip the leading up section headers in the merged section since
  # they will come from the destination block being merged into.

  n=from-1

  while (( n>= 0 )); do
    if ((mergeinfo==1 && skip[n]==0)); then
      if [[ "${lines[$n]}" =~ $is_key ]]; then
        id="${BASH_REMATCH[1]}"
        echo "*** Skipping key line $((n+1)) ($id)" >&2
      else
        echo "*** Skipping line $((n+1))" >&2
      fi
    fi;
    skip[$n]=1
    if [[ "${lines[$n]}" =~ $is_mapkey ]]; then
      break;
    fi
    n=n-1
  done
}

# Debug function for when parseinfo is set to 1.

dump_common_block () {
  declare -r -i b=$1
  declare -i l

  echo "*** Block $((b+1)) ($key)" >&2
  for ((l=block_from[b]; l <= block_to[b]; l=l+1)); do
    echo " $((l+1)): ${lines[$l]}" >&2
  done
}

# Debug function currently not called.

dump_common_blocks () {
  declare -i b;

  for ((b=0; b < nblocks; b=b+1)); do
    dump_common_block $b
  done
}

# Get the previous line that is not skipped (i.e. not a --- or comment).

_get_prev_non_skip () {
  declare -i n=$1

  n=n-1
  while ((n >= 0)); do
    if ((skip[$n] == 0)); then
      retval=$n;
      return
    fi
    n=n-1
  done
  retval=-1
}

# Get the next line that is not skipped.

_get_next_non_skip () {
  declare -i n=$1

  n=n+1
  while ((n < nlines)); do
    if ((skip[$n] == 0)); then
      retval=$n
      return;
    fi
    n=n+1
  done
  retval=$nlines
}

# Get the first line in the file that is not skipped.

_get_first_non_skip () {
  _get_next_non_skip -1
}

# Get the number of spaces before the first non-space character in the line.

_get_indentation () {
  declare -i n=$1
  declare spaces=""

  spaces="${lines[$n]%%[![:space:]]*}"
  if [[ "${lines[$n]}" =~ $is_array ]]; then
    spaces="$spaces  "
  fi
  retval="$spaces"
}

# Get the last line number of a common block.

_get_block_to () {
  declare -i n=$1		# first line of the block
  declare -i spanItems=$2	# include all items if its an array

  declare skipSpaces
  declare lineSpaces

  _get_indentation $n; skipSpaces="$retval"
  _get_next_non_skip $n; n=$retval

  while (( n<= nlines)); do
    _get_indentation $n; lineSpaces="$retval"

    if [[ "$lineSpaces" < "$skipSpaces" ]]; then
      break;
    fi
    if [[ "$spanItems" == "0" && "$lineSpaces" == "$skipSpaces" && \
	  "${lines[$n]}" =~ $is_array ]]; then
      break;
    fi
    _get_next_non_skip $n; n=$retval
  done

  _get_prev_non_skip $n

  # retval from _get_prev_non_skip passed back through to caller
}

# _find_blocks(): Find all the common blocks in a block.

# A common block starts at a mapkey or when indentation moves right and
# ends at a next mapkey or when indentation moves left. Each array element
# is also a common blocks. Array elements start with "- " and end at a
# next "- " or when when indentation moves left. Note that "- " is
# considered "  " by _get_indentation() for indentation comparisons.

# If this was a depth-first seach, below would simply recurse. However,
# this is breadth-first search in order to retain ordering so _find_blocks()
# uses a queue to record blocks in the order of their start line.

_find_blocks() {
  declare -i -r first=$1   # first line of the block to search in
  declare -i -r last=$2	   # last line of the block to search in
  declare key="$3"	   # key for this block
  declare idTag="$4"	   # default "id:" for this block

  declare -i n
  declare -i q
  declare -i prev=0
  declare -i idline=0

  declare blockSpaces=""
  declare lineSpaces=""
  declare skipSpaces=""

  declare -i nqueue=0	    # number of blocks found inside this one
  declare -i -a queue_first # array of first lines of each inside block
  declare -i -a queue_last  # array of last  lines of each inside block
  declare -a queue_idTag    # default id:'s for each inside block

  _get_indentation "$first"; blockSpaces="$retval"

  ((parseinfo)) && echo \
    "*** Start _find_block $((first+1)) $((last+1)) \"$key\" \"$idTag\"" >&2

  n=$first

  while (( n <= last )); do

    _get_indentation $n; lineSpaces="$retval"

    # If we are at a next mapkey, end the current block.
    if (( n>first )); then
      if [[ "${lines[$n]}" =~ $is_mapkey ]]; then
        break
      fi
    fi

    # If indentation goes left, end the current block.

    if [[ "$lineSpaces" < "$blockSpaces" ]]; then
      ((parseinfo)) && echo "***: Indentation went left" >&2
      break
    fi

    # If indentation goes right, queue the inside block.

    if [[ "$lineSpaces" > "$blockSpaces" ]]; then
      queue_first[$nqueue]=$n
      # if mapkey, use default tag "id: <null>". Else use the previous line
      if [[ "$blockSpaces" == "" ]]; then
        queue_idTag[$nqueue]="$idtoken: <none>"
      else
        _get_prev_non_skip $n; prev=$retval
        queue_idTag[$nqueue]="${lines[$prev]}"
      fi
      _get_block_to $n 1; n=$retval
      queue_last[$nqueue]=$n

      ((parseinfo)) && echo \
        "*** Indentation went right $((queue_first[$nqueue]+1)) $((n+1))" >&2

      nqueue=nqueue+1

      _get_next_non_skip $n; n=$retval
      continue
    fi

    # From here, the line is in the current block.

    # If this is the beginning of a list of array items, queue them each
    # for scanning one item at a time as a block since they may have
    # sub-blocks and also they may have an id: field to qualify it's key

    if [[ "${visited[$n]}" == "0" && "${lines[$n]}" =~ $is_array ]]; then
      visited[$n]=1
      queue_first[$nqueue]=$n
      queue_idTag[$nqueue]="${lines[$n]}"
      _get_block_to $n 0; n=$retval
      queue_last[$nqueue]=$n

      ((parseinfo)) && echo \
	"*** Queued block $((queue_first[$nqueue]+1)) $((n+1))" >&2

      nqueue=nqueue+1

      _get_next_non_skip $n; n=$retval
      continue
    fi

    # If it has an id: line, record that in idLine

    if [[ "${lines[$n]}" =~ $is_id ]]; then
      idline=$n
      ((parseinfo)) && echo "*** Parsed ${BASH_REMATCH[1]} id at $((n+1))" >&2
      idTag="${BASH_REMATCH[1]}"
    fi

    ((parseinfo)) && echo "*** Queued line $((n+1))" >&2

    _get_next_non_skip $n; n=$retval
  done

  if [[ "$idTag" != "" ]]; then
    key="$key/$idTag"
  fi

  if ((idline != 0)); then
    _get_prev_non_skip $n;
    block_from[$nblocks]=$first
    block_to[$nblocks]=$retval
    block_idline[$nblocks]=idline
    block_key[$nblocks]="$key"

    ((parseinfo)) && \
	  dump_common_block $nblocks

    nblocks=nblocks+1
  fi

  # Now handle any queued blocks.

  for ((q=0; q < nqueue; q=q+1)); do
    _find_blocks ${queue_first[$q]} ${queue_last[$q]} \
		 "$key" "${queue_idTag[$q]}"
  done

  ((parseinfo)) && echo \
    "*** End _find_block $((first+1)) $((last+1)) \"$key\" \"${idTag}\"" >&2

  retval=$n
}

# Kick off _find_blocks() starting at each mapkey.

find_blocks() {
  declare -i n=0
  declare key=""

  _get_first_non_skip; n=$retval

  while ((n < nlines)); do
    if [[ "${lines[$n]}" =~ $is_mapkey ]]; then
      [[ "${lines[$n]}" =~ $get_mapname ]]
      key="/${BASH_REMATCH[1]}"
      nmapkeys=nmapkeys+1
      _find_blocks $n $((nlines-1)) "$key" ""
      n=$retval
    else
      echo "$me: Error: line $((n+1)) is not a map key: ${lines[$n]}" >&2
      exit -1
    fi
  done
}

# For each block found by find_blocks(), check for merge blocks.

record_work () {
  declare -i b;

  for ((b=0; b < nblocks; b=b+1)); do
    _record_work "$b"
  done
}

main () {

  read_lines	# Read stdin into the 'lines' array.

  find_blocks	# Find common blocks in 'lines' using breadth first search.

  record_work	# Scan the blocks and identify merge 'work'.

  write_lines	# Write the 'lines' array out while processing work.

  exit 0	# Done. Voila!
}

main;		# main() never returns.