|
| 1 | +#!/usr/bin/env bash |
| 2 | +PNOTEVER=0.1.0.20260313 |
| 3 | +set -euo pipefail |
| 4 | +usage(){ |
| 5 | + cat <<HEREDOC |
| 6 | +Provenance tracking for any command and any input/output graph. |
| 7 | +Like datalad, but not linked to source control versioning |
| 8 | +
|
| 9 | + USAGE: |
| 10 | + $0 -o test.txt -- touch test.txt |
| 11 | + $0 --stdout date.txt -- date |
| 12 | + $0 show path/to/file |
| 13 | + OPTIONS: |
| 14 | + -o OUTPUT_FILE output, can repeat |
| 15 | + -i INPUT_FILE input file, can repeat |
| 16 | + -s|--stdout FILE capture stdoutput to FILE |
| 17 | + -e|--ifneeded only run if output(s) older than input(s) |
| 18 | + pnumoic for 'e': *e*xisting skipped |
| 19 | + or if hashes changed |
| 20 | + -d|--db DBPATH specify sqlite3 db path. Alt set env PNOTEDB |
| 21 | + if git, goes into .pnote.sqlite3 |
| 22 | + otherwise \$PWD/.pnote.sqlite3 |
| 23 | + -c|--sidecar write to first output file sidecar .output.pnote |
| 24 | + -n|--dryrun save as setting DRYRUN=1. show don't do |
| 25 | +TODO: |
| 26 | + * faketime for reproducabilty? |
| 27 | +PRIOR ART: |
| 28 | + * datalad |
| 29 | + * 3dNotes, niinote |
| 30 | + * cram (cli driven tests) |
| 31 | + * make |
| 32 | +HEREDOC |
| 33 | +} |
| 34 | +[[ "${1:--help}" =~ ^-h ]] && usage && exit 0 |
| 35 | + |
| 36 | +SCHEMA=" |
| 37 | +create table prov ( |
| 38 | + hash text not null, -- user@host:pwd cmd |
| 39 | + cmd text not null, |
| 40 | + user text, |
| 41 | + host text, |
| 42 | + pwd text, |
| 43 | + verinfo text, -- git branch, commit, dirty or clean? |
| 44 | + stime datetime not null, |
| 45 | + etime datetime, |
| 46 | + status int); |
| 47 | +create table inf (hash text, stime int, file text, modtime int); |
| 48 | +create table outf (hash text, stime int, file text); |
| 49 | +create table meta (inittime datetime, version text, inithost text); |
| 50 | +insert into meta values (datetime('now'), '$PNOTEVER', '$HOSTNAME');" |
| 51 | + |
| 52 | +record_to(){ |
| 53 | + declare -g DBFILE |
| 54 | + local backend="${1:?need provenance backend}"; local cmd_output=${2:-$PWD} |
| 55 | + case $backend in |
| 56 | + db) |
| 57 | + # PNOTEDB overwrite, otherwise git root, otherwise PWD |
| 58 | + db="${PNOTEDB:-}" |
| 59 | + [ -z "$db" ] && |
| 60 | + db=$(git rev-parse --show-toplevel || echo "$PWD")/.pnote.sqlite3 |
| 61 | + |
| 62 | + if ! test -r "$db"; then |
| 63 | + echo "# NOTE: make '$db'" >&2 |
| 64 | + dryrun sqlite3 "$db" <<< "$SCHEMA" |
| 65 | + fi |
| 66 | + DBFILE=$db |
| 67 | + ;; |
| 68 | + sidecar) |
| 69 | + sidecar=$(dirname "$cmd_output")/.$(basename "$cmd_output") |
| 70 | + # make we don't have a trailing slash to make this a directory |
| 71 | + sidecar=$(sed 's:/+$::' <<< "$sidecar") |
| 72 | + echo "$sidecar" |
| 73 | + DBFILE=$sidecar |
| 74 | + ;; |
| 75 | + esac |
| 76 | +} |
| 77 | +record_start(){ |
| 78 | + declare -g DBFILE |
| 79 | + local cmd_hash="$1"; shift |
| 80 | + local now="$1"; shift |
| 81 | + local cmd=$(printf "%q" "$1"); shift |
| 82 | + # TODO: pathological PWD need special care as with cmd? |
| 83 | + sqlite3 "$DBFILE" <<HERE |
| 84 | +.param set :cmd "$cmd" |
| 85 | +insert into prov |
| 86 | +( hash, cmd, user, host, pwd, stime) values |
| 87 | +('$cmd_hash',:cmd,'$USER','$HOSTNAME', '$PWD','$now'); |
| 88 | +HERE |
| 89 | + |
| 90 | +} |
| 91 | +record_in(){ |
| 92 | + declare -g DBFILE |
| 93 | + local h=${1:?cmd hash from start}; shift |
| 94 | + local n=${1:?start time of command}; shift |
| 95 | + # all other inputs are like 'moddate file' |
| 96 | + while [ $# -gt 0 ]; do |
| 97 | + if ! [[ $1 =~ ^([0-9]+)\ (.+)$ ]]; then |
| 98 | + echo "# INTERNAL ERROR: bad time/file name split on '$1'" >&2 |
| 99 | + shift |
| 100 | + continue |
| 101 | + fi |
| 102 | + m=${BASH_REMATCH[1]} |
| 103 | + f=${BASH_REMATCH[2]}; |
| 104 | + sqlite3 "$DBFILE" \ |
| 105 | + "insert into inf (hash, stime, file, modtime) values ('$h','$n','$f','$m');" |
| 106 | + shift; |
| 107 | + done |
| 108 | +} |
| 109 | +record_out(){ |
| 110 | + declare -g DBFILE |
| 111 | + local h=${1:?cmd hash from start}; shift |
| 112 | + local n=${1:?start time of command}; shift |
| 113 | + while [ $# -gt 0 ]; do |
| 114 | + sqlite3 "$DBFILE" "insert into outf (hash, stime, file) values ('$h','$n','$1');" |
| 115 | + shift; |
| 116 | + done |
| 117 | +} |
| 118 | + |
| 119 | +record_done(){ |
| 120 | + local h=${1:?} n=${2:?} s=${3:?} |
| 121 | + sqlite3 "$DBFILE" " |
| 122 | + update prov set |
| 123 | + etime=datetime('now'), status = '$s' |
| 124 | + where hash='$h' and stime='$s';" |
| 125 | +} |
| 126 | + |
| 127 | +show_db(){ |
| 128 | + record_to "$BACKEND" |
| 129 | + sqlite3 -separator $'\t' -header "$DBFILE" <<HERE |
| 130 | +select status, o.stime, etime, cmd from prov p join outf o on p.hash=o.hash where file like '$1'; |
| 131 | +HERE |
| 132 | +} |
| 133 | + |
| 134 | +_OUT=() |
| 135 | +_IN=() |
| 136 | +stdout= |
| 137 | +ifneeded=0 |
| 138 | +BACKEND="db" |
| 139 | +while [ $# -ne 0 ]; do |
| 140 | + case "$1" in |
| 141 | + show) show_db "$2"; shift 2; exit ;; |
| 142 | + --) shift; break;; |
| 143 | + -o|--output) |
| 144 | + _OUT+=("${2:?$1 must be followed by an output file}"); shift 2;; |
| 145 | + --stdout|-s) |
| 146 | + _OUT+=("${2:?$1 must be followed by an output file}"); |
| 147 | + stdout=$2; |
| 148 | + shift 2;; |
| 149 | + -i|--input) |
| 150 | + _IN+=("${2:?$1 must be following by an input file}"); shift 2;; |
| 151 | + -e|--ifneeded) ifneeded=1; shift;; |
| 152 | + -c|--sidecar) BACKEND="db"; shift; |
| 153 | + echo "sidecar not implemented yet"; exit 1;; |
| 154 | + -n|--dryrun) DRYRUN=echo; shift;; |
| 155 | + -d|--db) PNOTEDB=${2:?--db requires db path}; shift 2;; |
| 156 | + |
| 157 | + # die on unknown options |
| 158 | + -*) |
| 159 | + echo "ERROR: unknown option $1" >&2; exit 1;; |
| 160 | + |
| 161 | + # anything else is likely a command |
| 162 | + *) break;; |
| 163 | + esac |
| 164 | +done |
| 165 | + |
| 166 | +stat_exist(){ |
| 167 | + default=${1?:missing file number. '0' for oldest; 'error' to break}; shift; |
| 168 | + for f in "$@"; do |
| 169 | + test -e "$f" && stat -c "%Y %n" "$f" && continue |
| 170 | + |
| 171 | + [[ $default == "error" ]] && echo "ERROR: missing file '$f'" >&2 && return 1 |
| 172 | + echo "$default $f" |
| 173 | + done |
| 174 | +} |
| 175 | + |
| 176 | +#cmd_and_args=("$(printf "%q " "$@")") |
| 177 | +cmd_and_args=("$@") |
| 178 | +cmd_hash=$(md5sum <<< "$USER@$HOSTNAME:$PWD ${cmd_and_args[*]}" | sed 's/ -$//') |
| 179 | +now=$(date +%s) |
| 180 | +out_times=("$now") # default for no output: set as always up-to-date |
| 181 | +in_times=() |
| 182 | +if [ ${#_IN[@]} -gt 0 ]; then |
| 183 | + mapfile -t in_times < <(stat_exist error "${_IN[@]}" | sort -n) |
| 184 | + [ ${#in_times[@]} -ne ${#_IN[@]} ] && |
| 185 | + echo "ERROR: not all input files exist" >&2 && |
| 186 | + exit 1 |
| 187 | +fi |
| 188 | +[ ${#_OUT[@]} -gt 0 ] && |
| 189 | + mapfile -t out_times < <(stat_exist 0 "${_OUT[@]}" | sort -nr) # oldest first |
| 190 | +newest_in="${in_times[0]:-0// */}" # remove name, just stat |
| 191 | +oldest_out="${out_times[0]// */}" |
| 192 | +# TODO: if no output but still want run if needed? |
| 193 | +oldest_out=${oldest_out?-0} # no output, make start of time. |
| 194 | +# if newest output is older than newest input AND cmd_hash is same, nothing to do? |
| 195 | +if [[ $ifneeded -eq 1 && \ |
| 196 | + -n "${newest_in}" && -n "${oldest_out}" && \ |
| 197 | + ${newest_in} -lt ${oldest_out} \ |
| 198 | + ]]; then # `# check cmd_hash?` |
| 199 | +echo "# not rerunning: input '${in_times[0]}' $(bc -l <<< "$oldest_out - $newest_in") seconds older than output '${out_times[0]}'" >&2 |
| 200 | + exit 0 |
| 201 | +fi |
| 202 | + |
| 203 | +# nothing to do, just exit |
| 204 | +if [ -n "${DRYRUN:-}" ]; then |
| 205 | + echo "${cmd_and_args[*]}" |
| 206 | + exit 0 |
| 207 | +fi |
| 208 | + |
| 209 | +# set DBFILE |
| 210 | +record_to "$BACKEND" |
| 211 | + |
| 212 | +record_start "$cmd_hash" "$now" "${cmd_and_args[*]}" |
| 213 | +record_in "$cmd_hash" "$now" "${in_times[@]}" |
| 214 | +record_out "$cmd_hash" "$now" "${_OUT[@]}" |
| 215 | + |
| 216 | +if [ -n "$stdout" ]; then |
| 217 | + set +e # allow this to fail |
| 218 | + "${cmd_and_args[@]}" > "$stdout" |
| 219 | +else |
| 220 | + set +e # allow this to fail |
| 221 | + "${cmd_and_args[@]}" |
| 222 | +fi |
| 223 | +cmd_status=$? |
| 224 | +set -e |
| 225 | +record_done "$cmd_hash" "$now" "$cmd_status" |
| 226 | + |
| 227 | +# TODO: check all output files exist |
0 commit comments