Skip to content

Commit f33954e

Browse files
committed
wip/feat(pnote): provenance tracker
1 parent 8cbe2a4 commit f33954e

1 file changed

Lines changed: 227 additions & 0 deletions

File tree

pnote

Lines changed: 227 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,227 @@
1+
#!/usr/bin/env bash
2+
PNOTEVER=0.1.0.20260313
3+
set -euo pipefail
4+
usage(){
5+
cat <<HEREDOC
6+
Provenance tracking for any command and any input/output graph.
7+
Like datalad, but not linked to source control versioning
8+
9+
USAGE:
10+
$0 -o test.txt -- touch test.txt
11+
$0 --stdout date.txt -- date
12+
$0 show path/to/file
13+
OPTIONS:
14+
-o OUTPUT_FILE output, can repeat
15+
-i INPUT_FILE input file, can repeat
16+
-s|--stdout FILE capture stdoutput to FILE
17+
-e|--ifneeded only run if output(s) older than input(s)
18+
pnumoic for 'e': *e*xisting skipped
19+
or if hashes changed
20+
-d|--db DBPATH specify sqlite3 db path. Alt set env PNOTEDB
21+
if git, goes into .pnote.sqlite3
22+
otherwise \$PWD/.pnote.sqlite3
23+
-c|--sidecar write to first output file sidecar .output.pnote
24+
-n|--dryrun save as setting DRYRUN=1. show don't do
25+
TODO:
26+
* faketime for reproducabilty?
27+
PRIOR ART:
28+
* datalad
29+
* 3dNotes, niinote
30+
* cram (cli driven tests)
31+
* make
32+
HEREDOC
33+
}
34+
[[ "${1:--help}" =~ ^-h ]] && usage && exit 0
35+
36+
SCHEMA="
37+
create table prov (
38+
hash text not null, -- user@host:pwd cmd
39+
cmd text not null,
40+
user text,
41+
host text,
42+
pwd text,
43+
verinfo text, -- git branch, commit, dirty or clean?
44+
stime datetime not null,
45+
etime datetime,
46+
status int);
47+
create table inf (hash text, stime int, file text, modtime int);
48+
create table outf (hash text, stime int, file text);
49+
create table meta (inittime datetime, version text, inithost text);
50+
insert into meta values (datetime('now'), '$PNOTEVER', '$HOSTNAME');"
51+
52+
record_to(){
53+
declare -g DBFILE
54+
local backend="${1:?need provenance backend}"; local cmd_output=${2:-$PWD}
55+
case $backend in
56+
db)
57+
# PNOTEDB overwrite, otherwise git root, otherwise PWD
58+
db="${PNOTEDB:-}"
59+
[ -z "$db" ] &&
60+
db=$(git rev-parse --show-toplevel || echo "$PWD")/.pnote.sqlite3
61+
62+
if ! test -r "$db"; then
63+
echo "# NOTE: make '$db'" >&2
64+
dryrun sqlite3 "$db" <<< "$SCHEMA"
65+
fi
66+
DBFILE=$db
67+
;;
68+
sidecar)
69+
sidecar=$(dirname "$cmd_output")/.$(basename "$cmd_output")
70+
# make we don't have a trailing slash to make this a directory
71+
sidecar=$(sed 's:/+$::' <<< "$sidecar")
72+
echo "$sidecar"
73+
DBFILE=$sidecar
74+
;;
75+
esac
76+
}
77+
record_start(){
78+
declare -g DBFILE
79+
local cmd_hash="$1"; shift
80+
local now="$1"; shift
81+
local cmd=$(printf "%q" "$1"); shift
82+
# TODO: pathological PWD need special care as with cmd?
83+
sqlite3 "$DBFILE" <<HERE
84+
.param set :cmd "$cmd"
85+
insert into prov
86+
( hash, cmd, user, host, pwd, stime) values
87+
('$cmd_hash',:cmd,'$USER','$HOSTNAME', '$PWD','$now');
88+
HERE
89+
90+
}
91+
record_in(){
92+
declare -g DBFILE
93+
local h=${1:?cmd hash from start}; shift
94+
local n=${1:?start time of command}; shift
95+
# all other inputs are like 'moddate file'
96+
while [ $# -gt 0 ]; do
97+
if ! [[ $1 =~ ^([0-9]+)\ (.+)$ ]]; then
98+
echo "# INTERNAL ERROR: bad time/file name split on '$1'" >&2
99+
shift
100+
continue
101+
fi
102+
m=${BASH_REMATCH[1]}
103+
f=${BASH_REMATCH[2]};
104+
sqlite3 "$DBFILE" \
105+
"insert into inf (hash, stime, file, modtime) values ('$h','$n','$f','$m');"
106+
shift;
107+
done
108+
}
109+
record_out(){
110+
declare -g DBFILE
111+
local h=${1:?cmd hash from start}; shift
112+
local n=${1:?start time of command}; shift
113+
while [ $# -gt 0 ]; do
114+
sqlite3 "$DBFILE" "insert into outf (hash, stime, file) values ('$h','$n','$1');"
115+
shift;
116+
done
117+
}
118+
119+
record_done(){
120+
local h=${1:?} n=${2:?} s=${3:?}
121+
sqlite3 "$DBFILE" "
122+
update prov set
123+
etime=datetime('now'), status = '$s'
124+
where hash='$h' and stime='$s';"
125+
}
126+
127+
show_db(){
128+
record_to "$BACKEND"
129+
sqlite3 -separator $'\t' -header "$DBFILE" <<HERE
130+
select status, o.stime, etime, cmd from prov p join outf o on p.hash=o.hash where file like '$1';
131+
HERE
132+
}
133+
134+
_OUT=()
135+
_IN=()
136+
stdout=
137+
ifneeded=0
138+
BACKEND="db"
139+
while [ $# -ne 0 ]; do
140+
case "$1" in
141+
show) show_db "$2"; shift 2; exit ;;
142+
--) shift; break;;
143+
-o|--output)
144+
_OUT+=("${2:?$1 must be followed by an output file}"); shift 2;;
145+
--stdout|-s)
146+
_OUT+=("${2:?$1 must be followed by an output file}");
147+
stdout=$2;
148+
shift 2;;
149+
-i|--input)
150+
_IN+=("${2:?$1 must be following by an input file}"); shift 2;;
151+
-e|--ifneeded) ifneeded=1; shift;;
152+
-c|--sidecar) BACKEND="db"; shift;
153+
echo "sidecar not implemented yet"; exit 1;;
154+
-n|--dryrun) DRYRUN=echo; shift;;
155+
-d|--db) PNOTEDB=${2:?--db requires db path}; shift 2;;
156+
157+
# die on unknown options
158+
-*)
159+
echo "ERROR: unknown option $1" >&2; exit 1;;
160+
161+
# anything else is likely a command
162+
*) break;;
163+
esac
164+
done
165+
166+
stat_exist(){
167+
default=${1?:missing file number. '0' for oldest; 'error' to break}; shift;
168+
for f in "$@"; do
169+
test -e "$f" && stat -c "%Y %n" "$f" && continue
170+
171+
[[ $default == "error" ]] && echo "ERROR: missing file '$f'" >&2 && return 1
172+
echo "$default $f"
173+
done
174+
}
175+
176+
#cmd_and_args=("$(printf "%q " "$@")")
177+
cmd_and_args=("$@")
178+
cmd_hash=$(md5sum <<< "$USER@$HOSTNAME:$PWD ${cmd_and_args[*]}" | sed 's/ -$//')
179+
now=$(date +%s)
180+
out_times=("$now") # default for no output: set as always up-to-date
181+
in_times=()
182+
if [ ${#_IN[@]} -gt 0 ]; then
183+
mapfile -t in_times < <(stat_exist error "${_IN[@]}" | sort -n)
184+
[ ${#in_times[@]} -ne ${#_IN[@]} ] &&
185+
echo "ERROR: not all input files exist" >&2 &&
186+
exit 1
187+
fi
188+
[ ${#_OUT[@]} -gt 0 ] &&
189+
mapfile -t out_times < <(stat_exist 0 "${_OUT[@]}" | sort -nr) # oldest first
190+
newest_in="${in_times[0]:-0// */}" # remove name, just stat
191+
oldest_out="${out_times[0]// */}"
192+
# TODO: if no output but still want run if needed?
193+
oldest_out=${oldest_out?-0} # no output, make start of time.
194+
# if newest output is older than newest input AND cmd_hash is same, nothing to do?
195+
if [[ $ifneeded -eq 1 && \
196+
-n "${newest_in}" && -n "${oldest_out}" && \
197+
${newest_in} -lt ${oldest_out} \
198+
]]; then # `# check cmd_hash?`
199+
echo "# not rerunning: input '${in_times[0]}' $(bc -l <<< "$oldest_out - $newest_in") seconds older than output '${out_times[0]}'" >&2
200+
exit 0
201+
fi
202+
203+
# nothing to do, just exit
204+
if [ -n "${DRYRUN:-}" ]; then
205+
echo "${cmd_and_args[*]}"
206+
exit 0
207+
fi
208+
209+
# set DBFILE
210+
record_to "$BACKEND"
211+
212+
record_start "$cmd_hash" "$now" "${cmd_and_args[*]}"
213+
record_in "$cmd_hash" "$now" "${in_times[@]}"
214+
record_out "$cmd_hash" "$now" "${_OUT[@]}"
215+
216+
if [ -n "$stdout" ]; then
217+
set +e # allow this to fail
218+
"${cmd_and_args[@]}" > "$stdout"
219+
else
220+
set +e # allow this to fail
221+
"${cmd_and_args[@]}"
222+
fi
223+
cmd_status=$?
224+
set -e
225+
record_done "$cmd_hash" "$now" "$cmd_status"
226+
227+
# TODO: check all output files exist

0 commit comments

Comments
 (0)