diff --git a/.github/workflows/fleettest.yml b/.github/workflows/fleettest.yml new file mode 100644 index 000000000..c3f66c771 --- /dev/null +++ b/.github/workflows/fleettest.yml @@ -0,0 +1,64 @@ +name: Test fleettest harness + +# Bitrot check for testsuite/fleettest.py (the developer fleet CI harness). +# fleettest is meant to be run by developers on a modern Ubuntu box, so this +# job runs only on ubuntu-latest: it stands up a one-host "fleet" of two +# targets that both ssh to localhost and runs a real fleettest pass against it. +# It does not run on the BSD/Solaris/macOS/Cygwin matrix. + +on: + push: + branches: [ master ] + paths: + - 'testsuite/fleettest.py' + - '.github/workflows/fleettest.yml' + pull_request: + branches: [ master ] + paths: + - 'testsuite/fleettest.py' + - '.github/workflows/fleettest.yml' + workflow_dispatch: + schedule: + - cron: '17 7 * * 1' + +jobs: + fleettest: + runs-on: ubuntu-latest + name: fleettest against localhost + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 0 + - name: prep + run: | + sudo apt-get update + sudo apt-get install -y gcc g++ gawk autoconf automake \ + acl libacl1-dev attr libattr1-dev liblz4-dev libzstd-dev libxxhash-dev \ + python3-cmarkgfm openssl rsync openssh-server + - name: set up ssh to localhost + run: | + mkdir -p ~/.ssh && chmod 700 ~/.ssh + ssh-keygen -t ed25519 -N '' -f ~/.ssh/id_ed25519 + cat ~/.ssh/id_ed25519.pub >> ~/.ssh/authorized_keys + chmod 600 ~/.ssh/authorized_keys + sudo systemctl start ssh || sudo service ssh start + # fleettest connects with `ssh -o BatchMode=yes localhost`, which won't + # answer a host-key prompt -- so pre-trust localhost in known_hosts. + ssh-keyscan -H localhost 127.0.0.1 >> ~/.ssh/known_hosts 2>/dev/null + ssh -o BatchMode=yes -o ConnectTimeout=15 localhost 'echo ssh-to-localhost-ok' + - name: write localhost fleet config + run: | + cat > fleettest-ci.json <<'EOF' + { "targets": [ + { "name": "local-a", "ssh_host": "localhost", "workflow": "none.yml", + "configure_flags": [], "builddir": "rsync-citest-a", "privilege": "sudo" }, + { "name": "local-b", "ssh_host": "localhost", "workflow": "none.yml", + "configure_flags": [], "builddir": "rsync-citest-b", "privilege": "sudo" } + ] } + EOF + - name: fleettest --list (config sanity) + run: python3 testsuite/fleettest.py --fleet fleettest-ci.json --list + - name: run fleettest against localhost + # Two targets both on localhost exercise the parallel multi-target path + # and the per-run dir / port isolation; exit 0 iff every cell is OK. + run: python3 testsuite/fleettest.py --fleet fleettest-ci.json --timing diff --git a/testsuite/README.md b/testsuite/README.md index d3f87273a..295299dfe 100644 --- a/testsuite/README.md +++ b/testsuite/README.md @@ -133,6 +133,9 @@ cp testsuite/fleettest.json.example testsuite/fleettest.json # then edit # (or symlink it, or point elsewhere with --fleet PATH) ``` +The config is looked up in order: `~/.fleettest.json` first, then +`testsuite/fleettest.json`, unless overridden with `--fleet PATH`. + Each entry names an ssh host (`null` to run locally), the workflow it mirrors, and its configure flags, plus optional per-target settings (`make`, `privilege`, `env_prefix`, …). See the comments in `fleettest.json.example`. @@ -150,10 +153,26 @@ Run it from inside a checkout (it builds the current directory's HEAD; use ```sh python3 testsuite/fleettest.py # whole fleet, both transports python3 testsuite/fleettest.py --list # list configured targets -python3 testsuite/fleettest.py --targets NAME[,NAME] --clean +python3 testsuite/fleettest.py --targets NAME[,NAME] python3 testsuite/fleettest.py --fleet other.json --transport pipe +python3 testsuite/fleettest.py --timing # per-target wall-clock breakdown ``` +`--timing` adds a per-target breakdown after the report — total wall-clock plus +the push / build / pipe / tcp / nonroot phases, sorted slowest-first. Targets +run in parallel, so the whole run is gated by the slowest one; the phase columns +show whether that target's hold-up is the push, the build, or a test pass. + +Each run gets its own randomly-named build dir on every target +(`-`), so two or three runs can share the same fleet without +interfering. The dir is removed when the run ends — on success or failure, and +best-effort on Ctrl-C/kill; pass `--keep` to retain it for inspection. A hard +kill (`SIGKILL`), or a signal arriving mid-push, can leave a stray +`-` behind; sweep leftovers with +`python3 testsuite/fleettest.py --cleanup` (scope it with `--targets`, and only +run it when no other fleet runs are active, since it removes *all* matching run +dirs on the selected targets). + Each target must be provisioned with the build toolchain its workflow installs (autoconf, automake, a C compiler, perl, a python3 markdown module such as cmarkgfm or commonmark unless the flags pass `--disable-md2man`, and the dev diff --git a/testsuite/fleettest.py b/testsuite/fleettest.py index 9478c1045..36ebfef74 100755 --- a/testsuite/fleettest.py +++ b/testsuite/fleettest.py @@ -12,15 +12,23 @@ list (matching the workflows), so only test FAILs matter there. The fleet -- which machines, how to reach and build each -- is read from a JSON -config: fleettest.json next to this script, or --fleet PATH. Copy the bundled -fleettest.json.example to fleettest.json (or symlink it) and edit for your own -hosts; see testsuite/README.md and the comments in fleettest.json.example. +config: ~/.fleettest.json if present, else fleettest.json next to this script, +or --fleet PATH. Copy the bundled fleettest.json.example to either location (or +symlink it) and edit for your own hosts; see testsuite/README.md and the +comments in fleettest.json.example. Source = `git archive HEAD` of the rsync tree (the current directory, or --repo -PATH) -- source-only, no .o/binaries are ever pushed. Build is incremental by -default (each target's tree is kept in sync; native objects are preserved and -only changed files rebuild). Use --clean for a from-scratch build (recommended -on a target's first run). +PATH) -- source-only, no .o/binaries are ever pushed. + +Every run uses its own randomly-named build directory on each target +(-), so two or three fleettest runs can share the same fleet +without interfering: each pushes, builds and tests in isolation. The run dir is +removed when the run ends -- on success or failure, and best-effort on +Ctrl-C/kill (pass --keep to retain it for inspection). A run that is hard-killed +(SIGKILL), or signalled mid-push, or whose ssh dies during cleanup can leave a +stray - behind; sweep those with `fleettest.py --cleanup` +(optionally scoped with --targets). Because each +run starts from a fresh dir, every build is a full configure + build. PROVISIONING: each target must have the build toolchain its workflow's prepare step installs -- the target regenerates its own configure/proto.h/man pages, so @@ -40,8 +48,9 @@ Usage (run from inside an rsync checkout, or pass --repo): python3 testsuite/fleettest.py # whole fleet, both transports python3 testsuite/fleettest.py --targets cygwin,freebsd - python3 testsuite/fleettest.py --transport pipe --clean - python3 testsuite/fleettest.py --no-push # reuse synced trees + python3 testsuite/fleettest.py --transport pipe + python3 testsuite/fleettest.py --keep # keep run dirs for inspection + python3 testsuite/fleettest.py --cleanup # sweep stray run dirs, exit python3 testsuite/fleettest.py --fleet my-fleet.json --list Exit 0 iff every selected (target x transport) cell is OK. @@ -50,11 +59,14 @@ from __future__ import annotations import argparse +import atexit import concurrent.futures import dataclasses import json import os import re +import secrets +import signal import subprocess import sys import tempfile @@ -68,9 +80,13 @@ REPO = Path.cwd() WORKFLOWS = REPO / ".github" / "workflows" -# Fleet config: fleettest.json next to this script, overridable with --fleet. -DEFAULT_CONFIG = Path(__file__).resolve().parent / "fleettest.json" -EXAMPLE_CONFIG = DEFAULT_CONFIG.with_name(DEFAULT_CONFIG.name + ".example") +# Fleet config (overridable with --fleet): ~/.fleettest.json is tried first, then +# fleettest.json next to this script. The example template sits next to the +# script too. +HOME_CONFIG = Path.home() / ".fleettest.json" +SCRIPT_CONFIG = Path(__file__).resolve().parent / "fleettest.json" +DEFAULT_CONFIGS = [HOME_CONFIG, SCRIPT_CONFIG] +EXAMPLE_CONFIG = SCRIPT_CONFIG.with_name(SCRIPT_CONFIG.name + ".example") # The pushed tree is source-only (git archive). Each target regenerates its own # build files, so --delete must NOT prune them: we exclude everything `make` @@ -104,7 +120,10 @@ class Target: privilege: str = "root" # "root" (already root) | "sudo" | "user" (plain, no sudo) pipe_jobs: int = 8 tcp_jobs: int = 8 - builddir: str = "rsync-citest" # relative to remote $HOME; absolute for local + # Base build-dir name (relative to remote $HOME; absolute for local). A + # per-run random suffix is appended (-> -) so concurrent + # fleettest runs don't share a tree; --cleanup sweeps leftover -*. + builddir: str = "rsync-citest" # When true, after the sudo runs, additionally run -- as the (non-root) ssh # user -- every test that declares `fleet_nonroot = True` (see # discover_nonroot_tests). Mirrors a workflow's non-root check step. @@ -176,7 +195,7 @@ def run_on(target: Target, script: str, timeout: int) -> CmdResult: return CmdResult(127, str(e)) -def push_argv(target: Target, staging: str, clean: bool) -> list[str]: +def push_argv(target: Target, staging: str) -> list[str]: # -rlpgoD = -a without -t: do NOT preserve mtimes. The host clock can be # hours AHEAD of a target, so preserved (commit-time) mtimes land "in the # future" there and rsync's `Makefile: Makefile.in config.status` rule @@ -354,6 +373,9 @@ class TargetResult: error: str = "" build_log: str = "" transports: dict[str, TransportResult] = dataclasses.field(default_factory=dict) + # Wall-clock seconds per phase (push/build/pipe/tcp/nonroot) plus "total"; + # populated for --timing. Phases run sequentially, so they sum to the total. + timings: dict[str, float] = dataclasses.field(default_factory=dict) # --------------------------------------------------------------------------- @@ -371,6 +393,7 @@ def log(msg: str) -> None: def run_target(t: Target, args, staging: str) -> TargetResult: res = TargetResult(t.name) log(f"[{t.name}] start") + started = time.monotonic() if t.ssh_host: ping = run_on(t, "echo ok", timeout=25) @@ -380,20 +403,21 @@ def run_target(t: Target, args, staging: str) -> TargetResult: log(f"[{t.name}] UNREACHABLE") return res - if not args.no_push: - if args.clean: - bd = t.builddir - if bd and bd not in ("/", "~", os.path.expanduser("~")): - run_on(t, f'rm -rf {bd}', timeout=120) - push = subprocess.run(push_argv(t, staging, args.clean), - capture_output=True, text=True, timeout=600) - if push.returncode != 0: - res.pushed = False - res.error = f"push failed (rc={push.returncode}): {push.stderr.strip()[:300]}" - log(f"[{t.name}] PUSH-FAIL") - return res + # Always push: the run dir is freshly named per run, so there is no prior + # tree to reuse -- every run is a full configure + build. + t0 = time.monotonic() + push = subprocess.run(push_argv(t, staging), + capture_output=True, text=True, timeout=600) + res.timings["push"] = time.monotonic() - t0 + if push.returncode != 0: + res.pushed = False + res.error = f"push failed (rc={push.returncode}): {push.stderr.strip()[:300]}" + log(f"[{t.name}] PUSH-FAIL") + return res + t0 = time.monotonic() b = run_on(t, build_script(t), timeout=1200) + res.timings["build"] = time.monotonic() - t0 res.build_ok = b.rc == 0 res.build_log = b.out if not res.build_ok: @@ -405,7 +429,9 @@ def run_target(t: Target, args, staging: str) -> TargetResult: jobs = (args.jobs if args.jobs else (t.tcp_jobs if transport == "tcp" else t.pipe_jobs)) cmd = test_script(t, transport, skip_csv, jobs) + t0 = time.monotonic() r = run_on(t, cmd, timeout=2400) + res.timings[transport] = time.monotonic() - t0 res.transports[transport] = parse_transport(transport, r, skip_csv is not None) log(f"[{t.name}] {transport} done " f"({'ok' if res.transports[transport].ok else 'ISSUE'})") @@ -413,10 +439,13 @@ def run_target(t: Target, args, staging: str) -> TargetResult: # Extra non-root pass (after the sudo runs) for targets that opt in, running # the tests that declare `fleet_nonroot = True` (discovered in main()). if t.nonroot and args.nonroot_tests: + t0 = time.monotonic() r = run_on(t, nonroot_test_script(t, args.nonroot_tests), timeout=2400) + res.timings["nonroot"] = time.monotonic() - t0 res.transports["nonroot"] = parse_transport("nonroot", r, skip_checked=False) log(f"[{t.name}] nonroot done " f"({'ok' if res.transports['nonroot'].ok else 'ISSUE'})") + res.timings["total"] = time.monotonic() - started return res @@ -458,7 +487,7 @@ def print_report(results: list[TargetResult], args, fleet: list[Target]) -> bool ts = time.strftime("%Y-%m-%d %H:%M") print("\n" + "=" * 64) print(f"rsync fleet CI — branch {current_branch()} — {ts}") - print(f"source: HEAD build: {'clean' if args.clean else 'incremental'} " + print(f"source: HEAD run: {args.run_id} " f"transports: {','.join(args.transports)}") print("(A target's pipe skip-set is only enforced when its workflow sets " "RSYNC_EXPECT_SKIPPED; otherwise only FAILs matter. The 'nonroot' " @@ -541,6 +570,46 @@ def print_report(results: list[TargetResult], args, fleet: list[Target]) -> bool return all_ok +# Phase columns for --timing, in execution order (push -> build -> tests). +_TIMING_PHASES = ("push", "build", "pipe", "tcp", "nonroot") + + +def _fmt_dur(s: float) -> str: + if s < 60: + return f"{s:.0f}s" + m, sec = divmod(int(round(s)), 60) + return f"{m}m{sec:02d}s" + + +def print_timing(results: list[TargetResult]) -> None: + """Per-target wall-clock breakdown, slowest first. Targets run in parallel, + so the whole run is gated by the slowest one -- that's the hold-up; the + phase columns show whether it's push, build or the test passes.""" + timed = [r for r in results if r.timings] + if not timed: + return + phases = [p for p in _TIMING_PHASES if any(p in r.timings for r in timed)] + + def total(r: TargetResult) -> float: + # Failed-early targets have no "total"; sum the phases they did reach. + return r.timings.get("total") or sum(r.timings.get(p, 0.0) for p in phases) + + timed.sort(key=total, reverse=True) + width = max([len("TARGET")] + [len(r.target) for r in timed]) + 2 + print("\n==== TIMING (slowest target first) ====") + print("TARGET".ljust(width) + "TOTAL".ljust(9) + + "".join(p.upper().ljust(9) for p in phases)) + for r in timed: + row = r.target.ljust(width) + _fmt_dur(total(r)).ljust(9) + for p in phases: + v = r.timings.get(p) + row += (_fmt_dur(v) if v is not None else "-").ljust(9) + print(row) + slow = timed[0] + print(f"hold-up: {slow.target} at {_fmt_dur(total(slow))} gates the run " + "(targets run in parallel)") + + def current_branch() -> str: try: return subprocess.run(["git", "-C", str(REPO), "rev-parse", @@ -550,6 +619,82 @@ def current_branch() -> str: return "?" +# --------------------------------------------------------------------------- +# run-dir cleanup +# --------------------------------------------------------------------------- + +# Targets whose per-run dir (t.builddir, already suffixed with the run_id) this +# process must remove on exit. Populated in main() once the run_id is applied. +_cleanup_targets: list[Target] = [] +_cleanup_lock = threading.Lock() +_cleanup_done = False + + +def _unsafe_builddir(path: str) -> bool: + """True if `path` is too broad to feed to `rm -rf` -- empty, root, $HOME, or + an absolute path sitting directly under / (e.g. /tmp). A real run dir is + always nested deeper, so this rejects an obvious builddir misconfiguration + before any destructive command is built.""" + p = (path or "").rstrip("/") + if p in ("", "/", "~") or os.path.expanduser(p) == os.path.expanduser("~"): + return True + return os.path.isabs(p) and os.path.dirname(p) == "/" + + +def cleanup_run() -> None: + """Best-effort `rm -rf` of this run's dir on every chosen target. Idempotent + (atexit + a signal handler may both call it). Each target removes only its + own - dir, so a concurrent run's dir is never touched.""" + global _cleanup_done + with _cleanup_lock: + if _cleanup_done or not _cleanup_targets: + return + _cleanup_done = True + targets = list(_cleanup_targets) + for t in targets: + if _unsafe_builddir(t.builddir): + continue + run_on(t, f'rm -rf -- {t.builddir}', timeout=60) + + +def _on_signal(signum, frame): + cleanup_run() + # Skip atexit/thread-join: worker threads' ssh calls can't be cancelled and + # would otherwise block exit until they return. The remote build/test simply + # errors out now that its dir is gone. + os._exit(130 if signum == signal.SIGINT else 143) + + +def cleanup_remnants(targets: list[Target]) -> int: + """--cleanup mode: remove every -* run dir on each target, reporting + what each removed. Returns a process exit code. Only suffixed run dirs are + swept -- a bare is left alone.""" + rc = 0 + for t in targets: + base = t.builddir + if _unsafe_builddir(base): + log(f"[{t.name}] skipped (unsafe builddir {base!r})") + continue + # Echo each match before removing it so the harness can report what + # went; an unmatched glob stays literal and is skipped by the -e test. + script = (f'set -e\n' + f'for d in {base}-*; do\n' + f' [ -e "$d" ] || continue\n' + f' echo "$d"\n' + f' rm -rf -- "$d"\n' + f'done\n') + r = run_on(t, script, timeout=120) + removed = [ln for ln in r.out.splitlines() if ln.strip()] + if r.rc != 0: + rc = 1 + log(f"[{t.name}] cleanup error (rc={r.rc}): {r.out.strip()[:200]}") + elif removed: + log(f"[{t.name}] removed: {' '.join(removed)}") + else: + log(f"[{t.name}] nothing to remove") + return rc + + # --------------------------------------------------------------------------- # main # --------------------------------------------------------------------------- @@ -559,31 +704,41 @@ def main() -> int: ap = argparse.ArgumentParser(description="Fleet CI harness for rsync.") ap.add_argument("--targets", help="comma-separated subset (default: all)") ap.add_argument("--transport", choices=["pipe", "tcp", "both"], default="both") - ap.add_argument("--no-push", action="store_true", - help="reuse the already-synced tree on each target") - ap.add_argument("--clean", action="store_true", - help="wipe each builddir and reconfigure (recommended first run)") + ap.add_argument("--keep", action="store_true", + help="keep each run's build dir (default: remove it at exit)") + ap.add_argument("--cleanup", action="store_true", + help="remove stray -* run dirs on the targets, then exit") ap.add_argument("--jobs", type=int, help="override -j for both transports") + ap.add_argument("--timing", action="store_true", + help="report per-target wall-clock (push/build/test) to find " + "the slowest target") ap.add_argument("--repo", help="rsync source tree to build (default: cwd)") - ap.add_argument("--fleet", help="fleet config JSON " - "(default: fleettest.json next to this script)") + ap.add_argument("--fleet", help="fleet config JSON (default: ~/.fleettest.json, " + "else fleettest.json next to this script)") ap.add_argument("--list", action="store_true", help="list targets and exit") args = ap.parse_args() global REPO, WORKFLOWS REPO = Path(args.repo).resolve() if args.repo else Path.cwd() WORKFLOWS = REPO / ".github" / "workflows" - if not (REPO / "runtests.py").is_file(): + if not args.cleanup and not (REPO / "runtests.py").is_file(): print(f"{REPO} is not an rsync source tree (no runtests.py); " f"run from inside a checkout or pass --repo", file=sys.stderr) return 2 - config_path = Path(args.fleet).resolve() if args.fleet else DEFAULT_CONFIG - if not config_path.exists(): - print(f"no fleet config at {config_path}\n" - f"copy {EXAMPLE_CONFIG} to {DEFAULT_CONFIG} (or pass --fleet PATH)", - file=sys.stderr) - return 2 + if args.fleet: + config_path = Path(args.fleet).resolve() + if not config_path.exists(): + print(f"no fleet config at {config_path}", file=sys.stderr) + return 2 + else: + config_path = next((p for p in DEFAULT_CONFIGS if p.exists()), None) + if config_path is None: + tried = " or ".join(str(p) for p in DEFAULT_CONFIGS) + print(f"no fleet config found (looked for {tried})\n" + f"copy {EXAMPLE_CONFIG} to {SCRIPT_CONFIG} or {HOME_CONFIG} " + f"(or pass --fleet PATH)", file=sys.stderr) + return 2 fleet = load_fleet(config_path) if args.list: @@ -594,8 +749,6 @@ def main() -> int: f"pipe-skip={'set' if skip else 'unset'}") return 0 - args.transports = ["pipe", "tcp"] if args.transport == "both" else [args.transport] - chosen = fleet if args.targets: want = [s.strip() for s in args.targets.split(",") if s.strip()] @@ -607,6 +760,32 @@ def main() -> int: return 2 chosen = [by_name[w] for w in want] + if args.cleanup: + # Sweep every -* run dir on the selected targets. NB: this + # also removes dirs belonging to runs that are still in progress, so + # only run it when no other fleettest runs are active (or scope with + # --targets). + return cleanup_remnants(chosen) + + args.transports = ["pipe", "tcp"] if args.transport == "both" else [args.transport] + + # Give this run its own build dir on every target so concurrent runs don't + # collide: -. The base name is the prefix --cleanup globs. + args.run_id = secrets.token_hex(3) + for t in chosen: + t.builddir = f"{t.builddir}-{args.run_id}" + log(f"run {args.run_id}: build dir :{chosen[0].builddir} " + f"(removed at exit; --keep to retain)") + + # Remove each run dir when we exit -- success or failure, and best-effort on + # Ctrl-C/kill (a signal mid-push may still leave a remnant). SIGKILL can't be + # caught; `fleettest.py --cleanup` sweeps any such remnant. + if not args.keep: + _cleanup_targets.extend(chosen) + atexit.register(cleanup_run) + signal.signal(signal.SIGINT, _on_signal) + signal.signal(signal.SIGTERM, _on_signal) + # Stage committed HEAD (source-only). Each target regenerates its own # build files with its own toolchain -- exactly like the CI jobs, which # install autotools / python-markdown / dev-libs in their prepare step. @@ -641,6 +820,8 @@ def main() -> int: subprocess.run(["rm", "-rf", staging]) all_ok = print_report(results, args, fleet) + if args.timing: + print_timing(results) return 0 if all_ok else 1