From 2518c231358aa0c6c9858487fee6ea9080065269 Mon Sep 17 00:00:00 2001 From: Stuart Corbishley Date: Tue, 19 May 2026 15:20:59 +0200 Subject: [PATCH 01/11] feat: optional cross-worker lock for shared adaptor repo Add `--repo-lock` / `WORKER_REPO_LOCK` to coordinate adaptor installs across multiple workers sharing one repo directory (e.g. an NFS mount or k8s PVC). Uses proper-lockfile per-adaptor plus a sentinel cache, so the cache-hit path stays lock-free. Off by default; requires --repo-dir. Lock retry ceiling (6 min) is set above STALE_MS (5 min) so a dead holder's stale window expires before waiters give up, making cold- start of N pods against an empty repo recoverable rather than fatal. --- .../distributed-adaptor-install-lock.md | 5 + packages/ws-worker/package.json | 2 + packages/ws-worker/src/start.ts | 17 +- packages/ws-worker/src/util/cli.ts | 8 + packages/ws-worker/src/util/repo-lock.ts | 151 ++++++ .../test/util/repo-lock-multiprocess.test.ts | 473 ++++++++++++++++++ .../ws-worker/test/util/repo-lock-worker.ts | 144 ++++++ .../ws-worker/test/util/repo-lock.test.ts | 178 +++++++ pnpm-lock.yaml | 33 ++ 9 files changed, 1010 insertions(+), 1 deletion(-) create mode 100644 .changeset/distributed-adaptor-install-lock.md create mode 100644 packages/ws-worker/src/util/repo-lock.ts create mode 100644 packages/ws-worker/test/util/repo-lock-multiprocess.test.ts create mode 100644 packages/ws-worker/test/util/repo-lock-worker.ts create mode 100644 packages/ws-worker/test/util/repo-lock.test.ts diff --git a/.changeset/distributed-adaptor-install-lock.md b/.changeset/distributed-adaptor-install-lock.md new file mode 100644 index 000000000..3704f8d59 --- /dev/null +++ b/.changeset/distributed-adaptor-install-lock.md @@ -0,0 +1,5 @@ +--- +'@openfn/ws-worker': minor +--- + +Add optional filesystem-based lock so multiple ws-workers can safely share a single repo directory (e.g. an NFS mount or k8s PVC). When `WORKER_REPO_LOCK=true` (or `--repo-lock`) is set alongside `WORKER_REPO_DIR`, adaptor installs are serialised across workers via a per-adaptor lockfile and a sentinel cache. The cache-hit path stays lock-free. diff --git a/packages/ws-worker/package.json b/packages/ws-worker/package.json index 25e8aa448..b7ea0f6d4 100644 --- a/packages/ws-worker/package.json +++ b/packages/ws-worker/package.json @@ -36,6 +36,7 @@ "koa": "^2.16.4", "koa-logger": "^3.2.1", "phoenix": "1.7.10", + "proper-lockfile": "^4.1.2", "ws": "^8.19.0" }, "devDependencies": { @@ -45,6 +46,7 @@ "@types/node": "^18.19.130", "@types/nodemon": "1.19.3", "@types/phoenix": "^1.6.7", + "@types/proper-lockfile": "^4.1.4", "@types/yargs": "^17.0.35", "ava": "5.3.1", "nodemon": "3.1.14", diff --git a/packages/ws-worker/src/start.ts b/packages/ws-worker/src/start.ts index 9d901c7af..cdb9bdc19 100644 --- a/packages/ws-worker/src/start.ts +++ b/packages/ws-worker/src/start.ts @@ -5,6 +5,7 @@ import createMockRTE from './mock/runtime-engine'; import createWorker, { ServerOptions } from './server'; import cli from './util/cli'; import getDefaultWorkloopConfig from './util/get-default-workloop-config'; +import { createLockedHandlers } from './util/repo-lock'; const args = cli(process.argv); @@ -108,7 +109,7 @@ if (args.mock) { engineReady(engine); }); } else { - const engineOptions = { + const engineOptions: any = { repoDir: args.repoDir, memoryLimitMb: args.runMemory, maxWorkers: effectiveCapacity, @@ -119,6 +120,20 @@ if (args.mock) { profile: args.profile, profilePollInterval: args.profilePollIntervalMs, }; + + if (args.repoLock) { + if (args.repoDir) { + logger.info( + 'Repo lock enabled: coordinating adaptor installs via filesystem lock at', + args.repoDir + ); + engineOptions.autoinstall = createLockedHandlers(args.repoDir); + } else { + logger.warn( + 'WARNING: --repo-lock set but --repo-dir is not; ignoring repo lock' + ); + } + } logger.debug('Creating runtime engine...'); logger.debug('Engine options:', engineOptions); diff --git a/packages/ws-worker/src/util/cli.ts b/packages/ws-worker/src/util/cli.ts index f8d27a126..fb81ee559 100644 --- a/packages/ws-worker/src/util/cli.ts +++ b/packages/ws-worker/src/util/cli.ts @@ -37,6 +37,7 @@ type Args = { profile?: boolean; profilePollIntervalMs?: number; repoDir?: string; + repoLock?: boolean; runMemory?: number; secret?: string; sentryDsn?: string; @@ -96,6 +97,7 @@ export default function parseArgs(argv: string[]): Args { WORKER_PROFILE_POLL_INTERVAL_MS, WORKER_PROFILE, WORKER_REPO_DIR, + WORKER_REPO_LOCK, WORKER_SECRET, WORKER_SENTRY_DSN, WORKER_SENTRY_ENV, @@ -129,6 +131,11 @@ export default function parseArgs(argv: string[]): Args { description: 'Path to the runtime repo (where modules will be installed). Env: WORKER_REPO_DIR', }) + .option('repo-lock', { + description: + 'Coordinate adaptor installs across multiple workers sharing one repo directory via a filesystem lock. Requires --repo-dir. Default false. Env: WORKER_REPO_LOCK', + type: 'boolean', + }) .option('monorepo-dir', { alias: 'm', description: @@ -306,6 +313,7 @@ export default function parseArgs(argv: string[]): Args { 'ws://localhost:4000/worker' ), repoDir: setArg(args.repoDir, WORKER_REPO_DIR), + repoLock: setArg(args.repoLock, WORKER_REPO_LOCK, false), batchLogs: setArg(args.batchLogs, WORKER_BATCH_LOGS, false), batchInterval: setArg(args.batchInterval, WORKER_BATCH_INTERVAL, 10), batchLimit: setArg(args.batchLimit, WORKER_BATCH_LIMIT, 50), diff --git a/packages/ws-worker/src/util/repo-lock.ts b/packages/ws-worker/src/util/repo-lock.ts new file mode 100644 index 000000000..a39d2dab0 --- /dev/null +++ b/packages/ws-worker/src/util/repo-lock.ts @@ -0,0 +1,151 @@ +import path from 'node:path'; +import { mkdir, stat, writeFile, rename } from 'node:fs/promises'; +import lockfile from 'proper-lockfile'; +import { + getAliasedName, + install as runtimeInstall, +} from '@openfn/runtime'; +import type { Logger } from '@openfn/logger'; + +// k8s CPU throttling can delay the heartbeat setTimeout; 5 min avoids false-stale on tight clusters +const STALE_MS = 5 * 60_000; +// Retry ceiling exceeds STALE_MS so a dead lock-holder's stale window expires before we give up +const MAX_LOCK_WAIT_MS = STALE_MS + 60_000; // 6 min +const LOCK_INTERVAL_MS = 2_000; +const LOCK_RETRY_OPTIONS = { + retries: MAX_LOCK_WAIT_MS / LOCK_INTERVAL_MS, // 180 × 2s = 360s + factor: 1, + minTimeout: LOCK_INTERVAL_MS, + maxTimeout: LOCK_INTERVAL_MS, +}; +const UPDATE_MS = 5_000; + +const sentinelPath = (repoDir: string, alias: string) => + path.join(repoDir, '.sentinels', `${alias}.done`); + +const lockTargetPath = (repoDir: string, alias: string) => + path.join(repoDir, '.locks', `${alias}.lock`); + +const nodeModulesPkgPath = (repoDir: string, alias: string) => + path.join(repoDir, 'node_modules', alias, 'package.json'); + +const fileExists = async (p: string) => { + try { + await stat(p); + return true; + } catch { + return false; + } +}; + +const ensureLockTarget = async (target: string) => { + await mkdir(path.dirname(target), { recursive: true }); + if (!(await fileExists(target))) { + try { + await writeFile(target, '', { flag: 'wx' }); + } catch (e: any) { + if (e.code !== 'EEXIST') throw e; + } + } +}; + +const writeSentinelAtomic = async (sentinel: string) => { + await mkdir(path.dirname(sentinel), { recursive: true }); + const tmp = `${sentinel}.tmp`; + await writeFile(tmp, ''); + await rename(tmp, sentinel); +}; + +export type InstallFn = ( + specifier: string, + repoDir: string, + logger: Logger +) => Promise; + +const defaultInstall: InstallFn = (specifier, repoDir, logger) => + runtimeInstall([specifier], repoDir, logger); + +export const createLockedHandlers = ( + repoDir: string, + installFn: InstallFn = defaultInstall +) => { + const locksDir = path.join(repoDir, '.locks'); + const sentinelsDir = path.join(repoDir, '.sentinels'); + + const ensureDirs = (async () => { + await mkdir(repoDir, { recursive: true }); + await mkdir(locksDir, { recursive: true }); + await mkdir(sentinelsDir, { recursive: true }); + })(); + + const handleIsInstalled = async ( + specifier: string, + _repoDir: string, + _logger: Logger + ): Promise => { + await ensureDirs; + const alias = getAliasedName(specifier); + const sentinel = sentinelPath(repoDir, alias); + const pkg = nodeModulesPkgPath(repoDir, alias); + if ((await fileExists(sentinel)) && (await fileExists(pkg))) { + return true; + } + return false; + }; + + const handleInstall = async ( + specifier: string, + _repoDir: string, + logger: Logger + ): Promise => { + await ensureDirs; + const alias = getAliasedName(specifier); + const sentinel = sentinelPath(repoDir, alias); + const target = lockTargetPath(repoDir, alias); + + await ensureLockTarget(target); + + logger.debug(`acquiring install lock for ${specifier}`); + let release: () => Promise; + try { + release = await lockfile.lock(target, { + retries: LOCK_RETRY_OPTIONS, + stale: STALE_MS, + update: UPDATE_MS, + realpath: false, + }); + } catch (e: any) { + if (e.code === 'ELOCKED') { + const waitSecs = MAX_LOCK_WAIT_MS / 1000; + throw new Error( + `Lock acquisition timed out after ${waitSecs}s waiting for ${alias}; another worker likely still installing (lock: ${target})` + ); + } + throw e; + } + logger.debug(`acquired install lock for ${specifier}`); + + try { + const pkg = nodeModulesPkgPath(repoDir, alias); + if ((await fileExists(sentinel)) && (await fileExists(pkg))) { + logger.debug( + `another worker installed ${specifier} while waiting for lock; skipping install` + ); + return; + } + + await installFn(specifier, repoDir, logger); + await writeSentinelAtomic(sentinel); + } finally { + try { + await release(); + } catch (e) { + logger.warn(`failed to release install lock for ${specifier}:`, e); + } + } + }; + + return { handleInstall, handleIsInstalled }; +}; + +export default createLockedHandlers; diff --git a/packages/ws-worker/test/util/repo-lock-multiprocess.test.ts b/packages/ws-worker/test/util/repo-lock-multiprocess.test.ts new file mode 100644 index 000000000..f755338a7 --- /dev/null +++ b/packages/ws-worker/test/util/repo-lock-multiprocess.test.ts @@ -0,0 +1,473 @@ +/** + * Cross-process integration tests for repo-lock. + * + * Each test forks real child processes pointing at the same tmpdir to exercise + * the proper-lockfile filesystem lock path that in-process tests cannot reach. + * + * Worker behaviour is configured via env vars; orchestration uses IPC barriers + * so test ordering is deterministic (no setTimeout for control flow). + */ + +import test from 'ava'; +import path from 'node:path'; +import os from 'node:os'; +import { + mkdir, + mkdtemp, + rm, + stat, + utimes, + writeFile, +} from 'node:fs/promises'; +import { fork, type ChildProcess } from 'node:child_process'; +import { fileURLToPath } from 'node:url'; + +const __dirname = path.dirname(fileURLToPath(import.meta.url)); + +const WORKER_SCRIPT = path.resolve(__dirname, 'repo-lock-worker.ts'); + +// Node args to enable TypeScript via swc — uses the same register hook as ava.config.js +// (root ava.config.js also passes --experimental-vm-modules but that flag is harmless to omit here). +const EXEC_ARGV = ['--import=@swc-node/register/esm-register', '--no-warnings']; + +const SPECIFIER_A = '@openfn/language-http@6.5.0'; +const SPECIFIER_B = '@openfn/language-common@2.0.0'; +// Alias format produced by getAliasedName +const ALIAS_A = '@openfn/language-http_6.5.0'; +const ALIAS_B = '@openfn/language-common_2.0.0'; + +const fileExists = async (p: string) => { + try { + await stat(p); + return true; + } catch { + return false; + } +}; + +// ---- IPC helpers ---- + +interface WorkerMessage { + event: string; + [key: string]: unknown; +} + +/** + * Fork a worker process and return a handle with helpers. + */ +function spawnWorker( + env: Record +): { child: ChildProcess; messages: WorkerMessage[] } { + const messages: WorkerMessage[] = []; + + const child = fork(WORKER_SCRIPT, [], { + execArgv: EXEC_ARGV, + env: { ...process.env, ...env }, + // IPC channel is created automatically by fork + }); + + child.on('message', (msg: any) => { + messages.push(msg as WorkerMessage); + }); + + return { child, messages }; +} + +/** + * Wait for a specific event from a child process. + * Checks the buffered messages array first so events that arrived before this + * call is reached are not missed. + */ +function waitForEvent( + worker: { child: ChildProcess; messages: WorkerMessage[] }, + eventName: string, + timeoutMs = 8000 +): Promise { + // Check already-buffered messages first to avoid a hang if the event arrived + // before the listener was attached. + const buffered = worker.messages.find((m) => m.event === eventName); + if (buffered) return Promise.resolve(buffered); + + return new Promise((resolve, reject) => { + const timer = setTimeout( + () => reject(new Error(`Timed out waiting for event '${eventName}'`)), + timeoutMs + ); + + const handler = (msg: any) => { + if (msg?.event === eventName) { + clearTimeout(timer); + worker.child.off('message', handler); + resolve(msg as WorkerMessage); + } + }; + + worker.child.on('message', handler); + }); +} + +/** + * Collect N 'done' messages from a set of children (order is irrelevant). + */ +function collectDone( + children: ChildProcess[], + count: number, + timeoutMs = 8000 +): Promise { + return new Promise((resolve, reject) => { + const timer = setTimeout( + () => reject(new Error(`Timed out waiting for ${count} done messages (got ${results.length})`)), + timeoutMs + ); + + const results: WorkerMessage[] = []; + + const handler = (msg: any) => { + if (msg?.event === 'done') { + results.push(msg); + if (results.length >= count) { + clearTimeout(timer); + for (const c of children) c.off('message', handler); + resolve(results); + } + } + }; + + for (const c of children) c.on('message', handler); + }); +} + +/** + * Wait for all children to send the 'ready' event, then return. + */ +function waitAllReady(children: ChildProcess[], timeoutMs = 8000): Promise { + return new Promise((resolve, reject) => { + const timer = setTimeout( + () => reject(new Error('Timed out waiting for all workers to be ready')), + timeoutMs + ); + + let remaining = children.length; + + for (const c of children) { + const handler = (msg: any) => { + if (msg?.event === 'ready') { + c.off('message', handler); + remaining--; + if (remaining === 0) { + clearTimeout(timer); + resolve(); + } + } + }; + c.on('message', handler); + } + }); +} + +/** Wait for a child to exit. */ +function waitExit(child: ChildProcess, timeoutMs = 8000): Promise { + return new Promise((resolve, reject) => { + const timer = setTimeout( + () => reject(new Error('Timed out waiting for child to exit')), + timeoutMs + ); + child.once('exit', (code) => { + clearTimeout(timer); + resolve(code); + }); + }); +} + +// ---- test lifecycle ---- + +test.beforeEach(async (t) => { + const dir = await mkdtemp(path.join(os.tmpdir(), 'ws-worker-mp-lock-')); + t.context = { dir, children: [] as ChildProcess[] }; +}); + +test.afterEach.always(async (t) => { + const { dir, children } = t.context as { dir: string; children: ChildProcess[] }; + + // Await child exits before removing tmpdir to avoid a race between SIGTERM + // delivery and the rm -rf that follows. + await Promise.all( + children.map((c) => { + // Already gone (naturally exited or previously killed). + if (c.exitCode !== null || c.killed) return Promise.resolve(); + return new Promise((r) => { + c.once('exit', () => r()); + c.kill(); + }); + }) + ); + + await rm(dir, { recursive: true, force: true }); +}); + +function track(t: any, ...workers: ReturnType[]) { + const ctx = t.context as { children: ChildProcess[] }; + for (const w of workers) ctx.children.push(w.child); + return workers; +} + +// ---- Scenario 1: Race — exactly one install runs ---- + +test.serial('scenario 1: race — exactly one of N concurrent workers runs installFn', async (t) => { + const { dir } = t.context as { dir: string }; + + const env = { + REPO_DIR: dir, + SPECIFIER: SPECIFIER_A, + WORKER_MODE: 'race', + // 500ms: the window must outlast the slowest fork's swc-transpile startup time on CI + INSTALL_DELAY: '500', + }; + + const workers = [ + spawnWorker(env), + spawnWorker(env), + spawnWorker(env), + ]; + track(t, ...workers); + + // Wait until all three have imported the module and are ready. + await waitAllReady(workers.map((w) => w.child)); + + // Broadcast 'go' to all three simultaneously. + for (const { child } of workers) child.send({ event: 'go' }); + + // Collect all 'done' messages. + const dones = await collectDone(workers.map((w) => w.child), 3); + + const totalInstalls = dones.reduce((s, m) => s + ((m.installCount as number) ?? 0), 0); + const installStartEvents = workers.flatMap((w) => + w.messages.filter((m) => m.event === 'install-start') + ); + + t.is(totalInstalls, 1, 'exactly one worker should have run installFn'); + t.is(installStartEvents.length, 1, 'exactly one install-start event emitted'); + t.true( + await fileExists(path.join(dir, '.sentinels', `${ALIAS_A}.done`)), + 'sentinel must exist' + ); + t.true( + await fileExists(path.join(dir, 'node_modules', ALIAS_A, 'package.json')), + 'node_modules pkg must exist' + ); +}); + +// ---- Scenario 2: Stale-lock recovery ---- + +test.serial('scenario 2: stale-lock recovery — new process steals a ghost lock', async (t) => { + const { dir } = t.context as { dir: string }; + + // Seed the ghost lock directory with an ancient mtime (simulates SIGKILL). + const lockTarget = path.join(dir, '.locks', `${ALIAS_A}.lock`); + await mkdir(path.dirname(lockTarget), { recursive: true }); + await writeFile(lockTarget, ''); + const ghostLockDir = `${lockTarget}.lock`; + await mkdir(ghostLockDir, { recursive: true }); + const ancient = new Date(Date.now() - 5 * 60_000 - 1000); + await utimes(ghostLockDir, ancient, ancient); + + const testStart = Date.now(); + + const w = spawnWorker({ + REPO_DIR: dir, + SPECIFIER: SPECIFIER_A, + WORKER_MODE: 'default', + }); + track(t, w); + + const done = await waitForEvent(w, 'done'); + + t.falsy(done.error, `should complete without error, got: ${done.error}`); + t.is(done.installCount, 1, 'installFn should have run once'); + t.true( + await fileExists(path.join(dir, '.sentinels', `${ALIAS_A}.done`)), + 'sentinel must exist after stale recovery' + ); + + // The ghost lockdir should either be gone or have been replaced (newer mtime). + const ghostStillExists = await fileExists(ghostLockDir); + if (ghostStillExists) { + const ghostStat = await stat(ghostLockDir); + t.true( + ghostStat.mtimeMs >= testStart, + 'lockdir mtime must be newer than test start, proving proper-lockfile took ownership' + ); + } + // If it doesn't exist, proper-lockfile cleaned it up — also fine. +}); + +// ---- Scenario 3: Per-alias concurrency ---- + +test.serial('scenario 3: different aliases do not serialise', async (t) => { + const { dir } = t.context as { dir: string }; + + const wA = spawnWorker({ + REPO_DIR: dir, + SPECIFIER: SPECIFIER_A, + WORKER_MODE: 'default', + INSTALL_DELAY: '500', + }); + const wB = spawnWorker({ + REPO_DIR: dir, + SPECIFIER: SPECIFIER_B, + WORKER_MODE: 'default', + INSTALL_DELAY: '500', + }); + track(t, wA, wB); + + const dones = await collectDone([wA.child, wB.child], 2); + + // Verify both succeeded. + for (const done of dones) { + t.falsy(done.error); + t.is(done.installCount, 1); + } + + t.true(await fileExists(path.join(dir, '.sentinels', `${ALIAS_A}.done`))); + t.true(await fileExists(path.join(dir, '.sentinels', `${ALIAS_B}.done`))); + + // Confirm overlap: install-start events from A and B must be close in time. + const startA = wA.messages.find((m) => m.event === 'install-start')?.t as number; + const startB = wB.messages.find((m) => m.event === 'install-start')?.t as number; + t.truthy(startA, 'worker A should emit install-start'); + t.truthy(startB, 'worker B should emit install-start'); + + // Both should have started within half the install delay of each other. + // If locks were global, one would wait ~500ms for the other to finish first. + const timeDiff = Math.abs(startA - startB); + t.true( + timeDiff < 200, + `install-start times should be close (${timeDiff}ms apart); if >200ms the installs were serialised` + ); +}); + +// ---- Scenario 4: Install failure — no sentinel written, next process retries ---- + +test.serial('scenario 4: install failure — no sentinel; next worker retries cleanly', async (t) => { + const { dir } = t.context as { dir: string }; + + const wA = spawnWorker({ + REPO_DIR: dir, + SPECIFIER: SPECIFIER_A, + WORKER_MODE: 'default', + INSTALL_FAIL: '1', + }); + track(t, wA); + + const doneA = await waitForEvent(wA, 'done'); + await waitExit(wA.child); + + t.truthy(doneA.error, 'child A should report an error'); + t.false( + await fileExists(path.join(dir, '.sentinels', `${ALIAS_A}.done`)), + 'sentinel must not exist after failed install' + ); + t.false( + await fileExists(path.join(dir, '.locks', `${ALIAS_A}.lock.lock`)), + 'proper-lockfile lockdir must be released after install throws' + ); + + // Now fork a second worker that will succeed. + const wB = spawnWorker({ + REPO_DIR: dir, + SPECIFIER: SPECIFIER_A, + WORKER_MODE: 'default', + }); + track(t, wB); + + const doneB = await waitForEvent(wB, 'done'); + + t.falsy(doneB.error); + t.is(doneB.installCount, 1, 'second worker should have run install'); + t.true( + await fileExists(path.join(dir, '.sentinels', `${ALIAS_A}.done`)), + 'sentinel must exist after successful second install' + ); +}); + +// ---- Scenario 5: Post-lock double-check — waiting process skips redundant install ---- + +test.serial('scenario 5: post-lock double-check — waiting process skips install after winner finishes', async (t) => { + const { dir } = t.context as { dir: string }; + + // Child A: acquires lock, signals parent (lock-acquired), waits for 'proceed', + // then seeds node_modules and releases. + const wA = spawnWorker({ + REPO_DIR: dir, + SPECIFIER: SPECIFIER_A, + WORKER_MODE: 'double-check', + }); + track(t, wA); + + // Wait until A has the lock (but hasn't done any work yet). + await waitForEvent(wA, 'lock-acquired'); + + // Now fork B — it will try to acquire the same lock and queue behind A. + const wB = spawnWorker({ + REPO_DIR: dir, + SPECIFIER: SPECIFIER_A, + WORKER_MODE: 'default', + }); + track(t, wB); + + // Wait until B has actually called handleInstall (entered the lock path). + // This guarantees B is blocking on lockfile.lock before A releases, so we + // are genuinely testing cross-process contention, not just the double-check + // on an uncontested lock. + await waitForEvent(wB, 'lock-attempting'); + + // Tell A to complete: it will seed node_modules, write sentinel, release lock. + wA.child.send({ event: 'proceed' }); + + await collectDone([wA.child, wB.child], 2); + + // Find each worker's done message from their own IPC message arrays. + const doneAMsg = wA.messages.find((m) => m.event === 'done')!; + const doneBMsg = wB.messages.find((m) => m.event === 'done')!; + + t.is(doneAMsg.installCount, 1, 'worker A should have run installFn once'); + t.true(doneBMsg.skipped as boolean, 'worker B should have skipped (in-lock double-check)'); + t.is(doneBMsg.installCount, 0, 'worker B installCount should be 0'); + + t.true(await fileExists(path.join(dir, '.sentinels', `${ALIAS_A}.done`))); + t.true(await fileExists(path.join(dir, 'node_modules', ALIAS_A, 'package.json'))); +}); + +// ---- Scenario 6: Partial state — pkg without sentinel triggers install ---- + +test.serial('scenario 6: partial state — pkg without sentinel triggers install', async (t) => { + const { dir } = t.context as { dir: string }; + + // Pre-seed node_modules but NOT the sentinel (simulates a crash after npm install + // but before writeSentinelAtomic). + const pkgDir = path.join(dir, 'node_modules', ALIAS_A); + await mkdir(pkgDir, { recursive: true }); + await writeFile(path.join(pkgDir, 'package.json'), '{}'); + // Also create the dirs that createLockedHandlers would normally create. + await mkdir(path.join(dir, '.sentinels'), { recursive: true }); + await mkdir(path.join(dir, '.locks'), { recursive: true }); + + const w = spawnWorker({ + REPO_DIR: dir, + SPECIFIER: SPECIFIER_A, + WORKER_MODE: 'partial', + // installFn is a no-op; node_modules already exists, just needs sentinel. + INSTALL_NOOP: '1', + }); + track(t, w); + + const done = await waitForEvent(w, 'done'); + + t.false(done.wasInstalled as boolean, 'handleIsInstalled should return false without sentinel'); + t.is(done.installCount, 1, 'install should have run once'); + t.true(done.installedAfter as boolean, 'handleIsInstalled should return true after install'); + t.true( + await fileExists(path.join(dir, '.sentinels', `${ALIAS_A}.done`)), + 'sentinel must exist' + ); +}); diff --git a/packages/ws-worker/test/util/repo-lock-worker.ts b/packages/ws-worker/test/util/repo-lock-worker.ts new file mode 100644 index 000000000..2b8febc81 --- /dev/null +++ b/packages/ws-worker/test/util/repo-lock-worker.ts @@ -0,0 +1,144 @@ +/** + * Worker script for cross-process repo-lock tests. + * + * Config is supplied via environment variables: + * REPO_DIR - path to the shared tmpdir + * SPECIFIER - package specifier to install + * WORKER_MODE - one of: race | stale | per-alias | fail | double-check | partial + * INSTALL_DELAY - ms to sleep inside installFn (default 0) + * INSTALL_FAIL - if '1', installFn throws + * INSTALL_NOOP - if '1', installFn resolves immediately without seeding pkg + * + * IPC messages the worker sends to parent: + * { event: 'ready' } - worker is alive and module loaded + * { event: 'install-start', t: Date.now() } - installFn just started + * { event: 'lock-acquired' } - inside the lock, before calling installFn + * { event: 'lock-attempting', t: Date.now() } - about to call handleInstall (entering lock path) + * { event: 'done', installCount, skipped?, wasInstalled?, error? } + * + * IPC messages the worker expects from parent (for barriers): + * { event: 'go' } - start the handleInstall call (race scenario) + * { event: 'proceed' } - complete the installFn (double-check scenario) + */ + +import path from 'node:path'; +import { mkdir, writeFile } from 'node:fs/promises'; +import { createMockLogger } from '@openfn/logger'; +import { getAliasedName } from '@openfn/runtime'; +import { createLockedHandlers } from '../../src/util/repo-lock.js'; + +const repoDir = process.env.REPO_DIR!; +const specifier = process.env.SPECIFIER!; +const installDelay = parseInt(process.env.INSTALL_DELAY ?? '0', 10); +const installFail = process.env.INSTALL_FAIL === '1'; +const installNoop = process.env.INSTALL_NOOP === '1'; +const workerMode = process.env.WORKER_MODE ?? 'default'; + +const logger = createMockLogger(); +const alias = getAliasedName(specifier); + +const send = (msg: Record) => { + if (process.send) process.send(msg); +}; + +// Wait for a specific IPC message from the parent. +const waitForMessage = (eventName: string): Promise => + new Promise((resolve) => { + const handler = (msg: any) => { + if (msg?.event === eventName) { + process.off('message', handler); + resolve(); + } + }; + process.on('message', handler); + }); + +const seedModule = async () => { + const dir = path.join(repoDir, 'node_modules', alias); + await mkdir(dir, { recursive: true }); + await writeFile(path.join(dir, 'package.json'), '{}'); +}; + +const sleep = (ms: number) => new Promise((r) => setTimeout(r, ms)); + +async function main() { + let installCount = 0; + + // In double-check mode the installFn waits for a 'proceed' message so the + // parent can guarantee ordering: A holds lock -> B queues -> A finishes -> B skips. + let installFn: (specifier: string, repoDir: string, logger: any) => Promise; + + if (workerMode === 'double-check') { + // Child A: acquire lock, signal parent, wait for 'proceed', then complete. + installFn = async () => { + installCount++; + // Register the listener before signalling the parent so 'proceed' cannot + // arrive before the handler is attached. + const proceedPromise = waitForMessage('proceed'); + send({ event: 'lock-acquired' }); + await proceedPromise; + await seedModule(); + if (installDelay > 0) await sleep(installDelay); + }; + } else { + installFn = async () => { + send({ event: 'install-start', t: Date.now() }); + installCount++; + if (installFail) { + throw new Error('npm exploded'); + } + if (installDelay > 0) await sleep(installDelay); + if (!installNoop) { + await seedModule(); + } + }; + } + + const { handleInstall, handleIsInstalled } = createLockedHandlers( + repoDir, + installFn + ); + + // Build the barrier promise BEFORE sending 'ready' so that if the parent + // replies instantly the message isn't missed before the listener is attached. + const goPromise = workerMode === 'race' ? waitForMessage('go') : null; + + // Signal parent that the module is imported and we are ready. + send({ event: 'ready' }); + + // In race mode, wait for the parent's 'go' before calling handleInstall. + // This ensures all workers start as close together as possible. + if (goPromise) { + await goPromise; + } + + if (workerMode === 'partial') { + // Scenario 6: check handleIsInstalled, then install, then check again. + const wasInstalled = await handleIsInstalled(specifier, repoDir, logger); + send({ event: 'lock-attempting', t: Date.now() }); + try { + await handleInstall(specifier, repoDir, logger); + } catch (e: any) { + send({ event: 'done', wasInstalled, installCount, error: e.message }); + return; + } + const installedAfter = await handleIsInstalled(specifier, repoDir, logger); + send({ event: 'done', wasInstalled, installCount, installedAfter }); + return; + } + + // Signal parent that this worker is about to enter the lock path. + send({ event: 'lock-attempting', t: Date.now() }); + + try { + await handleInstall(specifier, repoDir, logger); + const skipped = installCount === 0; + send({ event: 'done', installCount, skipped }); + } catch (e: any) { + send({ event: 'done', installCount, error: e.message }); + } +} + +main().catch((e) => { + send({ event: 'done', installCount: 0, error: e.message }); +}); diff --git a/packages/ws-worker/test/util/repo-lock.test.ts b/packages/ws-worker/test/util/repo-lock.test.ts new file mode 100644 index 000000000..3e00b7934 --- /dev/null +++ b/packages/ws-worker/test/util/repo-lock.test.ts @@ -0,0 +1,178 @@ +import test from 'ava'; +import path from 'node:path'; +import os from 'node:os'; +import { + mkdir, + mkdtemp, + rm, + writeFile, + stat, + utimes, +} from 'node:fs/promises'; +import { createMockLogger } from '@openfn/logger'; +import { getAliasedName } from '@openfn/runtime'; + +import { createLockedHandlers } from '../../src/util/repo-lock'; + +const logger = createMockLogger(); + +const SPECIFIER = '@openfn/language-http@6.5.0'; +const ALIAS = getAliasedName(SPECIFIER); + +const fileExists = async (p: string) => { + try { + await stat(p); + return true; + } catch { + return false; + } +}; + +const seedModule = async (repoDir: string, alias = ALIAS) => { + const dir = path.join(repoDir, 'node_modules', alias); + await mkdir(dir, { recursive: true }); + await writeFile(path.join(dir, 'package.json'), '{}'); +}; + +const seedSentinel = async (repoDir: string, alias = ALIAS) => { + const sentinel = path.join(repoDir, '.sentinels', `${alias}.done`); + await mkdir(path.dirname(sentinel), { recursive: true }); + await writeFile(sentinel, ''); +}; + +test.beforeEach(async (t) => { + const dir = await mkdtemp(path.join(os.tmpdir(), 'ws-worker-repo-lock-')); + t.context = { dir }; +}); + +test.afterEach.always(async (t) => { + const { dir } = t.context as { dir: string }; + await rm(dir, { recursive: true, force: true }); +}); + +test('handleIsInstalled returns false when sentinel and node_modules are missing', async (t) => { + const { dir } = t.context as { dir: string }; + const { handleIsInstalled } = createLockedHandlers(dir); + t.false(await handleIsInstalled(SPECIFIER, dir, logger)); +}); + +test('handleIsInstalled returns true when both sentinel and node_modules exist', async (t) => { + const { dir } = t.context as { dir: string }; + await seedSentinel(dir); + await seedModule(dir); + const { handleIsInstalled } = createLockedHandlers(dir); + t.true(await handleIsInstalled(SPECIFIER, dir, logger)); +}); + +test('handleIsInstalled returns false when sentinel exists but node_modules was deleted', async (t) => { + const { dir } = t.context as { dir: string }; + await seedSentinel(dir); + const { handleIsInstalled } = createLockedHandlers(dir); + t.false(await handleIsInstalled(SPECIFIER, dir, logger)); +}); + +test('handleIsInstalled returns false when node_modules exists but sentinel is missing', async (t) => { + const { dir } = t.context as { dir: string }; + await seedModule(dir); + const { handleIsInstalled } = createLockedHandlers(dir); + t.false(await handleIsInstalled(SPECIFIER, dir, logger)); +}); + +test('handleInstall runs install and writes sentinel on success', async (t) => { + const { dir } = t.context as { dir: string }; + let installs = 0; + const installFn = async () => { + installs++; + await seedModule(dir); + }; + const { handleInstall, handleIsInstalled } = createLockedHandlers( + dir, + installFn + ); + await handleInstall(SPECIFIER, dir, logger); + + t.is(installs, 1); + t.true( + await fileExists(path.join(dir, '.sentinels', `${ALIAS}.done`)), + 'sentinel file should be written' + ); + t.true(await handleIsInstalled(SPECIFIER, dir, logger)); +}); + +test('concurrent handleInstall calls only invoke install once', async (t) => { + const { dir } = t.context as { dir: string }; + let installs = 0; + const installFn = async () => { + installs++; + // Simulate work so the second call definitely contends on the lock + await new Promise((r) => setTimeout(r, 100)); + await seedModule(dir); + }; + + const a = createLockedHandlers(dir, installFn); + const b = createLockedHandlers(dir, installFn); + + await Promise.all([ + a.handleInstall(SPECIFIER, dir, logger), + b.handleInstall(SPECIFIER, dir, logger), + ]); + + t.is(installs, 1, 'only one of the two contending installs should run'); + t.true(await fileExists(path.join(dir, '.sentinels', `${ALIAS}.done`))); +}); + +test('handleInstall recovers a stale lock left by a dead worker', async (t) => { + const { dir } = t.context as { dir: string }; + + // Manually create the proper-lockfile lock directory with an old mtime, + // simulating a worker that was SIGKILLed mid-install. + const lockTarget = path.join(dir, '.locks', `${ALIAS}.lock`); + await mkdir(path.dirname(lockTarget), { recursive: true }); + await writeFile(lockTarget, ''); + const ghostLockDir = `${lockTarget}.lock`; + await mkdir(ghostLockDir, { recursive: true }); + const ancient = new Date(Date.now() - 5 * 60_000); + await utimes(ghostLockDir, ancient, ancient); + + let installs = 0; + const installFn = async () => { + installs++; + await seedModule(dir); + }; + + const { handleInstall } = createLockedHandlers(dir, installFn); + await handleInstall(SPECIFIER, dir, logger); + + t.is(installs, 1); + t.true(await fileExists(path.join(dir, '.sentinels', `${ALIAS}.done`))); +}); + +test('handleInstall does not re-run install when sentinel was written by another worker while waiting', async (t) => { + const { dir } = t.context as { dir: string }; + + let installs = 0; + + // First worker holds the lock and writes the sentinel; second worker queues + // up behind it and should observe the sentinel inside the lock and skip. + const installFn = async () => { + installs++; + await new Promise((r) => setTimeout(r, 50)); + await seedModule(dir); + }; + + const a = createLockedHandlers(dir, installFn); + const b = createLockedHandlers(dir, async () => { + // If b ever calls install, count it — but it shouldn't. + installs++; + await seedModule(dir); + }); + + const aPromise = a.handleInstall(SPECIFIER, dir, logger); + // Give a a moment to grab the lock and start work. + await new Promise((r) => setTimeout(r, 10)); + const bPromise = b.handleInstall(SPECIFIER, dir, logger); + + await Promise.all([aPromise, bPromise]); + + t.is(installs, 1); +}); diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index fa9301662..c4f6cb035 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -759,6 +759,9 @@ importers: phoenix: specifier: 1.7.10 version: 1.7.10 + proper-lockfile: + specifier: ^4.1.2 + version: 4.1.2 ws: specifier: ^8.19.0 version: 8.19.0 @@ -781,6 +784,9 @@ importers: '@types/phoenix': specifier: ^1.6.7 version: 1.6.7 + '@types/proper-lockfile': + specifier: ^4.1.4 + version: 4.1.4 '@types/yargs': specifier: ^17.0.35 version: 17.0.35 @@ -2091,6 +2097,9 @@ packages: '@types/phoenix@1.6.7': resolution: {integrity: sha512-oN9ive//QSBkf19rfDv45M7eZPi0eEXylht2OLEXicu5b4KoQ1OzXIw+xDSGWxSxe1JmepRR/ZH283vsu518/Q==} + '@types/proper-lockfile@4.1.4': + resolution: {integrity: sha512-uo2ABllncSqg9F1D4nugVl9v93RmjxF6LJzQLMLDdPaXCUIDPeOJ21Gbqi43xNKzBi/WQ0Q0dICqufzQbMjipQ==} + '@types/qs@6.15.0': resolution: {integrity: sha512-JawvT8iBVWpzTrz3EGw9BTQFg3BQNmwERdKE22vlTxawwtbyUSlMppvZYKLZzB5zgACXdXxbD3m1bXaMqP/9ow==} @@ -3770,6 +3779,9 @@ packages: process-nextick-args@2.0.1: resolution: {integrity: sha512-3ouUOpQhtgrbOa17J7+uxOTpITYWaGP7/AhoR3+A+/1e9skrzelGi/dXzEYyvbxubEF6Wn2ypscTKiKJFFn1ag==} + proper-lockfile@4.1.2: + resolution: {integrity: sha512-TjNPblN4BwAWMXU8s9AEz4JmQxnD1NNL7bNOY/AKUzyamc379FWASUhc/K1pL2noVb+XmZKLL68cjzLsiOAMaA==} + proxy-addr@2.0.7: resolution: {integrity: sha512-llQsMLSUDUPT44jdrU/O37qlnifitDP+ZwrmmZcoSKyLKvtZxpyV0n2/bD/N4tBAAZ/gJEdZU7KMraoK1+XYAg==} engines: {node: '>= 0.10'} @@ -3855,6 +3867,10 @@ packages: resolve-pkg-maps@1.0.0: resolution: {integrity: sha512-seS2Tj26TBVOC2NIc2rOe2y2ZO7efxITtLZcGSOnHHNOQ7CkiUBfw0Iw2ck6xkIhPwLhKNLS8BO+hEpngQlqzw==} + retry@0.12.0: + resolution: {integrity: sha512-9LkiTwjUh6rT555DtE9rTX+BKByPfrMzEAtnlEtdEwr3Nkffwiihqe2bWADg+OQRjt9gl6ICdmB/ZFDCGAtSow==} + engines: {node: '>= 4'} + retry@0.13.1: resolution: {integrity: sha512-XQBQ3I8W1Cge0Seh+6gjj03LbmRFWuoszgK9ooCpwYIrhhoO80pfq4cUkU5DkknwfOfFteRwlZ56PYOGYyFWdg==} engines: {node: '>= 4'} @@ -3940,6 +3956,9 @@ packages: resolution: {integrity: sha512-ZX99e6tRweoUXqR+VBrslhda51Nh5MTQwou5tnUDgbtyM0dBgmhEDtWGP/xbKn6hqfPRHujUNwz5fy/wbbhnpw==} engines: {node: '>= 0.4'} + signal-exit@3.0.7: + resolution: {integrity: sha512-wnD2ZE+l+SPC/uoS0vXeE9L1+0wuaMqKlfz9AMUo38JsyLSBWSFcHR1Rri62LZc12vLr1gb3jl7iwQhgwpAbGQ==} + signal-exit@4.1.0: resolution: {integrity: sha512-bzyZ1e88w9O1iNJbKnOlvYTrWPDl46O1bG0D3XInv+9tkPrxrN8jUUTiFlDkkmKWgn1M6CfIA13SuGqOa9Korw==} engines: {node: '>=14'} @@ -5685,6 +5704,10 @@ snapshots: '@types/phoenix@1.6.7': {} + '@types/proper-lockfile@4.1.4': + dependencies: + '@types/retry': 0.12.0 + '@types/qs@6.15.0': {} '@types/range-parser@1.2.7': {} @@ -7337,6 +7360,12 @@ snapshots: process-nextick-args@2.0.1: {} + proper-lockfile@4.1.2: + dependencies: + graceful-fs: 4.2.11 + retry: 0.12.0 + signal-exit: 3.0.7 + proxy-addr@2.0.7: dependencies: forwarded: 0.2.0 @@ -7433,6 +7462,8 @@ snapshots: resolve-pkg-maps@1.0.0: {} + retry@0.12.0: {} + retry@0.13.1: {} reusify@1.1.0: {} @@ -7567,6 +7598,8 @@ snapshots: side-channel-map: 1.0.1 side-channel-weakmap: 1.0.2 + signal-exit@3.0.7: {} + signal-exit@4.1.0: {} simple-update-notifier@2.0.0: From 0ccdc90f53e00f199bfa5dacea451265a5adb529 Mon Sep 17 00:00:00 2001 From: Stuart Corbishley Date: Tue, 19 May 2026 15:37:55 +0200 Subject: [PATCH 02/11] refactor: tidy repo-lock helpers Drop redundant pre-check in ensureLockTarget (wx already throws EEXIST), parallelise fileExists pairs in handleIsInstalled and the post-lock double-check, collapse trivial if/return, drop unused default export, and fix the worker test harness' mode list comment plus an any-typed logger. --- packages/ws-worker/src/util/repo-lock.ts | 33 +++++++++---------- .../ws-worker/test/util/repo-lock-worker.ts | 6 ++-- 2 files changed, 19 insertions(+), 20 deletions(-) diff --git a/packages/ws-worker/src/util/repo-lock.ts b/packages/ws-worker/src/util/repo-lock.ts index a39d2dab0..76e0447f9 100644 --- a/packages/ws-worker/src/util/repo-lock.ts +++ b/packages/ws-worker/src/util/repo-lock.ts @@ -13,7 +13,7 @@ const STALE_MS = 5 * 60_000; const MAX_LOCK_WAIT_MS = STALE_MS + 60_000; // 6 min const LOCK_INTERVAL_MS = 2_000; const LOCK_RETRY_OPTIONS = { - retries: MAX_LOCK_WAIT_MS / LOCK_INTERVAL_MS, // 180 × 2s = 360s + retries: MAX_LOCK_WAIT_MS / LOCK_INTERVAL_MS, factor: 1, minTimeout: LOCK_INTERVAL_MS, maxTimeout: LOCK_INTERVAL_MS, @@ -40,12 +40,10 @@ const fileExists = async (p: string) => { const ensureLockTarget = async (target: string) => { await mkdir(path.dirname(target), { recursive: true }); - if (!(await fileExists(target))) { - try { - await writeFile(target, '', { flag: 'wx' }); - } catch (e: any) { - if (e.code !== 'EEXIST') throw e; - } + try { + await writeFile(target, '', { flag: 'wx' }); + } catch (e: any) { + if (e.code !== 'EEXIST') throw e; } }; @@ -85,12 +83,11 @@ export const createLockedHandlers = ( ): Promise => { await ensureDirs; const alias = getAliasedName(specifier); - const sentinel = sentinelPath(repoDir, alias); - const pkg = nodeModulesPkgPath(repoDir, alias); - if ((await fileExists(sentinel)) && (await fileExists(pkg))) { - return true; - } - return false; + const [hasSentinel, hasPkg] = await Promise.all([ + fileExists(sentinelPath(repoDir, alias)), + fileExists(nodeModulesPkgPath(repoDir, alias)), + ]); + return hasSentinel && hasPkg; }; const handleInstall = async ( @@ -101,6 +98,7 @@ export const createLockedHandlers = ( await ensureDirs; const alias = getAliasedName(specifier); const sentinel = sentinelPath(repoDir, alias); + const pkg = nodeModulesPkgPath(repoDir, alias); const target = lockTargetPath(repoDir, alias); await ensureLockTarget(target); @@ -126,8 +124,11 @@ export const createLockedHandlers = ( logger.debug(`acquired install lock for ${specifier}`); try { - const pkg = nodeModulesPkgPath(repoDir, alias); - if ((await fileExists(sentinel)) && (await fileExists(pkg))) { + const [hasSentinel, hasPkg] = await Promise.all([ + fileExists(sentinel), + fileExists(pkg), + ]); + if (hasSentinel && hasPkg) { logger.debug( `another worker installed ${specifier} while waiting for lock; skipping install` ); @@ -147,5 +148,3 @@ export const createLockedHandlers = ( return { handleInstall, handleIsInstalled }; }; - -export default createLockedHandlers; diff --git a/packages/ws-worker/test/util/repo-lock-worker.ts b/packages/ws-worker/test/util/repo-lock-worker.ts index 2b8febc81..837e024fc 100644 --- a/packages/ws-worker/test/util/repo-lock-worker.ts +++ b/packages/ws-worker/test/util/repo-lock-worker.ts @@ -4,7 +4,7 @@ * Config is supplied via environment variables: * REPO_DIR - path to the shared tmpdir * SPECIFIER - package specifier to install - * WORKER_MODE - one of: race | stale | per-alias | fail | double-check | partial + * WORKER_MODE - one of: race | double-check | partial | default * INSTALL_DELAY - ms to sleep inside installFn (default 0) * INSTALL_FAIL - if '1', installFn throws * INSTALL_NOOP - if '1', installFn resolves immediately without seeding pkg @@ -23,7 +23,7 @@ import path from 'node:path'; import { mkdir, writeFile } from 'node:fs/promises'; -import { createMockLogger } from '@openfn/logger'; +import { createMockLogger, Logger } from '@openfn/logger'; import { getAliasedName } from '@openfn/runtime'; import { createLockedHandlers } from '../../src/util/repo-lock.js'; @@ -66,7 +66,7 @@ async function main() { // In double-check mode the installFn waits for a 'proceed' message so the // parent can guarantee ordering: A holds lock -> B queues -> A finishes -> B skips. - let installFn: (specifier: string, repoDir: string, logger: any) => Promise; + let installFn: (specifier: string, repoDir: string, logger: Logger) => Promise; if (workerMode === 'double-check') { // Child A: acquire lock, signal parent, wait for 'proceed', then complete. From 7fe2707d83e83d2f22f455f52f18c8428e30258b Mon Sep 17 00:00:00 2001 From: Stuart Corbishley Date: Fri, 22 May 2026 09:10:01 +0200 Subject: [PATCH 03/11] refactor: move repo-lock into engine-multi Migrate the cross-worker adaptor install lock out of ws-worker and into the engine's autoinstall flow. The lock now lives next to the code that actually performs the install. - New `withInstallLock` primitive in engine-multi's util - `autoinstall.lockRepo` option (default on) wraps the install call - Strengthen `isInstalled` to also stat node_modules//package.json, catching half-installed adaptors left by a crashed worker - Drop the sentinel cache; npm's per-package extraction is atomic so the node_modules entry is itself proof of completion - ws-worker CLI flag renamed to `--no-repo-lock` (WORKER_NO_REPO_LOCK) - Rework tests to use tmpdirs + esmock'd runtime install, dropping the handleInstall/handleIsInstalled injection hooks entirely --- .../distributed-adaptor-install-lock.md | 3 +- packages/engine-multi/package.json | 5 +- packages/engine-multi/src/api/autoinstall.ts | 54 +- packages/engine-multi/src/util/repo-lock.ts | 69 ++ .../engine-multi/test/api/autoinstall.test.ts | 991 ++++++++++-------- .../engine-multi/test/api/execute.test.ts | 4 +- packages/engine-multi/test/engine.test.ts | 4 +- packages/engine-multi/test/errors.test.ts | 5 +- .../test/util/autoinstall-helpers.ts | 89 ++ .../test/util/repo-lock-worker.ts | 86 +- .../engine-multi/test/util/repo-lock.test.ts | 327 ++++++ packages/ws-worker/package.json | 3 +- packages/ws-worker/src/start.ts | 25 +- packages/ws-worker/src/util/cli.ts | 10 +- packages/ws-worker/src/util/repo-lock.ts | 150 --- packages/ws-worker/test/reasons.test.ts | 37 +- .../test/util/repo-lock-multiprocess.test.ts | 473 --------- .../ws-worker/test/util/repo-lock.test.ts | 178 ---- pnpm-lock.yaml | 24 +- 19 files changed, 1171 insertions(+), 1366 deletions(-) create mode 100644 packages/engine-multi/src/util/repo-lock.ts create mode 100644 packages/engine-multi/test/util/autoinstall-helpers.ts rename packages/{ws-worker => engine-multi}/test/util/repo-lock-worker.ts (52%) create mode 100644 packages/engine-multi/test/util/repo-lock.test.ts delete mode 100644 packages/ws-worker/src/util/repo-lock.ts delete mode 100644 packages/ws-worker/test/util/repo-lock-multiprocess.test.ts delete mode 100644 packages/ws-worker/test/util/repo-lock.test.ts diff --git a/.changeset/distributed-adaptor-install-lock.md b/.changeset/distributed-adaptor-install-lock.md index 3704f8d59..8bf07317c 100644 --- a/.changeset/distributed-adaptor-install-lock.md +++ b/.changeset/distributed-adaptor-install-lock.md @@ -1,5 +1,6 @@ --- +'@openfn/engine-multi': minor '@openfn/ws-worker': minor --- -Add optional filesystem-based lock so multiple ws-workers can safely share a single repo directory (e.g. an NFS mount or k8s PVC). When `WORKER_REPO_LOCK=true` (or `--repo-lock`) is set alongside `WORKER_REPO_DIR`, adaptor installs are serialised across workers via a per-adaptor lockfile and a sentinel cache. The cache-hit path stays lock-free. +Add a filesystem-based lock so multiple workers can safely share a single adaptor repo directory (e.g. an NFS mount or k8s PVC). The lock lives inside engine-multi's autoinstall and serialises installs across processes via a per-adaptor lockfile. It is enabled by default and can be disabled with `WORKER_NO_REPO_LOCK=true` (or `--no-repo-lock`) on ws-worker. The engine's `isInstalled` check has also been tightened to verify the adaptor's `node_modules` entry exists on disk, not just the repo `package.json` dependency entry — this catches half-installed adaptors left behind by a crashed worker. diff --git a/packages/engine-multi/package.json b/packages/engine-multi/package.json index 1e4a1f6c4..5a5883a5b 100644 --- a/packages/engine-multi/package.json +++ b/packages/engine-multi/package.json @@ -21,11 +21,14 @@ "@openfn/logger": "workspace:*", "@openfn/runtime": "workspace:*", "fast-safe-stringify": "^2.1.1", - "json-stream-stringify": "^3.1.6" + "json-stream-stringify": "^3.1.6", + "proper-lockfile": "^4.1.2" }, "devDependencies": { "@types/node": "^18.19.130", + "@types/proper-lockfile": "^4.1.4", "ava": "5.3.1", + "esmock": "^2.7.5", "tslib": "^2.8.1", "tsm": "^2.3.0", "tsup": "^8.5.1", diff --git a/packages/engine-multi/src/api/autoinstall.ts b/packages/engine-multi/src/api/autoinstall.ts index 6d351d9f2..038ee9ab0 100644 --- a/packages/engine-multi/src/api/autoinstall.ts +++ b/packages/engine-multi/src/api/autoinstall.ts @@ -1,3 +1,5 @@ +import path from 'node:path'; +import { stat } from 'node:fs/promises'; import { ensureRepo, getAliasedName, @@ -13,16 +15,12 @@ import type { Logger } from '@openfn/logger'; import { AUTOINSTALL_COMPLETE, AUTOINSTALL_ERROR } from '../events'; import { AutoinstallError } from '../errors'; import ExecutionContext from '../classes/ExecutionContext'; +import { withInstallLock } from '../util/repo-lock'; // none of these options should be on the plan actually export type AutoinstallOptions = { skipRepoValidation?: boolean; - handleInstall?(fn: string, repoDir: string, logger: Logger): Promise; - handleIsInstalled?( - fn: string, - repoDir: string, - logger: Logger - ): Promise; + lockRepo?: boolean; versionLookup?: (specifier: string) => Promise; }; @@ -60,16 +58,31 @@ const autoinstall = async (context: ExecutionContext): Promise => { adaptors: string[], onComplete: (err?: any) => void ) => { - // Check whether we still need to do any work + const useLock = autoinstallOptions.lockRepo !== false; + for (const a of adaptors) { const { name, version } = getNameAndVersion(a); - if (await isInstalledFn(a, repoDir, logger)) { + if (await isInstalled(a, repoDir, logger)) { continue; } const startTime = Date.now(); try { - await installFn(a, repoDir, logger); + if (useLock) { + // Re-check inside the lock so we skip work a peer worker + // completed while we were waiting. + await withInstallLock(repoDir, getAliasedName(a), logger, async () => { + if (await isInstalled(a, repoDir, logger)) { + logger.debug( + `another worker installed ${a} while waiting for lock; skipping` + ); + return; + } + await install(a, repoDir, logger); + }); + } else { + await install(a, repoDir, logger); + } const duration = Date.now() - startTime; logger.success(`autoinstalled ${a} in ${duration / 1000}s`); @@ -103,8 +116,6 @@ const autoinstall = async (context: ExecutionContext): Promise => { const { repoDir, whitelist } = options; const autoinstallOptions = options.autoinstall || {}; - const installFn = autoinstallOptions?.handleInstall || install; - const isInstalledFn = autoinstallOptions?.handleIsInstalled || isInstalled; const versionlookup = autoinstallOptions?.versionLookup || getLatestVersion; let didValidateRepo = false; @@ -166,7 +177,7 @@ const autoinstall = async (context: ExecutionContext): Promise => { version: v, }; - if (!(await isInstalledFn(resolvedAdaptorName, repoDir, logger))) { + if (!(await isInstalled(resolvedAdaptorName, repoDir, logger))) { adaptorsToLoad.push(resolvedAdaptorName); } } @@ -210,9 +221,17 @@ export default autoinstall; const install = (specifier: string, repoDir: string, logger: Logger) => runtimeInstall([specifier], repoDir, logger); -// The actual isInstalled function is not unit tested -// TODO this should probably all be handled (and tested) in @openfn/runtime -const isInstalled = async ( +const fileExists = async (p: string) => { + try { + await stat(p); + return true; + } catch { + return false; + } +}; + +// Exported for unit testing +export const isInstalled = async ( specifier: string, repoDir: string, logger: Logger @@ -232,7 +251,10 @@ const isInstalled = async ( const pkg = await loadRepoPkg(repoDir); if (pkg) { const { dependencies } = pkg; - return dependencies.hasOwnProperty(alias); + if (!dependencies.hasOwnProperty(alias)) { + return false; + } + return fileExists(path.join(repoDir, 'node_modules', alias, 'package.json')); } }; diff --git a/packages/engine-multi/src/util/repo-lock.ts b/packages/engine-multi/src/util/repo-lock.ts new file mode 100644 index 000000000..4ed5f1ba9 --- /dev/null +++ b/packages/engine-multi/src/util/repo-lock.ts @@ -0,0 +1,69 @@ +import path from 'node:path'; +import { mkdir, writeFile } from 'node:fs/promises'; +import lockfile from 'proper-lockfile'; +import type { Logger } from '@openfn/logger'; + +// k8s CPU throttling can delay the heartbeat setTimeout; 5 min avoids false-stale on tight clusters +const STALE_MS = 5 * 60_000; +// Retry ceiling exceeds STALE_MS so a dead lock-holder's stale window expires before we give up +const MAX_LOCK_WAIT_MS = STALE_MS + 60_000; +const LOCK_INTERVAL_MS = 2_000; +const UPDATE_MS = 5_000; + +const LOCK_RETRY_OPTIONS = { + retries: MAX_LOCK_WAIT_MS / LOCK_INTERVAL_MS, + factor: 1, + minTimeout: LOCK_INTERVAL_MS, + maxTimeout: LOCK_INTERVAL_MS, +}; + +const ensureLockTarget = async (target: string) => { + await mkdir(path.dirname(target), { recursive: true }); + try { + await writeFile(target, '', { flag: 'wx' }); + } catch (e: any) { + if (e.code !== 'EEXIST') throw e; + } +}; + +export const withInstallLock = async ( + repoDir: string, + alias: string, + logger: Logger, + fn: () => Promise +): Promise => { + const locksDir = path.join(repoDir, '.locks'); + const target = path.join(locksDir, `${alias}.lock`); + + await mkdir(locksDir, { recursive: true }); + await ensureLockTarget(target); + + logger.debug(`acquiring install lock for ${alias}`); + let release: () => Promise; + try { + release = await lockfile.lock(target, { + retries: LOCK_RETRY_OPTIONS, + stale: STALE_MS, + update: UPDATE_MS, + realpath: false, + }); + } catch (e: any) { + if (e.code === 'ELOCKED') { + throw new Error( + `Lock acquisition timed out after ${MAX_LOCK_WAIT_MS / 1000}s waiting for ${alias}; another worker likely still installing (lock: ${target})` + ); + } + throw e; + } + logger.debug(`acquired install lock for ${alias}`); + + try { + await fn(); + } finally { + try { + await release(); + } catch (e) { + logger.warn(`failed to release install lock for ${alias}:`, e); + } + } +}; diff --git a/packages/engine-multi/test/api/autoinstall.test.ts b/packages/engine-multi/test/api/autoinstall.test.ts index 36d550164..dfaab7218 100644 --- a/packages/engine-multi/test/api/autoinstall.test.ts +++ b/packages/engine-multi/test/api/autoinstall.test.ts @@ -2,26 +2,24 @@ import test from 'ava'; import { createMockLogger } from '@openfn/logger'; import type { ExecutionPlan, Job } from '@openfn/runtime'; -import autoinstall, { +import path from 'node:path'; +import { mkdir, writeFile, rm } from 'node:fs/promises'; + +import { AutoinstallOptions, identifyAdaptors, + isInstalled, } from '../../src/api/autoinstall'; import { AUTOINSTALL_COMPLETE, AUTOINSTALL_ERROR } from '../../src/events'; import ExecutionContext from '../../src/classes/ExecutionContext'; import whitelist from '../../src/whitelist'; - -type PackageJson = { - name: string; - [x: string]: any; -}; - -const mockIsInstalled = (pkg: PackageJson) => async (specifier: string) => { - const alias = specifier.split('@').join('_'); - return pkg.dependencies.hasOwnProperty(alias); -}; - -const mockHandleInstall = async (_specifier: string): Promise => - new Promise((r) => r()).then(); +import { + createTmpRepo, + removeTmpRepo, + markInstalled, + loadAutoinstall, + createInstallStub, +} from '../util/autoinstall-helpers'; const logger = createMockLogger(); @@ -30,13 +28,11 @@ const wait = (duration = 10) => setTimeout(resolve, duration); }); -const mockAutoinstallOpts = { - handleInstall: mockHandleInstall, - handleIsInstalled: mockIsInstalled, - versionLookup: async () => '2.0.0', -}; - +// Build an ExecutionContext for a freshly-loaded autoinstall module. repoDir +// must already exist on disk because the real isInstalled / install code paths +// read and write to it. const createContext = ( + repoDir: string, autoinstallOpts: AutoinstallOptions = {}, jobs?: Partial[], customWhitelist?: RegExp[] @@ -61,11 +57,12 @@ const createContext = ( options: { logger, whitelist: customWhitelist || whitelist, - repoDir: 'tmp/repo', - + repoDir, // @ts-ignore autoinstall: { - ...mockAutoinstallOpts, + skipRepoValidation: true, + lockRepo: false, + versionLookup: async () => '2.0.0', ...autoinstallOpts, }, }, @@ -75,50 +72,24 @@ test.afterEach(() => { logger._reset(); }); -test('Autoinstall basically works', async (t) => { - const autoinstallOpts = { - handleInstall: mockHandleInstall, - handleIsInstalled: async () => false, - }; - const context = createContext(autoinstallOpts); - - const paths = await autoinstall(context); - t.log(paths); - t.deepEqual(paths, { - '@openfn/language-common@1.0.0': { - path: 'tmp/repo/node_modules/@openfn/language-common_1.0.0', - version: '1.0.0', - }, - }); -}); - -test('mock is installed: should be installed', async (t) => { - const isInstalled = mockIsInstalled({ - name: 'repo', - dependencies: { - 'x_1.0.0': 'path', - }, - }); - - const result = await isInstalled('x@1.0.0'); - t.true(result); -}); - -test('mock is installed: should not be installed', async (t) => { - const isInstalled = mockIsInstalled({ - name: 'repo', - dependencies: { - 'x_1.0.0': 'path', - }, - }); - - const result = await isInstalled('x@1.0.1'); - t.false(result); -}); - -test('mock install: should return async', async (t) => { - await mockHandleInstall('x@1.0.0'); - t.true(true); +test.serial('Autoinstall basically works', async (t) => { + const repoDir = await createTmpRepo(); + try { + const stub = createInstallStub({ seedOnInstall: true }); + const mod = await loadAutoinstall(stub.fn); + const context = createContext(repoDir); + + const paths = await mod.default(context); + t.deepEqual(paths, { + '@openfn/language-common@1.0.0': { + path: `${repoDir}/node_modules/@openfn/language-common_1.0.0`, + version: '1.0.0', + }, + }); + t.is(stub.calls.length, 1); + } finally { + await removeTmpRepo(repoDir); + } }); test('identifyAdaptors: pick out adaptors and remove duplicates', (t) => { @@ -148,492 +119,582 @@ test('identifyAdaptors: pick out adaptors and remove duplicates', (t) => { }); test.serial('autoinstall: handle @latest', async (t) => { - const jobs = [ - { - adaptors: ['x@latest'], - }, - ]; - - const context = createContext({}, jobs, [/x/]); + const repoDir = await createTmpRepo(); + try { + const stub = createInstallStub({ seedOnInstall: true }); + const mod = await loadAutoinstall(stub.fn); + const jobs = [{ adaptors: ['x@latest'] }]; + const context = createContext(repoDir, {}, jobs, [/x/]); - const result = await autoinstall(context); + const result = await mod.default(context); - t.deepEqual(result, { - 'x@latest': { - path: 'tmp/repo/node_modules/x_2.0.0', - version: '2.0.0', - }, - }); + t.deepEqual(result, { + 'x@latest': { + path: `${repoDir}/node_modules/x_2.0.0`, + version: '2.0.0', + }, + }); + } finally { + await removeTmpRepo(repoDir); + } }); test.serial('autoinstall: handle @next', async (t) => { - const jobs = [ - { - adaptors: ['x@next'], - }, - ]; - - const context = createContext({}, jobs, [/x/]); + const repoDir = await createTmpRepo(); + try { + const stub = createInstallStub({ seedOnInstall: true }); + const mod = await loadAutoinstall(stub.fn); + const jobs = [{ adaptors: ['x@next'] }]; + const context = createContext(repoDir, {}, jobs, [/x/]); - const result = await autoinstall(context); + const result = await mod.default(context); - t.deepEqual(result, { - 'x@next': { - path: 'tmp/repo/node_modules/x_2.0.0', - version: '2.0.0', - }, - }); + t.deepEqual(result, { + 'x@next': { + path: `${repoDir}/node_modules/x_2.0.0`, + version: '2.0.0', + }, + }); + } finally { + await removeTmpRepo(repoDir); + } }); -// This doesn't do anything except check that the mocks are installed -test.serial('autoinstall: should call both mock functions', async (t) => { - let didCallIsInstalled = false; - let didCallInstall = true; - - const mockIsInstalled = async () => { - didCallIsInstalled = true; - return false; - }; - const mockInstall = async () => { - didCallInstall = true; - return; - }; - - const autoinstallOpts = { - handleInstall: mockInstall, - handleIsInstalled: mockIsInstalled, - }; - const context = createContext(autoinstallOpts); +test.serial('autoinstall: install is invoked exactly once per adaptor', async (t) => { + const repoDir = await createTmpRepo(); + try { + const stub = createInstallStub({ seedOnInstall: true }); + const mod = await loadAutoinstall(stub.fn); + const context = createContext(repoDir); - await autoinstall(context); + await mod.default(context); - t.true(didCallIsInstalled); - t.true(didCallInstall); + t.is(stub.calls.length, 1); + t.is(stub.calls[0], '@openfn/language-common@1.0.0'); + } finally { + await removeTmpRepo(repoDir); + } }); -// TODO -// error handling -// Queue for multiple installs -// Queue for multiple installs of the same version -// don't autoinstall if it's already there - test.serial( 'autoinstall: only call install once if there are two concurrent install requests', async (t) => { - let callCount = 0; - - const installed: Record = {}; - - const mockInstall = (name: string) => - new Promise((resolve) => { - installed[name] = true; - callCount++; - setTimeout(() => resolve(), 20); - }); - - const options = { - skipRepoValidation: true, - handleInstall: mockInstall, - handleIsInstalled: async (name: string) => name in installed, - }; - - const context = createContext(options); - - await Promise.all([autoinstall(context), autoinstall(context)]); - - t.is(callCount, 1); + const repoDir = await createTmpRepo(); + try { + const stub = createInstallStub({ seedOnInstall: true, delayMs: 20 }); + const mod = await loadAutoinstall(stub.fn); + const c1 = createContext(repoDir); + const c2 = createContext(repoDir); + + await Promise.all([mod.default(c1), mod.default(c2)]); + + t.is(stub.calls.length, 1); + } finally { + await removeTmpRepo(repoDir); + } } ); test.serial('autoinstall: install in sequence', async (t) => { - const installed: Record = {}; - - const states: Record = {}; - - const mockInstall = (name: string) => - new Promise((resolve) => { - // Each time install is called, - // record the time the call was made - // and the install state - states[name] = { - time: new Date().getTime(), - installed: Object.keys(installed).map((s) => s.split('common@')[1]), - }; - installed[name] = true; - setTimeout(() => resolve(), 50); + const repoDir = await createTmpRepo(); + try { + const states: Record = {}; + const installed: string[] = []; + const stub = createInstallStub({ + seedOnInstall: true, + delayMs: 50, + onCall: (name) => { + states[name] = { + time: Date.now(), + installed: [...installed].map((s) => s.split('common@')[1]), + }; + installed.push(name); + }, }); - - const options = { - skipRepoValidation: true, - handleInstall: mockInstall, - handleIsInstalled: false, - } as any; - - const c1 = createContext(options, [ - { adaptors: ['@openfn/language-common@1'] }, - ]); - const c2 = createContext(options, [ - { adaptors: ['@openfn/language-common@2'] }, - ]); - const c3 = createContext(options, [ - { adaptors: ['@openfn/language-common@3'] }, - ]); - - autoinstall(c1); - await wait(1); - autoinstall(c2); - await wait(1); - await autoinstall(c3); - - const s1 = states['@openfn/language-common@1']; - const s2 = states['@openfn/language-common@2']; - const s3 = states['@openfn/language-common@3']; - - // Check that modules are installed in sequence - t.deepEqual(s1.installed, []); - t.deepEqual(s2.installed, ['1']); - t.deepEqual(s3.installed, ['1', '2']); - - // And check for a time gap between installs - t.true(s3.time - s2.time > 40); - t.true(s2.time - s1.time > 40); + const mod = await loadAutoinstall(stub.fn); + + const c1 = createContext(repoDir, {}, [ + { adaptors: ['@openfn/language-common@1'] }, + ]); + const c2 = createContext(repoDir, {}, [ + { adaptors: ['@openfn/language-common@2'] }, + ]); + const c3 = createContext(repoDir, {}, [ + { adaptors: ['@openfn/language-common@3'] }, + ]); + + mod.default(c1); + await wait(1); + mod.default(c2); + await wait(1); + await mod.default(c3); + + const s1 = states['@openfn/language-common@1']; + const s2 = states['@openfn/language-common@2']; + const s3 = states['@openfn/language-common@3']; + + // Check that modules are installed in sequence + t.deepEqual(s1.installed, []); + t.deepEqual(s2.installed, ['1']); + t.deepEqual(s3.installed, ['1', '2']); + + // And check for a time gap between installs + t.true(s3.time - s2.time > 40); + t.true(s2.time - s1.time > 40); + } finally { + await removeTmpRepo(repoDir); + } }); test('autoinstall: handle two seperate, non-overlapping installs', async (t) => { - const options = { - handleInstall: mockHandleInstall, - handleIsInstalled: async () => false, - }; - - const c1 = createContext(options, [ - { adaptors: ['@openfn/language-dhis2@1.0.0'] }, - ]); - const c2 = createContext(options, [ - { adaptors: ['@openfn/language-http@1.0.0'] }, - ]); - - const p1 = await autoinstall(c1); - t.deepEqual(p1, { - '@openfn/language-dhis2@1.0.0': { - path: 'tmp/repo/node_modules/@openfn/language-dhis2_1.0.0', - version: '1.0.0', - }, - }); + const repoDir = await createTmpRepo(); + try { + const stub = createInstallStub({ seedOnInstall: true }); + const mod = await loadAutoinstall(stub.fn); + + const c1 = createContext(repoDir, {}, [ + { adaptors: ['@openfn/language-dhis2@1.0.0'] }, + ]); + const c2 = createContext(repoDir, {}, [ + { adaptors: ['@openfn/language-http@1.0.0'] }, + ]); + + const p1 = await mod.default(c1); + t.deepEqual(p1, { + '@openfn/language-dhis2@1.0.0': { + path: `${repoDir}/node_modules/@openfn/language-dhis2_1.0.0`, + version: '1.0.0', + }, + }); - const p2 = await autoinstall(c2); - t.deepEqual(p2, { - '@openfn/language-http@1.0.0': { - path: 'tmp/repo/node_modules/@openfn/language-http_1.0.0', - version: '1.0.0', - }, - }); + const p2 = await mod.default(c2); + t.deepEqual(p2, { + '@openfn/language-http@1.0.0': { + path: `${repoDir}/node_modules/@openfn/language-http_1.0.0`, + version: '1.0.0', + }, + }); + } finally { + await removeTmpRepo(repoDir); + } }); test.serial( 'autoinstall: do not try to install blacklisted modules', async (t) => { - let callCount = 0; - - const mockInstall = () => - new Promise((resolve) => { - callCount++; - setTimeout(() => resolve(), 20); - }); - - const job = [ - { - adaptors: ['lodash@1.0.0'], - }, - ]; - - const options = { - skipRepoValidation: true, - handleInstall: mockInstall, - handleIsInstalled: async () => false, - }; + const repoDir = await createTmpRepo(); + try { + const stub = createInstallStub({ seedOnInstall: true }); + const mod = await loadAutoinstall(stub.fn); - const context = createContext(options, job); + const job = [{ adaptors: ['lodash@1.0.0'] }]; + const context = createContext(repoDir, {}, job); - await autoinstall(context); + await mod.default(context); - t.is(callCount, 0); + t.is(stub.calls.length, 0); + } finally { + await removeTmpRepo(repoDir); + } } ); test.serial('autoinstall: return a map to modules', async (t) => { - const jobs = [ - { - adaptors: ['@openfn/language-common@1.0.0'], - }, - { - adaptors: ['@openfn/language-http@1.0.0'], - }, - ]; + const repoDir = await createTmpRepo(); + try { + const stub = createInstallStub({ seedOnInstall: true }); + const mod = await loadAutoinstall(stub.fn); - const autoinstallOpts = { - skipRepoValidation: true, - handleInstall: async () => {}, - handleIsInstalled: async () => false, - }; - const context = createContext(autoinstallOpts, jobs); + const jobs = [ + { adaptors: ['@openfn/language-common@1.0.0'] }, + { adaptors: ['@openfn/language-http@1.0.0'] }, + ]; + const context = createContext(repoDir, {}, jobs); - const result = await autoinstall(context); + const result = await mod.default(context); - t.deepEqual(result, { - '@openfn/language-common@1.0.0': { - path: 'tmp/repo/node_modules/@openfn/language-common_1.0.0', - version: '1.0.0', - }, - '@openfn/language-http@1.0.0': { - path: 'tmp/repo/node_modules/@openfn/language-http_1.0.0', - version: '1.0.0', - }, - }); + t.deepEqual(result, { + '@openfn/language-common@1.0.0': { + path: `${repoDir}/node_modules/@openfn/language-common_1.0.0`, + version: '1.0.0', + }, + '@openfn/language-http@1.0.0': { + path: `${repoDir}/node_modules/@openfn/language-http_1.0.0`, + version: '1.0.0', + }, + }); + } finally { + await removeTmpRepo(repoDir); + } }); test.serial('autoinstall: write linker options back to the plan', async (t) => { - const jobs = [ - { - adaptors: ['@openfn/language-common@1.0.0'], - }, - { - adaptors: [ - '@openfn/language-common@2.0.0', - '@openfn/language-collections@1.0.0', - ], - }, - { - adaptors: ['@openfn/language-http@1.0.0'], - }, - ]; + const repoDir = await createTmpRepo(); + try { + const stub = createInstallStub({ seedOnInstall: true }); + const mod = await loadAutoinstall(stub.fn); - const autoinstallOpts = { - skipRepoValidation: true, - handleInstall: async () => {}, - handleIsInstalled: async () => false, - }; - const context = createContext(autoinstallOpts, jobs); + const jobs = [ + { adaptors: ['@openfn/language-common@1.0.0'] }, + { + adaptors: [ + '@openfn/language-common@2.0.0', + '@openfn/language-collections@1.0.0', + ], + }, + { adaptors: ['@openfn/language-http@1.0.0'] }, + ]; + const context = createContext(repoDir, {}, jobs); - await autoinstall(context); + await mod.default(context); - const [a, b, c] = context.state.plan.workflow.steps as Job[]; - t.deepEqual(a.linker, { - '@openfn/language-common': { - path: 'tmp/repo/node_modules/@openfn/language-common_1.0.0', - version: '1.0.0', - }, - }); - t.deepEqual(b.linker, { - '@openfn/language-common': { - path: 'tmp/repo/node_modules/@openfn/language-common_2.0.0', - version: '2.0.0', - }, - '@openfn/language-collections': { - path: 'tmp/repo/node_modules/@openfn/language-collections_1.0.0', - version: '1.0.0', - }, - }); - t.deepEqual(c.linker, { - '@openfn/language-http': { - path: 'tmp/repo/node_modules/@openfn/language-http_1.0.0', - version: '1.0.0', - }, - }); + const [a, b, c] = context.state.plan.workflow.steps as Job[]; + t.deepEqual(a.linker, { + '@openfn/language-common': { + path: `${repoDir}/node_modules/@openfn/language-common_1.0.0`, + version: '1.0.0', + }, + }); + t.deepEqual(b.linker, { + '@openfn/language-common': { + path: `${repoDir}/node_modules/@openfn/language-common_2.0.0`, + version: '2.0.0', + }, + '@openfn/language-collections': { + path: `${repoDir}/node_modules/@openfn/language-collections_1.0.0`, + version: '1.0.0', + }, + }); + t.deepEqual(c.linker, { + '@openfn/language-http': { + path: `${repoDir}/node_modules/@openfn/language-http_1.0.0`, + version: '1.0.0', + }, + }); + } finally { + await removeTmpRepo(repoDir); + } }); test.serial('autoinstall: support custom whitelist', async (t) => { - const whitelist = [/^y/]; - const jobs = [ - { + const repoDir = await createTmpRepo(); + try { + const stub = createInstallStub({ seedOnInstall: true }); + const mod = await loadAutoinstall(stub.fn); + + const customWhitelist = [/^y/]; + const jobs = [ // will be ignored - adaptors: ['x@1.0.0'], - }, - { + { adaptors: ['x@1.0.0'] }, // will be installed - adaptors: ['y@1.0.0'], - }, - ]; - - const autoinstallOpts = { - skipRepoValidation: true, - handleInstall: async () => {}, - handleIsInstalled: async () => false, - }; - const context = createContext(autoinstallOpts, jobs, whitelist); + { adaptors: ['y@1.0.0'] }, + ]; + const context = createContext(repoDir, {}, jobs, customWhitelist); - const result = await autoinstall(context); + const result = await mod.default(context); - t.deepEqual(result, { - 'y@1.0.0': { - path: 'tmp/repo/node_modules/y_1.0.0', - version: '1.0.0', - }, - }); + t.deepEqual(result, { + 'y@1.0.0': { + path: `${repoDir}/node_modules/y_1.0.0`, + version: '1.0.0', + }, + }); + } finally { + await removeTmpRepo(repoDir); + } }); test.serial('autoinstall: emit an event on completion', async (t) => { - let event: any; - const jobs = [ - { - adaptors: ['@openfn/language-common@1.0.0'], - version: '1.0.0', - }, - ]; + const repoDir = await createTmpRepo(); + try { + let event: any; + const stub = createInstallStub({ seedOnInstall: true, delayMs: 50 }); + const mod = await loadAutoinstall(stub.fn); - const autoinstallOpts = { - skipRepoValidation: true, - handleInstall: async () => new Promise((done) => setTimeout(done, 50)), - handleIsInstalled: async () => false, - } as any; - const context = createContext(autoinstallOpts, jobs); + const jobs = [ + { adaptors: ['@openfn/language-common@1.0.0'], version: '1.0.0' }, + ]; + const context = createContext(repoDir, {}, jobs); - context.on(AUTOINSTALL_COMPLETE, (evt) => { - event = evt; - }); + context.on(AUTOINSTALL_COMPLETE, (evt) => { + event = evt; + }); - await autoinstall(context); + await mod.default(context); - t.truthy(event); - t.is(event.module, '@openfn/language-common'); - t.is(event.version, '1.0.0'); - // Duration could be anything really as timeout is only loose - but so long as it's - // more than 10ms that implies it's called handleInstall and returned a reasonable value - t.assert(event.duration >= 10); + t.truthy(event); + t.is(event.module, '@openfn/language-common'); + t.is(event.version, '1.0.0'); + t.assert(event.duration >= 10); + } finally { + await removeTmpRepo(repoDir); + } }); test.serial('autoinstall: throw on error', async (t) => { - const mockIsInstalled = async () => false; - const mockInstall = async () => { - throw new Error('err'); - }; - - const autoinstallOpts = { - handleInstall: mockInstall, - handleIsInstalled: mockIsInstalled, - }; - const context = createContext(autoinstallOpts); + const repoDir = await createTmpRepo(); + try { + const stub = createInstallStub({ throws: new Error('err') }); + const mod = await loadAutoinstall(stub.fn); + const context = createContext(repoDir); - await t.throwsAsync(() => autoinstall(context), { - name: 'AutoinstallError', - message: 'Error installing @openfn/language-common@1.0.0: err', - }); + await t.throwsAsync(() => mod.default(context), { + name: 'AutoinstallError', + message: 'Error installing @openfn/language-common@1.0.0: err', + }); + } finally { + await removeTmpRepo(repoDir); + } }); test.serial('autoinstall: throw on error twice if pending', async (t) => { - return new Promise((done) => { - let callCount = 0; - let errCount = 0; - const mockIsInstalled = async () => false; - const mockInstall = async () => { - callCount++; - return new Promise((_resolve, reject) => { - setTimeout(() => reject(new Error('err')), 10); + const repoDir = await createTmpRepo(); + try { + return await new Promise(async (done) => { + let callCount = 0; + let errCount = 0; + const stub = createInstallStub({ + delayMs: 10, + onCall: () => { + callCount++; + }, + throws: () => new Error('err'), }); - }; - - const autoinstallOpts = { - handleInstall: mockInstall, - handleIsInstalled: mockIsInstalled, - } as any; - const context = createContext(autoinstallOpts); - - autoinstall(context).catch(assertCatches); - - autoinstall(context).catch(assertCatches); - - function assertCatches(e: any) { - t.is(e.name, 'AutoinstallError'); - errCount += 1; - if (errCount === 2) { - t.is(callCount, 2); - t.pass('threw twice!'); - done(); + const mod = await loadAutoinstall(stub.fn); + const context = createContext(repoDir); + + mod.default(context).catch(assertCatches); + mod.default(context).catch(assertCatches); + + function assertCatches(e: any) { + t.is(e.name, 'AutoinstallError'); + errCount += 1; + if (errCount === 2) { + t.is(callCount, 2); + t.pass('threw twice!'); + done(); + } } - } - }); + }); + } finally { + await removeTmpRepo(repoDir); + } }); test.serial('autoinstall: emit on error', async (t) => { - let evt: any; - const mockIsInstalled = async () => false; - const mockInstall = async () => { - throw new Error('err'); - }; + const repoDir = await createTmpRepo(); + try { + let evt: any; + const stub = createInstallStub({ throws: new Error('err') }); + const mod = await loadAutoinstall(stub.fn); + const context = createContext(repoDir); - const autoinstallOpts = { - handleInstall: mockInstall, - handleIsInstalled: mockIsInstalled, - }; - const context = createContext(autoinstallOpts); + context.on(AUTOINSTALL_ERROR, (e) => { + evt = e; + }); - context.on(AUTOINSTALL_ERROR, (e) => { - evt = e; - }); + try { + await mod.default(context); + } catch (e) { + // do nothing + } - try { - await autoinstall(context); - } catch (e) { - // do nothing + t.is(evt.module, '@openfn/language-common'); + t.is(evt.version, '1.0.0'); + t.is(evt.message, 'err'); + t.true(!isNaN(evt.duration)); + } finally { + await removeTmpRepo(repoDir); } - - t.is(evt.module, '@openfn/language-common'); - t.is(evt.version, '1.0.0'); - t.is(evt.message, 'err'); - t.true(!isNaN(evt.duration)); }); test.serial('autoinstall: throw twice in a row', async (t) => { - let callCount = 0; + const repoDir = await createTmpRepo(); + try { + let callCount = 0; + const stub = createInstallStub({ + delayMs: 1, + onCall: () => { + callCount++; + }, + throws: () => new Error('err'), + }); + const mod = await loadAutoinstall(stub.fn); + const context = createContext(repoDir); - const mockIsInstalled = async () => false; - const mockInstall = async () => { - callCount++; - return new Promise((_resolve, reject) => { - setTimeout(() => reject(new Error('err')), 1); + await t.throwsAsync(() => mod.default(context), { + name: 'AutoinstallError', + message: 'Error installing @openfn/language-common@1.0.0: err', }); - }; + t.is(callCount, 1); - const autoinstallOpts = { - handleInstall: mockInstall, - handleIsInstalled: mockIsInstalled, - } as any; - const context = createContext(autoinstallOpts); + await t.throwsAsync(() => mod.default(context), { + name: 'AutoinstallError', + message: 'Error installing @openfn/language-common@1.0.0: err', + }); + t.is(callCount, 2); + } finally { + await removeTmpRepo(repoDir); + } +}); - await t.throwsAsync(() => autoinstall(context), { - name: 'AutoinstallError', - message: 'Error installing @openfn/language-common@1.0.0: err', - }); - t.is(callCount, 1); +test.serial('write versions to context', async (t) => { + const repoDir = await createTmpRepo(); + try { + const stub = createInstallStub({ seedOnInstall: true }); + const mod = await loadAutoinstall(stub.fn); + const context = createContext(repoDir); - await t.throwsAsync(() => autoinstall(context), { - name: 'AutoinstallError', - message: 'Error installing @openfn/language-common@1.0.0: err', - }); - t.is(callCount, 2); + await mod.default(context); + + // @ts-ignore + t.deepEqual(context.versions['@openfn/language-common'], ['1.0.0']); + } finally { + await removeTmpRepo(repoDir); + } }); -test('write versions to context', async (t) => { - const autoinstallOpts = { - handleInstall: mockHandleInstall, - handleIsInstalled: async () => false, - }; - const context = createContext(autoinstallOpts); +test.serial( + "write versions to context even if we don't install", + async (t) => { + const repoDir = await createTmpRepo(); + try { + // Pre-seed installed state so isInstalled returns true and install is skipped. + await markInstalled(repoDir, '@openfn/language-common@1.0.0'); + const stub = createInstallStub(); + const mod = await loadAutoinstall(stub.fn); + const context = createContext(repoDir); - await autoinstall(context); + await mod.default(context); - // @ts-ignore - t.deepEqual(context.versions['@openfn/language-common'], ['1.0.0']); -}); + t.is(stub.calls.length, 0); + // @ts-ignore + t.deepEqual(context.versions['@openfn/language-common'], ['1.0.0']); + } finally { + await removeTmpRepo(repoDir); + } + } +); -test("write versions to context even if we don't install", async (t) => { - const autoinstallOpts = { - handleInstall: mockHandleInstall, - handleIsInstalled: async () => true, - }; - const context = createContext(autoinstallOpts); +// ---- strengthened isInstalled: also checks node_modules//package.json ---- - await autoinstall(context); +const SPECIFIER = '@openfn/language-http@6.5.0'; +const ALIAS = '@openfn/language-http_6.5.0'; - // @ts-ignore - t.deepEqual(context.versions['@openfn/language-common'], ['1.0.0']); -}); +const writeRepoPkg = async (dir: string, deps: Record) => { + await writeFile( + path.join(dir, 'package.json'), + JSON.stringify({ name: 'repo', dependencies: deps }) + ); +}; + +const seedNodeModule = async (dir: string, alias = ALIAS) => { + const modDir = path.join(dir, 'node_modules', alias); + await mkdir(modDir, { recursive: true }); + await writeFile(path.join(modDir, 'package.json'), '{}'); +}; + +test.serial( + 'isInstalled returns false when alias is in repo deps but node_modules pkg is missing', + async (t) => { + const dir = await createTmpRepo(); + try { + await writeRepoPkg(dir, { [ALIAS]: '6.5.0' }); + const result = await isInstalled(SPECIFIER, dir, logger); + t.false(result as boolean); + } finally { + await rm(dir, { recursive: true, force: true }); + } + } +); + +test.serial( + 'isInstalled returns true when alias is in deps AND node_modules pkg exists', + async (t) => { + const dir = await createTmpRepo(); + try { + await writeRepoPkg(dir, { [ALIAS]: '6.5.0' }); + await seedNodeModule(dir); + const result = await isInstalled(SPECIFIER, dir, logger); + t.true(result as boolean); + } finally { + await rm(dir, { recursive: true, force: true }); + } + } +); + +test.serial( + 'isInstalled returns false when alias is not in repo deps', + async (t) => { + const dir = await createTmpRepo(); + try { + await writeRepoPkg(dir, {}); + // node_modules exists but dep entry doesn't — half-installed. + await seedNodeModule(dir); + const result = await isInstalled(SPECIFIER, dir, logger); + t.false(result as boolean); + } finally { + await rm(dir, { recursive: true, force: true }); + } + } +); + +// ---- concurrent autoinstall: lock + in-lock re-check ---- + +// Drives autoinstall against a real tmp repoDir with the lock engaged. +// Spawns two concurrent install attempts; the lock + in-lock isInstalled +// re-check ensures only one actually invokes install. +test.serial( + 'autoinstall: lock + in-lock re-check skips duplicate installFn invocations', + async (t) => { + const repoDir = await createTmpRepo(); + try { + const stub = createInstallStub({ seedOnInstall: true, delayMs: 50 }); + const mod = await loadAutoinstall(stub.fn); + + // Build contexts inline so we can set lockRepo: true (default is false in + // createContext to keep unrelated tests fast). + const makeContext = () => + new ExecutionContext({ + state: { + id: 'x', + status: 'pending', + plan: { + workflow: { + steps: [ + { + adaptors: ['@openfn/language-common@1.0.0'], + expression: '.', + }, + ], + }, + options: {}, + }, + input: {}, + }, + logger, + // @ts-ignore + callWorker: async () => {}, + options: { + logger, + whitelist, + repoDir, + // @ts-ignore + autoinstall: { + skipRepoValidation: true, + // lock on by default — exercise the lock path here + }, + }, + }); + + const c1 = makeContext(); + const c2 = makeContext(); + + await Promise.all([mod.default(c1), mod.default(c2)]); + + t.is(stub.calls.length, 1, 'install should run exactly once across both contexts'); + } finally { + await removeTmpRepo(repoDir); + } + } +); diff --git a/packages/engine-multi/test/api/execute.test.ts b/packages/engine-multi/test/api/execute.test.ts index 8ec2a9ea6..95188b1e4 100644 --- a/packages/engine-multi/test/api/execute.test.ts +++ b/packages/engine-multi/test/api/execute.test.ts @@ -62,8 +62,8 @@ const plan = { const options = { autoinstall: { - handleInstall: async () => {}, - handleIsInstalled: async () => false, + // No adaptors in these plans, so autoinstall has nothing to do. + skipRepoValidation: true, }, } as Partial; diff --git a/packages/engine-multi/test/engine.test.ts b/packages/engine-multi/test/engine.test.ts index a48e13046..8a07153b2 100644 --- a/packages/engine-multi/test/engine.test.ts +++ b/packages/engine-multi/test/engine.test.ts @@ -16,7 +16,9 @@ const options = { logger, repoDir: '.', // doesn't matter for the mock autoinstall: { - handleIsInstalled: async () => true, + // No adaptors are used in these tests so the install path is never hit; + // skipping repo validation keeps autoinstall from touching the cwd. + skipRepoValidation: true, }, }; diff --git a/packages/engine-multi/test/errors.test.ts b/packages/engine-multi/test/errors.test.ts index b904939f7..d4e8e3fe0 100644 --- a/packages/engine-multi/test/errors.test.ts +++ b/packages/engine-multi/test/errors.test.ts @@ -15,8 +15,9 @@ test.before(async () => { logger, repoDir: path.resolve('./test/__repo__'), autoinstall: { - // disable autoinstall - handleIsInstalled: async () => true, + // Fixture repo has the helper adaptor pre-installed; skip repo validation + // so autoinstall never tries to rewrite the fixture's package.json. + skipRepoValidation: true, }, maxWorkers: 1, memoryLimitMb: 200, diff --git a/packages/engine-multi/test/util/autoinstall-helpers.ts b/packages/engine-multi/test/util/autoinstall-helpers.ts new file mode 100644 index 000000000..65a67c6ab --- /dev/null +++ b/packages/engine-multi/test/util/autoinstall-helpers.ts @@ -0,0 +1,89 @@ +import path from 'node:path'; +import os from 'node:os'; +import { mkdir, mkdtemp, rm, writeFile, readFile } from 'node:fs/promises'; +import esmock from 'esmock'; +import * as runtime from '@openfn/runtime'; +import { getAliasedName, getNameAndVersion } from '@openfn/runtime'; + +// Helpers for autoinstall tests. We exercise the real `isInstalled` by +// seeding a tmp repoDir on disk; the engine's autoinstall reads the repo's +// package.json and stats node_modules//package.json. + +export const createTmpRepo = async () => { + const dir = await mkdtemp(path.join(os.tmpdir(), 'engine-autoinstall-')); + await writeFile( + path.join(dir, 'package.json'), + JSON.stringify({ name: 'repo', dependencies: {} }) + ); + return dir; +}; + +export const removeTmpRepo = (dir: string) => + rm(dir, { recursive: true, force: true }); + +// Mark a specifier as installed by writing the dep into package.json and +// seeding node_modules//package.json. The alias matches @openfn/runtime's +// getAliasedName output. +export const markInstalled = async (repoDir: string, specifier: string) => { + const alias = getAliasedName(specifier); + const pkgPath = path.join(repoDir, 'package.json'); + const raw = await readFile(pkgPath, 'utf8'); + const pkg = JSON.parse(raw); + pkg.dependencies = pkg.dependencies || {}; + const { version } = getNameAndVersion(specifier); + pkg.dependencies[alias] = version || '*'; + await writeFile(pkgPath, JSON.stringify(pkg)); + const modDir = path.join(repoDir, 'node_modules', alias); + await mkdir(modDir, { recursive: true }); + await writeFile(path.join(modDir, 'package.json'), '{}'); +}; + +export type InstallImpl = ( + specifiers: string[], + repoDir: string, + logger: any +) => Promise; + +export type AutoinstallModule = typeof import('../../src/api/autoinstall'); + +// Load autoinstall with @openfn/runtime's install swapped out so tests can +// control install behaviour without touching npm. Each call returns a fresh +// module so module-level queue/busy state is isolated between tests. +export const loadAutoinstall = async ( + installImpl: InstallImpl +): Promise => + esmock('../../src/api/autoinstall.ts', { + '@openfn/runtime': { + ...runtime, + install: installImpl, + }, + }); + +// Returns a stubbed install + counters. Pass `seedOnInstall: true` to write +// node_modules//package.json + repo dep entry so the real `isInstalled` +// returns true after the stub runs. +export const createInstallStub = (opts: { + seedOnInstall?: boolean; + delayMs?: number; + onCall?: (specifier: string) => void; + throws?: Error | (() => Error); +} = {}) => { + const calls: string[] = []; + const fn: InstallImpl = async (specifiers, repoDir) => { + for (const s of specifiers) { + calls.push(s); + opts.onCall?.(s); + if (opts.delayMs) { + await new Promise((r) => setTimeout(r, opts.delayMs)); + } + if (opts.throws) { + throw typeof opts.throws === 'function' ? opts.throws() : opts.throws; + } + if (opts.seedOnInstall) { + await markInstalled(repoDir, s); + } + } + }; + return { fn, calls }; +}; + diff --git a/packages/ws-worker/test/util/repo-lock-worker.ts b/packages/engine-multi/test/util/repo-lock-worker.ts similarity index 52% rename from packages/ws-worker/test/util/repo-lock-worker.ts rename to packages/engine-multi/test/util/repo-lock-worker.ts index 837e024fc..f05537c56 100644 --- a/packages/ws-worker/test/util/repo-lock-worker.ts +++ b/packages/engine-multi/test/util/repo-lock-worker.ts @@ -1,31 +1,36 @@ /** - * Worker script for cross-process repo-lock tests. + * Worker script for cross-process repo-lock tests in engine-multi. * - * Config is supplied via environment variables: + * Drives `withInstallLock` directly (the new public lock surface) rather than + * any wrapping handler. The install marker is `node_modules//package.json` + * — npm's per-package extraction is atomic so its presence is sufficient proof + * of a successful install. + * + * Config (env vars): * REPO_DIR - path to the shared tmpdir * SPECIFIER - package specifier to install * WORKER_MODE - one of: race | double-check | partial | default - * INSTALL_DELAY - ms to sleep inside installFn (default 0) - * INSTALL_FAIL - if '1', installFn throws - * INSTALL_NOOP - if '1', installFn resolves immediately without seeding pkg + * INSTALL_DELAY - ms to sleep inside the install fn (default 0) + * INSTALL_FAIL - if '1', the install fn throws + * INSTALL_NOOP - if '1', resolves without seeding node_modules * - * IPC messages the worker sends to parent: - * { event: 'ready' } - worker is alive and module loaded - * { event: 'install-start', t: Date.now() } - installFn just started - * { event: 'lock-acquired' } - inside the lock, before calling installFn - * { event: 'lock-attempting', t: Date.now() } - about to call handleInstall (entering lock path) - * { event: 'done', installCount, skipped?, wasInstalled?, error? } + * IPC out -> parent: + * { event: 'ready' } - module loaded + * { event: 'install-start', t } - inside install fn + * { event: 'lock-acquired' } - inside lock, before work (double-check) + * { event: 'lock-attempting', t } - about to acquire lock + * { event: 'done', installCount, skipped?, wasInstalled?, installedAfter?, error? } * - * IPC messages the worker expects from parent (for barriers): - * { event: 'go' } - start the handleInstall call (race scenario) - * { event: 'proceed' } - complete the installFn (double-check scenario) + * IPC in <- parent: + * { event: 'go' } - start the lock attempt (race) + * { event: 'proceed' } - complete the install fn (double-check) */ import path from 'node:path'; -import { mkdir, writeFile } from 'node:fs/promises'; -import { createMockLogger, Logger } from '@openfn/logger'; +import { mkdir, writeFile, stat } from 'node:fs/promises'; +import { createMockLogger } from '@openfn/logger'; import { getAliasedName } from '@openfn/runtime'; -import { createLockedHandlers } from '../../src/util/repo-lock.js'; +import { withInstallLock } from '../../src/util/repo-lock.js'; const repoDir = process.env.REPO_DIR!; const specifier = process.env.SPECIFIER!; @@ -41,7 +46,6 @@ const send = (msg: Record) => { if (process.send) process.send(msg); }; -// Wait for a specific IPC message from the parent. const waitForMessage = (eventName: string): Promise => new Promise((resolve) => { const handler = (msg: any) => { @@ -59,21 +63,26 @@ const seedModule = async () => { await writeFile(path.join(dir, 'package.json'), '{}'); }; +const moduleInstalled = async () => { + try { + await stat(path.join(repoDir, 'node_modules', alias, 'package.json')); + return true; + } catch { + return false; + } +}; + const sleep = (ms: number) => new Promise((r) => setTimeout(r, ms)); async function main() { let installCount = 0; - // In double-check mode the installFn waits for a 'proceed' message so the - // parent can guarantee ordering: A holds lock -> B queues -> A finishes -> B skips. - let installFn: (specifier: string, repoDir: string, logger: Logger) => Promise; + let installFn: () => Promise; if (workerMode === 'double-check') { - // Child A: acquire lock, signal parent, wait for 'proceed', then complete. + // Acquire lock, signal parent, wait for 'proceed', then complete. installFn = async () => { installCount++; - // Register the listener before signalling the parent so 'proceed' cannot - // arrive before the handler is attached. const proceedPromise = waitForMessage('proceed'); send({ event: 'lock-acquired' }); await proceedPromise; @@ -94,44 +103,43 @@ async function main() { }; } - const { handleInstall, handleIsInstalled } = createLockedHandlers( - repoDir, - installFn - ); + // Wrap fn with the same double-check pattern autoinstall uses inside the lock: + // re-check installed state before running the install fn. + const lockedInstall = async () => { + await withInstallLock(repoDir, alias, logger, async () => { + if (await moduleInstalled()) { + return; + } + await installFn(); + }); + }; - // Build the barrier promise BEFORE sending 'ready' so that if the parent - // replies instantly the message isn't missed before the listener is attached. const goPromise = workerMode === 'race' ? waitForMessage('go') : null; - // Signal parent that the module is imported and we are ready. send({ event: 'ready' }); - // In race mode, wait for the parent's 'go' before calling handleInstall. - // This ensures all workers start as close together as possible. if (goPromise) { await goPromise; } if (workerMode === 'partial') { - // Scenario 6: check handleIsInstalled, then install, then check again. - const wasInstalled = await handleIsInstalled(specifier, repoDir, logger); + const wasInstalled = await moduleInstalled(); send({ event: 'lock-attempting', t: Date.now() }); try { - await handleInstall(specifier, repoDir, logger); + await lockedInstall(); } catch (e: any) { send({ event: 'done', wasInstalled, installCount, error: e.message }); return; } - const installedAfter = await handleIsInstalled(specifier, repoDir, logger); + const installedAfter = await moduleInstalled(); send({ event: 'done', wasInstalled, installCount, installedAfter }); return; } - // Signal parent that this worker is about to enter the lock path. send({ event: 'lock-attempting', t: Date.now() }); try { - await handleInstall(specifier, repoDir, logger); + await lockedInstall(); const skipped = installCount === 0; send({ event: 'done', installCount, skipped }); } catch (e: any) { diff --git a/packages/engine-multi/test/util/repo-lock.test.ts b/packages/engine-multi/test/util/repo-lock.test.ts new file mode 100644 index 000000000..2b54aa3e5 --- /dev/null +++ b/packages/engine-multi/test/util/repo-lock.test.ts @@ -0,0 +1,327 @@ +/** + * Cross-process integration tests for withInstallLock. + * + * Each test forks real child processes pointing at the same tmpdir to exercise + * the proper-lockfile filesystem lock path that in-process tests cannot reach. + * IPC barriers keep ordering deterministic. + */ + +import test from 'ava'; +import path from 'node:path'; +import os from 'node:os'; +import { mkdir, mkdtemp, rm, stat } from 'node:fs/promises'; +import { fork, type ChildProcess } from 'node:child_process'; +import { fileURLToPath } from 'node:url'; + +const __dirname = path.dirname(fileURLToPath(import.meta.url)); + +const WORKER_SCRIPT = path.resolve(__dirname, 'repo-lock-worker.ts'); + +const EXEC_ARGV = ['--import=@swc-node/register/esm-register', '--no-warnings']; + +const SPECIFIER_A = '@openfn/language-http@6.5.0'; +const ALIAS_A = '@openfn/language-http_6.5.0'; + +const fileExists = async (p: string) => { + try { + await stat(p); + return true; + } catch { + return false; + } +}; + +interface WorkerMessage { + event: string; + [key: string]: unknown; +} + +function spawnWorker( + env: Record +): { child: ChildProcess; messages: WorkerMessage[] } { + const messages: WorkerMessage[] = []; + + const child = fork(WORKER_SCRIPT, [], { + execArgv: EXEC_ARGV, + env: { ...process.env, ...env }, + }); + + child.on('message', (msg: any) => { + messages.push(msg as WorkerMessage); + }); + + return { child, messages }; +} + +function waitForEvent( + worker: { child: ChildProcess; messages: WorkerMessage[] }, + eventName: string, + timeoutMs = 8000 +): Promise { + const buffered = worker.messages.find((m) => m.event === eventName); + if (buffered) return Promise.resolve(buffered); + + return new Promise((resolve, reject) => { + const timer = setTimeout( + () => reject(new Error(`Timed out waiting for event '${eventName}'`)), + timeoutMs + ); + const handler = (msg: any) => { + if (msg?.event === eventName) { + clearTimeout(timer); + worker.child.off('message', handler); + resolve(msg as WorkerMessage); + } + }; + worker.child.on('message', handler); + }); +} + +function collectDone( + children: ChildProcess[], + count: number, + timeoutMs = 8000 +): Promise { + return new Promise((resolve, reject) => { + const results: WorkerMessage[] = []; + const timer = setTimeout( + () => + reject( + new Error( + `Timed out waiting for ${count} done messages (got ${results.length})` + ) + ), + timeoutMs + ); + const handler = (msg: any) => { + if (msg?.event === 'done') { + results.push(msg); + if (results.length >= count) { + clearTimeout(timer); + for (const c of children) c.off('message', handler); + resolve(results); + } + } + }; + for (const c of children) c.on('message', handler); + }); +} + +function waitAllReady(children: ChildProcess[], timeoutMs = 8000): Promise { + return new Promise((resolve, reject) => { + const timer = setTimeout( + () => reject(new Error('Timed out waiting for all workers to be ready')), + timeoutMs + ); + let remaining = children.length; + for (const c of children) { + const handler = (msg: any) => { + if (msg?.event === 'ready') { + c.off('message', handler); + remaining--; + if (remaining === 0) { + clearTimeout(timer); + resolve(); + } + } + }; + c.on('message', handler); + } + }); +} + +test.beforeEach(async (t) => { + const dir = await mkdtemp(path.join(os.tmpdir(), 'engine-multi-mp-lock-')); + t.context = { dir, children: [] as ChildProcess[] }; +}); + +test.afterEach.always(async (t) => { + const { dir, children } = t.context as { + dir: string; + children: ChildProcess[]; + }; + await Promise.all( + children.map((c) => { + if (c.exitCode !== null || c.killed) return Promise.resolve(); + return new Promise((r) => { + c.once('exit', () => r()); + c.kill(); + }); + }) + ); + await rm(dir, { recursive: true, force: true }); +}); + +function track(t: any, ...workers: ReturnType[]) { + const ctx = t.context as { children: ChildProcess[] }; + for (const w of workers) ctx.children.push(w.child); + return workers; +} + +// Scenario: race — multiple workers attempt the lock at the same time. +test.serial( + 'race: exactly one of N concurrent workers runs installFn', + async (t) => { + const { dir } = t.context as { dir: string }; + + const env = { + REPO_DIR: dir, + SPECIFIER: SPECIFIER_A, + WORKER_MODE: 'race', + INSTALL_DELAY: '500', + }; + + const workers = [spawnWorker(env), spawnWorker(env), spawnWorker(env)]; + track(t, ...workers); + + await waitAllReady(workers.map((w) => w.child)); + for (const { child } of workers) child.send({ event: 'go' }); + + const dones = await collectDone(workers.map((w) => w.child), 3); + + const totalInstalls = dones.reduce( + (s, m) => s + ((m.installCount as number) ?? 0), + 0 + ); + const installStartEvents = workers.flatMap((w) => + w.messages.filter((m) => m.event === 'install-start') + ); + + t.is(totalInstalls, 1, 'exactly one worker should have run installFn'); + t.is(installStartEvents.length, 1, 'exactly one install-start emitted'); + t.true( + await fileExists(path.join(dir, 'node_modules', ALIAS_A, 'package.json')), + 'node_modules pkg must exist' + ); + } +); + +// Scenario: double-check — A holds the lock, B waits, A finishes, B re-checks and skips. +test.serial( + 'double-check: waiting worker skips install after winner finishes', + async (t) => { + const { dir } = t.context as { dir: string }; + + const wA = spawnWorker({ + REPO_DIR: dir, + SPECIFIER: SPECIFIER_A, + WORKER_MODE: 'double-check', + }); + track(t, wA); + + await waitForEvent(wA, 'lock-acquired'); + + const wB = spawnWorker({ + REPO_DIR: dir, + SPECIFIER: SPECIFIER_A, + WORKER_MODE: 'default', + }); + track(t, wB); + + await waitForEvent(wB, 'lock-attempting'); + + wA.child.send({ event: 'proceed' }); + + await collectDone([wA.child, wB.child], 2); + + const doneA = wA.messages.find((m) => m.event === 'done')!; + const doneB = wB.messages.find((m) => m.event === 'done')!; + + t.is(doneA.installCount, 1, 'worker A should have run installFn once'); + t.true(doneB.skipped as boolean, 'worker B should have skipped (re-check)'); + t.is(doneB.installCount, 0, 'worker B installCount should be 0'); + + t.true( + await fileExists(path.join(dir, 'node_modules', ALIAS_A, 'package.json')) + ); + } +); + +// Scenario: partial — node_modules pkg absent triggers install; present afterwards. +test.serial( + 'partial: missing node_modules pkg triggers install; present afterwards', + async (t) => { + const { dir } = t.context as { dir: string }; + + await mkdir(path.join(dir, '.locks'), { recursive: true }); + + const w = spawnWorker({ + REPO_DIR: dir, + SPECIFIER: SPECIFIER_A, + WORKER_MODE: 'partial', + }); + track(t, w); + + const done = await waitForEvent(w, 'done'); + + t.false( + done.wasInstalled as boolean, + 'should not be installed before the lock runs' + ); + t.is(done.installCount, 1, 'install should have run once'); + t.true( + done.installedAfter as boolean, + 'should be installed once install fn has seeded node_modules' + ); + t.true( + await fileExists(path.join(dir, 'node_modules', ALIAS_A, 'package.json')) + ); + } +); + +// Scenario: default — single worker, no contention, just works. +test.serial('default: single worker installs cleanly', async (t) => { + const { dir } = t.context as { dir: string }; + + const w = spawnWorker({ + REPO_DIR: dir, + SPECIFIER: SPECIFIER_A, + WORKER_MODE: 'default', + }); + track(t, w); + + const done = await waitForEvent(w, 'done'); + + t.falsy(done.error, `should complete without error, got: ${done.error}`); + t.is(done.installCount, 1, 'install should have run once'); + t.true( + await fileExists(path.join(dir, 'node_modules', ALIAS_A, 'package.json')) + ); +}); + +// Sanity check: install failure releases the lockdir so retries are possible. +test.serial( + 'failure: install error releases the lock for a retry', + async (t) => { + const { dir } = t.context as { dir: string }; + + const wA = spawnWorker({ + REPO_DIR: dir, + SPECIFIER: SPECIFIER_A, + WORKER_MODE: 'default', + INSTALL_FAIL: '1', + }); + track(t, wA); + + const doneA = await waitForEvent(wA, 'done'); + t.truthy(doneA.error, 'child A should report an error'); + t.false( + await fileExists(path.join(dir, '.locks', `${ALIAS_A}.lock.lock`)), + 'proper-lockfile lockdir must be released after install throws' + ); + + const wB = spawnWorker({ + REPO_DIR: dir, + SPECIFIER: SPECIFIER_A, + WORKER_MODE: 'default', + }); + track(t, wB); + + const doneB = await waitForEvent(wB, 'done'); + t.falsy(doneB.error); + t.is(doneB.installCount, 1, 'second worker should have run install'); + t.true( + await fileExists(path.join(dir, 'node_modules', ALIAS_A, 'package.json')) + ); + } +); diff --git a/packages/ws-worker/package.json b/packages/ws-worker/package.json index b7ea0f6d4..17b2466b0 100644 --- a/packages/ws-worker/package.json +++ b/packages/ws-worker/package.json @@ -36,7 +36,6 @@ "koa": "^2.16.4", "koa-logger": "^3.2.1", "phoenix": "1.7.10", - "proper-lockfile": "^4.1.2", "ws": "^8.19.0" }, "devDependencies": { @@ -46,9 +45,9 @@ "@types/node": "^18.19.130", "@types/nodemon": "1.19.3", "@types/phoenix": "^1.6.7", - "@types/proper-lockfile": "^4.1.4", "@types/yargs": "^17.0.35", "ava": "5.3.1", + "esmock": "^2.7.5", "nodemon": "3.1.14", "sentry-testkit": "^6.2.2", "tslib": "^2.8.1", diff --git a/packages/ws-worker/src/start.ts b/packages/ws-worker/src/start.ts index cdb9bdc19..0b301902d 100644 --- a/packages/ws-worker/src/start.ts +++ b/packages/ws-worker/src/start.ts @@ -5,7 +5,6 @@ import createMockRTE from './mock/runtime-engine'; import createWorker, { ServerOptions } from './server'; import cli from './util/cli'; import getDefaultWorkloopConfig from './util/get-default-workloop-config'; -import { createLockedHandlers } from './util/repo-lock'; const args = cli(process.argv); @@ -109,7 +108,14 @@ if (args.mock) { engineReady(engine); }); } else { - const engineOptions: any = { + const lockRepo = !args.noRepoLock; + if (lockRepo && !args.repoDir) { + logger.warn( + 'WARNING: repo lock is enabled but --repo-dir is not set; lock will be a no-op' + ); + } + + const engineOptions = { repoDir: args.repoDir, memoryLimitMb: args.runMemory, maxWorkers: effectiveCapacity, @@ -119,21 +125,8 @@ if (args.mock) { workerValidationRetries: args.engineValidationRetries, profile: args.profile, profilePollInterval: args.profilePollIntervalMs, + autoinstall: { lockRepo }, }; - - if (args.repoLock) { - if (args.repoDir) { - logger.info( - 'Repo lock enabled: coordinating adaptor installs via filesystem lock at', - args.repoDir - ); - engineOptions.autoinstall = createLockedHandlers(args.repoDir); - } else { - logger.warn( - 'WARNING: --repo-lock set but --repo-dir is not; ignoring repo lock' - ); - } - } logger.debug('Creating runtime engine...'); logger.debug('Engine options:', engineOptions); diff --git a/packages/ws-worker/src/util/cli.ts b/packages/ws-worker/src/util/cli.ts index fb81ee559..f9218b071 100644 --- a/packages/ws-worker/src/util/cli.ts +++ b/packages/ws-worker/src/util/cli.ts @@ -37,7 +37,7 @@ type Args = { profile?: boolean; profilePollIntervalMs?: number; repoDir?: string; - repoLock?: boolean; + noRepoLock?: boolean; runMemory?: number; secret?: string; sentryDsn?: string; @@ -97,7 +97,7 @@ export default function parseArgs(argv: string[]): Args { WORKER_PROFILE_POLL_INTERVAL_MS, WORKER_PROFILE, WORKER_REPO_DIR, - WORKER_REPO_LOCK, + WORKER_NO_REPO_LOCK, WORKER_SECRET, WORKER_SENTRY_DSN, WORKER_SENTRY_ENV, @@ -131,9 +131,9 @@ export default function parseArgs(argv: string[]): Args { description: 'Path to the runtime repo (where modules will be installed). Env: WORKER_REPO_DIR', }) - .option('repo-lock', { + .option('no-repo-lock', { description: - 'Coordinate adaptor installs across multiple workers sharing one repo directory via a filesystem lock. Requires --repo-dir. Default false. Env: WORKER_REPO_LOCK', + 'Disable the cross-worker filesystem lock that coordinates adaptor installs when multiple workers share a single --repo-dir. The lock is on by default. Env: WORKER_NO_REPO_LOCK', type: 'boolean', }) .option('monorepo-dir', { @@ -313,7 +313,7 @@ export default function parseArgs(argv: string[]): Args { 'ws://localhost:4000/worker' ), repoDir: setArg(args.repoDir, WORKER_REPO_DIR), - repoLock: setArg(args.repoLock, WORKER_REPO_LOCK, false), + noRepoLock: setArg(args.noRepoLock, WORKER_NO_REPO_LOCK, false), batchLogs: setArg(args.batchLogs, WORKER_BATCH_LOGS, false), batchInterval: setArg(args.batchInterval, WORKER_BATCH_INTERVAL, 10), batchLimit: setArg(args.batchLimit, WORKER_BATCH_LIMIT, 50), diff --git a/packages/ws-worker/src/util/repo-lock.ts b/packages/ws-worker/src/util/repo-lock.ts deleted file mode 100644 index 76e0447f9..000000000 --- a/packages/ws-worker/src/util/repo-lock.ts +++ /dev/null @@ -1,150 +0,0 @@ -import path from 'node:path'; -import { mkdir, stat, writeFile, rename } from 'node:fs/promises'; -import lockfile from 'proper-lockfile'; -import { - getAliasedName, - install as runtimeInstall, -} from '@openfn/runtime'; -import type { Logger } from '@openfn/logger'; - -// k8s CPU throttling can delay the heartbeat setTimeout; 5 min avoids false-stale on tight clusters -const STALE_MS = 5 * 60_000; -// Retry ceiling exceeds STALE_MS so a dead lock-holder's stale window expires before we give up -const MAX_LOCK_WAIT_MS = STALE_MS + 60_000; // 6 min -const LOCK_INTERVAL_MS = 2_000; -const LOCK_RETRY_OPTIONS = { - retries: MAX_LOCK_WAIT_MS / LOCK_INTERVAL_MS, - factor: 1, - minTimeout: LOCK_INTERVAL_MS, - maxTimeout: LOCK_INTERVAL_MS, -}; -const UPDATE_MS = 5_000; - -const sentinelPath = (repoDir: string, alias: string) => - path.join(repoDir, '.sentinels', `${alias}.done`); - -const lockTargetPath = (repoDir: string, alias: string) => - path.join(repoDir, '.locks', `${alias}.lock`); - -const nodeModulesPkgPath = (repoDir: string, alias: string) => - path.join(repoDir, 'node_modules', alias, 'package.json'); - -const fileExists = async (p: string) => { - try { - await stat(p); - return true; - } catch { - return false; - } -}; - -const ensureLockTarget = async (target: string) => { - await mkdir(path.dirname(target), { recursive: true }); - try { - await writeFile(target, '', { flag: 'wx' }); - } catch (e: any) { - if (e.code !== 'EEXIST') throw e; - } -}; - -const writeSentinelAtomic = async (sentinel: string) => { - await mkdir(path.dirname(sentinel), { recursive: true }); - const tmp = `${sentinel}.tmp`; - await writeFile(tmp, ''); - await rename(tmp, sentinel); -}; - -export type InstallFn = ( - specifier: string, - repoDir: string, - logger: Logger -) => Promise; - -const defaultInstall: InstallFn = (specifier, repoDir, logger) => - runtimeInstall([specifier], repoDir, logger); - -export const createLockedHandlers = ( - repoDir: string, - installFn: InstallFn = defaultInstall -) => { - const locksDir = path.join(repoDir, '.locks'); - const sentinelsDir = path.join(repoDir, '.sentinels'); - - const ensureDirs = (async () => { - await mkdir(repoDir, { recursive: true }); - await mkdir(locksDir, { recursive: true }); - await mkdir(sentinelsDir, { recursive: true }); - })(); - - const handleIsInstalled = async ( - specifier: string, - _repoDir: string, - _logger: Logger - ): Promise => { - await ensureDirs; - const alias = getAliasedName(specifier); - const [hasSentinel, hasPkg] = await Promise.all([ - fileExists(sentinelPath(repoDir, alias)), - fileExists(nodeModulesPkgPath(repoDir, alias)), - ]); - return hasSentinel && hasPkg; - }; - - const handleInstall = async ( - specifier: string, - _repoDir: string, - logger: Logger - ): Promise => { - await ensureDirs; - const alias = getAliasedName(specifier); - const sentinel = sentinelPath(repoDir, alias); - const pkg = nodeModulesPkgPath(repoDir, alias); - const target = lockTargetPath(repoDir, alias); - - await ensureLockTarget(target); - - logger.debug(`acquiring install lock for ${specifier}`); - let release: () => Promise; - try { - release = await lockfile.lock(target, { - retries: LOCK_RETRY_OPTIONS, - stale: STALE_MS, - update: UPDATE_MS, - realpath: false, - }); - } catch (e: any) { - if (e.code === 'ELOCKED') { - const waitSecs = MAX_LOCK_WAIT_MS / 1000; - throw new Error( - `Lock acquisition timed out after ${waitSecs}s waiting for ${alias}; another worker likely still installing (lock: ${target})` - ); - } - throw e; - } - logger.debug(`acquired install lock for ${specifier}`); - - try { - const [hasSentinel, hasPkg] = await Promise.all([ - fileExists(sentinel), - fileExists(pkg), - ]); - if (hasSentinel && hasPkg) { - logger.debug( - `another worker installed ${specifier} while waiting for lock; skipping install` - ); - return; - } - - await installFn(specifier, repoDir, logger); - await writeSentinelAtomic(sentinel); - } finally { - try { - await release(); - } catch (e) { - logger.warn(`failed to release install lock for ${specifier}:`, e); - } - } - }; - - return { handleInstall, handleIsInstalled }; -}; diff --git a/packages/ws-worker/test/reasons.test.ts b/packages/ws-worker/test/reasons.test.ts index 605ff497a..681518445 100644 --- a/packages/ws-worker/test/reasons.test.ts +++ b/packages/ws-worker/test/reasons.test.ts @@ -1,5 +1,9 @@ import test from 'ava'; -import createRTE from '@openfn/engine-multi'; +import esmock from 'esmock'; +import * as runtime from '@openfn/runtime'; +import path from 'node:path'; +import os from 'node:os'; +import { mkdtemp, rm, writeFile } from 'node:fs/promises'; import { createMockLogger } from '@openfn/logger'; import type { ExitReason } from '@openfn/lexicon/lightning'; @@ -18,23 +22,38 @@ import { ExecutionPlan } from '@openfn/runtime'; let engine: any; let logger: any; +let repoDir: string; test.before(async () => { logger = createMockLogger(); - // logger = createLogger(null, { level: 'debug' }); - // Note: this is the REAL engine, not a mock - engine = await createRTE({ - maxWorkers: 1, - logger, - autoinstall: { - handleIsInstalled: async () => false, - handleInstall: () => + repoDir = await mkdtemp(path.join(os.tmpdir(), 'ws-worker-reasons-')); + await writeFile( + path.join(repoDir, 'package.json'), + JSON.stringify({ name: 'repo', dependencies: {} }) + ); + + // Load the real engine but swap @openfn/runtime's install with a fast-failing + // stub. This exercises the engine's autoinstall path without touching npm. + const engineModule: any = await esmock('@openfn/engine-multi', { + '@openfn/runtime': { + ...runtime, + install: () => new Promise((_resolve, reject) => { setTimeout(() => reject(new Error('not the way to amarillo')), 1); }), }, }); + + engine = await engineModule.default({ + maxWorkers: 1, + logger, + repoDir, + }); +}); + +test.after.always(async () => { + if (repoDir) await rm(repoDir, { recursive: true, force: true }); }); test.after(async () => engine.destroy()); diff --git a/packages/ws-worker/test/util/repo-lock-multiprocess.test.ts b/packages/ws-worker/test/util/repo-lock-multiprocess.test.ts deleted file mode 100644 index f755338a7..000000000 --- a/packages/ws-worker/test/util/repo-lock-multiprocess.test.ts +++ /dev/null @@ -1,473 +0,0 @@ -/** - * Cross-process integration tests for repo-lock. - * - * Each test forks real child processes pointing at the same tmpdir to exercise - * the proper-lockfile filesystem lock path that in-process tests cannot reach. - * - * Worker behaviour is configured via env vars; orchestration uses IPC barriers - * so test ordering is deterministic (no setTimeout for control flow). - */ - -import test from 'ava'; -import path from 'node:path'; -import os from 'node:os'; -import { - mkdir, - mkdtemp, - rm, - stat, - utimes, - writeFile, -} from 'node:fs/promises'; -import { fork, type ChildProcess } from 'node:child_process'; -import { fileURLToPath } from 'node:url'; - -const __dirname = path.dirname(fileURLToPath(import.meta.url)); - -const WORKER_SCRIPT = path.resolve(__dirname, 'repo-lock-worker.ts'); - -// Node args to enable TypeScript via swc — uses the same register hook as ava.config.js -// (root ava.config.js also passes --experimental-vm-modules but that flag is harmless to omit here). -const EXEC_ARGV = ['--import=@swc-node/register/esm-register', '--no-warnings']; - -const SPECIFIER_A = '@openfn/language-http@6.5.0'; -const SPECIFIER_B = '@openfn/language-common@2.0.0'; -// Alias format produced by getAliasedName -const ALIAS_A = '@openfn/language-http_6.5.0'; -const ALIAS_B = '@openfn/language-common_2.0.0'; - -const fileExists = async (p: string) => { - try { - await stat(p); - return true; - } catch { - return false; - } -}; - -// ---- IPC helpers ---- - -interface WorkerMessage { - event: string; - [key: string]: unknown; -} - -/** - * Fork a worker process and return a handle with helpers. - */ -function spawnWorker( - env: Record -): { child: ChildProcess; messages: WorkerMessage[] } { - const messages: WorkerMessage[] = []; - - const child = fork(WORKER_SCRIPT, [], { - execArgv: EXEC_ARGV, - env: { ...process.env, ...env }, - // IPC channel is created automatically by fork - }); - - child.on('message', (msg: any) => { - messages.push(msg as WorkerMessage); - }); - - return { child, messages }; -} - -/** - * Wait for a specific event from a child process. - * Checks the buffered messages array first so events that arrived before this - * call is reached are not missed. - */ -function waitForEvent( - worker: { child: ChildProcess; messages: WorkerMessage[] }, - eventName: string, - timeoutMs = 8000 -): Promise { - // Check already-buffered messages first to avoid a hang if the event arrived - // before the listener was attached. - const buffered = worker.messages.find((m) => m.event === eventName); - if (buffered) return Promise.resolve(buffered); - - return new Promise((resolve, reject) => { - const timer = setTimeout( - () => reject(new Error(`Timed out waiting for event '${eventName}'`)), - timeoutMs - ); - - const handler = (msg: any) => { - if (msg?.event === eventName) { - clearTimeout(timer); - worker.child.off('message', handler); - resolve(msg as WorkerMessage); - } - }; - - worker.child.on('message', handler); - }); -} - -/** - * Collect N 'done' messages from a set of children (order is irrelevant). - */ -function collectDone( - children: ChildProcess[], - count: number, - timeoutMs = 8000 -): Promise { - return new Promise((resolve, reject) => { - const timer = setTimeout( - () => reject(new Error(`Timed out waiting for ${count} done messages (got ${results.length})`)), - timeoutMs - ); - - const results: WorkerMessage[] = []; - - const handler = (msg: any) => { - if (msg?.event === 'done') { - results.push(msg); - if (results.length >= count) { - clearTimeout(timer); - for (const c of children) c.off('message', handler); - resolve(results); - } - } - }; - - for (const c of children) c.on('message', handler); - }); -} - -/** - * Wait for all children to send the 'ready' event, then return. - */ -function waitAllReady(children: ChildProcess[], timeoutMs = 8000): Promise { - return new Promise((resolve, reject) => { - const timer = setTimeout( - () => reject(new Error('Timed out waiting for all workers to be ready')), - timeoutMs - ); - - let remaining = children.length; - - for (const c of children) { - const handler = (msg: any) => { - if (msg?.event === 'ready') { - c.off('message', handler); - remaining--; - if (remaining === 0) { - clearTimeout(timer); - resolve(); - } - } - }; - c.on('message', handler); - } - }); -} - -/** Wait for a child to exit. */ -function waitExit(child: ChildProcess, timeoutMs = 8000): Promise { - return new Promise((resolve, reject) => { - const timer = setTimeout( - () => reject(new Error('Timed out waiting for child to exit')), - timeoutMs - ); - child.once('exit', (code) => { - clearTimeout(timer); - resolve(code); - }); - }); -} - -// ---- test lifecycle ---- - -test.beforeEach(async (t) => { - const dir = await mkdtemp(path.join(os.tmpdir(), 'ws-worker-mp-lock-')); - t.context = { dir, children: [] as ChildProcess[] }; -}); - -test.afterEach.always(async (t) => { - const { dir, children } = t.context as { dir: string; children: ChildProcess[] }; - - // Await child exits before removing tmpdir to avoid a race between SIGTERM - // delivery and the rm -rf that follows. - await Promise.all( - children.map((c) => { - // Already gone (naturally exited or previously killed). - if (c.exitCode !== null || c.killed) return Promise.resolve(); - return new Promise((r) => { - c.once('exit', () => r()); - c.kill(); - }); - }) - ); - - await rm(dir, { recursive: true, force: true }); -}); - -function track(t: any, ...workers: ReturnType[]) { - const ctx = t.context as { children: ChildProcess[] }; - for (const w of workers) ctx.children.push(w.child); - return workers; -} - -// ---- Scenario 1: Race — exactly one install runs ---- - -test.serial('scenario 1: race — exactly one of N concurrent workers runs installFn', async (t) => { - const { dir } = t.context as { dir: string }; - - const env = { - REPO_DIR: dir, - SPECIFIER: SPECIFIER_A, - WORKER_MODE: 'race', - // 500ms: the window must outlast the slowest fork's swc-transpile startup time on CI - INSTALL_DELAY: '500', - }; - - const workers = [ - spawnWorker(env), - spawnWorker(env), - spawnWorker(env), - ]; - track(t, ...workers); - - // Wait until all three have imported the module and are ready. - await waitAllReady(workers.map((w) => w.child)); - - // Broadcast 'go' to all three simultaneously. - for (const { child } of workers) child.send({ event: 'go' }); - - // Collect all 'done' messages. - const dones = await collectDone(workers.map((w) => w.child), 3); - - const totalInstalls = dones.reduce((s, m) => s + ((m.installCount as number) ?? 0), 0); - const installStartEvents = workers.flatMap((w) => - w.messages.filter((m) => m.event === 'install-start') - ); - - t.is(totalInstalls, 1, 'exactly one worker should have run installFn'); - t.is(installStartEvents.length, 1, 'exactly one install-start event emitted'); - t.true( - await fileExists(path.join(dir, '.sentinels', `${ALIAS_A}.done`)), - 'sentinel must exist' - ); - t.true( - await fileExists(path.join(dir, 'node_modules', ALIAS_A, 'package.json')), - 'node_modules pkg must exist' - ); -}); - -// ---- Scenario 2: Stale-lock recovery ---- - -test.serial('scenario 2: stale-lock recovery — new process steals a ghost lock', async (t) => { - const { dir } = t.context as { dir: string }; - - // Seed the ghost lock directory with an ancient mtime (simulates SIGKILL). - const lockTarget = path.join(dir, '.locks', `${ALIAS_A}.lock`); - await mkdir(path.dirname(lockTarget), { recursive: true }); - await writeFile(lockTarget, ''); - const ghostLockDir = `${lockTarget}.lock`; - await mkdir(ghostLockDir, { recursive: true }); - const ancient = new Date(Date.now() - 5 * 60_000 - 1000); - await utimes(ghostLockDir, ancient, ancient); - - const testStart = Date.now(); - - const w = spawnWorker({ - REPO_DIR: dir, - SPECIFIER: SPECIFIER_A, - WORKER_MODE: 'default', - }); - track(t, w); - - const done = await waitForEvent(w, 'done'); - - t.falsy(done.error, `should complete without error, got: ${done.error}`); - t.is(done.installCount, 1, 'installFn should have run once'); - t.true( - await fileExists(path.join(dir, '.sentinels', `${ALIAS_A}.done`)), - 'sentinel must exist after stale recovery' - ); - - // The ghost lockdir should either be gone or have been replaced (newer mtime). - const ghostStillExists = await fileExists(ghostLockDir); - if (ghostStillExists) { - const ghostStat = await stat(ghostLockDir); - t.true( - ghostStat.mtimeMs >= testStart, - 'lockdir mtime must be newer than test start, proving proper-lockfile took ownership' - ); - } - // If it doesn't exist, proper-lockfile cleaned it up — also fine. -}); - -// ---- Scenario 3: Per-alias concurrency ---- - -test.serial('scenario 3: different aliases do not serialise', async (t) => { - const { dir } = t.context as { dir: string }; - - const wA = spawnWorker({ - REPO_DIR: dir, - SPECIFIER: SPECIFIER_A, - WORKER_MODE: 'default', - INSTALL_DELAY: '500', - }); - const wB = spawnWorker({ - REPO_DIR: dir, - SPECIFIER: SPECIFIER_B, - WORKER_MODE: 'default', - INSTALL_DELAY: '500', - }); - track(t, wA, wB); - - const dones = await collectDone([wA.child, wB.child], 2); - - // Verify both succeeded. - for (const done of dones) { - t.falsy(done.error); - t.is(done.installCount, 1); - } - - t.true(await fileExists(path.join(dir, '.sentinels', `${ALIAS_A}.done`))); - t.true(await fileExists(path.join(dir, '.sentinels', `${ALIAS_B}.done`))); - - // Confirm overlap: install-start events from A and B must be close in time. - const startA = wA.messages.find((m) => m.event === 'install-start')?.t as number; - const startB = wB.messages.find((m) => m.event === 'install-start')?.t as number; - t.truthy(startA, 'worker A should emit install-start'); - t.truthy(startB, 'worker B should emit install-start'); - - // Both should have started within half the install delay of each other. - // If locks were global, one would wait ~500ms for the other to finish first. - const timeDiff = Math.abs(startA - startB); - t.true( - timeDiff < 200, - `install-start times should be close (${timeDiff}ms apart); if >200ms the installs were serialised` - ); -}); - -// ---- Scenario 4: Install failure — no sentinel written, next process retries ---- - -test.serial('scenario 4: install failure — no sentinel; next worker retries cleanly', async (t) => { - const { dir } = t.context as { dir: string }; - - const wA = spawnWorker({ - REPO_DIR: dir, - SPECIFIER: SPECIFIER_A, - WORKER_MODE: 'default', - INSTALL_FAIL: '1', - }); - track(t, wA); - - const doneA = await waitForEvent(wA, 'done'); - await waitExit(wA.child); - - t.truthy(doneA.error, 'child A should report an error'); - t.false( - await fileExists(path.join(dir, '.sentinels', `${ALIAS_A}.done`)), - 'sentinel must not exist after failed install' - ); - t.false( - await fileExists(path.join(dir, '.locks', `${ALIAS_A}.lock.lock`)), - 'proper-lockfile lockdir must be released after install throws' - ); - - // Now fork a second worker that will succeed. - const wB = spawnWorker({ - REPO_DIR: dir, - SPECIFIER: SPECIFIER_A, - WORKER_MODE: 'default', - }); - track(t, wB); - - const doneB = await waitForEvent(wB, 'done'); - - t.falsy(doneB.error); - t.is(doneB.installCount, 1, 'second worker should have run install'); - t.true( - await fileExists(path.join(dir, '.sentinels', `${ALIAS_A}.done`)), - 'sentinel must exist after successful second install' - ); -}); - -// ---- Scenario 5: Post-lock double-check — waiting process skips redundant install ---- - -test.serial('scenario 5: post-lock double-check — waiting process skips install after winner finishes', async (t) => { - const { dir } = t.context as { dir: string }; - - // Child A: acquires lock, signals parent (lock-acquired), waits for 'proceed', - // then seeds node_modules and releases. - const wA = spawnWorker({ - REPO_DIR: dir, - SPECIFIER: SPECIFIER_A, - WORKER_MODE: 'double-check', - }); - track(t, wA); - - // Wait until A has the lock (but hasn't done any work yet). - await waitForEvent(wA, 'lock-acquired'); - - // Now fork B — it will try to acquire the same lock and queue behind A. - const wB = spawnWorker({ - REPO_DIR: dir, - SPECIFIER: SPECIFIER_A, - WORKER_MODE: 'default', - }); - track(t, wB); - - // Wait until B has actually called handleInstall (entered the lock path). - // This guarantees B is blocking on lockfile.lock before A releases, so we - // are genuinely testing cross-process contention, not just the double-check - // on an uncontested lock. - await waitForEvent(wB, 'lock-attempting'); - - // Tell A to complete: it will seed node_modules, write sentinel, release lock. - wA.child.send({ event: 'proceed' }); - - await collectDone([wA.child, wB.child], 2); - - // Find each worker's done message from their own IPC message arrays. - const doneAMsg = wA.messages.find((m) => m.event === 'done')!; - const doneBMsg = wB.messages.find((m) => m.event === 'done')!; - - t.is(doneAMsg.installCount, 1, 'worker A should have run installFn once'); - t.true(doneBMsg.skipped as boolean, 'worker B should have skipped (in-lock double-check)'); - t.is(doneBMsg.installCount, 0, 'worker B installCount should be 0'); - - t.true(await fileExists(path.join(dir, '.sentinels', `${ALIAS_A}.done`))); - t.true(await fileExists(path.join(dir, 'node_modules', ALIAS_A, 'package.json'))); -}); - -// ---- Scenario 6: Partial state — pkg without sentinel triggers install ---- - -test.serial('scenario 6: partial state — pkg without sentinel triggers install', async (t) => { - const { dir } = t.context as { dir: string }; - - // Pre-seed node_modules but NOT the sentinel (simulates a crash after npm install - // but before writeSentinelAtomic). - const pkgDir = path.join(dir, 'node_modules', ALIAS_A); - await mkdir(pkgDir, { recursive: true }); - await writeFile(path.join(pkgDir, 'package.json'), '{}'); - // Also create the dirs that createLockedHandlers would normally create. - await mkdir(path.join(dir, '.sentinels'), { recursive: true }); - await mkdir(path.join(dir, '.locks'), { recursive: true }); - - const w = spawnWorker({ - REPO_DIR: dir, - SPECIFIER: SPECIFIER_A, - WORKER_MODE: 'partial', - // installFn is a no-op; node_modules already exists, just needs sentinel. - INSTALL_NOOP: '1', - }); - track(t, w); - - const done = await waitForEvent(w, 'done'); - - t.false(done.wasInstalled as boolean, 'handleIsInstalled should return false without sentinel'); - t.is(done.installCount, 1, 'install should have run once'); - t.true(done.installedAfter as boolean, 'handleIsInstalled should return true after install'); - t.true( - await fileExists(path.join(dir, '.sentinels', `${ALIAS_A}.done`)), - 'sentinel must exist' - ); -}); diff --git a/packages/ws-worker/test/util/repo-lock.test.ts b/packages/ws-worker/test/util/repo-lock.test.ts deleted file mode 100644 index 3e00b7934..000000000 --- a/packages/ws-worker/test/util/repo-lock.test.ts +++ /dev/null @@ -1,178 +0,0 @@ -import test from 'ava'; -import path from 'node:path'; -import os from 'node:os'; -import { - mkdir, - mkdtemp, - rm, - writeFile, - stat, - utimes, -} from 'node:fs/promises'; -import { createMockLogger } from '@openfn/logger'; -import { getAliasedName } from '@openfn/runtime'; - -import { createLockedHandlers } from '../../src/util/repo-lock'; - -const logger = createMockLogger(); - -const SPECIFIER = '@openfn/language-http@6.5.0'; -const ALIAS = getAliasedName(SPECIFIER); - -const fileExists = async (p: string) => { - try { - await stat(p); - return true; - } catch { - return false; - } -}; - -const seedModule = async (repoDir: string, alias = ALIAS) => { - const dir = path.join(repoDir, 'node_modules', alias); - await mkdir(dir, { recursive: true }); - await writeFile(path.join(dir, 'package.json'), '{}'); -}; - -const seedSentinel = async (repoDir: string, alias = ALIAS) => { - const sentinel = path.join(repoDir, '.sentinels', `${alias}.done`); - await mkdir(path.dirname(sentinel), { recursive: true }); - await writeFile(sentinel, ''); -}; - -test.beforeEach(async (t) => { - const dir = await mkdtemp(path.join(os.tmpdir(), 'ws-worker-repo-lock-')); - t.context = { dir }; -}); - -test.afterEach.always(async (t) => { - const { dir } = t.context as { dir: string }; - await rm(dir, { recursive: true, force: true }); -}); - -test('handleIsInstalled returns false when sentinel and node_modules are missing', async (t) => { - const { dir } = t.context as { dir: string }; - const { handleIsInstalled } = createLockedHandlers(dir); - t.false(await handleIsInstalled(SPECIFIER, dir, logger)); -}); - -test('handleIsInstalled returns true when both sentinel and node_modules exist', async (t) => { - const { dir } = t.context as { dir: string }; - await seedSentinel(dir); - await seedModule(dir); - const { handleIsInstalled } = createLockedHandlers(dir); - t.true(await handleIsInstalled(SPECIFIER, dir, logger)); -}); - -test('handleIsInstalled returns false when sentinel exists but node_modules was deleted', async (t) => { - const { dir } = t.context as { dir: string }; - await seedSentinel(dir); - const { handleIsInstalled } = createLockedHandlers(dir); - t.false(await handleIsInstalled(SPECIFIER, dir, logger)); -}); - -test('handleIsInstalled returns false when node_modules exists but sentinel is missing', async (t) => { - const { dir } = t.context as { dir: string }; - await seedModule(dir); - const { handleIsInstalled } = createLockedHandlers(dir); - t.false(await handleIsInstalled(SPECIFIER, dir, logger)); -}); - -test('handleInstall runs install and writes sentinel on success', async (t) => { - const { dir } = t.context as { dir: string }; - let installs = 0; - const installFn = async () => { - installs++; - await seedModule(dir); - }; - const { handleInstall, handleIsInstalled } = createLockedHandlers( - dir, - installFn - ); - await handleInstall(SPECIFIER, dir, logger); - - t.is(installs, 1); - t.true( - await fileExists(path.join(dir, '.sentinels', `${ALIAS}.done`)), - 'sentinel file should be written' - ); - t.true(await handleIsInstalled(SPECIFIER, dir, logger)); -}); - -test('concurrent handleInstall calls only invoke install once', async (t) => { - const { dir } = t.context as { dir: string }; - let installs = 0; - const installFn = async () => { - installs++; - // Simulate work so the second call definitely contends on the lock - await new Promise((r) => setTimeout(r, 100)); - await seedModule(dir); - }; - - const a = createLockedHandlers(dir, installFn); - const b = createLockedHandlers(dir, installFn); - - await Promise.all([ - a.handleInstall(SPECIFIER, dir, logger), - b.handleInstall(SPECIFIER, dir, logger), - ]); - - t.is(installs, 1, 'only one of the two contending installs should run'); - t.true(await fileExists(path.join(dir, '.sentinels', `${ALIAS}.done`))); -}); - -test('handleInstall recovers a stale lock left by a dead worker', async (t) => { - const { dir } = t.context as { dir: string }; - - // Manually create the proper-lockfile lock directory with an old mtime, - // simulating a worker that was SIGKILLed mid-install. - const lockTarget = path.join(dir, '.locks', `${ALIAS}.lock`); - await mkdir(path.dirname(lockTarget), { recursive: true }); - await writeFile(lockTarget, ''); - const ghostLockDir = `${lockTarget}.lock`; - await mkdir(ghostLockDir, { recursive: true }); - const ancient = new Date(Date.now() - 5 * 60_000); - await utimes(ghostLockDir, ancient, ancient); - - let installs = 0; - const installFn = async () => { - installs++; - await seedModule(dir); - }; - - const { handleInstall } = createLockedHandlers(dir, installFn); - await handleInstall(SPECIFIER, dir, logger); - - t.is(installs, 1); - t.true(await fileExists(path.join(dir, '.sentinels', `${ALIAS}.done`))); -}); - -test('handleInstall does not re-run install when sentinel was written by another worker while waiting', async (t) => { - const { dir } = t.context as { dir: string }; - - let installs = 0; - - // First worker holds the lock and writes the sentinel; second worker queues - // up behind it and should observe the sentinel inside the lock and skip. - const installFn = async () => { - installs++; - await new Promise((r) => setTimeout(r, 50)); - await seedModule(dir); - }; - - const a = createLockedHandlers(dir, installFn); - const b = createLockedHandlers(dir, async () => { - // If b ever calls install, count it — but it shouldn't. - installs++; - await seedModule(dir); - }); - - const aPromise = a.handleInstall(SPECIFIER, dir, logger); - // Give a a moment to grab the lock and start work. - await new Promise((r) => setTimeout(r, 10)); - const bPromise = b.handleInstall(SPECIFIER, dir, logger); - - await Promise.all([aPromise, bPromise]); - - t.is(installs, 1); -}); diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index c4f6cb035..caeeb8a25 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -435,13 +435,22 @@ importers: json-stream-stringify: specifier: ^3.1.6 version: 3.1.6 + proper-lockfile: + specifier: ^4.1.2 + version: 4.1.2 devDependencies: '@types/node': specifier: ^18.19.130 version: 18.19.130 + '@types/proper-lockfile': + specifier: ^4.1.4 + version: 4.1.4 ava: specifier: 5.3.1 version: 5.3.1 + esmock: + specifier: ^2.7.5 + version: 2.7.5 tslib: specifier: ^2.8.1 version: 2.8.1 @@ -759,9 +768,6 @@ importers: phoenix: specifier: 1.7.10 version: 1.7.10 - proper-lockfile: - specifier: ^4.1.2 - version: 4.1.2 ws: specifier: ^8.19.0 version: 8.19.0 @@ -784,15 +790,15 @@ importers: '@types/phoenix': specifier: ^1.6.7 version: 1.6.7 - '@types/proper-lockfile': - specifier: ^4.1.4 - version: 4.1.4 '@types/yargs': specifier: ^17.0.35 version: 17.0.35 ava: specifier: 5.3.1 version: 5.3.1 + esmock: + specifier: ^2.7.5 + version: 2.7.5 nodemon: specifier: 3.1.14 version: 3.1.14 @@ -2884,6 +2890,10 @@ packages: resolution: {integrity: sha512-U1suiZ2oDVWv4zPO56S0NcR5QriEahGtdN2OR6FiOG4WJvcjBVFB0qI4+eKoWFH483PKGuLuu6V8Z4T5g63UVA==} engines: {node: '>=6'} + esmock@2.7.5: + resolution: {integrity: sha512-jKwL7yYpVOERalCllSnPur59s9M0gV5BKijtmOKclqDMuhqdS7DT/a7cODjz6w1XusE0wAaHBTrK+zgab/ENgw==} + engines: {node: '>=14.16.0'} + esprima@1.2.5: resolution: {integrity: sha512-S9VbPDU0adFErpDai3qDkjq8+G05ONtKzcyNrPKg/ZKa+tf879nX2KexNU95b31UoTJjRLInNBHHHjFPoCd7lQ==} engines: {node: '>=0.4.0'} @@ -6488,6 +6498,8 @@ snapshots: esm@3.2.25: optional: true + esmock@2.7.5: {} + esprima@1.2.5: {} esprima@4.0.1: {} From e52257e3a5a852df0fad2c06453a66154adec13e Mon Sep 17 00:00:00 2001 From: Stuart Corbishley Date: Fri, 22 May 2026 12:36:04 +0200 Subject: [PATCH 04/11] refactor: remove dead pending map from autoinstall The pending map was left behind by a 2023 restructure that replaced per-adaptor promise dedup with the module-level queue + busy flag. Nothing writes to it; the delete on error is a no-op. --- packages/engine-multi/src/api/autoinstall.ts | 4 ---- 1 file changed, 4 deletions(-) diff --git a/packages/engine-multi/src/api/autoinstall.ts b/packages/engine-multi/src/api/autoinstall.ts index 038ee9ab0..b813935d8 100644 --- a/packages/engine-multi/src/api/autoinstall.ts +++ b/packages/engine-multi/src/api/autoinstall.ts @@ -24,8 +24,6 @@ export type AutoinstallOptions = { versionLookup?: (specifier: string) => Promise; }; -const pending: Record> = {}; - let busy = false; const queue: Array<{ adaptors: string[]; callback: (err?: any) => void }> = []; @@ -92,8 +90,6 @@ const autoinstall = async (context: ExecutionContext): Promise => { duration, }); } catch (e: any) { - delete pending[a]; - logger.error(`ERROR autoinstalling ${a}: ${e.message}`); logger.error(e); const duration = Date.now() - startTime; From 66c1ea7d8247f88a77dc6e29d14ae8593abcdde2 Mon Sep 17 00:00:00 2001 From: Stuart Corbishley Date: Fri, 22 May 2026 12:36:37 +0200 Subject: [PATCH 05/11] refactor: guard install-lock alias against path traversal Defence-in-depth: refuse aliases containing `..` or absolute paths before composing the .locks/.lock path. Upstream whitelist filtering already restricts specifiers, so this guards against a future bug there becoming a path-traversal write. --- packages/engine-multi/src/util/repo-lock.ts | 6 ++++ .../test/util/repo-lock.unit.test.ts | 32 +++++++++++++++++++ 2 files changed, 38 insertions(+) create mode 100644 packages/engine-multi/test/util/repo-lock.unit.test.ts diff --git a/packages/engine-multi/src/util/repo-lock.ts b/packages/engine-multi/src/util/repo-lock.ts index 4ed5f1ba9..dfabc6f9f 100644 --- a/packages/engine-multi/src/util/repo-lock.ts +++ b/packages/engine-multi/src/util/repo-lock.ts @@ -32,6 +32,12 @@ export const withInstallLock = async ( logger: Logger, fn: () => Promise ): Promise => { + // Defence-in-depth: refuse aliases that could escape the .locks directory. + // Upstream whitelist filtering should make this unreachable. + if (alias.includes('..') || path.isAbsolute(alias)) { + throw new Error(`Invalid alias for install lock: ${alias}`); + } + const locksDir = path.join(repoDir, '.locks'); const target = path.join(locksDir, `${alias}.lock`); diff --git a/packages/engine-multi/test/util/repo-lock.unit.test.ts b/packages/engine-multi/test/util/repo-lock.unit.test.ts new file mode 100644 index 000000000..0759e0d2e --- /dev/null +++ b/packages/engine-multi/test/util/repo-lock.unit.test.ts @@ -0,0 +1,32 @@ +/** + * In-process unit tests for withInstallLock guards. + * + * The sibling `repo-lock.test.ts` exercises the real filesystem lock via + * forked workers; these tests cover input validation that never reaches the + * lock acquisition path. + */ + +import test from 'ava'; +import { createMockLogger } from '@openfn/logger'; + +import { withInstallLock } from '../../src/util/repo-lock'; + +const logger = createMockLogger(); + +const shouldNotRun = async () => { + throw new Error('install fn must not run when alias is rejected'); +}; + +test('rejects alias containing ".."', async (t) => { + await t.throwsAsync( + () => withInstallLock('/tmp/repo', '../evil', logger, shouldNotRun), + { message: /Invalid alias for install lock/ } + ); +}); + +test('rejects absolute-path alias', async (t) => { + await t.throwsAsync( + () => withInstallLock('/tmp/repo', '/etc/passwd', logger, shouldNotRun), + { message: /Invalid alias for install lock/ } + ); +}); From 09503ae4ed610156004214a78dadd8709fcd1919 Mon Sep 17 00:00:00 2001 From: Stuart Corbishley Date: Fri, 22 May 2026 13:21:04 +0200 Subject: [PATCH 06/11] feat: log install-lock contention at info level Try the lock with retries: 0 first. On ELOCKED, log one info line so k8s operators can see why a worker is sat waiting on cold start, then retry with the full budget. Uncontended acquisitions stay silent. --- packages/engine-multi/src/util/repo-lock.ts | 29 +++++++++++++-------- 1 file changed, 18 insertions(+), 11 deletions(-) diff --git a/packages/engine-multi/src/util/repo-lock.ts b/packages/engine-multi/src/util/repo-lock.ts index dfabc6f9f..efc4a4de6 100644 --- a/packages/engine-multi/src/util/repo-lock.ts +++ b/packages/engine-multi/src/util/repo-lock.ts @@ -46,21 +46,28 @@ export const withInstallLock = async ( logger.debug(`acquiring install lock for ${alias}`); let release: () => Promise; + + const lockOpts = { stale: STALE_MS, update: UPDATE_MS, realpath: false }; + try { - release = await lockfile.lock(target, { - retries: LOCK_RETRY_OPTIONS, - stale: STALE_MS, - update: UPDATE_MS, - realpath: false, - }); + release = await lockfile.lock(target, { ...lockOpts, retries: 0 }); } catch (e: any) { - if (e.code === 'ELOCKED') { - throw new Error( - `Lock acquisition timed out after ${MAX_LOCK_WAIT_MS / 1000}s waiting for ${alias}; another worker likely still installing (lock: ${target})` - ); + if (e.code !== 'ELOCKED') throw e; + + logger.info(`waiting for install lock on \`${alias}\` (another worker is installing)`); + + try { + release = await lockfile.lock(target, { ...lockOpts, retries: LOCK_RETRY_OPTIONS }); + } catch (e2: any) { + if (e2.code === 'ELOCKED') { + throw new Error( + `Lock acquisition timed out after ${MAX_LOCK_WAIT_MS / 1000}s waiting for ${alias}; another worker likely still installing (lock: ${target})` + ); + } + throw e2; } - throw e; } + logger.debug(`acquired install lock for ${alias}`); try { From 5509997b34ad3b90402dc5219b5e364493accd90 Mon Sep 17 00:00:00 2001 From: Stuart Corbishley Date: Fri, 22 May 2026 12:43:32 +0200 Subject: [PATCH 07/11] fix: --no-repo-lock flag was silently ignored Yargs auto-negates --no-foo into { foo: false }, so declaring the option as 'no-repo-lock' meant --no-repo-lock produced { repoLock: false } and left args.noRepoLock undefined. The downstream lockRepo check thus stayed true and the flag did nothing. Rename the option to its positive form 'repo-lock' (default true). yargs' auto-negation now correctly sets repoLock: false when --no-repo-lock is passed. WORKER_NO_REPO_LOCK env var still disables the lock. Also add defence-in-depth path-traversal guard on the lock alias. --- packages/ws-worker/src/start.ts | 2 +- packages/ws-worker/src/util/cli.ts | 12 ++++++---- packages/ws-worker/test/util/cli.test.ts | 28 ++++++++++++++++++++++++ 3 files changed, 37 insertions(+), 5 deletions(-) diff --git a/packages/ws-worker/src/start.ts b/packages/ws-worker/src/start.ts index 0b301902d..5a0f4eeb5 100644 --- a/packages/ws-worker/src/start.ts +++ b/packages/ws-worker/src/start.ts @@ -108,7 +108,7 @@ if (args.mock) { engineReady(engine); }); } else { - const lockRepo = !args.noRepoLock; + const lockRepo = args.repoLock !== false; if (lockRepo && !args.repoDir) { logger.warn( 'WARNING: repo lock is enabled but --repo-dir is not set; lock will be a no-op' diff --git a/packages/ws-worker/src/util/cli.ts b/packages/ws-worker/src/util/cli.ts index f9218b071..12af50b2e 100644 --- a/packages/ws-worker/src/util/cli.ts +++ b/packages/ws-worker/src/util/cli.ts @@ -37,7 +37,7 @@ type Args = { profile?: boolean; profilePollIntervalMs?: number; repoDir?: string; - noRepoLock?: boolean; + repoLock?: boolean; runMemory?: number; secret?: string; sentryDsn?: string; @@ -131,9 +131,9 @@ export default function parseArgs(argv: string[]): Args { description: 'Path to the runtime repo (where modules will be installed). Env: WORKER_REPO_DIR', }) - .option('no-repo-lock', { + .option('repo-lock', { description: - 'Disable the cross-worker filesystem lock that coordinates adaptor installs when multiple workers share a single --repo-dir. The lock is on by default. Env: WORKER_NO_REPO_LOCK', + 'Enable a cross-worker filesystem lock that coordinates adaptor installs when multiple workers share a single --repo-dir. On by default; pass --no-repo-lock to disable. Env: WORKER_NO_REPO_LOCK=true also disables it.', type: 'boolean', }) .option('monorepo-dir', { @@ -313,7 +313,11 @@ export default function parseArgs(argv: string[]): Args { 'ws://localhost:4000/worker' ), repoDir: setArg(args.repoDir, WORKER_REPO_DIR), - noRepoLock: setArg(args.noRepoLock, WORKER_NO_REPO_LOCK, false), + repoLock: + args.repoLock ?? + (WORKER_NO_REPO_LOCK !== undefined + ? !(WORKER_NO_REPO_LOCK === 'true' || WORKER_NO_REPO_LOCK === '1') + : true), batchLogs: setArg(args.batchLogs, WORKER_BATCH_LOGS, false), batchInterval: setArg(args.batchInterval, WORKER_BATCH_INTERVAL, 10), batchLimit: setArg(args.batchLimit, WORKER_BATCH_LIMIT, 50), diff --git a/packages/ws-worker/test/util/cli.test.ts b/packages/ws-worker/test/util/cli.test.ts index 698008dc8..a2edd2b98 100644 --- a/packages/ws-worker/test/util/cli.test.ts +++ b/packages/ws-worker/test/util/cli.test.ts @@ -77,6 +77,34 @@ test('cli should handle boolean options correctly', (t) => { t.is(args.mock, true); }); +test('cli should default repoLock to true', (t) => { + const args = cli('pnpm start'.split(' ')); + t.is(args.repoLock, true); +}); + +test('cli should disable repoLock when --no-repo-lock is passed', (t) => { + const args = cli('pnpm start --no-repo-lock'.split(' ')); + t.is(args.repoLock, false); +}); + +test('cli should disable repoLock when WORKER_NO_REPO_LOCK=true', (t) => { + process.env.WORKER_NO_REPO_LOCK = 'true'; + const args = cli('pnpm start'.split(' ')); + t.is(args.repoLock, false); +}); + +test('cli should keep repoLock enabled when WORKER_NO_REPO_LOCK=false', (t) => { + process.env.WORKER_NO_REPO_LOCK = 'false'; + const args = cli('pnpm start'.split(' ')); + t.is(args.repoLock, true); +}); + +test('cli flag should override WORKER_NO_REPO_LOCK env var', (t) => { + process.env.WORKER_NO_REPO_LOCK = 'true'; + const args = cli('pnpm start --repo-lock'.split(' ')); + t.is(args.repoLock, true); +}); + test('cli should configure sentry directly', (t) => { const argv = 'pnpm start --sentry-dsn abc --sentry-env local'.split(' '); const args = cli(argv); From 81eefaba49d16f20d6640f4d45822e0c98d10b87 Mon Sep 17 00:00:00 2001 From: Stuart Corbishley Date: Fri, 22 May 2026 14:59:07 +0200 Subject: [PATCH 08/11] refactor: drop redundant mkdir in withInstallLock ensureLockTarget already creates the parent directory recursively, so the explicit mkdir of locksDir was dead work. --- packages/engine-multi/src/util/repo-lock.ts | 1 - 1 file changed, 1 deletion(-) diff --git a/packages/engine-multi/src/util/repo-lock.ts b/packages/engine-multi/src/util/repo-lock.ts index efc4a4de6..2f1056ab3 100644 --- a/packages/engine-multi/src/util/repo-lock.ts +++ b/packages/engine-multi/src/util/repo-lock.ts @@ -41,7 +41,6 @@ export const withInstallLock = async ( const locksDir = path.join(repoDir, '.locks'); const target = path.join(locksDir, `${alias}.lock`); - await mkdir(locksDir, { recursive: true }); await ensureLockTarget(target); logger.debug(`acquiring install lock for ${alias}`); From f5c4abfb273597081d54966d5a55099e2e718f39 Mon Sep 17 00:00:00 2001 From: Stuart Corbishley Date: Fri, 22 May 2026 14:59:15 +0200 Subject: [PATCH 09/11] refactor: only reject '..' as a path component in install-lock alias The previous substring check would also reject innocuous names like 'foo..bar'. Split on path separators so only '..' as a real path component is rejected. --- packages/engine-multi/src/util/repo-lock.ts | 2 +- .../test/util/repo-lock.unit.test.ts | 21 ++++++++++++++----- 2 files changed, 17 insertions(+), 6 deletions(-) diff --git a/packages/engine-multi/src/util/repo-lock.ts b/packages/engine-multi/src/util/repo-lock.ts index 2f1056ab3..2f09da037 100644 --- a/packages/engine-multi/src/util/repo-lock.ts +++ b/packages/engine-multi/src/util/repo-lock.ts @@ -34,7 +34,7 @@ export const withInstallLock = async ( ): Promise => { // Defence-in-depth: refuse aliases that could escape the .locks directory. // Upstream whitelist filtering should make this unreachable. - if (alias.includes('..') || path.isAbsolute(alias)) { + if (alias.split(/[/\\]/).includes('..') || path.isAbsolute(alias)) { throw new Error(`Invalid alias for install lock: ${alias}`); } diff --git a/packages/engine-multi/test/util/repo-lock.unit.test.ts b/packages/engine-multi/test/util/repo-lock.unit.test.ts index 0759e0d2e..a39e4cd12 100644 --- a/packages/engine-multi/test/util/repo-lock.unit.test.ts +++ b/packages/engine-multi/test/util/repo-lock.unit.test.ts @@ -1,12 +1,14 @@ /** - * In-process unit tests for withInstallLock guards. + * In-process unit tests for withInstallLock alias validation. * - * The sibling `repo-lock.test.ts` exercises the real filesystem lock via - * forked workers; these tests cover input validation that never reaches the - * lock acquisition path. + * The sibling `repo-lock.test.ts` exercises the real cross-process filesystem + * lock via forked workers; these tests focus on the alias guard. */ import test from 'ava'; +import path from 'node:path'; +import os from 'node:os'; +import { mkdtemp, rm } from 'node:fs/promises'; import { createMockLogger } from '@openfn/logger'; import { withInstallLock } from '../../src/util/repo-lock'; @@ -17,11 +19,20 @@ const shouldNotRun = async () => { throw new Error('install fn must not run when alias is rejected'); }; -test('rejects alias containing ".."', async (t) => { +test('rejects ".." as a path component but allows it as a substring', async (t) => { await t.throwsAsync( () => withInstallLock('/tmp/repo', '../evil', logger, shouldNotRun), { message: /Invalid alias for install lock/ } ); + + const dir = await mkdtemp(path.join(os.tmpdir(), 'engine-multi-lock-unit-')); + t.teardown(() => rm(dir, { recursive: true, force: true })); + + let ran = false; + await withInstallLock(dir, 'foo..bar', logger, async () => { + ran = true; + }); + t.true(ran); }); test('rejects absolute-path alias', async (t) => { From 0d0fd3ba4355ed72390c03518b4941d7651ad995 Mon Sep 17 00:00:00 2001 From: Stuart Corbishley Date: Fri, 22 May 2026 15:27:14 +0200 Subject: [PATCH 10/11] refactor: pin per-call options on autoinstall queue entry Hoist processQueue and doAutoinstall to module scope and pass context/repoDir/logger/useLock through the queue entry instead of relying on the first caller's closure. Removes a latent foot-gun if anyone later adds per-run autoinstall/repoDir overrides or proxies AUTOINSTALL_COMPLETE/ERROR through createWorkflowEvents. --- packages/engine-multi/src/api/autoinstall.ts | 159 ++++++++++--------- 1 file changed, 83 insertions(+), 76 deletions(-) diff --git a/packages/engine-multi/src/api/autoinstall.ts b/packages/engine-multi/src/api/autoinstall.ts index b813935d8..f02167c86 100644 --- a/packages/engine-multi/src/api/autoinstall.ts +++ b/packages/engine-multi/src/api/autoinstall.ts @@ -24,93 +24,92 @@ export type AutoinstallOptions = { versionLookup?: (specifier: string) => Promise; }; -let busy = false; +// Per-entry options are pinned at enqueue time so the worker that drains the +// queue doesn't accidentally use the *first* caller's closure. See PR #1416. +type QueueEntry = { + adaptors: string[]; + callback: (err?: any) => void; + context: ExecutionContext; + repoDir: string; + logger: Logger; + useLock: boolean; +}; -const queue: Array<{ adaptors: string[]; callback: (err?: any) => void }> = []; +let busy = false; +const queue: QueueEntry[] = []; + +const processQueue = async () => { + const next = queue.shift(); + if (next) { + busy = true; + await doAutoinstall(next); + processQueue(); + } else { + busy = false; + } +}; -const enqueue = (adaptors: string[]) => - new Promise((resolve) => { - queue.push({ adaptors, callback: resolve }); - }); +const doAutoinstall = async (entry: QueueEntry) => { + const { adaptors, callback, context, repoDir, logger, useLock } = entry; -// Install any modules for an Execution Plan that are not already installed -// This will enforce a queue ensuring only one module is installed at a time -// This fixes https://github.com/OpenFn/kit/issues/503 -const autoinstall = async (context: ExecutionContext): Promise => { - // TODO not a huge fan of these functions in the closure, but it's ok for now - const processQueue = async () => { - const next = queue.shift(); - if (next) { - busy = true; - const { adaptors, callback } = next; - await doAutoinstall(adaptors, callback); - processQueue(); - } else { - // do nothing - busy = false; + for (const a of adaptors) { + const { name, version } = getNameAndVersion(a); + if (await isInstalled(a, repoDir, logger)) { + continue; } - }; - - // This will actually do the autoinstall for an run (all adaptors) - const doAutoinstall = async ( - adaptors: string[], - onComplete: (err?: any) => void - ) => { - const useLock = autoinstallOptions.lockRepo !== false; - - for (const a of adaptors) { - const { name, version } = getNameAndVersion(a); - if (await isInstalled(a, repoDir, logger)) { - continue; - } - const startTime = Date.now(); - try { - if (useLock) { - // Re-check inside the lock so we skip work a peer worker - // completed while we were waiting. - await withInstallLock(repoDir, getAliasedName(a), logger, async () => { - if (await isInstalled(a, repoDir, logger)) { - logger.debug( - `another worker installed ${a} while waiting for lock; skipping` - ); - return; - } - await install(a, repoDir, logger); - }); - } else { + const startTime = Date.now(); + try { + if (useLock) { + // Re-check inside the lock so we skip work a peer worker + // completed while we were waiting. + await withInstallLock(repoDir, getAliasedName(a), logger, async () => { + if (await isInstalled(a, repoDir, logger)) { + logger.debug( + `another worker installed ${a} while waiting for lock; skipping` + ); + return; + } await install(a, repoDir, logger); - } - - const duration = Date.now() - startTime; - logger.success(`autoinstalled ${a} in ${duration / 1000}s`); - context.emit(AUTOINSTALL_COMPLETE, { - module: name, - version: version!, - duration, - }); - } catch (e: any) { - logger.error(`ERROR autoinstalling ${a}: ${e.message}`); - logger.error(e); - const duration = Date.now() - startTime; - context.emit(AUTOINSTALL_ERROR, { - module: name, - version: version!, - duration, - message: e.message || e.toString(), }); - - // Abort on the first error - return onComplete(new AutoinstallError(a, e)); + } else { + await install(a, repoDir, logger); } + + const duration = Date.now() - startTime; + logger.success(`autoinstalled ${a} in ${duration / 1000}s`); + context.emit(AUTOINSTALL_COMPLETE, { + module: name, + version: version!, + duration, + }); + } catch (e: any) { + logger.error(`ERROR autoinstalling ${a}: ${e.message}`); + logger.error(e); + const duration = Date.now() - startTime; + context.emit(AUTOINSTALL_ERROR, { + module: name, + version: version!, + duration, + message: e.message || e.toString(), + }); + + // Abort on the first error + return callback(new AutoinstallError(a, e)); } - onComplete(); - }; + } + callback(); +}; +// Install any modules for an Execution Plan that are not already installed +// This will enforce a queue ensuring only one module is installed at a time +// This fixes https://github.com/OpenFn/kit/issues/503 +const autoinstall = async (context: ExecutionContext): Promise => { const { logger, state, options } = context; const { plan } = state; const { repoDir, whitelist } = options; const autoinstallOptions = options.autoinstall || {}; + const useLock = autoinstallOptions.lockRepo !== false; const versionlookup = autoinstallOptions?.versionLookup || getLatestVersion; @@ -131,7 +130,7 @@ const autoinstall = async (context: ExecutionContext): Promise => { const adaptors = Array.from(identifyAdaptors(plan)); const paths: ModulePaths = {}; - const adaptorsToLoad = []; + const adaptorsToLoad: string[] = []; for (const a of adaptors) { // Ensure that this is not blacklisted if (whitelist && !whitelist.find((r) => r.exec(a))) { @@ -192,8 +191,16 @@ const autoinstall = async (context: ExecutionContext): Promise => { } if (adaptorsToLoad.length) { - // Add this to the queue - const p = enqueue(adaptorsToLoad); + const p = new Promise((resolve) => { + queue.push({ + adaptors: adaptorsToLoad, + callback: resolve, + context, + repoDir, + logger, + useLock, + }); + }); if (!busy) { processQueue(); From dc8f04ff595658b649df1eb4d59f5d0cf3c04f80 Mon Sep 17 00:00:00 2001 From: Stuart Corbishley Date: Fri, 22 May 2026 15:35:24 +0200 Subject: [PATCH 11/11] style: apply prettier to repo-lock and autoinstall Pure whitespace; line-wrap long template strings and object literals so `pnpm test:format` (CI) passes. --- packages/engine-multi/src/api/autoinstall.ts | 4 +++- packages/engine-multi/src/util/repo-lock.ts | 13 ++++++++++--- 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/packages/engine-multi/src/api/autoinstall.ts b/packages/engine-multi/src/api/autoinstall.ts index f02167c86..4d186b5e7 100644 --- a/packages/engine-multi/src/api/autoinstall.ts +++ b/packages/engine-multi/src/api/autoinstall.ts @@ -257,7 +257,9 @@ export const isInstalled = async ( if (!dependencies.hasOwnProperty(alias)) { return false; } - return fileExists(path.join(repoDir, 'node_modules', alias, 'package.json')); + return fileExists( + path.join(repoDir, 'node_modules', alias, 'package.json') + ); } }; diff --git a/packages/engine-multi/src/util/repo-lock.ts b/packages/engine-multi/src/util/repo-lock.ts index 2f09da037..527fef193 100644 --- a/packages/engine-multi/src/util/repo-lock.ts +++ b/packages/engine-multi/src/util/repo-lock.ts @@ -53,14 +53,21 @@ export const withInstallLock = async ( } catch (e: any) { if (e.code !== 'ELOCKED') throw e; - logger.info(`waiting for install lock on \`${alias}\` (another worker is installing)`); + logger.info( + `waiting for install lock on \`${alias}\` (another worker is installing)` + ); try { - release = await lockfile.lock(target, { ...lockOpts, retries: LOCK_RETRY_OPTIONS }); + release = await lockfile.lock(target, { + ...lockOpts, + retries: LOCK_RETRY_OPTIONS, + }); } catch (e2: any) { if (e2.code === 'ELOCKED') { throw new Error( - `Lock acquisition timed out after ${MAX_LOCK_WAIT_MS / 1000}s waiting for ${alias}; another worker likely still installing (lock: ${target})` + `Lock acquisition timed out after ${ + MAX_LOCK_WAIT_MS / 1000 + }s waiting for ${alias}; another worker likely still installing (lock: ${target})` ); } throw e2;