diff --git a/cmd/pyrycode-relay/main.go b/cmd/pyrycode-relay/main.go index c7221b2..1745cd1 100644 --- a/cmd/pyrycode-relay/main.go +++ b/cmd/pyrycode-relay/main.go @@ -92,6 +92,24 @@ func run(args []string, sigCtx context.Context) int { return 2 } + // CheckSingleInstance is the deterministic backstop to the + // docs/architecture.md § Single-instance constraint prose half (#64). + // Refuses to start on multi-instance-capable platforms (today: Fly.io, + // signalled by FLY_APP_NAME) unless the operator asserts single-instance + // intent via PYRYCODE_RELAY_SINGLE_INSTANCE=1. Runs after CheckEnvConfig + // so a typo'd bypass fails with the structured config error rather than + // being silently treated as unset; runs before CheckInsecureListenInProduction + // because a deploy-shape misconfiguration is more fundamental than a + // production-mode flag misconfiguration. See + // docs/specs/architecture/65-startup-multi-instance-check.md § Wiring. + if err := relay.CheckSingleInstance(os.Getenv); err != nil { + logger.Error("refusing to start: multi-instance deploy detected", + "err", err, + "bypass_env_var", "PYRYCODE_RELAY_SINGLE_INSTANCE", + "fix", "set PYRYCODE_RELAY_SINGLE_INSTANCE=1 in the deploy manifest (e.g. fly.toml [env]) to assert single-instance intent; see docs/architecture.md § Single-instance constraint") + return 2 + } + if err := relay.CheckInsecureListenInProduction(*insecureListen, os.Getenv); err != nil { logger.Error("refusing to start: production-mode misconfiguration", "err", err, diff --git a/docs/knowledge/INDEX.md b/docs/knowledge/INDEX.md index 18d0eef..982b9d9 100644 --- a/docs/knowledge/INDEX.md +++ b/docs/knowledge/INDEX.md @@ -4,6 +4,7 @@ One-line pointers into the evergreen knowledge base. Newest entries at the top o ## Features +- [Single-instance startup self-check](features/single-instance-check.md) — deterministic backstop to `docs/architecture.md § Single-instance constraint` (#64). `relay.CheckSingleInstance(getenv)` in `internal/relay/single_instance.go` refuses to start (exit 2, `ErrMultiInstanceDeployDetected`) when the relay detects it is running on a multi-instance-capable platform (today: Fly.io, signalled by `FLY_APP_NAME` non-empty) AND the operator has not asserted single-instance intent via `PYRYCODE_RELAY_SINGLE_INSTANCE=1`. Decision order: bypass exact `"1"` wins unconditionally; else `FLY_APP_NAME` non-empty returns the sentinel; else nil. Sentinel message contains both AC-mandated substrings (`multi-instance deploy detected` + `PYRYCODE_RELAY_SINGLE_INSTANCE`) so `grep -i` over boot logs hits regardless of structured-field rendering. Wired in `cmd/pyrycode-relay/main.go` between `CheckEnvConfig` (so a typo'd bypass like `=true` fails with the structured per-key config error before reaching here) and `CheckInsecureListenInProduction` (because deploy-shape misconfiguration is more fundamental than production-mode flag misconfiguration). `envContracts` in `env_config.go` gains a row for `PYRYCODE_RELAY_SINGLE_INSTANCE` with the same exact-`"1"`-or-unset shape as `envProductionMode`; the env-name constant lives in `single_instance.go` and is referenced by the registry row (single-source-of-truth invariant). The bypass constant carries `// #nosec G101 -- env var name, not a credential` to silence gosec's literal-credential false positive. **Heuristic, not adversarial**: catches "operator deployed to Fly and never read `architecture.md`"; does NOT catch "operator ran `fly scale count 3` after configuring the bypass in `fly.toml [env]`" — once the bypass propagates via the manifest, all replicas inherit it. The ticket Technical Notes precluded a Fly Machines API client or DNS-against-`.internal` mechanism that could catch runtime scale-out; honest framing is "explicit deploy-time gate, not a runtime instance count". Fly.io only today; K8s/ECS/Lambda signals deferred until a non-Fly substrate is considered (speculative signals would false-positive in local-dev shells). Deterministic half of the belt-and-suspenders pair shipped jointly with #64's prose; both halves agree on the literal env var name (`PYRYCODE_RELAY_SINGLE_INSTANCE`) as the shared contract surface. Out of scope: updating `fly.toml [env]` to set the bypass (rides with `docs/deploy.md` bootstrap procedure); reconciling `architecture.md`'s "NOT recommended for production" framing with the deployed reality that the bypass must be set permanently in the Fly manifest (separate XS doc-only follow-up — code is the contract source). Tests: 7-row table matrix in `single_instance_test.go` covering AC #5(a)/(b)/(c) plus bypass-set-no-signal no-op plus three non-`"1"` bypass values (`"true"`, `"0"`, `" 1"`) to lock that `CheckSingleInstance` is strict-equality independently of `CheckEnvConfig`; AC-substring assertion as its own canary test; new row in the `env_config_test.go` malformed-value matrix locks the registry entry (#65). - [Graceful shutdown on SIGTERM](features/graceful-shutdown.md) — `SIGTERM` / `SIGINT` triggers a bounded drain: stop accepting new connections, emit `1001 StatusGoingAway` on every live WS conn, give peers a configurable window (default 10s) to ack, then force-close and exit. Closes the "graceful-shutdown signal handler" gap that #47, #57, and #60 deferred to this ticket. Three pieces: (a) `func (r *Registry) Snapshot() []Conn` — freshly-allocated slice of every registered binary + phone conn, RLock-scoped, nil on empty, mirrors `PhonesFor`'s shape; preserves ADR-0003's passive-registry contract (slow work happens outside the lock). (b) `func relay.Shutdown(ctx, logger, reg, servers...) error` in `internal/relay/shutdown.go` — concurrently invokes `srv.Shutdown(ctx)` on each `*http.Server` (closes listeners synchronously, waits for non-hijacked handlers) AND fans `CloseWithCode(StatusGoingAway, "shutting down")` out across `reg.Snapshot()` (one goroutine per conn; concurrent fan-out is load-bearing because `nhooyr.io/websocket.Conn.Close` blocks up to 5s per conn for the peer's reciprocal handshake — serial iteration would scale linearly in conn count and blow past the deadline at the second stuck peer). A type-assertion seam against the private `gracefulCloser` interface (`CloseWithCode(websocket.StatusCode, string)`) lets `*WSConn` (#7, ADR-0007) take the code-aware path while test fakes and any future narrow-`Conn` impls fall back to plain `Close()`. Returns `nil` on clean drain, `ctx.Err()` on deadline expiry — and on the deadline path explicitly iterates `srv.Close()` per server because `http.Server.Shutdown` does NOT internally force-close its listeners after ctx expiry (the unit test asserts wall-clock < 300ms after a 50ms deadline specifically to pin this). Errors from `srv.Shutdown` are logged but never abort the drain — every server gets a Shutdown call, every conn gets a close call. (c) `cmd/pyrycode-relay/main.go` refactor splitting `main` into a testable `run(args []string, sigCtx context.Context) int` entry point invoked from a one-line `main` (`os.Exit(run(os.Args[1:], signalContextFor(SIGTERM, SIGINT)))`); the split exists so `main_e2e_test.go` can drive the full boot+drain sequence with a synthetic `context.Context` instead of forking a subprocess and sending real signals. `runServers(sigCtx, logger, reg, servers, listen)` is the single sink for both `--insecure-listen` and `--domain` modes — launches one goroutine per `*http.Server` running the `listen` closure (`ListenAndServe` or `ListenAndServeTLS("", "")` for the autocert HTTPS server), forwards the first non-`ErrServerClosed` listener error to a 1-slot buffered channel (first error wins, additional non-blocking sends drop), `select`s on `<-sigCtx.Done()` vs `<-listenerErr`, then builds `context.WithTimeout(_, drainDeadline)` and calls `relay.Shutdown(...)`. Exit codes: 0 for signal-triggered drain (clean operator action), 1 for listener-error-triggered drain (process supervisor restart) — preserves today's `os.Exit(1)` semantics on listener errors while enabling clean shutdown on signals. `drainDeadline = 10 * time.Second` declared as a `const` at the top of `main.go` per the established policy-values-at-wiring-site convention (#21, #60, #79); leaves ~5s of headroom beyond a single close-handshake. The five pre-existing `os.Exit(1)` calls inside listener goroutines collapse into the buffered-error channel — and `defer limiter.Close()` (introduced by #47) now actually runs on every shutdown path. Shutdown ordering is structural, not enforced by a barrier: the listener-close is in flight before the snapshot is taken, so no new WS handshake can complete between snapshot and close. Handler goroutines are NOT joined by the helper — `CloseWithCode` unblocks the handler's `Read`, the handler unwinds through its existing `defer reg.ScheduleReleaseServer / wsconn.Close` chain, and anything still alive at process exit is reaped by the kernel. A second SIGTERM during drain is ignored (`NotifyContext` cancels its context once); SIGKILL bypasses Go entirely. Empty registry is a no-op (`Snapshot()` returns nil, fan-out has zero goroutines, helper returns nil promptly). Tests: `shutdown_test.go` (empty-registry, all-conns-get-StatusGoingAway, deadline-expiry-with-wall-clock-bound, real-`*WSConn`-idempotency under race), `registry_test.go` (Snapshot empty/membership/isolation/race-freedom), `main_e2e_test.go` (full boot + WS dial + cancel-sigCtx + assert `websocket.CloseStatus(err) == StatusGoingAway` + assert exit 0 within drainDeadline+2s). Out of scope: protocol-level reconnect/resume coordination (lives in `protocol-mobile.md`), in-flight frame persistence (relay is stateless), `--drain-deadline` CLI flag (10s constant for v1; promote when ops experience demands), SIGHUP config reload (no runtime-reloadable config exists). (#31). - [Upgrade-attempt and register-failure counters](features/upgrade-and-register-failure-counters.md) — two CounterVecs at the `/v1/server` + `/v1/client` upgrade call sites so operators can distinguish "flood of malformed headers" from "flood of 4409 conflicts" from "flood of 4429 phone-cap rejections" from "flood of rate-limited clients" without log-grepping. `pyrycode_relay_ws_upgrade_attempts_total{endpoint, outcome}` (`endpoint` ∈ `{server, client}`, `outcome` ∈ `{accept, reject_headers, reject_409, reject_404, reject_429, reject_rate_limit}`) is incremented exactly once per terminal decision; `pyrycode_relay_register_failures_total{kind}` (`kind` ∈ `{no_server, server_in_use, phones_at_cap, ratelimit}`) co-increments in lockstep with the matching `outcome=reject_*` cell. All 16 series are pre-bound via `WithLabelValues` in `NewUpgradeMetrics(reg prometheus.Registerer)` — including the three structurally unreachable cells (`server×reject_404`, `server×reject_429`, `client×reject_409`) which stay at 0 by construction — so ops dashboards see a stable scrape shape rather than series flickering into existence on first occurrence. Nine event-named, nil-receiver-safe methods (`ServerAccept`, `ServerHeaderReject`, `ServerIDConflict`, `ServerRateLimitDeny`, `ClientAccept`, `ClientHeaderReject`, `ClientNoServer`, `ClientPhonesAtCap`, `ClientRateLimitDeny`); the five composite methods are the lockstep contract — a single call bumps both the endpoint×outcome and kind cells, so tests cannot accidentally advance one without the other. **Two load-bearing seam choices**: (a) handler-parameter threading (`*UpgradeMetrics` as the final arg to `ServerHandler` / `ClientHandler`) instead of #58's hooks-on-`Registry`, because three of the five terminal sites (header-reject, success, rate-limit-deny) have NO Registry call to hook through — fan-out is 7 mechanical `, nil` test appends, under the 10-call-site red line; (b) `WrapServerRateLimitDeny` / `WrapClientRateLimitDeny` as a status-code observer wrapper stacked OUTSIDE the rate-limit middleware (#47), because adding a callback parameter to `NewRateLimitMiddleware` would cascade across 11 call sites — HTTP 429 in either pipe is unambiguously the rate-limit deny signal (header-gate writes 400, success writes 101, WS close codes 4404/4409/4429 travel inside the upgraded socket). The wrapper's `statusObserverResponseWriter` **must implement `http.Hijacker`** because the wrapped pipe terminates in `websocket.Accept` which type-asserts the writer for hijacking; without forwarding, the upgrade fails with HTTP 500 — a constraint that applies to any future middleware wrapping a WS upgrade. Increments fire structurally tight to the deciding branch (between `http.Error`/`logger.Info` and `return`) so "did this code path execute?" is one statement from "did the counter increment?". All three label values (`endpoint`, `outcome`, `kind`) are hard-coded compile-time constants set per event-named method — none derived from request headers, request paths, or any other attacker-influenced input; cardinality is exactly 16 regardless of traffic. Registered against the private `metricsReg` per ADR-0008 § Scope of use; package-level `TestMetricsRegistry_NoGlobalRegistrarLeak` plus a local sibling `TestUpgradeMetrics_NoGlobalRegistrarLeak` enforce no `DefaultRegisterer` leak structurally. The defensive `wsconn.Close()` non-`Is`-matched branches in both handlers stay MUST-NOT-increment — a future error variant would be a new outcome and belongs in a follow-up ticket. Out of scope: latency histograms for the upgrade handshake, a second HTTP-429 source on the upgrade pipes (would invalidate the observer's filter), source-IP-labelled attempt counters (would carry attacker-influenced label values) (#57). - [Per-IP rate-limit middleware (`/v1/server` + `/v1/client`)](features/rate-limit-middleware.md) — HTTP middleware that throttles WS upgrade attempts per source IP **before** `websocket.Accept` runs, before the registry is touched, before the mobile/binary token header is read. One new exported symbol: `relay.NewRateLimitMiddleware(limiter *IPRateLimiter, logger *slog.Logger, trustForwardedFor bool) func(http.Handler) http.Handler` — returns the conventional composition shape, matching `EnforceHost`. Per request: extract `ip := ClientIP(r, trustForwardedFor)`, deny on `ip == ""` (empty-IP guard runs **before** `Allow` because the limiter's `Allow("")` is a normal map key, not a deny — loud-failure rule: refuse un-attributable traffic rather than admit it un-throttled), deny on `!limiter.Allow(ip)`, otherwise `next.ServeHTTP`. Both deny paths emit one `logger.Warn("rate_limited", "remote", strconv.Quote(ip))` line — single allowlisted key, `strconv.Quote` defends against control-byte log injection, the empty-IP case renders `remote="\"\""` so operators can distinguish the two deny causes by the value. 429 body is empty (`http.Error(w, "", http.StatusTooManyRequests)` — matches the existing `BadRequest` shape in `server_endpoint.go` / `client_endpoint.go`, no internal state leaked). New `--trust-x-forwarded-for` CLI flag (default `false`, Usage carries the explicit spoofing warning) is threaded once into the factory. Policy lives at the wiring site in `cmd/pyrycode-relay/main.go`: `rateLimitRefillEvery = 6s`, `rateLimitBurst = 20`, `rateLimitEvictionInterval = 5min` → ~10 sustained attempts/IP/min with 20-attempt burst headroom; `5min` sweep is well above `burst*refillEvery = 120s`. One shared `*IPRateLimiter` is applied to both `mux.Handle` registrations — a misbehaving IP retrying against either endpoint shares one bucket; `/healthz` is intentionally NOT wrapped (must remain pollable from monitoring). `defer limiter.Close()` is best-effort (current `os.Exit` listener path skips defers; graceful shutdown out of scope). Concurrency safety fully delegated to the limiter; middleware is stateless, no new locks, no new goroutines. Mobile/binary `X-Pyrycode-Token` structurally not read on this code path. Why middleware not constructor injection: keeps frame-routing handlers ignorant of admission control, makes `/healthz`'s exemption fall out for free, mirrors `EnforceHost`. Closes the third leg of `docs/threat-model.md` § *DoS resistance* alongside #29 (per-frame size cap) and #30 (per-server-id phone cap); the "token-bucket rate limit on /v1/server and /v1/client" line moves from *Future hardening* to v1 mitigation. Out of scope: connection-count cap (different threat surface — *attempt rate* vs *resident count*), CIDR-aware trusted-proxy chain (today's flag is flat all-or-nothing), multi-instance shared-state (would need Redis), rate-limit metrics counter (future ticket parallel to #58), adaptive policy, graceful-shutdown signal handler (#47). diff --git a/docs/knowledge/codebase/65.md b/docs/knowledge/codebase/65.md new file mode 100644 index 0000000..90855e0 --- /dev/null +++ b/docs/knowledge/codebase/65.md @@ -0,0 +1,37 @@ +# Ticket #65 — startup self-check refuses to run as multi-instance deploy + +Deterministic backstop to the stochastic prose half landed in #64. New boot-time `Check*` helper in `internal/relay/single_instance.go` that refuses to start (exit 2) when the relay detects it is running on a multi-instance-capable platform (today: Fly.io, signalled by `FLY_APP_NAME`) and the operator has not asserted single-instance intent via `PYRYCODE_RELAY_SINGLE_INSTANCE=1`. Closes the "operator runs `fly scale count 3` and silently breaks server-id routing" failure mode against the in-memory per-process registry. + +## Implementation + +- **New file `internal/relay/single_instance.go` (~50 lines).** Two unexported env-name constants (`envSingleInstanceBypass = "PYRYCODE_RELAY_SINGLE_INSTANCE"`, `envFlyAppName = "FLY_APP_NAME"`), exported sentinel `ErrMultiInstanceDeployDetected` whose message contains both AC-mandated substrings (`multi-instance deploy detected` and `PYRYCODE_RELAY_SINGLE_INSTANCE`), and `CheckSingleInstance(getenv func(string) string) error`. Decision order: bypass exact `"1"` wins → else `FLY_APP_NAME` non-empty → sentinel; else nil. The bypass constant carries `// #nosec G101 -- env var name, not a credential` to silence gosec's literal-credential heuristic — same false-positive class as #80's existing env constants. +- **`internal/relay/env_config.go`** — one new row appended to `envContracts` for the bypass env var, referencing the constant declared in `single_instance.go` (single-source-of-truth invariant preserved). Same shape as `envProductionMode`: `required: false`, validator accepts exact `"1"` or unset. A `=true` typo therefore fails `CheckEnvConfig` with a structured `*ErrInvalidConfig{Key, Reason}` *before* `CheckSingleInstance` reads the value. +- **`cmd/pyrycode-relay/main.go`** — `CheckSingleInstance(os.Getenv)` wired between `CheckEnvConfig` and `CheckInsecureListenInProduction`. Error-log block mirrors the established refuse-to-start pattern (`logger.Error` + literal substring in the message + structured `bypass_env_var` / `fix` fields + `return 2`). The wiring comment names both ordering constraints inline so a future reordering touches a load-bearing comment, not silent code. +- **Tests.** `internal/relay/single_instance_test.go` — 7-row table matrix exercising AC #5(a)/(b)/(c), the bypass-set-no-signal no-op, and three non-`"1"` bypass values (`"true"`, `"0"`, `" 1"`) to lock that `CheckSingleInstance` itself is strict-equality even though `CheckEnvConfig` would catch them earlier. Two separate tests assert (a) the message contains both AC-mandated substrings and (b) the sentinel is `errors.Is`-branchable. `internal/relay/env_config_test.go` — one new test (`TestCheckEnvConfig_SingleInstanceBypassMalformedValue`) locks the new envContracts row. Re-uses `fakeGetenv` from `production_test.go` and `fakeLookup` from `env_config_test.go` — no new test helpers. + +## Patterns established / reinforced + +- **Belt-and-suspenders means different fabric, demonstrated end-to-end.** #64 shipped the stochastic advisory (prose in `docs/architecture.md`); #65 ships the deterministic enforcement (refuse-to-start). Both halves share the same env var name (`PYRYCODE_RELAY_SINGLE_INSTANCE`) but landed in two PRs across two tickets. The merge-second-fixes-it contract (codified in #64's note) held: #64 landed first, the env var name was already pinned, #65 implements without renaming. +- **Env-var-name constants live in their feature file, not in `env_config.go`.** `envSingleInstanceBypass` is declared in `single_instance.go`; the env-config registry row references it. This is the same invariant `envProductionMode` follows (declared in `production.go`, used from `env_config.go`). New env vars added to the registry must declare their literal in the feature file, not at the registry row. +- **AC-substring assertions as their own test.** `TestErrMultiInstanceDeployDetected_MessageContainsACSubstrings` is a one-purpose canary that locks the error message's grep-anchor substrings against future prose rewrites. Cheap, mechanical, survives any future reword that does not break the substring contract. Same shape as #64's literal-grep AC checks at the docs side. + +## Lessons learned + +- **`// #nosec G101` is the right tool for env-var-name constants that look like credentials.** The bypass env var name `PYRYCODE_RELAY_SINGLE_INSTANCE` tripped gosec G101 ("Potential hardcoded credentials") on the security-scan job — the same false-positive shape `envProductionMode` carries. The fix is the inline `// #nosec G101 -- env var name, not a credential` comment, not a global rule disable. Shipped as a follow-up commit on this branch; bake the comment in on the first commit next time an env-var-name constant is introduced. +- **Heuristic-detection limits belong in the docs, not as a code TODO.** The check fires on `FLY_APP_NAME` presence — sufficient to catch "operator deployed to Fly and never read `architecture.md`", insufficient to catch "operator ran `fly scale count 3` after configuring the bypass in `fly.toml [env]`". The latter is in the same trust class as the operator deleting the binary. Documenting that explicit scope boundary in the feature doc (rather than a code-side `TODO: detect scale-out at runtime`) prevents a future contributor from "fixing" the gap by adding the Fly Machines API client the ticket forbids. +- **A code spec can create tension with a prior doc spec; that tension is a follow-up ticket, not a code change.** `docs/architecture.md` § Single-instance constraint (landed in #64) calls the bypass "NOT recommended for production". With `FLY_APP_NAME` as the signal, the production Fly deploy must set the bypass permanently for the relay to start. The architect's open-question note flagged this as a separate doc-only follow-up against `architecture.md`. Resisting the urge to silently rewrite the prose in the same PR kept the diff scoped and left the reconciliation visible. + +## Out of scope (filed as follow-ups by the spec) + +- **Updating `fly.toml` `[env]` to set the bypass.** Production Fly deploy will refuse to start until the manifest is updated, but the existing `__REGION__` / `__DOMAIN__` placeholders fail first-deploy regardless. Rides alongside the bootstrap procedure in `docs/deploy.md`. +- **Reconciling `architecture.md`'s "NOT recommended for production" framing.** Separate XS doc-only ticket; this code ticket is the contract source, the doc follows. +- **Other multi-instance-capable platforms** (K8s, ECS, Lambda). Defer until a non-Fly substrate is considered; speculative signals create false positives in unusual local-dev shells. + +## Cross-links + +- [Single-instance startup self-check (feature doc)](../features/single-instance-check.md) — evergreen behaviour, wiring, threat model. +- [Architecture § Single-instance constraint (v1)](../../architecture.md#single-instance-constraint-v1) — prose half (#64). +- [Codebase note #64](64.md) — the doc-only stochastic half this ticket pairs with. +- [Spec: 65-startup-multi-instance-check](../../specs/architecture/65-startup-multi-instance-check.md) — architect's design (file structure, decision order, wiring rationale, test matrix). +- [`internal/relay/single_instance.go`](../../../internal/relay/single_instance.go) — implementation. +- [`cmd/pyrycode-relay/main.go`](../../../cmd/pyrycode-relay/main.go) — wiring site (between `CheckEnvConfig` and `CheckInsecureListenInProduction`). diff --git a/docs/knowledge/features/env-config-validator.md b/docs/knowledge/features/env-config-validator.md index 7cf9a50..61acd01 100644 --- a/docs/knowledge/features/env-config-validator.md +++ b/docs/knowledge/features/env-config-validator.md @@ -4,7 +4,7 @@ Structured, table-driven validation of every env var the relay reads at boot. In ## What it catches -The contract table is the single source of truth for "what env vars the relay reads". Today it has one row (`PYRYCODE_RELAY_PRODUCTION`); future env-var reads register here. For each entry the validator distinguishes three states: +The contract table is the single source of truth for "what env vars the relay reads". Today it has two rows (`PYRYCODE_RELAY_PRODUCTION` from #80; `PYRYCODE_RELAY_SINGLE_INSTANCE` from #65, same exact-`"1"`-or-unset shape); future env-var reads register here. For each entry the validator distinguishes three states: - **Unset.** If `required: true`, returns `*ErrInvalidConfig{Reason: "missing"}`. If `required: false`, skipped. - **Set-and-non-empty.** Runs the per-key `validate(value)`; on `error`, returns `*ErrInvalidConfig{Reason: "malformed-value: "}`. @@ -32,7 +32,7 @@ Mixing the two seam shapes is intentional; do not refactor `IsProductionMode` to ## Wiring (`cmd/pyrycode-relay/main.go`) -After flag-parse, **before** `CheckInsecureListenInProduction` (#77) and `CheckCapabilities` (#79): +After flag-parse, **before** `CheckSingleInstance` (#65), `CheckInsecureListenInProduction` (#77), `CheckRunningAsRoot` (#78) and `CheckCapabilities` (#79): ```go if err := relay.CheckEnvConfig(os.LookupEnv); err != nil { @@ -49,7 +49,7 @@ if err := relay.CheckEnvConfig(os.LookupEnv); err != nil { } ``` -- **Ordering is load-bearing.** `CheckInsecureListenInProduction` consults `IsProductionMode`, which treats every non-`"1"` value as "not production". If the operator wrote `PYRYCODE_RELAY_PRODUCTION=true`, the insecure-listen guard returns nil and a plaintext production listener boots. Running `CheckEnvConfig` first catches the malformed value and the dangerous path is never reached. Code review enforces. +- **Ordering is load-bearing.** Downstream `Check*` helpers (`CheckInsecureListenInProduction`, `CheckSingleInstance`) treat every non-`"1"` value as "not set" — silent fall-through, not refusal. If the operator wrote `PYRYCODE_RELAY_PRODUCTION=true`, the insecure-listen guard would return nil and a plaintext production listener would boot; similarly `PYRYCODE_RELAY_SINGLE_INSTANCE=true` would let a multi-instance deploy through. Running `CheckEnvConfig` first catches malformed values and the dangerous paths are never reached. Code review enforces. - **Exit code 2** matches the sibling boot-time refusals; exit 1 is reserved for runtime failures. - **`errors.As` extraction at the log site**, not in the validator. The structured fields are `err`, `env_var` (the key), and `reason` (the per-key prose). The fallback branch (no `errors.As` hit) is defensive against a future refactor that wraps the error in something opaque; under the current design it is unreachable. - **No `value` log field.** The reason string already echoes the malformed value (`got "true"`) where safe; the `env_var` field carries the name only. Future entries whose values could carry secrets must keep raw bytes out of the reason string — describe the shape violation in operator-facing terms instead (`"expected hex-encoded 32-byte value"`, not `got %q`). @@ -67,7 +67,9 @@ Code review enforces "every env-var read in the binary is registered here" by re ## Behaviour matrix (today) -| `PYRYCODE_RELAY_PRODUCTION` | `CheckEnvConfig` result | +Both registered env vars share the same `exact "1" or unset` shape, so the matrix is identical for either key: + +| Value (applies to `PYRYCODE_RELAY_PRODUCTION` and `PYRYCODE_RELAY_SINGLE_INSTANCE`) | `CheckEnvConfig` result | |---|---| | unset | nil | | `"1"` | nil | @@ -76,11 +78,12 @@ Code review enforces "every env-var read in the binary is registered here" by re ## Why not a `Config` struct (yet) -A natural extension is a typed `relay.Config` populated from the env at boot, validated as a single step, and threaded through the binary. **Deferred.** The relay has one env var today; the ticket body promises "future env-var additions register here", which means the contract table is the registry, not a typed struct. When env-var count crosses ~3 *and* a downstream caller wants a typed snapshot, a follow-up ticket consolidates. Doing it now is premature. +A natural extension is a typed `relay.Config` populated from the env at boot, validated as a single step, and threaded through the binary. **Deferred.** The relay has two env vars today; the ticket body promises "future env-var additions register here", which means the contract table is the registry, not a typed struct. When env-var count crosses ~3 *and* a downstream caller wants a typed snapshot, a follow-up ticket consolidates. Doing it now is premature. ## Cross-links - [Production-mode contract & `--insecure-listen` startup refusal](production-mode.md) — first env-var contract (#77); the malformed-value cases listed there are now caught by `CheckEnvConfig` before `IsProductionMode` is consulted. +- [Single-instance startup self-check](single-instance-check.md) — second env-var contract (#65); same exact-`"1"`-or-unset shape, second consumer of the registry. - [Linux capability allowlist](capability-allowlist.md) — sibling boot-time refusal (#79); same wiring shape, runs after env-config validation. - [Codebase note #80](../codebase/80.md) — implementation summary and lessons. - [Codebase note #77](../codebase/77.md) — env-var contract precedent (`PYRYCODE_RELAY_PRODUCTION`). diff --git a/docs/knowledge/features/production-mode.md b/docs/knowledge/features/production-mode.md index 958abd0..1dc8053 100644 --- a/docs/knowledge/features/production-mode.md +++ b/docs/knowledge/features/production-mode.md @@ -25,7 +25,7 @@ The check takes a `func(string) string` rather than calling `os.Getenv` directly ## Wiring (`cmd/pyrycode-relay/main.go`) -Boot-time check ordering, top to bottom: `CheckEnvConfig` (#80) → `CheckInsecureListenInProduction` (#77) → `CheckRunningAsRoot` (#78) → `CheckCapabilities` (#79). Each `if err != nil` branch logs at error level and `os.Exit(2)`s; no listener has been opened yet at any point in the sequence. +Boot-time check ordering, top to bottom: `CheckEnvConfig` (#80) → `CheckSingleInstance` (#65) → `CheckInsecureListenInProduction` (#77) → `CheckRunningAsRoot` (#78) → `CheckCapabilities` (#79). Each `if err != nil` branch logs at error level and `os.Exit(2)`s; no listener has been opened yet at any point in the sequence. ```go if err := relay.CheckInsecureListenInProduction(*insecureListen, os.Getenv); err != nil { @@ -84,7 +84,7 @@ The checks are fail-closed: if any precondition trips, the relay refuses to boot ## Out of scope (deferred) -- **No `Config` struct.** A bundled `relay.Config` with `Validate()` returning a multi-error is a natural extension once ~5 startup checks exist. With four today (`CheckEnvConfig`, `CheckInsecureListenInProduction`, `CheckRunningAsRoot`, `CheckCapabilities`), the wiring boilerplate is approaching the cost threshold; a follow-up ticket will consolidate when it crosses. +- **No `Config` struct.** A bundled `relay.Config` with `Validate()` returning a multi-error is a natural extension once ~5 startup checks exist. With five today (`CheckEnvConfig`, `CheckSingleInstance`, `CheckInsecureListenInProduction`, `CheckRunningAsRoot`, `CheckCapabilities`), the wiring boilerplate has reached the cost threshold; a follow-up ticket will consolidate. - **No fork-exec integration test on `main.go`.** The `main` wiring is observable via package-level unit tests; one `if err != nil { os.Exit(2) }` block does not warrant a binary-spawning test. - **No `IsRunningAsRoot` bool helper.** A second consumer hasn't appeared (no log line branches on it, no metric labels it); exporting it today would be dead surface area and a temptation to call it without the production-mode guard tomorrow. Inline the predicate inside `CheckRunningAsRoot` until a second consumer materialises. diff --git a/docs/knowledge/features/single-instance-check.md b/docs/knowledge/features/single-instance-check.md new file mode 100644 index 0000000..64d2637 --- /dev/null +++ b/docs/knowledge/features/single-instance-check.md @@ -0,0 +1,88 @@ +# Single-instance startup self-check + +Boot-time refusal that fires when the relay detects it is running on a multi-instance-capable platform (today: Fly.io) and the operator has not asserted single-instance intent via `PYRYCODE_RELAY_SINGLE_INSTANCE=1`. Deterministic backstop to the [`docs/architecture.md` § Single-instance constraint](../../architecture.md#single-instance-constraint-v1) prose half (#64). Closes the foot-gun where `fly scale count 3` silently breaks server-id routing because the connection registry (`internal/relay/registry.go`) is in-memory per process — two replicas hold disjoint server-id tables and a phone routed to the wrong replica closes with `4404` even though the binary is online. + +## Contract + +- **Bypass env var:** `PYRYCODE_RELAY_SINGLE_INSTANCE`. The literal name is pinned by `docs/architecture.md` (the doc and the code agree on the wire). Exact string `"1"` bypasses; anything else is treated as not set. +- **Multi-instance signal:** `FLY_APP_NAME` non-empty. Fly's platform sets this on every machine in an app, which is also the substrate where `fly scale count > 1` is a one-line command. Presence is sufficient evidence we are on a multi-instance-capable platform; the value is not inspected. +- **Decision order:** bypass wins unconditionally. If `PYRYCODE_RELAY_SINGLE_INSTANCE=1`, return nil even when the platform signal is present. Otherwise, if the platform signal is present, return `ErrMultiInstanceDeployDetected`. Otherwise nil. + +## API (`internal/relay/single_instance.go`) + +- `CheckSingleInstance(getenv func(string) string) error` — exported helper. Signature mirrors `CheckInsecureListenInProduction` / `CheckRunningAsRoot`. Pure: two env reads, one string comparison, no goroutines, no I/O. +- `ErrMultiInstanceDeployDetected` — exported sentinel, branchable via `errors.Is`. Message: `"relay: multi-instance deploy detected; set PYRYCODE_RELAY_SINGLE_INSTANCE=1 to bypass"`. Both the AC-mandated substring `multi-instance deploy detected` AND the literal env-var name `PYRYCODE_RELAY_SINGLE_INSTANCE` are present in the message so `grep -i 'multi-instance deploy detected'` over boot logs hits regardless of structured-field rendering. +- `envSingleInstanceBypass` / `envFlyAppName` (unexported) — the env-var-name constants. The bypass constant carries a `// #nosec G101` comment to silence gosec's "looks like a credential" false-positive on the literal string. + +## Wiring (`cmd/pyrycode-relay/main.go`) + +Boot-time check ordering, top to bottom: + +1. `CheckEnvConfig` (#80) +2. **`CheckSingleInstance` (#65)** — this check +3. `CheckInsecureListenInProduction` (#77) +4. `CheckRunningAsRoot` (#78) +5. `CheckCapabilities` (#79) + +Position rationale: + +- **After `CheckEnvConfig`** so a typo'd bypass (`=true`, `=0`, `= 1`) fails with the structured per-key config error rather than being silently treated as "not set" and tripping the multi-instance refusal with a misleading log line. +- **Before `CheckInsecureListenInProduction`** because a deploy-shape misconfiguration (multi-instance) is more fundamental than a production-mode misconfiguration (insecure-listen-in-prod). If both fire, the operator should see the multi-instance error first; fixing it may not be possible without changing the deploy substrate, while insecure-listen is a flag flip. + +Error-log shape (mirrors the established refuse-to-start pattern): + +```go +if err := relay.CheckSingleInstance(os.Getenv); err != nil { + logger.Error("refusing to start: multi-instance deploy detected", + "err", err, + "bypass_env_var", "PYRYCODE_RELAY_SINGLE_INSTANCE", + "fix", "set PYRYCODE_RELAY_SINGLE_INSTANCE=1 in the deploy manifest (e.g. fly.toml [env]) to assert single-instance intent; see docs/architecture.md § Single-instance constraint") + return 2 +} +``` + +Exit code 2 matches the sibling boot-time refusals (config-rejected-at-boot vs. exit 1 for runtime failures). + +## Env-config registry row + +`envContracts` in `internal/relay/env_config.go` gains a row for `PYRYCODE_RELAY_SINGLE_INSTANCE` — `required: false`, validator accepts exact `"1"` or unset, rejects everything else. Same shape as `envProductionMode`. The row references the unexported constant declared in `single_instance.go` so the literal env var name lives in exactly one place. A `=true` typo therefore fails `CheckEnvConfig` with a structured per-key error before `CheckSingleInstance` reads the value. + +## Behaviour matrix + +| `PYRYCODE_RELAY_SINGLE_INSTANCE` | `FLY_APP_NAME` | Result | +|---|---|---| +| unset | unset | nil (common single-instance case) | +| unset | set | `ErrMultiInstanceDeployDetected` → exit 2 | +| `"1"` | unset | nil (bypass is benign when the check would have passed) | +| `"1"` | set | nil (bypass asserts intent) | +| anything else (`"true"`, `"0"`, `" 1"`, …) | any | caught earlier by `CheckEnvConfig` (exit 2 with structured per-key error) | + +The "anything else" row is the defence-in-depth case: `CheckSingleInstance` treats every non-`"1"` value as "not set" even if `CheckEnvConfig` were ever to be reordered or removed. The matrix's bottom row reflects today's wiring; the underlying contract is "exact-`"1"` bypass" at both layers. + +## Scope and limits + +- **Heuristic, not adversarial.** The check catches "operator deployed to Fly and never read `architecture.md`". It does NOT catch "operator ran `fly scale count 3` after configuring the bypass in `fly.toml [env]`" — once the bypass propagates via the manifest, all replicas inherit it. The constraint to use only env vars exposed by the host (no Fly Machines API client, no DNS query against `.internal`) precludes the only mechanism that could catch the scale-out case at runtime. The honest framing: this is an *explicit deploy-time gate*, not a runtime instance count. +- **Fly.io only today.** Only `FLY_APP_NAME` is in the signal set. Kubernetes (`KUBERNETES_SERVICE_HOST`), ECS (`ECS_CONTAINER_METADATA_URI`), AWS Lambda (`AWS_LAMBDA_FUNCTION_NAME`) would be candidates if the relay is ever ported. Signal-set is intentionally narrow — adding signals on speculation would create false positives for operators running the binary in unusual local-dev shells. Defer until a non-Fly substrate is actually considered. +- **No new dependencies.** No Fly Machines API client, no platform-specific SDK; the detection mechanism is local env-var reads only, per ticket Technical Notes. + +## Threat model alignment + +The check reads process env vars set by the deploy substrate (Fly's platform sets `FLY_APP_NAME`; the operator sets the bypass). Both are out-of-band relative to the relay's internet-exposed surface — a remote attacker on `/v1/server` or `/v1/client` cannot influence either value. The bypass is fail-open by design: the threat model assumes an adversary capable of setting process env vars already has shell access to the deploy host, at which point the multi-instance refusal is not the load-bearing control. The bypass is an operator-discipline tool, not a security boundary. + +No env-var *values* are interpolated into the log line — only the literal bypass-env-var name and the sentinel error. Log-injection is structurally impossible at this site. + +## Out of scope (deferred) + +- **`fly.toml [env] PYRYCODE_RELAY_SINGLE_INSTANCE = "1"` block.** Once this ticket lands, the production Fly deploy will refuse to start until the manifest sets the bypass. The `__REGION__` / `__DOMAIN__` placeholders in `fly.toml` would fail the first deploy regardless; the bootstrap procedure in `docs/deploy.md` handles both at once. +- **Reconciling `docs/architecture.md`'s "NOT recommended for production" framing.** That section, written before this code spec, characterises the bypass as an emergency / migration-window tool. With `FLY_APP_NAME` as the multi-instance signal, the bypass must be set *permanently* in the Fly manifest for the production deploy to start at all. The framing reconciliation is a separate doc-only follow-up ticket against `docs/architecture.md § Single-instance constraint`; this code-side ticket landed first as the contract source. +- **Shared-registry / sticky-session multi-instance support.** Tracked in `docs/architecture.md § What multi-instance would require`; this ticket is the v1 single-instance backstop. + +## Cross-links + +- [Architecture § Single-instance constraint (v1)](../../architecture.md#single-instance-constraint-v1) — the doc half (#64); shares the `PYRYCODE_RELAY_SINGLE_INSTANCE` env var contract. +- [Production-mode contract & startup refusals](production-mode.md) — sibling boot-time refusals (#77 / #78); the `func(string) string` getter shape and exact-`"1"` contract this check mirrors. +- [Env-var config validator](env-config-validator.md) — #80; polices the `=true` / `=0` / `= 1` typo cases before this check reads the value. +- [Listener port allowlist](listener-port-allowlist.md) — sibling boot-time refusal (#81); same exit-2-and-structured-log shape. +- [Codebase note #65](../codebase/65.md) — per-ticket implementation detail. +- [`internal/relay/single_instance.go`](../../../internal/relay/single_instance.go) — implementation. +- [Spec: 65-startup-multi-instance-check](../../specs/architecture/65-startup-multi-instance-check.md) — architect's design. diff --git a/docs/specs/architecture/65-startup-multi-instance-check.md b/docs/specs/architecture/65-startup-multi-instance-check.md new file mode 100644 index 0000000..1419c58 --- /dev/null +++ b/docs/specs/architecture/65-startup-multi-instance-check.md @@ -0,0 +1,107 @@ +# Spec: startup self-check refuses to run as multi-instance deploy + +Ticket: [#65](https://github.com/pyrycode/pyrycode-relay/issues/65). Size XS. Split from #39. Sibling of merged #64 (the doc half of the belt-and-suspenders pair). + +## Files to read first + +- `internal/relay/production.go` (whole file, 78 lines) — the canonical "boot-time Check\*" helper shape this spec mirrors. Lines 5–9 show the `envXxx` constant convention (env var name lives in `internal/relay`, not duplicated at the call site); lines 19–32 show the exact-`"1"` getter contract; lines 41–46 / 72–77 show the `Check*(getenv …) error` signature with the injected-seam rationale. +- `internal/relay/env_config.go` (whole file, 93 lines) — the env-var registry. Every new env var the relay reads at boot registers a row here; the spec adds one row for `PYRYCODE_RELAY_SINGLE_INSTANCE` so a typo like `=true` fails the boot config check before the multi-instance check runs. +- `internal/relay/production_test.go:1-45` — the value-matrix table pattern and the `fakeGetenv` helper (line 8–10) that the new test file reuses verbatim. The new test file lives in the same package (`relay`), so the helper is in scope without re-declaration. +- `cmd/pyrycode-relay/main.go:77-101` — the existing Check\* wiring band, between `CheckEnvConfig` (line 82) and `CheckInsecureListenInProduction` (line 95). The new check slots in between these two. The error-log shape (logger.Error + `"refusing to start: …"` + structured fields + `return 2`) is the established pattern. +- `docs/architecture.md:48-63` — the bypass env var contract pinned by sibling #64. The literal env-var name `PYRYCODE_RELAY_SINGLE_INSTANCE` and the literal value `1` are non-negotiable inputs to this spec. Note also the "intended uses" framing (emergency rollback, migration windows) and the "NOT recommended for production" line — this spec creates tension with that framing; see § Open questions. +- `fly.toml` (whole file, 74 lines) — the deploy manifest. The check fires when `FLY_APP_NAME` (set by the Fly platform on every machine in the app) is non-empty. Note also `auto_start_machines = false` / `min_machines_running = 1` (lines 40–42, 53–55) — the platform-side controls this check backs up. +- `docs/specs/architecture/64-single-instance-architecture-doc.md` — the doc-half spec that landed `architecture.md § Single-instance constraint`. Read only for tone and cross-link patterns; this spec implements the code half and references that section, it does not extend it. + +## Context + +`fly scale count 3` is a one-line operator (or AI-agent) command. The connection registry (`internal/relay/registry.go`) is in-memory per process, so two replicas hold two disjoint server-id tables. A phone connected to replica A with a server-id claimed on replica B closes with `4404` even though the binary is online — and nothing in the relay logs identifies "the binary is on the other replica" as the cause, because no replica knows the others exist. + +Sibling #64 ships the stochastic guard: prose in `docs/architecture.md` that tells operators and AI agents "v1 is single-instance, here's why, here's the bypass env var". This ticket ships the deterministic backstop: a boot-time refusal that fires when the relay detects it is running on a platform where multi-instance scale-out is a one-command operation (today: Fly.io) and the operator has not explicitly asserted single-instance intent via the bypass env var. + +The check is heuristic, not adversarial. It catches "operator deployed to Fly and never read `architecture.md`"; it does **not** catch "operator ran `fly scale count 3` after configuring the bypass in `fly.toml [env]`" — once the bypass propagates via the manifest, all replicas inherit it. The constraint to use **only env vars exposed by the host** (per Technical Notes in the ticket body) precludes the only mechanism that could catch the scale-out case at runtime (a DNS query against `.internal` or a Fly Machines API call). The honest framing: this check forces an *explicit deploy-time gate*, not a runtime instance count. + +Related: #39 (parent), #64 (doc half, merged), #77 / #78 / #80 (sibling boot-time Check\* helpers that established the helper / wiring / env-registry pattern this spec extends). + +## Design + +Two new files in `internal/relay/`. One ~10-line addition to `cmd/pyrycode-relay/main.go`. One row added to `envContracts` in `internal/relay/env_config.go`. No existing code is refactored. + +### New file: `internal/relay/single_instance.go` + +Declares: + +- `envSingleInstanceBypass = "PYRYCODE_RELAY_SINGLE_INSTANCE"` — package-internal constant, the literal bypass env var name pinned by `docs/architecture.md`. +- `envFlyAppName = "FLY_APP_NAME"` — package-internal constant, the multi-instance-platform signal. Fly's platform sets this on every machine in the app; presence is sufficient evidence that we are on a substrate where `fly scale count > 1` is a one-line command. +- `ErrMultiInstanceDeployDetected` — exported sentinel error. Its `Error()` string MUST contain the substring `multi-instance deploy detected` AND the literal env-var name `PYRYCODE_RELAY_SINGLE_INSTANCE` (AC #2). Recommended message: `"relay: multi-instance deploy detected; set PYRYCODE_RELAY_SINGLE_INSTANCE=1 to bypass"`. +- `CheckSingleInstance(getenv func(string) string) error` — exported helper. Signature mirrors `CheckInsecureListenInProduction` / `CheckRunningAsRoot` from `production.go`. Returns `ErrMultiInstanceDeployDetected` when the bypass is **not** set AND `FLY_APP_NAME` is non-empty; returns `nil` otherwise. + +Decision order in `CheckSingleInstance`: + +1. If `getenv(envSingleInstanceBypass) == "1"` → return `nil`. Bypass wins unconditionally. (The exact-`"1"` contract matches `IsProductionMode`; any other value of the bypass env — including `"true"`, `"0"`, leading/trailing whitespace — is treated as "not set" and the registry walk in `CheckEnvConfig` will have already rejected such values before this function runs.) +2. Else if `getenv(envFlyAppName) != ""` → return `ErrMultiInstanceDeployDetected`. +3. Else → return `nil`. + +No new exported types beyond the sentinel error. No new packages. + +### Edit: `internal/relay/env_config.go` + +Add one row to `envContracts` for the bypass env var. The contract is "exact `"1"` or unset" — same shape as `envProductionMode`. This means `PYRYCODE_RELAY_SINGLE_INSTANCE=true` fails the boot config check (in `CheckEnvConfig`) before `CheckSingleInstance` reads the value, catching the obvious typo class. + +The new row references the bypass-env constant declared in `single_instance.go` (`envSingleInstanceBypass`); the env-config file does **not** re-declare the literal string. This keeps the single-source-of-truth-for-env-var-names invariant the package already enforces (`envProductionMode` lives in `production.go`, used from `env_config.go`). + +### Edit: `cmd/pyrycode-relay/main.go` + +Insert `CheckSingleInstance` between the existing `CheckEnvConfig` (lines 82–93) and `CheckInsecureListenInProduction` (line 95). Wiring order rationale: + +- **After `CheckEnvConfig`** so a typo'd bypass (`=true`, `=0`, `= 1`) fails with the structured config error (clear per-key remediation) rather than silently being treated as "not set" and tripping the multi-instance refusal with a misleading log line. +- **Before `CheckInsecureListenInProduction`** because a deploy-shape misconfiguration (multi-instance) is more fundamental than a production-mode misconfiguration (insecure-listen-in-prod). If both fire, the operator should see the multi-instance error first; fixing it may not be possible without changing the deploy substrate, while insecure-listen is a flag flip. + +Error-log shape (mirrors the existing `CheckInsecureListenInProduction` block at lines 95–101): + +- Level: `ERROR` (AC #2). +- Message: `"refusing to start: multi-instance deploy detected"` — the AC-mandated substring lives in the message, not just in the structured `err` field, so a `grep -i 'multi-instance deploy detected'` over the boot logs hits regardless of whether the operator's log-rendering tool surfaces structured fields. +- Structured fields: `"err"` (the sentinel error), `"bypass_env_var"` (the literal string `"PYRYCODE_RELAY_SINGLE_INSTANCE"`), `"fix"` (a one-line remediation pointing to `docs/architecture.md § Single-instance constraint`). +- Return value: `2` (matches the established refuse-to-start exit-code convention; see `main.go` lines 74, 92, 100, 116, 123). + +### Concurrency model + +None. `CheckSingleInstance` is pure — no goroutines, no I/O, no allocation beyond the sentinel error. Called once at boot before any goroutine is started. + +### Error handling + +`ErrMultiInstanceDeployDetected` is a single exported sentinel, branchable via `errors.Is`. No structured fields (cf. `*ErrInvalidConfig`) — the failure has no per-instance variance worth surfacing, and the bypass instructions are universal. + +## Testing strategy + +New file: `internal/relay/single_instance_test.go`. Same package (`relay`), reuses `fakeGetenv` from `production_test.go` without redeclaration. + +Test cases (table-driven, AC #5 maps directly to the first three; the last two are guard-rails): + +- **Multi-instance signal present, bypass unset → `ErrMultiInstanceDeployDetected`.** (`FLY_APP_NAME = "pyrycode-relay"`, no `PYRYCODE_RELAY_SINGLE_INSTANCE`.) Asserts `errors.Is(err, ErrMultiInstanceDeployDetected)` AND `strings.Contains(err.Error(), "multi-instance deploy detected")` AND `strings.Contains(err.Error(), "PYRYCODE_RELAY_SINGLE_INSTANCE")`. AC #5(a). +- **Bypass set to `"1"`, signal present → `nil`.** (`FLY_APP_NAME = "pyrycode-relay"`, `PYRYCODE_RELAY_SINGLE_INSTANCE = "1"`.) AC #5(b). +- **No signal, no bypass → `nil`.** (Empty env.) AC #5(c). +- **Bypass set, signal absent → `nil`.** (Bypass is benign when the check would have passed anyway. Documents the no-op case so a future refactor can't silently flip the precedence.) +- **Bypass set to a non-`"1"` value, signal present → `ErrMultiInstanceDeployDetected`.** Documents that only the exact string `"1"` bypasses. The matrix should include at least one row from the production-mode equivalence set (e.g. `"true"`, `"0"`, `" 1"`) — the actual typo'd-value failures are caught earlier by `CheckEnvConfig`, but `CheckSingleInstance` is the ultimate decision site and must not be lenient. + +The bypass-env-config-validation contract (typo catches like `=true`) is already exercised by the existing `TestCheckEnvConfig_MalformedValueReturnsStructuredError` matrix. Add at minimum one row to that matrix exercising `PYRYCODE_RELAY_SINGLE_INSTANCE=true` to lock the new envContracts entry. No new test file for the env-config side. + +No new e2e test. The boot-refusal path is exercised by the helper-level test; the wiring is a one-line call site whose correctness is reviewable by code inspection (cf. the established practice for `CheckRunningAsRoot` / `CheckCapabilities` — no e2e for the helper-level Check\* boot refusals). + +## Security review + +Categories walked: + +- **Trust boundaries.** The check reads process env vars set by the deploy substrate (Fly's platform sets `FLY_APP_NAME`; the operator sets `PYRYCODE_RELAY_SINGLE_INSTANCE`). Both are *out-of-band* relative to the relay's internet-exposed surface — a remote attacker on `/v1/server` or `/v1/client` cannot influence either value. No new attack surface introduced by this ticket. +- **Bypass abuse.** The bypass env var is fail-open: setting it skips the check. The threat model assumes an adversary capable of setting process env vars already has shell access to the deploy host, at which point the multi-instance refusal is not the load-bearing control. The bypass is a deliberate operator-discipline tool, not a security boundary. +- **Log injection.** The error log emits two literal strings (`"refusing to start: …"` and `"PYRYCODE_RELAY_SINGLE_INSTANCE"`) plus the sentinel `err`. No env-var *values* are interpolated into the log line. (Compare `CheckEnvConfig` which logs the offending key and reason — `cfgErr.Key` / `cfgErr.Reason` are constructed from the registry, not from raw env values, so the same property holds. This spec adds only the literal bypass-env-var name.) +- **Resource exhaustion.** Two env-var reads, one string comparison, one allocation (`errors.New` runs once at package init for the sentinel). Boot only, no per-request cost. +- **Denial of service via misconfiguration.** A malicious operator (with deploy access) could set `FLY_APP_NAME=x` on a non-Fly host to crash boot. This is in the same trust class as the operator deleting the binary — not a relay-side concern. +- **Wire-protocol surface.** None. No new HTTP handler, no new WebSocket frame parser, no new TLS-config branch. + +Verdict: **PASS.** The change is a boot-time refusal helper with no new network or wire-protocol surface; all inputs are operator-controlled out-of-band env vars; the bypass semantics are explicit and aligned with the documented operational contract. + +## Open questions + +- **Tension with `architecture.md`'s "NOT recommended for production" framing.** The architecture doc, written before this code spec, characterises `PYRYCODE_RELAY_SINGLE_INSTANCE=1` as an emergency / migration-window bypass and warns against setting it permanently. This spec, by making `FLY_APP_NAME` presence the multi-instance signal, requires the bypass to be set permanently in `fly.toml [env]` for the production Fly deploy to start. The framing should be reconciled — the bypass is operationally an *explicit single-instance assertion* on multi-instance-capable platforms, not just an emergency override. **Recommended follow-up:** a separate XS doc-ticket against `docs/architecture.md § Single-instance constraint` that rewrites the "Intended uses" / "NOT recommended for production" lines to reflect the deployed reality. This ticket should land first (the code is the contract; the doc follows). **Out of scope for this ticket.** +- **Updating `fly.toml` to set the bypass in `[env]`.** Once this ticket lands, the production Fly deploy will refuse to start until `fly.toml` adds an `[env] PYRYCODE_RELAY_SINGLE_INSTANCE = "1"` block. There is no production deploy today (the `__REGION__` / `__DOMAIN__` placeholders in `fly.toml` would fail the first deploy regardless), so the update can ride alongside the bootstrap procedure documented in `docs/deploy.md`. **Recommended follow-up:** include the `[env]` block in the same PR or a follow-up XS ticket against `fly.toml` + `docs/deploy.md`. **Out of scope for this ticket** — this spec is scoped to the binary-side check. +- **Other multi-instance-capable platforms.** Only `FLY_APP_NAME` is checked today. Kubernetes-style deploys (`KUBERNETES_SERVICE_HOST`), ECS (`ECS_CONTAINER_METADATA_URI`), and AWS Lambda (`AWS_LAMBDA_FUNCTION_NAME`) would be candidates if the relay is ever ported. The signal-set is intentionally narrow — adding signals on speculation would create false positives for operators running the binary in unusual local-dev shells. **Defer until a non-Fly substrate is actually considered.** diff --git a/internal/relay/env_config.go b/internal/relay/env_config.go index 9918b3a..ef03eff 100644 --- a/internal/relay/env_config.go +++ b/internal/relay/env_config.go @@ -60,6 +60,16 @@ var envContracts = []envContract{ return fmt.Errorf("expected %q or unset, got %q", "1", v) }, }, + { + name: envSingleInstanceBypass, + required: false, + validate: func(v string) error { + if v == "1" { + return nil + } + return fmt.Errorf("expected %q or unset, got %q", "1", v) + }, + }, } // CheckEnvConfig walks envContracts and returns *ErrInvalidConfig on the diff --git a/internal/relay/env_config_test.go b/internal/relay/env_config_test.go index c34032b..87e974f 100644 --- a/internal/relay/env_config_test.go +++ b/internal/relay/env_config_test.go @@ -81,6 +81,33 @@ func TestCheckEnvConfig_MalformedValueReturnsStructuredError(t *testing.T) { } } +func TestCheckEnvConfig_SingleInstanceBypassMalformedValue(t *testing.T) { + t.Parallel() + + // Lock the envContracts row added for PYRYCODE_RELAY_SINGLE_INSTANCE so + // a typo'd bypass (e.g. =true) fails CheckEnvConfig with a structured + // per-key error before CheckSingleInstance reads the value. See + // docs/specs/architecture/65-startup-multi-instance-check.md. + env := map[string]string{envSingleInstanceBypass: "true"} + err := CheckEnvConfig(fakeLookup(env)) + if err == nil { + t.Fatal("CheckEnvConfig with bypass=true returned nil, want error") + } + if !errors.Is(err, ErrInvalidConfigSentinel) { + t.Errorf("err %v should satisfy errors.Is(err, ErrInvalidConfigSentinel)", err) + } + var cfgErr *ErrInvalidConfig + if !errors.As(err, &cfgErr) { + t.Fatalf("err %v should satisfy errors.As(err, &*ErrInvalidConfig)", err) + } + if cfgErr.Key != envSingleInstanceBypass { + t.Errorf("got Key=%q, want %q", cfgErr.Key, envSingleInstanceBypass) + } + if !strings.HasPrefix(cfgErr.Reason, "malformed-value: ") { + t.Errorf("got Reason=%q, want prefix %q", cfgErr.Reason, "malformed-value: ") + } +} + func TestCheckEnvConfig_MissingRequiredKey(t *testing.T) { t.Parallel() diff --git a/internal/relay/single_instance.go b/internal/relay/single_instance.go new file mode 100644 index 0000000..6e02164 --- /dev/null +++ b/internal/relay/single_instance.go @@ -0,0 +1,50 @@ +package relay + +import "errors" + +// envSingleInstanceBypass is the env var that asserts explicit operator +// intent to run a single instance on a multi-instance-capable platform. +// Contract is "1" means assert; anything else is treated as unset. Pinned +// by docs/architecture.md § Single-instance constraint. +const envSingleInstanceBypass = "PYRYCODE_RELAY_SINGLE_INSTANCE" // #nosec G101 -- env var name, not a credential + +// envFlyAppName is the Fly.io platform-set signal: non-empty on every +// machine in a Fly app, which is also the substrate where +// `fly scale count > 1` is a one-line operator command. Presence is the +// heuristic that we are on a multi-instance-capable platform. +const envFlyAppName = "FLY_APP_NAME" + +// ErrMultiInstanceDeployDetected is returned by CheckSingleInstance when +// the relay detects it is running on a multi-instance-capable platform +// (today: Fly.io) and the operator has not asserted single-instance intent +// via PYRYCODE_RELAY_SINGLE_INSTANCE=1. +// +// The connection registry is in-memory per process, so two replicas hold +// disjoint server-id tables and phones routed to the wrong replica close +// with 4404 even though the binary is online. The deterministic backstop +// to the docs/architecture.md prose half lives here. +var ErrMultiInstanceDeployDetected = errors.New("relay: multi-instance deploy detected; set PYRYCODE_RELAY_SINGLE_INSTANCE=1 to bypass") + +// CheckSingleInstance returns ErrMultiInstanceDeployDetected when the +// bypass env var is not exactly "1" AND a multi-instance-capable platform +// signal is present (FLY_APP_NAME non-empty). Returns nil otherwise. +// Intended to be called from main after flag parse, before any listener +// is started. +// +// Bypass wins unconditionally: when PYRYCODE_RELAY_SINGLE_INSTANCE=1 the +// platform signal is not consulted. Only the exact string "1" bypasses; +// other values (e.g. "true", " 1") are rejected earlier by CheckEnvConfig +// via the env-config registry, but CheckSingleInstance treats them as +// "not set" defensively. +// +// getenv is the env-var lookup function; pass os.Getenv at the call site, +// an injected func in tests. +func CheckSingleInstance(getenv func(string) string) error { + if getenv(envSingleInstanceBypass) == "1" { + return nil + } + if getenv(envFlyAppName) != "" { + return ErrMultiInstanceDeployDetected + } + return nil +} diff --git a/internal/relay/single_instance_test.go b/internal/relay/single_instance_test.go new file mode 100644 index 0000000..4c39bb1 --- /dev/null +++ b/internal/relay/single_instance_test.go @@ -0,0 +1,112 @@ +package relay + +import ( + "errors" + "strings" + "testing" +) + +func TestCheckSingleInstance_Matrix(t *testing.T) { + t.Parallel() + + cases := []struct { + name string + env map[string]string + wantSentinel bool + }{ + { + name: "multi-instance signal present, bypass unset returns sentinel", + env: map[string]string{ + envFlyAppName: "pyrycode-relay", + }, + wantSentinel: true, + }, + { + name: "bypass exact 1, signal present returns nil", + env: map[string]string{ + envFlyAppName: "pyrycode-relay", + envSingleInstanceBypass: "1", + }, + wantSentinel: false, + }, + { + name: "no signal, no bypass returns nil", + env: map[string]string{}, + wantSentinel: false, + }, + { + name: "bypass set, signal absent returns nil", + env: map[string]string{ + envSingleInstanceBypass: "1", + }, + wantSentinel: false, + }, + { + name: "bypass true (non-exact) with signal returns sentinel", + env: map[string]string{ + envFlyAppName: "pyrycode-relay", + envSingleInstanceBypass: "true", + }, + wantSentinel: true, + }, + { + name: "bypass 0 with signal returns sentinel", + env: map[string]string{ + envFlyAppName: "pyrycode-relay", + envSingleInstanceBypass: "0", + }, + wantSentinel: true, + }, + { + name: "bypass leading-space with signal returns sentinel", + env: map[string]string{ + envFlyAppName: "pyrycode-relay", + envSingleInstanceBypass: " 1", + }, + wantSentinel: true, + }, + } + + for _, tc := range cases { + tc := tc + t.Run(tc.name, func(t *testing.T) { + t.Parallel() + err := CheckSingleInstance(fakeGetenv(tc.env)) + if tc.wantSentinel { + if !errors.Is(err, ErrMultiInstanceDeployDetected) { + t.Errorf("got err %v, want errors.Is(err, ErrMultiInstanceDeployDetected)", err) + } + } else { + if err != nil { + t.Errorf("got err %v, want nil", err) + } + } + }) + } +} + +func TestErrMultiInstanceDeployDetected_MessageContainsACSubstrings(t *testing.T) { + t.Parallel() + + msg := ErrMultiInstanceDeployDetected.Error() + if !strings.Contains(msg, "multi-instance deploy detected") { + t.Errorf("error message %q must contain %q (AC #2)", msg, "multi-instance deploy detected") + } + if !strings.Contains(msg, "PYRYCODE_RELAY_SINGLE_INSTANCE") { + t.Errorf("error message %q must contain bypass env var name %q (AC #2)", msg, "PYRYCODE_RELAY_SINGLE_INSTANCE") + } +} + +func TestErrMultiInstanceDeployDetected_IsBranchable(t *testing.T) { + t.Parallel() + + if !errors.Is(ErrMultiInstanceDeployDetected, ErrMultiInstanceDeployDetected) { + t.Fatal("ErrMultiInstanceDeployDetected should be errors.Is itself") + } + + env := map[string]string{envFlyAppName: "pyrycode-relay"} + err := CheckSingleInstance(fakeGetenv(env)) + if !errors.Is(err, ErrMultiInstanceDeployDetected) { + t.Errorf("returned error %v should satisfy errors.Is(err, ErrMultiInstanceDeployDetected)", err) + } +}