diff --git a/cmd/pyrycode-relay/main.go b/cmd/pyrycode-relay/main.go index 675a7d8..47cea7e 100644 --- a/cmd/pyrycode-relay/main.go +++ b/cmd/pyrycode-relay/main.go @@ -7,6 +7,7 @@ package main import ( + "errors" "flag" "fmt" "log/slog" @@ -42,6 +43,24 @@ func main() { os.Exit(2) } + // Boot-time env-var validation runs BEFORE CheckInsecureListenInProduction + // so that a typo like PYRYCODE_RELAY_PRODUCTION=true cannot slip through + // IsProductionMode's silent-non-production fallback and reach the + // insecure-listen guard with an unvalidated value. See + // docs/specs/architecture/80-validate-env-vars-at-boot.md § Wiring order. + if err := relay.CheckEnvConfig(os.LookupEnv); err != nil { + var cfgErr *relay.ErrInvalidConfig + if errors.As(err, &cfgErr) { + logger.Error("refusing to start: invalid env-var config", + "err", err, + "env_var", cfgErr.Key, + "reason", cfgErr.Reason) + } else { + logger.Error("refusing to start: invalid env-var config", "err", err) + } + os.Exit(2) + } + if err := relay.CheckInsecureListenInProduction(*insecureListen, os.Getenv); err != nil { logger.Error("refusing to start: production-mode misconfiguration", "err", err, diff --git a/docs/knowledge/INDEX.md b/docs/knowledge/INDEX.md index e94d057..66b87ae 100644 --- a/docs/knowledge/INDEX.md +++ b/docs/knowledge/INDEX.md @@ -4,6 +4,7 @@ One-line pointers into the evergreen knowledge base. Newest entries at the top o ## Features +- [Env-var config validator (boot-time refusal)](features/env-config-validator.md) — table-driven validation of every env var the relay reads at boot. Single source of truth is the unexported `envContracts []envContract` registry in `internal/relay/env_config.go`; each row carries `name`, `required` bool, and an inline `validate func(string) error`. `CheckEnvConfig(lookup func(string) (string, bool)) error` walks the registry and returns the structured `*ErrInvalidConfig{Key, Reason}` on the first failure (`Reason` is `"missing"` or `"malformed-value: "`); the package-level sentinel `ErrInvalidConfigSentinel` is matched via a custom `Is` method (not `Unwrap`, which would double-print the message prefix) so `errors.Is(err, ErrInvalidConfigSentinel)` and `errors.As(err, &cfgErr)` form a dual contract. The `func(string) (string, bool)` (= `os.LookupEnv` shape) getter coexists with #77's `func(string) string` getter — the presence bit is necessary here to distinguish "missing-but-required" from "present-but-empty", semantically inert for `IsProductionMode`'s exact-`"1"` match. **Ordering is load-bearing**: wired in `main.go` BEFORE `CheckInsecureListenInProduction` so a typo like `PYRYCODE_RELAY_PRODUCTION=true` cannot slip through `IsProductionMode`'s silent-non-production fallback and reach the insecure-listen guard with an unvalidated value. Today's registry has one row (`PYRYCODE_RELAY_PRODUCTION`, optional-but-format-validated); future env-var reads register here at code-review time. `checkEnvConfigWith(lookup, contracts)` is the parameterised inner used by the `required: true` test case (today's production table has no required entries). Exit 2 = config-rejected-at-boot, matching the sibling refusals (#9, #77, #79) (#80). - [Linux capability allowlist (boot-time refusal)](features/capability-allowlist.md) — relay parses `/proc/self/status`'s `CapEff:` hex mask at boot and refuses to start (exit 2) if any bit is set outside `AllowedCapabilities` (currently `{CAP_NET_BIND_SERVICE}` only, motivated by autocert binding `:80`/`:443` from uid 65532 in the distroless image). Exported sentinel `ErrUnexpectedCapability` is branchable via `errors.Is`; the wrapped error names every offending bit symbolically (`CAP_SYS_ADMIN (bit 21)` or `bit 63` for unknown), lists the allowlist contents, and embeds the operator fix string. `CapEff` only — `CapPrm/CapBnd/CapInh` would broaden false-positives (legitimate K8s default policy grants wide CapBnd) without adding load-bearing protection (relay never `capset(2)`s). Linux/non-Linux split at compile time via the new `_.go` / `_other.go` build-tag convention (see ADR-0009); non-Linux GOOS logs one skip line and returns nil. Unconditional — no production-mode gating, no env-var bypass, because stray capabilities are never legitimate. Reader-boundary test seam (`func() (string, error)`) exercises the parse + mask check end-to-end without touching real `/proc`. Joins the boot-time-refusal sentinel family (#9, #77, #79; future #78) (#79). - [Production-mode contract & `--insecure-listen` startup refusal](features/production-mode.md) — `PYRYCODE_RELAY_PRODUCTION=1` env-var contract (exact-string match, lazy read via injected getter, mirrors `PYRYCODE_RELAY_SINGLE_INSTANCE` shape from #64/#65) plus the first boot-time check that consumes it: `relay.CheckInsecureListenInProduction` returns the exported `ErrInsecureListenInProduction` sentinel (branchable via `errors.Is`) when production mode is on AND `--insecure-listen` is set, wired into `cmd/pyrycode-relay/main.go` after flag-parse with `os.Exit(2)` (config-rejected-at-boot, distinct from runtime-failure exit 1) and structured log fields naming the env var (name only, never value) and a one-line `fix` listing both valid resolutions. `IsProductionMode` exported so sibling startup checks (#78 = uid-0) compose on the same predicate without re-reading the env var. Test seam is a `func(string) string` getter (smallest possible — no interface, no struct, no package-level var) so the 2×2 AC matrix and the value-space matrix run under `t.Parallel()` + `-race` without mutating process env (#77). - [Fly.io deploy](features/fly-deploy.md) — production host wiring: `fly.toml` declares TCP-passthrough on `:80`/`:443` (no Fly HTTP proxy, no Fly-managed certs) so TLS keeps terminating in the relay via autocert (#9), persistent Fly volume `relay_autocert` mounted at `/var/lib/relay/autocert`, and a single-machine hard cap encoded via `min_machines_running=1` + `auto_start_machines=false` + `auto_stop_machines="off"` + `[deploy] strategy="immediate"` (Fly Apps v2 has no `max_machines` key; the in-binary `PYRYCODE_RELAY_SINGLE_INSTANCE` self-check from #65 is the backstop). CI `deploy` job in `.github/workflows/ci.yml` runs `flyctl deploy --remote-only` on push to `main`, gated by branch-condition + `needs: [test, security, image-scan]` + `permissions: contents: read` so `FLY_API_TOKEN` is structurally unreachable from PR code; `superfly/flyctl-actions/setup-flyctl` pinned by commit SHA with `# Tracks:` comment (same convention as #68 / #41). Dedicated IPv4 is required (not optional) for autocert's HTTP-01 challenge; TCP passthrough preserves the real socket peer IP that #34's rate limiter reads. `__REGION__` / `__DOMAIN__` ship as placeholders that fail loud on first deploy (#38). diff --git a/docs/knowledge/codebase/80.md b/docs/knowledge/codebase/80.md new file mode 100644 index 0000000..9d6fd73 --- /dev/null +++ b/docs/knowledge/codebase/80.md @@ -0,0 +1,52 @@ +# Ticket #80 — validate required env vars at boot, fail-fast on missing/malformed + +Adds a boot-time, table-driven validator over an explicit contract registry of env vars the relay reads. A misconfigured deploy manifest with a typo like `PYRYCODE_RELAY_PRODUCTION=true` now fails the deploy's health check at boot (`os.Exit(2)`) with a structured log line naming the key and shape violation, rather than silently being treated as "not production" and reaching `CheckInsecureListenInProduction` (#77) with an unvalidated value. Split from #42; sibling of #77 (`--insecure-listen` guard) and #79 (Linux capability allowlist). + +## Implementation + +- **`internal/relay/env_config.go` (new, 92 lines)** — four exported names plus the unexported registry: + - `ErrInvalidConfigSentinel` — package-level sentinel; matched by `*ErrInvalidConfig.Is`. + - `ErrInvalidConfig{Key, Reason string}` — structured error. The AC's literal type name; `Sentinel` suffix on the variable disambiguates the two without departing from the AC. `Is(target)` method matches the sentinel without `Unwrap` (which would double-print the message prefix). + - `CheckEnvConfig(lookup func(string) (string, bool)) error` — entry point; walks `envContracts` and returns the first failure. Calls `checkEnvConfigWith(lookup, envContracts)`. + - `checkEnvConfigWith(lookup, contracts)` (unexported) — parameterised version so tests exercise the `required: true` branch with a synthetic table. + - `envContract{name, required, validate}` + `envContracts []envContract` — single-source-of-truth registry. Today one row: `PYRYCODE_RELAY_PRODUCTION` with `required: false` and an inline `func(v string) error` that returns nil only on exact-`"1"` match. +- **`internal/relay/env_config_test.go` (new, 138 lines)** — five `t.Parallel()`-safe tests; never mutate process env: + - `TestCheckEnvConfig_ValidEnvReturnsNil` — two rows (unset, exact `"1"`). + - `TestCheckEnvConfig_MalformedValueReturnsStructuredError` — seven rows (`"true"`, `"0"`, `"yes"`, `" 1"`, `"1 "`, `"PRODUCTION"`, `""`). Asserts `errors.Is`, `errors.As`, `Key`, and `Reason` prefix + substring. Mirrors the `IsProductionMode` value matrix from #77 so a future refactor to `strconv.ParseBool` fails on both tests at once. + - `TestCheckEnvConfig_MissingRequiredKey` — uses `checkEnvConfigWith` with a synthetic `required: true` row (today's `envContracts` has none). + - `TestCheckEnvConfig_IgnoresUnregisteredKeys` — locks in "polices declared contract, not arbitrary env". + - `TestErrInvalidConfigSentinel_IsBranchable` — `errors.Is` + `errors.As` dual contract. +- **`cmd/pyrycode-relay/main.go` (+19 lines, +1 import `errors`)** — call inserted between the flag-mutex guard (line 41-44) and the existing `CheckInsecureListenInProduction` (line 64). `errors.As` extraction at the log site populates `env_var` and `reason` structured fields; the no-`As` fallback branch is defensive (unreachable under the current design). + +## Acceptance criteria — verification map + +- **AC-1** (exported `ErrInvalidConfig` exposes Key + Reason; `errors.As` for fields, `errors.Is` against sentinel): `env_config.go:13-36`, locked in by `TestErrInvalidConfigSentinel_IsBranchable`. +- **AC-2** (check function iterates an explicit in-package contract table, returns `ErrInvalidConfig` on first failure): `env_config.go:74-92`. +- **AC-3** (contract table covers every env var read at merge time): `envContracts` on `env_config.go:52-63`. Grep verification: `grep -rn "os\.Getenv\|os\.LookupEnv" cmd/ internal/` returns `PYRYCODE_RELAY_PRODUCTION` only (via the `IsProductionMode` getter argument). One row, complete. +- **AC-4** (wired in `main.go` after flag parse, before listener, fail-fast with structured log naming key + reason): `main.go:46-62`. Runs before `CheckInsecureListenInProduction` and `CheckCapabilities`. +- **AC-5** (unit tests for valid / missing-required / malformed / unregistered; test seam not process env): `env_config_test.go:16-138`. + +## Patterns established + +- **Contract-table-as-registry shape.** A package-level `[]envContract` slice where each row carries the env-var name, a `required` bool, and an inline shape `validate`. The registry is read-only at runtime, append-only at code-review time. Use this shape any time a binary reads >1 env var; for one var, the #77 single-helper-per-var shape is still fine. The natural next step (a typed `Config` struct populated from the registry) is deferred until env-var count + downstream-consumer pressure motivates it. +- **`os.LookupEnv`-shape seam for presence-aware checks.** Distinguishing "missing-but-required" from "present-but-empty" requires the presence bit. The `func(string) (string, bool)` shape carries it. Coexists with #77's `func(string) string` shape — pick the seam by whether the check needs the presence bit, not by repo-wide consistency. Mixing the two intentionally is correct. +- **Sentinel + structured error pair, dual-contract.** `ErrXxxSentinel` (the matchable identity) + `ErrXxx struct` (the field-carrier) with an `Is(target) bool` method on the struct. `errors.Is(err, ErrXxxSentinel)` matches any instance regardless of fields; `errors.As(err, &xxxErr)` extracts the fields. The `Is`-method-not-`Unwrap` choice avoids double-printing the sentinel's message prefix in the wrapped form. Apply for any future field-carrying boot-time-refusal error. +- **`checkXxxWith(lookup, table)` test seam for table-driven checks.** When the production table is a package-level var, expose an unexported parameterised variant so tests inject a synthetic table without mutating package state. Same shape as `caps_test.go`'s reader-boundary seam (#79). +- **Boot-time-refusal sentinel family grows.** Now five: `ErrCacheDirInsecure` (#9), `ErrInsecureListenInProduction` (#77), `ErrUnexpectedCapability` (#79), `ErrInvalidConfigSentinel` (#80), future `#78`. Same wiring shape: `if err := relay.CheckXxx(...); err != nil { logger.Error(...); os.Exit(2) }`. The boilerplate is starting to repeat; a follow-up ticket may consolidate into `relay.Config.Validate()` returning a multi-error once #78 lands and the count is six — not before. + +## Lessons learned + +- **The validator runs BEFORE the consumer of the value.** This is the architectural decision of the ticket. `CheckInsecureListenInProduction` reads `PYRYCODE_RELAY_PRODUCTION` via `IsProductionMode`, which silently accepts every non-`"1"` value as "not production". If `CheckEnvConfig` runs *after* `CheckInsecureListenInProduction`, the typo `PYRYCODE_RELAY_PRODUCTION=true` causes the insecure-listen guard to pass (because `IsProductionMode` returned false), the plaintext listener is allowed to boot, and only then does `CheckEnvConfig` fire — but by that time the dangerous code path has already been validated as safe. The ordering is load-bearing; the wiring comment in `main.go:46-50` documents it; code review enforces it. General rule for boot-time validators: validate the input *before* any consumer reads it, otherwise the consumer's failure mode is the very thing the validator exists to prevent. +- **Departing from Go naming convention is sometimes the right call.** Go convention is `XxxError` for struct types and `ErrXxx` for sentinel variables (cf. `os.PathError` vs `os.ErrNotExist`). The AC explicitly named the struct `ErrInvalidConfig`. Renaming it to `InvalidConfigError` would have been more idiomatic but would have caused grep-confusion between ticket discussion and code for every future reader. Resolution: keep the struct as `ErrInvalidConfig`, name the sentinel `ErrInvalidConfigSentinel` to disambiguate. Naming conflicts are best resolved by adding a suffix to the less-load-bearing name, not by renaming the AC-fixed one. (The deviation cost is one extra word in the sentinel; the alternative cost is permanent ticket/code drift.) +- **First-failure-only is sufficient for boot-time refusal.** A multi-error variant (collect all failures, surface together) is a natural design instinct but adds no value for the current threat model: the operator fixes one misconfiguration, redeploys, sees the next failure, fixes it. The fail-fast loop is "fix, redeploy, fix, redeploy" — the multi-error path would be "fix all, redeploy", which saves at most a few redeploy cycles per onboarding. Defer the multi-error consolidation until env-var count grows enough that the redeploy cost is real (per #77's "do not pre-build that abstraction" guidance). +- **`errors.Is` method instead of `Unwrap` for sentinel pairs.** If `*ErrInvalidConfig.Unwrap()` returned `ErrInvalidConfigSentinel`, then `fmt.Errorf("…: %w", cfgErr)` would print `…: relay: invalid env-var config: KEY: REASON: relay: invalid env-var config` — the sentinel's message appears twice. The custom `Is(target) bool` method gives the same `errors.Is` semantics without the message duplication. Use this shape any time a struct error needs to match a sentinel without being a wrapper. +- **Echoing raw env-var values in error messages is safe today, but a forward-looking trap.** The `PYRYCODE_RELAY_PRODUCTION` validator's reason includes `got %q` because the legitimate value is the literal `"1"` and any other operator-supplied bytes are misconfiguration data, not secret data. For future env vars that could carry tokens, keys, or other secrets, the validator's reason string must describe the shape violation in operator-facing terms without echoing the raw value (the architect security-review noted this; codifying it here for future entries). The structured log line in `main.go` already keeps the raw value out via the `env_var` (name only) and `reason` (validator's prose) fields — do not add a `value` field. + +## Cross-links + +- [Env-var config validator (feature)](../features/env-config-validator.md) — operator/contributor-facing doc covering contract, API, wiring, and "how to add a new env var". +- [Production-mode contract & `--insecure-listen` startup refusal](../features/production-mode.md) — #77, the first env-var contract; `IsProductionMode` is the silent-non-production consumer this ticket defends. +- [Codebase note #77](77.md) — env-var contract precedent (`PYRYCODE_RELAY_PRODUCTION`); same single-helper shape that scaled into this ticket's registry. +- [Codebase note #79](79.md) — sibling boot-time refusal (Linux capability allowlist); same wiring shape, runs after env-config validation. +- [Architecture spec](../../specs/architecture/80-env-config-validator.md) — the design that landed. +- [#42 — parent ticket](https://github.com/pyrycode/pyrycode-relay/issues/42) — split into #77 / #78 / #79 / #80. diff --git a/docs/knowledge/features/env-config-validator.md b/docs/knowledge/features/env-config-validator.md new file mode 100644 index 0000000..7cf9a50 --- /dev/null +++ b/docs/knowledge/features/env-config-validator.md @@ -0,0 +1,87 @@ +# Env-var config validator (boot-time refusal) + +Structured, table-driven validation of every env var the relay reads at boot. Introduced by #80. The validator polices an explicit in-package contract table — one row per env var, with a `required` flag and a per-key shape validator — and aborts the process with `os.Exit(2)` and a structured log line on the first failure. Sits in front of the `IsProductionMode` consumer (#77) so a typo like `PYRYCODE_RELAY_PRODUCTION=true` cannot slip through the production-mode helper's silent-non-production fallback. + +## What it catches + +The contract table is the single source of truth for "what env vars the relay reads". Today it has one row (`PYRYCODE_RELAY_PRODUCTION`); future env-var reads register here. For each entry the validator distinguishes three states: + +- **Unset.** If `required: true`, returns `*ErrInvalidConfig{Reason: "missing"}`. If `required: false`, skipped. +- **Set-and-non-empty.** Runs the per-key `validate(value)`; on `error`, returns `*ErrInvalidConfig{Reason: "malformed-value: "}`. +- **Set-but-empty.** Treated as "present" — runs the validator. Per-key validators decide whether empty is meaningful; today's `PYRYCODE_RELAY_PRODUCTION` validator rejects empty as malformed. + +Unregistered env vars (anything not in `envContracts`) are ignored. The validator polices its declared contract, not arbitrary process env — a future refactor that walks `os.Environ()` for unknown keys would break this invariant silently and is explicitly forbidden by a unit test. + +## API (`internal/relay/env_config.go`) + +- `CheckEnvConfig(lookup func(string) (string, bool)) error` — public entry point. Pass `os.LookupEnv` at the call site; tests inject a closure over a `map[string]string`. Returns nil on full pass, `*ErrInvalidConfig` on the first failure. +- `ErrInvalidConfig{Key, Reason string}` — structured error. `Key` is the env-var name; `Reason` is operator-facing prose (`"missing"` or `"malformed-value: "`). +- `ErrInvalidConfigSentinel` — package-level sentinel. `*ErrInvalidConfig.Is(target)` matches against it, so `errors.Is(err, ErrInvalidConfigSentinel)` returns true for any instance regardless of `Key` / `Reason`. Use `errors.As(err, &cfgErr)` to extract the fields. +- `envContract{name, required, validate}` (unexported) — one row of the registry. +- `envContracts` (unexported, package-level var) — the registry. Read-only at runtime; new env-var reads append here at code-review time. +- `checkEnvConfigWith(lookup, contracts)` (unexported) — same body as `CheckEnvConfig` parameterised by a table. Exists so tests can exercise the `required: true` branch without mutating the production `envContracts` slice. + +### Getter shape: `func(string) (string, bool)`, not `func(string) string` + +The validator's getter has the `os.LookupEnv` signature (returning a presence bit), not the `os.Getenv` signature that `IsProductionMode` uses. The two seams coexist: + +- `IsProductionMode` (#77) wants `func(string) string` because exact-`"1"` match has no use for the presence bit (anything-not-`"1"` collapses to "not production"). +- `CheckEnvConfig` (#80) wants `func(string) (string, bool)` because distinguishing "missing-but-required" from "present-but-empty" is semantically necessary for the required-key case, and `os.Getenv` cannot make that distinction. + +Mixing the two seam shapes is intentional; do not refactor `IsProductionMode` to the `LookupEnv` shape for consistency's sake without a motivating failure. + +## Wiring (`cmd/pyrycode-relay/main.go`) + +After flag-parse, **before** `CheckInsecureListenInProduction` (#77) and `CheckCapabilities` (#79): + +```go +if err := relay.CheckEnvConfig(os.LookupEnv); err != nil { + var cfgErr *relay.ErrInvalidConfig + if errors.As(err, &cfgErr) { + logger.Error("refusing to start: invalid env-var config", + "err", err, + "env_var", cfgErr.Key, + "reason", cfgErr.Reason) + } else { + logger.Error("refusing to start: invalid env-var config", "err", err) + } + os.Exit(2) +} +``` + +- **Ordering is load-bearing.** `CheckInsecureListenInProduction` consults `IsProductionMode`, which treats every non-`"1"` value as "not production". If the operator wrote `PYRYCODE_RELAY_PRODUCTION=true`, the insecure-listen guard returns nil and a plaintext production listener boots. Running `CheckEnvConfig` first catches the malformed value and the dangerous path is never reached. Code review enforces. +- **Exit code 2** matches the sibling boot-time refusals; exit 1 is reserved for runtime failures. +- **`errors.As` extraction at the log site**, not in the validator. The structured fields are `err`, `env_var` (the key), and `reason` (the per-key prose). The fallback branch (no `errors.As` hit) is defensive against a future refactor that wraps the error in something opaque; under the current design it is unreachable. +- **No `value` log field.** The reason string already echoes the malformed value (`got "true"`) where safe; the `env_var` field carries the name only. Future entries whose values could carry secrets must keep raw bytes out of the reason string — describe the shape violation in operator-facing terms instead (`"expected hex-encoded 32-byte value"`, not `got %q`). + +## Adding a new env var + +When a future ticket adds an `os.Getenv` / `os.LookupEnv` read under `cmd/pyrycode-relay/` or `internal/relay/`: + +1. Add an entry to `envContracts` with `name`, `required`, and an inline `validate func(string) error`. +2. The validator handles presence/absence; `validate` only sees the raw value when the var is set. +3. Decide `required` per env var. Today `PYRYCODE_RELAY_PRODUCTION` is `required: false` (unset means non-production — legitimate for dev environments); the validator only fires on a non-`"1"` set value. +4. For secret-bearing env vars, the validator's error must not echo the raw value into the reason string (it surfaces in the log line). + +Code review enforces "every env-var read in the binary is registered here" by re-running the grep at landing. + +## Behaviour matrix (today) + +| `PYRYCODE_RELAY_PRODUCTION` | `CheckEnvConfig` result | +|---|---| +| unset | nil | +| `"1"` | nil | +| `""` (set-but-empty) | `*ErrInvalidConfig{Key, Reason: "malformed-value: expected \"1\" or unset, got \"\""}` | +| anything else (`"true"`, `"0"`, `"yes"`, `" 1"`, …) | `*ErrInvalidConfig{Key, Reason: "malformed-value: expected \"1\" or unset, got \"…\""}` | + +## Why not a `Config` struct (yet) + +A natural extension is a typed `relay.Config` populated from the env at boot, validated as a single step, and threaded through the binary. **Deferred.** The relay has one env var today; the ticket body promises "future env-var additions register here", which means the contract table is the registry, not a typed struct. When env-var count crosses ~3 *and* a downstream caller wants a typed snapshot, a follow-up ticket consolidates. Doing it now is premature. + +## Cross-links + +- [Production-mode contract & `--insecure-listen` startup refusal](production-mode.md) — first env-var contract (#77); the malformed-value cases listed there are now caught by `CheckEnvConfig` before `IsProductionMode` is consulted. +- [Linux capability allowlist](capability-allowlist.md) — sibling boot-time refusal (#79); same wiring shape, runs after env-config validation. +- [Codebase note #80](../codebase/80.md) — implementation summary and lessons. +- [Codebase note #77](../codebase/77.md) — env-var contract precedent (`PYRYCODE_RELAY_PRODUCTION`). +- [`internal/relay/env_config.go`](../../../internal/relay/env_config.go) — implementation. diff --git a/docs/knowledge/features/production-mode.md b/docs/knowledge/features/production-mode.md index b63a371..c8a4a90 100644 --- a/docs/knowledge/features/production-mode.md +++ b/docs/knowledge/features/production-mode.md @@ -49,6 +49,8 @@ if err := relay.CheckInsecureListenInProduction(*insecureListen, os.Getenv); err Autocert (`--domain`) is not inspected — the contract is purely about plaintext-in-prod. Setting `--domain` with `PYRYCODE_RELAY_PRODUCTION=1` is the happy path. +Since #80 the matrix's "not `"1"`" row only describes the *post-validation* state: a malformed `PYRYCODE_RELAY_PRODUCTION` value (`"true"`, `"yes"`, `" 1"`, `"PRODUCTION"`, etc.) is now caught by [`CheckEnvConfig`](env-config-validator.md) *before* `CheckInsecureListenInProduction` is consulted, so the relay refuses to boot at the env-validation stage rather than silently treating the typo as "not production". `IsProductionMode`'s strict-`"1"` contract is unchanged; the validator simply ensures no other value ever reaches it. + ## Threat model alignment `pyrycode/pyrycode/docs/protocol-mobile.md` § Security model assumes TLS for all production traffic. This is the in-binary enforcement of that assumption *when production mode is explicitly tagged*. The complement — a CI / deploy-manifest check that prod manifests actually set `PYRYCODE_RELAY_PRODUCTION=1` — is out of scope and the responsibility of the deploy layer. @@ -66,3 +68,4 @@ The check is fail-closed: if the env var is on AND plaintext is requested, the r - [`internal/relay/tls.go`](../../../internal/relay/tls.go) — `ErrCacheDirInsecure` is the canonical boot-time-refusal sentinel this one models. - [Single-instance constraint (v1)](../../architecture.md#single-instance-constraint-v1) — sibling env-var contract (`PYRYCODE_RELAY_SINGLE_INSTANCE`), shape precedent. - [Codebase ticket note #77](../codebase/77.md) — per-ticket implementation detail. +- [Env-var config validator](env-config-validator.md) — #80; the boot-time validator that polices the malformed-value cases listed in the contract, running before `CheckInsecureListenInProduction` so a typo can never reach `IsProductionMode`. diff --git a/docs/specs/architecture/80-env-config-validator.md b/docs/specs/architecture/80-env-config-validator.md new file mode 100644 index 0000000..1a2297b --- /dev/null +++ b/docs/specs/architecture/80-env-config-validator.md @@ -0,0 +1,351 @@ +# Spec: validate required env vars at boot, fail-fast on missing/malformed + +Ticket: [#80](https://github.com/pyrycode/pyrycode-relay/issues/80). Size S. Split from #42. Blocked-by #77 (now merged). + +## Files to read first + +- `internal/relay/production.go` (whole file, 47 lines) — the existing env-var helper. Lines 5–9 define the `envProductionMode` constant; lines 30–32 show the exact-`"1"` contract; lines 41–46 show the boot-time-refusal helper shape. The new validator polices the same env var, so the constant is reused (not duplicated). +- `internal/relay/production_test.go` (whole file, 119 lines) — the test style. Lines 8–10 define the `fakeGetenv` helper this spec mirrors (adapted to the `LookupEnv` shape). Lines 12–45 are the value-matrix table pattern to copy; lines 47–105 are the 2×2 matrix shape to copy for the validator's missing-vs-malformed-vs-valid cases. +- `cmd/pyrycode-relay/main.go:30-58` — the wiring location. The new validator slots between line 43 (the `--domain` xor `--insecure-listen` flag-validation guard) and line 45 (`CheckInsecureListenInProduction`). This ordering is load-bearing — see § Wiring order below. +- `internal/relay/caps.go:10-22` — the canonical "sentinel + struct error" precedent in this package. Mirror the doc-comment shape (intent, deploy-time framing, "branchable via errors.Is"). +- `internal/relay/caps_test.go:108-155` — the `errors.Is`-branchable test idiom for sentinel-style errors. +- `docs/specs/architecture/77-refuse-insecure-in-production.md:23-89` — the sibling design that introduced `PYRYCODE_RELAY_PRODUCTION`. The "exact `"1"` match, not truthy parsing" rule and the injected-getter seam are this ticket's foundation. The validator extends, not replaces, that contract. + +## Context + +The relay reads exactly one env var today (`PYRYCODE_RELAY_PRODUCTION`, landed in #77) and reads it through `IsProductionMode(getenv)` which uses an exact-`"1"` match. Anything else — `"true"`, `"yes"`, `" 1"`, `"PRODUCTION"` — is silently treated as **non-production**. That is the failure mode this ticket exists to catch: an operator who writes `PYRYCODE_RELAY_PRODUCTION=true` in a deploy manifest gets a relay that runs in non-production mode, with no signal that the operator's intent was different from the relay's behaviour. The `--insecure-listen` guard from #77 does not fire, plaintext serving is allowed, and the operator only notices when traffic is already on the wire in cleartext. + +This ticket adds a boot-time structured validator that runs over an explicit contract table of env-var shapes and aborts with a per-key reason on the first failure. After this ticket, `PYRYCODE_RELAY_PRODUCTION=true` is a `*ErrInvalidConfig{Key: "PYRYCODE_RELAY_PRODUCTION", Reason: "malformed-value: expected \"1\" or unset, got \"true\""}` at boot, the process exits 2, and the deploy fails its health check rather than running in the wrong mode. + +The contract table is the *single source of truth* for "what env vars the relay reads". Every future env-var addition registers an entry; the architect/code-review pair enforces this at landing time. Today the table is one row (`PYRYCODE_RELAY_PRODUCTION`) — that's the entire population of `os.Getenv` / `os.LookupEnv` call sites in `cmd/pyrycode-relay/` and `internal/relay/` at this ticket's merge time (verified by grep, see § Contract table population below). + +Related: #9 (boot-time refusal precedent, `ErrCacheDirInsecure`); #77 (introduced `PYRYCODE_RELAY_PRODUCTION`); #79 (sibling boot-time check, `ErrUnexpectedCapability`); #42 (parent — split into #77 / #78 / #80). + +## Design + +Two new files: `internal/relay/env_config.go` and `internal/relay/env_config_test.go`. One ~7-line addition to `cmd/pyrycode-relay/main.go`. No existing code is refactored. + +### Structured error type and sentinel + +```go +// ErrInvalidConfigSentinel is the package-level sentinel that every +// *ErrInvalidConfig matches against via errors.Is. Detect any +// env-var validation failure with errors.Is(err, ErrInvalidConfigSentinel); +// extract the offending key and reason with errors.As(err, &cfgErr) where +// cfgErr is *ErrInvalidConfig. +var ErrInvalidConfigSentinel = errors.New("relay: invalid env-var config") + +// ErrInvalidConfig is the structured error returned by CheckEnvConfig when +// a registered env var is missing-but-required or present-but-malformed. +// It exposes the offending env-var name and a human-readable reason so the +// caller can log a per-key remediation message at boot. +// +// Branchable via errors.Is(err, ErrInvalidConfigSentinel). Fields are +// extracted via errors.As(err, &cfgErr). +type ErrInvalidConfig struct { + Key string // env-var name (e.g. "PYRYCODE_RELAY_PRODUCTION") + Reason string // human-readable shape violation +} + +func (e *ErrInvalidConfig) Error() string { + return fmt.Sprintf("%s: %s: %s", ErrInvalidConfigSentinel, e.Key, e.Reason) +} + +// Is matches against the package-level sentinel so errors.Is(err, ErrInvalidConfigSentinel) +// returns true for any *ErrInvalidConfig instance regardless of Key / Reason. +func (e *ErrInvalidConfig) Is(target error) bool { + return target == ErrInvalidConfigSentinel +} +``` + +Three decisions worth naming: + +1. **Type name follows the AC literally.** The AC says "an exported structured error type `ErrInvalidConfig`". Go convention would normally use `InvalidConfigError` for the struct type and reserve `Err*` for sentinel variables (cf. `os.PathError` vs `os.ErrNotExist`). The AC explicitly conflates the two names; the cost of departing is grep-confusion for future agents reading the ticket vs the code. Keeping the type name `ErrInvalidConfig` and naming the sentinel `ErrInvalidConfigSentinel` resolves the AC literally while keeping the two values distinguishable in caller code. +2. **`Is(target) bool` on the type, not `Unwrap()`.** `Unwrap` would mean `*ErrInvalidConfig`'s message is layered on top of the sentinel's message, which double-prints the prefix. A custom `Is` method gives `errors.Is(err, ErrInvalidConfigSentinel) == true` without the message duplication. (`errors.As` works on the type regardless of `Is` / `Unwrap`.) +3. **Reason is a plain string, not a typed enum.** Reasons today are `"missing"` and `"malformed-value: …"`. A typed enum would lock the categories at the type level, but it would also force every future validator to extend the enum. The AC's examples (`missing`, `malformed-value: expected "1", got "true"`) treat the reason as operator-facing prose; a string carries that directly. If two reasons later need machine branching, the upgrade path is `errors.As` + a method on the type, not an enum break. + +### Contract table + +```go +// envContract is one row in the env-var registry. The validator iterates +// the registry in order at boot; each entry specifies the env-var name, +// whether the relay refuses to start when the var is unset, and a per-key +// shape validator. Validators receive the raw value (never an empty +// "present-but-empty" case — that branch is handled by the registry walk). +type envContract struct { + name string + required bool + validate func(value string) error +} + +// envContracts is the single source of truth for "what env vars the relay +// reads at boot". Every new env-var read added under cmd/pyrycode-relay +// or internal/relay must register an entry here. Code review enforces. +var envContracts = []envContract{ + { + name: envProductionMode, // "PYRYCODE_RELAY_PRODUCTION" + required: false, + validate: func(v string) error { + if v == "1" { + return nil + } + return fmt.Errorf("expected %q or unset, got %q", "1", v) + }, + }, +} +``` + +Three decisions: + +1. **Inline validators, no separate `envvalidate` package.** Per the ticket's Technical Notes. Validators today are one-liners; pulling them out would invert the locality without buying anything until there are several with shared logic. +2. **Optional-but-format-validated for `PYRYCODE_RELAY_PRODUCTION`.** Per the ticket body's explicit guidance ("`PYRYCODE_RELAY_PRODUCTION` is expected to be optional-but-format-validated"). Operationally: unset means non-production, which is a legitimate state for dev environments. The validator only fires when the var is *set* to a non-`"1"` value — that is the silent-typo class the ticket targets. +3. **`required: false` + validator never sees an empty value.** The check function distinguishes "unset" (skip), "set-but-empty-and-required" (fail with `missing`), and "set-and-non-empty" (run validator). A validator does not need to special-case the empty-string branch; this keeps each per-key validator focused on shape, not presence. + +### Check function and getter shape + +```go +// CheckEnvConfig walks envContracts and returns *ErrInvalidConfig on the +// first failure. Returns nil on full pass. Intended to be called from main +// after flag parse, before any listener is started. +// +// lookup is the env-var lookup function with the os.LookupEnv signature: +// it returns (value, present). Pass os.LookupEnv at the call site; tests +// inject a func built from a map[string]string. Env values not registered +// in envContracts are ignored (the validator polices its declared contract, +// not arbitrary process env). +func CheckEnvConfig(lookup func(string) (string, bool)) error { + for _, c := range envContracts { + val, present := lookup(c.name) + if !present { + if c.required { + return &ErrInvalidConfig{Key: c.name, Reason: "missing"} + } + continue + } + if err := c.validate(val); err != nil { + return &ErrInvalidConfig{Key: c.name, Reason: "malformed-value: " + err.Error()} + } + } + return nil +} +``` + +Two decisions: + +1. **`func(string) (string, bool)` not `func(string) string`.** The AC is explicit: "Env values are injected via a test seam (a `func(string) (string, bool)`-shaped getter)". The Technical Notes section of the ticket says "same shape #77 uses" — that's the `func(string) string` (os.Getenv) shape, which conflicts with the AC. **The AC wins**: distinguishing "missing-but-required" from "present-but-empty" is semantically necessary for the required-key case, and `os.Getenv` cannot make that distinction (it returns `""` for both). `os.LookupEnv`'s shape carries the bit. The existing `IsProductionMode` keeps its `func(string) string` getter because exact-`"1"` match doesn't need the presence bit; the two seams coexist without conflict. +2. **First-failure, not collected.** Returning on the first failure is sufficient for this ticket's threat model (a single misconfigured key blocks boot; the operator fixes it and re-deploys; the next failure is then surfaced). A multi-error variant is a future-ticket move if and when the table grows large enough that "fix one, redeploy, fix next" becomes a real cost. + +### Wiring order in `cmd/pyrycode-relay/main.go` + +Insert immediately before the existing `CheckInsecureListenInProduction` call (currently line 45), after the `--domain` xor `--insecure-listen` flag-validation guard (line 43): + +```go +if err := relay.CheckEnvConfig(os.LookupEnv); err != nil { + var cfgErr *relay.ErrInvalidConfig + if errors.As(err, &cfgErr) { + logger.Error("refusing to start: invalid env-var config", + "err", err, + "env_var", cfgErr.Key, + "reason", cfgErr.Reason) + } else { + logger.Error("refusing to start: invalid env-var config", "err", err) + } + os.Exit(2) +} +``` + +Four details: + +- **Order matters: env-config check runs BEFORE `CheckInsecureListenInProduction`.** This is the core architectural decision of the ticket. `CheckInsecureListenInProduction` consults `IsProductionMode`, which silently accepts any non-`"1"` value as "non-production". If the operator wrote `PYRYCODE_RELAY_PRODUCTION=true`, `CheckInsecureListenInProduction` returns nil and a plaintext production listener boots. Running `CheckEnvConfig` first catches the malformed value, exits 2, and the dangerous code path is never reached. Code review must verify this ordering at landing. +- **Exit code 2**, matching the adjacent boot-time refusal checks (`CheckInsecureListenInProduction` at line 50, `CheckCapabilities` at line 57). Exit 1 is reserved for runtime failures (listener died, autocert failed); exit 2 is configuration-rejected-at-boot. Ops dashboards can split "deploy never started" from "deploy started and crashed" on this distinction. +- **`errors.As` extraction at the log site**, not in the validator. The validator returns `error`; the caller decides how to structure the log line. The extracted `cfgErr.Key` populates the `env_var` field directly so the operator does not have to grep the error message. The `reason` field carries the per-key shape violation. The fallback branch (no `errors.As` hit) is defensive against a future refactor that wraps the error in something `errors.As` can't see through — under the current design it is unreachable. +- **No `fix` field.** The #77 wiring uses a `fix` field with a static remediation string because the failure mode is one specific misconfiguration. For `CheckEnvConfig` the remediation is per-key (every env var has its own fix), and `reason` already carries the per-key shape ("expected `\"1\"` or unset, got `\"true\"`") that tells the operator what to flip. Adding a separate `fix` field would duplicate. + +The `errors` import is added to `cmd/pyrycode-relay/main.go` (not currently imported there). + +### Contract table population at merge time + +Verified by grep against `main` at spec-write time: + +``` +$ grep -rn "os\.Getenv\|os\.LookupEnv" cmd/ internal/ +cmd/pyrycode-relay/main.go:45: if err := relay.CheckInsecureListenInProduction(*insecureListen, os.Getenv); err != nil { +internal/relay/production.go:28: // The env var is read on every call via getenv. Pass os.Getenv at call sites; +internal/relay/production.go:39: // getenv is the env-var lookup function; pass os.Getenv at the call site, +``` + +The only read of an env var (the `main.go:45` line passes `os.Getenv` *as a function value* into `CheckInsecureListenInProduction`, which calls it with `envProductionMode` inside `IsProductionMode`) is `PYRYCODE_RELAY_PRODUCTION`. The two `production.go` matches are doc-comment references. The table has one row. + +If a sibling ticket (e.g. #81) lands an additional env-var read between now and #80's merge, the developer extends `envContracts` to cover it. Code review enforces the "every env var registered" invariant by re-running the grep. + +### Why not a `Config` struct (yet) + +A natural extension is to build a `relay.Config` struct populated from the env at boot, validated as a single step, and threaded through the rest of the binary. **Deferred.** Today the relay has one env var; the ticket body promises "future env-var additions register here", which means the contract table is the registry, not a typed struct. When env-var count crosses ~3 *and* a downstream caller wants a typed snapshot (rather than a per-call `getenv` closure), a follow-up ticket consolidates. Doing it now is premature. + +## Concurrency model + +None. `CheckEnvConfig` runs on the main goroutine after `flag.Parse` and before any listener is started. No locks, no channels, no goroutines. + +## Error handling + +`CheckEnvConfig` returns either `nil` (full pass) or `*ErrInvalidConfig` (first failure). No wrapping, no retry, no fallback. The main wiring exits 2 on any non-nil return — boot-time refusal is total. + +The error message format (`"relay: invalid env-var config: : "`) is stable and tested. Downstream callers that branch on this failure mode use `errors.Is(err, ErrInvalidConfigSentinel)`; callers that need the offending key use `errors.As(err, &cfgErr)` and read `cfgErr.Key`. + +## Testing strategy + +All tests live in `internal/relay/env_config_test.go`, are `t.Parallel()`-safe, and never call `os.Setenv` / `t.Setenv`. Helper for building the fake lookup (adapted from `production_test.go`'s `fakeGetenv` to the `LookupEnv` shape): + +```go +func fakeLookup(env map[string]string) func(string) (string, bool) { + return func(k string) (string, bool) { + v, ok := env[k] + return v, ok + } +} +``` + +### Test 1 — Valid env returns nil (AC.a) + +`PYRYCODE_RELAY_PRODUCTION` set to `"1"` and `PYRYCODE_RELAY_PRODUCTION` unset are both valid (the var is optional). Two cases in one table: + +| `PYRYCODE_RELAY_PRODUCTION` | want | +|---|---| +| unset | `nil` | +| `"1"` | `nil` | + +### Test 2 — Malformed value returns `*ErrInvalidConfig` with key + reason (AC.c) + +Table over the malformed-value space. Each row: set `PYRYCODE_RELAY_PRODUCTION` to a non-`"1"`, non-empty value; assert: +- `errors.Is(err, ErrInvalidConfigSentinel)` is true +- `errors.As(err, &cfgErr)` is true +- `cfgErr.Key == "PYRYCODE_RELAY_PRODUCTION"` +- `cfgErr.Reason` starts with `"malformed-value: "` + +Cases: + +| value | reason contains | +|---|---| +| `"true"` | `expected "1" or unset, got "true"` | +| `"0"` | `expected "1" or unset, got "0"` | +| `"yes"` | `expected "1" or unset, got "yes"` | +| `" 1"` | `expected "1" or unset, got " 1"` | +| `"1 "` | `expected "1" or unset, got "1 "` | +| `"PRODUCTION"` | `expected "1" or unset, got "PRODUCTION"` | + +The cases mirror the `IsProductionMode` value matrix from `production_test.go:21-29`. The doubling is intentional: if a future refactor relaxes the `"1"`-exact rule to `strconv.ParseBool`, both tests fail at once, making the contract violation impossible to miss. + +### Test 3 — Missing required key returns `*ErrInvalidConfig` with `Reason: "missing"` (AC.b) + +Today no env var is `required: true`, so this case is not exercisable through the production `envContracts` table. To still test the code path, the test injects a *test-local* `envContracts` table via a small package-private seam — the simplest version is a `checkEnvConfigWith` helper that takes the table as an argument: + +```go +// in env_config.go, package-private +func checkEnvConfigWith(lookup func(string) (string, bool), contracts []envContract) error { + // body identical to CheckEnvConfig's loop +} + +func CheckEnvConfig(lookup func(string) (string, bool)) error { + return checkEnvConfigWith(lookup, envContracts) +} +``` + +Then in `env_config_test.go`: + +```go +func TestCheckEnvConfig_MissingRequiredKey(t *testing.T) { + t.Parallel() + contracts := []envContract{{name: "X", required: true, validate: func(string) error { return nil }}} + err := checkEnvConfigWith(fakeLookup(map[string]string{}), contracts) + var cfgErr *ErrInvalidConfig + if !errors.As(err, &cfgErr) { + t.Fatalf("err = %v, want *ErrInvalidConfig", err) + } + if cfgErr.Key != "X" || cfgErr.Reason != "missing" { + t.Errorf("got Key=%q Reason=%q, want Key=%q Reason=%q", cfgErr.Key, cfgErr.Reason, "X", "missing") + } + if !errors.Is(err, ErrInvalidConfigSentinel) { + t.Error("err should satisfy errors.Is(err, ErrInvalidConfigSentinel)") + } +} +``` + +This is the same shape as `caps_test.go:108-155` (exercise the inner check function with a synthetic input the outer wrapper can't construct). + +### Test 4 — Unregistered env vars do NOT fail the validator (AC.d) + +```go +func TestCheckEnvConfig_IgnoresUnregisteredKeys(t *testing.T) { + t.Parallel() + env := map[string]string{ + "FOO_BAR_BAZ": "garbage", + "PYRYCODE_UNKNOWN": "anything", + "PATH": "/usr/bin", + // PYRYCODE_RELAY_PRODUCTION intentionally absent (optional, unset is valid) + } + if err := CheckEnvConfig(fakeLookup(env)); err != nil { + t.Errorf("got err %v, want nil — unregistered keys must be ignored", err) + } +} +``` + +This locks in the "validator polices its declared contract, not arbitrary process env" invariant. Without it a future refactor that walks `os.Environ()` for unknown keys would break the contract silently. + +### Test 5 — Sentinel is branchable + +```go +func TestErrInvalidConfigSentinel_IsBranchable(t *testing.T) { + t.Parallel() + err := &ErrInvalidConfig{Key: "X", Reason: "missing"} + if !errors.Is(err, ErrInvalidConfigSentinel) { + t.Error("*ErrInvalidConfig should satisfy errors.Is(_, ErrInvalidConfigSentinel)") + } + var cfgErr *ErrInvalidConfig + if !errors.As(error(err), &cfgErr) { + t.Error("*ErrInvalidConfig should satisfy errors.As(_, &*ErrInvalidConfig)") + } +} +``` + +Locks in the `errors.Is` / `errors.As` dual contract at the type level so future refactors of the `Is` method can't silently break detection. + +### What is NOT tested + +- The `main.go` wiring. Same rationale as #77: `main` is a coordination function whose behaviour is observable through the unit tests on the package it calls. The `errors.As` extraction in `main` is locally verifiable from the diff; an integration test that fork-execs the binary to assert exit code 2 is over-engineering for one `if err != nil { os.Exit(2) }` block. +- The structured log line's exact field names. The AC requires "a structured log line that names the offending key and the reason"; the spec's wiring satisfies that literally. Asserting on slog handler output requires capturing the handler, which is friction for an operator-facing prose contract. +- The composition of `CheckEnvConfig` + `CheckInsecureListenInProduction`. They are independent checks called in sequence; the per-check tests cover each branch. The ordering invariant ("env-config check runs first") is enforced by code review of `main.go`, not by a unit test. + +## Open questions + +None. The error type name is fixed by the AC; the sentinel suffix (`Sentinel`) is the smallest naming convention that resolves the AC's collision between "type" and "sentinel" without departing from the AC's literal type name. The getter shape (`(string, bool)`) is fixed by the AC. The contract table is one row at merge time; the population mechanism (grep on landing) is documented above. + +## Security review + +**Verdict:** PASS + +**Findings:** + +- **[Trust boundaries]** No findings. The only data crossing into the validator is operator-controlled env vars (not network-attacker-controlled). The validator is a fail-closed gate: anything not exactly matching a registered shape aborts boot. The boundary is explicit (the `envContracts` table in `internal/relay/env_config.go`); downstream code does not consume env-var values directly — it consumes typed helpers (`IsProductionMode`) whose inputs have already been validated by the time `main` reaches them. **Important ordering invariant:** the validator must run *before* `CheckInsecureListenInProduction` so that `IsProductionMode`'s silent-non-production fallback for non-`"1"` values cannot be reached with an unvalidated env. The spec's § Wiring order section is explicit about this and code review must enforce it. + +- **[Tokens, secrets, credentials]** N/A. The ticket creates no tokens, reads no secrets, writes no credential material. Env vars in the contract table carry configuration signals (`PYRYCODE_RELAY_PRODUCTION="1"`), not credentials. **Forward-looking caveat for the contract table:** if a future env var carries a secret (token, key), its validator must *not* echo the value into the reason string the way the `PYRYCODE_RELAY_PRODUCTION` validator does today (`got %q`). The reason string is logged at boot, so for secret-bearing env vars the validator should describe the shape violation in operator-facing terms ("expected hex-encoded 32-byte value", not "got %q"). Today's only entry is safe because the legitimate value is the literal string `"1"`. The developer must not extend the log line in `main.go` to include the raw `os.LookupEnv` value either — the structured fields are `err` (which includes the reason as composed by the validator), `env_var` (the name, not the value), and `reason` (the validator's prose). No `value` field. + +- **[File operations]** N/A. No file is read, written, statted, opened, or unlinked. No path concatenation. + +- **[Subprocess / external command execution]** N/A. No `exec.Command`, no `os.StartProcess`, no shell. + +- **[Cryptographic primitives]** N/A. No RNG, no hash, no comparison of attacker-controlled values to secrets. The string equality `v == "1"` inside the `PYRYCODE_RELAY_PRODUCTION` validator is operator-controlled-to-operator-controlled (an env var value vs a literal constant) — no timing-side-channel surface. + +- **[Network & I/O]** No findings. The validator runs *before* any listener is opened. The whole purpose of the ticket is to prevent a listener from starting in a misconfigured state (specifically, to prevent `--insecure-listen` from being honoured under a typo'd production-mode env). Unrelated network code (`http.Server` timeout configuration in `main.go:84-104`, autocert setup, the `tls.go` machinery) is unchanged. + +- **[Error messages, logs, telemetry]** One **SHOULD FIX** noted inline above (Tokens category): the developer must not add a log field that echoes the raw env-var value into a structured log line. The spec's wiring already avoids this (the log fields are `err`, `env_var`, `reason`). For today's one entry (`PYRYCODE_RELAY_PRODUCTION`) the reason itself echoes the value via `got %q`, which is acceptable because the legitimate value is `"1"` and any operator-supplied bytes there are misconfiguration data, not secret data. The rule for future entries is "validator's reason describes shape, not raw value, if the env var could carry a secret". + + No findings for log injection: slog with the default text handler escapes control characters in field values; an attacker (operator) who put a newline into `PYRYCODE_RELAY_PRODUCTION` would get a `\n`-escaped field, not a forged log line. + +- **[Concurrency]** N/A. The validator runs on the main goroutine before any goroutine is spawned. No shared mutable state. The `envContracts` table is a package-level `var` written once at init and read-only thereafter; nothing in the spec mutates it at runtime, and code review must keep it read-only (a runtime extension API would invert the trust model). + +- **[Threat model alignment]** No findings. `pyrycode/pyrycode/docs/protocol-mobile.md` § Security model and `docs/threat-model.md` both assume the relay is correctly configured for its deployment role (production with TLS; dev with plaintext for local loops). This ticket is the in-binary enforcement of "configuration is well-formed". The complement (a CI / deploy-manifest check that *required* env vars exist in production manifests) is out of scope and tracked separately; today no env var is `required: true` so that future work hooks in naturally when the first required key is introduced. + +The validator's adversarial surface is small: it parses no untrusted bytes (env vars are operator-controlled and bounded by the OS env page size, typically 128 KiB total per process), produces no externally-visible artifacts beyond a structured log line, and aborts the process on any contract violation. The fail-closed default is the correct stance for a boot-time gate. + +**Reviewer:** architect (self-review per `architect/security-review.md`) +**Date:** 2026-05-13 diff --git a/internal/relay/env_config.go b/internal/relay/env_config.go new file mode 100644 index 0000000..9918b3a --- /dev/null +++ b/internal/relay/env_config.go @@ -0,0 +1,92 @@ +package relay + +import ( + "errors" + "fmt" +) + +// ErrInvalidConfigSentinel is the package-level sentinel that every +// *ErrInvalidConfig matches against via errors.Is. Detect any env-var +// validation failure with errors.Is(err, ErrInvalidConfigSentinel); extract +// the offending key and reason with errors.As(err, &cfgErr) where cfgErr is +// *ErrInvalidConfig. +var ErrInvalidConfigSentinel = errors.New("relay: invalid env-var config") + +// ErrInvalidConfig is the structured error returned by CheckEnvConfig when a +// registered env var is missing-but-required or present-but-malformed. It +// exposes the offending env-var name and a human-readable reason so the +// caller can log a per-key remediation message at boot. +// +// Branchable via errors.Is(err, ErrInvalidConfigSentinel). Fields are +// extracted via errors.As(err, &cfgErr). +type ErrInvalidConfig struct { + Key string // env-var name (e.g. "PYRYCODE_RELAY_PRODUCTION") + Reason string // human-readable shape violation +} + +func (e *ErrInvalidConfig) Error() string { + return fmt.Sprintf("%s: %s: %s", ErrInvalidConfigSentinel, e.Key, e.Reason) +} + +// Is matches against the package-level sentinel so errors.Is(err, +// ErrInvalidConfigSentinel) returns true for any *ErrInvalidConfig instance +// regardless of Key / Reason. +func (e *ErrInvalidConfig) Is(target error) bool { + return target == ErrInvalidConfigSentinel +} + +// envContract is one row in the env-var registry. The validator iterates the +// registry in order at boot; each entry specifies the env-var name, whether +// the relay refuses to start when the var is unset, and a per-key shape +// validator. validate receives the raw value when the var is present and +// non-empty; the registry walk handles the unset and present-but-empty cases. +type envContract struct { + name string + required bool + validate func(value string) error +} + +// envContracts is the single source of truth for "what env vars the relay +// reads at boot". Every new env-var read added under cmd/pyrycode-relay or +// internal/relay must register an entry here. Code review enforces. +var envContracts = []envContract{ + { + name: envProductionMode, + required: false, + validate: func(v string) error { + if v == "1" { + return nil + } + return fmt.Errorf("expected %q or unset, got %q", "1", v) + }, + }, +} + +// CheckEnvConfig walks envContracts and returns *ErrInvalidConfig on the +// first failure. Returns nil on full pass. Intended to be called from main +// after flag parse, before any listener is started. +// +// lookup is the env-var lookup function with the os.LookupEnv signature: it +// returns (value, present). Pass os.LookupEnv at the call site; tests inject +// a func built from a map[string]string. Env values not registered in +// envContracts are ignored (the validator polices its declared contract, not +// arbitrary process env). +func CheckEnvConfig(lookup func(string) (string, bool)) error { + return checkEnvConfigWith(lookup, envContracts) +} + +func checkEnvConfigWith(lookup func(string) (string, bool), contracts []envContract) error { + for _, c := range contracts { + val, present := lookup(c.name) + if !present { + if c.required { + return &ErrInvalidConfig{Key: c.name, Reason: "missing"} + } + continue + } + if err := c.validate(val); err != nil { + return &ErrInvalidConfig{Key: c.name, Reason: "malformed-value: " + err.Error()} + } + } + return nil +} diff --git a/internal/relay/env_config_test.go b/internal/relay/env_config_test.go new file mode 100644 index 0000000..c34032b --- /dev/null +++ b/internal/relay/env_config_test.go @@ -0,0 +1,138 @@ +package relay + +import ( + "errors" + "strings" + "testing" +) + +func fakeLookup(env map[string]string) func(string) (string, bool) { + return func(k string) (string, bool) { + v, ok := env[k] + return v, ok + } +} + +func TestCheckEnvConfig_ValidEnvReturnsNil(t *testing.T) { + t.Parallel() + + cases := []struct { + name string + env map[string]string + }{ + {name: "production unset is valid", env: map[string]string{}}, + {name: "production exact 1 is valid", env: map[string]string{envProductionMode: "1"}}, + } + + for _, tc := range cases { + tc := tc + t.Run(tc.name, func(t *testing.T) { + t.Parallel() + if err := CheckEnvConfig(fakeLookup(tc.env)); err != nil { + t.Errorf("CheckEnvConfig(%v) = %v, want nil", tc.env, err) + } + }) + } +} + +func TestCheckEnvConfig_MalformedValueReturnsStructuredError(t *testing.T) { + t.Parallel() + + cases := []struct { + name string + value string + wantReasonSub string + }{ + {name: "true is malformed", value: "true", wantReasonSub: `expected "1" or unset, got "true"`}, + {name: "zero is malformed", value: "0", wantReasonSub: `expected "1" or unset, got "0"`}, + {name: "yes is malformed", value: "yes", wantReasonSub: `expected "1" or unset, got "yes"`}, + {name: "leading space is malformed", value: " 1", wantReasonSub: `expected "1" or unset, got " 1"`}, + {name: "trailing space is malformed", value: "1 ", wantReasonSub: `expected "1" or unset, got "1 "`}, + {name: "uppercase PRODUCTION is malformed", value: "PRODUCTION", wantReasonSub: `expected "1" or unset, got "PRODUCTION"`}, + {name: "empty string is malformed", value: "", wantReasonSub: `expected "1" or unset, got ""`}, + } + + for _, tc := range cases { + tc := tc + t.Run(tc.name, func(t *testing.T) { + t.Parallel() + env := map[string]string{envProductionMode: tc.value} + err := CheckEnvConfig(fakeLookup(env)) + if err == nil { + t.Fatalf("CheckEnvConfig with value %q returned nil, want error", tc.value) + } + if !errors.Is(err, ErrInvalidConfigSentinel) { + t.Errorf("err %v should satisfy errors.Is(err, ErrInvalidConfigSentinel)", err) + } + var cfgErr *ErrInvalidConfig + if !errors.As(err, &cfgErr) { + t.Fatalf("err %v should satisfy errors.As(err, &*ErrInvalidConfig)", err) + } + if cfgErr.Key != envProductionMode { + t.Errorf("got Key=%q, want %q", cfgErr.Key, envProductionMode) + } + if !strings.HasPrefix(cfgErr.Reason, "malformed-value: ") { + t.Errorf("got Reason=%q, want prefix %q", cfgErr.Reason, "malformed-value: ") + } + if !strings.Contains(cfgErr.Reason, tc.wantReasonSub) { + t.Errorf("got Reason=%q, want contains %q", cfgErr.Reason, tc.wantReasonSub) + } + }) + } +} + +func TestCheckEnvConfig_MissingRequiredKey(t *testing.T) { + t.Parallel() + + contracts := []envContract{ + { + name: "PYRYCODE_TEST_REQUIRED", + required: true, + validate: func(string) error { return nil }, + }, + } + err := checkEnvConfigWith(fakeLookup(map[string]string{}), contracts) + if err == nil { + t.Fatal("checkEnvConfigWith with missing required key returned nil, want error") + } + if !errors.Is(err, ErrInvalidConfigSentinel) { + t.Errorf("err %v should satisfy errors.Is(err, ErrInvalidConfigSentinel)", err) + } + var cfgErr *ErrInvalidConfig + if !errors.As(err, &cfgErr) { + t.Fatalf("err %v should satisfy errors.As(err, &*ErrInvalidConfig)", err) + } + if cfgErr.Key != "PYRYCODE_TEST_REQUIRED" { + t.Errorf("got Key=%q, want %q", cfgErr.Key, "PYRYCODE_TEST_REQUIRED") + } + if cfgErr.Reason != "missing" { + t.Errorf("got Reason=%q, want %q", cfgErr.Reason, "missing") + } +} + +func TestCheckEnvConfig_IgnoresUnregisteredKeys(t *testing.T) { + t.Parallel() + + env := map[string]string{ + "FOO_BAR_BAZ": "garbage", + "PYRYCODE_UNKNOWN": "anything", + "PATH": "/usr/bin", + // PYRYCODE_RELAY_PRODUCTION intentionally absent (optional, unset is valid). + } + if err := CheckEnvConfig(fakeLookup(env)); err != nil { + t.Errorf("CheckEnvConfig with unregistered keys = %v, want nil", err) + } +} + +func TestErrInvalidConfigSentinel_IsBranchable(t *testing.T) { + t.Parallel() + + err := &ErrInvalidConfig{Key: "X", Reason: "missing"} + if !errors.Is(err, ErrInvalidConfigSentinel) { + t.Error("*ErrInvalidConfig should satisfy errors.Is(_, ErrInvalidConfigSentinel)") + } + var cfgErr *ErrInvalidConfig + if !errors.As(error(err), &cfgErr) { + t.Error("*ErrInvalidConfig should satisfy errors.As(_, &*ErrInvalidConfig)") + } +}