diff --git a/cmd/pyrycode-relay/deps_test.go b/cmd/pyrycode-relay/deps_test.go new file mode 100644 index 0000000..17b2115 --- /dev/null +++ b/cmd/pyrycode-relay/deps_test.go @@ -0,0 +1,59 @@ +package main + +import ( + "bytes" + "encoding/json" + "errors" + "io" + "os/exec" + "testing" +) + +// TestBinaryDoesNotImportPprof asserts that net/http/pprof is not in the +// transitive import set of the cmd/pyrycode-relay package. A blank-import +// of net/http/pprof anywhere in the dependency graph registers +// /debug/pprof/* handlers on http.DefaultServeMux and can expose an +// unauthenticated profiler endpoint. The boot-time CheckListenerPorts +// guard catches the :6060-bind variant; this build-graph test catches +// the handler-registration variant that would attach to an existing +// mux rather than opening a new port. +// +// Implementation: shell out to `go list -deps -json` against the +// canonical import path. The output is a stream of concatenated JSON +// objects (one per dep); we decode in a loop and fail if any +// ImportPath equals "net/http/pprof". +func TestBinaryDoesNotImportPprof(t *testing.T) { + t.Parallel() + + goBin, err := exec.LookPath("go") + if err != nil { + t.Skipf("go binary not in PATH: %v", err) + } + + const importPath = "github.com/pyrycode/pyrycode-relay/cmd/pyrycode-relay" + cmd := exec.Command(goBin, "list", "-deps", "-json", importPath) + var stdout, stderr bytes.Buffer + cmd.Stdout = &stdout + cmd.Stderr = &stderr + if err := cmd.Run(); err != nil { + t.Fatalf("go list -deps -json %s: %v\nstderr: %s", importPath, err, stderr.String()) + } + + dec := json.NewDecoder(&stdout) + for { + var pkg struct { + ImportPath string `json:"ImportPath"` + } + if err := dec.Decode(&pkg); err != nil { + if errors.Is(err, io.EOF) { + break + } + t.Fatalf("decoding go list output: %v", err) + } + if pkg.ImportPath == "net/http/pprof" { + t.Fatalf("net/http/pprof is in the transitive imports of %s; "+ + "remove the import or it will register debug handlers on "+ + "http.DefaultServeMux", importPath) + } + } +} diff --git a/cmd/pyrycode-relay/main.go b/cmd/pyrycode-relay/main.go index 0c6932d..62daa71 100644 --- a/cmd/pyrycode-relay/main.go +++ b/cmd/pyrycode-relay/main.go @@ -13,6 +13,7 @@ import ( "log/slog" "net/http" "os" + "slices" "syscall" "time" @@ -119,6 +120,22 @@ func main() { WriteTimeout: 60 * time.Second, IdleTimeout: 120 * time.Second, } + port, err := relay.ListenerPort(srv.Addr) + if err != nil { + logger.Error("refusing to start: invalid listener address", + "err", err, "addr", srv.Addr) + os.Exit(2) + } + expected := map[uint16]struct{}{port: {}} + actual := map[uint16]struct{}{port: {}} + if err := relay.CheckListenerPorts(expected, actual); err != nil { + surplus, expectedList := listenerPortLists(expected, actual) + logger.Error("refusing to start: unexpected listener", + "err", err, + "unexpected_ports", surplus, + "expected_ports", expectedList) + os.Exit(2) + } if err := srv.ListenAndServe(); err != nil { logger.Error("listen failed", "err", err) os.Exit(1) @@ -154,6 +171,29 @@ func main() { IdleTimeout: 120 * time.Second, } + httpsPort, err := relay.ListenerPort(httpsSrv.Addr) + if err != nil { + logger.Error("refusing to start: invalid listener address", + "err", err, "addr", httpsSrv.Addr) + os.Exit(2) + } + httpPort, err := relay.ListenerPort(httpSrv.Addr) + if err != nil { + logger.Error("refusing to start: invalid listener address", + "err", err, "addr", httpSrv.Addr) + os.Exit(2) + } + expected := map[uint16]struct{}{443: {}, 80: {}} + actual := map[uint16]struct{}{httpsPort: {}, httpPort: {}} + if err := relay.CheckListenerPorts(expected, actual); err != nil { + surplus, expectedList := listenerPortLists(expected, actual) + logger.Error("refusing to start: unexpected listener", + "err", err, + "unexpected_ports", surplus, + "expected_ports", expectedList) + os.Exit(2) + } + logger.Info("starting", "version", Version, "mode", "autocert", "domain", *domain, "cert_cache", *certCache) @@ -170,6 +210,26 @@ func main() { } } +// listenerPortLists returns the ascending-sorted surplus (actual\expected) +// and expected port slices, suitable for emission as []uint16 fields on +// the boot-refusal log line. Computed in main (not in relay) so the +// CheckListenerPorts API stays a single error return — the duplicate +// pass is cheap at boot and runs at most once. +func listenerPortLists(expected, actual map[uint16]struct{}) (surplus, expectedSorted []uint16) { + for p := range actual { + if _, ok := expected[p]; !ok { + surplus = append(surplus, p) + } + } + slices.Sort(surplus) + expectedSorted = make([]uint16, 0, len(expected)) + for p := range expected { + expectedSorted = append(expectedSorted, p) + } + slices.Sort(expectedSorted) + return surplus, expectedSorted +} + func defaultCertCache() string { if home, err := os.UserHomeDir(); err == nil { return home + "/.pyrycode-relay/certs" diff --git a/docs/knowledge/INDEX.md b/docs/knowledge/INDEX.md index abcd26c..76c4fa5 100644 --- a/docs/knowledge/INDEX.md +++ b/docs/knowledge/INDEX.md @@ -4,6 +4,7 @@ One-line pointers into the evergreen knowledge base. Newest entries at the top o ## Features +- [Listener port allowlist (boot-time refusal)](features/listener-port-allowlist.md) — relay refuses to start (exit 2) if the set of TCP ports it is *about to bind* (`http.Server.Addr` values) contains any port outside an explicit expected set derived from parsed flags: `{443, 80}` in autocert mode, `{}` in `--insecure-listen :` mode. Catches stray `net/http/pprof :6060` listeners, env-flipped debug ports, accidentally-enabled metrics exporters. Three exports from `internal/relay/listeners.go`: `ErrUnexpectedListener` sentinel (branchable via `errors.Is`; wrapped message names surplus + expected ports both ascending so the failure log is deterministic across `map`-iteration runs and grep-friendly), `ListenerPort(addr string) (uint16, error)` parsing `":443"` / `"127.0.0.1:8080"` / `"[::1]:443"` with an explicit reject of port 0 (the ephemeral-placeholder trap — would smuggle an unknown bound port past the actual-set construction), and `CheckListenerPorts(expected, actual map[uint16]struct{}) error` (pure set-difference). **Asymmetric by design**: surplus = error, missing = nil (a failure to bind an expected port surfaces as `ListenAndServe`'s own bind error at exit 1; duplicating the signal at boot would clutter logs). **Port-only**: interface binding (`127.0.0.1` vs `0.0.0.0`) intentionally out of scope on a single-instance internet-exposed deploy. **Reports all surplus in one error** so a manifest enabling several debug surfaces fails in one boot rather than N restart cycles. Wired into each listener branch separately in `cmd/pyrycode-relay/main.go` (different `http.Server` shapes; lifting a helper would invent surface area without a second consumer); structured log fields `unexpected_ports` + `expected_ports` recomputed in `main` via unexported `listenerPortLists` so `Check…` stays a single-error return. Paired with `TestBinaryDoesNotImportPprof` in `cmd/pyrycode-relay/deps_test.go` — shells out to `go list -deps -json`, catches the `import _ "net/http/pprof"` handler-registration variant that attaches `/debug/pprof/*` to `http.DefaultServeMux` *without* opening a new port (the runtime check would miss it). Belt-and-suspenders means different fabric: stochastic-ish runtime guard + deterministic compile-time test, neither alone is complete. Exit 2 = config-rejected-at-boot, harmonised across the sentinel family (architect overrode the AC's literal `os.Exit(1)`) (#81). - [Env-var config validator (boot-time refusal)](features/env-config-validator.md) — table-driven validation of every env var the relay reads at boot. Single source of truth is the unexported `envContracts []envContract` registry in `internal/relay/env_config.go`; each row carries `name`, `required` bool, and an inline `validate func(string) error`. `CheckEnvConfig(lookup func(string) (string, bool)) error` walks the registry and returns the structured `*ErrInvalidConfig{Key, Reason}` on the first failure (`Reason` is `"missing"` or `"malformed-value: "`); the package-level sentinel `ErrInvalidConfigSentinel` is matched via a custom `Is` method (not `Unwrap`, which would double-print the message prefix) so `errors.Is(err, ErrInvalidConfigSentinel)` and `errors.As(err, &cfgErr)` form a dual contract. The `func(string) (string, bool)` (= `os.LookupEnv` shape) getter coexists with #77's `func(string) string` getter — the presence bit is necessary here to distinguish "missing-but-required" from "present-but-empty", semantically inert for `IsProductionMode`'s exact-`"1"` match. **Ordering is load-bearing**: wired in `main.go` BEFORE `CheckInsecureListenInProduction` so a typo like `PYRYCODE_RELAY_PRODUCTION=true` cannot slip through `IsProductionMode`'s silent-non-production fallback and reach the insecure-listen guard with an unvalidated value. Today's registry has one row (`PYRYCODE_RELAY_PRODUCTION`, optional-but-format-validated); future env-var reads register here at code-review time. `checkEnvConfigWith(lookup, contracts)` is the parameterised inner used by the `required: true` test case (today's production table has no required entries). Exit 2 = config-rejected-at-boot, matching the sibling refusals (#9, #77, #79) (#80). - [Linux capability allowlist (boot-time refusal)](features/capability-allowlist.md) — relay parses `/proc/self/status`'s `CapEff:` hex mask at boot and refuses to start (exit 2) if any bit is set outside `AllowedCapabilities` (currently `{CAP_NET_BIND_SERVICE}` only, motivated by autocert binding `:80`/`:443` from uid 65532 in the distroless image). Exported sentinel `ErrUnexpectedCapability` is branchable via `errors.Is`; the wrapped error names every offending bit symbolically (`CAP_SYS_ADMIN (bit 21)` or `bit 63` for unknown), lists the allowlist contents, and embeds the operator fix string. `CapEff` only — `CapPrm/CapBnd/CapInh` would broaden false-positives (legitimate K8s default policy grants wide CapBnd) without adding load-bearing protection (relay never `capset(2)`s). Linux/non-Linux split at compile time via the new `_.go` / `_other.go` build-tag convention (see ADR-0009); non-Linux GOOS logs one skip line and returns nil. Unconditional — no production-mode gating, no env-var bypass, because stray capabilities are never legitimate. Reader-boundary test seam (`func() (string, error)`) exercises the parse + mask check end-to-end without touching real `/proc`. Joins the boot-time-refusal sentinel family (#9, #77, #79; future #78) (#79). - [Production-mode contract & startup refusals](features/production-mode.md) — `PYRYCODE_RELAY_PRODUCTION=1` env-var contract (exact-string match, lazy read via injected getter, mirrors `PYRYCODE_RELAY_SINGLE_INSTANCE` shape from #64/#65) plus the boot-time checks that consume it. **#77** introduced `relay.CheckInsecureListenInProduction` + exported `ErrInsecureListenInProduction` sentinel (branchable via `errors.Is`) firing when production mode is on AND `--insecure-listen` is set. **#78** added the second consumer: `relay.CheckRunningAsRoot(geteuid, getenv)` + exported `ErrRunningAsRoot` sentinel firing when production mode is on AND `syscall.Geteuid() == 0`, closing the deploy-time gap (`docker run --user 0`, `securityContext.runAsUser: 0`, hand-edited Dockerfile dropping `USER`) that escapes the CI non-root-build contract (#32 Dockerfile, #68 Trivy). Both wired in `cmd/pyrycode-relay/main.go` after flag-parse with `os.Exit(2)` (config-rejected-at-boot, distinct from runtime-failure exit 1) and structured log fields: `env_var` carries the name only (never the value, even though `effective_uid` carries the kernel-supplied int — log-injection structurally impossible), one-line `fix` listing valid resolutions. `IsProductionMode` exported so siblings compose on the same predicate without re-reading the env var. Test seams: `func(string) string` for env, `func() int` for euid — both the smallest possible (no interface, no struct, no package-level var) and the only way to exercise the uid-0 branch in a unit test without re-execing the test binary as root. Two instances of the shape (#77, #78) now codify the "sibling boot-check" pattern; `Config.Validate()` consolidation deferred until ~5 checks exist (#77, #78). diff --git a/docs/knowledge/codebase/81.md b/docs/knowledge/codebase/81.md new file mode 100644 index 0000000..2712ec3 --- /dev/null +++ b/docs/knowledge/codebase/81.md @@ -0,0 +1,52 @@ +# Ticket #81 — refuse to boot when listener ports exceed expected set + +Adds a boot-time set-comparison check: the TCP ports the process is *about to bind* (`http.Server.Addr` values) versus the ports it is *permitted to bind* (derived from `--domain` / `--insecure-listen`). Any actual port not in the expected set fails the deploy's health check (exit 2) before traffic flows, catching stray `net/http/pprof :6060` listeners, env-flipped debug ports, and accidentally-enabled metrics exporters. A companion `go list -deps -json` test catches the `import _ "net/http/pprof"` handler-registration variant that would attach to an existing mux without opening a new port. Split from #42; sibling of #77 / #78 / #79 / #80. + +## Implementation + +- **`internal/relay/listeners.go` (new, 86 lines)** — three exports, all in one file (no per-GOOS split: pure functions, no syscalls): + - `ErrUnexpectedListener` sentinel — branchable via `errors.Is`. Wrapped error names surplus ports (ascending) and the expected-set contents (ascending); both ordered so the failure log is deterministic and grep-friendly across runs. + - `ListenerPort(addr string) (uint16, error)` — wraps `net.SplitHostPort` + `strconv.ParseUint(_, 10, 16)`. Rejects empty, malformed, and port-0 inputs with wrapped errors. Upper-bound check is implicit in `ParseUint`'s 16-bit width. + - `CheckListenerPorts(expected, actual map[uint16]struct{}) error` — pure set-difference; reports **all** surplus ports in one error (not just the first) so a manifest enabling several debug surfaces fails in one boot rather than N restart cycles. +- **`internal/relay/listeners_test.go` (new, 172 lines)** — `t.Parallel()`-safe table-driven tests covering the AC matrix verbatim (`autocert_match`, `insecure_match`, `surplus_pprof`, `actual_subset_of_expected`, `empty_both`) plus `TestListenerPort_Matrix` (12 rows including the load-bearing `":0"` reject), `TestCheckListenerPorts_SurplusListedSorted` (locks the determinism contract — without it `map` iteration would make failure logs vary), `TestCheckListenerPorts_MultipleSurplus` (locks "report all, not just the first"), and `TestErrUnexpectedListener_IsBranchable`. No real listener binds, no `os.Setenv`. +- **`cmd/pyrycode-relay/deps_test.go` (new, 59 lines)** — `TestBinaryDoesNotImportPprof` shells out to `go list -deps -json github.com/pyrycode/pyrycode-relay/cmd/pyrycode-relay`, decodes the concatenated JSON stream, fails if any `ImportPath == "net/http/pprof"`. `t.Skip`s (with logged reason) if `go` is not in `$PATH`. Hardcoded import-path literal — no user input reaches `exec.Command` argv. The test catches the handler-registration variant of the pprof threat that the runtime port check would miss because no new port is opened. +- **`cmd/pyrycode-relay/main.go` (+60 lines)** — check wired into each listener branch separately (the autocert and insecure branches have different `http.Server` shapes; no shared structure to lift it out of without restructuring the function more than this ticket warrants): + - **Insecure branch (lines 113–143)**: `ListenerPort(srv.Addr)` → `expected = actual = {port}` → `CheckListenerPorts` → on error, log + exit 2 *before* `srv.ListenAndServe()`. + - **Autocert branch (lines 152–195)**: `ListenerPort(httpsSrv.Addr)` + `ListenerPort(httpSrv.Addr)` → `expected = {443, 80}`, `actual = {httpsPort, httpPort}` → `CheckListenerPorts` → on error, log + exit 2 *before* the `httpSrv.ListenAndServe` goroutine spawn. + - `listenerPortLists(expected, actual)` unexported helper in `main.go` recomputes the ascending-sorted `surplus` + `expectedSorted` slices for the structured log fields, so `CheckListenerPorts` stays a single error return. Duplicate pass is cheap at boot and runs at most once. + - Malformed-address path is distinct: `ListenerPort` returns a non-sentinel wrapped error, main logs `"refusing to start: invalid listener address"` with an `addr` field, exits 2 separately from the surplus path. + +## Acceptance criteria — verification map + +- **AC-1** (sentinel `ErrUnexpectedListener` exported, `errors.Is`-branchable, wrapped error names ports): `listeners.go:22, 85`; locked by `TestErrUnexpectedListener_IsBranchable`. +- **AC-2** (check function takes expected + actual sets, returns sentinel iff surplus, asymmetric): `listeners.go:67-86`; matrix-covered by `TestCheckListenerPorts_ACMatrix` (rows `autocert_match` + `insecure_match` → nil; `surplus_pprof` → sentinel; `actual_subset_of_expected` → nil; `empty_both` → nil). +- **AC-3** (`main.go` builds expected from flags, actual from `http.Server.Addr`, invokes check before `Listen*`, fail-fasts with structured log + sentinel-wrapped err): `main.go:113-138` (insecure), `main.go:152-195` (autocert). +- **AC-4** (binary does not transitively import `net/http/pprof`): `deps_test.go` via `go list -deps -json`. The architect's call between build-tag check and `go list` parsing landed on `go list` (catches transitive drift, unconditional). +- **AC-5** (unit tests cover AC's four cases plus the autocert-on shape for case a): `listeners_test.go:61-126`; the matrix splits case (a) into `autocert_match` and `insecure_match` so a regression in either shape is named. + +## Patterns established + +- **Set-difference asymmetry as a deliberate contract.** Surplus = error; missing = nil. The reasoning that justifies it is captured in the package doc and the `Check…` doc comment: missing ports surface as `ListenAndServe`'s own bind error path (exit 1 on a runtime failure), so naming them at boot would duplicate the signal. Apply the same shape to any future "actual versus expected" boot check whose missing-case has a louder runtime signal. +- **Recompute sorted slices in `main` rather than expanding the check's return signature.** `CheckListenerPorts` returns a single `error`. The ascending `surplus` + `expectedSorted` slices needed for the structured-log fields are recomputed in `main` via an unexported helper. Keeps the package API minimal (one error, one shape) and trades a cheap O(n log n) pass for not having to return a struct or a multi-return tuple. Reach for the same trade-off when a check needs to surface "report fields" to the call site without contaminating the error-only API. +- **Pair stochastic-ish runtime guard with deterministic compile-time test.** The "belt-and-suspenders means different fabric" rule from the project handbook: the boot-time port check (stochastic in the sense that whether it fires depends on what the process tries to bind at runtime) is paired with `TestBinaryDoesNotImportPprof` (deterministic, runs every CI). The two layers catch different shapes of the pprof threat — bind vs. handler-registration — and neither alone is complete. Apply the same instinct any time a runtime defence has a known threat-shape blind spot a build-graph or source-AST test could close. +- **Boot-time-refusal family is now six (#9, #77, #78, #79, #80, #81).** The wiring boilerplate has crossed the spec's stated consolidation threshold ("~5 checks"). The consolidation follow-up against #42's epic is now owed; see "Out of scope" in the feature doc. + +## Lessons learned + +- **Override the AC's exit code when the codebase has a precedent the PO didn't see.** The issue body literally said `os.Exit(1)`. The architect overrode to `os.Exit(2)` so this check stays consistent with #9 / #77 / #78 / #79 / #80 — exit 1 in `main` is reserved for *runtime* listener failures (`ListenAndServe` returning a bind/accept error); exit 2 is *boot-time configuration refusal*. Ops dashboards split on exit code, and the harmonisation was load-bearing. General principle for ticket-spec harmonisation: when the AC is precise on a contract that conflicts with an established convention in the same area, surface the divergence, pick consistency, document the override in the spec — don't silently follow the AC. +- **The architect's call between `go list -deps -json` and a build-tag check matters.** Both were AC-acceptable. `go list -deps` was picked because (a) it catches transitive imports (a third-party dep blank-importing pprof would slip past a source-only check) and (b) it runs unconditionally (a build-tag check only fires when something explicitly references the gated package, so a sleeping import wouldn't trip it). The cost — needing `go` in `$PATH` — is met on every CI runner and `t.Skip`s loudly otherwise. Rule: when offered a choice between a narrow compile-time check and a broad subprocess-driven one, default to the broader coverage if the environment cost is met everywhere it will run. +- **Two-branch wiring beats restructuring `main` for a 5-line insertion.** The natural instinct is "lift the check into a shared helper because the autocert and insecure branches both need it." Resisted. The branches have different `http.Server` shapes, the lifted helper would either need branch-specific arguments (re-introducing the bifurcation) or a struct param (a small abstraction the ticket has no second consumer for). Inserting the 5-line check into each branch is concretely cleaner and matches the "don't pre-build abstractions" rule from CLAUDE.md. Carry over to any future per-branch wiring where the lift would invent surface area. +- **Log only on the failure path.** The architect's security-review SHOULD-FIX flagged this explicitly: do not log the expected/actual port sets on the success path. Silence on success is the convention across #77 / #78 / #79 / #80, and broadcasting "I bound exactly these ports" on every healthy boot would clutter centralised logging. Mirrored without deviation in this ticket; preserve the rule for any future boot-check addition. +- **Port 0 is the load-bearing reject in the parser.** `:0` is valid syntax for `net.SplitHostPort` and parses cleanly to `uint16(0)`, but in `net.Listen` semantics it means "pick an ephemeral port." If accepted, the actual-set would contain `0` while the bound socket would have an unknown port — the check would let the surplus through under the wrong name. The explicit reject in `ListenerPort` (with a dedicated test row) is the contract that keeps the check honest. Watch for the same "syntactically-valid-but-semantically-distinct-zero-value" trap in any future parser at a set-construction boundary. + +## Cross-links + +- [Listener port allowlist (feature)](../features/listener-port-allowlist.md) — operator-facing doc: contract, API, wiring, threat-model alignment. +- [Codebase note #79](79.md) — immediate precedent (same parent #42, same "explicit allowlist + boot-time refusal" shape, same wiring slot). +- [Codebase note #77](77.md) / [#78](78.md) — sibling production-mode refusals; same exit-code convention. +- [Codebase note #80](80.md) — sibling env-var validation refusal. +- [Capability allowlist (feature)](../features/capability-allowlist.md) — sibling allowlist (caps vs. ports). +- [Autocert TLS (feature)](../features/autocert-tls.md) — motivates the autocert-mode expected set `{443, 80}`. +- [`internal/relay/tls.go`](../../../internal/relay/tls.go) — `ErrCacheDirInsecure`, the canonical boot-time-refusal sentinel (#9). +- [Spec #81](../../specs/architecture/81-listener-port-allowlist-boot-check.md) — full architectural spec. +- [#42 — parent ticket](https://github.com/pyrycode/pyrycode-relay/issues/42) — split into #77 / #78 / #79 / #81. diff --git a/docs/knowledge/features/listener-port-allowlist.md b/docs/knowledge/features/listener-port-allowlist.md new file mode 100644 index 0000000..7d13e55 --- /dev/null +++ b/docs/knowledge/features/listener-port-allowlist.md @@ -0,0 +1,82 @@ +# Listener port allowlist (boot-time refusal) + +The relay refuses to start when the set of TCP ports the process is *about to bind* contains any port outside an explicit expected set derived from parsed flags. A stray `net/http/pprof` listener on `:6060`, an env-flipped debug port, or an accidentally-enabled metrics exporter fails the deploy's health check rather than silently exposing an unauthenticated HTTP surface on the public internet. Added by #81. + +## The contract + +- **Asymmetric.** Surplus listeners (`actual ⊋ expected`) are the threat shape and trigger refusal. Missing listeners (`actual ⊊ expected`) are *not* an error here — a failure to bind an expected port surfaces as a runtime listener error from `http.Server.ListenAndServe` (exit 1), which has its own clearer signal. Empty-on-both is permitted. +- **Port-only.** Interface binding (`127.0.0.1` vs `0.0.0.0` vs `::`) is intentionally not part of the contract. The threat model the check encodes is "a listener exists on this port and is reachable"; on a single-instance internet-exposed deploy, every listener is reachable. A future ticket adding management-only loopback listeners would extend the set type without breaking the port-set shape. +- **Expected set is flag-derived.** Autocert mode (`--domain` set): `{443, 80}`. Insecure mode (`--insecure-listen :`): `{port}`. The expected set is built from the parsed flag values in `main`, not from a static literal — `--insecure-listen :8080` yields `{8080}`, not `{443}`. +- **Actual set is `http.Server.Addr`-derived.** The check runs after the `http.Server` literals are constructed but *before* either `ListenAndServe` call. The relay's only network ingress today is `http.Server`-mediated; a future non-HTTP listener (gRPC, raw TCP) would need a deliberate spec update to extend the actual-set source. +- **Port 0 is rejected.** `:0` means "pick an ephemeral port" in `net.Listen` semantics — accepting it would smuggle an unknown bound port past the actual-set construction and defeat the check. `ListenerPort` returns a wrapped error. + +## API (`internal/relay/listeners.go`) + +- `ErrUnexpectedListener` — exported sentinel, branchable via `errors.Is`. Wrapped error message names the surplus ports in ascending order plus the expected-set contents (also ascending) so the failure log is deterministic across runs and grep-friendly. +- `ListenerPort(addr string) (uint16, error)` — extracts the TCP port from any `http.Server.Addr` shape (`":443"`, `"127.0.0.1:8080"`, `"[::1]:443"`). Wraps `net.SplitHostPort` + `strconv.ParseUint(_, 10, 16)`; the upper-bound check is implicit in `ParseUint`'s `16`-bit width, and port 0 is rejected explicitly. +- `CheckListenerPorts(expected, actual map[uint16]struct{}) error` — pure function, no I/O. Returns `ErrUnexpectedListener` (wrapped) if any element of `actual` is absent from `expected`, nil otherwise. Reports **all** surplus ports in one shot (not just the first) so a manifest enabling several debug surfaces fails in one boot rather than N restart cycles. + +The maps are `map[uint16]struct{}` rather than `[]uint16` so set-difference is O(actual). The ascending-sorted ordering is a deliberate contract: `map` iteration is randomised, which would make the failure log non-deterministic and complicate operator grepping. + +## Wiring (`cmd/pyrycode-relay/main.go`) + +The check is inserted into each listener branch separately — the two branches have different `http.Server` shapes and there is no shared structure to lift it out of without restructuring `main`. Both branches run the check *after* the `http.Server` literals exist and *before* either `ListenAndServe` call (in the autocert branch, before the goroutine that runs `httpSrv.ListenAndServe` is spawned). + +**Insecure-mode branch.** Builds `expected = {port}` and `actual = {port}` from the parsed flag, runs the check, exits 2 on surplus. + +**Autocert-mode branch.** Builds `expected = {443, 80}` and `actual = {ListenerPort(httpsSrv.Addr), ListenerPort(httpSrv.Addr)}`, runs the check, exits 2 on surplus. + +Three details: + +- **Exit code 2** = config-rejected-at-boot, matching #9 / #77 / #78 / #79. Exit 1 in this file is reserved for *runtime* listener failures (`ListenAndServe` returning a bind/accept error). Distinct codes let ops dashboards split "deploy never started because of misconfiguration" from "deploy started and then crashed." The AC body literally said `os.Exit(1)`; the architect's spec overrode to harmonise. +- **Structured log fields**: `err` (wrapped sentinel), `unexpected_ports` (`[]uint16`, ascending), `expected_ports` (`[]uint16`, ascending). The surplus / expected slices are recomputed in `main` (via the unexported `listenerPortLists` helper) so `CheckListenerPorts` stays a single error return — the duplicate pass is cheap at boot and runs at most once. +- **Malformed-address path is distinct.** A `ListenerPort` parse error (e.g. `--insecure-listen notaport`) is a flag-validation failure, not a surplus-listener failure; main exits 2 with `"refusing to start: invalid listener address"` and an `addr` field, separate from the `"unexpected listener"` log line. + +## Build-graph defence (`cmd/pyrycode-relay/deps_test.go`) + +`TestBinaryDoesNotImportPprof` is a compile-time companion to the runtime port check. It shells out to `go list -deps -json github.com/pyrycode/pyrycode-relay/cmd/pyrycode-relay`, decodes the streamed JSON, and fails if any entry's `ImportPath == "net/http/pprof"`. The two layers catch different shapes of the same threat: + +- **Runtime check** catches a stray `:6060` *bind* — pprof's `go func() { http.ListenAndServe(":6060", nil) }()` shape. +- **Build-graph test** catches the `import _ "net/http/pprof"` *handler-registration* variant that attaches `/debug/pprof/*` to `http.DefaultServeMux` without opening a new port — would slip past the runtime port check entirely and expose unauthenticated profiler handlers on whatever mux the default mux is mounted on. + +The build-graph layer spans transitive imports, not just direct ones — a third-party dep blank-importing `net/http/pprof` (which has happened in real Go ecosystems) is the realistic drift surface. The test `t.Skip`s with a logged reason if `go` is not in `$PATH`; CI runners always have it. + +This is the "belt-and-suspenders means different fabric" rule from the project handbook: pairing a stochastic-ish runtime guard with a deterministic compile-time test. Both are needed; either alone leaves a known gap. + +## Failure modes (three distinct return shapes) + +| Cause | Source | Return | Branchable via | +|---|---|---|---| +| Surplus listener | `CheckListenerPorts` | `fmt.Errorf("%w: unexpected ports …; expected ports …", ErrUnexpectedListener)` | `errors.Is(err, ErrUnexpectedListener)` | +| Malformed listener address | `ListenerPort` | `fmt.Errorf("relay: parsing listener address %q: %w", addr, err)` (or empty/zero-port variants) | underlying `strconv.NumError` / `net.AddrError` | +| `net/http/pprof` in transitive imports | `TestBinaryDoesNotImportPprof` | `t.Fatalf` at CI time, naming the offending import path | — (test-time) | + +All three runtime paths exit 2 in `main`. The build-graph defence fails the CI pipeline before the binary ever ships. + +## Threat model alignment + +`docs/threat-model.md` § Deploy treats operator misconfiguration as the dominant failure class for an internet-exposed relay. The listener-allowlist check joins the boot-time-refusal family: + +- `ErrCacheDirInsecure` (#9) — autocert cache dir mode drift +- `ErrInsecureListenInProduction` (#77) — plaintext-in-prod transport drift +- `ErrRunningAsRoot` (#78) — uid-0 in production +- `ErrUnexpectedCapability` (#79) — over-broad capability grant +- `ErrInvalidConfigSentinel` (#80) — env-var typo / malformed value +- `ErrUnexpectedListener` (#81) — surplus listener bound by the process + +All six share the "refuse to start; fail the health check; never serve traffic in this configuration" shape. With six instances of the pattern, the consolidation follow-up (Config.Validate() multi-error) is owed against the parent #42 epic. + +## Out of scope (deferred) + +- **Interface binding.** `127.0.0.1` vs `0.0.0.0` vs `::` is port-only-equivalent today. Revisit if a management-only loopback listener is ever added. +- **Non-HTTP listeners.** Raw `net.Listen` without an `http.Server` wrapper would slip through — the relay's only network ingress today is `http.Server`-mediated. A future protocol that adds (e.g.) gRPC would need a deliberate spec update to extend the actual-set source. +- **Consolidation under `Config.Validate()`.** Sixth boot-check in `internal/relay`. The consolidation work owns the multi-error API design and the migration of all six call sites; deferred to a follow-up against #42's epic. + +## Cross-links + +- [Codebase note #81](../codebase/81.md) — per-ticket implementation detail. +- [Production-mode contract](production-mode.md) — sibling boot-time refusals (#77, #78). +- [Capability allowlist](capability-allowlist.md) — sibling boot-time refusal (#79); immediate precedent for the explicit-allowlist shape. +- [Env-var config validator](env-config-validator.md) — sibling boot-time refusal (#80). +- [Autocert TLS](autocert-tls.md) — `ErrCacheDirInsecure` is the original boot-time-refusal sentinel (#9). +- [#42 — parent ticket](https://github.com/pyrycode/pyrycode-relay/issues/42) — split into #77 / #78 / #79 / #81. diff --git a/docs/specs/architecture/81-listener-port-allowlist-boot-check.md b/docs/specs/architecture/81-listener-port-allowlist-boot-check.md new file mode 100644 index 0000000..e77cfe2 --- /dev/null +++ b/docs/specs/architecture/81-listener-port-allowlist-boot-check.md @@ -0,0 +1,293 @@ +# Spec: refuse to boot when listener ports exceed expected set + +Ticket: [#81](https://github.com/pyrycode/pyrycode-relay/issues/81). Size S. Split from #42. Sibling of #77 / #78 / #79. + +## Files to read first + +- `cmd/pyrycode-relay/main.go` (whole file, 178 lines) — the only call site. Lines 89–94 hold the most recent boot-check wiring (`CheckCapabilities`); the new listener-allowlist check slots in after `mux` is built (line 110) but before either listener branch starts. Lines 112–127 hold the insecure branch (single `http.Server.Addr = *insecureListen`), 135–155 hold the autocert branch (`httpsSrv.Addr = ":443"`, `httpSrv.Addr = ":80"`), 160–170 hold the two `ListenAndServe` calls the check must run *before*. +- `internal/relay/production.go` (whole file, 77 lines) — the canonical "boot-time refusal helper" pattern: single exported `Check…` function returning `error`, sentinel branchable via `errors.Is`, injected test seam, no init/global mutation. Lines 11–17 are the `ErrInsecureListenInProduction` shape this spec mirrors for `ErrUnexpectedListener`. +- `internal/relay/production_test.go` (whole file, 199 lines) — the table-driven, `t.Parallel()`-safe, `errors.Is`-against-sentinel test style for boot-check helpers. The 2×2 matrix in `TestCheckInsecureListenInProduction_Matrix` (lines 47–105) is the exact shape this spec's `CheckListenerPorts` matrix uses, swapping env/flag axes for expected/actual-set axes. +- `internal/relay/tls.go:15-19` — canonical sentinel-error declaration shape (`var Err... = errors.New("relay: …")` + Go doc that names the contract). Mirror it. +- `internal/relay/tls.go:46-47` — canonical wrapped-sentinel return shape (`fmt.Errorf("%w: %s (mode %o)", ErrCacheDirInsecure, …)`). The listener check uses this shape when naming the offending port(s). +- `docs/specs/architecture/79-capability-allowlist-boot-check.md` — the immediate precedent (sibling ticket, same parent #42). Same call-site neighbourhood, same sentinel-error contract, same testing approach. The "Why not also check …" framing (CapPrm/CapBnd) maps directly onto this ticket's "Why port-only, not interface" decision. +- `docs/specs/architecture/77-refuse-insecure-in-production.md:105-113` — the "Why not a `Config` struct" decision. The same rationale (one `Check…` per concern; consolidate when the count hits ~3 and the wiring boilerplate is a real cost) carries over here. +- `docs/PROJECT-MEMORY.md` § "Project-level conventions" — the "Sentinel errors + `errors.Is` branching", "Loud failure over silent correction", and "Tests live in the same package" rules apply directly. + +## Context + +The relay's intended listeners are exactly two shapes: + +- **Autocert mode** (`--domain `): TCP/443 (HTTPS terminus, autocert-managed cert) and TCP/80 (ACME HTTP-01 challenge + 404). +- **Insecure mode** (`--insecure-listen :`): a single TCP listener on the operator-supplied port, fronted by an upstream proxy. + +Anything else the process binds is suspect. The high-frequency leak shapes are: + +- **`net/http/pprof` blank-import.** Importing the package for any reason registers `/debug/pprof/*` handlers on `http.DefaultServeMux` *and* (since Go 1.22) starts the server on `:6060` via `http.ListenAndServe(":6060", …)` if anything anywhere in the binary calls it. A single `import _ "net/http/pprof"` slipped into a debug-flagged file (or pulled transitively from a profiling library) opens an unauthenticated stack-trace / heap-dump endpoint on every box. +- **Env-flipped debug ports.** A future flag (`--debug-listen :`) added as a developer convenience and forgotten in a deploy manifest. +- **Accidentally-enabled metrics exporters.** Prometheus client wiring (`promhttp.Handler` on `:9100`) is a one-line mistake that exposes process internals on the internet. + +The defence this ticket adds is a boot-time set-comparison: the ports the process is *about to bind* (the `Addr` fields of the constructed `http.Server` values) compared against the ports it is *permitted to bind* (derived from parsed flags). Any actual port not in the expected set fails the deploy's health check before traffic flows. + +The check is **asymmetric** by design. Surplus listeners (actual ⊋ expected) are the threat shape. Missing listeners (actual ⊊ expected) are not — a failure to bind an expected port will surface as `EADDRINUSE` (or similar) from `ListenAndServe` itself, and the relay will exit 1 on the runtime-error path. This split keeps the boot check focused on one observable failure mode. + +The check is **port-only**. Interface binding (`127.0.0.1` vs `0.0.0.0` vs `::`) is out of scope per the AC's technical notes. The threat model the check encodes is "a listener exists on this port and is reachable"; on a single-instance internet-exposed deploy, every listener is reachable. A future ticket that adds management-only `127.0.0.1` listeners can extend the set type without breaking the port-set contract — the migration path is "wrap `uint16` with a richer record"; the migration is trivial because nothing outside `internal/relay` constructs these sets except `cmd/`. + +Related: #9 (`ErrCacheDirInsecure` set the boot-time-refusal precedent); #77 (`ErrInsecureListenInProduction`); #78 (`ErrRunningAsRoot`); #79 (`ErrUnexpectedCapability` — same "explicit allowlist" pattern). All four share the "refuse to start; fail the health check; never serve traffic in this configuration" shape. + +## Design + +Three new files; one edit to `cmd/pyrycode-relay/main.go`. No existing code is refactored. + +| Path | Purpose | New? | +|---|---|---| +| `internal/relay/listeners.go` | `ErrUnexpectedListener` sentinel, `ListenerPort` parser, `CheckListenerPorts` check | new | +| `internal/relay/listeners_test.go` | Unit tests for the parser and check (AC matrix lives here) | new | +| `cmd/pyrycode-relay/deps_test.go` | Build-graph test that `net/http/pprof` is not in the binary's transitive imports | new | +| `cmd/pyrycode-relay/main.go` | Build expected + actual sets, invoke check before `Listen*` calls | edit (~25 lines added) | + +### Sentinel error and helpers (in `internal/relay/listeners.go`, exported) + +Three exports plus one sentinel. The shapes: + +```go +// ErrUnexpectedListener is returned by CheckListenerPorts when the set +// of ports the process is about to bind contains any port outside the +// expected set (derived from parsed flags). Stray listeners +// (net/http/pprof on :6060, a forgotten debug exporter, a metrics +// endpoint accidentally enabled) are the threat shape: an unauthenticated +// HTTP surface on the public internet. The relay refuses to start so the +// misconfiguration fails the deploy's health check rather than serving +// traffic on the surplus port. +// +// Branchable via errors.Is. The wrapped error names the offending +// port(s) and the expected-ports set. +var ErrUnexpectedListener = errors.New("relay: process is about to bind a TCP port outside the expected set") + +// ListenerPort extracts the TCP port from an http.Server Addr value +// such as ":443", "127.0.0.1:8080", or "[::1]:443". Returns a wrapped +// error if addr is empty or does not contain a parseable port in +// 1..65535. Used by cmd/pyrycode-relay to canonicalise both the expected +// (flag-derived) and actual (http.Server.Addr-derived) port sets onto +// the same uint16 key. +func ListenerPort(addr string) (uint16, error) + +// CheckListenerPorts returns ErrUnexpectedListener (wrapped, naming the +// offending port(s) in ascending order plus the expected-set contents) +// when any element of actual is absent from expected. Returns nil +// otherwise. +// +// The check is asymmetric: missing ports (actual ⊊ expected) are NOT an +// error — a failure to bind an expected port surfaces as a runtime +// listener error from http.Server.ListenAndServe, which has its own +// exit path. This check exists to catch the surplus-listener case +// (actual ⊋ expected) where an unauthenticated debug endpoint would +// otherwise be exposed on the internet. +// +// Both arguments are sets keyed by TCP port (uint16). Empty sets are +// valid inputs: CheckListenerPorts(∅, ∅) returns nil. +func CheckListenerPorts(expected, actual map[uint16]struct{}) error +``` + +### `ListenerPort` — algorithm + +1. If `addr == ""` → return `0, fmt.Errorf("relay: listener address is empty")`. +2. Call `net.SplitHostPort(addr)`. On error → return `0, fmt.Errorf("relay: parsing listener address %q: %w", addr, err)`. +3. `strconv.ParseUint(port, 10, 16)` on the port substring. On error → wrap with the same prefix. +4. Reject `0` explicitly: `port == 0` means "pick an ephemeral port" in net.Listen semantics; the relay never wants that, and an actual-set entry of `0` would defeat the check. Return a wrapped error. +5. Return `uint16(parsed), nil`. + +Bounds check on the upper end is implicit in `ParseUint(_, 10, 16)`: anything > 65535 fails the parse. + +### `CheckListenerPorts` — algorithm + +1. Compute `surplus := slices.Sorted(actual \ expected)` (ports in `actual` not in `expected`, ascending). +2. If `surplus` is empty → return `nil`. +3. Format the error message: + ``` + relay: process is about to bind a TCP port outside the expected set: \ + unexpected ports [N, M, …]; expected ports [P, Q, …] + ``` + Both lists are sorted ascending so the message is deterministic across runs (`map[uint16]struct{}` iteration order is randomised, which would make the failure log non-deterministic and complicate operator grepping). +4. Return `fmt.Errorf("%w: unexpected ports %v; expected ports %v", ErrUnexpectedListener, surplus, sortedExpected)`. + +**Decision: report all surplus ports, not just the first.** A misconfigured manifest can enable several debug surfaces in one breath (`pprof` + `metrics` + a forgotten dev port); listing one port at a time would force the operator through N restart cycles. The error message names every offending port. Mirrors the `ErrUnexpectedCapability` decision in #79. + +**Decision: do not return information about missing ports in the error.** The asymmetric contract is the whole point. If the operator's deploy somehow ends up with `actual ⊊ expected`, the next thing that happens is `http.Server.ListenAndServe` returns and the process exits 1 with a runtime "listen tcp :443: bind: ..." error — that's the clearer signal for that failure mode. + +### Wiring in `cmd/pyrycode-relay/main.go` + +The constructed-but-not-yet-bound `http.Server` values are the source of truth for the actual set. The natural insertion point is *after* both `http.Server` values exist *and* after `mux` is wired, but *before* either `ListenAndServe` call. The two listener branches have different shapes, so the simplest wiring keeps the check inside each branch — there is no shared structure to lift it out of without restructuring the function more than this ticket warrants. + +**Insecure-mode branch** (currently lines 112–127). Insert after the `srv` literal, before `srv.ListenAndServe()`: + +``` +// 1. Build expected set: a single port from --insecure-listen. +// 2. Build actual set: ListenerPort(srv.Addr). +// 3. Call relay.CheckListenerPorts(expected, actual); on error, +// logger.Error(... "unexpected_ports", surplus, "expected_ports", ...) + os.Exit(2). +``` + +**Autocert-mode branch** (currently lines 129–170). Insert after both `httpsSrv` and `httpSrv` literals exist, before the goroutine that runs `httpSrv.ListenAndServe()`: + +``` +// 1. Build expected set: {443, 80}. +// 2. Build actual set: ListenerPort(httpsSrv.Addr), ListenerPort(httpSrv.Addr). +// 3. Call relay.CheckListenerPorts(expected, actual); on error, +// logger.Error(... "unexpected_ports", surplus, "expected_ports", ...) + os.Exit(2). +``` + +Three details: + +- **Exit code 2**, matching #77 / #78 / #79. The AC body literally says `os.Exit(1)`; I am overriding that to keep the boot-check exit-code contract consistent across the four sibling checks. Exit 1 in this file is reserved for *runtime* listener failures (`http.Server.ListenAndServe` returning a bind/accept error). Exit 2 is *boot-time configuration refusal*. Distinct codes let ops dashboards split "deploy never started because of misconfiguration" from "deploy started and then crashed." The PO body did not consider the precedent set by #77's spec; the architect's job is to harmonise. +- **Structured log line fields:** `err` (the wrapped sentinel), `unexpected_ports` (the surplus list as `[]uint16`), `expected_ports` (the expected list as `[]uint16`). These satisfy the AC's "at minimum: the offending port(s), the expected-ports set, and an `err` field carrying the wrapped sentinel." +- **`ListenerPort` parse errors → also `os.Exit(2)`** with `logger.Error("refusing to start: invalid listener address", "err", err, "addr", addr)`. A malformed `--insecure-listen` value (e.g. `notaport`) is a flag-validation failure, not a surplus-listener failure; surface it distinctly so the operator sees the right fix. This happens before the `CheckListenerPorts` call, on either branch. + +### Why a fresh helper per check, not a shared `Config.Validate()` + +Same rationale as the precedent in #77's spec § "Why not a `Config` struct." `CheckListenerPorts` is the fourth `Check…` to land in `internal/relay`. The line of "consolidate when the count hits ~3 and the wiring boilerplate becomes a real cost" has been crossed *by counting*, but the wiring boilerplate in `main.go` is still tractable (each `Check…` is a 5-line `if err := … { logger.Error(…); os.Exit(2) }` block). The consolidation work is a separate follow-up that owns both the API design (multi-error return? short-circuit?) and the migration; doing it here would expand the ticket beyond the AC. **Action:** when this ticket closes, file a follow-up "consolidate boot-check helpers" issue against the parent #42's epic. + +### Build-graph test in `cmd/pyrycode-relay/deps_test.go` + +```go +package main + +// TestBinaryDoesNotImportPprof asserts that net/http/pprof is not in the +// transitive import set of the cmd/pyrycode-relay package. A blank-import +// of net/http/pprof anywhere in the dependency graph registers /debug/pprof/* +// handlers on http.DefaultServeMux and (under Go 1.22+ default configs) can +// open :6060 unattended. The boot-time CheckListenerPorts guard catches the +// :6060-bind variant; this build-graph test catches the handler-registration +// variant that wouldn't bind a new port but would attach to an existing mux. +// +// Implementation: shell out to `go list -deps -json ` and +// inspect the ImportPath field of each entry. The import path is the +// canonical module form, not "." or a relative path — that makes the test +// CWD-independent (works under `go test ./...` from repo root and under +// `go test` from the package directory). +func TestBinaryDoesNotImportPprof(t *testing.T) +``` + +**Algorithm:** + +1. Run `exec.LookPath("go")` → `exec.Command(goBin, "list", "-deps", "-json", "github.com/pyrycode/pyrycode-relay/cmd/pyrycode-relay")`. Capture stdout to a `bytes.Buffer`. +2. The output is a stream of concatenated JSON objects (one per dep). Use `json.NewDecoder(...).Decode(&pkg)` in a loop where `pkg` is `struct { ImportPath string }`. +3. Skip until EOF; if any decoded `ImportPath == "net/http/pprof"` → `t.Fatalf("net/http/pprof in transitive imports; remove the import or it will register debug handlers on http.DefaultServeMux")`. +4. `t.Fatalf` on any I/O or decode error. + +**Decision: `go list -deps -json`, not a build-tag-gated compile check.** The AC offers either as acceptable. The `go list` approach has two concrete advantages: + +1. **Catches transitive imports too.** A blank-import in a dependency we pull in (`golang.org/x/net/http2/h2c`-style accident) doesn't appear in our source but does appear in `-deps`. The threat is third-party drift; the check has to span the whole graph. +2. **Test fails on the offending CI run, not at runtime.** A build-tag check would only flag the issue when something explicitly references the pprof package. `go list -deps` is unconditional. + +The cost is the test requires the `go` binary in `$PATH` — which is true on every host that runs `make test` and every CI runner. If a future CI environment hits this constraint, the test logs `t.Skip("go binary not in PATH")` and the developer fixes the env. No silent-skip path; the test is either run or explicitly skipped. + +**`t.Parallel()`-safe?** Yes. The test shells out to `go list` but reads no global state and writes only to the test's own `*testing.T`. + +## Concurrency model + +None. `ListenerPort` and `CheckListenerPorts` are pure functions; the `main` wiring runs on the main goroutine before any listener goroutine is spawned (recall the autocert branch spawns `httpSrv.ListenAndServe` in a goroutine *after* the check runs, then runs `httpsSrv.ListenAndServeTLS` on the main goroutine). No locks, no channels. + +## Error handling + +Three distinct failure modes, each with its own log line and exit code in `main`: + +| Mode | Source | Return shape | `main` action | +|---|---|---|---| +| Surplus listener | `CheckListenerPorts` returns `fmt.Errorf("%w: …", ErrUnexpectedListener)` | wraps sentinel | `logger.Error("refusing to start: unexpected listener", "err", err, "unexpected_ports", surplus, "expected_ports", expected)` + `os.Exit(2)` | +| Malformed address | `ListenerPort` returns wrapped parse error | not sentinel | `logger.Error("refusing to start: invalid listener address", "err", err, "addr", addr)` + `os.Exit(2)` | +| Empty actual set | (cannot happen — every branch constructs at least one `http.Server` before the check) | — | n/a | + +All three paths are total — no retry, no fallback. Boot-time refusal is intentional and complete. + +## Testing strategy + +All tests live alongside the implementation and are `t.Parallel()`-safe. No `os.Setenv`, no `t.Setenv`, no real listener binds, no real `/proc` reads. + +### `internal/relay/listeners_test.go` — pure-function tests + +#### `TestListenerPort_Matrix` + +Table-driven over `(addr string, wantPort uint16, wantErr bool)`: + +- `":443"` → `443`, nil +- `":80"` → `80`, nil +- `"127.0.0.1:8080"` → `8080`, nil +- `"[::1]:443"` → `443`, nil (IPv6 bracket form) +- `"0.0.0.0:9000"` → `9000`, nil +- `""` (empty) → 0, error +- `"443"` (no colon) → 0, error +- `"127.0.0.1"` (no port) → 0, error +- `":0"` (zero port) → 0, error (explicit reject) +- `":99999"` (out of range) → 0, error +- `":notaport"` → 0, error +- `":-1"` → 0, error + +The `:0` row is the load-bearing one: documents the explicit reject of the ephemeral-port placeholder. + +#### `TestCheckListenerPorts_ACMatrix` — verbatim from the AC + +The AC enumerates exactly four cases; this table replays them: + +| name | expected | actual | want | +|---|---|---|---| +| `autocert_match` (case a, autocert-on shape) | `{443, 80}` | `{443, 80}` | nil | +| `insecure_match` (case a, insecure shape) | `{8080}` | `{8080}` | nil | +| `surplus_pprof` (case b) | `{443, 80}` | `{443, 80, 6060}` | `errors.Is(err, ErrUnexpectedListener)` AND `err.Error()` contains `"6060"` | +| `actual_subset_of_expected` (case c) | `{443, 80}` | `{443}` | nil | +| `empty_both` (case d) | `{}` | `{}` | nil | + +Per the AC's technical note: case (a) is exercised in both the autocert-on shape (`{443, 80}`) and the insecure shape (`{8080}`); the rows above split them so a regression in either shape is named. + +#### `TestCheckListenerPorts_SurplusListedSorted` + +A focused test: `expected = {443}`, `actual = {443, 9000, 6060, 1234}`. Assert that the error message contains the surplus ports in ascending order (`"1234"` before `"6060"` before `"9000"`). Locks in the determinism contract — without it, `map` iteration order would make failure logs vary across runs. + +#### `TestCheckListenerPorts_MultipleSurplus` + +A focused test: `expected = {443, 80}`, `actual = {443, 80, 6060, 9090}`. Assert that **all** surplus ports appear in the error message (both `"6060"` and `"9090"`). Locks in the "report all, not just the first" contract. + +#### `TestErrUnexpectedListener_IsBranchable` + +A one-line `errors.Is` test, mirroring `TestErrInsecureListenInProduction_IsBranchable` in `production_test.go`. Locks in the `errors.Is` contract so a future refactor to `fmt.Errorf("relay: %s", …)` (dropping the `%w`) breaks the test, not downstream callers. + +### `cmd/pyrycode-relay/deps_test.go` — build-graph test + +#### `TestBinaryDoesNotImportPprof` + +Single test. Algorithm above. Assertion: no decoded JSON object has `ImportPath == "net/http/pprof"`. On failure, `t.Fatalf` with the offending dep's full ImportPath (so the developer can grep their tree for the introducing import). + +### What is NOT tested + +- The `main.go` wiring itself. Same rationale as #77/#78/#79: `main` is a coordination function; its behaviour is observable through the unit tests on the helpers it calls. A fork-exec integration test for a 5-line `if err != nil { os.Exit(2) }` block is over-engineering. Code review of the diff is the gate. +- The exact log-line field names or ordering. The AC says "structured log line containing at minimum: the offending port(s), the expected-ports set, and an `err` field." The spec satisfies that literally; asserting on slog output requires a handler-capture seam, which is friction the surface does not earn. +- The exact format of the error message string (whitespace, punctuation). We test that the message contains the offending port numbers and is branchable via `errors.Is`; cosmetic format changes should not break tests. +- Other "extra surface" leak shapes (UNIX-domain sockets, raw `net.Listen` on a TCP address that isn't wired to an `http.Server`). These are caught only insofar as they show up as `http.Server.Addr` values in the actual set. A raw `net.Listen` without an `http.Server` wrapper would slip through — but the relay's only network ingress today is `http.Server`-mediated, and a future protocol that adds a non-HTTP listener (e.g. gRPC) will need a deliberate spec update to extend the actual-set source. Documented here so the gap is explicit rather than silent. + +## Open questions + +None blocking the developer. Two notes for future tickets: + +1. **Exit-code drift from the AC.** The AC literally says `os.Exit(1)`; the spec uses `os.Exit(2)` to stay consistent with #77/#78/#79. If the PO disagrees with the harmonisation, surface as a comment on the ticket and the spec adjusts. +2. **Consolidation follow-up.** `CheckListenerPorts` is the fourth `Check…` helper in `internal/relay`. A follow-up to consolidate them under a `Config.Validate()`-shape multi-error helper is worth filing against the parent #42's epic once this ticket closes. + +## Security review + +**Verdict:** PASS + +Per `architect/security-review.md` for the security-sensitive ticket #81. The spec was walked adversarially against each category. Findings: + +- **[Trust boundaries]** Two inputs cross into the check: (1) `--insecure-listen` is operator-supplied, and `ListenerPort` is the parser — bounded by `net.SplitHostPort` + `strconv.ParseUint(_, 10, 16)`, both stdlib, both bounded. No allocation driven by attacker-controlled length (the input is the value of a process flag, not network-borne). (2) The `http.Server.Addr` fields the actual set draws from are package-internal — either literal strings (`":443"`, `":80"`) or the same operator-supplied value the parser already validated. No untrusted parser surface in the check itself. No findings. +- **[Tokens, secrets, credentials]** N/A. No credential material is read or written. The check operates on port numbers. The error message contains port numbers and the literal `relay: …` prefix — no operator-supplied strings beyond the (already operator-controlled) port digits. No findings. +- **[File operations]** N/A in `internal/relay/listeners.go`. The `cmd/pyrycode-relay/deps_test.go` test invokes `exec.Command("go", "list", "-deps", "-json", )` — the import path is a compile-time string constant, not derived from any user input or env var. The test does not write any file, does not concatenate paths, does not read attacker-controlled data. The subprocess inherits the test runner's environment (which is fine for a test) and writes to a `*bytes.Buffer` in memory. No findings. +- **[Subprocess / external command execution]** The `go list` invocation in the test is the only subprocess. Arguments are hardcoded literals (`"list"`, `"-deps"`, `"-json"`, the import-path string constant). No `sh -c`, no shell metacharacters, no user-supplied substring in `argv`. The lookup uses `exec.LookPath("go")` (or `exec.Command("go", ...)` which delegates to `$PATH`) — a PATH-poisoning test environment could substitute a malicious `go`, but the same risk applies to every other Go test in the repo and is out of scope for this ticket. No findings. +- **[Cryptographic primitives]** N/A. No RNG, no hash, no constant-time comparison. The `actual \ expected` operation is a bitwise-comparable set-difference on `uint64` keys (via map lookup); not security-critical timing because neither input is attacker-controlled in a way that depends on the comparison's runtime. No findings. +- **[Network & I/O]** The entire purpose of the ticket is to *prevent* a listener from opening in a misconfigured state. The check runs before any `Listen*` call. Listener timeouts, frame caps, header gates — all unchanged. The check itself does not open any network socket; `net.SplitHostPort` is a pure string parse. No findings. +- **[Error messages, logs, telemetry]** The wrapped error message contains: (a) the sentinel prose, (b) the surplus port list as `[]uint16` formatted via `%v` (digits + brackets — no operator-supplied strings), (c) the expected port list as `[]uint16` (same shape). None are user-controlled in a way an attacker can reach: the surplus ports come from in-process `http.Server.Addr` literals or from the operator's own `--insecure-listen` value (and even there, the parser has already enforced `ParseUint(_, 10, 16)` plus the `0` reject, so the value is a number in `1..65535`). The `main`-side log line emits `err`, `unexpected_ports`, `expected_ports`, and (on the parse path) `addr`. The `addr` field on the parse-error path carries the operator's flag value verbatim; this is acceptable — operator flags are not secrets, and the diagnostic value of seeing the exact bad input outweighs a hypothetical "operator put a token in `--insecure-listen`" misuse. **Worth flagging to the developer:** do not extend the success path to log the expected/actual ports on every boot — only log on failure, mirroring the silence-on-success convention from #77/#78/#79. No findings against the spec as written. +- **[Concurrency]** N/A. The check runs on the main goroutine before any other goroutine is spawned (in the autocert branch, the goroutine that runs `httpSrv.ListenAndServe` is spawned only after the check passes; in the insecure branch, no goroutines are spawned at all). The `map[uint16]struct{}` instances live entirely on `main`'s stack frame. No shared mutable state. No findings. +- **[Threat model alignment]** `docs/threat-model.md` § Deploy treats "operator misconfiguration leaks an unauthenticated surface" as the dominant failure class for an internet-exposed relay. This ticket adds an in-binary defence against one specific shape (surplus listener). It complements `ErrInsecureListenInProduction` (transport drift), `ErrRunningAsRoot` (uid drift), `ErrUnexpectedCapability` (capability drift), and `ErrCacheDirInsecure` (file-mode drift). All five share the "refuse to start; fail the health check; never serve traffic in this configuration" shape. The build-graph `TestBinaryDoesNotImportPprof` test adds a *second* defence on a different layer (compile-time vs runtime) against the highest-frequency leak shape, which is exactly the "belt-and-suspenders means different fabric" guidance in the architect handbook. No findings. +- **[Asymmetric-check correctness]** Explicit walk: the asymmetric design (surplus = error, missing = nil) is correct because the missing case is observable through a distinct, louder failure path (`ListenAndServe` returns a bind error). A reviewer might worry that an attacker could exploit the asymmetry — *no*. The check inputs are not attacker-controllable (both sets are derived from operator flags and in-process literals), so the asymmetric direction has no adversarial axis. The asymmetry's only consequence is "what kind of operator misconfiguration is caught here vs caught at runtime." No findings. + +**One SHOULD FIX flagged inline** (Error messages, logs, telemetry): the developer must not log the expected/actual port sets on the success path. The spec is written that way (silence on success); code review must double-check. + +**Reviewer:** architect (self-review per `architect/security-review.md`) +**Date:** 2026-05-13 diff --git a/internal/relay/listeners.go b/internal/relay/listeners.go new file mode 100644 index 0000000..e79a4d2 --- /dev/null +++ b/internal/relay/listeners.go @@ -0,0 +1,86 @@ +package relay + +import ( + "errors" + "fmt" + "net" + "slices" + "strconv" +) + +// ErrUnexpectedListener is returned by CheckListenerPorts when the set +// of ports the process is about to bind contains any port outside the +// expected set (derived from parsed flags). Stray listeners +// (net/http/pprof on :6060, a forgotten debug exporter, a metrics +// endpoint accidentally enabled) are the threat shape: an unauthenticated +// HTTP surface on the public internet. The relay refuses to start so the +// misconfiguration fails the deploy's health check rather than serving +// traffic on the surplus port. +// +// Branchable via errors.Is. The wrapped error names the offending +// port(s) and the expected-ports set. +var ErrUnexpectedListener = errors.New("relay: process is about to bind a TCP port outside the expected set") + +// ListenerPort extracts the TCP port from an http.Server Addr value +// such as ":443", "127.0.0.1:8080", or "[::1]:443". Returns a wrapped +// error if addr is empty or does not contain a parseable port in +// 1..65535. Used by cmd/pyrycode-relay to canonicalise both the expected +// (flag-derived) and actual (http.Server.Addr-derived) port sets onto +// the same uint16 key. +// +// Port 0 is rejected explicitly: it means "pick an ephemeral port" in +// net.Listen semantics, which the relay never wants and which would +// defeat the asymmetric set check by smuggling an unknown bound port +// past the actual-set construction. +func ListenerPort(addr string) (uint16, error) { + if addr == "" { + return 0, fmt.Errorf("relay: listener address is empty") + } + _, port, err := net.SplitHostPort(addr) + if err != nil { + return 0, fmt.Errorf("relay: parsing listener address %q: %w", addr, err) + } + n, err := strconv.ParseUint(port, 10, 16) + if err != nil { + return 0, fmt.Errorf("relay: parsing listener address %q: %w", addr, err) + } + if n == 0 { + return 0, fmt.Errorf("relay: parsing listener address %q: port 0 is not permitted (ephemeral-port placeholder)", addr) + } + return uint16(n), nil +} + +// CheckListenerPorts returns ErrUnexpectedListener (wrapped, naming the +// offending port(s) in ascending order plus the expected-set contents) +// when any element of actual is absent from expected. Returns nil +// otherwise. +// +// The check is asymmetric: missing ports (actual ⊊ expected) are NOT an +// error — a failure to bind an expected port surfaces as a runtime +// listener error from http.Server.ListenAndServe, which has its own +// exit path. This check exists to catch the surplus-listener case +// (actual ⊋ expected) where an unauthenticated debug endpoint would +// otherwise be exposed on the internet. +// +// Both arguments are sets keyed by TCP port (uint16). Empty sets are +// valid inputs: CheckListenerPorts(∅, ∅) returns nil. +func CheckListenerPorts(expected, actual map[uint16]struct{}) error { + var surplus []uint16 + for p := range actual { + if _, ok := expected[p]; !ok { + surplus = append(surplus, p) + } + } + if len(surplus) == 0 { + return nil + } + slices.Sort(surplus) + + sortedExpected := make([]uint16, 0, len(expected)) + for p := range expected { + sortedExpected = append(sortedExpected, p) + } + slices.Sort(sortedExpected) + + return fmt.Errorf("%w: unexpected ports %v; expected ports %v", ErrUnexpectedListener, surplus, sortedExpected) +} diff --git a/internal/relay/listeners_test.go b/internal/relay/listeners_test.go new file mode 100644 index 0000000..110ce29 --- /dev/null +++ b/internal/relay/listeners_test.go @@ -0,0 +1,172 @@ +package relay + +import ( + "errors" + "strings" + "testing" +) + +func TestListenerPort_Matrix(t *testing.T) { + t.Parallel() + + cases := []struct { + name string + addr string + wantPort uint16 + wantErr bool + }{ + {name: "all-interfaces 443", addr: ":443", wantPort: 443}, + {name: "all-interfaces 80", addr: ":80", wantPort: 80}, + {name: "loopback ipv4 high port", addr: "127.0.0.1:8080", wantPort: 8080}, + {name: "loopback ipv6 bracketed", addr: "[::1]:443", wantPort: 443}, + {name: "zero ipv4 host", addr: "0.0.0.0:9000", wantPort: 9000}, + {name: "empty rejected", addr: "", wantErr: true}, + {name: "no colon rejected", addr: "443", wantErr: true}, + {name: "host only rejected", addr: "127.0.0.1", wantErr: true}, + {name: "zero port rejected", addr: ":0", wantErr: true}, + {name: "out of range rejected", addr: ":99999", wantErr: true}, + {name: "non-numeric port rejected", addr: ":notaport", wantErr: true}, + {name: "negative port rejected", addr: ":-1", wantErr: true}, + } + + for _, tc := range cases { + tc := tc + t.Run(tc.name, func(t *testing.T) { + t.Parallel() + got, err := ListenerPort(tc.addr) + if tc.wantErr { + if err == nil { + t.Fatalf("ListenerPort(%q) = %d, nil; want error", tc.addr, got) + } + return + } + if err != nil { + t.Fatalf("ListenerPort(%q) returned error: %v", tc.addr, err) + } + if got != tc.wantPort { + t.Errorf("ListenerPort(%q) = %d, want %d", tc.addr, got, tc.wantPort) + } + }) + } +} + +func portSet(ports ...uint16) map[uint16]struct{} { + s := make(map[uint16]struct{}, len(ports)) + for _, p := range ports { + s[p] = struct{}{} + } + return s +} + +func TestCheckListenerPorts_ACMatrix(t *testing.T) { + t.Parallel() + + cases := []struct { + name string + expected map[uint16]struct{} + actual map[uint16]struct{} + wantErr bool + mustContain []string + }{ + { + name: "autocert_match", + expected: portSet(443, 80), + actual: portSet(443, 80), + wantErr: false, + }, + { + name: "insecure_match", + expected: portSet(8080), + actual: portSet(8080), + wantErr: false, + }, + { + name: "surplus_pprof", + expected: portSet(443, 80), + actual: portSet(443, 80, 6060), + wantErr: true, + mustContain: []string{"6060"}, + }, + { + name: "actual_subset_of_expected", + expected: portSet(443, 80), + actual: portSet(443), + wantErr: false, + }, + { + name: "empty_both", + expected: portSet(), + actual: portSet(), + wantErr: false, + }, + } + + for _, tc := range cases { + tc := tc + t.Run(tc.name, func(t *testing.T) { + t.Parallel() + err := CheckListenerPorts(tc.expected, tc.actual) + if !tc.wantErr { + if err != nil { + t.Fatalf("CheckListenerPorts: got err %v, want nil", err) + } + return + } + if !errors.Is(err, ErrUnexpectedListener) { + t.Fatalf("CheckListenerPorts: got err %v, want errors.Is(err, ErrUnexpectedListener)", err) + } + msg := err.Error() + for _, sub := range tc.mustContain { + if !strings.Contains(msg, sub) { + t.Errorf("error message %q missing substring %q", msg, sub) + } + } + }) + } +} + +func TestCheckListenerPorts_SurplusListedSorted(t *testing.T) { + t.Parallel() + + err := CheckListenerPorts(portSet(443), portSet(443, 9000, 6060, 1234)) + if !errors.Is(err, ErrUnexpectedListener) { + t.Fatalf("got err %v, want errors.Is(err, ErrUnexpectedListener)", err) + } + msg := err.Error() + idx := func(s string) int { return strings.Index(msg, s) } + i1234, i6060, i9000 := idx("1234"), idx("6060"), idx("9000") + if i1234 < 0 || i6060 < 0 || i9000 < 0 { + t.Fatalf("error message %q missing one of 1234/6060/9000", msg) + } + if !(i1234 < i6060 && i6060 < i9000) { + t.Errorf("surplus ports not in ascending order in %q (positions: 1234=%d 6060=%d 9000=%d)", + msg, i1234, i6060, i9000) + } +} + +func TestCheckListenerPorts_MultipleSurplus(t *testing.T) { + t.Parallel() + + err := CheckListenerPorts(portSet(443, 80), portSet(443, 80, 6060, 9090)) + if !errors.Is(err, ErrUnexpectedListener) { + t.Fatalf("got err %v, want errors.Is(err, ErrUnexpectedListener)", err) + } + msg := err.Error() + for _, sub := range []string{"6060", "9090"} { + if !strings.Contains(msg, sub) { + t.Errorf("error message %q missing surplus port %q", msg, sub) + } + } +} + +func TestErrUnexpectedListener_IsBranchable(t *testing.T) { + t.Parallel() + + if !errors.Is(ErrUnexpectedListener, ErrUnexpectedListener) { + t.Fatal("ErrUnexpectedListener should be errors.Is itself") + } + err := CheckListenerPorts(portSet(443), portSet(443, 6060)) + if !errors.Is(err, ErrUnexpectedListener) { + t.Errorf("returned error %v should satisfy errors.Is(err, ErrUnexpectedListener)", err) + } +}