diff --git a/cmd/pyrycode-relay/main.go b/cmd/pyrycode-relay/main.go index 62daa71..923996d 100644 --- a/cmd/pyrycode-relay/main.go +++ b/cmd/pyrycode-relay/main.go @@ -28,6 +28,7 @@ func main() { domain = flag.String("domain", "", "Public domain for Let's Encrypt cert issuance (required unless --insecure-listen is set).") certCache = flag.String("cert-cache", defaultCertCache(), "Directory for autocert's TLS certificate cache.") insecureListen = flag.String("insecure-listen", "", "Listen address for plain HTTP (e.g. :8080). Disables autocert; use only when fronted by a reverse proxy.") + metricsListen = flag.String("metrics-listen", "127.0.0.1:9090", "Listen address for the /metrics endpoint. Must be a loopback IP literal (e.g. 127.0.0.1:9090, [::1]:9090). Empty disables.") showVersion = flag.Bool("version", false, "Print version and exit.") ) flag.Parse() @@ -97,6 +98,21 @@ func main() { startedAt := time.Now() reg := relay.NewRegistry() + metricsReg := relay.NewMetricsRegistry() + relay.NewConnectionsMetrics(metricsReg, reg) + + metricsMux := http.NewServeMux() + metricsMux.Handle("/metrics", relay.NewMetricsHandler(metricsReg)) + + metricsSrv, err := relay.NewMetricsServer(*metricsListen, metricsMux) + if err != nil { + logger.Error("refusing to start: invalid --metrics-listen address", + "err", err, + "value", *metricsListen, + "fix", "use a loopback IP literal such as 127.0.0.1:9090 or [::1]:9090, or pass --metrics-listen= to disable") + os.Exit(2) + } + // maxFrameBytes: 256 KiB per-frame read cap. Derivation: // docs/specs/architecture/29-wsconn-read-limit.md (≤50-message // message_chunk envelope + routing wrapper, headroom for outliers, @@ -128,6 +144,16 @@ func main() { } expected := map[uint16]struct{}{port: {}} actual := map[uint16]struct{}{port: {}} + if metricsSrv != nil { + mp, err := relay.ListenerPort(metricsSrv.Addr) + if err != nil { + logger.Error("refusing to start: invalid listener address", + "err", err, "addr", metricsSrv.Addr) + os.Exit(2) + } + expected[mp] = struct{}{} + actual[mp] = struct{}{} + } if err := relay.CheckListenerPorts(expected, actual); err != nil { surplus, expectedList := listenerPortLists(expected, actual) logger.Error("refusing to start: unexpected listener", @@ -136,6 +162,15 @@ func main() { "expected_ports", expectedList) os.Exit(2) } + if metricsSrv != nil { + logger.Info("starting metrics listener", "listen", metricsSrv.Addr) + go func() { + if err := metricsSrv.ListenAndServe(); err != nil { + logger.Error("metrics listener failed", "err", err) + os.Exit(1) + } + }() + } if err := srv.ListenAndServe(); err != nil { logger.Error("listen failed", "err", err) os.Exit(1) @@ -185,6 +220,16 @@ func main() { } expected := map[uint16]struct{}{443: {}, 80: {}} actual := map[uint16]struct{}{httpsPort: {}, httpPort: {}} + if metricsSrv != nil { + mp, err := relay.ListenerPort(metricsSrv.Addr) + if err != nil { + logger.Error("refusing to start: invalid listener address", + "err", err, "addr", metricsSrv.Addr) + os.Exit(2) + } + expected[mp] = struct{}{} + actual[mp] = struct{}{} + } if err := relay.CheckListenerPorts(expected, actual); err != nil { surplus, expectedList := listenerPortLists(expected, actual) logger.Error("refusing to start: unexpected listener", @@ -197,6 +242,16 @@ func main() { logger.Info("starting", "version", Version, "mode", "autocert", "domain", *domain, "cert_cache", *certCache) + if metricsSrv != nil { + logger.Info("starting metrics listener", "listen", metricsSrv.Addr) + go func() { + if err := metricsSrv.ListenAndServe(); err != nil { + logger.Error("metrics listener failed", "err", err) + os.Exit(1) + } + }() + } + go func() { if err := httpSrv.ListenAndServe(); err != nil { logger.Error("http-01 listener failed", "err", err) diff --git a/docs/knowledge/INDEX.md b/docs/knowledge/INDEX.md index 76c4fa5..512eafa 100644 --- a/docs/knowledge/INDEX.md +++ b/docs/knowledge/INDEX.md @@ -4,13 +4,14 @@ One-line pointers into the evergreen knowledge base. Newest entries at the top o ## Features +- [Metrics listener (localhost-only)](features/metrics-listener.md) — separate `*http.Server` for `/metrics`, bound to a loopback IP literal (default `127.0.0.1:9090`); kept off the internet-exposed public listener that serves `/healthz` + `/v1/{server,client}` because metric values leak operational state. Two exports + one sentinel in `internal/relay/metrics_listen.go`: `ErrNonLoopbackBind` (branchable via `errors.Is`), `CheckLoopbackBind(addr)` (pure validator — `net.SplitHostPort` → `ListenerPort` for the port-0 / range / format rejects inherited from #81 → `net.ParseIP(host).IsLoopback()`; hostnames including `localhost:9090` are rejected even when they currently resolve to loopback — the DNS-time TOCTOU window is closed structurally, not by re-resolving), and `NewMetricsServer(addr, h)` (opt-out-aware: `addr == ""` → `(nil, nil)`, validator failure → `(nil, err)`, otherwise an `*http.Server` with the public listener's four timeouts duplicated literally so either listener can drift independently in a future ticket). Wired into both listener branches in `cmd/pyrycode-relay/main.go`: a goroutine launch mirroring the autocert mode's http-01 listener pattern (`os.Exit(1)` on `ListenAndServe` failure — loud-fail-over-silent because a relay booted with metrics enabled but silently not serving them would mislead operator scrapes), the metrics port joins **both** the `expected` and `actual` sets of #81's allowlist (declared secondary listener — must land on both sides of the asymmetric check). Empty-flag opt-out is structural via `metricsSrv != nil` guards at every reference site — no repeated `if *metricsListen == "" {}` branches. TLS / authn deliberately out of scope (loopback IS the defence); same-host adversary in scope but not a defence target; graceful shutdown deferred to #31. Three tests in `metrics_listen_test.go` — 12-row validator matrix, 3-row constructor matrix with timeouts pinned to literal values (not a shared constant), and an end-to-end happy-path that drives validator + constructor + `net.Listen("tcp", "127.0.0.1:0")` + actual `http.Get` round-trip by exploiting `http.Server.Serve(l)` ignoring `Addr` once a listener is supplied (#60). - [Listener port allowlist (boot-time refusal)](features/listener-port-allowlist.md) — relay refuses to start (exit 2) if the set of TCP ports it is *about to bind* (`http.Server.Addr` values) contains any port outside an explicit expected set derived from parsed flags: `{443, 80}` in autocert mode, `{}` in `--insecure-listen :` mode. Catches stray `net/http/pprof :6060` listeners, env-flipped debug ports, accidentally-enabled metrics exporters. Three exports from `internal/relay/listeners.go`: `ErrUnexpectedListener` sentinel (branchable via `errors.Is`; wrapped message names surplus + expected ports both ascending so the failure log is deterministic across `map`-iteration runs and grep-friendly), `ListenerPort(addr string) (uint16, error)` parsing `":443"` / `"127.0.0.1:8080"` / `"[::1]:443"` with an explicit reject of port 0 (the ephemeral-placeholder trap — would smuggle an unknown bound port past the actual-set construction), and `CheckListenerPorts(expected, actual map[uint16]struct{}) error` (pure set-difference). **Asymmetric by design**: surplus = error, missing = nil (a failure to bind an expected port surfaces as `ListenAndServe`'s own bind error at exit 1; duplicating the signal at boot would clutter logs). **Port-only**: interface binding (`127.0.0.1` vs `0.0.0.0`) intentionally out of scope on a single-instance internet-exposed deploy. **Reports all surplus in one error** so a manifest enabling several debug surfaces fails in one boot rather than N restart cycles. Wired into each listener branch separately in `cmd/pyrycode-relay/main.go` (different `http.Server` shapes; lifting a helper would invent surface area without a second consumer); structured log fields `unexpected_ports` + `expected_ports` recomputed in `main` via unexported `listenerPortLists` so `Check…` stays a single-error return. Paired with `TestBinaryDoesNotImportPprof` in `cmd/pyrycode-relay/deps_test.go` — shells out to `go list -deps -json`, catches the `import _ "net/http/pprof"` handler-registration variant that attaches `/debug/pprof/*` to `http.DefaultServeMux` *without* opening a new port (the runtime check would miss it). Belt-and-suspenders means different fabric: stochastic-ish runtime guard + deterministic compile-time test, neither alone is complete. Exit 2 = config-rejected-at-boot, harmonised across the sentinel family (architect overrode the AC's literal `os.Exit(1)`) (#81). - [Env-var config validator (boot-time refusal)](features/env-config-validator.md) — table-driven validation of every env var the relay reads at boot. Single source of truth is the unexported `envContracts []envContract` registry in `internal/relay/env_config.go`; each row carries `name`, `required` bool, and an inline `validate func(string) error`. `CheckEnvConfig(lookup func(string) (string, bool)) error` walks the registry and returns the structured `*ErrInvalidConfig{Key, Reason}` on the first failure (`Reason` is `"missing"` or `"malformed-value: "`); the package-level sentinel `ErrInvalidConfigSentinel` is matched via a custom `Is` method (not `Unwrap`, which would double-print the message prefix) so `errors.Is(err, ErrInvalidConfigSentinel)` and `errors.As(err, &cfgErr)` form a dual contract. The `func(string) (string, bool)` (= `os.LookupEnv` shape) getter coexists with #77's `func(string) string` getter — the presence bit is necessary here to distinguish "missing-but-required" from "present-but-empty", semantically inert for `IsProductionMode`'s exact-`"1"` match. **Ordering is load-bearing**: wired in `main.go` BEFORE `CheckInsecureListenInProduction` so a typo like `PYRYCODE_RELAY_PRODUCTION=true` cannot slip through `IsProductionMode`'s silent-non-production fallback and reach the insecure-listen guard with an unvalidated value. Today's registry has one row (`PYRYCODE_RELAY_PRODUCTION`, optional-but-format-validated); future env-var reads register here at code-review time. `checkEnvConfigWith(lookup, contracts)` is the parameterised inner used by the `required: true` test case (today's production table has no required entries). Exit 2 = config-rejected-at-boot, matching the sibling refusals (#9, #77, #79) (#80). - [Linux capability allowlist (boot-time refusal)](features/capability-allowlist.md) — relay parses `/proc/self/status`'s `CapEff:` hex mask at boot and refuses to start (exit 2) if any bit is set outside `AllowedCapabilities` (currently `{CAP_NET_BIND_SERVICE}` only, motivated by autocert binding `:80`/`:443` from uid 65532 in the distroless image). Exported sentinel `ErrUnexpectedCapability` is branchable via `errors.Is`; the wrapped error names every offending bit symbolically (`CAP_SYS_ADMIN (bit 21)` or `bit 63` for unknown), lists the allowlist contents, and embeds the operator fix string. `CapEff` only — `CapPrm/CapBnd/CapInh` would broaden false-positives (legitimate K8s default policy grants wide CapBnd) without adding load-bearing protection (relay never `capset(2)`s). Linux/non-Linux split at compile time via the new `_.go` / `_other.go` build-tag convention (see ADR-0009); non-Linux GOOS logs one skip line and returns nil. Unconditional — no production-mode gating, no env-var bypass, because stray capabilities are never legitimate. Reader-boundary test seam (`func() (string, error)`) exercises the parse + mask check end-to-end without touching real `/proc`. Joins the boot-time-refusal sentinel family (#9, #77, #79; future #78) (#79). - [Production-mode contract & startup refusals](features/production-mode.md) — `PYRYCODE_RELAY_PRODUCTION=1` env-var contract (exact-string match, lazy read via injected getter, mirrors `PYRYCODE_RELAY_SINGLE_INSTANCE` shape from #64/#65) plus the boot-time checks that consume it. **#77** introduced `relay.CheckInsecureListenInProduction` + exported `ErrInsecureListenInProduction` sentinel (branchable via `errors.Is`) firing when production mode is on AND `--insecure-listen` is set. **#78** added the second consumer: `relay.CheckRunningAsRoot(geteuid, getenv)` + exported `ErrRunningAsRoot` sentinel firing when production mode is on AND `syscall.Geteuid() == 0`, closing the deploy-time gap (`docker run --user 0`, `securityContext.runAsUser: 0`, hand-edited Dockerfile dropping `USER`) that escapes the CI non-root-build contract (#32 Dockerfile, #68 Trivy). Both wired in `cmd/pyrycode-relay/main.go` after flag-parse with `os.Exit(2)` (config-rejected-at-boot, distinct from runtime-failure exit 1) and structured log fields: `env_var` carries the name only (never the value, even though `effective_uid` carries the kernel-supplied int — log-injection structurally impossible), one-line `fix` listing valid resolutions. `IsProductionMode` exported so siblings compose on the same predicate without re-reading the env var. Test seams: `func(string) string` for env, `func() int` for euid — both the smallest possible (no interface, no struct, no package-level var) and the only way to exercise the uid-0 branch in a unit test without re-execing the test binary as root. Two instances of the shape (#77, #78) now codify the "sibling boot-check" pattern; `Config.Validate()` consolidation deferred until ~5 checks exist (#77, #78). - [Fly.io deploy](features/fly-deploy.md) — production host wiring: `fly.toml` declares TCP-passthrough on `:80`/`:443` (no Fly HTTP proxy, no Fly-managed certs) so TLS keeps terminating in the relay via autocert (#9), persistent Fly volume `relay_autocert` mounted at `/var/lib/relay/autocert`, and a single-machine hard cap encoded via `min_machines_running=1` + `auto_start_machines=false` + `auto_stop_machines="off"` + `[deploy] strategy="immediate"` (Fly Apps v2 has no `max_machines` key; the in-binary `PYRYCODE_RELAY_SINGLE_INSTANCE` self-check from #65 is the backstop). CI `deploy` job in `.github/workflows/ci.yml` runs `flyctl deploy --remote-only` on push to `main`, gated by branch-condition + `needs: [test, security, image-scan]` + `permissions: contents: read` so `FLY_API_TOKEN` is structurally unreachable from PR code; `superfly/flyctl-actions/setup-flyctl` pinned by commit SHA with `# Tracks:` comment (same convention as #68 / #41). Dedicated IPv4 is required (not optional) for autocert's HTTP-01 challenge; TCP passthrough preserves the real socket peer IP that #34's rate limiter reads. `__REGION__` / `__DOMAIN__` ship as placeholders that fail loud on first deploy (#38). - [Connection-count gauges](features/connection-count-gauges.md) — `pyrycode_relay_connected_binaries` and `pyrycode_relay_connected_phones` exposed via a pull-based `prometheus.Collector` reading `Registry.Counts()` on each scrape; zero edits to `registry.go`; scalar (no labels) by design — `{server="..."}` would carry the attacker-influenced `x-pyrycode-server` header onto the metrics surface, which threat-model § Log hygiene forbids; stale grace-expiry fires can't move the gauge because the pointer-identity guard (ADR-0006) keeps the maps unchanged and the gauge IS the map size; race-tested against 16 mutator goroutines + a tight-loop scraper under `-race`. First collector wired into the #59 seam (#61). -- [Metrics registry (scaffolding)](features/metrics-registry.md) — private `*prometheus.Registry` + `NewMetricsHandler` factory wrapping `promhttp.HandlerFor` (text format only; OpenMetrics off; `HandlerOpts.Registry: reg` keeps `promhttp_metric_handler_*` off `DefaultRegisterer`). Seam shape for siblings: per-concern collector struct in its own file, constructed by a helper taking `prometheus.Registerer` (no mega-struct, no package-level vars) — first instantiated by #61's `connectionsCollector`. Listener still pending (#60). Structural defence against default-registry leaks via `TestMetricsRegistry_NoGlobalRegistrarLeak` (#59). +- [Metrics registry (scaffolding)](features/metrics-registry.md) — private `*prometheus.Registry` + `NewMetricsHandler` factory wrapping `promhttp.HandlerFor` (text format only; OpenMetrics off; `HandlerOpts.Registry: reg` keeps `promhttp_metric_handler_*` off `DefaultRegisterer`). Seam shape for siblings: per-concern collector struct in its own file, constructed by a helper taking `prometheus.Registerer` (no mega-struct, no package-level vars) — first instantiated by #61's `connectionsCollector`. Listener landed in #60 (see [Metrics listener](features/metrics-listener.md)). Structural defence against default-registry leaks via `TestMetricsRegistry_NoGlobalRegistrarLeak` (#59). - [Docker image](features/docker-image.md) — portable OCI artifact: multi-stage `Dockerfile` builds a fully-static binary (`CGO_ENABLED=0`, `-trimpath -s -w`) into `distroless/static-debian12:nonroot`; both base images digest-pinned with `# Tracks:` comments; exposes `:80`/`:443` and declares `/var/lib/relay/autocert` volume; host-specific wiring (TLS policy, ports, volumes, healthcheck) is #38's problem (#32). PR-time Trivy CVE scan against the just-built image lives in CI as the `image-scan` job, fails on **fixable** CRITICAL/HIGH only (`ignore-unfixed: true`), action pinned by commit SHA with `# Tracks: ` comment mirroring the Dockerfile pin convention; intentional overlap with `govulncheck` (source-reachability vs. shipped-artifact) (#68). Both scanners are also re-run daily against `main` via `.github/workflows/security-scan.yml` (cron + `workflow_dispatch`) so disclosed CVEs against unchanged deps surface within ≤24h rather than staying invisible until the next bump (#72); a red cron run also opens a `security-sensitive`-labelled GitHub issue via the workflow's `file-issue` job (artifact-handoff privilege split keeps `issues: write` off the scanners and out of workflow scope; deterministic-title dedup via `gh issue list --search 'in:title …'`) so regressions land as tracked work-items rather than passive Actions rows (#73). - [Binary-side frame forwarder](features/binary-forwarder.md) — per-binary read pump: unwraps each inbound routing envelope, linear-scans `PhonesFor(serverID)` for `env.ConnID`, writes `env.Frame` verbatim to that phone; opaque inner bytes; synchronous (handler discards the return); diverges from #25 in error policy — unknown `conn_id`, malformed envelope, phone `Send` error all log+continue (a single bad frame never tears down the binary); replaced `/v1/server`'s `CloseRead` placeholder (#26). - [WebSocket heartbeat](features/heartbeat.md) — per-conn goroutine on both endpoints sends RFC 6455 ping every 30s; closes with `1011 "heartbeat timeout"` if no pong within 30s. Detects half-open TCP within 60s; ctx-cancel exit path leaves close to the handler defer (#7). diff --git a/docs/knowledge/codebase/60.md b/docs/knowledge/codebase/60.md new file mode 100644 index 0000000..3885123 --- /dev/null +++ b/docs/knowledge/codebase/60.md @@ -0,0 +1,56 @@ +# Ticket #60 — localhost-only `/metrics` listener with bind-address validation + +Wires a second `http.Server` on a loopback IP literal (default `127.0.0.1:9090`) for Prometheus scraping. Refuses to start (exit 2) if `--metrics-listen` is anything other than `:` or empty; an empty value disables the listener entirely. Consumes the #59 metrics seam and the #61 `NewConnectionsMetrics` constructor; extends #81's listener-port allowlist by adding the metrics port to both the expected and actual sets at each wiring site. + +## Implementation + +- **`internal/relay/metrics_listen.go` (new, 96 lines)** — two exports and one sentinel, no goroutines, no I/O: + - `ErrNonLoopbackBind` — branchable via `errors.Is`. Returned by both `CheckLoopbackBind` and `NewMetricsServer` for any failure of the loopback-IP-literal rule (hostnames, non-loopback IPs, `:9090`-style empty host). + - `CheckLoopbackBind(addr string) error` — pure validator. `net.SplitHostPort` → `ListenerPort` (port-0 and range check inherited from #81) → `net.ParseIP(host)` → `IsLoopback()`. Hostnames are refused even when they currently resolve to loopback; the docstring restates the DNS-time TOCTOU rationale so the next contributor does not "fix" the validator by adding `net.LookupHost`. Empty addr returns a non-nil error as defence in depth (the structural opt-out lives in `NewMetricsServer`). + - `NewMetricsServer(addr string, h http.Handler) (*http.Server, error)` — opt-out-aware constructor. `addr == ""` → `(nil, nil)`; `CheckLoopbackBind` failure → `(nil, err)`; otherwise an `*http.Server` with the four timeouts duplicated from the public listener (`ReadHeaderTimeout: 10s`, `ReadTimeout: 60s`, `WriteTimeout: 60s`, `IdleTimeout: 120s`) — not exported as constants because each listener may drift in a future ticket. +- **`internal/relay/metrics_listen_test.go` (new, 187 lines)** — three tests, all `t.Parallel()`-safe: + - `TestCheckLoopbackBind_Matrix` — 12-row matrix; nil for IPv4 + IPv6 loopback and 127.0.0.0/8 high address; sentinel-wrapped error for non-loopback IPs, hostnames, empty host; non-sentinel error for port-0, out-of-range port, missing port, empty addr. + - `TestNewMetricsServer_Matrix` — three rows covering the opt-out, the sentinel-wrapped reject, and the happy path (timeouts pinned to literal values, not to a shared constant). + - `TestMetricsServer_EndToEnd_HappyPath` — validator + constructor + `net.Listen("tcp", "127.0.0.1:0")` + `http.Server.Serve(l)` + actual `http.Get` round-trip. The factory is called with `127.0.0.1:9090` (validator passes); the listener uses an ephemeral port (`http.Server.Serve` ignores `Addr` once a listener is supplied), sidestepping the port-0 rule in `ListenerPort` without bypassing the factory. Status 200 + Prometheus text-format `Content-Type` prefix + non-empty body containing the registered probe counter. +- **`cmd/pyrycode-relay/main.go` (+55 lines)** — flag plus three localised wiring edits: + - **Flag (line 31):** `--metrics-listen` defaults to `127.0.0.1:9090`; help text names the IP-literal contract and the empty opt-out. + - **Construction (lines 101–114):** `metricsReg := NewMetricsRegistry()` → `NewConnectionsMetrics(metricsReg, reg)` (#61) → mux with `/metrics` → `NewMetricsServer(*metricsListen, metricsMux)`. A non-nil error fails boot with exit 2, structured log carries `err`, `value=`, and an operator fix string listing both the loopback-IP-literal example and the `--metrics-listen=` disable shape. + - **Port-allowlist integration (insecure: lines 147–156; autocert: lines 223–232):** `if metricsSrv != nil`, parse `metricsSrv.Addr` with `ListenerPort` and add the port to **both** the `expected` and `actual` sets passed to `CheckListenerPorts` (#81). The metrics port is a declared listener; it must land on both sides of the asymmetric check or #81 would refuse to start. + - **Goroutine launch (insecure: lines 165–173; autocert: lines 245–253):** `go metricsSrv.ListenAndServe()` placed before the foreground public listener call (mirroring the existing http-01 goroutine pattern in autocert mode). On error the goroutine logs `"metrics listener failed"` and calls `os.Exit(1)` — same shape as the http-01 listener. A relay that booted with metrics enabled but is silently not serving them would mislead operator scrapes; loud failure over silent correction. + +## Acceptance criteria — verification map + +- **AC-1** (`--metrics-listen` accepts default `127.0.0.1:9090`; empty disables structurally — no listener, no goroutine, no error): `main.go:31` (flag), `main.go:107` (`NewMetricsServer` returns `(nil, nil)` on empty), `main.go:147 / 223` (port-set entry guarded by `metricsSrv != nil`), `main.go:165 / 245` (goroutine launch guarded the same way). `TestNewMetricsServer_Matrix/empty_addr_returns_nil_nil_opt-out`. +- **AC-2** (separate `http.Server` with explicit timeouts matching the public listener; graceful-shutdown pattern mirrored — both pending #31): `metrics_listen.go:87-94`. `TestNewMetricsServer_Matrix/loopback_addr_returns_configured_server` pins all four timeouts. +- **AC-3** (bind-address validation: `net.ParseIP` + `IsLoopback`; hostnames rejected; non-loopback or unparseable host fails boot loudly): `metrics_listen.go:39-58`. `TestCheckLoopbackBind_Matrix` covers all 12 branches; `errors.Is(_, ErrNonLoopbackBind)` asserted on the loopback-rule rejections. +- **AC-4(a)** (`GET /metrics` returns 200 + Prometheus text `Content-Type`): `TestMetricsServer_EndToEnd_HappyPath` — real `net.Listen` + real `http.Get`. +- **AC-4(b)** (non-loopback addresses fail boot): `TestCheckLoopbackBind_Matrix/{ipv4_all-interfaces,ipv4_private,ipv6_non-loopback}_rejected` + `TestNewMetricsServer_Matrix/non-loopback_addr_returns_wrapped_ErrNonLoopbackBind`. +- **AC-4(c)** (empty `--metrics-listen` results in no listener bound and does not error): `TestNewMetricsServer_Matrix/empty_addr_returns_nil_nil_opt-out`; the structural skip in `main.go` is by the `metricsSrv != nil` guard at every reference site, not by a flag-test (matches the spec's "opt-out is structural, not flag-tested" rule). +- **AC-5** (`make vet`, `make test -race`, `make build` clean; docs entries): vet/test/build verified in the developer run; `docs/knowledge/codebase/60.md` (this file) + `docs/knowledge/INDEX.md` updated in the doc phase. + +## Patterns established + +- **"Opt-out is structural, not flag-tested."** `NewMetricsServer("", h)` returns `(nil, nil)` and every caller checks `srv != nil` before referencing the metrics listener. The empty-flag path is invisible to the rest of `main` — no `if *metricsListen == "" {}` branches outside the constructor, no second flag. Carry this shape to any future operator-toggleable secondary listener: the disable lives in the constructor's nil-return contract, not in repeated flag conditionals at every reference site. +- **TOCTOU-proof validation: refuse the entire shape, do not re-resolve.** Hostnames are rejected at validation time even when they currently point to loopback, because `net.LookupHost`-then-`net.Listen` has two resolver hits and an attacker (or just a /etc/hosts edit, or a resolver reconfigure) can flip the answer between them. The docstring on `CheckLoopbackBind` restates the rationale so the next contributor does not "fix" the validator by adding a lookup. Generalises to any rule of the form " must satisfy property P at bind/use time": validate the **shape**, not the **resolved value**. +- **Listener-allowlist integration for declared secondary listeners.** A new listener that the operator deliberately enables must add its port to **both** the `expected` and `actual` sets passed to `CheckListenerPorts` (#81). Not adding to `expected` makes #81 refuse boot (surplus listener). Not adding to `actual` would let a typo flag through if the listener silently failed to bind. The pattern is the same in both the insecure and autocert branches; carry to any next-secondary-listener ticket. +- **Per-listener timeouts are duplicated, not shared.** The metrics listener's `Read{Header,}Timeout` / `WriteTimeout` / `IdleTimeout` are copied from the public listener literally rather than extracted into a shared constant. Reasoning: each listener's threat model differs (public is internet-exposed; metrics is loopback-only). Today they happen to match because the public listener's policy is the safest default for a new listener; a future ticket may need to drift one independently. Avoid the false-economy of "DRY this constant" — the duplication is a stability hedge against a refactor that would silently couple two listener policies. + +## Lessons learned + +- **Use `net.ParseIP` + `IsLoopback`, never `net.LookupHost`, for bind validation.** A hostname's resolution at validation time is not the resolution at bind time — two syscalls, different code paths, can return different answers. The only TOCTOU-proof defence is to refuse the hostname shape at parse time. Documented in the `CheckLoopbackBind` docstring; carry to every future " must satisfy P" validator. +- **`http.Server.Serve(l)` ignores `Addr` once a listener is supplied — exploit this in tests.** The happy-path end-to-end test needed an ephemeral port (`127.0.0.1:0`) but also needed to drive the real factory (which refuses port 0 via `ListenerPort`). The resolution: construct with `127.0.0.1:9090` (validator passes), then `net.Listen("tcp", "127.0.0.1:0")` and pass the listener into `srv.Serve(l)`. The test exercises every code path main.go runs at boot, without bypassing the factory. Reach for the same shape any time a factory has a "syntactically forbidden in production" rule that conflicts with an ergonomic test pattern. +- **Pin timeout assertions to literal values, not to a shared constant.** The metrics-listener test asserts `srv.ReadHeaderTimeout == 10 * time.Second`, not `srv.ReadHeaderTimeout == publicListenerReadHeaderTimeout`. Reason: the spec explicitly says each listener can drift independently in a future ticket. If the test had imported a shared constant, a future "tighten the metrics listener's WriteTimeout" ticket would have to also touch the public listener's constant or weaken the assertion — neither is clean. Literal values are the trade-off that keeps the divergence cheap. Apply the same to any "today they match but may diverge" assertion. +- **The wiring slice is a perfectly natural place to absorb the consumer-side concerns of three sibling tickets.** This ticket consumes the seam from #59, the collector from #61, and extends the allowlist from #81. The instinct to factor "metrics-listener wiring helpers" was resisted: each integration is a 2–5 line edit at a known anchor; lifting them would invent surface area without a second consumer. Generalises to "wiring tickets in a feature family should remain thin glue, not abstraction-introducing". + +## Cross-links + +- [Metrics listener (feature)](../features/metrics-listener.md) — operator-facing doc: contract, API, threat-model alignment. +- [Metrics registry (feature)](../features/metrics-registry.md) — the seam #60 consumes (#59). +- [Connection-count gauges (feature)](../features/connection-count-gauges.md) — the only collector wired here today (#61). +- [Listener port allowlist (feature)](../features/listener-port-allowlist.md) — the boot-time set check that #60 extends (#81). +- [Codebase note #59](59.md) — registry + handler scaffolding. +- [Codebase note #61](61.md) — `NewConnectionsMetrics` constructor. +- [Codebase note #81](81.md) — listener-port allowlist this ticket extends. +- [ADR-0008](../decisions/0008-prometheus-client-adoption.md) — Prometheus client adoption + scope-of-use rules. +- [Spec #60](../../specs/architecture/60-metrics-listener.md) — full architectural spec. +- [#56 — parent ticket](https://github.com/pyrycode/pyrycode-relay/issues/56) — split into #59 / #60 / #61 / #57 / #58. diff --git a/docs/knowledge/features/connection-count-gauges.md b/docs/knowledge/features/connection-count-gauges.md index 94ea54e..0d23ebc 100644 --- a/docs/knowledge/features/connection-count-gauges.md +++ b/docs/knowledge/features/connection-count-gauges.md @@ -41,7 +41,7 @@ The constant-cardinality property also keeps the `/metrics` response size delta ## Wiring -The collector lives in the registry's state for the process lifetime once `NewConnectionsMetrics(mreg, registry)` is called at startup. The wiring call site is `cmd/pyrycode-relay/main.go` next to `relay.NewRegistry()` and `relay.NewMetricsRegistry()`, alongside the listener wiring tracked under #60. Until #60 merges, the collector is exercised only by `metrics_connections_test.go`. +The collector lives in the registry's state for the process lifetime once `NewConnectionsMetrics(mreg, registry)` is called at startup. The wiring call site is `cmd/pyrycode-relay/main.go` next to `relay.NewRegistry()` and `relay.NewMetricsRegistry()`, alongside the [localhost-only `/metrics` listener](metrics-listener.md) wired in #60. ## Concurrency diff --git a/docs/knowledge/features/metrics-listener.md b/docs/knowledge/features/metrics-listener.md new file mode 100644 index 0000000..5c6a721 --- /dev/null +++ b/docs/knowledge/features/metrics-listener.md @@ -0,0 +1,98 @@ +# Metrics listener — localhost-only `/metrics` + +The relay serves `/metrics` on a **separate** `http.Server` bound to a loopback IP literal (default `127.0.0.1:9090`). It is intentionally not multiplexed onto the public listener that serves `/healthz` and `/v1/{server,client}`: Prometheus metrics leak operational state (active-connection counts indicate whether anyone is using the relay; future upgrade/register counters reveal traffic patterns), and the relay is internet-exposed. Operators reach `/metrics` via SSH tunnel or sidecar. + +The contract has three guarantees: the bind address must be a loopback **IP literal** (hostnames refused), an empty value is a structural opt-out (no listener bound, no goroutine started, no error), and any other value fails boot loudly with exit 2. + +## Contract + +- Flag: `--metrics-listen` (default `127.0.0.1:9090`). + - Loopback IP literal + port → listener bound (`127.0.0.1:9090`, `127.0.0.5:1234`, `[::1]:9090` all accepted; whole `127.0.0.0/8` is loopback). + - Empty string → no listener, no goroutine, no port in the allowlist set (#81). Structural opt-out. + - Anything else → `os.Exit(2)` with a structured log naming the offending value and the fix string. +- Hostnames (including `localhost:9090`) are refused even when they currently resolve to a loopback IP — see the TOCTOU rationale below. +- Port 0 is refused (inherited from `ListenerPort`, #81). The ephemeral-port shape would smuggle an unknown bound port past #81's listener-port allowlist. + +## API + +Package `internal/relay` (`metrics_listen.go`): + +```go +var ErrNonLoopbackBind = errors.New("relay: metrics listener bind address must be a loopback IP literal") + +func CheckLoopbackBind(addr string) error +func NewMetricsServer(addr string, h http.Handler) (*http.Server, error) +``` + +- `CheckLoopbackBind` is the pure validator. Reuses `ListenerPort` for the port parse (1..65535, port 0 rejected) so the rule the rest of the codebase already obeys extends here without duplication. Returns wrapped `ErrNonLoopbackBind` for the loopback-rule failures (hostnames, non-loopback IPs, empty host) — branchable via `errors.Is`. Other shapes (port out of range, missing port, empty addr) return plain wrapped errors. +- `NewMetricsServer` is the opt-out-aware constructor: + - `addr == ""` → `(nil, nil)`. The caller checks `srv != nil` before launching the goroutine or adding the port to the listener allowlist. + - `CheckLoopbackBind(addr)` fails → `(nil, err)`. + - otherwise → `*http.Server` with the public listener's timeout policy (`ReadHeaderTimeout: 10s`, `ReadTimeout: 60s`, `WriteTimeout: 60s`, `IdleTimeout: 120s`) — duplicated rather than shared so either listener can drift if a future ticket has reason; today they match because that is the safest default. + +The handler argument is `http.Handler`, not `*http.ServeMux`: the caller chooses whether to wire a bare `NewMetricsHandler(reg)` or wrap it in a mux. `cmd/pyrycode-relay/main.go` wraps in a `ServeMux` so `/metrics` is distinguishable from a future sibling path on the same listener. + +## Why hostnames are refused (load-bearing) + +A hostname resolves at validation time (the `net.LookupHost` a "helpful" validator would call) and again at bind time (the kernel's resolver hit inside `net.Listen`). Between the two, DNS rebinding, an `/etc/hosts` race, or a resolver reconfigure can make a hostname that validated as loopback bind to a non-loopback IP. The only TOCTOU-proof defence is to refuse the entire shape: the validator parses `net.SplitHostPort`'s host portion with `net.ParseIP` and rejects anything that fails to parse as an IP literal. + +The docstring on `CheckLoopbackBind` restates this rationale so the next contributor does not "fix" the validator by adding `net.LookupHost`. + +## Wiring (`cmd/pyrycode-relay/main.go`) + +Three localised edits, both listener modes: + +1. **Construction (lines 101–114):** build the metrics registry, register collectors (today just `NewConnectionsMetrics`, #61; sibling tickets #57 / #58 append here), wrap `NewMetricsHandler` in a `ServeMux`, and call `NewMetricsServer(*metricsListen, metricsMux)`. A non-nil error fails boot with exit 2 and a structured log carrying `err`, `value=`, and the operator fix string. +2. **Port-allowlist integration (insecure: lines 147–156; autocert: lines 223–232):** when `metricsSrv != nil`, parse its `Addr` via `ListenerPort` and add the port to **both** the `expected` and `actual` sets passed to `CheckListenerPorts` (#81). The metrics port is a declared listener, not a surplus one — it has to land on both sides of the asymmetric check. +3. **Goroutine launch (insecure: lines 165–173; autocert: lines 245–253):** `go metricsSrv.ListenAndServe()`. On error the goroutine logs `"metrics listener failed"` and calls `os.Exit(1)` — same shape as the autocert mode's http-01 listener. A relay that booted with metrics enabled but is silently not serving them would mislead operator scrapes into thinking the relay is healthy when its metrics surface is gone; loud failure over silent correction. + +## Threat model alignment + +- **Trust boundary defended structurally, not by runbook.** The bind shape is the gate. Loopback validation + IP-literal-only is enforced before any listener starts; no fallback, no warn-and-continue. +- **Same-host adversary is in scope but not a defence target.** A hostile process on the same host can scrape `/metrics` — that is the threat model's accepted shape. Off-host exposure is structurally impossible without a kernel-level routing bypass. +- **TLS deliberately out of scope.** Loopback is the entire defence; TLS would add no marginal security and would require autocert / static-cert plumbing not justified by the threat model. +- **No authentication / authorisation on `/metrics`.** Same rationale. +- **DoS:** `http.Server` timeouts cap scrape duration. `promhttp.HandlerFor`'s `MaxRequestsInFlight` is left at the library default (unbounded); acceptable because the loopback gate caps the threat population to same-host processes. A future ticket can cap it if a same-host operational-process bug causes scrape storms. +- **Log hygiene:** boot-refusal log lines emit the flag value (operator input, not network input) and the wrapped error. No token, no PII, no metric labels carry attacker-influenced values (the only collector wired here, `NewConnectionsMetrics`, is label-less by design — see [Connection-count gauges](connection-count-gauges.md)). +- **gosec G114** (`http.Server` without `ReadHeaderTimeout`): mitigated by `NewMetricsServer` setting all four timeouts explicitly. + +## Concurrency + +One new goroutine: the metrics listener's `srv.ListenAndServe()`. Started only after both flag validation and `CheckLoopbackBind` have succeeded. Exits only on `ListenAndServe` returning an error; the error path is `logger.Error` + `os.Exit(1)`. No coordination with the public listener's goroutines — they are independent. + +`promhttp.HandlerFor` is documented safe for concurrent calls; `prometheus.Registry` uses internal locking. `NewConnectionsMetrics` reads `Registry.Counts()` on every scrape (#61's pull-based design), which already takes the right lock. No shared state is introduced. + +## Failure modes + +| Failure | Surface | Recovery | +| --- | --- | --- | +| `--metrics-listen` rejected by `CheckLoopbackBind` | `logger.Error` + `os.Exit(2)` (config-rejected-at-boot) | Operator fixes the flag | +| Metrics port collides with public listener | `ListenAndServe` returns `bind: address already in use` | Goroutine logs + `os.Exit(1)`. `CheckListenerPorts` does not catch this — both ports are in the expected set; the OS catches it at bind time | +| `ListenAndServe` returns mid-run (fd exhaustion, network reconfigure) | Goroutine logs + `os.Exit(1)` | Process restart | +| `--metrics-listen=""` | Structural skip — no listener, no goroutine, no port-set entry | n/a (opt-out) | + +## Testing + +`internal/relay/metrics_listen_test.go`: + +- `TestCheckLoopbackBind_Matrix` — 12-row table covering every branch: IPv4 loopback (default + `/8` high address), IPv6 loopback (bracketed), non-loopback IPv4 / IPv6 (sentinel-wrapped), private IPs, hostname (`localhost:9090` rejected — the security-design anchor), empty host (`:9090` rejected), port-0, out-of-range port, missing port, empty addr. Sentinel-rule failures asserted with `errors.Is(err, ErrNonLoopbackBind)`; other failures asserted non-nil only. +- `TestNewMetricsServer_Matrix` — three rows: empty addr → `(nil, nil)`; non-loopback → `(nil, err)` with `errors.Is(_, ErrNonLoopbackBind)`; loopback → `(srv, nil)` with all four timeouts pinned to literal values (not a shared constant — the spec says they can drift). +- `TestMetricsServer_EndToEnd_HappyPath` — drives validator + constructor + `net.Listen` + actual HTTP round-trip over an ephemeral port. Constructs the server at `127.0.0.1:9090` (validator passes), then serves on `net.Listen("tcp", "127.0.0.1:0")` because `http.Server.Serve` ignores `Addr` once a listener is supplied — sidesteps the port-0 rule in `ListenerPort` without bypassing the factory. Asserts status 200 and Prometheus text-format `Content-Type` prefix; body parsing is `metrics_test.go`'s job (#59). + +`make vet`, `make test -race`, `make build` clean. + +## What this deliberately does NOT do + +- TLS on the metrics listener — loopback is the entire defence (see *Threat model*). +- Authentication on `/metrics` — same rationale. +- Graceful shutdown of either listener — the public listener has none either; SIGTERM-driven shutdown is open ticket #31, which will retrofit both listeners together. Structuring `metricsSrv` as a top-level local in `main` keeps that retrofit a localised edit. +- A `--metrics-listen` unix-socket form — out of scope; would need its own threat-model review (file permissions, peer authentication). +- A second `/metrics` collector beyond #61's `NewConnectionsMetrics` — siblings #57 / #58 append more `NewXxxMetrics(metricsReg, …)` calls at the wiring site this ticket establishes. + +## Related + +- [Metrics registry (scaffolding)](metrics-registry.md) — the `*prometheus.Registry` + `NewMetricsHandler` factory this listener consumes (#59). +- [Connection-count gauges](connection-count-gauges.md) — the only collector wired here today (#61). +- [Listener port allowlist (boot-time refusal)](listener-port-allowlist.md) — the metrics port joins both the `expected` and `actual` sets (#81). +- [ADR-0008: Adopt `github.com/prometheus/client_golang`](../decisions/0008-prometheus-client-adoption.md) — scope-of-use rules. +- [Threat model](../../threat-model.md) — log-hygiene and DoS posture that this listener inherits. diff --git a/docs/knowledge/features/metrics-registry.md b/docs/knowledge/features/metrics-registry.md index fb63043..ad7c85a 100644 --- a/docs/knowledge/features/metrics-registry.md +++ b/docs/knowledge/features/metrics-registry.md @@ -2,7 +2,7 @@ The relay holds a private `*prometheus.Registry` and serves it over a Prometheus-format `/metrics` handler. The registry is constructed per process; sibling tickets register counters / gauges / histograms against it. `prometheus.DefaultRegisterer` is never touched. -This page documents the scaffolding only — what the seam is, why it has the shape it does, and where the rules are recorded. The `/metrics` listener wiring lives in #60. +This page documents the scaffolding only — what the seam is, why it has the shape it does, and where the rules are recorded. The `/metrics` listener wiring lives in #60; see [Metrics listener (localhost-only)](metrics-listener.md). The first collector to plug into the seam landed in #61: see [Connection-count gauges](connection-count-gauges.md) for the pull-based `connectionsCollector` exposing `pyrycode_relay_connected_{binaries,phones}` over `Registry.Counts()`. Sibling counter/histogram tickets (#57, #58) follow. @@ -80,7 +80,7 @@ Structurally out of bounds: ## What this deliberately does NOT do - No counters, no histograms here — siblings (#57, #58, future) own those. The first gauge collector (`connectionsCollector`) landed in #61 as a pull-based reader of `Registry.Counts()`; see [Connection-count gauges](connection-count-gauges.md). -- No listener — `/metrics` is not bound to a `mux` in this ticket. #60 owns the listener (flag, bind-address validation, `http.Server` timeouts). +- No listener — `/metrics` was not bound to a `mux` in #59. The listener (flag, loopback bind-address validation, `http.Server` timeouts) landed in #60 and is documented at [Metrics listener](metrics-listener.md). - No process / Go-runtime collectors. - No `ErrorLog` wiring from `promhttp` into `slog`. - No content negotiation (OpenMetrics opt-in is a separate decision). diff --git a/docs/specs/architecture/60-metrics-listener.md b/docs/specs/architecture/60-metrics-listener.md new file mode 100644 index 0000000..9a7de23 --- /dev/null +++ b/docs/specs/architecture/60-metrics-listener.md @@ -0,0 +1,570 @@ +# Spec — localhost-only /metrics listener with bind-address validation (#60) + +## Files to read first + +- `cmd/pyrycode-relay/main.go:26-211` — entire `main` body. Two things to + extract: + 1. The existing public-listener pattern (timeouts at lines 118-121 / 156-159 + / 168-172, goroutine launch at lines 200-205 for the http-01 listener, + bare `ListenAndServe` at line 139 for insecure mode). The metrics + listener mirrors these exactly. + 2. The `CheckListenerPorts` wiring at lines 123-138 (insecure mode) and + 174-195 (autocert mode). The metrics port MUST land in both the + `expected` and `actual` sets when the metrics listener is enabled, or + `#81`'s boot-time port allowlist refuses to start. +- `internal/relay/listeners.go:35-86` — `ListenerPort` extracts a `uint16` + from an `http.Server.Addr` string; `CheckListenerPorts` is asymmetric + (surplus is the failure mode, missing is not). Use `ListenerPort` for the + metrics addr the same way main.go already uses it for `:443`/`:80`/insecure. +- `internal/relay/metrics.go:21-51` — `NewMetricsRegistry()` and + `NewMetricsHandler(reg)` from #59's scaffolding. Consume both. Do NOT add + new exported surface to this file — the seam is fixed. +- `internal/relay/metrics_connections.go:37-51` — `NewConnectionsMetrics(reg, + src)` from #61. The wiring site (this ticket) calls it once at boot + against the relay's `*Registry` (constructed at `main.go:98`). Future + sibling tickets (#57, #58) will add similar `NewXxxMetrics` calls + alongside. +- `docs/specs/architecture/59-metrics-scaffolding.md:165-169` — the + "package-vars-vs-struct" pick and the implication for this ticket: each + per-concern collector is a constructor call at the wiring site. Adding + one collector here (`NewConnectionsMetrics`) is the pattern; #57 and #58 + will append. +- `docs/specs/architecture/81-listener-port-allowlist-boot-check.md` (whole + file) — the asymmetric `expected ⊋ actual ⇒ refuse boot` semantics this + ticket extends. The metrics port joins the expected set; that is the + whole interaction. +- `docs/PROJECT-MEMORY.md` line 27 — *"Loud failure over silent correction."* + Bind-address validation refuses to start on non-loopback / hostname / bad + port. No silent fallback to localhost; no "warn and continue". +- `docs/threat-model.md` § *Log hygiene* (extends to metric labels per #59's + spec) — relevant for the security review below: `/metrics` labels are an + exfiltration channel. The collectors wired in this ticket + (`NewConnectionsMetrics`) carry no labels by design. + +## Context + +The relay is internet-exposed. `/metrics` exposes operational state (active +connection counts indicate whether anyone is using the relay; future +upgrade/register counters reveal traffic patterns). Putting `/metrics` on +the same listener as `/healthz` (#10) and `/v1/{server,client}` would +publish that state to anyone who can reach the public listener. + +The ticket body locks the exposure decision: `/metrics` runs on a separate +**localhost-only** listener (default `127.0.0.1:9090`). Operators scrape +via SSH tunnel or sidecar. The bind-address validation (loopback IP +literals only, hostnames refused) is part of the contract — not the +operator's responsibility to get right via runbook discipline. + +This ticket is the wiring slice. #59 landed the registry and handler +factory; #61 landed `NewConnectionsMetrics`. #57 and #58 are concurrent +sibling tickets adding more collectors; they will append more +`NewXxxMetrics(metricsReg, …)` calls at the wiring site this ticket +establishes, but they are out of scope here — whatever is in the registry +at scrape time is what gets served. + +## Design + +Two artefacts: a new helper file in `internal/relay/` and edits to +`cmd/pyrycode-relay/main.go`. No edits to `internal/relay/metrics.go` — +the #59 seam is fixed. + +### 1. `internal/relay/metrics_listen.go` (new) + +Two exported functions and one sentinel error. Total ≤60 production lines +including doc comments. + +```go +package relay + +// ErrNonLoopbackBind is returned by CheckLoopbackBind / NewMetricsServer +// when --metrics-listen names a host that does not parse as a loopback +// IP literal. The relay refuses to start so a misconfiguration +// (typo, copy-paste from a non-loopback bind doc, accidental "0.0.0.0") +// does not silently publish operational state on the internet. +var ErrNonLoopbackBind = errors.New("relay: metrics listener bind address must be a loopback IP literal") +``` + +**`CheckLoopbackBind(addr string) error`** — pure validator. + +- Behaviour contract (full enumeration; the test file is the executable spec): + - `addr == ""` → returns an error naming the empty case + (caller short-circuits *before* calling; this is a defence-in-depth + return rather than a feature). The empty-disable path lives in main.go + structurally, not in this helper. + - `addr` not parseable by `net.SplitHostPort` → wrapped error naming + the input. + - port portion not a valid TCP port (per `ListenerPort` semantics: + 1..65535, no port 0) → wrapped error from `ListenerPort` so the + surrounding pattern (port-0 refusal is already documented in + listeners.go) extends to this listener. + - host portion does not parse with `net.ParseIP` (hostname, empty, + malformed) → wrapped `ErrNonLoopbackBind` with a message naming the + offending value and the rule ("hostnames are rejected to avoid + DNS-time TOCTOU; provide a loopback IP literal"). + - parsed IP, `IsLoopback() == false` → wrapped `ErrNonLoopbackBind` + naming the offending IP. + - parsed IP, `IsLoopback() == true` → return nil. Both IPv4 (`127.0.0.0/8`) + and IPv6 (`::1`) loopback addresses are accepted. + +- The hostname-refusal rationale (DNS TOCTOU) is the load-bearing security + design choice — restate it in the docstring so the next reader does not + "fix" the validator by adding `net.LookupHost`. A hostname can resolve + to a loopback IP at validation time and a non-loopback IP at bind time + (DNS rebinding, `/etc/hosts` race, resolver reconfigure). The only safe + validation is to refuse the entire shape. + +- Reuse `ListenerPort` for the port parse: it already wraps the same + error families and refuses port 0. Do NOT duplicate the port logic. + +**`NewMetricsServer(addr string, h http.Handler) (*http.Server, error)`** — +opt-out-aware constructor. + +- Behaviour contract: + - `addr == ""` → return `(nil, nil)`. This is the operator opt-out + path: no listener, no goroutine, no error. Caller checks + `srv != nil` to decide whether to enter the wiring branch. + - `CheckLoopbackBind(addr)` returns non-nil → return `(nil, err)`. Caller + logs and exits. + - otherwise → return an `*http.Server` with: + - `Addr: addr` + - `Handler: h` + - `ReadHeaderTimeout: 10 * time.Second` + - `ReadTimeout: 60 * time.Second` + - `WriteTimeout: 60 * time.Second` + - `IdleTimeout: 120 * time.Second` + - Timeout values are copied from the public listener (`main.go:118-121`). + Centralised here so a future change to either listener's timeout + policy does not silently diverge — but they are NOT exported as + constants (each listener can drift independently if a future ticket + has reason; today they match because that is the safest default). + - The handler argument is `http.Handler`, not `http.ServeMux` — the + caller chooses to wire either a bare `NewMetricsHandler(reg)` or a + mux. Current main.go uses a `ServeMux` so the path `/metrics` is + distinguishable from a future `/healthz` on this listener; that + decision lives in main.go, not here. + +### 2. `cmd/pyrycode-relay/main.go` edits + +Three localised edits. All other code in `main` is untouched. + +**Edit A — flag definition.** Add to the `flag.String` block at the top +of `main`: + +```go +metricsListen = flag.String("metrics-listen", "127.0.0.1:9090", + "Listen address for the /metrics endpoint. Must be a loopback IP "+ + "literal (e.g. 127.0.0.1:9090, [::1]:9090). Empty disables.") +``` + +**Edit B — metrics-listener wiring block.** After the existing +`startedAt` / `reg := relay.NewRegistry()` lines (~line 98) and before +the `mux := http.NewServeMux()` for the public listener (~line 106), +insert a block of the shape: + +```go +metricsReg := relay.NewMetricsRegistry() +relay.NewConnectionsMetrics(metricsReg, reg) + +metricsMux := http.NewServeMux() +metricsMux.Handle("/metrics", relay.NewMetricsHandler(metricsReg)) + +metricsSrv, err := relay.NewMetricsServer(*metricsListen, metricsMux) +if err != nil { + logger.Error("refusing to start: invalid --metrics-listen address", + "err", err, + "value", *metricsListen, + "fix", "use a loopback IP literal such as 127.0.0.1:9090 "+ + "or [::1]:9090, or pass --metrics-listen= to disable") + os.Exit(2) +} +``` + +- `metricsSrv` is a local in `main`, kept in scope for the goroutine + launch below and for the port-allowlist set construction. +- When `*metricsListen == ""` the constructor returns `(nil, nil)` and + no error path runs — `metricsSrv` is nil thereafter; the goroutine + launch and the port-set inclusion both short-circuit on `metricsSrv == + nil`. The opt-out is structural, not flag-tested. + +**Edit C — port-allowlist integration and goroutine launch.** Two +sub-edits, one per public-listener mode. + +*Insecure mode (`*insecureListen != ""` branch, ~line 113):* + +- After `port, err := relay.ListenerPort(srv.Addr)` and BEFORE the + `expected := map[uint16]struct{}{port: {}}` line, parse the metrics + port (only if `metricsSrv != nil`) and add it to the expected/actual + sets: + + ```go + expected := map[uint16]struct{}{port: {}} + actual := map[uint16]struct{}{port: {}} + if metricsSrv != nil { + mp, err := relay.ListenerPort(metricsSrv.Addr) + if err != nil { /* same os.Exit(2) shape as the existing block */ } + expected[mp] = struct{}{} + actual[mp] = struct{}{} + } + // existing CheckListenerPorts call unchanged + ``` + +- BEFORE `srv.ListenAndServe()`, launch the metrics listener in a + goroutine (only if `metricsSrv != nil`): + + ```go + if metricsSrv != nil { + logger.Info("starting metrics listener", "listen", metricsSrv.Addr) + go func() { + if err := metricsSrv.ListenAndServe(); err != nil { + logger.Error("metrics listener failed", "err", err) + os.Exit(1) + } + }() + } + ``` + +*Autocert mode (the second branch, ~line 146 onwards):* + +- Same shape. The existing `expected := map[uint16]struct{}{443: {}, 80: + {}}` and `actual := …` lines extend by the metrics port the same way. +- The metrics goroutine launches alongside the existing `go func()` + for the http-01 listener (~line 200), before `httpsSrv.ListenAndServeTLS`. + +**Why a goroutine and not foreground:** the existing public listener +runs foreground (`ListenAndServe` blocks `main`); the http-01 listener +in autocert mode runs in a goroutine. The metrics listener is a +secondary listener, never the main exit point — goroutine is the right +choice. If `ListenAndServe` returns (typically because the port is +already bound or the address is rejected by the OS), the goroutine +calls `os.Exit(1)` to surface the failure rather than serving the public +listener with a missing metrics surface. + +**Why no graceful shutdown wiring here:** the existing public listener +has none — process exits on signal naturally. #31 (open) will retrofit +SIGTERM-driven shutdown for both listeners; structuring `metricsSrv` as +a top-level local in `main` makes that retrofit a localised edit. This +ticket does not anticipate #31's design. + +### 3. `internal/relay/metrics_listen_test.go` (new) + +Table-driven coverage of the validator + constructor + an end-to-end +happy-path roundtrip. Three test functions; total ~120 lines. + +- **`TestCheckLoopbackBind_Matrix`** — table test covering every branch + of `CheckLoopbackBind`. Cases: + - `127.0.0.1:9090` → nil + - `127.0.0.5:1234` → nil (whole `127.0.0.0/8` is loopback) + - `[::1]:9090` → nil + - `0.0.0.0:9090` → wraps `ErrNonLoopbackBind` + - `192.168.1.10:9090` → wraps `ErrNonLoopbackBind` + - `[2001:db8::1]:9090` → wraps `ErrNonLoopbackBind` + - `localhost:9090` → wraps `ErrNonLoopbackBind` (hostname, not an IP literal — the security-design rationale) + - `127.0.0.1:0` → error (port 0 refusal flows from `ListenerPort`) + - `127.0.0.1:99999` → error (out-of-range from `ListenerPort`) + - `127.0.0.1` → error (no port, `net.SplitHostPort` fails) + - `""` → error (empty addr; defence-in-depth case) + - `:9090` → wraps `ErrNonLoopbackBind` (empty host doesn't parse as IP) + + Use `errors.Is(err, ErrNonLoopbackBind)` to branch on the loopback-rule + failures specifically; other error cases assert non-nil only. + +- **`TestNewMetricsServer_Matrix`** — three rows: + - `addr=""` → returns `(nil, nil)` + - `addr="0.0.0.0:9090"` → returns `(nil, err)` with `errors.Is(err, ErrNonLoopbackBind)` + - `addr="127.0.0.1:9090"` → returns `(srv, nil)` with `srv.Addr == "127.0.0.1:9090"`, all four timeouts matching the public-listener policy literally (do NOT compare against a shared constant — the spec says they can drift independently; assert on the actual `time.Second` values). + +- **`TestMetricsServer_EndToEnd_HappyPath`** — satisfies AC (a) against an + actual listener, not just `httptest.NewRecorder`. Scenario: + - Construct `reg := NewMetricsRegistry()`, register one trivial counter + (so the response body is non-empty — the assertion is on + `Content-Type` and status, not on the counter value, but a non-empty + body shakes out any "handler is a no-op" regression). + - `mux := http.NewServeMux(); mux.Handle("/metrics", NewMetricsHandler(reg))`. + - `srv, _ := NewMetricsServer("127.0.0.1:0", mux)`. + - Open a listener with `net.Listen("tcp", srv.Addr)` and call + `go srv.Serve(l)` (not `ListenAndServe`, so the ephemeral port is + knowable through `l.Addr()`). `t.Cleanup(func(){ srv.Close() })`. + - `http.Get("http://" + l.Addr().String() + "/metrics")`. + - Assert status == 200 and `Content-Type` matches the Prometheus text + format (`text/plain; version=0.0.4; charset=utf-8`). Body is not + parsed — that is `metrics_test.go`'s job (#59). + + This test is the AC (a) anchor. It exercises the validator + constructor + + actual `net.Listen` + actual HTTP round-trip — the path main.go runs + at boot, minus the flag plumbing. + +**Note on the `127.0.0.1:0` happy-path test.** `0` is normally rejected +by `CheckLoopbackBind` (port-0 refusal inherited from `ListenerPort`). +The test uses `127.0.0.1:0` precisely so the validator rejects it — +which means the happy-path test cannot construct the server through +`NewMetricsServer("127.0.0.1:0", …)`. Two acceptable resolutions; the +spec lets the developer pick: + + 1. **Preferred.** Construct the server with `NewMetricsServer("127.0.0.1:9090", mux)` (validator passes), then overwrite `srv.Addr = ""` and pass `l` from `net.Listen("tcp", "127.0.0.1:0")` into `srv.Serve(l)`. `http.Server.Serve` ignores `Addr` once a listener is supplied. + 2. Bypass `NewMetricsServer` and assemble the `*http.Server` directly in the test (timeouts copied from the spec). This trades coverage of `NewMetricsServer` for ergonomics; do this only if option 1's `srv.Addr = ""` step reads worse to a reviewer than constructing inline. + + Either is fine; the AC is about the listener+handler round-trip, not + about which factory function the test calls. + +### 4. Test-time port collision + +The happy-path end-to-end test uses `127.0.0.1:0` to pick an ephemeral +port — no collision with anything the developer's machine might already +bind to. Tests do not use the default `127.0.0.1:9090` literal. + +### 5. Out of scope + +- TLS for the metrics listener. Loopback-only is the entire defence; + TLS would add no marginal security and would require autocert / + static-cert plumbing not justified by the threat model. +- Authentication / authorisation on `/metrics`. Same rationale: the + bind shape is the gate. +- A second `/metrics` collector beyond the one wired by #61 + (`NewConnectionsMetrics`). #57 and #58 land theirs on this same wiring + site by appending more constructor calls — that is the seam this + ticket establishes, not work this ticket does. +- Graceful shutdown of either listener. #31's job. +- The `docs/knowledge/codebase/60.md` summary entry and the + `docs/knowledge/INDEX.md` append. Both are documentation-phase + territory (architect's "Never Update" rule); the AC is satisfied at + the ticket level by the documentation pass, not by this developer + run. Recorded so the developer does not chase an AC item that is not + theirs. + +## Concurrency model + +One new goroutine: the metrics listener's `srv.ListenAndServe()` call. +Lifecycle: + +- Started after both flag validation and `CheckLoopbackBind` have + succeeded (so the goroutine never starts with a bad config). +- Exits only on `ListenAndServe` returning an error. The error handling + is `logger.Error` + `os.Exit(1)` — same shape as the existing http-01 + goroutine in autocert mode. Treating a metrics-listener failure as a + process-fatal event is intentional: a relay that booted with metrics + enabled but is silently not serving them would mislead operator + scrapes into thinking the relay is healthy when its metrics surface + is gone. Loud failure over silent correction. +- No coordination with the public listener's goroutine. They are + independent. + +Concurrency on the handler side: `promhttp.HandlerFor` is documented +safe for concurrent calls; `prometheus.Registry` uses internal locking. +`NewConnectionsMetrics` reads `Registry.Counts()` on every scrape (#61's +pull-based design), which already holds the right lock on the relay's +`*Registry`. No shared state introduced here. + +## Error handling + +| Failure mode | Surface | Recovery | +| --- | --- | --- | +| `--metrics-listen` rejected by `CheckLoopbackBind` | `logger.Error` + `os.Exit(2)` (config-error, not runtime) | Operator fixes the flag | +| `--metrics-listen` port collides with public listener | `srv.ListenAndServe` returns `bind: address already in use` (OS-level) | `os.Exit(1)` from the metrics goroutine. Loud-failure-over-silent. The CheckListenerPorts pre-check does NOT catch this because both ports are in the expected set; the OS catches it at bind time. Acceptable. | +| `--metrics-listen` port not in expected set when port allowlist check runs | impossible by construction (we add the metrics port to both expected and actual sets in the same block) | n/a | +| `metricsSrv.ListenAndServe` returns mid-run (network reconfigure, fd exhaustion) | metrics goroutine logs + `os.Exit(1)` | Process restart | +| `--metrics-listen=""` | structural skip; no listener, no goroutine, no port-set entry | n/a — opt-out | + +## Testing strategy + +- **`internal/relay/metrics_listen_test.go`** (above) is the gate. It + satisfies ACs (a), (b), and (c) at the package level. +- **No tests in `cmd/pyrycode-relay/`** are added by this ticket. The + main-package logic is a thin wiring layer over the tested helpers; + asserting it would require either a `go test -tags integration` style + process-launch harness or a refactor of `main` into a testable shape + neither of which the AC asks for. `deps_test.go` is unaffected. +- **`make test -race`** runs the new tests under the race detector. The + end-to-end test starts a goroutine (`srv.Serve(l)`); the race detector + surfaces any data race in the (third-party) handler stack. No relay- + side shared state to exercise. +- **No change to e2e tests.** `/metrics` is not on the public-listener + surface that the e2e harness drives. + +## Open questions + +1. **Should the metrics port also pass through `CheckRunningAsRoot` / + `CheckCapabilities` / `CheckEnvConfig`?** No — those are + process-wide pre-flight checks already gating both listeners; they + run before any `metricsSrv` construction. No interaction. +2. **Should `--metrics-listen` accept a unix-domain socket path (e.g. + `/var/run/pyrycode-relay.sock`)?** Out of scope. The ticket body + names IP literals only. If a future operator wants unix-socket + metrics, that is a separate ticket with its own threat-model review + (file-permission surface, peer authentication, etc.). +3. **Should the metrics goroutine attempt to register at a different + port if the configured port is busy?** No — that would be silent + correction. Refuse to start. + +## Security review (security-sensitive ticket) + +### Mindset + +Re-reading the spec as an adversary against the relay process. Default +verdict FAIL until each applicable category below has either a concrete +finding or an explicit design decision that makes the category not +applicable. The label is the gate; I do not skip the pass because the +ticket "feels small". + +### 1. Trust boundaries + +- **Inbound trust boundary introduced:** yes. A new HTTP listener binds + to `127.0.0.1:9090` (default) and accepts unauthenticated GET requests + for `/metrics`. The boundary defence is the bind shape — loopback IP + literals only, hostnames refused, non-loopback IPs refused, port 0 + refused, all enforced at boot before the listener starts. +- **Why hostnames are refused (load-bearing):** a hostname resolves at + validation time and again at bind time (different syscall paths). + Between the two, DNS rebinding, `/etc/hosts` race, or resolver + reconfigure can make a hostname that validated as loopback bind to a + non-loopback IP. Refusing the entire shape is the only TOCTOU-proof + defence. Documented in the `CheckLoopbackBind` docstring and in the + spec's *Design* § 1 so a future contributor does not "fix" the + validator by adding `net.LookupHost`. +- **Internal boundary unchanged:** the `prometheus.Registry` private to + the relay (per ADR-0008 § Scope of use) is not touched by this ticket. + `NewMetricsRegistry()` is called once at the wiring site; + `DefaultRegisterer` is never referenced. + +### 2. Tokens, secrets, credentials + +- The relay's only handled secret is `X-Pyrycode-Token` at `/v1/client` + (#5). This ticket adds no code path that touches a token. +- The metrics labels emitted by the only collector wired here + (`NewConnectionsMetrics`, #61) are label-less by design — `pyrycode_relay_connected_binaries` + and `pyrycode_relay_connected_phones` are scalar gauges. The + `{server=""}` shape that #61's security + review forbade is structurally absent. Sibling tickets (#57, #58) that + add labelled collectors must enforce the same rule in their own specs; + this ticket's wiring carries no labels and therefore no exfiltration + channel. The constraint is recorded in #59's spec for sibling + architects. + +### 3. File operations + +- None. The metrics listener performs no file I/O. The `--metrics-listen` + flag is parsed as a string, not as a path. + +### 4. Subprocess / external command execution + +- None. + +### 5. Cryptographic primitives + +- None. The metrics listener is plaintext HTTP on loopback. TLS is + intentionally out of scope (see *Out of scope* § 5) — loopback is + the defence, not TLS. + +### 6. Network & I/O + +- **Listener exposure:** localhost-only by validation. A hostile + process on the same host can scrape `/metrics` — that is in-scope + for the threat model: same-host adversary is assumed to have at + least equal privilege to the relay process and metrics is not a + defence boundary against them. +- **Off-host exposure:** structurally impossible without a kernel-level + routing bypass. `net.ParseIP(host).IsLoopback()` is the gate; the + kernel enforces that loopback-bound sockets are not reachable from + non-loopback paths. +- **DoS via scrape flooding:** the listener uses the same timeouts as + the public listener (`ReadHeaderTimeout: 10s`, `ReadTimeout: 60s`, + `WriteTimeout: 60s`, `IdleTimeout: 120s`). `promhttp.HandlerFor`'s + `MaxRequestsInFlight` defaults to zero (unbounded) — acceptable + because the loopback gate caps the threat population to same-host + processes. A future ticket can cap it if a same-host + operational-process bug causes scrape storms, but the present threat + model does not justify a cap. +- **`promhttp.HandlerOpts.Timeout`:** zero (no per-handler timeout). + The `http.Server.WriteTimeout` is the upper bound on scrape duration. + No exposure here. +- **gosec G114 (`http.Server` without ReadHeaderTimeout):** mitigated. + `ReadHeaderTimeout: 10 * time.Second` is set explicitly in + `NewMetricsServer`. +- **Port-allowlist interaction (#81):** the metrics port joins the + `expected` and `actual` sets at the wiring site. A typo'd flag (e.g. + `--metrics-listen=127.0.0.1:6060`) would put a debug-shaped port in + the expected set — that's the operator's choice and the boot check + cannot second-guess it. The asymmetric check catches *surplus* + listeners (a stray import bound to `:6060` that the operator did not + declare); a deliberately-declared metrics port at `:6060` is not + surplus and not caught. Acceptable; operators are accountable for + the flag value within the loopback constraint. + +### 7. Error messages, logs, telemetry + +- **Boot-failure log line on `CheckLoopbackBind` rejection:** logs + `value=` and the wrapped error. The flag value is + operator input, not attacker input — `--metrics-listen` is a + command-line flag, not a network-supplied string. Logging it is safe. + No token / no secret / no PII flows through this path. +- **Metrics-goroutine `ListenAndServe failed` log:** the error from + `ListenAndServe` is library-internal (`bind: address already in use`, + `accept: too many open files`, etc.). No relay state, no token, no + PII. +- **`promhttp` self-instrumentation counters** (registered by + `HandlerOpts.Registry` per #59) — published on the loopback `/metrics` + surface. Same trust boundary as the rest of the metrics; not a leak + to non-loopback callers. +- **No new label-bearing metrics in this ticket.** The label-as-channel + rule recorded in #59 propagates here automatically: nothing this + ticket emits has labels. + +### 8. Concurrency + +- **One new goroutine** (`go srv.ListenAndServe()`); no shared mutable + state with the public listener's goroutines. The `prometheus.Registry` + the metrics goroutine reads is constructed once before either listener + starts; `NewConnectionsMetrics` registers a `Collector` whose + `Collect` method reads `Registry.Counts()` (already locked by #61). + No new lock, no new channel, no goroutine-leak vector — `os.Exit(1)` + on listener failure tears the process down before any leak matters. +- **Test-side goroutine** in `TestMetricsServer_EndToEnd_HappyPath`: + `srv.Serve(l)`. `t.Cleanup(func(){ srv.Close() })` ensures the + goroutine exits between subtests. `go test -race` covers the handler + stack's internal concurrency. + +### 9. Threat model alignment + +- **`docs/threat-model.md` § Log hygiene:** unchanged by this ticket; + no new log lines carry user-controlled input (the flag value is + operator-controlled, not network-controlled). +- **`docs/threat-model.md` § DoS:** the metrics scrape surface is + loopback-only. Same-host scrape storms are bounded by the + `http.Server` timeouts. No internet-reachable surface added. +- **`docs/threat-model.md` § Supply chain:** unchanged. No new direct + dep — `prometheus/client_golang` came in with #59; this ticket + consumes it. +- **`docs/threat-model.md` § TLS:** unchanged. The public listener's + TLS surface (`:443` in autocert mode) is untouched. +- **Protocol spec** (`pyrycode/pyrycode/docs/protocol-mobile.md`): + unaffected. `/metrics` is operational, not on the wire protocol. + +### Findings + +- **[Trust boundaries]** No MUST FIX. The boundary is defended + structurally by `CheckLoopbackBind`. Hostname refusal is the + load-bearing TOCTOU defence; it is documented in the validator's + docstring and asserted by `TestCheckLoopbackBind_Matrix`. +- **[Tokens]** No findings — no token-carrying code path is touched. +- **[File operations]** No findings — none performed. +- **[Subprocess]** No findings — none performed. +- **[Cryptographic primitives]** No findings — TLS deliberately out + of scope; loopback is the defence. +- **[Network & I/O]** No MUST FIX. Loopback validation + + per-listener timeouts cover the cases the threat model specifies. + `MaxRequestsInFlight` left at default; acceptable on loopback. +- **[Errors / logs / telemetry]** No findings — operator-controlled + flag value is the only thing logged from this ticket's new paths. +- **[Concurrency]** No findings — one new goroutine, no shared + mutable state, `os.Exit(1)` on failure. +- **[Threat model]** No findings unique to this ticket. + +### Verdict + +**PASS.** No MUST FIX findings. The design's load-bearing security +choice (refuse hostnames, not just non-loopback IPs) is structurally +defended in `CheckLoopbackBind` and asserted by the matrix test. +Operator-misconfiguration paths fail loudly at boot rather than serving +on the wrong interface. + +**Reviewer:** architect (self-review per `architect/security-review.md`) +**Date:** 2026-05-13 diff --git a/internal/relay/metrics_listen.go b/internal/relay/metrics_listen.go new file mode 100644 index 0000000..c72af76 --- /dev/null +++ b/internal/relay/metrics_listen.go @@ -0,0 +1,95 @@ +package relay + +import ( + "errors" + "fmt" + "net" + "net/http" + "time" +) + +// ErrNonLoopbackBind is returned by CheckLoopbackBind and NewMetricsServer +// when --metrics-listen names a host that does not parse as a loopback IP +// literal. The relay refuses to start so a misconfiguration (typo, +// copy-paste from a non-loopback bind doc, accidental "0.0.0.0") does not +// silently publish operational state on the internet-exposed surface. +// +// Branchable via errors.Is. +var ErrNonLoopbackBind = errors.New("relay: metrics listener bind address must be a loopback IP literal") + +// CheckLoopbackBind validates a --metrics-listen address. It returns nil +// only if addr parses as : with the host portion being a +// loopback IP (IPv4 127.0.0.0/8 or IPv6 ::1) and the port satisfying +// ListenerPort's rules (1..65535, port 0 rejected). +// +// Hostnames are deliberately refused even when they resolve to a loopback +// IP at validation time. A hostname resolves at validation time and again +// at bind time (different syscalls); between the two, DNS rebinding, an +// /etc/hosts race, or a resolver reconfigure can make a hostname that +// validated as loopback bind to a non-loopback IP. Refusing the entire +// shape is the only TOCTOU-proof defence. Do not "fix" this validator by +// adding net.LookupHost — that reintroduces the very race the IP-literal +// rule exists to close. +// +// The empty-addr case returns a non-nil error as defence in depth: the +// operator opt-out (--metrics-listen="") is structurally handled in +// NewMetricsServer before this function runs, so a caller that reaches +// CheckLoopbackBind with an empty string has already lost the opt-out +// shape and should fail. +func CheckLoopbackBind(addr string) error { + if addr == "" { + return fmt.Errorf("relay: metrics listener bind address is empty") + } + host, _, err := net.SplitHostPort(addr) + if err != nil { + return fmt.Errorf("relay: parsing metrics listener address %q: %w", addr, err) + } + if _, err := ListenerPort(addr); err != nil { + return err + } + ip := net.ParseIP(host) + if ip == nil { + return fmt.Errorf("%w: %q has a non-IP-literal host (hostnames are rejected to avoid DNS-time TOCTOU; provide a loopback IP literal such as 127.0.0.1 or [::1])", ErrNonLoopbackBind, addr) + } + if !ip.IsLoopback() { + return fmt.Errorf("%w: %q resolves to non-loopback IP %s", ErrNonLoopbackBind, addr, ip) + } + return nil +} + +// NewMetricsServer constructs the *http.Server that serves /metrics on +// the operator-supplied loopback address. The opt-out path (addr == "") +// returns (nil, nil): the caller checks srv != nil before launching the +// goroutine, adding the port to the listener-allowlist set, or otherwise +// referencing the metrics listener — no listener, no goroutine, no error. +// +// Validation order: addr == "" is the structural opt-out and short- +// circuits before CheckLoopbackBind runs. Otherwise CheckLoopbackBind +// gates the construction; its error is returned verbatim so callers can +// branch on errors.Is(err, ErrNonLoopbackBind). +// +// Timeouts match the public listener (cmd/pyrycode-relay/main.go) so the +// metrics surface has the same DoS-resistance shape. They are duplicated +// here rather than exported as constants — each listener may drift +// independently if a future ticket has reason; today they match because +// that is the safest default. +// +// The handler argument is http.Handler, not *http.ServeMux: the caller +// chooses whether to wire the bare NewMetricsHandler(reg) result or wrap +// it in a mux so /metrics is distinguishable from a future sibling path. +func NewMetricsServer(addr string, h http.Handler) (*http.Server, error) { + if addr == "" { + return nil, nil + } + if err := CheckLoopbackBind(addr); err != nil { + return nil, err + } + return &http.Server{ + Addr: addr, + Handler: h, + ReadHeaderTimeout: 10 * time.Second, + ReadTimeout: 60 * time.Second, + WriteTimeout: 60 * time.Second, + IdleTimeout: 120 * time.Second, + }, nil +} diff --git a/internal/relay/metrics_listen_test.go b/internal/relay/metrics_listen_test.go new file mode 100644 index 0000000..cdbfaeb --- /dev/null +++ b/internal/relay/metrics_listen_test.go @@ -0,0 +1,187 @@ +package relay + +import ( + "errors" + "io" + "net" + "net/http" + "strings" + "testing" + "time" + + "github.com/prometheus/client_golang/prometheus" +) + +func TestCheckLoopbackBind_Matrix(t *testing.T) { + t.Parallel() + + cases := []struct { + name string + addr string + wantErr bool + wantNonLoop bool + }{ + {name: "ipv4 loopback default port", addr: "127.0.0.1:9090"}, + {name: "ipv4 loopback /8 high address", addr: "127.0.0.5:1234"}, + {name: "ipv6 loopback bracketed", addr: "[::1]:9090"}, + + {name: "ipv4 all-interfaces rejected", addr: "0.0.0.0:9090", wantErr: true, wantNonLoop: true}, + {name: "ipv4 private rejected", addr: "192.168.1.10:9090", wantErr: true, wantNonLoop: true}, + {name: "ipv6 non-loopback rejected", addr: "[2001:db8::1]:9090", wantErr: true, wantNonLoop: true}, + {name: "hostname rejected", addr: "localhost:9090", wantErr: true, wantNonLoop: true}, + {name: "empty host rejected", addr: ":9090", wantErr: true, wantNonLoop: true}, + + {name: "port 0 rejected", addr: "127.0.0.1:0", wantErr: true}, + {name: "port out of range rejected", addr: "127.0.0.1:99999", wantErr: true}, + {name: "no port rejected", addr: "127.0.0.1", wantErr: true}, + {name: "empty addr rejected", addr: "", wantErr: true}, + } + + for _, tc := range cases { + tc := tc + t.Run(tc.name, func(t *testing.T) { + t.Parallel() + err := CheckLoopbackBind(tc.addr) + if !tc.wantErr { + if err != nil { + t.Fatalf("CheckLoopbackBind(%q) returned error: %v", tc.addr, err) + } + return + } + if err == nil { + t.Fatalf("CheckLoopbackBind(%q) = nil; want error", tc.addr) + } + if tc.wantNonLoop && !errors.Is(err, ErrNonLoopbackBind) { + t.Errorf("CheckLoopbackBind(%q) error = %v; want errors.Is(_, ErrNonLoopbackBind)", tc.addr, err) + } + }) + } +} + +func TestNewMetricsServer_Matrix(t *testing.T) { + t.Parallel() + + t.Run("empty addr returns (nil, nil) opt-out", func(t *testing.T) { + t.Parallel() + srv, err := NewMetricsServer("", http.NotFoundHandler()) + if err != nil { + t.Fatalf("NewMetricsServer(\"\", _) error = %v; want nil", err) + } + if srv != nil { + t.Fatalf("NewMetricsServer(\"\", _) srv = %+v; want nil (opt-out)", srv) + } + }) + + t.Run("non-loopback addr returns wrapped ErrNonLoopbackBind", func(t *testing.T) { + t.Parallel() + srv, err := NewMetricsServer("0.0.0.0:9090", http.NotFoundHandler()) + if err == nil { + t.Fatalf("NewMetricsServer(\"0.0.0.0:9090\", _) error = nil; want error") + } + if !errors.Is(err, ErrNonLoopbackBind) { + t.Errorf("NewMetricsServer(\"0.0.0.0:9090\", _) error = %v; want errors.Is(_, ErrNonLoopbackBind)", err) + } + if srv != nil { + t.Errorf("NewMetricsServer(\"0.0.0.0:9090\", _) srv = %+v; want nil", srv) + } + }) + + t.Run("loopback addr returns configured server", func(t *testing.T) { + t.Parallel() + srv, err := NewMetricsServer("127.0.0.1:9090", http.NotFoundHandler()) + if err != nil { + t.Fatalf("NewMetricsServer(\"127.0.0.1:9090\", _) error = %v; want nil", err) + } + if srv == nil { + t.Fatalf("NewMetricsServer(\"127.0.0.1:9090\", _) srv = nil; want non-nil") + } + if srv.Addr != "127.0.0.1:9090" { + t.Errorf("srv.Addr = %q; want %q", srv.Addr, "127.0.0.1:9090") + } + // Timeouts pinned to literal values: the spec says the metrics + // listener and the public listener share a policy today but may + // drift independently in a future ticket; assert on the values, + // not on a shared constant. + if got, want := srv.ReadHeaderTimeout, 10*time.Second; got != want { + t.Errorf("ReadHeaderTimeout = %v; want %v", got, want) + } + if got, want := srv.ReadTimeout, 60*time.Second; got != want { + t.Errorf("ReadTimeout = %v; want %v", got, want) + } + if got, want := srv.WriteTimeout, 60*time.Second; got != want { + t.Errorf("WriteTimeout = %v; want %v", got, want) + } + if got, want := srv.IdleTimeout, 120*time.Second; got != want { + t.Errorf("IdleTimeout = %v; want %v", got, want) + } + }) +} + +// TestMetricsServer_EndToEnd_HappyPath drives the wired listener over a +// real loopback TCP socket: validator + constructor + net.Listen + actual +// HTTP round-trip. This is the AC (a) anchor for #60 — httptest.NewRecorder +// is already exercised by metrics_test.go (#59); this test catches +// regressions in the path main.go runs at boot. +// +// The server is constructed with the default loopback address (which +// passes CheckLoopbackBind) and then served on an ephemeral port via +// net.Listen("tcp", "127.0.0.1:0"); http.Server.Serve ignores Addr once a +// listener is supplied, so the port-0 rule in ListenerPort does not +// conflict with the test's need for an ephemeral port. +func TestMetricsServer_EndToEnd_HappyPath(t *testing.T) { + t.Parallel() + + reg := NewMetricsRegistry() + c := prometheus.NewCounter(prometheus.CounterOpts{ + Name: "relay_test_listen_counter_total", + Help: "Sole purpose: ensure /metrics body is non-empty for the end-to-end test.", + }) + reg.MustRegister(c) + c.Inc() + + mux := http.NewServeMux() + mux.Handle("/metrics", NewMetricsHandler(reg)) + + srv, err := NewMetricsServer("127.0.0.1:9090", mux) + if err != nil { + t.Fatalf("NewMetricsServer: %v", err) + } + + l, err := net.Listen("tcp", "127.0.0.1:0") + if err != nil { + t.Fatalf("net.Listen: %v", err) + } + t.Cleanup(func() { _ = srv.Close() }) + + serveErr := make(chan error, 1) + go func() { + serveErr <- srv.Serve(l) + }() + + resp, err := http.Get("http://" + l.Addr().String() + "/metrics") + if err != nil { + t.Fatalf("GET /metrics: %v", err) + } + t.Cleanup(func() { _ = resp.Body.Close() }) + + if resp.StatusCode != http.StatusOK { + t.Fatalf("status = %d; want 200", resp.StatusCode) + } + if got := resp.Header.Get("Content-Type"); !strings.HasPrefix(got, textFormatPrefix) { + t.Errorf("Content-Type = %q; want prefix %q", got, textFormatPrefix) + } + body, err := io.ReadAll(resp.Body) + if err != nil { + t.Fatalf("read body: %v", err) + } + if !strings.Contains(string(body), "relay_test_listen_counter_total 1") { + t.Errorf("body missing registered counter; got:\n%s", body) + } + + if err := srv.Close(); err != nil { + t.Errorf("srv.Close: %v", err) + } + if err := <-serveErr; err != nil && !errors.Is(err, http.ErrServerClosed) { + t.Errorf("Serve returned: %v", err) + } +}