Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
55 changes: 55 additions & 0 deletions cmd/pyrycode-relay/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ func main() {
domain = flag.String("domain", "", "Public domain for Let's Encrypt cert issuance (required unless --insecure-listen is set).")
certCache = flag.String("cert-cache", defaultCertCache(), "Directory for autocert's TLS certificate cache.")
insecureListen = flag.String("insecure-listen", "", "Listen address for plain HTTP (e.g. :8080). Disables autocert; use only when fronted by a reverse proxy.")
metricsListen = flag.String("metrics-listen", "127.0.0.1:9090", "Listen address for the /metrics endpoint. Must be a loopback IP literal (e.g. 127.0.0.1:9090, [::1]:9090). Empty disables.")
showVersion = flag.Bool("version", false, "Print version and exit.")
)
flag.Parse()
Expand Down Expand Up @@ -97,6 +98,21 @@ func main() {
startedAt := time.Now()
reg := relay.NewRegistry()

metricsReg := relay.NewMetricsRegistry()
relay.NewConnectionsMetrics(metricsReg, reg)

metricsMux := http.NewServeMux()
metricsMux.Handle("/metrics", relay.NewMetricsHandler(metricsReg))

metricsSrv, err := relay.NewMetricsServer(*metricsListen, metricsMux)
if err != nil {
logger.Error("refusing to start: invalid --metrics-listen address",
"err", err,
"value", *metricsListen,
"fix", "use a loopback IP literal such as 127.0.0.1:9090 or [::1]:9090, or pass --metrics-listen= to disable")
os.Exit(2)
}

// maxFrameBytes: 256 KiB per-frame read cap. Derivation:
// docs/specs/architecture/29-wsconn-read-limit.md (≤50-message
// message_chunk envelope + routing wrapper, headroom for outliers,
Expand Down Expand Up @@ -128,6 +144,16 @@ func main() {
}
expected := map[uint16]struct{}{port: {}}
actual := map[uint16]struct{}{port: {}}
if metricsSrv != nil {
mp, err := relay.ListenerPort(metricsSrv.Addr)
if err != nil {
logger.Error("refusing to start: invalid listener address",
"err", err, "addr", metricsSrv.Addr)
os.Exit(2)
}
expected[mp] = struct{}{}
actual[mp] = struct{}{}
}
if err := relay.CheckListenerPorts(expected, actual); err != nil {
surplus, expectedList := listenerPortLists(expected, actual)
logger.Error("refusing to start: unexpected listener",
Expand All @@ -136,6 +162,15 @@ func main() {
"expected_ports", expectedList)
os.Exit(2)
}
if metricsSrv != nil {
logger.Info("starting metrics listener", "listen", metricsSrv.Addr)
go func() {
if err := metricsSrv.ListenAndServe(); err != nil {
logger.Error("metrics listener failed", "err", err)
os.Exit(1)
}
}()
}
if err := srv.ListenAndServe(); err != nil {
logger.Error("listen failed", "err", err)
os.Exit(1)
Expand Down Expand Up @@ -185,6 +220,16 @@ func main() {
}
expected := map[uint16]struct{}{443: {}, 80: {}}
actual := map[uint16]struct{}{httpsPort: {}, httpPort: {}}
if metricsSrv != nil {
mp, err := relay.ListenerPort(metricsSrv.Addr)
if err != nil {
logger.Error("refusing to start: invalid listener address",
"err", err, "addr", metricsSrv.Addr)
os.Exit(2)
}
expected[mp] = struct{}{}
actual[mp] = struct{}{}
}
if err := relay.CheckListenerPorts(expected, actual); err != nil {
surplus, expectedList := listenerPortLists(expected, actual)
logger.Error("refusing to start: unexpected listener",
Expand All @@ -197,6 +242,16 @@ func main() {
logger.Info("starting", "version", Version, "mode", "autocert",
"domain", *domain, "cert_cache", *certCache)

if metricsSrv != nil {
logger.Info("starting metrics listener", "listen", metricsSrv.Addr)
go func() {
if err := metricsSrv.ListenAndServe(); err != nil {
logger.Error("metrics listener failed", "err", err)
os.Exit(1)
}
}()
}

go func() {
if err := httpSrv.ListenAndServe(); err != nil {
logger.Error("http-01 listener failed", "err", err)
Expand Down
3 changes: 2 additions & 1 deletion docs/knowledge/INDEX.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,14 @@ One-line pointers into the evergreen knowledge base. Newest entries at the top o

## Features

- [Metrics listener (localhost-only)](features/metrics-listener.md) — separate `*http.Server` for `/metrics`, bound to a loopback IP literal (default `127.0.0.1:9090`); kept off the internet-exposed public listener that serves `/healthz` + `/v1/{server,client}` because metric values leak operational state. Two exports + one sentinel in `internal/relay/metrics_listen.go`: `ErrNonLoopbackBind` (branchable via `errors.Is`), `CheckLoopbackBind(addr)` (pure validator — `net.SplitHostPort` → `ListenerPort` for the port-0 / range / format rejects inherited from #81 → `net.ParseIP(host).IsLoopback()`; hostnames including `localhost:9090` are rejected even when they currently resolve to loopback — the DNS-time TOCTOU window is closed structurally, not by re-resolving), and `NewMetricsServer(addr, h)` (opt-out-aware: `addr == ""` → `(nil, nil)`, validator failure → `(nil, err)`, otherwise an `*http.Server` with the public listener's four timeouts duplicated literally so either listener can drift independently in a future ticket). Wired into both listener branches in `cmd/pyrycode-relay/main.go`: a goroutine launch mirroring the autocert mode's http-01 listener pattern (`os.Exit(1)` on `ListenAndServe` failure — loud-fail-over-silent because a relay booted with metrics enabled but silently not serving them would mislead operator scrapes), the metrics port joins **both** the `expected` and `actual` sets of #81's allowlist (declared secondary listener — must land on both sides of the asymmetric check). Empty-flag opt-out is structural via `metricsSrv != nil` guards at every reference site — no repeated `if *metricsListen == "" {}` branches. TLS / authn deliberately out of scope (loopback IS the defence); same-host adversary in scope but not a defence target; graceful shutdown deferred to #31. Three tests in `metrics_listen_test.go` — 12-row validator matrix, 3-row constructor matrix with timeouts pinned to literal values (not a shared constant), and an end-to-end happy-path that drives validator + constructor + `net.Listen("tcp", "127.0.0.1:0")` + actual `http.Get` round-trip by exploiting `http.Server.Serve(l)` ignoring `Addr` once a listener is supplied (#60).
- [Listener port allowlist (boot-time refusal)](features/listener-port-allowlist.md) — relay refuses to start (exit 2) if the set of TCP ports it is *about to bind* (`http.Server.Addr` values) contains any port outside an explicit expected set derived from parsed flags: `{443, 80}` in autocert mode, `{<port>}` in `--insecure-listen :<port>` mode. Catches stray `net/http/pprof :6060` listeners, env-flipped debug ports, accidentally-enabled metrics exporters. Three exports from `internal/relay/listeners.go`: `ErrUnexpectedListener` sentinel (branchable via `errors.Is`; wrapped message names surplus + expected ports both ascending so the failure log is deterministic across `map`-iteration runs and grep-friendly), `ListenerPort(addr string) (uint16, error)` parsing `":443"` / `"127.0.0.1:8080"` / `"[::1]:443"` with an explicit reject of port 0 (the ephemeral-placeholder trap — would smuggle an unknown bound port past the actual-set construction), and `CheckListenerPorts(expected, actual map[uint16]struct{}) error` (pure set-difference). **Asymmetric by design**: surplus = error, missing = nil (a failure to bind an expected port surfaces as `ListenAndServe`'s own bind error at exit 1; duplicating the signal at boot would clutter logs). **Port-only**: interface binding (`127.0.0.1` vs `0.0.0.0`) intentionally out of scope on a single-instance internet-exposed deploy. **Reports all surplus in one error** so a manifest enabling several debug surfaces fails in one boot rather than N restart cycles. Wired into each listener branch separately in `cmd/pyrycode-relay/main.go` (different `http.Server` shapes; lifting a helper would invent surface area without a second consumer); structured log fields `unexpected_ports` + `expected_ports` recomputed in `main` via unexported `listenerPortLists` so `Check…` stays a single-error return. Paired with `TestBinaryDoesNotImportPprof` in `cmd/pyrycode-relay/deps_test.go` — shells out to `go list -deps -json`, catches the `import _ "net/http/pprof"` handler-registration variant that attaches `/debug/pprof/*` to `http.DefaultServeMux` *without* opening a new port (the runtime check would miss it). Belt-and-suspenders means different fabric: stochastic-ish runtime guard + deterministic compile-time test, neither alone is complete. Exit 2 = config-rejected-at-boot, harmonised across the sentinel family (architect overrode the AC's literal `os.Exit(1)`) (#81).
- [Env-var config validator (boot-time refusal)](features/env-config-validator.md) — table-driven validation of every env var the relay reads at boot. Single source of truth is the unexported `envContracts []envContract` registry in `internal/relay/env_config.go`; each row carries `name`, `required` bool, and an inline `validate func(string) error`. `CheckEnvConfig(lookup func(string) (string, bool)) error` walks the registry and returns the structured `*ErrInvalidConfig{Key, Reason}` on the first failure (`Reason` is `"missing"` or `"malformed-value: <err>"`); the package-level sentinel `ErrInvalidConfigSentinel` is matched via a custom `Is` method (not `Unwrap`, which would double-print the message prefix) so `errors.Is(err, ErrInvalidConfigSentinel)` and `errors.As(err, &cfgErr)` form a dual contract. The `func(string) (string, bool)` (= `os.LookupEnv` shape) getter coexists with #77's `func(string) string` getter — the presence bit is necessary here to distinguish "missing-but-required" from "present-but-empty", semantically inert for `IsProductionMode`'s exact-`"1"` match. **Ordering is load-bearing**: wired in `main.go` BEFORE `CheckInsecureListenInProduction` so a typo like `PYRYCODE_RELAY_PRODUCTION=true` cannot slip through `IsProductionMode`'s silent-non-production fallback and reach the insecure-listen guard with an unvalidated value. Today's registry has one row (`PYRYCODE_RELAY_PRODUCTION`, optional-but-format-validated); future env-var reads register here at code-review time. `checkEnvConfigWith(lookup, contracts)` is the parameterised inner used by the `required: true` test case (today's production table has no required entries). Exit 2 = config-rejected-at-boot, matching the sibling refusals (#9, #77, #79) (#80).
- [Linux capability allowlist (boot-time refusal)](features/capability-allowlist.md) — relay parses `/proc/self/status`'s `CapEff:` hex mask at boot and refuses to start (exit 2) if any bit is set outside `AllowedCapabilities` (currently `{CAP_NET_BIND_SERVICE}` only, motivated by autocert binding `:80`/`:443` from uid 65532 in the distroless image). Exported sentinel `ErrUnexpectedCapability` is branchable via `errors.Is`; the wrapped error names every offending bit symbolically (`CAP_SYS_ADMIN (bit 21)` or `bit 63` for unknown), lists the allowlist contents, and embeds the operator fix string. `CapEff` only — `CapPrm/CapBnd/CapInh` would broaden false-positives (legitimate K8s default policy grants wide CapBnd) without adding load-bearing protection (relay never `capset(2)`s). Linux/non-Linux split at compile time via the new `_<goos>.go` / `_other.go` build-tag convention (see ADR-0009); non-Linux GOOS logs one skip line and returns nil. Unconditional — no production-mode gating, no env-var bypass, because stray capabilities are never legitimate. Reader-boundary test seam (`func() (string, error)`) exercises the parse + mask check end-to-end without touching real `/proc`. Joins the boot-time-refusal sentinel family (#9, #77, #79; future #78) (#79).
- [Production-mode contract & startup refusals](features/production-mode.md) — `PYRYCODE_RELAY_PRODUCTION=1` env-var contract (exact-string match, lazy read via injected getter, mirrors `PYRYCODE_RELAY_SINGLE_INSTANCE` shape from #64/#65) plus the boot-time checks that consume it. **#77** introduced `relay.CheckInsecureListenInProduction` + exported `ErrInsecureListenInProduction` sentinel (branchable via `errors.Is`) firing when production mode is on AND `--insecure-listen` is set. **#78** added the second consumer: `relay.CheckRunningAsRoot(geteuid, getenv)` + exported `ErrRunningAsRoot` sentinel firing when production mode is on AND `syscall.Geteuid() == 0`, closing the deploy-time gap (`docker run --user 0`, `securityContext.runAsUser: 0`, hand-edited Dockerfile dropping `USER`) that escapes the CI non-root-build contract (#32 Dockerfile, #68 Trivy). Both wired in `cmd/pyrycode-relay/main.go` after flag-parse with `os.Exit(2)` (config-rejected-at-boot, distinct from runtime-failure exit 1) and structured log fields: `env_var` carries the name only (never the value, even though `effective_uid` carries the kernel-supplied int — log-injection structurally impossible), one-line `fix` listing valid resolutions. `IsProductionMode` exported so siblings compose on the same predicate without re-reading the env var. Test seams: `func(string) string` for env, `func() int` for euid — both the smallest possible (no interface, no struct, no package-level var) and the only way to exercise the uid-0 branch in a unit test without re-execing the test binary as root. Two instances of the shape (#77, #78) now codify the "sibling boot-check" pattern; `Config.Validate()` consolidation deferred until ~5 checks exist (#77, #78).
- [Fly.io deploy](features/fly-deploy.md) — production host wiring: `fly.toml` declares TCP-passthrough on `:80`/`:443` (no Fly HTTP proxy, no Fly-managed certs) so TLS keeps terminating in the relay via autocert (#9), persistent Fly volume `relay_autocert` mounted at `/var/lib/relay/autocert`, and a single-machine hard cap encoded via `min_machines_running=1` + `auto_start_machines=false` + `auto_stop_machines="off"` + `[deploy] strategy="immediate"` (Fly Apps v2 has no `max_machines` key; the in-binary `PYRYCODE_RELAY_SINGLE_INSTANCE` self-check from #65 is the backstop). CI `deploy` job in `.github/workflows/ci.yml` runs `flyctl deploy --remote-only` on push to `main`, gated by branch-condition + `needs: [test, security, image-scan]` + `permissions: contents: read` so `FLY_API_TOKEN` is structurally unreachable from PR code; `superfly/flyctl-actions/setup-flyctl` pinned by commit SHA with `# Tracks:` comment (same convention as #68 / #41). Dedicated IPv4 is required (not optional) for autocert's HTTP-01 challenge; TCP passthrough preserves the real socket peer IP that #34's rate limiter reads. `__REGION__` / `__DOMAIN__` ship as placeholders that fail loud on first deploy (#38).
- [Connection-count gauges](features/connection-count-gauges.md) — `pyrycode_relay_connected_binaries` and `pyrycode_relay_connected_phones` exposed via a pull-based `prometheus.Collector` reading `Registry.Counts()` on each scrape; zero edits to `registry.go`; scalar (no labels) by design — `{server="..."}` would carry the attacker-influenced `x-pyrycode-server` header onto the metrics surface, which threat-model § Log hygiene forbids; stale grace-expiry fires can't move the gauge because the pointer-identity guard (ADR-0006) keeps the maps unchanged and the gauge IS the map size; race-tested against 16 mutator goroutines + a tight-loop scraper under `-race`. First collector wired into the #59 seam (#61).
- [Metrics registry (scaffolding)](features/metrics-registry.md) — private `*prometheus.Registry` + `NewMetricsHandler` factory wrapping `promhttp.HandlerFor` (text format only; OpenMetrics off; `HandlerOpts.Registry: reg` keeps `promhttp_metric_handler_*` off `DefaultRegisterer`). Seam shape for siblings: per-concern collector struct in its own file, constructed by a helper taking `prometheus.Registerer` (no mega-struct, no package-level vars) — first instantiated by #61's `connectionsCollector`. Listener still pending (#60). Structural defence against default-registry leaks via `TestMetricsRegistry_NoGlobalRegistrarLeak` (#59).
- [Metrics registry (scaffolding)](features/metrics-registry.md) — private `*prometheus.Registry` + `NewMetricsHandler` factory wrapping `promhttp.HandlerFor` (text format only; OpenMetrics off; `HandlerOpts.Registry: reg` keeps `promhttp_metric_handler_*` off `DefaultRegisterer`). Seam shape for siblings: per-concern collector struct in its own file, constructed by a helper taking `prometheus.Registerer` (no mega-struct, no package-level vars) — first instantiated by #61's `connectionsCollector`. Listener landed in #60 (see [Metrics listener](features/metrics-listener.md)). Structural defence against default-registry leaks via `TestMetricsRegistry_NoGlobalRegistrarLeak` (#59).
- [Docker image](features/docker-image.md) — portable OCI artifact: multi-stage `Dockerfile` builds a fully-static binary (`CGO_ENABLED=0`, `-trimpath -s -w`) into `distroless/static-debian12:nonroot`; both base images digest-pinned with `# Tracks:` comments; exposes `:80`/`:443` and declares `/var/lib/relay/autocert` volume; host-specific wiring (TLS policy, ports, volumes, healthcheck) is #38's problem (#32). PR-time Trivy CVE scan against the just-built image lives in CI as the `image-scan` job, fails on **fixable** CRITICAL/HIGH only (`ignore-unfixed: true`), action pinned by commit SHA with `# Tracks: <tag>` comment mirroring the Dockerfile pin convention; intentional overlap with `govulncheck` (source-reachability vs. shipped-artifact) (#68). Both scanners are also re-run daily against `main` via `.github/workflows/security-scan.yml` (cron + `workflow_dispatch`) so disclosed CVEs against unchanged deps surface within ≤24h rather than staying invisible until the next bump (#72); a red cron run also opens a `security-sensitive`-labelled GitHub issue via the workflow's `file-issue` job (artifact-handoff privilege split keeps `issues: write` off the scanners and out of workflow scope; deterministic-title dedup via `gh issue list --search 'in:title …'`) so regressions land as tracked work-items rather than passive Actions rows (#73).
- [Binary-side frame forwarder](features/binary-forwarder.md) — per-binary read pump: unwraps each inbound routing envelope, linear-scans `PhonesFor(serverID)` for `env.ConnID`, writes `env.Frame` verbatim to that phone; opaque inner bytes; synchronous (handler discards the return); diverges from #25 in error policy — unknown `conn_id`, malformed envelope, phone `Send` error all log+continue (a single bad frame never tears down the binary); replaced `/v1/server`'s `CloseRead` placeholder (#26).
- [WebSocket heartbeat](features/heartbeat.md) — per-conn goroutine on both endpoints sends RFC 6455 ping every 30s; closes with `1011 "heartbeat timeout"` if no pong within 30s. Detects half-open TCP within 60s; ctx-cancel exit path leaves close to the handler defer (#7).
Expand Down
Loading