diff --git a/cmd/pyrycode-relay/main.go b/cmd/pyrycode-relay/main.go index 6fa5389..c7221b2 100644 --- a/cmd/pyrycode-relay/main.go +++ b/cmd/pyrycode-relay/main.go @@ -182,10 +182,20 @@ func run(args []string, sigCtx context.Context) int { if *insecureListen != "" { logger.Info("starting", "version", Version, "mode", "insecure", "listen", *insecureListen) + // ReadHeaderTimeout (5s) bounds the pre-upgrade WebSocket + // handshake window — the gap between TCP accept and full HTTP + // request-header receipt. After header parse, + // nhooyr.io/websocket.Accept hijacks the connection and + // ReadTimeout/WriteTimeout no longer apply; post-upgrade slow + // peers are covered by per-frame deadlines (#15) and heartbeat + // ping/pong (#7). Caps slow-loris exposure on the internet- + // exposed surface (docs/threat-model.md § DoS resistance). + // Lives at each wiring site per the policy-values-in-main + // convention (docs/PROJECT-MEMORY.md "Project-level conventions"). srv := &http.Server{ Addr: *insecureListen, Handler: mux, - ReadHeaderTimeout: 10 * time.Second, + ReadHeaderTimeout: 5 * time.Second, ReadTimeout: 60 * time.Second, WriteTimeout: 60 * time.Second, IdleTimeout: 120 * time.Second, @@ -236,7 +246,7 @@ func run(args []string, sigCtx context.Context) int { Addr: ":443", Handler: relay.EnforceHost(*domain, mux), TLSConfig: relay.TLSConfig(mgr), - ReadHeaderTimeout: 10 * time.Second, + ReadHeaderTimeout: 5 * time.Second, ReadTimeout: 60 * time.Second, WriteTimeout: 60 * time.Second, IdleTimeout: 120 * time.Second, @@ -248,7 +258,7 @@ func run(args []string, sigCtx context.Context) int { // would 302 GET/HEAD to HTTPS; the AC requires explicit 404 for // non-challenge traffic. Handler: mgr.HTTPHandler(http.NotFoundHandler()), - ReadHeaderTimeout: 10 * time.Second, + ReadHeaderTimeout: 5 * time.Second, ReadTimeout: 60 * time.Second, WriteTimeout: 60 * time.Second, IdleTimeout: 120 * time.Second, diff --git a/docs/knowledge/codebase/35.md b/docs/knowledge/codebase/35.md new file mode 100644 index 0000000..bdaf91c --- /dev/null +++ b/docs/knowledge/codebase/35.md @@ -0,0 +1,59 @@ +# Ticket #35 — WS upgrade handshake timeout: cap pre-upgrade window at 5s + +`http.Server.ReadHeaderTimeout` was 10s at every relay listener. That is the only bound on a peer that opens TCP and dribbles WS-upgrade request headers byte-by-byte; `ReadTimeout` / `WriteTimeout` stop applying once `websocket.Accept` hijacks the connection, so the 10s window was the full slow-loris exposure on the internet-exposed surface. This ticket tightens the value to 5s at every wiring site — three `http.Server` literals in `cmd/pyrycode-relay/main.go` and one in `internal/relay/metrics_listen.go`'s `NewMetricsServer` — plus a behavioural test pinning the `net/http` contract the relay depends on. + +## Implementation + +- **Four wiring sites move together.** `ReadHeaderTimeout: 10 * time.Second` → `5 * time.Second` at: + - `cmd/pyrycode-relay/main.go:198` — insecure-listen `srv`. + - `cmd/pyrycode-relay/main.go:249` — autocert TLS `httpsSrv`. + - `cmd/pyrycode-relay/main.go:261` — autocert HTTP-01 `httpSrv`. + - `internal/relay/metrics_listen.go:90` — `NewMetricsServer` return literal. +- **Metrics listener moves with the public listeners on purpose.** `metrics_listen.go`'s function comment ("Timeouts match the public listener (cmd/pyrycode-relay/main.go) so the metrics surface has the same DoS-resistance shape … today they match because that is the safest default") makes the four-site invariant load-bearing. Loopback-only metrics traffic has no slow-loris threat, but leaving the value at 10s would silently break the invariant the comment encodes and force a future contributor to re-discover whether the divergence was intentional. +- **Documenting the bound at the wiring site.** A new comment paragraph above the first occurrence (insecure-listen `srv` in `main.go`) states the value, what it bounds (pre-upgrade window — the gap between TCP accept and full HTTP header receipt), the references to the threat model and the post-upgrade complements (#15 per-frame deadlines, #7 heartbeat), and the wiring-site convention from `docs/PROJECT-MEMORY.md`. The autocert-mode `httpsSrv` / `httpSrv` carry the literal without their own comment — they sit visibly next to the commented insecure-mode `srv` in the same file. `NewMetricsServer`'s existing function-level comment block already covers its site. +- **No factory, no shared `const`.** Each of the four sites keeps its own literal. The inline pattern is load-bearing for the wiring-site convention: an operator reading the constructor sees the policy value next to the listener that enforces it. Promoting to a shared symbol would gain DRY but lose locality — the rule the AC names ("policy values live at the wiring site") explicitly prefers locality. +- **Why 5s.** Inside the AC's "~5 seconds" band, 5s is the choice because (a) it is identical to nhooyr's per-conn close-handshake timeout already documented in `docs/lessons.md` and used as the basis for `drainDeadline` headroom — keeping the relay's slow-peer bounds at one consistent number reduces operator surprise; (b) it comfortably exceeds round-trip + header-write time on the realistic worst case (mobile peer over LTE, ~500ms-1s RTT, a few hundred bytes of headers), so legitimate clients are not at risk of intermittent boot failures; (c) 3s would be tighter but starts to feel like a tuning value, while 5s leaves observable margin without leaving the resource window meaningfully open. Not a tuning parameter for this ticket; should not be revisited unless an operational signal demands it. +- **No new code path, no new goroutines, no new logs, no new metrics.** `ReadHeaderTimeout` is enforced by `net/http`'s connection-state machine before the registered handler is invoked. When it fires, the underlying `net.Conn` is closed and no handler runs. No log line, no metric — matches today's behaviour at 10s. A peer that never sends a full request is indistinguishable from a peer that disconnected mid-write, and we already chose not to log either case. Visibility into pre-upgrade timeouts as a slow-loris indicator is deferred to a future metrics-pipeline ticket against `internal/relay/metrics_upgrade.go`. +- **Test (`internal/relay/slow_upgrade_test.go`, new, ~93 lines):** `TestHTTPServerReadHeaderTimeoutClosesSlowPeer` pins the `net/http` contract this ticket depends on. Constructs a bare `*http.Server` (not `httptest.NewServer`, which cannot set `ReadHeaderTimeout`) on a `net.Listen("tcp", "127.0.0.1:0")` listener, with `ReadHeaderTimeout: 150ms` to keep the test fast and other timeouts (`ReadTimeout` / `WriteTimeout` / `IdleTimeout`) deliberately generous so the test cannot accidentally pass by tripping the wrong timeout. A slow client `Dial`s, writes a partial request-header block (`"GET /v1/server HTTP/1.1\r\nHost: x\r\n"` — no terminating `\r\n\r\n`), then `Read`s with a deadline of `ReadHeaderTimeout + 500ms` slack. The server must close the connection: `Read` returns either `io.EOF` or a connection-reset-style `net.Error` whose `Timeout()` is false (the client-side deadline error is explicitly distinguished and fails the test loudly — "server did not close the connection"). An `atomic.Int64` guards the handler: if it ever increments, the test fails because that signals `ReadHeaderTimeout` did not fire before the handler ran. +- **What the test does and does not prove.** Does prove: the standard library's `ReadHeaderTimeout` contract — the thing the relay depends on — actually closes the connection at the configured deadline. If a future Go version regressed this, or if a future refactor accidentally zeroed the field, the test catches it. Does not prove: that `main.go` sets the value to 5s. That is verified by reading the diff (or `grep ReadHeaderTimeout cmd/pyrycode-relay/main.go internal/relay/metrics_listen.go`) at code review time, same as every other wiring-site literal (`drainDeadline`, `maxFrameBytes`, the rate-limit constants in `main.go`, the timeouts in `metrics_listen.go`). +- **`internal/relay/metrics_listen_test.go`** — one literal updated in the constructor matrix's pinned-timeout assertion (10s → 5s). The test pins the timeout values intentionally rather than reading from a shared symbol, so an accidental drift surfaces as a unit-test failure. + +## Acceptance criteria — verification map + +- **AC-1** (pre-upgrade stall closed within ~5s): `ReadHeaderTimeout: 5 * time.Second` at every wiring site. Behavioural test `TestHTTPServerReadHeaderTimeoutClosesSlowPeer` proves the contract at a short value; the production value is verified by code review at each literal. +- **AC-2** (applies on `/v1/server` and `/v1/client`): the bound lives on `http.Server`, before the per-route handler dispatch — it covers every path on the listener (`/healthz`, `/v1/server`, `/v1/client`) by construction. +- **AC-3** (applies in both `--insecure-listen` and `--domain` modes): three `http.Server` literals in `main.go` move together (insecure `srv`, autocert `httpsSrv`, autocert `httpSrv`); the metrics-listener literal moves with them. +- **AC-4** (focused slow-handshake test in `internal/relay`): `internal/relay/slow_upgrade_test.go` — bare `http.Server` on loopback, partial-header dial, `Read` returns within deadline + slack. +- **AC-5** (5s literal at the wiring site, not buried in handler internals): each of the four sites carries the literal directly; no helper, no shared `const`. Matches the policy-values-in-main convention documented in `docs/PROJECT-MEMORY.md`. +- **AC-6** (`make vet`, `make test -race`, `make build` clean): no new dependencies, no new exported symbols, no concurrency changes. + +## Patterns reinforced (not new) + +- **Policy values live at the wiring site.** Each of the three `http.Server` literals in `main.go` and the one in `metrics_listen.go` continues to carry its own value. The local-comment-+-literal shape (mirroring `drainDeadline`'s comment block at `main.go:25-31`) is the established way the codebase documents a wiring-site policy. +- **Behavioural test of a stdlib contract + code-review of the literal.** The combination is how this codebase already validates `drainDeadline` (#31), `maxFrameBytes` (#15), the rate-limit constants (#47), and the existing `metrics_listen.go` timeouts (#60). The slow-handshake test fits the same shape — it pins the `net/http` contract the production literal depends on, while the literal itself is verified at review. +- **Multi-site invariants need every site to move.** The metrics-listener literal moves with the public-listener literals because the function's own comment makes the symmetry load-bearing. Carry forward: when a comment says "this value matches site X for reason Y", changing site X mechanically obligates this site too — the alternative is silently invalidating the comment. + +## Lessons learned + +- **`ReadTimeout` does not cover the upgrade window the way intuition suggests.** It is tempting to read `ReadTimeout: 60s` as "the whole request must finish in 60s" — that would already bound slow-loris. But `nhooyr.io/websocket.Accept` calls `http.ResponseWriter`'s `Hijack`, and once a connection is hijacked, `net/http` stops applying `ReadTimeout` / `WriteTimeout`. So between TCP accept and the moment the upgrade completes, `ReadHeaderTimeout` is the only stdlib-level bound. The post-upgrade window is then handled by per-frame deadlines (#15) and heartbeat (#7), which are application-level concerns living above the hijacked socket. Three layers, three distinct surfaces — none of them subsumes the others. +- **A behavioural test that runs the production handler would be misleading here.** A natural first instinct is "spin up a real `relay.ServerHandler`, dial slowly, assert close" — but that path requires `websocket.Accept` against a real upgrade, and the bound this ticket tests fires *before* the upgrade even starts. Testing the production handler would exercise more code without exercising the deciding line. The bare-`http.Server` shape is the minimal coverage of the thing actually being defended. +- **Handler-never-runs is the right negative signal.** The `atomic.Int64` handler-call counter is small but load-bearing: if the test setup ever lets the handler run, the assertion ("server closed the connection") could still pass spuriously via a different code path (e.g. a handler-side `r.Body` read with its own deadline). Counting the handler invocations makes "the timeout fired before the handler ran" an explicit, testable claim rather than an implicit assumption. +- **A short test timeout exercises the same contract as the production one.** Tests do not need the production value (5s) to verify the production contract. `net/http`'s `ReadHeaderTimeout` enforcement is the same code path at any positive duration; using 150ms keeps the test under the parallel-tests budget without weakening the assertion. Same shape as `TestShutdown_DeadlineExpiryReturnsCtxErr` (#31), which uses a 50ms drain deadline against the helper's 10s production constant. +- **Non-EOF, non-timeout close errors are valid.** Different kernels surface a server-side close as `io.EOF`, `ECONNRESET`, `EPIPE`, or `read: connection reset by peer` depending on TCP race timing. The test treats EOF as the canonical signal and logs (but does not fail on) other non-timeout errors, which keeps it robust across darwin/linux/CI runners without weakening the assertion (`netErr.Timeout()` being false is the load-bearing predicate — anything else means the server, not the client, ended the read). + +## Cross-links + +- [Threat model § DoS resistance](../../threat-model.md) — updated to cite the new 5s value and to list all four `http.Server` instances (insecure, autocert TLS, autocert HTTP-01, metrics listener). +- [Feature: Autocert TLS](../features/autocert-tls.md) — server-timeouts table updated to 5s. +- [Feature: Metrics listener](../features/metrics-listener.md) — constructor timeout policy updated to 5s; function comment's "match the public listener" invariant preserved. +- [Codebase note #31](31.md) — `drainDeadline` is the established pattern for a policy literal documented at the wiring site; this ticket reuses the same shape. +- [Codebase note #15](15.md) — per-frame deadlines that cover the post-upgrade slow-peer window. +- [Codebase note #7](7.md) — heartbeat ping/pong that also covers the post-upgrade slow-peer window. +- [Codebase note #50](50.md) — per-IP attempt-rate cap; the layer above this one in the DoS-resistance stack. + +## Out of scope (named in spec, deferred) + +- Per-IP connection rate limiting — covered by #50. +- Post-upgrade slow peers — covered by #15 (per-frame deadlines) and #7 (heartbeat). +- A metric counting `ReadHeaderTimeout` firings — no operational signal currently demands it; future work belongs against `internal/relay/metrics_upgrade.go`. +- Tightening `ReadTimeout`, `WriteTimeout`, or `IdleTimeout` — those govern post-header request lifecycle and are not the slow-loris surface this ticket addresses. diff --git a/docs/knowledge/features/autocert-tls.md b/docs/knowledge/features/autocert-tls.md index 77dd43c..5fbc272 100644 --- a/docs/knowledge/features/autocert-tls.md +++ b/docs/knowledge/features/autocert-tls.md @@ -75,12 +75,12 @@ Both `httpsSrv` and `httpSrv` mirror the insecure path's timeouts: | Setting | Value | |---|---| -| `ReadHeaderTimeout` | 10s | +| `ReadHeaderTimeout` | 5s | | `ReadTimeout` | 60s | | `WriteTimeout` | 60s | | `IdleTimeout` | 120s | -`ReadHeaderTimeout` bounds slow-loris on both ports and keeps `gosec` G114 quiet. +`ReadHeaderTimeout` bounds slow-loris on the pre-upgrade window on both ports and keeps `gosec` G114 quiet. Tightened from 10s to 5s in #35; post-upgrade slow peers are covered by per-frame deadlines (#15) and heartbeat (#7). ## TLS version diff --git a/docs/knowledge/features/metrics-listener.md b/docs/knowledge/features/metrics-listener.md index d3bf9b3..7fcf16a 100644 --- a/docs/knowledge/features/metrics-listener.md +++ b/docs/knowledge/features/metrics-listener.md @@ -28,7 +28,7 @@ func NewMetricsServer(addr string, h http.Handler) (*http.Server, error) - `NewMetricsServer` is the opt-out-aware constructor: - `addr == ""` → `(nil, nil)`. The caller checks `srv != nil` before launching the goroutine or adding the port to the listener allowlist. - `CheckLoopbackBind(addr)` fails → `(nil, err)`. - - otherwise → `*http.Server` with the public listener's timeout policy (`ReadHeaderTimeout: 10s`, `ReadTimeout: 60s`, `WriteTimeout: 60s`, `IdleTimeout: 120s`) — duplicated rather than shared so either listener can drift if a future ticket has reason; today they match because that is the safest default. + - otherwise → `*http.Server` with the public listener's timeout policy (`ReadHeaderTimeout: 5s`, `ReadTimeout: 60s`, `WriteTimeout: 60s`, `IdleTimeout: 120s`) — duplicated rather than shared so either listener can drift if a future ticket has reason; today they match because that is the safest default. (`ReadHeaderTimeout` was 10s at first wiring; #35 tightened all four wiring sites to 5s together to keep the invariant the function's comment encodes.) The handler argument is `http.Handler`, not `*http.ServeMux`: the caller chooses whether to wire a bare `NewMetricsHandler(reg)` or wrap it in a mux. `cmd/pyrycode-relay/main.go` wraps in a `ServeMux` so `/metrics` is distinguishable from a future sibling path on the same listener. diff --git a/docs/specs/architecture/35-ws-upgrade-handshake-timeout.md b/docs/specs/architecture/35-ws-upgrade-handshake-timeout.md new file mode 100644 index 0000000..15702e2 --- /dev/null +++ b/docs/specs/architecture/35-ws-upgrade-handshake-timeout.md @@ -0,0 +1,124 @@ +# Spec: WS upgrade handshake timeout — bound slow-loris exposure (#35) + +## Files to read first + +- `cmd/pyrycode-relay/main.go:185-255` — the three `http.Server` literals (`insecure-listen`, `httpsSrv`, `httpSrv`). Each carries `ReadHeaderTimeout: 10 * time.Second` (lines 188, 239, 251). These are the wiring sites the AC references. +- `internal/relay/metrics_listen.go:60-95` — fourth `http.Server` literal (line 90). The function's own comment ("Timeouts match the public listener (cmd/pyrycode-relay/main.go) so the metrics surface has the same DoS-resistance shape … today they match because that is the safest default") makes a coordinated change across all four sites mandatory; leaving this one at 10s would silently break the invariant the comment encodes. +- `cmd/pyrycode-relay/main.go:25-31` — the `drainDeadline` const, shown here as the established pattern for a policy literal documented at the wiring site (small comment + constant in `main`). +- `docs/threat-model.md:32-36` — § "DoS resistance" cites the current `ReadHeaderTimeout: 10s` value verbatim. The documentation phase will update this; the developer does not touch it. +- `internal/relay/server_endpoint_test.go:18-30` and `internal/relay/heartbeat_test.go:41-55` — `httptest.NewServer` patterns. The new slow-handshake test does **not** use `httptest.NewServer` (it cannot set `ReadHeaderTimeout`); it constructs a bare `http.Server` on a `net.Listen("tcp", "127.0.0.1:0")` listener. Read these so the test stylistically matches the package: t.Helper, `t.Cleanup` for teardown, `defer srv.Close()` patterns. +- `docs/threat-model.md` (whole file, scan only) — confirms the threat model places slow-loris under "DoS resistance" and ties to `http.Server` timeout fields; the spec inherits that framing. + +## Context + +The relay is internet-exposed. Per the threat model (`docs/threat-model.md` § DoS resistance), slow-loris mitigation rests on the `http.Server` timeout fields. Today every `http.Server` instance the relay constructs carries `ReadHeaderTimeout: 10 * time.Second`. That window is long enough that a single peer dribbling upgrade-request bytes can pin a socket + serving goroutine for ten seconds with negligible bandwidth cost; multiplied across attempts that survive the per-IP rate limit (~10/min/IP from #50), it remains a meaningful resource amplifier. + +After header parse completes, `nhooyr.io/websocket.Accept` writes the 101 response and hijacks the connection. From that point on, `ReadTimeout`/`WriteTimeout` no longer apply — post-upgrade slow peers are covered by per-frame deadlines (#15) and heartbeat ping/pong (#7). This ticket bounds *only* the pre-upgrade window: the gap between TCP accept and full HTTP header receipt. + +The fix is to tighten `ReadHeaderTimeout` from 10s to 5s at every site. No new code path, no factory, no exported helper. + +## Design + +### Edit set (production code) + +Four literal-value changes, no structural changes: + +1. `cmd/pyrycode-relay/main.go:188` (insecure listener `srv`) — `ReadHeaderTimeout: 10 * time.Second` → `5 * time.Second`. +2. `cmd/pyrycode-relay/main.go:239` (autocert TLS `httpsSrv`) — same change. +3. `cmd/pyrycode-relay/main.go:251` (autocert HTTP-01 `httpSrv`) — same change. +4. `internal/relay/metrics_listen.go:90` (`NewMetricsServer` return literal) — same change. + +The metrics-listener change is required because `metrics_listen.go`'s function comment (lines 70-75) explicitly states the timeouts mirror the public listener as a "safest default"; tightening the public listeners without tightening this one silently violates the invariant the comment encodes. Loopback-only metrics traffic has no slow-loris threat, but the test surface and "duplicated, identical" rationale stay intact only if all four sites move together. + +### Documenting the bound + +Add a short comment above the first occurrence in `main.go` (the insecure-listen branch's `srv`) — single paragraph, in the same shape as the existing `drainDeadline` comment block at lines 25-31. Contents: + +- States the value (5s) and what it bounds (pre-upgrade WebSocket handshake window). +- Cites the threat model and the post-upgrade complements (#15, #7). +- Names the wiring-site convention (per `docs/PROJECT-MEMORY.md` "Project-level conventions"). + +Do **not** factor a shared `const` or helper. Each of the four sites continues to carry its own literal, exactly as today; the inline pattern is load-bearing for the wiring-site convention the AC names. The metrics-listen function already has its own comment explaining why the value is duplicated rather than imported — leave that comment as-is. + +The autocert-mode `httpsSrv` and `httpSrv` don't need a new comment; they are visibly identical to the insecure-mode `srv` they sit next to. The metrics file's existing comment block already covers its site. + +### Choice of 5s (not 3s, not 10s) + +The AC says "~5 seconds". Inside that band, 5s is chosen because: + +- It is identical to nhooyr's per-conn close-handshake timeout already documented in `docs/lessons.md` and used as the basis for `drainDeadline` headroom — keeping the relay's slow-peer bounds at one consistent number reduces operator surprise. +- It comfortably exceeds round-trip + header-write time on the realistic worst-case path (mobile peer over LTE, ~500ms RTT, a few hundred bytes of headers), so legitimate clients are not at risk of intermittent boot failures. +- 3s would be tighter but starts to feel like a tuning value; 5s leaves observable margin without leaving the resource window meaningfully open. + +The value is not a tuning parameter for this ticket and should not be revisited unless an operational signal demands it. + +## Concurrency model + +No change. The `http.Server` runs the same accept loop; `ReadHeaderTimeout` is enforced by `net/http`'s connection-state machine before the registered `http.Handler` is invoked. No new goroutines, no new channels, no new shutdown sequence interaction. The `relay.Shutdown` path (#31) is untouched. + +## Error handling + +When `ReadHeaderTimeout` fires, `net/http` closes the underlying `net.Conn`. No handler runs, no log line is emitted by relay code, no metric is incremented — this matches today's behaviour at 10s and is the correct floor: a peer that never sends a full request is indistinguishable from a peer that disconnected mid-write, and we already chose not to log either case (the existing 10s timeout produces no log line today and there is no signal that this is a gap worth filling). If a future ticket wants visibility into pre-upgrade timeouts as a slow-loris indicator, that is a metrics-pipeline ticket against `internal/relay/metrics_upgrade.go`, not this one. + +## Testing strategy + +Add one test file, `internal/relay/slow_upgrade_test.go` (new), one test function. Stylistically mirrors `internal/relay/heartbeat_test.go`. + +Test shape (bullet-pointed scenario, not a function body): + +- **Setup.** Construct a bare `http.Server` (not `httptest.NewServer`) with: + - `Handler`: any minimal `http.HandlerFunc` (e.g. one that writes 200 OK). The handler **should never run** in this test; if it does, the test fails because that means `ReadHeaderTimeout` did not fire. + - `ReadHeaderTimeout`: a short value (~150ms) to keep the test fast. The test verifies the `http.Server` *contract* — that `ReadHeaderTimeout` actually closes a peer that stalls header writes — at any value. The production literal (5s) is verified by code review, consistent with how every other inline policy literal in `main.go` is verified. + - Other timeouts (`ReadTimeout`, `WriteTimeout`, `IdleTimeout`): set generously (seconds) so the test cannot accidentally pass via the wrong timeout. +- **Listener.** `net.Listen("tcp", "127.0.0.1:0")`, then `go srv.Serve(ln)`. `t.Cleanup` closes the server. +- **Slow client.** `net.Dial("tcp", ln.Addr().String())`. Write `"GET /v1/server HTTP/1.1\r\nHost: x\r\n"` — note the deliberate missing `\r\n\r\n` terminator. Do not write more. +- **Assertion.** Within `ReadHeaderTimeout + slack` (e.g. 500ms total), a `conn.Read` on the client side returns either EOF or a connection-reset/closed error. Use `conn.SetReadDeadline(time.Now().Add(slack))` to bound the test's own wait. +- **Negative-shape check.** If the handler ever runs (use an atomic counter), fail with a clear message — that's the signal that the timeout did not fire and something is wrong with the test setup. + +The test sits in `package relay` (same-package test convention, per PROJECT-MEMORY.md). It does not exercise `nhooyr.io/websocket` at all — the bound being tested is purely at the `http.Server` layer, before the upgrade handshake even starts. Pulling websocket in would add complexity without adding coverage. + +### What this test does and does not prove + +- **Does** prove: the standard library's `ReadHeaderTimeout` contract — the thing the relay depends on — actually closes the connection at the configured deadline. If a future Go version regressed this, or if a future refactor accidentally zeroed the field, the test catches it. +- **Does not** prove: that `main.go` sets the value to 5s. That is verified by reading the diff (or `grep ReadHeaderTimeout cmd/pyrycode-relay/main.go internal/relay/metrics_listen.go`) at code review time, same as every other wiring-site literal. + +Combining "behavioural test of the contract" + "code-review verification of the literal" is the established way this codebase validates wiring-site policy values (see how `drainDeadline`, `maxFrameBytes`, the rate-limit constants in `main.go`, and the timeouts in `metrics_listen.go` are all guarded). + +## CI invariants + +`make vet`, `make test -race`, `make build` all clean — same as every other ticket. No new dependencies. No new exported symbols. + +## Documentation impact (not the developer's job) + +`docs/threat-model.md:32-36` currently cites `ReadHeaderTimeout: 10s`. The documentation phase will update this to `5s` and tweak the surrounding sentence. The developer does not modify `docs/threat-model.md`; the architect does not modify it either (it is in the read-only set for both phases). + +## Out of scope + +- Per-IP connection rate limiting — owned by #50. +- Post-upgrade slow peers (idle frames, dribbled payloads) — owned by #15 (per-frame deadlines) and #7 (heartbeat). +- Adding a metric for `ReadHeaderTimeout` firings — no operational signal currently demands it; see Error handling above. +- Tightening `ReadTimeout`, `WriteTimeout`, or `IdleTimeout` — those govern post-header request lifecycle and are not the slow-loris surface this ticket addresses. + +## Open questions + +None. + +## Security review + +**Verdict:** PASS + +**Findings:** + +- [Trust boundaries] No finding. The relevant boundary is TCP → `http.Server` header parser; `ReadHeaderTimeout` is enforced by `net/http` *before* any handler runs, so a slow-loris attempt never crosses into application code. Tightening the deadline narrows the time-budget for crossing the boundary without changing which code holds trusted vs untrusted data. +- [Tokens, secrets, credentials] No finding. The pre-upgrade window this ticket bounds carries no application credentials; `X-Pyrycode-Token` is parsed by the post-upgrade handlers (`/v1/server`, `/v1/client`) after `websocket.Accept`, outside the window. Shortening `ReadHeaderTimeout` from 10s → 5s does not change any token-handling path. +- [File operations] N/A — no filesystem code added or modified. Autocert cert-cache paths untouched. +- [Subprocess / external command execution] N/A — no subprocesses involved. +- [Cryptographic primitives] N/A — no crypto added or modified. `relay.TLSConfig(mgr)` is wired identically to today. +- [Network & I/O] No finding — this is the category the ticket is *for*. All four `http.Server` instances retain their full timeout set (`ReadHeaderTimeout`, `ReadTimeout`, `WriteTimeout`, `IdleTimeout`) and `MaxHeaderBytes` (Go default, 1 MiB) is unchanged. Legitimate-client risk: worst-case mobile RTT (~1-2s on saturated LTE) plus a few hundred bytes of upgrade headers fits well inside the new 5s bound; the residual margin (3-4s) is large relative to realistic header-write time, so the change does not introduce a flakiness vector for normal peers. Loopback metrics path is even further from the edge (microsecond-scale RTT). Combined coverage: this ticket bounds per-attempt time at the pre-upgrade layer; #50 bounds attempts/IP/minute; #15 and #7 bound post-upgrade slow peers. The three layers are independent and additive. +- [Error messages, logs, telemetry] No finding. `ReadHeaderTimeout` firing produces no log line, no metric — matches today's 10s behaviour. The visibility gap (no signal when a slow-loris probe is dropped) is explicitly OUT OF SCOPE: the spec's "Out of scope" section names it and points future work at `internal/relay/metrics_upgrade.go`. No information is leaked in the no-log case because nothing is logged. +- [Concurrency] No finding. No new goroutines, no new locks, no change to the `relay.Shutdown` sequence (#31). The connection-close path on timeout is standard-library-owned. +- [Threat model alignment] No finding for design. The spec correctly names `docs/threat-model.md` § DoS resistance as the cited reference; that doc currently quotes the 10s value verbatim and will be updated by the documentation phase. Post-upgrade slow-peer threats (#15, #7) and per-IP attempt-rate cap (#50) are explicitly named as out of scope with ticket refs. + +**Reviewer:** architect (self-review per `architect/security-review.md`) +**Date:** 2026-05-13 + diff --git a/docs/threat-model.md b/docs/threat-model.md index a75372b..357b7c7 100644 --- a/docs/threat-model.md +++ b/docs/threat-model.md @@ -33,7 +33,7 @@ Each threat below records four fields: **Severity:** `medium`. -**v1 mitigation:** Slow-loris is mitigated by the `http.Server` timeouts in `cmd/pyrycode-relay/main.go:53-95` (`ReadHeaderTimeout: 10s`, `ReadTimeout: 60s`, `WriteTimeout: 60s`, `IdleTimeout: 120s`), applied identically to the autocert HTTP listener and the TLS listener. A per-IP token-bucket rate limit fronts the `/v1/server` and `/v1/client` upgrade paths (`cmd/pyrycode-relay/main.go` policy block, `internal/relay/ratelimit_middleware.go`): ~10 attempts/IP/minute steady-state, burst 20, eviction sweep every 5 minutes. Over-cap attempts receive `429 Too Many Requests` before `websocket.Accept` runs; the wrapped registry is never touched on a deny. The middleware extracts the source IP via `relay.ClientIP`; `X-Forwarded-For` is honoured only when the operator opts in with `--trust-x-forwarded-for` (default `false`). `/healthz` is intentionally unwrapped — it must remain pollable from monitoring without throttling. Bandwidth amplification is structurally absent: the relay forwards 1:1, so there is no amplification factor an attacker can lever. Connection-count caps (per-IP and global, distinct from the attempt-rate cap) are deferred. +**v1 mitigation:** Slow-loris is mitigated by the `http.Server` timeouts in `cmd/pyrycode-relay/main.go` (`ReadHeaderTimeout: 5s`, `ReadTimeout: 60s`, `WriteTimeout: 60s`, `IdleTimeout: 120s`), applied identically to the insecure listener, the autocert TLS listener, the autocert HTTP-01 listener, and the loopback metrics listener (`internal/relay/metrics_listen.go`). `ReadHeaderTimeout` bounds the pre-upgrade window — the gap between TCP accept and full HTTP header receipt — at 5s; after `websocket.Accept` hijacks the connection, post-upgrade slow peers are bounded by per-frame deadlines (#15) and the heartbeat ping/pong (#7). A per-IP token-bucket rate limit fronts the `/v1/server` and `/v1/client` upgrade paths (`cmd/pyrycode-relay/main.go` policy block, `internal/relay/ratelimit_middleware.go`): ~10 attempts/IP/minute steady-state, burst 20, eviction sweep every 5 minutes. Over-cap attempts receive `429 Too Many Requests` before `websocket.Accept` runs; the wrapped registry is never touched on a deny. The middleware extracts the source IP via `relay.ClientIP`; `X-Forwarded-For` is honoured only when the operator opts in with `--trust-x-forwarded-for` (default `false`). `/healthz` is intentionally unwrapped — it must remain pollable from monitoring without throttling. Bandwidth amplification is structurally absent: the relay forwards 1:1, so there is no amplification factor an attacker can lever. Connection-count caps (per-IP and global, distinct from the attempt-rate cap) are deferred. **Residual risk:** The rate limit caps the *attempt rate* per source IP but not the *concurrent connection count*: a slow attacker that opens connections at or below the bucket refill rate can still pin file descriptors and RAM. The single-instance limiter is per-process — a future multi-instance deployment behind L4 load balancing would need shared state to converge on a global limit. `X-Forwarded-For` is binary trust (all-or-nothing) — a CIDR-aware trusted-proxy chain is not implemented. diff --git a/internal/relay/metrics_listen.go b/internal/relay/metrics_listen.go index c72af76..34d9f20 100644 --- a/internal/relay/metrics_listen.go +++ b/internal/relay/metrics_listen.go @@ -87,7 +87,7 @@ func NewMetricsServer(addr string, h http.Handler) (*http.Server, error) { return &http.Server{ Addr: addr, Handler: h, - ReadHeaderTimeout: 10 * time.Second, + ReadHeaderTimeout: 5 * time.Second, ReadTimeout: 60 * time.Second, WriteTimeout: 60 * time.Second, IdleTimeout: 120 * time.Second, diff --git a/internal/relay/metrics_listen_test.go b/internal/relay/metrics_listen_test.go index cdbfaeb..e04f396 100644 --- a/internal/relay/metrics_listen_test.go +++ b/internal/relay/metrics_listen_test.go @@ -102,7 +102,7 @@ func TestNewMetricsServer_Matrix(t *testing.T) { // listener and the public listener share a policy today but may // drift independently in a future ticket; assert on the values, // not on a shared constant. - if got, want := srv.ReadHeaderTimeout, 10*time.Second; got != want { + if got, want := srv.ReadHeaderTimeout, 5*time.Second; got != want { t.Errorf("ReadHeaderTimeout = %v; want %v", got, want) } if got, want := srv.ReadTimeout, 60*time.Second; got != want { diff --git a/internal/relay/slow_upgrade_test.go b/internal/relay/slow_upgrade_test.go new file mode 100644 index 0000000..a3a22e1 --- /dev/null +++ b/internal/relay/slow_upgrade_test.go @@ -0,0 +1,93 @@ +package relay + +import ( + "errors" + "io" + "net" + "net/http" + "sync/atomic" + "testing" + "time" +) + +// TestHTTPServerReadHeaderTimeoutClosesSlowPeer verifies the standard +// library contract this ticket (#35) depends on: an http.Server with a +// finite ReadHeaderTimeout closes a peer that opens TCP and dribbles +// (or never completes) the request-header block. The production +// literal (5s) is verified by code review at each wiring site +// (cmd/pyrycode-relay/main.go and internal/relay/metrics_listen.go); +// this test pins the behavioural contract at a short value so a future +// Go regression or accidental zeroing of the field would fail CI. +func TestHTTPServerReadHeaderTimeoutClosesSlowPeer(t *testing.T) { + t.Parallel() + + const readHeaderTimeout = 150 * time.Millisecond + + var handlerCalls atomic.Int64 + srv := &http.Server{ + Handler: http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + handlerCalls.Add(1) + w.WriteHeader(http.StatusOK) + }), + ReadHeaderTimeout: readHeaderTimeout, + // Other timeouts deliberately generous so the test cannot + // accidentally pass by tripping the wrong timeout. + ReadTimeout: 5 * time.Second, + WriteTimeout: 5 * time.Second, + IdleTimeout: 5 * time.Second, + } + + ln, err := net.Listen("tcp", "127.0.0.1:0") + if err != nil { + t.Fatalf("net.Listen: %v", err) + } + t.Cleanup(func() { _ = srv.Close() }) + + go func() { + _ = srv.Serve(ln) + }() + + conn, err := net.Dial("tcp", ln.Addr().String()) + if err != nil { + t.Fatalf("net.Dial: %v", err) + } + t.Cleanup(func() { _ = conn.Close() }) + + // Write a partial request-header block: opening line + Host header, + // but deliberately NO terminating "\r\n\r\n". The server is now + // waiting for the rest of the headers; ReadHeaderTimeout should fire. + if _, err := conn.Write([]byte("GET /v1/server HTTP/1.1\r\nHost: x\r\n")); err != nil { + t.Fatalf("conn.Write: %v", err) + } + + // Bound our wait at ReadHeaderTimeout + slack. A correctly-behaving + // http.Server closes the connection at the deadline; our Read returns + // EOF (or a reset-style error on some kernels). + slack := 500 * time.Millisecond + if err := conn.SetReadDeadline(time.Now().Add(readHeaderTimeout + slack)); err != nil { + t.Fatalf("conn.SetReadDeadline: %v", err) + } + + buf := make([]byte, 64) + start := time.Now() + n, err := conn.Read(buf) + elapsed := time.Since(start) + if err == nil { + t.Fatalf("conn.Read returned (n=%d, err=nil) for slow peer; want connection closed by server", n) + } + var netErr net.Error + if errors.As(err, &netErr) && netErr.Timeout() { + t.Fatalf("conn.Read hit client-side deadline after %v (timeout=%v + slack=%v); server did not close the connection", + elapsed, readHeaderTimeout, slack) + } + if !errors.Is(err, io.EOF) { + // Non-EOF, non-timeout errors (e.g. ECONNRESET) are also a valid + // signal that the server closed the connection. Log for context + // but do not fail. + t.Logf("conn.Read closed with non-EOF error after %v: %v", elapsed, err) + } + + if got := handlerCalls.Load(); got != 0 { + t.Fatalf("handler was invoked %d time(s); ReadHeaderTimeout did not fire before the handler ran", got) + } +}