diff --git a/.openapi-generator-ignore b/.openapi-generator-ignore index 94ee40c..3f045e6 100644 --- a/.openapi-generator-ignore +++ b/.openapi-generator-ignore @@ -7,10 +7,12 @@ setup.py # at the package root (outside hotdata/api and hotdata/models) so the generator # never emits or overwrites them, but they are listed here as the source of # truth for "hand-maintained, don't touch": _auth.py (JWT exchange), arrow.py -# (Arrow IPC result fetch), query.py (429 retry + truncation auto-follow, #688). +# (Arrow IPC result fetch), query.py (429 retry + truncation auto-follow, #688), +# _retry.py (pre-response connection-reset retry on all methods, #118). hotdata/_auth.py hotdata/arrow.py hotdata/query.py +hotdata/_retry.py # Hand-written test for the patched ApiClient.close()/context-manager behavior # (re-applied by scripts/patch_api_client_close.py). It lives in the generated diff --git a/.openapi-generator-templates/configuration.mustache b/.openapi-generator-templates/configuration.mustache index 0b6ac91..8bf64be 100644 --- a/.openapi-generator-templates/configuration.mustache +++ b/.openapi-generator-templates/configuration.mustache @@ -433,6 +433,13 @@ conf = {{packageName}}.Configuration( self.safe_chars_for_path_param = '' """Safe chars for path_param """ + # Default to a retry policy that transparently retries pre-response + # connection resets (stale pooled keep-alive connections) on every + # method, including POST (#118). Passing an explicit `retries` (int or + # urllib3.Retry) overrides it entirely. + if retries is None: + from {{packageName}}._retry import default_retry + retries = default_retry() self.retries = retries """Retry configuration """ diff --git a/CHANGELOG.md b/CHANGELOG.md index 959656a..294c4a8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,16 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Changed + +- `Configuration` now defaults to a retry policy that transparently retries + pre-response connection resets (stale pooled keep-alive connections, e.g. + `ProtocolError('Connection aborted.', ConnectionResetError)`) on **every** + method, including `POST`. Such a reset happens before the request reaches the + server, so retrying on a fresh connection cannot double-execute. Read timeouts + and status retries stay idempotent-only. Pass an explicit `retries` to + override (#118). + ## [0.4.1] - 2026-06-17 ### Changed diff --git a/hotdata/_retry.py b/hotdata/_retry.py new file mode 100644 index 0000000..5af8790 --- /dev/null +++ b/hotdata/_retry.py @@ -0,0 +1,112 @@ +"""Transparent retry of pre-response connection resets (#118). + +A pooled keep-alive connection can be reused after an intermediary (load +balancer / reverse proxy) has already dropped it for exceeding its idle +timeout. The client is not notified, so the next request that reuses that +socket fails immediately with:: + + urllib3.exceptions.ProtocolError: + ('Connection aborted.', ConnectionResetError(54, 'Connection reset by peer')) + +urllib3 classifies *every* :class:`~urllib3.exceptions.ProtocolError` as a +*read* error (see ``Retry._is_read_error``) — it conservatively assumes the +server may have begun processing the request. Read retries are gated by +``allowed_methods``, which does not include ``POST``, so a ``POST`` that hits a +dead pooled connection is re-raised to the caller instead of being retried. + +But a reset that happens *before any response bytes arrive* means the request +never reached the server — the connection was already gone when urllib3 tried +to send. That is safe to retry on **any** method, ``POST`` included, because the +server did no work and a retry cannot double-execute. + +:class:`ConnectionResetRetry` reclassifies exactly that case — a +``ProtocolError`` whose underlying cause is a connection-level ``OSError`` — as +a *connection* error. urllib3's ``Retry.increment`` handles connection errors +before read errors and without the ``allowed_methods`` gate, so the reset is +retried on a fresh connection regardless of method. Read **timeouts** +(:class:`~urllib3.exceptions.ReadTimeoutError`) and status retries are left +untouched and stay idempotent-only, so a ``POST`` that may have reached the +server is never blindly replayed. +""" + +from __future__ import annotations + +from urllib3.exceptions import ProtocolError, ProxyError +from urllib3.util.retry import Retry + +# Bounded attempts for the transparent connection-reset retry. Stale-pool resets +# clear on the first fresh connection, so a small ceiling is plenty; it also +# bounds the blast radius if a host is genuinely refusing connections. +DEFAULT_TOTAL_RETRIES = 3 + +# A small exponential backoff between attempts. The first retry on a fresh +# connection almost always succeeds, so the delay stays sub-second; it exists +# only to avoid hammering a host that is briefly flapping. +DEFAULT_BACKOFF_FACTOR = 0.1 + + +def _is_pre_response_connection_reset(error: BaseException) -> bool: + """True for a connection reset/abort that occurred before any response. + + urllib3 wraps a low-level socket failure raised while sending the request as + ``ProtocolError("Connection aborted.", )``, where ```` is the + originating :class:`OSError` (``ConnectionResetError``, + ``ConnectionAbortedError``, ``BrokenPipeError``, …). Those builtin + connection errors all subclass :class:`ConnectionError`, so the cause's type + is a precise, message-independent signal that the failure was at the socket + layer before a response existed — distinct from a read **timeout**, which + urllib3 raises as a ``ReadTimeoutError`` (not a ``ProtocolError``) and which + we deliberately leave method-gated. + """ + if isinstance(error, ProxyError): + error = error.original_error + if not isinstance(error, ProtocolError): + return False + cause = error.args[1] if len(error.args) > 1 else None + return isinstance(cause, ConnectionError) + + +class ConnectionResetRetry(Retry): + """A :class:`urllib3.util.retry.Retry` that retries pre-response connection + resets on any HTTP method, while leaving read/status retries idempotent-only. + + Behaves exactly like ``Retry`` except a connection reset/abort that happened + before any response bytes were received (see + :func:`_is_pre_response_connection_reset`) is treated as a *connection* + error rather than a *read* error. urllib3's ``increment`` retries connection + errors without consulting ``allowed_methods``, so the reset is retried on a + fresh connection for ``POST`` too — safe because the server did no work. + """ + + def _is_connection_error(self, err: Exception) -> bool: + if super()._is_connection_error(err): + return True + return _is_pre_response_connection_reset(err) + + +def default_retry() -> ConnectionResetRetry: + """The SDK's default retry policy. + + Matches urllib3's defaults (idempotent-only read/status retries, a bounded + total, no retry on response status codes) and adds transparent retry of + pre-response connection resets on every method. ``Configuration`` installs + this when the caller does not supply their own ``retries``. + """ + return ConnectionResetRetry( + total=DEFAULT_TOTAL_RETRIES, + backoff_factor=DEFAULT_BACKOFF_FACTOR, + # Don't retry on response status codes by default: a status code means + # the request reached the server, and 429 admission-shedding already has + # dedicated handling in hotdata.query. Connection-reset retry is purely a + # transport-layer concern. + status=0, + raise_on_status=False, + ) + + +__all__ = [ + "ConnectionResetRetry", + "DEFAULT_BACKOFF_FACTOR", + "DEFAULT_TOTAL_RETRIES", + "default_retry", +] diff --git a/hotdata/configuration.py b/hotdata/configuration.py index 1759006..6e7f789 100644 --- a/hotdata/configuration.py +++ b/hotdata/configuration.py @@ -321,6 +321,13 @@ def __init__( self.safe_chars_for_path_param = '' """Safe chars for path_param """ + # Default to a retry policy that transparently retries pre-response + # connection resets (stale pooled keep-alive connections) on every + # method, including POST (#118). Passing an explicit `retries` (int or + # urllib3.Retry) overrides it entirely. + if retries is None: + from hotdata._retry import default_retry + retries = default_retry() self.retries = retries """Retry configuration """ diff --git a/tests/test_retry.py b/tests/test_retry.py new file mode 100644 index 0000000..0bae58d --- /dev/null +++ b/tests/test_retry.py @@ -0,0 +1,210 @@ +"""Tests for transparent pre-response connection-reset retry (#118). + +A pooled keep-alive connection reused after an intermediary has dropped it for +idle-timeout fails immediately with:: + + ProtocolError('Connection aborted.', ConnectionResetError(54, 'reset by peer')) + +before the request reaches the server. That is safe to retry on any method, +including POST, because the server did no work. These tests pin three things: + +* the reset is retried on a fresh connection regardless of method (the fix); +* read **timeouts** stay idempotent-only — a POST that may have reached the + server is never blindly replayed; +* ``Configuration`` installs this policy by default but an explicit ``retries`` + still overrides it. + +The end-to-end tests drive urllib3's real ``HTTPConnectionPool.urlopen`` retry +machinery (the layer below ``RESTClientObject``, where the retry actually +happens) with ``_make_request`` stubbed to fail once then succeed. +""" + +from __future__ import annotations + +from unittest import mock + +import pytest +from urllib3 import HTTPConnectionPool, HTTPResponse +from urllib3.exceptions import ( + ConnectTimeoutError, + MaxRetryError, + ProtocolError, + ProxyError, + ReadTimeoutError, +) +from urllib3.util.retry import Retry + +from hotdata._retry import ( + ConnectionResetRetry, + _is_pre_response_connection_reset, + default_retry, +) +from hotdata.configuration import Configuration + + +def _reset_error() -> ProtocolError: + """The exact error urllib3 raises for a reset on a stale pooled socket.""" + return ProtocolError( + "Connection aborted.", ConnectionResetError(54, "Connection reset by peer") + ) + + +def _read_timeout() -> ReadTimeoutError: + """A read timeout — raised after the request was sent, so retrying it on a + non-idempotent method risks double execution.""" + pool = HTTPConnectionPool("example.invalid", 80) + return ReadTimeoutError(pool, "/v1/query", "read timed out") + + +def _drive(method: str, error: Exception, retry: Retry, *, fail_times: int = 1): + """Run ``method`` through a real urllib3 pool whose ``_make_request`` raises + ``error`` for the first ``fail_times`` attempts, then returns 200. + + Returns ``(result_or_exception, make_request_call_count)``. ``_get_conn`` / + ``_put_conn`` are stubbed so no socket is opened; only the retry control flow + is exercised. + """ + pool = HTTPConnectionPool("example.invalid", 80) + calls = {"n": 0} + + def fake_make_request(conn, m, url, **kw): + calls["n"] += 1 + if calls["n"] <= fail_times: + raise error + return HTTPResponse( + body=b"", status=200, headers={}, preload_content=False, request_method=m + ) + + with mock.patch.object(pool, "_get_conn", return_value=mock.Mock()), \ + mock.patch.object(pool, "_put_conn"), \ + mock.patch.object(pool, "_make_request", side_effect=fake_make_request): + try: + resp = pool.urlopen(method, "/v1/query", retries=retry, release_conn=False) + return resp, calls["n"] + except Exception as exc: # noqa: BLE001 - the test inspects the type + return exc, calls["n"] + + +# -- the bug: urllib3's default reraises a POST reset ---------------------------- + + +def test_stock_urllib3_retry_does_not_retry_post_reset(): + """Reproduces #118: the stock policy retries GET but reraises POST.""" + get_result, get_calls = _drive("GET", _reset_error(), Retry(total=3)) + assert isinstance(get_result, HTTPResponse) and get_calls == 2 + + post_result, post_calls = _drive("POST", _reset_error(), Retry(total=3)) + assert isinstance(post_result, ProtocolError) + assert post_calls == 1 # never retried + + +# -- the fix: pre-response resets retry on every method -------------------------- + + +@pytest.mark.parametrize("method", ["POST", "PUT", "PATCH", "DELETE", "GET"]) +def test_connection_reset_retried_on_all_methods(method): + result, calls = _drive(method, _reset_error(), default_retry()) + assert isinstance(result, HTTPResponse) and result.status == 200 + assert calls == 2 # failed once, succeeded on the fresh connection + + +def test_reset_retry_is_bounded_and_eventually_raises(): + """The reset retry is bounded by ``total``; once exhausted it surfaces.""" + retry = ConnectionResetRetry(total=2, backoff_factor=0) + result, calls = _drive("POST", _reset_error(), retry, fail_times=99) + assert isinstance(result, MaxRetryError) + assert calls == 3 # initial attempt + 2 retries + + +# -- read/status retries stay idempotent-only ------------------------------------ + + +def test_read_timeout_not_retried_on_post(): + """A read timeout may mean the server began processing — POST must not + replay it, so the idempotent-only gate is preserved.""" + result, calls = _drive("POST", _read_timeout(), default_retry()) + assert isinstance(result, ReadTimeoutError) + assert calls == 1 + + +def test_read_timeout_still_retried_on_get(): + result, calls = _drive("GET", _read_timeout(), default_retry()) + assert isinstance(result, HTTPResponse) and calls == 2 + + +def test_default_retry_does_not_retry_status_codes(): + """A 503 means the request reached the server; the default does not retry + status codes (429 has dedicated handling in hotdata.query).""" + assert default_retry().status == 0 + assert default_retry().status_forcelist in (None, frozenset(), set()) + + +# -- the pre-response detector --------------------------------------------------- + + +@pytest.mark.parametrize( + "cause", + [ + ConnectionResetError(54, "reset by peer"), + ConnectionAbortedError(53, "aborted"), + BrokenPipeError(32, "broken pipe"), + ], +) +def test_detector_matches_connection_level_causes(cause): + assert _is_pre_response_connection_reset(ProtocolError("Connection aborted.", cause)) + + +def test_detector_unwraps_proxy_error(): + inner = ProtocolError("Connection aborted.", ConnectionResetError(54, "reset")) + assert _is_pre_response_connection_reset(ProxyError("proxy", inner)) + + +@pytest.mark.parametrize( + "error", + [ + ProtocolError("Response ended prematurely"), # no wrapped cause + ProtocolError("boom", ValueError("not a socket error")), + ValueError("unrelated"), + ], +) +def test_detector_rejects_non_connection_errors(error): + assert not _is_pre_response_connection_reset(error) + + +def test_classifier_treats_reset_as_connection_error(): + retry = ConnectionResetRetry(total=3) + # Reclassified: a reset is a connection error (not method-gated)... + assert retry._is_connection_error(_reset_error()) + # ...while a connect timeout is still a connection error (super's behavior)... + assert retry._is_connection_error(ConnectTimeoutError()) + # ...and a read timeout is not. + assert not retry._is_connection_error(_read_timeout()) + + +# -- Configuration wiring -------------------------------------------------------- + + +def test_configuration_installs_reset_retry_by_default(): + cfg = Configuration() + assert isinstance(cfg.retries, ConnectionResetRetry) + + +def test_explicit_retries_overrides_default(): + cfg = Configuration(retries=0) + assert cfg.retries == 0 + + custom = Retry(total=7) + cfg2 = Configuration(retries=custom) + assert cfg2.retries is custom + + +def test_default_retry_flows_into_rest_pool_args(): + """The installed policy must actually reach urllib3's pool manager.""" + from hotdata.rest import RESTClientObject + + cfg = Configuration() + with mock.patch("urllib3.PoolManager") as pm: + RESTClientObject(cfg) + _, kwargs = pm.call_args + assert kwargs["retries"] is cfg.retries + assert isinstance(kwargs["retries"], ConnectionResetRetry)