Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion .openapi-generator-ignore
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,12 @@ setup.py
# at the package root (outside hotdata/api and hotdata/models) so the generator
# never emits or overwrites them, but they are listed here as the source of
# truth for "hand-maintained, don't touch": _auth.py (JWT exchange), arrow.py
# (Arrow IPC result fetch), query.py (429 retry + truncation auto-follow, #688).
# (Arrow IPC result fetch), query.py (429 retry + truncation auto-follow, #688),
# _retry.py (pre-response connection-reset retry on all methods, #118).
hotdata/_auth.py
hotdata/arrow.py
hotdata/query.py
hotdata/_retry.py

# Hand-written test for the patched ApiClient.close()/context-manager behavior
# (re-applied by scripts/patch_api_client_close.py). It lives in the generated
Expand Down
7 changes: 7 additions & 0 deletions .openapi-generator-templates/configuration.mustache
Original file line number Diff line number Diff line change
Expand Up @@ -433,6 +433,13 @@ conf = {{packageName}}.Configuration(
self.safe_chars_for_path_param = ''
"""Safe chars for path_param
"""
# Default to a retry policy that transparently retries pre-response
# connection resets (stale pooled keep-alive connections) on every
# method, including POST (#118). Passing an explicit `retries` (int or
# urllib3.Retry) overrides it entirely.
if retries is None:
from {{packageName}}._retry import default_retry
retries = default_retry()
self.retries = retries
"""Retry configuration
"""
Expand Down
10 changes: 10 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,16 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

## [Unreleased]

### Changed

- `Configuration` now defaults to a retry policy that transparently retries
pre-response connection resets (stale pooled keep-alive connections, e.g.
`ProtocolError('Connection aborted.', ConnectionResetError)`) on **every**
method, including `POST`. Such a reset happens before the request reaches the
server, so retrying on a fresh connection cannot double-execute. Read timeouts
and status retries stay idempotent-only. Pass an explicit `retries` to
override (#118).

## [0.4.1] - 2026-06-17

### Changed
Expand Down
112 changes: 112 additions & 0 deletions hotdata/_retry.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
"""Transparent retry of pre-response connection resets (#118).

A pooled keep-alive connection can be reused after an intermediary (load
balancer / reverse proxy) has already dropped it for exceeding its idle
timeout. The client is not notified, so the next request that reuses that
socket fails immediately with::

urllib3.exceptions.ProtocolError:
('Connection aborted.', ConnectionResetError(54, 'Connection reset by peer'))

urllib3 classifies *every* :class:`~urllib3.exceptions.ProtocolError` as a
*read* error (see ``Retry._is_read_error``) — it conservatively assumes the
server may have begun processing the request. Read retries are gated by
``allowed_methods``, which does not include ``POST``, so a ``POST`` that hits a
dead pooled connection is re-raised to the caller instead of being retried.

But a reset that happens *before any response bytes arrive* means the request
never reached the server — the connection was already gone when urllib3 tried
to send. That is safe to retry on **any** method, ``POST`` included, because the
server did no work and a retry cannot double-execute.

:class:`ConnectionResetRetry` reclassifies exactly that case — a
``ProtocolError`` whose underlying cause is a connection-level ``OSError`` — as
a *connection* error. urllib3's ``Retry.increment`` handles connection errors
before read errors and without the ``allowed_methods`` gate, so the reset is
retried on a fresh connection regardless of method. Read **timeouts**
(:class:`~urllib3.exceptions.ReadTimeoutError`) and status retries are left
untouched and stay idempotent-only, so a ``POST`` that may have reached the
server is never blindly replayed.
"""

from __future__ import annotations

from urllib3.exceptions import ProtocolError, ProxyError
from urllib3.util.retry import Retry

# Bounded attempts for the transparent connection-reset retry. Stale-pool resets
# clear on the first fresh connection, so a small ceiling is plenty; it also
# bounds the blast radius if a host is genuinely refusing connections.
DEFAULT_TOTAL_RETRIES = 3

# A small exponential backoff between attempts. The first retry on a fresh
# connection almost always succeeds, so the delay stays sub-second; it exists
# only to avoid hammering a host that is briefly flapping.
DEFAULT_BACKOFF_FACTOR = 0.1


def _is_pre_response_connection_reset(error: BaseException) -> bool:
"""True for a connection reset/abort that occurred before any response.

urllib3 wraps a low-level socket failure raised while sending the request as
``ProtocolError("Connection aborted.", <cause>)``, where ``<cause>`` is the
originating :class:`OSError` (``ConnectionResetError``,
``ConnectionAbortedError``, ``BrokenPipeError``, …). Those builtin
connection errors all subclass :class:`ConnectionError`, so the cause's type
is a precise, message-independent signal that the failure was at the socket
layer before a response existed — distinct from a read **timeout**, which
urllib3 raises as a ``ReadTimeoutError`` (not a ``ProtocolError``) and which
we deliberately leave method-gated.
"""
if isinstance(error, ProxyError):
error = error.original_error
if not isinstance(error, ProtocolError):
return False
cause = error.args[1] if len(error.args) > 1 else None
return isinstance(cause, ConnectionError)


class ConnectionResetRetry(Retry):
"""A :class:`urllib3.util.retry.Retry` that retries pre-response connection
resets on any HTTP method, while leaving read/status retries idempotent-only.

Behaves exactly like ``Retry`` except a connection reset/abort that happened
before any response bytes were received (see
:func:`_is_pre_response_connection_reset`) is treated as a *connection*
error rather than a *read* error. urllib3's ``increment`` retries connection
errors without consulting ``allowed_methods``, so the reset is retried on a
fresh connection for ``POST`` too — safe because the server did no work.
"""

def _is_connection_error(self, err: Exception) -> bool:
if super()._is_connection_error(err):
return True
return _is_pre_response_connection_reset(err)


def default_retry() -> ConnectionResetRetry:
"""The SDK's default retry policy.

Matches urllib3's defaults (idempotent-only read/status retries, a bounded
total, no retry on response status codes) and adds transparent retry of
pre-response connection resets on every method. ``Configuration`` installs
this when the caller does not supply their own ``retries``.
"""
return ConnectionResetRetry(
total=DEFAULT_TOTAL_RETRIES,
backoff_factor=DEFAULT_BACKOFF_FACTOR,
# Don't retry on response status codes by default: a status code means
# the request reached the server, and 429 admission-shedding already has
# dedicated handling in hotdata.query. Connection-reset retry is purely a
# transport-layer concern.
status=0,
raise_on_status=False,
)


__all__ = [
"ConnectionResetRetry",
"DEFAULT_BACKOFF_FACTOR",
"DEFAULT_TOTAL_RETRIES",
"default_retry",
]
7 changes: 7 additions & 0 deletions hotdata/configuration.py
Original file line number Diff line number Diff line change
Expand Up @@ -321,6 +321,13 @@ def __init__(
self.safe_chars_for_path_param = ''
"""Safe chars for path_param
"""
# Default to a retry policy that transparently retries pre-response
# connection resets (stale pooled keep-alive connections) on every
# method, including POST (#118). Passing an explicit `retries` (int or
# urllib3.Retry) overrides it entirely.
if retries is None:
from hotdata._retry import default_retry
retries = default_retry()
self.retries = retries
"""Retry configuration
"""
Expand Down
210 changes: 210 additions & 0 deletions tests/test_retry.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,210 @@
"""Tests for transparent pre-response connection-reset retry (#118).

A pooled keep-alive connection reused after an intermediary has dropped it for
idle-timeout fails immediately with::

ProtocolError('Connection aborted.', ConnectionResetError(54, 'reset by peer'))

before the request reaches the server. That is safe to retry on any method,
including POST, because the server did no work. These tests pin three things:

* the reset is retried on a fresh connection regardless of method (the fix);
* read **timeouts** stay idempotent-only — a POST that may have reached the
server is never blindly replayed;
* ``Configuration`` installs this policy by default but an explicit ``retries``
still overrides it.

The end-to-end tests drive urllib3's real ``HTTPConnectionPool.urlopen`` retry
machinery (the layer below ``RESTClientObject``, where the retry actually
happens) with ``_make_request`` stubbed to fail once then succeed.
"""

from __future__ import annotations

from unittest import mock

import pytest
from urllib3 import HTTPConnectionPool, HTTPResponse
from urllib3.exceptions import (
ConnectTimeoutError,
MaxRetryError,
ProtocolError,
ProxyError,
ReadTimeoutError,
)
from urllib3.util.retry import Retry

from hotdata._retry import (
ConnectionResetRetry,
_is_pre_response_connection_reset,
default_retry,
)
from hotdata.configuration import Configuration


def _reset_error() -> ProtocolError:
"""The exact error urllib3 raises for a reset on a stale pooled socket."""
return ProtocolError(
"Connection aborted.", ConnectionResetError(54, "Connection reset by peer")
)


def _read_timeout() -> ReadTimeoutError:
"""A read timeout — raised after the request was sent, so retrying it on a
non-idempotent method risks double execution."""
pool = HTTPConnectionPool("example.invalid", 80)
return ReadTimeoutError(pool, "/v1/query", "read timed out")


def _drive(method: str, error: Exception, retry: Retry, *, fail_times: int = 1):
"""Run ``method`` through a real urllib3 pool whose ``_make_request`` raises
``error`` for the first ``fail_times`` attempts, then returns 200.

Returns ``(result_or_exception, make_request_call_count)``. ``_get_conn`` /
``_put_conn`` are stubbed so no socket is opened; only the retry control flow
is exercised.
"""
pool = HTTPConnectionPool("example.invalid", 80)
calls = {"n": 0}

def fake_make_request(conn, m, url, **kw):
calls["n"] += 1
if calls["n"] <= fail_times:
raise error
return HTTPResponse(
body=b"", status=200, headers={}, preload_content=False, request_method=m
)

with mock.patch.object(pool, "_get_conn", return_value=mock.Mock()), \
mock.patch.object(pool, "_put_conn"), \
mock.patch.object(pool, "_make_request", side_effect=fake_make_request):
try:
resp = pool.urlopen(method, "/v1/query", retries=retry, release_conn=False)
return resp, calls["n"]
except Exception as exc: # noqa: BLE001 - the test inspects the type
return exc, calls["n"]


# -- the bug: urllib3's default reraises a POST reset ----------------------------


def test_stock_urllib3_retry_does_not_retry_post_reset():
"""Reproduces #118: the stock policy retries GET but reraises POST."""
get_result, get_calls = _drive("GET", _reset_error(), Retry(total=3))
assert isinstance(get_result, HTTPResponse) and get_calls == 2

post_result, post_calls = _drive("POST", _reset_error(), Retry(total=3))
assert isinstance(post_result, ProtocolError)
assert post_calls == 1 # never retried


# -- the fix: pre-response resets retry on every method --------------------------


@pytest.mark.parametrize("method", ["POST", "PUT", "PATCH", "DELETE", "GET"])
def test_connection_reset_retried_on_all_methods(method):
result, calls = _drive(method, _reset_error(), default_retry())
assert isinstance(result, HTTPResponse) and result.status == 200
assert calls == 2 # failed once, succeeded on the fresh connection


def test_reset_retry_is_bounded_and_eventually_raises():
"""The reset retry is bounded by ``total``; once exhausted it surfaces."""
retry = ConnectionResetRetry(total=2, backoff_factor=0)
result, calls = _drive("POST", _reset_error(), retry, fail_times=99)
assert isinstance(result, MaxRetryError)
assert calls == 3 # initial attempt + 2 retries


# -- read/status retries stay idempotent-only ------------------------------------


def test_read_timeout_not_retried_on_post():
"""A read timeout may mean the server began processing — POST must not
replay it, so the idempotent-only gate is preserved."""
result, calls = _drive("POST", _read_timeout(), default_retry())
assert isinstance(result, ReadTimeoutError)
assert calls == 1


def test_read_timeout_still_retried_on_get():
result, calls = _drive("GET", _read_timeout(), default_retry())
assert isinstance(result, HTTPResponse) and calls == 2


def test_default_retry_does_not_retry_status_codes():
"""A 503 means the request reached the server; the default does not retry
status codes (429 has dedicated handling in hotdata.query)."""
assert default_retry().status == 0
assert default_retry().status_forcelist in (None, frozenset(), set())


# -- the pre-response detector ---------------------------------------------------


@pytest.mark.parametrize(
"cause",
[
ConnectionResetError(54, "reset by peer"),
ConnectionAbortedError(53, "aborted"),
BrokenPipeError(32, "broken pipe"),
],
)
def test_detector_matches_connection_level_causes(cause):
assert _is_pre_response_connection_reset(ProtocolError("Connection aborted.", cause))


def test_detector_unwraps_proxy_error():
inner = ProtocolError("Connection aborted.", ConnectionResetError(54, "reset"))
assert _is_pre_response_connection_reset(ProxyError("proxy", inner))


@pytest.mark.parametrize(
"error",
[
ProtocolError("Response ended prematurely"), # no wrapped cause
ProtocolError("boom", ValueError("not a socket error")),
ValueError("unrelated"),
],
)
def test_detector_rejects_non_connection_errors(error):
assert not _is_pre_response_connection_reset(error)


def test_classifier_treats_reset_as_connection_error():
retry = ConnectionResetRetry(total=3)
# Reclassified: a reset is a connection error (not method-gated)...
assert retry._is_connection_error(_reset_error())
# ...while a connect timeout is still a connection error (super's behavior)...
assert retry._is_connection_error(ConnectTimeoutError())
# ...and a read timeout is not.
assert not retry._is_connection_error(_read_timeout())


# -- Configuration wiring --------------------------------------------------------


def test_configuration_installs_reset_retry_by_default():
cfg = Configuration()
assert isinstance(cfg.retries, ConnectionResetRetry)


def test_explicit_retries_overrides_default():
cfg = Configuration(retries=0)
assert cfg.retries == 0

custom = Retry(total=7)
cfg2 = Configuration(retries=custom)
assert cfg2.retries is custom


def test_default_retry_flows_into_rest_pool_args():
"""The installed policy must actually reach urllib3's pool manager."""
from hotdata.rest import RESTClientObject

cfg = Configuration()
with mock.patch("urllib3.PoolManager") as pm:
RESTClientObject(cfg)
_, kwargs = pm.call_args
assert kwargs["retries"] is cfg.retries
assert isinstance(kwargs["retries"], ConnectionResetRetry)