From ac4da09af57dd6ec987a77ee00485798000b1309 Mon Sep 17 00:00:00 2001 From: vmamidi Date: Fri, 27 Feb 2026 15:12:36 -0600 Subject: [PATCH 1/7] Add option 3 to connect down policy to count inactive connections as failures --- doc/admin-guide/files/records.yaml.en.rst | 10 +++++++--- src/proxy/http/HttpSM.cc | 8 +++++++- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/doc/admin-guide/files/records.yaml.en.rst b/doc/admin-guide/files/records.yaml.en.rst index 74aba11c225..2232c0b3fd7 100644 --- a/doc/admin-guide/files/records.yaml.en.rst +++ b/doc/admin-guide/files/records.yaml.en.rst @@ -1833,9 +1833,13 @@ Origin Server Connect Attempts .. ts:cv:: CONFIG proxy.config.http.connect.down.policy INT 2 :overridable: - Controls what origin server connection failures contribute to marking a server down. When set to 2, any connection failure during the TCP and TLS - handshakes will contribute to marking the server down. When set to 1, only TCP handshake failures will contribute to marking a server down. - When set to 0, no connection failures will be used towards marking a server down. + Controls what origin server connection failures contribute to marking a server down. + When set to ``2``, any connection failure during the TCP and TLS handshakes will + contribute to marking the server down. When set to ``1``, only TCP handshake failures + will contribute to marking a server down. When set to ``0``, no connection failures + will be used towards marking a server down. When set to ``3``, all failures covered + by ``2`` plus transaction inactive timeouts (server goes silent after connection is + established) will contribute to marking a server down. .. ts:cv:: CONFIG proxy.config.http.server_max_connections INT 0 :reloadable: diff --git a/src/proxy/http/HttpSM.cc b/src/proxy/http/HttpSM.cc index 1082b3a0815..bb0f191df59 100644 --- a/src/proxy/http/HttpSM.cc +++ b/src/proxy/http/HttpSM.cc @@ -4671,12 +4671,18 @@ HttpSM::track_connect_fail() const bool retval = false; if (t_state.current.server->had_connect_fail()) { // What does our policy say? - if (t_state.txn_conf->connect_down_policy == 2) { // Any connection error through TLS handshake + if (t_state.txn_conf->connect_down_policy == 2 || + t_state.txn_conf->connect_down_policy == 3) { // Any connection error through TLS handshake retval = true; } else if (t_state.txn_conf->connect_down_policy == 1) { // Any connection error through TCP retval = t_state.current.server->connect_result != -ENET_SSL_CONNECT_FAILED; } } + // Policy 3 additionally marks the server down on transaction inactive timeout, + // even when had_connect_fail() is false (connect_result was cleared at CONNECTION_ALIVE). + if (!retval && t_state.txn_conf->connect_down_policy == 3) { + retval = (t_state.current.state == HttpTransact::INACTIVE_TIMEOUT); + } return retval; } From 4fb1f50b12cf75994747e8bb2d473d83afb3ef49 Mon Sep 17 00:00:00 2001 From: vmamidi Date: Fri, 27 Feb 2026 17:05:44 -0600 Subject: [PATCH 2/7] Add option 3 to connect down policy to count inactive connections as failures --- src/proxy/http/HttpSM.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/proxy/http/HttpSM.cc b/src/proxy/http/HttpSM.cc index bb0f191df59..3604cc29600 100644 --- a/src/proxy/http/HttpSM.cc +++ b/src/proxy/http/HttpSM.cc @@ -4681,7 +4681,7 @@ HttpSM::track_connect_fail() const // Policy 3 additionally marks the server down on transaction inactive timeout, // even when had_connect_fail() is false (connect_result was cleared at CONNECTION_ALIVE). if (!retval && t_state.txn_conf->connect_down_policy == 3) { - retval = (t_state.current.state == HttpTransact::INACTIVE_TIMEOUT); + retval = (t_state.current.server->state == HttpTransact::INACTIVE_TIMEOUT); } return retval; } From 828e9dcbe1f87a3944f67f016b73f954efb8c3b4 Mon Sep 17 00:00:00 2001 From: vmamidi Date: Fri, 27 Feb 2026 18:05:48 -0600 Subject: [PATCH 3/7] Add option 3 to connect down policy to count inactive connections as failures --- .../connect_down_policy_3.test.py | 110 ++++++++++++++++++ .../replay/inactive_timeout.replay.yaml | 38 ++++++ 2 files changed, 148 insertions(+) create mode 100644 tests/gold_tests/connect_down_policy/connect_down_policy_3.test.py create mode 100644 tests/gold_tests/connect_down_policy/replay/inactive_timeout.replay.yaml diff --git a/tests/gold_tests/connect_down_policy/connect_down_policy_3.test.py b/tests/gold_tests/connect_down_policy/connect_down_policy_3.test.py new file mode 100644 index 00000000000..52621e390a8 --- /dev/null +++ b/tests/gold_tests/connect_down_policy/connect_down_policy_3.test.py @@ -0,0 +1,110 @@ +''' +Verify proxy.config.http.connect.down.policy=3 marks the origin down on +transaction inactive timeout (server goes silent after connection is established), +and that policy=2 does not mark the origin down for the same scenario. +''' +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +Test.Summary = ''' +Verify connect.down.policy=3 marks the origin down on inactive timeout after +connection is established, and policy=2 does not. +''' + +REPLAY_FILE = "replay/inactive_timeout.replay.yaml" + +# Inactivity timeout in seconds. The replay server-response delay (10s) is +# intentionally longer so ATS fires the timeout before the server replies. +INACTIVITY_TIMEOUT = 3 + + +class ConnectDownPolicy3Test: + """ + Test that policy=3 marks the origin server down when the server goes silent + and ATS fires an INACTIVE_TIMEOUT. + + Sequence: + 1. ATS connects to the origin and sends the request. + 2. The origin delays its response beyond INACTIVITY_TIMEOUT. + 3. ATS fires VC_EVENT_INACTIVITY_TIMEOUT, calling track_connect_fail(). + 4. Under policy=3 track_connect_fail() returns true → mark_host_failure(). + 5. With connect_attempts_rr_retries=1, one failure is enough to mark the + host down, which writes a "marking down" entry to error.log. + """ + + def __init__(self, policy, expect_mark_down): + self._policy = policy + self._expect_mark_down = expect_mark_down + self._name = f"policy{policy}" + self._server = Test.MakeVerifierServerProcess(f"server-{self._name}", REPLAY_FILE) + self._configure_trafficserver() + + def _configure_trafficserver(self): + self._ts = Test.MakeATSProcess(f"ts-{self._name}", enable_cache=False) + + self._ts.Disk.remap_config.AddLine(f"map / http://127.0.0.1:{self._server.Variables.http_port}/") + + self._ts.Disk.records_config.update( + { + 'proxy.config.diags.debug.enabled': 1, + 'proxy.config.diags.debug.tags': 'hostdb|http', + # Use policy under test. + 'proxy.config.http.connect.down.policy': self._policy, + # No retries so the timeout triggers mark-down immediately. + 'proxy.config.http.connect_attempts_max_retries': 0, + # One failure is enough to mark the host down and write to error.log. + 'proxy.config.http.connect_attempts_rr_retries': 1, + # Short server-side inactivity timeout so the test runs quickly. + 'proxy.config.http.transaction_no_activity_timeout_out': INACTIVITY_TIMEOUT, + # Keep the host marked down long enough to verify. + 'proxy.config.hostdb.fail.timeout': 60, + }) + + def _test_inactive_timeout(self): + tr = Test.AddTestRun(f"policy={self._policy}: inactive timeout triggers 504") + tr.Processes.Default.StartBefore(self._server) + tr.Processes.Default.StartBefore(self._ts) + tr.AddVerifierClientProcess(f"client-{self._name}", REPLAY_FILE, http_ports=[self._ts.Variables.port]) + + def _test_mark_down(self): + if self._expect_mark_down: + # Wait for error.log to appear then verify it contains the mark-down entry. + tr = Test.AddTestRun(f"policy={self._policy}: check error.log for mark-down") + tr.Processes.Default.Command = ( + os.path.join(Test.Variables.AtsTestToolsDir, 'condwait') + ' 60 1 -f ' + + os.path.join(self._ts.Variables.LOGDIR, 'error.log')) + self._ts.Disk.error_log.Content = Testers.ContainsExpression( + "marking down", f"policy={self._policy}: origin should be marked down after inactive timeout") + else: + # For policy=2 the host should not be marked down, so error.log should + # not exist. Verify by checking traffic.out has no mark-down message. + tr = Test.AddTestRun(f"policy={self._policy}: verify no mark-down") + tr.Processes.Default.Command = "true" + self._ts.Disk.traffic_out.Content = Testers.ExcludesExpression( + "marking down", f"policy={self._policy}: origin should NOT be marked down after inactive timeout") + + def run(self): + self._test_inactive_timeout() + self._test_mark_down() + + +# Policy 3: inactive timeout SHOULD mark the origin down. +ConnectDownPolicy3Test(policy=3, expect_mark_down=True).run() + +# Policy 2: inactive timeout should NOT mark the origin down. +ConnectDownPolicy3Test(policy=2, expect_mark_down=False).run() diff --git a/tests/gold_tests/connect_down_policy/replay/inactive_timeout.replay.yaml b/tests/gold_tests/connect_down_policy/replay/inactive_timeout.replay.yaml new file mode 100644 index 00000000000..75bfd8f93eb --- /dev/null +++ b/tests/gold_tests/connect_down_policy/replay/inactive_timeout.replay.yaml @@ -0,0 +1,38 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +meta: + version: "1.0" + +sessions: +- transactions: + - client-request: + method: "GET" + version: "1.1" + url: /inactive/timeout/test + headers: + fields: + - [ Host, example.com ] + - [ uuid, 1 ] + + # Server accepts the connection but goes silent (delay longer than inactivity timeout). + server-response: + status: 200 + delay: 10s + + # ATS should time out waiting for the server response and return 504. + proxy-response: + status: 504 From 630d596b2ee1be9811c3dcbc37485c2a6186c7cf Mon Sep 17 00:00:00 2001 From: Vijay Mamidi Date: Fri, 27 Feb 2026 18:19:38 -0600 Subject: [PATCH 4/7] Update src/proxy/http/HttpSM.cc Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- src/proxy/http/HttpSM.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/proxy/http/HttpSM.cc b/src/proxy/http/HttpSM.cc index 3604cc29600..a947ad13a71 100644 --- a/src/proxy/http/HttpSM.cc +++ b/src/proxy/http/HttpSM.cc @@ -4672,7 +4672,7 @@ HttpSM::track_connect_fail() const if (t_state.current.server->had_connect_fail()) { // What does our policy say? if (t_state.txn_conf->connect_down_policy == 2 || - t_state.txn_conf->connect_down_policy == 3) { // Any connection error through TLS handshake + t_state.txn_conf->connect_down_policy == 3) { // Policy 2: any connection error during TCP or TLS handshake; Policy 3: same plus inactive timeout below retval = true; } else if (t_state.txn_conf->connect_down_policy == 1) { // Any connection error through TCP retval = t_state.current.server->connect_result != -ENET_SSL_CONNECT_FAILED; From 0361f3cd5484248adc54181da6d0f3dabadacb35 Mon Sep 17 00:00:00 2001 From: vmamidi Date: Fri, 27 Feb 2026 18:26:03 -0600 Subject: [PATCH 5/7] Add option 3 to connect down policy to count inactive connections as failures --- .../connect_down_policy/connect_down_policy_3.test.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/tests/gold_tests/connect_down_policy/connect_down_policy_3.test.py b/tests/gold_tests/connect_down_policy/connect_down_policy_3.test.py index 52621e390a8..a55da312d0a 100644 --- a/tests/gold_tests/connect_down_policy/connect_down_policy_3.test.py +++ b/tests/gold_tests/connect_down_policy/connect_down_policy_3.test.py @@ -91,11 +91,12 @@ def _test_mark_down(self): self._ts.Disk.error_log.Content = Testers.ContainsExpression( "marking down", f"policy={self._policy}: origin should be marked down after inactive timeout") else: - # For policy=2 the host should not be marked down, so error.log should - # not exist. Verify by checking traffic.out has no mark-down message. - tr = Test.AddTestRun(f"policy={self._policy}: verify no mark-down") - tr.Processes.Default.Command = "true" - self._ts.Disk.traffic_out.Content = Testers.ExcludesExpression( + # Pre-create error.log so ExcludesExpression can read it even if ATS + # never writes to it (policy=2 should not mark the host down). + error_log_path = os.path.join(self._ts.Variables.LOGDIR, 'error.log') + tr = Test.AddTestRun(f"policy={self._policy}: verify no mark-down in error.log") + tr.Processes.Default.Command = f"touch {error_log_path}" + self._ts.Disk.error_log.Content = Testers.ExcludesExpression( "marking down", f"policy={self._policy}: origin should NOT be marked down after inactive timeout") def run(self): From 449466bf266e1ad2a7f9e01816722cf355119ebf Mon Sep 17 00:00:00 2001 From: vmamidi Date: Fri, 27 Feb 2026 21:41:14 -0600 Subject: [PATCH 6/7] Add option 3 to connect down policy to count inactive connections as failures --- .../connect_down_policy_3.test.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/tests/gold_tests/connect_down_policy/connect_down_policy_3.test.py b/tests/gold_tests/connect_down_policy/connect_down_policy_3.test.py index a55da312d0a..f4f5d9681ab 100644 --- a/tests/gold_tests/connect_down_policy/connect_down_policy_3.test.py +++ b/tests/gold_tests/connect_down_policy/connect_down_policy_3.test.py @@ -43,8 +43,8 @@ class ConnectDownPolicy3Test: 2. The origin delays its response beyond INACTIVITY_TIMEOUT. 3. ATS fires VC_EVENT_INACTIVITY_TIMEOUT, calling track_connect_fail(). 4. Under policy=3 track_connect_fail() returns true → mark_host_failure(). - 5. With connect_attempts_rr_retries=1, one failure is enough to mark the - host down, which writes a "marking down" entry to error.log. + 5. With connect_attempts_rr_retries=0, increment_fail_count marks the host + down immediately (fcount >= 0), writing a "marking down" entry to error.log. """ def __init__(self, policy, expect_mark_down): @@ -65,10 +65,12 @@ def _configure_trafficserver(self): 'proxy.config.diags.debug.tags': 'hostdb|http', # Use policy under test. 'proxy.config.http.connect.down.policy': self._policy, - # No retries so the timeout triggers mark-down immediately. + # No connection retries — the single timeout failure is sufficient. 'proxy.config.http.connect_attempts_max_retries': 0, - # One failure is enough to mark the host down and write to error.log. - 'proxy.config.http.connect_attempts_rr_retries': 1, + # Set rr_retries=0 so it does not exceed max_retries (which would + # emit a Warning). With max_retries=0, increment_fail_count marks + # the host down on the first failure (fcount >= 0 is always true). + 'proxy.config.http.connect_attempts_rr_retries': 0, # Short server-side inactivity timeout so the test runs quickly. 'proxy.config.http.transaction_no_activity_timeout_out': INACTIVITY_TIMEOUT, # Keep the host marked down long enough to verify. From 609eb8b988f21c913ff0a27fc3fcb4ddd45e2240 Mon Sep 17 00:00:00 2001 From: vmamidi Date: Fri, 27 Feb 2026 21:53:07 -0600 Subject: [PATCH 7/7] Add option 3 to connect down policy to count inactive connections as failures --- src/proxy/http/HttpSM.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/proxy/http/HttpSM.cc b/src/proxy/http/HttpSM.cc index a947ad13a71..3604cc29600 100644 --- a/src/proxy/http/HttpSM.cc +++ b/src/proxy/http/HttpSM.cc @@ -4672,7 +4672,7 @@ HttpSM::track_connect_fail() const if (t_state.current.server->had_connect_fail()) { // What does our policy say? if (t_state.txn_conf->connect_down_policy == 2 || - t_state.txn_conf->connect_down_policy == 3) { // Policy 2: any connection error during TCP or TLS handshake; Policy 3: same plus inactive timeout below + t_state.txn_conf->connect_down_policy == 3) { // Any connection error through TLS handshake retval = true; } else if (t_state.txn_conf->connect_down_policy == 1) { // Any connection error through TCP retval = t_state.current.server->connect_result != -ENET_SSL_CONNECT_FAILED;