diff --git a/doc/admin-guide/files/records.yaml.en.rst b/doc/admin-guide/files/records.yaml.en.rst index 74aba11c225..2232c0b3fd7 100644 --- a/doc/admin-guide/files/records.yaml.en.rst +++ b/doc/admin-guide/files/records.yaml.en.rst @@ -1833,9 +1833,13 @@ Origin Server Connect Attempts .. ts:cv:: CONFIG proxy.config.http.connect.down.policy INT 2 :overridable: - Controls what origin server connection failures contribute to marking a server down. When set to 2, any connection failure during the TCP and TLS - handshakes will contribute to marking the server down. When set to 1, only TCP handshake failures will contribute to marking a server down. - When set to 0, no connection failures will be used towards marking a server down. + Controls what origin server connection failures contribute to marking a server down. + When set to ``2``, any connection failure during the TCP and TLS handshakes will + contribute to marking the server down. When set to ``1``, only TCP handshake failures + will contribute to marking a server down. When set to ``0``, no connection failures + will be used towards marking a server down. When set to ``3``, all failures covered + by ``2`` plus transaction inactive timeouts (server goes silent after connection is + established) will contribute to marking a server down. .. ts:cv:: CONFIG proxy.config.http.server_max_connections INT 0 :reloadable: diff --git a/src/proxy/http/HttpSM.cc b/src/proxy/http/HttpSM.cc index 1082b3a0815..3604cc29600 100644 --- a/src/proxy/http/HttpSM.cc +++ b/src/proxy/http/HttpSM.cc @@ -4671,12 +4671,18 @@ HttpSM::track_connect_fail() const bool retval = false; if (t_state.current.server->had_connect_fail()) { // What does our policy say? - if (t_state.txn_conf->connect_down_policy == 2) { // Any connection error through TLS handshake + if (t_state.txn_conf->connect_down_policy == 2 || + t_state.txn_conf->connect_down_policy == 3) { // Any connection error through TLS handshake retval = true; } else if (t_state.txn_conf->connect_down_policy == 1) { // Any connection error through TCP retval = t_state.current.server->connect_result != -ENET_SSL_CONNECT_FAILED; } } + // Policy 3 additionally marks the server down on transaction inactive timeout, + // even when had_connect_fail() is false (connect_result was cleared at CONNECTION_ALIVE). + if (!retval && t_state.txn_conf->connect_down_policy == 3) { + retval = (t_state.current.server->state == HttpTransact::INACTIVE_TIMEOUT); + } return retval; } diff --git a/tests/gold_tests/connect_down_policy/connect_down_policy_3.test.py b/tests/gold_tests/connect_down_policy/connect_down_policy_3.test.py new file mode 100644 index 00000000000..f4f5d9681ab --- /dev/null +++ b/tests/gold_tests/connect_down_policy/connect_down_policy_3.test.py @@ -0,0 +1,113 @@ +''' +Verify proxy.config.http.connect.down.policy=3 marks the origin down on +transaction inactive timeout (server goes silent after connection is established), +and that policy=2 does not mark the origin down for the same scenario. +''' +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +Test.Summary = ''' +Verify connect.down.policy=3 marks the origin down on inactive timeout after +connection is established, and policy=2 does not. +''' + +REPLAY_FILE = "replay/inactive_timeout.replay.yaml" + +# Inactivity timeout in seconds. The replay server-response delay (10s) is +# intentionally longer so ATS fires the timeout before the server replies. +INACTIVITY_TIMEOUT = 3 + + +class ConnectDownPolicy3Test: + """ + Test that policy=3 marks the origin server down when the server goes silent + and ATS fires an INACTIVE_TIMEOUT. + + Sequence: + 1. ATS connects to the origin and sends the request. + 2. The origin delays its response beyond INACTIVITY_TIMEOUT. + 3. ATS fires VC_EVENT_INACTIVITY_TIMEOUT, calling track_connect_fail(). + 4. Under policy=3 track_connect_fail() returns true → mark_host_failure(). + 5. With connect_attempts_rr_retries=0, increment_fail_count marks the host + down immediately (fcount >= 0), writing a "marking down" entry to error.log. + """ + + def __init__(self, policy, expect_mark_down): + self._policy = policy + self._expect_mark_down = expect_mark_down + self._name = f"policy{policy}" + self._server = Test.MakeVerifierServerProcess(f"server-{self._name}", REPLAY_FILE) + self._configure_trafficserver() + + def _configure_trafficserver(self): + self._ts = Test.MakeATSProcess(f"ts-{self._name}", enable_cache=False) + + self._ts.Disk.remap_config.AddLine(f"map / http://127.0.0.1:{self._server.Variables.http_port}/") + + self._ts.Disk.records_config.update( + { + 'proxy.config.diags.debug.enabled': 1, + 'proxy.config.diags.debug.tags': 'hostdb|http', + # Use policy under test. + 'proxy.config.http.connect.down.policy': self._policy, + # No connection retries — the single timeout failure is sufficient. + 'proxy.config.http.connect_attempts_max_retries': 0, + # Set rr_retries=0 so it does not exceed max_retries (which would + # emit a Warning). With max_retries=0, increment_fail_count marks + # the host down on the first failure (fcount >= 0 is always true). + 'proxy.config.http.connect_attempts_rr_retries': 0, + # Short server-side inactivity timeout so the test runs quickly. + 'proxy.config.http.transaction_no_activity_timeout_out': INACTIVITY_TIMEOUT, + # Keep the host marked down long enough to verify. + 'proxy.config.hostdb.fail.timeout': 60, + }) + + def _test_inactive_timeout(self): + tr = Test.AddTestRun(f"policy={self._policy}: inactive timeout triggers 504") + tr.Processes.Default.StartBefore(self._server) + tr.Processes.Default.StartBefore(self._ts) + tr.AddVerifierClientProcess(f"client-{self._name}", REPLAY_FILE, http_ports=[self._ts.Variables.port]) + + def _test_mark_down(self): + if self._expect_mark_down: + # Wait for error.log to appear then verify it contains the mark-down entry. + tr = Test.AddTestRun(f"policy={self._policy}: check error.log for mark-down") + tr.Processes.Default.Command = ( + os.path.join(Test.Variables.AtsTestToolsDir, 'condwait') + ' 60 1 -f ' + + os.path.join(self._ts.Variables.LOGDIR, 'error.log')) + self._ts.Disk.error_log.Content = Testers.ContainsExpression( + "marking down", f"policy={self._policy}: origin should be marked down after inactive timeout") + else: + # Pre-create error.log so ExcludesExpression can read it even if ATS + # never writes to it (policy=2 should not mark the host down). + error_log_path = os.path.join(self._ts.Variables.LOGDIR, 'error.log') + tr = Test.AddTestRun(f"policy={self._policy}: verify no mark-down in error.log") + tr.Processes.Default.Command = f"touch {error_log_path}" + self._ts.Disk.error_log.Content = Testers.ExcludesExpression( + "marking down", f"policy={self._policy}: origin should NOT be marked down after inactive timeout") + + def run(self): + self._test_inactive_timeout() + self._test_mark_down() + + +# Policy 3: inactive timeout SHOULD mark the origin down. +ConnectDownPolicy3Test(policy=3, expect_mark_down=True).run() + +# Policy 2: inactive timeout should NOT mark the origin down. +ConnectDownPolicy3Test(policy=2, expect_mark_down=False).run() diff --git a/tests/gold_tests/connect_down_policy/replay/inactive_timeout.replay.yaml b/tests/gold_tests/connect_down_policy/replay/inactive_timeout.replay.yaml new file mode 100644 index 00000000000..75bfd8f93eb --- /dev/null +++ b/tests/gold_tests/connect_down_policy/replay/inactive_timeout.replay.yaml @@ -0,0 +1,38 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +meta: + version: "1.0" + +sessions: +- transactions: + - client-request: + method: "GET" + version: "1.1" + url: /inactive/timeout/test + headers: + fields: + - [ Host, example.com ] + - [ uuid, 1 ] + + # Server accepts the connection but goes silent (delay longer than inactivity timeout). + server-response: + status: 200 + delay: 10s + + # ATS should time out waiting for the server response and return 504. + proxy-response: + status: 504