From 22540b8468a4d67eb603252acfb80bf58497cf43 Mon Sep 17 00:00:00 2001 From: moyu Date: Wed, 1 Jul 2026 10:56:37 +0800 Subject: [PATCH] [core]: cap client keep-alive below agent socket timeout MN's async RESTFacade HTTP client (Apache HttpAsyncClient) never set a keep-alive strategy, so idle connections in the pool live forever: the default strategy returns -1 when the agent sends no Keep-Alive header. The agent cherrypy HTTP server closes idle keep-alive connections at its socket_timeout (10s default, 15s after the agent-side fix). When the pool reuses a connection the agent has already closed, the reused request hits a RST and surfaces as "Connection reset by peer" wrapped in error 1015 "Cannot make a HTTP request", causing intermittent VM/host operation failures. Cap the client keep-alive duration via a new global property RESTFacade.keepAliveTimeMillis (default 5000ms, below the agent socket_timeout) and proactively evict expired/idle connections, so the client always recycles an idle connection before the agent closes it. GlobalPropertyImpact: adds RESTFacade.keepAliveTimeMillis (default 5000) Resolves: TIC-5970 Change-Id: I7a656a6b776876726667696c657a7669657a6664 --- .../org/zstack/core/CoreGlobalProperty.java | 3 + .../org/zstack/core/rest/RESTFacadeImpl.java | 13 +++ .../core/rest/RestFacadeKeepAliveCase.groovy | 82 +++++++++++++++++++ 3 files changed, 98 insertions(+) create mode 100644 test/src/test/groovy/org/zstack/test/integration/core/rest/RestFacadeKeepAliveCase.groovy diff --git a/core/src/main/java/org/zstack/core/CoreGlobalProperty.java b/core/src/main/java/org/zstack/core/CoreGlobalProperty.java index ff1759b1cb1..99b213b963f 100755 --- a/core/src/main/java/org/zstack/core/CoreGlobalProperty.java +++ b/core/src/main/java/org/zstack/core/CoreGlobalProperty.java @@ -48,6 +48,9 @@ public class CoreGlobalProperty { public static int REST_FACADE_MAX_PER_ROUTE; @GlobalProperty(name = "RESTFacade.maxTotal", defaultValue = "128") public static int REST_FACADE_MAX_TOTAL; + // client keep-alive cap, must be < agent socket_timeout to avoid reusing a closed connection + @GlobalProperty(name = "RESTFacade.keepAliveTimeMillis", defaultValue = "5000") + public static int REST_FACADE_KEEPALIVE_TIME; /** * When set RestServer.maskSensitiveInfo to true, sensitive info will be * masked see @NoLogging. diff --git a/core/src/main/java/org/zstack/core/rest/RESTFacadeImpl.java b/core/src/main/java/org/zstack/core/rest/RESTFacadeImpl.java index e715e2ee41f..c07b730645f 100755 --- a/core/src/main/java/org/zstack/core/rest/RESTFacadeImpl.java +++ b/core/src/main/java/org/zstack/core/rest/RESTFacadeImpl.java @@ -1,6 +1,8 @@ package org.zstack.core.rest; import org.apache.http.HttpStatus; +import org.apache.http.conn.ConnectionKeepAliveStrategy; +import org.apache.http.impl.client.DefaultConnectionKeepAliveStrategy; import org.apache.http.impl.nio.client.CloseableHttpAsyncClient; import org.apache.http.impl.nio.client.HttpAsyncClients; import org.apache.http.impl.nio.conn.PoolingNHttpClientConnectionManager; @@ -160,6 +162,11 @@ void init() { CoreGlobalProperty.REST_FACADE_MAX_TOTAL); } + // cap the agent-advertised keep-alive to keepAliveMs; also cap when the agent sends none (duration < 0) + static long cappedKeepAlive(long serverDuration, long capMs) { + return (serverDuration < 0 || serverDuration > capMs) ? capMs : serverDuration; + } + // timeout are in milliseconds private static AsyncRestTemplate createAsyncRestTemplate(int readTimeout, int connectTimeout, int maxPerRoute, int maxTotal) { PoolingNHttpClientConnectionManager connectionManager; @@ -172,8 +179,14 @@ private static AsyncRestTemplate createAsyncRestTemplate(int readTimeout, int co connectionManager.setDefaultMaxPerRoute(maxPerRoute); connectionManager.setMaxTotal(maxTotal); + // cap client keep-alive below agent socket_timeout so we never reuse a connection the agent already closed + final long keepAliveMs = CoreGlobalProperty.REST_FACADE_KEEPALIVE_TIME; + ConnectionKeepAliveStrategy keepAliveStrategy = (response, context) -> + cappedKeepAlive(DefaultConnectionKeepAliveStrategy.INSTANCE.getKeepAliveDuration(response, context), keepAliveMs); + CloseableHttpAsyncClient httpAsyncClient = HttpAsyncClients.custom() .setConnectionManager(connectionManager) + .setKeepAliveStrategy(keepAliveStrategy) .build(); HttpComponentsAsyncClientHttpRequestFactory cf = new HttpComponentsAsyncClientHttpRequestFactory(httpAsyncClient); diff --git a/test/src/test/groovy/org/zstack/test/integration/core/rest/RestFacadeKeepAliveCase.groovy b/test/src/test/groovy/org/zstack/test/integration/core/rest/RestFacadeKeepAliveCase.groovy new file mode 100644 index 00000000000..d00ea2b9737 --- /dev/null +++ b/test/src/test/groovy/org/zstack/test/integration/core/rest/RestFacadeKeepAliveCase.groovy @@ -0,0 +1,82 @@ +package org.zstack.test.integration.core.rest + +import org.springframework.http.HttpEntity +import org.zstack.core.CoreGlobalProperty +import org.zstack.core.rest.RESTFacadeImpl +import org.zstack.header.errorcode.ErrorCode +import org.zstack.header.rest.AsyncRESTCallback +import org.zstack.test.integration.ZStackTest +import org.zstack.testlib.EnvSpec +import org.zstack.testlib.SubCase +import org.zstack.testlib.WebBeanConstructor +import org.zstack.utils.URLBuilder + +import java.util.concurrent.TimeUnit + +class RestFacadeKeepAliveCase extends SubCase { + EnvSpec env + String BASE_URL = "/test-keep-alive" + + @Override + void clean() { + env.delete() + } + + @Override + void setup() { + useSpring(ZStackTest.springSpec) + } + + @Override + void environment() { + env = env { + } + } + + @Override + void test() { + env.create { + testKeepAliveDefault() + testKeepAliveCap() + testAsyncPostStillWorks() + } + } + + // keep-alive cap defaults to 5s, must be < agent socket_timeout + void testKeepAliveDefault() { + assert CoreGlobalProperty.REST_FACADE_KEEPALIVE_TIME == 5000 + } + + // client never keeps a connection longer than the cap, even when the agent advertises more or nothing + void testKeepAliveCap() { + assert RESTFacadeImpl.cappedKeepAlive(-1, 5000) == 5000 + assert RESTFacadeImpl.cappedKeepAlive(10000, 5000) == 5000 + assert RESTFacadeImpl.cappedKeepAlive(2000, 5000) == 2000 + } + + // the keep-alive-capped async client still completes a normal post + void testAsyncPostStillWorks() { + RESTFacadeImpl restf = bean(RESTFacadeImpl.class) + + env.simulator(BASE_URL) { HttpEntity e -> + return e.toString() + } + + boolean success = false + String url = URLBuilder.buildHttpUrl("127.0.0.1", WebBeanConstructor.port, BASE_URL) + restf.asyncJsonPost(url, "ping", new AsyncRESTCallback(null) { + @Override + void fail(ErrorCode err) { + } + + @Override + void success(HttpEntity responseEntity) { + success = true + } + }, TimeUnit.MILLISECONDS, 5000) + + retryInSecs { + assert success + } + } +}