diff --git a/plugin/kvm/src/main/java/org/zstack/kvm/KVMHost.java b/plugin/kvm/src/main/java/org/zstack/kvm/KVMHost.java index ab8c29640f3..fc0bb962f9a 100755 --- a/plugin/kvm/src/main/java/org/zstack/kvm/KVMHost.java +++ b/plugin/kvm/src/main/java/org/zstack/kvm/KVMHost.java @@ -5803,6 +5803,13 @@ public void success(Boolean run) { @Override public void fail(ErrorCode errorCode) { + if (KVMHostUtils.shouldContinueReconnectOnAnsibleFailure(info.isNewAdded(), errorCode)) { + logger.warn(String.format( + "kvm ansible failed to mask libvirt sockets because systemd dbus timed out on existing host[uuid:%s, ip:%s], continue reconnect and verify kvmagent, error: %s", + self.getUuid(), self.getManagementIp(), errorCode)); + trigger.next(); + return; + } trigger.fail(errorCode); } }); diff --git a/plugin/kvm/src/main/java/org/zstack/kvm/KVMHostUtils.java b/plugin/kvm/src/main/java/org/zstack/kvm/KVMHostUtils.java index cf6d16e7560..3b9979caeb3 100644 --- a/plugin/kvm/src/main/java/org/zstack/kvm/KVMHostUtils.java +++ b/plugin/kvm/src/main/java/org/zstack/kvm/KVMHostUtils.java @@ -2,6 +2,7 @@ import org.apache.commons.codec.digest.DigestUtils; import org.zstack.core.db.Q; +import org.zstack.header.errorcode.ErrorCode; import org.zstack.header.network.l2.*; import org.zstack.header.tag.SystemTagVO; import org.zstack.header.tag.SystemTagVO_; @@ -13,6 +14,7 @@ import java.util.Collections; import java.util.HashSet; import java.util.List; +import java.util.Locale; import java.util.Set; /** @@ -21,6 +23,37 @@ public class KVMHostUtils { private static final CLogger logger = CLoggerImpl.getLogger(KVMHostUtils.class); + public static boolean shouldContinueReconnectOnAnsibleFailure(boolean isNewAdded, ErrorCode errorCode) { + return !isNewAdded && isLibvirtSocketMaskSystemdTimeout(errorCode); + } + + public static boolean isLibvirtSocketMaskSystemdTimeout(ErrorCode errorCode) { + String errorText = collectErrorText(errorCode).toLowerCase(Locale.ROOT); + return errorText.contains("systemctl mask") + && errorText.contains("libvirtd.socket") + && errorText.contains("org.freedesktop.systemd1") + && errorText.contains("timed out") + && (errorText.contains("failed to get properties") + || errorText.contains("failed to activate service")); + } + + private static String collectErrorText(ErrorCode errorCode) { + StringBuilder sb = new StringBuilder(); + ErrorCode cursor = errorCode; + while (cursor != null) { + appendIfNotNull(sb, cursor.getDetails()); + appendIfNotNull(sb, cursor.getDescription()); + cursor = cursor.getCause(); + } + return sb.toString(); + } + + private static void appendIfNotNull(StringBuilder sb, String text) { + if (text != null) { + sb.append(text).append('\n'); + } + } + /** * Get normalized bridge name for l2 network, which at most has 15 chars. * - if l2 network has L2_BRIDGE_NAME tag, then return it's value directly; diff --git a/test/src/test/java/org/zstack/test/kvm/KVMHostUtilsTest.java b/test/src/test/java/org/zstack/test/kvm/KVMHostUtilsTest.java new file mode 100644 index 00000000000..2e0e06c7dfc --- /dev/null +++ b/test/src/test/java/org/zstack/test/kvm/KVMHostUtilsTest.java @@ -0,0 +1,25 @@ +package org.zstack.test.kvm; + +import org.junit.Assert; +import org.junit.Test; +import org.zstack.header.errorcode.ErrorCode; +import org.zstack.kvm.KVMHostUtils; + +public class KVMHostUtilsTest { + @Test + public void zstac86349_continueReconnectOnLibvirtSocketMaskSystemdTimeout() { + ErrorCode error = new ErrorCode(); + error.setDetails("[HOST: 192.168.51.12] ERROR: run shell command: systemctl mask libvirtd.socket libvirtd-ro.socket libvirtd-admin.socket libvirtd-tls.socket libvirtd-tcp.socket failed! stderr: Failed to get properties: Failed to activate service 'org.freedesktop.systemd1': timed out (service_start_timeout=25000ms)"); + + Assert.assertTrue(KVMHostUtils.shouldContinueReconnectOnAnsibleFailure(false, error)); + Assert.assertFalse(KVMHostUtils.shouldContinueReconnectOnAnsibleFailure(true, error)); + } + + @Test + public void zstac86349_doNotContinueReconnectOnOtherAnsibleFailures() { + ErrorCode error = new ErrorCode(); + error.setDetails("[HOST: 192.168.51.12] ERROR: run shell command: systemctl restart libvirtd failed! stderr: Job for libvirtd.service failed"); + + Assert.assertFalse(KVMHostUtils.shouldContinueReconnectOnAnsibleFailure(false, error)); + } +}