Skip to content

Commit c37f1d2

Browse files
committed
Switches halt logic to being time based
Switches the halt logic to be time-based since number of attempts is based on general.rpc.timeout.
1 parent 2a85a39 commit c37f1d2

2 files changed

Lines changed: 35 additions & 7 deletions

File tree

  • core/src/main/java/org/apache/accumulo/core/conf
  • server/manager/src/main/java/org/apache/accumulo/manager

core/src/main/java/org/apache/accumulo/core/conf/Property.java

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -450,8 +450,9 @@ public enum Property {
450450
"The number of threads used to run fault-tolerant executions (FATE)."
451451
+ " These are primarily table operations like merge.",
452452
"1.4.3"),
453-
MANAGER_TSERVER_HALT_ATTEMPTS("manager.tservers.halt.attempts", "0", PropertyType.COUNT,
454-
"Allows the manager to force tserver halting by setting the max number of attempted tserver halt "
453+
MANAGER_TSERVER_HALT_DURATION("manager.tservers.halt.grace.period", "0",
454+
PropertyType.TIMEDURATION,
455+
"Allows the manager to force tserver halting by setting the max duration of time spent attempting to halt a tserver "
455456
+ " requests before deleting the tserver's zlock. A value of zero (default) disables this feature.",
456457
"2.1.5"),
457458
@Deprecated(since = "2.1.0")

server/manager/src/main/java/org/apache/accumulo/manager/Manager.java

Lines changed: 32 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,7 @@
109109
import org.apache.accumulo.core.trace.TraceUtil;
110110
import org.apache.accumulo.core.util.HostAndPort;
111111
import org.apache.accumulo.core.util.Retry;
112+
import org.apache.accumulo.core.util.Timer;
112113
import org.apache.accumulo.core.util.threads.ThreadPools;
113114
import org.apache.accumulo.core.util.threads.Threads;
114115
import org.apache.accumulo.manager.metrics.BalancerMetrics;
@@ -195,7 +196,7 @@ public class Manager extends AbstractServer implements LiveTServerSet.Listener,
195196
final AuditedSecurityOperation security;
196197
final Map<TServerInstance,AtomicInteger> badServers =
197198
Collections.synchronizedMap(new HashMap<>());
198-
final Map<TServerInstance,AtomicInteger> tserverHaltRpcAttempts =
199+
final Map<TServerInstance,GracefulHaltTimer> tserverHaltRpcAttempts =
199200
Collections.synchronizedMap(new HashMap<>());
200201
final Set<TServerInstance> serversToShutdown = Collections.synchronizedSet(new HashSet<>());
201202
final Migrations migrations = new Migrations();
@@ -1143,6 +1144,30 @@ private List<TabletMigration> checkMigrationSanity(Set<TabletServerId> current,
11431144

11441145
}
11451146

1147+
/**
1148+
* This class tracks details about the haltRPCs used
1149+
*/
1150+
private static class GracefulHaltTimer {
1151+
1152+
Duration maxHaltGraceDuration;
1153+
Timer timer;
1154+
1155+
public GracefulHaltTimer(AccumuloConfiguration config) {
1156+
timer = null;
1157+
maxHaltGraceDuration =
1158+
Duration.ofMillis(config.getTimeInMillis(Property.MANAGER_TSERVER_HALT_DURATION));
1159+
}
1160+
1161+
public void startTimer() {
1162+
timer = Timer.startNew();
1163+
}
1164+
1165+
public boolean shouldForceHalt() {
1166+
return maxHaltGraceDuration.toMillis() != 0 && timer != null
1167+
&& timer.hasElapsed(maxHaltGraceDuration);
1168+
}
1169+
}
1170+
11461171
private SortedMap<TServerInstance,TabletServerStatus>
11471172
gatherTableInformation(Set<TServerInstance> currentServers) {
11481173
final long rpcTimeout = getConfiguration().getTimeInMillis(Property.GENERAL_RPC_TIMEOUT);
@@ -1153,7 +1178,7 @@ private List<TabletMigration> checkMigrationSanity(Set<TabletServerId> current,
11531178
final SortedMap<TServerInstance,TabletServerStatus> result = new ConcurrentSkipListMap<>();
11541179
final RateLimiter shutdownServerRateLimiter = RateLimiter.create(MAX_SHUTDOWNS_PER_SEC);
11551180
final int maxTserverRpcHaltAttempts =
1156-
getConfiguration().getCount(Property.MANAGER_TSERVER_HALT_ATTEMPTS);
1181+
getConfiguration().getCount(Property.MANAGER_TSERVER_HALT_DURATION);
11571182
final boolean forceHaltingEnabled = maxTserverRpcHaltAttempts != 0;
11581183
for (TServerInstance serverInstance : currentServers) {
11591184
final TServerInstance server = serverInstance;
@@ -1195,9 +1220,9 @@ private List<TabletMigration> checkMigrationSanity(Set<TabletServerId> current,
11951220
> MAX_BAD_STATUS_COUNT) {
11961221
if (shutdownServerRateLimiter.tryAcquire()) {
11971222
log.warn("attempting to stop {}", server);
1198-
if (forceHaltingEnabled
1199-
&& (tserverHaltRpcAttempts.computeIfAbsent(server, s -> new AtomicInteger(0))
1200-
.incrementAndGet() > maxTserverRpcHaltAttempts)) {
1223+
var gracefulHaltTimer = tserverHaltRpcAttempts.computeIfAbsent(server,
1224+
s -> new GracefulHaltTimer(getConfiguration()));
1225+
if (gracefulHaltTimer.shouldForceHalt()) {
12011226
log.warn("tserver {} is not responding to halt requests, deleting zlock", server);
12021227
var zk = getContext().getZooReaderWriter();
12031228
var iid = getContext().getInstanceID();
@@ -1221,6 +1246,8 @@ private List<TabletMigration> checkMigrationSanity(Set<TabletServerId> current,
12211246
log.trace("error attempting to halt tablet server {}", server, e1);
12221247
} catch (Exception e2) {
12231248
log.info("error talking to troublesome tablet server {}", server, e2);
1249+
} finally {
1250+
gracefulHaltTimer.startTimer();
12241251
}
12251252
}
12261253
} else {

0 commit comments

Comments
 (0)