|
19 | 19 |
|
20 | 20 | import static org.apache.hadoop.hdds.protocol.DatanodeDetails.Port.Name.HTTP; |
21 | 21 | import static org.apache.hadoop.hdds.protocol.DatanodeDetails.Port.Name.HTTPS; |
| 22 | +import static org.apache.hadoop.hdds.scm.ScmConfigKeys.OZONE_SCM_NODES_KEY; |
22 | 23 | import static org.apache.hadoop.hdds.utils.HddsServerUtil.getRemoteUser; |
23 | 24 | import static org.apache.hadoop.hdds.utils.HddsServerUtil.getScmSecurityClientWithMaxRetry; |
24 | 25 | import static org.apache.hadoop.ozone.OzoneConfigKeys.HDDS_DATANODE_PLUGINS_KEY; |
|
34 | 35 |
|
35 | 36 | import com.google.common.annotations.VisibleForTesting; |
36 | 37 | import com.google.common.base.Preconditions; |
| 38 | +import com.google.common.collect.Sets; |
37 | 39 | import java.io.File; |
38 | 40 | import java.io.IOException; |
| 41 | +import java.net.InetSocketAddress; |
39 | 42 | import java.util.Arrays; |
| 43 | +import java.util.Collection; |
40 | 44 | import java.util.HashMap; |
| 45 | +import java.util.HashSet; |
41 | 46 | import java.util.List; |
42 | 47 | import java.util.Map; |
43 | 48 | import java.util.Objects; |
| 49 | +import java.util.Set; |
44 | 50 | import java.util.concurrent.Callable; |
45 | 51 | import java.util.concurrent.ConcurrentHashMap; |
46 | 52 | import java.util.concurrent.atomic.AtomicBoolean; |
| 53 | +import java.util.stream.Collectors; |
| 54 | +import java.util.stream.Stream; |
47 | 55 | import javax.management.ObjectName; |
| 56 | +import org.apache.commons.lang3.StringUtils; |
| 57 | +import org.apache.commons.lang3.tuple.Pair; |
48 | 58 | import org.apache.hadoop.conf.Configurable; |
49 | 59 | import org.apache.hadoop.hdds.DatanodeVersion; |
50 | 60 | import org.apache.hadoop.hdds.HddsConfigKeys; |
51 | 61 | import org.apache.hadoop.hdds.HddsUtils; |
52 | 62 | import org.apache.hadoop.hdds.cli.GenericCli; |
53 | 63 | import org.apache.hadoop.hdds.cli.HddsVersionProvider; |
| 64 | +import org.apache.hadoop.hdds.conf.ConfigurationSource; |
54 | 65 | import org.apache.hadoop.hdds.conf.OzoneConfiguration; |
55 | 66 | import org.apache.hadoop.hdds.conf.ReconfigurationHandler; |
56 | 67 | import org.apache.hadoop.hdds.protocol.DatanodeDetails; |
|
72 | 83 | import org.apache.hadoop.ozone.container.common.DatanodeLayoutStorage; |
73 | 84 | import org.apache.hadoop.ozone.container.common.helpers.ContainerUtils; |
74 | 85 | import org.apache.hadoop.ozone.container.common.statemachine.DatanodeStateMachine; |
| 86 | +import org.apache.hadoop.ozone.container.common.statemachine.DatanodeStateMachine.DatanodeStates; |
| 87 | +import org.apache.hadoop.ozone.container.common.statemachine.SCMConnectionManager; |
| 88 | +import org.apache.hadoop.ozone.container.common.statemachine.StateContext; |
75 | 89 | import org.apache.hadoop.ozone.container.common.statemachine.commandhandler.DeleteBlocksCommandHandler; |
76 | 90 | import org.apache.hadoop.ozone.container.common.utils.StorageVolumeUtil; |
77 | 91 | import org.apache.hadoop.ozone.container.common.volume.HddsVolume; |
@@ -124,6 +138,7 @@ public class HddsDatanodeService extends GenericCli implements Callable<Void>, S |
124 | 138 | private HddsDatanodeClientProtocolServer clientProtocolServer; |
125 | 139 | private OzoneAdmins admins; |
126 | 140 | private ReconfigurationHandler reconfigurationHandler; |
| 141 | + private String scmServiceId; |
127 | 142 |
|
128 | 143 | //Constructor for DataNode PluginService |
129 | 144 | public HddsDatanodeService() { } |
@@ -207,6 +222,7 @@ public void start(OzoneConfiguration configuration) { |
207 | 222 | start(); |
208 | 223 | } |
209 | 224 |
|
| 225 | + @SuppressWarnings("methodlength") |
210 | 226 | public void start() { |
211 | 227 | serviceRuntimeInfo = new DNMXBeanImpl(HddsVersionInfo.HDDS_VERSION_INFO) { |
212 | 228 | @Override |
@@ -294,6 +310,12 @@ public String getNamespace() { |
294 | 310 | .register(REPLICATION_STREAMS_LIMIT_KEY, |
295 | 311 | this::reconfigReplicationStreamsLimit); |
296 | 312 |
|
| 313 | + scmServiceId = HddsUtils.getScmServiceId(conf); |
| 314 | + if (scmServiceId != null) { |
| 315 | + reconfigurationHandler.register(OZONE_SCM_NODES_KEY + "." + scmServiceId, |
| 316 | + this::reconfigScmNodes); |
| 317 | + } |
| 318 | + |
297 | 319 | reconfigurationHandler.setReconfigurationCompleteCallback(reconfigurationHandler.defaultLoggingCallback()); |
298 | 320 |
|
299 | 321 | datanodeStateMachine = new DatanodeStateMachine(this, datanodeDetails, conf, |
@@ -680,6 +702,112 @@ private String reconfigBlockDeletingServiceTimeout(String value) { |
680 | 702 | return value; |
681 | 703 | } |
682 | 704 |
|
| 705 | + /** |
| 706 | + * Reconfigure the SCM nodes configuration which will trigger the creation and removal of |
| 707 | + * SCM connections based on the difference between the old and the new SCM nodes configuration. |
| 708 | + * <p> |
| 709 | + * The assumption is that the SCM node address configurations exists for all the involved node IDs |
| 710 | + * This is because reconfiguration can only support one configuration field at a time |
| 711 | + * @param value The new configuration value for "ozone.scm.nodes.SERVICEID" |
| 712 | + * @return new configuration for "ozone.scm.nodes.SERVICEID" which reflects the SCMs that the datanode has |
| 713 | + * is not connected to. |
| 714 | + */ |
| 715 | + private String reconfigScmNodes(String value) { |
| 716 | + if (StringUtils.isBlank(value)) { |
| 717 | + throw new IllegalArgumentException("Reconfiguration failed since setting the empty SCM nodes " + |
| 718 | + "configuration is not allowed"); |
| 719 | + } |
| 720 | + Set<String> previousNodeIds = new HashSet<>(HddsUtils.getSCMNodeIds(getConf(), scmServiceId)); |
| 721 | + Set<String> newScmNodeIds = Stream.of(ConfigurationSource.getTrimmedStringsFromValue(value)) |
| 722 | + .collect(Collectors.toSet()); |
| 723 | + |
| 724 | + if (newScmNodeIds.isEmpty()) { |
| 725 | + throw new IllegalArgumentException("Reconfiguration failed since setting the empty SCM nodes " + |
| 726 | + "configuration is not allowed"); |
| 727 | + } |
| 728 | + |
| 729 | + Set<String> scmNodesIdsToAdd = Sets.difference(newScmNodeIds, previousNodeIds); |
| 730 | + Set<String> scmNodesIdsToRemove = Sets.difference(previousNodeIds, newScmNodeIds); |
| 731 | + |
| 732 | + // We should only update configuration with the SCMs that are actually added / removed |
| 733 | + // If there is partial reconfiguration (e.g. one successful add and one failed add), |
| 734 | + // we want to be able to retry on the failed node reconfiguration. |
| 735 | + // If we don't handle this, the subsequent reconfiguration will not work since the node |
| 736 | + // configuration is already exists / removed. |
| 737 | + Set<String> effectiveScmNodeIds = new HashSet<>(previousNodeIds); |
| 738 | + |
| 739 | + LOG.info("Reconfiguring SCM nodes for service ID {} with new SCM nodes {} and remove SCM nodes {}", |
| 740 | + scmServiceId, scmNodesIdsToAdd, scmNodesIdsToRemove); |
| 741 | + |
| 742 | + Collection<Pair<String, InetSocketAddress>> scmToAdd = HddsUtils.getSCMAddressForDatanodes( |
| 743 | + getConf(), scmServiceId, scmNodesIdsToAdd); |
| 744 | + if (scmToAdd == null) { |
| 745 | + throw new IllegalStateException("Reconfiguration failed to get SCM address to add due to wrong configuration"); |
| 746 | + } |
| 747 | + Collection<Pair<String, InetSocketAddress>> scmToRemove = HddsUtils.getSCMAddressForDatanodes( |
| 748 | + getConf(), scmServiceId, scmNodesIdsToRemove); |
| 749 | + if (scmToRemove == null) { |
| 750 | + throw new IllegalArgumentException( |
| 751 | + "Reconfiguration failed to get SCM address to remove due to wrong configuration"); |
| 752 | + } |
| 753 | + |
| 754 | + StateContext context = datanodeStateMachine.getContext(); |
| 755 | + SCMConnectionManager connectionManager = datanodeStateMachine.getConnectionManager(); |
| 756 | + |
| 757 | + // Assert that the datanode is in RUNNING state since |
| 758 | + // 1. If the datanode state is INIT, there might be concurrent connection manager operations |
| 759 | + // that might cause unpredictable behaviors |
| 760 | + // 2. If the datanode state is SHUTDOWN, it means that datanode is shutting down and there is no need |
| 761 | + // to reconfigure the connections. |
| 762 | + if (!DatanodeStates.RUNNING.equals(context.getState())) { |
| 763 | + throw new IllegalStateException("Reconfiguration failed since the datanode the current state" + |
| 764 | + context.getState().toString() + " is not in RUNNING state"); |
| 765 | + } |
| 766 | + |
| 767 | + // Add the new SCM servers |
| 768 | + for (Pair<String, InetSocketAddress> pair : scmToAdd) { |
| 769 | + String scmNodeId = pair.getLeft(); |
| 770 | + InetSocketAddress scmAddress = pair.getRight(); |
| 771 | + if (scmAddress.isUnresolved()) { |
| 772 | + LOG.warn("Reconfiguration failed to add SCM address {} for SCM service {} since it can't " + |
| 773 | + "be resolved, skipping", scmAddress, scmServiceId); |
| 774 | + continue; |
| 775 | + } |
| 776 | + try { |
| 777 | + connectionManager.addSCMServer(scmAddress, context.getThreadNamePrefix()); |
| 778 | + context.addEndpoint(scmAddress); |
| 779 | + effectiveScmNodeIds.add(scmNodeId); |
| 780 | + LOG.info("Reconfiguration successfully add SCM address {} for SCM service {}", scmAddress, scmServiceId); |
| 781 | + } catch (IOException e) { |
| 782 | + LOG.error("Reconfiguration failed to add SCM address {} for SCM service {}", scmAddress, scmServiceId, e); |
| 783 | + } |
| 784 | + } |
| 785 | + |
| 786 | + // Remove the old SCM server |
| 787 | + for (Pair<String, InetSocketAddress> pair : scmToRemove) { |
| 788 | + String scmNodeId = pair.getLeft(); |
| 789 | + InetSocketAddress scmAddress = pair.getRight(); |
| 790 | + try { |
| 791 | + connectionManager.removeSCMServer(scmAddress); |
| 792 | + context.removeEndpoint(scmAddress); |
| 793 | + effectiveScmNodeIds.remove(scmNodeId); |
| 794 | + LOG.info("Reconfiguration successfully remove SCM address {} for SCM service {}", |
| 795 | + scmAddress, scmServiceId); |
| 796 | + } catch (IOException e) { |
| 797 | + LOG.error("Reconfiguration failed to remove SCM address {} for SCM service {}", scmAddress, scmServiceId, e); |
| 798 | + } |
| 799 | + } |
| 800 | + |
| 801 | + // Resize the executor pool size to (number of SCMs + 1 Recon) |
| 802 | + // Refer to DatanodeStateMachine#getEndPointTaskThreadPoolSize |
| 803 | + datanodeStateMachine.resizeExecutor(connectionManager.getNumOfConnections()); |
| 804 | + |
| 805 | + // TODO: In the future, we might also do some assertions on the SCM |
| 806 | + // - The SCM cannot be a leader since this causes the datanode to disappear |
| 807 | + // - The SCM should be decommissioned |
| 808 | + return String.join(",", effectiveScmNodeIds); |
| 809 | + } |
| 810 | + |
683 | 811 | /** |
684 | 812 | * Returns the initial version of the datanode. |
685 | 813 | */ |
|
0 commit comments