From e49dd752dcd7c83fdd707d2ed7dfbdded5001132 Mon Sep 17 00:00:00 2001
From: Jieming Zhang <jiemingz@nvidia.com>
Date: Thu, 5 Mar 2026 14:45:49 -0800
Subject: [PATCH 01/43] ETP

Signed-off-by: Jieming Zhang <jiemingz@nvidia.com>
---
 transformer_engine/pytorch/distributed.py     | 147 ++--
 transformer_engine/pytorch/module/base.py     |   6 +
 .../module/extended_tensor_parallelism.py     | 775 ++++++++++++++++++
 .../pytorch/module/grouped_linear.py          | 156 ++--
 .../pytorch/module/layernorm_linear.py        |  76 +-
 .../pytorch/module/layernorm_mlp.py           |  10 +
 transformer_engine/pytorch/module/linear.py   |  46 +-
 .../tensor/storage/nvfp4_tensor_storage.py    |  12 +
 transformer_engine/pytorch/utils.py           |  40 +
 9 files changed, 1158 insertions(+), 110 deletions(-)
 create mode 100644 transformer_engine/pytorch/module/extended_tensor_parallelism.py

diff --git a/transformer_engine/pytorch/distributed.py b/transformer_engine/pytorch/distributed.py
index f269e21b8c..dc5d8b3063 100644
--- a/transformer_engine/pytorch/distributed.py
+++ b/transformer_engine/pytorch/distributed.py
@@ -6,11 +6,11 @@
 from __future__ import annotations
 
 from collections.abc import Iterable
-from contextlib import contextmanager, AbstractContextManager, ContextDecorator
+from contextlib import contextmanager, AbstractContextManager, ContextDecorator, nullcontext
 from functools import lru_cache
 from dataclasses import dataclass
 import math
-from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+from typing import Any, Callable, ContextManager, Dict, List, Optional, Tuple, Union
 import warnings
 
 import torch
@@ -60,6 +60,14 @@
     "partition_stride": 1,
 }
 
+
+_EXTENDED_TENSOR_MODEL_PARALLEL_ATTRIBUTE_DEFAUTLTS = {
+    'etp_model_parallel': False,
+    'etp_partition_dim': -1,
+    'etp_partition_stride': 1,
+}
+
+
 _USE_REENTRANT_ACTIVATION_RECOMPUTE = True
 
 _FP8_ACTIVATION_RECOMPUTE_ENABLED = False
@@ -159,6 +167,19 @@ def set_tensor_model_parallel_attributes(
     setattr(tensor, "partition_stride", stride)
 
 
+def set_extended_tensor_parallel_attributes(
+    tensor: torch.Tensor, is_parallel: bool, dim: int, stride: int
+) -> None:
+    """Set ps attributes to tensor."""
+    # Make sure the attributes are not set.
+    for attribute in _EXTENDED_TENSOR_MODEL_PARALLEL_ATTRIBUTE_DEFAUTLTS:
+        assert not hasattr(tensor, attribute)
+    # Set the attributes.
+    setattr(tensor, 'etp_model_parallel', is_parallel)
+    setattr(tensor, 'etp_partition_dim', dim)
+    setattr(tensor, 'etp_partition_stride', stride)
+
+
 @lru_cache
 def get_distributed_world_size(group: Optional[dist_group_type] = None) -> int:
     """Return world size for the distributed group."""
@@ -908,7 +929,7 @@ def fork(self, name: str = "model-parallel-rng"):
 
 
 def reduce_scatter_along_first_dim(
-    inp: torch.Tensor, tp_group: dist_group_type, async_op: bool = False
+    inp: torch.Tensor, tp_group: dist_group_type, async_op: bool = False, output: torch.Tensor = None
 ) -> Tuple[torch.Tensor, Optional[torch.distributed.Work]]:
     """Reduce-scatter the input tensor across model parallel group."""
     world_size = get_distributed_world_size(tp_group)
@@ -916,14 +937,15 @@ def reduce_scatter_along_first_dim(
     if world_size == 1:
         return inp, None
 
-    dim_size = list(inp.size())
-    assert (
-        dim_size[0] % world_size == 0
-    ), "First dimension of the tensor should be divisible by tensor parallel size"
+    if output is None:
+        dim_size = list(inp.size())
+        assert (
+            dim_size[0] % world_size == 0
+        ), "First dimension of the tensor should be divisible by tensor parallel size"
 
-    dim_size[0] = dim_size[0] // world_size
+        dim_size[0] = dim_size[0] // world_size
 
-    output = torch.empty(dim_size, dtype=inp.dtype, device=torch.cuda.current_device())
+        output = torch.empty(dim_size, dtype=inp.dtype, device=torch.cuda.current_device())
     handle = torch.distributed.reduce_scatter_tensor(
         output, inp.contiguous(), group=tp_group, async_op=async_op
     )
@@ -1271,17 +1293,20 @@ class _NVFP4AllGatherAsyncHandle:
     async_handle: torch.distributed.Work
     _synchronized: bool = False
 
-    def wait(self) -> None:
-        """Wait for the async operation to complete and post-process the tensor."""
-        if self._synchronized:
-            return
-        self.async_handle.wait()
+    def post_process_nvfp4_gather(self) -> None:
         _post_process_nvfp4_gather(
             self.output,
             self.columnwise_data_interleaved,
             self.columnwise_scale_inv_interleaved,
             self.world_size,
         )
+
+    def wait(self) -> None:
+        """Wait for the async operation to complete and post-process the tensor."""
+        if self._synchronized:
+            return
+        self.async_handle.wait()
+        self.post_process_nvfp4_gather()
         self._synchronized = True
 
 
@@ -1292,6 +1317,8 @@ def _all_gather_nvfp4(
     async_op: bool = False,
     quantizer: NVFP4Quantizer,
     out_shape: Optional[list[int]] = None,
+    output_tensor = None,
+    grouped = False,
 ) -> tuple[NVFP4TensorStorage, Optional[torch.distributed.Work]]:
     """All-gather NVFP4 tensor along first dimension."""
 
@@ -1348,6 +1375,12 @@ def _all_gather_nvfp4(
         out = quantizer(out)
         return out, None
 
+    # Construct NVFP4 output tensor
+    if output_tensor is not None:
+        out = output_tensor
+    else:
+        out = quantizer.make_empty(out_shape, dtype=dtype, device=device)
+
     # Cast input tensor to NVFP4 with required data
     if not isinstance(inp, NVFP4TensorStorage):
         inp = quantizer(inp)
@@ -1360,17 +1393,19 @@ def _all_gather_nvfp4(
         )
         inp = quantizer(inp.dequantize())
 
-    # Construct NVFP4 output tensor
-    out = quantizer.make_empty(out_shape, dtype=dtype, device=device)
-
-    # Coalesce NCCL collectives for gathering data and scale inverses.
-    with torch.distributed._coalescing_manager(
-        group=process_group,
-        device=device,
-        async_ops=async_op,
-    ) as gather_coalescing_manager:
+    if not grouped:
+        # Coalesce NCCL collectives for gathering data and scale inverses.
+        gather_coalescing_manager = torch.distributed._coalescing_manager(
+            group=process_group,
+            device=device,
+            async_ops=async_op,
+        )
+    else:
+        gather_coalescing_manager = nullcontext()
 
+    with gather_coalescing_manager as coalesced_handle:
         # Gather NVFP4 data for row-wise usage
+        out_columnwise_data = None
         if quantizer.rowwise_usage:
 
             # Remove padding from NVFP4 scale-inverses
@@ -1446,15 +1481,19 @@ def _all_gather_nvfp4(
             # Transfer amax to output.
             out._amax_columnwise = inp._amax_columnwise
 
-    handle = gather_coalescing_manager if async_op else None
+    handle = coalesced_handle if async_op else None
 
     # Fixes interleaved data for transposed tensor/scale inv and pads scale inv if needed.
-    if async_op and quantizer.columnwise_usage:
-        handle = _NVFP4AllGatherAsyncHandle(
-            out, out_columnwise_data, out_scale_inv, world_size, handle
-        )
-    elif quantizer.columnwise_usage:
-        _post_process_nvfp4_gather(out, out_columnwise_data, out_scale_inv, world_size, handle)
+    if quantizer.columnwise_usage:
+        if async_op or grouped:
+            # Defer post-processing: either the async op hasn't completed yet, or an
+            # external coalescing manager owns the NCCL ops and hasn't flushed them.
+            inner_handle = handle if async_op else None
+            handle = _NVFP4AllGatherAsyncHandle(
+                out, out_columnwise_data, out_scale_inv, world_size, inner_handle
+            )
+        else:
+            _post_process_nvfp4_gather(out, out_columnwise_data, out_scale_inv, world_size, handle)
 
     return out, handle
 
@@ -1466,6 +1505,8 @@ def _all_gather_mxfp8(
     async_op: bool = False,
     quantizer: MXFP8Quantizer,
     out_shape: Optional[list[int]] = None,
+    output_tensor: torch.Tensor = None,
+    grouped: bool = False,
 ) -> tuple[MXFP8TensorStorage, Optional[torch.distributed.Work]]:
     """All-gather MXFP8 tensor along first dimension."""
 
@@ -1528,15 +1569,22 @@ def _all_gather_mxfp8(
         inp = quantizer(inp.dequantize())
 
     # Construct MXFP8 output tensor
-    out = quantizer.make_empty(out_shape, dtype=dtype, device=device)
+    if output_tensor is not None:
+        out = output_tensor
+    else:
+        out = quantizer.make_empty(out_shape, dtype=dtype, device=device)
 
-    # Coalesce NCCL collectives
-    with torch.distributed._coalescing_manager(
-        group=process_group,
-        device=device,
-        async_ops=async_op,
-    ) as coalescing_manager:
+    if not grouped:
+        # Coalesce NCCL collectives for gathering data and scale inverses.
+        gather_coalescing_manager = torch.distributed._coalescing_manager(
+            group=process_group,
+            device=device,
+            async_ops=async_op,
+        )
+    else:
+        gather_coalescing_manager = nullcontext()
 
+    with gather_coalescing_manager as coalesced_handle:
         # Gather MXFP8 data for row-wise usage
         if quantizer.rowwise_usage:
 
@@ -1583,7 +1631,7 @@ def _all_gather_mxfp8(
                 group=process_group,
             )
 
-    handle = coalescing_manager if async_op else None
+    handle = coalesced_handle if async_op else None
     return out, handle
 
 
@@ -1592,6 +1640,8 @@ def gather_along_first_dim(
     process_group: dist_group_type,
     async_op: bool = False,
     quantizer: Optional[Quantizer] = None,
+    output_tensor: torch.Tensor = None,
+    grouped: bool = False,
 ) -> tuple[torch.Tensor, Optional[torch.distributed.Work]]:
     """
     All-gather tensors and concatenate along first dimension.
@@ -1679,6 +1729,8 @@ def gather_along_first_dim(
             async_op=async_op,
             quantizer=quantizer,
             out_shape=out_shape,
+            output_tensor=output_tensor,
+            grouped=grouped,
         )
 
     # NVFP4 case
@@ -1690,6 +1742,8 @@ def gather_along_first_dim(
             async_op=async_op,
             quantizer=quantizer,
             out_shape=out_shape,
+            output_tensor=output_tensor,
+            grouped=grouped,
         )
 
     # High-precision communication for quantized tensors
@@ -1719,19 +1773,20 @@ def gather_along_first_dim(
         inp = inp.dequantize()
 
     # Communication for plain PyTorch tensors
-    out = torch.empty(
-        out_shape,
-        dtype=inp.dtype,
-        device=inp.device,
-        memory_format=torch.contiguous_format,
-    )
+    if output_tensor is None:
+        output_tensor = torch.empty(
+            out_shape,
+            dtype=inp.dtype,
+            device=inp.device,
+            memory_format=torch.contiguous_format,
+        )
     handle = torch.distributed.all_gather_into_tensor(
-        out,
+        output_tensor,
         inp.contiguous(),
         group=process_group,
         async_op=async_op,
     )
-    return out, handle
+    return output_tensor, handle
 
 
 # Global cache to store symmetric memory tensors
diff --git a/transformer_engine/pytorch/module/base.py b/transformer_engine/pytorch/module/base.py
index 9c21141a39..b565a40f87 100644
--- a/transformer_engine/pytorch/module/base.py
+++ b/transformer_engine/pytorch/module/base.py
@@ -631,6 +631,7 @@ def __init__(self, name: Optional[str] = None) -> None:
         self.activation_dtype: Optional[torch.dtype] = None
         self.wgrad_accumulation_and_reduce_hooks = []
         self.wgrad_store = None
+        self.etp_size = 1
 
         if not TEDebugState.debug_enabled:
             TEDebugState.initialize()
@@ -956,6 +957,8 @@ def set_tensor_parallel_group(self, tp_group: Union[dist_group_type, None]) -> N
         self.fast_setattr("tp_group", tp_group)
         self.fast_setattr("tp_group_initialized", True)
 
+
+
     def _get_fp8_params(self) -> Union[List[torch.Tensor], None]:
         """returns the FP8 weights."""
         fp8_params = []
@@ -1242,6 +1245,9 @@ def reset_parameters(self, defer_init: Optional[bool] = False) -> None:
         for name, param in self.named_parameters(recurse=False):
             # Check if parameter is a DTensor (FSDP2) or regular tensor
             is_dtensor = isinstance(param, DTensor)
+            from .extended_tensor_parallelism import ETPShardedParam
+            is_etp = isinstance(param, ETPShardedParam)
+
             dtensor_param = param if is_dtensor else None
             # Need to update/quantize local tensor in case of DTensor
             param = param._local_tensor if is_dtensor else param
diff --git a/transformer_engine/pytorch/module/extended_tensor_parallelism.py b/transformer_engine/pytorch/module/extended_tensor_parallelism.py
new file mode 100644
index 0000000000..cb4e058418
--- /dev/null
+++ b/transformer_engine/pytorch/module/extended_tensor_parallelism.py
@@ -0,0 +1,775 @@
+# Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# See LICENSE for license information.
+
+from collections import defaultdict
+from typing import Dict, List
+from enum import Enum
+from dataclasses import dataclass
+import torch
+
+from ..distributed import (
+    gather_along_first_dim,
+    reduce_scatter_along_first_dim,
+    _NVFP4AllGatherAsyncHandle
+)
+from ..quantized_tensor import QuantizedTensor
+from ..tensor import NVFP4TensorStorage, MXFP8TensorStorage
+from ..utils import nvtx_range_pop, nvtx_range_push
+from .base import get_dummy_wgrad
+
+import transformer_engine_torch as tex
+
+
+class ETPWeightState(Enum):
+    NONE = "NONE"              # Sharded, no pending operation
+    ASYNC_WAIT = "ASYNC_WAIT"  # Async all-gather in progress
+    ASYNC_DONE = "ASYNC_DONE"  # Async all-gather complete, result in cache
+
+_STATE_TRANSITIONS = {
+    ETPWeightState.NONE:       {ETPWeightState.ASYNC_WAIT},
+    ETPWeightState.ASYNC_WAIT: {ETPWeightState.ASYNC_DONE},
+    ETPWeightState.ASYNC_DONE: {ETPWeightState.NONE},
+}
+
+
+# Global AG Prefetching Buffer for ETP.
+_ALL_GATHER_BUFFER = None
+
+
+@dataclass
+class ETPConfig:
+    """Global configuration for Extended Tensor Parallelism."""
+    pad_for_alignment: int = 16
+    weight_prefetch: bool = True
+
+ETP_CONFIG = ETPConfig()
+
+def update_config(**kwargs):
+    """Update the global ETP configuration."""
+    for key, value in kwargs.items():
+        if not hasattr(ETP_CONFIG, key):
+            raise ValueError(f"Unknown ETP config option: {key}")
+        setattr(ETP_CONFIG, key, value)
+
+
+def wrap_module_params_etp(module, weight_names, etp_group, is_grouped=None):
+    """Shard and re-register all parameters of a module using ETP weight sharding."""
+    if etp_group.size() == 1:
+        return
+
+    etp_size = etp_group.size()
+    etp_rank = etp_group.rank()
+
+    for idx, name in enumerate(weight_names):
+        param = getattr(module, name, None)
+        if param is None:
+            continue
+
+        # delete the original parameter, which will be replaced by an ETP sharded one
+        delattr(module, name)
+
+        if ETP_CONFIG.pad_for_alignment > 0:
+            # Ensure each shard's dim0 is a multiple of 16 for quantization (NVFP4/FP8) by padding 
+            # the last rank such that the total padded length of dim0 is a multiple of ETP size * 16
+            alignment = ETP_CONFIG.pad_for_alignment * etp_size
+            tensor = param.data
+            dim0 = tensor.shape[0]
+            pad_length = (alignment - dim0 % alignment) % alignment if alignment > 0 else 0
+            padded_dim0 = dim0 + pad_length
+            is_padded_last_rank = pad_length > 0 and etp_rank == etp_size - 1
+            # Create the ETP sharded param, pass a clone of the shard so that the original unsharded
+            # buffer may be deallocated
+            shard_size = padded_dim0 // etp_size
+            start_idx = etp_rank * shard_size
+            end_idx = min((etp_rank + 1) * shard_size, tensor.shape[0])
+            shard = tensor[start_idx: end_idx]
+            etp_shard = ETPShardedParam(shard.clone())
+            # finally, set attributes
+            etp_shard.pad_length = pad_length
+            etp_shard.is_padded_last_rank = is_padded_last_rank
+        else:
+            shard_size = tensor.shape[0] // etp_group.size()
+            shard = tensor[etp_rank * shard_size: (etp_rank + 1) * shard_size]
+            etp_shard = ETPShardedParam(shard.clone())
+
+        if is_grouped:
+            etp_shard.expert_idx = idx
+            etp_shard.is_routed_expert = True
+        etp_shard.group = etp_group
+        etp_shard.ps_size = etp_size
+        # register the newly sharded param back to the module
+        module._parameters[name] = etp_shard
+
+    if is_grouped:
+        allweights = [getattr(module, name) for name in weight_names]
+        allweights[0].weight_list = allweights
+
+
+class ETPShardHandle:
+
+    def __init__(self, handle, etp_shards: list):
+        self.handle = handle
+        self.etp_shards = etp_shards
+
+    def wait(self):
+        if self.handle is not None:
+            self.handle.wait()
+        for w in self.etp_shards:
+            w._set_state(ETPWeightState.ASYNC_DONE)
+
+
+class ETPShardedParam(torch.nn.Parameter):
+
+    _pending_rs_weight = None
+    _first_weight_flag = True
+    _last_weight = None
+
+    @staticmethod
+    def __new__(cls, tensor, *args, **kwargs):
+        requires_grad = kwargs.get('requires_grad', True)
+        return super(ETPShardedParam, cls).__new__(cls, tensor, requires_grad=requires_grad)
+
+    def __init__(self, x, *args, **kwargs):
+        super().__init__()
+        
+        self.state = ETPWeightState.NONE
+        self._cache_ticket = None
+        self._prefetch_handle = None
+        self._grad_accum_node = None
+        self._grad_accum_hook = None
+        # Quantization
+        self._quantizer = None
+        self.did_cast_to_low_precision = False
+        self.quantized = None
+        # Prefetching linked list
+        self.is_first_weight = False
+        self.next_w = None
+        self.prev_w = None
+        # Grouped gemm
+        self.is_routed_expert = False
+        self.expert_idx = None
+        self.group = None
+        self.weight_list = None
+        # Reduce-scatter state (set during wgrad_reduce_scatter)
+        self.wgrad_rs = None
+        self.wgrad_rs_handle = None
+        self.fuse_wgrad_accumulation = False
+        # Padding
+        self.is_padded_last_rank = False
+        self.pad_length = 0
+
+    def setup(self, weight_quantizer=None):
+        """Set quantizer and create quantized shard."""
+
+        if self._quantizer is None:
+            def _configure_quantizer(q, group):
+                q = q.copy()
+                q.with_amax_reduction = True
+                q.amax_reduction_group = group
+                q.internal = False
+                q.optimize_for_gemm = True
+                return q
+
+            weights = self.weight_list if self.is_routed_expert and self.weight_list is not None else [self]
+            for quantizer, weight in zip(weight_quantizer, weights):
+                if quantizer is None:
+                    continue
+
+                weight._quantizer = _configure_quantizer(quantizer, weight.group)
+                weight.quantized = weight._quantizer.quantize(weight.get_padded_shard())
+                weight.quantized.is_routed_expert = getattr(weight, 'is_routed_expert', False)
+
+    @property
+    def _weights(self):
+        """Return the list of individual weight shards (self for non-routed, weight_list for routed)."""
+        weights = self.weight_list if self.is_routed_expert else [self]
+        # Safety: all weights must be in the same state.
+        assert all(w.state == weights[0].state for w in weights)
+        return list(weights)
+
+    @property
+    def _unsharded_shape_padded(self):
+        out_shape = list(self.size())
+        if self.pad_length > 0 and self.group.rank() == self.group.size() - 1:
+            out_shape[0] = (out_shape[0]+ self.pad_length) * self.group.size()
+        else:
+            out_shape[0] = out_shape[0] * self.group.size()
+        return tuple(out_shape)   
+
+    @property
+    def _unsharded_shape(self):
+        out_shape = list(self._unsharded_shape_padded)
+        out_shape[0] -= self.pad_length
+        return tuple(out_shape)
+
+    def get_padded_shard(self):
+        if self.pad_length > 0 and self.is_padded_last_rank:
+            return torch.nn.functional.pad(self, (0, 0, 0, self.pad_length))  
+        return self
+
+    def _set_state(self, new_state: ETPWeightState):
+        """Validate and update state machine transition."""
+        assert new_state in _STATE_TRANSITIONS[self.state], \
+            f"Invalid state transition: {self.state} -> {new_state}"
+        self.state = new_state
+
+    def _get_cache_key(self, dtype, fwd: bool) -> tuple:
+        """Build cache key using output shape + dtype.
+
+        Weights with matching gathered shape and dtype share a buffer.
+        For expert weights gathered in parallel, self.expert_idx distinguishes them so
+        each gets a distinct buffer, while same-indexed experts across layers share.
+        """
+        
+        if not isinstance(dtype, torch.dtype):
+            return (self._unsharded_shape_padded, dtype, fwd, not fwd, self.expert_idx)
+        return (self._unsharded_shape_padded, dtype, self.expert_idx)
+
+    def _quantize_if_needed(self, skip_weight_cast=False, cast_noop_flag=None):
+        """Re-quantize sharded weight into existing buffer. Returns quantized weight or self."""
+        if self._quantizer is None:
+            self.did_cast_to_low_precision = False
+            return self
+
+        self._quantizer.set_usage(rowwise=True, columnwise=True)
+        if skip_weight_cast is False or cast_noop_flag is not None:
+            tex.quantize(
+                tensor=self.get_padded_shard(),
+                quantizer=self._quantizer,
+                output=self.quantized,
+                noop=cast_noop_flag,
+            )
+        self.did_cast_to_low_precision = True
+
+        return self.quantized
+
+    def _strip_padding(self, tensor):
+        if self.pad_length == 0:
+            return tensor
+
+        if isinstance(tensor, QuantizedTensor):
+            assert isinstance(tensor, (NVFP4TensorStorage, MXFP8TensorStorage)), \
+                f"Unsupported quantized tensor type for ETP padding: {type(tensor)}"
+
+            metadata = tensor.get_metadata()
+            if metadata.get("rowwise_data") is not None:
+                metadata["rowwise_data"] = metadata["rowwise_data"][:-self.pad_length]
+            if metadata.get("columnwise_data") is not None:
+                if isinstance(tensor, NVFP4TensorStorage):
+                    # NVFP4 transposes columnwise and packs 2 values per byte
+                    metadata["columnwise_data"] = metadata["columnwise_data"][
+                        ..., :-self.pad_length // 2
+                    ].contiguous()
+                else:
+                    # MXFP8 columnwise is not transposed, strip first dim
+                    metadata["columnwise_data"] = metadata["columnwise_data"][
+                        :-self.pad_length
+                    ]
+            return type(tensor)(**metadata, shape=self._unsharded_shape, dtype=torch.bfloat16)
+        else:
+            return tensor[:-self.pad_length]
+
+    def _all_gather_weight(self, async_op, skip_weight_cast, cast_noop_flag, fwd, nvtx_label=None):
+        """Quantize (if needed) and all-gather weight. Returns (weight_total, handle)."""
+        weights = self._weights
+
+        # 1. Transition state for async gathers.
+        if async_op:
+            for w in weights:
+                w._set_state(ETPWeightState.ASYNC_WAIT)
+
+        # 2. Prepare: quantize, set usage direction.
+        for w in weights:
+            w._quantize_if_needed(skip_weight_cast, cast_noop_flag)
+            if w.did_cast_to_low_precision:
+                w._quantizer.set_usage(rowwise=fwd, columnwise=not fwd)
+
+        # 3. Build gather inputs.
+        quantizers = [w._quantizer for w in weights]
+        if weights[0].did_cast_to_low_precision:
+            gather_weights = [w.quantized for w in weights]
+        else:
+            gather_weights = list(w.get_padded_shard() for w in weights)
+
+        # 4. Cache checkout (async only — sync gathers don't need pooled buffers).
+        if async_op:
+            dtypes = [q.dtype if q is not None else w.dtype for q, w in zip(quantizers, weights)]
+            out_buffers = []
+            for p, dt in zip(weights, dtypes):
+                assert p._cache_ticket is None, \
+                    f"Cache ticket leak: weight {id(p)} still has unreturned ticket {p._cache_ticket}"
+                buf, p._cache_ticket = get_global_ETP_cache().checkout(p, dt, fwd)
+                out_buffers.append(buf)
+        else:
+            out_buffers = None
+
+        # 5. Communicate.
+        etp_group = weights[0].group
+        if out_buffers is not None and len(gather_weights) > 1:
+            assert len(set(id(b) for b in out_buffers)) == len(out_buffers), \
+                "Duplicate output buffers in batched all-gather — experts need distinct cache keys"
+
+        if len(gather_weights) > 1:
+            nvtx_range_push(f"{nvtx_label}.batched_etp_ag")
+            results, handle = grouped_gather_along_first_dim(
+                gather_weights, etp_group,
+                async_op=async_op,
+                quantizers=quantizers,
+                output_tensors=out_buffers,
+            )
+            nvtx_range_pop(f"{nvtx_label}.batched_etp_ag")
+        else:
+            nvtx_range_push(f"{nvtx_label}.etp_ag")
+            weight_total, handle = gather_along_first_dim(
+                gather_weights[0], etp_group,
+                quantizer=quantizers[0],
+                async_op=async_op,
+                output_tensor=out_buffers[0] if out_buffers is not None else None,
+            )
+            nvtx_range_pop(f"{nvtx_label}.etp_ag")
+            results = [weight_total]
+
+        result = results if self.is_routed_expert else results[0]
+
+        # 6. Wrap handle.
+        if async_op:
+            handle = ETPShardHandle(handle, weights)
+        else:
+            handle = None
+
+        return result, handle
+
+    def _get_unsharded(self, fwd, skip_weight_cast=False, cast_noop_flag=None):
+        """Get unsharded (all-gathered) weight tensor(s).
+
+        Handles both routed experts (returns list) and single weights (returns tensor).
+        Supports sync gather, async prefetch wait, and cache retrieval.
+        """
+        weights = self._weights
+
+        # Wait for async prefetch if in progress
+        if weights[0].state == ETPWeightState.ASYNC_WAIT:
+            self._prefetch_handle.wait()
+            self._prefetch_handle = None
+
+        if weights[0].state == ETPWeightState.NONE:
+            # Synchronous all-gather (no cache — buffers allocated inline)
+            result, _ = self._all_gather_weight(
+                async_op=False,
+                skip_weight_cast=skip_weight_cast,
+                cast_noop_flag=cast_noop_flag,
+                fwd=fwd,
+            )
+            result = result if self.is_routed_expert else [result]
+
+        elif weights[0].state == ETPWeightState.ASYNC_DONE:
+            # Retrieve prefetched results from cache
+            cache = get_global_ETP_cache()
+            result = []
+            for w in weights:
+                buf = cache.get(w._cache_ticket)
+                w._cache_ticket = None
+                # Post-gather quantization safety net: weight was prefetched
+                # before weight_quantizer was set
+                if not w.did_cast_to_low_precision:
+                    if w._quantizer is not None and not isinstance(buf, QuantizedTensor):
+                        w._quantize_if_needed()
+                        buf = w._quantizer.quantize(buf)
+                w._set_state(ETPWeightState.NONE)
+                result.append(buf)
+        else:
+            assert False, f"Unexpected state: {weights[0].state}"
+
+        result = [self._strip_padding(r) for r in result]
+        result = [r.detach().requires_grad_(w.requires_grad) for r, w in zip(result, weights)]
+        return result if self.is_routed_expert else result[0]
+
+    def all_gather_and_prefetch_bwd(self, nvtx_label=None):
+        """
+        Backward variant: get current weight (from cache if prefetched, else
+        sync gather) and async-prefetch prev_w.
+
+        Safe thanks to the coat-check cache: get() returns the current buffer
+        to the pool, and the prefetch's checkout() will allocate a separate
+        buffer if the pool is empty (i.e. the current buffer is still live
+        via the caller's tensor reference).
+
+        Returns:
+            weight_total
+        """
+        result = self._get_unsharded(fwd=False, skip_weight_cast=True)
+
+        if ETP_CONFIG.weight_prefetch and self.prev_w is not None:
+            _, handle = self.prev_w._all_gather_weight(
+                async_op=True, skip_weight_cast=True, cast_noop_flag=None,
+                fwd=False, nvtx_label=nvtx_label,
+            )
+            self.prev_w._prefetch_handle = handle
+        return result
+
+    def batched_all_gather_and_prefetch_bwd(self, nvtx_label=None):
+        """Batched backward all-gather + prefetch. Wrapper around all_gather_and_prefetch_bwd."""
+        return self.all_gather_and_prefetch_bwd(nvtx_label=nvtx_label)
+
+    def all_gather_and_prefetch(
+        self,
+        fwd: bool = True,
+        skip_weight_cast: bool = False,
+        cast_noop_flag: torch.Tensor = None,
+        nvtx_label: str = None,
+    ):
+        """
+        All-gather current weight and async-prefetch the next weight.
+
+        Returns:
+            weight_total
+        """
+        # Lazy population of linked list: link previous weight to current weight
+        cls = type(self)
+        if cls._first_weight_flag:
+            self.is_first_weight = True
+            cls._first_weight_flag = False
+
+        if self.is_first_weight:
+            cls._last_weight = None
+
+        if cls._last_weight is not None and cls._last_weight.next_w is None:
+            print_rank_0(f"linking curr w: {id(self)} {self.is_routed_expert} prev_w: {id(cls._last_weight)}")
+            cls._last_weight.next_w = self
+            self.prev_w = cls._last_weight
+        cls._last_weight = self
+
+        result = self._get_unsharded(fwd, skip_weight_cast=skip_weight_cast, cast_noop_flag=cast_noop_flag)
+
+        if ETP_CONFIG.weight_prefetch and self.next_w is not None:
+            target = self.next_w
+            _, handle = target._all_gather_weight(
+                async_op=True, skip_weight_cast=skip_weight_cast,
+                cast_noop_flag=cast_noop_flag, fwd=fwd, nvtx_label=nvtx_label,
+            )
+            target._prefetch_handle = handle
+        return result
+
+    def batched_all_gather_and_prefetch(self, **kwargs):
+        """Batched all-gather + prefetch for expert weights. Wrapper around all_gather_and_prefetch."""
+        return self.all_gather_and_prefetch(**kwargs)
+
+    def get_wgrad_tensor(self):
+        return torch.empty(
+            self._unsharded_shape,
+            dtype=self.main_grad.dtype,
+            device=self.device,
+            requires_grad=False,
+        )
+
+    def register_grad_accum_hook(self, grad_accum_node, hook):
+        self._grad_accum_node = grad_accum_node
+        self._grad_accum_hook = hook
+
+    @classmethod
+    def _resolve_pending_rs(cls, expected_next):
+        """Finish any pending reduce-scatter from a previous weight."""
+        if cls._pending_rs_weight is not None:
+            assert cls._pending_rs_weight is expected_next
+            cls._pending_rs_weight.finish_wgrad_reduce_scatter()
+            cls._pending_rs_weight = None
+
+    @staticmethod
+    def _apply_fused_wgrad(param, wgrad_rs):
+        """Apply fuse_wgrad_accumulation logic to a single param and return a dummy grad."""
+        
+        # the last rank in the etp group pads the param, so need to remove the padding here
+        if param.group.rank() == param.group.size() - 1:
+            wgrad_rs = param._strip_padding(wgrad_rs)
+
+        param.main_grad.add_(wgrad_rs)
+        # Handle mcore grad accum fusion
+        if hasattr(param, "grad_added_to_main_grad"):
+            param.grad_added_to_main_grad = True
+        param.grad = get_dummy_wgrad(list(param.main_grad.shape), param.dtype)
+        if param._grad_accum_hook is not None:
+            param._grad_accum_hook(param)
+
+    def finish_wgrad_reduce_scatter(self):
+        if self.wgrad_rs_handle is not None:
+            self.wgrad_rs_handle.wait()
+            self.wgrad_rs_handle = None
+
+        for param, wgrad_rs in zip(self._weights, self.wgrad_rs):
+            if self.fuse_wgrad_accumulation:
+                self._apply_fused_wgrad(param, wgrad_rs)
+            else:
+                if param.is_padded_last_rank:
+                    wgrad_rs = param._strip_padding(wgrad_rs)
+                param.grad = wgrad_rs
+            # Finally call the grad accum node
+            param._grad_accum_node(param.grad)
+
+    def _reduce_scatter(self, wgrads, async_op):
+        """Reduce-scatter one or more wgrads. Returns (outputs, handle).
+
+        Single tensor: plain reduce-scatter (no coalescing).
+        Multiple tensors: coalesced reduce-scatter.
+        """
+
+        if self.pad_length > 0:
+            wgrads = [torch.nn.functional.pad(w, (0, 0, 0, self.pad_length)) for w in wgrads]
+
+        if len(wgrads) == 1:
+            out, handle = reduce_scatter_along_first_dim(
+                wgrads[0], self.group, async_op=async_op
+            )
+            return [out], handle
+        else:
+            outputs = []
+            with torch.distributed._coalescing_manager(
+                group=self.group,
+                device=wgrads[0].device,
+                async_ops=async_op,
+            ) as cm:
+                for tensor in wgrads:
+                    out, _ = reduce_scatter_along_first_dim(tensor, self.group)
+                    outputs.append(out)
+            return outputs, cm if async_op else None
+
+    def wgrad_reduce_scatter(self, wgrad, fuse_wgrad_accumulation):
+        """Reduce-scatter wgrad(s). Sync for last weight, async+deferred for others.
+
+        Accepts a single tensor (non-routed) or list of tensors (routed experts).
+
+        Returns:
+            Single tensor or list for sync (last weight) — backward should return this.
+            None or tuple of Nones for async — backward should return this.
+        """
+        batched = isinstance(wgrad, (list, tuple))
+        wgrads = list(wgrad) if batched else [wgrad]
+        weights = self._weights
+
+        self._resolve_pending_rs(self.next_w)
+
+        if self.prev_w is None:
+            # Sync reduce-scatter (last weight in chain)
+            sharded, _ = self._reduce_scatter(wgrads, async_op=False)
+            if fuse_wgrad_accumulation:
+                [self._apply_fused_wgrad(p, g) for p, g in zip(weights, sharded)]
+                result = [None] * len(weights)
+            else:
+                result = [
+                    p._strip_padding(g) if p.is_padded_last_rank else g
+                    for p, g in zip(weights, sharded)
+                ]
+            return result if batched else result[0]
+        else:
+            # Async reduce-scatter (not last weight — deferred finish)
+            self.fuse_wgrad_accumulation = fuse_wgrad_accumulation
+            self.wgrad_rs, self.wgrad_rs_handle = self._reduce_scatter(wgrads, async_op=True)
+            type(self)._pending_rs_weight = self
+            return tuple([None] * len(wgrads)) if batched else None
+
+    def batched_wgrad_reduce_scatter(self, wgrad_list, fuse_wgrad_accumulation):
+        """Batched version of wgrad_reduce_scatter."""
+        return self.wgrad_reduce_scatter(wgrad_list, fuse_wgrad_accumulation)
+
+    def __torch_function__(self, func, types, args=(), kwargs=None):
+        if kwargs is None:
+            kwargs = {}
+
+        if func is torch.Tensor.detach:
+            with torch._C.DisableTorchFunctionSubclass():
+                # Perform the raw detach
+                result = func(*args, **kwargs)
+            # Re-wrap it in your subclass so PyTorch is happy
+            return result.as_subclass(type(self))
+
+        # 2. For everything else (add, mul, etc.), be transparent/decay.
+        with torch._C.DisableTorchFunctionSubclass():
+            return func(*args, **kwargs)
+
+
+def print_rank_0(message, rank=None):
+    """If distributed is initialized or rank is specified, print only on rank 0."""
+    if rank is not None:
+        if rank == 0:
+            print(message, flush=True)
+    elif torch.distributed.is_initialized():
+        if torch.distributed.get_rank() == 0:
+            print(message, flush=True)
+    else:
+        print(message, flush=True)
+
+class ETPWeightCache:
+    """
+    Buffers are pooled by cache key (shape + dtype).  Two operations:
+
+    - ``checkout(param, dtype, fwd)`` → ``(buffer, ticket)``
+      Takes a buffer from the pool (or allocates). Ticket is ``id(buf)``.
+    - ``get(ticket, param, dtype, fwd)`` → ``buffer``
+      Retrieves the buffer, asserts key matches, returns it to the pool,
+      and invalidates the ticket.
+
+    Every checkout is paired with exactly one get (1:1).
+    Two weights sharing the same cache key get distinct buffers if one
+    is still checked out, preventing aliasing.
+    """
+
+    # Bytes per element for known dtypes (used for logging).
+    _BYTES_PER_ELEMENT = {
+        torch.bfloat16: 2, torch.float16: 2, torch.float32: 4,
+        tex.DType.kFloat4E2M1: 0.5,
+        tex.DType.kFloat8E4M3: 1,
+    }
+
+    def __init__(self):
+        self._pool: Dict[tuple, List[torch.Tensor]] = defaultdict(list)
+        self._tickets: Dict[int, tuple] = {}   # ticket → (key, buf)
+        self._free_tickets: list[int] = []      # recycled ticket IDs
+        self._max_ticket: int = 0               # high-water mark for ticket allocation
+        self._total_bytes: int = 0              # running total of allocated bytes
+
+    @staticmethod
+    def _buf_bytes(shape, dtype) -> int:
+        """Estimate buffer size in bytes."""
+        numel = 1
+        for d in shape:
+            numel *= d
+        bpe = ETPWeightCache._BYTES_PER_ELEMENT.get(dtype, None)
+        return numel * bpe
+
+    def _allocate_buffer(self, param: 'ETPShardedParam', dtype) -> torch.Tensor:
+        out_shape = param._unsharded_shape_padded
+        if not isinstance(dtype, torch.dtype):
+            quantizer = param._quantizer
+            assert quantizer is not None
+            assert quantizer.rowwise_usage ^ quantizer.columnwise_usage
+
+            device = torch.cuda.current_device()
+            buf = param._quantizer.make_empty(out_shape, dtype=torch.bfloat16, device=device)
+        else:
+            buf = torch.empty(
+                out_shape, dtype=dtype, device=param.device, memory_format=torch.contiguous_format
+            )
+        buf_bytes = self._buf_bytes(out_shape, dtype)
+        self._total_bytes += buf_bytes
+        print_rank_0(
+            f"[ETP Cache] +{buf_bytes / 1024**2:.1f} MB  (shape={out_shape}, dtype={dtype})  "
+            f"total={self._total_bytes / 1024**2:.1f} MB"
+        )
+        return buf
+
+    def checkout(self, param: 'ETPShardedParam', dtype, fwd: bool):
+        """Get a buffer for all-gather output.  Returns (buffer, ticket).
+
+        Ticket IDs are recycled so they stay bounded.
+        If all buffers for this key are checked out, allocates a new one.
+        """
+        key = param._get_cache_key(dtype, fwd)
+        pool = self._pool[key]
+        buf = pool.pop() if pool else self._allocate_buffer(param, dtype)
+
+        if self._free_tickets:
+            ticket = self._free_tickets.pop()
+        else:
+            ticket = self._max_ticket
+            self._max_ticket += 1
+        self._tickets[ticket] = (key, buf)
+        return buf, ticket
+
+    def get(self, ticket: int) -> torch.Tensor:
+        """Retrieve buffer by ticket and return it to the pool.
+
+        This combines the old get + ticket_return into a single call.
+        After this call the ticket is invalidated and the buffer is
+        available for future checkouts.
+        """
+        assert ticket in self._tickets, f"Invalid ticket: {ticket}"
+        key, buf = self._tickets.pop(ticket)
+        self._free_tickets.append(ticket)
+        self._pool[key].append(buf)
+        return buf
+
+
+def get_global_ETP_cache() -> ETPWeightCache:
+    """Get or lazily create the global cache instance."""
+    global _ALL_GATHER_BUFFER
+    if _ALL_GATHER_BUFFER is None:
+        _ALL_GATHER_BUFFER = ETPWeightCache()
+    return _ALL_GATHER_BUFFER
+
+
+@dataclass
+class BatchedNVFP4AllGatherAsyncHandle:
+    """Handle for batched asynchronous NVFP4 all-gathers."""
+    output_handles: List[_NVFP4AllGatherAsyncHandle]
+    outer_async_handle: torch.distributed.Work
+    _synchronized: bool = False
+
+    def wait(self) -> None:
+        """Wait for the async operation to complete and post-process the tensor."""
+        if self._synchronized:
+            return
+        self.outer_async_handle.wait()
+        # Fixes interleaved data for transposed tensor/scale inv and pads scale inv if needed.
+        for output_handle in self.output_handles:
+            if output_handle is not None:
+                assert output_handle.async_handle is None
+                output_handle.post_process_nvfp4_gather()
+                # release any tensor references just in case
+                output_handle.output = None
+                output_handle.columnwise_data_interleaved = None
+                output_handle.columnwise_scale_inv_interleaved = None
+
+        self._synchronized = True
+
+
+def grouped_gather_along_first_dim(
+    weights: list,
+    process_group,
+    async_op: bool = False,
+    quantizers: list = None,
+    output_tensors: list = None,
+):
+    """
+    All-gather multiple weights in a single coalesced operation.
+
+    Handles NVFP4 post-processing for both sync and async paths.
+    """
+    # Determine device from first weight.
+    inp = weights[0]
+    if isinstance(inp, NVFP4TensorStorage):
+        device = (
+            inp._rowwise_data.device if inp._rowwise_data is not None
+            else inp._columnwise_data.device
+        )
+    else:
+        device = inp.device
+
+    weights_all = []
+    weight_handles = []
+    with torch.distributed._coalescing_manager(
+        group=process_group, device=device, async_ops=async_op,
+    ) as gather_coalescing_manager:
+        for i, weight in enumerate(weights):
+            weight_all, weight_handle = gather_along_first_dim(
+                weight, process_group,
+                quantizer=quantizers[i],
+                output_tensor=output_tensors[i] if output_tensors is not None else None,
+                grouped=True,
+            )
+            weights_all.append(weight_all)
+            weight_handles.append(weight_handle)
+
+    if async_op:
+        handle = gather_coalescing_manager
+        if (
+            quantizers is not None
+            and getattr(quantizers[0], "columnwise_usage", False)
+        ):
+            handle = BatchedNVFP4AllGatherAsyncHandle(weight_handles, handle)
+    else:
+        for wh in weight_handles:
+            if isinstance(wh, _NVFP4AllGatherAsyncHandle):
+                wh.post_process_nvfp4_gather()
+        handle = None
+
+    return weights_all, handle
diff --git a/transformer_engine/pytorch/module/grouped_linear.py b/transformer_engine/pytorch/module/grouped_linear.py
index f3e7b57cf1..58c4e3b130 100644
--- a/transformer_engine/pytorch/module/grouped_linear.py
+++ b/transformer_engine/pytorch/module/grouped_linear.py
@@ -5,6 +5,7 @@
 """GroupedLinear API"""
 from typing import Union, Optional, Callable, Tuple, List
 from itertools import chain
+import traceback
 import warnings
 
 import functools
@@ -22,6 +23,7 @@
     _2X_ACC_WGRAD,
 )
 from ._common import WeightGradStore
+from .extended_tensor_parallelism import wrap_module_params_etp
 from ..quantization import FP8GlobalStateManager
 from ..utils import (
     divide,
@@ -32,6 +34,7 @@
     get_nvtx_range_context,
 )
 from ..distributed import (
+    set_extended_tensor_parallel_attributes,
     set_tensor_model_parallel_attributes,
     get_distributed_world_size,
     is_fp8_activation_recompute_enabled,
@@ -43,9 +46,9 @@
 from ..constants import GemmParallelModes, dist_group_type
 from ..jit import no_torch_dynamo
 from ..cpu_offload import is_cpu_offload_enabled, mark_not_offload, start_offload
-
 from ..tensor.float8_tensor import Float8CurrentScalingQuantizer, Float8Quantizer
 from ..quantized_tensor import (
+    QuantizedTensor,
     QuantizedTensorStorage,
     Quantizer,
     prepare_for_saving,
@@ -96,6 +99,8 @@ def forward(
             skip_fp8_weight_update,
             save_original_input,
             debug,
+            etp_size,
+            recompute,
         ) = non_tensor_args
 
         num_gemms = len(m_splits)
@@ -104,6 +109,14 @@ def forward(
         device = inp.device
         weight_requires_grad = weights[0].requires_grad
 
+        if etp_size > 1:
+            weights_etp_sharded = weights
+            weights = weights[0].batched_all_gather_and_prefetch(
+                fwd=True,
+                skip_weight_cast=is_first_microbatch is False,
+                cast_noop_flag=skip_fp8_weight_update,
+            )
+
         # Configure quantizers
         if save_original_input and isinstance(input_quantizers[0], Float8Quantizer):
             raise ValueError("DelayedScaling recipe is not supported with save_original_input")
@@ -165,7 +178,7 @@ def forward(
             start_offload(*inputmats)
 
         # Initialize weights
-        weights_fp8: list
+        weights_fp8: list        
         if fp8 or debug:
             # FP8 cast to workspace buffer
             weights_fp8 = []
@@ -180,7 +193,6 @@ def forward(
                     workspace_dtype=activation_dtype,
                 )
                 weights_fp8.append(weight_fp8)
-
         else:
             weights_fp8 = [cast_if_needed(weight, activation_dtype) for weight in weights]
 
@@ -257,12 +269,20 @@ def forward(
                     for weight in weights:
                         ctx.weight_objects.append(weight)
 
-            tensors_to_save, tensor_objects = prepare_for_saving(
-                *inputmats,
-                *weights_fp8,
-                *weights,
-                *biases,
-            )
+            if etp_size == 1:
+                tensors_to_save, tensor_objects = prepare_for_saving(
+                    *inputmats,
+                    *weights_fp8,
+                    *weights,
+                    *biases,
+                )
+            else:
+                tensors_to_save, tensor_objects = prepare_for_saving(
+                    *inputmats,
+                    *weights_etp_sharded,
+                    *biases,
+                )
+
             ctx.save_for_backward(*tensors_to_save)
             ctx.tensor_objects = tensor_objects
 
@@ -278,6 +298,8 @@ def forward(
                 if hasattr(weights[0], "__fsdp_param__"):
                     # MCore FSDP creates main_grad lazily before backward
                     ctx.main_grad_funcs = [weights[i].get_main_grad for i in range(num_gemms)]
+                elif etp_size > 1:
+                    ctx.main_grad_funcs = [weights_etp_sharded[i].get_wgrad_tensor for i in range(num_gemms)]
                 else:
                     ctx.main_grad_funcs = [
                         lambda j=i: weights[j].main_grad for i in range(num_gemms)
@@ -308,6 +330,8 @@ def forward(
             ctx.debug = debug
             ctx.save_original_input = save_original_input
             ctx.input_quantizers = input_quantizers
+            ctx.etp_size = etp_size
+            ctx.recompute = recompute
 
         # [*, in_features] -> [*, out_features] except first dimension changes for SP
         return out.view(-1, *inp.shape[1:-1], out.shape[-1])
@@ -318,11 +342,19 @@ def backward(ctx, grad_output: torch.Tensor) -> Tuple[Union[torch.Tensor, None],
         with get_nvtx_range_context("_GroupedLinear_backward"):
             saved_tensors = restore_from_saved(ctx.tensor_objects, ctx.saved_tensors)
             N = ctx.num_gemms
-            inputmats = saved_tensors[:N]
-            weights = saved_tensors[N : 2 * N]
-            origin_weights = saved_tensors[2 * N : 3 * N]
-            biases = saved_tensors[3 * N : 4 * N]
-            main_grads = [main_grad_func() for main_grad_func in ctx.main_grad_funcs]
+
+            if ctx.etp_size == 1:
+                inputmats = saved_tensors[:N]
+                weights = saved_tensors[N : 2 * N]
+                origin_weights = saved_tensors[2 * N : 3 * N]
+                biases = saved_tensors[3 * N : 4 * N]
+            else:
+                inputmats = saved_tensors[:N]
+                origin_weights = saved_tensors[N : 2 * N]
+                biases = saved_tensors[2 * N : 3 * N]
+
+            if ctx.fuse_wgrad_accumulation:
+                main_grads = [main_grad_func() for main_grad_func in ctx.main_grad_funcs]
 
             if ctx.cpu_offloading:
                 if ctx.grad_added_to_main_grad:
@@ -330,10 +362,6 @@ def backward(ctx, grad_output: torch.Tensor) -> Tuple[Union[torch.Tensor, None],
                         origin_weights[i] = ctx.weight_objects[i]
                         ctx.weight_objects[i] = None
 
-            if ctx.fuse_wgrad_accumulation:
-                for i in range(N):
-                    origin_weights[i].main_grad = main_grads[i]
-
             # Preprocess grad output
             grad_output_view = grad_output.contiguous().view(-1, grad_output.shape[-1])
             grad_output = [None] * ctx.num_gemms
@@ -383,13 +411,20 @@ def backward(ctx, grad_output: torch.Tensor) -> Tuple[Union[torch.Tensor, None],
                     ctx.m_splits,
                 )
 
-            if ctx.is_first_microbatch is not None:
+            if ctx.etp_size > 1:
+                accumulate_wgrad_into_param_main_grad = False
+            elif ctx.is_first_microbatch is not None:
                 accumulate_wgrad_into_param_main_grad = (
                     ctx.fuse_wgrad_accumulation and not ctx.is_first_microbatch
                 )
             else:
                 accumulate_wgrad_into_param_main_grad = ctx.fuse_wgrad_accumulation
 
+            if ctx.etp_size > 1:
+                weights = origin_weights[0].batched_all_gather_and_prefetch_bwd(
+                    nvtx_label="te._GroupedLinear.bwd",
+                )
+
             if ctx.requires_dgrad:
                 dgrad_gemm_use_split_accumulator = _2X_ACC_DGRAD
                 if ctx.fp8 or ctx.debug:
@@ -421,7 +456,32 @@ def backward(ctx, grad_output: torch.Tensor) -> Tuple[Union[torch.Tensor, None],
                     use_split_accumulator=dgrad_gemm_use_split_accumulator,
                 )
 
+            def handle_custom_ddp_from_mcore(weight, wgrad):
+                if ctx.weights_requires_grad:
+                    # Handle custom DDP from mcore.
+                    if ctx.fuse_wgrad_accumulation and hasattr(
+                        weight, "grad_added_to_main_grad"
+                    ):
+                        weight.grad_added_to_main_grad = True
+                        if getattr(weight, "zero_out_wgrad", False):
+                            wgrad = get_dummy_wgrad(
+                                list(weight.main_grad.shape),
+                                weight.dtype,
+                                zero=True,
+                            )
+                        else:
+                            wgrad = get_dummy_wgrad(
+                                list(weight.main_grad.shape),
+                                weight.dtype,
+                            )
+                    elif ctx.fuse_wgrad_accumulation:
+                        wgrad = None
+                else:
+                    wgrad = None
+                return wgrad
+
             if ctx.weights_requires_grad:
+                """Wgrad computation."""
                 wgrad_gemm_use_split_accumulator = _2X_ACC_WGRAD
                 if ctx.fp8:
                     recipe = ctx.fp8_recipe
@@ -429,6 +489,7 @@ def backward(ctx, grad_output: torch.Tensor) -> Tuple[Union[torch.Tensor, None],
                         wgrad_gemm_use_split_accumulator = (
                             recipe.fp8_gemm_wgrad.use_split_accumulator
                         )
+
                 if ctx.fuse_wgrad_accumulation:
                     wgrad_list = main_grads
                 else:
@@ -476,7 +537,7 @@ def backward(ctx, grad_output: torch.Tensor) -> Tuple[Union[torch.Tensor, None],
                     use_split_accumulator=wgrad_gemm_use_split_accumulator,
                     accumulate=(
                         accumulate_wgrad_into_param_main_grad
-                        if not getattr(weights[0], "overwrite_main_grad", False)
+                        if ctx.etp_size == 1 and not getattr(weights[0], "overwrite_main_grad", False)
                         else False
                     ),
                 )
@@ -494,36 +555,13 @@ def backward(ctx, grad_output: torch.Tensor) -> Tuple[Union[torch.Tensor, None],
                     # Deallocate input tensor
                     clear_tensor_data(*inputmats)
 
-                def handle_custom_ddp_from_mcore(weight, wgrad):
-                    if ctx.weights_requires_grad:
-                        # Handle custom DDP from mcore.
-                        if ctx.fuse_wgrad_accumulation and hasattr(
-                            weight, "grad_added_to_main_grad"
-                        ):
-                            weight.grad_added_to_main_grad = True
-                            if getattr(weight, "zero_out_wgrad", False):
-                                wgrad = get_dummy_wgrad(
-                                    list(weight.main_grad.shape),
-                                    weight.dtype,
-                                    zero=True,
-                                )
-                            else:
-                                wgrad = get_dummy_wgrad(
-                                    list(weight.main_grad.shape),
-                                    weight.dtype,
-                                )
-                        elif ctx.fuse_wgrad_accumulation:
-                            wgrad = None
-                    else:
-                        wgrad = None
-                    return wgrad
-
+            if ctx.etp_size > 1:
+                wgrad_list = origin_weights[0].batched_wgrad_reduce_scatter(wgrad_list, ctx.fuse_wgrad_accumulation)
+            elif ctx.fuse_wgrad_accumulation:
                 wgrad_list = [
-                    handle_custom_ddp_from_mcore(weight, wgrad)
-                    for weight, wgrad in zip(origin_weights, wgrad_list)
-                ]
-            else:
-                wgrad_list = [None] * ctx.num_gemms
+                        handle_custom_ddp_from_mcore(weight, wgrad)
+                        for weight, wgrad in zip(origin_weights, wgrad_list)
+                    ]
 
             if not ctx.use_bias or (
                 ctx.wgrad_store is not None
@@ -630,6 +668,8 @@ def __init__(
         save_original_input: bool = False,
         single_grouped_parameter: bool = False,
         name: Optional[str] = None,
+        etp_group: Optional[dist_group_type] = None,
+        recompute: bool = False,
     ) -> None:
         super().__init__(name)
 
@@ -682,6 +722,13 @@ def __init__(
                 "Because the TP communication is handled outside of this module."
             )
 
+        if etp_group is None:
+            self.etp_size = 1
+        else:
+            self.etp_size = get_distributed_world_size(etp_group)
+            assert tp_size == 1, f"TODO(shiqingf): ETP+TP is not well supported yet."
+        self.recompute = recompute
+
         self.parallel_mode = parallel_mode
         assert (
             self.parallel_mode in GemmParallelModes
@@ -734,6 +781,10 @@ def __init__(
         is_meta = torch.device(device).type == "meta"
         self.reset_parameters(defer_init=is_meta)
 
+        if etp_group is not None:
+            weight_names = [f"weight{idx}" for idx in range(self.num_gemms)]
+            wrap_module_params_etp(self, weight_names, etp_group, is_grouped=True)
+
         if self.wgrad_store.delay_wgrad_compute():
             for name, param in self.named_parameters():
                 for i in range(self.num_gemms):
@@ -887,6 +938,11 @@ def forward(
             weight_tensors = self._get_weight_tensors()
             bias_tensors = [getattr(self, f"bias{i}") for i in range(self.num_gemms)]
 
+            if self.etp_size > 1:
+                weight_tensors[0].setup(
+                    weight_quantizer=self._get_weight_quantizers(),
+                )
+
             quantizers = self._get_quantizers() if not debug else self._get_debug_quantizers()
 
             if debug:
@@ -932,6 +988,8 @@ def forward(
                 None,  # skip_fp8_weight_update
                 self.save_original_input,
                 debug,
+                self.etp_size,
+                self.recompute,
             )
             out = linear_fn(*autograd_ctx, inp, non_tensor_args, *weight_tensors, *bias_tensors)
 
diff --git a/transformer_engine/pytorch/module/layernorm_linear.py b/transformer_engine/pytorch/module/layernorm_linear.py
index ce0581024a..2dda7d8812 100644
--- a/transformer_engine/pytorch/module/layernorm_linear.py
+++ b/transformer_engine/pytorch/module/layernorm_linear.py
@@ -26,6 +26,7 @@
     _2X_ACC_DGRAD,
     _2X_ACC_WGRAD,
 )
+from .extended_tensor_parallelism import wrap_module_params_etp
 from ..quantization import FP8GlobalStateManager
 from ..utils import (
     assert_dim_for_fp8_exec,
@@ -42,6 +43,7 @@
     get_nvtx_range_context,
 )
 from ..distributed import (
+    set_extended_tensor_parallel_attributes,
     set_tensor_model_parallel_attributes,
     get_distributed_world_size,
     allreduce,
@@ -71,8 +73,10 @@
     mark_not_offload,
     mark_activation_offload,
 )
+from ..tensor.nvfp4_tensor import NVFP4Tensor
 from ..tensor.storage.float8_blockwise_tensor_storage import Float8BlockwiseQTensorStorage
 from ..tensor.storage.mxfp8_tensor_storage import MXFP8TensorStorage
+
 from ..export import is_in_onnx_export_mode, assert_warmed_up
 
 from ..cpp_extensions import (
@@ -140,6 +144,7 @@ def forward(
             skip_fp8_weight_update,
             symmetric_ar_type,
             debug,
+            etp_size,
         ) = non_tensor_args
 
         # NVTX label for profiling
@@ -286,6 +291,15 @@ def forward(
         # ------------------------------------------------------
         # Prepare weight tensor
         # ------------------------------------------------------
+
+        if etp_size > 1:
+            weight_etp_sharded = weight
+            weight = weight.all_gather_and_prefetch(
+                fwd=True,
+                skip_weight_cast=is_first_microbatch is False,
+                cast_noop_flag=skip_fp8_weight_update,
+            )
+
         weightmat = weight
         is_weight_param_quantized = False
         if fp8 or debug:
@@ -368,6 +382,7 @@ def forward(
             extra_output=reduce_scatter_out,
         )
         nvtx_range_pop(f"{nvtx_label}.gemm")
+
         # ------------------------------------------------------
         # Finished forward GEMM...
         # ------------------------------------------------------
@@ -400,7 +415,7 @@ def forward(
             nvtx_range_pop(f"{nvtx_label}.row_parallel_comm")
         else:
             out = gemm_out
-        out = out.view(-1, *inp_shape[1:-1], out_features)
+        out = out.view(-1, *inp_shape[1:-1], out.shape[-1])
         # ------------------------------------------------------
         # Output tensor is ready to return...
         # ------------------------------------------------------
@@ -463,8 +478,9 @@ def forward(
 
             tensors_to_save, tensor_objects = prepare_for_saving(
                 inputmat,
-                weightmat,
-                weight,
+                # For ETP, avoid keeping the gathered weightmat in memory for memory saving.
+                weightmat if etp_size == 1 else None,
+                weight if etp_size == 1 else weight_etp_sharded,
                 bias,
                 ln_weight,
                 ln_out,
@@ -483,6 +499,8 @@ def forward(
                 if hasattr(weight, "__fsdp_param__"):
                     # MCore FSDP creates main_grad lazily before backward
                     ctx.main_grad_func = weight.get_main_grad
+                elif etp_size > 1:
+                    ctx.main_grad_func = weight_etp_sharded.get_wgrad_tensor
                 else:
                     ctx.main_grad_func = lambda: weight.main_grad
             ctx.grad_input_quantizer = grad_input_quantizer
@@ -523,6 +541,7 @@ def forward(
                     FP8GlobalStateManager.IS_FIRST_FP8_MODULE = _first_fp8_module
             ctx.wgrad_store = wgrad_store
             ctx.debug = debug
+            ctx.etp_size = etp_size
 
         # ------------------------------------------------------
         # Cached state for backward pass is ready...
@@ -567,7 +586,7 @@ def backward(
             # Since main_grad can be modified inplace, it should not be a part of saved_tensors
             main_grad = (
                 ctx.main_grad_func()
-                if weight is not None and ctx.fuse_wgrad_accumulation and ctx.requires_wgrad
+                if origin_weight is not None and ctx.fuse_wgrad_accumulation and ctx.requires_wgrad
                 else None
             )
 
@@ -590,7 +609,7 @@ def backward(
             if ctx.cpu_offloading:
                 if ctx.grad_added_to_main_grad:
                     origin_weight = ctx.weight_object
-            if ctx.requires_wgrad and ctx.fuse_wgrad_accumulation:
+            if ctx.requires_wgrad and ctx.fuse_wgrad_accumulation and ctx.etp_size == 1:
                 origin_weight.main_grad = main_grad
 
             # Configure Userbuffers communication (comm+GEMM overlap)
@@ -640,13 +659,14 @@ def backward(
 
             # Prepare grad output tensor
             # Note: Cast to expected dtype and perform tensor-parallel communication
+            grad_output = grad_outputs[0]
             nvtx_range_push(f"{nvtx_label}.grad_output_preprocess")
             (
                 grad_output,
                 grad_bias,
             ) = TransformerEngineBaseModule.grad_output_preprocess(
                 ctx,
-                grad_outputs[0],
+                grad_output,
                 ctx.parallel_mode == "row",
                 ctx.grad_output_quantizer,
             )
@@ -702,6 +722,10 @@ def backward(
             # --------------------------------------------------
 
             # Make sure required data is available
+            if ctx.etp_size > 1:
+                weight = origin_weight.all_gather_and_prefetch_bwd(
+                    nvtx_label=nvtx_label)
+
             if isinstance(grad_output, QuantizedTensorStorage):
                 grad_output.update_usage(rowwise_usage=True)
             if ctx.weight_quantizer is not None and isinstance(weight, QuantizedTensorStorage):
@@ -843,7 +867,11 @@ def backward(
                         use_split_accumulator = recipe.fp8_gemm_wgrad.use_split_accumulator
 
                 # Figure out whether to output wgrad GEMM directly into main grad
-                if ctx.is_first_microbatch is not None:
+                if ctx.etp_size > 1:
+                    # When ETP is enabled, GA is always disabled. ETP Wgrad workflow:
+                    #  allocte wgrad_out tmp buffer -> RS(wgrad_gemm) -> GradientAccumulation
+                    accumulate_wgrad_into_param_main_grad = False
+                elif ctx.is_first_microbatch is not None:
                     accumulate_wgrad_into_param_main_grad = (
                         ctx.fuse_wgrad_accumulation and not ctx.is_first_microbatch
                     )
@@ -910,6 +938,7 @@ def wgrad_gemm(
                             "with Userbuffers (tensor-parallel communication overlapping)"
                         )
                     ctx.wgrad_store.put([ln_out_total, grad_output], wgrad_gemm)
+                    assert False, f"TODO(shiqingf): not supported for ETP..."
                 else:
 
                     # Call wgrad GEMM now
@@ -941,9 +970,8 @@ def wgrad_gemm(
                     else:
                         dgrad = ub_obj_wgrad.get_buffer(local_chunk=True).clone()
 
-            # --------------------------------------------------
-            # Grad weight has been computed...
-            # --------------------------------------------------
+                if ctx.etp_size > 1:
+                    wgrad = origin_weight.wgrad_reduce_scatter(wgrad, ctx.fuse_wgrad_accumulation)
 
             # Don't return grad bias if not needed
             if not ctx.use_bias:
@@ -992,7 +1020,9 @@ def wgrad_gemm(
             clear_tensor_data(mu)
             clear_tensor_data(rsigma)
 
-        if ctx.requires_wgrad:
+        if ctx.etp_size > 1:
+            wgrad = None
+        elif ctx.requires_wgrad:
             # Handle custom DDP from mcore.
             if ctx.fuse_wgrad_accumulation and hasattr(origin_weight, "grad_added_to_main_grad"):
                 origin_weight.grad_added_to_main_grad = True
@@ -1160,6 +1190,7 @@ def __init__(
         delay_wgrad_compute: bool = False,
         symmetric_ar_type: Optional[str] = None,
         name: Optional[str] = None,
+        etp_group: Optional[dist_group_type] = None,
     ) -> None:
         super().__init__(name)
 
@@ -1190,6 +1221,10 @@ def __init__(
             self.set_tensor_parallel_group(tp_group)
         self.set_nccl_overlap_warning_if_tp()
 
+        if etp_group is None:
+            self.etp_size = 1
+        else:
+            self.etp_size = get_distributed_world_size(etp_group)
         self.parallel_mode = parallel_mode
         assert (
             self.parallel_mode in GemmParallelModes
@@ -1199,6 +1234,7 @@ def __init__(
             self.out_features = divide(self.out_features, self.tp_size)
         elif self.parallel_mode == "row":
             self.in_features = divide(self.in_features, self.tp_size)
+        self.tp_out_features = self.out_features
 
         if init_method is None:
             init_method = get_default_init_method()
@@ -1382,6 +1418,10 @@ def __init__(
 
         self.reset_parameters(defer_init=device == "meta")
 
+        if etp_group is not None:
+            wrap_module_params_etp(self, self.weight_names, etp_group)
+            del weight_tensor
+
         # For RPL, bias has to be added after TP collectives
         # So it cannot be fused with the GEMM
         if self.parallel_mode == "row" and self.apply_bias:
@@ -1402,6 +1442,7 @@ def __init__(
                 if name in self.weight_names or name in self.bias_names:
                     param.skip_backward_post_hook = True
 
+
     def set_meta_tensor(self, fwd: bool, recipe: Recipe) -> None:
         """Init scales and amaxes for fwd | bwd."""
         super().set_meta_tensor(fwd, recipe)
@@ -1445,6 +1486,13 @@ def reset_parameters(self, defer_init=False):
                     dim=1 if self.parallel_mode == "row" else 0,
                     stride=1,
                 )
+                if self.etp_size > 1:
+                    set_extended_tensor_parallel_attributes(
+                       tensor=getattr(self, weight),
+                       is_parallel=True,
+                       dim=0,  # ETP always shard along the first dim.
+                       stride=1,
+                    )
 
             # Set parallelism attributes for linear biases
             if self.use_bias:
@@ -1516,6 +1564,11 @@ def forward(
             # Get concatenated weight and bias tensors
             weight_tensor, bias_tensor = self._get_weight_and_bias_tensors()
 
+            if self.etp_size > 1:
+                weight_tensor.setup(
+                    weight_quantizer=self._get_weight_quantizers(),
+                )
+
             quantizers = (
                 self._get_quantizers(fp8_output, fp8_grad, is_grad_enabled)
                 if not debug
@@ -1580,6 +1633,7 @@ def forward(
                 skip_fp8_weight_update,
                 self.symmetric_ar_type,
                 debug,
+                self.etp_size,
             )
             out = fwd_fn(
                 *autograd_ctx,
diff --git a/transformer_engine/pytorch/module/layernorm_mlp.py b/transformer_engine/pytorch/module/layernorm_mlp.py
index 16e620fd94..f204383166 100644
--- a/transformer_engine/pytorch/module/layernorm_mlp.py
+++ b/transformer_engine/pytorch/module/layernorm_mlp.py
@@ -47,6 +47,7 @@
     get_nvtx_range_context,
 )
 from ..distributed import (
+    set_extended_tensor_parallel_attributes,
     set_tensor_model_parallel_attributes,
     get_distributed_world_size,
     allreduce,
@@ -1799,6 +1800,7 @@ def __init__(
         delay_wgrad_compute: bool = False,
         symmetric_ar_type: Optional[str] = None,
         checkpoint: bool = False,
+        etp_group: Optional[dist_group_type] = None,
     ) -> None:
         super().__init__(name)
 
@@ -1843,6 +1845,11 @@ def __init__(
             self.set_tensor_parallel_group(tp_group)
         self.set_nccl_overlap_warning_if_tp()
 
+        if etp_group is None:
+            self.etp_size = 1
+        else:
+            self.etp_size = get_distributed_world_size(etp_group)
+
         if init_method is None:
             init_method = get_default_init_method()
         if output_layer_init_method is None:
@@ -2007,6 +2014,9 @@ def reset_parameters(self, defer_init=False):
             # Set parallel attributes for linear parameters
             set_tensor_model_parallel_attributes(self.fc1_weight, True, 0, 1)
             set_tensor_model_parallel_attributes(self.fc2_weight, True, 1, 1)
+            if self.etp_size > 1:
+                set_extended_tensor_parallel_attributes(self.fc1_weight, True, 0, 1)
+                set_extended_tensor_parallel_attributes(self.fc2_weight, True, 0, 1)
             if self.use_bias:
                 set_tensor_model_parallel_attributes(self.fc1_bias, True, 0, 1)
                 if self.set_parallel_mode:
diff --git a/transformer_engine/pytorch/module/linear.py b/transformer_engine/pytorch/module/linear.py
index 31dac4d329..20f4799167 100644
--- a/transformer_engine/pytorch/module/linear.py
+++ b/transformer_engine/pytorch/module/linear.py
@@ -25,6 +25,7 @@
     _2X_ACC_WGRAD,
 )
 from ._common import noop_cat, WeightGradStore
+from .extended_tensor_parallelism import wrap_module_params_etp
 from ..quantization import FP8GlobalStateManager
 from ..utils import (
     cast_if_needed,
@@ -128,6 +129,7 @@ def forward(
             symmetric_ar_type,
             save_original_input,
             debug,
+            etp_size,
         ) = non_tensor_args
 
         # NVTX label for profiling
@@ -249,6 +251,15 @@ def forward(
         # ------------------------------------------------------
         # Prepare weight tensor
         # ------------------------------------------------------
+        
+        if etp_size > 1:
+            weight_etp_sharded = weight
+            weight = weight.all_gather_and_prefetch(
+                fwd=True,
+                skip_weight_cast=is_first_microbatch is False,
+                cast_noop_flag=skip_fp8_weight_update,
+            )
+
         weightmat = weight
         if fp8 or debug:
             # Configure quantizer
@@ -434,8 +445,8 @@ def forward(
             # TODO(ksivamani): Check memory usage
             tensors_to_save, tensor_objects = prepare_for_saving(
                 saved_inputmat,
-                weightmat,
-                weight,
+                weightmat if etp_size == 1 else None,
+                weight if etp_size == 1 else weight_etp_sharded,
                 bias,
             )
             ctx.save_for_backward(*tensors_to_save)
@@ -456,6 +467,8 @@ def forward(
                 if hasattr(weight, "__fsdp_param__"):
                     # MCore FSDP creates main_grad lazily before backward
                     ctx.main_grad_func = weight.get_main_grad
+                elif etp_size > 1:
+                    ctx.main_grad_func = weight_etp_sharded.get_wgrad_tensor
                 else:
                     ctx.main_grad_func = lambda: weight.main_grad
 
@@ -486,6 +499,7 @@ def forward(
                 if in_fp8_activation_recompute_phase():
                     FP8GlobalStateManager.IS_FIRST_FP8_MODULE = _first_fp8_module
             ctx.wgrad_store = wgrad_store
+            ctx.etp_size = etp_size
 
         # ------------------------------------------------------
         # Cached state for backward pass is ready...
@@ -522,7 +536,7 @@ def backward(ctx, grad_output: torch.Tensor) -> Tuple[Union[torch.Tensor, None],
             if ctx.cpu_offloading:
                 if ctx.grad_added_to_main_grad:
                     weight = ctx.weight_object
-            if ctx.requires_wgrad and ctx.fuse_wgrad_accumulation:
+            if ctx.requires_wgrad and ctx.fuse_wgrad_accumulation and ctx.etp_size == 1:
                 weight.main_grad = main_grad
 
             # Gather intermediate/activation tensors if needed
@@ -684,6 +698,10 @@ def backward(ctx, grad_output: torch.Tensor) -> Tuple[Union[torch.Tensor, None],
             # Compute grad input tensor
             # --------------------------------------------------
 
+            if ctx.etp_size > 1:
+                weight_fp8 = weight.all_gather_and_prefetch_bwd(
+                    nvtx_label=nvtx_label)
+
             dgrad = None
             dgrad_work = None
             if ctx.requires_dgrad:
@@ -832,7 +850,9 @@ def backward(ctx, grad_output: torch.Tensor) -> Tuple[Union[torch.Tensor, None],
                         use_split_accumulator = recipe.fp8_gemm_wgrad.use_split_accumulator
 
                 # Figure out whether to output wgrad GEMM directly into main grad
-                if ctx.is_first_microbatch is not None:
+                if ctx.etp_size > 1:
+                    accumulate_wgrad_into_param_main_grad = False
+                elif ctx.is_first_microbatch is not None:
                     accumulate_wgrad_into_param_main_grad = (
                         ctx.fuse_wgrad_accumulation and not ctx.is_first_microbatch
                     )
@@ -943,6 +963,8 @@ def wgrad_gemm(
                 dgrad_work.wait()
                 dgrad_work = None
 
+        if ctx.etp_size > 1:
+            wgrad = weight.wgrad_reduce_scatter(wgrad, ctx.fuse_wgrad_accumulation)
         if ctx.requires_wgrad:
             # Handle custom DDP from mcore.
             if (
@@ -1098,6 +1120,7 @@ def __init__(
         symmetric_ar_type: Optional[str] = None,
         save_original_input: bool = False,
         name: Optional[str] = None,
+        etp_group: Optional[dist_group_type] = None,
     ) -> None:
         super().__init__(name)
 
@@ -1126,6 +1149,11 @@ def __init__(
             self.set_tensor_parallel_group(tp_group)
         self.set_nccl_overlap_warning_if_tp()
 
+        if etp_group is None:
+            self.etp_size = 1
+        else:
+            self.etp_size = get_distributed_world_size(etp_group)
+
         self.parallel_mode = parallel_mode
         assert (
             self.parallel_mode in GemmParallelModes
@@ -1297,6 +1325,10 @@ def __init__(
 
         self.reset_parameters(defer_init=device == "meta")
 
+        if etp_group is not None:
+            wrap_module_params_etp(self, self.weight_names, etp_group)
+            del weight_tensor
+
         # For RPL, bias has to be added after TP collectives
         # So it cannot be fused with the GEMM
         if self.parallel_mode == "row" and self.apply_bias:
@@ -1399,6 +1431,11 @@ def forward(
         try:
             weight_tensor, bias_tensor = self._get_weight_and_bias_tensors()
 
+            if self.etp_size > 1:
+                weight_tensor.setup(
+                    weight_quantizer=self._get_weight_quantizers(),
+                )
+
             quantizers = (
                 self._get_quantizers(fp8_output, fp8_grad, is_grad_enabled)
                 if not debug
@@ -1459,6 +1496,7 @@ def forward(
                 self.symmetric_ar_type,
                 self.save_original_input,
                 debug,
+                self.etp_size,
             )
             out = linear_fn(
                 *autograd_ctx,
diff --git a/transformer_engine/pytorch/tensor/storage/nvfp4_tensor_storage.py b/transformer_engine/pytorch/tensor/storage/nvfp4_tensor_storage.py
index e7509f3994..0b6830c1d0 100644
--- a/transformer_engine/pytorch/tensor/storage/nvfp4_tensor_storage.py
+++ b/transformer_engine/pytorch/tensor/storage/nvfp4_tensor_storage.py
@@ -297,6 +297,18 @@ def view(self, shape: torch.Size):
             with_gemm_swizzled_scales=self._with_gemm_swizzled_scales,
         )
 
+    def copy_(self, tensor: NVFP4TensorStorage):
+        assert tensor._fp4_dtype == self._fp4_dtype
+
+        self._rowwise_data.copy_(tensor._rowwise_data)
+        self._columnwise_data.copy_(tensor._columnwise_data)
+        self._rowwise_scale_inv.copy_(tensor._rowwise_scale_inv)
+        self._columnwise_scale_inv.copy_(tensor._columnwise_scale_inv)
+        self._amax_rowwise.copy_(tensor._amax_rowwise)
+        self._amax_columnwise.copy_(tensor._amax_columnwise)
+        self._quantizer = tensor._quantizer.copy() if tensor._quantizer is not None else None
+
+
     def __repr__(self):
         data_rowwise = self.dequantize()
 
diff --git a/transformer_engine/pytorch/utils.py b/transformer_engine/pytorch/utils.py
index 47af9fabe1..c225cb3009 100644
--- a/transformer_engine/pytorch/utils.py
+++ b/transformer_engine/pytorch/utils.py
@@ -430,6 +430,46 @@ def cast_if_needed(tensor: torch.Tensor, dtype: torch.dtype) -> torch.Tensor:
     with torch.enable_grad():
         return tensor.to(dtype=dtype)
 
+def mask_distributed_columns_graph_compatible(tensor, num_mask_cols, etp_size, valid_indices=None):
+    """
+    CUDA graph compatible version using index_select.
+
+    Args:
+        tensor: [M, N] input tensor
+        num_mask_cols: total columns to mask
+        etp_size: number of chunks
+        valid_indices: pre-computed valid column indices (optional, will be computed if None)
+
+    Returns:
+        result: [M, N - num_mask_cols] tensor with masked columns removed
+        valid_indices: indices of valid columns (for reuse)
+    """
+    # Sanity check
+    assert num_mask_cols > 0 and etp_size > 1
+    _, N = tensor.shape
+
+    chunk_size = N // etp_size
+    mask_per_chunk = num_mask_cols // etp_size
+    assert num_mask_cols % etp_size == 0 and mask_per_chunk >= 1
+
+    # Pre-compute valid indices if not provided
+    if valid_indices is None:
+        # Build list of valid column indices
+        indices_list = []
+        for chunk_idx in range(etp_size):
+            chunk_start = chunk_idx * chunk_size
+            chunk_end = chunk_start + chunk_size
+            valid_end = chunk_end - mask_per_chunk
+            indices_list.extend(range(chunk_start, valid_end))
+
+        # Allocated during warmup of CG.
+        valid_indices = torch.tensor(indices_list, dtype=torch.long, device=tensor.device)
+
+    # Use index_select instead of boolean indexing (CUDA graph compatible)
+    result = torch.index_select(tensor, dim=1, index=valid_indices)
+
+    return result, valid_indices
+
 
 def check_dim_for_fp8_exec(tensor: torch.Tensor) -> bool:
     """Check if tensor dimensions are supported for FP8 TN GEMM"""

From 4e0e39db354f443db098475770fd89141b36fe14 Mon Sep 17 00:00:00 2001
From: Jieming Zhang <jiemingz@nvidia.com>
Date: Thu, 5 Mar 2026 15:42:53 -0800
Subject: [PATCH 02/43] cleanup

Signed-off-by: Jieming Zhang <jiemingz@nvidia.com>
---
 transformer_engine/pytorch/distributed.py     | 23 +----
 transformer_engine/pytorch/module/base.py     |  6 --
 .../pytorch/module/grouped_linear.py          | 86 +++++++++----------
 .../pytorch/module/layernorm_linear.py        | 32 ++-----
 .../pytorch/module/layernorm_mlp.py           | 10 ---
 .../tensor/storage/nvfp4_tensor_storage.py    | 12 ---
 transformer_engine/pytorch/utils.py           | 40 ---------
 7 files changed, 51 insertions(+), 158 deletions(-)

diff --git a/transformer_engine/pytorch/distributed.py b/transformer_engine/pytorch/distributed.py
index dc5d8b3063..20c617ddb7 100644
--- a/transformer_engine/pytorch/distributed.py
+++ b/transformer_engine/pytorch/distributed.py
@@ -10,7 +10,7 @@
 from functools import lru_cache
 from dataclasses import dataclass
 import math
-from typing import Any, Callable, ContextManager, Dict, List, Optional, Tuple, Union
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
 import warnings
 
 import torch
@@ -60,14 +60,6 @@
     "partition_stride": 1,
 }
 
-
-_EXTENDED_TENSOR_MODEL_PARALLEL_ATTRIBUTE_DEFAUTLTS = {
-    'etp_model_parallel': False,
-    'etp_partition_dim': -1,
-    'etp_partition_stride': 1,
-}
-
-
 _USE_REENTRANT_ACTIVATION_RECOMPUTE = True
 
 _FP8_ACTIVATION_RECOMPUTE_ENABLED = False
@@ -167,19 +159,6 @@ def set_tensor_model_parallel_attributes(
     setattr(tensor, "partition_stride", stride)
 
 
-def set_extended_tensor_parallel_attributes(
-    tensor: torch.Tensor, is_parallel: bool, dim: int, stride: int
-) -> None:
-    """Set ps attributes to tensor."""
-    # Make sure the attributes are not set.
-    for attribute in _EXTENDED_TENSOR_MODEL_PARALLEL_ATTRIBUTE_DEFAUTLTS:
-        assert not hasattr(tensor, attribute)
-    # Set the attributes.
-    setattr(tensor, 'etp_model_parallel', is_parallel)
-    setattr(tensor, 'etp_partition_dim', dim)
-    setattr(tensor, 'etp_partition_stride', stride)
-
-
 @lru_cache
 def get_distributed_world_size(group: Optional[dist_group_type] = None) -> int:
     """Return world size for the distributed group."""
diff --git a/transformer_engine/pytorch/module/base.py b/transformer_engine/pytorch/module/base.py
index b565a40f87..9c21141a39 100644
--- a/transformer_engine/pytorch/module/base.py
+++ b/transformer_engine/pytorch/module/base.py
@@ -631,7 +631,6 @@ def __init__(self, name: Optional[str] = None) -> None:
         self.activation_dtype: Optional[torch.dtype] = None
         self.wgrad_accumulation_and_reduce_hooks = []
         self.wgrad_store = None
-        self.etp_size = 1
 
         if not TEDebugState.debug_enabled:
             TEDebugState.initialize()
@@ -957,8 +956,6 @@ def set_tensor_parallel_group(self, tp_group: Union[dist_group_type, None]) -> N
         self.fast_setattr("tp_group", tp_group)
         self.fast_setattr("tp_group_initialized", True)
 
-
-
     def _get_fp8_params(self) -> Union[List[torch.Tensor], None]:
         """returns the FP8 weights."""
         fp8_params = []
@@ -1245,9 +1242,6 @@ def reset_parameters(self, defer_init: Optional[bool] = False) -> None:
         for name, param in self.named_parameters(recurse=False):
             # Check if parameter is a DTensor (FSDP2) or regular tensor
             is_dtensor = isinstance(param, DTensor)
-            from .extended_tensor_parallelism import ETPShardedParam
-            is_etp = isinstance(param, ETPShardedParam)
-
             dtensor_param = param if is_dtensor else None
             # Need to update/quantize local tensor in case of DTensor
             param = param._local_tensor if is_dtensor else param
diff --git a/transformer_engine/pytorch/module/grouped_linear.py b/transformer_engine/pytorch/module/grouped_linear.py
index 58c4e3b130..fd55f84b3c 100644
--- a/transformer_engine/pytorch/module/grouped_linear.py
+++ b/transformer_engine/pytorch/module/grouped_linear.py
@@ -34,7 +34,6 @@
     get_nvtx_range_context,
 )
 from ..distributed import (
-    set_extended_tensor_parallel_attributes,
     set_tensor_model_parallel_attributes,
     get_distributed_world_size,
     is_fp8_activation_recompute_enabled,
@@ -46,9 +45,9 @@
 from ..constants import GemmParallelModes, dist_group_type
 from ..jit import no_torch_dynamo
 from ..cpu_offload import is_cpu_offload_enabled, mark_not_offload, start_offload
+
 from ..tensor.float8_tensor import Float8CurrentScalingQuantizer, Float8Quantizer
 from ..quantized_tensor import (
-    QuantizedTensor,
     QuantizedTensorStorage,
     Quantizer,
     prepare_for_saving,
@@ -100,7 +99,6 @@ def forward(
             save_original_input,
             debug,
             etp_size,
-            recompute,
         ) = non_tensor_args
 
         num_gemms = len(m_splits)
@@ -178,7 +176,7 @@ def forward(
             start_offload(*inputmats)
 
         # Initialize weights
-        weights_fp8: list        
+        weights_fp8: list
         if fp8 or debug:
             # FP8 cast to workspace buffer
             weights_fp8 = []
@@ -193,6 +191,7 @@ def forward(
                     workspace_dtype=activation_dtype,
                 )
                 weights_fp8.append(weight_fp8)
+
         else:
             weights_fp8 = [cast_if_needed(weight, activation_dtype) for weight in weights]
 
@@ -331,7 +330,6 @@ def forward(
             ctx.save_original_input = save_original_input
             ctx.input_quantizers = input_quantizers
             ctx.etp_size = etp_size
-            ctx.recompute = recompute
 
         # [*, in_features] -> [*, out_features] except first dimension changes for SP
         return out.view(-1, *inp.shape[1:-1], out.shape[-1])
@@ -342,7 +340,6 @@ def backward(ctx, grad_output: torch.Tensor) -> Tuple[Union[torch.Tensor, None],
         with get_nvtx_range_context("_GroupedLinear_backward"):
             saved_tensors = restore_from_saved(ctx.tensor_objects, ctx.saved_tensors)
             N = ctx.num_gemms
-
             if ctx.etp_size == 1:
                 inputmats = saved_tensors[:N]
                 weights = saved_tensors[N : 2 * N]
@@ -352,9 +349,7 @@ def backward(ctx, grad_output: torch.Tensor) -> Tuple[Union[torch.Tensor, None],
                 inputmats = saved_tensors[:N]
                 origin_weights = saved_tensors[N : 2 * N]
                 biases = saved_tensors[2 * N : 3 * N]
-
-            if ctx.fuse_wgrad_accumulation:
-                main_grads = [main_grad_func() for main_grad_func in ctx.main_grad_funcs]
+            main_grads = [main_grad_func() for main_grad_func in ctx.main_grad_funcs]
 
             if ctx.cpu_offloading:
                 if ctx.grad_added_to_main_grad:
@@ -362,6 +357,10 @@ def backward(ctx, grad_output: torch.Tensor) -> Tuple[Union[torch.Tensor, None],
                         origin_weights[i] = ctx.weight_objects[i]
                         ctx.weight_objects[i] = None
 
+            if ctx.fuse_wgrad_accumulation:
+                for i in range(N):
+                    origin_weights[i].main_grad = main_grads[i]
+
             # Preprocess grad output
             grad_output_view = grad_output.contiguous().view(-1, grad_output.shape[-1])
             grad_output = [None] * ctx.num_gemms
@@ -456,32 +455,7 @@ def backward(ctx, grad_output: torch.Tensor) -> Tuple[Union[torch.Tensor, None],
                     use_split_accumulator=dgrad_gemm_use_split_accumulator,
                 )
 
-            def handle_custom_ddp_from_mcore(weight, wgrad):
-                if ctx.weights_requires_grad:
-                    # Handle custom DDP from mcore.
-                    if ctx.fuse_wgrad_accumulation and hasattr(
-                        weight, "grad_added_to_main_grad"
-                    ):
-                        weight.grad_added_to_main_grad = True
-                        if getattr(weight, "zero_out_wgrad", False):
-                            wgrad = get_dummy_wgrad(
-                                list(weight.main_grad.shape),
-                                weight.dtype,
-                                zero=True,
-                            )
-                        else:
-                            wgrad = get_dummy_wgrad(
-                                list(weight.main_grad.shape),
-                                weight.dtype,
-                            )
-                    elif ctx.fuse_wgrad_accumulation:
-                        wgrad = None
-                else:
-                    wgrad = None
-                return wgrad
-
             if ctx.weights_requires_grad:
-                """Wgrad computation."""
                 wgrad_gemm_use_split_accumulator = _2X_ACC_WGRAD
                 if ctx.fp8:
                     recipe = ctx.fp8_recipe
@@ -489,7 +463,6 @@ def handle_custom_ddp_from_mcore(weight, wgrad):
                         wgrad_gemm_use_split_accumulator = (
                             recipe.fp8_gemm_wgrad.use_split_accumulator
                         )
-
                 if ctx.fuse_wgrad_accumulation:
                     wgrad_list = main_grads
                 else:
@@ -555,13 +528,39 @@ def handle_custom_ddp_from_mcore(weight, wgrad):
                     # Deallocate input tensor
                     clear_tensor_data(*inputmats)
 
-            if ctx.etp_size > 1:
-                wgrad_list = origin_weights[0].batched_wgrad_reduce_scatter(wgrad_list, ctx.fuse_wgrad_accumulation)
-            elif ctx.fuse_wgrad_accumulation:
-                wgrad_list = [
-                        handle_custom_ddp_from_mcore(weight, wgrad)
-                        for weight, wgrad in zip(origin_weights, wgrad_list)
-                    ]
+                def handle_custom_ddp_from_mcore(weight, wgrad):
+                    if ctx.weights_requires_grad:
+                        # Handle custom DDP from mcore.
+                        if ctx.fuse_wgrad_accumulation and hasattr(
+                            weight, "grad_added_to_main_grad"
+                        ):
+                            weight.grad_added_to_main_grad = True
+                            if getattr(weight, "zero_out_wgrad", False):
+                                wgrad = get_dummy_wgrad(
+                                    list(weight.main_grad.shape),
+                                    weight.dtype,
+                                    zero=True,
+                                )
+                            else:
+                                wgrad = get_dummy_wgrad(
+                                    list(weight.main_grad.shape),
+                                    weight.dtype,
+                                )
+                        elif ctx.fuse_wgrad_accumulation:
+                            wgrad = None
+                    else:
+                        wgrad = None
+                    return wgrad
+
+                if ctx.etp_size > 1:
+                    wgrad_list = origin_weights[0].batched_wgrad_reduce_scatter(wgrad_list, ctx.fuse_wgrad_accumulation)
+                elif ctx.fuse_wgrad_accumulation:
+                    wgrad_list = [
+                            handle_custom_ddp_from_mcore(weight, wgrad)
+                            for weight, wgrad in zip(origin_weights, wgrad_list)
+                        ]
+            else:
+                wgrad_list = [None] * ctx.num_gemms
 
             if not ctx.use_bias or (
                 ctx.wgrad_store is not None
@@ -669,7 +668,6 @@ def __init__(
         single_grouped_parameter: bool = False,
         name: Optional[str] = None,
         etp_group: Optional[dist_group_type] = None,
-        recompute: bool = False,
     ) -> None:
         super().__init__(name)
 
@@ -727,7 +725,6 @@ def __init__(
         else:
             self.etp_size = get_distributed_world_size(etp_group)
             assert tp_size == 1, f"TODO(shiqingf): ETP+TP is not well supported yet."
-        self.recompute = recompute
 
         self.parallel_mode = parallel_mode
         assert (
@@ -989,7 +986,6 @@ def forward(
                 self.save_original_input,
                 debug,
                 self.etp_size,
-                self.recompute,
             )
             out = linear_fn(*autograd_ctx, inp, non_tensor_args, *weight_tensors, *bias_tensors)
 
diff --git a/transformer_engine/pytorch/module/layernorm_linear.py b/transformer_engine/pytorch/module/layernorm_linear.py
index 2dda7d8812..f6f24f6a5f 100644
--- a/transformer_engine/pytorch/module/layernorm_linear.py
+++ b/transformer_engine/pytorch/module/layernorm_linear.py
@@ -43,7 +43,6 @@
     get_nvtx_range_context,
 )
 from ..distributed import (
-    set_extended_tensor_parallel_attributes,
     set_tensor_model_parallel_attributes,
     get_distributed_world_size,
     allreduce,
@@ -73,10 +72,8 @@
     mark_not_offload,
     mark_activation_offload,
 )
-from ..tensor.nvfp4_tensor import NVFP4Tensor
 from ..tensor.storage.float8_blockwise_tensor_storage import Float8BlockwiseQTensorStorage
 from ..tensor.storage.mxfp8_tensor_storage import MXFP8TensorStorage
-
 from ..export import is_in_onnx_export_mode, assert_warmed_up
 
 from ..cpp_extensions import (
@@ -382,7 +379,6 @@ def forward(
             extra_output=reduce_scatter_out,
         )
         nvtx_range_pop(f"{nvtx_label}.gemm")
-
         # ------------------------------------------------------
         # Finished forward GEMM...
         # ------------------------------------------------------
@@ -586,7 +582,7 @@ def backward(
             # Since main_grad can be modified inplace, it should not be a part of saved_tensors
             main_grad = (
                 ctx.main_grad_func()
-                if origin_weight is not None and ctx.fuse_wgrad_accumulation and ctx.requires_wgrad
+                if weight is not None and ctx.fuse_wgrad_accumulation and ctx.requires_wgrad
                 else None
             )
 
@@ -659,14 +655,13 @@ def backward(
 
             # Prepare grad output tensor
             # Note: Cast to expected dtype and perform tensor-parallel communication
-            grad_output = grad_outputs[0]
             nvtx_range_push(f"{nvtx_label}.grad_output_preprocess")
             (
                 grad_output,
                 grad_bias,
             ) = TransformerEngineBaseModule.grad_output_preprocess(
                 ctx,
-                grad_output,
+                grad_outputs[0],
                 ctx.parallel_mode == "row",
                 ctx.grad_output_quantizer,
             )
@@ -938,7 +933,6 @@ def wgrad_gemm(
                             "with Userbuffers (tensor-parallel communication overlapping)"
                         )
                     ctx.wgrad_store.put([ln_out_total, grad_output], wgrad_gemm)
-                    assert False, f"TODO(shiqingf): not supported for ETP..."
                 else:
 
                     # Call wgrad GEMM now
@@ -970,8 +964,9 @@ def wgrad_gemm(
                     else:
                         dgrad = ub_obj_wgrad.get_buffer(local_chunk=True).clone()
 
-                if ctx.etp_size > 1:
-                    wgrad = origin_weight.wgrad_reduce_scatter(wgrad, ctx.fuse_wgrad_accumulation)
+            # --------------------------------------------------
+            # Grad weight has been computed...
+            # --------------------------------------------------
 
             # Don't return grad bias if not needed
             if not ctx.use_bias:
@@ -1020,11 +1015,11 @@ def wgrad_gemm(
             clear_tensor_data(mu)
             clear_tensor_data(rsigma)
 
-        if ctx.etp_size > 1:
-            wgrad = None
-        elif ctx.requires_wgrad:
+        if ctx.requires_wgrad:
             # Handle custom DDP from mcore.
-            if ctx.fuse_wgrad_accumulation and hasattr(origin_weight, "grad_added_to_main_grad"):
+            if ctx.etp_size > 1:
+                wgrad = origin_weight.wgrad_reduce_scatter(wgrad, ctx.fuse_wgrad_accumulation)
+            elif ctx.fuse_wgrad_accumulation and hasattr(origin_weight, "grad_added_to_main_grad"):
                 origin_weight.grad_added_to_main_grad = True
                 if getattr(origin_weight, "zero_out_wgrad", False):
                     wgrad = get_dummy_wgrad(
@@ -1234,7 +1229,6 @@ def __init__(
             self.out_features = divide(self.out_features, self.tp_size)
         elif self.parallel_mode == "row":
             self.in_features = divide(self.in_features, self.tp_size)
-        self.tp_out_features = self.out_features
 
         if init_method is None:
             init_method = get_default_init_method()
@@ -1442,7 +1436,6 @@ def __init__(
                 if name in self.weight_names or name in self.bias_names:
                     param.skip_backward_post_hook = True
 
-
     def set_meta_tensor(self, fwd: bool, recipe: Recipe) -> None:
         """Init scales and amaxes for fwd | bwd."""
         super().set_meta_tensor(fwd, recipe)
@@ -1486,13 +1479,6 @@ def reset_parameters(self, defer_init=False):
                     dim=1 if self.parallel_mode == "row" else 0,
                     stride=1,
                 )
-                if self.etp_size > 1:
-                    set_extended_tensor_parallel_attributes(
-                       tensor=getattr(self, weight),
-                       is_parallel=True,
-                       dim=0,  # ETP always shard along the first dim.
-                       stride=1,
-                    )
 
             # Set parallelism attributes for linear biases
             if self.use_bias:
diff --git a/transformer_engine/pytorch/module/layernorm_mlp.py b/transformer_engine/pytorch/module/layernorm_mlp.py
index f204383166..16e620fd94 100644
--- a/transformer_engine/pytorch/module/layernorm_mlp.py
+++ b/transformer_engine/pytorch/module/layernorm_mlp.py
@@ -47,7 +47,6 @@
     get_nvtx_range_context,
 )
 from ..distributed import (
-    set_extended_tensor_parallel_attributes,
     set_tensor_model_parallel_attributes,
     get_distributed_world_size,
     allreduce,
@@ -1800,7 +1799,6 @@ def __init__(
         delay_wgrad_compute: bool = False,
         symmetric_ar_type: Optional[str] = None,
         checkpoint: bool = False,
-        etp_group: Optional[dist_group_type] = None,
     ) -> None:
         super().__init__(name)
 
@@ -1845,11 +1843,6 @@ def __init__(
             self.set_tensor_parallel_group(tp_group)
         self.set_nccl_overlap_warning_if_tp()
 
-        if etp_group is None:
-            self.etp_size = 1
-        else:
-            self.etp_size = get_distributed_world_size(etp_group)
-
         if init_method is None:
             init_method = get_default_init_method()
         if output_layer_init_method is None:
@@ -2014,9 +2007,6 @@ def reset_parameters(self, defer_init=False):
             # Set parallel attributes for linear parameters
             set_tensor_model_parallel_attributes(self.fc1_weight, True, 0, 1)
             set_tensor_model_parallel_attributes(self.fc2_weight, True, 1, 1)
-            if self.etp_size > 1:
-                set_extended_tensor_parallel_attributes(self.fc1_weight, True, 0, 1)
-                set_extended_tensor_parallel_attributes(self.fc2_weight, True, 0, 1)
             if self.use_bias:
                 set_tensor_model_parallel_attributes(self.fc1_bias, True, 0, 1)
                 if self.set_parallel_mode:
diff --git a/transformer_engine/pytorch/tensor/storage/nvfp4_tensor_storage.py b/transformer_engine/pytorch/tensor/storage/nvfp4_tensor_storage.py
index 0b6830c1d0..e7509f3994 100644
--- a/transformer_engine/pytorch/tensor/storage/nvfp4_tensor_storage.py
+++ b/transformer_engine/pytorch/tensor/storage/nvfp4_tensor_storage.py
@@ -297,18 +297,6 @@ def view(self, shape: torch.Size):
             with_gemm_swizzled_scales=self._with_gemm_swizzled_scales,
         )
 
-    def copy_(self, tensor: NVFP4TensorStorage):
-        assert tensor._fp4_dtype == self._fp4_dtype
-
-        self._rowwise_data.copy_(tensor._rowwise_data)
-        self._columnwise_data.copy_(tensor._columnwise_data)
-        self._rowwise_scale_inv.copy_(tensor._rowwise_scale_inv)
-        self._columnwise_scale_inv.copy_(tensor._columnwise_scale_inv)
-        self._amax_rowwise.copy_(tensor._amax_rowwise)
-        self._amax_columnwise.copy_(tensor._amax_columnwise)
-        self._quantizer = tensor._quantizer.copy() if tensor._quantizer is not None else None
-
-
     def __repr__(self):
         data_rowwise = self.dequantize()
 
diff --git a/transformer_engine/pytorch/utils.py b/transformer_engine/pytorch/utils.py
index c225cb3009..47af9fabe1 100644
--- a/transformer_engine/pytorch/utils.py
+++ b/transformer_engine/pytorch/utils.py
@@ -430,46 +430,6 @@ def cast_if_needed(tensor: torch.Tensor, dtype: torch.dtype) -> torch.Tensor:
     with torch.enable_grad():
         return tensor.to(dtype=dtype)
 
-def mask_distributed_columns_graph_compatible(tensor, num_mask_cols, etp_size, valid_indices=None):
-    """
-    CUDA graph compatible version using index_select.
-
-    Args:
-        tensor: [M, N] input tensor
-        num_mask_cols: total columns to mask
-        etp_size: number of chunks
-        valid_indices: pre-computed valid column indices (optional, will be computed if None)
-
-    Returns:
-        result: [M, N - num_mask_cols] tensor with masked columns removed
-        valid_indices: indices of valid columns (for reuse)
-    """
-    # Sanity check
-    assert num_mask_cols > 0 and etp_size > 1
-    _, N = tensor.shape
-
-    chunk_size = N // etp_size
-    mask_per_chunk = num_mask_cols // etp_size
-    assert num_mask_cols % etp_size == 0 and mask_per_chunk >= 1
-
-    # Pre-compute valid indices if not provided
-    if valid_indices is None:
-        # Build list of valid column indices
-        indices_list = []
-        for chunk_idx in range(etp_size):
-            chunk_start = chunk_idx * chunk_size
-            chunk_end = chunk_start + chunk_size
-            valid_end = chunk_end - mask_per_chunk
-            indices_list.extend(range(chunk_start, valid_end))
-
-        # Allocated during warmup of CG.
-        valid_indices = torch.tensor(indices_list, dtype=torch.long, device=tensor.device)
-
-    # Use index_select instead of boolean indexing (CUDA graph compatible)
-    result = torch.index_select(tensor, dim=1, index=valid_indices)
-
-    return result, valid_indices
-
 
 def check_dim_for_fp8_exec(tensor: torch.Tensor) -> bool:
     """Check if tensor dimensions are supported for FP8 TN GEMM"""

From de29fac4f7e466c9770144749fa0394f94d4fa58 Mon Sep 17 00:00:00 2001
From: Jieming Zhang <jiemingz@nvidia.com>
Date: Tue, 10 Mar 2026 10:22:19 -0700
Subject: [PATCH 03/43] fix post_hook not being called in certain cases

Signed-off-by: Jieming Zhang <jiemingz@nvidia.com>
---
 .../module/extended_tensor_parallelism.py     | 76 +++++++++----------
 .../pytorch/module/grouped_linear.py          |  2 +-
 .../pytorch/module/layernorm_linear.py        |  8 +-
 3 files changed, 39 insertions(+), 47 deletions(-)

diff --git a/transformer_engine/pytorch/module/extended_tensor_parallelism.py b/transformer_engine/pytorch/module/extended_tensor_parallelism.py
index cb4e058418..47311d3080 100644
--- a/transformer_engine/pytorch/module/extended_tensor_parallelism.py
+++ b/transformer_engine/pytorch/module/extended_tensor_parallelism.py
@@ -410,6 +410,7 @@ def all_gather_and_prefetch_bwd(self, nvtx_label=None):
 
     def batched_all_gather_and_prefetch_bwd(self, nvtx_label=None):
         """Batched backward all-gather + prefetch. Wrapper around all_gather_and_prefetch_bwd."""
+        assert self.is_routed_expert and self.weight_list is not None
         return self.all_gather_and_prefetch_bwd(nvtx_label=nvtx_label)
 
     def all_gather_and_prefetch(
@@ -453,6 +454,7 @@ def all_gather_and_prefetch(
 
     def batched_all_gather_and_prefetch(self, **kwargs):
         """Batched all-gather + prefetch for expert weights. Wrapper around all_gather_and_prefetch."""
+        assert self.is_routed_expert and self.weight_list is not None
         return self.all_gather_and_prefetch(**kwargs)
 
     def get_wgrad_tensor(self):
@@ -467,44 +469,30 @@ def register_grad_accum_hook(self, grad_accum_node, hook):
         self._grad_accum_node = grad_accum_node
         self._grad_accum_hook = hook
 
-    @classmethod
-    def _resolve_pending_rs(cls, expected_next):
-        """Finish any pending reduce-scatter from a previous weight."""
-        if cls._pending_rs_weight is not None:
-            assert cls._pending_rs_weight is expected_next
-            cls._pending_rs_weight.finish_wgrad_reduce_scatter()
-            cls._pending_rs_weight = None
-
     @staticmethod
-    def _apply_fused_wgrad(param, wgrad_rs):
-        """Apply fuse_wgrad_accumulation logic to a single param and return a dummy grad."""
-        
-        # the last rank in the etp group pads the param, so need to remove the padding here
-        if param.group.rank() == param.group.size() - 1:
+    def _finalize_wgrad(param, wgrad_rs, fuse_wgrad_accumulation):
+        """Post-RS per-param processing: strip padding, accumulate, call hook.
+
+        Returns None for fused (grad already accumulated into main_grad),
+        or the stripped wgrad for unfused (to be returned to autograd).
+        """
+        # 1. Strip padding
+        if param.is_padded_last_rank:
             wgrad_rs = param._strip_padding(wgrad_rs)
 
-        param.main_grad.add_(wgrad_rs)
-        # Handle mcore grad accum fusion
-        if hasattr(param, "grad_added_to_main_grad"):
-            param.grad_added_to_main_grad = True
-        param.grad = get_dummy_wgrad(list(param.main_grad.shape), param.dtype)
+        # 2. Accumulate
+        if fuse_wgrad_accumulation:
+            param.main_grad.add_(wgrad_rs)
+            if hasattr(param, "grad_added_to_main_grad"):
+                param.grad_added_to_main_grad = True
+            dummy_grad = get_dummy_wgrad(list(param.main_grad.shape), param.dtype)
+
+        # 3. Post hook
         if param._grad_accum_hook is not None:
+            param.grad = dummy_grad if fuse_wgrad_accumulation else wgrad_rs
             param._grad_accum_hook(param)
 
-    def finish_wgrad_reduce_scatter(self):
-        if self.wgrad_rs_handle is not None:
-            self.wgrad_rs_handle.wait()
-            self.wgrad_rs_handle = None
-
-        for param, wgrad_rs in zip(self._weights, self.wgrad_rs):
-            if self.fuse_wgrad_accumulation:
-                self._apply_fused_wgrad(param, wgrad_rs)
-            else:
-                if param.is_padded_last_rank:
-                    wgrad_rs = param._strip_padding(wgrad_rs)
-                param.grad = wgrad_rs
-            # Finally call the grad accum node
-            param._grad_accum_node(param.grad)
+        return dummy_grad if fuse_wgrad_accumulation else wgrad_rs
 
     def _reduce_scatter(self, wgrads, async_op):
         """Reduce-scatter one or more wgrads. Returns (outputs, handle).
@@ -546,29 +534,33 @@ def wgrad_reduce_scatter(self, wgrad, fuse_wgrad_accumulation):
         wgrads = list(wgrad) if batched else [wgrad]
         weights = self._weights
 
-        self._resolve_pending_rs(self.next_w)
+        # Wait for last reduce scatter if it was async
+        if ETPShardedParam._pending_rs_weight is not None:
+            param = ETPShardedParam._pending_rs_weight
+            assert param is self.next_w
+            param.wgrad_rs_handle.wait()
+            param.wgrad_rs_handle = None
+
+            for p, g in zip(param._weights, param.wgrad_rs):
+                self._finalize_wgrad(p, g, param.fuse_wgrad_accumulation)
+            ETPShardedParam._pending_rs_weight = None
 
         if self.prev_w is None:
             # Sync reduce-scatter (last weight in chain)
             sharded, _ = self._reduce_scatter(wgrads, async_op=False)
-            if fuse_wgrad_accumulation:
-                [self._apply_fused_wgrad(p, g) for p, g in zip(weights, sharded)]
-                result = [None] * len(weights)
-            else:
-                result = [
-                    p._strip_padding(g) if p.is_padded_last_rank else g
-                    for p, g in zip(weights, sharded)
-                ]
+            result = [self._finalize_wgrad(p, g, fuse_wgrad_accumulation)
+                      for p, g in zip(weights, sharded)]
             return result if batched else result[0]
         else:
             # Async reduce-scatter (not last weight — deferred finish)
             self.fuse_wgrad_accumulation = fuse_wgrad_accumulation
             self.wgrad_rs, self.wgrad_rs_handle = self._reduce_scatter(wgrads, async_op=True)
-            type(self)._pending_rs_weight = self
+            ETPShardedParam._pending_rs_weight = self
             return tuple([None] * len(wgrads)) if batched else None
 
     def batched_wgrad_reduce_scatter(self, wgrad_list, fuse_wgrad_accumulation):
         """Batched version of wgrad_reduce_scatter."""
+        assert self.is_routed_expert and self.weight_list is not None
         return self.wgrad_reduce_scatter(wgrad_list, fuse_wgrad_accumulation)
 
     def __torch_function__(self, func, types, args=(), kwargs=None):
diff --git a/transformer_engine/pytorch/module/grouped_linear.py b/transformer_engine/pytorch/module/grouped_linear.py
index fd55f84b3c..2f1fac23bf 100644
--- a/transformer_engine/pytorch/module/grouped_linear.py
+++ b/transformer_engine/pytorch/module/grouped_linear.py
@@ -357,7 +357,7 @@ def backward(ctx, grad_output: torch.Tensor) -> Tuple[Union[torch.Tensor, None],
                         origin_weights[i] = ctx.weight_objects[i]
                         ctx.weight_objects[i] = None
 
-            if ctx.fuse_wgrad_accumulation:
+            if ctx.fuse_wgrad_accumulation and ctx.etp_size == 1:
                 for i in range(N):
                     origin_weights[i].main_grad = main_grads[i]
 
diff --git a/transformer_engine/pytorch/module/layernorm_linear.py b/transformer_engine/pytorch/module/layernorm_linear.py
index f6f24f6a5f..c3c6cf73d1 100644
--- a/transformer_engine/pytorch/module/layernorm_linear.py
+++ b/transformer_engine/pytorch/module/layernorm_linear.py
@@ -575,6 +575,10 @@ def backward(
                 rsigma,
             ) = restore_from_saved(ctx.tensor_objects, saved_tensors)
 
+            if ctx.etp_size > 1:
+                weight = origin_weight.all_gather_and_prefetch_bwd(
+                    nvtx_label=nvtx_label)
+
             # Delete the references to tensor objects once they've been consumed
             # by the `restore_from_saved` method to construct back the actual tensors.
             ctx.tensor_objects = None
@@ -716,10 +720,6 @@ def backward(
             # Note: Gradient w.r.t. GEMM input (i.e. norm output).
             # --------------------------------------------------
 
-            # Make sure required data is available
-            if ctx.etp_size > 1:
-                weight = origin_weight.all_gather_and_prefetch_bwd(
-                    nvtx_label=nvtx_label)
 
             if isinstance(grad_output, QuantizedTensorStorage):
                 grad_output.update_usage(rowwise_usage=True)

From 9b846dc9172b9a216716c104172d94d05d914fc1 Mon Sep 17 00:00:00 2001
From: Jieming Zhang <jiemingz@nvidia.com>
Date: Thu, 19 Mar 2026 15:13:42 -0700
Subject: [PATCH 04/43] Cudagraph Support

Signed-off-by: Jieming Zhang <jiemingz@nvidia.com>
---
 transformer_engine/pytorch/distributed.py     |  24 +-
 .../module/extended_tensor_parallelism.py     | 507 ++++++++++++------
 .../pytorch/module/layernorm_linear.py        |   6 +-
 3 files changed, 369 insertions(+), 168 deletions(-)

diff --git a/transformer_engine/pytorch/distributed.py b/transformer_engine/pytorch/distributed.py
index 20c617ddb7..cea07eb6d5 100644
--- a/transformer_engine/pytorch/distributed.py
+++ b/transformer_engine/pytorch/distributed.py
@@ -1253,12 +1253,16 @@ def _post_process_nvfp4_gather(
         handle.wait()
         handle = None
 
-    # Fix the interleaved transposed data from gathering along first dim.
-    out._columnwise_scale_inv = _swap_first_dims(columnwise_scale_inv_interleaved, world_size)
-    out._columnwise_data = _swap_first_dims(columnwise_data_interleaved, world_size)
+    # TODO
+    # # Fix the interleaved transposed data from gathering along first dim.
+    # out._columnwise_scale_inv = _swap_first_dims(columnwise_scale_inv_interleaved, world_size)
+    # out._columnwise_data = _swap_first_dims(columnwise_data_interleaved, world_size)
+    out._columnwise_scale_inv.copy_(_swap_first_dims(columnwise_scale_inv_interleaved, world_size)) 
+    out._columnwise_data.copy_(_swap_first_dims(columnwise_data_interleaved, world_size)) 
 
-    # Optionally pad the scaling inverse if needed.
-    out._columnwise_scale_inv = pad_columnwise_scale_inv(out._columnwise_scale_inv)
+    # # Optionally pad the scaling inverse if needed.
+    # out._columnwise_scale_inv = pad_columnwise_scale_inv(out._columnwise_scale_inv)
+    out._columnwise_scale_inv.copy_(pad_columnwise_scale_inv(out._columnwise_scale_inv)) 
 
 
 @dataclass
@@ -1409,7 +1413,9 @@ def _all_gather_nvfp4(
             )
 
             # Transfer amax to output.
-            out._amax_rowwise = inp._amax_rowwise
+            #TODO: jiemingz
+            # out._amax_rowwise = inp._amax_rowwise
+            out._amax_rowwise.copy_(inp._amax_rowwise)
 
         # Gather the transposed NVFP4 data along first dimension. Fix format later.
         if quantizer.columnwise_usage:
@@ -1458,7 +1464,8 @@ def _all_gather_nvfp4(
             )
 
             # Transfer amax to output.
-            out._amax_columnwise = inp._amax_columnwise
+            out._amax_columnwise.copy_(inp._amax_columnwise)
+
 
     handle = coalesced_handle if async_op else None
 
@@ -1473,6 +1480,9 @@ def _all_gather_nvfp4(
             )
         else:
             _post_process_nvfp4_gather(out, out_columnwise_data, out_scale_inv, world_size, handle)
+    else:
+        if handle is not None:
+            handle.output = out
 
     return out, handle
 
diff --git a/transformer_engine/pytorch/module/extended_tensor_parallelism.py b/transformer_engine/pytorch/module/extended_tensor_parallelism.py
index 47311d3080..d6a33e9826 100644
--- a/transformer_engine/pytorch/module/extended_tensor_parallelism.py
+++ b/transformer_engine/pytorch/module/extended_tensor_parallelism.py
@@ -3,10 +3,11 @@
 # See LICENSE for license information.
 
 from collections import defaultdict
-from typing import Dict, List
+from typing import Dict, List, Optional
 from enum import Enum
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 import torch
+from contextlib import nullcontext
 
 from ..distributed import (
     gather_along_first_dim,
@@ -20,27 +21,49 @@
 
 import transformer_engine_torch as tex
 
+DEBUG_TENSOR = None
+
 
 class ETPWeightState(Enum):
     NONE = "NONE"              # Sharded, no pending operation
     ASYNC_WAIT = "ASYNC_WAIT"  # Async all-gather in progress
-    ASYNC_DONE = "ASYNC_DONE"  # Async all-gather complete, result in cache
+    DATA_READY = "DATA_READY"  # Async all-gather complete, result in cache
+    DATA_READY_SYNC = "DATA_READY_SYNC"  # Sync all-gather complete, result in cache
+
 
 _STATE_TRANSITIONS = {
-    ETPWeightState.NONE:       {ETPWeightState.ASYNC_WAIT},
-    ETPWeightState.ASYNC_WAIT: {ETPWeightState.ASYNC_DONE},
-    ETPWeightState.ASYNC_DONE: {ETPWeightState.NONE},
+    ETPWeightState.NONE:       {ETPWeightState.ASYNC_WAIT, ETPWeightState.DATA_READY_SYNC},
+    ETPWeightState.ASYNC_WAIT: {ETPWeightState.DATA_READY},
+    ETPWeightState.DATA_READY: {ETPWeightState.NONE},
+    ETPWeightState.DATA_READY_SYNC: {ETPWeightState.NONE},
 }
 
 
-# Global AG Prefetching Buffer for ETP.
-_ALL_GATHER_BUFFER = None
+# Global ETP buffer cache (persists across clear(); never set to None after creation).
+_ETP_CACHE = None
+
+# Global set of ETPShardedParam with in-flight async comms (AG or RS).
+_inflight_comm_params: set = set()
+AG_STREAM = None
+RS_STREAM = None
+
+def get_ag_stream():
+    global AG_STREAM
+    if AG_STREAM is None:
+        AG_STREAM = torch.cuda.Stream()
+    return AG_STREAM
 
+def get_rs_stream():
+    global RS_STREAM
+    if RS_STREAM is None:
+        RS_STREAM = torch.cuda.Stream()
+    return RS_STREAM
 
 @dataclass
 class ETPConfig:
     """Global configuration for Extended Tensor Parallelism."""
     pad_for_alignment: int = 16
+    check_param_states: bool = True
     weight_prefetch: bool = True
 
 ETP_CONFIG = ETPConfig()
@@ -108,15 +131,22 @@ def wrap_module_params_etp(module, weight_names, etp_group, is_grouped=None):
 
 class ETPShardHandle:
 
-    def __init__(self, handle, etp_shards: list):
+    def __init__(self, handle, etp_shards, reduce_scatter=False):
         self.handle = handle
         self.etp_shards = etp_shards
+        self.reduce_scatter = reduce_scatter
+        _inflight_comm_params.add(etp_shards[0])
 
     def wait(self):
         if self.handle is not None:
             self.handle.wait()
         for w in self.etp_shards:
-            w._set_state(ETPWeightState.ASYNC_DONE)
+            if self.reduce_scatter:
+                w._set_rs_state(ETPWeightState.DATA_READY)
+            else:
+                w._set_state(ETPWeightState.DATA_READY)
+
+        _inflight_comm_params.discard(self.etp_shards[0])
 
 
 class ETPShardedParam(torch.nn.Parameter):
@@ -133,17 +163,19 @@ def __new__(cls, tensor, *args, **kwargs):
     def __init__(self, x, *args, **kwargs):
         super().__init__()
         
+        # all gather
         self.state = ETPWeightState.NONE
-        self._cache_ticket = None
+        self._ag_ticket_fwd = None
+        self._ag_ticket_bwd = None
         self._prefetch_handle = None
-        self._grad_accum_node = None
-        self._grad_accum_hook = None
+        self._need_weight_prefetch = True
+        self.ag_event = torch.cuda.Event(external=True)
         # Quantization
         self._quantizer = None
         self.did_cast_to_low_precision = False
         self.quantized = None
         # Prefetching linked list
-        self.is_first_weight = False
+        self.prefetch_initialized = False
         self.next_w = None
         self.prev_w = None
         # Grouped gemm
@@ -152,9 +184,14 @@ def __init__(self, x, *args, **kwargs):
         self.group = None
         self.weight_list = None
         # Reduce-scatter state (set during wgrad_reduce_scatter)
+        self.rs_state = ETPWeightState.NONE
         self.wgrad_rs = None
-        self.wgrad_rs_handle = None
+        self._wgrad_rs_handle = None
         self.fuse_wgrad_accumulation = False
+        self._grad_accum_node = None
+        self._grad_accum_hook = None
+        self.rs_event = torch.cuda.Event(external=True)
+        self._rs_ticket = None
         # Padding
         self.is_padded_last_rank = False
         self.pad_length = 0
@@ -203,18 +240,31 @@ def _unsharded_shape(self):
         out_shape[0] -= self.pad_length
         return tuple(out_shape)
 
+    @property
+    def _sharded_padded_shape(self):
+        out_shape = list(self.size())
+        if self.pad_length > 0 and self.group.rank() == self.group.size() - 1:
+            out_shape[0] += self.pad_length
+        return tuple(out_shape)
+
     def get_padded_shard(self):
         if self.pad_length > 0 and self.is_padded_last_rank:
             return torch.nn.functional.pad(self, (0, 0, 0, self.pad_length))  
         return self
 
     def _set_state(self, new_state: ETPWeightState):
-        """Validate and update state machine transition."""
-        assert new_state in _STATE_TRANSITIONS[self.state], \
-            f"Invalid state transition: {self.state} -> {new_state}"
+        # if ETP_CONFIG.check_param_states:
+        #     assert new_state in _STATE_TRANSITIONS[self.state], \
+        #         f"Invalid state transition: {self.state} -> {new_state}"
         self.state = new_state
 
-    def _get_cache_key(self, dtype, fwd: bool) -> tuple:
+    def _set_rs_state(self, new_state: ETPWeightState):
+        # if ETP_CONFIG.check_param_states:
+        #     assert new_state in _STATE_TRANSITIONS[self.rs_state], \
+        #         f"Invalid state transition: {self.rs_state} -> {new_state}"
+        self.rs_state = new_state
+
+    def _get_cache_key(self, dtype, fwd: bool, reduce_scatter: bool) -> tuple:
         """Build cache key using output shape + dtype.
 
         Weights with matching gathered shape and dtype share a buffer.
@@ -223,8 +273,8 @@ def _get_cache_key(self, dtype, fwd: bool) -> tuple:
         """
         
         if not isinstance(dtype, torch.dtype):
-            return (self._unsharded_shape_padded, dtype, fwd, not fwd, self.expert_idx)
-        return (self._unsharded_shape_padded, dtype, self.expert_idx)
+            return (self._unsharded_shape_padded, dtype, fwd, not fwd, self.expert_idx, reduce_scatter)
+        return (self._unsharded_shape_padded, dtype, self.expert_idx, reduce_scatter)
 
     def _quantize_if_needed(self, skip_weight_cast=False, cast_noop_flag=None):
         """Re-quantize sharded weight into existing buffer. Returns quantized weight or self."""
@@ -278,6 +328,9 @@ def _all_gather_weight(self, async_op, skip_weight_cast, cast_noop_flag, fwd, nv
         if async_op:
             for w in weights:
                 w._set_state(ETPWeightState.ASYNC_WAIT)
+        else:
+            for w in weights:
+                w._set_state(ETPWeightState.DATA_READY_SYNC)
 
         # 2. Prepare: quantize, set usage direction.
         for w in weights:
@@ -296,11 +349,16 @@ def _all_gather_weight(self, async_op, skip_weight_cast, cast_noop_flag, fwd, nv
         if async_op:
             dtypes = [q.dtype if q is not None else w.dtype for q, w in zip(quantizers, weights)]
             out_buffers = []
+            cache = get_global_ETP_cache()
             for p, dt in zip(weights, dtypes):
-                assert p._cache_ticket is None, \
-                    f"Cache ticket leak: weight {id(p)} still has unreturned ticket {p._cache_ticket}"
-                buf, p._cache_ticket = get_global_ETP_cache().checkout(p, dt, fwd)
-                out_buffers.append(buf)
+                if fwd:
+                    if p._ag_ticket_fwd is None:
+                        p._ag_ticket_fwd = cache.reserve(p, dt, fwd=True)
+                    out_buffers.append(cache.get(p._ag_ticket_fwd))
+                else:
+                    if p._ag_ticket_bwd is None:
+                        p._ag_ticket_bwd = cache.reserve(p, dt, fwd=False)
+                    out_buffers.append(cache.get(p._ag_ticket_bwd))
         else:
             out_buffers = None
 
@@ -340,49 +398,42 @@ def _all_gather_weight(self, async_op, skip_weight_cast, cast_noop_flag, fwd, nv
 
         return result, handle
 
-    def _get_unsharded(self, fwd, skip_weight_cast=False, cast_noop_flag=None):
-        """Get unsharded (all-gathered) weight tensor(s).
-
-        Handles both routed experts (returns list) and single weights (returns tensor).
-        Supports sync gather, async prefetch wait, and cache retrieval.
-        """
-        weights = self._weights
+    def _wait_param_gather(self):
+            # Since wait() may sychronize against a different stream than the current stream,
+            # an event is recorded and waited on when the data is retrieved, which ensures the
+            # AG always finishes before returning the unsharded param
+            with torch.cuda.stream(get_ag_stream()):
+                if self._prefetch_handle is not None:
+                    self._prefetch_handle.wait()
+                    self._prefetch_handle = None
+                    self.ag_event.record()
+
+    def _all_gather_weight_on_demand(self, fwd, skip_weight_cast=False, cast_noop_flag=None):
+        result, _ = self._all_gather_weight(
+            async_op=False,
+            skip_weight_cast=skip_weight_cast,
+            cast_noop_flag=cast_noop_flag,
+            fwd=fwd,
+        )
+        result = result if self.is_routed_expert else [result]
+        result = [self._strip_padding(r) for r in result]
+        result = [r.detach().requires_grad_(w.requires_grad) for r, w in zip(result,self._weights)]
+        return result if self.is_routed_expert else result[0]
 
+    def _get_prefetched_weight(self, fwd, skip_weight_cast=False, cast_noop_flag=None):
         # Wait for async prefetch if in progress
-        if weights[0].state == ETPWeightState.ASYNC_WAIT:
-            self._prefetch_handle.wait()
-            self._prefetch_handle = None
-
-        if weights[0].state == ETPWeightState.NONE:
-            # Synchronous all-gather (no cache — buffers allocated inline)
-            result, _ = self._all_gather_weight(
-                async_op=False,
-                skip_weight_cast=skip_weight_cast,
-                cast_noop_flag=cast_noop_flag,
-                fwd=fwd,
-            )
-            result = result if self.is_routed_expert else [result]
+        self._wait_param_gather()
+        self.ag_event.wait()
 
-        elif weights[0].state == ETPWeightState.ASYNC_DONE:
-            # Retrieve prefetched results from cache
-            cache = get_global_ETP_cache()
-            result = []
-            for w in weights:
-                buf = cache.get(w._cache_ticket)
-                w._cache_ticket = None
-                # Post-gather quantization safety net: weight was prefetched
-                # before weight_quantizer was set
-                if not w.did_cast_to_low_precision:
-                    if w._quantizer is not None and not isinstance(buf, QuantizedTensor):
-                        w._quantize_if_needed()
-                        buf = w._quantizer.quantize(buf)
-                w._set_state(ETPWeightState.NONE)
-                result.append(buf)
-        else:
-            assert False, f"Unexpected state: {weights[0].state}"
+        # Retrieve prefetched results from cache
+        result = []
+        cache = get_global_ETP_cache()
+        for w in self._weights:
+            ticket = w._ag_ticket_fwd if fwd else w._ag_ticket_bwd
+            result.append(cache.get(ticket))
 
         result = [self._strip_padding(r) for r in result]
-        result = [r.detach().requires_grad_(w.requires_grad) for r, w in zip(result, weights)]
+        result = [r.detach().requires_grad_(w.requires_grad) for r, w in zip(result, self._weights)]
         return result if self.is_routed_expert else result[0]
 
     def all_gather_and_prefetch_bwd(self, nvtx_label=None):
@@ -398,14 +449,32 @@ def all_gather_and_prefetch_bwd(self, nvtx_label=None):
         Returns:
             weight_total
         """
-        result = self._get_unsharded(fwd=False, skip_weight_cast=True)
 
-        if ETP_CONFIG.weight_prefetch and self.prev_w is not None:
+        if self.next_w is not None:
+            result = self._get_prefetched_weight(False, skip_weight_cast=True)
+        else:
+            result = self._all_gather_weight_on_demand(False, skip_weight_cast=True)
+
+        if (
+            ETP_CONFIG.weight_prefetch 
+            and self.prev_w is not None 
+            and self.prev_w._need_weight_prefetch
+        ):
             _, handle = self.prev_w._all_gather_weight(
                 async_op=True, skip_weight_cast=True, cast_noop_flag=None,
                 fwd=False, nvtx_label=nvtx_label,
             )
             self.prev_w._prefetch_handle = handle
+
+        # The unsharded tensor has been returned, no pending work so reset state to NONE
+        for w in self._weights:
+            w._set_state(ETPWeightState.NONE)
+
+        if self.next_w is not None:
+            cache = get_global_ETP_cache()
+            for w in self._weights:
+                cache.release(w._ag_ticket_bwd)
+
         return result
 
     def batched_all_gather_and_prefetch_bwd(self, nvtx_label=None):
@@ -426,30 +495,44 @@ def all_gather_and_prefetch(
         Returns:
             weight_total
         """
-        # Lazy population of linked list: link previous weight to current weight
-        cls = type(self)
-        if cls._first_weight_flag:
-            self.is_first_weight = True
-            cls._first_weight_flag = False
+        if self.prev_w is not None:
+            result = self._get_prefetched_weight(True, skip_weight_cast, cast_noop_flag)
+        else:
+            result = self._all_gather_weight_on_demand(True, skip_weight_cast, cast_noop_flag)
 
-        if self.is_first_weight:
-            cls._last_weight = None
+        # Prefetch next weight
+        if (
+            ETP_CONFIG.weight_prefetch 
+            and self.next_w is not None 
+            and self.next_w._need_weight_prefetch
+        ):
+            _, handle = self.next_w._all_gather_weight(
+                async_op=True, 
+                skip_weight_cast=skip_weight_cast,
+                cast_noop_flag=cast_noop_flag, 
+                fwd=fwd, nvtx_label=nvtx_label,
+            )
+            self.next_w._prefetch_handle = handle
 
-        if cls._last_weight is not None and cls._last_weight.next_w is None:
-            print_rank_0(f"linking curr w: {id(self)} {self.is_routed_expert} prev_w: {id(cls._last_weight)}")
-            cls._last_weight.next_w = self
-            self.prev_w = cls._last_weight
-        cls._last_weight = self
+        # The unsharded tensor has been returned, no pending work so reset state to NONE
+        for w in self._weights:
+            w._set_state(ETPWeightState.NONE)
 
-        result = self._get_unsharded(fwd, skip_weight_cast=skip_weight_cast, cast_noop_flag=cast_noop_flag)
+        if self.prev_w is not None:
+            cache = get_global_ETP_cache()
+            for w in self._weights:
+                cache.release(w._ag_ticket_fwd)
+
+        # Lazy population of linked list: link previous weight to current weight
+        cls = type(self)
+        if not self.prefetch_initialized:
+            if cls._last_weight is not None and cls._last_weight.next_w is None:
+                print_rank_0(f"linking curr w: {id(self)} {self.is_routed_expert} prev_w: {id(cls._last_weight)}")
+                cls._last_weight.next_w = self
+                self.prev_w = cls._last_weight
+            self.prefetch_initialized = True
+        cls._last_weight = self
 
-        if ETP_CONFIG.weight_prefetch and self.next_w is not None:
-            target = self.next_w
-            _, handle = target._all_gather_weight(
-                async_op=True, skip_weight_cast=skip_weight_cast,
-                cast_noop_flag=cast_noop_flag, fwd=fwd, nvtx_label=nvtx_label,
-            )
-            target._prefetch_handle = handle
         return result
 
     def batched_all_gather_and_prefetch(self, **kwargs):
@@ -476,6 +559,9 @@ def _finalize_wgrad(param, wgrad_rs, fuse_wgrad_accumulation):
         Returns None for fused (grad already accumulated into main_grad),
         or the stripped wgrad for unfused (to be returned to autograd).
         """
+
+        param._set_rs_state(ETPWeightState.NONE)
+
         # 1. Strip padding
         if param.is_padded_last_rank:
             wgrad_rs = param._strip_padding(wgrad_rs)
@@ -492,7 +578,15 @@ def _finalize_wgrad(param, wgrad_rs, fuse_wgrad_accumulation):
             param.grad = dummy_grad if fuse_wgrad_accumulation else wgrad_rs
             param._grad_accum_hook(param)
 
-        return dummy_grad if fuse_wgrad_accumulation else wgrad_rs
+        return None if fuse_wgrad_accumulation else wgrad_rs
+
+    def _wait_reduce_scatter(self):
+        # assert self._wgrad_rs_handle is not None or is_graph_capturing()
+        with torch.cuda.stream(get_rs_stream()):
+            if self._wgrad_rs_handle is not None:
+                self._wgrad_rs_handle.wait()
+                self._wgrad_rs_handle = None
+                self.rs_event.record()
 
     def _reduce_scatter(self, wgrads, async_op):
         """Reduce-scatter one or more wgrads. Returns (outputs, handle).
@@ -501,12 +595,29 @@ def _reduce_scatter(self, wgrads, async_op):
         Multiple tensors: coalesced reduce-scatter.
         """
 
+        for w in self._weights:
+            if async_op:
+                w._set_rs_state(ETPWeightState.ASYNC_WAIT)
+            else:
+                w._set_rs_state(ETPWeightState.DATA_READY_SYNC)
+
         if self.pad_length > 0:
             wgrads = [torch.nn.functional.pad(w, (0, 0, 0, self.pad_length)) for w in wgrads]
 
+        if async_op:
+            dtypes = [w.dtype for w in wgrads]
+            out_buffers = []
+            cache = get_global_ETP_cache()
+            for p, dt in zip(self._weights, dtypes):
+                if p._rs_ticket is None:
+                    p._rs_ticket = cache.reserve(p, dt, fwd=False, reduce_scatter=True)
+                out_buffers.append(cache.get(p._rs_ticket))
+        else:
+            out_buffers = [None] * len(wgrads)
+
         if len(wgrads) == 1:
             out, handle = reduce_scatter_along_first_dim(
-                wgrads[0], self.group, async_op=async_op
+                wgrads[0], self.group, async_op=async_op, output=out_buffers[0]
             )
             return [out], handle
         else:
@@ -516,9 +627,10 @@ def _reduce_scatter(self, wgrads, async_op):
                 device=wgrads[0].device,
                 async_ops=async_op,
             ) as cm:
-                for tensor in wgrads:
-                    out, _ = reduce_scatter_along_first_dim(tensor, self.group)
+                for out_buffer, tensor in zip(out_buffers, wgrads):
+                    out, _ = reduce_scatter_along_first_dim(tensor, self.group, output=out_buffer)
                     outputs.append(out)
+
             return outputs, cm if async_op else None
 
     def wgrad_reduce_scatter(self, wgrad, fuse_wgrad_accumulation):
@@ -534,29 +646,32 @@ def wgrad_reduce_scatter(self, wgrad, fuse_wgrad_accumulation):
         wgrads = list(wgrad) if batched else [wgrad]
         weights = self._weights
 
-        # Wait for last reduce scatter if it was async
-        if ETPShardedParam._pending_rs_weight is not None:
-            param = ETPShardedParam._pending_rs_weight
-            assert param is self.next_w
-            param.wgrad_rs_handle.wait()
-            param.wgrad_rs_handle = None
-
-            for p, g in zip(param._weights, param.wgrad_rs):
-                self._finalize_wgrad(p, g, param.fuse_wgrad_accumulation)
-            ETPShardedParam._pending_rs_weight = None
-
-        if self.prev_w is None:
+        if ETP_CONFIG.weight_prefetch and self.prev_w is not None:
+            # Async reduce-scatter (not last weight — deferred finish)
+            self.fuse_wgrad_accumulation = fuse_wgrad_accumulation
+            _, rs_handle = self._reduce_scatter(wgrads, async_op=True)
+            self._wgrad_rs_handle = ETPShardHandle(rs_handle, weights, reduce_scatter=True)
+            ret = tuple([None] * len(wgrads)) if batched else None
+        else:
             # Sync reduce-scatter (last weight in chain)
             sharded, _ = self._reduce_scatter(wgrads, async_op=False)
             result = [self._finalize_wgrad(p, g, fuse_wgrad_accumulation)
                       for p, g in zip(weights, sharded)]
-            return result if batched else result[0]
-        else:
-            # Async reduce-scatter (not last weight — deferred finish)
-            self.fuse_wgrad_accumulation = fuse_wgrad_accumulation
-            self.wgrad_rs, self.wgrad_rs_handle = self._reduce_scatter(wgrads, async_op=True)
-            ETPShardedParam._pending_rs_weight = self
-            return tuple([None] * len(wgrads)) if batched else None
+            ret = result if batched else result[0]
+
+        # Wait for last reduce scatter if it was async
+        # Currently only support reduce scattering in reverse order
+        if self.next_w is not None:
+            self.next_w._wait_reduce_scatter()
+            self.next_w.rs_event.wait()
+
+            cache = get_global_ETP_cache()
+            fuse_wgrad_accumulation = self.next_w._weights[0].fuse_wgrad_accumulation
+            for w in self.next_w._weights:
+                self._finalize_wgrad(w, cache.get(w._rs_ticket), fuse_wgrad_accumulation)
+                cache.release(w._rs_ticket)
+
+        return ret
 
     def batched_wgrad_reduce_scatter(self, wgrad_list, fuse_wgrad_accumulation):
         """Batched version of wgrad_reduce_scatter."""
@@ -590,34 +705,48 @@ def print_rank_0(message, rank=None):
     else:
         print(message, flush=True)
 
-class ETPWeightCache:
-    """
-    Buffers are pooled by cache key (shape + dtype).  Two operations:
+@dataclass
+class _TicketSlot:
+    """Internal slot backing a persistent ticket in the ETP buffer cache."""
+    key: tuple                                   # cache key (shape, dtype, ...)
+    param: 'ETPShardedParam'                     # for lazy allocation metadata
+    dtype: object                                # torch.dtype or tex.DType
+    reduce_scatter: bool
+    fwd: bool
+    buf: Optional[torch.Tensor] = field(default=None)  # None when released or after clear()
 
-    - ``checkout(param, dtype, fwd)`` → ``(buffer, ticket)``
-      Takes a buffer from the pool (or allocates). Ticket is ``id(buf)``.
-    - ``get(ticket, param, dtype, fwd)`` → ``buffer``
-      Retrieves the buffer, asserts key matches, returns it to the pool,
-      and invalidates the ticket.
 
-    Every checkout is paired with exactly one get (1:1).
-    Two weights sharing the same cache key get distinct buffers if one
-    is still checked out, preventing aliasing.
+class ETPWeightCache:
+    """
+    Ticket-based buffer pool for ETP all-gather / reduce-scatter buffers.
+
+    - ``reserve(param, dtype, fwd)`` → ``ticket``
+      Assigns a persistent ticket (no buffer allocated yet).
+    - ``get(ticket)`` → ``buffer``
+      Returns the buffer, lazily allocating from pool or fresh if needed.
+    - ``release(ticket)``
+      Returns the buffer to the pool.  Ticket remains valid; next ``get()``
+      will re-allocate from the pool.
+    - ``clear()``
+      Drops all buffers and pools.  Tickets remain valid; next ``get()``
+      lazily allocates fresh buffers.
     """
 
     # Bytes per element for known dtypes (used for logging).
     _BYTES_PER_ELEMENT = {
-        torch.bfloat16: 2, torch.float16: 2, torch.float32: 4,
+        torch.bfloat16: 2,
+        torch.float16: 2,
+        torch.float32: 4,
         tex.DType.kFloat4E2M1: 0.5,
         tex.DType.kFloat8E4M3: 1,
     }
 
     def __init__(self):
         self._pool: Dict[tuple, List[torch.Tensor]] = defaultdict(list)
-        self._tickets: Dict[int, tuple] = {}   # ticket → (key, buf)
-        self._free_tickets: list[int] = []      # recycled ticket IDs
-        self._max_ticket: int = 0               # high-water mark for ticket allocation
+        self._slots: Dict[int, _TicketSlot] = {}
+        self._next_ticket: int = 0
         self._total_bytes: int = 0              # running total of allocated bytes
+        self.key_to_allocate_func = {}
 
     @staticmethod
     def _buf_bytes(shape, dtype) -> int:
@@ -628,65 +757,125 @@ def _buf_bytes(shape, dtype) -> int:
         bpe = ETPWeightCache._BYTES_PER_ELEMENT.get(dtype, None)
         return numel * bpe
 
-    def _allocate_buffer(self, param: 'ETPShardedParam', dtype) -> torch.Tensor:
-        out_shape = param._unsharded_shape_padded
+    def _allocate_buffer(self, param: 'ETPShardedParam', dtype, reduce_scatter, fwd) -> torch.Tensor:
+        if reduce_scatter:
+            out_shape = param._sharded_padded_shape
+        else:
+            out_shape = param._unsharded_shape_padded
+
         if not isinstance(dtype, torch.dtype):
             quantizer = param._quantizer
             assert quantizer is not None
-            assert quantizer.rowwise_usage ^ quantizer.columnwise_usage
+            param._quantizer.set_usage(rowwise=fwd, columnwise=not fwd)
 
-            device = torch.cuda.current_device()
-            buf = param._quantizer.make_empty(out_shape, dtype=torch.bfloat16, device=device)
+            buf = param._quantizer.make_empty(
+                out_shape, 
+                dtype=torch.bfloat16, 
+                device=torch.cuda.current_device(),
+            )
         else:
             buf = torch.empty(
                 out_shape, dtype=dtype, device=param.device, memory_format=torch.contiguous_format
             )
+
         buf_bytes = self._buf_bytes(out_shape, dtype)
         self._total_bytes += buf_bytes
         print_rank_0(
             f"[ETP Cache] +{buf_bytes / 1024**2:.1f} MB  (shape={out_shape}, dtype={dtype})  "
-            f"total={self._total_bytes / 1024**2:.1f} MB"
+            f"total={self._total_bytes / 1024**2:.1f} MB id: {id(buf)} fwd: {fwd}"
         )
         return buf
 
-    def checkout(self, param: 'ETPShardedParam', dtype, fwd: bool):
-        """Get a buffer for all-gather output.  Returns (buffer, ticket).
+    def reserve(self, param: 'ETPShardedParam', dtype, fwd: bool, reduce_scatter=False) -> int:
+        """Assign a persistent ticket.  No buffer is allocated until ``get()``."""
+        key = param._get_cache_key(dtype, fwd, reduce_scatter)
+        ticket = self._next_ticket
+        self._next_ticket += 1
 
-        Ticket IDs are recycled so they stay bounded.
-        If all buffers for this key are checked out, allocates a new one.
-        """
-        key = param._get_cache_key(dtype, fwd)
-        pool = self._pool[key]
-        buf = pool.pop() if pool else self._allocate_buffer(param, dtype)
-
-        if self._free_tickets:
-            ticket = self._free_tickets.pop()
-        else:
-            ticket = self._max_ticket
-            self._max_ticket += 1
-        self._tickets[ticket] = (key, buf)
-        return buf, ticket
+        self._slots[ticket] = _TicketSlot(
+            key=key, param=param, dtype=dtype, reduce_scatter=reduce_scatter, fwd=fwd
+        )
+        return ticket
 
     def get(self, ticket: int) -> torch.Tensor:
-        """Retrieve buffer by ticket and return it to the pool.
-
-        This combines the old get + ticket_return into a single call.
-        After this call the ticket is invalidated and the buffer is
-        available for future checkouts.
+        """Return the buffer for *ticket*, lazily allocating if needed."""
+        slot = self._slots[ticket]
+        if slot.buf is None:
+            pool = self._pool[slot.key]
+            slot.buf = pool.pop() if pool else self._allocate_buffer(
+                slot.param, slot.dtype, slot.reduce_scatter, fwd=slot.fwd
+            )
+            self.key_to_allocate_func[slot.key] = (slot.param, slot.dtype, slot.reduce_scatter, slot.fwd)
+            
+        return slot.buf
+
+    def release(self, ticket: int):
+        """Return the buffer to the pool.  Ticket remains valid."""
+        slot = self._slots[ticket]
+        assert slot.buf is not None
+        if slot.buf not in self._pool[slot.key]:
+            self._pool[slot.key].append(slot.buf)
+
+    def clear(self):
+        """Drop all buffers; tickets remain valid and lazily re-allocate on next get()."""
+        for slot in self._slots.values():
+            slot.buf = None
+        self._pool.clear()
+        self._total_bytes = 0
+
+    def reallocate_to_mempool(self, device, mempool):
+        """Re-allocate all ticket buffers into a CUDA graph memory pool.
+
+        Call BEFORE graph capture so every buffer lives in the capture pool
+        and no allocations are recorded inside the graph.
         """
-        assert ticket in self._tickets, f"Invalid ticket: {ticket}"
-        key, buf = self._tickets.pop(ticket)
-        self._free_tickets.append(ticket)
-        self._pool[key].append(buf)
-        return buf
 
+        # Clone the current memory pool buffers but into the passed in mempool
+        self._total_bytes = 0
+        new_pool = defaultdict(list)
+        torch._C._cuda_beginAllocateCurrentThreadToPool(device, mempool)
+        for key, buffers in self._pool.items():
+            new_buffers = []
+            for _ in range(len(buffers)):
+                buf = self._allocate_buffer(*self.key_to_allocate_func[key])
+                new_buffers.append(buf)
+            new_pool[key] = new_buffers
+        torch._C._cuda_endAllocateToPool(device, mempool)
+
+        # Map each buffer in the old pool to its corresponding new one
+        old_to_new_buff = {}
+        for key, old_pool in self._pool.items():
+            new = new_pool[key]
+            for old_buf, new_buf in zip(old_pool, new):
+                old_to_new_buff[old_buf] = new_buf
+        # Replace each slot's reference to its corresponding new one
+        for slot in self._slots.values():
+            if slot.buf is not None:
+                slot.buf = old_to_new_buff[slot.buf]
+
+        self._pool = new_pool
+        return
 
 def get_global_ETP_cache() -> ETPWeightCache:
     """Get or lazily create the global cache instance."""
-    global _ALL_GATHER_BUFFER
-    if _ALL_GATHER_BUFFER is None:
-        _ALL_GATHER_BUFFER = ETPWeightCache()
-    return _ALL_GATHER_BUFFER
+    global _ETP_CACHE
+    if _ETP_CACHE is None:
+        _ETP_CACHE = ETPWeightCache()
+    return _ETP_CACHE
+
+
+def reallocate_etp_cache_to_mempool(device, mempool):
+    """Re-allocate all ETP cache buffers into a CUDA graph memory pool."""
+    if _ETP_CACHE is not None:
+        _ETP_CACHE.reallocate_to_mempool(device, mempool)
+
+
+def wait_async_comms():
+    """Wait on all in-flight ETP async communications (all-gathers + reduce-scatters).
+    """
+    for param in list(_inflight_comm_params):
+        param._wait_param_gather()
+        param._wait_reduce_scatter()
 
 
 @dataclass
diff --git a/transformer_engine/pytorch/module/layernorm_linear.py b/transformer_engine/pytorch/module/layernorm_linear.py
index c3c6cf73d1..13b7a66b75 100644
--- a/transformer_engine/pytorch/module/layernorm_linear.py
+++ b/transformer_engine/pytorch/module/layernorm_linear.py
@@ -938,6 +938,9 @@ def wgrad_gemm(
                     # Call wgrad GEMM now
                     wgrad, grad_bias_ = wgrad_gemm(ln_out_total, grad_output)
 
+                    if ctx.etp_size > 1:
+                        wgrad = origin_weight.wgrad_reduce_scatter(wgrad, ctx.fuse_wgrad_accumulation)
+
                     # Update grad bias if needed
                     if grad_bias is None:
                         grad_bias = grad_bias_
@@ -963,7 +966,6 @@ def wgrad_gemm(
                         dgrad = reduce_scatter_out
                     else:
                         dgrad = ub_obj_wgrad.get_buffer(local_chunk=True).clone()
-
             # --------------------------------------------------
             # Grad weight has been computed...
             # --------------------------------------------------
@@ -1018,7 +1020,7 @@ def wgrad_gemm(
         if ctx.requires_wgrad:
             # Handle custom DDP from mcore.
             if ctx.etp_size > 1:
-                wgrad = origin_weight.wgrad_reduce_scatter(wgrad, ctx.fuse_wgrad_accumulation)
+                pass
             elif ctx.fuse_wgrad_accumulation and hasattr(origin_weight, "grad_added_to_main_grad"):
                 origin_weight.grad_added_to_main_grad = True
                 if getattr(origin_weight, "zero_out_wgrad", False):

From 9cb1379aca2719854bf308d61b945beead458827 Mon Sep 17 00:00:00 2001
From: Shiqing Fan <shiqingf@nvidia.com>
Date: Mon, 30 Mar 2026 01:53:25 -0700
Subject: [PATCH 05/43] debug: make etp link table log human readable.

---
 .../module/extended_tensor_parallelism.py     | 47 ++++++++++++++++++-
 1 file changed, 46 insertions(+), 1 deletion(-)

diff --git a/transformer_engine/pytorch/module/extended_tensor_parallelism.py b/transformer_engine/pytorch/module/extended_tensor_parallelism.py
index d6a33e9826..bc921705a9 100644
--- a/transformer_engine/pytorch/module/extended_tensor_parallelism.py
+++ b/transformer_engine/pytorch/module/extended_tensor_parallelism.py
@@ -6,6 +6,7 @@
 from typing import Dict, List, Optional
 from enum import Enum
 from dataclasses import dataclass, field
+import re
 import torch
 from contextlib import nullcontext
 
@@ -76,6 +77,17 @@ def update_config(**kwargs):
         setattr(ETP_CONFIG, key, value)
 
 
+def tag_etp_params_with_names(model):
+    """Populate _debug_name on every ETPShardedParam with its full dotted parameter name.
+
+    Call once after model construction so the linking log prints human-readable names
+    instead of raw tensor ids.
+    """
+    for name, param in model.named_parameters():
+        if isinstance(param, ETPShardedParam):
+            param._debug_name = name
+
+
 def wrap_module_params_etp(module, weight_names, etp_group, is_grouped=None):
     """Shard and re-register all parameters of a module using ETP weight sharding."""
     if etp_group.size() == 1:
@@ -154,6 +166,33 @@ class ETPShardedParam(torch.nn.Parameter):
     _pending_rs_weight = None
     _first_weight_flag = True
     _last_weight = None
+    _link_node_count = 0
+    _link_table_buffer: List[str] = []
+    _link_table_flushed: bool = False
+
+    @classmethod
+    def _buffer_link_table_row(cls, prev: "ETPShardedParam", curr: "ETPShardedParam") -> None:
+        """Buffer one row of the prefetch-link table (flushed atomically on the second forward pass)."""
+        _W = 70
+
+        def _layer_id(name: str) -> str:
+            m = re.search(r"\d+", name)
+            return m.group() if m else "-"
+
+        cls._link_node_count += 1
+        if cls._link_node_count == 1:
+            cls._link_table_buffer.append(
+                f"\n{'node_id':>7} | {'layer_id':>8} | {'curr_weight_name':<{_W}} | prev_weight_name"
+                f"\n{'-'*7}-+-{'-'*8}-+-{'-'*_W}-+-{'-'*_W}"
+            )
+            # Seed weight (first ETP param) as row 0
+            cls._link_table_buffer.append(
+                f"{'0':>7} | {_layer_id(prev._debug_name):>8} | {prev._debug_name:<{_W}} | -"
+            )
+        cls._link_table_buffer.append(
+            f"{cls._link_node_count:>7} | {_layer_id(curr._debug_name):>8} | "
+            f"{curr._debug_name:<{_W}} | {prev._debug_name}"
+        )
 
     @staticmethod
     def __new__(cls, tensor, *args, **kwargs):
@@ -195,6 +234,8 @@ def __init__(self, x, *args, **kwargs):
         # Padding
         self.is_padded_last_rank = False
         self.pad_length = 0
+        # Debug
+        self._debug_name = ""
 
     def setup(self, weight_quantizer=None):
         """Set quantizer and create quantized shard."""
@@ -527,10 +568,14 @@ def all_gather_and_prefetch(
         cls = type(self)
         if not self.prefetch_initialized:
             if cls._last_weight is not None and cls._last_weight.next_w is None:
-                print_rank_0(f"linking curr w: {id(self)} {self.is_routed_expert} prev_w: {id(cls._last_weight)}")
+                cls._buffer_link_table_row(cls._last_weight, self)
                 cls._last_weight.next_w = self
                 self.prev_w = cls._last_weight
             self.prefetch_initialized = True
+        elif not cls._link_table_flushed and cls._link_table_buffer:
+            # Second forward pass: flush the complete table atomically to avoid interleaving
+            cls._link_table_flushed = True
+            print_rank_0("\n".join(cls._link_table_buffer) + "\n")
         cls._last_weight = self
 
         return result

From a5ef6753c7e6bdac16e936331f95c696bcf3f5f8 Mon Sep 17 00:00:00 2001
From: Shiqing Fan <shiqingf@nvidia.com>
Date: Mon, 30 Mar 2026 02:12:46 -0700
Subject: [PATCH 06/43] doc: add README_ETP.md

---
 docs/README_ETP.md | 685 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 685 insertions(+)
 create mode 100644 docs/README_ETP.md

diff --git a/docs/README_ETP.md b/docs/README_ETP.md
new file mode 100644
index 0000000000..2dfb3dd227
--- /dev/null
+++ b/docs/README_ETP.md
@@ -0,0 +1,685 @@
+# Extended Tensor Parallelism (ETP)
+
+## Overview
+
+Extended Tensor Parallelism (ETP) is a **light-weight**, **high-performance** and **memory-efficient** distributed training strategy implemented in TransformerEngine. It shards weight tensors across an ETP process group and reconstructs them on-demand via async all-gather, enabling training of larger models without sacrificing throughput by overlapping communication with computation.
+
+ETP applies to any TE module that wraps a `Linear` layer: `Linear`, `LayerNormLinear`, `LayerNormMLP` (for dense models), and `GroupedLinear` (for MoE models). When used with `GroupedLinear`, ETP provides additional batched coalesced all-gather support for gathering multiple expert weights in a single NCCL operation.
+
+ETP supports all TE low-precision formats (FP8, MXFP8, NVFP4) with a **quantize-then-gather** strategy: each rank quantizes only its local shard before the all-gather, so wire bandwidth scales with the quantized size (0.5× for FP8, 0.25× for NVFP4) rather than the full BF16 weight.
+
+---
+
+## Performance
+
+TODO(shiqingf): add performance for Ultra model in nvfp4.
+
+----
+
+## Features
+
+### User-Visible Features
+
+| Feature | Description |
+|---|---|
+| **Weight sharding** | Weights sharded 1/N across ETP group along `out_features`, reducing per-GPU VRAM |
+| **Async prefetch** | Next layer's weight all-gather overlaps with current layer's GEMM in both forward (prefetches `next_w`) and backward (prefetches `prev_w`); controlled by `ETPConfig.weight_prefetch` |
+| **NVFP4 support** | Full 4-bit quantized all-gather with interleaved-format post-processing |
+| **FP8 / MXFP8 support** | Quantized shards with ETP-group amax reduction |
+| **Routed expert support** | Batched coalesced all-gather for all experts in a MoE layer (GroupedLinear) |
+| **Composable with TP/SP** | Orthogonal to tensor parallelism and sequence parallelism |
+| **CUDA Graphs compatible** | ETP is compatible with CUDA Graphs. |
+| **Debug naming** | `tag_etp_params_with_names(model)` populates human-readable names on every `ETPShardedParam`; the prefetch-link table is printed atomically at the start of the second forward pass |
+
+### Implementation Mechanisms
+
+| Mechanism | Description |
+|---|---|
+| **Alignment padding** | Shards padded to `ETPConfig.pad_for_alignment × etp_size` rows at construction via `get_padded_shard()`; only last rank carries padding (`is_padded_last_rank`); padding stripped in `_strip_padding()` both post-gather (before GEMM) and post-reduce-scatter (before wgrad accumulation) |
+| **Fine-grained weight scheduling** | Each weight has its own `ETPWeightState` lifecycle and is scheduled independently via a doubly-linked list (`next_w`/`prev_w`), enabling per-weight AG/RS overlap at single-weight granularity |
+| **Separate AG and RS state** | All-gather state (`state`) and reduce-scatter state (`rs_state`) are tracked independently per param, allowing forward and backward async ops to proceed without interference |
+| **Dedicated CUDA streams** | AG and RS run on separate global CUDA streams (`AG_STREAM`, `RS_STREAM`), decoupled from the default compute stream; completion is signaled back via per-param CUDA events (`ag_event`, `rs_event`) that the compute stream waits on before consuming the result |
+| **Ticket-based buffer cache** | `ETPWeightCache` assigns persistent tickets via `reserve()`; buffers are lazily allocated on `get()` and returned to the pool on `release()`; `clear()` drops all buffers while keeping tickets valid for lazy re-allocation (used for CUDA Graph re-capture) |
+| **Wgrad reduce-scatter** | Async reduce-scatter of weight gradients, deferred to overlap with next layer's wgrad RS; padding stripped and grad accumulated in `_finalize_wgrad()` |
+
+---
+
+## Design
+
+### Core Idea
+
+In standard Tensor Parallelism (TP), each GPU holds a shard of each weight and communicates activations. ETP goes one level deeper: **each weight is sharded along the `out_features` dimension (dim 0) across an ETP group of N GPUs**, so each GPU stores only 1/N of the weight. Before each GEMM, an all-gather reconstructs the full weight; after the backward GEMM, a reduce-scatter propagates the weight gradient back to the shards.
+
+```
+Standard column-parallel TP (TP=2, 2 GPUs, weight W of shape [K, M]):
+  GPU0 owns W[:K/2, :]   (first half of out_features)
+  GPU1 owns W[K/2:, :]   (second half of out_features)
+
+ETP (on top of column-parallel TP, ETP=2 per TP rank, 4 GPUs):
+  GPU0 (TP0, ETP0) owns W[:K/4, :]    (first quarter of out_features)
+  GPU1 (TP0, ETP1) owns W[K/4:K/2, :] (second quarter of out_features)
+  GPU2 (TP1, ETP0) owns W[K/2:3K/4, :]
+  GPU3 (TP1, ETP1) owns W[3K/4:K, :]
+```
+
+ETP always shards along `out_features` regardless of the TP parallel mode (`column` or `row`). For `row` parallel mode, TP shards `in_features` while ETP shards `out_features`, making the two dimensions orthogonal.
+
+ETP is composable with TP and Sequence Parallelism for `Linear`, `LayerNormLinear`, and `LayerNormMLP`. The `etp_group` process group is orthogonal to the `tp_group`, giving a 2D parallelism grid.
+
+
+### Weight Sharding
+
+#### Initialization
+
+Every rank independently allocates and initializes the **full** weight tensor, then slices out its local portion — there is no broadcast or communication during construction.
+
+```
+te.Linear.__init__(out_features=F, in_features=K, etp_group=group)
+│
+├─ 1. Every rank: weight_tensor = torch.empty(F, K)   ← full weight, same shape on all ranks
+│
+├─ 2. reset_parameters()                              ← Kaiming-uniform init on every rank
+│      identical seed ⇒ identical values on all ranks; slice is consistent without any comm
+│
+└─ 3. wrap_module_params_etp(self, weight_names, etp_group)
+       │
+       ├─ alignment  = pad_for_alignment(16) × etp_size
+       │  pad_length = (alignment − F % alignment) % alignment
+       │  shard_size = (F + pad_length) // etp_size
+       │
+       ├─ start = rank × shard_size
+       │  end   = min((rank+1) × shard_size, F)   ← clips real rows for last rank
+       │  shard = weight_tensor[start : end].clone()
+       │
+       ├─ ETPShardedParam(shard)
+       │    .pad_length          = pad_length
+       │    .is_padded_last_rank = (rank == etp_size−1 and pad_length > 0)
+       │    .group               = etp_group
+       │
+       ├─ module._parameters["weight"] = etp_shard   ← replace nn.Parameter
+       │
+       └─ del weight_tensor                           ← full buffer freed
+```
+
+Example: `F=63, K=32, etp_size=4, pad_for_alignment=16`
+
+```
+alignment=64, pad_length=1, shard_size=16
+
+rank 0: rows [ 0:16] → ETPShardedParam [16, 32]  pad_length=0  is_padded=False
+rank 1: rows [16:32] → ETPShardedParam [16, 32]  pad_length=0  is_padded=False
+rank 2: rows [32:48] → ETPShardedParam [16, 32]  pad_length=0  is_padded=False
+rank 3: rows [48:63] → ETPShardedParam [15, 32]  pad_length=1  is_padded=True
+```
+
+#### Padding and strip flow
+
+Padding is added **entering** each collective so all ranks contribute equal-sized chunks; it is stripped **exiting** each collective so downstream consumers see the real shape.
+
+```
+FORWARD
+  local shard  [real_rows, K]                 (e.g. [15, 32] on last rank)
+    └─ get_padded_shard()  →  [shard_size, K] (e.g. [16, 32]  zero row appended)
+         └─ all-gather    →  [padded_F, K]   (e.g. [64, 32]  across etp_size ranks)
+              └─ _strip_padding  →  [F, K]   (e.g. [63, 32]  ← weight seen by GEMM)
+                   └─ GEMM  →  output [B, F]
+
+BACKWARD  (wgrad path)
+  wgrad [B, F]  (computed against stripped weight, so first dim is F not padded_F)
+    └─ _reduce_scatter pads:  [F, K]  →  [padded_F, K]  (re-pads before RS so chunks are equal)
+         └─ reduce-scatter   →  [shard_size, K]  per rank
+              └─ _finalize_wgrad → _strip_padding  →  [real_rows, K]
+                   └─ stored as param.grad  (matches local shard shape)
+```
+
+#### Wrapping call
+
+```python
+# Called in Linear/LayerNormLinear/LayerNormMLP/GroupedLinear __init__
+if etp_group is not None:
+    wrap_module_params_etp(self, self.weight_names, etp_group)
+    del weight_tensor   # free the temporary full-weight buffer
+```
+
+For `GroupedLinear` (MoE), `wrap_module_params_etp` is called with `is_grouped=True`, which additionally sets `weight_list` on the first expert's `ETPShardedParam` so all experts' weights can be batched together in a single coalesced all-gather.
+
+### State Machine
+
+Each `ETPShardedParam` tracks two independent state machines: one for the all-gather (`state`) and one for the reduce-scatter (`rs_state`). Each uses the same four-state enum:
+
+```
+NONE ──────────► ASYNC_WAIT ──────────► DATA_READY ──────────► NONE
+(shard only)    (AG/RS launched)       (AG/RS complete,        (consumed,
+                                        result in cache)        back to shard)
+
+NONE ─────────────────────────────────► DATA_READY_SYNC ──────► NONE
+                                        (sync gather,           (consumed)
+                                         result available)
+```
+
+The `DATA_READY_SYNC` state is used for on-demand synchronous gathers (cold start or when prefetch is disabled). `DATA_READY` is used after an async gather completes via `handle.wait()`.
+
+Invalid transitions are guarded by `_set_state()` / `_set_rs_state()`.
+
+### Class Diagram
+
+<details>
+<summary>Click to expand</summary>
+
+```mermaid
+classDiagram
+
+    %% ── Enums ────────────────────────────────────────────────────────────────
+    class ETPWeightState {
+        <<enumeration>>
+        NONE
+        ASYNC_WAIT
+        DATA_READY
+        DATA_READY_SYNC
+    }
+
+    %% ── Config ───────────────────────────────────────────────────────────────
+    class ETPConfig {
+        <<dataclass>>
+        +int pad_for_alignment
+        +bool check_param_states
+        +bool weight_prefetch
+    }
+
+    %% ── Core parameter class ─────────────────────────────────────────────────
+    class ETPShardedParam {
+        <<nn.Parameter subclass>>
+        $ _pending_rs_weight : ETPShardedParam
+        $ _first_weight_flag : bool
+        $ _last_weight : ETPShardedParam
+        $ _link_node_count : int
+        $ _link_table_buffer : List[str]
+        $ _link_table_flushed : bool
+        +ETPWeightState state
+        +ETPWeightState rs_state
+        +int _ag_ticket_fwd
+        +int _ag_ticket_bwd
+        +int _rs_ticket
+        +Event ag_event
+        +Event rs_event
+        +ETPShardHandle _prefetch_handle
+        +ETPShardHandle _wgrad_rs_handle
+        +callable _grad_accum_node
+        +callable _grad_accum_hook
+        +Quantizer _quantizer
+        +bool did_cast_to_low_precision
+        +QuantizedTensor quantized
+        +int pad_length
+        +bool is_padded_last_rank
+        +bool prefetch_initialized
+        +ETPShardedParam next_w
+        +ETPShardedParam prev_w
+        +bool is_routed_expert
+        +int expert_idx
+        +ProcessGroup group
+        +List weight_list
+        +Tensor wgrad_rs
+        +bool fuse_wgrad_accumulation
+        +str _debug_name
+        +setup(weight_quantizer)
+        +_weights() List
+        +_set_state(new_state)
+        +_set_rs_state(new_state)
+        +_get_cache_key(dtype, fwd, reduce_scatter) tuple
+        +_unsharded_shape_padded() tuple
+        +_unsharded_shape() tuple
+        +_sharded_padded_shape() tuple
+        +get_padded_shard() Tensor
+        +_strip_padding(tensor) Tensor
+        +_quantize_if_needed(skip, flag)
+        +_all_gather_weight(async_op, ...) tuple
+        +_all_gather_weight_on_demand(fwd, ...) Tensor
+        +_get_prefetched_weight(fwd, ...) Tensor
+        +_wait_param_gather()
+        +_wait_reduce_scatter()
+        +all_gather_and_prefetch(fwd, ...) Tensor
+        +all_gather_and_prefetch_bwd() Tensor
+        +get_wgrad_tensor() Tensor
+        +register_grad_accum_hook(node, hook)
+        +_finalize_wgrad(param, wgrad_rs, fuse) [staticmethod]
+        +_reduce_scatter(wgrads, async_op) tuple
+        +wgrad_reduce_scatter(wgrad, fuse)
+    }
+
+    %% ── Async all-gather handles ─────────────────────────────────────────────
+    class ETPShardHandle {
+        +Work handle
+        +List etp_shards
+        +bool reduce_scatter
+        +wait()
+    }
+
+    class BatchedNVFP4AllGatherAsyncHandle {
+        <<dataclass>>
+        +List output_handles
+        +Work outer_async_handle
+        +bool _synchronized
+        +wait()
+    }
+
+    class _NVFP4AllGatherAsyncHandle {
+        +NVFP4TensorStorage output
+        +Tensor columnwise_data_interleaved
+        +Tensor columnwise_scale_inv_interleaved
+        +int world_size
+        +Work async_handle
+        +bool _synchronized
+        +post_process_nvfp4_gather()
+        +wait()
+    }
+
+    %% ── Buffer pool / ticket cache ───────────────────────────────────────────
+    class _TicketSlot {
+        <<dataclass>>
+        +tuple key
+        +ETPShardedParam param
+        +dtype
+        +bool reduce_scatter
+        +bool fwd
+        +Tensor buf
+    }
+
+    class ETPWeightCache {
+        -Dict _pool
+        -Dict _slots
+        -int _next_ticket
+        -int _total_bytes
+        +reserve(param, dtype, fwd, reduce_scatter) int
+        +get(ticket) Tensor
+        +release(ticket)
+        +clear()
+        +reallocate_to_mempool(device, mempool)
+        -_allocate_buffer(param, dtype, reduce_scatter, fwd) Tensor
+        -_buf_bytes(shape, dtype) int
+    }
+
+    %% ── External bases (simplified) ──────────────────────────────────────────
+    class torch_nn_Parameter {
+        <<torch.nn.Parameter>>
+    }
+    class QuantizedTensor {
+        <<torch.Tensor subclass>>
+    }
+    class NVFP4TensorStorage {
+        <<QuantizedTensorStorage subclass>>
+    }
+
+    %% ── Relationships ────────────────────────────────────────────────────────
+
+    %% inheritance
+    torch_nn_Parameter <|-- ETPShardedParam
+
+    %% state machine ownership
+    ETPShardedParam --> ETPWeightState : state / rs_state
+
+    %% doubly-linked prefetch list (self-referential)
+    ETPShardedParam --> ETPShardedParam : next_w / prev_w
+
+    %% grouped expert list (self-referential)
+    ETPShardedParam --> ETPShardedParam : weight_list
+
+    %% in-flight prefetch / RS handles
+    ETPShardedParam --> ETPShardHandle : _prefetch_handle / _wgrad_rs_handle
+
+    %% handle back-reference to shards for state transitions
+    ETPShardHandle --> ETPShardedParam : etp_shards
+
+    %% handle polymorphism: plain Work or NVFP4-batched
+    ETPShardHandle --> BatchedNVFP4AllGatherAsyncHandle : handle (NVFP4 path)
+
+    %% batched handle contains one entry per expert
+    BatchedNVFP4AllGatherAsyncHandle --> _NVFP4AllGatherAsyncHandle : output_handles
+
+    %% config singleton controls all params
+    ETPShardedParam ..> ETPConfig : ETP_CONFIG
+
+    %% buffer pool used via global singleton
+    ETPShardedParam ..> ETPWeightCache : reserve / get / release
+
+    %% ticket slots
+    ETPWeightCache --> _TicketSlot : _slots
+
+    %% quantized tensor stored per param
+    ETPShardedParam --> QuantizedTensor : quantized
+
+    %% NVFP4 storage is a QuantizedTensor
+    NVFP4TensorStorage --|> QuantizedTensor
+
+    %% NVFP4 handle output type
+    _NVFP4AllGatherAsyncHandle --> NVFP4TensorStorage : output
+```
+
+</details>
+
+---
+
+## Difference with FSDP
+
+FSDP (Fully Sharded Data Parallelism) and ETP both shard weight parameters, but they target different axes and serve different purposes:
+
+| Dimension | FSDP | ETP |
+|---|---|---|
+| **Sharding axis** | Data-parallel replicas | ETP process group (model-parallel dimension) |
+| **Target layer** | All parameters uniformly | Any TE Linear, LayerNormLinear, LayerNormMLP, or GroupedLinear weight |
+| **Communication** | All-gather before fwd, reduce-scatter after bwd | Same pattern, but orthogonal group |
+| **State tracked** | PyTorch handles lifecycle | `ETPWeightState` state machine per param (separate for AG and RS) |
+| **Quantization** | Framework-level, post-gather | **Quantize-then-gather** (lower bandwidth) |
+| **Buffer management** | PyTorch flat-param storage | Ticket-based buffer pool per shape/dtype |
+| **Prefetching** | PyTorch forward-hook prefetch | Lazy linked-list async prefetch across layers |
+| **Gradient flow** | Reduce over data-parallel dim | Reduce-scatter over ETP dim |
+| **Composability** | Wraps module hierarchy | Opt-in per-module via `etp_group` arg |
+
+**Key distinction**: FSDP shards across the *data-parallel dimension* (replicas processing different samples), while ETP shards across the *model-parallel dimension* (GPUs processing the same sample). They can coexist: a model can use FSDP for data parallelism and ETP for weight memory reduction simultaneously.
+
+A further practical difference is that ETP is **quantization-aware**: shards are quantized *before* the all-gather, so the wire bandwidth is proportional to the quantized size (e.g., FP4 = 1/4 of BF16), not the original weight size. FSDP gathers in full precision by default.
+
+---
+
+## Scalability
+
+ETP scales along two independent dimensions:
+
+1. **ETP group size (N)**: Divides per-GPU weight memory by N. With N=8 and BF16 weights, a weight of 8 GB is reduced to 1 GB per GPU. With NVFP4, the same weight becomes 250 MB per GPU.
+
+2. **Number of experts (E)** (MoE only): Expert weights are gathered in parallel via a batched coalesced all-gather (`grouped_gather_along_first_dim`), so adding more experts within a MoE layer does not serialize the communication.
+
+**Combined scaling**: In a model with TP×ETP parallelism, the effective per-GPU weight size is `W / (TP × ETP)`. For example, TP=4 + ETP=8 gives 32× weight compression before training data parallelism is even considered.
+
+**Prefetch chain amortizes communication**: The linked-list prefetch means that for an L-layer model, L-1 all-gathers are completely hidden behind compute. Only the very first layer's all-gather (or the first backward layer) may stall, and only if the GPU compute is faster than the network.
+
+
+TODO: add scalability perf of Ultra in nvfp4.
+
+---
+
+## Schedule Details
+
+### Forward Pass
+
+```
+Layer i-1 fwd                   Layer i fwd                    Layer i+1 fwd
+┌─────────────────────────┐     ┌─────────────────────────┐     ┌──────────────
+│ all_gather_and_prefetch │     │ all_gather_and_prefetch │     │ ...
+│  ├─ get W_i-1 (cached)  │     │  ├─ get W_i (cached)    │     │
+│  └─ async AG W_i ─────  │─────▶ ready at use time       │     │
+│                         │     │                         │     │
+│ GEMM(input, W_i-1)      │     │ GEMM(input, W_i)        │     │
+└─────────────────────────┘     └─────────────────────────┘     └──────────────
+         ↑ Overlap                        ↑ Overlap
+   AG(W_i) ∥ GEMM(W_i-1)          AG(W_i+1) ∥ GEMM(W_i)
+```
+
+Step by step for layer `i`:
+
+1. **Lazy linked-list construction** (first pass only): Each `ETPShardedParam` has a `prefetch_initialized` flag. On the first call to `all_gather_and_prefetch`, this flag is `False`. The weight links itself to the previous weight (`cls._last_weight`) by setting `prev_w` / `next_w`, then sets `prefetch_initialized = True`. On subsequent passes the linking block is skipped. The complete link table is buffered during the first pass and flushed atomically as a single log print at the start of the second pass.
+2. **Retrieve current weight**:
+   - `prev_w is not None` and `_ag_ticket_fwd is not None` → pull from buffer cache (ticket already reserved by async prefetch)
+   - otherwise → synchronous on-demand all-gather (only on very first use or when prefetch is disabled)
+3. **Quantize if needed** (FP8/NVFP4/MXFP8): re-quantize the local shard into its pre-allocated quantized buffer before communication.
+4. **Run GEMM** using the gathered full weight.
+5. **Async prefetch next weight**: kick off `_all_gather_weight(async_op=True)` for `next_w` and store the handle in `next_w._prefetch_handle`.
+6. **Release buffer**: after returning the gathered weight to the caller, the buffer for the current weight is returned to the pool via `cache.release(ticket)`.
+7. **Save sharded weight** (not gathered) for the backward pass: `weight_etp_sharded` is stored in `ctx`; the gathered buffer is transient.
+
+#### Prefetch implementation sketch
+
+```python
+# all_gather_and_prefetch (simplified)
+if self.prev_w is not None and self._ag_ticket_fwd is not None:
+    result = self._get_prefetched_weight(fwd=True, ...)   # cached
+else:
+    result = self._all_gather_weight_on_demand(fwd=True, ...)  # sync fallback
+
+if ETP_CONFIG.weight_prefetch and self.next_w is not None:
+    _, handle = self.next_w._all_gather_weight(async_op=True, ...)
+    self.next_w._prefetch_handle = handle
+
+if self.prev_w is not None:
+    cache.release(self._ag_ticket_fwd)   # return consumed buffer to pool
+
+# First-pass only: link into prefetch chain
+if not self.prefetch_initialized:
+    if cls._last_weight is not None and cls._last_weight.next_w is None:
+        cls._buffer_link_table_row(cls._last_weight, self)
+        cls._last_weight.next_w = self
+        self.prev_w = cls._last_weight
+    self.prefetch_initialized = True
+elif not cls._link_table_flushed and cls._link_table_buffer:
+    cls._link_table_flushed = True
+    print_rank_0("\n".join(cls._link_table_buffer) + "\n")   # atomic flush
+cls._last_weight = self
+```
+
+The all-gather for layer `i+1` runs on the dedicated `AG_STREAM` while the GEMM for layer `i` runs on the compute stream, giving near-perfect overlap for GPU-compute-bound models. Similarly, the wgrad reduce-scatter runs on `RS_STREAM`. Both streams signal completion via CUDA events (`ag_event`, `rs_event`) that are waited on the compute stream before the result is consumed, ensuring correct ordering without blocking either communication stream.
+
+### Backward Pass
+
+The backward schedule mirrors forward, but traverses the layer chain in reverse:
+
+```
+Layer i+1 bwd                   Layer i bwd                    Layer i-1 bwd
+┌─────────────────────────┐     ┌─────────────────────────┐     ┌──────────────
+│ all_gather_and_prefetch │     │ all_gather_and_prefetch │     │ ...
+│  ├─ get W_i+1 (cached)  │     │  ├─ get W_i (cached)    │     │
+│  └─ async AG W_i ────── │─────▶ ready at use time       │     │
+│                         │     │                         │     │
+│ dgrad GEMM(grad, W_i+1) │     │ dgrad GEMM(grad, W_i)   │     │
+│ wgrad GEMM(act, grad)   │     │ wgrad GEMM(act, grad)   │     │
+│ async RS(wgrad_i+1) ─── │─────▶ finish RS before use    │     │
+└─────────────────────────┘     └─────────────────────────┘     └──────────────
+```
+
+Step by step for layer `i` backward:
+
+1. **`all_gather_and_prefetch_bwd()`**: Gather `W_i` for the dgrad GEMM; simultaneously async-prefetch `W_i-1` (the `prev_w`) for the next backward step. Uses `skip_weight_cast=True` — no re-quantization needed since scales are already valid from the forward pass.
+2. **dgrad GEMM**: Compute `dX = dY × W_i` using the gathered weight.
+3. **wgrad GEMM**: Compute `dW = X^T × dY` using the saved input activation.
+4. **`wgrad_reduce_scatter(wgrad, fuse_wgrad_accumulation)`**:
+   - **Non-last layer** (`prev_w is not None`): Launch async reduce-scatter; store `ETPShardHandle` in `self._wgrad_rs_handle`. Return `None` to backward (gradient deferred).
+   - **Last layer** (`prev_w is None`): Synchronous reduce-scatter. Call `_finalize_wgrad()` immediately — strips padding, accumulates into `main_grad`, fires grad-accum hook.
+5. **Deferred finish**: At the start of each subsequent layer's `wgrad_reduce_scatter`, `self.next_w._wait_reduce_scatter()` is called, which waits on `next_w._wgrad_rs_handle` and records a CUDA event. Then `_finalize_wgrad()` is called for `next_w` to strip padding, accumulate, and fire the hook. The RS buffer is returned to the pool via `cache.release()`.
+
+### Coalesced Expert Communication
+
+For MoE layers with multiple routed experts, all experts' all-gathers are coalesced into a single NCCL operation via `torch.distributed._coalescing_manager`. This reduces NCCL kernel launch overhead and improves bus utilization compared to E sequential all-gathers. The wgrad reduce-scatter for all experts is similarly coalesced.
+
+---
+
+## Low-Precision Details
+
+### FP8 (per-tensor scaling)
+
+- Each `ETPShardedParam` is assigned a quantizer via `setup(weight_quantizer)`.
+- The quantizer is configured with `amax_reduction_group=etp_group` (the group is already stored in the param from construction), so the amax is all-reduced across the ETP group before scaling—ensuring all GPUs in the group use the same scale factor for the full weight.
+- On the first microbatch (`is_first_microbatch=True`), `_quantize_if_needed()` re-quantizes the shard. On subsequent microbatches, `skip_weight_cast=True` reuses the existing quantized buffer, saving re-quantization cost.
+- A `cast_noop_flag` tensor (from the FP8 recipe) can signal that no scale update is needed, enabling a no-op cast path.
+
+### NVFP4 (4-bit, block-scaled)
+
+NVFP4 requires special communication handling because:
+- Each 4-bit value shares a scale with its 16-element block.
+- The layout has both rowwise and columnwise views, each with separate data and `scale_inv` tensors.
+- After all-gather, the interleaved format must be re-assembled into a GEMM-ready layout.
+
+The `_all_gather_nvfp4()` function in `distributed.py` handles this:
+1. **Pre-communication**: Strips padding from `scale_inv` tensors (padding ensures alignment to communication boundaries).
+2. **All-gather**: Gathers both `data` and `scale_inv` for the rowwise view; similarly for the columnwise view (with transposed tensor handling).
+3. **Post-processing** (`_post_process_nvfp4_gather` / `post_process_nvfp4_gather`):
+   - Fixes interleaved data layout back to packed format.
+   - Re-pads `scale_inv` to the GEMM-required alignment.
+   - Transitions the tensor to `GEMM_READY` state.
+
+For async all-gathers, post-processing is deferred into `_NVFP4AllGatherAsyncHandle.wait()`, keeping it off the critical path.
+
+For routed experts, `BatchedNVFP4AllGatherAsyncHandle` wraps one handle per expert; the single outer coalescing-manager handle is waited first, then each expert's NVFP4 post-processing is applied sequentially.
+
+`_strip_padding` handles NVFP4 scale_inv correctly:
+- `rowwise_scale_inv`: strip to `round_up(M, 128)` rows (dim 0)
+- `columnwise_scale_inv`: strip to `round_up(ceil(M / 16), 4)` columns (dim 1, transposed)
+
+### MXFP8 (microscaling FP8)
+
+MXFP8 follows the same quantize-then-gather pattern as FP8. The amax reduction for microscaling is handled within the quantizer; ETP configures the reduction group to be the ETP group.
+
+`_strip_padding` handles MXFP8 scale_inv correctly:
+- `rowwise_scale_inv`: strip to `round_up(M, 128)` rows (dim 0)
+- `columnwise_scale_inv`: strip to `round_up(M // 32, 4)` rows (dim 0; columnwise is not transposed for MXFP8)
+
+### Bandwidth Savings from Quantization
+
+| Dtype | Size vs BF16 | Example: 8B param weight |
+|---|---|---|
+| BF16 | 1× | 16 GB per ETP group |
+| FP8 | 0.5× | 8 GB |
+| NVFP4 | 0.25× | 4 GB |
+
+With ETP size N=8 and NVFP4, each GPU holds and gathers 0.5 GB instead of the full 16 GB.
+
+---
+
+## Memory Savings
+
+### Per-GPU Weight Memory
+
+With ETP group size N, each GPU stores only `1/N` of each weight at rest. The gathered weight is transient (lives only during the GEMM) and reused from the pool.
+
+### Ticket-Based Buffer Pool
+
+`ETPWeightCache` pools gathered weight buffers by `(shape, dtype, fwd, expert_idx, reduce_scatter)` key so that same-shaped weights across layers reuse a single GPU allocation instead of allocating per-layer.
+
+#### Data structures
+
+```
+_pool    : { cache_key → [buf, buf, ...] }   available (released) buffers
+_slots   : { ticket_id → _TicketSlot }       persistent per-param ticket slots
+                                              (key, param, dtype, fwd, reduce_scatter, buf)
+_next_ticket : int                            monotonically increasing ticket ID counter
+```
+
+Each `ETPShardedParam` holds up to three tickets:
+- `_ag_ticket_fwd` — forward all-gather buffer
+- `_ag_ticket_bwd` — backward all-gather buffer
+- `_rs_ticket` — reduce-scatter buffer
+
+A buffer lives in **exactly one** place at a time:
+
+```
+reserve()    → slot created, buf=None (no allocation yet)
+get(ticket)  → buf allocated lazily from pool or fresh; stored in slot
+release(ticket) → buf returned to pool; slot.buf set to None
+clear()      → all slot.buf = None, pool cleared (tickets stay valid; next get() re-allocates)
+```
+
+#### CUDA Graph support
+
+Before graph capture, call `reallocate_etp_cache_to_mempool(device, mempool)` to migrate all pool buffers into the CUDA graph memory pool. This ensures no allocations occur inside the captured graph.
+
+### No Activation Duplication
+
+The sharded weight (`weight_etp_sharded`) is saved for the backward pass instead of the gathered weight. This avoids keeping a full-size weight copy in the gradient tape, which would negate the memory savings.
+
+### Quantized Shard Storage
+
+When using FP8/NVFP4/MXFP8, only the quantized shard (not BF16) is stored persistently in `ETPShardedParam.quantized`. The full-precision master weight can reside in the optimizer state on CPU or be managed separately, keeping GPU footprint at quantized shard size.
+
+---
+
+## API Usage
+
+<details>
+<summary>Click to expand</summary>
+
+```python
+import torch.distributed as dist
+from transformer_engine.pytorch import Linear, LayerNormLinear, LayerNormMLP
+from transformer_engine.pytorch.module.extended_tensor_parallelism import (
+    tag_etp_params_with_names,
+    update_config,
+)
+
+# Set up process groups
+tp_group = ...   # Tensor-parallel group
+etp_group = ...  # ETP group (orthogonal to TP)
+
+# Drop-in replacement for standard TE Linear (dense model)
+# Weights are sharded at construction time by wrap_module_params_etp
+layer = Linear(
+    in_features=4096,
+    out_features=4096,
+    parallel_mode="column",   # or "row"
+    tp_group=tp_group,
+    etp_group=etp_group,      # Enable ETP
+)
+
+# Also works with LayerNormLinear and LayerNormMLP (dense or MoE feed-forward)
+ffn = LayerNormMLP(
+    hidden_size=4096,
+    ffn_hidden_size=16384,
+    tp_group=tp_group,
+    etp_group=etp_group,      # Enable ETP
+)
+
+# Weight is automatically an ETPShardedParam holding only the local shard
+assert isinstance(layer.weight, ETPShardedParam)
+
+# Call setup() once after constructing quantizers (FP8/NVFP4).
+# Note: etp_group is already stored in the param; setup() only takes quantizers.
+layer.weight.setup(weight_quantizer=quantizers)
+
+# Optionally tag all ETP params with human-readable names for the link table log.
+# Call once after full model construction.
+tag_etp_params_with_names(model)
+
+# Forward/backward are transparent — ETP handles all-gather/reduce-scatter internally
+output = layer(input)
+```
+
+</details>
+
+For MoE layers with routed experts, `GroupedLinear` uses the same `etp_group` argument and handles batched expert weight gathers automatically.
+
+---
+
+## Implementation Files
+
+| File | Role |
+|---|---|
+| `transformer_engine/pytorch/module/extended_tensor_parallelism.py` | Core ETP: `ETPShardedParam`, `ETPWeightCache`, `_TicketSlot`, `ETPWeightState`, `ETPConfig`, `wrap_module_params_etp`, `tag_etp_params_with_names`, `update_config`, `reallocate_etp_cache_to_mempool`, `wait_async_comms` |
+| `transformer_engine/pytorch/module/linear.py` | ETP integration in `Linear` forward/backward |
+| `transformer_engine/pytorch/module/layernorm_linear.py` | ETP integration in `LayerNormLinear` forward/backward |
+| `transformer_engine/pytorch/module/layernorm_mlp.py` | ETP integration in `LayerNormMLP` forward/backward |
+| `transformer_engine/pytorch/module/grouped_linear.py` | ETP integration for MoE routed-expert grouped GEMMs |
+| `transformer_engine/pytorch/distributed.py` | `gather_along_first_dim`, `_all_gather_nvfp4`, `_NVFP4AllGatherAsyncHandle` |
+| `tests/pytorch/distributed/test_etp.py` | ETP unit tests: state machine, buffer cache, weight sharding, module param replacement, `Linear`/`LayerNormLinear`/`GroupedLinear` fwd/bwd correctness, prefetch chain, wgrad reduce-scatter, microbatches, NVFP4 fwd/bwd (aligned + unaligned), MXFP8 fwd/bwd (aligned + unaligned) |
+| `tests/pytorch/distributed/test_tp_etp.py` | TP+ETP integration tests: process group layout, `Linear` (column/row parallel) weight shape and fwd/bwd correctness, `LayerNormLinear` and `LayerNormMLP` fwd/bwd smoke tests; runs on 4 GPUs with TP=2, ETP=2 |
+
+----
+
+## Best Practice
+
+TODO
+
+----
+
+## Caveats
+
+- First forward pass always stalls (cold start)
+
+  On the very first forward pass, `state == NONE` for all weights (no prefetch has run yet), so every weight does a synchronous all-gather. Only from the second pass onward does the async prefetch chain kick in. For frameworks that benchmark the first iteration (e.g., profilers, compilation warmup), this cold-start stall looks like a regression.
+
+- Link table logged on second forward pass
+
+  The prefetch-link table (printed via `tag_etp_params_with_names` + the built-in logging) is buffered during the first forward pass and flushed atomically at the start of the second forward pass. This ensures it is not interleaved with other logs, but means it will not appear until the second iteration.
+
+----
+
+## Future Work
+
+TODO
+
+----

From 92fc0f0910b39a614b269c3e87f812bfab88916e Mon Sep 17 00:00:00 2001
From: Shiqing Fan <shiqingf@nvidia.com>
Date: Mon, 30 Mar 2026 08:24:44 -0700
Subject: [PATCH 07/43] debug: add default and meaningful nvtx_label for ETP
 (batched) AG/RS kernels.

---
 .../module/extended_tensor_parallelism.py     | 29 +++++++++++++++----
 .../pytorch/module/grouped_linear.py          |  4 +--
 .../pytorch/module/layernorm_linear.py        |  3 +-
 transformer_engine/pytorch/module/linear.py   |  3 +-
 4 files changed, 26 insertions(+), 13 deletions(-)

diff --git a/transformer_engine/pytorch/module/extended_tensor_parallelism.py b/transformer_engine/pytorch/module/extended_tensor_parallelism.py
index bc921705a9..0cc1979524 100644
--- a/transformer_engine/pytorch/module/extended_tensor_parallelism.py
+++ b/transformer_engine/pytorch/module/extended_tensor_parallelism.py
@@ -363,6 +363,13 @@ def _strip_padding(self, tensor):
 
     def _all_gather_weight(self, async_op, skip_weight_cast, cast_noop_flag, fwd, nvtx_label=None):
         """Quantize (if needed) and all-gather weight. Returns (weight_total, handle)."""
+        if nvtx_label is None:
+            nvtx_label = (
+                self._debug_name
+                + (".fwd" if fwd else ".bwd")
+                + (".async" if async_op else ".sync")
+            )
+
         weights = self._weights
 
         # 1. Transition state for async gathers.
@@ -633,12 +640,18 @@ def _wait_reduce_scatter(self):
                 self._wgrad_rs_handle = None
                 self.rs_event.record()
 
-    def _reduce_scatter(self, wgrads, async_op):
+    def _reduce_scatter(self, wgrads, async_op, nvtx_label=None):
         """Reduce-scatter one or more wgrads. Returns (outputs, handle).
 
         Single tensor: plain reduce-scatter (no coalescing).
         Multiple tensors: coalesced reduce-scatter.
         """
+        if nvtx_label is None:
+            nvtx_label = (
+                self._debug_name
+                + ".bwd"
+                + (".async" if async_op else ".sync")
+            )
 
         for w in self._weights:
             if async_op:
@@ -661,12 +674,15 @@ def _reduce_scatter(self, wgrads, async_op):
             out_buffers = [None] * len(wgrads)
 
         if len(wgrads) == 1:
+            nvtx_range_push(f"{nvtx_label}.etp_rs")
             out, handle = reduce_scatter_along_first_dim(
                 wgrads[0], self.group, async_op=async_op, output=out_buffers[0]
             )
+            nvtx_range_pop(f"{nvtx_label}.etp_rs")
             return [out], handle
         else:
             outputs = []
+            nvtx_range_push(f"{nvtx_label}.batched_etp_rs")
             with torch.distributed._coalescing_manager(
                 group=self.group,
                 device=wgrads[0].device,
@@ -675,10 +691,11 @@ def _reduce_scatter(self, wgrads, async_op):
                 for out_buffer, tensor in zip(out_buffers, wgrads):
                     out, _ = reduce_scatter_along_first_dim(tensor, self.group, output=out_buffer)
                     outputs.append(out)
+            nvtx_range_pop(f"{nvtx_label}.batched_etp_rs")
 
             return outputs, cm if async_op else None
 
-    def wgrad_reduce_scatter(self, wgrad, fuse_wgrad_accumulation):
+    def wgrad_reduce_scatter(self, wgrad, fuse_wgrad_accumulation, nvtx_label=None):
         """Reduce-scatter wgrad(s). Sync for last weight, async+deferred for others.
 
         Accepts a single tensor (non-routed) or list of tensors (routed experts).
@@ -694,12 +711,12 @@ def wgrad_reduce_scatter(self, wgrad, fuse_wgrad_accumulation):
         if ETP_CONFIG.weight_prefetch and self.prev_w is not None:
             # Async reduce-scatter (not last weight — deferred finish)
             self.fuse_wgrad_accumulation = fuse_wgrad_accumulation
-            _, rs_handle = self._reduce_scatter(wgrads, async_op=True)
+            _, rs_handle = self._reduce_scatter(wgrads, async_op=True, nvtx_label=nvtx_label)
             self._wgrad_rs_handle = ETPShardHandle(rs_handle, weights, reduce_scatter=True)
             ret = tuple([None] * len(wgrads)) if batched else None
         else:
             # Sync reduce-scatter (last weight in chain)
-            sharded, _ = self._reduce_scatter(wgrads, async_op=False)
+            sharded, _ = self._reduce_scatter(wgrads, async_op=False, nvtx_label=nvtx_label)
             result = [self._finalize_wgrad(p, g, fuse_wgrad_accumulation)
                       for p, g in zip(weights, sharded)]
             ret = result if batched else result[0]
@@ -718,10 +735,10 @@ def wgrad_reduce_scatter(self, wgrad, fuse_wgrad_accumulation):
 
         return ret
 
-    def batched_wgrad_reduce_scatter(self, wgrad_list, fuse_wgrad_accumulation):
+    def batched_wgrad_reduce_scatter(self, wgrad_list, fuse_wgrad_accumulation, nvtx_label=None):
         """Batched version of wgrad_reduce_scatter."""
         assert self.is_routed_expert and self.weight_list is not None
-        return self.wgrad_reduce_scatter(wgrad_list, fuse_wgrad_accumulation)
+        return self.wgrad_reduce_scatter(wgrad_list, fuse_wgrad_accumulation, nvtx_label=nvtx_label)
 
     def __torch_function__(self, func, types, args=(), kwargs=None):
         if kwargs is None:
diff --git a/transformer_engine/pytorch/module/grouped_linear.py b/transformer_engine/pytorch/module/grouped_linear.py
index 2f1fac23bf..fe81196f4a 100644
--- a/transformer_engine/pytorch/module/grouped_linear.py
+++ b/transformer_engine/pytorch/module/grouped_linear.py
@@ -420,9 +420,7 @@ def backward(ctx, grad_output: torch.Tensor) -> Tuple[Union[torch.Tensor, None],
                 accumulate_wgrad_into_param_main_grad = ctx.fuse_wgrad_accumulation
 
             if ctx.etp_size > 1:
-                weights = origin_weights[0].batched_all_gather_and_prefetch_bwd(
-                    nvtx_label="te._GroupedLinear.bwd",
-                )
+                weights = origin_weights[0].batched_all_gather_and_prefetch_bwd()
 
             if ctx.requires_dgrad:
                 dgrad_gemm_use_split_accumulator = _2X_ACC_DGRAD
diff --git a/transformer_engine/pytorch/module/layernorm_linear.py b/transformer_engine/pytorch/module/layernorm_linear.py
index 13b7a66b75..824030c3d0 100644
--- a/transformer_engine/pytorch/module/layernorm_linear.py
+++ b/transformer_engine/pytorch/module/layernorm_linear.py
@@ -576,8 +576,7 @@ def backward(
             ) = restore_from_saved(ctx.tensor_objects, saved_tensors)
 
             if ctx.etp_size > 1:
-                weight = origin_weight.all_gather_and_prefetch_bwd(
-                    nvtx_label=nvtx_label)
+                weight = origin_weight.all_gather_and_prefetch_bwd()
 
             # Delete the references to tensor objects once they've been consumed
             # by the `restore_from_saved` method to construct back the actual tensors.
diff --git a/transformer_engine/pytorch/module/linear.py b/transformer_engine/pytorch/module/linear.py
index 20f4799167..4c4789461c 100644
--- a/transformer_engine/pytorch/module/linear.py
+++ b/transformer_engine/pytorch/module/linear.py
@@ -699,8 +699,7 @@ def backward(ctx, grad_output: torch.Tensor) -> Tuple[Union[torch.Tensor, None],
             # --------------------------------------------------
 
             if ctx.etp_size > 1:
-                weight_fp8 = weight.all_gather_and_prefetch_bwd(
-                    nvtx_label=nvtx_label)
+                weight_fp8 = weight.all_gather_and_prefetch_bwd()
 
             dgrad = None
             dgrad_work = None

From b63927171b8cef3622884db4b3cd93511ba1ec66 Mon Sep 17 00:00:00 2001
From: Shiqing Fan <shiqingf@nvidia.com>
Date: Tue, 31 Mar 2026 00:51:44 -0700
Subject: [PATCH 08/43] doc: udpate README

---
 docs/README_ETP.md                    |   6 ++++++
 docs/etp/etp_ep_nt6_schedule_bf16.png | Bin 0 -> 188377 bytes
 2 files changed, 6 insertions(+)
 create mode 100644 docs/etp/etp_ep_nt6_schedule_bf16.png

diff --git a/docs/README_ETP.md b/docs/README_ETP.md
index 2dfb3dd227..d32321d6df 100644
--- a/docs/README_ETP.md
+++ b/docs/README_ETP.md
@@ -484,6 +484,12 @@ Step by step for layer `i` backward:
    - **Last layer** (`prev_w is None`): Synchronous reduce-scatter. Call `_finalize_wgrad()` immediately — strips padding, accumulates into `main_grad`, fires grad-accum hook.
 5. **Deferred finish**: At the start of each subsequent layer's `wgrad_reduce_scatter`, `self.next_w._wait_reduce_scatter()` is called, which waits on `next_w._wgrad_rs_handle` and records a CUDA event. Then `_finalize_wgrad()` is called for `next_w` to strip padding, accumulate, and fire the hook. The RS buffer is returned to the pool via `cache.release()`.
 
+
+Here is an example of ETP schedule diagram for Hybried Nemotron6 in bf16 as an example (ETP+EP with partial CGs):
+
+![alt text](etp/etp_ep_nt6_schedule_bf16.png)
+
+
 ### Coalesced Expert Communication
 
 For MoE layers with multiple routed experts, all experts' all-gathers are coalesced into a single NCCL operation via `torch.distributed._coalescing_manager`. This reduces NCCL kernel launch overhead and improves bus utilization compared to E sequential all-gathers. The wgrad reduce-scatter for all experts is similarly coalesced.
diff --git a/docs/etp/etp_ep_nt6_schedule_bf16.png b/docs/etp/etp_ep_nt6_schedule_bf16.png
new file mode 100644
index 0000000000000000000000000000000000000000..828f7b14a61d5903dd21dba22b72a978324b5ae5
GIT binary patch
literal 188377
zcmZ_01yqz>_dX1WGD8S6bVv+C38Hj&&Cn$wDAFmZgoL1UcY_krAq~<YB_RlcgftS;
zEui1wd4Kvo-&*f~Em+_PH}`$c*?V8twXZ!P>Z%F^xKy|p7#IX-l&mHO#tmW&jBAV#
zEbt0Fsp}f}g6X2EAdOKrM6(Y5A#1LOwop;QxDCFCU|<GWV_;wX2>7G|pWs*DU}9VY
zUoo#<dxQ1oe{T@Kx%TJ#Ym8Sv2+d>0!@!WlK+8&LyJP;yxSme3Lfx~oW*?nspDQA(
zt|==i{eb8?ET;05+Lt#Ni!6)vjW3Foo@q8C7E5z<*6E_Pr?-B4i<X?4$FntNSXktI
z=v;iq&Emwu=RkJ%^8Dmgb+9kwe}70)U{!S3jog3{VgBzA2owDZ>bu|n<JWy{u{0wM
zT}E{N_b(G+vMC7T$Vy`W&ksZ5Xy@hBu-ef7$IYRHMuq6H5dZUIKGip#al)|r{{OzT
ztYm-8(EsC!h%hIxb7>k@vrGT?rRlMp{J|a5GZX7oTWK`8?_q_sfkjk%Bp!SF*@j_*
z%Z-61`?-b}8#7z+A){SUN%tNzKU*3a|0tklshlRLqwe@vdE?t`bQnt+cM^VI6?S)c
zxIT@b58J~;-ioCgV>x+Dt#_59={1hS`>@tgTuIA)apY{8$TsG;>wh=tG(+;9o4${L
z^bPRRX_#b=d_<nw>+SN9W`XUn-^H2bV@uyQ=X%8E8k5$3w10p3>YA)E*V+fAr@|iH
za_xQyYOcb0PbbaNf;N9FYt4bMP5)-f&p}<d94l7)Z4&$(yL;qcaXM8pglmjD1H~y&
zGJpSa7c17jP^d<bK^!S2Id&ic_3>(&!~4^=+B7%|I$gqa{pVY%Y1bP=fkySt&-UA?
z2RGwU2^@}Ik*=3D<~{3gZoCy18U9kcvoj0TZ!ob8uvE%LQ43~rBn>^(7qT@S8Yr|8
z*YB-W=nh!0&Fk5ouG{h4Y5u;*Gi~1=&ldOONz%gKZAoM%Qja3Kqw=6T<j)b&d(~D`
zoK$KYT=K62OHC9#97Aw3R+$8I2PWuwTyi3;JA8E&qI=le-<mz=+x(V#qAZ6qFZX)r
zqHc+}wkfL+!~Px*zg=kWcUCX)pW1&86TLh!1PY}TbbLHh?{x5e`1j#wweJHdJgVt}
zrah6~W_i8Sbq-nq^J^nH8r^lzk2bcZ>l8fPwpjmrT4@>RS9%#K#zX!@jwA+Ll6K=$
zR%hc*i$VFL<_}@yJF|^$zb?)aZr`_=Yw;e=6zhDwTVp$s>9Lx!@-ZoBjw4-7<k<$V
z%~*jW)f(e}Lu?ll#Fe;j3B%v8vO_=q5K3y=9WKR^jlZI$CYHdWmbo!ryoByzU%BqO
zHT9u{)2PL3dq&9Rxo(-EkjpB5?B_zQ{7Lt{_LobeZ<Ug^!M!YXM?_Ey&<Z(s+3h&H
ziHp7Xb#b&g0q*tr&qM2~3Z1`yPPXj?gwR=LgFMNfReXozH<F`ZYV}&&+vLN~lUTM#
zP3|CSEJd8*<#ax$#_cb!51!%Q66Q6iwR>@SL_YGw&g=4MqS1Bhz30g`37clq&yC^`
zIMeO>MI<S)ld9WDx5lbwNk6Rf<E?YBZG9zXu4SbES_3h^7k6KIN!0%L7Kkv}i6`F~
zU-OY)|D5l}y&uAo`53R=R&{!?E``VPTctUkLl7aIxLGfe>3O}=(&}JZP$x`t?EC8%
z;!caQRo;^cgJ}XhPve+XIw+ZbJeorMYd2ZIZZ4t5v;OX;FHzfOkK--pR6=Og9xZ}s
z(7N#~kz-1bY>)rRRJHYJE?4b~i@hE`o3Y+lCMB?jj#{`zLjB2Hj50{2M0{BP4;}Wu
z2OTl*3G%4dup#5$TsZj(<B>#HMnFvhzAVR>KHj;8Am2nN*&Ul|mg5NCY1=O)A!FOd
z)V5)N?g*p_Uyu{&B)<^-y>2a1JzF-U)S!;4q3jmRyQk}#-^gxj_d4ZN`23ixct;&^
z=aeXg?v~hdUU0*!1};qLvxEE({u>#XMsSd#m8Pha{zOKbD)v;3ZBWB<XP#iHwZQo2
z-p~mAv3e&{er8lOhIOBn<WcqgVx5#b89KENbM|)aM%F{=Lg1B%v2E4n_0oUcZJ;F0
z9Da@lI8;~(7;fsE`H4t$?EL)WBs!rdxte@_)SR!IMQRSur=vX?1nj@H;cru9Sh!tI
z9i2;^5O+n<i9}xd=$1cv=1oM*61gUIc%J3NJO5?Sz-?zXf||v?>2!~#Zm!YuWVO_w
zU8+3<{(JLeVX}>Lli_5?g_c>>YDYi0T8OiIqJPGuq$`q?-8j2TmD>Av?SmQ{0&0F5
zE|adkrJm_(>o^9PK$)PM1XR3MAV9S8l+l@@Ld=w3E7{(B$(F<8kEQC{y%S0&A^y6|
zuwlQC<uxzIi-SRdgVn+FNz<s2YX+%&wq!%wUh%@uKt$^9OjhJLE_BEfJ}3PXJ4Bds
z#2|z1q}a;BAT(G44&RWT#UEGpS3bhhE%Q|~9@N@Vws-$}a{T?5gwl78&ejYAH{l1)
zNBs$!LP}MaCud=|>ZVODS4-1AxV(LOwzDxFOe?E2Sgkwph$f8lNu!`XpT(G9?L_He
z`e)baX0aCwULlHpIgEB+n}$}_CyF!(!f2?v-n75M`F(LP{A}~f9sNo(P;pdWi$|Eo
zEI9hdJlhzPdMy3$_QUswbT2MOvSe^%a1|t;ufI)d0bzSG_hK#?520Q9peB_3?yt@A
zR!x_!ua>U~EZxMVL&15`{aR@rF?)3TzX9dL@VM1-$)`P6N=_2HgF(Id`4Nam5&kwh
z7AKS6B6xzS%Q4Sp?uYx34L;PapKPt$E>3DdxopP3yuYB&SY8)S+#9=WHxvwp5>K7d
zbN=@Ja8~tQ>n+-RAUOAfg!k__A(8VkUOSDu4%=P=_J%LcU(pI~5}aDS^ZveQ<xzb6
z5SQ#1i)VF(*!|R}kpvg@v?8uQGQ~VyH@^@#<nUMyEe~Z(mn2x|1w3!D8bM%vS^N-|
z#BHv~?{l`2)U7b*buifGcMTSMlUB5s%qx;vIl1k5i`UuOaZKaV${|=p5f4Es&-H(y
z6EArHNru&J>0uB(EK+tIyYZs?lk2lB-sc?wcrzxmAR&P(UdJVwuiXH-805saZ}|8^
z2B=w%&CkJ|+0VY0!?a`k#Ap{p_BHmNMhAMy$9-u_z@+Ewd-qOVN^8Yl_t7Uw{xnV|
zCEb#_N!m;!7)z7MUjOrm-+T9_&U?Pav8XyHBU&2y3~p}zUZ37^>*0Ud^xn;TB2`|#
zy{Kq>GrpW8^{}TBuT{5!mSdX(d1G)zVSV&c0ZS?Q-c|nYtbJPN8A`&|!T*KqUx}Cr
zQ?B?dwzeDYmn`>qp(6k!UzQV`;ps3^cKQCO5MmYtg+@fMG95^Tk{aB>c3&&sY8PrE
z_vztCkSQ0IlWMTqyGyAU;7{iE)$*p<lgc=fo8O@~Kgl*c;E<+Q$fP07EZW^1ZcRPr
zNU0<FS|C5Wu}YR!7VYb?)y=o#`r^g$OCw_ByJjMb&pGr};5_|bouZeAMGtF)3(usM
zK->e`@>J4%P8V=`%3y)4=1=TWKvXjMyBp5Ek-D}|S>rI5*(G0PIs9wx_pdsz*q{!5
z`q3N1NP1hFzU0*!wPsK>uD{jIkI!inI1KiatXtvlbV{_j!i^eut0OhXQ^&hsoZVvQ
zf9PsIclhy;?pDNb43AnC;x~<K&z7rx`MLD0n{czs>S0Rt`l#li&1hX*YK`~aVnCQ{
z#au>)K97X>Qqb2AVezcTvU|Dc*almR#;xMSJHno;@i|}L_UjBqbHy!@8I&)T$-YYy
z=|LE%sdR_^K}Fc201#ll2<DT5?FwU7Gm!Nd|2$kby{)i(UhuGy|F!tR01xlxblu$(
z)f8@XX`!js7p++|6)V#Yt$Z6q%?CpwPo`^Un>=`=93#CSSc>WV(2FqO{pE9bGW-15
z+t{fYN|%pf`kYVR^M0wghnvWe`5W&yjhk1FrGo*xLP934*o(xw4@TbUziMtgekACb
zWiwdx`9=D+pn^yl@p_1z|BAWd4ec+>Tsi5){1LycrRdodE?Ue#CJsjCr`wVJHx0@a
zuhRfr7tHAwCH^xQLe+Eqe(#TXmaT5#5YxVGV^L?8&p6b5HfCPhEnXfn{EMX;a%+<n
zrrio(yh?S7Z)4gT@Y^*%T^L>T2y=3%$jgA$6}dzao}|}rotA6Woq)rC-5?K@fK_Pq
zMJ+3JHQS?=xJajdtrb0Z$I203V11hH;Fg-imphrqtbufQRgC_9lAK9$Zq_lgV?AP!
z3{LsHs0U@0dWpSS5ct@vg-;ii`=rdM(KX`P$G{>=*VVh}66UG#c&i+G4M<+?Qs;5S
zyi5Lv(3`HpffY1_*(xme8-2D^Z22#}f9$Z$saSt2OGz1-$@>hot9oTB^!g=P`c8(3
z8z?6)744;_&0_ysHQl2?jo+!be+6%0OrWBV0)R$cI#F&kj2lwT_`XVFDyx$BMd8^Y
zoyEn)(r`lYeZ>^v>YH_B#jt`?Z4YIt-F8Xo@yxQz>hXa%v0BD^AhnzI#cq9To_0<A
z`1rqZ9efjWMJV*9*cLm$(Ta};Q~3bAbTSz!&|u8Jv(W0jlA!ziaLws#-`p7q>ieMi
z_{Og1?q2@5jqWWzv$6N2n>W0<dV6F%P3s4#CzLbqeJT9@V3<!ibEZ36Z`7>M=1Z}V
z!xvGSE?n`2?3?dDdmVj=!uEOndo!jJ=N19|%Nn1h#7Ki`&i~d7#5_4bNf$34z6$ld
z56MLnmo^Q_OMZ4BB+F+LI~vncT~Lj5xHa)LM)WI4pLcXhC^ngt64UP)ztnO_e6FNT
zaJ)Q}B;wtD$fA~BqW2cdA*2TCO>on@X2N8$$+=JTmyrIWvPCUd?MRdBJ^U=1RgVbb
zbC~a>MNgzhRG3_z$9$D%|HVW^L-MmU<(_M993_;PsWQn~mSlP;LnID>7e<AfOSW=W
z)ZgMGkH!G(k%>G2!1kWL$=ppELCKFZ4K7;VqshMj@^`ZCs<NIad-%1=vd(>`>@&k>
zgFh?pCkYrgt9n$)Dic-<y|304_neK8wYTe0Ydu?~u41Z_SydorQRS_DE$H;ppqxX$
zGI_A1HkRF}>>-OAm5x*E$@!{WUVuLBu!)bEef0;$3-`T~j3!$=Ek>*5$2fQ2S=vY}
zxf_a8?Hgl@Xz;v-v1+{c!btZs@;<tMvOOD7VBj++4uAl_vEjEwY7Oe_0gzPyJY~TH
z&;V^DE=1kLd8I!A<nw#ecMa>Kr7^3s;O5_&JTyl4R%C-JTTb2%4-pd}|Lokn0#?7v
zvcf6V18muO32bhQ0W0&bYZA2^5m)D06zO=nyAXLxq~&7&V+oD$(*kg)?=~O8h21ZI
z9$9l+4$<T+yu6O5=dln#uHMIfzeu~n<O4WyfO8QN4&svDP8WXGZ^iF(;XV`N1Mbu-
zGW^cJvDNP@3~Tg@>fxVgZu^D}%6ZVR#?H59fM)hUMY>;Rm~v*={5;_q2_l}}^m!bM
z+7S4RLCE3b48$-hELzGIOa;Rjt?s#P8|F}!lMq-<RToN+E7SiME$`mFDuhX`^5lOK
z+B*Tv3#KsgyWfg+%K_`Oo2u?BA?YgtfJkAIYQz%oukN$8&uS9pjK)??uG=#yx-LSl
zKawkcEr#7?R!yIr`Tr3f5s>43!1DdAZ*AS!{Pr^_{|61uE2#nwzuIx=?51lY6l1!*
zGQE%Wbv}iU=Bo{d!dhJqR#m?2-R7R6*vu8+{sxeG+yC#<@36q7bp&xMrT#8a8FqIB
zb<le2ub=FZawQkP&UaRoCr3fdb$l+MPzOM-M89eu2CcFjOub)Yb3c39U8$Y#kXcKJ
z`(!-yzv)WPAEc`=8b}JmpTqF@8kSbR>f7BEYgUg-^uw-cKzIQs3Fe)=hI4cIj`h2%
zrB@6iyR-47w9$5=3~<Nx$zp&>s~HOUG41Od=C&uw=}d?>yYv6O$;Gf}XR{hcgy&z5
zT7se%6r`y7dAzl0MZ+m)1dv;`_2=+5UgyVZuf=cG6aa2{vNM;@R3<O)nS;4`w!e}F
zc=5ro#BWd*_l_%iXk{Gw(}k)=l*|7TD#T+xkb+8FWzC}gJ|roW*YjO{wcSXz93Zy3
z>4Hw&=KZqQus#9CZC9PUPc=pKKocz8p!eo@u>xWX#B)#Nt+aicYA*2y&ChE<R{7WU
zVc_~4&h%!1f3F_}L8`_MEHP4|!@})}#q{(PR3FqkE`ZbX$mh&FA4bfqeA-2<CirCR
zZw|qPy`u+PN+!z6jsCNx^h6M|kBJf1(S5;?8p-6S)xWwL5hD~O6dDusFz#<D-WB%)
z1qj}nh>*kGA$$w_yQ;En27vi{gq8dx#WWxSS__9ONPZJ!U*!9HiOFxu7d0nP&%GyQ
zXJy^{@NweQ(p-emd^)3%p_(walf+qo_tt(Xb)D)hA!0U?FAhz%AN!S~m{il3=Xk}w
z)Cqav?F7x__&NCegy`YTQF#B0FOXdk@I@TdYKX5cWTw&0VSl-g$70}3_#uFGP27RQ
zA_P#V@uQ~hP-3t7vDNyZyJs`l4g~dUJsBQhEme+NEjS4oO?;tI=Mic1PhXC;7aw0x
zvA7z>V)ujw2wNPiJSypG|1dE$*TMVJ^m<PuX_{`LUe@mli;1rR*mHC|^&cm$?J%h%
zuuTw?#^s?hI9<wkDJxmfF;it)xVZ119!&+&`B+Ukr2BN+u5o!fR=X4Ue;!TG{g-A!
zUxfWKBDO+f_Tf2@vnIm<+{ejN%@B6qoo{s8IbIuC`ao_pkS0(*xJuMLbH|{T-k*Sy
zI~SzdvF5x$jg7HFKtHMTvM%`z8(mA4k~rPvF96d3nYd~Wd&XeE=ru?)zxF@g?M(vQ
z?Y7>#$Hh9Otb*nz#QV#IaX7z4QG+Ep3Q@n#zQ6vMbXUS15U4u<fP+l>*37dzoU-t3
z0=q6}kAMHRkp3cZk?+=2O;}qoV7?U{8cR4x@1KW!wqMNT&knxd)h;07sq`;}@ndDB
zbd3Y4>EgWIL|dj)B6yBC&jaws=h-?`L4~p*@LYDqL=H*`4oX^1<++y<+{w2jd@dk7
z>N0=I9YA|mDlx1hR~|uxfnBh}1MUeZNdVEjK6fcnGT5zUsZ389Ib8pIRffj0fg*WT
zonp4y`?p4Jy?5J*T<T<b?G+2i8+#g+%5)@+5Vy<fz?J4S`~F!36<^V#=ASOZVo972
z-&?ub#>~b%RJRjK?L~J5WjAr?y~9`ukp3}1(#N5a%G=A?iv~FWh_ilj12^ZwC4Qa0
z3i$z)F;Ex5H5;C;!xK6{VlsFmiMjeRx~nPrVaHBa|LaHfj_o7~>WNnx+r{bL=X^CF
z`24&$I|y8f5<Aud>~{pa&&<EfpE<b1Kf5bd)Wey5She=Asu?Vq3jrZW!pTR5c@C?0
z-$Qitg9N)e{5t9cw{8V+kau&uzihdrJLB230txASfRHDepUVH_DiQqVuecPkSEuH+
z^7Y+c&ws5sKYe@<GKpPlKO2hg*V4zZ4GIoW2N@pQVs^|n?b%4FPSrWk?PHfi8d0RY
z-2wTUZxtZ{_3tUl(5DUcn_a@wPUBqZvFbf!cOE=F-8HuB#EdRFQ0^j%#gbQUa9X+n
z9|f#V(6tGH<gx`7s6OT((N}+U$Stvo{D=oAQ|LHmIa6*qrl(AsG=lp7lF?v56lF!k
zkI7@(p?#C_E(FDYotoM|-o0-1tx2ekvo<~C)yT)hJLljjuJ#xRCKQ=k;BUE}ic0E)
zsCWVd(A&8`qCj`OHy$Ar0ni=2BtA7ieiA!Pd8W_LbwJu=D+$y16v8EyOr*Tuc_vfG
z7zr|MO;#)cnlN4aBu&797Uz|qtWWSMskCYbBO8WKq+zCUFS`3>K77OxNTUnB&R1VT
zPAD546?5_uIP~F`O!T>m3AmO_%g8UVWXD<OuciEdb-G|*I*>j08NCSZV0}Z(Q*;K*
z;VxKzqw3SWbnY?FJ-iYXK)TnskD^>EZub_uiGCsMwjFLYT52F*6!oNHZ+oUeMll0K
z;ewHTcFOVAbPfd>n`W1Z#q*Ces-8$>HL@|NfeR*s!p-GHYY1+19cD3Qu`=9&oy%j>
zRQQ-NBgV<6BW?m+N*|D-^aKEy!mHrUv6R?73K|sJjemvxV9CR)JTESDM31GzASwT7
zne=6v`}`arz04}9yLNN8yXD4jGTw3kut+NxwFhMX*wM6}BaN+r8|Oz`(<iS8#p#y_
zd20L`TDW7*ZIH;(YgwU-wmPK`_?#b@_r-qDT{5tu>A5E$P33f_+BBIu6na*Rj;ppD
zrd^(YXC2#{M1m1AiSYSGADfz~h*Dba?9dM}RoPNPVD^Q|=_0!k;T7(Cix)Q#w<$T5
zP)Zzv>>6+GEp`^Z{aXtmvZ225Ij99qQ`kFJOq2ThoNnE>K;D+e*NsJ@Ic#kG2c_AP
zxlFrH_m*PX9Mcy0IvS@c83clMnoqt>7`;#lUjX8``9~r5Ijia$r(VA=Pfp*>r^xx3
zz!|Q$eWktSEGtd9UKbKN1<3G-u{@su+<~!za(pHne<tz%!$DrOpkB;An_Vi>A6dP$
zre1l}0zn6snIUzhwWZxU^`UN}G)duw3#r<9%9P7UoyX?SZ^LlNJ6q{%#Sgh3{a-ES
zkOo?i+)e)hBEJG6dKUJt<qa{vjwXyk7P9Lp1VS~|dlk>;AHM7%s2l;({9d}vOg$G=
zs>JM#t{uZI<yN1|`WJcO4}jj(#4Z2$S!g2xC(3=&iG0vQnhbfn3+D?-jyQGxJppZg
z__BzO3*#pLdYyiW{@`GNB)RI<oyW%%T*G}MR{pTjLYO^^T?6i?JEM9<A<IL#G-%?(
zpU%p*X<WOB!?c#gTJeVWIuX8#+#eY6ulipOpVuK8v|9U=H3KK=952!5scdt9CQJ9U
z#bS_ZQRnB$j+(F;4H~tsvaLrKzInm`L1nEz1of(KHAkB{MvP|6r1QIdZ#2WqH8)V*
zz&Qeah)6pYt}JgsWBXoQA?M{bM(7>r4zl>Zi%Sx7p?n0D+3e7335m1X3Q=a7F4W*2
z=<Y!FIIPgJZ44|6Z)DG;5lg6vh>!4Qjr!=NN7=GaZgvKw(^|xC9Zsd@^Tj0uUpCc$
zWs}>W^7uVB<16=N^7XtbF(C^bltDixFo`-pY(*>x$w^g!!?hsvVemy8ZR<T^84gDC
zVUe9C8PcYEbT%@w&}df6FD3f!44`&|96q}9pxU=+%zt;Up?Fn3nOc*%JQ#A%tl~j(
z+DBeAP~qiwB+sN#CLtQuu^kehI3<|G$c7YI6}w*+$d1~pLYvOOvpM{DNfj2X##OJJ
z!o3&&Qdbo!qpg@J+VIi7M&r*O(9?rJ#_jDaLSLuH<XmfDlIPEU;1NYD79F9OaNGRE
zBla`)U@*d@YlO&gtlC;j6!Js75cO5U^@rX@00E8Q?pEDg<XEK5k4OwQ=ioVjSP_|q
zYehhuW;sY2Pq&((+&S4&HMR%Th1oXtsB<Qsxv$D{2y{pGw4WESU3!D^Q*^H3=XaO9
zp;}s6nu61~!yk09q#yLB^0C7l>p6sk**(6{9x?wx{qt^D%-siK2Ke4fjO%G=N~PvI
z&J>hi%dgGzQ_K;;%cDOBjZK^esw;5Z5QgB>ic++%r8%^C0EL;>3BI<}OZ;Yy!q3i(
zhGkj_6k}*_kwIPAhMF9$8IiHQ+59(9&@{2iFKAN6XJ|TSP$~Xtnj}9@PC4aMk<KVa
z*Ee3^=A#(Y-wI)>3*eG-hAYI5BIPJWvSU-q(iEr!*%?@1xOFBUaN`x;e^3Z{A0#?`
zZyAYjjHp&@WBH&7nkd<#)vkY=UAyGq;QT_AJIT$%u-(hTByN8AfW;OV_M=q(*w!ou
zAPpjzR+e;tf-mG#6kASU6)C1p|K^V^#wH*pds(7z1BJhW{NEJ4o3MNQ#&1^Qxzy?S
z%qK_*@v22GS{jwTJX7|tv9~3PoRmQqJOX@_{I)j4_S$*$nT2ZsDsugO2*YfZCE<Jc
zBI7gxHBoGT(%q;6+4uH{d3-gdM?wfLWJQKUWPtpiR?hGh2iDKrP{9_J^fjoo{+tKe
z8Nv8@U5*P;#28$Lg1!S8BILttoky>JOw8PN$4De-oL}$)&Ai?BsX+Gz+~$cO=wv8k
z+-%R(=KB4kYIrx2A%~2X6?LOFJ^Uz~zjNkzdsZotrpLw}K-VsEs2W;EsZyFxU=+!~
zz+m$(+6v$A)nu{dYi}|m<iJvidMYyNwMp&@zZ_Hg#4cXWZN!3uYy#Y_H$NODz5U~B
z)zU+yQ|fGZmLo{7W;(H*Z+Zwrh5v5{{DOgXfW%41h7IcxizEfat+v_ogxU9F@B*&y
zD1X6Y0FEyKG;ZKpa?SAY$jAx6Y1{MbWy^f8rJ6a-fM-DUF3)Kqw3xib0egao1I-{Y
zg#N(kvE(ii*bw7Jc_wI=Obsp7zo1?XboLvoG3~j@UB1T{o{kv-5D8Fp{E_0ab)>B7
z^m3sj-z0Oz39uk&3kxRrvnvVkwPGaRu*R+__tY-a9j!ckXE^)vcUR?!3?v#J^WPJ6
zm3kh1C?1ldm&SHq9Q5XJtLk!W-s`3|YrW}9!O0!2oGD7_jZIE1n%{e0`9m>e0Wu@+
zuM$&O1f*I<`EYtt)V%5#EsqQbz2iC{tW0i%b0P?<U1?1?SvjpJsKT;-povT@)g8>L
z@TG8dm0LwKQ`Dn?I}k5?6zOq<51kg;w#%!!f^tN39xKEBKaV!i#un9T>4IF}3C)Ey
zemd$P(%>7hFwCG)l3KMVf6fvtm;>B<3{&1QVjhhJ<RQ67l-y=0`VWp^<A3D8_UY&=
z09iBedps?~WmGx&UKRj1Kxc6Rm{`=~W?{J%v0j5SQ~RrBTb68sgesfyqG8bk<!aCS
zI55OK$tsyDUmcB*Gm#cqma5oPuX!r68p<6HQz+p@(+1pX80x)YmAlE`-?mWFrJLMZ
zB-mI%w5{USRWJ7IbO|6rS?P|!xB_$!q#j@Z1$$J@|B+h1WJSq-4P~+Y)Vfw^^hxSu
zgwN&A8mp0PsaH2-2T!N7*CJmmbrWblCywI=nD$DL+U>xb@l&Y#EcunrX}7fQ+fP8Q
zM4qk@vAW<^d(5=HcrA-^Z;x(B_f#4(&K-s`DPm8FZr!7^Ki@;)=BHAZF`23C>;6st
z$>7LurW=u%wdwZ}B{*s!O5vk$MdB4jCw876vZ@^k1E}Jz;)TMl2|9E2y^t?L@s+aw
zWc)T<K)Ll7SbGC0WHEN0V}0%&39Ir3=^hXODiFM~mv1OF8bj}4+UUlNrdc*|t-TSq
zDO$e10|bBbeG~V{d<Cb<{_p;s1L;C>7b~(+Wb8WlPm-<$b%x#l5f<aR`w9)u2C77y
zAN7I-Xt$%3qUi@B{QLc5o$7YVZR{1}%o?@e6G#AW<e=72xsU)V$K3iwtHu7Va7V`+
z2;(T6n<*RqVP4r1K>h&g>FqcFCnWn>fxJVa;yd*a8=;6{bOSE;;&dTMkURRv4Ug*?
zOxYp^HMhj6sQK+A$ruEPGqsRBl|^rVRfOfwIJBHG$EfE*S?Y>kgnuf~VDjbn^XCO^
z`bDEho?A75u`n?75i=ACiZ#8aTn$Wr3QCgA+pNRcX3wM|ENz!1``02bI`2Xy=#2s!
z7KTlhr<o@J{+|X-#-4!)0OOeU3Vb`BoYuNi1ie3?!IaVNFoHe|88FOUlh>ARMCOZI
z-dduk2B`Kez$HL%Tuj#7^L;v7LNTP+pRn2>la=`&O@!mhY7}rYP#KEC>T-<f3;w?R
z>IMNqGlYUOGkV%P&rn$nV1pkWR>tM6zl4pc%wpc1=Ze}(2voa2UTDK&7c%ysDZeWM
zd?t(wZ-GnYy9Hdgy+46HPH8&nhE)j!eh*~UA75W3ngJwN7LgOUl)Te9^NGqaHrtW9
zp^Xb#lYx5(O2l$5pqp1!B7prRu{O_d_`I>N>3jS1VEs=Cf_szplk&3yHiq*#@j@;Z
z=?VC9(9-Ax(1tsk@QL^%v1>2`cGkaEopV*AqcS3XOq`NETOWPvaj*&)tY?>a;_qa9
zn06`i1@0x@aG}LuYK-I9zRr2OWGrV~Bd!ET(Z{pVxpqRW2(-+icrEBEfR3`($8~@<
z%D{$U#7p7-xMmgD;N%%aOli+(k(}0A)R1MP;hd?P*x>RV!O4P(exs7cUtd9Q$AAR+
z6=S%tO`OwDK;&XpStA4@s48m4+%X4qX<2L<X9lg+cg~c%z?s8%t2=}^NKc6eE0Q$U
z1BaGu5cN;o&_4#G8aG*{Fhno74OA2Y!t2WkNk%>?X7Xg*oQiqsTNI?>IIGug;9CJH
zzB7=(X)H=pjmv~Z@guB7;pf50H_yBB%7TF^@VjuDZGiK5Ou8UNwob!Qp`iyT{LaS{
z>#0}ZL%y+{Iyjk|aiA19feQF%hL>{u>7pGvW}>T4tv}$yg#a#u?2z_;Dbxz8=l3GT
z1NZ|S8dJUb_4t!L*6opmfv2;%_dBkl>(&7P{POHwj4xB2bULJ<Jxq)b1u_*mFo~vr
zMf?A{`wyOatP8-@kLSx~b=RRpopQ&voH&JpvM9VM%n%I@E6ZHv#rKJ5F2?PWWRmri
z`umaEgj}UuHdve%ok4^wof^t%p2}%=>s}zG7wuo`UVQv?D>}M<FwB|;>v4-kGWE$Z
ztD?KkA}$a&qEAN{S0OAgMqBv%Sl#cNoGP&~RD?oMhL95BM>SFz=dl=?dFv&z7EG$5
zo#xg+uFvh~=4kJOkqdRqv0{dVc73(-Z{4$fHM*>1%a`~Mi<1n$YHxJYK3AEE!wz2P
z7+j816NlzfaBgSxZi+rXv|4lm>V(?k2OuKE(07Q=H$LYRqCf2oYgX<`iZ<DUGzO_T
zXgS{&asK*-BJr+Krsf>*X}C4M5`QO($ukha@vNSCA;~PTVNb+C$3^DBz754b<7Z@y
z1GOnfl-3RT0F3+JP!oeM6a5QF9$;>tIxQp96HW`Wq-iqXOMwnlg*Vu)1x3qV)Kg;i
z7+m7Y1vDQ2r}8<<fk<b+J;jU#V{E&BYkX~xf;0VQK^j@b!_w&u&{0qhW!aSp#P?W=
z5ZF!EWKIgEDFrq$pfS@0G0v#<exQzdIL5pSQJ04Gk-{kHnn9bNk?S4yH(t}#@PT^*
z9sRrbi%Al8d;)a-4Gg9+%uunguIWhY*j*Xpr*F}rLM$vVN(eOiJ|z@j29<S@(V9E?
zP|-xksaH5@%19$Vf<oQHZW$&`Z#q<%o$`-Bp#TD4A0qz@z69CbrSu3m?*!%g=*L%{
zdnWY-GQ8&fB-lBjr0kl)q4D2D--kN|@hcKDYeCJ&3UYebvaDKI8So1-dhz5D$P1#g
z1p2K#_VOm_sZ^5>q1VTdd?`5CD8KM%vs`6qCFi2mM4)tBNHtOY17Vf2lT9ai(GZYm
z(Tu^?mh8IB7T7fJN3R*ZU2?ze^nSlz=B{saQby|$cB4}$9J41hr%Kb&(NXzT-r0Y&
z6@N8Qu1~!V3n7BUZCQ_jT>@Y{pI_}SEl4or9DBbH5)e{m7*t#NeJ7D?PtPws*@oIp
zz5@!;qk;!Dw;Lj!2CZglDPQ=r{6?YN^OQ4$6CB|!Mi+LDugQq@yOM>{jP0$2{Ye%A
zEdm>|7~lq7mkn7=9*hGK$^#HR2s&KVA2h@dFw_LoVwgj`rXgs>I1*D;`3YhsmYghh
z0E(jEeDi_^lc*Ibn$2+U=%%yg#eHU-5<NDpJl^yK$O}_+JZsFRPtAW!AoOppINMrh
zai2#d#<&`aWKe~kd@)|hcJ@6V;il+B^NZ(*hgjmU*WSmJb;9ndF{wd^#lN^d)aE!V
z(pPK9lse3{)MohOp9o`KNI^_NR}+ge<`HN(fP_ITEq#V8CoI9_kZi+uIq>EapkerZ
zx<u%Y<mI^f5#9by;zI2z3zcOgF4k?8*z2l9cqzW2X2R^0DG97U{mNu9L|T*rvo0KA
zosm&+%0@J8R3iA_81p{qxEQwobSG#*-1g=Oj_cFIC*MC~XTWf1z2r|)ocR&Mni}Nb
z`g*O>{9~_=n9Ud~oM!>MeQ6CeL?^v9Tk2jPE|de8&>N{&-=1&YnTsleZJ{0mX|`b4
z>&Q`oO4R+{$rffb&0r)>#|}T(m6_-q);---P|oyLwruQk(H(3dx;?z;hmEcu%Kgt@
zyhZPyhZyT$>z%a-RZOqGcp0M~GFb=soeOu|I^oAXjgGjREQYEq-?G_Lvf)@-fZlS5
zh*18Ai|(R8?xYPt5twiS`3P8QBa1cKx*Yc|C|MZ2=-l@Qa)Eg-=df8rytOZA76|Gf
zw=DURt*BO;hBNv4$RcvVV<$osG)E>1G;$0ZoVgMMnQvl(V^e|#+;2|Xr85INA8EJ{
z&x~d<ehPWxnavJFl=%|>d^p~Fcvfn?9ko~vA_2&_xwE1^olYUTiJwmE4%gAuBpD+$
z<^NH-2(RK6(~Q8F2g`8#4Os+@Q0zx~#=v4!41zyRhQg&m;0^BZ`A+Nb5#{~Pz4U$g
z(1?4TH+}>H;yLfg3lvyjG{gCU<m1{Qu77@X+Y(k2y(``E8$t^2k!R9B5qI|tmrSPC
zu&@#~E^b(0iU<1N`N@uFZE9KybBLbB#S<WOby>yj$Xmz$aQdb<pr~q^bAx6=J6MsN
z({P?N<0(k>=@x+%%c?-PA-+DZ9$#=zk~N*}Hq6kv0@_D&CGH1j0o3~8)3tKzMx^9z
zttO26fE--SpqsbC<Fh#prmPsZ<=j;ls6GSlV5L#qK(Bna!6)r9-v&#bjZbWM;;$1%
ztEhAwU4sH=BHF(SG@BuEaA?RhOPZ>Jg=ds6<*IxaWfb=59q!JV+qz|T+AcDxChZN3
zsTQm%n!K1cC%aI3?|qi>r#jp=KpUbQQUta-GdHAC5w5zi?*V1`l?jOw=GcU|t--)~
z!2t5iKT2u-)zXuok4#Om3^PD=^~HeN5`{xX2mvYE$aN&|e+De3oxu6vXQy4H%|=a!
zv`q#kE<k=8T`m<V`NSYK-CR&!_F#=vG0$V5I>R6+|9BotRU~o+2U^|vW8^FuoN<Ss
z(pCBHl&%nB&!SWVLF?FHp$@h}(2~SdcNoj*QOJ;sX#+k10!t=FtjKCb!%@d4ui}`$
zH;u4mx`D>b(zttAery)I6MxZ|HA#c5z}1R7%76kzeQ%0}2NI1S`A3*D)Qhw6ACnM$
zS93nrP|)GB0g7#3(%m`R@@DzRsr?!5%gLO2@7@51Z#oj#JN8gP8?wNT_s*;ju35^!
za2*FHM_2sv@H1$-L-U)(;h1q?;a5RaTBIUp<Sho0rLJ?@6=>z({FSU<Wx<y{{q&jx
zEqSSXXd95b1Xsd8fKpY^^)+}t(nqn0`r1OIEP<h@fRjE#^);!M_F_9>nQ|t#{N>fk
zOX6QO2LfrVR6eI*s2zn=<i`pR32NVW0F8DRyvAliC{*ftj@6Ja<~4Wh39tvir}^~>
z{7cOVFjO;UxdU1Bo9B+$<^#5mSdkcfV^G0|fMwd-FEiWDwTRWNU`Bh~nZFrNzzm>)
zQ)@(2sVSd;jVw2&eSQu&OQ}f4xmwdy<WDc)Yax=1iQ7eG$2Sr@?voWEZe#2tN!Wb=
z0GdWf1iMa2z3WygR~7JG?9v2J3Vd5+B2O(jH-2v)%lXygxbh#HLDLmhsOv2<vW3r&
z`cagD?sLHRGtq9Abv2AGM~z1R_-`|M`y`PhqJT4N2R<Cm^uZ=*zC4~bdU45jB9XbJ
z-y9-|4JFEvLLkSCeHwSY8MBW;<L>(^%ge^%6o@}l8=b7PLzRE&st#zs|B9T?fo0qS
zTFb(u(gMH98ZZ-t+z5p$Ry%;hasUPlBwqOVBQI-kJq@crcw)0EJ`Pp{?So9kg7h`T
z`i~VKC<K7zGq4JNn`KB5yM;0vZiM6E=(+Q$z!IFSO`x`zU78~C#AJM%szCdmDKJcg
zb3c%%)BHxarv=7~!%D{m>B?+%j7U!undWo$cK@EJYwh6<%|~B=YhLJy=(h%862D&f
zJTn~>$|9I9@`;znyuT|Y1=NV%<D6pzQ6k0z!YBkyaRcma*i5GpSxzc-W3(rp5Sv0I
zjfUf(V+wO2=-NO1c<BLTwNJ6E??}Jg1B!$LF@5(S=)A>zx?gVCApK(+bc1&%8okf?
zZ*)aaCrt`?jic$h4W2RqZa|!@*8C**=RfWYi7Pz}I8vs_u?uF}wVv<aRZioNz$gO@
zaliUA`87|MaG)VAnE8JvVTwZ!cgZOS;F2~8J;vWkJ4~P9xbvtfIgSOx>$Az42L6n7
zIXL^kAM@JhCtC$YaGIG^5qM!<RhQ?08dL@Xp}tSv;K{CH2pZzKUJ%eAl!uni=-~@N
zr&`An#p052Sj}XgKiQ|!!=@#A^OE5`JQN-lqrVQahrH@fKo=;YiK6-w?}X{z5?Abt
zWBE?{FeHNGlna`Bm9r)@fXAYdPU`DUBF4lU?VO#tRtYXc67tSIR*a@a;AIr{*k1-7
zW_nV}OfAFIdbu}**-{w8k$_?c_UxP%#%%CWiCOEyb~*h(?~}uzHvIMVZM|JdFW_Yc
zi?B7zFzz|P>?;zukh)x!O1+Kh%3)xMNoQXyp!tkkP(26YQ$;yYQ)%8j_!)o!U5}u0
zL@5O;LeK?DUE~gv@BGvSGmT70=Ym16dqFKIyUQT^TR22emTBDj=%dExHvnE}(1Xy&
z{Zt(`4vjvITQmV}YS2m*g_@6qW6=kO-ByAa*Hi6@CSp&T9o7CceplrUI@l~fqj>cI
zT++u#7j+dUsT9k`c9iQPgqeGQmRc@?`HcPhJq<b7tkIS%Jry=kg@bdD#jsIAWBab1
zu>y@(6ucds1g*%Ivj~dS)nw+7MZ#P(!USa<o9z@|U0e>Or)Ibf%0|;-9?&1Tk-9^U
z_}Tr`k35+a<L76(6+&`nLA-1RD{2QH)QnK6T~W>%B5<s3g!4~FV+g2X=PlTDRk@ke
zQ}NlzM#@kRElDuDWVDyk|GYjy<4&x8Ez+~&cl9S~HR5D$bKG=bS-S)d9{2YSM=V#W
zrX9>r0@%EeVm%<gecTeJtVx(0-<;tC6I+0#aZ;#`&-C^yT&S8AH;h12Nu*pjSaY$7
z7P>}&u#A}6omEfGy{8~a&7tC3ti!pb2UdJB64>_jtwBbpKcdWIMDAh7AEc)%+HsBE
zrYvXE%zbxwXqFBVaU1Z`kT?eM%BuzgT6T;kQ(_1Qy|ZSZ1AxXzz%*W_)|J5kKwVJH
z0n>pS(xuOuMKILCG;b~D5C(oGLd=U{E$6iB#u!e`M5rsmnVNy4w!*aM_*?LXK6VGo
zV*t4^HC(~!XMdZ#9)$$aj-}q!IRm-gK^`*oWX6+SMe}s|sIVC_|L8<g>nBSU9FSmn
z0>O&})au~V=(uny&}7K!q~ZhzO&&YlXJhMGmw72<@+Alb0~%(^^+XIz!x<`Ar<|2M
zkYm6h!rL)V53etx<+UPuIf8e$Ns>#j?qC9kmdW=u=qG#25{T1$5uK_KCdPaLr}s^g
z-O+|3p9b?Zfi?`VMI7f#Q5j&<!X5ElM1bc-F<N6V!iCflk%N63)4qO4jC+8994Uj6
z)^Jvi>*_>iHqMtReWHm<qKz#19p?KG`lgNQJ={7WIFKhJL8r_REhcEHlfGd{k;Cb<
zPyEyYbgyn#Ac5nnj3;Rp#DmWj)-FHgbJvFMLNeN_VT|&-byXl3LABt}%tdL7RQ3Ca
z`kHIb%<frcqMa(J3um%*QUb-c>OBsa2Ve-cPI3&X$YW7R<y0fKU|%1}D^F${5rKtV
zrOxXi7<4<4Mc?G;_pt%%%pN6r)uH2(V}mKw6dku)<v1b6B1)OGi$Vlus1ER=<LL*I
z?izvY28+ddV;|VKZtFRKbUfO)|5?AU1k+NGeecWTm#^UWb%IId9wHz&@AN58Jh7_b
zBF?b5gN67Dw@c)FDat^54>+)cYHY?4zu6n$;gb%w`jlu}i{Z?{<{6;YQVZCJPm`cs
z5nugM6~ZuD=0s<@h0?ig`dMl2#7}*_iCte#Uu_92=BT%3pI9GMvxY<itCbnfw7Drh
zlds4qFw`9H%<X6oMsVB{{2u$gw_r6831FD3Te#z@ob)0?1gP+bpMjaMGG`Rou#~su
zC*?~aprvgR;IAmix)%Vtz6V({LG{=ssi!|-=iN1|LyBJc?$PUOC0+S`e$rJ}xRwg2
z-x^DK&0Z?LK|!ZW1JB>hhkpij2PofD6Y}oQMSs~5TPg_U0xdc^v+&AZqnNONNWZZI
z!v@X~1r+_B2$&xrAY7y!c|aE?i2-Y4d^PtP&<a;D1cp4$reJa#ft7fYN>cEW8o=*$
z%-{<_xgc}dy+8mgh$wFZ76F1gq!UO<_2zgWC#4d_s>grIwL>e&he;x#hV_nqcd;7Y
zM_ixqX`v5P|EM5uS<jYm$AC;hB9qs#*(_8MsXMx&h4d(HfK0|-#S3jMg*mUT|MF#)
zA(h|PE9EsLyVE1vikzEdi2k4#@C>z^jG4zgZ1=_YKUQ=eV6oi2DGORd7bd<pNHN)o
z^uUG!wuUBr0cZi{wHxyRhXXvrdhPeV(qqLSK&(5RgayG~SfYJo3F}Gr+!E8KfyJ`{
z&xL0csnHnmi58H%iwNfQd@vUC56v#@%<A@&!bnK81FIjM2=Q1->kw{8{8DZd_5SJa
zU*36rj5u&vi@w-^bVVpJ^K$B!-tk+hx@_J*+veZsgOYePIr{h^&|_iB?FWvEWki}V
zpp%0>RKa7j(G;MX`*4CT3e)P)dmQ>tcg{D8%Y|KkpqTgTzCCCE^u6dTMA9EJ|FRnN
z11n3NMuzKhKtBVd{oqfTHb|^QBzPoX3D~{Yp#oKcx^%jv2-~M`GzAh|JrDy@<MIyh
z2XHenAlX!6nW7rP_t`L)k)~3rfQY0FDT;=+#p_j=EOhBsnxS)kQX7demcot1(vzc8
zyll#D5Q7tT1DvoC-~5NoGQQs_z&IKrh3SKD0aB&@N{wyiiQjp?G8$(kGP*YN{#h{?
zYs|GACC%SiYovm<uVuJ>I|FV_bous%%M)V1WF_B8Y{aWQY3X%NV2Hl{9CbU1Q6Vx}
zNtTC@9xKw=eoYa(#Hnbm!6k@>^FnBzj2y?(Klk1EOb0rytL|DpveF)?vh=>{*0&Kv
zcd&qo3ix``uN0{2E;(_Z6G1M@hoF+21ndfMvsciwP~v4OhLT2*d2exGJpxs|7f*fM
zTpJqg6!>zjm%ZS2dGj$<%L=CYiXU&aJ4FQOVbCDToAz;2VP#DkOu3h$k{H7_n(w2>
zgFTlpdJ|VL5gXE=tkNIgiN~VONa_~rih;;#xR{TM=}>WbLWbFRm6HvCjZ%M*09pvl
zau6Bd+V}?Fu}~fbt)rR4f<wiDo6_)_>3a5BcGy?o(m_f}ZwIv~X7wd}a`fm>e8V~?
z8_RyxIB18(qSV?y<Hfpa;o~GR@l?Tw6tAjQp5KN6wrtyVuq~W#tm2}ejx13UH^mp0
zC+K~P!ThYD=h@_qD8imNGRE^GkU~DOL4NC$x<$!EN(UJh9&Sx1Sp|R}E2Q-x!LUh6
zMn48dzE%MC$&I|)1qu|RXlIn-lte7DKb4q5fP^`Sf-95cQ?7mic3VTFV=#C?mlS5`
zTWCc>qhX~zBQlHi1JU!;Jcw)%#+<l}ZiiL~X?jBMq{{siP^vTuht?=bL%fcg5chz9
zLzJ+q3z$n?`&xLJH<*o?0mXeeGQ9TuB?e)2j4m*fsY+vR%e&{PY-1SV{}|)~wlQJz
z{&*nW$0=61d>?XG-8&gbyH>Ol44?nx_(2U*L{*6Pk=#>(8gNwIQ5Y=!0QW+Z#LC9N
zEN3XWCIHH`J_WzSB<YE-ZoFu<o7}Hj!5vM!WZ~ZsGaVo(n9qymrEg%E5aIxmTJQJJ
z)m1($Ai(p;GKnU90>YV$w)8hK&}MUpFm_U3G`_R#XBZSmPPd-8x%)|ZcK!3_)Ud6B
z{H-1ckaA4AjBDqSh3*|&I9!InTEb&e?{4*23b;;3fsr>n%oET;cl^<^ha#b_ayD*!
z$G0)lTUlddR!e?eo_ju{f5wT0spi|1zdZV=%3=^Wydj*c-?zc+8dINAo0=gp2__5H
zOy%~AcaC&Q2;@G`BBvB}Q^cvJV}Ab~_VF2arJwh>pD+q-0V6*^ou{m@_rGRbh8Y+@
z=o8;>CiP<_s(0I=U61>eaLQd7tq!$hYRLky(OQ1}YM8jIh2dy_b{P+~Ta%*GHwh1r
z{GBxR=e+=&YQi90G*{0!R8iZdd<GLRsfT#YTjYQi$(2E6=_WIu^D^p};Z;@uW#;s)
z0ajhOJIWW=8(TV?viW?wk+5$4H-Hl(Ol=2`_5)%dA!GMI-+$hDhSivM)~BWwGMVTL
zTa~w>rZm`nrPt!s0LZ=GAVIm<qJ6<pcS0v1Z%*wN-Jk(D-O1Tms*17xg#>I-JzYEd
zr(ij`ZoZNpkwRdfKV^bNhE~BV%VEC)3*v#hv3OYhIK=nl${8na8nNr^3+v<qbGLB0
zIQmx<oIXigLxJqlqWQqk6YyK80YA95Q|`-l9Iohj1h);)Aw0m8-8Bc-)EJ1KABKhy
zNf(WPJ&nF9D4Qxi-<|Rx7;2jA?-B@HS>`EL1Iv~{!6)J;v)7lAmJWZQ4cdH5>r_=b
zsmMmU=U*N0u)JiPbfi_KA0U<q82TLb5$$y7#xoOcyxd3JiOw-<L^Lm^PKQ!qj%$jw
z$$8-<ers3e0`I_$1sFCE2vsS_9|?&nFMSciFsbkDmsNDgtghm9_;&F&=~0nR=@cXD
znyCcr6pW1tIL=FEuE7Q8S%`T{AYJrWYwOF>*|N+ZoO&PW0+xO$+Wn@pyih*b7kJ}1
zJqzRjj;CB>TanMp6>-~kR14dIS8~)8)a$Qsk?v*52HsZD^hXzGL1e>xVVIV|iX{hw
zZ_Bb5N@VBxAL8KP#9d_4qy{UVvc`za5hb|G!BoMRUUlHa5aKZtI5j0td9gl{WmBk!
z^}cK`zjHzej?G)`(%F1eAQo>9KslQAe&09TRW-QAz{+c%?<q6CDqkSi0bUo$j65)W
zLWY!Vfo?xzQwZ?yV&d!j(fdo@$s8~BcARV*0=on1VhdbpZu!9q5U~9k#-ZInLs6^+
z&bN9;<K<jzC|u8slUoMc1PfF8TYZmlyTaAkPiu<RxgsD|$~Q3mk+dmGC9*^`LjZfF
zq$GeAAt^Ddb)+BYsA|yPQYjt?Xbb=@cPcK!<tE8TPyir7lr4$My{Dng0&@Vg3<J08
z&4bvna_P6elv1nNvM47@IuYoqRI7kwy@?Hc=|K*w0PGbA0&N4G&f5^kELAvx^92L(
z_yyJll<{{=j30Gj@Zt2boU{buqXWQc^;@|^B%8n`%3q`iLijSO!Gee$+mV9z%FNo*
zy37aWfr&Z&Pucg_4rDJ@_FVQBKb)nZ%IsT4@(TRhNWc^+hH;iP_LZd>^Xa`yG17jr
zd9y=$e|BEr4w^u|I;_(ex%GM!ANgWsK}gpweudlSy+WPV){Z$AfcJ8h%gC;kh+><&
zQF+@SS<|ZcHX6N@*Kl_GF}e41vmBJ)R9RHN&`&Ez8wRZ3OyKx~DeIQG7r!&oG*cam
z(e%_p`n{vlFo9ZDmC+Y{>lqNe0E{_(8C~fE^XnQRCL;B0V0Hsng1J-i<?QY_lZJrW
z+(?4EZQiD8EHdQ21wzDx0^B;+4_{Bq_or_EbQewPfnjw9sQVbf3lHk-Z)IxvKbSWI
z(G2$H_@>0s1{y0pw7#J*Je4J-=q_xR#t*z{K-}dCn50lh+5uYzoMDdfyM`wlXlxT2
z%x!75xo(r?M_k1XF^>cEG^7=rT;@r$5~s>%Uaa=TDOksv%;;xBr~NS^{R0cS*gYgT
zA2op_`iPrdqX71-FB#j-z)S{UCh?;+#e#0Hau0I`<{@U{gp^t&z#EA{gTnH|DLlZ%
z2jHK-0||Lm<P(jmWsvSsgkWMI8JMO)tCwx1tTDA9=J4YqR`2>iAh5&pZ?sapxtc~s
zpdN!+rg|JO+nbzU=+LwWvJ@Za$dk8wk1K=s1);$L);J43fl7Oqpb)FDIKi6@qNJTF
zA#c^Qq?eU6Q*#}oHyB~NBABAM5l?6RTPr}?$6-1&(dqT>PNKpKabi5aY7~PiFx(UY
z5$rSyITdUjmcB80^_C_EbmFa)!`C=X>cPAUIavB<8(mr)(u5Zl5kWVY(=bL{QG^$#
zv6rc6C!v+Bmhp(Y5U;}#1<)GfM@krl=W_uSB3or1=>HdN46j*V-t{56*rjr-8rjyn
z1%UBI&{ggscK~g`4~>3gGd7DcU5a4x#bvqIFSHBSNdE24C82mT4AYjlO#~{yB%_Uh
zL_!1&6Cs2>)G-j9PFK;`D<P^%Gfz1sI`JPF$EIx$(ld1s_bl-W@4<uG<>iU2B(t@6
z0>j?X_jgV8`XuFXKB!&;{D=y?(}2C!O!Yms!hKFDeOJZ+4cK}8Z)BKIQd44$Rv<9M
z*E9MGh=)X@rlc#Ibd9_do=$;2HP*yK&nG~*3ueCR@V>7A?;#=8_LjNuD+`Q^go;J~
z3Ur_h(DP|$F;@c@cCz!p=OM@rWXy61x@pl93|Grzn;WJi<F%JA&$7fP><Y*-{W_%@
zQj~22{@3Xd`GbVQZ;asfJ?*X)S(?-{t7R(VljGuH8iIOB|0HeJGB_?Dm}}mI0RKP(
zm_`B<+|}o~CT}jge6-{v6!Ji8{X3Pw4fv;cIr?bG)fkrG8n7Wpr@sX=OcxxC3~b}x
z^tVg~KU8AZcLHmg1T2;{R!PSbUb4%_YtP}49jE~JxDSS_r6WnJbHU(*zAJ+J>a@MY
z28(==u&d3Dm@imB9SQhgyK^eB^vXke)_Mce<AFJku{3s}BzMja;(RmQLNgL;N$|N7
z8StW<;IJ^3v4B>^9+)Rv{g^Zywse+jn_j(_j%dB1Hd@cp-WN<K^*X#nN>ur3zB4xm
zK|T%)dfnGF2~Wzl!9rdr_;R2G{cqk9j{m5$pX8LUDE#DiN=}R%eIUGz-EWQm*OX@e
zAnLQN6?%pXJ|#bi_h1xs7|?l0A!zp}E!odE)v-iP1V*0$M?@Lmd`ypkt3e`Wt2-Cn
zJUplk5L3i}VqiYJT#%h@k#Bbh57=?tl*RZNIlrPQ4}c?@n#<5i9<<<2^mcqUf~}Y}
zpD)>rfj3<c&EJ?P-(ivAEy01zWA{j1%)cT;IV}5aj|Bip^awHo(7dGf=!-?|WFVOU
z#=NxGpHXzgksya4(t^y#UNtzXa{&?BWsUTo=^ofqLJX07p97=#<aZwM0{=gzy0t(a
zjrO1xHbTZ%Vlc13tg9tPA4>iKRM9NHrAk@L3;JE%MqpP*ZprH+nxUbw3+4W0vVk?d
z_yx6pB0*E0{@#+AY#tPs1*FjbkEyeas;c|KJtZk1DBay4-KBIYA>Ae29QugTjdUyB
zDc#*5-5k2RIdI@^-~YIG+%J4%4EAR2wdS19^ZZuR=Uf@2cHQA_M#7JDiuAmIo5X*&
z2yqoC^H)8k;B)gm9)N$;;P^j6iT(zwMM@-KJUB;&Z{3fJ#b?!)`0EB#*!P%w#lbF>
zo``kOcL8!Zl)|{yKY@ApW^j>|jG@PXW~>y7hASXa5W3*TIe60jqWjIl)Jmu3IjfCi
zC%X<rP*<D)J+i91oQ~i03CF0;1}|E%-|bTz0N?51K+M67R%HD5AIZUOO0whMAvCQ@
z$R%|7MZ=>^6C9!EjC)2BmgD>(`fnD#gH+1tOtHFb6fsbHvN>>{XH1>N1z~GRDL>wC
z#Q;YV3~(0mIj)K1JQigFQ$>k4ly~S8eE*yq4(eX5T@<X}qk+&Y?wj~&TB*)kB*sr8
zAfVmI+=leQ|4Z&PGW?t1hY@9=(kfElze8gJWX-Sgtmv+_u+Q_rhhd5|m<edqIFG@n
zk<EZ1h%SXO!Q@)<7OfS(2x&kS31pwQ&nKFAt%t)ox@JSSGpJ{NGRwiK&k8iun_?Md
zoInjN{e`~AHJl_8Sm5uO*wv5SKbe;+BA9odPUVjP^Qa3kz)Dau!nXu)ecrr9eZ$HP
ze)i3fFP3?l({qe__0IvtX|kD?i@Fd2XTzb38`dxcR#nC&eS@meW%6bKiCRAegB%5r
zE`L?LJw<NKp<@~yGYrw53kQ-Iu@Yl)^RP~>14rM4dq2wCC_L)9rdl&pf1yyno~Sqh
z0jYf8?QRtVkN8SLH*w$W+6G%Kx;1?MFq1;uKO<5cSiDl#yrzwziVe+^F*PxYZvjiB
zV34VDzDJGG5p?VGsNIzy19M%{xf`8@@Q{GLS;K39IC4%8Z&M9?6Oq||<ezbS!$Yk9
z%@hQGK>?d&s1W?-m&iswee6LkIXZl3_bVpA;R>{7Gz*v%bi+(JbQd96D~y3n7fdcc
zyH|3s41B`BjiH@~4n}_4@__pswyd2%Kjoe+>NA%@Do)TywUsON_qC)Gpfok(>fL))
z3&JU;aQt^h+{plMzXX+srlJZ>30PVjDQ9En$Du$LrQh`sP!#Ne;?hAK_}FgjUJl1P
zkmPbD=E{NY-Vb_EC}MtI$@wqN+i8RJH^Et&mgiq;lR?rEhz^B)+Rd7_@faotWGh*O
zz*0(DApG;gcNuV1qwcDoUPtbIERwV1amkIv%?D=EE2PY$V>rL&fDP>}<%hY*Z#_n6
zRx*I(`b|eaDN;8U4VSh^yPCSop^<f?tQS4}Cl1m;C8pP+sXz~^iqY_AB*|BD-0ivo
zM9%i2KtK#Qux<bYN|o*F`MI%OswTIM?z5{s$JHu_t*OL1BRhR)R7FKLpYXBiDT&6P
z`;Mb%+et?yM|NS*V(l3TcaEZ>{?c(=|77|W@S!%L^>SAjb@*Rz!*AtUsTN_fvJ=&;
z`Vi90)(R_9+j{J4)P;VtM|H{%r9r%y*XLEk{tc8BB9w6}cIjITAI<pSD>R(RW#yp2
z%lVwpXl{YAVb_1d+%5-vPhnGlX1xv^9>Z1uZ3@T4Ar2b9*VdnMRC6T<(}~eAlW|*I
z%O@hYKIV~4&YhNtK)sKJ>v9OL0Bj6m3V>Z+EnWwW13iNj^CN23&Mr=a#Wt<KCh4^M
z$n4~Ww!ENA9!}NK)8xM{-U}RzMHarh1jrH@k)Cd5TWAgTI6TiN^S2>^jSJ)1Lz{F4
zc97P|zHCiUlq{J1+C`Qivp0Ol)GXan5~Ac*y_&ysN*n{@+D}wr7jhPq1x3;D$|gru
zckYm`!iUX}8t_^6hPbx+@tbenZ87EZ7<YaHt$FQH5U<q^koAY$KJ7U*+8*QPwp<Nc
zYz=aO(0W(Bc78te^qc>z(I}pYeN%FUbYJ_+hw9D@1xgB((`2~sge66?zijk<#(4<>
zY@2EOgwN-3&Tm6rDQ2#{Jcv$`0IN$2=}j0g77WVxmv5S)bsmweI|Hh<btZ{T`;1qf
zbcwh@1P_^#x*u5oDeOeMNsga7JjI{4S1x+Qy~dYUf!Dmv+j;UqdaLi@tnYV=EgUFW
zjs2yp#g_Cm+kH>4*-j}}Zfw$G3wSm>U-4(yIwfdMh-CZb?(Q@Qh^oJWO&-xaxPKD{
zecTkvboZ|tsQp1AS(epi_(7U06X?><?YHsr(e^A_lU_dx+B0hueT%^@hN1nXA|N@$
z#bDt)57Qu>AlJCDR7Ma1WdeSpdSH$lE?(UmL`|hLJ%g${>o<~=;gEKlzV%^c$m70s
z1Xn-)3#O2-NTRyxDRJeJsQp34iAt2$GRtwnuU|BU^-7+`Xqwh!ew2DyOFwy)d}^zw
z88ki4O4t7!bm=9#y3Gryxw<W;a2kwHlEwaeHZvKXJ<o@gs8k~k?+Vv^y{0pd2fNg+
zE$%Jwi=V-tr}fJDXY2L*p>&i93y|-NG3;-bYqr?&*hNosI##n%z5^CnT)&ahVk<}o
zAbJeYoejY%1>3=m&kuvu!kcT@=vd)5Mn;Pq<a|y~FRj+%FEw5vTli+H7F*&Cc0{*n
z5u678exv;wUv{}xFY+*jGdLPnIw&-Xo?JBz_cuo*{Y1VveR7i9#}0}KxjArh?Y|Ye
z9?%7k#Z9eCHDOVRtq4?OPQf;N{U65-tkBsVL#W-kAI{f?nyezMd&hUBZK6+!TQYLB
z0vcOUJQ&a$?d-l)74L|N8LW~ifDi&g{$T5eYmaWSp>{QhlO&m?L|_E8`$0ch#_dti
zZU9XP_k6Qsrs!kfRTDXMt#lsFNM)^+0fzTnCd__X0l#9<+?ZncDYm8Gk=)5S)9hIX
z#Ph>Fc^=LVJU08P#}(zSsZ4mi#<@LW590Y=d$l(V->8M?`tV4SOh`EDG`T@PJh^=N
z_qKWzjH}rZdFR7-YhWR@6D{%QKGj)mRpvr27iWtvV$t%76jA4*U0?4!b#BDjDd_7G
z2GNj_AdL!j<O8C%b4GLmjVy6M>v%1J=`@06@PR5B=2HEm(?d4_#O92r7`@3=&>qb%
zUHI$KsDep@)o?P4`a-z_F#{I5%i_aa>Z_jHE_~t_XH$b?ZRg7Z4ii3s7n`oD%Cxg*
z1LwsnGFGpe=Z+}y3h9yynmy0$V+Wz}G<xyM3X(086W|Shld_49amWxZ@-CK1cE}WU
zKQ-;kc=@%)Wik1Y@Z~vsMs{My!<m)c^0+BqeFD5Js_(ieus`&jJ3Oh$_Xhg*fm_^;
z5DErGHO!RnER1STq(6qWnlvEet*EU8<wE*;T5Pt5N%>+Pvo-ZIs`Ko;=v4RPTy*vM
z3Zh2(B{i902Y+6EG{~6cz@3i(w4M4=djAEPuO~@<h1ot+eQ`HGc-M$rE})1_6snm!
z)d_0{Yq)LfS#PXnisBxwS_FJxrjWn>6ew)ZqKu=Ox7TA^-An5CXJhQObWL_o3(s5Z
zcrThhR@vx_xb;`aCj75noe0&~ob7sojYM$yJyv++xl@$&`NaG#))llYwOT#jXEyEi
z;QhBtf}(&=?S}-p$|mFm;9?ZNE%dpBz%u6fI2Qap<}nM)t)tI4PXGHlqaN~_OZ!pg
zfTq_;b3s*sTr8b=bnYr-!Xe|&;b!s3-Dz;%__d>$R&vy$PZuqL=-#Tw^~HNpV7n?7
zY_>gih|WU5_2Bbzq<`HK$5vY~zWB$Lj3p*t0n+N99@b-};Yb8ccAF4}IlN=p`lFcl
zTqav`@f`15nlze9Yts4>y$>bsT+LU^t8m57@TW4`>Q9}fyFUy4+=T3O7|;N&$81w@
z)73XJ_<R50{hLj=o$oZALymFlPjbiFEfj(x$W?%Tj1eZt;-STq$l{DpSUYYsC@Oxs
zdv67fcne?_UqS`4DCoOIe=4Ji%a*uGd162EzD{RS8;*lTo6fNAhOwyH<<wWcx(m-1
zK>N}cO~7E7T9`44;q&`R^#_+uhVCVwTGVPiXts9J^DgY>H_H%Num>b1i5*|-gvR2Q
zh9rBz73V=j>-p2q7`tQh-$%C<b{@-Ky~IY)N4ux{>RL<GwmKD9&wQ&PhuWcx<X$QC
z?=<!C&#oKT_oIGEN!Qw4Vep0S!|)55Bd}8rYVO6^kRJWzQ}VdDak~nF-O*1u7i2jc
zU&nSiGg4IMl5*xa@B4}TnFz*cq98USS#aes*4=CUAlRJ7FrA7p&d&#;AjK&B2o?ba
zzq=U`wBIft=?9%cM7K&fmksJcivj{)?1&-FAW`L-m1v(PJKuUI_5r?vOsnicvfGQY
zmj1Z3T(d18X4fB)+<Q*%DkzJMz*%TnA>|Z3?`Wi)b#C-~_k*StM|iAH;9*MJb8UBj
z72Ct8g;{-z9Vv@PS(WJYz3C}+mx;a$`z|Z#(!Kxt;kYeBc9IZXqJGQz%%KzpofNd@
zf9pqxuN`!yGT0W|7`8QC+al=wTCV8kMQ3HK8hm7EE(txl-{_Hj8BG&xp~<FA%&bJB
zzDX8Ri~3Pe(XUS+eo(<q`1PRqYLF)@oeDH;SFNbYc1xioDM{XF{7Y7gfUv@cj|BAk
z22?+e-5$`_hr=m``fDC8Yim+Np3Pk??i-#DY9O{oc5XKKS@4+u<M3x~`Xy6D1S*S>
zO!0@ijca+YFS&FwIC71)t|h0H+&f1RAF^3&faa{~RlrE{xX#$PEzwlggXy1m&7h1m
z5=U;7Os^981y$-<!pznD8UNjgOA{YUj7$o_1_n2}s}9U=>*JjG=$(ni1>&XJ2)Yv2
zj23&}f&{?`bIhA#CsA2`(H4iu^9}M)j9MDLfg2Qpl*cSKqZ4ST<7h*FxUHGOcB|Ti
zj#*CIzlEcvAOKP{tEjya#I$T>o4ip-Ig#1I#ArZSMIYAcrecI>n%#O#*V@+7%r4xJ
zl6|w&ZM`}SvQ_1K?Y`~IFRKcV*2Ba&*o8Zw8YPJDkGfmg#y~1Q+x(v$Qo;@OiQh}l
z&3meydi^5;XsNYzKVOSL>~m*`nGK`(8@aol83ZEqs|jqgff=glYyo`r!V0hd7qFE|
z?hH%tt|BoAUNS;rmHW1bqzg<HTBX<h|N1$)pKSP_m==Z<mY-&z_O7+_-c)a?klcI$
z>Us_&(^p3qhJQM*j;SUK%I6i|VKP5Dj{o|4Q<QlkLD_eGw&l3|Eg#J5GmmwP{3{A%
z`uHc(5)OAJL28{U0009qC?qffT|73w)alFQ%BuByg9HKi$^Va!@&<nStpuOKbx&pM
zl0SVKAe{HUJXk$TyD;$nXAm2~`Q@8FlJF6ShbOH?IEp%(hJn)CmI2}JG<I6;|1vkF
zqB~&=)fIVh6bjqF7Tet)(7}acO&qlL9?KKk5VQ<Iu*XD}fz?FzSOTZ(1vTB);QW>m
zpDv)Ay@4OWX#~bd09ptLddggMB{01(m5eL{AP`{mwDLZ=0T^)aP6IBdS{Fbp1T0|Z
zNhh@1(Qg^Xfyuq;Ky)UZBEI1w-S-yR?4X3)eQpix|GBh*pB~h2a9M8ac;t`GtL>u1
z0+NNZv|Ep6l5*j5Nnu<d%e^cz^?apxfaWIp_&k_MGtpZe&n>cBf2?_pvs4kndj@u=
zF<WZ9z|CvOJmAs>TJ3ez<F&RHfVBx|&AgU#K<kjirdK28al8O9+?2D$R)JG0M!ZJl
zp&rQQ{3Xo)mjr1o2gU+`AotIQo*TlD<(UKc_lni>!`DwI2a3Pe{r4xYb0RGBJ31~G
z8Psj6V*7&E!*YM~N{(j<E|)4+h6ge8mQ3z9=v*zhygb9A$AsCl8XDD)=8wMkJ8b{@
ze0RFGT;+}E=(yHsopL!_XrxNYKV)WE49s3x0`&MD@7;Yd|BE&y(Nwt)d@;{$e<7*z
z>WhBh_J>rt9n~FaL15=>BSeQ<KUDHZ+QmNaPB)=LXVL%nOTV)u0q(~{3g;H!o&*DW
zRI;JB^=JwwFj_%H83XK4nE(x?Fw5Hp;C;x{0!d4L>#4x|x)|~26jB+$?rD+dfnDe<
zCHB+h9gczx4S@1;y1XAEu;pj@QnT=D0)<@E#qN5LQ|`k_db`Vc=kKm;Y#W$OI(g--
zcD3iL;mJ#gR_o2uGG^MG<3j6(R!@g+gN;E&Yg5v=QL~BiSB1f&`HC1)0WP0tyY}}>
zduA()+P*Ik(EUL81j_uke~SSyK-3mIt99riW-<B-K9vHm#N;bdwx|-+a_X<x$EdIi
zP~p*t!F;8z<Eqzi4GK~_`JMCwSE6p)0=<C1wO{j}m?&bXW_?)<q>X@hFh+Q)Hp&*i
zRQvt}+dD%P6wKDap~tgXswxY9)GXCXySQBlGh2n6T$-0(8bA(w*Vp0ezXE$CV3|25
z4*(W}0U*)?u8UAvz-rc+%?L#10kSf-RWCqg>W?6ZdA$e>BDm8wtO8wnJu+gnO+ej{
zm76ynAP^f(qrs#N4pP459Q7N}(?X$@%0w*l@4C_OEM8YDSQ4+#+&tv)9VuxBr)$4;
zy#HFW^>VwxBfh<U3YBZo)kA+Bi4}~e^g|+g+1=yumQ7trb<NlDoR45~<`?sav>Th}
z=jBZWO_p+lYCe;kz<idZ=q$mcg-6}EZpogthCn8+5C6=4tV(dQGk0sAj_1jM6<cwt
z#VWgOv1YDJ?}rsOFeg}F?0Ik{@Vt$swKjeJetvAylNQxa6HY~e#q_z}v;KGy+QI+J
zpwNR!y>O-7;_>?;hB)DMK+qoZ^ar%E)yCZ$tu-g}&o$ai&(#)_H9?Iu?mn>cVZ4$^
zD0G^SOm93*WN*8;BP5DcqtQ6-w%n-A`{ZzXLR6>CW=ygzgk7(#Om|g*MC({CojJ<P
zQI{Zq@ong%M0@1r>C$H;grA)kTfh4v04S~PX!LR$po#(HZH{B}3tZHe$%StAU0EQ3
zXSo{b$K4Q8+p<V+!=e_Q)35CFkSbADXFJ8Nn^mh&&En738{b1A=(`<;t9mZ{p^&K7
z+;=N;+_hjr8j;JKhR<>&6*F^?5Z8K0TRxF~_Cyb)T{*ws6wPMnbG8NA8&6w?&d!;C
z;wD~z-spBcJz->aLrCmJaast$XJI)l3du~i1g;i^sG*%75u_L!>>xQRuq8V}0h`Xa
z9>5VpeSiGhd8GfTT&K-WITN{#U|dJL)L)r4FNAFQ{slNC`n<Ph`=^~Ogc|VnI5L!z
zj<elLRFwNw372cE)7i;dYG6Ges78s-?L>&7w0e7bmVil4wiT>b_QxZZ;u~?vRJ9!j
z)kl+^l6<`3FG7C1Jj?NP%1OK<TPl7_`zK@H`!@OQzS8_aoS2Cu7=N{v$3yc^lIRf(
zeVVfr2WFRw83NT0a{ykOe&+McPWLpWtFU#g&H&92fEW&}ZBZW?Y}fpwYaRv|8$Bc7
zaI6Mm0!t=Y_r|l*FIN)+!*Ev`{eSFpn{7Rd-zHY7t!c;LjXaF#)vM{S!@$`M_YFi>
zHnXMHJkzEBJDim$s~qpzMP5C7{nRP(Ee;+d&Jch$OxK3Q52z9`W@L$;=l#}MCfI{<
zot>@<t9QH<OU5GyNFKY@_J@Sr?D4)Y*b>0V;@lZgH(rI#Mw3irh>AK7BbA)ntu#+M
ziEU0t5^-O7pXz`8@FizCHcDIYdj2;Ff5jdf08;->RD(DC4b^@3c+tAnzht^d=tb;t
z^X<|Cs42DK91v3;tRPH-E;w%jI%#4BUH2<501{4QF%|8}O4GEH<Ud|=#Ye!A<?D8~
zChB6`k}f7#KNbTR<dT7zuuX-uFUS>ARQ|-`-L#Ueq|#4%pCGK6T#_<Ew)hVV2pEJd
zhZDk>nrycm9)ai&5r0`Aemx8@82%&jvf%sxka*}m!ekx<_F=5>Du~G~PTyXm5G7{|
zInb8GC&dYYNkMn(&}Y%dAJ0c{DYw;snAC#iEM<Nz0-)SnFqvXJz59k~P&~crVV5(_
z)NK4LGxk2O@k7oI-AK}t?>+&4C1n5BpWh?ndJLOwiMPY^dGpDLATpI}$w2F3bHH|~
z0PO<xRrvNqB1dwIW!4h8#ZTY5+!s+%m-`8E4||R$sS73S?1(IJ(C+8~*daVpP_2kd
zD+}dWK~fS-26ct38GsyChsO)POH*S|UCa6oBmo%`JQf{3i{Y96(@7#6@hMb)P<Pt!
zMCMjcV4i$D?vs+E-8N0(?aCXqn`!IZi8UFkw6*t_+mHd^r1W~68*6=qZZ-aMOa%Mx
zWbFZB6!gn2#zHQY`C7uL!^3`pVmap7{k?nI06GCnoJi9w8RzqMdXppQXRrJfo-eiX
zk{f+L<HRJ5k@ocKm@@5?*2}@X9Ef}zuCz+#(;mOm-T3N^wp<2J+{H1~=gm*mluuUJ
zbw~3K+9$6*WLfJpn||$$$B%f+s8e>`%d~}ZONPHEscBgTv)-M7B(vL1=2;ZHPF2l#
z%~sRpa=l>GiqHD#e%%PgY`{Afmx+<lt7conT<_{J7)ALz7MWNIpYjsG!m6`>;L-!;
zFTJY))yc|6j%%&e&L<_n=pK*#3$x_+z<$<f3fR%%EK4-E!cQKZf>E7Cm$OrY3JR=K
z2nmPecZ63FDOzOFQ9VW=!5lC+e$_zEsmUK;%T-NpeCX!LI{xxl7t?LDP-~@f1~VII
zsx9QQ9M}CLr6UWDugWN|>f0d&%8%#yjUOtzqFFQqDiL<C!O>q@!q2zlHGhBj+8*b;
z)qk@;iC7+jVAEQzRY_h6P;J)TdqZ&m+KN%DCuiS`5{wow=%dP`V3A7f{~7~eUH_2Z
z&2H9dzz{r+XE|;uvwu>D{G#DE;6(-$ATAWo<LUvM`HIyJ@C(4KNhdL=IEHj|04Re{
zS&bB$fcO!<4LedU^TB_JYE}nGyE3}-Oe2t23A7gn?h*ie5$VflPwDSXJ^;D)1s5Q~
z1;8m2ShU8`ZGx<SlW7ScCBV5^3(mriO)ps>x^4k<weF!#PTq!l+nopIo6O+n`{|e?
zen0=a=+zS=-<|G7jNS-)$e+x;KSs*X8uO-~mS&f-=K>ANA#=O<F_q>+Njr5}l{n5a
z6LiCg``l%qv_ICtF^_q+Ee_eYj|ov@OT0cg3*sAgyULZhMDB<454RB)9QD?_jgyK;
zpDCLxdXh~1#20J)6`_#_!Z*h=>(YB>I5_I6q~f<WrQSXu`;j!!ySR^!Ua;MPMV;E6
z3}IW{?7L?e@2^r|SG5=Re7MbA&c=j(9WCk-ym}{~S`t%RcsS$8<=N|bMU2Yy`&3=J
zfuh~iP*Nzh+PiPFf=q9%)t^o>C-2y>uHQA=pO;gzTk10*j#=={BV^P%et$1@33Kg?
zU&Jq1Nu@{hl1-Bk8JqlgffdOA0>r#bQJ?w?*};QcAPXwsaM2|sPW%r*1*E+m$Q*!>
z>OYZ{28kk)Wc-ajqc6XQ`O95PzS5O8Kx$RV641?)TC4;PCn4pc`paYW-&AbTBi~d9
zc<q+F=cC!|jeL`SNN?Gv%TXkKJ<bEz(6C2>lXCv0U6xOc_8}F~k5Jpp<*fF$wG7T3
zVU?S=16Ytoe^?(M*~|Rh+bxZlZc;FGBfU}<YeVl#m0TD$G=}j@PGm=cW#rM~74zI`
z3z};7-C3lFuy?t4T`*YW=^$-ZEnoS@Fxjx&$l9RVW?$L{C?mJoV)+3!_v#Yp>~Gu0
z41s_*$Hxd~3mjMDP&ShqQpviEj!Sl8J%7L?lKDB>`^LY;HH2TCR<+u9c1p76G3s&C
z=XK?S+X<8S%elsmjhE|Q`o+;i;;<$?+?ewR3&lj3=b)@&(#x}3;$i7rh284!#pB|u
zsXP@DVSl>;OZckig@cd$91Q5gw<(;-&PM)RMR$4!+?E@aaC&R|50gi$F9QlM*gotT
z!mG~h6Z{Ump1u4lxObjyH-{D&4@4r3VE&HMHZk%w=nj~JkNiQab2aqyz4BVC$=G`P
znS)^m?qkdKQ`JU<Jt=1NOmhnc+Nl=A?{2lq;PYUZMvF;<k9C9<_(asyeCD}(Jc~15
zZ(vv7GtY-rl%V3t{n_%;k%;TrRa#0(GAcYM7|dHNbpYN(6EU?(5Jt0#29q!7y%6m&
zpm&rnq;iAWop!(0m4<e_JhuS(D!`}Gd9zVBZwN5EE6ujG5}KraWix*K4Q~A4Fqson
zJ^%82*WvqcSsh)Z0r<5Ju<^H(o86BpbZYy*0;ul^ClpveiK7!_3_tC&6((lj4Y1(q
zf%rz;(&|3<aAnM==zV_GERlW#bfIzJ$enbn^0_&j4d&Z*Fh-(D_zuK?5}0><vznIs
zc07=S2tbs`1Ot2+_^c9q-WPoYILkFw6DkJHKobaLKrVok5@`xh*v}SyMuZ>vyGII|
z@qUMzz^d~Taift&3`^{}T`-zF^X9h^wjWf|!8-S?h<b%#(^QRN>EVRf)pO_FWTe0m
zv~9Wa<5l#C2dlJKUhn8I9ZRv!m1QM=F1Lf!?a2&`x3?H*71IUjOXiE)FV=%QVqP-&
z40NY4ZRUqEEgfmvA4c^jj08HiWOqF{%B8~12Ulb9-Syg`yQOE?Pms+A#kDi2f0Kg6
z*gF-R2B+b>-08vzo}2j{0OGvcg_`WvgW~u8X+-|Ivu>ul?xpeD<($)D>h($Tpp5ua
zO_})t`XI`3z1>&+7T>r{i={u{t(C>;2!hdP|CPPpxh+G|1MG-mhl{tf>>R+T>1C<L
z+P>CX(-QV%7LR}8NwSDD5J?<v$73@Z13F#Xp?@jG`5{L9p4#nNj(x^-Jtm_1Z43&K
ze|Uo3LGA$e4zP*XJ)O3k-G>##TG9I4XuCd}pXZLz!~sr0z(`@y`3=6`NFWCD%|mn&
zYg&XIkLijk?@{1k;(gg{xvt%(Y)~WTd!fhcBZ|^l@A2al*;8N~V)Nx}h<Kv?ug|)i
z3SUJLv~_-GXLq7v$zMeNXP&JVn@*GY<Y+MjbT$iF8R=j#@||)#IJ@#UsNr>d{@^)O
zu3ul|S1TkPo@nzoPxIYwqw8|N;i@6gZC|gQpi30YMLYRcIoVF2vofDzZy(r8?w?_K
zg&*D`N%=N;ZS79>{05~*k8BB&4?yDC0twGn>+V(-uaLguv0X^ng-9VspR_eRYeX@3
zj6qKx3O`IZ;o{-cl8Q?{g^K&Eqg?gzDL=N&cX)`-PlStDc-<4mkP5jNp9KJ3CaZ}=
zHlwb;N7e&$%@g5Tl?q%|9?h4t6~Z3G0XOLl4yOZUA5o_A<&%Z2$V*a)gfoR)FUK=m
zQYiv|RPPuLw!u(t8RXtj9AL^O4%Aw5Khl3cuFM4k@RwOseC`HWD>}2_NcE2N$(<9G
zcrX?WV{qyYjQA-++w11#I1|P9ois^5M;n0<nnBvNp_F8a8s3xCs{~;7Fcm=wFrw7n
zVK5_n{}uX;=Rg_<=|B$UfQm{2ck0paWh`)Pm%VAl;Pj_r%_GLZ@$n33r1fg_s(l)K
zo(fj2KbuRdN>6{RWC7Dq_Ddm<-VZ(R|Iqm%`}$pB{D2CLOd#9i2$sWNZK+g;ii#Sg
zHwR;S{5#FYaEVL?D)g#29OnM8-J2ZgaApg3D-(<++ZTg;iSR_48^noG!l&1}RBtV<
zOCk}G5%_eg9kgn9%Txcn)H~eFTzoXzU#L^4OEdR%?X0>`-y5bASW?Vx38O(?!+tH`
zv>6O8<_Zf7>v+0{{XsjMv;F(WqfMqi5+T@@$9hFxLBTgy@rsIuUOD~hVJ%aSA)0jI
zkC$y%_|IiKHA-&j??j|K8GuQ6doZp{uFG~;<Ka9-Bf=|;?Xix(5>u#Yl+a`%;|XQq
z>+nE?&kac=+>bMq_NCv83DqCl_1aS1dqq1Om#Qs9Q7v86a#ReO+|w*5?qIIv+Va)|
z<jw-Zf+YJi9HHgbILA`ECnsaIm|_xy-*#sKaegYD6iQ(_24roJgJ0oAXlmXnzs2K*
z0GSyRIXToa9^zs<%(6_tJ2rAgIfzr*x8QE#M44_ylx?6>L$pyAM>8n)T}Ju<2zHyp
zHvtf}Uv9vJj^Jj}RaaH3RI`-+7ZS1@ZTBqT3mL+Agp>c940z=*oaN1>>iTl9(US(H
z1-11kS^4?-<KXZD=xn=9m{6AI!#poqtf%P_6q8tSE?-e?rjNVAwMw^jj|sdo?UNDY
z2~(v#+4ipzQ<W*;Id5;oY^0l?4CVw!goYLwwl%MK28+JlXAF=h9xswW8Ip^6&l~8U
zcr<dLP_50R4-kM~r^*iE2b{N3f7}lKV?u=!_A8d^_{*TDOMIgw5DxtS+%l6qAw4s+
zxHvQ<xcZ$ry6E?{YK9>ELg?HS*=Vmwn<&LA@ynuhXXCqo&X%7Is7*9qFh2>0)S#F6
zm()m^{;BJsl}BF1$bt3rx7(f>8#USNoDqOK9Ov5&VRU&bd1lMI?mfi3Oukz#N>R`<
zccnD3m+p?j9bEm4=`4O~uLVn0B2kZFePbFGFZ-i8N0Gf+i#VK(jBXY2M#tokkUTsK
z4-sF&QLf3VNs4eU;~x5nF}>FAfrhG#wLD_%QwXoilkE3DTwfO!E_xm({hJuC5PZLD
zrvauv<i)<moG0E&r7hPB>!8nrjDr!_jUoUkUhGvTG!o~#y%0e967z$;C%uFo74o#_
z*cCi^4yXA4kVRVv&7myzYqIOf%1kE;?!AKyr5=ErL##%i**b<<BChaNI64v${r&>W
zBg=b}Ka9H{L5|q1J|y^Rd?c2|$Ap_~u)}E%Eju{A93`O}-y;Hs3H^6?iI!j!H44?I
zD^N&s3&?8+Q-Orj9~v=MVju!)w#-CMvZ>l|vAb$&c0TDZJ!wAr<2%wd=9}GM|Kzt4
z>l_r0-~B{XI1>v>dVVrMd(p5DxyZkAH4uBSG=$0AxYy{NCj@MbP_(0Z5q6&+o~A<0
z)BfE4tp%$-vo7Slju=*ota2cjFnEFyjKms#m1BHod|oLSVfCZ?oX$wVc4OD5;mu-!
zS^7||7~^R&^YZJ`D!Xkke?=Sg3%>|_K)HU4Ki-{0e7_Re!X259aLZ0<(pt0M<B>E(
zjizeyekga|Ubn?z){#LqJ4F=SpQz5iQ#m}Bsn@UM{nFdziZ2yi^hds@Amom;gMu{S
z4MmJN4UyNy?WNAm>e8?E&ilqjz;`8(_l;7trDG_YHKV-^+Oc#~_v=yeFIB4uX4DTC
zd~O8pU6s{pSzQJK+5Ey1-|Mc(y)0|z$MJH4mO5|TOz6vHCD<<WyYCR!K=O3CFn=Of
za&2Hx(5LQt_ZFKkm2|SiSQ%no2}pR%Uv`qQA8p&9;@<b)0@Sx%oSYKQl0%z3pGPTt
zz*alJ3Lvz6Q`y0L?*F%Kz{Dhjm>Zq2VlvOsWxZ`#L&*2|o!D7o>D;}?MTTEXee3Lt
z!Q_nHGw~<dK$*C)OHA{IvC1};A7dmY9WOSL)4TR-$Kp~6!FVFqIS~HJk7VHyQSf{U
z3eGmU>RJw&x~(xyrPA*q`mLT9Kv|X2S<^a<Lr#u0i<N`MxxfH;n9Kl!CZ#NtsoW1k
zB7*CR=5wRd($j~<*YNAZoW4jY3L}LsorA|N-rNW}G_pcL-zNgbJAdl_`HJ%kdo%ck
z=Bl{U88F3UVv9dHJ^NY<A>E<u6F<)aPAfJV<!`cONr)-%?CVU2z+n4T6g4L(fny%1
zLb?j2^50<*OM3#$3&P7Gy3kTeWu$Yip#1F6fEsCcN=YR%Bv~}nFnu@~)^GnT`N4$s
zK>$ha1-Jsrm=+!ui__=;&^q&IM?b&-bP)AAvkFx(=j_JqKN8(uC2(@{#CUpY0~K$g
z^(TN}{IH)J6Uy-oCB+ahoyD<H<BceYz)1vVP-9lrb96kZNw|;Np<ogt>B@@4D}6<E
zKh(AeU>Rv~+U+GGdV76-Y4<npc_%_k_mA~y{x?s~w~xlVC^<#W)BQMv_t0gXB>lg*
zDYWwZ*bPRYN<_X73ZIHn6uM%$<V`QarZ9w?rAIbcl5OV74f(BuwH_UANO%#)aCStn
zeB6vDJerg&n<B?D_&%o)I5Vri*XDlmJ(wtrrtoTWwLx<+v@f9_;?^gPhBlhCsos*W
zY%lKF&Xmk+{`N}O-3qk68cwWav<ZL(Do}}syUg|@!PWHbVXPzedH~XfV=L9zx$E<K
zPiPov=No2TBjZo*V!uDBTh#1{zdTJBM9nBq?HBUYbg`@Ie<n;dWFT^*QmT+pOSqV*
zvm41Kb=~b1rySm#nI4G<IlRnwooG1NKgp~&{amIUhHb_E&YE5&{x$Mr`<f&>6$p{P
z(&#t5a<-r0V&ocP2$QPvq1oejCF(*bE_=bZ+8vfJUy>|moyJA9cHK?p#f2l1S=sm8
z-CRU`E?KK4w-@8r_8*aJ%hn&eM1`(1_&k>8OFNJ+7p@-H4D>t@GJ9GQTy;1)9nyt@
zmUowav=eIiH=r+ko!!;EA(=OW9YEAyZub{zJzzp@5P#bP0tg}r8&n11mz6T~K7FF&
z(dpJa)4|e<IGZrpf*6LhffFO_4vt$rvQkc^Sx>}FpO4BA$Z(1|wPvB<vb$bZ?U*k$
zsFa$-rIeM8A+x`*!}z>gP<~WR$&$7ys@#!u+zV&?kSJ#cKPVj#Vou1xK)HUfp>t5I
zFidD0h|<dYbr?xrJ~}kg({c1K(UsUzlXGR!ex<=nf;_whb#Cq&`JCf^kt9L1K}6(h
z*<H3QYfi^q1jxI27oy>n$bz{tHpzy_dr@esddKJK!@5IE5>a{fy<M<Y0&{QeL_?di
zGPGi@6e6Y46KQO&U=+$}`R=`bt~|sXgdG#r5;F-`g7vzwr(5PG?PfbT6{o6%+L>c%
zVSX&B<F-Zv*gIm#1qHTY7cNvtRSIu|tql#u_OoBV;T>*ddb@49kRnF@?CSWtTUwe%
zF_|r9>l32ir~#WP&>JXy=weCnM9_ybGIki+)<_BFOaN|HL3R7yzF#={dTyw%C0<X{
zjK4&^ZfFqa!9;0805SLd?u)T|S67$z>+frTLG-mAnt%eO>?rl_cqYbmk6*U{{yp;G
zE|7YnCSa1pgo|`b%K@WTxE;<gO>ytUVj}FAYhZrr%l!*v5de&aDSk*cP4(~?QgHvb
zH}JG^WU)s;hQjj)4V?2X0Bv@IN748t&V$WIzkvT9-<)w*#8|U4*2$_fav!0<>3p~-
z!Hakh)vy@Aa0VxmR^jA-<_J>C7Rz*ZC(8<58GpiQ1lnI<l9lMU(FR;F87KD|&A2mE
zsVc5mr<R_Q3>TPgAsizTqT~ex&o#T-GNK9W!++g^JK|0Ede^Z1s7z)k1iF$++bpol
z=sHic5I^y+Y4E#Qnw5<HKoLARXM$W?LNG1gNh@z^KS1#iLn^{$Ik-SSmOi7r$RTJD
z=FQMI|D@)2{W^yuQ;ANneWf%Td0Q@5=lLSxgqvY&@9XV%N2CZ8X&cLMRp-y68PbrA
zjh|Jrd|%X4Q7Y~&1}f}H8ti~patJ}er$X~FHQS%<3#D>7PXAlFz>pBO6j6(@%gQem
z5Vka%Lk<Y_x~?EWwke$c15Y~qC~p)8JmLlb)3{(VrZg6X3(c-%%(hT~`_k`Ea!g3!
zpJACYbst_6e`yka<aY7?lS+JAzLNnh`n!6AKrJ@CR<mVf9GzSbgIJndVrC6m&5yVj
z^-n(H=j3CquJQYy;X#fKHYxpTt+)w{x(;d-RM&ABCkonDky#?k)7)f!x(U|#kk{Gx
zBwiK2$_}f(B!lZRuytenMFV-llNqtvz77<SEwJ?U^&^lh&UDu#I;0l&9foppa+<6q
zc;yyEBJkxfinRih;jEiQ)@o|^uP?vv2CxaTP*AW)LYKS3xchsgx$4xor(U4TLM}s9
zI2&4qhyG797hc(8ad9;8B=)Oy(z_kTAA%kaPg)w#xA!Fndwzgl9#%K;EO{u!E#Mt|
zYGu#hKl;$T$@KSX_PO?~+uO)R$<57;6{O8=;#M=EQ9B&bXK0Bl6u@6`y&k?Dn>pX=
zes}&^y9liW=Q@XEzH(UFeg$wxXfyVzJ=$p>5dU}XS8vii<a%}AQ3Fn)MXCnCEv$(b
zw*QXA&`j52#K@-!W*J$9Nh}^w0RMrOz;p}V4BF@fqzc|8FI>N3$MgOkAhuh94}XB9
zhPtazrTz5;2pdSi1^%$9qNPMIh7VxdFZ^S%jfzAMf=6+Ur7>~nCqRMBj=nvwq@rg7
z00W4;a9Ew>*wezNZ?CES|LM>!GYyVL-Bsnl{g7xL9|cAb@8wrS&dI)g%)Bl$>k)dq
zU3R<(Z*$1x`O`z*{o|b~92XwGIh+Jn2L?5+hX?HsiCzv${uwF+W72!GZZCC!```5q
zE)i)m>{2qCT+~~N*xC1P74!!wMx0Xim&W}WC3z9<*GLQSprb18ZqJK}nKPp>asfS!
z?B8CAqsH(ibk_IAyzmX0MW5h@LiJZ9^}nO}sCe28CM8g()+~^sPQYALNP@CyY|1_z
z6emI|GMbzUjC<e2XC$=xk9prVg@|V(Di7ysrZCHU%BT-0E!UeY_v*a$x%Q&@$N&kJ
zhVP>n<NK1QQ?vDVV<GTWt@!b5`OSkcZUbuNN1x~RB35`l;X*4_vbTsSje*N%__)F#
zAA?k`+V?CUqOnv~eQx)#8&JWpCeS4y3>QROREM%!&V-Kh=c&%8xf7oVxNQ`NPcaNe
zA{NP=Q~WmMLIQRld@Cd_z3E7V9abzWRdM&{i~?m6!G5Q2mFz~{z)pv*XR9F*mqnO_
zP8oQa7J(_tt!Ulo8==pZ#zhWT5lNGXUOB9IX{2Yo^E|_1FLiRFE4jZO(L51qEi0?|
zkYB>mZXn>Z0WytBRuu6+nqzgGYsKpaZbGhq!=Cg(sL}Vr=&LiGBJ@oBmZE&A#dTzK
zh-P~?bxC$q5arH~?%x0Jow&W@T1lq93KkNVkFMQD6ZCO{CP=M*OA0H0IH^QCf3uoq
z0HsY_jtRePD$s=ZsV+#-WSDk_DU;3PTg~|s&KypzlltF#q65stn2O56_Mck?QEYmK
zQw)ORSwa{pR<w4#OFtgNrYFnPfDj0FxJ%@dDD145Z=zwEbJu3NX7$kMOZTtevG;)j
zCVBPO5fllQ>W-~T7*~G)yJ79!-vDvC>sGH?w*(0ql347YbPNpe6_at_zO@|jSM=;x
zYH#HHruylDsqzl*EoR^G`0BDatQolxllUFi*+PXPm63Hj14YfK`)N>y1kw3w)zr=?
zr>VnDMp2|y3^!}=nwKLf*pt#!8MRP4ky-0#@s8~04H1v&)U|5jfnsubC-jZePnW8-
z2A}kXh8QJ}F9q}1Bo&ju=wAVqcrXdmwo2-B7F0!~kWdper%qc$cNunvZxKZlZ2w@(
z!N1UoQ*RR`^zsB_Xt5vM5kbXTs`Yc7KWz)dYbABW`1ggM-hHG~fU@Y>HpVm+N&l+i
z%ncEsu0m}Z8--5lUc>{?9Lo)4hS~%d{B(3w;VgZ-pXP$5hXGfVHSdW!JKn*c!Y3J#
zmIavY(b2qess4r7NR6=Y;~8;a8xU_U^X-sR^1ta-GomMi`jIXm-CCsD)6za3ih)i^
zZ@xc=T4ED%1*#8zQo`<}k!%kgaKKXgS>xrq^erlsipYdsdb8vc=z39qxb=bZ_4z=Q
zIynORJA?QrwpUo_W%Qc?Cn=s0@88ECBfq9llL|+X*B@Jj49_wn+UU2qC_D5QZxt3s
z!3G;*Q=1F)RU|P8W6j;nua_NPqhnw$gDoNHh~9z%LMc?F$g6iP^>{=&$br|wzQ-XM
z!6HA!6rdrM9FJJZ0#$dQ9XkEBmG(sRugqKAuXAbUO1C3QRVuiq8f;(GvIBYo_rAZL
zCaG|~yH)mU={Udmo-gjOkUJyA%{nYWOQH7VvjC8bL%>1g%dSG?jqM{&?kvBr;%2_u
z8K`Wn5-y-N_P$SfBW^;1w!m1MF!wdlKQ-2(^Zt3E#X$d(Lo>}jkwSFJ;CgN-w1;x{
z^?ZeX?3OCoi_h>vmspTndB8SO6XBaI;b<~JIg;}sX_SW+zoXy{dZK+Al43XC_AeuI
zWX}TrY#}DOqgnqBJ+bVohhyx$(2kE_(xeQS=<CG!qSqMuZOn=ivXKHZ+do4JLx83F
z4ZN+lz)vmW+uxfi&hg@Pj%uCdI0|pK-EN7|(N8=6ehl^)K$msjlk_%pnT(8PlKYNy
z;nnhBW!_6N#6Mbyx$r@<;p6{h0a!1qA?U@B3wLuA*ucCo-yaX^jHkRHjA(&hG|H!*
zcpV`;%t9{E?U)_0_P=$SH^F$eJSBnLj;OK}(aDiZv0f&kwPJ<u!in^01uHkcjX-an
z4)Pw{=C*Opu5sg}Bk)k(mq6|X=_ceDVCsn_Z>}bOi*I{+s1QEQjmXQi@bJ~iAVRi_
z1_%8e-Ow>HAC5-JSmSEXen9Zwp^>~Sjohua;^2eOk3CABSncjDW(#g??cVU^6yl$m
zvD@HyuJ>`F1)`>5jH{*8N9cuNsH`JsbJ#6Y8|RLy4L2BK?#p$OZlfCERk_D{>Svhu
zp>LoPkZc;4ytf=B%oM8nVl2}&AD|biNi`QNgx1T{k44mPMHuGJuEc#USAnq8i3=dM
z5*D!Ft{7pgB0QJPoL=(hc|%h#JS4$5F$DX_y*a=o-{j11@wJCt+^#?5Z&_K?JP`*A
zjc*2DQp%i`Me|Tb3?*;FZ6@|?;->(uT(ViQM2m_&pf&JW_vVcdu*=FyZyCZD;=zJ%
z6YsccHKZTfNfCdF_;24AgH<reL}|IGFQIiuPeEenyqc6{_)smST%W41O(rMIRaC7w
z7GGwLgCGu&KMu1sl@iI?W>2Qiqh{cj3!9ks)cpCdO3gwlP9u(cLL)&XoX?yMjt_NS
zrEXO<D)@-J-Lc-ecfSg4=bD}&c>NmBjOq+6M1|)PiD$_~bScTwxPwt75?ZlJ7gk8H
z_%jh{lf2*^E?eZXTjTv<N#9_mvC{cSeRWH4G+lRzg;8+uCJyiYJnzVbfkk762U*MJ
zrcHNfKr(xVg#qdAQW2FSCaF+AL*H5OMynt=(ZK%^n<xs_*9B|QN#31=+DslNLi{Wn
z>5!p~7ki1%`6D2|6J<L{FJKP}R;-<w&G4%D6HYanm-KS|n05kwVSje&>iMLdFO#?h
zrclSfq`dE1O0?C9{UlRjl8@xpyT5lfFbh_d$MCsyzJNqDk`olB2sV@~?;hrt>S5bk
zKiQcHMp*|~PRMv1lIpX&JCCHaS|}u9qPCXCb5X8iaUPpYa0n-#tW<#(lUY<VMK5Nj
zIJOf$SvI@vH@olAo0QDun`u3$Nn*4@r))z!D7Mb-yTDrhEd~DJwNzArY#nyuUgsjw
zJs?M+PN9jFAlxF2db=hF$bOcHB!)cseIs>Hgv5=)lmcQDk!&q~`Q6D1P(t@N-U~#^
z$G~6=!YlJ7>BU>%^Sg$SsB}^L>+DC54Zkqn!s;Gqb+vbH0b69*>I^ZaZ@doSue!6S
zEK(dC_V%GjOBueN6!WGWx$V*UwUkp56|*%|jJhb(!e}tTA?CzHbUDfHM25hATXpc!
zaCzzfzNBsw&I}sMG&}yt$9JL9=Xg#^kpb`Qgny@PYB2O92zp?A#T%&*{E{inJZv27
z*r?B^+b<It20!%v;}${8j3+BY=VMF=%_3)s{`dd|I5LQ!9uL8`Q@LgVQ~CWTiAN~~
z&yOxitcW7$P7&IcRuF!o?l<%<mveThcnG7t?RI5T80rESwAJsUn@}e3leO6xIBsoI
z6MyxmQ^DW1BEHA{8rL}ARL79{)H&Gp?e2MTYN8-A6{=u6q$50V<Yz=7VpD}$2rDqL
zuy%eWX#SHHc)*A<Z+Cz7BtmvJ<ku%0*~D+*T1YQ9^pz<N_yBFs&BujHEK<vHp=XmM
zDz3aTQM%4y-Wu9kSr|#ewe@487m<6z02N6gRR-OPD=IL%#%=7(EadJH$}OMUw<B3t
zQK|a0AD=*4FnfRRybso5Kfk*gcFZSMkFIT)dNa%5tn*aK;lan!`HBRhrY&bary46A
zwrhwSZMBO0h9U#CfNBe&zB46mizXXU+|8xSE-NP+H;uV}#D%yNd8vKDqQ4O0E3lhM
zMA51$ErOha=35u)1Klp4^MN}*g!N4s(MJ(Iso4${Gt0)be+-U>N(4^(-*6Qe3@`pY
z=Tt9pl8DhLT<tWp<s*};9`u>QYAYb#-}?Q-$McIYPvrw`Z7^nVFL{tlWFms#l|G%Y
z5Ym?l<2d#iTMfsP@Ve<eFl{ZI|D(k;Ma#2i8>s)t9KV<+8*lUEXl&dKd<whTnO8$w
zb#uB+5FE>LlrcWLZR>e#dl~wvD%83nrUz*V_{c6kyV;=*N+V`#N4_CePT>ny9#+6l
zmWgsq=J2XNa8*q5TRo~<GwrGQGEs*kJXTu`8&xfEu9GydZVT8-Og22>-yS1Yokw_E
ztjBP}P0fy1WJEy$nR?FWK6askP)n=b;&y}X8}Aq>ZGQ5WE#D_&iW{{!*IlY7O0$2;
zD;MT7>An>Z$ysxq1;Oe&Y<3EJ$i;U{M(GEue5pD79;Oef0@dgu5@x>sMd??~<oA^T
z#Q`YZeR&Gd!ELaBbk|=F^MxXL)_CmW#n1ZRbFarrAa3f1e!4p@ujh*@JqhpWv&Ebf
zOcP1Gs~nG)Kgy1{r@s3Oj&Xa3_BqX;E8b>^%X!@Qq}DE&6Ae(FoI{(ux7sJqZgm=<
zKA-y%VhF?`&21{~Wur&E+jGSMV&M#uFJABe4(QKNQ+{8EH3JzyIoxh^F|~ysDJM+=
z(iVw*7`EmUS)x!T=jov%N`_iPILGlfMiO#Pxb|(jta>hUgE;ioc1kkkN{^`->-)q8
zU;uI4`^RP$UHH`oHp$r_KmrLeM0(DdmTLX7g@Z=IDHiJFmGW6aXS2fisgHnSmk=KA
zl~*a3g@>eN7lDunC8;F~N1XHZ{-o*DcV{A!BKXg7a3PCN;njg`A=hNI8&yOgIl|&e
zjIvQcnhH8X>ZZnc><yrvl>3b3pP4A~(E<@&gRl%mNx5)D1Wt%ESaokK9i<1uGddZ`
zs#%0Pcv^4k@Zpy$5&n@;Klotj%(%<xKKT;l?m8>qubD|`n^6HXvCj;4&#Jii!}zq8
z=>vE=uIe#6l1sH6_KT?QQKBNE^lG09$2uH5+}!rpN`3Acnt@=DYgc3tS&O?q3})jo
zOC(?ir*PYz`N*PKEMR?0%NJHFBg{0dA>YpLPY7OgR0>loh6FG{DzcU<FxY9OQrn=I
zyBs~2kbT%KizpQ7)e5|B_Xj<z?Vf?W;|~Ho$85SCPUkaGzkEQPZLCJWe~udR)X6k5
z2eS-UdBEgE)La?2E}VL)2G(n$K8n@Zz=5sGNYCd&&$vmjs@``pfA`NPDwt8VY)hGk
zkbY}{PkLSv6<wr@3O65+NnItt&(JDc9dCA!+s{?YA8u^U$IaT2anZ!u^Zu3zqF2g%
zd|qI>LpYTfe~VSVQ!xae>0vo1+bVgGH##&8j8B$*E8dJOpJ5^aFqfz6E*fMXf=e;q
z_4Az6h<5!OX%z)liWy$ljYtrLxIYVz%<wd}OxdvIGWjhy``#@KX__MqrgOKpnq8rD
z(+@62P670czYjn4<Pi*7p0KSJ{MNi&*CHQ&;Zgm@7H(%w{avn2&D1}f3qA|56Bl>V
z?hvnk*~#3*CwPY@Gc^;#5Jhqk@WTL087tQ;#}U$GFx-*A`TQ;6Wa7rD`dx^dr<8Y-
zYl9$$I60s9c_BA}Mu|4wW0_-;pV!rG)EPd<-9=)7tMJ`vr^tCj`IJbN-GNK6o<G;h
zmd63E%8ueUQs>u%TqUw+pwm|?9(U)a7INc{J2ghzRb1apCbx;MWTRgg85w`cO_~zs
zu~g!YySy7LlTq&&qUl%NQZ4kTRjt*a1&9w5D`Ur(Q4=Xz?YFy8RmTvQ^?UH$4}`b4
zn@@^~v(-b!h8c@y#;tH{DN_;@rSoO!I_lg<W~Vkda9D7!I#IisquT1%lg*2y_Y2=4
z5n!P6iQ^^my0`&%-T>r3#cLv`=#wO`p;xX*9@xF49#e6vdcHv3+*BdAcWY7Q2%uph
zJkZeX8h85KS0l%E7r*IuApONhDKILOKvbOISz>lVyn@cZtT1Wzh<wToTvk)R^7g`$
z-ZYuNIyCAx(*3(9dP7AE_HN3!l%~%Iw<kZRu*KgZ@)wAb3W$Fn>X&T$yZjZ`4`u89
zuV4E-+JzOquHHO5e-E&>&=bTn<#vkI1Xge|<bey^Vv$x5S}j=Am*i5&=3GGWb!<ls
zHK)upvW-)WYORBNBjS&)6J27f(5zN3)4e%^j+c6SXd%UFtG{h`%Uhe7;x3RAB--yc
z)o)nvHEd%!(avn;AW_AvATsM}V<iz%M#}4@Q875@{WG%o^UEo@wTfO|Fq&fHxXSqA
zg$=6(hZXVP)G=b$or+W~rnRxaAKWzrDmtR$Wu#l`mR~;bZRHyfFz4wA6YCHQm+kyk
zo=e$dgmvw$iV?s3A<UkVvo=ptdNY&tcV^q%KmW;pq0#iI<?UQ$Qb7%Ayp^iWr0#rH
zp9g-DG>z*C|69&_yZQe^(>umx(*KX!+1_m1+HBi4ce6HccAeSY%uSnZx4E<0TpKr=
z_gug4{eRf22XoHMNAt#UxSez{kb!bUPNBczB>r%DdbW1Z!n@+zeNH=ltu<^{ak>FE
zpFGfJm;8*i48)<7An!S;;MYaPy7I=SiHus8kN&rdz(>#B%yESo1nd%N_g?Sw@DZ}_
zLe}pN<7%lSotyr(4_>0o$F~-?TvfbmlXbms*E>_uR1xt6Pfm<P2eV0`!Q$d|p8wG3
zZ^nyOzx$$IF4%5l>t?yOr&bl_N_L?s{99Wwyk_><FBtsI>)7MIm(P(R;~7xfEtfF0
zL!e!$mA0U2!<x-`8u}D_7*ju^WmK~Abm!r&Gr}%x^V!u}Dcu|$uyr#u+h%bN!q_Zk
zwvObX_&NHQTy44RX%d=t$}>#koBO<Tz;IQj|546$ex~=l#Tw9xxErqSszNy}|K+J9
zCI$f$3ncWBAE64NZL#^LlIBoUqOm7<<H_q=kkQd?B^g)K*ogzbByU?A?XG?j$z}nc
zF(*>(D6NTo*Q4)~>1W~<4wj%y>69rUrlcA}UOzR{K;V)i4-8Va*dR6J>VU6<UyQ)#
zjAp*f<dedulTr)w_=Dmaee3Xy1{Iz~sS|Pr!zL$5i5CTj>i2j_kxQj9Q5s#nI0*_?
z8K$ZEXXU#0FX!J<A`h>Z5#Z4!TJLKRUq4Hg+&|F$nu<`&D2bq{lDh(pFnZY-dd;|g
z;yPw5lTV3)PX`5tyDqf-Xtsi{Sc89f376}EyS{RLm@&rtd`KxB+r{Okwt%bbJ@549
z#l$Iubkw5R8n3EbcVam7@ZS9*2N4!E9k;i;t6xGQ`kh$+r}5LpQGc<IF`xQ3yY`2(
zq6OqWsyzHVt+XIUNNuR)bg|~_CC)tq351RB{wtw{@K=ae@1?I+O+0hdC`DY8U;go~
zn?1n|JsMoBble@`Vq@&@?i$rtxk55+yRdte>K5tAcin8896I%pu(3G@_G*YfgtlC)
zA!%`&mq_(xByg5a{1G6dFTlzaa$*dez^{(@;rCu|!$0E66llQVKZang>UBQQzF?~=
zKc;Rmzp@qhq%rEmA5HxIM!zaAUS+gFY9UE`&=<6yxjU>-GV!`;yl=`kjA591Y0$dD
z0p0cOC8F2#80yr+NH+7N^1qw%F@^%#w}#<A^@SfHs3CgbCL%eM6mE}|!JQ$&i$U^Y
zVPnHsQ2+rJXEBzs7l0XIsSut+UULKXA>b08RG>xPAIqG8d9>i885HII#1C>aCNz_H
zBucJ3fe(YGdh@mri`viY^1zy9hem)6V-OFWABSzB*=E^gpg`9rJ@FXgRb>hPE)dY%
zHj-d}@v8>UJm-zOLA$qh<Cpij1t;tVa)-zH{2%l~%+1@;kt}4JI0B&-<<J&urqu5@
z|4h6Wj)aeDx#)g-I4(_3Z~SZC+}I*|*!&I(BVMX9Ofl8f#asRw3_`fEIe?a&tRP$}
z`m1>QJG*ahzRp)qA4<OY)TB<mDL&0!Jyw?{@q@Z=*90+e9ElSToy6~OVN|uwu=#wf
zzxW%`5{a-6w#@1??kqU)Ag0^hM#I9oS;@!n_A%&6x4CLby6H{v8TIJR)5dsg`0)ns
zqucXsVuZ)A^X|<5b-#fYHAjNapvU(sc(1-a)~4x3*zsk%_{zOJ(D&wk+J2?3k+|7)
z(zh3jhAuAKsL8k1$iXK<Taoj36!HGiwpxQkDN0jC5-(-&E@l$%JXSM770h)QOc1DW
zt}o5J4CQ&y(@KrQWf}z<&Q=MEe}D_Ua~c9`$ACy>1dxYau_8g+0Q|$uJxE_}ds-j3
zAtRIk6nxEvvXPftE*&|3D=$X}_eGB?7^D)IY>^eK&6S%+XbQs1a|ldNH>g+FO<$ME
zbIZ<WwBwsA_@>q8e}IWQHl3Bv|7y0Y?yuAPH)OPgu)f^fZ?ynEk$g$TPz;KLloXd<
znnePbI>E#l5ce~!&I(5mvOuJ;Z)MX)^hLENDV@@$)FC_`*2CKy26q;e%{u@^IO?O0
zsSXz0qeb^Ste<5+%Y%jvsrTyfc;~#IVW2tKDag4I?ZJ3t>-A|vFXR5c(1Pk{*rV2~
z5*l$w=)BgFRP$(Kx74K!D!1=hY_Pw`{~mq(v}VYMh}7ogwONz^9ST*hoJ&f8!(*Ne
zI)AWIxLBBv=$?PMZ})pGm9C`FhN^<mLIhXAh=p9LB2kBxliYXo6QQsSqkwQ2x=sOS
z3?(&nSSEn=x|z&(JrfA$$9wm$_rJcmA!0WRg2>!|7ndF2X)8S<bZdZN?&H_MEsxMf
ztiWQIgrZSQ+fRs-j=`yXywz+&ph0QqGJSaeRHk}#sqxZ6vaUk*IaD}bm&N$tc!%Hr
z_407Nh5ke9>8Jvqt4gz}0!xjiP(dtB>^)sf$pJ-DMoY^ht(+<F_IUABmu71o7?%<V
zPfMHpfm;@No5Lf+KMgwFc8Gymypzeg233LxH|bdHL3xavd9lrYLAVS%o$BL%2g(qy
zU!EhITsHolon0FgIC+a>EjtPfSC#+w6#l!UnKq)?{(snJhs~}hz-0x<Qc@k4D^gho
zz#(zM6NThJ+s-FKOma-aNLNHZZXur7h$Pq-p7x}r>T*u21AIkTt^F~LWUWbc8Z!dK
zF+d|tOhP-8$>%iI_SLeC1M-N!&~_u@8;1tb^WV<uo|mMFe8)S7)|F1UKZt5Naf&Lg
zj2XpURsZ)4|9b!sh+sNe9}$#EvZAr6O`m(brt*Xwk-+CyAO~9yNu0OT8ib2AR(C(s
z@!)m5qW#&?>n1>9;m6PO(yqYw!ylp$$fB9N?+d^GMuaIKF9yCg_eyqp5V<n?J$W3Q
zSd4x~_lma7&P}bMlvQ}DF#q3Z*pGu@XZ`e&(9b?#jH$B@^xb~a-kXPy6m`H&NXP>?
zZqde*;M1(?-y$!}^vQA~v^I|kc;-S+Vmesnu^&}qNyE}rxD^V0{)n$_K53j>PW6V(
z_T!AM?MMekMl^C2c6U{s@2fs<Yu`L`uNbHuRZk7Uy7#1O9pCNWr4XIbj)n^wxp``U
zm;ZmekOY*cLG8-h>?;N8Szfqv;W!m-@9l{qsK8#S7i%x&WU=o1?L^Y4Ze5*W_jz~E
zAbRd99dqFY{HAEz2f5I4gfbBlA~v?);b5dtt0T*1jk3pk+tsn()#*rqU~6Fehar~K
z*wv~oGT3$+!fpRdi@Xd%n64*`n^ifpv@%)XlOkrqbMgOvjA2qA&~^;AL(!E55h)I~
z)+?kC!LNVm4}s0$b)+l<eDAO6j5Pn2N1cJi2XVV0KJU~uMGCwY*^I$p-)hKWnkELB
z6x3Pnmu9C8<3Qbkf>(LHkOu3ZB^Fk=pxk;x_p}iG>#Q<Q#`(~3Ql~Jz?0=U<mWQyk
zb;x~6`Im>dhuv=3q|zsoR;}%HC&3g|Jw9&0T-fTosXkB#EGk+mCE?kHxG9rZz|DU-
z>>+XP-s}|Lh}f-@eK7GuJM(=!g7n1bN58O)v$@z7W0D-C7D)E}f0Hc;Bqw%;<N5yT
z)8h33@XoLNlm{;zm^=jr8ZgU&7Y_{nCa`S2<Eg7N00obMhDQU;Pq0SEKnyMQ^JAl3
zIWYLEcD0d#o#iuiBAwE@kZ}x!GJEm7W6q_m<0bv*d2V%%U;I+3j#Um;eXHiE#d;T)
zPo4JlD6y>f+3>hbE^2`tx93rte!=wh{nZGR@9XT9auv6u`i+k{BuhD-G0)(p{&!!;
z9fs9*+sK?8zd^Wpw_kkOpZ)Ut&cAmOE<5|bE{Zy%o%N?%v3dEH9tN$%=e^4MY^Ckc
zwJ&}Aw=(@VtNJ>y5_`M(Mn>cu0q7&p-&_DN3ISa@qjnWwjbi52EQSSuUzj;%&CXjU
z++)MT)_{64Y}MRxyDxY)DFTIna$c`>xGK`Yeo~iZO93m&@cOL`jm@|{u%$DNvEHbG
z(T_&3k-6tD-ovMIhMoCL-A2v$Y~BBj=O+^=f%}=g@&qzI|Lwim(L%X<G%SHiXGr5>
z71X88RX8#f!B1c7y8UhsR97*17uvpws0|qj1Q@4i>>l3q<nma@M@HHfKZ9Pn`NfQC
z)oaUgHS{Hx%{2|PzD3)`_)`Xd!8`Py|I|!W1ye%{Ss@wt{!~t%4_X3Kf;Ldt1qKz>
z0YDg;kT<z=JWv;G+~TmB&0{~|=yOt8{-2fr5OC~JAvW>&6GbaND()tx=nJWl$K|rH
z@-&P6w?MclL7m&T$bG9EQ;(PXvvYP#_(8nH?U+r~T$%QNOQ<m-HiJQ!s5Qv|lmS6X
z!XW7(n{V!u?unV^mj9n==FY!=|I(IUUEd}TKJje(KAEoElpk%>5+-Yx`R;eYQ08gx
z(;t@pT+VEzfe;#^c!ux>dOHvYz%}LrEtaG400r^%v3mro9q0nn`PY<@aqj(tQdS42
z&7keS;~j2_n7>Bq@WezTc`ffGwH?&|*0c(&dTT^%!<Lxl$fJB?Ze_zYH^HSAeI2Mi
znN3YvIqRRIPOHgF^Z(lf5>v1q0mbdZ+3G@(bSx0qI<V@_S}LWK&61}j8;KQuz1AO1
zPX6TBC2Ig$9wDk!ZW(0>^e_4~Z0Q-_#s})miAH&;IZ<4uos>^*&X&scr!5`9*w*T?
zy9xs4MhotayB$?sIWNaucnk=n&C9tm$nPoW)=>V`nXoB>bu(O+&y;u2Y1DORx5Q5l
zX)F6RaK6iJKWR9&t6>OBv&y3=@4JMA#{ET4aW|rZ%1q$Aqf}DjfrBQQzd_oKmlUpa
zJS^5Huz<#Oc)?3#cZ&6TVk-0TYs<ZweoMl0DM0f4kB*P|O7&LFPgYDR%g=W6rxLSj
zz0LK>QC6d_F}^qNk<kmrPNBBt&)AUEMIuq&(eyBvdD$&P=r7W6Hec){e=PmWFE}pE
z-U_aV7o^KW8BZsVKgvUY_T*qSefzztcv#cMF{^8ju3t|D7vmOy@A>q~kXZpYzYi|8
zOOWJIi7zN2=GXa}j<fV-nPt>THXYO;10}SQ--gjwz=PD9#5wmAQAW%=Pnx|WT$A&U
zTgFjdMb2zBfo?PRajR0Wir+~^Zk_jp@8P+A8YBp3XT)=gcBmIiOhR%`+&*9$za%I9
z^6_YYk)Fn{UmV=w@9+OyMhsm3lEyK~;K?Y-#RL5IfCKs{NbCN6SnqIGjU3N%cRHPM
zyVvx|{q(y1@sY3epQES3WPnL7#?JCH06&`uP}fff%naeo3;Rk?IQ)$rxhbD0q2J$B
zs$?uww^>)i?qYt8$WN*ofZAU!Qhz0UeHTrD5yS(z&;8<LA*nKLY;JX*X~*L8w>9)O
zqGu)IJdON?8!;7^*qW0f+Z4lQqod-vOD6RvRyB@EyF$Vfslje@6f;`U3C6>Bv!}is
zW{#S!la|(ppFd)CHA_ZZz?-oKNOol}DSvx$uH=3BqgOmfT=)x3wHJQ$=u5JWVa3{f
zQo*O<TAkX)$Vj8P6hp^FtcvQUJ!H`2xX(AJehLoP=xFPhQ^JQ6>ZZlx?#_9`tlU(i
zDk-$3FNFs`fzIw4W4b5hH`mfKJ055T_cM*{!HUbv(cipv1(rO2{iSa?SUD>F<;Q#8
ztorUR!SDpl6xP^g7)kF_eDjUE^jdXBI|eq`NqNj{S~*EIOX-a1Cqo>9()!dF`N%{e
zlA-Tizmnz$EJ<zjfIL8EwKh8$mr_}rhnw3s_g{KsN#NqKMHxo>hD!FS+1Uy+3i%v6
zd=LXRX;921pAb2(NM6HD(!Y61StAc4f1@=ycz9H}3i9CWHB3?*&vE(m^q~ciI5~oo
z$ybqxIA&g6&gIQ<aqjssX8r<ZUdD8lbsH*slP+ChY0BW0P%`0_;Zm0!-co-BYz0kx
zwepm++_UCrPXo(nx&9w><#i%Q3+mXp;ayK9f_z{cw$#dyZ0Udnwmtfl(c;d|*jlAi
zK%Fs$JUp^f<~7X7-$r$Hb%jcJ;4VGT-`}G)1Pl6Z7%8D_L~Mb?FNzY)Mp|HnwlP1-
zKeVYrM3^z7oS*bCB61!dIF>Z0%)<OH>Qcek%Ta4_WqMqF?UG_8p|7tGfP!wcxk_$Q
zvaIA-UBR_=beyDqWh%hWvpJ~N^eZ2Gk&O{vLM)~MrSLl!%H$NvdzZLgF7svbAkav+
zwGUIv{9!vVp6aewx6Ee_)T*cltMP4;s>ERG_S7Jp^>$qaf?oVOaWsbXKYMfitZ^?%
zDNr4``3Ng{oCT|yu1L(F>3biiXvm1tJCeAJDhwN_H9C3DdO~GuIcbbd(opKWuxl7j
ztjw&=?Ze+a_S*XY@mIiObnb7Fmu|<wh2^c$O6hA6eRndMQrE03!PXa^W-+p=39I#O
zC*&#qTiHWGtiQ>&gq%lj`eeVI%$Zd>aPU_0b9Lg1Z&9_dmdEbPiGlZGYnsTKiC}8N
zpF4IuP`95eTVS3|#Hg^%LH{Jp!l4db_cb>KLoi!=LM%GjO{i`wcimctKZpB-diqzp
zwyv$4GMfgwXT1XMUkF*L3|i9V0<!RQwsfB{BKNH<57Qq=P1$rS++6E41f;XR``+wW
zU#GJfES+Jwk?*DTQvb}Y>l`nRxN^L?bl?B|c<ZL%)hde6*@;tiyrmXjqsR8`%o`sA
z%Y~**kKk*ntE;;Wc60(LBZYv>Y)e4*b4DBZv1G_2VPY>5Q$$+$jY-}_MOgsKS!ift
zqc_>?Pj)KLA=Xy!9XkU>bCPXH5sS9#Qqj+!tNFONNcWikFdv>xzF3$$uh`m^7U8=_
zuLaHSW0~BOjvS*DRY=-G=jIWoZrMQ_^KxbdV<~l~-tZ7QnFBm%6gss!10$$i0)*F<
z@tMtbt0%W?FcD$v;ReZ+!FW_>?VRsdBc@EQO2rt&PM`aRV$1faB?qpT^F!lmyRTd~
z26To1ZoNjc{$IM?jGRMfcf(G)4|dhIPFFiWs#5&YkkLZhqdl=^`D__ZGDe9!!a|$&
z@*e`}kweBDF~t7Tsl592C8O?`4%>b$B#<wJo0Y*}W7hMnys}gxt;lVA#KlntHVN4`
zrZXMOEjQ8`eF&5NjX;2<fC$Sj14%>>-g^{9FT$`PQ9Yx5qe{d(cl)n$%H;95oX|tr
z*6xy>lAl6<@6hrLYv^b3=QP+V;3I~Y9x~7Qo7khVAE>*al{ME&Y0?s_$T@t&Y2SEl
zL3*BCTt03*?P7tR=Jg>^kVyB7cBz$P`^rwkj(AeAwf|%Xf3_IDew&}s64p2SR6FvF
zl<A<xq*dk#-_<=;QV;@YaY%k>yd;d8&Q=!0&|{`S6hYJ(CRMKvZ)=QB^6(SZi|ol_
zPQ#{Bj8s&kCqWibKN;0vB~jY-X$S=GCDDmuhvUXZOlxZ1^q2pN0}<UR68HYW^Jd;S
zXQRui&UAQpg7Bc~Mt^%ZUhzj#!-$sx)G=1GN_=Aaymf~gUCjj%>c4Lm3p5<(`@R#`
z$ud-7fgW!J{5d}LNx8<T6<A}?SaR{I!63%3Q<0$LAkftDSB*H@`>Kco+fJgtR=PLf
zE*PzS`f{hFCVl$KQZp*SNIuQSdj#n|nS7Z(8Gf4n+dKarG0n4{GKJOKhWV@@&;%ho
z(XlOCCWpG~IHGHaW2#94mkP1?PV$OCHSw%Tx_nijO=M-gy3Ikw)mN;?dqsvCtoV8A
z<YEeVzl|*Sd-LLXRkLE^fAaaYQ$k?+-=3d4QB|W<>31&-TYZ|NW;1uKk@+*l@#pM9
zoFhBh1fPxwxJE{4{Y9Tv&Uoj{0+{+`;7+F^@nWCay60x--HfzydFhSjKO5!E9}}%W
z5B&VoP+u)YaTJxQ#i-9~PIX$7a4;AtexmX5xpZr%Sj$)<SrUzGJb}ll9iFH?b&fCW
zuS>$x32>BUbW61?;_arK?3XE9c08?D|NEv6lsH*i9x2HEy8jv&w>V$v8o^2b6OxD@
z{5AI}V=oekNZ({1LWa@C%BLaQv<QhPi%qb}t*SnWz(O%f`4^mnm5icCQ~jlx%Rk6s
zcKdpeN6wFY(IhdO`ni--+~>e;4Aj8m1cw0`byoqAn)!^M`Nwi*uquo^**3qc2*cye
z+;$iHKd*p+cM<<Lv+W>EZ&84m0unfKgkjgJM*!Dj)yCoFG`6YaSMXi6h394bYh6>C
zxyrKmyJA4*nN!{JItxY%Odsj|1$Y$x(uXdgjwKyoZBnsKAh#NO+|L@54e5vNF>VOF
z6LPHb@`POeMB_B!QeQd7KiiDr@W&FW(Inbq)w^rxkPs3+U-W(eh3a^RngFT6c3tpb
z=~w;!I=!Ea9iu}k?X*A9MEH|QPg*IgVA+6788Vk(y}CmP&S$UB-AwI?&|c6si;Q3y
zA2ePI5P-Lvi+iuq*Fprm7+T=5n0b@u<_OW(1BQ6uVLS}b$j-suxZJok)Ffp$4~>!F
zwZ6;4dZPWU=!f$s5@yhQGu4G^M<DX);E$BaIm0qMhp1Oql4z{U;kQdRiu&(IGySJz
zwg=8=0Kx7FC@bc*U!L093IvkS1i$tX9e#OTY?1{$QiQ@}$<hqVDiH%cIq>(4>-6pv
z2L()jQL>)OA6iyF0nhfW#|oa8vzwPEk~&xjiHL^!>!78!E$j)G!zq>;FuXQ@h+o(}
z3I=HXq!FN4L&*TUd74pm39hf>sO2fF%r!9MK?hHLCn6Y2-B9$F+wUP0NzHQuZW-nH
zfYPC)p>-wyZRO=&JH=BY<%QmiDv%65&4qeW3Uwa`5$I2dn(hUJEBT3uE@$9|qoX6h
z=hkAsf`+8g!73CCiW2&EvIuk*%p3FgCu67J`}oX+9H0*_;^+@1PtfTiSEO5ZdZ~tk
zlzKm}hJlK9shVMzhWb3r%m0o^gXx8qf8{Nveuv>jLRjkH|Eh(f#(>M?pb319ds%({
z<#1k7)-l9;oNvdoHk*l|>7|iUjZvTEBe*plj$I@6vri@+qX8Iz6vNnHdi^b$76R$4
zjA0ShWt!I{0v~PQ27grQJIF#6_+QHdADMKj`DaJ;KIN|N?o?$8X^cQ?=g#dQy6B%@
zUsGsdc@4VXF98A*IWgc7#os>)MHk_7-5t8z8SH(3J_T$O&0pp$fVLnjegHQPP=%!7
zEiEhryiQymp8EqNZ9j2&9L)lUqG-hvXOZ-v7~<9W>FK6=b#EhYr*V#)`nk8L!jgTf
z&X5i@hfU*2X>e4ne+pIc8&bn*Q+Ir+eEN6HdUbQvjW?slsR~+P^bBJ4(GLjWLyaBl
zfE|HBuor-HLx2AF$VNs6_)Gtb!E~q~!9|IUn>v4vcsOV(@!=DA%*2+Chbx9t6uBz&
z7+4vySh;v29pxd+jsR+Zt0r^0fS{n)>GD5dK>qmgy4mAu@W|T{@Fviv3dl4BfExgG
z;GZ|W_jU#&3+2q@GuzG2{;y9%F@a1poj^X?HEBIy3o}LU%Taw&>OD}%4(!EiEzfl=
zBS=_k!d8aj6@PKEY|V@7!S#um7&iir0Y7(&a|cvuX@om)JgaM*`O$6BxniJrV^mdc
zJ@4XUkh!S*`lpf?>M@-d!lcJjzH1;!i?B;xb2{w*tLk86mkh8=Cr%5hPU$4Ls|1!b
ziBJ0M-0q$(yi}dQ-7){d29K>ykC$hbkNddugl5$z@h3|Eyk96vTZeHdnUiYEeys=g
z0Z+299^od#kZe&uEQq5Y5V^$1*n9i?*rfT5n+4dlkXqkr$?50;9Sz``k&uu8ATA{M
z_;Rm+R6-x%H~^FePax_D7~3<-4^wm*7-T+qfnW^a3uHhOm=(Hp`E_-5MyZ5jby#Zr
zvMsK*H$VpzIVX5)QndU)ueE2nPAe`}lGnVLQ-o2>X`puUD>$kQk_a(@nqgXtIX83G
z0eGF_dSc%$+m*^aved$}7lDUqPQ`@46HBQLrbmW6Hl6VSN1<mwE;z?AaM%FrnpV=s
z{9js!)b5v`12VYYgc)xYI~U{G1+Yk3vFz$l=M1C38?H=2FIaVnrKaKmEzeX|w8c<S
z=UL@yK-6PG>L~)4JS8S1F#Ww<@^|?+W;>;xjQ_@LAIr>Mz51%e!IWX`bkETS@kY}i
zFVLEi;v~-6CqvuJmNEpav(aXYN3T?nIJ^^2ew70<w39-I=2Y2_Pf*Hl5%bQxXuu6g
zdbcAI%^5A{wq3kt1ys~09dl~BgDYT60D%h7+Du17GyX4B-sAc97;lX`HExkuXkT&V
zr#ujEfnH9QD_pWeC+1VDP@_m9XLxQLDV^GQQ5QH~S_mPBTryy3y>Q!%1Q`(<QA=1k
z5u@Fs*=RAvEoy-g=61Hu`+|kN>WmXWwbfT{m;+tkPIVZQ?xJ0~ySt5%(p>8ousv5e
z_l<h|SC^KW0oMG%gglD2p$M=VreDOb@y5!Dg@c$WWm1sTfx8}}p_#z}kL~N&Bmz^6
z6GK5qy&Z*!Av-eJU9I}<Vq_RDyn14efRSIK05Krf<@v0a0#bM552u>j;)a$LK~=C2
zMKBPC1ytRtXXO!GK#hlJF2N3{!hxW7Q@F#tL<az30Zm=o2te7vI`!#F9a$dG@QQ$h
zl#E#lNaK3$mJLsrYdZr!L}prIanfs+mYs!Hdloc-x7&^ZrH>%wYZd>=S3Tr8Cz865
zQnuZ<NyDtzP+Wdz4w4Wxurjg`R)l}@F>v!_Vf234ckTCTdB@0%6~w_a=u{Vq)sN5b
zx#J1Wj@nRX7Lvn7gY$)lPzL_eF)WGDfy%5nZaqV@0ggKeMUZk(W_*17K64*?+Y)e&
z0xB>D<BlmHP#N%IHw0)}xveL#_c;Ki37{Bag1s?F5Ha;{V&;DqAJz4{SZn@I#s?Vc
z%~*Duev4Qh!tx)`EsW+muR8#~U$;0|=%c_)y{$Gko)#<Rc0Fh>QAWm-{YtLUkGDFB
zEG(7+`x;ke+wZj}fsH6jK_JB-yng_XW#KC|64?<;QY)0x`I}z>t%hN##uA|WjCs@a
zz^X1MkR#{Ur{m<M)$uB}0Li3tDRAjNS*J1^*4Ct?6wimZ3uGT5Xg(|g`=q&e@t3WQ
z!kp{xY=ED`Nn}XcMjY6Q6{8eU5K$gj{$}m#)Wq2mloqef&Z=<&XOC8ia3tXH@o;se
zp2`Rd4Fz=$yKz%Tz3PiyG3aSl{A4hV@Ac65bEZ{y`q3%vZ}I4db=f|k8^p64DJ2pQ
zeSQ$}Hf!xkc7MFw>A#5Q^Lhq<)|^nu;Z>e$Kwfmz7R<q;3DPEXst2qEOMuWO>fX!m
zIGm-|=^L97XaNv;3O)RBCs&{j&7%e^<_J9R!+55|f}YGAeUd<r@2G?0T#+>DiO$N1
z(eX;>;i)iuq94GuqC}37tuyIfYjKQCSMkKGeE(~R*eAyI*NhMjRs2IvNy$p4oQ%z4
zcW+O=u!|5A)4WYUVX*TBfQsTL&t;<0&3w{L!Lnp~eK+tp`nIu5JN$*}V~JMidF$i7
z{KK2#^TzG*Z!%aDY5>=>eq78YY}jZCo}s!KJ{T$)!V~N*)MS9B(-qG)^%n0K>Ko4p
zfjA0kR7;-D<+_;&ootA1aPyb5y1ZoxC6$dF<KV}5yqUP@B{nQ#6NV;1L_+T(@jRUU
zAqgPm%IyKT>3>1up4c26GW}d=)Rs!dwmCU7;8cY!S-h3i^!>ePIQM#fPEP6>FwYN*
z@H@fEZXtu{dVqfBIMyJZu!bm`YL!NITGt8W^}p6%8!UimI>3>BJZGxS+bAa-ujP4&
zHH|+`c5TVKUyPnQe+S&&v87;}&ra7U$3Dsbp;PA=&jXqbP@}NDwvv^n$*l7PD8_lE
zyPk?AeSYHkuPzrnnmF;IH{H&b|EI~43&E+@!I<{IgCd7C`3<0A#t4943&6J7pRZbe
ztc3iY9AAXM%542UG-51D#hNEMmNl%m>-s>xZ**|X*q93pcRx?Qzl>CW0e?%k*$UX*
zD*=x?>0Hm}a=UEGFRuo*2L}r`H~c&!Yj%KC%Adz~I<zx^Q!!E6zt?1bC2lJ&UU9VU
z^pERp0l>0J#N7}(#FB(#vMa*JJsH+M7le$+1m^PT;^pS-4#<vMu*Hy2`{u2|?q#^Z
z)3_)nsbPF2<dfxY%Jk87pqFGM3%DTFO2uU3r00ck%tV7o&8?05`*A@r?et%6Ng8--
z__P;wE4NcT0h@~bAgtuphxH5eoU1*6T8OaK<)hgub7zzm<S4hjx#|r4yLR>~vVbU~
z_<phCTBb*ndcHroTOOcQ<OeXe{S~7f2&#e?QM?=mL_RV(<jS`Y%T!cU-`v8(!yC-l
zC?zk5+nydg@?cKBz9$AX<i-LtSVWbW_u!4a8vUl*J!zDwa6rgHU+3%N<KyiOVO2D9
zeYM07*oaMg{C&Dy8lFr6?^+Bl3n1gDoFFF0hK4xFR!Q&j&EAwl4Hn8OSNs?q9!5Mj
zE$dc9n5Pv^|Cv1l&;e9BK6$Rs&B;fjsR3|QMFw=3>~PN6U?3aN@I-mk_n(<JQ}It8
zAleBrjFbsqgIbq6379aKkc}_6oc9fhYU-!f(M{wHp1;`QvaS0`mv*@Hkw7Y}umEsD
zChAg}>@P&#fD<<>TSa5gcCuU82!DHf$K#U#1*_-o>)#w_7=KBqlFvTZRzK8TkHu$6
z@O0hpf6ZuwJpALf<Tfw~B%MSwJ<DNt_=64l`?`qE{p~bG_v8HLPh-q{m-S0uB&KWv
zF<g7VzvdONEjBni7}RN6-p{osqR(_LVr{bu!4u3FY?J}fq<?_lpW}CV+27-Qm-n&i
zAH@zJ@Fbr05P@tByD};on$0GFMs^<f!q~p~D6|OiogX^>;b<=O&DsUM;a@^+`On-q
zP&|&Cjgr++SWq8^euITH-^ciP)vwad>9!!YC9=TRtBb)v(_R<FP5FO7BQiQ^32K{v
zb=EjnDR9c}(wj;o!^t;il@^XiM2$(Ls>L?@P5xt+i@4qfjZjWH`bs_=83%-xfSHjg
zXg=FNkC34ka5P_vOhwB?|Ht>2ew;Xf?UM~Xz!gD@ms1E^VGhQ7dN=EM+)4{nC0-4r
zJEC~UP<&eJ6j-HDf+-T(pG&{}f(;}VttVahu^7Uo46pC_W5V}8(TuJRH!HU0|F_XE
z`4HO&zV26k?+o~@HogO`s16+&f6ZnH%Y$F27J+v2Hmxm~DM}3ir&S%35ZT!TpsPDZ
ztUk>osgU^$@JmtOKxMYD#*eoRjkG3>sJnd|3UHh{_~zE%$-JFTEC4lLS5B)}eFX-V
zI#pQ=7mrwM290tR63ne`Kd<GjzT}~=faJaK8f#=enH>@vR&7dE^exp^y_|cI6M|~_
zws2&r&Xi2pdms$)i-S<(Z=oBMOFC$-7=$=Jb#KylWptuBOb+#xW6V$^z>)Ynm*4$z
zyB|P+TA0QNW1cL&3_jofo7<eZW0?N{Qcl&nbrBWA-2ZVV^tY7>K|vrP<8C7oqdO*?
zBZ%x2ED{L0<hkGvCnI-vubXdo)8D9gN^9bd#}tJ-j2<tF+(S}kA!hHAaPVnjwtsw=
zEjZRN)Hf4H`22<Z$|*>F^GtYqhN`Zjf>2L<l0>%m1AuHv7B?Sti#U09`3||@xa0Gq
zLSe@`4_(RxHos4`x<`!bvf!(o1O^!y!}VU6{L#FbU@Ag->-^X|;i<Te6*1InjIv5V
zHJAqeZ4<&$cqkGoV}u%ozxFS1sQBY*SFgU=P8~=?Qb5|;+S>5BjX&~c^LyD$>afV7
zs`8*oKZOzwTE)>w(OSke2|c;_@|cytc<d{&ia0s0{UQ+XsJL|0YIPIVpCLyml>Wj%
zZP^7k)9J?|mtv!AZ<Y7NLO}EYQNw;ECJMl8_#h#mPza6=0l>z<gU+^-m>jUwK>0%k
zSq8_b-bSDjS?R}-iN22|1>V25^y+;j<DXxH<Sy{q`i@6_JcwQvCK}`6^2`0k4JWNA
zYTn_NLq+sy|I;l8uF8->Y)%dlN0%$`{R(ML4Oy@5?@-j~WoKglS<q{8tJ90k&gY9>
zTXEBAxn_jN`Rg?>3Ak0xpKUG%DC2_h)eN`Jg!zQ@J*<Xf4ulYeq*u=}N_I==`7gR|
zW<X<<&6BOOx^7Z9m<`!w(}t0oDMB!|75fnO18;FEt~JrP!6E(B()-$h+KBt8i1jIi
ztPRFY_*-khLI|`yN+}dHrevcK5txQ#bDApj!R#MZmgi&HX9|Rfnd4wP?W$uI`dX+n
zlw*!z&Z3vGQex>E)@-u(oRG_IQ&H;y<V}A_D8Ni9AsQMoR6kUn+aDoE4^2#-2QX>b
zlgTH8KMMngX`>LeQTV(^yQL~bcnql+XxMN<d0P8<_DnHJddMl*!0V&WN3ryO)ROXY
zbM=I#utIz^T8=+X`(?0Q0oxtziJSfreN<8~()a&@Gf{YHu&V)gOS-{cb+<%(jgsXn
zdATHVzqPe+{qxP86ucUPmd?)1h<w_8@m$#?OoGwI(rlv9l=Arrwq!7cDM|h*dpqo3
zvk)JBl;t@Gl|{f5Sg&IG6VZJARS<Yh5FHLjeqJ{VQMFj{WvAsJd>05jgx_E%j^Z!{
z0JMbpvN~Du%^iKLCQUvmJXT}t=0$bi3F;K?O}zX*BRy4oz7p}s4DM1zXgB$1(<oJs
z4GVyH#NBLN-2!YUehuG8IcZA5F<rMipo$d-^7He%bkKK8O)8&Ft2}N$PS-U>RY2p$
zh<)vQ$ZTudx@*6ufB#$DD%VR^Z>iFs5Jx%X?pniF#r&aE7}1m(q%reYC!QB12t6rw
zjAfY4RfQHSP^7eMjDXFi%{R$-?FH38D$eEO3<k7J11bNKMczH&)?g#BFvbshkik)!
zLz7k1`=-C}Za=3tMq8C)c;3`7xW}fkq*_!mA2umKAfdn$l8;2Ug_3{+w?o%|<oBF&
zL(`-%w!<*RDDHwJIYQ%EG(F^lpjK-QKc{LBK=Cr_&^kgiDik{wt0CxnD@uLb)@{%*
znrIGxE^(~JwO_6Y4wHrs*Pv)kwI&_lY#_-+>mTs6^)zVqT<EwXiulv%>%X;T{i4I`
z^ha;T+Osfq`CQbm(=-Cbd^~Ht)x*=y&x+;{VCrD!M|oB(<qNqoGc#+_z~hAM&8`O7
zA13gSd?umSM`X|181T<;_{h<K*I$?W5qWqsv*!Jv6bU-QxKRVXcN?$#a&D$4VG)9M
zaJ%3zA`_b3j$ogEeAK8R?nkjhAQ>QZLNtcNK*^$1Yg2pQOn=tz4Pgrs8FeEo#zsUa
zq(??<ydI=ZVMoaW!~+vo{^uNlk{=_K-9SJo&JIGGa<WRhdKS@)%3Xmo6QPDGS~2}~
z8iBXyHBZn>A~}P}!dk}HSkr(~C{a02AZ<)U*fb7Q@!9oy^t00@Ru7KYR{anK%@N|y
z3sg#3&$;+bP6sL6|33L*d{K*`8x<XHTDU%G>m9GS$XaE9i#&foV&EcM|2ZZ6mX8+G
zW?9;@>d?>?YSuIRyh-}{-uByT@v=wIci)9vtASAH%h6bty9mqQG@I!O>X>i5q+I6P
zw9ms;g+N&3b-<vqiF1tRwwLVCw&p`!upkk&GhRvv?Vd?rXmBK|v-k99TG2LPhLBf#
z>4P&7OMz6yYH4tU8p424M+%?b^xyhh=-j6)zf-}3>Bgo0q<O+PUn^Qj{Tb>Y+o6z8
z)jH!d<DP_qrHT$o#x(}xO<MpYlg|(PcG_AmppV;o=!J^(527-M22;u4sJI6duSFV~
zveNbsfUHYRDvcnmg7n%GyJhlyt1#>l3^VYTOv$L=@|u%LsvXSl(zkBB-fIBf%Bdo_
z-anAJZuEp25vMi}c40s+5!90@@0WocP*NUThqQ@VKc@GCcmCYT%aRjXH$E_D3u7zi
z(Ca2l&dgmB*7}yENeAVe8+lxaU6ZvFENNlCbhq+_mfHtT*!Egiuozl}_4N^2f1Ch-
z5CZ{7iK1T|h~w`fXr7*gg9>O5-`Bz(?j5$Gv|&smjP#_rG%?yU9ES-j&w=>-nELrf
zXlnL!2uNeSrV-@!^mo<bj+R^&!?5p5YM$9V{fM8nRfM?~6B?_q|Dy}gQh`DWZl^b=
zV%_nL2+peG7t*en0hs^6_EhOg<FQHV-M)8vV^F+UVdiF3k5jpgdP9BaaxkNr+>c&n
z_l9Lk1ZxpKc+o$G8{sC|hTOK(wC?j{1_v+;p>IFoFS7q&e+;RZvckvWG%+)yvIXPa
zP;mu==rOA9ftsuuQ#auH($rjG5#N};Vhxn!Et;Xx(nNdvY=qJfPBq;r%7KE5zm&Tw
zsbDxzVKTs}Sf@|HsmOYb#xxf+GHl6A*bp5>)%B|SyIByB$zvv&qj3j`hr*d3&~XQe
z);VdW>xd1I`Cmes9RMi7vN>?OAI{0=!aZlAWk{5{w@jGv;tDKYN(~5^$E@@uDrQw5
zRSxd@mLaP|<TvNVS4URN!$W^=6WNbqR7Cl;AT>vG)lP$@0vReR8ciFt>z+&fK73pO
z3a`NXR7`kJM~ypC`z^j^e`JNAJM_RnAT{F@zQOtrNa+oyTw(UWF(J9JU@#Ia;!ons
zAIMolfazH|R0xVAzEP9!*TmJBG(9It%F3V-Fvv0<OjY4jYq}JO3%mU6a(Uq6*Y8KU
zgb8Tc&!MYNvSO>*-*bt16O^%Y5xbNlUpP{u^+|4MhT)Hik9zaD`<*BFjaBzY4Xo7{
zm@P;s%rAJ~eA;|Q!fAmRBNfjk)zQfiSIBl<<jZ7zh_1q}&)rZ_-X96<xCxx<ut;fK
zmVY_WLJaMje^C4I24(qty>IY3SE^QiX|u5Ph`OaT?uVvatEX-}%9EvCCZ$)tuXK+(
zIySq>kIqdUNKBzAy*_2oDa4OVc3vV+ZwdssbgH>YWhnrZHfRO3MJ`(G26?nkBJ1~?
zepnNgq(Iel1PrihJ|6YwZZFLwI(qC5A*L(c<`G+yqdHc-eTB<dtmtA6=2j6;?>2LU
zREWLK<af78Z<m77V#2lJE94hbj3}uhHkKk}<>0j>oX+_Er_B{YzK=$X(fiG62kISj
z94sZH!vogLr4cb&Vh@v)D&km_;4}hX4k0JTCXoumm2vwbP9V|LgK4Vv@V*822i3yM
z_c55^qUFHo%gc`>E!Z*@f)@V&JhHM`Q+d_~4V_(bF^^LkkLR%yL5p9Jc72U?rhIFj
ztFR~E|6>94$#CfI><Q4x+grn-`D~{yAb4z>4@u^PgZ9f9p^K<;yqwa_spc@r<+veW
zV3|mpaptp7Sm~Sa!q`2COi<O<9v|EG_e%yN>v4snkfQt)aku{DeZy1oX@anWJkBI<
zXhe$tQ?5#u>B?g8tG=ZTqJ2n$8^^~NFW?T;;{YmO{D{ghr*hU6bf651%*+}?wd6(6
ziIU}YeHbK*oO!g-74Lazv|1D+O%B51Em~!+t_S_nOUCR5O6<rWhENA(to>Oyh<;0I
zOx9GEtJ*zIp7hDpsxL(J*1Xe5@>_1Ag`pnx!Qfa^RS#T$d=u&Nm>=$!^o(YCMnMd3
z7RHp({uJg^bCHH6-fAV+;m$?AZLr;;+<SYvrRuX}VeY!Mu;trUf#MP{{FYXa$s@rh
zakx(oX+SxcsMlikVrL@k1DTL27Remd;{S7Lg$WlOpHJ*pdboyHYaEAB>^HhrWD7|A
zUHttw!xh$@eS{Epg=X@L1#Sp(54fl8t62`e(N;{-S6BzDtDj>EXnr$dOG56u6~I`O
zZb1ABtR|LX<L2q&n~<ThBO_8*%kD1jESqNkRS@E)2OZaAqV~;8XNxO3P|?;_vNlK$
zdM9MkZktM%=WPYNu#mbfNYjdpppYBK`6ioEn?W5Sm`V$%MxcEi3sTB-pYWU{mxACX
zuxoA87$Bm7XO%Pie91h3+((Crkr1%cSVXHvs$nrjnGEU1(L^&f)(?>oGQh22TQ#n2
z#1MUpKg0o_j-dmRIpU7v-c243@CoiFw1Fe~;Tc7edT=r;dr(hHfnew$Xe;`auKAYy
z94G?3#D}GbEoD#dnr|GV08+tE6PR#N*M0Iq>(jG*4v+OQ+Dz|tdmj!HG%dlbQHs9;
z!DHr{bSkyR%`LyVHa)C>z$?_>J`fFI-_GECx*>5%ATi-jdH7Lleg6Qtpk`XO?5^s!
zzw_4Y9}Y{$-{1v7)y2jUN)GtPU3i>gi2h9gK84>p`4=d9<k30gN@AZS;t$MNc-=26
z$N6!~$o9~IDi!RaFi=$g7K|m)47N9e_b8iDTVHNixW%?*S9Frsju#QBlyuQ+5<Z_i
zTrtQwIa?!53{8=omV;J^*{B{QwG>x{;s{nYvz^8mc}3MQu@QVnXh}jMD&1&)MlHQ;
zHMN67laEy*!|DN*1@i4hmNIdra`kwVpX`39=QG-_{9^nLef@_(3^6c4BIJ9FyaUyf
z*h_LAarBcA*QZ&3??Matgj8mULR=@K2g9rwELlCB%Y6yvPmlh=a8!gw?WL{bpal88
z)}$%kiF(#_Az+mqrP<``a@n1LUzfo)_%;3h{rTl3ja0y53|+LQublK<SidC9bnCcX
zmy?-sOKNvJH_doCH192khGyiMcEjQd=eO*mesO}C%w=F&-obZB(On-iAJqtul}H<>
z_5GlLg38>$u$tQtqn1$-uf9E7++r#Y5IL@w=N^Z92*}!!{!~Kj9V1qmC~@*bp^ulw
z4A((QH&d&l6I|LSQ#!0-r+=UDK(bK4{Tt^hm@v)(X?u!MQT`wn1~?Dg>gLmAuX)jC
zHAbFIUB{5DhCxd>Oc(ZEN5d-)*a=APBOv{vS11(1h$p{^%>+__l6*mh7&;C3BT>Po
z@PtJ*!l1bu7<LVlW*2a!w@p1J&W|8BdV84HU9Id$Xd;@huV=>pG+MqTE<`a1Xc3AN
zo~_iGt|>CnC7V-~C%femk6k*Qpyb}=EN*pJSLdYU<i-Z^%tBf}l||eMhBz1RSVjuk
zk=&W1{eYw=8SN1+Q=Qb+w81RX$sJu7ynqhmcRxRAUH{2U{Nk3oj0i@cYBmjg;~C8V
zv24kT#Dne*?fGWe$Ve9(y)r~+cgSjTXR&bYR#FNj@N62L{NDJUp%LEAt`X`;Z5OsK
zhnJZ?mj9A<XC5-4?fM|w!}9fJKtOBhBW!ljG$$?SaT6{KevwU`jEbi{Z^}~NG`Ra^
z!)q&A4qL?BgQ}cD%oB%K1BrTpy|~uGL(x6@U+uWkOxuEux&og$_Ar{^Wkgv(8PEO=
zvH_}$9f_ca!QH~dgDiUF@7Nz4d$2mjXYS~WlpViepi5z`$Ts4f1^)&>b*jcGp0C(%
zU--2=8edwPS$qy9{-moR;g(*H3Hp(j_=Q}x0V&*6PJpu7k;LTe@g;sMc-kepj~WRz
z{T4-xg{XDB!DL@SI!MIZZ(ByQGd6a8@}CKuk>%XyrjfCsdx?NF*mb$f{#vVh*3{hf
z{LSjkj~dCP-Ra0KnL(gZu5lOdJ2*-)i-G?vcPu#aOjLC%2i&P{)2+FujR=l`Nv0f9
zv1{YlNr4?f9h*Eq^argj?Eqa!_^8{1!On!eQ;if$*Z;5tJ?aeFYYe0>GE9$2ZrK51
zmbls{axBjBh*@%oH=sbD-Q4!9m|n}CKJxOJ!oQCt&A~euRIyPRHlkvXr=9Rh2f2_)
zReete3x$fWDyar7CH;l8PdJ3sq0r7h!=)KeO%I$AkFIx7yP)bJNbo=+^{-<hdi%hn
zt%<4YbnPb|&opK!FVc{4L-D#*Ge2T-qo;<U#dyX9kx$$7<**|VS?g2^xtMeX%*{w@
zR{J}Wv|x@X=%gkeOP_j4wGI6|)8&`Ul>7oQ;d^hoM*R5L!dbQ*Hn&}^^L_2q1soc+
zFJXe^kZQf9eiVL9R}Pddc<1h)Wg`6<yTU~&ISFiaW?S_yRn1Gj3)HV6xWo-m4K(Of
zf#=0alMvVwKUgvgl8Qd8#J$=*-=lcB&Z~bgskQINt&*<9s!xWgTCQ;8cq~eGq}GK|
zh8AQ%aW?-8MIBumyI)<C$}R)sP?<3XgHL!0(npuM!Spn$Hg16IEmI{6cJ+7Cr6adB
z(AZ(WVq*}p30s^(04pY{hMa<1S~F`k`!Kr@bm!`f{fPk2m#xdI*F-s+hk`&YkF!C0
zkjeIa`PV(yvjdgR9$%_<6Wln3g-c%a#01h-V1`pVqkfqYPT<aZRQo4#g#7x<X!DNd
zgPiXZSl6cbzHE5|P!0Z}P8FXLqt(V;Ad~b}tt~pe<{C-Xc{>AyLFoDV=?)_cRd${R
zWMp3@&Cy7<WGFQbbxp@H5tEa7DkoZ1;Qs6Gbd(P|)T(x^6)V;|7~Y2}Ko{$nih}eB
z_S2Ib9!1X3a~$BG!ZY;`HuTl`Q9t;28OkAx_V1gkU53@+MvEiKy~O}DN}FiJCAkz7
zzl8xR`LK+(5_t74^Nx}Ez!f#lZx_h!HYbm;&bCMuwT!r7u|x7hMJ2899>DdvtFyL^
zU)jLmmoSi>aLNE56iQFCbT0NRZE_Z5%2NlIfhC%J`fAzYv9mcl^yWnalYI&Z%oa9r
zk19n6sUxT2BSs?$9@$Qhw}=kZ1eMkSRzU<wU~-34AM;9GERs)GERG93AaGs(X-`>2
zwTh{Sr^cib^z3+Pz5tjAmgpE+p-(121nn}d#%()s_fcKp9-l8v74+MbA--GV{!9*^
zhe8r8=LA<LGvXYXD*@Eg)jJ{{J@U>wGjq8>8)5=Q5j{hVadW{SDc?c2^hxIWPm`IW
zt^=*=wHD1_GnibtdpL-;KIZUg`N9Ay`mCn72tY{XaD}(l1^!{3VF5TdbQwmZV6l*z
zF!f88(Y~bodah4OO6yo_*Zrwhs+T1pz}j#XYh{J#BTRtI<qZKr-z92~`(6BCL<fSz
z<TkcdrZ+THRs{_;;q<B(Z8tf7wKKpMc?2ICm}(^J^I<?~&VwLVWyMts1Ee4*wDk+Y
z1zs*<ZbuMqL*`H};U&U+BG@q0=L%4(!%D?Z;z5zcpHww?t?;$o5JYMZl@M{ln{0~3
z@BiF|k-^|`!Gx}bM2MD#(n+0s;s?%ah`bZq!x>H0!CkD7pAxBbblD;1j@f^Mu=odl
z6wato{DR4o%aa&l_|45s<WS;+B#mNZn*a3(E|-)1N{LOsk(!qmw-ssv>ryG~1q;8a
zaRGOP#hf@Yi%77t;gImSUzv#eOOUB!K)51RJBEH5g$zOZr+tY&J}~`Wgf%)ZQY@+;
z27v=mzp_EfN4+y|qiiLhWeCqZp~q)e^Am>Jx}rXkpUjtl_gb!jIL+VK;1z<G@b;&c
zov#-)B8wpm_RldDyT+vKuRMl7O3QfqY&4;lc96nCYimH@^f8;4By_@b8V9Sk1V#rd
zJrM$vX$;;8_R;|A2V_}?=POu>k^XV<d&e+M#s1O$V5&=ulVzYI42Iv}5iuDyr24qx
z692!tJqXpI$y+;qP&2gWy{82ok0z5ZF)=1tP6awqmVt_iD*7OTsA;S&nOVP^Q~}<;
zvFz>B_AU07jMLBaE&pbh_b!yF&2W041d7GIlbMFN4m>u7)og*4%*CxS*@$aa95g!a
zq{4t(t6!$`2?3G18ZFuVL*48S`w{{F3xKce(s2=L5E(4OrQ-Rs1kKo>ixT-~)>|17
z)Lk-b`F*6^5Dc+H9O2xYTL#c$DN*dB8GP)7SR1s#a_Q<lX!x<j=D(feh_{bUs0V+<
zO+l=*mjq4}BmM)4BiLX(($JT2qM{4rR6?pi;vA<U;t`6rxx|765!aGj+@ws{X_Y?L
z8F#`(%X{;Lw7}<O);Gc^q%%PkX_zGMxpNl+#Wb;oKy_M%ev8{tpUf)b<j@qB!>`N8
zfux7-x5=DLZ3R-2I=30r7?$vP8VcEyBtM`yr#Ls`J5bTHZr=J2`NW{Qsq-i2culC}
zpP?x}sX5FB1@IW7SJy`lLB~+~DHMP5;0U}HSdM{-Hl5~mLO=jKUY~SbMVxMI@yYU=
z$m4vS=1Q){Wvmrm2F&N*XFv;(ljk`Dla-s-BG9}mL;>lY=k2W58z5XmxswOd=QTn#
z-$cubGPdR(=h*4Dy@0dN6OM^h@$vH;#EXuJ8Q4t0|2xN7O%=K##II5fQeTzv+4UI<
zpqi#2wCdG?reTK?cY|}D2UZMR2$hD1tN=uzi8g^MppXfLD4IL7J(y{Hrv9pkCGrlr
zAsQd3<$2=;$`66G=b~OHv!yTFwOYJU7zm3bHwY9{^Rz4|FoY5vE@Qu!^6T#u&Qyic
zbdf@z2O>4~$dx)!^d%^{B{Py*-Uy0IN><5EJi-tJ<rR3Hh!6?rq832iKoULqWfZ{-
zGAva$!ni^4CNnjC0V;5&Fv_ubIjQ`3o4{XFkT<%%z*)=H1NkTajYVC-oX_IZ`GQzx
z$cn@`<_J@-8SGBx?6bkt6q!<O^hn?F@jU^`*`8<)k5|JVsfIz69@Bw=0WcE8B+pZ_
z6S)Y6XgT`eS&BAm@6RvShwal8vQL^*BM%DUpty{sl+@K=&~EbXz#ge6gZ$eTJ>ov4
z1B^cXP!oQA(h6PE|Hsrfhv(HTZ#Qb}G`4NKv2ELEY&EuRyRjNJwi`FL?KI!c`+nz~
z-*r8Iq}S7D?X}m;S~F|ro_nILte-`C#Nz9;MtWpbLFc5!_!_97MaV>egV^O_cw`6r
zFSgX*G_FP})_wp+sz<L^z<S3a4ZVwP1D>4lk^?VP41;6p;dF<(OK&6Mb-ImO)Uwtu
z3}_%)TE-eWnL?!+-M0dI-JjE(bv3M+S`i_=LNEvjw5rVoego_W^g5>6M98tL=y1^w
zXRD#nIIEP#sbFAW_+p>`nxphFDV@yTMFZ`vw&4fSUuv31W>KceFtvYrH}NWy#sR>*
zo-pUXAu>l!u5)In>_725$Nr44u=7*5pHVVCIia#kDZcR<695?A>xZUp0KE&b?=Vq_
z=i=u0;LK#M?wD+Aa*DTjv!%M!SzG7v<;djYMn+YXAj-ICK(bQow$Wf={X7N$#PsmR
zviG=cihdc`zG2x8IG;1r+x!IHn~Iq2YfjuJbS}*7@RIEg{iE1on{uO}cO4gwCHj#`
z)C<vGvbx5a7O}vK)ZN`(API^Zk`4YEq=`5j&-`NF?M>&61COOuY+j?csq7&{XhqUM
z1t5_}OFq%1@=f)Mxe;u*G)`qhQqeQf^K;cj$y>76fqh_^jB*ApxPPX_cN&<D=q^E}
z_p!UhM$X=zSysG4tO6wKy1Vnj$pE;hXGZTD-R3wSI2NR2<m~Zb6@#NqK(GmhWtWB@
z)y$HV02Ek)6uLtjB97@yPZWdsXPhUDWbdxBQ0jnJ!hj-7*;6*imij$v^w|r99K&_L
zcb|LtArPi<l4ee%duvOo9k68jIv3o5A~I#+Cppk#+EI2pOZ?F22oJQeFI?o-i4D*=
zPd}gcrN(ZIirWxypz|%q)Te1*6rWju$ZT-*_4Z9k^D-$k;0;P{kNO&YVNWnU+_j;d
z25@-w)7P3k%T$5OBK(2T*UNOu?aFy%aiBPsI#SlSEdJ4Pa5Ba4IP6K(s>hl3T_B>|
z-jB6Q=5(qSM}~!<bR*vvpVsmt(0I9UP91<5NqN5IcHa2M794Iou8|RulI91Ya2K#D
z#;q4E)jp`#Mqa~|QQpTuz^bXBDJY7Fc$__}Bf(O$&9>j_2~i+P5w^9zH*lnmnAu-A
z(zV^6Qsj=Ep4yxF-eh;K{xL5#!Q=h8XbnHMNr)~}UdK~x2{touF|KM)GAh>90GnxZ
zyU{+cL)zt4*-~UvRJx_S^iHqKZz^5LOd_~kHa?wUF0(wG6}hM}DyDbUzw&idtZt#;
z?*io5s$rjwBY*eqO$+!6q(mO;vV>WrWK$>to|O`~DG4g21Dfa_<yy3^UcF-ds@1Qy
zr#7$HQ%i*=Lq{RPzNTG5wqda?@<aWnl6u9xBtf&*%bHD#W`s-|(tHSq)h$W(iX26z
z44fibA4uEVgKcy^a7vtHCv8P`m{0CYCWSx#fd7ps5kl;7S|n`xyl=nXX?|GiC4k=`
z-p7EGKOlg+$Dl7CIIqI*F~m5>&anXn*DBs1Oatx!d!Q}Zcpd6*Umrh&ONHYhg316{
zh8x^eJSNTVSVL+`;zl0DM22LxJUmaaLCiNI)>T~c1zH3IWK?W#O(H28^b<b&GE`T=
zq7{-hV|{$}wBEOR#<1Q(Hp@p-lDI&vtx^NU3LCWQv#;Hfg2;I`MGA~alpxgN0SU1D
zc&3G$SkH-pmi8om%wZ0zg+sD>C40Hfr{fLg#S~DPZ--Y(P#R2rF8wx1(ZnTGhZ35=
z2xy$kWRZ=HzWmxsA1@zQKNFDE@d@TGO{BTqUY4N9rqTT5SAP+BY)=1SfPqP2Vsm}4
zk!K=FByD4)Zm7)>l}2Wd(s8xivggQ}Bq18O4aFZ{uCu3@*XR@&mta1{Rsy=F>BM%t
zdy<sh!2<tE#Z(xY6nJ1w%3Y{%k{R4D>Sw}xXGP+#@0wqwz4i-FH(~jj?DjTUpd|s0
zWifI2{+-IrxLQ+kWFh~0MFseo=FSvc5-RWWM4@HMPn*9+#dr{5BT4)n`N*-jgreSs
zw6hx;nc9Ahi5;mbS3x7+vJa3XPUAs|v2!aPE~Too7rroVO$KD>+w(o%>7N8vSvNcf
zz-htNE09xw_(9{YaYOk1ErF@_vHHy-lu6_ii3F+=Gg?mMs1`#<KcYy<_^PYz#^v7*
z6Iz>#gx-O1b)lQmXSnf=^C0ku0d2)ra)DG@@Q*OeDTErDO?1*6Imd5q*TJy?1HK^t
zsvfmC3)x0b|CDsKG2z*z*vYDd>)mQRA*Cr))3?FtqrFB5?W=<U!%Q%WcCwk1he9pW
zNvB;1BE;bIp1&mR9q;vDRveHWNXZ|6mZ+wk0G!V9`Y$Yhc|9Urc$w~pJ&ss4g-4Yu
zIAalmZAdFjO_SILYc?{hC_%v{zF|RE4aM~l*+8&cfB#K}1Mw}@{E}SCH5S7^y-bxW
zjseP~5D)T9RS8T|n>z>OLFZ5N4I9{nv(8+pvvCc2aC7}ka|ilpAn7JXfK@d8V6ML8
zXz!983)YYVIt0^{BRLWkiXfMIyG~>ShFR00{tvQ@FAD}6E8x;X#a49V2qC0DtR)<;
z_!5%}DYWcq8lCAc9DN=bwP>x))gsW<r|k7N>W$lt7icH88m7=)PDCRQZWF=1NfoHL
zcB-CDUm8NJ17aup1D8j7Q9Cnqu_`ji2P(5t`8>jNh)Nzu1RROk3bz;Tc<!$FLvC&^
zLu6EBU`?J72qOKl8uJ(UGM?<@r5V7dIZks;GdnQdxlLLkJG{M$O)4KA_;M0nQ1rcK
z1bQ7K!*aCo^r=O0P#DSLvL<nil^LVp;DHU6zja84VXahL5$);4M42mmAnK2pUuC&y
zmwf=GSY60jtWMcrsuwSox7&!uw=$2K!nfTd&IWV?A)MTXU<UG?f5D~mly{s0C%zxO
z5h)NcwSm=;{6%6}A}6B|BxM$X2^B2Od@Z!Q0@V3snuvD$FyEMSt;kjwQS6sk?n=ei
zT202zM8V+xpnlNR{6Y$_&47|YS62O((`jN~9)Ih*12IriLC|>Fo`O62J#=_*M@Pr&
zYaP^_-{r~_bab+wovvqESQ7H`fEV2imudyq0HAE8b^uu2>DrMeP1%%YZaA}8LXXsZ
z`+<76r?C&Y=Yx)&G3!g#vNeI9*aD5upNy3a-}36=myLE_tOK1@j+*e3WaByY5KHJG
z+4)pp1bdx>1A<vK+C(QZ%mBetOs?Mk$1X@FqRMsD=!&_=VXSIbJ=bLC)#QfEy0Jta
zyO8v{n4BnS(62((F>-`m^D}5+mC#vSUKNE@NczWz+!N=mZ9!K%h$U}~qMMyXNc5ht
z2P8TN*EC=+=$M(GJfICh`>|>`7W{akTyCwZrbK*_y{QzjNW8kGR0bu=n6H{dSbrdS
z$ZGGJ{B}^2PP*=N!Y)VgVM4%^c;filLq<&)z#6cNZ&7%X*|R9-mQu#CU|wKf;~#Ly
za&?kLn~0L#P1JO<AbT+*-OKKR7^=ea3;_=PR#8}9r%!eqeRoz?YOW=&A9a`h2vPJZ
zt#Y2u@vHaAr7I^%tvI5?HS;INx5CgO=-c$eKamCG)9tpNMkgl)91BC0uD7sZz>RxX
ziK{%lOGNqMm0HpMbWDAuQN(B9?)+%y4YXj2v=<l=;<b~--g%Aq7(Yo0qQG9fvyMMe
z!}h!XNtr+R8aN2Y&#q3;zw1XVRPAEoc*ZuyxrG*13R!!zx_3QMx?jpcpCy81qfPmb
zm2=fZi4Cx8h*f(GEVT;OmkbE%Q4YojcT<nxbuK98Qt7@Z;MTr9&<h`F70qd|TBPy6
z6WC(lkWcpSXK~K;<8vMCN2pb^;V9s->%6GXwsIzyGu`4HsM(M|D4wgf9fcx}@Dcq9
z>DtkG5XWjS<IZMXn6a#)jgK8`B}Jn1(-c1_s`d0zpW)QPj@wqrIVl6^NoW>3WB5TG
zxJuugOk4N$gXXjv6{JbZzpLaeX-J!|-I^VzS(3(7JSxfCn1-u-=wt<_e0<Kf>OQqW
zEue8NS2ZFM>0swG39oOZqaqOAR5OWn>+r(-$8r4CmPM9$cjdQ+o&U-zy=+`n&_2Aj
zoN4<mli#-dYE%OvPK1D+PSM_=mLU`czW7D2U|s3LwcG_5`9l$=bQ;|LhJ^ieiN1RF
zG1oBt0UEXtdPjD=bI0}cr6x+XCl%#x@biljA5&5C{j<az_c8pfp^rlj@0Cf-_8axg
z{L<d%^(G7TT<KYHu7iwm#>tE#V0TpmB2Yk{`}nQb+_)^sUHV~P@rUXioQ|L>d-)W7
zsMZ^x4}d;<6o7h40|G?8Sh@UbJi6<=Rtt}N`dp8J_sh>GhgwUt!vEK7qey4nFTK4}
zs`WUvl@k(a;Ek#nCToBP_$L4JA`ktEp(R(;N)T|%$I|nwNlxL<1$Ty>IG?9}6MZ}o
zSf|g=wd+eCKY5l>*{9BH5m}nYyn2?+ZJu{Gtpb%OK+FVY$hed$93C?J@c<++!+WlX
z05Ov<5m3Wk{RMoBe)5%NY1kyu4fc^lhym?LPn@=#23Aq#_HEx{f4L)zsW-9pTl&Pv
zMDx)-8?$8!%_i1a;_-o8RRSl#$Btt9l)j*tuh!2?#zB1V<@b_9qR3ro`6uJYk1*dw
zO)^Taj*2pWupxOge9m3mRitm|$cc=_MoLXl_1WWOM#tEaF~bB{^4(eSyHq~V5&j)e
z6_q)L9A#DMoZq3#w9(eMf{w<zjHlwEKJU&n6_Hv`RFvZ*4)udXH7#RhR~ttNfOGhk
zJTWJLk3%VNF&Fl7q7Vjzkty8I4vrV|bUi!i_JdyTYV1jU(ZfZ$9UnD~T}*uM8md`$
z{5J0<?b8x#GN#R7v%Ym(C2`Nl?$Z}Y)_k8GW#vm-ewO3R)O~DW1?Ma^x|mCF?4V0<
zG@|h~j~OMa5ADu;uOi-e0$GgfVf7Gpr_+0-B0IOBn9SzS#&$o?o#X%bo4aI?a%P%)
z{lvoR)C;e#6qoPn?<vVpp}j{r&ZPayoy?-3OCJ^*OFISZ8Db|7346s{gUg@ytW{F>
z<6bQh2#^@H$OyB7M8G)gN#)tyNeMqUUMcO)cn@`v+*eUA?Dh*jQ=IYJ$k&D)ADQtD
zcO)J21`WhIAuKn{R+o<qhyw4px!S#$n!@TDV?1%P75ExA^4*@HQgj}wvD0;L#o9E<
zf?*k@Mpg%jM;1kw%^_wfnm+Z&77D8NkMJj^m3l^1uCZd2_s$N}xo1^V#dd^c8qx@>
zWYD%l0-miNKH)R!g$T(344)0_T8^n0DqCx_B)wbhckNTJ<{7)EOL*JP+aEApo!|V@
zP%ONb@jxHoic$XZ=bS%F6@Lx9CKB8$^X6<o1LQ?X5|?m#b#NuoMRHNJlcW_MFe28D
zU`>Oxc)8WwgNJ2O2sU&UT|uE5@q#R;hdtaw_joTQ+x+afYD<aHZc!6BbOyAn=v-SC
zF@f8U>dQ40UL>7^zdD55-q{~n@_}j?pJp}hj^b-OQ^%8y=2Kl(#^YN~)y0G7@)F9&
zjntSb4$XPxu*K_+`T`={gNoEWDfscfZ7oJW`D0e7b+un}H?b_{#9K|s(CET`dq6lr
z-Os6I`;m;9X4803%f~gMRq045+RlS&%#?FZ3@q{*!Dn)u{7!0>S~Cb|KTPB5zGD$`
zkl0wU_#^sGVi1%tH_%or_zfD7o=~_tjw!{`3NnLb%CBMmriFNI_bXV?NzvZ#=-?ZA
ztvd)B?7i?7=&NH73oz})`aVjR+Pg=~OP$1~=27*pj_s#WV?Sbi_f8p*mR^$mma$$=
zJXqOHcju9FlEy+RKdlQx(BhnN+f?FM*pmz+9omX457)RNbaftsudG6y1*-;8;XM7;
zW2H9-Mav(<;*2B#W@iNYYYZtVdbHsw(4GOf;VkRXMuJP^0o|m2QocISh<|)NLyaSP
zFoN#wrN~Y_RD%%-0|t0ohW?YplK$WY@q(AkB51>6?%6S^dXK_{qb-}Ok@bp6xuDLu
z;1WHOpJK1lh;U%r#DG@RXibL!Br%U?wn2&TevWd_l$I=`Vv4g0&MaBroLY2Qx0>wG
zae|FVP<kO@pv#n6*nN-!|4fVDpadHD#st;hjMXs+^Ig=$3oCP-%pp$emi2T}_!k1T
ziGAUrrL8A6=f_Q}!B`5=HBKU=4!Ez&WxVpq|4hlF8Dk6(2qh}$;T6+n&URK^8efYe
zX`CKYT=6s`nQQAlyJHebRv5KZVl>M4j7VzQ`gl4!9NL1(98RZ@QN=7a&Pr)e(oKqh
zO;e~*F<)ZDQQpV<0vj|HFh(X|c1zHCmB@oI4qza8J?Lb8x4o1&*_6&`gT-jb#r4Q5
zCx?>aQV_l)l6gANn+WMR&~cxj<NUe)9c<rZjGX;rFco&XJ{@3PM}t2Fp>P$)ds*tX
zw+#*W%G}DdqUtSMi2oq@g)Ko*UdA6>pw7PHF&&?P=63ewH-)wh=M=wq-Aqgs1av<t
z2-ktpi=62jI+Hur_j*N-oYY?}7^a2SHVi4EprlAfpsk#f=@XjKLNZM{^|0+WDl8Ky
z_PvTKV`6NxbdqX{3Kle}bBV!m{*j+vaIaJd0ZpL>b<d9o$GLUlNQ!>>S7}O2$DQE2
zIw^6|6HRHM0ktn9y`~c~`H6vE^VcgClVhfdf4=}?`}{9iN6!hm9kL`Lib-LkznN)-
zL>50+6C%xq=+>GTW>LtxSX}gWb*HzL3U)Fd$UPUe*C1^GxCG$}6eq5ceNLgAs<fjK
zVRnEUUbA?ejf$S`Pa$_D1$G!B88cB(MPb1%A%C6TiEP0pnX6?CqsN==aPvQi%;b(x
z{UtWwTg>$w&PyXQuto1dHZ`>};j+t=^;6}omocM?I;<=-l7LJx0)*NxrP){NjE4aF
z*qfUh?ItVLI;}v`$|kW8SMhqI0T@UX%0FsAS;qLdj0i5DC(wkt7^v~7Qms@2LLL<=
z<==;QAY%UA(#Z)fmZL0`?aulAVcloSDz6FA2zcD!r(9fIrMbSIH~SNb2?>NZ`Q<SL
zYt7zchRGD5SXfv<z(}OWtRdfFu}V|9&S;=vl3Q9#O1ioeVZ};a4z8;6luQoZXA3Uz
z6LgZ;r@s1HMc(WFqdAA|t<ZjBVNo*|jAF`kvK8fEBWAr3u}qGtr-u!{xw$z9jDU7f
z&1Ily#&)Upm%<gfY{sTh&@fPlH1%qJl+)>@#b%$h+~#Cd;5X$QN5t5I{JA_?@*+7*
zV41q~^R?c$t65bi^lF>5<oJbV7b!)Se7cUc)BN;rumSRGysZ44g=|brV;+8#nZNB?
zw5$1{HIky?^uf<VLO<nLwZ;QlE;x^*(J^gUXW<8xG{J5>a-T{ue#2z1a|d94Y+t>1
z)M@~5AynhZ5>^WobKA?*GexfVci~8Mx`dVx-Ji|@vQ}2dw?}ia*^M=;%20UV1t|3y
zd|u75{1#Y0mfv<EeV4q(+fJ`MX;%)wD%lAzukW2LbKKUt9AfFZd(Neg1^zyQmDM%b
zEGrr1K0mzv3AK^%0%~%wu`BhveZw}Itkr?si9?ys%e_Qc56?vh$=FJ(>Ch8~KG}^2
zfZ75|vd<$CN6qweY!5@1Z`Ah7Fh5K9gTE3IGJsz@omvQ>3<e%ypyHN~#7t!BxOBde
zpA}%`YNt)c4GoQJ)!W6o?Y6GfY$&(7z2^fBNAXfobkcTJ_kr%_{2te&v)gJQS0T2)
zD^xjIEoVvl)oT=CFIbQSSAdrC<=ax7kv*bLeR??bdDPIL4AHq9UdsqRGBP3ZXiJ}C
zAFZ7xiMp-afC9%%`<aZR+qvk8Aa6L^AYG3JicIJSuln>1b{e~qSb@K-K)dGJ{y(Cw
zuFY=d(|WDHnHm=ll+}H%DqWeskpwuBY$lE${q-?``#6{S89_GmMP+k%pZ>mFyaUze
zL`>tlU6$*!O_gf2UZ-_KXjwozQWT%~dDk(zmb}%mWuSEXY5-$r-EK0As}=8Pt`yOA
z(T*ocv+mn<tp3-A4|)be!~D^r`JOHBM4VJ5WzqHk{m$rb>OOy;4uEn7QVtG`8c0NZ
z-lcNwSZn&Dbj^|AbD*kFbt0$-sCxzH2P!VbL+gPO-tk-k0(cIfG0v_(qbH8;VyS%r
zWluJr9nyyTPBc)iyX=<p61yai76Gmo;3F<3jdCgUi7%G(Abz{Sp|kafo#`*?Ol>^P
zc^9N;H_mT|&%^=#$F{dCGz@a-%%;orrmc)yzMDWnIu@VjEszS&&-x%O_`5O>)VX=w
z?Em@J2j92h?lyIvH4uq$vDU@{@EZfg{1TueN0l{FZIP6#>tdr9OEA1QYp}|`>CApJ
zv&l3QnImsz^6YbEiiOh?Pm4#esa4B4ervyFfKqpibNU>wbFMYHEYIU|Km|oF%;bBP
zR)<ls22%w<-aXk<y8US5!2yLMPVW((Jj<~D5$}ZKV)1)76yUVD8X|B&jf?Xii4g%f
ze-aX<qxkxGD4oChom98%u8Q8u`P~O^HTOGP|K+dj9OT&W4)m4rRrT0F=1Ez+)IGbV
zxd~&tmu45Dn^uRiH`RCFf>@@1T8IJ||JhvUtre1h#zNp=&UYM*^9f|qd2o{%Y|{7S
zfxiGAIcOrw#dWYR&3p7&Jg^VX<~bf_02yY9Ty_M`+xzQ*K-XqMJ+hhP<v<!c;MkIb
zGyYhpby&&(y<T>|>TTHmiK7zKj6l_2__#*wxZ9KveSfk1^n6mY(+rY|(uhbekD=>4
zGaOHn4_?0S*d6Ka@OWbWz<Y5O`dK;?D6`q=0v(0W<II1kd}s%ZF-+A(fJ<g`H{bSM
zP1jq#Po|p0o-xZ-@+{EuHx%E*3&GmWr8;u~&?dP(g-F78q86Z1`YU|@n^?Q+*_v*F
z&F*9nlOgG*B3Ex<bopzzIG(p*2QJ+=yAsvjRvox~5iOT_qq?Hv;&R=GPb#i(SAQDJ
zKbRZ{M@x!|T2~oEKIGFY^y$GBC~6z<$?=ssh0V&mHa}l$YduSBPveH{jAq<^o0Vor
zM#Uk*?^K6Lw2tzyFUq2Uw1>SSsYodE=7k`{$!-j{O5T>`y!L`|a=!R^{_~{BhV+2l
z%FS%Y^`g6`r-15eTOBxDkOTirX7f}^kosMx8}c}vsV2w?D)!1qat39(tU(B=D4FSj
zCPjDjY!RVcCii@ABTqq!t@YOEm#;P2_JArRk3uyTK)G4lr-;#}jaPRd!%>x~AFh-E
z%9d|SKXLaQyD?4v^t?M}-cM4N8>w2Q^|~J~&Gi(rpZncLR02SQ^v;PXagOqzFvYwk
zwCk}4JEu;H<&7-JA2K_!Sj>2@`cakG6P$3$&3e5`aO|C3z=F(zY?yqK`JJ#z{#oiR
zGGM8HZb#4nA+G&%T;FcOJWsA?z+QDfZbzO&BZHMU`My27f14!;MuXtGe>%ubt=YH>
zolZq`q&0(!k7w-3n+AF;<F_A1hcl3}YMGSD_c8*8CJMDjH_tm56Qm^&Gyn5SFf^7U
zSyNRN?VT5ySK%h59=<Qo(6L@|nhQd}AK<$TMx!_Wo`s@4D&|719qCJM`orPsXc29n
z@OyY26EzaZu$Vl*_XUNqxU>wg*YeSPraLYE{B()HdrmhN!TG`Syi7Br7Z+XMNHwn9
z?sD>Z2Pn(ma2UeF9CigNPbwR804eas8}dR2g$S<RuRsuk{y+e%es^1yCfW0wy7xnJ
znex;|Q6mEZsOK^iSpg~yKaFZdkUU&u3Z6kRMOv%K=MV%u>eBwfb2>M;=Zn0Om;3V&
zA=23Azmm(tZfc`S&5Pl}Twz!1G9j6KYebtN(+ODt>tfGTb*7WYO{jwd8!n5wH67P5
z>$DX@rU0202$UO1fWP=2)TE(M&7z*y#3tE7VLVe#jL~x5LbbLuPRHiJsXjyJZNl2X
z^ViiDd+|r0tV{?KWX-lPQQ+rI{!M&7EgTFVP#Mouh}jhZ7^wlmb)e<|R|ut~WGJgX
zjR2e>h(a#QeNL8RWc&$mu8<hOiTBF|CmlPTXF4J$kDwkgYhnV-FyH2ZN=uii2SB*`
zF3Ie%^(%xPv6xJtEwB`4YHbNx10t(Sbvu`lp*hK;RrCG{(NaxOl7YJ-?>{xSdV}<Y
zfR!fr9Nu|<eKW;>M_IcWQKEGm0EsWyeyF*1M$bL6oz0j5ccsLx#q1?jeXP1igQSzg
zcs2t}{l@S8!LI@}ogdOCtv){Ia~~-Z8c16$cD?&PG?Pv1;|XG&U33c3_Kqg+^EL~U
zcUtRcsbRy{EAwrCC`=QQ-GBhIM-h3G4h<lczrH__Sr-XA>htg?)PIf9hgr>36th!3
zkNkIq%|IWrUaql}NGim7*y&2J;dPqC%o|%rW4=w1#uI+rS#bUy{E78zCr9GTgx(m-
zWCn}VpiiKkI@xh1v=EUULK9V-@0%)pfBrZUkVh*0^=%jFvILjSZtZbF+x~91vb69K
zISF-`gvhs>u_nm(7_uDq>+>%I_;eQE{Y&BSr#`sR8K7y0;U(Z|jN;tR$??GJrVtJm
zQlJeY3}d{c0liz#7wcf_&`)S~s_SvdsdG0Fa5<+7L}AH(kD*+V);jJ<jcXI-J|l(W
z%ec2hb>(5=M_~-RrvTwGIKdv~@nO^{M@RoanMrvR;EE6j9uwZkeTO~%YJF@xdGy%?
zLt<_~;kKVK8^kx5al+{LGo7_{r+R;t<}%o4xSpE0^>MG(Dw9Hq<JrAz|88*}m*XJS
z%XRET{qD~x5`7mwE;_Ndg{6O3lD;FSbP5ZOij{8<`~BzweKRzSqDmxur_-qIakc{$
zwgVha!B!wjjf{DbCGgm%xZUj*Zr=@5Ut59LJmPXNm1Pxj0G2ZB{Y`#+JZ2rczxm+2
z9{HH2hB+=E4|huw%zg;D&0$B(4NZ!@mk7?Ymv-flF>?8#Xb<q$BkURgJf*XISQ_K7
zm}9ZW6Fl1;)iz18O;Uo%RcE?N)3n|hkK5C2Arq<q-pRU*dyhyBRELL9DDy+Q1|<$v
z6r2clz~$6*_`2A%HKTZy=M@ds2$zwA@{hQ!2Q(*1gEN!i#_2#L0ryLT4zfWha7@zg
z(1Upk#+JhUeV;s0O~}3Ju7Y1T0})Ng)7qf>V5V1V0cT#~I~+ju5he_97Vx;ne_89X
z=OC99wI_!XGtYAP>`P1EaVg|Z>9k4l;m}EdI@%R76|N7Qt$1B@LS|#5u%|3jyUfr2
zR@eX2toy=a*KNF!@}=OC(vv#>VwWVASIMgMi)dGs35n*X{=%@M-^CTU-kbaH2w%yk
z_}MYP&%~36Un;31=-ni+{6N+qM@$WQ&*XJEYhHVO%e?q`!`^{#G*bu}g3Et1&Z2<{
zp?tT6js%>TIrV_84Tsy2Bt!yr^t8ECSShVZvu8h3B-9kQ&G<6bpY0aTv4O&7EozHr
zwCtd~4E1F&aSWi2dfiQw{|k7c?uh}?(4Va~$wC8y^#EbD`O@q^_qaeJe%1K%rH>ik
zN99D~q$~I<yQ3TNiqUO7viCz>x(eNl9%d1{&EZZMlWhsWBwVP>@@#HEq<etZk9~(9
zSs&j!EC`Nrxo`lL(c-Hr;T{W_DNWxdqSp7Kc%drlhHq25++t6VlH0##;O+qnEty*Z
z^pZHN57Ks(L3VuFOWko{`pNv&G!%Z;2w6&kS0hV;ugEZpa<Kl7@5@<S==2zLqLm*|
zc3q3bVTX%eA)^1uJ!V6VchRt(bbKx-6cQJY-<xxRFmA6Pmm=pA(UWmH>1|N&I0j~*
ze{f2!@3W<hmswhu+y!+X;7eVJ3Ce~?6C?tc)$IeJ$>gw{sKdG_9)Y{S5>Ob$d<opE
zt<*mJ&PwUf1`|>{+Do27I1@pE&X~Un%cEk?2}YDr_Z9lxAV2GciuzPPvGv_Sw5&WD
zs>7SW;9gJ9>`?sZI5mO>+d(4Y`lTdi8Lo~DgJSlnpV!|?9Rg5iG`KiH3n&9m8!{ZC
zA%yIHFya0Nh7j@_%85O{+(>nOv{YTJ;`5>A)f_-~tUCe(KiQY>(dz~5C<R-?yYB5-
zzAK7PNUadES^CSvWeS>W@K$Wz3yT>Y4h~?n|4<%<OQjwaiwL;@3A7<a^|U0n%sVFo
zfC&?}Nn3U4-&*{)80v9?WMXD+)3C$q<1Z#e#x9Wr<q78CIewiZ7$&8&-utSUhG{!>
z=&H9vZANb8(D{Ne8OYj`_i3oY!|GKsnM#B?bR#V~pw&FoMdo~kGOi~Lx{9Pyegh2A
zQZ}RE_chTCvR(gDh|>Z{)Eq1mUNzf3mcT0HT_Vx|8&X7Fk7C!O0RxrO?E&WALu6{i
zt3)7Du@612QSb8@<e^uQ?ljzd5GQNz|9hITjd`&0ghaM)NN#2EjH*B4dg=0}-t*3C
ze8tSl61$CKs+pukWM<Z8CgYF1592DU<f-V76>o##)j6LUj1_A)MX#;jpn5o)+u!{e
zxbiv|E~wTzFBr987V-2G4sGxAI}to-58Gnv52|V3vS!SO0UyXezULPhLgYZ*^lEG~
zlVsGH-(^mr3+zyU50hH;w2Jwa5ai!cA209mDzcONJPn<52V^c^6lFAz)o&EFR==rv
zXE39_U9Fh9&V*w9FiKNLPGwmOz*HLf^9uSB(%IaxQ;b$U+r{d;ern1yZ>Ip_O?JiM
z>a38SB)g3Y(fo_#?@E@j2%U7|S+I1s<yKCHBvxVPx3PRzWXh)WB&HQ>0PeMNa#LQ}
z_2`l;H3`Hrz(DMMV)805gYb(>g=DUL(LU?3J2&*ZGg<n3#4|cjv68!a$rJ~TG}LXS
zB5!l=;>d_caD@*>VeQyO#RxGg(Z_&EA4)NQ7M9peGrZLj8j^EX{yt5!fmw)7O(RoG
z+6RPa2He5RHAZGi0mnqa4DREG%L4LyRRR$taOpQ<82>sFdsR`%M3Zvk{EuI=&Z(Wz
zGIj2;B3@gfwz3^>PIm5)+AVgv7n1Xef26FA*jO^l-3h=>ZFfm1GvWmmz$;X~1VV<}
zj7L|~b)vZ844%XPRDZp0D9gZdb9pH4O=H5#>`KgFC>Ju>>1>JbLgIxSxof_S5BX~K
zEDW{d7^0{WBGELqE(Q~P0Wt@8Z@VQEMQ{;bIOS!t!1C3%#0&irBt->G|1bMjYLBb+
z7exl>8EEB+xU&(NAHyN4O9uXRW^rDOnDAOC+^Ff);%QkDW@M`P<LV6R#Dcp3BOQZ!
zjflD2xLAnyWw_`@9=)M=xuc*!eDjO5>1Y%P5G-XxjZ4%CGN#8@ULF?5{=7Uqq%s;+
zA1}1!)kBE$l!KJQ4Fr>&>5WIl0u+p~QJ|s(Gue&qqk0m=d@5?l!{bT%gMXBh7Ibkl
zsRW=x7%duV|M|!nkd^FQ)<vwFl3b)KuwPFO-CC5_$6Hc=v82W+FI)!$?kG1~Gs8ri
zvY0S6sfV~h;R3bJ8IE8qNMB=zNEfIFSqW2{>}HFg_j!9mL}_LcQH|FOUID%g<5xyA
zqOYVSGFo*;!IlV7q(FdD2Q)7F5(e2%Nq|~h900D6oFbu#E>nUdr9(iAKm`2wGtfFU
z5Ob!uWkZ>b%Om#6CA;P-PI+UZM?@UwH6H@;hYBYc%6!>Oc)85xiTOc;XtR0>S<Jn>
zd{M=x&MZA^dhA8fl;GQ^0z@f+gDfXTEqJ5pzi%(yuh-JFe1tE=Blu#NGNIv(EX>Te
z#e~r0a=px<m;&GVr8L;Ebfb4=#59l~Y4n8{rKF^KmpGy{g{Me5Ig6)44L^J*1@LVN
zb{PFsdQqOoPah~DW=NDUY|x2}NPhLmQ_XFGMe*pVUpx_CfQ%){-$&J=)K+)|e&c;4
z6j0T<DvfDA|0<*W7UU2}BkK(+|Kn6Y6d;&Cl|?2-1!6ynTmp#=bZyBuO5>HF7GIAY
zRXHRu!JHvCmtq#dj)-ed3pfN~37DMUHi09M!I)8crn06hf~%n}GN2?b<pWwL;~f3f
z{27;V+GaF~D((Jo#kwZ>YC30klXJYL5eS8<RcjI9i|<891VSQqT@B*gaUGUA&4%8w
ze?%+$5d$6*qIav1BGJRXMM1)iNCILZ#Y~5W7C{YgJsT(pb^uu6akroHjruFM*I*0E
z)3-+!VZs&3aSW9X<C>=YE?_j(OtTbkN5!7l<G0Vax}ttDy|j{UUPZC!>S_>O`>LcH
zdN6-U0f|EH>(~g9kZ;%6fukugq%Pp|y8r0xz|zpa>jzK^dhI2v6Uu$1X5I~dss`+_
z{%r)e<1W-syb;M51t}mFP|FX5IIQdI>pvp8m{2m9;E-TG2*a9f;Nen!5(6xPz<q||
zLsk*t=*2c@s1w6dasIjB+%D*;`s5*<v?z(MGFaLBg&WABH~l@oW{&te*(s0)*To=X
zxKF$IbJKYUqU&a8fQ%80Edp{JH&!&AXf8<*a3iA>daYE*2UVlJZnRLG$YA4pg64x#
z#%m)GB_Mv=h5+J&PA!K2MCT+xl|srC8<|t}H8{zd^;?CEWd5_B1OaloqU#P<^39y>
z;<sEEHA7BjkzNF7bMI(WRDa2fefm+LYHXZei}+?fvIt>&nia;1bRYRE)v%g->ZY%j
z3JBoDx?I$d;I!XAVj42g;8en=!%d5m5@owucQcLPvGF>R6C~JEkl$@;vfJ*f1Xm+z
z%02AR@reQ)iToj4A^#BVPlWsImDLLJJ``c{%Q#=C(wh`^#8IYyqd2Ws%%+A`T%Y~h
zmV2S|grJ-T$%~P!8&>77NAtNY0oxPjD|>}5S-{VDKUujG1*EXGrr$z}Jo{fikTW1C
zP)Z>H9FFioFcD%|4}ju9OgWPxF@byL(D*)68Ff&xh>LYsn!Zi%n5qfdm_|XnU1BWH
z-G62O4yOpD?GR0YbxSvsti%jUAT)&<7E2LS62B_1e-CTE)&+FSVL^a4VyY3-DcdK;
zsOqaG9!i)CXzP35?!?mQwb-;7Mr&Xt*7PbVf)dTUGNY7f;bW$9!K*xc3Z^{xPiIt)
zHQI%mFQshJ%T`mb(rfRvX?((@*Pr>tur)`;3{N{zLhF&`vayeVjGxgiO}|wV35cHi
zC^CB_kd%Ug4yd1xiok7DV5b&BK6)=yVeef_S<yDL1KvBIcseit&@C{Hg~lwoaA&Ar
zSw6x|JdEBvGy>}engIYNZf)xm<xy<bwqSfNuWM0koy1Ck1r2bHA_a2fz{zL}3cVos
zGcXAFDqj|SyUvovNDb;WIh5C2tHUoI(8jZ2m;TkmPwyEi3#3S&0@%;`w}U~7B4bDi
zqEN7owsT$z`?NhZu>)$ykW7Klq}7+l7Yi9nxPl}1rPVQAb}7l2*kaPcBi3*SeJ~{*
z<1FY(@ehIdqCyqgs2Dim@}d`<(;!6^h_DF3iZ<D(kp1rE2=(=wNb?8pKs{WOh4avW
z!=ayL>A(@&m=W+pMdT;nHxti2xLt}8Trv$~PKq8s#yN8^v}bTzrd)}ieYYSL+@r+S
zj^_VJhHwQ$^Emo2>;%c4PBfUS)t~bpc@-Fqt$9Oo!>iVqEV~zR&JHQ$>lzR>H}1Bc
zbTj*%eWhejRi+ca_7+jK3AtZMHLpRp@WLB)Lqm%=woHi@kG#2M3ZiPf$FD1;(6)Wz
zFUi9KJgX^iZfBIzdDQoDbPSzSrwj|maUR+ncb$%#5J8BxBWPMpmFHE2r8K*e8c885
zn~r#m7By~1#_EzkMib+!O~@~@yfR~|COW+j<g)TB{eOAA_Eai&ik1pv(y=p5eL3~B
zIK7Jo^Rr~h`I0BhMi8Kp_RRaiZP-KWTzFqxFVp(Yk<wcwZ{B_TIy!k%O*spi)B`#_
zGhA+%`jF2P1Tg_z1Pqun`bW%V3*7aT=XpV0ns`kz&?}inm9fA7^2dZ-E~AGfoO+xp
zko$#->tS0AOJ^~X<-+FCYP6#}_hA$yugiujEU3L}S9CtxudsNLBik(kjQb-)6r6#8
zT>%ryOUYmuM|P^+FCC^${6v|o6L;_1IjKb9A7v8*q3AD84O^*9qnC?aOVtH&rQFXD
zMU)N0Pwxxrkt9Ru{(M|Dwr~D(%tX-Og8rk5pQh7-0N;5HoY8%K?kwX)Tcm@@dUd_u
zcx|-}2X<y_Tka3SDaVrby3g2*UmlUts;n$qc!#Youf~740FJns?v)cCdF)Yh{cKVE
z#&>lS_I40)ANV&buVzIN#?!eEgJ#wnV{-frJV~nbaIdrzJyqASW?8oX+>U%5PdrP+
ziv~_PZ_??)RQG#J+a9fPn{GLDM1dnM6;?CqFBK7+#4$7&=a6~u>7&Bu@%hv%*8B|c
zhgO_+@u`-=RY)nPnlTHue+nip=tnQr`aL$3{O!i;x>O|^2gm7x`hIDLX$a_(RQamJ
z5LbOYL;ymRt<WlKO>_Mak$v7;8HsjA@F+Z|z0L?Y{h2G7*o7hjQFQx*<ZR*3WkKK#
z_;)j!)i5Qp1lcb6i=;@%;Eca#68r|P$+pKEh<kGhvwN(ghVYRux~-pcNQOI%N!z@)
z44;{y>ye+){~XBc(_y~Y#TLwTkNEIZR*%4l*sf?Y#>{^F<B4oy9#bmuNVPAp?x}jX
z^8xm?&0bB~F&Nyqv6LJ8EbGM=D|26gK3+{ZWKYSt43<g2O-L8!h6Dry$guN%0T!(w
z?$i12Z#;)DYLkN%QyssA<*z=owBc~Mu)4&?1G}FTDFEDR9x3n~$;oUW%<=XLvOt^T
zhQApZN<fGHYwhq9#si-T?Z#XjYhkwCMioW<d9C&|!;1y0SoGMWEk{<=$I)d1ihl1D
zxx1fl|1si!KE6npJYqPjZ2QIn2_hv&Qvwk*KhVcg!dSFKO9HnZtfuIhcI=rR2(I=e
z6xn2P=jm9G{cX_xK2L-Fa~|q|l6O*(ENXiG1IYaEsCrP&N96pEB7(|c$F@^;SJPmi
zPR7dYj`l+nnf>OuVP25oRovSr45Df&X)MY*9-?(briDi6aQ~z@Eud<r6fHQJF+QH~
z|33j?gwy#=?$4SIdu+&CG6r%T&)=8gNyK6hXkBbdwBBkK`Ye9&Ud@0zNbAq!CmyaV
zH`?t~ROy(&^8hxY;YpL`AJ06`5pDmo!PS4~3H*^ydby@dd{OIu^G9J{GsnZ{J-W3y
zR-;LY)mkxUK+Hlj5+g!$ZlI%WWxEK>4)u&XBIh8iWFmT9YA>ulJ|6Y~fEY*+vVsF3
z?+_9vqW{hqSaB%OIbYP42UP6|^wB7&_YUql<MzXOwGLjLwUv!Z-7129)`JG~2>ULy
zo7C`^&s<)ZcPQ}K8R}jm#52u&UO4G}hO{3cr=ahjk_#0SOz{8jype@AhmqssiR#V6
z<#b8<Nd(s1k31AL2y=X9Vo8~dJ-36>_;kn9>G<DNY?i3;@rb!ye}@$PF4L6yB7j|p
zWf0HyVGx0jsW5PUEPpfo@qb6(BEFpNEci;o)NfmS|LtkVxJ_RcWpRUap$0iR^D&JW
zeH>vgB7JUP&4y{)Ha>;?Gdxmudqjg*#O<v{8Q#f112cUkWFoivB?Y{}|2~4qA2#u#
z`^%Zgh#xJM>&uzZcIg!{&e<d~OOe6t4~u1yHTS|aW=-c4DwHR86f$$#KOa{T3eZz0
zH2La@p9%jRO^_F9sZ}p)LB(c`5IY-%?YGYs77?w{;f><XH-i|v^)kA6cE23VyZl3^
z<ijON=M((o9T>1me&v6(%o!!Gj3E0=tmZ?0|M~DDTX<)*9&y`N6j~_8@V~2cT$O12
ztto{O_2Es42sCIk$a#EZaag@8@DUHNtk%x{a(8D0@$qv){tvD7Ze+0rY{q{!mxJhL
z^<sa<@9j4SMTH~y6Olx-ZGeKvWl=n!#kZ~1J%BxQro&@9vSy%XRkR(U;DAfZwC!Vd
zpP)s)0EdI(Zyo29|IJRK2Ln`N;q4TE`w%%k;W*NR(-yx&eZGu7$DKWkKdAY1nPng=
zeY=ujpm)z`Shql!Vx}{-Y=f{dn33b3jRfST0RY*WLIdG{_SMIQ^_tj#2F=APjG-I3
zbCRZp7>wHdReU?WbU*-BH2Wt=acr8%)05FXP)*g0DWGSwLXY36pV<i|4JZ>b6a6gm
zze9h5223UrZsNbw|5xB5tO2F?Uy@n+3+2UsBv!tEDa|G7dePExvEAi9`qQu$XtV%#
zHHCZi44}#@Fx^WJPtl@gXFE!F6#pHQlDG$}zObzBkox~<5a2s7Rw9riMl8X17KnAy
zgdj#rEn6(gF@d3CnnogDgtKHu;Mk#czmxyGM+%2V9?OjRcTV6k{ys6FVnadyl@IU-
zvJ9blXqmQtl@7gDW0V2|MGQtK12UVnSee&3z@V7-uMY94-L`2tpOQ|V({sBzaG3ma
zd59CFMFtKO>FSZ6|I_bJ(1%cPhZNi{$ZwFx1(>(FJ>4{{zbpV7f9~Y43SS^BKGRvs
zc|-foVg&Q=OO}!L%vdQd+c;<ZA4%sqgW9e{kVQ&!6b4fTh~5m0V;TL`xf<Xwn#L;_
zTi(9P-)<}^Y5usaf1K@xAG@P3daUAqt1sw{s!=b|Xv=U9V49eBaDoJyhN2b$4M}}R
zjZ}*ua<dkm{rP}$cl?qF0o1a8JliM&IA_wV<kbGj!T$eEkp~{Yk{PbPft0GFN1VX(
z8FVN%a(Qkbi)(w%!DvqIYAJAqvK!d=$G_o2*2w;yjxZ>0HF{|DB=rzu0RC3lDx#1I
ztauxf&b|W?ebm-*ofRn(IcQ=~8BR%ori}(P+)wSIqM+Y_Q1q{xAWYUjA1Soq{DkMr
zsm@Os#Qe&j#geH-Yv-J!|5=~B4rpMbd7zqmkSe5r_ZC20<U67rvj;mS;UDv+6F|rH
zpaWJFTB1kvQ|07D8vXt|iC71P78}Om2H{T^4Bhs~AaG*fp74)haRW8<m~?gvWCApS
z9NZD01ie`QQ>Dq0Xgpa4@DeNl40b?{w&e3r%(6{0X&+JKfL{Lhv;Od+`T5=;@Je3K
z>mqbzrhy!l+e8J8Lrz~_BC>&fq?FekdKSzRTR5)NqjMm%jDmuakeE2unB+e8`si?S
z^$Pl+WnQEW3ri;+gGNwCzEq(m**ii~Xv6;UD{#OZg4obsB|JQ3n{`rQl71}Y&gfvW
zm)iwJo@XK_S=A;km!SjAa9I4jcgy7jrrEZ^%rtyX4mFtu;~=CeF8@W1X&_#0<a>~K
z?9Z&J@y5}JP130empz5z2(ZbE)q_z&vfp}Czg2(rh1yccTg57kj4a$btCP=8Zq@$K
zP*kYVGx}YD@K6vKyX@D~MKd&8AX@O-3YL>2?yGF23YFQHZ)#goVJUR9XqJ}iqXOTi
zQ?hIntM@08h9n4cplx(&=;A}LYKFXNpY^I`4Ols}oe$BbBsARqU2uJ<q}IC!G7wup
zyGEmX{<v+t0Su}qkzOA}s@^xUUjhps*1c+0*ua0Z_9nzD8a!62j$J(+sEPc&62D5^
zb(*qDF&;t}c#}&r5FfwjUiHg9o<2|@8d>NJ-HBuX^^0NSJlZ)2f-e`vCH!V6Y0Cf+
zHo$%z3uISEcAfUeHJfcT4KziyDu{$(m@A<ly7$J;y8j-RDfX58H85^M=e_iQn<hFV
zUG6OHS4{gwNHxNL>H(=Dp(z{2HU;N0U#`2<>1DXl1)AGrZT&ec_49fpW9@e}HqR&$
zwx&E6Ylmgi`}?yq)rzGJ_dWz!5v{Jrf*G=-S8aQ%&!_?nKgI@SgPBKekAzgeZ3)rI
zo$<daT)Ax(L{>3(+3TwdrQ_ztnQ1qnD3@aIB!6WLxaew8opR1{2>GM(Cy^`go}e3O
zi+XeK1D?J=5$Mq0-PqN-s~;o8`;Y}DptG)gcbIe8zeCP$x)tjt@16usoZ4fIXfams
zm5X~vrp~!8?Y7!|Q*l%*epT-_>SH?*hXY$)!V3Ob2$UOi4uB8<z$t$aFqJhRbc64C
zm_HC1sg8jUgtGut(*2^o9~F);A=~fgqtb%@4e*)b$oqj@j+RHNlhupHG^)hN7v=Gc
z;sNK=!{=~!fSv?Vz{1bIlg=i1yCf4_1VmWVv{O4w4{~cM2A|LK7X-$58j~=(?-bud
z2O#vVZ=|oLU2n}h0gxSSSEo+E<D9T~|F)h5V>Gc;sL`D?EoxncGs5ZljufRPj?OD1
zBl26c9oQM$#qw^uKanNRYxh{u@wZ?wxLTIn?4wgSa7F~H^!vVdo{Wo!W}lvJHx59L
z^50G@VmFFL2BrG4iUYkoJR`AKq&W5Oe|XU3)9M`0it@fCwFq_M(!lz4pbZI=BF%tj
zuw$GoggKxN=stwAXp1*iI>-ET5)t-7wc5pyjMMxS2ECH&v1<XqaXZuD6{>UOxncsT
zER&pt0L%%qu~=Lg=+QBR?{zvOPT;~h?vkY}SGVSA{r&HSz@k<Ldy|mjb9LF&%>IPY
znU?lk2{b_6P9omq&G^}T6m6!flZ|EWPGqI9^lp@f?y=oZqIbh@wWeK85KW_V<<;X$
z;AwqB0|u?ew;~J1S%67_@l@{O3E;p-z~`kjhJk?r0*XyQp5%5@fqi%v=nS)@+vZrD
zzYrF*2hiVg*evByBN$OaTn)t%h|d7Ac~#9LFiKdnu&+k|xKf#)GPBd`VKP@hfPq1N
zkkIdl>-CrGW)BEwpy%0g;Bs~&{GQ!%J%poPmt7TFKe-B$_>3|Y1qI~&%frPGeJ?Bq
zeG-C2INLnJW0e&V$?t>F<QV%d7eGnin-k(GO@4u%$-A7zx7X85U+=M>?(b>3T`ZvV
zer|`w;6A+%VyhfzwA`l{DNZdwmwfJKw~aVx8y=JX#)&fR>GNX6?|mJv`z89%DOcHd
zvs&v^RV(3(uF~|kL|Ys#B~+dB_Nkw??R%N_RoX2pknlo5?pI`WCXx=qB!gbajDlZG
z>>f7kmz(K8+H6;n{BKZ02}KPhFhCu7E_frmh@lXD27`YLi)2Eo+0FfBh5TOF?zZZ-
zZy`soy+1RU9Jws;Nj#D>)LUx?ysW*xS6wD1Tv(h)6C7bHvfjPU_Qln-*ay{ZUt^`&
zh1Jex;6V|8sa<{SZ1d~l1HzS;tM!gF;(o-MPHHY@A&kyg%611ey$ZRw!_C3!Zn=_<
zk|Gq4>)9v#{nLd#OM*f!sjRg<^cz?ymK8ONf8Wc)PLlLDH%aWgg$fJK@Wx{iAvH?f
z3?kuI;((Cz7gG9;X~(I}?c=<vWE#BLPr0LzyZ)CeY%!)|I)5{X6Ckcw5TG}Q(7Vs9
zl-0`&6gC&}G4L3;HV^Q~3M}N=IfDWY4woa=tA;23$mp3~-WvhoUZ{4QN8~xbJkF+Y
zf_v6~?K~lf=%-ZHk84D{m;nIhc}xHxi^|*~t5Q+QeJL3ko&a#=s52QaCWTgWJ8C{>
z>o-A8fp#<Py6P3wnn&q9j=z^m5QWTij`aIu-(5H+ZwkqNu;I!~uJ7!&`J{}9#v7e(
z1Siv}hfH1U@SCR}zbIsB0sK0EE`$*O$mI4*=0mnm>({=_E6Wo_arerU2pRx=5tv&5
zDuUaSMZ{cApv5u_xLx;~tA_<});yB}O_iXx4Z+5MWW!k1W%%UR>2MR2J1)C5PTZdt
zT`x(?k5@Z;ij*StoF0b&vvgoYqvagADYPGBWzt$Om_!r6A?Jg_8aHkQu#3xPvL_dN
zzg`U~Jk-LztEzPy6{R@60JuBkaP!Y?+P5CG0vjvce$a;J>G{~U%d8UhcG(NK?Aj3N
z2l{@N?Q!q5Jm2zY4CP{^9pab{A9rkOwq27-L6S&>d0l6^W%g-J@%p{rnrJ>3ti@zs
z8`L?12ew}<z~vDTy^emH_yk%k6(hmMHk)a0#l9ZtAMo6MfL|>(ESpou?E8W+mIja^
z&}lb%^4+JYaZh?AOA$OC8cz^-<Ly!)1TglRySb@S^Jvq^u~0|6KABHQI@c>RVd&e8
zcjO651Kl~q{AtuIuAb3FzR00lh;CX3n|;sw4CW4DYkiBEiWnAv+icse_QOW1-+;$)
z{=2~Ms<5wpcswbM4$4=wW^d1y{DQ+^Wah+lm)-Ae!-P=_saKEvsd$4S0zCno?hBgq
zP!{65h&b~21q8(Py)}?Fa#=_cVM_Z93c*lE3Dgj}c#1r2uuw}~ud@TyhF~=Qo74Lf
zYRe=_``-tX?=5sp==*jCzw9JX0L`yfh#q4XMAVq+Ud8R~!kRIB1|9|Qp~nGuWe)x^
zYWFSxcwoH6?*6vP9juPQ{Lxcu=4nyd@u|XKyFa{yUEtM!Kr)i!G0?Z)Q@UfsVmdFN
z-v1ZSxMK!P-M$aV0{?Rz15&GNEe;=L9-N?wGw<SgmAz_#=B<IOl;TJX22Ybb7o2p`
zyztvp*s=1te)P9~kO&soNcdFo$GLuu6hCn6YB$3Xu)%XC7)imIHhe<^g(8qkfNYX{
z2nb5AMhz}q?=&_MzWXMK1^u^EB?}ZGLPCnqy@U*!*$kIu)pz?EK;LEraL<SHXdH(!
z<5}OuAG(Z*5i$TbEt3gi%bgTgf)auVmPzC{0C*S&W>;A37<4&buKWvr1-@5FWtJsr
z6xkW~7DXsrUEhaoe0m3(JxvvcW3T(|9*3LQCp=V(NGui~{!9`9(HnrfDo}AoHGTkS
z&56-q9;S)YY+)vO0rZK)_Yh&VdgoyofUEW%fKBG(ZXp$weV{P_aE}mg1xaTHzYljs
ztelc31zwD?yO|h@&0%Tb|6}j1qoVA(K44<N8A67kLnUTtDJ4a^8$<=AyQQQ>x^oyn
z6v?3^q)S1%q)R{=Boq}85fSw}<8y~T-~0aiegC{`v2I!Su;x0~Is5GR?cd&eDxul)
zVjGu=Fe?@me+|%ob^G8f{K}8*!gF`D>?_^-_U^Ov*5~>eZaS1)zDn>gLO)SCp<#_L
z)SPvci$3=@9Bt-axG8f|v=gU5zec*I=US??O{Gqt9@E|&yYqVb!Uhw)#v5p+Lg(1e
zH=B+s;N3qNOE%nj(c|~Ta;z|})>MM=q^?MMg$h=<!e(`PJu!J;s&w1AFs5o&`m5qg
z-N+8+htz`F+2s<!MMm{Jlt@YnB8VmVSviFcI%o_1+i%_L59b03Ali7`tE|NO80MCf
zncL~D0pl;#oX$<1GrDM&&85e#kCwaPX?uHxqK;-SoHJ5}yX@*=@=T7#TN^hT(QK*k
z!^oNy8szg=X0H|$RFuLrPS~&s&S$wP<Q&68qzY4L-$xUN2GH|F_uJ)+A1Llr=?#vb
ze!UeVeqV5UHp)sw0`^Q@LLPAFFP41k)6U!Lp<DQ#=V5S=!7*}#^$5?P9doK$z1l}E
zwh%nINl6y?+3w1_^4bs2KWS&KC5SDxli~@K7J6K^=!Ez%`CaXNvob34)A&Wu_n$w$
z$oMIBi6}9y50fl(T_;{zlZ^1)5#+D=&@A+KAUKG@`#_3ACKQ?R7vk4~#=p1bKl6Yv
z8RlP!uRV*(CI*s_Y;n6)y|&}EZ}*E!6R{%%v!?fy0u;l&@tW~NRL+mHeM+<+y$fW1
zZ9n?<A&bev7WAoU3cO$lsp}=&`2p~`tK#Tt)%io@!>`;?Y}@N36>LA%@()<hw4&=3
z4<Cmdz`bp9*zLu(=Ly|adNo>!*ZZ*^hitZl-mcp}lMt*k{U)EeKTeJ7Y4pk#%5y($
zJ!15BcLJ>Y7cYG!=--2ops#Y5wQz~P%#L{B9eaiU`F0ipReAkase_b}y8{bg6Q#90
zZl3HeH64_E`t%?$z7H!eOTLQtg#3I1D?@*uGv;04C}s9mp>zp`y{;T{Gx^aE9Uhf_
ztrvL$FlUShRwi6fTa)?vV9KifsOb%(It;O3z_T{-@H;O|uI!uNaHKp5*Jnwt;aI!>
zy}mn!{3c~7hCNIp*}%<!TV+HbeP(Afj6y_btXg&cT^hT@0Ob>wXVh{d$)&N{$Ci;3
z?bmYH)ewaRMB}+NG8ltIYj<%7<ry>7+&S2@R1Sw9r{A9TlJ}^&(>ju}g`7<}pzvpV
z!1DV0t&dlQ-19k}%Chu|1~j}Rn<qJeuYThnFc;6$1i6gp7d&DN!BOI#$;Asw6*inA
z0_Qt*oU6JyM}#tUKcd^9+gAh${kH7&v{n>)ELh6oB_&_zRBCHRM+FiSYL-S;=_iiu
zlh8!jb9tr<rg?<(aQ#eqargo&W02@2b7`LrGdK<et^7&ivWv0t-iU0^iV_jA@Jl*+
z9>g>H;&?ik?pL8h`i~U3EB!fM#@IuX^u4Y`{`TaM<XX)SY7(7kL~qtYJqQ#O;zT9H
z8R-NR&E1J1<^`2L!U(m!>k}$#-yxB8Oo9A^=%0T9c>s&8<3`}BWfU3l4?R0Bvd`oR
zObgAnBg7Jm(se`A`Nq%BPNkzQRhldmNSEG^K6Lnq9S(8%csKEgE2x7gU9Z6Tc7r*(
zAe_|yb#;2Px(zh9Sc)TbXQ5Paq)JXK|HI<5;^JbBZ(tq<8;i;k`FW}1%4Z)sF|uIr
zm118Oj!Tw|2_TCrauIm=fv9m4jD|oPm5xf@eOu@9npPkoL%=LiB8==B%fM|%`r8#Z
zd<l$HP*Z2LWxjhglUAI0WYENNDK80e!eC*G@=}?2{V39=RXLFvrl-cZQgq>yF4sCJ
zfhrOF@B>T|;P%mDBk+apPqm5OUL8!E$I_cTa}2L$`}SqIc?1IH7G@^&jXrX%xmj^5
z-@*0ycqnS?Dylt3)T_k%Mt_Bk2KJ&*UqXBz)fG(cm5+Q4ic(X_<$QhPR%>Ur7=tH&
zHas<bvpq2#OVd$P+tcvAQfgY}Fkhj`fy&y_!}Sp@en5F~<bwCE*Y!b5A$J{G{8sZ$
z2>IjrCk7$z?u)PM6`Dnd5?USW)76~bQw}v1UawK$?`X6dXWRO+7MRq-a4mh|W@Z7=
zGoQu*i@XV}!Wnp#rGB>2C7t}44AZyyEj~#%`W|1YxOj_ttKKHg`*74D|4Rgat&PU`
zr<Xw+{R5WltfG#R6<m__J(+g-PI>RdV@s~TBx0?NOov?-_+plE_MG!fK2@=Wf{JUU
zs7o>(#qBy(-*xiWT@(tJ>e~*~g{*IuqJHL4uJ+2zc^h>*J)QIaq8!y7F7vZafxXOl
zuI2_)9<OpDb&eKhx=z7qv8lSB)z0;0x{?2pmqGb9Pnn7iB=&Z(mC702mzGSI>$>i{
zB3B*<_k<d4B(6V<2ynE&x~&#o7OWZi<?`~qFD8H%vB!_(<t4pDu?7#?*e6WXF=h`s
z-B+<!ek^%c^4o6hzBFO!Ud$Zq&2`v|uf9&98Isw|?QtZAYc47gB$aY~RoylOYt;Oq
zzF&oWJ{TLv?==uoF+R~mqtR{Ok8aQQXMP_0M)Q={`W15a`N1IXfl@?`-QVc8yb?hA
zOH_p~AfPDx=e^xo58Jl5dZ0evut?TIwl4}lo3-N^tXvk=15}+RiDih6To>!MguGJC
znkSZC+>bdUs&J5bsoub@zQZnN>;8hptBTt<8P+QqSL^C72b$iUzOI`nh~blvF#51|
zZRwjSsK|l7Bww{1emEd_xhQ{o?PrtKhuLcOCB(Wn!($)8If!kkZj^g%WoJ|24aW3X
zOb7-;&Cq^{FdKhwFL)8Odt%Tb=qHyp7xtLj8A#TB5l7cEtJKP3JK&2fH*T&W>!NBE
z+!&y`7x}?LYCyH~!B6_fieL;&f&E7HoQtn{gSq$F=CYsVCbwS{M>x%RU3Flh5dXp?
zQuFzJ*_}RwUw(j|KV$ZdU8Rb=Mrwh!mF)$kxQ>lM1_mOi`S}n0CQ5O%56hBM1)XXh
z+MVlyzu{X{*H8K3D?lJxaZ4hX*+O=5+{&BpvUdm3wg;P_TtFOc@0%mxOYe0xts9DR
zDsDS*35tOlG12RR6%Tt<KOb#9UVk8R?fLfx+E=BuHs4PWnuqD8ou#EcX2~wE6;pDb
zz_?(rhfLnjHf8FvnJRCRhliFY=qS+`lamwLjdFaG*qRSGbi2oUXRJ+Vrdu%}PaAc`
zn=DgY6D5EBGTLu0_F>a3)_{VJe~Kg`F{^fR0$cjBb?vYP&iw2R?YF7r^5cd)0dpUh
zNbcL-4evjVR@_M(+shWutM8e<88d&^_;N6+K=>@TxGd}8b!{F$($w@DWX3mz!;re6
zcr8R03e+<~$8R#s#+0C~X6`h>Y=kb*OOqz<Jv3vk^K}lF0+09Zw0%|Ph~x^(VAwv1
zy(t2yU1t*qD?50c;*EMm&a#(+K?ic%x>en{Qa3I_P;45|K2I0vYkz$f&TZz9%VHDk
zOex*dJn(V`u%<}JNz<DaR_>8)Tr_5QH(geY7W_M}lE=34URuDwM(_ea74O4VMkrE0
z`<hiNZN`YD`d%zOnLjoKIOYTES3eGQL-tw5h?1-P@4Bc`Z|cZ41$<d67E#C^Bgpo<
z@oi^&ta`)qk!a75(F0D^&xB}`*xb%``j5^+rDIz;_Y==cHvOzj?1UT=RoL}3Z)Nk|
zsS$X#D@;Zu8=pbm_*p!5E>sX7S5qU9A$aY`Lmsu;dFIeregyR72)?_pQJK{ReH;er
z!D=Y6B>+8Z)ya|GyD=@FqkBR6^VGHK*H|^G9jT>@>qjP)XNtv0S+bIoql08sqm3N)
zV9i*^>j4Rj4a-5bd7e>2ZmVn+14TNoxX&EOmdIS1pqiLWbKVg0na#~Pd-l!(_MWMC
zsW+`f*HpEK(#7%|5w~RCnlD5Lx$kPH*j$^Ht}wCcEL@po^9*x7TnXd28|5F$?{p&5
zoSez?)bTo5mS%8pMC#8_(%5MR8y3QIl8z|thggYSW=2c#50f7zZ1^wnp(0~mj$D0p
z{azZqB}rCD$<)&ez57OcXJ{Ojqyik(W1fiE=RPJIdxRyIrlJy($T+u#9V7|i&=Sd^
zlh7f91f7msRIl`P7?wz@m0El}AM`nmJbGiV@%4!VJ6o%r@3<1};l_zq=$rgFhyl-#
z#ODQ1n0>TofJ1hU_mj=H4qCf456x@b!>)d;@>X@ZcuPF!C;7bvWAZ4QGt;cZs_iNS
z$=~uKtjbKD9i8)82x(K&U+&P6#ISd-9E3Yj6){ivY_@`uC?SPqB0A&Xn?$}s$tfuo
zaQm^l(0hr3y$p|Cm_!_>3REbL*KNLvN^rD*-l5qG%5)~o=vV3xK~luY#1rysA6XHe
zq-!WD|Ehdlub!TH(xN-YQZyaD|ADs=dyi$gzxcJ;DQbYk8XVEC%DcgFhq=`*E@H_d
zxG{HWcp=GqciaD@Q<w5CwZQ2cFsV4^LX=q%wWSR`-`9i9M-OCFSN179ho?q_r7?qr
zpxyaO#6?>H`Jd+&EX@4gqzR9{c6~<v_-(P)BjQq}9+u!Qb)YGLCxx6|=VP%C0KAwS
zouNO>AAbLxVr7MfcZtyI<|3Hi{qy_w4CE@*=V$Z_z$w(a#2{#nr`#g95qb(C<kIyG
z4=$s}KLdj~&6%Hd?p_}`f0D9`jDQL2ax2=}+FB~Z4A<%}8TSZ_(T@i=&d_^)0p)jo
zeq@55A(wR}vxJ!MwR6V-Fs?O#qOUZeo;cKP?3oXNfAYZfM%_!#%i2yoZn86?qih;)
ziM$UK_txZPUeu`yW+Ye+dRL;~ljv$a{e<Sna?(S5TQfjW4d2K^?X~2IG20$>5VQ1?
zudk22-p(+KFTOxlQ~Un&yVr;J=`aSR<^>S;ErVOP8a_OFOGC7)-{juNEwUN%E$_Pn
z>Cm!!R3w2F@nWjf3D-<<qs&1x$G+x6>vyluY<bHkFrqHcfr`T-LzSe4q`+7B1J2#h
zzKTZ6p^S7pr3%8F2O=*+JU<PV<)Vyo{O3c<e;h9!Z5PZ(Pi+osaWy|%MfIjeaZxTK
z`XffA$}0HJ9DICU0W<Z_Ae)oCKixfF-{5Le&rWA*FGp`_KWWRD={cen-lY&_85#6#
z0+kr>@nYaU%EGt2?Y-#mLBs8y<@;8V&V2<*X+b{%_EkJXZ=X`O`|^7UpqYY(gg(_V
z8zzWd7z-QwO1&7E`{FbqRkP8=b6BM1YHBW{8u>7Ez5I={4~Mss-oYYa5id`y+xNb@
zdk;8ZAH%J0T26nXf9z5?>d_$Wkf5#Hq`LZ<?1X>SF}lN5-9gy;rO97%3r&}`FVe8&
z2Us2}DmdulvFkg|1I(P2rS|i4({vF%&E_%Jd$Uq{Jik~H8x6^YohSF;F`CrTBnhkY
z(DQU)2{_%(=+?gm?Lsh}|FlQ;^V^-1%%nKG{zwW#S*=@=2~J>u@WG9<reU&SF*4VY
zn}sv46^2uyrZBUSCPO^2FaX1i1v`e3Rr7umGXWLl2XMB1^&4sV1P9qnh_uULYenj(
zU?8#04e`ST&rypO5{P+Ho&H<-r)=T-`i%Qkiu4hkXEJG|g?!~`!|jM2SbW?)lkD&q
zdbb`3S()?MDWxf}3eGi1z9nP6;wKEFk}o5lt+vECvO_^!Njr;P-I=Wa8e^k_k}wnY
z7UHhCoI2_7a^pltLP7M#;)KfJUXn};$>_^V5)o4iuMMwj74|%F6ehpHuYmN=-KNvc
zi0VyhENRYcrOT^H!{3O6Et*IJ_Ri4y2jc>Yb7OTe>2Kaz4ayR^q1Q_uvNBn0wfN1u
zm&-+#mKGKUMLPTWzx-gb{UIF5u57Oq0RF&Js!RE2qh*Axz$+UCJeTk{qgZ^7yS<gd
z%B>rhM<>XKL|cQxULxMJZAC>2B|lj(hdmWU7}&CLp8NPp=e5$sspgQK>NlFNS;AUM
z5GrkDUnBF<d6bFW4z}$s_a(IxwVJ%Qf6Kbi#bLM5=RD&vF6o#4)<hV?cBRO|Pr6&c
z<jI@hxu<F5O==bfS9C9*yOQ{}EEx6H`kWqj^AAS-j@}g+j<k#)>&YV*Mv_f_K@bae
z{WFB^h(hsqZg<<0q)csBzT@*N2)gC0F*2s!x=l_;$w!WV>A8oj2eLy6?nysT?VKF4
zkor=-*!;Aj_GkIhnQ);6vi&AwHY&)}8rdM301sS-08!)F)jbPRPYnDz(cPPE$TIc*
zzxf{!1-vhb3%DFs3SP@w)zZ#qo?*n~oi!hnL{aC>rau`noKfde*@K7;J_v#;Gzw-Y
zrdTcgSWn~L&FkFGc(9+|z8YK;d0uvc*k^Xz7nR0-=pgS34dA)Oqf*szBJQXqcAa+9
zME({T0wZMDrxhV|4xvBb3(Cqq1(Ox9XkNk((d;Q%2!-adRB%_5Orv||{T&^9xEPYo
z+~tc1I<7i8KDT;5*;dq5z0?0wNyIHVKJ%QsTJsMcB{Wb-mYM_-gvLXeG0CcM{^{F%
zs!>FpQn0{5i_4Nj-kVLA6=_}sh_(1PYZF5~$Fk*vT#vW&N;P;UGJ>7gl;R*aZhJN^
z*K#^6`d>7yJlzgF<ga}8tTmQfUhxfAbWY*6&RY|S(6;5adja(vj7ZiO7<S`0)&+CC
z`i*cp;@Qvn%pC{;dKURrMA6t_4+98aCA(MT2??KP$u_dv1P4>cfLgw{VXn1rd%|{u
z<13%~TSfjIUK>Evy<%Q1-EhO9p=1QxjiD(<+ox<-CLT#f=-r}ULYdut$QX`7ocjSf
zf4m78fADl(AYh~1E;PW&SvS-=rhho2I$%2MQHL6t=4A2pYi3=V74eZ(L4{018nLB`
zWM9J_#n{>N>}Np^{0<*&z@p@~g#L-{V_iRloAiUOV~q0AHOA%^cA~5#&1eUoSml-o
zjxLt(+#(XnbRfCgQug3psmZg6;j!(z^>eccgkz2~ilf}G29lG_uht7W%Ln4Nk|5tl
zP3Z(NCQ=>q^|@ceOa9t|45N=rM-{g(>PTk>YqMt4;*S+m%wD+1LDnaVA20k!$cotX
zyg3OLz{zkbsC<3MA-hWC+jo_{d^vubnLMTm*xq&_FPrP^rJu``xvU_`uoEf^xN)QF
zqTt(RP=QS^7FGRP967lrAGV~keKOVTX}2v$zerpSyLl=Q*P^xdHZ1eApf2P0N4b7^
zy4W81J16?|9RJ}dgy>R^`B!-2He8kfeob)o&1(-oPKqCSH>FG^*3!qj+?4@)H(7QU
z-n4;+kQH<tni}R+gQ!>gEJsxA<N0b18!g#NmCZ`XfQQgeO4`2eG`dMEt0B70aRrLp
zj<7Q?4g8k{=@4EhsHCtyqwHFTs$==5;GJsLH~lG70NZT|@JOS%k<kA96&KNZxljUS
z5kdD8#|IlF?p9jgdm=|v9=wJYj0=^C8SrRUp6MIwp|KgbC3VoxKGrN4AXYT1;Nhi$
zPDKuGoc@Qf^2;~kBkpa?U>$mF1fUvlneUAY$&;A)O4)?ETWni~p-i%fIU^ptAE@D5
ze`bDTp=M~g__@#o`Mu9V3wW~6k$6I6`0Yb8+2_wc$6oAH;0%9oEwQJ%1WqOHzGkgG
z@}vCLG*6Luan)J(qy6l~5W*-S<_6Q&w+T$I<QhruyUAMXeCf+>*jK`9?56LDuNSfD
z)7LDEq9XG^Um-eF3Po-NK{dpX92|o|%1wa<I&cq9oyw#R>sgS0?~2#~)%@c=_SZ!P
z+qa#?Cjn!`tLpl`Fka5M4vE5&E|(-fL!w8<uPkve<lnISUq0hs{1cq@jTXVNWt=V!
zp8U=t`uBI)5*!(sMr~8!Ut$?B)BpTbD}$)Myq@db|MGbr8z6WaATud`<52yrL%~zJ
z83EA@^dT<)Up^1Vk%Dwntfl{X;2*!hodEN(=T=<J|Moc#=qM|%g#P;P3;p)+WZ7~Y
zh3fxa{`Wok=lB0R`TrC9fA9J4*Y5xB{QsW{4`Edyk5iYRTFsV!an&xCOHiiDwzz9m
zydo%#Yj4_)PAuEdv7W<PWav;r$%}qQwQRA<e{y>-&GcXPGPvckr#R(z-(`d`RaT9l
z?|^jk`?E%C<Ef7}AC?}u@*20e`%Jx3ycu$}_-WOR8kc$V@gAjndt>`o#e9MU-<EZd
z{Gn52cM;&*p1Hhym=b&k!q<t`7up&096cPCk}Uu6U@yRSn8(g5u{4`+xjcKhX@;SA
zCcZTN#Pvu2ws_$->JQ`4&5J+cN&WQVs6WoF_<#S%Ism1grX1ae^9fmmq13lNv3F7W
z|0*r}S4`n5IMbkKRr}=M|M#2S0<GeJ<q2BN2Oa<Z-;V%>i4uJ5J~dnZJNfswKRgC#
zc8Xb(z2^UXMcfsES!5OwEmQxWKMw+Xzd^^(_`kd%Gwx_q5{94rabo{GKG+Pr+t-M=
z-v9a4z^fzHODb_omlcQ3l8`2CUGNc*=y23htJpYyK}je=)Sxog(xZp#U~cjHvtfQF
zE)kJw%j#|N#B*0S;fb*W1F)FqhT9#2f4&`fse6fQCDlgP5m3^Qp1941s})baEmyoU
zp{KnsZ@7JEbpNW%X(rs3KD}JD{Po!Wl=kuMYU_xl5OzPmbK38tj~!n8C{40^@Rw`y
z7sr>S61peiVJsrXt3hDF?)L7n!mVt-^foZ8-07`P0q^sGkCjXxw_hJl**b}p?}%-u
zYY4QQwv?SH-lV}a(yqR>JM6>Aj5vE1#pYbPdOL#Qzx<r&+rS4{YuGVzM{wOF*g9&f
zbNb@>vcA8DuiW1@LCn<uTjZ4=o~Osvx9@EGxPR1mSyOk=e+4D-=2Oy(rjOO%5C|%3
zzZa)_q-;qSE-^S5o<Cbn_7`sP*ONl}SpbPTW&G?RD79pH0!129H8?LC6ndn+zVlYb
z>S)<PPuhZN_Dl0;$!Nuwv;wK`lYO{8Hs;*_7;wAJ8yowiHfH3HgO*jKGS4Z9c0Dht
zLLj7Iesm~v^nS78sdm)JRmWGQWl1rWZeJ=}N5osNx3+wqsWdxhlKsg4b(`n6pWik)
z$qbz)^T^3;zh7m_*)sK7Iu^=#IDZNKUwf_%+`8D*NLvv#mLW)Q+n0wOVf$u3X{QJE
zZ0jA56$)?F^9kKg&3iar<D^w})Z9E8PQfX4JVj`z<}&!@TFc?Ceu3?0`~94tKNb#(
z_$WaaTweC7Ay7F0$lE$?r*}V(q@KJme0Wr%c)h@T>v`O0PjG$DrwnFE-|f-T`=&o~
zUKTX)cxC<90<9slTU^#$nUp}&(Gk#=^*9z9ypPa5rl=RMRhoW#{r;oV*B`R3?iKYl
zFMc|^6|sEL|M%++euv)x2j3Z+6Y(oL>4pc_`AohSyC((ON;`=jia!mQ9(n@Gxr*Gk
zp?~Or@C4v<R^{1sAmKM5Mc+>*XNH+RU)R>wZfhC6ctRi9rM>c8W8~h)&tEo2vcA3D
zzS;GG?A(6~9Rxg^EstiK5Y{b>A6J#@pL6f9G7rsNT56C;rrFsPHy+N_s`iP)=xz;Y
zTz1#{sq1VHy~6v+{CgfC(E9z`qIGccTf1oFgy7;>fhVp#7KYZpo4xzKvQxPKfl*{+
zyTh=~?PjNlAyT&SI;Hz1f$zl1pPnsM#q~D){bwx}kFz|XY}3}}uVDJmk5$_#z=KYM
zm9G8q_<xoX&5K*dlV&5M|M~O1FuXK&8S`<>9}oR!0pRIr1x=2wH-0ZE`E3QKEV6I<
zDGdK1_x<^hkS1IlQTgyu+TY~AKR?Hp0rGr6NUQZfzZy_6K-~YodG$_JmT^;|_`+KI
zKbIJCj7+A_nYOI}OGJjis5qt^X9!m#e!9j9an#bDJ~neJcYNb0bnWS4z|jX|F;A0%
zyLTC_QGnW+;O}DCuwI-s;G(m96Hu%`E+2_P^(eO*lK%0CXa;aXmPOn~l!(2g_Kks5
znUihyz#ze$FNc^Xpt<Rhz&-2RcV<sN23(+!aTk8k#bEL%XZ|$4s6yw?{GesEV#BLK
zUmyEd)iDJxB>cDOcb~i}o5>$8t$#N)<fvQg-F3@|+!4vCQkZVAuQ<KhwbVbC&CUy2
zZ+-`CEHT%GCQz*a>K1zWsS8`~zAGv$Op}j_`mX~Sj12Bsb;D?l45a~GDUV~yy$hp|
zegNX;YdTj&eD_w$%gdp6FWWbQeq3VrP7aAkX#+n`-=yI&X!`cy+3h;}@mjl4kkuu6
z>@zD2sNwCPi6CgST5I!?7V_it(`Xj6mbU}Q2qvAsn%kV^J?{V&1Kl`9L`0lTPmHRD
zZC?Y1g!fhhVB{|>EPzLRoUbPxzNxDlJOs)uW<V#itTNpc$C>g{>O#;VRwgcA;t%DJ
z9DE;Vd;8)*f%P;bUBFDbx0#$lcmse~v5xpXPb0;>w#PuPuC37=3Yf#^x6{ctU7zpw
zdcU8Si;u?N=}1ivKZ;2A8ssjPRSHdqC~?QuA*b7(`FxBlfk?n=CIEFB9-My&I_O=v
z>Q5~SNFBS=Wf{AVA$LnbZ`I`Vjdrm#Lc;&D`yFuLE|6U<LWPvD2LIqLP)_jw{`nOk
zLCb*s0dx(;gm-z~Uf)kX>!#c&H{1uTqzhNPcR}w5B63w#Knozht7q}@&d(p;w-dv)
zjx-*Mx-d@B`dW>lbGZsZZOrh?6%yqa$Z0KDy#gnny&5=x$cgayiW|I#9f{|zh&oO|
z374%epd|LlTSr5AGfV$CbCx#T-Ftdu>$0AVcfLv`>3l+n131{`Pa{_VIp<V7e5G`7
zZf@?9?a%|syF4dVvI=rC<>?o1%PWOq*SOU%y&4zQ65Vl)TQnWjY8(<T$edMzG5K?@
zQY{XPy3Ah&OoTvn!l5qmG<XX-hzF%;Yi+IQus-4P$K~L<1}?`SwZJ9L5I02of)F6w
zen{1kj#-u3b_^d8Sv};ac`5C?$AUx-MT0T~Ea>8!dq*2!n4ota1x=}X8c)jmfZ(N_
zdZZ?_-hP~IPM{R<>q|t!X(W2dbLc5)%^6ZKL#~Zih)^z~g%!6O<@HP&oEciUzsm&J
z;h$HkQCeL=LwtOq@H!S<LV6ZqA3juxzkD6hbB@+0s>j91I%EZogk-=85y<UtoIRpl
z)(BxWTg%25feLr;-kq+vc6Io0Ct%cOarwplq(ZowW>&kHaVnZqr&5hu@2t)z_-zQj
zoGcs;`R5-Ec9x-(LduMN2>wf!D(ATxA*)`N5E<{?Ss)UIwqyN5B#9DLFksk@9lxHO
zh`KJ&lfX>gul2;KB*KU<lNy9W-K<XFywrOd`YC29X&Mu5Zl0dkq}7wvF<%wEvYb|?
zc}+Uy&@<I1g6h#+DD5o6RDF3}?tHs<lkntF78WR!AT;>zf8bDpPG%y4L}mnp4M~7I
z&F8*CN#9dBwBCXa#(*EqzHzxTYJ4Nl>UF&n9f9P`%nVPKR-qv21>%RVNbd2+1Ph#^
zsWUK~I@!WDgSo8<dkqsyiOfN2#P*%c9b9rqZhn{;ijFQLl~d=5wfE&4`sypeFA&-4
zj$wMS7<>9)_GkYt2Ddc5Gx!#X`6IVpYO>1^Q~|oh#alp=g~O=&Ws~kJZsgP!+D7w|
zP@@Y9*56K-snI7f>w}B6Cm$+XU0+|1p|OxTE6$RNJw+c&0kIKzVAGDGT#qc%o_*wW
zo^(%fk{+dsqDS$=*nX^}?HyPR8I<ZzSKOwXp#6%Z`9WN&C{d#<{qT(K&h_8VOpfI_
z3(Jt6uMw<T0S<)`Ko9v0tMG0XfV<zpkfOtbY#nG0VR!p}a#ubRC|tRg7SPmK)M-XD
zqIRF*I40$wq!*RaNB#oYU(eN#a^Ni!C=B<VN#niQzwEO+TMY`##);9CQ6fDkltwxk
zH%gSg5W{`V@v5raAFGgsL+?PLp43Y>6n7~hnhJPl;2XZ6z9gwK=+WcHj~%u&-oaUd
zi6s(MAvLRaYVLjW&U((w%zUslN^vC_h#B0Axmss>3q7VWaf)`Enn-f*65s}cM;Y)$
zt|7Zg=1q}euh^d6D#%_}X&mC8jZHOS(*3&^fZke7hX9Nl6^M!Uw~C`=S57{Y68Y!Z
zg$5hJ;hX(@RqDl|!9;LbcU!=`hZKR%&@~zv{Md+^_FQ2_S!nWuT|lV^-SrIH0>1*e
zAfe`x6@G?5skf(fgMG`b%~$>YX=}8ezHzcxXFUUw+PP35oPkYWdj0xQ8Ihz~s=j2v
z5uago8RLqKsxB6-AF4Wm#cbrqvMry>Ec#oral*8U4}__+izk{qG}r<9{N6X(5qrSW
z$63RH2%X81j2T<yV0u*NTrS>?N@F!)-;%2Yb36tmx>e?Kmy(z*T3&6TF{bD>^sPqM
z$81Wmz<9Gie|*Vq1iX>bxfjUqJ>T|L!#D~RpF9HuJ_K1m&70lbAsQtOmQW0Gh+Gn8
zgzDki=hjyTR}pN@ze+%k#Q^LSmw`D(8>j)+;A~tkURcaqKrt`7QNbrx6>lv`@NxM?
zALR~UY<Jh>DfLWBmnkc<p0$+Pc(i0@Elqeq(w@=~$%QyY2fv)0OoiPnyOB_4vW|{+
zj|1k(ns`mclf9M+i*QwUK7al^;D$f*+&bGN$K}b>rs{@qQ{&=7X+50MQR8m4%m|vQ
zW~TdXbvT6;kJD4`#@HKhXcQDw%bX>5&MpBBzGe$llHBa`9#BIGAgI-7ul=<iU@oZA
zv49)C;l-InPRorbW+jCrUpP;Iy4XwJiUqx;dHDz01=O055y%yLef1_Uy+|wha19C1
zlx^}el|0v<AP-+B>xUYI>RGOjQM{68{9Q${KoxBu;S#ZfWdfc_g3#T$+A|~efU$qA
zNONbdwz$6jZn%&MnMfqwDdpD&&|tF)be^>z&tJU)jH=LOw4RFajPctxw%`Yt0}Zxd
zjb(vS(0i9SNUbx{o^JTvtEVE$Kto-)aDnDCapV(nY6KaF6h{^!W~r`Sta;R)Uj6qw
z7{ZkpEU2P)N|3x0<pa^qmTCqSdx{xR6c-Y6C`a$EE?QQPQbVEz**a)foq;xu(6+n>
zk^^zf2tEY9O3@*;((6tX_Y0&rC0hf~-*bb(DR^Sv>BUyjWc(>W_B*+_^<lI-p!an*
zY;6K6*2l`osfNY^J%jY=5`f8}67=$ya(U-y{y41Y1)NrBes$B9E`+iPyxr;7Pg_Ho
zlfZv*U2J(Xn8r(>qpV3mKp@phH4@n(`ol)?JeZ0Vg}qg%k)aGVGE8Dw`BEPvodMZb
zKQGpUnl8~}H){)YQ-j{&eeWVC!eY>&E2O^XKBnfZ%Dvd+z78C;EQEmRbI{BmwCul?
ziaR<Jgy|I)sY27=#5^P#w_B9=h<bjgM&prng3-u;|92DX*5b88a1RdS<i18i9RjX8
zaH=pRY~2+s;%;JLyX}-QmybfSInN}p;8BEXBAAx>PSHp2U#mP3o_7dR3o_8xK1A<5
z74m2^_#TB9qy~ISoahi-$38pmb{H8RPT&9&-Br{}9jBj&bFnTK#&|05C=EmWes*yD
zB>CO=tx|$Z*eQ^w$s)|!PIxhkGQ<INoqh}2j*ptBICu+Zn6m_{lF!YZy&xSwL*r@>
zWnz!DzH=vu9WnPVY&<WEfFM>QRH-HCyQH>hxIOoOgdEFadhwX8gH4X7gR6SAU}J49
z^w9(`rq?ONQ4ad*gGOr2+;j8u#O_cBjz4X2B<>g^S6+^Vwu?c4ZWQ+1)D&kIc99Sl
zZ)d)Dg##tmP7wm6JFkrVew}#-!JRuFwI55`jASv8#E@sY;BzCP$&s370&1T$9A8yF
z&M}^7q!-)QAdwu&k-n!Nk7i4QGKCuuIOYojZ$(GO-ox|hwVfq%c@geH>j5;;J!%K;
zKNe)}3hte-Sm(qGmOU#7yxp|Li<_bn6n=$?p6tUW*aq3%HisAtk<dhisI!zLpyBWr
zaKr`D3M<z!eH>nR!7D87TFU}E(`7fk63ozp>qV9e(W3}{Zu)4M`#%<5U=a2SF%wK3
zt{{xiT~dR3P#qVkqOh27BsuZCN9^yKoK;cwK>?hAtC*C4Ro)z2seabf$M-<74scmx
zfWC`vNlh&{PY`ZIITFb>aq{_{-yTP!<bk28-1_r$;5e2<MKRcwyQ}$*vCxsboXLQs
zgqZ3VX=bkVB_+k*OJtUac30T%i}zN1TiKj#ftii=Cv@|U@mJW%!c*8JpEq6w?{L#q
zeM~J`eVePgCxJ;!pO8J3>bGmi!iN(kzVVFe0$~D`L}nup6M-NIlu+utz3kVYG)Sjl
zKm0iGZIvny?}dp=z>(#<RmVlGU#edy?*!dZxqvTU?KSuscwo=GwRTXe!i3NAc2KYk
zEX7ULK`=n!QnJ3hB1}e08uq!+jv1q#&X+0TbVC*OrW{L;@>US8%;Npy@bG|RH(<j8
zHDtt}u+6(x>$cj7Pe=^6?&kH<xXrSzgbx9-6Y-F2eiZbTXG@UWMf%N|r|d<?5Y$rC
za`W)Rwt#1MyztCNz!<@=>M~!~AuupM|L~VU^}GoYoygEXI6Op#oj<FN<v9HO*(MhP
zL$DNv`yeRYVi2f_1&pG(H1$on7Q{cB6r!30^g&8cj7{h&6yI1L7fXUK29<eU@8|2y
zJ3?3c#f4C4wqQOXYJ{{aAV#z8o!QKHm@HC?r4CVAlaoYiOEPQ4weuFXR=jb)W{kf7
z0;#tLTeR^69{h`IOoI#1q4iwF2YG`9QIdwu9&h$E*p@E3uXUg3tFsx?w2-p4!v4Ac
zvPfAVQ>s<b!V%D5JdY|+2rh^B3`7rO%3KU8cm+M(s!@2Rf>o-(6*o>mHiou!ze(8I
z!a3qC&5_Hn4ix1|?=9Wn{#g)t0Xwp<Auqsf)f<0|au9OuL}gmhSbUH@uMFH34|jJt
zb=WdWgqfSPS#Q0dJCdW3EscxC#^!7QR_FKdpi>5%TEtHg1Sz{HfuiP6mV{?_G-XK(
zTDcWya>@W{%X`*xD_{%*Ykrj56Dm&a+4*^Sd&CHF@qm@5b*5!_W>hcEz8sFwWEj-i
zYSE(n4mOY@*MwXtZPh8qbejxlL&=q6lEp+(g^;I*Uq9J#V1#YdkzM+Q>O^tf#fUAA
zQ?0-+Y%7j&1ONJV=8RxfnUNWFv=B{sJfQ-hUDxHtK*SDOnm_WYyi5z>3k!bqh>7nn
zuRLZu<VpM2h+Su!K}?8VO+2}9EIxs6H~le+)vxj^YHoJc9s~%SgOw8S_H!_0pb!0g
zeYxxkLEGild*9Mm-Kd;EQ%Y(CDTdET_)UQ#GxIK5B0fc$=(kX+N(!vW=)5O`kUC2h
zm^J_$=Uja$aMY%){z@?tFOlrh1dy72sldIJj`yz{?*k;+sMeN?Z8f^vt~<xLJ3FKo
z1&m3ohJKLR_SN-_@Y80l+}j?RlpD4tQ$=drlu@czZM$P>;*9SE`atEo1z^fCS~h{8
zGQ0@&9#j2KI0sG-%8ENQxU7T;Tu{**#HJu<nSbv033!VEhuf^COp$m4L5i{~z?&6k
zqwM$or8AM?=a`v+=Nnupn$$sQL7cLzEM<l{68#{kE&$_)-L*rDu6>o4`--vwuFioK
z<%q3n)$0B53HH;pu?>!m_h4WPUuMl^g-9|mv}41mhrb*&ljGM&R&NrBs4b0_`1kZ!
zpxeOLRIMbGW0``tcSY~j0e{VNdn^w$Z`=@KnQBQ-goIQn1T${|T_pn;+W<W9CF9^U
z2grm5hD%QTkv(zLVhue5gQa{=lY`#^bq4mJsY4f<VGR)>y|LPNX1K3c9ay%F1>_EE
zlptg<=_!qc>*<#~dK5{rx*8f;otr6)`CXgi44rj!^e!nuhivdx9k_GD*;ko5OMu4&
z<Q1mO36wwl2PHtRNap5}-AtQ3P&FN@FFuvu&XXtF#_rt-q^{FlN9UeGl@#IU_+xZS
zLGdo{3uIxdmhN?G2&b_YL(mN&b-v6t4B}nHW8FI6s@?=9I?fxyZQK`7^o78ujw&;X
z7W#LS;6^y@T~k>;^im!QN6UtO0>j7az_ba^tq~wvbz|+q@htEnXmWI{TLYLGDyy7z
z#V7I9?3|=mRc$SMR>GNiK@lxt3Nn0S6S7luvR|Y|rvS{|Ya$BV##J-{;dp4@^XJda
zF^7d1Xdt^frA0tF3r9*C&#k%=wEkLIu_~(OMNi_n*P*t9y1y-J0Jki(K$YfbAoMw?
z*as6Yx+2M)KoA2QFVG*~wXGU!C&wbi9xOwes7fbbiZqy<p1uo&{<xN!#{fzT`x(9G
z+&oK6z?ji3$<vDL!KnvTRAK8yWBsO<TYE|q;FN$@2W&TB1f#Cz*HF>MglDU($9!au
zM=-^sRX;|Q-KDnrTi`>YLvcQxO=0)QUPL=U_@XB}x=O&!-LN`q)*B-dDm%-HcVE%F
z0W@{rU21)VeAACd$9n_RW_Xl1MLpINu5o}Sb7(r;aq?GlQ?n+yw5KtMM#VXNS9kiv
z9^BAsi8&6ZGGJI9)3cNV!q|T7rl!hGxk+3fUdG~uAezFMXAIqW9RC}NK}iFGJhr!k
zF^d)v?Xfhx>HH>xkG{IE_flrl-mvX&3Y8UC!K(~$6FpFK0c{B+VRlgf&O=-yj0?we
z6Bjcbx+E*>uD0+cTl!wC=|uL`fTh-eGZ|A)&1C4$knr?sH6aNXI;Z)61UIGS6?AWY
z_@E78E{kc*ZEO_tX3L){yD89w9OUDgTgHfs<*Yi=TE_n?>ftb)YJlwu+lMO#FKooo
z2`n|<S?-RZ0$$uj3vo&hnx5?{S-j;lHMKyJq6Jcli|s(skLvJ^CkEVs7od?Oq;b|m
z$!xjqYR4#(w!pdZaY}xe%VW@q1F;xsik?^k)%BjLD(Fwc@$KN-%`q%!a}4GY8m%v1
z_b6qb*_v{nc?e($`HC@ay87TaPdV*3(^5YCtx7}O!ohK_+6#4K+~V-Y#Yk!S41wL<
z9xoWkE?OFlyzSH@h!ZcjAE#eI;K%l4n2u!Legdl|Dk>_@#yZNJJxu#LC>qZgG5`7`
zNUg1{%?+#+M3$J)0}TZKPLyh7DsdAzkFwg1Cogv)2%Xz`xk+JuPQZmjBh_YYVy>?+
zEkyv_os9Du{L<f2hA+obhYOJFk}*MSoCJz<LyyGntae63_jrF?GP!KYz9sy!yptg0
z)l;+e=<uSCph-!kX{)-ICWxMQ-@kWPR8L`3GBiY5MG6DK$9_d|^Q(PDQP!2zHAZNV
z*enb$EP{ZY42v<ex3jbJRnX@b_6>!=RzlM-=r-}jz^B8{g(q77K1Xw9oFB~IG;udv
zIr!BRtoAq1n1=GwThJ>5tcm^5tJ@d1N3!x<Qr}jR+&(DWwdqf0Z4!i!6UR|syj6$0
zfVKz%zRUW<_+S;^0XhYZ7cRZ+z^o0&VRu+W$Q_IbH%dRnk~PQ<;?G_Jsi$UFCcDRt
zvRvyNpzbF7*GCYH1?Qz~kuu1YfH#qci=9Lqfu$eLP2jmvPG7j1JU~j2t`zJ>Z-|8S
zj47?@QJKj|T^r00$hl;v{gRQbz=#092}!eF^cA(**x9m<gYM3sqQf<6x`iM3kv(89
z&{sEjlCpn=*YtT4T3>%~p@ds6nob})Q!Di!sR75Mii=f{53qXFa>1PGnV0Mm>4ow^
zz;*WQSpcucu~B6>kPxiG@x8A1j}WKSS#%LUH;AU``6du>v<pmH4?8zABOgZu2{@i`
z`Ac31zlEkt>xqi4(laJsQF83)VgZB5RP9bUem)%t(9fVyg<eOyMu9;faEP~1tpK;k
zRE>MTG{&=F`dp+f2$}wZG-auAkpfrKwy@1QG+B%s-#J_3ir04V+v0s?eVm7E5z_wn
zijV4rAkeX-rnc4ZQxWhU0L~a8&n9=<@BzEkE$R>uIKTadVy&rcLKbF(XD8xBc_6>c
za<hZFKymkx`WN6Jq;w{wR8&;(8k7WUFxD)nXFVcVTbOz<$G`JW2ppY{+ax7A)1r!`
zc3^-q_>4jg1st+<E#M|J_uR02&~P!F7fuwX=q-w^U++r-nE_cYLPMne@}BLJSt}06
zWq#NTBqt{);R__jD_ldE>cC>DA}Z?4lA55pD)-g<Ke7m3;fX;61O%J`>_ht{1cW}R
z%LzXRQ}p%q?gZwSQ*ZXh;s3Ka-AZ6{qMvzN`f;&T-T(Omy!ZYo%q~1Db^~#iNVAYs
zu3aCxSKd!m0Pt0yWZcx*LQDVlztQHj8W-h#8KRCkfd74n#+V_f(Q@Z;5vBks?Y#H#
z@v;5d0w`yC6Pe2ov*!?@=0oIvGpey(mGOU`YOunTQ#G3B5_ib?cbuXqkHdAs<9M!-
zkPvJTaK=s6*u3mt!`0g`FNS%f*HJ<sA$u|HAWU#)3KwBPv8{1~=;#e)%G65#he(~U
z9v)A2JojA71!EHv+*oe_q*7dvZbyCInFNc(X6$Qp$|yA<Jp~kJk4E#I0P@7mS{R;8
zdp>St1Q!!`Hh{SI?(S>~X&6+T-Y!8;Jxknu_7;Bu>EBVpDhUojoc4$oN|(heHOanc
z2Xb5%-Rm|WJm(X763+_WC0%|6Xmk+rg>dJZJ2}NwdM`cq8yHdk!J(1KvL3Di4iQA;
z`o`!GQ^W}O>bQS3cO50m5P1@E8!0`m7pk{L5)!7YmXNonv{n=nODm8L-`Y$~@VYY<
z-oQGM^S2g)<Kr|U0U!DS{VnL;nmf?^kRsOLML-%JpCP|!UU)%{C6I09!-qt6#Hw@I
z&8M5y@E_41rLEn;<TZ0lXYsIQ8j6yo2q1AgK-8Mp>m8>_9^$nH!~2;fJb*VhpalC=
z4Go*dV%!X`0e8-CBPi5m8|Q#l=oKhqzjj?4NYb4UQg<%li)h}r{3nNC&Vf^pR8<o$
z$MSeY5b*-gaEL8y4vbx25{we~*3AgCm*6FWZ7y7TYo#O1+Ai#z{Y*JStm-lKp2KEJ
z;kW4UVCjtTEk=LfkFVV)&woAEN6F94cG0e6K2`wT%ayX9pat?Ny6f<r8#UHysKRh3
z#Ir%If!<yY1M-xSzpX}yATFX4#50Mk6C`j`W>=11`8w9o5z?nbn#A7)B0t$Cc|3Qj
zg)>m=E1sKVd<aJl=(6?jLmCs8l;^OUyfoVJa&VQuT$6C&)eMM8Fb=jLR)K4i(O%+<
z$7`epcrGjsi^b^fD8I;YQ;lEcn9P+U%tG(!t%a`E=rQ@<rWL&&`42D<ngG;wN2IoP
zJRY?CR=pz_Yk9NMI8)Gq;1(xkyQPpGfXn!z@#HzHRFBj6%-XW-XF`cc*V0u9I6CK$
z>u91^tfAyY8Cc7Oev<pZX9vxb+RwRlOaX+>=flE7Gqjr;rL)Pfc_oQAI5Nnyys5-T
z2+O#qW-n|nM`KB7c(Ceev2MWMgA{DgeMz4oYf#PjM;k=r1WIvlU|^Lt1n5M5a8&_L
zT*3$xIO$@no6{3|96$v?bS~RK$Vm3pXCRY2nj=$?oAeMidUs51wL_|!Rf;`#6lK!t
zPjlD%mv3pT5$$djMh6|Og}>&D#5pC&I-M#p<Ubw?GgBod2U;o|ZjZzKfT3o@B6bnR
z=ve=P;3jlWJSODtbHu*~1ZqM>VK4<BKl?i^7{kNz0pR1nasWvl>=)$C3<H<KkR;^G
z$K~~CY)x%S>!Scw;I{1U?n3QSIqw5C6@DQKPYh_yZPil%;UlQ=6oH50p`>)2c6J(u
z%LLHIkc*+DKc2W4Yn+z#b`+3vqw<}oI5bNRK#Wibu#E60tP`1kUq16_aE_7D{XyMI
zD*yoiZqfsp`#~Vq1EvDFj7QN|=&FsuS#}Z<gjdNao3(g3;D#L%*4<goF8-dc6o(k$
z5ct{EMWDpObl8l{PY0Qif+l2IpVn7^@lc1*PN9oDN{=bkCB*;$*8iiZ1oc5%aM6&!
zQ?aE{F$Uqz%320tFI~WZ)S|1}6&t?)cmyG;@nBskzU~EMQ~iwzWJjP0OC+(VzcQMS
zo1~&u^E$kWn!s(++U^QSIfEVr1xj%<MOsX*A~;Z3H#fLYGufSuf&hTMAsUtnpOq&y
zLRqRB8hQm_Nys-DSdtXGdEwMC<>0t8p-gNsV2esn;f;Gf0iWN}D1d0?QS$BQsQ|dw
z3)K!au<Y`QdC)Z0pmg$=^hXcj%wOhBF`l8}$w^P36d^?-0Pne0tPN6Mi6ph@5&(^r
zrDu~>gsnOk>*Ore*pPZ5M$G{WK<O-u7G{DLysR0SI0oh7SL<nNZf@rO6~DA%fz~di
z7gH^SUcDx?L;!7PUM<2>>H8l>E8gMaWgWS9u%R~S=!M0A0I3p-X^H{CX+Eh#$aD(p
zFA<B5#x0EUrk3*ztE}k;a9#kw`y4<ZZpzk*2;*OWQzb~Cn8+*>bP6zTAj6UUmDh^T
zmd=5Mvy%92kQcq^98X3!02f?AgPv;8Yo%XNU`sV7fuq2UzS#P;X#L&0G9i$Y6;waw
zTKyJ@L1fUbccR?StW!zG=p^W|QI7=PP#gX0Dz09@S(%@VF%imeLjaD0`ZOp#!xeA7
zL3ZU^s%gK3FkNW{!1^FhuVfGa(6TXQYr8H5Y~a@e5HVMmvkKqBDh1mGoIfBxMYB9`
zc5wlmjF1xY5Y<6}n7$vXaiN++QqXM;?v603ACX}Q))oLMTFF)l0bTl$>14s!A=Usb
zh1@hE|4-&K#4QR4)~c&ocWyi7EC4gWd`e#_&<Ri<FkLJGCgmF-!Sl<GM^O5HKM_e(
z{R#LBzzvU<khj3zeK!+9O4m!t4RAYJQRT?E1t`6n+Ms}%yBa8`4ORl1<Y9(K`{3Mx
z+MpIdJB(16W@unon_|hMajTwKBAG;tKA1qWuL1IUdts?&_OyoQ803Ft7UEbzTB;{E
zJw!ehPbQiUc-O#vi=pB~OKLF}E0C0A34m2?s^EPOI=BFu+C>Z{EiEmD&)EJ4B!8;*
zDM^4`Q34`;?_>jLi)DeB!`r~Ucm*~s&38V@W4k|Gfdb#v{-O96fa+l|2m&`qN2wf8
zJCJ(=Per<$>)XXjhCuo<Dw}2^^)KxZ0rY6UyzEOsR=E{GG66cedm8kNl7j^AneyUt
z!@rIh#DWWh(C@Rrbp-~p-w#5l&BD|&FVz(lb5+=$S`GxEV&6sU8V4w%O%a|^8uIK6
zt;k`JHN$dws6Dv*@Pc&U7%<J(LiJjJ=QdYs$9g#pgE+JSCS?^3fX-PhrB-)JdBMzS
z!5y3`0%FX9#0fg$aKpHky`92Dzj*k8P{E_+bfCIwDYyxLW!cPaA*OACYTlyJ$Sf|{
z_x^_>ydx|}M@Lnv0J#zRv;x8)bC5(|rE2u~XzCWCAv^|TT#Y|NR}pU&X;Raul4OpN
zl9J+DsSm%o38FN5A~P&*2691oqCSWd!G4Js>DK)N14GY)h?s~%OycD(bmgm5z>zb-
zWWVx4p#~k>l_6;?*?$829?B+=t)$TIXC2Ow$>}QwZB+mfpbn&6v0^n%7lqBIl<@dr
z?_Guk8~{JYnhbI_dlfi)7Zdw@r9C(LS(EnlzVHaW9-`cs!D4`On3oDb=5C^xZiH2S
z1_mHzAejsQ(MvQ?*dLjM5Z)`{%N*v~G5pW2ty}@-=j80!IT!_WZx_rrDPQf10(~uB
zb#$zPx+fBf8!;fC49@-mQwY#y9c(F&$sOMN?ys#nW{TZ0HN=7x<)Z-m9W%I~Y<sXq
z=z>9znp6vKD>8d2LPHo&%UgjCzlPjC9`OUrs)^Yk1Y>!13Ee3{V_A)6?$ry$^r+Ak
zXJ=>mSBM?0|Ex&%0edrXCmlg|Q38mvbo1oTUGeU=lMZCyocYfgQbJU3K_WJ`x0`H-
z8MDy7(s$<TxZ6~H&D1^c;>v@`EFmEP+gl?JJrb={+r|N({gVb2x2~U}<7GLnZ^{L0
zP%6hHl!%GagZw^#Ue@7Oh$GT1pgjBSVFi%K;g1k<ZkyGr{=X8BEC(PrthWI|Z()V=
z$z$zpZmR^8j?W#ExNPGd|7T}t@Utht6G4sJ=Vs5XlamubwGz=t=i_3d5ingSUsvA>
zZKt4I;Wp`MlH=?^<z<B9N!fLvM4V>GSgqe=gimI5H+_d|I>33Lxp(mjfWrm#E(orj
z0MIiK`nIe$FEgUvbl*Me%?%LHSUtAAiw4RWAeRozVstl7=ws-_IyGIsd|1R`E&!^=
zs-=R&C~D^x`_N!a9$2Kg5kxyn96#<5^kCR`0B$?6G4SclQ-ncw-$=pwvRu13L5Q0g
z7z{Is3y+1*=1e^#$3I1zr(hldzeQg?Fk8wGVT9_9KYvv+J`hY42ijsujc1QsnHzku
z?Im?d+Fy5&4QMPq#ak8qcD!>D0A9gO0b`r;KicK~h-6<zyo?8{ZfAz>@t-@0c5486
zrsZco6cShvFMgU;5C@Qrgfd2&C}f2dF9S=kaQOwYb77k#iNaD42>eGvWf*@9<b4N|
z{0IUPnt<1)TBSOu0yB;2&H%v>Y9E#JM5n(1P!m@Qx<SyYr^eQ4PQk<ZD+>yz#DR-!
zmA9;`GO-tt2>Iy}5WIC4?UxA&52<hdeG}b0JR|{01f;w+hcdgaN$SZXS)b_Ug|@b~
z_PC_r6NOx02OUJugyFK-LGcgks`v=Fx_On=Qq=877hl}ZX&0m32CZU}F2Vtl*MVBi
z#S&cNhJRC@t4SZU|0OMkP+<P}Aa!H~tnK$(rIt=V-v6mU_(+%ZvRS=?{$I2Ss3?Kd
z+m(JbMJ9nEph6R8$ac6%skz$MwSc7vTdc_90{0Pr6+M(tb$B5bDs}&du~MrX59`OC
zVoS{P)>ejyZBZ-+c(6qE-CBSW6>1O*R0`KaR+C>TA#)3G>1za6^!&w_W=p0Vv>sl@
zQ7=HBru@sy!hXNpt)WOf#Q7>!Kq52)h%|*LAVHP`o+Ze3LG7q6O4R1@pNnE7Xu7+S
zp%~1><gQ>m1l;L@FTh*bU&oiRx!9X9WRxfR#0v=x$HQk^G{;bvLNdDfGlg^>ODbiW
z<|V+f>_rcBR8h>BDdGZZz}<wC@fQC&D|1%d^5A27f%Hq^qgfJELhJ-wZX&pzlhkJv
zi*QCSG`I)OvQ9%u3uY%qD4m>s^#qv%_oIypaW8ZaD6EErz}Z-g^`ZEf$u5!*ARP|z
z2&pyzK1c4ZtpP8Gu3Y|2EucezG)aVBetC>1r#gnK+&SI-7go53OWpktN_nWOYz{nI
zkmp*x--jb@ZjZ5I0x~}Qx_WR(Xe8d~(y?o@_BKdRAnO&A!UAd@KR!1%=lk({eNx<3
z1qq$d#B!&^jkAmKQz}WWSmE5>bGMAXH}_WEdq#Uh<0iw6nmctupL1lG9G?4TU*9^f
z{5C=KtbO9vs1XgHGnOiK)4r}+&6BL;KmQq*Acm;QzO(mA#1|25NyHx>w_nGj{XTg*
zC)D!g=!Gfw=_%S{qd&w9A*Wz}=N;CL;9-HD?5PhA0IKbQO|<IalH<G@csw>a`3cm#
zHpPJ^#8a<cWsIx(v=3+UGAP6DT%kvab~h_&NY`b^<6PSi7FPHr)e3A13!D$q0Mc@R
zY|<4)q4V%^DcDpYtHIB-e9Ui<za&PUFl4sc>gK`gZL<BF%qhSK;c(>x{hy$3L3d64
z;n5(d`lY9#X6(|q5Gq^324}Y=EBKyrU!kYTWua*wfH`#`5{$+TQ?dmWcG{P2Rg~>6
zE<;nHgf@rN1qnM4{-K|?`s@0I38@XqVVpYjM=ttS+W!wxe;L)*_qE}|El_BS6)456
zMT!+#+@%zEcXtTxE(MANTHFK0-Ca`LU4jR9chA|s|9Q^)@`b@5j6sr}z1E!by0674
zy^zW-5l77SS1KUcK3d!x3tu>hUt%5R^3o<J?8DCcI>!sSxZ&a-Ib3cbb_o?2z(BGT
z7h*cO87*)=Q}S)1^tk6&Lj9aZj8bl#{sO+&#9vH-GbBoJj5d~#|Go4RHZ#*d5cYoO
z6~HPeW{J|h^I4R|;Fcp34gz>lN|*w0Sb78j3ps#@uvXQ1T-8$u5qtBI(at7w7hc3A
z83_6hWPTC7XAp9_@O*eSxvs^TynPsJ74o@{XEJ&e<E)a?t#z32=^DJ-i|vWTTbztR
z%-HX#O3v{Mz|T`kB<Z07j17l(%hj7oW=~_AUtn?Y_P+)>gRl~=D+40Q4L${ZzvuHG
z2xIg@N8!ZH`Gyt5uR!)|pS#d&DcoMz=-b??S#hZ0B&qvmMMsI%)0;^(VQ2OQwj*T7
zM#dtnLZ93DqIatq&AqVTpa#sBC!HY+H$OSNYXz5?r23LvrWI97ftRPup(HGzAnMur
z@Up9xZwqXgnX2}G;j#%Jz4FDC_I>Dz<wm1?l5vI*g^*DzVO9G>Dj`S~BF_BPDmr^|
zE|({?^$a*ghI}sfV#J`I^Q=KekIEKHc;&YB!pT={_dYwh>WbVKp`(-|gBQ=!j<|hZ
z%R#%LSgvv7g{%z-z#iP&^dJ*9=_qgT@?=xk%~JP?RC+Z*n^?kWkvavQTf>i#HwFl~
z%}w3HHOk<@0Adw`uL{6XCVIW@&n=?ri;dxNnshKB>)NxS-h{E3qaxs6M~_a;^tgbd
zUcKayhg!MUPefu8*LZK<t0NL<N4{qD-t@Y+ELNr>M!}=NPqhDi+yb_fJujYRbGGle
zST+O33#5Qok~_<liS_U%cM<WG^+ETq$B#A_1Y;GALI!(`Ui9>y%8b2ILHGq99{*RP
z=%{88vFl(k&$j^%H6l6)up+$cz<EdU`*nN3+Ki8Jjbq1uM~8xJZ9UX*Y0b%n{8}~Z
zI2$0h-BILJaatZ;P=#2^=VgnEE|j)+SoixQ4@niiCcVG#JN-I0F9`%U@9y3U^rEpq
zu#Jp`U!%D;pPVf`g@o)aJg%_F94<X&Y(2CedtH+Ha<%{I*1xv-kiP;wXi+IG%~UV(
zg_YNf(6oq?s6ZFX>yvKK4sNTip!D=(8AIQ^(-HDt5cSVSK3y!T_h?+zvZOfOz3e3d
zj$$AD%aBt)fuHI)K+!MO6RMPB#b)pEa>oL;UsFCyJg$-EESfVZMnijhYIf>Uxl+hS
z^D2JxIy-fh>n>r~XjUKMKWXe+2QA%R6pWSMEagZ<ZePH=V)n;tGs|Lii0ZnNi70Xk
zTx8Vvf8li#PBDG84y6y+&9@&uEXW`Jx(OEYy?=ZRl(4RV3F?Q!iJf2k4Q+{Uh_JhV
zZg}?s7tUaG%<iWCc-B)@E&aIGh1Vc~4H0;}xy#u-G0|x7DiZfrWCO5N&ihY7-dC$d
zJ9JOPgGIjNLJv)a|9deG_Q4!}SF;z{uq2o88tC~GZ}a}R<%Y$4+|q^1wUF_8I=%0c
zE4kuE!(N%mLmH(o@zH!P59;)7kB)mz21$mAFjy`%XQCIlS#y0)Pp&)fi^A#Ke}McF
z2LU&F5#37#3SSek7_xCvFcR)KFYRdBeQ?6s76F@meux^ncXQjNllR`*{?^rFj|_w$
zJeH<X+SKBQrscbak3DjhvJjCIo?obvY`_ip_04}<!Q-UQoy3xo*Fd_UbMO%WZ~l=M
zP1wY9te?D74}ImkB+!=uRJ7`4K>7}X>!(dFaDd_bUgQt(<U25vb~rw5n{Aw}U%iDM
z7y_HY4{hWg$X5SIpSXNxhv{*Nkl7~G7ZIsG4_r#hIyp(;iQaVajS)1KlAu|c$uI-x
zgEpAqJ$*90y?$4VNUO~WM(V4{IZGJ^WZ)IJrXBgy3v3*JnP2t1NDSDv9G4|}cgaKR
zdkb0klc07jngYV}-9a&R;iam$^t|i63Y54GO2PDr7F}@`7Vv0ncktd!Y0<p)m~}+5
z{&ZEr?`s(CG-pvIWw`5XisjJS_87{T-XlJiHb#nkcct}f47Oq)W5*BLY7Rb6avMqH
zsx156BBh+Bx{yUwT6)X2RITl)eqgD&2b+{M@nPM8ox<n#<0@wEUd^iz$QxrOU4hsZ
z2;s1Gc<OF~*UahkqDoHyF;x1fiGMH6`c8YkSLUyO>{+H<s>us5a&@<lbL}_O*KwGZ
z(Fe-nVXch6vt>UcuiH;x>*?6waXFrqaztt4G6ck%V|hV8j5U)+VSNg<PMfe4dwOGI
zcEK?l`afJ<%mLW?M69BJ-bRaf@pUthJSjLxm|BdoS?NqWQttAx$&t_Pi1)D3>nZVH
zpAS1I1m9m|QP%o!;xD?t$idt`Xr;3O;`iFvBz3~0YXzNi*3l+m<f*GhcAw`T&vDzT
zKV{A61j0f2)BpBR|5wsezDvOQMV>&%;!J?j$%Z8A9kMfnvxtGS>Pln%TKp9bM%rMz
z)$7*?5V8KG_J2iM;OWnR$8>as5~_M&D<*u3^A3`;YC>o6f4rg3_kWhW7xrO!Kt=xl
z%@+Q*_kE3ogZuJ~C*t=It@!_M>hOQR`Tu`2d<m?v{YaEU?~DML#7ord(A^%s1l=(n
zSL<b(U+)c<$!l4>NP|^)G`8eRC1q#%h9`douDw6<e8Xf3_B?t|=WSQ?%K7r?)by_Q
z`p|aAr}M6dFH&)N!v_6=MxEXFn809-wD&KMx_S)7%FWr?Y{OtLr^tc}Fa4wfh0sc~
z=g8Le$eiQG_JB<iXWJhX{d+p?_QSok?dv5q|0Tk&#->AOY7{%q%Y1W)4j31kny0I$
zzo?nXjA$O@x6AqU0}b2mU?+iTr3O_NOGQV+fT0^zlOCZ9{-s`|uN=96=jO&n4n`ya
zmt8MLN|0r4OF<tH@t7=-qv*aaP6;b0NPBQr6F=pU*|z;^LMSzFS38#0U~ei3dGiTO
zo-p{E@^o{4Dy<<yA9_8I%0uE`Ws(zGx_G?eE@hbZ*($bzcz8u{B?iK84m$|Zi0zh&
zrFlv$i#h06US^iKP8Yeezt1>nSsdlcIN>djO#J9QOG+PGpxV{cEC3+hCe1)Ga9l73
zvEzeBlW@roCs*CVMG)&gfNT3H>}IN9%#du1z3zk#8`64x_oXf!Tqd{gQbcq%)u+&4
zzv9;9n}p-MHIhK<OKcW;kFkQt)F;<J9$UV82q*vKHA||cHWJ)|9nb;0@!hsfAyk4A
zZ3P@_ppKYwIiQGEqXga@B6g2R8p_%cC)A#Uo8ht4f49<L_ICQvyVK_!puX5<iHCD$
z|M`jg%oqRPxaA$naKu0Nm_~ClDP97Uk6)17)9q&~tdmh)$IeQ$YIYXtTzwIC%y@J%
zDId*>(_mlBM*djrH479?<@zGlo=)&2>=tVmhL-EB%TRM@<rNa{btXTee^+WT7750o
zLsaD&)zrhvb$7>GIX^7SCpB(*@ajL_MrxceXjhYhxyrSg4QCPLOKlmjQNPRFl#oMk
zKFZhK8K#!FUJTw-n7j7SRcdVtuQUD6<$swV1#-TCkS`4xEA5_a#+{e-SZ(Z|P;ZWZ
zGb%h|={;C+B(gl#`A#*i!DTyN2)A#ZM+}+tRzR6j1rpGEr{XML!65rw8N3%`npMX1
zpSa4j>g{i4f0VdN`G16eh(bSqoL5ol!{+nfhNxOi-Vm|p8(p}GgwYU|oGxAMB~!Iw
z9WD2Iao8`Gos3sS&|(c{bu{V~+S&8!m8n#%Zh5Z&D9IGX=g-s5l~CGzeX9+BvYMM!
zgH+G?>P<pEA0HxUTBy2JZKcW0?39H(Q+nO-?nD5C`^n;0>rU1q%T|vwv@7z3Dml6`
z+N{nU7K7rMl17`ZmB;M90aSMN_+rH6)<qN#bS#;RS!&%Bxat}m4|rk!CV7VN<u=A^
zd|ML9bhvDgX)FEY7G@E*XI-KAI=!;QW;UQQ=zSc5>v(-8GR%$qfy&GMn3v014?Z=N
zk>}#pgo)rbUL_23)5}UvEYtNtbOd|)Gv(r0=1_e?vt4TU+ZY7xu<*)1`#zST->i2(
zr}Eju+B1+PE*<&()J&q+S6XAUja)S!-Te`GCR+<YYe4g8Zx5bKZ{jC#38Q+thonts
zusg?wwe<Za|E@sBm(cg+W_j@jco+FxI>%?R$}BzC4nyF53M~%XqQ&e*UKslmS)RU6
zQ^+hQ%T?~3mErfG^`02_(XB%Opf`WwwHY|f({eIBJHI@4x+&Iz_Qq?)0Z1sj@A@TK
znwzp#lls;lys!evrv$C36Ny0)F-k(2EU)$>mlpk=`<)Y>-f)}bs;{XDXRl+t@G&aD
zZ~SEo0kOdpeu4Z5UrpAlLv&rY)?p4-|Et!IG?9gYjdRvXMHkRlD(D2vq}@_d|7{uG
zp@_Z$x)GwU#P_JBDIIR3oX=I8xxm(uWQ^B&7ZzDyafQ^GYF~}aV~+(})tc#Dk!6on
z1NH|l%bfYhS-;y`$H!mo7Z%U=Lz^jx4B}jWPk&1@d|pA7sGGi^x&)iq8PsT^Px%cu
zLmz~aD)d_>=ICqIdm4HxES63Mv6yWjjgPncdCs5i-JgAri)FQ{nKN&^A@U>9>8`z6
z;(W`UA?jq_SVR{koR-78yfUFzt`5HUw}l(a>{e3%<mlK{OasL@QtEFP3l#mK0O39#
z`xCs#SxU%i9P6g8iEqe@mEN8WF4qj08QK;hDVujJm8_a%y39kjNSkIb(`fW)k#9R$
zYIi&w&m>am+pJ-uW~`I{DAJLOem+$YGqN9lH0$2YqGC?I&|sq}6ozZYNDr-{6gAps
zxICU-n&s@6MYcF;;l4SVTq<{{N_8bhb!3_u&Rq-%d${O&a4*yRCgEOw97&NrGF=fq
z!3d5A*&jyeH|`o=Q45dNMVIl#kn?`PJNvxHQTcFNWO4&~I-Xc4yt=aHD%EewpZFfA
zkuRUHeMu#;aNB;P*Xpy@Xdp3_uswr2-ZcIe_!uZ8U6*i43y<WksG@ve(Rl%?H6I9_
zq#1PjLT;q@;8whnb>4#qyPQ}1K{8=DwNEw>M{To^r3Il9&~Lqz-rSkJ-#$u{<Kv8?
z;Moexfu8%_sQ^JUhM8n;uZS}6=#lYIH25j>Du1cfQRc`6%(+m1T_O7FZJul*?)EOc
z29d~E=8e-??+e=|R<owLxTsa*5JA{ZRQ0TW4X(W&#CA7@{3%z^!{#NI%;4mBmP=E&
zC+9Nt5%F%3)%sRfJk=}%|B{m*#ov0;U+9-3oyeT+6JJY?#PW~pNT6*;hXiWy(k8Ji
zgj2i8#B9wt<d$V)Fobc$rtY)iT)AAqY@`KCAm_cwgv&EUI=As;P)ek@9Qoam3w=s-
zK#|Pq-m#}CVnR(u{cAj&g$pnA5!7L?X01LezZ#Yp)8KvpSyV02bDfS?au5eb$eG->
z#<0ZYXg!_?XLtt`>j>U-G9PCF5kmEdob{^FU0e4oNttr#x;nO}i3m&V(i6u;VFCv@
zeps4R@An82_7FAfORQr)&^SICk3#ZeOtq>q?%RHqe~tF4()73%xON)$$K@to{tS?V
zKzYA!mp%sY`I-7dsZ7q`^lJ6WJDI_=Old=rB4vyh_~p9%usTpMC%9xon$^z~igu$n
zt%=%uabUW$?RvWxmClN}@C%^Q>Vw7Q?56+bge^Mvu@zW9FEz>so8O^-*yA%@s-eTq
zDA#9)jA)P9jX}VK<9U8p>umYadc6MkRb3C$*}y)=x>DsA4qN_IolGCr6UTFd=eT$+
zm1w1v4O((Y5{rJN_vXJ$w438q=D%0Afk<cXYy2U<&P+nDk$etes54yToqrgO&Uar1
zizD8G#7Mqm28Be*rBtzdu`gp1@t&_Y`NtJtuwiqI94gYcdf&JWv|70`ECvrZ<44vS
z%+Tun-akJOuyiTsf7N_V-Z@CV0%fDTa6s;kE0-QvQt89=eb}y>W!f!S-MDzWlOOnr
zZfnG#aCQuQzy)eCxzH}72u3n8Mx?Q}>aGp@?2TsL>+o5#cKov^aB+G(^|P^A4t7-J
z%yDMFvcVlsxaPE9lh||YxjpNh@-Ep1o$@+fxIk(wXL~^;;?We>eY(pRUD43`89U`1
z#gxbI;@B=X4msP8Th5D!h5HS8WV?>HoS?a&4T5rRK@Sh83?v#YhRpE&{f_<WX*)*W
zad{iBg1~J9+(O>>`cdoO(*RB!j>p6CaxT0NOw6x7A)XH5*NWJSvsg!SIqml=<NHD4
zsI$^&Oj3(;l`yEWb}=U;p_UeBA+K1dloz^yx#>Jr4<?<l{4|KU&uJIFxEEz#8uu6M
z23(om8-A9b7FJ)V<tkPQpt%)gHVMW+?pxNz<fhAuASjZ(5o9tCkvMF<U>mMyK^LDx
zdy{PNE*KF@++dBNbGhHcSWqHAeMvg~$s#jd&&z##0CD=<l^<OQ&SUJ32!YQ|PpCY^
z9AksTG?rxFzW>7N*elj!l)>%vckK5HcW(S=ozConu4c0XOZ)v_lPfP?#wD(vb@d@g
z1?@m(5A<gHA9PM8jTM9N-0knsEB+ecg*u&$lPq6T%s@pV@5#pmeBf<BuI4=!(j-{+
zVvIxmG2;Q&PgMlOCj$F}0&YovBqqdr{=`K>yFMnIYrERqixl#|<F_^>Xv9a#QH9eX
z1b-?9GZYN{eq)vDuRw7!Yw_#3|GG!|Ok!l*Am4oWQZ4O84DM@MpbroA-A#n>`^{B~
zhkt1c-Db9g8g9qiB#|C)*_kCMarM*mu|1xvb<NK$e4y9g-UgR!F~X#O3lk_+)Nc-`
zM5ktL@L5mWy#VDhWR&l6=g4Z+l;~afo-fqP<yfawZ|({>440Q3xY=p3Sx;gxmh4A)
zoQ#&`ph4S$nf2NS^>%co2^e&BYtX)wLLd|<99@1XgBC6M9gYs;yzF(XzUR9Fe4NG?
zBQ{B4Hz%r1;sW@eZ_Y!)+RyF|4}IZnc^B(&v|Dz|1#z%spwL_*u&kt6f9JM^+QSm{
z7H1V6LC*q7i^$%!0~Q*DD3w#W=-=AwIF5w|503UW!h9GnHm3wc?%>Bw+#9Hk#uqPh
zh4UZ3eO60#0g<}ke5Nf!68+H_Y&MV>hi4Q>HkNKXyU>BqEH>{fd^w<!o7`eMH4*Hs
zGg~26no`WQ=N6c=G~9FWC7o}%)-5Ffb#fo5?28QI3=OOxLjLPoWZpo>s@C&B-1wIv
zS~<Lt^D0KFu6Wi;u>t3&e1*vJ{SB#Zxl71A`m-gh?CRRJg@V7)|15UY^)q1_@o`UN
z(g9b++U3Vrk^HW{;^Rw1eQbJsE_96*MkSDq9x#{lzAd&w3^{W1oaSO8qsm$ZGGB}P
zK2FryW_-XqkK0|rFzKXePZBGg&`Y8>nk~LN^~H|9&ry^d7D+l5G5WHx+*QT0hG|(!
z%e1HALI_nzs(`n<T9Kk$9OGp3xk01Hmgv^GT0+Agw_2x=;u5aXe_P<x8www_-wW&W
zLl;{E6?Im#g*0ncCZXUiy!~A`?{U6u(fNW>+}oSurH6-AF6BnmkXZMOlZjxxH6(2z
z{?i-sg-E@I9lJCvThpX-phC)i;k1K|i(TS+b>1WMw08bs_~2nSDW^yJ!_0zWA?(tk
zj_)=}gy@@@A~2-*LcpSv{*Lo0QJE9}rApEZvcG$g1qS@?Rc0W)#=laFG<`A)G2d-;
zNFLTh?0J_NG@435K`7cn1F>`@f<}!CW|YYyEiSzXv}xeHoa5l*q)^-cmDxW({zqzm
zOrBIT6%JtiOa3H5kB<*`Z@>S@F;x&1eMA>N!hgmly7}6ZSZDF4cmbH;%@u$`By)PQ
zV@dE25`#WSnqe#}C`_%Z8-98Bv?t`le?mQF(Vtr}QtPx=+Sbxg2t2dTubnryjuW}}
zG(Hah!^js0e?a0i+%zRgvllYPAP$@T^0<(p1A`s$hf2Hey;U=euFZ60X+Ai=s`@!t
z_MiauSqs^ax~xRcnzB`SMH-zr0;#usYM<O<)F@Me?Jbze<rAlxbQV5qKhXO#iLRyT
z*+Bcz%LLz}x4yvcSqWOeAKed$+$Dwx^+~iM%-wHBK_?m&>)l|D7QdQw&$rZfC$VgM
zL9&@2@%>%A&u`cKmbJUOJ}@YYawS5KfXjZm?dD1GuWSg7jr+#i?9}Aiw~iY<8pno`
z$=G{S6N?WsV_@xl?7WJ-huyX1#<O*5sl;k;s8Qaa+NAM_Y_s03`*W9*uygGk%yfRA
z=8>LnD5;#=ZWER~UQIJZ-;<(_B?r_xumT;Sg(xE4a$7L2yKcHacSQg(zwb4*%(Kh>
zXN(>S=qT;_o9$nzJvB-51WfB5XQX}#blB-KX|&7<7Nm&fT3_VhKfFH#zvI08S-vN?
z5nP+O7I2$dxU7Yq&j>2Qw@Jc26wtI$sgMs=&q$@32+m~Bw!T1UmXf{AUERNDw?DS`
z`E3o%gk{!7bTTB?I&vPApdWJ`M>|vShtwy(sj3z!CgdP81@ziwUy05vG#nDzmbolN
z9yjfNW~&1&L7!)al5T@Pd+kE^D-g5_f6|5A<SB#%Di<j|(u1yFVJ@JwqtCN@PIJXD
z{Is{Vbsc4z<?PkdJ{4O3+%PXhJJ!8sIlrA3c*7#Wff5g?s4u~n)k8Y!Hhe$Jq~~pT
zBjReDiR5V*-MS=zNB1P%gqri6=g0p@feR|bCbTGE47De5u<A#2nJ$OI&F^{M4TEt3
z!}omSv%RSTF8h^2I8h)ZG6Wz9J)a)|erNyy$cb4pN99!O3xNUNqnx&jVog2c<1%AA
z*gr!ug#j!`skO@T^#8d4crF$)I?&p+bjo5qN`k%JrfrsD-?3&owbYe`X^*Vn!D>uL
zvg^JYtUc@PAcaJTkbD&@Kk)}4I4#U(QVk507?8hN4BhQ??!^_bD3_C}o}0}#nrjtR
z<FlC#O5e)D+%Pj+1a+$26y_tEA*<Ufr!*p#x-{;ea!Zu2niqb7_-@F)9&>?`M4K^E
z){wH|E&~O84BY{MpQ`6lM?3%U_=Yr+DW14ov#7cN!IMAu`H+-<E7!g^k|6U?zuiIo
ztIPg7>n(2OnR2)))QtLb!_m^g`kg<HU+A9AY`zV=!(=*Fwv95JkA$a^#Am(#jeo5z
zj3WynXn3s6cj;uQ6sh!%?1axFo%jo3Nu|$OY51pyI!m~3$6y&M584&X?7TL_x23+=
z5JLZb`w+x;#iU!C%WJ)VBnln-Hw5&(jScY9CGM|?fBtb??^Uak99*r=V<_xwirgWd
zdGMksI2-cbT3ai}dbNQ^vz<Y5?eH<i^Bv0l8@7a)ft)_<HLUj_u^J%%b<9)AM~FN?
zFC_Lt%66f#rKLc&+{9@nl<+QOC_17jJ74vzT4wvP2?R?tf?_b}wYm(f=c(1;{*bAX
z?R(}iO%uLp&eKhaC>A^&yR>5C_q*@U8rkT>E>cQa*mZ^{ey@PHmd{rXF7?!SyC0Ph
z1BXsF%8z)pT!Y5q=qR$SpS<?V=T<==-Ujk-cE6sR_9hF0bsKk_Se$2vwVx)CCC>2H
z1|Z)}P@k*4hmMW#!hkjlQ&w7n+h38Fe=-CLo+m4S^zbqxLAm+B^AQ*8$H6b7?*7FU
zEwB#dE89iJ7`;qAI=B6_TzGhVaI`RMBIRxdN$sQSX#AiTuLS8xI`dFO2qUd5Bugm{
zBILW$YR)$CYmaTO^TO<Z5F%yiyI*!yf&O<}Nhn?<s`3ChH<mHgmhrsC2>@Cydy{|)
z;y8@?iHqCo@^cq2AD{Cs5K29~w+BS@pCkb4E1t!m9bo>p0PGc@V-IG_RG5M6v_Fuq
z3d44Vs+!giBBhc?^oXk>(@4c${(*cKY#~3k3yjfkv@9w+NZLrt;aIFT0$QgwSzxTg
z`IU0VI;p7-6$?&^kmYgh)STa}M!4s#C7A~8(`}?!rb8)w6q}J3We3`6=sWUY@@WpE
zb!zIcVl_ek5NVE>4Ci_b&`(T{_0_Jc?JFBIY^G%MWQ1s<EidiyaCpTbh8YU9rT&?S
z-V6a|Rx|;Dx;`sFeA;9h1*Uk!P-<zUyK5YPXl%PTXEM827Ogb?c=(4B>>hiHq6Q0Q
z{hNxo1AAwUY$7OVV?V$gr~lR6LSKPz4WzY|AJKIAd$&e7y*N|AcQOxVDB3$~D<nO6
z0;h|9ZQ1xw<K+N@>Tvzx1A);4Fjv<P>y99^fS)l+jIQj>)xmGqeC>3hq~pH=m%bqJ
z_E`Fk?+`{+NVVGR#Fc9)9-hdFhp8t38K95YKp|F(6ZYOhjtaEzqL3+|1XlA=(Qjk@
zs77(AP`fqlV5RIYK{65SL{wQ{)zO48VK%b^PJ;!LCB1hk!O+H4kZ4^Q6<E9hdf2<+
zR8vOd+`n2RIhI>}gS#(8Z0!ThOPORxfCo%#7RrqPV_u-F{RchwXCbs!D<Koo_X+A@
zNGzLdmmo=xQZ*5ETBtj$blv$NI`fWU(kcCwUA=Gi-=8#s-p{HWI*qo^90es8NuxPo
zgnmW142eZi#Y&mFo)TF1TQ^$J^?-a=$<@tJ<^#8r`3}s6EIp$#!Of82d6%CP^LQ3@
z<%%4~6U}sv>GnP=F4nVmA+X3#Nmvd0bGS;>Wpb71Z*XZO)0xiTVb8NCKtV~5eRA;J
z$5Xqb*w>4-kq_2s-x~WYN&@tL_7?JN?(K7~{@fW6EoBOMKX;kmV<H(KJtb1HSd3+m
zolo4&Hb8_4M)e-e#^=Ps@j_ai_eB0%E&_j38uDyQYBd2JPJp=u&|ItA)3tGNAQkd-
zP;o*Ic~l6^iUDGQZ2)7*s8N3D&;E;p!3j8-Lrv)EG7g~vMCRTa#RGKF)d37(p&tO&
z!OhJL;3<V%_uqpDY!rcr0Tl6kjIc#yNXl7K{c~cs91noGaOaB<ucLp^7C_5`49@KT
z&WpU9Qy>?*f0B43?IE5w_Vv5hYIbn(@rs^~AB45E2EKS}6(_-AGsUAJ$d|;|Kxur7
z-cxWEKx=H!dI{|5p9NbfDiTwmVC=$_;tK5&@@&xe2j0_Cbc0}YS%h5~Su^$~kLN)%
zzSX2zF)VjxLjpVg>|VMn(4FKkr|Dwe@wixMr^6GZ-;my4C)_NZCTLQS8BB-x7BCRR
zpFWh8c{Jc)P%G9XJo*iuEhZKF?S3%NSr;N8;^DD&Xn;&%bAB1(U2m`3<Wf$z-flN$
zfe>6fEHVM7HlRN;Ia7qtP;fgK7ONKBUIX~+?%75k6@}x#65L#tjDSx)JwnaeY&7<1
zgai_BkFgoV?|si5JJW_5Mg37+@3;QT+sbqv&svGF-jxdd1p-FBQZX%DwpZ+=S1-$u
zBEMbuo7`g%<BctdWA+hrSPK0tql#+t?(0=8nlcsw8L~#z=YIRW_(QmWunha2H291o
z+<(tD_NBnljw~y$%R5FX;}FU}Q_OU&O7+J}vhPX0af$GA5stjwX6)A*QfKxACL9}d
zF-p>3bO!(4yhKS}bh8T_8wxVJ@WMXUZ?>MuV<bQ``G{Je>)fndrr#4SuwZp{s-luF
z=doX$qa>d$SU#Oom-FHC?**&F$~PB@!+{-CCB)VNH3Xx(3$Q>J@03Z4Nv?p3-sv-_
z5}WbiJ7yijqkY$y3~tfFRAs(NdkV!S{fy94K~PI=v&$9GiOkeLt|L~X;jV)DxBplO
zCj(%Eh@<hEw(Bw$y}uazuw<!Eqzi7*Yf1jXHl=Uv_h-Ge8*QV|qtyw<BwAcnRxK%8
ztR@Tg=4!g84W4ht*8F{Zd<;K^UXmu#NPKv}z21?B{MWF)dAaP<>Z_ZscYASu+o6*#
zj5>)Wmn`Uaa@}T=Sf0&=?H<!Kwat_1WINs}i20n%pW~R;+jPFw>b+A=Jb@&{$==nO
z`=Va1m<yA$GSd9ZLfaKu4Egg8en0N5J!x!ev5?xRNs_l19EYm{IOYBx)AJJ5on)!q
zn{V-FGo{@8|A4C3-%*UUJ#2rGFuxt{=y(~F4~RDYV;V`g%B#CaMvj2kGXP`B#P2dT
z9q$!I9)1R#Hvn|2Mq+FY<RPt1OunC{UdK*#bp^xOwN;5*ezwrbCM%|`4gu-Mwn@*&
z=H}+fgbx@BBfvFzmk#(OfPlqk)_cHP;V8oEva1N7ZI%jK=upg(xt{Rw28*OG0Pzcx
zY7>=N`|~pcAUkJiMFmI#odC{l=r`5(!FYN#%rSF)ef?YZGZF%YLSXM{$SKcw`+b5=
z$w_<nw>8BpAwW0rwi96J!g_LaTRj}fjhREgCkX~`^hUxq3zVZY^TgYLfs~SBQi@xZ
zRqyXPo2%I_WUt!gTgZOju2jvSM)Av=muJ~3M_N^iHa>UlF@BTA|CKmEk0O~HwAP5@
z3k;qx?SG7Mi}@TG2=#VVL>z#w*D*h#x|Qkd9SqHFA2qw42&D(pMN@R&rkY*&3cv=X
zw_;#!BTAHPWY&ro7V=M^$MMddLE-N1h^rZooRw#RhZxErwH_KH>;fjurjg1#s8(8+
z)BWZ`^36`-59eo%dPCI5@Yi*bWORweu-dB2uyWO_0!Nh{F!>Vb@pNU{`K>eB%O}o6
z7J&7`_wofo+^s`Lz~B&&1OU7qnJC)=_vQZ5{tkYm`M5$#1P3wu6HDzBd=kY;MBn8x
zY3gbwXFwKK^zZ^vuU4*P(Y2qKho1Y18|(T4M#RfQuHKiccDu7HGD<u<CUQVwmWNMr
zW7VSl@}xj%qab^rZ2D5i^)UR!lq*#1rwCNfvdx$I_Iqzu-~`WP@9FvtAG4$ZUhqr5
zS{oP2r?_U9U9Y{tJK`%wrGcD>eEh}Bm{iSZacK@jfmlt1cpg$pA&r>xD$DQP{A#^O
ztXEICws%59vK+yoH{ad&rbQ%L{|fru*Sc-&|JeweBVaJzk+6E@Jx<orAeyS%<TkLi
zLMD2;i>86DnbuslUm%}y&j6d>LI>}2V7VbW_^PCWw;z4>g<5aA;l0J=PFwvouCPS$
ztJKxo?_jsBPk!vq`%Cg|3!s&DrTK7U9Hoqk*0x9=iZ|rHXnx@xEqfV`|9W;s40#A+
zm)n)gB(jilI#35&?W_m=L{(3*?&*~$etIxn#tdUDP>bV?OdLZLd4bAO)IE~epna8a
zfRI?e8^$UT^!?jlV?OimHqNtl7cM>_e=-YQ^oHN)X}hR#ghb${wm4xKfMue2E57^Y
znL7P-JzNpt`+6D#A7Q9RaN-xM6SBEp&UI|iV!`w_8!59b;y;?bJxN;m(x7|0*l3*N
zHXHJ=BZX3OB;7%c0@QcGqjd-HEWs>2#1RfRq{^drH_(Jw)w*!hdZFA3_H0nWUIww8
z$gwe_|9_cpB9y;chcn%PGVDL8Qu2E+k8vmvJvf}qL$5Lf0yf)<02)ugq?u*DQlwHq
z#AYshv-&pIdN7eWa$^jLlAvJ_3Y#$e7do{VCB^Sa1O#z_Si}>6e2c%X$N~i4U;jA>
z;O6=_BnuJDaGp`3qbmSX{6*g&DEVtfV!^<AZ=C>m|76p-zy6o6&{F&Mj<!SmYZ4F`
z6rl<v=TF+s5Mv896n~%9kc>1k-Sk^Rr`|TuYN5@W%gC$E>nf;FIqWo#_^WYuFj}+w
z!fK`Sj=b+L)wI_)Z*m(Gni?BX?09wqkyU6#E5qTaEohy<O)Edtnd(FKVZf;W&_D0(
zJp%u{A0*H1ZV0oZ^JYkm$dVneUN{k>TWFM4)@d~PLXE>LV)lre__j$gtb8of?{+nf
z7KG|6{_V8J&%*^VkoMVBgYoJ}^l6R$R;o$R*HojCFo}YAGVi>g$zzAcvj|D(4_7iU
ztp_r&5`^5Ri6TN?m!%g+VaFN7@vrac#i;d~9Z(zS<wbT1)jeAb@`su9i$zxj9A;}Q
zV&Z6J<WH4WnS=c<4`<N0bTb>ub*Og>#?Rq#5;f*yCTXlxF^suZ5-(c3;YNWsK)3n9
zB8EcLIt#Le*fCk%L3Nql=s9RdM$H3nv;&TefCWWtX8}O1$uFF}dR9dG&Ga>a4<NY#
z<a8t8NK`<wV7XY&EX&nek4O~vnpNp1Z!m$CcSV=L?3G6PQ{8v2!J9?OfWmFMKm55E
zsBgI_T1h{7)p{kkxMS31+mf-Vm)7omd(^<0gLY_!fFHhS_r<7`*xJpo5K!^X6R}x*
z8qhHObVr8Xdla;0yWUr>#Tnzb18{-626&<~r#!}4o=+M{Hj2ZX6WGV(_q2OSj%-w&
zyKNQrK&DAebz&Vqax`6_$zc|Tz`G)OX!UWny?}1*6V%9&4zKOX?KoUsem(Dg^H}|1
zH8Z&Ux*XFxLA&PJAznb&AbrI*TX{<PtQ0)1j0#4E(@0|4=g2WO5L^vK2yZAD2r59X
zbiO{UZ%HS)-#Db5KXiZhrw|PdXLqQxna&eQ5+svihI%gEe~??q+HCWiBBpL@vVs-e
zIk%{jz1O{ky=TfG*fzWDUh4<n>`q7Gd4IgBv(U*#x*j3sM3}7960jK5+HZ|F>c?5t
z_?M_170@t#xNE;TS}M2E6S`^mH-a@dp^+)r*O0*sFKm=U>Ov4FZ^Y5Xob}5mab~#s
zq+#m7@aa{lZMJPi_BgXRSM&w_?FDq&jU3Z{hITo=O_?Sa@Uyf@qm<jP9n*yz%|lB}
zNR~4gs26Qc8hE$+T_vJs4xqI*oY{e(kR`d;h1|DZknVgSc^Ks$D8}~ciAlR1vDtF*
z7!=`|N?Y}Oul@CNLD!y;_hZ!U2=J~a-y8InaAl&Em)(0X?ltG73HjY>miU!nEid$i
zlY~`P5UH5!L!AT^p820bZ)osH+hv^n(o~KLtQkL)UtR$qq>cA<y!a)NQngRf0;XT*
zXxI(8Jn>hnrAtR?^re620Qyp@(;hv2PE=!lxB}Jr6WffCYOtR>+X)i7>J#wa?#x#s
zVM%YZG^lo{0gkxCAGUDO4IZ62mZ?a}b4nuTw>vE#;e)qbftH#bhpw4#i9MS=T}j3h
zrcHc#uS@Qn<w||m?u<1njPNXEH$zzuU|RAT84*ce{NP(6YwID4?luiK=jZI<)gSt^
zye&w(nLT72Ht$1k9dX}}X@B~aQ)teuR+T4}Mr+sFNSh(KQS${Yu2(^Qu{^JcPTKo_
zlYb-;$|xe6Te7I&_p5+<{%l;FRiu#;XSyCBynzylN*!#?AyFWkB1#hmn3O|na6^Ff
z=lZW{;6yDy`&Zw-AH6#n15&7O=er#muPv|M1J(rvU%+1K;cyQ}JfjJ{@>lv#>+x$`
zqZvrdfL0T@b4NP~|1Msiz{kcJ@=bvD0#KJ<T81i~x%1bHkT7diqp&b|1rT<=0F46@
zqwiEl%PmQco~kAKZTEm95ZKGT7`r-wHa8q7nI6nlZ2*Q2K(?%)DYO)+Nt~SI|9mHL
z=EYx~$?X^v@&XgVJc2UOg?6JoT~h14c>m-NBAKnYP!RFR8M5h}{w+>a^YbI<ee3+0
z3NAECh+q`hPXu<f@%<)z4y(mM{$s>!Q=gtKN%CsU<MFCDW|z-M3_jHtBTK~BJvH@7
zL%)T>Eu{NRZ`eoBWfu;zwF59G70gIO_<F$i%`ON=GPU!45(2cXuilyd`h&M?y3ZCc
z`@IsKwcbNrK9xPZWfsWLH+`DGZ{SAyp$4%SO%Lqv#qPJ1iR<n8N+)sbBa#c=<=G@K
z?H$$%mL&PA+piHA$!c-@D3T2({8DR20I<mC26#Rqr;#E7;%AgQU(%QBZJe9g&Ews-
zz%unr^Y1~qIeZp-qIr_eJq21^q(Qq-2N7@3d!(FxdQ?z%m$gio`KWt_g?z@*c)oMY
zOTB1vAyKxU-n=(<5Gvn%TdkT3-G`ky0|_qB++Y8(4olP#{nM)>*jb$aCvq`5X5t3@
zRv#)^OuBljiRiHpa@qup+SSJMgPzV?!PMuTN0=EXAofa_p|uz0etw9}#GEXNvn+(u
zx16-ewiKj!kdK)f@|*KMJENYnzR7u8<Sc@_k-bPcES>Xy?U#tvJ2<UhqOF!}9yn;S
zW`c9+_k^VUA6~Q5=b=Ww1+%4q0W1MBR4`Ga)mDD6H&&CNBh4YB)_HCXB0XX3i$P8?
z80Ur8(UaHMY}>eaTiE4b1`AiVvIl3LY=-qfI?W)1ywCOU%x91oGHzF!*Oq|ueAHi>
zi25>kG-)01Q72#VZhUTZnQOK?a7md&a3cI~&$0J83H^C<Hi&yf)XI~gf4|>EyOO}3
zT&FAiLGU<gEnFrs+57sqiwR$ehB8<auRgE;-e6t{>Hc#2*b&KShL>wlbVsIeGv*`U
zlA=5v#f*+M0+%on*8UL@s{WtMPS}#laL<whxh0NzqUp`5W;ZGsGA5uIbhmb(3<~N9
z{3Su$>RL$E9u>gyt#oY~xhKnbp3+OPm9}43h+)L=H7H5NgQdHCU$LYhKxgW|J<(`}
zOl~jWAehMpEI;43-aUtYo&IO!*c{;wU~$`k@uE;ngQSjDe-u`t5u>O2j|$4ITB6R}
zPoMm03;PEhAS(RlF~bG{gjG41uU4)vhEZ*L4=3~G_?$K+MQbff>!OxIfIPS;MiqOb
z``0Nz3l|L9o8+y)6TNfbUhlL74mE9g)+u&<<LdMJ(?>ADFuDLX!pwR(lv&jG^@Z!}
zz#o@O&jNg;W9c7%5Yrb9bEy~JU_e=K#ztBi>@-`9sm9hw+$pfdQn;rjusy?;@3EGv
zJV)v=OO*4ZBKe&IMR~RPm+Zv0arrQQ$Z$&eP&LJL5aAF-ba>o3ehTUq^3lAxSg@4Y
zxX|X7&@ia1oQc-6_iY6?guk!rzY&Y@)U*S<o^6&Zg=QJ4yfGx`w%l%Q{s_B+h2<h;
zpDQg+covJ~I5~^orRCR0Y!P;%2!3GjqT&f3<$jK|R|=`dx8gBgO&&dRz1bO<qQyAk
z`WtSb+;jGc4vTcYLZQKL+J33J-lRkxg<GRS7kO<*CW+;JeGN-;#k;Wct<fYJlMg}l
zx5e9|e;&^3B2`E4;rzy1hi~6&H+Wt>q1{AzX-~-37e@UHKT1liwJ)5kl1mY|>u3vm
z_XVY0F108!fKfAdHB^lxn&O$xB9rv+*?whu+Dc{PZLQ~3BH9i4^kY+RkkE&$-FQKh
zdrU^o-yPp5sVM|}-qYzH<_^TV?ZKMFn$~Z(E3~VdT?^FjH~#W+_@31=aFo1r(*M`~
zue!;<-Rn-;)sodtzoqphCcEL~i_-O7ft?m__rnZUP^+|u`@k8lZziwX+sXama5l`;
z8UN$^$LbBU1dnnj@Zf{nF?OLs(f`SIU!w%Fnn#~@b#t%B#xLTKb9ul&J1gOQq{dW#
zT|o*$0)*%Ex{a(=dhZ0HqoWzj<@ktWkz%Ewcc9N7vVY4ZleFUhAy(vnZkf>^>N<Cb
zPgImICZe)s5Gw&>in4EKkuyF$hI4BJ%8W9`+;~-@UTXW5R_Rw#uZqqPg|keP1P%)y
zU!MUMKU5d;-ydrxf%OAOmVQoT%}UJ1!Xba@O2WRwGVNV28}>scabxV8Nyz=@&!2Bd
zSwBv<)X9dO>Xz|cu8w`{3UPRUT;kKyxK&J28tajxV7)(E7Rk2*L<XI<ho$6%nK3Z~
zSl1OeMg9>YsSRpysDsLRr;#Nvh)v2@peJ14n=!c2mfH|&5t3y59)~aft^%-OpsF$e
zfmq@n@l@(?C<;mg4U1J>>h!xQFB;O`gOHu}E^nTjxdD@(D-N$mJ|%xD10zHSRhtPJ
zV^h!Wo`a9y1057(l%-?n@XGZ;%-G+&h;xbFa_JdY)%=Cmn!%<@!%D!zS(|TX{WPW@
zZdkb`1LELinnlLg$AaTR2hbYmGkP#v<w4fqwc0n+(eYxHA<-Y`%TCQ|Egc#~^c-Kv
z#YY>CqeysHhL$7E?*#~>O8on?bPFX#)Ksz@0$mwrH?o5lrnfFM0L1dGz@FpxUc8_N
z_lNdyHd^jXK?)YXR)F}L$h37gV@!FMY0g@B`2VId6L4OeY+C{yl`IHwjzL0q*gMvx
zU(;Sd4mm1F<QAKy*E2UT2(U}gh<vD&F-7{;?B`LD<!pNS8zIe~R`~PxIve0b9T$)!
z0J4^{rtv%xcsnKEf{Z;KxPvB@myPS40@lF-*yfjA7J&9NOEf<Ak99svF4-GZFfjCb
z`pHE-i=Fjb<t2*TGAVW`u*;~Kl@3U^L$<a8u$KSSM#3**Fpiy;WQ>|sawR7Ko5U;h
zY6+kLERl;b@)aKwM^Ps5KdZ@mB&}};L2!i(o+7bQ`kel$D(|$bAx=E%*IzHUk4S#(
zE;V|#0jR(})74W1#aQYx5p=XjNo^<i@M&RZeyZ<h*JGVO*TNh?w5fn&9(_G#gZ93o
zMMc>$i6mQ_u)8PoQUh5bQA)jX(=kX>(YFYp*FeYI(tF}8wzz|5_Yu0o=VD*smea=?
zUt>8_MWZdcQsLugeTv5pwl+U~Gx5g0<=|ExR#4s-LrTHr;qwRg$Km0TE~KeA(OP#m
zjYnmtbgunxmT%yLJH<5p!wAU)q7n7GWedv7-1<o#{FNr(0^1*CM5=k}3BZ}~LYJ`Y
zKXKx!6)L_5&1GYm{9Q@=n^kd*pQwgH$|*(+litoDxIgHVNWT3)sQ>NDE(9S{IJ)0=
zNyKW}cU;RYr_Lb|((E*<#n0$EFN$)l5ZiZVC<$lcA~y@m%X)k$Cs{|h1m>}PxT7Ls
z7OS^qSD~6rj-<p%*)t^X<MovRp-93=cYgL@?#G39@933t`vE_G5jAC`sRQcH;1`9m
zfPgU^{skcDsr8LC@^?rO9`$hyVs(PU!o)SI9>ev8r)2w{i`vt2v*QTgxX(%lVaD)T
zt}S+NcpwDHa_(_Gq`<m2M#xR-vOHtUXK3K)jt#^`{i?tw$!8cZWOIAFI1pP=mVw%h
zI2{1I7E6*yMS8OYH(OprUf0ZGSF)jlU_(Q}%Ze%Jp6gQB6<4I4NQzl_61K#(I9}qv
zE}hS&=o&;ie>Tx=pcc0KPLqI$-XklT&Cb4$SA4`A)Dp4J(F32XwECT|axVPa*X1|{
z(hf6UHjG1K#$bxSvTjbGc^x5_t1D!npzJIBDO3)^gXKk7lvyB;-}!~Z#4zRYUT+!6
zehky0wr}Bketv<IC0BJlfg7*u>8r(~vyQHZN2?ru&-=}lF7E|9$+s>C`yY63zMG*A
z@-6dT6AE1=nIK57JdeI3e8Vn>6KT+t#a=x*l`!kni509oZJw1s@eL*3-`mM%pi8Od
ziPp(2&TjZ@9zE?Rg@3XbK?IY`RI5eu)w)%(Xo@}6*)F7BBhm%Q1wy~jV#QDk-X)wb
zE#uSE%p_<D0rQ$K$h3uGNurKos=dA|yX}?szVM}HX8;|WxjSgECnVUHJ92ueLGP!c
zNL@sdI(S4BoNiqtEy_2{KrS#XC{7fkn`su%si+PmqIc#6uoLx0Uda$ncP)QU1Naj6
zrA&V_&)UINd^_EimsXukuLjNF_2vb{mL3os2VsvFKvTOr?zOi(`OF04Dbq`^G5}s5
zB&D0nrKnZxjlHZ3H^*=D-N5p8JCsY|rQydK*~P1}HchzfyK=edeDhi1dFcf1uUTbD
zGa~pv@A+!FY~mjH@1}U97j<TBnK$}K6F%#ZhR~)bE9XipwaR{7KddCm$nps}pXeHV
zRr1S(`xN$am--0{=TV(;)WXuJ{7Rvllz5YczZJ6dFqq-^E5w|s3^NFRSdF3$Ov(zm
zo48w09$a|-yX~L^{==zcckW)aMQTwL3nlxknDwIfEHO<%^(Uo)BC#kq;qx2iyHn!y
zMk?fkqeB<_H}Vic%(Q@?oRgA=3{RAQncw-)gt7BX)N1Ua7IdMAC{X^3;%a=R3N>#q
z?uihyrgV`bCAF4#ZTQJmtnHlj{=aiX)PKzOI_BmnbqH6sd=*wzJv^SR<C|U4(=&O}
zU98i99Rpu>Rh92le~S~h!?#vF+I|A_%I&o#K;C$s!N#`pIEL^wa55Uw;z{a9+UBrZ
z;!gL^jt%W&-<7!xk@;Pq=!4Aqm+v;}yTuXEup3;BmsS#5qn{(yPbWmp`uU%F3^J1|
zg|`{?xzeFq&C!q~P53ZASy);tJ0ui^t4K@zk)vEG-zU3td4WSQ)G$_S+5e*81YXCJ
z64E_7y5hPhj+HjcgrB6FDUgqj0JK57+?uRhE=)AToZ-ovkM;J-N8z~NV$=Vso*yr@
zisSl$S~`15FRyLkS+TSO1t-hP+Osw24_;E~o0IWuALmeKq><Eq^%Ef;GW1(r!nH8$
zHJg6w#IdX757xcQ|2Q00B1U@`&p}rqE&Oqzw+7Yl=Z4HXPkj}_*oVhW`ezd>Za3-D
zp(2&5E^n?p?}&Kmov*39j_lFH(*q^)#8&M0cV44UZAZ%_U)x_{oxnxf`mpHRnMfNd
z4Aw8Z9x4*wt}7e+tD)k4xXi;+r_WfN;G}qAW;_(d(k89fzCu2ofa;5J`ta76M8M9Y
z?E^=TLCC(@p-qBC8<{~G9*O_;deIyIzSbhFbU2Tf=<sW_<if?yaG`<B>3}ug4_`9V
z=FT@{p_cPO0uLs=;^}W<-`&|DaL9CA@AK@#a1-9DmDtA&)u<uLdwx6Dmwu4h{sQ^x
zE<N$~j%;7m686`3esW9fZqjDW-(UXfc2}!NoTvNXadYCQ5Ur4h%gNRO0p>n?taebD
z<HOqoAV^YFmN!$XG!!kcsNdpQ7X->XXef6(o}r2pr=`!6USl|u5?ro!u3^X6HGGa$
z2jC0^C#x#9`opEmvp9+FhgbKtyOXE0%V{4+IzBxV-SnRYuU~GL9i;o6-c}>U=*6dg
zJrQixnQk}AEb2;y=;LY)h-_?M^@~7bx!BorJx4yQS7?E^-9i{{tYJDt<4QG;x3SD=
z(8GV_yT_)Jnw(B!<zR2IL4q-T#l3WT=WhBJ5-NS>1PTTb73pF<?sk(~16_2c^yWzw
z4rijOo0`>Lft`0>aZy$$ofTuAGf?BJ8I}va!;8=hf@&m#Rf>Ofo7VJitHR%O7bqu0
zOwuI0F#Rj^eA8!->{r8RtH4a=?S>LjtW_aCq5LD^plq?R-c|76?H%KJSW_rzlanDe
zU%QuD@v3Sr%H2o4#QFn{A{(0mkLtGs%(54S(X84|kF%vZaRx(&i8aq@b-=8zCFt58
zoT=L6W%e=8<gikxhis=od@r@m)+YE$QX`Hiz&ix<JWK=fJa0vtc`IgBLpPFNuJPsT
z>TPdA8ZY}k;Q^1l^azmUPX6X*t?V@DY5Jxg_cu*eC`#>&WL&J2X5O!L*5hC|LTQWG
z43kofj#(T-Osi9F)Mo0L5#=z}`l|no78>52g#m%EEN3NPELLxG8io4$m0K5$y4Yg`
z)8MJfa<ePq{(wdDOJ$1Y`#<8Oh0s61U$vhmi(#}JGv$d#5q1<AidJR4#ljNbW1>~K
z6mls&%?+K;Q7_XD_wWi_*6Q?V5>JuOGSPjtgY@uk*tsv7EI~%`S(za$Ve0Xp(V<j!
z8Y(AU6w%s0EL^cekTCT-zP#{oDQijdtJzOj(RG@vU9p5bg*Hj*bvKY^nqFmV)uhL7
zL~CFjmqwRoRfK_mL9I>F=vkzFBVt4|YQ$;>&7FsNY?ynQf@Uxt?kZMw8&-_m&1tQu
zVWPr@kyx-3x~Q)DdiHKjk9s(nr`n(@wb}1ZcjfWb8(m)iSe=~5x6(qsFTY68U<xQM
zNlY@VK=>6M1-%^eRiU4_GX2;&IWh0wdmNpIbhK&PSXl$xFuAA0{oLAqqq33`|D9P5
zqqT#>qOQ%(0lk{pjMqy|t^zfIFKKJ1=^3i5P-D_rJvO;FuSb_b$Cqa>;w|jkI<@qT
zhwjY1hIBo5e-K|zv`7w+-I5$3!lrYxk;Sn{xRE{%w_qRF?8w<K)jubwaH%YvyPWyt
z0TDh7Je0I?g-k*5fk;$-@8?{Hj*p7+cFS!OWh%p)BG#JsZ&>jN5q-Hh-~Ih>Mwx+?
zFrU_3dGSR+dPPgCtG|tv5!GY6<K7IiJN--ZGVEji&>y+|zkV$>f&QgcM&_mq=Pp&7
z7rY*ZRPCOeO<`byC8Y*Rv)K*~7z;G8Wa_Ae0tSwrbnq1tV=8WrpISnn1?T3v*?UGb
zv{HA5OCdx<>u&gmMF*EnagUk2?!`aiW>M^xTSATsd_3R2lmFA@02lm@30`edaUEO`
z|MAnRvFv@%7kG1NwaM16Q-`Di7IEzRGM%AgZ}}s2gGlL&`ZK-H$H_OnJ%mw*_jw1K
zysZ!Y25iqrKAw-lFE8iRBRYs?=oK^A3ClCve6%&QP{r(9JHwf}5Tq;4h6>|f8EqH^
zd_(9_Wr+nmZbJA=NcAi4he+p^XY`Ckr0dv1x^Ue_csVNLGU)yvs{S%6&Zg_aMF|iR
zf+V<0u;A|Q?(XjHF2UX1-K}x=0Kp|l<L>Sb=YHOApRvdON5g2kX{u^Tt$9s7e1*LH
z{H+2OxGwIlBG2#*7sFr2jv6%e$mc9YX7c(!XpfUlh~SUzipQkN?J4wajA#@E>wYRK
zX7AwZxV4cbY(2}gG3(b`HdT0fxX;LtP@|DTLWv?^vKvC!U4K6(*q6)wzTkAc0P77d
zCy~0gct>8?b@AP`)|yFiaB==%kge70j{oI{51BftX9+G4c+R3EDHt0ooak2tCDP!N
z37R`9YCvMiKw5<2_I*c2&3_aQ%4L!N`BfN*`|emm!7@cbaYWT-pkF?KLhGxTa6$+)
zIUDR5eN<aQYM^71%c?%I#8O|N`1iA~ayFC`y;R)5$dgD=>CXs736SHSB=&ccaM%Ln
z4DwH3;WH)G2TfzY{u56~WwXBRWfj~BjQOe#{Tmh{0D6>EzJ3llquN^{;-e0#pR@I_
z(2?D>Ho2BE)TmV27aj@_dnNcuXDwCB05f+}?8;8<ZZFRT97@0(kM(lFTVbeM$Gxh4
z(!Y=AE|t(4I8zzTc3(Sl`^#mtNLk#aP(6_7#4@w>LJ0IBXgr&DYwK+#xpMCI`H=xz
zPTcoIS*lEX-a_j`c6{sB`A#;Mea`O?Jm~b{d+OceK2q;ytIIN#vc@UrdwsgrW-->j
zr_rD}XA6c$9E*CQ0tAHOD{138>|Tz0C2s(O1KjRA2uJ`ZRce;EQoD;sRS6f*GSxmZ
z`36?T8XRt1E6_?OInGz=n7Z7CO9q`v190oow7c!^!wbEnQY)1!8mn`2<zjsY-nHn7
zGez#b+-!}<OgYT|<aV@F^MSl7Pcl<uWbeEXFl`OW-MxOaIh}AqK?!wGGBk@Qc6PCP
z+z_vKwAA|Ja`3eVN}vS!9Y5wT;B^$*IG@d?$LPE$mNUa!&RLpU8!G16UmeTd!Q7j#
zbjp-wYb`W=6t`pt?u?Q;<&R)uVyaXtluq@==lQNq0}8?{?&co(cw}d$L7XY;93fPy
zRqY|Q;Ae^*s-F%1`Grbr>ri4-xg2#<aWkf-8jAMJrlUm~C6a|y7?Zcy^pN>@%MCVv
z%X;>wvgmErzcu>PW^*l7xtQ5M&Ebfn#%K-HePni7jb)tSs4);Kb<(<%-V)h7v$IfI
z`p%~;v!Pn49rfODj$>TL+juowuXkG<{@x<5oyg$Z@xSr;HLN)#3P71rhpf~$^njl)
zn+~7T;H|tH>Xl#D&wt=?IQ*q=5NM)gd49kyD=rTvLaxe1F^fT?kSDQ9p(EIaY(qQG
zniiqdsyN-N#K1ePaPWLuHi-lJN(9KX1ZP!0_zZ8}vu`NK%RF{cVzVwD4reo^{H8MI
zN=#nbB28;`J89WTjPIV&hGp?6G+WgZVJ5H_?Ia<~%MsD=AWB%SDSoe^@*SFH<s7B6
zqDo)R$A%Lrug}?ie?Jm2lvL$YlPqRtV)-(-(&cy1PkO^2<G!6z^@!o0N}*BXJi_Sf
z+uhZ1$K$a77%2Ae>C==RXNT+2P_|dg8BgoO#L6N|CCA;_%m&f3jqAyI_2AgP-|fBS
zLRs(}D}FKAU4{{JgfV<FAozwSV1CXGW_nL&jE%h>96QitwV04kvMsv?8)<838uvI|
zpext<@_hM&?=>eV)qJ*Y8pq*e(ovyUeD&EK7K2jtumxD9%||YG+bS=X_YZa(dfaBP
z*ox7b%$~2)ZBll@=v(7GrcI~QhBH+nk&ASQ2#7j8hHvG>r6XYxDVMLokaGb?a3Fx!
zF*x~B7|Z<pEoG$3XyFJnO^>IO&Gc*X9&Z`rz(g)97nMrm^n9YDm3u5(jDhn0f*a(r
zwg6!J=PcD<h@ujnlqPZcZZ6k}c~?^>bv4_M_~x3LYd2g^md~z}ZB{;J@$Jla3xZ5;
zQoX!9;9^Zjeg24aT~in+IpK1rRS5gu+`g9I5Evf)o3~S}<==wrRSb@#$T8m|cI|4b
zatep;EOQxmPC|*acUO0Z7dbhxT6=L}npP#$LpB9GV_K&`K)LqNbf_MpcdxzV_pxEu
z(Q>-fZE^5>>rF^VP>ovK<N7dM5xCVQ*XiGw>e1?{8AGQI^>z7}^wFoAJzZ*kxI7oC
zv0FEE?|AurPN<41x|>t&Rq&*d%-#eVwbV~|TCaW~6r22x&lJ?UwmPher&TZ+(y#wg
zb0$HCfHu55ui?-OSGlijBOa@l*<>M^xcoi~bF*8SQpM|V$c_W6VyO&BQGbzR3Yi}T
zn3TrmP%0P6TmKY$puY37-IzOMl2<^WRW;GWDyOq+gVBK8kb=zEsN8-$9-L!Ni-g7*
zOoz`%_#NH)ZJCG^24wc3jl~*~om~k`ZSCCrctKWuv-b#*AwmO-7a0?ic57g}|0s!}
z|4Y*+N(9%`YmE#-VyK!>M*d?Ph?`hLB41(cv3&D-MBLJ1r#at3r`+m3Z2QJqrBQGD
z1j$EWXO23IcrI*ItkBRb8Y1cke{nPUHZlIW@|fA2v(YZJrKWa-E6>YHn(Pkmb!9jf
z{}*B@C?bfY(s5>`VG5rQ9`V`v`mZkbrgAOCZwy%4T)wqD3969ox&bdS#KYNon^a3#
zm*MEs4zCl<nr=5gF1zPPz@5I){NK{zVu#VcV^)vrDV_TDaq1J$yABv@z*})81QG0U
zw(y{q;2$NPT;iJ7t`x|{M*A;>LuuG<4dmSmszXO#8iv0>a0#v;Sns(To(85Z2r86o
zecnc&W&%4{ES7wiSNCEG*0dP?IALjXxFn24Ix1XnSaZ2-v?-g=`-s*nG0?c_pku&R
zi+25G5Uj|SJ|<M)f5Cd1i=WwW866>097<$x@1FOVW)rRTA5`__KEt7Xc3N$5`WMFc
zj?aBPMS4?$%T*r>J%&E6+2aWdRsyussUKWa$?b6|gUg~$8`^qZ;#V{3Amc%%Ux)=E
zubVN3-+Qcb1zveO-6BoS2APH_)hY)qEWeYWpfoV<oe`{dWJ|g%sg*yyy_j1o);K?f
z#-1DKmij!f=Ftzp#CShF@9n0eD1ZufLSqR|nTk51UDg9We>2$l;%#tSYZQWrS!aLL
zS@;VvQU5AJ#EPx@e745w{q^&Q7G9T_OjU>|Ys2Y2**^6PY%aS`V~tW5C(B6=w^No9
zCV5(9b94O=pulQI_=Z9GEEQUF!dk1_Wbk?4pFUWqxNr#3PO&FF-+!izFSMh_EtkpW
z)8y%wn8QeypOS|>t;0K|lTe-r*Vn3#)Wo8^2^9VrF-?R44DhCt+07O&&p+QnG=Q;p
z9$Rn|PAbLIn%Bb;T*(DbV~0uutD2oI*8OmtX>{gP4&&6*$bg6xS`pA?x<N!Rki9!z
zHk<O-2{0tfFjninE_WKvw_o=vBOEup2;Tsdl~5$v9nQ@aKJ$-5_(KVzF>h!M5HTth
z!yNat+Pq>?l*uht@}*Z*PsfY|QYkb7P@ulk_k}K<xBJg<80H(*Yx^vjtkfHa2RmP&
z^FYJg*4lxl&cl~eTRZC=erfIclGQLxqW5^PRH2<h?H|u`PZ}3}T|fV3aaZXpgb%c*
z8HVmG?U?lm^p(bD)y(&ni*{kF(diOjA3srHy6?XTdNTK9wAkp*moNW`M|rSb8=HTy
zy}*#wbGLu+H?_%ldU-CaN#?4(t0HAew<)*@USY&u$Ttw%%+z~!{p67!Uoe#PX7jHY
zfg3NGPrnegN<>Xgt=QXvlDoTDUu0CIu0yHa`$)`<0Exrxc=ZgY`GH`8FVmzMkdSV)
zdIyRhyiEEbzWp29DGW5)!%?%1!Qw747!&oI$_5$-N{vVo_D`!OL}M$dwf{hfrukt8
z6AE>z=_eSqDEsJ6h1ti8#=8(WTkbL9uYVkNJR8qpo7v6h^4RFs$FtaP?xY5p6O7@1
zo^-asp;Cfu2cMz1zQey5f%D>_QYW-r0R;>)wI0auUxmzBhwEQ8@{bBXH56ikdg%WO
zU2#&_BD@3gWKKd1Dk;KGu)@MCZ!&}e{i)OS{`dh7|Af3+@}*e3$3N0WG*)(P6b>R2
zKDJd>Y^W%m_d9Me`5``2p(q7cK(ZrtnClaS(s7UL<=JOH%Y46BNwIgKR3j!LYHHm6
z&90K*U}=*5ON{oUgRS=S<4Zk?N~3T(ka>MS+xX-ZTD<>?7U9WOrO9po*s6-xj$~TT
zXb}~_L>hC)CHot}0LlCPy4nSsd0<`X=-QPZ&9|F(u~w(3_vl6yB6fdDugqbhK9{NZ
z`2%?dMgS5DN|olxY(9lprD9PqBo==atvRcFJ>N6?X^aiU{cihd8E;r!4azr}+Qe(S
z4APauY1=;1RcD5T2tE%&U6hao@r9Bl<kk|7+xc{Qecy%?BeFR>vJ2<n%R?)LYMafb
zJ&)hD{Y^L1>9K0Lg4f1mv?7ylCVipV$5xhVnR2mcH4LFJ0S_N4mBq{Eee`p*=IMI(
z`@_{g%|;b#D=xmyowGes6KQsSBW6_2FBV>)J@ZP!asvjE?ue}`2UvBQK{2CsK@)PU
zI6yl44<VyzYovOk^)6Y;tjPuS1#B|5A_OTJ=&xK}Pt4tlS}T?I@+nyQyH8`Sx!=LN
z&UBU;t(>dPY9vfbq0k*5r=DeXh{EH@mwWIH(kNK0(Cy&&7bM`}kwm5Oc<9qnZXxt}
z<*?-b4GBw#B2n_jQLWc;wO5(i3eA6caM)8&K=hHpim1|5$wtd&1&)G#`F<f{W_erd
z2ZC>CuD6&gp^e|}MZg*=f)xt&&o&<2o>K42?_I5riDAVYE{8g}@=>jI4Sbuq2u-uT
z?kSSYGrm$Z-h1n`o=5<$ly;a9K^HSSUlb%aw3$yg4GfmOZ${6O&nh0DUQ(zPp1~u9
z#+{CD-#pD4N+tvC27Ss`I!or_7>iLUHRW>o+5nB~&hrGJI!o2R-Pw6Kuscz=_R=@l
zs4mN7Jd7zFNyfvKx?z0U<7H;6&|^~A{(iwJX>6Iz%+UBchV{e1fPn88Jc^ytb@@gi
zqH<7GT8j!%1`gEfEPCqmM{|Y5pMbf_a@f=yUf+u1LYIf3wKt6WJ*v1`)i+JCdy00t
z80r)<bdHAgE*G|5J_tw28DtS$D-~ja1AM%x9CkLdK|Yd|R!gNGq_<6GBt!(*^w;4H
zhIAI@g;RAX)*yrnj<JvOwLI1u-##t~#8>Y(z(7#MU+F+bz$?--bMdkNIH7*bADTqI
z&N2AK4WFM~v%&bRo<<6<f=hoBw)ZR4F%d>e!ajAb(Boa?Vx?<Um2xri4}6#&IXoU2
z67*pPBE-KUX-8k;2%zd)ZTbdx7NEXovpYi~D%2&2v`Ny);qwlSD^0F)v{}*0X|tZM
z^giZ`{vbjQGujP`Uwd&Q*6vQQiPoII!5EDd%PSy_<ZXS+vn_1*YPrB;@l|eUmXUx!
zn;aff+fDE@`Cr>CZW~$LEvpdqB40a?+Mh(~w_JDTiB#j1ZkKyV1?r7R1f`}lgHcM%
zb}miMwqT`B;xTD8LVJr9VU2E&H$GSdQ{DeTcda}>iE0qa1*a*<)9GmS$TM1N;J9{o
z)BR~RTx*e5QQ^s4A8x!@K7PAkov~Rjj_$>>>34M4Fdwsdst{Ht<CBb;3_h_UAQ8%f
z3{EeF42}%{+xRs2T_+ydguk{-HVuJDek4Cb*=zo|Nd!Fo7c;xIpQDiqy$;;$(G<4Y
z<rBDO_60opmRw(|oM-L9718c-wmngKK&4s}a+%qD4nKdgoZNtrL79SnZnu#)17j_$
zt|o?|o1n}R7V`DPY?9k!)&4YKv0Qt1<}FjWd$46Lvb5L+P1r3)^1i&-{@18nrk;Ua
zH-Ek%1uynQ!yZIwhu;efwjXHzQ1(vqE*5Xq=VHMn(>qw6^d_n~E_)ej<ZdF;vDvj;
zS&f$J<aT<P%}&Q=EXmEu5AM5o9uxmA``5&fI2&e_M5Ux;GHI+(7L#EZMDix2oza)u
zJGc?aMxVxXdI}@bzhnn;+C3f!()sc~sj<*WdnuBLzIY7=%IPW>KpVBH&!n}a|E2v6
z-;ac}Rn5slp}tV!bm6)^JCE}%P7rfgK&Qr3<?|<*|56J;r-eqNy~?(n<JsNGz>L!I
zO%MKiq1{(&VVWP2%@^w#rNWbn_aZH)^7?PMW^`G$%B2h-q%FWALu`2Pb$2|6$}C2E
zGwP6IA<7hPp-T7U@0F0-`NpJG;`ihc=pkK}Q0#&vuu<$Io>$9?S_KiZc%Xb1*=1y=
zN12dF1R6m9K%<m0#Lg>-2*yt&_qX6LCCeVuOg&kvi4ALWcp@W|U4Ql95UKkbOol|~
zTe;{&(zoZNcFbBcK$35`7oj$(BXAK_2^rB(quJNbFq&LE>XYsTR%t*9iNIU|Zh_sX
zmGHD?oXDoo27#3n59uGx9$ZF5w5NARdU+qMg|#w^f5G#Y=p66U>$C*v@6#|)e%;nN
zO^4JjxrJWQ_2fi{@b3EA3nk|t?+&Szq17fx(i;af_rryGtv#yL16MZNwMT&eqRv&Z
zsC@PA?Dv9*CZbQ=->JMsiu6M){adjJ#LJ%2kj!9~$G~Vbe|^{=Rn$Wj3a!B!_%A}@
z(7SpXkpIQ-q7Z1~U2>Kp%G9SoaR?*+Lo<OSdX|an?I)BOsxX?-H55^VC5}+{+3JV5
zl_?)NJwHT!VQf4ep878^&ylw+cDf3aY4rxuPq=)YIKhly(WI;%eWdA#C8gnkbYxM#
zdgGDf<7Y|bZ=d-qbQnk7_vetTr>QmT_VTSR{j7}0_&q(JYxUa0{!pB|X<G`NLR5Uy
zTA*uiD(kBVtUR~1GhDtr&rVVw-{jC~dn9n@8Ls7aaD~I;VKtRn0J@xO6DlOWcWrQr
z>!|Shh*;K;3dHt*O_IiG*g<9|M@C<Qd`O9nQ0%525)$3z#mil1KC3Ge!@MRishvwb
ztev+$&Rc*JpgG&yUp>9=mEK>Mj40HG5Kh<XkWL4<r55Heqc@#b*lIoO4`BWf2rJL@
zAq^2g7<QRcQCiT)x%1Q4DR)NM9M1!nN>QEmhNdOb_Xlo69OEFmz@zj8Qb{Z<EH4i)
z$@5tS^~BlXsN=G-pM_sE_x`%<toj_BOySJgibJlMe*=dCQPB`1wN#N7(nKmn$jYM6
z&#BG?s-#yha;SQr<~>wuuV98;Nr@>nbxH)CAh-5@o~4lCt8?D-_mj1oRNmMcz8?2i
z&PYq-ljCCr>&;S#_^X9393sLGG7ZQrBqRVJSF2e5?bsi6`~JL|{5a5TJbO=BN~vz9
zzs`EFXVFfN%kA+x^2OFK>QPBJ;Ivl+*&tTEdlq}qs@>&)tn+b6MOpvk9_`d1`fRnM
zO8{?>TyhyqTo_(gH>V7Isv*$6biywcUpN?9SUjFyi^XAU#W<u&4krtUS*sQ=^`4#E
z?q84VPQ*S%i5HfZM%JtA>*tC4J?!)BMK#}>%!qNdz7UR@|2d#a`+ll_KIZ|-3Tnf>
zq29OCG5Ql3Y(Y_byTuD6#|2oC`_*Y~i+A@0O>iQRls<2a`-MkXW2o`UVYp#RkeE1}
zod@<;jDx|lSshyARRcR~Unshm-_-9eDC4>Z)DYL4I=H;60tCkW!-?G3f(wX3r`d_)
z250BUauL{4wcwGWFvWfi3w3<Y=5U1>MJ^Q`3R%xt@2vRh^!rOVL{GmTky7vsl<!zN
z)o|O383VL#r2!4C#fjz3$<w+LiSLziJ`45#n;ZZAt3U~>O3PX(**kH^Qt2EucM5K7
zw@WP%5|x3*Cbv`j{B3F}QA{9gf9hH9wA<%aUYa5=Yx9@wVoDQ)DA<r3#b(y!&JPX}
z@-+r`_3fPL6^YC1+GsSnrBqR!673XCD?{1}o6QamuH@b<M5?-72ITwpIOEgOQqxd6
zB)yS*a(90VaY?CAp{;KI7tlMk1=Z;7Rj^P%S2DBbY&4pdJ3ga+|MKPYPmgz7ou1WF
zDW}xZSWu8n3Mtj*VtaCA9-+aR*HQmb-O1bKmQHznv#Kg36zTt;1<-Z_a@PA;a+G`%
zrD>_sy5C%<Ng)M)j?Jd}Y}88@r}!@g?~iqTr;f_pzgE0yxw2yQ5ysb9dir6!%td>N
zV#`~uRc*bLrlzrsi3HyBwQgQUO5~Rf`I?1l_1DQanxV}5vxuciwUNwsIX%$N_frNo
z!~I=`G4#n^bTk{;=$Wsw!#WV@qc(}DlpwH38s;aG;#vOd$ITkdM!D>%EbduwIX8Sa
zb`wsKqK-2q!ZT*8(}1|HNo1*FHamZ0PLKQj_}<cyQ|U2lE#R}1VswTqf<_X_f}@rn
z&}j3Z%Dl+?EA}b2W=YF!vEF3VY%JmKzH5c+!6r5$s&|@8IC&mNZM)=bCQ+;d@ekn~
zQLKn+WWW>LjzzcUd<GFrAWIgqMLa1E1DXw$PN(Df_|rc^VJGy7sB(#;+|W|PC*|@^
zo(ltaS_sk3P=+>{-H4_!NJuC|Jif#CNg0YzTxB&#a~mf3#}1DJ(b#vLoHp*W`L3R}
zbPhMOi4c_wo^^**47ykW+9WvOCfPqGU&Yqy`_u!_FhEYlJ*{*!i_PH<NUyldT%;)d
zWo>uRtCA*$*|I%1#0lV#PYWb0C2+iw{sH+|^vc4TpZnBJ;c!XyNjR+a?4PyfjD>0(
z_kh2sfIYN0BAc2UhUej{^FfIY6z6wbc27dv=Fyep&>zn4yB<FZf*a7olSX@T@FTX=
z`nn+pzIzQph*IB4rBF%iwtp<_2=9UWj4pTweh?XrI1Wm{h|2eMb2E4gV6)mm{2DCR
z9VeD=u#Lt$apU&6y<J;zb1E5<j}X?qI@voy5Dv~!Ha7<^n|zbFegf56%Kg+=Z1vyh
zG_7>><iIV3&ckpR*2!tl;_|2oR+1vuVX<7KA2WmrYH`>b`hLNy?|`ecRCnX)eMzm*
z+^S+rHT69Qz^<Xf9y3>T`K;ZGJ>WBtG*_?C?h)m@%odsKOeuZVn)`<dVPcJz2;DgK
zxYp{45@FZDda<97qyRmg9QYn0OJoYNn0h~ZQ3x1Nb=yJ?UaSRQ_Pz0tCEj0f<|Rn+
zCX*6M@u%>AljIkTQIt<}RNxDM2KBAxS;E3UTdy=e$4WH3e18kn&40Rn;FJ0Vz@ilu
z#{P!c8#+5PkQ9QI(7XiH%Pjttn<ZwLMD8u{c|D64qMtpxSit;XdgzgPbg!pLGkHDq
z2k%Mnd@^YOK^tt*dBtcKpi7IPl-+6=RbJE<X11KqHOC7DS`NZtC{#V8hRVN&Auiag
zLY~7{SiWkQhEnPCc=Ka$u6J!*`mbrGP?@_Pr1Z;Vrd<gih;I=PqLEDLwKTgZ&@Fpz
z3er}tDz~`b$r2MMTAJ8Azs^Fc(CP5tB}a7J;d@`LS}1oNbD|Dva_mv-Fjqco#=9pC
z5UC_9s-mYXsni#pmiLlXuv4-^aG0|uKtC`{Rj8k9nofmdl|-7!ktZFo)@<V+9sW`O
ziQFI5#QT2ngyM_yyMOIM$ZIPbA!u)Tub~0Grf((NsEu33GNtml?eY+taaQgGCb_M2
zH7u2O>RqQ{>}NZ1IRo}**~LZ^F$vCo9AI{QZCn{!9FUW$z%N^pez3WkpWN#l(5!{|
z4JcPc`W6ci+(pZfe2VA|skX|-R;!#ZS`MNxT4}Owai8>aPwWjx8?vI+=8XI{3yduO
z{vL*UWF*>DvQSi~1c(TuGiJjWdJxaK;8(zEwK^GWJ~O~!aAvu@V7H|_A<&1Y#CyC@
zF=Y|#$Pj^#R-htK7eg|e*THNw%eV7^uhj0yU~my7Cei6HQrt>!wNar2OrqSRj{5As
zmXjf5Jm6l!jTeeEd<^iZg-ZqxNxvc}v5&Hh8nBQ;u>YmstTmcy-em7!Js)O;BZxzb
zRNWygP>gm2DlP#FhUc*|e*Pju1_z(5YmX%KO1829G^bGm5_>8D<FV*v-?EMN4Uif`
zO^EGLHy2aI;XQaL{**akBi5$XWaIX)`!!pbi0O5@1^SM1<l%VG$F1ryJe#sa{ar+q
zmw?5{E!sCjjn1%8#%YVYoYVRzTlq5T4L=v#87_~7bC2GOW4UIvW#W7K?E{!1Ig&Fk
zSrp#r{pM21c<@=eQkEuj(OQAiEgi0uj@ewnPd`-v1}5F?jm>InJlOT~*#|Zc><fED
zQSSE}YmHvA$A)nOitQk5RNBTr7P)#}Y!?5DgbHJ#D_Q!SQo@u|RMv)?s&6z3D~1So
z&Fazu<E=B6n>OeII}pm%LCCI2gl*(zPugv-ACb__)oVj|c8b+UNp(tE&2|iCM||v^
zLgsL4bo9)OWf}_>y>w6t1OtW-soXR9dY*0qOY-@=`_o^kj(T~<Fsgg8+g0$Xs0~!;
zR0jGL84!Mm1&>Df>nt&t!ta7cKf%GSin)h1jN68f6pfkD6+Si_?I4PNK0YHVZ1pkc
zih1%HFPixf;;=oP=DV7NNeiAi$AT-}==5wbZPTQ&Y<9MH4l9>_{{<Wyzp4F|<M>N*
zrxCm<!xU&0RUiRlC445+sL06UoN&u_Wv&fCfK;YXm}1R)_Bzz#a5u#-p;#GMdb7UP
z6lFuYAKA^jj#%uNpKN-$RM@6WX{w!O^EHw+sKK;h)9d@zVPUA4?Flyow4$U7WauPM
zzaXo57-x7-I?Q@Ra}2F8d=z>%uLE%5k)HXG6O#>h!>K_-eRszc8WQaL4Zhcl(<1L|
z^B#>^>}~1b$Z9I5`|5O!aVSRaG-58c&GE|FUvMy5Mn)rXK&!7}=C?V!TpP5tV2{ts
z^5`hcPodb^x$z(z_k|Stzv8i;NsilgA+>9MXQ$WsQc8m)bWdN-J~;<Aiz-M%<EYXx
zIV^jlUMWk8W=||Ge))+KJ4R{EZNwPf(Tg{d{!fxm^FowVDyx*h2lC4Y=lIaH88Fo1
zKqg)-P71e_cz?dp{BS!7HN0I(Wg@l{v*V@^ZE=ojBumsSsbo)R!{I#!mhUlWGPID#
zP?66L&&N-*^PZQu8o4m)8(3+G!W8z847Ibj)))bg%TsD|K&b5AXXBt1YPfK6wl`&}
zet4}aznE`$jE=~g-Q3*_vLeLucCYjIf;onKvnp3IahIG-78WtWp}hIBmmCJ`L`aQp
zS}RdTR@d8lye?Fm8*Pk*gIaRsf-Q@uC7HtR%-!8Rj*kw%;z$QIP{_j~d=<;l&9PVi
zjYfX9<CQ!1#g?PV>8~{=)J_EDuAT|(qsQm06`yKPD=uObs6F>R`Y>t#fMA(u>E8+^
zZ%j+m)9&^gddVn1LZ0sKt1FXLY=fr$&dmKK;hyh*gGGQdbN`w*vca`^T4Ob@wS<*E
zq=i>ZYIbU1{a&3(C68fEA)@duP}-t%e&bua5JID{wk&6Bp<SrHQYFfvyi6x+@J0_I
zwUQDOWsJRfpkB|ZX;GK$?Q{G3*GpHu@@JH-?JYB5@7TcD=yJqh4>=7pk@3dFhb6jQ
z>L7i7G=e(GGUqa@M>Pve6R0B!qx#MZ?GI5hlo6YI*y6v%ovm-*;29Z|HhX^P_6rR-
zFwya@`sQ}V@Tlvx7@O^kM@+SQ6Z#LhE$%W>$uE~!de64HUSWTY2@i)=>hL4`Fw!>$
z+m%6zD)Z8Np4~6dq^i)`(vr|AV;J}|y!|FHHaGt25dP;EzBS`4TMJbl|IiNCU%2rv
zP0`S+w<YtL(YdU$ORWii!#Ht@NNlM}yVtu9TN%!_x_f+XSj8rBF$IxFYT9t8EaR#j
z(q*@m-~XR*A0bZYAhm>Au1G7K!XJ_dfl})lUHQ)IVA>5SvQYinqWZR9XF$yP`AY0K
z1J$(dPcG_57bL)A$K@*6AV4Vz6p()Oxy;)#+O~$ZT^8nc(pxhJhbh5D29-2Bq$CE!
zm&V{_@ZI>WJ-<KqtsyW@-V3D~VoZ+1%1i%&3)~2*gZ_z{YFH{ahUw$u^OXYav|_Gi
z<8-?%QCFA1!55?BQxv&{e0t_Ezs~As0|Oq{XL!6D9Z@4q#G=M3DG!n&jjm@;&z+BV
zlnV<`5D@o5TVA#Yw;>MwT}X_Fsh7pMs|O#gx~JqS9d~-LVD!vq>aKDN$@Ztz^d~}+
z@3M}if~gMBDr|^Ezd@m2mS#yKmWBZ)TO6!rGgUb{FWp`rh>SDqG@*tVaQZ<*>0ubA
z4S<=VY9u(k2P-kFt~A_j&K#lDZULF=)4$)c(Oj_^?38+aoj9u2f0k6O!C~4O4L(a{
zas8q*VyBEfSUb<lF0Wy*g=A5qSsH$`7au2loqAasjvB-E(H^bN<A&yKhYW-1>M_cC
zkfpKNy7W#4E!{c_>VcaLPtzq}k55rmH{^4a96?KtkkWir)~GN1#{>|5^vnD)S;vc=
z-skl#sct|w*2v6kP6BFbZdI9Nvso&o)5t8|=>G=%;uMkNn&uTH6C64C>Y?=mtidj`
zYiDQoGL?80^REXRELb%yvM$)s+01ae+dv=Bh-~t3)rVj7Nbn;csJCYw9~SOtrwHs;
zo!Xrn@5-WP-Irl2?k<k;mYjCi8(k@kYizzu!m4V*90#<^oGmB>_8byN76+IjZ^UmG
zUPanUd#hNQ7wz#&j<qLwWtrXue5TfGr385P05sA-`C_S2&)z!;Cvn^0FNN0PX}>(u
zeqtq9Et73$1GQF=mc9-dbJfl~usz!Cbm{nd`>5HaV=PHoJy~*orz48T6MD>OqGtYL
zZT$?~`ma0VEy{0)zCCptuU4US=pxa#$MO97sIXK_4tK52YaEVskSYN#N(u5U<i2vT
z=={9e3lz}kI;Cmy;ObB!Crk2mE@1wq;oxt#$(gEd1pt3U9dFuAqB!_}cFw3I|2}SU
z75Cn(uR5%9)#+cop)-6g{WSZ1x&fC@S;bPZ{;wvjDN+a_Y|Q3=)Q@h5;gg!?)!TSW
zth^q6tqxx>Vk@~4n#_ht=EYKIVW5>mMaNSQp2+0IUWWLd@lpFjmphEq1n~HkE7WV7
z&XPbFf~%FjzbB;dShRJIfeXvQ<wzx#2@9tryW1*DttQ0p32|zZ3Er^3*PM@M!h3cx
zie+IhL!M*9GF3}tT{)tuKmFG`Q`CZFJRol(rB0>$U)=uRH~(*O{Fna&ft)$vcem_5
zbA?2W=zkA$MIjj-Z~v`YwiW*0?OELa3WeYQ_v`*Y&lHyAUr?1Q>fPN9kMI5c>8L+#
z@@h;D`QPzv!omt2d1jjcj7_H=KfPnoRnEHq)>b?PkrES%k53ooN-l+>zIU+ui!X9?
zES*3|*hjaV-gN12VvKAaUYL@ws!P>G9;?p%=z)Hs#@oh!*An4TQP-DGjlLVvsE0aa
z2<>E91QxBW7Q#bUAk3g3%i}{_zdPE`6%8Z#`I*yh8|TTPyY3@<?>LsX(bM*Da{d_<
z{qjl!WY4hgyEv%W$dnYXumrVia_e&GmIYEXa|_HPmbiNUNCpwKIy~N7-Ov;$wdo_{
z+b<nr;*NSm1H`;&p1%9ZS_x9}lVy0tnFDk`zl14^Js&*iQXNRn+s8}2P<lMSe13Yq
z|Cr#?Ln{2W9=jp$CCN&dm=YpO@e%7^XHq8b*0t}*T+Ep^Pp{T563(ja?Mk?1e5?)N
z$1QAsw0z-osV}W${<+mBc0RaOL;D#gyp>Ue3RlmKBR*iTLTRopl#7U{e@-I3kr{ql
za7hZSF=~KEaQ0i0(+Duk8#hTBHx192&8B0k%bCg#C_sDc;Ix`72R*6XnxjS+5x&md
z!hNMSFSf_e+KRaZNh^qQE=4u*`Hb%3py!w3>_n))#cZ0bf$5#(&E3CJNh3tN>E?dZ
z#5y@C9;BjT4jLuCT*&6=V`+Z&k=RN4?38K7L_hm<`pyw;Rmz##(%n|&Xv#M4^Eyi`
zmJ)<(EoJj>@!$O7{5{;)ew$`giqD|y0rdn~704(Cvf-sD1eI;23ub!xfB)G2fcaA?
zqu}qJ5WXt&7l-rMD_k6J=@HeY^T&7#1vcHHnN;9z<zpim7{G*7q{26{xG*z4g-|Z6
z9OL%caW|y=&8Fy6z*DcNa%<REFo+j@qL!gSC5rwQ-M)QKkvpSyy*OD_o2)@M$3{pg
zxVG#gMBIRVl$Jq08Xge=@BEgur2dmL=5LsI*LXX&W_sg1X5(Y!!c-N`Ed{h0n8@pN
z1GGucBcwC)4ps|?y@g|KqQAPkWNMIxu7|=l(a$3o*P_67k7zL76ZbCoH4U?ubi{;<
zk6eS7Op$1t<g0pCb}7B}>FOJ||Gm|*#WAMYw|j(tu>Tn6r4D}<w^R4+KZmq?!dTP^
z>AycqY_T|*KiyGKz1?6$Mn(MHe?v1Z${mr{5~8CY<&x}YM>#xLOl4+igl9?*wuJut
z{6!Pn#Pz|$w17^aKB6cdTj7~AL^ItzNF~>*#?tD9VO&TMiimhV;c2+nZLr>NfYpQl
za@ZbE=uDb(IRn-wrm4~S)+H*SACd?~7?(P_+dbi}{RHcXnq-`hnVX&vywd+yQg4c!
zn~0g3TEgxc`a(_q@t2a%&E@50)_(X+bBebDOvt0vC*2{xY1Zzx5KdU0w5z?Cu!z=&
zdqs&#Nd{Qk4P|JMJNf_mjjyf9kCwELJFZIS8k$mhKDY4;O`~_ke6YA(*T+XVx&GBV
zmqNMxMgznJ>(9dJSjLPKRrCm_X9SGCPsK3FH>ONuW$gNp*)msBHd&^{?&%WbKZr(_
zxa?LIDYXe(H2Rl(0t>wPL^RS(W@jS?kZ%`IVg(YH+0i6h<CweKufmvQx3f>-l<Ztl
zCP?P+anW7g${$jQ-L&S(%ZiK14L%}U7J?m99H<LqExHL_g#rg?swU<ebI$h88%*x=
zOJbhx5N?sFd*Vc=Oi2!}IpR;gd>G-_!XIjCd@#LJovYC{NH06A)lNMre@rdAck$2Q
z#x%FJT|N#;=}mFriLxom+*aQZe(2M&@xkFZt&PL_!+CvhrC0to30@{eA-_oUyUgfu
zoxf0>(B`=$XC@!tiEi<bMG2hxtp*wFbhL0JW=qG*-dvVuU+?TYf3aXhA5#&GO^QJA
z^((CV$$_Pjsng@TDT<HNu;Gyec8ihfT=3WdOSut-4>4}h%GL?ek86sgc=UlUNwj51
zVPRpH?q^2k8Bg=Xix}UvPf{H{Z{Jglw#Z1UMl0jB27ZB8n5LOxHFWW+gJ9Z}lOjWJ
zN-A%TQ!VKR|6s9u+c~KVt0&gaCgO=vo~KrBWOAE&FrNI0d?e^@A?0R#&72=p@NAEY
z2!qdcw9m6y&-hFeX~yt3BZ&1G`o$f_9h`PBGams3-TAgZo6UyN!5JkkI5IL-|FU4|
zjl@d5#xPm1{?rjFCbG+_(3yzh6%-WOdFl9xBVy#MwXv1}l5V?rq1wlv6tdD(AVv`u
ziW5@YtVPYTwPpVoxr~Kj%X&qz>fjtR=TrInLGjOfMLwf)Gm;PzA<?VeL=y@dlC$yZ
zU~-A%KiL!(Aca*EQbtu7FnN4&MKX6CC|)zzG)`E)F98;Z6hf#-4yy=wRa}`C_-|h&
zlnGGaa7ZgIgHS4=a7eWUhq^4wtdg)n3ZCfVj0S~^ytVZ>A748gZoOl7n@5nvLy^U|
z8(Hw~g>4i5i=%wMp^d@hh3mr2xL(exudt)-o@;+wLGg!rj{3ycrVCFIMrgP;c&C{O
zi3Wvs5#%N)uj3%31m@}JRNsW(S)|P~Q}8&Xgt_c=K5pQ@WCG0!SvKzsL^8KBiPP~x
zOQI6w^b_?Y1g!PUqqYSP2&@hnE@Cy}{|okszoT%A)64cE{)^;~z_l9q5C>B7AXUw7
zZd2}zxNtyJf@vrq0o$5bpIE4?R-c$<FvY>3kiH4N`43G9{3z=;SZUl1lJiNwMKCC=
z$XOsHI>G**A`Yvd3Qe~vY%>xlUf@;I@@ntRVrM=y8ZyA=UviQnG%KSF@<K|@E+O0h
zk_-}Y{${5n==>3l|K5hU2kc*v!sr+?UTWHmL19t&Jy*Xd5|V?vO=KYkGN5}BD)kkJ
z@7qd8wptOV{HT^AIS)>7HI@g9&n&T5b^Vfo!J=4{gU5L?ToETPC%q3TX0=Y@_X0lR
zPg{T7Bz(Z+-N@Z)o$xq5bR%PTqsWg8B!t9bd_z7-UO%M>jimTPq8absq>NAe=)s5o
zO635Ss#?52zF_d}9a{GQU=8bcvCiY`q`m}3El;D0TvxL$Z1o-~M<ta_9AJ(38hh(`
zKwwr^5x-b#ev4HW7WMkg;u<bu)GD7o+~?!n$2ynl0C2NcTd4XrR`2ic4^WLA92`af
z3d3Lo+Qhh_GuH3u+a{KR=-*TaN>o`Dp!m$;OtDye%pSG^3UqXI4#z+2*nW4%$+@|D
z>f)`7TzpJ$MqSjbl&W=r#cf+@O-;?y^&XXOXKQ(RMa3B4N~PD@85QviG#9b(SZ7%m
z*nw9BKo5W!ZJxJTF1LrFt?eWsQsDw%Bg&lR`}!E)C)c~f0*dDa%>X_(?qI1T^8E7h
z^EEj#>s_Qvfod}ep@GGfT!+dmc4o#o`c!{?9DwU{3fPhU{P`2`QK!|qr;@YbzLo%?
z)@aYfqE5DgW}3b%dcS!$;FvrtT%x=E`SXWFE{8ja;bao*{`$y}OLN{Grrd3n_GhRW
zNz#&WMO=g0Q9Y%1itLjMpEX~m_U`-We<_VAa-2aSGm496+2NPK=pPgfXR~!N0A<V3
z0!VK*k+Qkmau|$M&oW|Utkf>=-KD(E%@6;^6H}#7w%P|N0mM)U>@ev~aWd=Mt)>=#
zhkn|5<T7msiqP_s%OX_(b(7qEzS0s4-D10DSf^7gne6xR27KMFs;UYw??`ofMKz+C
z*{CyGUDgyuG%N`iK=F5M+~wsuIhOq+R(_R=0|L&!020jI@jQx~%_+D}IHlHHt5hO7
z;(o91vu3k3YzoQs#YPVwz_tc_lXIEOVgdu9dY1s0p-My!ugTCVDU)yWpesyZ^yqSk
z?>7x@@4&u_LF|kS7$HyXHm^^ZWdeaPz@Mtr)&Tz-ARmNvoNjIPvD*PsD`Vr`+Mf4i
zU~$5-67|{jye$Bx?aYDEx-XqGGpb{sacsEUZWBfav?lVgLGi9h_ZK}Mh~SC81dPVw
z0Fw)FOvXennYMLub1S`Y9-f}IpDR=7>yvQMq}#nWdi6zN-Cm2K-{|y6Tz`gJG4D@<
zb-_I*kM#$f>}O>`lYrm3<A&Kp#$feSCffx7YfE0~aekWhL^v4*#n(&kf`CFo>Ayn4
zY2B@Du^xIh^1l<WYc-tNLTpyQa2izmNM|uSE39_CJqVYgGOJJ|9$%?eTOXp`Xg(9U
zc-|4C=l48&r}Wzoz!Q;s0uIW+0@iD^w)qEe;0gG0I9)6u???$)8Cj>S%?k+mNuPdz
z;(^j_481%P8i090elE0}ub^(r$J6O_M~W*3=&@P69y@R=hv!X9ey>-<XOUk72L$Ks
zdNIJ{7BIzgfXV7H!X*X!CRR3|pD(a6cO#?1P62urA$Mh2+2!RW;GlZ}U?&eIv$X*@
zl3*g43}7Pvr~N+^<@5yL+^te7i)TXlC8v$h1Gs6mB2f{L>uN5oRmOs6RYcZxFYAY1
zEam?wk;x2zVf5VTXdr=Nt<Y#7`}A9Ze*jo=XX{;DB-e@Zibk&hwcm5I*L)#I5$)=8
z;s`4w>+IqJ%Rjc*+E?%uU?8)-b$5H$x#Kx@7Kqav&F@tzmpl0ws_<3W88`-HGFg!%
z?g2CNPr<959!>xWxA$9bN!{41hhaA?&d+bG$Un%mbDT-h^NWVM@VVVV*DGGY(;6Fq
z1%Il0iD<lg!>HHk&Jag#K+D0!HI>F;bXY4iAVh?0h?BqOebc^O8=%vE+`5YSzv@yI
z_#Y?LUnfZ={uhCf{@bp?U#WI}$zWS2tv~x5%@zZG#qfvfPLsO^(oMztN5{v<6nK|1
z>Hug)rBdwT{k;2IBKY%-dL4dX8Wg6(?cvPF+l629DW3JWR00-9)WZbuR;pwoBn2c6
zR{F0&9#whyX~4ezgoC3hcgAb*+N77fK!uWLes;D%PoKQ19RVH{4GRY+Fu3Vtc5Lc}
z$GQ;e3u4|xX=UcuL&|iJg1dMt2|o)|Rhh<R;Eyc{8gfGA4RJN|vKg*c@xu*R<i7EA
zCabx!tAkuWODn3RHzu=5Y4~LxU;`teiEuf8XA;MkEj7*jCs!W@hx2ow?CWR8-TPfw
z^#cHN*tU~3)ZHNb4J<Aub^zE*!^5I_Qzlmkz7AF%Jnpdr$l)FUJsivXG9(w#D>}{J
ztsql1)>+)o+|Mn@i`qzSZE8N0X$ClE>o5H%7axlpo18>^h08%A#BTTE9s?=#1^_s(
zZU-YGZ^Azn90S6=9b4H$bdp+~=<Y`UoLgdT6UpQ(^?HQ5mhYfHA%s-}NHBdW?O^8-
zA2HsZ*K4@a%Z6DwyJQ=jt8K++=_7_^K%F^3^$Y-zckskBC8j&v>~tLtcO5;qW`h$C
z<RB7`&3z-6VcbdHXXU2$QvOqn#?2zfo#7TY3h!P27rEu-AQ`QTa<Uj@D{S#l%uixJ
z;e3XMo~zNp;T|ket`6*2XkdWb;lIUBF1B8(52xR^i$J4-p)%M#+(_mr&xafkz}^f2
zhNYfGU#i<Z&nO{K#%92?FEJpbXakp^=C!CLxPvMlPIG-fua1blvD9DhKp_1sR?uW!
zJ-`Y~?v^PKkARR;iEe4kCNrR{g)yO{Hy}wV!>Ko3Wq{zdhhM0AKM^6`n-O=%*At7+
zquJqVQ;QEcxVxX=9107EIMqr(L<Vt1vVJ+X$U0wY2u1KPVPO`YDG-T7`6X+w(kqJ2
z$^erS_JoDVVo*4JS=4GF1@+0&OqJUcXjm{4yk|PPjJl<`i(`SJLC^qWVkmJEBp^Xk
z>f&lH2D;S8NC*)t0sP9%c_~v-%+z~D%_EM?2d_4V(?vKGLD2~iI3Av!=c^3^#M6PD
zw99nv1qONYjP7NQPdXj0d#LP~M2f(BfxU_1Z-r9&3J*6uFPU;&=_qnU|9Za0rBBQ`
zmWVjb=W%TuR<ICU#pQaapJ(*=u;DjADX4a}H4qAhYWID6?hl6R3wK9w3`eRroycId
zS%$}ECeL<0Qv#P>mW8d7wTXCvz%G}Y0_8TBq~Tk0>+8A^(6WX`flAgm9IWLF?LsGe
zktrX$&P)>Qj{GUTa<reYIG#to{9g3|^dw@y61er*&GBriUut#q@gDrZ1R-@niRrU^
zF452K@$NDBpt86ViMCyBb?7Hy9f5(z>~(}|{V6$&+B~C1R##~9r-t`KCtEKU%=s|0
z#{&>Mqhj8V4_^93nS;?j7eS>di;OMi@D^07)L^<ok<kYoqgxOXgNy?re`0+nQ>?eS
zRq>nSlK2EBmlV>HTWgbP(baiEkvW-faJzY~o^Z-AKQhnH9QetC#SS3~(~^T3g)4DI
z1aTaiM*<edQd5+8+R3hf#VJ0BF?+63F}JWNpOrmlL9Z1d`6>udM!yyg1-oF5G%J+K
zW~=)K{^eH+s31-kQlyZ4y7j7yHit5D2VZWrXSDYtqa3HxLx*GZj7F2d2h0D6r_Vu{
zLFOspoUr1%>!6j8CVh)_h#1Ak`$RA-jQ1D&fk&9w`6K=f7y<A+x<*dt7c`!Nz$aaS
z0(hwyt;o&mVg4Xd(I2AlD+W^H38G0usZ&9y>8AdF%fw+>3lt9l`#j!cOR-{XTcqp=
zn{+AGa2g2-#Q(M_p#=ZhQgV{O|2>f6mw}~E@3!AJ4@=Py7M93|>}^bbQWZ9y^Xeft
zdXx5j{w73N*Cj82WcEe+Fi_lu;Zs7tBVo4GZv+o%qt?})L#1D_cB)Dnatf-Z>DoGa
z4ZW(iPi%M{!WQPxLFVSAI9-;?$^fNl5lB5n9cLI_0byoJdmuValJ5~SJ9UM5IHOx_
zHVf{mPJ}TV&g{1k|FrS}MF1XE?r<OQoq&fDy{q!0?<6ops;tC={D?T9pvqBQOFgrp
zLc}U6IaB#-JIvF~ftoL;crOe)|92@pTtPs^HcZ0W98E&X@(B4on;;~FH0HKXFl<k0
zEhD+f<*xAyeA=~}v>BT<IIkc!P@xoH4Sq?);@*d3WgNq8vR-2R_3IbwS#ip*AN&|m
zpI3&z6VOalQF5uyMK|00AurI2Yfw*&?40JgG#%7|NDc&>7#G<`k|9QE`8hvKr2f)N
z(GCaEGkQJqRWCOcK@W_J{^UjK6%~|-_JVGRa@UsWBk#-N$e^eyXmgMsnO+SqXL%;*
zxB{e$_jlJ*rZ&$zeWJX&-J_~mhYUusEKHyD3nSfVAKRbTV0$uW@Ek2n76$*sUfYn4
zGX+byBf&Dng<`lA@%lU`(WvIzzH#<9#}^h77eX4T2RMa#xF-uL%lb#t>h`4lBTsF}
z6daSl?c_vum4D>O7fo0aP5Oa^mQJjOXm%i$?$TXW`>mMd2Xr3|GFraZ=Y|rWm;0;$
zRH(?j#wO<hzoC+$bAh3)`0xdo<mKFVC^p+w2nzHZ^z@qP&2f#}eb?NRt92-*EyWiP
zI3yV6U5rqOfFh_?7%G`1rtHPiSZwd1?hS*YmpxJms%hz6o$6Iw-6HQv%;V0X;>R^F
zSD)GExiZ(CAZr|n^1c7HmHA?!QB!{I2>}HvP#jZGep0RY(d-ewKdBX`N48)+ssLp!
zc-lhcNu(V>FiC1^WVjHlW`v$tpm2jh%RsMk`LT3u;*(QAlME;20h_lr;ImtDVxn5L
z1<0+A4+z9rsx_Mw+g`{$e|*m&O3_-?bBu@C5d9-2D$?c|TFY_UIz0W651)LEAmbfg
zRi9?S)LBZqu2gDarU?fp5oOt&QG6XBNSx~>e0#o<+pRAu9FfavqX{?%un*?M6l8wX
z;4;6EJS!3j1S5YFVLjSJVh3{T)Ia#GsC1Clc#+!bOFyhI@o|54UG;qIogC5CMCjF#
zym#Gf2>ADnUO5D%zD0i64C*PPlD<4Xt~&WH<*j^BQ`Z}fhKgI3UQuH8sfP799mG_+
z1(o&`aHWa;gmI9dxWT9eJ8jx+&pkayHYK`6()Z)?`ErLF(4f&N20-9_q1d6k{8376
zC?bg}#6D<Dhx!YWJUq<Zmm;Y9J07#(pHmIT?d3YJ*?D#-%-)h820yuWmC8<b9)J@%
z+#tS}IL^`FXIj5njkW~5q(o4;Z2Pw~5e}yJoM<%rnh19o*B#FZH)?hw(y=`M>ctdz
z+?7N`N&@lU=iaw3U@Kri{y{2=zdg39HAs4Ta|fB?(;7dk+q$;H<$1|1Sx{Www4azY
z{K<66rweWEtq9#!4fNSn85n$>%>m(tBjR=`=zhTg%lR^)9?T#`x69(-XcF7@YtYtl
z*G+88IIs9-u2JDdhwiUUyxqGwrF`5XM}GSqN;~AABS3zT_*-)_i$kc>IggFmVO01?
z89K+$o!R3fb*sWrg>%iz$|e<wrn<Vi7?Xm(m|Jz0;#1P%=JJnTOzwR<_uEzPjrM2C
z?!Teuw(D+QU5?xEc!;rIGaGBC07zT0E(A`#K9#K5c)Ed=84hZNyBa+*CMGUWf=40P
z+xYqUohP-Ld<(HqMjiDjg0@9p(tGLTec7GvN7*Ur|HlqG*{^_dGt7dJXUWx%BTi=h
z8kzv+NZ96cr*wA(Bxb3GKfVeHa-#WPhS{Udl#|5C{iis1ydt^8b9MA6zHW~8$+M!-
z{ZmJWG%y~Bkv>W~i-NV`KcL(UM{oQW2J+v=W5USQ|0bqeq1CVI>rJ}a4R5X}#^m&w
zkVA#JL0j<=$p_hDQ^{Rk<X-tia88Wq_}r)C{Sr|VGd}~LIhfBKZg)K%;iT<Rd@q$c
z=^XvT6CiRW79yrqz&>YBJ=u{9`0T#!7bz1At}63xr;c~aENr=9Y_zelXPRxkSEvvf
z!Yste$c62YEs#V}GTr@XxLN8VAYlF$t~ox_PT6%w+AroQTgtq~KpdwY&62u`9%yyQ
zklc%5tq)2iUB6%MU!k}@5D>F%b2XhG->hE@P|*?C5oi-rWoit%Wu^tlquzdr)97~a
zUbeQesmq2&3H>rWwdnU=C6ZVo18%o!kgQCwk6<8cbaZxhzDWiFBGf$43yK*>FLa)-
zPWzqvxwjgh(hpe=2tD)8`n^JSCggA(8`V;PzS!<^&a1PD@H^s1JN|Yd-QG5rEE@IX
zmbZZkj2D$TjD%V&9{Whw;c`O*YT?yV&O>UNyWoE<jQBr*X7-y2bMqe7ly<)j2ilh0
z#Nc2sdOw}{dXYy>R@)cEv*8R}*$8zNd`{;w@lU}nX#_iQxQ^iutiK7Z3)62*Y4uwF
ze_XwFSd>w>J}lh|2nY;4bmx%L-JQ~nLw5_3!XO>e-KlhkbVzrngmfc(kMH@N?>guF
z2iL${&wln^vDbaCdr?UU?o740a@`K-<a&KRoSSU(_f<qPUk~>NPir>N|LQ|z$2FkX
z62AC-?sjd^X2Ve|G{2QbgNXtY9RGwUsIU>EAa7Towk}4QEk1QgO2Q7^xKUcZ=A>Ee
zkIiZb$jIPZ2>yX0__xcHf<ox)Xoi}~HlAimN*5ih3&^>Un@|`Z5oL3ViEeQlQ;Snm
z7!A7;mhLh3?dD5h&Niw-wI4&v-f7n^qHT|T)h0ft*rPUTabxK8BPQeuHvQXw3(G31
zs_n29NI}LsT@QG~lKc%+V*Ea2Wb-v1tyIaLQ(@Wh$nLYGr2iujb$P~qN{uYqWg{cm
zI;LCdX=~VOlcY|7CV3ag4u#;i33)t5T71-L=Jtp!@E2EHKd<_lsSp$Q?~*A+{mP>Y
zKArB*;BE>=<0_Ef@@Sj1wfVovP_IEwTqMd9u-a!1ODf-^Bn1S>{jg*=#ta(Bt|ov1
zH7d+CL*viIF7o^tRn!y*EDPOm^BYy~15QFWO=rAqEHXe@pO98G=Cak<2qCYSKprF)
zS6?kSv}^TOq5Ku*j$ZP=r}?@t(~*x3Eg{^HV(%H&mRBosx-~$Zb5nugH9bO)DV83e
zZu+EPE*X_sfw8cYQ_K?22_m$>#6U`_mT!|Wgt?nPJ{HW17EGp#yu&<|Lm+SAr5m#B
zsuEZ&Dx3oS%=G2XFCsl&cN_=9w)7mREWN8&C6(?LdH1yYvfIkalY#dIv}IzbblLGC
z=1Z{a%*S7+RMJ}{XrfLJiNCAHPbr@x%Ecu@3M=~faiQQ}RKLNj%o1_&%;c+n;h2QO
z7~zz3@~TcTF>CmncL7NIHe+?y!%%P>;ex^jyt>hvax6w4gU|bT-2>=n0W74yh-b>2
zT$sMQcjK2LnEMgW@_7^+<i~-hCx#E>Zp!>w18`%saJ8%LAaEdw3x|rr*Nl7vUc~nC
z+XIG43^%T%Q)XEj|3p5SR9+X)Ae+J*74+7-t@HiFpn<dP+=TvOdet;nx80w%3(~op
zU&NGaYa@M~zD4L1{XSo~z(FjdP&f_pCvLRCR1PZUQfs;@7v3*9B{#GwPE_N6wHvbZ
zQ%uIIqv;FA;Dj^Bi#K)vbUe=MzD5Fw3x8*}Bc(hrIxHj1fDtkA^rCZ3lLyRT#VnsH
z^f_v<F}3+NV#B0wr{{;3>j$%zEfN?Zj>Kqz{ZzNFE}<>1{}0cach%YqL8JWfl9VVG
zXiq!1zmknYZAnbvE1|icmKM!JDmv_!A(Oa!<iA!6H9Vdqg@n{;sX`X<%<O{M5cZ}O
znUMO~rkpfFw-Gw}He`Mkqn$Nd=DPqBj0(g|TLG;$vnWxExc~H&$%>)nJWXgKuxZjT
z@)21u*(?_9-k}AyUq^Drtg*tTG4L0vToCdyRT&aJy^=vZ_VpJb;*Oz(%$*BT>dFk^
z(@BWB?loIxTn&LQ&#Y^DLZsi#Ii>9=%Z%3Dj`K<jXC3<_{q4spq|<U<S&Z9vx=vr*
zPT;4S`|z>jcE3JZYUg#)+dtCmBvgW@-N=^^H3P}f*475~yhB%woWPD!H_bq$jgdXw
z+7q-Msce=YssCYi*J-+mllnU{BHF}}ybnu`N`(T19)8g)CXF5f?kdJ&Qp+LCsnV4V
z8mC)?m?@BqaVB#V6-BQ9`7>(_&_wbT(>U2bDra_f?5;c_<%517+2OG0v*OBjE<hMl
z*(OTg%Gfe0Ff8ygI~SHy{R%wp#Q(CV8d<>Bmk12~=;{B!yYv&kSl8KXos32suCHeH
zO$>M@wBXJA)nat)p>g|`(C|<|ku_1pFT)Q#i8x7RQnO66#Rb|h`EY%dlz_9BMDvr~
zd*jb1yZ3=lj476Xgrr=)_~goqVZ!}3*NR+CShmjaUoXPQ%b2;KMro>?G6#_mB<8Ee
zRJ7<ZD2WeC4x#bw$h*wirq!w!%5ROsFK&>-WWHiyo?q3@)2H<n$tg-Eks~61T9}P!
ztnad?eBQ0Nl+T)(aaN5eC56K>U$(jUA?GF03vq4JDsj$LTNY3#s(^7(_NP-^<x<9^
z1xB+*QjU3&(A3MqWXmVAng@OF){^-Y?l`Et@13koLEirX(C*Kaa~pqlH=rP@AS3r|
z{#3v`T{vL+LhjMQj~a`4JVCbtYKSXz_FxbbK%?I_9#+~iM8Cb9KnpYqbgQ)K$n$pD
z(dv*VY>SkjxhS7r=H-Fn%a->~Nlm@mDM!4B{0~|cg4h9a>1z>h^V8}ZWK`>8S3}^B
zSB}K1lBnGr;58@+q8t^%y(15ct0PiIeQhGAn`m;RRj`BK>3M3@5Gfkd*4Ac{&V^*H
z1LvFPhk3>zYMzt%StatG)gVHVf_=19ydivW_=YX@o$<SK%UzHJzeV`3aT<A(I%R76
zjmIG37>gy1ACdrt?TkzI)ps7|JQJ)6L&*I4{KrY^&$S6wSR}tD6pU-dW2P~*{K`L&
z-RqEv#o;*WRDOW?^i>PZJ9h{KAeUN22w#{<>csi$D-M|<K5Puv>b}~<VuoCH{$4<O
zg-pW(2P#~Ct}FOaxB2!=3&FfsdW&;CWu@6QFp2v)Arpo7<53^|H_ApN3<6nO8ijVM
zup~OT1K-p>?J!uxj(HzjcJo~pgIcgar=MvQ0aI0=MlK84dhW3t?1U)hPQ5?I(|23h
zA~*h?2GG(l9b+zG%76z2z2zV`(Wa=(=-!}!us%BK?a!@dH49AZnd;Jc;G+$s>Jy?R
zETGXv`D+<Z*O+996F{uYXens0!`us}$-$qzkY-;iaxrsahJOg#qjXbJ3jZ*A%i2sn
zN}sp!dk?!3KkH}e9RmRB9iUe8U?5-sb%PK`0s2HcxfnTUDYiR2Fkor;t!-V?V{>Vj
z^kHJc<g{!}Tvc+Fi?LT`*`@UL#ExT*b?j?X@NP%n>utF)rl)-W=HWWI<At98M=}F)
z(_<1j_hQ*s@$9?wzG<6@6GENhbb2#AE2Q=Ia~T<&6ISaW<hl(G9_RvgBMvr+d@}RU
zGaVQ887(%=4gT2J*kX9xj~(5o8VgHbB1yC})I83@APx?VkiNcmNR?i(w0Fx;L6f{-
zjj~rkxc0Gt$9xjP?czr0XxV(Q7jbbtZZoVDa3d9T>53XacYQ@7l#V{aI*Fn}uE0jd
zBt#HdzLiqxdv_`HRai_-C97SZH~%!cG(pWK^iS?T3LqGb*|?(C&-t#NsV%lP4lZEE
zb19dG31W7WB-LR<cQi5KX>^`FPs=9vhx60|SGH!da2h3~bkc366)_!`9GYlS#K&r;
zod&~x-7muYMhH_rlAWxN6-o151HJBI3=vnDLd6w(_SJJzN*x`~Ieqsgk{^z-L}@JE
z8~R@9NEV?%5nW@6&GNy42?c((Y<D9Z-F57^64s0R{$RzCWEPyL#5xBUGSG_pLU@;D
zPnlyEp*%%%jqng=%<_76^x!|BJ~ISbV0D9;g1L}VeBHkaLSg+g!1XTK7!ezDSE-q;
z#NJ_z<g;&ZwCL+lg_z81Ojr@kJIGdHQEoS{vchF~ralRtyrVcJH-s1fz{;MwaI+wu
z-yT2mE=Ui%D5uu|aI@HWPlK9t$UlI|EOn?x4cDP1LkcJc(gdA`k(>uv$mrjTJ}Qz<
zUNVHDk9TLJV%g)NEWi?rQu7d1K~%LaGM|3)w5}-fnzFun6Zgpyb9iWBarZmR)Mq$T
zWhoOmLM_wwFPx;Vo^HEmOqj)y78!YdL57O(uM}&(seMc6IKTVZHG%P+J`qEAQJoZ2
zFf+m|0y#C*WPAYt-5{D*ucNe3C&gSx+NO(?=|;c5HBRw{ct?0u6(W9|t1xvS{l;*4
z;yVuLb+M`T{F(FLcN7px2Kvr_B)_t_7Dl*l_1hFuL=?lBot&*UY=qOUMG~VOr$tlg
zLVqWoy(qne6R@D6Lx$;Wr6>}IzA#9Ko3IccuOGi`Wd|MJkCrfdFDuo(3x{c@gV4^h
zLUw+?9p9e$-k8Yq>%8<H$C4?#(Y;G<4{~O5CZ=15taPgJ`%D{4wbDKdlz0RX+*_aK
z`=oj{qzRp1kZMdirzQHaNta~X??~jjx4mm$bBxc?2(k^?JUh*#zEGm$x}p_uF1D*R
ze#*vz;JS2E-x!GKfVe=LzcT%K!b%EyXs5b<6Ihw0%8X^b2OaWWQi-h)td-8-JGaOU
zm?*qnrstF0)KME-6J@kmkO)YbIu&kt{AwDASIa>4vp{bnGrM&ZB46>ta)G`UH(`3@
zU)g_?`fQd=R=<HvBZYC6LuqFE@urPnn}KB5JJeQSm(VJ0fdJ{sbT{$@!5NM55x_nc
z5hWDTN*RwSEwSJmxUsOX#PT<2;Lyar(!HDZypw@unku1@v1NeIXgRQYe30~1z2Amz
zbI9l#_k`npr`cdcefPo4$QQuf<D~yN{!7_UbfNeZuN*82N~I+S|JWZL2)cFN30VX+
z26LaUY_>%3pt<2^m5vGC+@na5E;Y|B(K<S}J^<8Zv@12Q?U?B=!xP|Tx3y`ipI@fM
zbFf0+aztM=2@Ny^7g*r6p0`;wKRU%>e%IG=c0xsa7<@?Jr8`|%h@3W6#DrSprTHz$
z{#`@Em63I_sbs`ruG70ZTz@+v1K@<cv>59$s2^-FE=bfUy3maUghNBVr|Fd$|K!!6
z#ehuxiinQDYk+n6``jF((T5q@5ghv2>xpoYZQS(K#(xW9q4W66-qtgx{78^Vq_5|a
zcG3UoOyL`lr6@3H@SM5!NXMzg0ibFuyTt2|loTNkgq%oc<5av_QhE}X$`8%Bb`1;J
zml(HEw0K2_nR53%31}cBACyqUFi~Fh@0O!#i#|JgqBWS$2|CJHzqurpP(tmrbFDZ{
z@=;-^mdnzNL=xbcr30;@q;ZvP4Rww0ivXfil(S>;9}bNTnjh~DDoB*EU;dNt3=Yk!
zgdeSgqp3*9zBRR6d5A#)h$#5At61mZrCbhxYHXq+RK7W<8v*3Z$UU+!k~GJkDrG@{
zA&$5flC7LvPa&2lQIMGVO2N#Tapsu@A3@Q|j8Kce5SVQEFSHA#Tz_`5R|s@g=tC!{
z%4T=2G$RT@JP_^@=uYJPR2@4p%8zkUHalWj1Wb%R9~gf_F6g;hvcbae`YJW#9?hBY
zrHBbl*J2{z?#vXr3lI0J20sl80J<r<|2lW<6S0u{e-><Xpcm-T$NQVlr%_a%HI-`7
zBdz1Of`#>bs1Yy672{rn9eSbBn-aQo>mc|m$GKh5^R#BkVvHfASEFC%3mdq&Vhsh0
zI7E-52V;+U#zkv8>gzD&3P+5Ml9Log6}Ywr$MN&6Lp66Iv&Ba{Tl8gVXiLu8;HOPr
zmNIcq{8z#DgvW|+!R{nckGi%<jEowi4f0B%WX1r-&tLd>xIi9RPHwz)rw(oe@K`v-
zeDlJsZi#|NT4;!aphL6wshs_@x>y(%9F@(XjXV@wN=|5o!qU@=XC5s}FUz;nQ^VaP
zQamLDb4ohYk{*>4R8O7h=|fH&5RKfT32p;pDqX|A6H=Fva3Fu#VcR>s9vWO>7v7-1
zLG2bIq7Wp&C49+~{-mbKqh<FsQ;kLpzBn6RICuC<nsrgdud`4w?xd^r5SU;4Utur5
zUo~1oDYYtpCH_iKf!QM~&6erMjW*9;$ycZ@;Grd&f_2S>Fd{i`39MO4aUu<c)?C<w
zy?^E=1Oe0}`qJ8HT&o!oWS;_Q@Gj@SBA4t0uBtg%n!uPfBePX+Bnue}VKgaq28L&`
zQ_L^EUICs1+}1dfQD#aWb~e0WoMD<XT)g6g3LGaFm-vQOfb^n*d8wZ&X;V`B%E~y=
z{dGh<_ZUr3O*sqtChz!Zt(z8rh1_k@bhTidaGeELhF2j;62l&(#Jaallu5{ajTbSJ
z<Qi7QHkZ1jn=*RZq8bAS?{-sY-5N)%lJye7FGbd9pW3#mUh%$vn`}rZlH+U&LK^MH
zDK>xS#@{A!T{M=#WxZ#ySAUF%S{FS?6#%L~oGk2C+)cg0j)@p8P*qo8TpnbisPk=o
z!1DD+mgM0MIC@{D+U!@`ikv39TUuLgaeA|&u>j$mVi&bV?O_%qKX#(8)_r9uYOY2B
zkA-%FwaWRZdfbzMouIoA!RU0_?mF8htt{B7c}qlPPJ=1i7oG(7*`J%w2HKS)KE4m4
zikBZ4h$a-Vy5n1r&Lep!kKa(Zk3_2_2&Ns<$$FP4+GzXf438H7QfVO~5z~Z1Nb{7E
zL4k<N?bZ1l)aX7NEh;4bo4oOs>mbc4-jMcql>w91MZn8+UBhMyE2+Lq*<5=%aeA`v
zI|gmfM~7XG8jH!P4g}4`^L1zgAwo4-%PynM{rRd6a$7-Lyk7w!j>(z%$m#f{U{D8~
zWmK|8nr?hpIiuc;NW1H9_fD=Wdt-<#|1T_b6e&jIjy4xG;H1g={oTs!ZH;;3z@yD?
zF8$^a$$NPTPXh&#SIk!<Q?E*K0p!rX?G90wrO#$`3u>@vS*SuRK4}mUdR4zIwtX3i
zl?V(@{%n`K){U|>zUJ^}nE&SmzDk8Xn43#i+WlmHR^$rhoHXsGQKCvMS+qZ?%Tppo
zUCRPHSht?@ZKYOuvL?>jFH{AJIJ{~SY+qDA)zCwt6aeW$5m&(+m*hi=4dzl2T}V|O
zXA1v!h*jfWy-@&~%d$Gu7IfA2Unxm{M^0FvFGX9B&HUi<&5a&Td%VqM?0e^=e5@kl
zB4pSOG1deRG^Y}7nD7mc@Eh|2J^hDmE-xzlQf<+n3Eu{L=TMgiYcIc_P!^!O;62*m
zngN(`sRqUelbf!>#3?%6FO;9`!MO!RBxz6VQSTKQ;Pk|!;XTbI0)R1z73UkT6%f=(
z7wGR<<g#f9kJ9?lK#_C(9)mR3;=!<j)GLArX7i(Ovr@MY4zRIhqy4&3^*l`TS367B
zLywjpQ?5V@#nLP?wwNW0cPath1*2BxMV0x5Rs_2iUg{%h@53#F@F0;`%z{<xSIMg&
zhi6A#gLFb+VA2=(AS^pOTe4T><yJvYl>yYiJzPo)m28Jb$_c!MhYUMpNc;#NvxR32
zV2?CKqAaD2NVI*5TGcWEuP{VZ-U4!*NsTNbfrr7ckG!<GG`jCGP_b$_8t)lTr=-42
zqk9-j4b%M#P*1wJcMgL%gE7M%=p&exIduFkRs-ohzzA1)7AR#8wRk(yS^8wfe`^7J
z!QRDKmS(nLGipPnF1QG=4o^U1nxXb;ThK58zjF|R@Yw)$3Cp{7zkAL{G-x(Duf3Bt
zQfy+z+^{!r>*&IbP`?|I@1q3Lc5H2Al?NVZ;<<q9<<s++EArW_sPz{XOJUy8ac4Fp
zy|79oVe)sZAG$2i)`_o9+0*FohJR&caeIFYFP3R!B~AFX&u5xXotRL<927u~^7_E@
zcKoD_2V}*;Tcuk?&AkSkvTb{RU?4oUN|{b|Od~qCIwk1h_fGzh3*I3X`S9Pz+snyl
zt2Z=d%Cmi?h#)mAmJsVc+P0jJwe2C1tXC^~(9gubkTfJNkLIg(8Eg<%Q^yKe3>r+1
z$<f$Lv5|`-`tn5ltJU(u>{_|*_@mkFD+?6yxJWQ@cAib|3wOJu$*TS90KAsh`M5at
zl`W=Iel8e}jaCR5(9S&>*4oRWBl^coQo>TP1H#ux5}R*|m^~7h%r{ffjcC~oyD!<g
zS*<+G00Jcyu(3%#)O#C{{5LOvgp?+692jI}WvHVvAci9R4l?f*n>L%zf}!<RJFdK(
zNxkROyC`^W6g^c@(yi`ycGpaRY+j93en*4wlcd{^T9mH>@C9T}RBO1HpMeR##7i~~
z-x^PQ)|L-&@_$h%<`n%k2&%^n{{$27k@)f54w%=sd6)ZkV=q?AdpjuU<$B)uIOcLq
z`Zc01xoJ=I=eez$@VJTjN`M;SxVmojy}PzxOlPLKuHnV{RA?i=q(MZuH9b}wy*-4{
zP_%%7f}RomrV91qX_*WnTAQyTIb$t$Y*wvOV<x338pqfUCM|d?kNTT>arMR1C1uFM
zz#}E(nATeO9{Oj6|8~56J#Ez@z<0cvSfmaOj#70I4KY!_1WH*#Vc+NYyvpr(=sL%j
zV$IxGhDO~FdwJ2FepiX|;`)5Ky*@{m<rRGGulq~MzqRM3mG6jnF(NuaePaZ%BY=d%
z3=t0AZ+z=;Vu0q_E%Z#%L?qC%mhy#u3vZpW2{vs%f5$oCht7ZjN4b&oX;9jzR&#h*
zHU&oTa{bu4O-40}jueDMj_R<*d9K>si_8>y9{$<!+FO=_JtQ1^3b*uX;sEDqHL^VG
zZt(N1X>z&&)sR~0yB2W^ur}A<hx*6YBtm>oL3AD`3!D#=D6_YRe*Q|z#vK=%jx;L7
z%*yiWe$OA`*S~1ajqZlNGx9pBY}8Fi?CkM6=abLOa{PRe0OQ*S7-87gs0Uo*D%>b<
zCJMCjtj>FAJYh{>E;a<I=<1xNEoy1%AiQ;<(raE-v{Ffu5acv$m~xp5<(37QRogHB
z0a1hsq@@xAZj8Gp6V=@jl)9*Y>_qW#JLTCp#>fiX?aVdwX@d>RLCl0&0#-YV7ZDNz
zE%mOY_#e)VIxNpBNtfL*ciNv$=tROH-R#g%1zOxdf){T-66WEcE0r$hw6Z(d{poaU
z6XDjgG6~CVu8iX^2h2PCKQXiOZ=Y}fMC9ILwp-E%!Vi8tA7Sf^WlP%dA!yy{%?^M4
zKxvEo-C?zqzuJ%Nf?b^wI~8w)mR)%~23p_(GITyqk~I`6h#gD<eX?Zo9yK*8fJv}z
zvk`wI=3)HH6h93ZjOY*7_3=`TqPZ`Xw)L7_m*%0r5)#niC{1XwhjrxkZYUGgjuz$Z
z2Lm&pR$R}qDXL9l146@hCzmQ@F9VlogMkQ=U(=Pa=kJqsONE}qc*wUV5Qg}iXsIG8
ze)G@rejz+lnu#Xjy2IRMgUK&P>Ttz$I7AZO?FEe;I04Z_S`S@Z6!nghqAre{563Cu
z6!dpb6QV*nWeMK9SbmdGMd+7;UPjd+S9w;rH#i{thfc`-tT;r3+b=XqKg2B<dUbT_
zy1ljCnQ@2nQhka4BYnD@D9-wx&ociwkAir@L7`Hw!_QWGDW=%y!&H;l$x-$3%UH?-
zr@h^_0$zu4?x+Sql+IbbkjlJ;a<|}TIxK>s4O28Nn#U5?wLGnlya&!Clu)U<IIA59
zdM?U~(%tu%^iOr0@rJ%9^<|?P(}LBV#=08^03Fv@P%Y9z8rP=7eSavHd%=%Ig+?WM
z1zCt*y}x@=mp9pZl7f_V))xHCgKG)S0hGKL7Y`amioTt~;Tgpw`@F#<da5>k=?LG2
z-~RZM_*Qgd{=&;5e<gSd$;QuA`Hz1y@f)JCq`dAwt*0HnLtRk}h<8T)7b$uo1%;&A
zhFZZo@tO-5=r7+oWyeC_xOVjKNnpN+Njd?A0m(h=m&0xAzO5vB8D@sNiEevyGXMBH
zgnC^R1%b1iAfh!nuMc%U=q>9=&(l;WX(9TdVom<_qq%RtZ*ufSM%ZZAsh!^Vdj+T!
zOY`a=b-WhYj^?6w^;PX+QlcTD-W6FQGVZT7?(Fc`Udnv*cy_+{iqC_?qv?$gFP;OB
zrfD@scuL(NMO|E<4lQuHtUa*iA$UeUOX-b;MKek2$V2kNIuWyw!wT_2K4GI~0FET+
z`xmznc?ys-T7#qBe3LhmJBzMRcN2Y65f!L*gej}?seL2rKrLTd&6RH%7~%XQXx^~c
zPwn1k-A2o96WIKUUVi>GO4fpQ>)Wc-rh*n=9<FlD1#0)Z$$p#r%d+7op0@og-#h)#
z**@>OJY4=T$7bKFgCOV+wqE{?w(sH!hi-}xc}`{hJ*4M~ZOI&MvE7f}RFex%%#0fx
zOvMzexx`tthXn<HXa&9&(`J>TDr+X8S|A>kUf33A7NLKruIO!6{&UUPAXNhG(-Nrj
z_I1m^as*$#H~H}X<+iIh+s|jIv=|J+DX>~WkX}~D7yyIfocBzWe{Y)KZB!e!7{M$?
zQ#qt=??l?{-|Ulk*t7_mP@G!MJ|+GU^K(iPxJBocwM|q{bqu`RMT_WswfSg*-e*sV
zyD2J9!FcKkNh*vGg-m?`P*R6}Hm<x1HF-Nzbiabj>sJigIAd6gall{z|81i_T7lA%
zAx2BT=ivL{<%q8~4xjj4_t?p{AHB?JK0KA3<?NLEn8R$DDK*N~SX@Avnvc{S)Fma6
zgWjQ^uuGiall1t@oSHaaQBokn16Hg?k;oXXv>OX+A{gSaBU@=SFvKIVk1o&Gr1Wr>
z+C)PnCwNlTyqcR7x6a)0nzE)hzkXjc`AcQ<&N6?rx4E8}`hh<*TLKL&QpT0bj622n
zGrAVnvLMN8_^$iIEOyxOF|?y_pPu+_wL#;l7{pGqx6X1J*)`$S2%RXzjG8e7u^4ms
z!2j?u*HR&6+g_6M4I@mZ1E9ul7mVWXY|4k0IVV=<$P!2d!FZOu>fbHBIp!mIt5aF_
z(^awrY9S=#xY`^!V!eV8CWxFA@F(cFx4BJb%5_@P0qp+f7j@SOaJCGI1dp^ly+`l)
z!Nh-h(nc;XuXjHW?RP1S)8WgbM52Odcp$_N-fY{E)uSHvuK3Z~IEJ#esdctL$H&qe
z<D-KmODu{IUcZ*<V`8rcsMTVTSYJWIXWiwfahB^hGeC+~-WsgplJOOwoCo{9R!Y?G
z8xT&`RR8WgD-y=8Ls=0aw(Wa$8t{;er(aPZoyJe>AY&)X_!d=Jow|MN<16un2K(hO
zlRh;~YSd$eWf=!>@Qm!N4(0dQe~umdHKHUxdU!d7IX1jj3Setm`%5}N&{yzSBKjMU
z{eU&(po-OI)wucM@_e#KgkYV#Z|oOJ>;9ITv*6+{i#YB}N`eJ-tb#FZomzVlk9`C7
z%Zlf53ESe$__8BTp40^fN$E=1*k|4eLZNT*B3=O1)K6jJ5`Wz#d#d!Zyt>_VXz{!A
zs$CsLb3LQ`t(OJ37>w#jz5*xfr!Xf~MO({gBf<N+*|xX;+6zSF6GY7NHG&U(`AaHj
zhOR8WNvKD6kW`^?INjGt#z%at$?klzaJ-h2NHPy<Vc1-8o_j)6smwy+cJiRjGJ5+8
z$2zM~V1DF;Oq6C+cBJEG>7(%78N%gz?2V-N0gENJFe?tV+QKY{ox!+;w)-m-R4Tta
zY(w|^*_KO_v8Hc(4IOJEZDOcS;1>kDC321ZZ|RagY3x6erg=rq24z3cIx}ps2zeaq
zuHD^z;pEZ&DP{(~MapgRo0%|zL=Ase7Ksvi*!jEpy1N<oMUqj_?LbtR8IMX^*2J?H
z*^o)+e9%DWSpB}Bq%@YLn*DV0CFUEC|IN$Vi?O~wF|G(}!r<Sm&zsmucO)EQ@u6VC
z-1x@B9N+nX+aXWAG`)5IThb^rFGwOWV0~q%LD(-z6;y5(-++jk$OIP$8^(9kbvH`0
z0CySj+{JKBv?ih=Wzl<Csg{h1BLu^zMSDG&lxwgdrI5s17<+AoQX^B{<2b8CFatES
z?)2^nOGd6mdd92H>uX}}rl;}qz^7+HN_~qkt2#d<`z$sUZ3}(NbH1X$5thap_{qx&
z5s{PI#fkQt^E#<ZeTbuYLG&=>#Jb2KJVpjGJi$WGSN5ol*mT}ESmuNUf>6JwQt_u|
z9U2=-=d0z*OqhIQ{4-%6)prGJFu2bZ9GM~eQT&?|wYX1-vC8dpL^V;0s>VxI$00P1
z9%RduEOmY8{+sUWQme<;Z+E-=(*eBe2nBTD#`b-D?sWv64?epOSVtoFhkw<+#&G@#
z>PjF?jD+Hmlm5!=-=DxB7pXOy)PGXCI}<zRyKQrg%;OQR)9gO-O}k?b=2la28rp1M
zDcom3<oAw`{Phj!43Qa-H1sJX3z8zhXoD@#U;ZAG7p`#c8X&FSiFKH*pWSmeC>e<M
zD`FN1Yo+?a|D05?Fi;(S#tnADCnDanYwo`*&=tO%TvZmzvoWX_5BIlzh!LB|l*ThJ
zq?qAwP-A^h?u?m!RK*I+A)X-m#*iK2H(a|wa*Ni}@_tFIp_)!wBq)%Ei7fW=s82<5
zoE|Ymv<Z6yguDpEOl<gF$&GoW<+VGX(E0H<`+OcGTTBMkC<>ZImOEI!wtGRn{4{o!
zbZuxGUOigNZI|r5ZhV=ouiuf4?s)!MPT`sh6u3UkNl$Y9{BG^9^U9;fIo%eM9w*Hu
zP38PsR^wL7ku~Pm=M=*!?ZaWa@9vfegbh53KbM>>_--8O1n6;mY@bvy4f3Bz{+wpJ
zD6OQ5L_C$DtXVGM`&y+bP~E}9NHx=>Enp;T<c}A}doU=Fq(9n465VM+*~wx<&|9n1
z{4@xw_0r{%boATl8k%W-z)Igwu9OzcIl!&_o?iF$$g_KW!tY{pi(@SFcb<G~XuV$e
zx|O$xEvsR;)6zqlRdh-{O2zHqSFV(6%AcjgVQ&9=OCra-S+Ad;@S|pCno!PK<X6*M
zJV(TYR5h0^xn8oK9vnL!A`>5~2|C}~pYT4*wC)f7F_ihBGnca4I3k?da_DL-!&DS-
zuU`DKXCls)3kK)(rox~x=5em%oVmo8;;(3b))>m`Vp7OD0`&mLa&=bg=E%85skiNe
z#77Sn4)Ub{FB!W%VOOP%-O;vD_s5s<anDs-gPUq!pJCEM$cyFdz6kRpVs?I~_V~K9
zS2(%vr7sZ$qLFFE+6d6lM9bpy@nY)sFZb69iO`$iS`n;OXPc;b(@Q<23}V;yxDGH+
zuY3DhpJ_LV(Y<T|eiY+Y&vre3(un8ii{Ba|3B;c$@#Ejbq_=deeq8V8-91=oI6M`H
zU*w!Cz)-Ao1U(c2N8JxL^CEwAFM}Em^Hq3?I|x^6Jrge5(VhiU@&PX*$sJVUq*h}=
z0{pB=1HWaj>0Wi?>;uAI;3*dA%=iN5?X+ISJMdn+bE!O}ptgoM29mWcJK{2<WII<9
z29xk(8!LjspUb17ID&cfdkCKI4(n0Sd5zp>@he0pzH{U<WhH2>Y0V_y3a)FH+uAEY
zxl<~MjVFDHA4~a_9l^Mmvx;(cVLRrQ-*rCD-BG)MN~_n~U*i^Iwkga|CZT86b&$wx
z7iF~gj5SFnG^)W`c2gIYkIshM>b_Mb%9cc2K`vmCv+j$2<nv#h-po`lVn?bL_G$O8
zAQ*w;0p{4v@rk4lC~aaKX0P;vTMlYAZf^lmwcKtWtrGkEwYrGq+g5vUO2Vqz!D5;E
z+?n23!{y*O;m3NP$SObeoTlTtkqG;~$FL@mWOz}b3!cCENZxZc<=IHyemu<O4^0PN
zz!R^XQJdj>R`=B=s()iePEF%6KW8{(FZnj3Y(d_h3Hjfvb%4+Ht<iB{<rD-Q`S4dD
zDDMt5Q27SDu)wnMM@G&0lowHnsOZ*LjGGc$uLbt|TcvSYxh;qB2!2Qlxqyi3)Ep>>
za@`l{7g+o(z|sy;Ut0VQxGF@i#syEYN2Q1<f_ePLWGf4#<&UWr2UC(J9ysG*TW=L`
z1lzKn2ZNZk)i2pV-uALI^CKFN3(F5_hfHZuyp2Linuc^Jo{5B|wptyI^#sG9DGkp{
zU0ii@EECFN1l|Y-M5m^VZA4!ClorGxSRg^we8x@v7CWfikNN>OOx0-WVyXhav5Yt$
zR@>ZBi+b2XzbISS_qr^X5U3caHlsyMsO@%i0ISCKK>SdrR>w0F-EPVYCW=*l)oo)F
z=TirbBpS?RmMJHmTSCkA!lINYuo5UAuyLO4VO?Z5Ud#6WQ~3Eo#3rjkjWr(j1Gq4h
z63tf3d(EtVHY-oc3)Zq+j7VIFUmk3FVREK)vCPklQa7Wi?_x`!1TmZ@9db~z&Jf-a
z_4Z9^K-3YAG=AKZQ`h0BWz?I0JXLua=(KDi#b2afsfvh}K@E8Wcg~NV{gD!k1D}*F
zS_{=2may5CQIoPFwEs@>6`B|iMPoQCq54gh029BzJaiJcIpxbBEW?a*=cBuHs_KYe
zLMsMxCaE=7x~CONFOXK$7cb)^Y~YGlA+)`vnbR0YEaB(oZq6R`Zp9+M3nblA$hJry
z>&*b9DK67?lQ<sBQBb_WYkUx;swG;;MYVlWD9zh<@ZuX@s1Z;%p)0B%_AVa|)}#le
zbCKR_QIv1T!OXVJzfZX^>O!d2&XOq~RM-u6hNWMLj8pK4tVuWX%Y4oy<Y=HAl*Zu#
zq9>LZgwxZ5e%}PD$&=SUngo~pJ;v%}cwzB}^!Haf>KHZ4qN9=!^pN;;7$xYZ4u({j
z>(PnipD$21D%+9f5K(H}Sets;Haku$N^CVm=B03vY8HAof~EQeyMXwKhb+xqJ@|!N
z5~kmAdQ6^|Z)bLnJy@!{h;+ux-rvkhkA*n7sg1DIxSFRICmVM$x;!X3RAJ^FyIB=j
zj-{(pscC*1!_eEQ>@s2z7AWA=9ZK*Ml3RZ{<8nPI_<c0ix>vV<%}IW(@q7~K%C8?=
zY`qMN!P@P_ArKpxV2y~0s+00wp$yt30D=|iuP4l@dCEEIp$Uc>spUL*jIGOAiGmsH
zogyWphb)ezmEaG<&Xdm4N;?F{jvXClhw}UJ_B@^Up?c6iuBFw@HPX3%OWO6XZAP9{
z*H%|p1g9ZngA1GGfg;zTG2*ES`(Bl^+qAdSqszH|zFGI`K3oiH)}9Nl=&ok=wUZ6F
zr*R>=*=ghQKfY#UN-YQM4-m4U<;r4^g7D@0f8-q{y?tl+_pUmHBWs&shrj~GYw{}r
zndMoDp~BaFzbS20)F62!io};$Mjq_63?TSh%5~-GeVb+boPmm<{82x-H@;&y`xR4q
zdbNJ#*his>w7If;YZ@Mh{sswcd#Nyo#pd#NkMaD?QCP#%e5|xzvbS9ElUBkiV;-{2
zTfcp?+~w5X?`e|i?ii&#S$)uKjWl?<(a&D&iE3xtCjQ$-*lNjrE2N+dsCWwa5F_ep
zu<^(2TU#E_WZ1Vr-qe^zSQWnCcq=_K%M^<}v0s=rtGz_g#*JQGR?x&bjaresjW0Y*
zsI*^&WG{dRF<v+A*+vsl86tF=>=yVua)(#Zs^j*?&0$55orbILGJEnOSlyssa^PH}
zOpgND)RO^$=*Jt7aN@&{mcbJfhVr9ky_5z6x?EFR+Ff@NqLL4Pv*dGHOOMxucYjJf
z02K%dO?ZNBk!DrjSdU%V4bhh8U86qx$n#nGu|gj++Ile0^yTkAeHjmY6Y7o9i%;R6
z8NGa2dpJI~(%kpx7Do2h7dsvPO{OuU-_ir+-!%#j@EvMrUmlh(zO;;Cxc06Nu6znq
zR1oNCdX1}JxMBg%K})P)c>w`wc<`+-UKtwqTfp5W7Gl@5c-&4!>X;w5ji0ab)-V6L
znvzp>y|Pm*78{!^uT6=QOTvzDf<OGbb+k>>Lr2#OShgrCD&Qxz5m!1nS;_+nv$yGD
z6Li>b@<RvKCn8Tx?|6zEY}GIOvga?OKQ@wOw3xJ7`=`#k*%%$gFbRg<Q~Z6y*4afC
z>3K>-N{o59KK?S(b7Q0IAQfQtW4TeTQsHGWvHmD!r&WzJre$`*;K#PpW9Izz$8Bx4
zR(zFLrsay<Y}N2-)x}pA(`~_1*M3BgQ`c?dlPR_H{K-t%)$^0{B<;kmg2+5F)&0OK
zQfk>a=F5K(9g{FU8`6QX|K9?zJAfM}<NC+%Zlz2{d>kCLN}%~^HEr!Vpx=8YQbS#x
z7~;lX2XyjraB*qOdLGRw;Y)v*4On+bxl!v3`0R6iRQh0u;^OLBcNueR+9)-WJX_v+
zaaL(mFHc0Rslr!kiJ`TY-6F!{p3OKdr+(s7Dsy~|YyarsFg84{eqHS5B8R`YxM<J_
z<%ouHIV^8+0yMQ))qE>?C7qSlWg?l*&QA@#w|`c@%aDT%fd2lsxVV~&M<k!&#YK!t
z9|X)3^wrw+k3JgIeDTPnJ(0_bCvjwOk6CDii~OPxBWG8#OJ+0K7WXYG&8{@1%ddTO
zelo`{!}({}-*QMXu`BmFWyhWkVY(xiaX+Oo{G~hdbu?YQ%~158rn0D?+AQ|-$i?}h
zVy^jRjO_@ux`&Vx(cm58bDzHOW&0xHhi1fvg@T10yvu9G>%8hUwF=MF(;?B4Wb{5i
z(vef#PnN1LF>^i9!N(GNzEhWw-l?~;R#jZ!iL~qvCMGUJ97xr>*1`|)3Mql(S)ozu
zSj9(hmSG&bdCESPhHDITOib?UwQur}v9{jg=4l^g;~ITpHi%D)tMzPT5>q!*SMQas
zy3I@h`S$rQMLa^P3}kemR_2%PZieteJ}tL7)(<*2_R7L~`0yN7gjdxhF)M5WMzSNx
zslE_UihxXt^Q7u%KkyBw{K@DctO(iW!}Hm6ej-|Mo3s8g{cw(<IniUT+|O&?M_SdF
zoHE8mG_bK#!D^N!EC|^nmth&l`^0sUG0{=-2BUCNyx5#Q8we!u%9~WW86M3p#%f!=
zI^R2{rS{wM0H7H&%`0?Mu*78?49iy8@y|VcYljzpYA^KCGYWa&#j@IN7kNp%>lO@%
z9_!uCD;9FeCO5yNF^O3cbfm7*k8JlQ^i78>yZ+WHd%IoPSFE=eHNj~9z1=&eka+sn
z5s3m4?hMIT|0u`#pTP2FuS|0a(co|Siwl0F3Yq4r{d@e$CsMo!Xys9V?)kQN*`Sk2
zH`kN<*S&L3%52VPZSO^qT%@-RyL5vE@=X;b@)L5#fwdUR7`t-+`@wU|BONIK4ft@8
zZ-*;+NQ(DUh##7tq1&P5rIctQ*w|vYD2q4gFygkkS*H4Wp((ADwo=&+>J}fH6GidJ
zNm3urtpKgkRuGw~31G48Dlg_pQqv#n^O~=ZiBwS@`D!;yQ6%u5bT^P<zEh0niw`R&
z>)DdrXP=gHwU5CX-!nBu)?SFxA~<9uAbE0=cnUeHe+m1o;Jb-InS|EVdQd1-ang;s
zd2WXot+u}Gh*L6g&oKs=l1njW@I%N>DcZGwZoayhRZNZUr{-T{_$r+->dKqiqn~Zj
z%6M9hTy-Bue_}<bIgWDZZkv=6BcNVHKWL2$m$`o=YFyVgR*Kc((h_MT63@Q(nfz5=
zd*2T+h9{pdPmJo|VVNbQLqez*`y@Xs?XN1jBEWG<OvmE`Auog;C;LdZe5u7Rn)x#Q
zqu+|c$dRkjGiX(N%+|d@q4xNz{IA2N0;6r8$?u&CCbc{I@wnv!v&z5LjjhawtCYtN
zaxE=q`DtSYj!uI^W{$C_?>HuEOX{;G2IG^G7FsHoX3<DAvKc@*A4I>v58xiPlD!)1
z=jM&R_Ng~D?%b>;6&5wb@+YEeu*)X}d6=>HC&%})l6BRZrtTk%G|d2KUQ)!swioh=
zPwjxUMHef!JhrFTZc^Lq%xl`qVb!-Fl`Y70qc`=n!d2<<#6jixcCD%Hhk-3mv0Jil
zjpJkeG|HP}3sd#JU!0v`f+28TT+*)0)%+f(-Kwxjf{9w@Pl!ANIng7aL9~ton<56s
zj^aNZTMAvR#cJU18FnjKkvea7IxOa)8C`Vga=tv#8Uh}}LXRH+ea?#EZlcqLDsQ(e
zub)y3jc>S=E%0y?;@=DIC#<ICo0fj<ORqILwT~%;=da@TQB)=GujOyKJ;st%b>#-7
zSDTl*V)or3esJT$7UdCgps;kGTt${{g!T(A`6brXek?j1St#KJr3<R<t5=8BI-?(c
z=79)sWKWiGXUEj383$N%-}-G$3@(mX#Pl<_sq@5?9a&~fmOG~z!#BRRoL3aRAW5?g
z(MhwZ4`2VrYbSe~*z#n{v=&P_$fd_CH&H@-oNTbMlKICb)neK*f1~lUUW{LBa{Te>
z4A>}*=lD?B6u0u`C^)d;bcE2yyZHU@n<hJL8W57;X@$^!kfOD`l7eF5Q7K;uS(DLO
z*}Jpr2iM=@*MhHgf^F&_*Hc~ByBz2W@kDY_24T!CzA+2ic@mvQkKZ$3c8LD%9cj&_
zIbCFvlXh=9-=9tfx;uI)-lAdiS;@F$%DA}MG;Fv&#9yV}>Od1`V`1<jr|+hDbt?%X
zwJ|bMb%7$RYCmGcr*x5t+2VOcJIVI;oA{K3Ewlrk^)wxGNl{~lQK8lwODI{{<Zw!D
zwQ0&aNjto0hNO*;Lq`~CVyzBE22%+?vgNUROouA)q5t&<+JCM{$9!Uhh3DUe5*<E|
zB=pqss*EJG8HzZ@t{KDndzn(*0{kkEgk`nq<yoxC>E~Iy*x#6FAsKp^H?RN9aSw*h
zGRi~DAX?h|W%P@vdoPdf5{f_QWdha}TvY!hA{2=KDK-83er~wHaWMXS2!E$w&{+y>
zMTP|VQ3j(PH6GUUN>}Q2F1#hUDARRjQ=hAiiYVj2RVN3U=sy$rZYMIUCPU?ae;H~J
z1)6MDK1GmLMl*1AsinK4r3#)APdBu*+hqXRuXnuP)+ILQnJy?O0MK-`hUF7L(}i~0
z4{%6;Zb~V2$uCrOSAooeCUkV=LmYp#n~fzz_uv}tn?xAA_1X`;@3Eg}J_SJ&zx}Q-
zsw|^?Gx8{kO!NOQ<v;6s!o|)8cY5{g<p2+0h+h5^S4zEPqw#IV@}N+cUwBxU`Yc3c
z5fvT~K?p34UryyPFHuoaYCp(}0!s83TTyJU$%LA`FZZ_lJIgHce;_<owHz{*&koyt
zej|KmVq|1QK^+If{<dXaHn*Q!48_B{EY9+s%~(Fj|0>+F6R!ZD3xmbO)Z<wi%0h)i
zh0JLG_v7_MeFBI9M)SQ#Hj<EwFNIHxOsWtnFFLS}XF#cXy3V334xz9g-{IHV_Lt|!
znRTC|N<h=2U(o7&>!(Z%iT8ay+MD-IryWWPvZlgL8(mLNPnILelomNmN@<*hn(^`w
zb%xQe1lss_Mwa=n(u&FXFOKhcvkBdSQA!FnbwD#*ijj{kARz)|QH4|eU9m8;{GXfj
z*d}J+|NWqhsh=dvwa$>}udkS`=h$iebkv#LpKz>IYwISn++@s{v($0IS+3xi77qq<
zXXvuYzx&<0WJJa%Bmh*qz;2{>zic%%H5C;jLtp7s=$-==<%vweM%(Ek!X5l2*5*GA
zGycAHoYQFb>41U?0PHLC@8Su?2mHOBmrI~EbRFXfK(<bXJ*8^ZNytO$_6pa$_uqDN
z4v;$pChyJ}gj<oX`2sA?LFNxlPTj#sFc9zSkaz%WRK*4TkHZa|2yE?t7)klarX{;}
z6hLdJNz~8^9g)V0hYCudS2NL3?5<YE!}3^Sj#y9Ei<CgYlfo}gmw<{<$MeI{;S7k^
zAw$5uj3XLYKsK=lplUV5G#^D#Pg|d#<-f<^;p*a|ng~ez$TZi~IIXpLzdYY|GCVGo
z)j>5yCMPRT@4Go=Ae=6s_t%Nz87D7a8UXw%<#3axE{=+}`oAs(|80vDb*K?8cz@>0
zwXapF(9wLB{^JnKjd~2hzlOFB5*_E&<0bgrbM{e1TAgBH;;lxT1DrWHL}GSFI;<aB
zJy`NoV8lGX!c=3TZ<^lOFZ~!=F#KVI6To_Yes003UZ<8%Ob|Si2zTA?dt1~79B~7O
z=1|cFN{KK+`&D!;JqY|#PiR!Ir9-Vg*5V+R6-6r*2uWNH`rn@7zubl>A#REOlqjLm
zrwc-Lq+w~K8`-fF<~lX+l`jDg+yL97W4YPY`tQSa9IU2VyK-qa#n^$Hgx`f4ql5?)
z1f|P#!|jk_nuD@ytM?K*z6+rP#*fV#eOztfNvj{~oX!zL$Y*lR{W2mAF`ysaXI(T=
zj<mOOusW(g27m{)62|7jw)i;t|Fg%>|MvLhBi;%slQv{2fh=w~Zcw^=)ndTjJ{fkv
z%)k)2I~Dyj?1g#HF9BQ%_yydTn0uE|*YgzE+7r*X57Ba|pmsy?G>&cOVexrFt+<P*
zv|=2N8hw64#^3GenpRpzN8h8h&fdkrR+uey{;$(xr>2;+GxbsvC&5*aXEAE98#+`f
z{$V?pV?f73E9M^Zxj&i3U`6030~i8Hmd^raEf|e`{BAX-?;*5m8_9`M%AcvMVDkIh
zIYb=Le0qQ3BA9Lo&I$*b>zjcNDNgPLRyqLzeP?qbwmP<SqKgeI4m*!xe5h{YURM?D
z?0BX5dl|RE0|WRn4mW*8T8m2-fTxjA9PrlsF2?}TuYi}ow~}i2LE5bW)63x^SL>6v
z?xzcFz*C^SA!|GBcoc9uaGKMzCgpQ70rZu~1MW=4#o-;~-Fl>)Td9CUFhat?C(wKz
z5%)$!_^{)({c74{v-)A|pmZS>F2euWioJhZafyL<aMs|k>T}|bkiuf<B40dSu2Y$_
zI6xk#xTTLSB<^{#G+UYPel(XG?RWn3-DhNA7+r~JZy2-QY>|EVy7f~T4X<>F&()-i
z$m1~=USWB4$HT0SvHeU*oWYwtLkWod7a05QAT#Li^W$lXp$E|=6&d<@Z~;FcQ}qWJ
z`oaUa_m`cagfWed-$m)VxBg>$?Y&{o>lJC!fIEE#J_S#8+x11@gzb*;u8#^B%Yvj~
zYS@c|qTMIAz*7p{uDHGtc{)q8&Wgmes3?{e2KPq8|1*5+Y+_RD5uK=NHMs=xz$YAq
zKc*f-P8Qt1=d0HNa!j;g91If=4lsK4W>}WL&yQK}nS|!a-1*>tlq%Umpv3b(tUiJn
zo+fLU>f-^Y{^w7t4xL-W<>qhg9M;<dXNPy2o14uCVx*Q-9o`K?fm7>U1dAn)#{z>k
z4}zk`uJ{#FD%X=z(!B>rJxyMEFxV(S{@1fQLhcF7dyCfDJmb51B_wq?r5-@xxXt%Z
zGu`|5{$t;G3QPe-J2nUA8Z8YnVZWwG<F&`r&Q7*o4wnUcl-tv8uq_a2?wH!I=M1hy
z(`W(bOu}P__dv>-LB|~j8&#~Ut0Q@ha`-0XpST;P-<H?ky>v$-zjLWx^&x4*(pkl-
z=Zg_GYbz@k*P_Bg%CmZ#NqFGGuuWsVCJ7OUKN5=HMkH>AVaDUZn>q{v|3O*cW9e`e
z*)sgz@mL+$0Y!=7|M7-^*G1v-ueiKp)VdWH_}Gj06b*r{O>!5q)=M4;UHy`<Aupk1
zWWY?KCWAcjX)0t5oewr0RkhB@+ZirF%*3wPI-lbGIHrq~z{Z#@XvuD&r?_um(g(?Q
zRYpwV(7q36_&u|sddpCVhg=+&X%AwXiOB*WjAoaYmpA$<4)$!+?$Yn5Ave16!&bwx
zi}(yUKVd?x%;*i~1Fy3h8CvbBK+SNGQBe}kN7Kb`nrqLWs9q`ktWF!lTpZMugT-=h
z-#`p}SfU!^K*{Q9fMMs`Z;VPoICJF0{<j~I6o6BUq56w%H;8>nP<gym>6@Oc6g`yV
z+;%mk(9epL!FD)Pnr|1;FL1w;a6Z$;AU!0_*f>cf1%hn%!#>WIC2{uipaqq>j7X2V
zJwM&g8~Fl}ZKe+=dM)75UTe=C_$!TKWv$k5*NpuP*R}4|2fK4Xr;!_K6LVAGk@S7^
z6C6Txei^{u0%3b>-kdb9=jnG`jte5*I<u_A8~6@OXJy`%;w%onjS1W{vII{k`^`o`
z0Uf*<00Q{$C<fHIf@G;FhSjKz>bE$+635D(B+9FkHDdl+3ZLGfM=%1DmtqyA%bxDG
zzm_L19B)}m#)Dl57tInbPQLNeG;scz%tt}mYIwfPe!(7>eod_97*NT*CT*uq-QV%2
z<w(;TNM1(ulK{ZnP$-gqTjGap0xLd%V)X<hvlxar;;>3NM0#m58CO$*?x}!n2RW!!
z%SZva*3ZDgY6>1#CgXo^Un9Qmf14sUeEdFr@F^S(h)1!m-fj}pP-1tQGW0}53dA2h
z>iT$8-6`u^0HOM<%Td|Okhwpa1%#HHljZPf6x%FY*<aXtwQ_h$jA^dp`~xfayYc{v
zlCPoiKB<X|XAEF9(TAZ*P>-o4fn{Ps)+&A;5ilDxT3&|j>{xtgGzn#k=TxoMC{pBj
z;F#AS0D`aVe+A#6F<QVPDkE!|>f(aL|IzFID+vFCiGg3};pxQr1$?gPo1yP{>oz%2
zIkA1o^1`fLjKlflac!1bc(~(<eA`<JHY}YVL|&d;0l7WkvRWURBQF7%#%&C#Ag4>4
zl@|>T+DI(;p+diYc)|@;{m8?ox|-HsQ5)QQoS-6{#^+4sg2t4e!1`@=>>P5TNTYFY
z0Y=@h`U!|tXHi$Y2@AQT#~Is@Z~(8dU&ntqC6<!5bkl}V4|M<scT5)6Km{VAhi40+
z1pyJ}73}MG%`uAlQK3_YT7+)0I%wZg43JjEQPhi4M8W>|rbO`lTcOWM{d|0zJ@W=q
zHPBn@_5bnq)=^ciUH7meD&5`PAl)6C?rxOsMnFPJIt8S=yBk3|MOr|RPU-IN;*IBd
z&N=Vz-*1dP)D4ck@B6y1SZmI?=Hg2Vbn;sWpA}Q98R0>`wv!xu>ou*eS^Xnn%QWlc
zPwg1hd*lN)hAe|xGkJ}4u6U}GlB-#5e!jAwc(pb?QcCm#H2i>G27a3XK|3f$L9H>B
zA|<~dijh9hFUV#;evXaxvpX3ekB&i3mGPg_GKmy)EK;_U`LrjWA0L^-$|MoiSE1jk
z<re1pHDZ}#cdjPd%4{wKZ;DbQwr~abY`zjE%8;g1NU=@OZYzr2%eNYW$UO)rk~7Wg
zl=7~4i`ZumLef6n9gssJic)G&C&&0pJSy_AVD4i~$$6>2-1R@Ef&@oSx}gUW?9}`M
zH~Z&b9y)lVL<+5-mqCu!zl>y@VFGN&A{IX!wl~UtVd`TF0#=j9C=WoyTq?6(2n1j_
zt<Z4Ao>;~$*PE%dYjc4&rz31*Rd;uBl(G&u%CFjTJoP}d4UA2>!V5AzHj~9&m5uT}
z^kkK?n44}7S{3>B*;(gt<hz=e!IV~`U~3CJXggJSL7Qy_EG}<Wf{D3hj8cwevbibU
zoK){jAI>X%+b@?sOw~F@GPNCv64WTA{~Wo_c03`IS20b9Oyb+ibqrW^=td;=K0|Hg
zHptG+=A`pvZqzRS)EJ9YYJo4{$<9TD>TmU>6@GA#@l-jxyM}skoMiCRdoA^acR6+F
z4EVHNlJIH2&NF69>5eKhEvo1IXY7D5#h)${LEFPb4t=hO?$<k3$-it$bdt+Gw31?<
z5WYzaU@yxuH3z<I<B!g{Dw0Wjwf1f4XTHiytk@mgW2w1i3V$PZAOwn=s<WA!*pBQS
zACJ4zAP~Y+52flt7qo&OQU;P95$UGnKfgPOo8cA%seM0%x%dkP`3s}38aj-8Gaa-Y
zq;5-`Dbo%ho5_)Yr(`jbTQ!+}fy4)9o&B$->RrFL%yf-JQ|?f{=kbwhx_5?J*Wt~~
z@UK+MbvSm_OUU(t5u&r-%@q~lF0sJa1}0|}kfvt|E04k;ewc3dEbpm_O(7}m5;Hzr
zY7Y@%C^Lf*hA#F#>!@rWLit4mjy%@AV&E8>n)=$1lZ|EQx8imE5SWHSJWLjqkfjnu
z{_Cj~@;R<WZukA&g!T1RpUudz*8_!_K4Hd$!%$DDkI?-YdTG^hqWPM*9_!LFMKYNa
zn_w{$he0Lwd4f;v%Sb)lHj_c)+{~fj(XZEi_!?ze&z)&jeB3AHxdgxgmG!EU+Ib=;
zJB<m{>WpcdAI^V>S(ZQMEIE7sMeZ*cM3xmA%mXQT8&B*=$aA`vJvhZ-GUO6<q#-15
zF-IYi@;390_R&43*ISwNLIvSWEg`R|6AQOcF$kY4!?wdJKP#kgc}}FK9!7~rCPd1d
zJ)A{DO5_b3Tz4b5Gf3u+w8BfK^Q1omGvdd~j>r7hn3R$xpIPBoroY&jxTu$EFe;Ro
zkmpX&Gz~&Zh%hX4`O{iO*mR0T0lVrJ?Dml)B_%PbXo8R>OC*fOLP-hz{jlx&Cy9Nh
za}OH;OEejP#OY*c$d_`L&c4o!ap5=H-S^8S4|qi}7p)W&8IO$wO(*@BNW1oP4Kg^7
zR#Mzr&#W*7$}>z!(O(BDDfQ_`^G!)HPz8V7`0Y!6HF)<&O)B!0i`^n!B6V1I1;U2$
z$ncKXx9)SooPt&(JlWs2H!qhSkD>%#o8O&yyBtLUvt8Pvu#wVV@>J+pe+e!fps2rZ
zZG+HR<$>>2HTavCst>c1&HKg$E0haKxswp$v6n<IFkB=`N9yS2#<>Ldru_`93lWLi
zO!iK)h{7ctYDEepi%$VHTzHLkgAMLrbV$Mn5U{HwBIg(J%gA=62X6GithHRNhGV}m
z81cWXx1Ob%Ky0KUQk5?BpGEV>Ky|he_bH}nHuSj;Lg3EyzQ9#{*W=G}3^qzE)37S4
z)g9pXXhdZ2aYMzF{PEsBA@YukB??kwvp&#h{d@@0=M^amB9ga%G{8ytbS`Gf-;@|{
z*}aLpeX~RlY}U5t>r2PO8n^B*e~%qhz3oPEJC0CQAyh7E`|VLlN`%<t1tVg$z*zy3
zoS1RuPEi9tHhM$)C`ZI9?Z8yh1+Sw)&Gc3mU&~)??(!$Nre@y@v5Jy&sGZj>>#j^o
zXDGRBoO?Gk?{3yKNn@l&e+2pbw@T|j&SDs6C~5^BGV@w|_&OXh9?ysCt;Hy`46G2R
zo>Pj{X2$0joBBgn(I^gn{@q<~Lj-qem=MzHK=MONOH0G&RcR+kNF)uNQmuyNBP}JG
z7h8adTy#ZzoX)?CjI3})a2(VYCMdR*18X={tHqaSVzDaDAG+vsyPx(RSjD78-GDs4
z7w01$<Rb`qUcwAbGH}Yk_bd$5W)DCz<c8`$#s9RPu?Lh`Fq|iAoLlkdi`{8iOq6g`
z|6_EDuPW2aZasFx7louiz&r!|87ZQ(K84xT@hzen%JZcVtF;brLO@P!_q*Bq6t`VL
z^IL=62qNa-XP)Zrn|`x^8Hwh|oGWJJ#Gtsy)m+9P^@(YEExmVdp#K6J(8Y9ysz!zI
zilnnBPQ$?Dpk?Mdj|t4qDm%C9CR1ar<EV!sL&YIFzP020L$=MYRL2qle8X3>KZe!0
z?5NCivu;gjAck@PI-?Bgt=8)35!$<r4+$8s?V<f8{X`!kHb%DlZLcpT72gYl7W=rz
zs|%Fst0Ud11M!8nhfWSvV3j(6EnTNVN)X;I$IgqF=gs7al;_K074yW@4Lh^8`3`yn
zB9jdN?CtHPq@sG4+r_4;dU<i+!`cdbr7vx^3qi1*$4tGnTSVMC<SI1?saBr(bZ*6{
zv!3336LK;1%QAdL?9E^bA%3}U$!Yd{u4Hs4ERcKnsF=ckK}!4hgcSd0hzvvKuo_QJ
zBs%3BqRH#Ew!c+0OH~qQBc8W#WZanza&n|h@(fOoDRF%=lL$n{P^B8+4E#B}ubd}5
zq&UUj9|}%Nr;m4U*3eggBM+L470Cle2&U`zk&~#hXcFs<qpk!KFYi1zh7D<@Qa66i
z+ah{rbr5wKk+#kO0M~_PSb5ushrbb3{Y0M<zdF(DckNBv(OvC2)Y_<oo!OXWX_AIl
zl>29&lR@ebY)`D(rWQh8w~Vv#K%(>o&`&sRr?@a<cW%ZS$Vb$R6AlsPiroBzdXBh7
zU-k0eTD+}oSX{oo2-F;e3vN*Gr>k^t*9fy&#|&5d+b~p7Dp7tt**K2NpV=DAwEv1=
zvS2!k5;_tbQ3DrDN|Q?y9{b>LR%f@!YbAKK{9*(p@8Ik00mw-Xm%jx-N^2PZjA6_b
z*@Pxe==Kgl0Bnuts4mOJYC`T0_XY$XOWJwJ<g=|zu!N`7IayuBQi_mY^djnQ=%{=S
z$WLjNa}BFd=GhJ~ae?KQU|YS{8sZRi(=s8KayPB8_+aX@-?>-Weo}*lhz9(TWyPH-
z+*!Y#AO3#CnNslyw^4&5;bHjeMDL333_ax$Z544~K^JZXpg?BH^>IHjECCxa<;#gk
zt|f~K*k-vh+`_LwaE9|jHhV?7M~sR|G);ellWd2h1<lc%A;e5j78tlIhdaoxdYs>#
zj+xJWa5{^-!NcE}sxtZpm6icq!V3~DjXgdjYz_PxbWGf%(h@uUgkqnIGN9RH6R*<U
z!V>5xX@d^pT?#Gs$mvNG^VrA`4f2E}75CBO4`6yRdF)+#k@RZqqW1e{;QfZO&+p@K
z{6K0iaA?suhHGde&Z%P3#!6i=Lw~y6v_2)b6~1Oztjo94vRBdV;r#e;zdn5taeW@9
zZ$e(Fm8bI8yd;pv4?UZp_)W!O`=tRF))i9LVo`2hx*OlnTBbISikTU{I=WPZZaS~4
z)~k^98Hzd$fvHNv%(7|Yz6fPxeh{OHknYaaQp49JD~cqaEr9;x+#*Gu8j7m8O*^}(
z_FtO;&|YLeS%8S^v6BbfNjLi$k(v?tI=ARuna=UBRkBhCAZDMQCw7Z)s)&^vsw&p1
z?{IO=g<PNbyk)5b5G=Zwj61EzMti}lwfk_E8q~Mm`?{QJSp#l8d@RBiAYaLXGS2`k
zhmBf1n(o#ZWS6(`k88=&Ug`gZ;=uSqVR7Y?-$@djzz`z!A1t<B`=_tZl*Mh%D0+JG
zDk^>D|Kvb558ShJA(43O7DjebaJD&SMTK!m-0W0bFKQjDHqzSs|HX~Liqw}2&(R3<
z^V}@Xbyplr6-bX}_30_}b==k40vYPQrVCyD;}O6n^je%}{r-?**U`I<tk}dD8g)EI
z@;os>bzd`j=ABRDBjZPb89FfV{y-3-<fJ|M<I9WKxJWJO(b_E)Yp+4b?~z~e3AzrG
z68p0Ir2L3ZPVhuNAC__-cbBT7z9=F%+mSaOzwY`)fggk>zaz_e^X}%=j7Iob>tbZ#
zpT8|-8OXBu6+sXzYa6Ikwl(M%TMwW0R1f{xvuSChk>%cVejdL!S9NDar@=WivUbgW
z(g5y&ELvZR{DKmH9CvOzGVtu{4HNR*z-%*}WIOR34a(*Q1-?2`11tQ0Zk-_g*#VNd
zNbme$v}W`&hq(}T0aQsvlU#JK!8^bJMC9q2=H^{l`9+Y{*xU_6^1tV|ndS!mAajx>
zY{dCTDg}WiJLKtzpfZ4Mg?&9YfMv<0<~{_`27W6?uNYHQxQ8JLaUMxee>Tg>{B@eE
zu|lcS=rD>_rgazZ&u6OdE>AlDdFdv*D85&5pFiJ%A7gIfo5;qaF<hm~#`-Hdu_ZHk
zYvB7V_xVyl{=7zn-<yxYHB!}o%G4Jd=-+Jn5i+_E4Y*7d+X;B}IB{>!%pdDjE{|!%
zGL9TMNAXTL0jX-@3(M^FA?aZ!i)@3+JJGENImvycN=*qggp*%xlA($SW`xF*ackwR
zPX2$SCV>b$*s0DFHZdSrti@o?O#yfQmVUch=5LrS(5Xg#q><eC)DXez<Fgvax?3FP
zR%@n<Clt$ZjWlX4G4s(o((PAX{#T?R^Lt!EnY>QqF_9Z$^&ancX>)_Ja6FIB@lP7#
zzYK1g6qLG!Z9RHuecAVf4|P>#ExfV!Kg(vU#^0hgx@}7tc-gEE<_5V*zFVjM724A}
zyW*e`y<W!`JMe4Ap-+<a6#aUfZVT-{BU%(FOi|(ZT)1yc)IbdOx9|J!FBgVJfr0Oj
zfmx;y!}x#wmuYeUPffROvKs$&Uj6%ghK7Kz>EsSg_$wUoZ|t^&2w-(3OB`Qt{|bFQ
z|JNsyFazH9WuJ7}U*e$uc%yU<=pME6islimzYg5LN6`-zWrK!U;oaXT&VPM~Ux_5R
z&XV_|rhWhSXn+#D2=HY69MS0ibzy%SHZ=ioo$Bq@Ol*Hi9{zo+fx<>lkbcBxEl`g9
z?;-m4Rf=Zqg6k~W)pt&={(p@Z)YFg~rS<%61^xR&)T6<5svo8dM}S(i|M>EMzd^wP
zLk_h@q4EE^)gn)Doh6>LdS-t+lm9(lolguK4t4e4X4StwQMeI%R_uSh0RI1)Cdz)v
z|25GtWsO1$rny(ht1NVe(S1~Svj2~P2t);=(6#+;Kq>H}J(Mc@zU6Pp;hg~r<*1?1
zG8cO2f8Fc<JgX24D711cT1}Zg+JF4+zlYfmT^>v#ba}n+)X*eKT3Stj#|`dY3Ge&r
zed~6(w7+LzSHZzndsSHV!%|CSQ_1J+Vo~)1;neyDE@sMPabtepN>%c0_QE^z=(t5Y
zA@?IA5fQ(B)I-dn49&w}`Falz4{dGjg0R(hIyyQE3JS+Q9*JoRKTKb-E}swtoiYC7
zvS8joJrH4ZTj#Ojkt&74cmWdVa`AoUbV<ka+S*!>M{WRk{~#xv>1NFL;d4rg%*N(u
z>KOMLf?>{Jc_S7TB_<A_UO|nNQ-w~xHG3+TZn5u8@oM9ReTU@~0<KD#%Kd#~dP-s6
zJTd{zxT2SrNPpD4sxhHtG?uLLt8=knr0H_qClZL=>JEFR(lA;8D%qDU3iz}Y0Q%fu
z6u=RWfb;MaP}@uxfJjSA8v^zt`I0~?Snm>-&CG$#C@nKn(E|t?{Ma%6#zuqnBixKe
z<I~14J1MmV<{b|7#o=!u07s7JN~#95j#7Tn^AA%4AW&eQd4@-UL^#&Y&W`uZSvIrJ
zoqKMAE~(%WuWd@-n4*xjp^t=agWBG-?0XKY#rf~qxE<K3V@cV=!k?-65WnQ)%erFj
zlzpvee9RI_JnU#ZIX(C=qmWLJo9TemBz#Iog!x{lARn}n5xDqiY&vhcoqp{I4c|Wb
zOb5`8Cm`zl-TMAb!?t8RFd8Bh=3_yH`qwh~pG6t}2F%5wFO_FZ(EPA%05bxdGtcdx
zTAo0IBhVhj=X8_{<ok2+w5p&JPGIIZ_}TYtz%~)P!X^*azP3U`uRk4YJa!(WO#E<I
zEuXYJGU!;QS3GAk<M*Pz<H4!u=nd2)m3&QurE%`yL_U*Z&h@D_yFA~$%j`mjtoiW9
zmp!4!ucQ8;l3v&BeuoCnf0k{Jh{YYvcCJR^r5mAuCx$OB9pWCq+-b&W)t#3+{Xmxh
z(9>j*Ss0h}C{q8I)ejkfJ}}$-WL~3LwEJ20ZMFxo7EmG|?SQRD;6t%Y3P}s@tFvzA
zcx>bZ=pL3opz8+iD-7m=(b4?ieAzE4d+N#7)$YgdPW65yBoPWsK9R_LZhww!BoHO1
zJ(4-7O=m}K_GZfUJju_HKf%G2aToKFJF^sYd#GzlMM5+QMh6Cruwm-n`CkQ9!>bf@
zb0f;8l!lv5N&Sr+0y`xN$<qc1O=%{YH3IXLyfiL(miFcha}^=Ck);<cjq&a=zyY3c
zO<h_Rup{Zb>yPISEfsL}7TF!SV|<G5X?2=aD(BjYnK`D9yWkZFVjt3WRJ6W6F8K$j
zjHWP=!m^#Z-R>*nblg@nS2W!L=Dx3|#|4Z8K*_!s`tYO-faWY9er0xC74I&3f4GFL
z=ioQ*ebBtMJ6$R%4?I4ea4HJ4T~gsufy-Z$b(WJ)sE<WG5C_wJh{qG{dlE|A&8utb
zn`+obJnTTfDF*>@I-pEI>ncEFv&o=Kev=aGwzL3t0thz1q@9A<^LD=x9UUFl4WP{m
z0E<o#gPVI9P;K1L2~@@an}R%I{Lr>*-be)3km+-?^ON0rMowB5jCrYYZXe*Vh=_?5
zg{E>na3>PRyE{PhAkc8gfdf1mft|e1Q$c;tP$Q9a8*B!v9NAw$ZQTGq*}iGzRTNr|
zl*`f}b*5Z$2GHF|KOco5W}jBq)B6S3R542Yl}PqNSd=F744)gzAp|~3%Ag(Q@0M42
zn6?1c4+#rXVf|bZtND-fQuyQ3S&9;$1nXBrnw2IKqp$Y<36egfA|(~se>j^|!~i%|
zCw)Q@H4Oc3%!0L9!Np3jo6l03QOBvm?ZAinVB&_E3UF`j-$OQC$I04z|1`JoPc>F@
zTEsY?G`4JwS1c;j<;gP5M0+l-t`gk8=04IP``l3Y4L}9=z;ECI;ABnW_h5a)V=)Nl
z8Kr%40?&y`&*-3O4QV>@*!OBB_-eE5s0*Ih>*yOmgiCfogNG29^C9|5Jy2NMryaxj
zWB7-oLsyERcT1(X5NI1Fn4Aq{FOMrOwV-WAn`_0_8>_qbfU9GQr}FYp3wH7U>I^0c
z>nAXD=;zRj!Otfd@z9;(Sj85)`cSmhHULpSDAD_|*?=37<-t7Ddj^O4yyM{pjDiA;
zz*RVIa0C>}56`0!l@wH#(4kWp9t41*;qG<=hJN0<vfQ9OC08kDzf8OK$1Jx@GEWo`
z1L^OiSXYSbii+KVUB^KzV1KGubvjh=;mVjQ>NDOd28WWVnjWDY1ruCc8mpMn9x#1a
zjh#>UKb{y0w7XZFJYY4^Ax1#p(<Y>h{_DFG1{&I{V3c<;I8(}Lo<9V`h7`HeIoE<m
z#I<HB?bU^L5q(cb%aS&0vL9-O8Gjmj*XJM3X}}r!mJj!QxPZiBhjyj$D61VT6n-&u
zH{fD^{N>%a<m>Z5aY~YKi~KIQM9ohb9p<w$uk=`;z}`1y$YGXT=M?bxpdksN`9R>D
zBn#S}{giN6`2n74KGJw$O2I@z$h_*g^VauXghA<jyh@xWGN+G;)F+9Ql;s(mqTn?}
zuv!%kZa=MY1nB3QlTu_PypD-E7UYt@JI!Ql^aVc9DQ~x#eOsHt_?@Oa+#!)GjP~nf
zMR}R&BFzn9@n9=WSbw?KFzyJBR_c}Oba8KMZ}F2yI1m>jS?PzAAu&03;LKFg$bE+$
zO}8of7>|n<45N}e@(SYCEN7)jIk6gHrD^cb=q?-eM<%{0I;edLL{)NBZ<<aTlY$03
zbvlv9I;tA-30Z$%UrBBm1LOgZ96zcVcK*j#Z>;mrsn%lr*W2X%$U@tWRyKLOH^;AM
zk{aW1Zo`IMw^Ek8&$w>enxi^4_c--iTgYZ<KG@MPFShFD=DHG7Hp*X%DN8eBR1RRZ
zt)6t#+i|x&0bSUJUM$bG<L5#KRbHl!x&h>#ZbM=kWj`OTk6utD9@a#a1HdwDNVqT8
zagvrrjHiS{Ss%Ei=m62qh$P~LhgJgF==6ILs{^BDyvPm-Dn(s*lD&~mP!tn6K&iYu
zrSGP)K#Hzn(r<C#kt$kuQ`?W5BG86|9r032crJllQU(d(LP0?(iD|)k6*5wL^habm
zi&D9hquAol3z!$;_+;{Nyu21aNJMud8Z4}OUX|!uC>?p8;#{cwGh&XH5%$NHk)U`b
zn#BN1^7cpiwMRxLO;_xZ?ls`-Lg-jC5vE@-W=)Zh2Ns9~*yIv9Dx_^Q3?=5Ke_+xH
zxcK(Eh2Q03uElAp%CFYhe4)<rVq>_RdoE^hz>Cb8M_@5@>8$g`X3Hw;8toE~hSfp?
zV}*)%W^7b&ouvB>0bkA=bhG%_WFQBeX1XWI#R*mu$)Drg96nn__^HmBcguoAi%dwF
z^~-%NvS$+XUkOv<m?B1dwt^K%IX?=>OMCWg;R4cn*?B7Gb~^L`2pp*ku7R|9ltaLQ
zaZ)lV$}Qv-<ywp~vkAGC+J&_V6^uh@HMPqD*x)v&Afhz6*}5X8pITPK=m!k)kXTS(
z<dU+Mh#5$(&Sd`$w|I(^{mGV3cUX!HdO#-T_YAgZ%a_0;{vLG2ij6z~2$p}sFB5VE
z5DESU@OXmkg70JNC3nN21IC8r1mcH_DMQJqDa@QGLVbJSiBQvVN#y4g^&GoN<!&27
z+2}87`$tQ^lIM_Lz=-iEGVMnmy2@bcaTE(4ns~ubh^druh7aTu<x6%vgGegKHybeX
z3Ef6_GSv=tu^4rfqM!BC)5b_WWB0mp7}{2Cddxcf<pL-+Y}+EKa4bevYsI8y+oz6r
z<6Tlq;`9>=srva<;>ju(?n1GDLfNJ5M(27_2eniX3#qMQs$qI^Vc%Lr3*Ky-e$g*b
zV;pPP#LIW6w_DUf%s7tWlXlWiMU~e6TRQ6JL`fnjxe=zxiw$)E{e!${zRiQ}fCobj
z2LrZN{e;joazZ2q!<3bBO+1%w(ZErk3=t`T!ZU-d-;bh))4fU-t;F#6M0SAfzRoQo
z7k5sMUWI^pX7{cBr6}*7+~UulSWHeO7-Kt<X}`UUlg|q|Zybj|KL185xlK~W4S|>&
zL<as&gUTetBI%}ur480+Rma~tXIR!2bI!R~0=zfyIjjAcQi#Yq5A+u2Kvt65kp`MM
zK|2c^_n=b*sz~>b2S_anfcP0tr8q?hNQNFSeIGBE1CYtSlHC081+#A#L<^$({E8xo
z!#Sv&Ax0R4gz^xF3H@BF*ce^h+%p#Pu5W6~3jlL&ck|aorKY6jq77~;pa~e7MNkgi
zJd?H|L>z?8j}um=8bGp?YZi&chFy#~U(@o7x?T+n;)S?5++{0ewPlQJQYH^ro~%U+
zrL^aJ;mWrEBgM;t_WLnGF8!3Ri&A}DHCj*RtYk22V<Z0XhlN+Or4MXU$}Tz7Be-Si
zQYmkvjmI%Ht-`Ut3^?6q_Mfd8`l_bg^HPIClt{*oPSfO+61p(M*GP3fSK0Mh!u2;3
z{E~SDe?p*C6X@dq^kN&b1F%>O&Ou#_$l{Oo;m9^2EQF9uj1%qCI124+DyuJnKR&hi
z5u^KuUt<h?g5gHOU0gNKi*qi`y+U`C+KAEM*k+}pZnW}u4_-hDj^B!k^_p*!{baN8
zOai@zJKLmKFyS<IXipqbjFP;GH~c!Gmi&??0fnuCRVSNODMP7*(a)6*8KVrwSj}85
zxf!*&>RWe+M%ok(Lc%K)Ruf~hwAqN*ArR-n8~ZQPlzw)C>(=^*3g&6aR^7ynHDPA(
znt*d8YXmliJ=ln5weZONc#_95V89|EQ&w3el_lk(j(F{ahh2KI0={hv&}J{^LKtQ#
z&R81X+1BhYGy>QV8HFtcWnF^k7{sh41Y?kPT|e=0_q`bNl+pc|S$!!ooad?@)E@OL
znkRDu%-eb!P}Hete)g?y`()b@+iW%OKNsL}k&W5NyNFutFSfp46YlA8ByqwgC4YN_
z5YPZ!7F(K%15u|UM-!ARQOx5#!J@TDa^&q1{}mNMA-Op&^Y_&U;7$b)GtyC4O_ZY?
zNq^k(ReLd0d)ZR*#p;ua!)|dQT8rz5gsB`iy=2o}jBpERmIDNw*-nYaJl8Nn!Xe2j
zQEFA$m2)^lv2R`!|8F58$V2}pi4%wOL#ucm%W8pJh!SD}mG_)%XRJs$_l`m;gVPp|
z$MF*qAs#B5_p;yXn8=WocPup_I8_LL!~6dW#(|;$yTo&UG`9b$i0~m(tgTcC{Yj=X
zG;oz`+5DAZxHiLO-e4y6lAoMKTfB(Fa-!!{iL#W`7KPL)aQr&DjT|SEUkUn`dd<$T
z%l}^8{{los{!iGBKem1WK?8J!IqG#4p&lj<jts6{(rPGf_Lm#CnV#?;*~3+TjW2|`
z(Tx)9o(NkQbSUkV3+p6=d>N$N`~utJIP^+_&qM}11*ElqJ3LpOz(0ppGFBo|rLWL%
z@aa1{Sf@N_XmvleoF$s$ot^%8NCH=tGT3R^v_)>*7q?r)!<(ZzDsiWISfCCv?2EUF
z>Gg=*n{pdY+0hh*(}=jw<>TW>(_7hM(04oKQ_#@_j#wDmt;)*Z@i_F_H#T)@Yh6Et
zPb)8wyEzPNNXl_oul1F5u=<RCcl<N|)^<vZ*kB-)3>*vtglb;h{-TU;X7Pv(Eo!gN
zE{29Cnuhw+ORq$RVg5R1!s_5q6FM(}7TKhOE<t%dmE7XGznR&@eakV+t)d#9kaE&_
z{L>gKFZmYIVnW!cI+A9os4%0`*kI_M-TvNQQl`5rTWmjgLa%SK(0cPZ{zmwl8N`{&
zMm&zs)C=lk+v2Xf+-mo__B*PrrwWDhwzCyi>u;jy)e>B-a)q0HiI9j31n%oDpJTsA
zRRIYGkf9~IQ7?R9Je_?sJS8mEXf|6v=Vvq9HUa7KZ0PK#Z1v5p1#`K^!(Ti0Qr|B4
z_qohfRK5u<+}D4&u`IkUNtTzDFyT!6his1&#lcqvE*svG3}7pH=t=x@b2&i`Z@qg*
zj87V)oX4h}oCuDOXNH*?T@K2qKX@Jr?C+K`N@3<l4ZfqDk>4D9pP*U6fYp`OE78tY
zqlvFFT!+iFMZTM^<Jdg)nW*qvW^LKX<yIzRVfyCS@o7bCxqX{sS|Q!=ruN+Y^?m(Z
z?J%0!WI@zx;;AD-c-yR%Pc(JfQh&o+QNBOreOsmE#-(A<6O-xO%U*2BKgw5kD)m<M
zYQ)*R9Aeyms#g#vDKFK|W+d$2oom`Y`m~zMJ4zNzq;O~yP9~;Wr*aJYn*_uh-@#WD
zUiw4r`M;dUHsqcytIi{An0%bCzaf1koLq~}Tetj%PqNsB+h+L7g0gRcyz_g#p@FK$
zht|`D!#}N;7~y;cq}vi{0d{v^mJ~DD$`=}SeXf>Tyqet}?{6e0%Y+DP&2~8Z5<jEw
zJ77GHq;6#>-8KH!;|RyXjcmh6d^0;HRW$#LNAUG#{#<p*h@7=ShFK#YNtI9gOT1l}
zUk-~o8Pdx3Y8ATX=3hM3yFFAdcQ@F+SBVpbmy?sgN3^=pqTgfA^YG|sBnDZRSn;6V
zk7OR^1;1;}K`X*$ayeMs;w@fK=B0LvmO1#GFPn}qX46@<2pMSa5r+v<HNzp`FG4kF
z6jGmUMk=)amA^@}{exd>ej<QXNgucGHIi0y5E3yH(bgcfF7bzvlr$FvO;QCDEYpv6
z7FFs3%n!Zc`+ZjesJPT|{(nDxO(7q}NuF#sQHY?KGWfm(zUj>oP6b^g@j1@YZ_dw$
z#srO7#PNl=H>++(CR@%<J70KuEDLLPn+~&&;Z4i5+s&^}Xv90^(Md))q_fj6#AVnC
zbod?z2t)~;jY(DAt-KD?2}a^nBZ9MU3(Zxk!;A3Avgm%;R`wp*n{ED7zPsbs)$v$P
z%=+kcU%xj+uem*`EOdQ6PVAM{e%@gE4XXX5FjJ2+?RymqoSb;NXe1qnx=?Q~_MU6I
z=T2~6^ei7PyHsXKjmOp7c1W>o$tSVvm(=*)C0#+-I(8k4y}!I*j0=jH!gig13$}OT
zvYkx|Tr#}DE6g?fo<_6guad;2$feqnWEaTGIqZ+cxxzS$QM9wP*y8-f&g@K4%<cVt
zL(vXK{bUjpVxinP{*ue*l1am9eZThW*Tz@By_}D=eQrlN-c*vO@u+ra{VY|O`q*(Z
z<htb$!j^JNe4F>3laVoONa!J}Xwmgc>Hg7*PHll_vMzkmO?6HrpNPIZN6@J5*)fA&
zQFzkO@+W-uq007(M$^0a>GGl||CF1!kO#Qv_tg=ie~nSqIH7ux(RP13{+CT*4vdNU
z!tLd3K!f|hmPe}X8sv5$_BrC#@S9^r$0HI#?5!}Lld|JU<wiTE_etKjb+GnUlZ87-
z<VDIM;?OnC9D1!YQKd+VCmhx!Ufe!+B3$lH5efYt;C{Z*Es#ZM{2lr`Y>9CKZT`7)
z_Mc|kUs<!`x`nFxHU3UpxcooeCLgt1-RNiVm;qWP`GqOvn9bfGv^srK^U&(riC+4P
z?}IjWC>ok9jzgWMqPNsGRU4DeNVwa<+ySSjttI}X*VS9Z(5+U`bUj~^$nkmJB6+Th
z_SU$ht-DUThuPahK?P`Xj85n8jxk-ZzG?l$vxrqa{b3gMhCZ}ydkU{tpQHmcrhM<M
zun~kI)lB-E>k7+spv!6WMMY~JfK{`GZ59gXX~%>Plh_KC5S=%PXAV31zes+$Bf5=#
z@r4Emu7e(<2)}nc&cv)VDfUH--=1-F)sD?9eigitx+L%JwY+oh&&{o)uLvw(+5q5d
zUnIRoY|+m9jHT3}Id=OBJ2sD+hoOsU%^7^VA1nN)@fY9u6s913(RxzstE$HOYj0Sm
z98!6=2xM<B@NQ5Yy{u>FoAReg4QRJEc(%rB?`}#H7A!|T-=K1df2w|RlKjgNO(O<X
z5j?{BN4W*?IGxyjp!2#S^}=bjZRf}5gX=+p4Wl>C9iD0r_sHgJkcM!kiM}YxUCHRM
zcl(vb8iKxSo|D-x%zAl4%p)uNBJr|0wiS0gniDk*C(ES-PP^beKTZc>Ysbf!*zq}%
z3370T_lh#lH#sJ{?K5m;$daUPTyMG3pM@~$hH^|wvRa}gYScx{l%GS3Z7Qf;XYPBN
zrgB!mqWOx~Sal=Zul8bxZ$(*{66D0hev_>Zbo4~NU(N~o9*@yJ=Cj4U_2!55h^|*!
zket9|A?mS7I-73hxn<#(k7k?~s*prH9u;D(H3)C1<60do`nEhV5tG8@V&UcW$mEUq
zl9#@9`Plc1U26(XCx!H|H_`b90{2Rj1vk$!ypb*`hr`zx$CcG|3_N;yEKtU6FFq?a
z^!C?Iq~$sqFKjX@VDKtsIwQ%~^-J=-|MNqvB_~V}wCrJO7AGFLzuYPLoyho-P7{M;
z({_Twyxp_mv*OG_)8g^ER?aR1vG2BZAl}4?qR^aO+c++M?uX?2Ek|pY_*_f2_Kp@;
zj>hR<Ev}P$(i)4Enfr}9GvCu0=`~O|ENl5(^8d7*o+tH~XF9L;xLoSraHn!L{&EZk
zw~<(wxpLBcM&2WM@Q5rlrNAT6QNY6T{q*}}D@Oyepz!HLDqBfmh8yz(g!YQ883otu
zAGk-B4J;s48~F3NNSC8Kfjs{;5`l$QnPJCL8G-F$;wb5z2p-o2arB#}kHqat6!<%e
zS3V6s_Y4v>%GIIf^2aREadGxV3Q{2o@&Yp1hC=$;M%ib>ZyFUdH<y-&=yTN#cE18U
zGFis%gPDfI&I0$C*7waRX&)+<6yN2n4bc<YckC}A-^$~$NpAH;)-k4(ss28Hr>R!&
zUd>2Vpv6z8Vi0D_=Q@EOch=&F&g)1*;Xx;fljfaR5wt=QaEX{Al|J7~h;^y1Q+$cr
zi*+Sjy+0QYm7|#L>swG3ZBkDJ*=o{#PfVXU;&zy$SLPN=X?2k1rB^Y<>zG9Jvl4xw
zwm+=L*k<8zJUm^|HMW?@ULwv|ENr#j7T@4pb+)3s!_0h7pvg8Z2TT<j-NhSE`WozV
z1F>(nFz6+^g4cl#Lf-488kUZZo??c&auGqxtn9g{m&?%wD_A{?troHES;9ibV;SSA
zoR~^iYwyb+pJ9io)1Oc2Xlq}mY~t-Fj(T3&F6>s?2zZwKuF&kOG$FsRo}K1<d)M$T
zhBCrb@~;~=j34V$tq5*cd_|xzEi~{k_o<fYR5ihg?g+GWZptXuSI>L>eV|IaENZ@9
zFM7tU^`Udj%}5A&y`rYXay+MpDItSnXyP=XvxwMt{g;Ax>CL%55{U$E)Or8Ii{gW=
zRa`-)?;JB*8Ls?-I)`yr#e&{24wH+JpB$fb;!T^2GG83rZ;!H<t6O`o*XgusTqcX&
zlK3J%w;oMx^<NAt(|be6>tN+95$dE8m;d2Ce6=wm{`Pr#>yKI=tH%Aa$#c}Nml;8y
z4IKQiV|z!J_UuGmc^!AKiX}qg(5|_sC5n=)+IAN=H+NL=UA&KRXd10`S7mqzcpXIR
z<jioibnEh*z0<5z2TO8u+UY4-s{8+VJu1kZIgLERXqLMnCdfJu(Hf+rmNr$sMs^!X
z8|)$U{_ax4XPwBwrP+F+L;W6}4*7g5ZI6480sBZ~c5)sMQ=Pw`EE!g@$b!Q$wI2F;
z<r){V@X$ZFJM`zXLdP3Zt^dUtCkq3<q%f%g@N`Mh4K~W@`l2nIwo}A~)N)l*<<>&E
zH2u!~Q8#RNITD^%IK%d_hkd`8>sa?}QJ>0hyhqT3=rmEQL32h=3pB#7Ct*Rk_*~pV
z4^3KENBra1hP!==k<%F;zE9MJ%5vruTubvm?kzO_=5Z{2+srlQB|Vh&2u!KgNyQj?
zOq84LGoAl@7U%IuUON!D*){GJYSo|kv>P_-bAPrvYq(dTON)43?O~U$Fuf=EsAs=W
zn`azeSY4U&5<Way?p|VSQmt_SwV)|cZB;@+Z2f?gKQdF6iIlqWHZLc~0joB4VY?u8
zz^FVmo9rzXv1S+gZr<m!aVEXx5)*BK@ZN`U$%9F*Oo5o1OjIZ1UxHJP`}LkJMIr$N
zR-{5g-Wk)H9g0LBM*M5>nUwEeYY9DQ-JZyHZJ$BpQ+H>fWUQjv(99d{2<K|TDwR(^
zpAjW8I*-P3ET1Jlg_ZwKyFp^BH2##+Zqt)&VMF!9ob726w+FO;*Js#D-@@Th@pYoQ
z3)aL$KHp6Jp2)~xE6ZC>U`=>-m?1#OVHFdpi+>yMLl#H>TR?7>0}?$m7P5sTxzONj
z^LvHKEosc<;reF$(c29FzUj1PSWglTzV34h-;|6N`QfxqnwtBBG!swfFO7ubqMIt3
z1*Bw0okk~ii)EXiF;hZI>Msyd*sHN0+1>I8nZ{3d1oU#-|FC9uQ_UjNCQi={4y^Iy
z30p}Go$gwjkk>?bPe}h+AV>aydldOtI863_af9UZ<w){{NB~^uthLjhsXlEY&w(hp
zi{*fedh7H|Upr#=^adlpay$J+(mFR=G_?hd%7#X3>_{@U3);Oub5R?^`{W1Xx|!3n
z!_tv<FKDmuNNM{X4<FAqMhtuxZ@zu-RBT)-C{0}Ff0xQP8|uE*5bG4OE<=&iawO`l
zW}aSLXtPylZ%D{dgC`wDCFjs~(SmOArhUEb7!o|Kvx<xo+Wu3^cR!doO|^i#vbk>y
zb80`z_EnBC=85h<_9rT(RpLw51Kkz?Es@|rRXfi7_L=^|3<ArMNp`~Xkd2G%o7A&!
zuhSW36rK|XQ?{t}q>dH{=`7eB%m<qzn)z_1$vEKMT<CiFy5z65?Acu5R+nI#PGz<p
z&fG>bIgvVjm@0g@Sg>sxsj|by4M%0&4jan!nRqSq)*rn(0Lr+4OXIEiX-d+kjqU=)
z_vKX;dXZXv;qj24y5%<n<ue~C(84A5<|XWV3$Uu9o%V})@29P%gIyeB$q|MCM%?q_
z<`$<;t1UC7Sb#T7O*}|3!jZ&{|0-rF6tCPTC8u1Uk6Je4{v^Csk+{Ek%gTC2t3ZAd
zk8376*__?I>U-w-Ft3{n0#Vs?d7(EOuDXO~&r!E%_|Dv<j7G7d;B<@I<jsYv+kOcl
zzlABwZ6agQ8y5b0Yewzj%nFmpmZ-F@S(x8m-F}7(t%qox7M7DUCnxlttNV&V(}yU~
zBxjZV#BWG{?x;@GSyzs~o@{MIYB!c7gt>&&npNC7{|Z;IEBpZA{s?l@DBt^GR?j*g
zM}6Z4h+{>^!^DDtV^Ba<|1D%!va}z2@tI&>w4Yx{jm<xHyVuWE!s6YJXri5^GQlsQ
zA<qQXH2L}54(bXD7j)}uC@JDC-fzbcT?0{zde?T-dk4$G?@=X&V@oBj82-H>s28(_
zvjQSZJ`06iqRcv@Dc{#kzCetKpuW=v1$kD*#fgpt{~F5ECBtFdC!vp>i{O{B<J@Rl
zP?>Cic7h}8&G1!dJ@nbz9G&TlOt<?oSRVXKkN!G)kYlN(K39hWnRYLU13uq&arfif
zQjPwSFrUIgKII++U}aSuY@UccMVa!l7JV5elIilWk(vq;>=vo@-u}0NtPHtz_p1Y6
zPpR_y_l>odZ16;i(`&ZDNIW4kX?(Lla!UJI;FAT>cyJ#{{$Dm9@J3i0^~VdcC&ZM*
z3x<S+-Tu2nn-qcsz4;Ck&PGf1+3i*)_G+~ASV-43JVWin)Ywevmkwtv#Gkio?M~Io
z#ku#c*|!<oT53@e2kQPNk9{i+@fZ?Uf8u%3FcKBgr>4)G<Rdqw>=#=+-fHDSLWjt{
z<%>s-iwy)lle9QfY%&~}4asVN(4MUa;d|82*SQLE%<y;xa*%gFT&>-dFCCaL$R=0f
zlNqP|Gt85F(#}!g<fD@+dBC(We~*h@xw|%xLu8@7hkYDZW=QQdmuL>xNO+wyLg5Y-
zBJ-x>9W~3$nuM~A3$yX3dbe^FlicMoa>l=>7BEr3_2Ol}Y*~=72b#O#WZY2gBgu_m
zissQ}i+*+W_-N{Mcz7k2_%ICT=w#iqnMP{oNuk$qacfpH47hSQ-_VhBv$r-j-xz*n
z6cjAbZ-uZ_OYGD`5$eOs?-3Q1|NQ$IfWrKw>3~pv`umLi7x4KGnnI^MOy42d&K-SV
zWP}}=nU0Q*iRrtZ*xZW4Q&~9B8HI+1g1#+fpi@ig^I#iJU74^9g@TdBva&MZ&5VJC
z<+lBU^qa5McIWr>)YR1Ubf99{pPEu@U0f=P&Db7rUTAT(28bpev23adrx7#IViW;g
z5b$sqJ>PUvP(YG<_wL=v$;mSuVq$wiJe%a_wfxCj`LGR{Aq_$-C=-*D%`AYL)=*Mf
z#?)l+akSbHstgvOYu0VrXnNj6%#W0)f_kFAe({x4p_^~uUG-7MyJEO8QWgcs-nm%k
zcD4%E6bEZTA3h>XpPdH=D=X`j*3lgU$9GpC*a4pEj_nG9*Cvg(`&V!5e0&~&P^y`9
zQSb{6+97LSXE6Td5~#)m!V+@?>Hnk#xMQKcm=aycUX%Wph7?tbuKAc`9Jr~!U-IZ*
zt_7m;+A+aaplku9&<emX4m5$h1|BYCO*pUkgEHLaEmOJdIjx3DI-UQJNAvsZ^?HCJ
zRcSx;a_!np?%PMep#suqH~`&P5HwJ57e1L^0kua@YLGE87`Sx4_Yjd=z%cPg<8f@I
zzh639>7FCxvLh&D+ZCe+9t+06q&&MGaZIDg4^UO+#m1>Zz7N0!C4Mf{&tX>oy*E(2
zjq#t#4ZepQF9CUkEMddka^r8GT$V#}XVZ@xKMu%xhE`V+8A!Dptd?ZdI*$AB?YeQ8
zjh>yi?flxMAFa4{V^!P*nhy?V(PEyxvRV+@nKaNYoB+{;*C@xV`@S<M9mLjmQEvA<
z_q;st224VZArKRs2O|?}^qD$2f&xI*&2au6^D++v;HscO0K|ZiDZtUb5!864Am;st
z$P!2+vb;L_`QA>(pu_sfh-HB$)BKF3J9+u}@@d@1QG5r>+3yBoNV%zw+)sh`LI*Gx
z0O~fX-)gGv7jx!&Q^n3euvAP1>K2_qXD0U_bh;#7x8A*&nsTX}`+l#9aJU^v(#KM{
zkjHo9RTbuHOl8Bd@{)L#$`vyO&)9tKuAVF#j=sk5yQhe%wSe-F`^MGr;SP~yN&BzY
zW^G0fneC^~zT^zdA4#x;!!o}f2@89xIbu2b)>Q)R?X^kD;HQ``Hq&01>8Yu8euPyP
zN60F=+fJD*3FkTLIQmw-eUQwDZ3p$$=K{C1R3W{@r_?O*=Ad|YXdl001Bi@##wD<6
zJG15Nkp<3_*1N1L+p$tTKmO<dRPjLzymU|!E7tz*PoQ7wL6DT4V7#ZDT=5zQ62^Jp
zl|nJ=rA*25nLqr>02F2%SfY#LAGb%PJ&7Dc$fH7ZiLQa2L*zm+Ypg=y-HVfa1IJ#>
zL1GT8sm1rd5!+X~gP(*R9>Xoe*Sn>aF*~uf=bvu@wZ{*NFkKLO?|{2XxKz;~fgTjj
zIZRO_V7*NFzrR4{N3mSZzIwZ^c>pSTQUV>CkJbWVlRo~>OOEXMOwL!iPG~&X=U@AR
ziGAGXaTzzYIbsLl-~#LxTY{dCLu|%HB~-oxxy~n%N`v;UkGqv^Cvf2e8f`;&fQHvX
z3VN=L%n+QzYfX{kUWE2a#sU#2{l4T~KiSVTgEgC29Ca!Z@M>H}C-w|X)JSNEuG9<e
zmFRtwA=1{*tbM;~vUra=U$9;CV~|p)ne#}FsmpLbt44ULZf%w7@+q&M5#3o%TtXYf
zyq<yrwrdt@3%MpAePgWHP<bet)S;><U=qlJm9wyZz$hw7c%QhGx$Z<W7{jRGPh@4i
zK(I@RJ`O=8cV`N09?*2gIJN*ntd&GEowtOgjDa=l*;D3TUvG9xMIh1SQ(6<Yyo$8x
z<}-dlx^087$OD}_9tmPXu{LgtuA0?3up&ydZ&AxEcck5gZS#0X9iJyBvGS4_dq2ra
zBGl%^b!09>?D#g5j3?wK&YXdyO|yWA^i_x)(d*!l5;>mv{9djl_W<RP0_mf+9mf2D
zED`Ap7NtxNxQ(OaObKR@CBzq2<DT=xVcU=!4%js*r6IB7>x%_@Nh+0>JbKB*fy^{%
zTlpC}^)wsO$(RoCLLBnUmuD}!j##$*l07?X%?K6Cz51Z2cHTVefSnqWE1U}Td3~iK
zPNhoNi-L!sVY2%AUEM(bimEQ&>Qvkt)Ld!HDavmHXr2&M*r{t!w#3b-egkUa^OV2B
zSfUP*CxgR*yjJ6t=5gG}^Lt1!V@~qGN)DrmIP0|s>F67SdE7$zPG~THo?cl#Fly6^
zY<9iipJ-4F2(0iDGx;V~65q&z&)gPlzf_pjA0#-FNiv6UFb_L{cE=RswdRT=TJWTt
z66Oj5=W1b163Sf><t)66GzjrV+7doxuX5wt&2&UIN^{>-P4pEq@jGb1U}{6xD5S>d
z2f+#JWVpT?0Dduq59G>41lh4c#QsXmUpEqU$%40@Ol3AmJ2t6_U8Rvr>BQ}0J^FCX
z9q5~1uY6)BHFE0m;Q~z^oclhTl!1AV^J0p|WBC*a^S&GdvmlHk)CVa;w>eWf8KFse
zw<bbf7n*zq)Nh!AsMd9Yn$GiXg%b;Qt&vQ$@|cJMqbqO%%Gdnlieb9nrAfEI;#Pg_
z4za_WqLLL%R$ypLwvP?)r49WigHA<K)gym-5c}q3AnZqU|0(|$*|@u;^efv$oW(eJ
z4%`}_a?}KsP?tn`L|eKbEhRFoE<2(~Y>{JxZa+zH<ecM*dXzi~tTcHg=4$fnu}b96
zl6&O27Adx_wR`G%p1or=`Z{iBW@{MQBz%U52`o&Qgn;{SFcRUm@8)ZfSNWSB#f1gI
zKup1v-$L@a#Amz+4FR_=d88=_4eA}Q*s8~Pgi6K@SpcE<2w8p6Ap<#)n3yHECRrsx
z?3iZ2_tSHV(fDxt=hN5zO5%bY5)@Ztr_xkBG^XPy$(>V1<Mb}lPEFhRi<^4^-kvO)
zwDRU(WR$;!_`O@8YOH5%x<M7D$Zw7zJZFe_3HWM@D9Z@{M7Ie`bmr&(^C}{S(ump(
zuGMAn3xlbW5{?t5`XQkQD@(|V3F*n4sb4ZH?uJX|aG)A^xfo1V9~j@A62|uN2NX=}
z4=NbeVS35B-mq|e%jAPqy4rVdBMGtlRH7z!VVI`6FyUUpGlo^PoyaiEVM?Ufp&+44
zVh2(6vLahDO!77c4WU{qyP&icV-mzcdWE@-P+P&_<t*#Jd(9y<EY$K^F$%xmU>7+}
z+e$^&DT#F_?nVsiPo5ZiEP{w29{NdG;%L@#7muv!S*|3JN1$@2xdD&p4^m*aa7lf!
z0_=spAqQ)%5Ss?>!T{xEJh2UiXYMA>J1-|J1kMr#{vau|b1~PMmi1h|tmq%~znrQO
znDPlgiJFIdE`E=xMU~d5e-Lg6ec-EKIkhCXRhqHnKbsL@c(Yz3hD#5l{J@g9igglq
zjmI`2A^Kd`NfvJexx`8TtiT)Ub=_mD$4`D;Z5HKqgFA$ptU`)u_=)DDuDsf=6U&^t
z6-zu`9G&C{yA2Lsiyw(-in5Vl%<>0GDoaS9?jYi|b|9WCtuthC0Tgm?6SY9j5(9f)
z&Wq;BzS(iaS@xS*d10WQXUU4Hbd;zWH8kRE<+zb~1LQiCnT6yElCNJm3aZTyvX3gF
z7&fv~%fn;v!m|XNp)4DHdnPXDPDNd}BNWHZ$Qptn&G=+@ldQ~-dLb7*;v$nuqZV}(
zEw&>f;CV(eiXCRi$<f0|l-Ceb@EIdk>=H+$fTdPhfLi{m5Axi?@cS5hIQ5euR4AJ3
zfspaWt6uW#5glATTFH<+_F$|xV|P9$0u#h9R&_~&SbP|BjTQcN0wxLTqgOep%?9rO
zhNdFxq)TTeV?sm6jA(tk5z{fz@mR~f6=6<`T4RDa+CbGzRzPloJyV3!lakmGQ@d%-
zsAj-Jr^hluQ!(K!R@Oh^Q&S|iMnYZw-KiSr&v$Y5)MY}kje2`0VlkEZ`QSOj-Z6?l
zvV=@XE%S8fMos{cPwcJ^<vU^Ci(V2rhYuhUGD<W&wZ3O4e))p8PmDCS>Q5n73TpDU
zjOeJ0){+wL0P?svYhYsk`f1yg>DxD3VKO9!@bhg>6fBfELNPX4IJJie)HEV$Q~6s^
zP6a%$#Blj<<rNqeNOHpB$syv?^+>)zqANoynN(eZ{D%e))$|1-)>7On=73k;i3=Zv
zC4^}|D+=(T<Pl_>m4v`5A{%oT<^Q0dVD#76_H|eCe@5`fV^>ZPQ*{G|1DY)ZmCc2B
zLg&M0sSo*tkUGqvScTZF1&YTAsH5aM8a(LXo>|o{Xv60tC$&<Bh-s=ND471@>&$|b
zOg}P5@_9btUWecblYDa%a-!%PrzGO6BuGf!a;NdKl9)JlDH^+ajW@~QnIyd|bWdh3
zli$JIG=xq3N&2}b!9|&ucQ%=kB#X?myx@V58tFpSd6{gdz~{e}ToD-VQY0{&H#7{{
zzssniwy&N;(de~E7#6tNoMEns#3P%ZoDRjd1@#mdK4Wk#rUo06O4JqE3-O-y#7bcZ
zPYaOTTk0DUxGrT<8A1KeOGya2FLRiC<+tcVC2W%LBHFUWl1HC@zJXBmO~AA4aMKri
zr=Vap`OBXoU7&NPmQ{VQiMp67R3uKsy$j0J6tol>n4mFOD05W+5UhG^Zdqvbvb=3_
z|5`BBsO4;hzGo@>VGh-|E~+p9cz9x|=`m!IeHPbAPrzAGSpPv|PZft7oLbeXFxoI@
zmOAx<u<v87)5HkeBn<~cs)uUAAJ>q2PJs;p8sGY94OP^BOy@H0=P+!e?C5GM(NGc?
z&laqhh2vtxdu2CHDqt{`&47wu`iE~Znn-IER^kA3pk)<;#yKET$XijoyBRB!RjD#d
zD_ZC>zh#ntQ18)u=PA{@Ho9R_gRUg8eyTy0&+!^Y_lIqLCWB$|aOj`9VZOuR4rfH~
zur1SxPr4k1MYyc411zxZCwiqW>l9YtEC^zr3{YL_4nUK0%E+1T{^+rW`7AkX>IMkP
zN2v{8dknBmNwd(YBYS8qu0mghIKneEk<|K2(QLk;)}lX<<Gfb<8V%Dr@Yd^j{#%w1
z4>qTteIfFm*)#cxa{6|ZDMtyCR-vJ>DX~mKWAK@H$_VxZPGggfL#$oUmE<E$lo?TE
zOg>ufSozl@Nsi&AR}kx)<DhA5ILiRdE=SwyUaTTnP(PyP+Lcw*`vr!Un#y6xIho+c
zN(|S{aj(fqo$Uf=MB}2{g4E}bxq)@S@4w9#0;*r%^}R;r6pjlQx#Q4c^of+u6Fshc
z2VC}lUaPCD&Ex64n^C&+5Oy)RdjvjLa=N69E_YYvC*IcoJFkqOT;)55l=aaYe9@Ca
zp1}v;_v^RGSmyY{axoNY-|UKZ83kig>PV5G?SmeIH_q%O&#l?=C^<Pu;7HVuBz+jx
z?3WoTnj#w?<s35aD_52NKla`_s>-$d0+kRp&8DQgOH#VKyQLfH5)hH@?(Pl+r33_|
zJ2oLAAYB4dBB<zn_7~?I&-Z=fj&c9F|NO={!?E$;-tT_j=ULBMbIvsv?Y&icSc(Ng
zWba$^^&7Hoo|Gq=&)Vr(BW#T+4no|~h^o7=V|bOD7gM9=*#pO7LQ`vxeVelx4Ly`3
zwJK#}RdM>wiv2boZwDi)C7EQVILL?7rPzs0b6_~GT?{^`o`0shEw`GYFFkhek?HCX
zNZ{>qu<LXWHDVQ^CQXKNlV}nVw<W+J5|(d2Vc(78N%ovJ1Y7!u5b14QceD{*J6-HK
zS}R3Lj<3x4vfm@a<qy?U+Eg_0Sgu_pSdJ($$^k~i$?LuM3Z7)x?aryHmu@5*bIH$a
zU+3K+d|I4|SS+nauaJMin~=<=i{{NnnSaU<A-?k=As;-OgwLe8bX6m-4B`x`6tr@K
zpY2N5t72J<n9*O!mpqz5Wi&C)t>m3!<OM4v;GiOlu@=K9){Q}e?uWNxsq!kG-CYJw
zHR6gJRhM*iZuuHNTJ<wnfa#B^gKSFYnS;wX<fUruvhaDY%GjWN?NO0iqAlL;BvqOq
z^_z-Si!dz6uyt6K@qD@3BNiCfT#<f*aSGjNHu6W7+kKk*hemzkyf0VOm)Dh9H?uO;
z+D9sBloVvXDdlVnAVpF!{VRutr_q(j_2dib*~e-YFCmRn^tr)tvJ8oknqF0rr-siP
z6QZ}=V6k1Jg(i!8>?JqfCC(tM3$-!|rOExI*@Kw(8;4UDiuKLwWxVS(4jZ`=BIP&=
z4)IXJ2@ZawkHjE1p)`-E6Zs-RBi>T|Xj#;*4t=mYBADsbVM~WH-Zjy|wvwrVD<M6v
z$9@>S|5iM-k;jKO5I5D3_K@(i_Z{m}^=M_uNNNiN%g_d!oYU&~{!3rr6SMbV9fwj>
zkRySQLcY}MX&rgyd-m`5jBKE;a~@J{rDXiZoagK%G>1P3N}n{BE<P+p^F5Q*fDH)k
z<%elT4=C|rA+Yylla@OU+TvE=<pdb2jgP3}PjvKSi^MN#ac57*B1y|ByC?#)hd#y9
zGlCr0r7!S9+|Hzf<&-)QX_Zet>Zpj~at4q=x>Gy~HhDF9Ih!5D?d&2=IH5N=^21SP
zEw&<y*7clTA_8=8Hr=o;wFn`IujtFxu!F#gzSx~T{|y~{^zg9>WO<0S2qmTWis50A
zSU|M-!~F04GAxt!PCASY)Qt43x82<S$-TI*tNW;V7JmrSFw7|nn~Vvn#zpIi9yia8
zNX_e<8VhbzvBMk*VMnUl?}AY!fIu0_CNsJ)GmOImBkORl<ZMGuFCpcvUtT3b5sdfV
ziBU$$;!^TmgJup^G}M#)5)N0nohG<h{C8>9i+q?aIVb$Zbq}0JX-ap*AUMX|w3dzX
zwrB(2GFzxZI0k5G1gPB`Cx+rL%mwXbInIZlE$$sf)5L}h-gpdCC`cmnC<DxeYn|9}
z(X*N+KP6m4kL6;tlJG}kxy^yJ7la&wd=^j^ICz)H@J<`KM0of>ZPH>B%}7MgvDY?w
zoFrw)0?VVuiQI(baM1L{`?)cd{P>IH^G$Xp{jlT`9AQjmb5kDNK*KS-leb{1(LOSX
z?`3e7b(IlPn$l9rJhO2ASDoxv3;HIbWS%H=OQ5q$npgd?e6eL6#_`MbDx7M<0n!=T
zoazV8>Ki_$)UX@I2z6!K&De{_n(}u-H6jZO?}X!WArm0W&i7WU5TH$ucv}hJb~Z<U
zmgJ&k4pYRMm)CuOeQ7y3<tw`Q0h29{O7C9F-KeU3Pjs2rz#R0YZOq#UA+h^}bFu=s
z1lgk1#EQDYPwiHH-X%;iBOsR+2{w18U){S>b)-D)x|ov5EB=)-eWT-UorHIhukXF6
zk-nLkqq?^-v6WUFO{0H=xiYg{U2z=S6+%ftb6AOXDX3YNoT{X%Y@A*>dwy;iiA<z~
znqqs@CBv`S->xlDK0bZXSG9|PW-!3SCposVh%v%b$h7BMmhfQnUSy6W@x2b3Gjq%$
z^^8rEUUu4alzHYs=BMG*+-MhbEXQ|vsd%m8vK80mh-5wF!-nL0Xpa21hjCrlF39lI
z`Mjv7yS0Rixj2I`bBrHeqUS`qQfJ6#sD_rcZ;YPEsTaaDINNTp;#&mUnO(mTvd-7U
za8!^vN?KL+ck}<d_~Hubpw*NZW_eAB&lRE!ITf3ximuxy_WimZ;$kk0t=|J-&oBz1
zn;a2hvpF}iI>`K@S}pDTW>%?*{KMN&qXi*k<TFI3B{pGd?DmYP%QV7!b)<UR_|vjG
zjSZ1}B2(uJ{vLkKiP(LawCwMDLo4xf@1h7FN9+i@uzZUgscbf_SXc<`re#kS(k(BG
zSWqgAtESpP!Xk<0s}m>HtF-dBnP9S{8{5-)!HPufcw8lu7p)C*-^#S5Cp3z!Qxx&=
zW)<?`E=%7n7e77Ppuw}LMat&B6pqQc%bOZ|ej|GauSnHP6l&*L!J#4?PS#bw(T}um
z=su?E2Kpuanh0V5h8IU0ppM$kkQmRskiL~EiFQD7J%IT}!$%1fIahs)F55J>PCIew
zaarX}Ij0j8jdsiMj#pirzudNBHVXo>atbBS4JO;0F`GkM6VK<ghcJ9G0!RHV*=|kx
zNJw#cOt+L&ae7?%vQ?XDpwxOW7Y)<Wy(!xBFNvbEB(R%0ccyewIfwp2?|iz%{ZWqC
zj2|<6LjFY+yFxw~MY;ZW`S{0}PDu*7Om?{1a(FA04{C$BZGwyAWhOPcBe}gs=H2(n
zNKxx>Nco~me}Eb)Z7n>9Gq+5IT#V>Jb$;wbU$FmFqA6bbVfxAC>1WM08l~WJJBpt0
z3C^ctHHwV9jq|5xWO(Kb@i%s7GE%Cz8;Z?_Y+-d=0twNrIKy9aTOHC>#YA4|_K1e`
zq5W31t2snI%ev-4z|Yo(G!EGfjvI&05K|yhh}mOJ`*J19gZ{lY>7yuE-dI1M7YgMf
z`@|T=kh%n?$`H`@MOTlY*Dj0M)-%nrFAbIUyFymhsc9<SlZQtR3hS6uZ<FuDt;*MX
zsti6JNXt~3k;#l8cS9zZbI#zdj3N$8$D%$oB-t8^BZdq`V;2iI*~l$hL!$R**ZSkB
zh>B*4kq&Z1Fv(>w(XmZxSg=Y*9SXd1i39rZ2`Zrpo+B8K3K*LH?FB$fE#*WaSU>a{
zf@EoU^XqQrM_pZ6Oe^%ELlo0K>W>U|1g<rGgf1CMGsx0N#e%G6WS8ed1C!iy^aeXB
zbL0;?p|vPYf_{bSv`f&(k;47`nZhUNOu_-8b1pO7<mV_vjCw=VE(N_}rN){&%6Ho<
zhFUT>^Y?a&a-*!QxCmp=#^QrqzQp>Ee@4RNQrl)y>dw%fV{yybJ*EEPdTKIk!<Za9
z=>X9VAT)NW<Wu7kLJQqWIiT#3{^7|wxzdYc-sJR^cr)bvpn&kR^e5ee!ea>RJr$6-
zaMuMbeBZxhofQy6Ay&De8%8cUJZkB1i7yulUxbZG7AXjbeAr%e@ifsu88sifEVS6`
zS4CqF2teg!O+byngng^X<o?d*QwjNkkBrZ8(F}bhGR#b#M(u;#J;^1jcX%eTR&|Uh
zFilz>h6e38$8?2orYo^CV}Wl*ahqG^J}-j`17P3u<65qzp=JtvbLZptHG1zgSsa;L
z8`yC?n1z@&?cGb@=|ds2#93wd_Yw=K$?h#L+7Tj@IV;QQO;-t?Q-zStej2T+ud;~3
zW)h%q??uqjZ^L+^qS+5R>-)4bvne=j=n!cS6&|nx>!{pO1o@-raGRW;7nNLy+=8h!
zQsGF_htY<!82DrlAF#?eva(ryVr!BhZ<TvU{1RGXcC?>AR>*xJ2uUZuN|$&iYQ1an
z?~6geuckvx6so0b1UD2*Uh+j5t5=_E*q3_ee&I-Y`>}NXd@FQ++5)Npr8i$#*cx9&
z-aaPR79UJzUDay+-eeSiuQZEiwsZSEyL`KvQ-36*lUiTU#Jp0AqgA3G8FkA2gUWHI
zM+*f~x1v6)UwY&V{S6<ub6-MQ_-h3jNH63p9@#UH-KiR=@+$fRdnJ`XU{gqV7QZ6%
z=T-gzs(}{{#E5GRi(D(KwSQv&e?lO^Xna8ccO(5p{5tB-lJ}3=7rdC^0y!x^=HoV}
zKMR+?{$@f^a7NjXg#1a#|9F2)aD7<)s|~sTW*+|f>DiK7Ocp!UXGtkf9RB<JW66Lb
zso(u^-CsZS&(|~8Xof;gKdiSY|Hu3H<2HKE|KHdD|Ly6ARASB5oA8G?X7mCx^)t{H
zH#8^U2(tmKV3NsSYM(#31TCoh1ej!i{n+l1i3<$C5K65xdS&-&@_`8;!gYik53AOh
z`2NUT{{13b8X~ysfKI~v*z1ti9j3(80H__&xyOKZT(VB``Mg%JUIwq)4+E4X68>YU
z_7Z3mkAg`dmDf?HNZGyVGV8#M)co<P59qYoXlPqAezPmmJ|+AKlS23sa0b_a5bOqS
zyP$tFo>J`FoQ8xcFeqq*^EH5674OI7b<cahd<A~|1$YG&juwI@mGdjmr@ra_?Vzg2
zGr7I$&n5it-IkCQ{J4&Pt;u7$5{f8GR`Qejw)h%&S6BeI$)5F4j9t+04oEfW^+gl6
z0$}$Jnvz|t(t%G&xA@a<ka5r&*|KC6#{q2Y>Ka-}J3oHAI<ETx$lxw8k^^|<-||UM
zCprGdCey5f*PJY7A_5^+h&>wZ;=tY`&$Z(KkcuKljs3~VHmkq}13b2$1Na)`JT@a{
zPmX=&wG^J->|_D6vt@^}gw|!JIxspVco~(#d&c-tgPf0ZE1*b#_cUuza(nsYgAbTz
z)$Nvh|7C|tszLnPjMW_8FCsJM1^d?7uCRCPm&%esW#$31s`lXhHn6=H{kC7$Bt_W)
zZUBTDX4jlTr@#sP;S3&n3<4TbT@yA$<`y_D_7%7uixvf*kAgcg+8|J4+xzV=Rn)(#
zYzrPVK`u)kgpLk0RdfXD*yUE2x?BwQ#$I54APPwINV?F-boh*tcP4Ic1p6V5F+ac(
zdORhxC%O@i-vKN=!CVb2ffaYa@VUKvUI0e0?Ei&T{a0+0bb$h7*@XUh7}-hkju^B0
z$do~z#}G+|QuC}d1_uEOd5M~FluZ$Miq1B`e)1z2EfX*RRltqr15PDz3U9~$l4JcV
znui$Eta~Lo-7PD~mMp+Ew9U{f^xKR9drE8VeX4wDTKpP##$bevAAkwByy?@+*4{ix
zBIXW#>h1$W6ZtdM|6{{Mg2<U0iTW{VZ6ECj5Zsq%^Ew*0Kp_DJu@qP*iDjy}<VQ`u
zR_yX>0G<P3s5raH&gu$%^+2!@Xy!S}ke^OqrBn7y_VLW0wd%k5KrQ@J*A{s((UX;V
z1<CuGU_n{UK1iD$pXUSu0K_;!UNQpU9rF=5gjTIO0dUO(5LLkU(UqoSBa%#zH;=E%
zfsXzb>_HtWVTqLg*shYd_@|u<hefYV_(F1_>&<0aW%4w{9@4%6sk1M7ehS-+Bm)CQ
z&hG2YF3Scxbr+ZaSoRi(gy3M%MWE07?Kt}5jOyi6D)t+n;EU<Q`EUOizI(Z#;AF$!
zIQ`FP^3QjX1Rp@aod4Iu_^+?Cqrt%$<9<E#A9wGcp^zHFmynJRJ}4o)w+Ki`!0npJ
zA+gK01Z&Zm{*6=fD)8~30)8k}fMd#eH_czGey-gm0sJrEI5AjkycBsq;FE5qdv34D
z+!pg}GmUtr$oe^+0{?Aq;I#seCn!hLAnNU-P&fc5>l;_D2ua~NRo+SAPxU(-ObwSn
z$x#&pD(J);&DW^H({wX+KL~19VmSWDKH$$AzNK~@{;{Rz#X}6t{3~|aAHBEDk_)K5
zMXBUW0yIvr{%|4%*#~Qm#$H5VPObs0EzaRrSL8Xy;x)mITI9c%u?==E6MQL57mBfQ
z7b53@FPCj^?oblJSmMf|e*?C9@c_ToK=iq5lJFt3aNur^eWpO(HcWx)X}W7Y2txel
zjq}<nT6HFEuKcyO*1zRL)utfLamuQClOxH2+rC%$WP1mk1`2Yuca!FVF%VlS+pC;=
z#e0CQIoOkXK$sysD3#ZYjsIGw+X^XB?k4f|{BO^RG#?(UuW{Rvz&Z$lTUOQ9J&&yi
z{OaQb{>5mM#Wo~V>_DLL*#prC?n%R8++6GcG(ktM))FcJj<LA`g;A4;U5Xr$WzqPr
z+mFArWUpb#G&0GhAilrxMqL40Qbl~>tQ7=xIhmXRRI0#WU<!4cP7Psixf`=8281uW
z;B@I3W38KEfQ=`0mYx8q#uZHEKJ;tTVQn3l&O$oQtR1>I{uns8+}>Y)51g4aeqcAG
zA6Oo~ZfuivdjlMuvKt!v(D}ZB&x9YC)UmObX{`=G5&h2Y)dH{tLfhs7)2gn?+`8Tt
zwr3ca8LeIe?*@;NNcO_S;a?zFJXrJFI?ZI<|20!0gVy`@<$P;dpZK^^i*-0V4~S*r
z?_ak9E6+-`w8}5w3+1iBE&w{(WEsnJ3%tGPcmukD56xa%mu2t8rWeqA&=M;Q<r=84
zH-&q`!w}r}Kq`jiE6}|plJYxMWay7whWKM3G`=cJcGYCSRJKHg$lT!zbuD~)WmM|3
z3xrr8BBp>V-iQK5;>-;WIHY#O)<ds&LszW2z#%G+Pw$7@Em&N95m*wOoLT8u3on^N
z><T~SX*(5P=tu{Fpn)+1pR@8!xN(o~%L~4bMuic44+8HKQ(q^~yh~Qfc(F<T+uCyA
zLj3V$ISY_OTQsNOl-%E22a%NFZ4DcQ?~ro*Ge{Q1Z=06D+Aw-<4xISMWUYGf{1w}d
z(ozNTsXJu{<L-AGV#5!EOWiE6OC#09{0V?s#2N@RUQ&rcc=-@_+`!JTbU$hu9_t6|
zmMTZNTK}*o@GBV0GT8h?u!PCcr(#<CyK^j%wK1(`9&)56ccdJO%p+10Y9UBZ6>DAc
z$0rHx63220{y6Q|GbjRw=iQrqhYm5Izfl6`JocevVC{tig4W}HEP-+|V%;Tsc9jHZ
zzDFHpAqYNBn{QF6;ukYCWo8kO?-4u)oUoXuVy}=6+X5%|5*T47l#ND4Q`&LFe_~(Y
zL(~S5gsYI+7XriX$SKkvV7^7sXD%nmc>5f61w7RVs2cie6bq?Z^8VTz5p{Urhlv^+
z2Y_5niy-Vdg5D|+J{7#ct2k%@BhqwYYnjQ!{SDKloX?R=os=!$gGHiH84CS(NzSPP
z_Gw%XXdFFt2To_xfPRCoL7Hl&kZTK19@G1i#_!Jw7t#6`-luT@w4dQhVp@@mD(CsU
zTucK>LA|eqf#*t|hKjFrPk`IFvKEoxEBOO(f?E3n9)CREFlMI2@TKp`icy@qvQKc6
zD+`#^k|tUVwb8`gETNq+oTx*Ob1DzlOysiQr=0qJdC(tV_~?ra5Q~9WLhE{$j6N^^
zf@ihA#~XbG=4EUkKmbYmrlgn6^9BowAOXZ}fr9kC$Zih8=RM##4umOYy#^vTy@6K_
z-tD$$RH32DT3Il&6l{rj#;ggrS~8aPh!>T9Tq!7CS}4^u^n%Q_on{UJ8Nwgi>I0Xm
zJ>7`7hets~2O?9}1-0X#MKs<(V{PXA5YL#FqAs&7&D@N?C@<CaHV7e6U9i-;F^hLd
zW7)p5+3Lav^sZXbe#rC^gdRuUxV@MaN;iS$4-bl|yl13JituvN#d#cA6z2OF<#X(5
zbU7quDk_nK=8<(MUMmO_5Rc;QM5AiqFnq_`EH~L60j~7M{RsrsCkGzqp~cF#zo(!;
z9y+FV>+Z|O$d!2~ug`F2r~<cy087YNp0!?vO`{p}{X_S+cUc+gBfso{oMf@i;Qlt5
zGU18G2u)P7T|_(yh8v7#k~y0>lma5Ufobhem%uRo<dQ=FJoYV(LRO!PVVZ(-qEn4y
zg^&Y@weWr05ZB`G)zKCAOEiuK-WJM)GWk*b`pC2T1lS*Jhv54+R{C(%R_vAsX%kke
zy2;3{+c?xJ;5ahN3M;vF;j5|DaSSrkv{1KsC>`DLgrGAklQ<j}3ai$*R;Vee)-+Dr
z4k`92@0$y%<}gpX^6X+R<C4;TFPVAh$I0@L8~?n8{)FqRpP@3yRv$7Bp)uu48gH2p
zdzDAs#n8scOclLsRK3%IV3MuSxu3Vw6y+H{RiVyvd=}O?_!YvSVm<I&xLZ?s3s=S0
z{4nze@#Q#YXMAb9ZR@ywp!@_>n;_>g=(Ee`y8XvKISsx{#+W<-se9%LiZ8jc5Mng(
zd4wLjW%w>l9gP({@CNEgFCD@vFjhxUKgqQf{53t{3JIXrJg-N+Eqf0q!&?-C=L`+C
zh6lMNN%oo+BH9{oKxG0{3j^zwd9Is$BzV8Rz3Z{*0s2Ny(J=&6@?PZ(8K;q>nG&QT
zfscW_q;EkLrJRc;#k0t?ho3T2n}Iz31iL^h(JZ)SkjSLb2`*3Nse&Fj7gIc5(YjG2
zgh2$U6bf%10>SFdwkEMlrA%Y5{F8RWH*POS8OdE_wh8Px<T6xgKZ?D<jKlu4PH7xP
z)<R3mxFyw=aj9P=H7f&PTFMCD%EQ=%&CC3B%31DoS}}JEtaqSXo8%a#uR$;BJZ?2X
zF(yb^0){(2ogi&WvG_z};G|hOy^yHY#S@PduU(|fYe@bn4AQruJkHTZK8&>EaxQcp
zBRNE0PB0lA6B3l(7N76Oj0~lqNq4OqAWYVp@uRs#A#=uU8MDqJAx*f{acz5VN9L`q
zUb30aU=~3Xls?m+%6O=7k^jPEss7cJaJi58J+$<0W@Y?948e`)e4lrw1~(8%OI&!g
z<0{KOhDef+4!mRCt~$NL?83P7Xq?1m+iO2HAkK5X)y1R`Tg%WKRoSO>5#)MQl)opO
zA>Wv|d@a}EF)s+x=xHZn*vzm)>#C%9Nz(*YPx;LVC$Rss7K<^H*}i*5%KjOO2yt>)
z>jK5C+w*L?#flNtz?Vcj*L$auGB>=BjV7WH(CC@0Tnw*<Jw{k7rs-*k(MQpKZ9A;8
z<q5iF_q&eFe#_UqM~{@4H+v%VU<=m_dPZGaJ>7RQDv?UUp`4|~jH|$2<->cR9FTYQ
zaA9lSi4v>Orm_@C8t+DtI5kL@!E}NS-=-sFO+H^GoX1ueJdBKZ=;b~o4X=qyiykxG
zp(HRTX9#KHMn{Nye8qCkaBL>tAb-;z5$irBSL8#^p9AdqtOOoz<5rfJf@;7RO?+Bt
zkDOvzjt@T;6P+4+^nQ}*f>w7Qj>yksXUK+Bc&Jd5pO!45mPVVmyz(PqJ)Vy(`|gJ!
z-rgpZBeVM>>?j@c>5>@orJDy!FQO*G`H1qaOKB3`f#>wD$y%YDcK5-Cg+up1w+EOh
zJ3vwbfy;HveIz@}D^AycFAnE<5r)YZc-;jA)bI@f5&ixdfk9WBil(8jI2cG&E@T>!
zjAd$kcBr_eqb?M|Au32y9kq@zXw}#j!K#Q7(BIKo+hks^N)VqrmzxiP3}!zB<3~Ve
zfbL8J=H*RW&us$oEPiaVbicOvXJriD2ec{jE-746ei$@D;VeyY3H^_kGWvx{koqFD
z<rr)4F%q~9(mv!C4-Zld(SC+rmu#fucHslGNglSIJbX+7uR3For7bJzo*#^QY;jsC
zY3lKOGbY}{C%*`{ll5__1>236&-UBHW8#v&Q+<BY!Dnr*t2_;5<SG5xrA(!wLrWEQ
zCAe~o{R+88W8EBhz=6l!Dj(D)<^0S8^Tl1%am<&If$0P_WL@aV#*k=Cv73&|_52Ar
zoSX}9y_=mTjcT!<Jzce`lO_2)*qCYPrN2~#<lLYDKIwZlq?cndz>jRy{A!8a=vuf(
zTL;lnSM4HUJ&zb_IZJD9ug)+hkX0<oMcYvN3Hnezo`$xJ_RHnd5~BMW+upC1IPV4h
zZLb6{^h$I-%>ke&3gTHTA7Adi#t4F<=#l;G3#1Tw^0#GtY{1c}TZ?fIoka%~t;uYG
zV-|}?@~ve|-GX=pkP_0q;b%8yN;z4a4JF~lJNhd1CRIS(S_SGD-ViNMBOU(y(3zSP
zM%L2zEazmyJPK=zYnfq^Rk)TdrI19ovJLtj30bdPLU3NC9b!#W@Y;Jy3&M$<>OLBI
znrt|Y56L{xKBRg=s%=sZ6u~g`?+@+by)lymi1FzXk+o5!e@cFx#hpPQz)}YFh^-jJ
zONmJd1UW%1==UBgh+hkdoW~VzKdDCf<GK>j+QoKz3XIG+zLrGMb48fnmbW`=B3ljY
zy`gPm9fwuTJhv>7sAkrwFuD9E2TtwE?FYLwSTshvsHS%N4W490!h4Dpvr69NnEyCy
z-1k#O-C6a?Urn}QGI>ZpIH*oUCvvPL;x=;14HvFdNX(*Po%@TZ3Y5>9)5w9+zG6s^
zy%;r9gEyZp;bj;PTyb8#YK*#DLA^LpuO)PxeGPf9Q>({TH#sujp^M=+AuVJTGtMz(
zouX>0-pnF;kb@L3cgu6Bd9KE+5H48bC~mE$zOBB`zCDNAxd2n3GwpsNYad?DfU{3Q
zg0pf6p)_l)DAj(Gfc2PUCt8z{wRHCMI(T!_qxrkWu(Zw?r}X(2$!i83DMK5eX@q69
z(40lPM?9^}(6%o3s`vjy{y;RhBP&N#G@UYBy|kN!GNFW?J$f?cZKfe*ez`&Ipr*kl
zDauGz$!61MZIk={M6vF~9e8@UG1uh=1Rm}h5MlC>3Zu93DzWfvE7CKZ`vq~FR<u_Z
zuKB!4A)8tIkbxwvag9`xQLKq6U>k9nnj)vtDupW|kW2*`xEQ&TJ05!Lg&3E|V5Bw1
zdGRxOFBQAcysI>q8H%{~d{njIV74sE^=~N_mUJ;%L*jRX=Mtb=uf#-T|85lRd4l<&
za<}lwN%I_B>m}0XN8#7yCC0&_`sf36+yGlkc{*a0m`)Bl)t^X{*)EI*K|;*(=Hi9Q
z&)a^-`yS?9@2NW+Y0LGYVBe*Z*?Ek~fuH~b2MD(;$g6d`l`Y-IbvL{zFBvhH&_;|N
ztUPaqDLeDiiYq?3!$NDUtV|*@vZ+|WBR)qfwF{NLR#eI0oZCYmX8x=rbgHlz@@(yn
zEiywKFlv;HVwhGD#QBB#c3?#GeiXv22_@6#<QQ2m)i6~;_8hdMnr-4Q)p}@GT>6Lv
zK*eQ)S7|TTG#GFCBwD9huFE%pE|zPA>L|lQZ<nYh`sc9deOn_ZWq{e{x*?k<pjWZ|
z+=%HO;_?VuM8sg%td-l#`vU<2FC@d@<N@{4YBkzC4}yC0hBLukb6ni?!jBKfIT%i~
zcd9iN6rIU@@V+3+g<ap17M-C6Ffh@Z$oTzop3h;A=9Bmsyq~u=R*lW3pRSyg{f1+y
zT7va|MexAl<T)Av-ns?ds64r_j>JnXOM%K6)zA__{hX(U10-kaY@_n0&Ca@2NKQvS
zvG1mS!(NJ80qR}ds<@nl#z&8kh`dRAT6L<Y=`+vkO9u4~_wr@hX%kt{D)Twa%nF6t
zBg?mns4cH`2p_M_9|rO?=@f8y9^J2-9+hZGj3@$0!XGGnFY-CSg?DlUhkyIaKYzi3
z9pSV7h3Ic7<G-Or@R!VtfY!;5^ZM;-|MKp+q#^&PmwPb==ce4}r`TUjr0yo~-d*eB
zEV#EI@MzH)bdqeV?U5kG3i?DpUa~*m$~4HkploVS7yvjg`F)+?UEp)OSMRW<t-%H-
z2eSDUHvz!0_txJKyQvU>Be1Mag=c;dqMvQeZBpdURsBMQUx?z2E1q_V`cx7GfW+(3
zlAz_^k-<40a6+q%mwmQ(^G4pyzLxG;|6=y;p_iI>I(`knefujijh*1i4O0t%UVe>X
zDvTb(M*+it%m8Na5!Co2qf9`BB<LZs^*P|`h-E8^G5|6gTPd*WJqPd;lCyol3QS+$
zfH~x#rL7A8C9j;1#}hEj>~s58qF>PITT~IBL)|50m^J{3zFi)qfr(u(6%48yklYOm
z>cpxJeLny?(QjQvJVIc6Z9nJ(S^+-p4Xw+E+i4m<!EcQdcaHr1p;Zz9-ZEEPe(W}1
zAL0xE!YFdj=)x<2O}*Yq6u}nWr*4i1F5zL>r1NQDNUy2Sg5IZwHcBZz`Y9m!0W=5s
z)63b~P`{=Lp7DC{=?mw=z-1BiCg$Y=(8hRW4J6QDl;HgX&~le&5dY?UjLpFOsSiMP
z?Eob8?DmuS%e@9C(2NG9#s1u=AvQ1btrrPL_WlQeSj|j1Ew2z7JbbHD-lCbid6jA6
zs>M79?>cgB2IYSll?!;hmYR(43FFo`pj?IbPyvHXihbf}KuZWx-Rcg-#QnK4|GCPF
zx$!JG4&ixk3huhV=4n5UhKLM3M$SIe1|UhX-XKxw3wEGubuFP~K^;X31MQFz2zb+=
zjqlg@kH1p;3+n-AQZN9XanyT2qLACANY>HLIA-YSzZK*m6o1u+&c<vHrr#t^K0Lz4
zc%>Q&p%zxv#Jg!`i>fyDPKAr+GZSmj+g`l6BK6SLRVr{-@p&+rA}Rg>7^3Pa|A3z9
zlUTR(d94CGqz1o&&}87#hYPvw>h$4w?dT2kGiFa1@|ivun{0(-YJ)cvaSIv-tH{6g
z9>W(A$_=a5_V1Y`fU)`P0M$G6Z7K_m&kPa5>Fo--Ubb~QVer|#K|1r+eJ0UfQYtVP
zfW9i?wv?O9V}&|ong{uZdP9sh(})=eGrk>x$-xTL1wM;LPFi+H`gG^ff}}pd9L~t(
z&@Vb;Xg4D=WX_FJb@~lNCeBU3<=ZC>a5TVabc6~tCWeRtyKscJf}r`7Au5>hDy2lP
z>26YRF<50es+`{B(xuf?EGpL|!rs;AH!Fau+Mfn9S1E;B3TM)WYBV;e5F7yRhyr<9
z_CfR=SUz$rw;Nu7CdjI=>K06l{h+O9bMQN&0|Jf1HgcCqLkdoEr@$msl9iTEJ}Kd{
zca0nN!KIC&2*MnG&jSKQD@_K|uM2u^kKI5K_mGPgG5Xe4ST`@4_{s}-qvmoLSCfIc
z<=VTtM?Y`BE6Dg60w3&Iku@-Z7RL6x-RfF5G_pN?&82Ns5%*Jhw=KJmO7sj^>fhl2
zsM{KO&H+HD!3^Hx{+D1x=KvOs9W>n=Y-N1AL06G&u6bro-ysiqeza}ktH1`JHOiOq
z2A2o*qQ?QypI{8=l#Jxf?z3lz0T>`tX@yM>P(M!a&8aHme;be!kWVC!DU_>J*H;!R
z0=ct;p@!%ga+J~;93g^798tvSKH@?Bb^urD#4&S-tLQ1M3jYUdk&p@a26+*_lcjm3
zU{m8*VkcO+(guxSepCTv4Ax!WC?z4<K8VsLnE$kON|z{rM@^6eKEMsAWp{zmlQ~1L
z!cCIsIYfIFG>f&vl-W|k#HEAdV=rTv)7D_<V^9Phn^xcs&Qa?~x^vh66M(l@J;ym}
z#<z;eE7akGLMbLLf!%uXY;h0dE(LXXtV}nW;AZio`3lf}n;N@ROdf$M%LxNLdNKLv
z_($U7hG3-PVhK4%^Nh43OMya_ikBk(^j-Iqc_|f;;4<6{_<E#BUUvd`SzT}g`D*U$
zp>3_^_V1?{2n&BGB>HP&@0bi-iX!*yW0p;9m8f|GJTC{r%q;7?PJ2gztT>q|k>arj
z?lgcA2iT@HQ&4ay;|;@}%MxHZ#T_1gaq6N{YkTzkwOu6*_Ra=}MM_P9kuw$@>R!$g
zIr4qPEXWlNDRXVQvO`uT0h{wa@8>X;1Z#=aP>>b@h7^kP1@M%o1W>}k_wq{$$<P9+
z%?$!4p%R?%i(=slaW<ukhtAw893ez))y~?_PNzAAz7;()?2s)dL?wm~>xSFTcPPVs
z+v5pt21vN00o}=avr}gydi@_fCn*_R72+bhQdE<FM>;8yGgdQe0_oLvb}VdTa2N`5
zMe{ajCHiHNa4Q^%TT76l6Aa(F)A&-`6ytlS<30zh5|hxjvooIXizr#vl#x58GT)cI
z>of*Ap;KvSy+Qym-WSp~kk?ZFB%k}LPHEUuwR}z|4Q6sx=3#;R*~9u`74)2r$#dCo
zV+tzfO<!i0Bqxl#8}xmRHmdv!BV079I$m%PLEgh8+rof!L)SaiI**v;O@ewRV4Ng6
zKg`5%DYoHe)@Uz<^Zq@9DAs~rfBJl!P-}7$kEYsyEJz>SNWfV@q<d`bnGJmquyBt?
z2)PS)DRt4<Du$t54uw_{g_~h2VbN0Z$UKc3D(2Xr?dVP3TmmX=hQ4Qw+oyCkWd**=
zHF)nKPv$wrzn}=<<yXpb5ybrP0a$o)47u-qNz{<z+;;WFSvj+Q{useRN<ZXgL-Tl~
zrYPKZqr}~#@A3DmW1&M4X6Ua7=Yh2jc(Zaw!kBJNk$5MNRQ@F|0<W`G;b*aG@N-v0
z9~DWm%1gxiW~&jtuZ~+klv0)C;p&A8%lyFb;1vEAMdt^fD(a&sGK(#GD!CRDnFdI8
zeV7(@vmNR~(BiLIKa(^Q9En3Ug*G#vAkeP&DXN!29U^>tIVZ%OE6{mR@HL{vdw~B%
z^PZ~fJOlsR2Yjw)lGA918#5K~UX_NQa5X~RMb+F7r#biWX!Cv7nYz|eOd>-IgxXh4
z*kb!2K#PO90V8`h%$IbD_OxnCpm%+h!CL%)hE=<UHK$OP<3B_k#U4-#g_!$5N-8fM
zY>sNimw5ov26h?o&##$^ZXX_CvxLSA`y7VHmaqy(=gy^ge+S8svYAwQGIPI{IDr`8
zECsg*?RmzZT^+sWUEs&{Si;KU{<KhcPkknX=cWVB9O2*Bv1TIOBR^A`1p};=8bHs#
zg5Af!3pI#<=q~7eF7{>FX?awFMU&<cEKo&PL*%B{k5<KU@Ic(3m-NO#AO#<-G@N<N
zJU=HPL0RaN@R14we8dEG7@zM1_wYJ?tuTh@R?$V?sZOnO4P(CB-_yi|SV{bVG^1nq
z)+^@5UneL5smL{;@i+Y(u|3s0$DVVlA8ss_y}4SB<Fg|!uA=rW=7e_1#HY6QRJ5Sa
z9YJJnEwjkUhS5JF7#OPL%<n`$3h1Nks01)e^uBifkAj--fO4^@NXv-+cmqaOyoW?$
z1uZcTlPCc<YvpMsT9nvos;dawAM?{3!@4D99_j7ay^PXMfECf|&_!h|QD!li68L@P
z0ZbR3mKTO}3gK6qAc7}p3Vacwu*l@Mp~kA96$#C%RA~)=y7PE8DOu>S6?Kb%YE3E0
z!3S^%Z4R@c)WskPX*2H$f|~IOQApNoyfy<sLvA*?#Ph!h4B^~3yuL>RAt*!hWdfQi
zH*)lxr5DMpm^0BQF#Fy9fx^2-DxGLiws^<_9;`a#)bI%-w5AzNFS_;te@4%YrdE5-
z+WKEaJ~n{GWh5zJ%ax>KI0ZQF>?<R?oCpp0p<&rZs=%N1QA_U=PFUn1*^GP=CGbi7
zlzZ}+ZH{DZeyFS|q9H!vKk~$0WT!ZAFfP+NO8o1>|9Sa;{K?e<&31KqLM@T>Tlw~<
zY(SC&>R_xAbOea}Vj{?o&(D6Tyg41>b`e(L^=~7+U%oeJKbgzFg=Kz|U_qb;Oc%QR
z6Rabbcrq6mR>9a}(E2aN<NstCWQ+;#Y2(;a{dG*1%448(@)H0!2fNfhw}$V&Qy@r$
z&eA?>6Q6bnm}Ncq;QWfCX#kJBu;4MzjZlD7VG2`0pqoFAbJmueTkrI}1!mOU-l%O%
z-U-dJ*qww&!o@pp^vr69b3lfX^CaUPYqL!pgByx7n0G;uU}0G{^|zbNEZ57eP%$F-
z5t%f&mmJ?bU!xg}vor(j<t(9@MGsJEaJI~AlesiQ@87d7nTnaM5gm9f4WB^#BB5#w
z$bUqddYT^Zi=RghG=d7_ytV}hit=PyOn?i?b5oJEheR$OV_%wvDOF^E;79}53#9)7
zZW_La^DNFV<W&8C6+d?J0q_a9E#c2Elb-#s2GqL4`<FW)ivV?wM3htmJ)pPliC$#i
z@4mJ=JO%8pKhQ&Dq*dzVq0fQB%K;Q1tbCl-Lz=J!U__A7RHLQUf@DnR#?LSRY}-Rn
zx2}@#;tG&-xX+2l0#m&HRzw^Y1zRWnQ)S|b48P?v*g~>IAj(Vvj>2O`3Kd%Z3e2X(
zoex{&bzW)&ND6wx9xys!c>8D^vL*y%OL{vXw3*rE*e;qsd%9XGW|`I+#pa6UY7FE%
z;y+GhjzC$cBP~h^lyN}6mEl)`fP{j(e+d+8E+87Q)i)wytpaux&Y-wCiGXqs2b=@j
ztj+WU74t4Pmq1$G3YdOa4ye7{z%&a=A<u14uXzD+9X^#epaC;pn3TalO8Y6o708$b
zVh!it++KZfj2@<EQj2<<URdobS#^o({p)XsENT3mM_T4!KO}_TkwC8Ib+o7VJpxr1
zymb1*9by2N=7f0<WWoKx*VL=7U1vjz5|;UlQxd;^%xlZR-9K``_!h3V5qU+GW_8jS
zCvZI#`{01zy8;9*m<R}=TZq+AZi1(=bP8zf2WWD#?`5H4RUd4EfaGmV_Dsqm0DKd=
zs6grJZZMn`FZ$tt%qU*QHpoWhwqY*cUeBqE%<74s)lcbs10=gN;VQbg`O`0Y<<Gt!
z3p`8-!<sVp-yvkzaf@&qvI1OB*$L1wNM$%g1j+mGIL1|z_*0WYpqh@cOLN0sZ$Pc-
zo!a}nPk(9gE8y=fm1Hb~HJDRmI#X(7TaaQ&kOu7fnTi5aE6aon1~~eq_B_6W)3Y{O
ztN4SZyy?Wg@3sw|cgC<k2$nryS~gJo6~1Og5>}O)B=>YDW6m}pxLX0}2vWWDs!m}2
z;XYW{qKf~Q9*lp)Y6o0IJuFx)8PIDn)Axo7m6111Fs2bx9aZpCo0QjLb@m!okCV;v
z5x>%Z=R&dr0prop-cTQ-blj=M$2NMiwFd~LSQtE^CSSbSE_O(k--*#?0A0U$T{C$+
zSKD*>-n0Z!OY%`(h&47VEsqSOt1YB=fuOB7tcpe#Vo3G$Qmeh<Nj6+C$>Br4bXiUt
z+S<Ob03ZgmQP#qAvkj`Q6s03QL3Y(*avF7-x*GJ*^!Xxy0mS{RMqjN=Z)aMydSUqn
zxYd-EfJ`C7+f2v`^`n*o1)&`#KH1yT{u^bRLiav2#Eb)z8-zh(CB<(AqH`VIoZ-@*
z^Ol4K@BvR3!PoI5FffWk`F(7*qkwA#`7ux^c=qjq3Tyb)<)?Q^d<#wGa<Fzi4#b&u
zIRsfz<zWXP$Yc6Arq$@v_Z#>@3NL#icRnwwS<oV~XTQfDLNvKoHKfR=MXWV&P%P_x
z0E^&%CL2A1@e)yP16mQRC;WXC7AJ{EKufyWRoFPe6n#cRsc;iMTAbvrq>zj^u-l^A
zpgoq7UI|Js0c)x@fD?#Kp-F9SyU{)_x7tt1*ku9oM7+LG_;{NDY<$_G5G2&pq((g9
zTv>)o{(wZ16a}YH1^Mh^w9vxn=omik`^*Z4h_Mb?pn%)LMPL@z&z2M_MIXcP&=|3Y
zXv`zd-o5uwQXENvBR|>h9gX7#+~DGXD+k#NYA}c*`K{l@OsM0>`S@rLNFm+j@B`#K
zSa+gnbAvWxm|)#{JjEm2&DTIjxvns_%sosVhR>qVEX|I`t#Byx06y8qWfp;dpt(F?
z0JWlljbjHwHqXtNWL4~rNb&>D)nv~iJ91{bV^9qaI1bYu(odEcj17%xQNH{=Toy}#
z#sVkYBUkTbhT;SjS-c995X7j(iJa*-U)Mx1HSsl>Y1i+n5FoTtj|TImMk}Ik@eR=j
z;`K_WqqdRr5@g!q!KHiSPmUONELb9#k27ksv=dKyeg)M^0L@s1v1J$;3PhDh5>0x9
zIv~P?MS=Wr1MNIRtQAsLD?iP*>hvJbz;>87PSL(>#}1#XY(k*%nATC!RymW?8TCu{
zAF@KFII?rqx|XuhQX`!63<L6bNmjm%T1+H@au4K8VEVvC;C7q4%<-+1c8W=O6>}`u
z_O%Wze9@Q;iW;mt5GD0B0uxy;^!3zDx-Y1avsU6XI}lRVP#T;*%<W7TR<Xq6<Xjb5
zYsd6anve>5`h|tA7SHskqra(za1n#rNb8u)88x3F3!3G=%kh9$xDgGdKN|zy4W97o
z&*!U8)-3SsGMsdMjyI%lW@^PQvBH%Aw09meX|yK6nt5gi18wBwql3Zm<y|m_&pG-;
zi6cYaYJG*O!l>i%$x6<<3Xiz-V8)6-vrhFFu(n2)o_QuqO#+WtwV^v_5s!OIl)J!@
zK>$!gC!TfFqx{p!pDm4hE*|A6#JCR8Ov~^yhw3r>Jl(Ei3jfsw%1==OlQ->tA*6Dt
za(x<>i0`d{K1&apii_s5$i}D7Xzw-+2P|H^-p!?7;)~J^XOn&YHVic_j4NDGHXT3Y
z!H5_p`3o&;5tHVYw<u&IGT&lE2Q0WzX<q?NZ8;jKX+HPFl`86r6eD|C=2MbPh3srA
zQqsN^O=$zeAJd9)im4sqC_;Zoq?Hc&uKC-)Prt!bOH&H9pcgtQ`t^g!Fq+*u2&9pP
zq7*!|yjbG-Q&R)QuP<N2#gIIp=k~XtpjuprcD?HK1#@Lm?Gy5n;7}$NbNFR4P3UD9
z9zjmm5M@lr2*Ch>EUmAuDez+I`i^<mDbC|w>Hq-WVvt|KRfZU^nGRF4Eh9;|)V&l1
zT$TwAP(9E&>^cs9rY32b=|A+4n?kdq!sq6x|KXn|X3`7E9Q~?Kt1{%Z;R4{Z5e1Q6
zOD9RLDR8Kc)2JQtnkHz66A$pO(hgJ0B+K`$<1+wM%@<7tPlfX}eGwQ~sLNunk_LyR
zZo0tSH5hOF$f!9;E643Kr)kXCF$t2&)PV^cf(P;rIM)h(O`dScKkRJ6E#EAuevaF4
zT87*SQ!JOM410blFB*G}BxTov1dkc^hj<Bc*sHr~j{sPxOM5H3W0^!FWlydcPCu}s
zsDd$9T0(m@!!4wwLMIa>eZ7Uuw!E4C6_~7Phd*Ri;(q#EnkQmXXzp)!mlE=$#i2fG
zXa!)Dy{?_dk(qDhO7Td6z8c#ul@&7C4m72>bg)>U^FwZf!x-o2PS~G*G&k$!PCne1
z75*&$V4p|WQsGI&KIKbjiby_s`tw6>#_pI7sHt{s%!u1;>0R;nHUgg8BO|g_?En(d
z;)Dcg^<r46vC1h*+iB-(Ev0GJ<BPo7?;<Xxg^q+xNngXj9hKLd5E~t=(YUr`Ge0cX
zhb3FGZ^vRMCkI5NSMkOxa?#2<dMxv09+mV*GF9l><9Wo9j@a*p5(>$t$#v6a^NpqC
zXrxok0L4l?YqYmkN;TDbnS{K$cu7ugjEd|VACSJfB~Qml$ue$*dt%WRFr6Hgr>XZ*
zLL8iA$qGO+k4fuU651%3E!qVi%9-N(`(dS8{1)EMa!TWl3XWHVhz|~^BqGg*$6)GW
z>j)i$c0>R$5m<Te3^&*YB8w@}NOnXM*<>KpM-K3oXlk0)J}|nzNWjt*a1r_1@$}*K
zVSXWNAp*V=gB8)li91863w16Gm~fv77+XU#-rhQV%A^Cjvrg$RKl!h-C-jN4Qq`O8
zwggo1KDZiwnwx->kny&_q3}U7+LUu77g+_)Z0h%<i{Aq=miBizu)m^ALDvXD8XzS|
z;44^u$NaPx0ku&A+xt#<j|1~mbjfTt*H9dvC#49oM>j~ZE@FcF7J1c2b|EE!aRY1~
zmpJ_hB!%hNML@a8tclJ%{y{<g27zHWx|_zDhH``GHiZ%LBjzvyEi!VS7g=1{`h``A
ztqrBe02Gtc-mw4(h;1mr7(-Utz!)0qB2s+A?^HDKUSlogc{1ji`F+A=TG0a&LGQoc
z=nXW!kppE{pz1A7zS9E<c>2V+HkmxJoW^T^rX=8dMv#amLo(*lXSyfYl#Eo4l5N_2
z+6IofrltXEhCeWyaMYt@@N4mN91#&PaKh`7M`wj<1YpZH;l*jA?>P2|DlC;R$zL<E
z6C&cfc(6Ce&Om2K`{Ef#(0CKEHrvUSeUNdMH4RgHv-K}u3aLl-;Sl<>nB*?1oRnp*
zP3iAqSs5Z?sU=h{VBwKB6HwAq)-YibJs|I+s90{TjKy<J_AJ#!vk`Gr<*~-)zs$#%
z#ZJ{35_cWPWPUKBH*5c+xch+nONa23bt=`mCh410+)+9OwhFv_9d$XgEa0YPR!=k}
z%z_4079zbbc27vuQ&lb!4GW61qd+4n@;6RQ^Br>TOmFzJz3)AsBG9>mfXyULw@zOS
zk`m3%<kGqN-uK`8n-rcLw;0%DYPi$$lFpEOO@y!w3S_iZrAy*j$-s`~7W7jPdLC38
zdlwdZRr8+lJIbXa&SMq20tW&6Is@Z$B`*|wKv3@MlEtLy%T8`=B0DUr&DE%tSA-6_
z&{vbC<Hj$w4@>E487qY*M2Rwr-GA1NR~&sDif6#!+Fv;l3^S?3x)k(RX2O+gK*x`#
zR!2`jqu4BsE(qJ#g!mpfszvDW<Q&PAU+kAhL~+{2|J8`&A8>^mybp%VNx%m2i&QXg
ziO^>5on%}=r#8K9yft)%J6fc@+MQ~A?E<AwmJAJ+upU_Y94$40LCzyKUmqf)+7`AM
zyf6zeH(~tP_BR|sM;HJhI^Q52zu+wc($Y&@gJ$IuaTAO@x$$}U8lq53IeSNqNHT-}
z(xbrxa;jXOq0u*dA!BI!3}5+I*>p>uUEy!M{~p6SV9Wa-*dh$x)B#hNx&1rz^G_rQ
z^m%h#N$S8$yZ`=QGHAzRo*oeT@Bh~V(Hgw}`Tu9ux1QV|`%mEHuT*<58UfslX=3p8
z|N3P8L{Zeg)#rcw7z?h7b$9dO-*3mikykc_LZtuozJuVo6MA_({~zd1a0cD<-%zwa
zegqv;`iZfx1>gOC{QXIDkEQJW{w4qX@{-(ORSZ7N|4-oTUyo*I#Ou#D`uzinvq4Q`
z1zhj{xRT#rG<t3e{jUeCsR{(Q-1q)lGuA&Y6+B+@f8Vyh-_HLxQvd%R?;lV5{~qt}
z3-DjNw>AhU9ne|8A5`$pO3)EvQt~umVt)wObb>+AUjW^(SY4X%H#iFH^TTLhf^z-{
z9$Ntf)(cv_wIgM+I8BL+Y*GL!S00Zg2bk%%pcTr0*|AFO%e)TBxE#<M05IkwNLAo;
zwW=0F8)#Ep2P`fefTDX3$5(-DeuMN;y~jF{?3T=(JCr?2vQoMssG!xeKIaf357Mnk
zP`-W#s)IU69>AD>0(V*?;10<<#Q$dS25#cJ;xSyJB)1AmZ|uEi-#>h$ty&%cTr+%{
zvIQtq_SAx2z;;2J-wBKyL-N4|!l$#U$BmW4sax;{_re~yEI$M|rY-w8!7AU$81V$?
z!2~9}Yk)odb(Zvu&k-hD0hbZXD82wV3#bJqbaP7Go`J>$z)f!gZh%VE&;6C%B)oNV
z`6>AdCFsw*h7ZX649cPT8-SKQ1rEijuK-#GPf$5nGj_jMQ)+vFi{r4>&z^RGPt_)%
zcyiQ4_`qO)4-mke;Br}n0o(}pz3Q3_MnHnM9PIE4<&;|D%K<+BMv19_;AYO)Y9gDv
zKN1VxGnGw=0X%v@qYifRK_%PICm%r5+G~J=Vax(e#f3yk2CVbuTkX%M!=#UeYio~l
z86MsKR9H9`(EH+Fez@>fZ*sRf;LBtTd(qQMr~NuT?=QXmwUtFdrKKN(>H@qR7K*cK
z#lJN=&FbqtdSTMir;JbX*VZ|7v{N@a__5|AwGKM5SbxBPz7=acwVfkFvbf|4-d*MQ
zjmq`2bhv!Y0B*b&@EwRez#s^!iZxtscng!XHIG)h_Ev7Ck;gHBG%NDU^z=^Lf$$`)
z2y{WcJ~z%Vgm+TR_A4LRUVtxIy9i2T1OJBj9TNL|sM;Pr=-K)L;Lc)JAlXPopv(5)
zsrYC|uw)G8H&H%O-Jj9movL8gicL>rVfBf__x}z!ES<4R-5^nxZ(1yQ%I8`Y3sJy+
z;pNQFk|@elgrdOC8`4N4fQ#!6+~egT<9hoJwfvzuP;xx=-2&(&maKAd!|*$fIsAc1
zuHAS1I>uoic-1NA%14)eiCwwBHWn}ySW_LUIk?{wZ}jQ&mddm592_JUb4k`$E4HPv
zx;^XpG0uV3e`W!4$g#fyh<@Gi1c`F-31mWWE52oioFv-TX=lxlfQ4R4Z-&$`n?-GD
z3Q&3Y#%N^258m6P0cg3-JAzQ3njrK#?z&8vTN&8K)^#1k?*J-HIX{BO@hk9@T>(nz
zej%&`av>g_!8WP&6a`8lARqD(8E1dhoA3xwR%l^0bZ_Zr!|Gp-v7(RW2p6*bA^?qn
zQ79?16IqK|*dJXPwJC&(0^5!(ZUa{$y7+I_STBP&7lBcAJU29JAjpnqOCM?2etX_>
zv*t`<M9<`BS*^m?CM;;g<>WiPFcf{CrPogD>zkEHLS2WgO(_oero#og3%_DwJpXzZ
zgusFBshDqx*rImFC<@EyrVca*fwEOSm+Bmg%!{_jd$|m#19Vxrp1tn|`=qow^^}36
z3~NKy(Iyx8mgATA|IBrWo_qv~_Z&<RTTt{}m0#W61(JzkPLTl6qWjcWjz!|tK<N`d
zI^^AoEVFNb^=Qv9+ORJYf13WWL?#MgE_;A9uuH;mf|8x3Pp{>u4gz2-0Jv+DM}bL2
zJvOm0lXAWZcrnGt3j=)89DMD4`}88q6Y*JVo(JJok?LYoPDQ;ZPtL4WE-uxvqw>Xg
zIp8P#NOz>itz~I#j)1z~j=j1aOi%vlwnC`bPo`rF${x@<=K(<M84DW{_QG6RF^J13
z`^Cd#97`xtDMvL#Dc@$r&b0?PCdXrsJdl(xS+;I1-HVRTspuSn9-o7rEwwFi)&u{_
zCmAbtn$_sV=ofmi`)>z07X+kGM9ATwK8U3zVN0*Q_EewM7V!4k4AU;~60*IPdsgpz
zy-(ji`~np#<EznJh+WVok!hrZf@0=lb;mpR!~&)+RUYY`B<})|jZ63L!RI?UXL<6X
zq}KO(nwgcqC6-U70B80Ce)mAo9h;w{?Rqx-IGL)V@6iAI8IZ|oemRM*E%N$Uzts5t
zje=fP*)ZwXj<#C4@ml_O&pW=paCjNB5_B!KpeGPNY2>-V3Y$ptVjAGRs_{lWk;Q)c
z$2Q?ogRWDV!CkWtLnV*&ty9cPerYBO(y{0;_|`>u(gR)nWm?AjwQ~?$44xq|F*0wl
z%A(Had}LgkatD0TDA|U@ymHnhd3Neqzt#O94oCHh{Iitz%ktUUAG@FO`fO$1>U+X~
zfA$D{h`c299_os$IFndJDS?I>lP!=m3-ZUvm$?C`+q`2KnS;eE6tltRL0@r}oDY{L
zweCu7!)cOWMZUKbGigNUf?qhXZD6Y1oRz4BIJ9#C^oe2By_bBuNY7*4MjC5&4Z0NU
z`&1{J?#6JVBkgm`Y#vBLQH8ikxG)Y9{HVIlVw!vvUMkw@%bMaj?*U~oD)YAa&^9ju
zb1FkpeCroLe!Rp`5(<1tZU36`O)PR=t@gwCyKST%Lwvl6*RS_W<EYCHZ38HOW>mYk
z8GrK$))tIG3cLFF>={p4zqj}<+EZ)hGPmUUS2|ht288slml}p<?KXOrg~DED1|96n
zJHPDrzgEIOaOM9J9+D;nK-1OTE~cD=v^}eUsD33%2j9M-fn@tg!644_vT1ABK44Ng
ztT73^P$hjTyJVcr{5I|`Fm$DS%qM1FP7vz}tg%fyc_)MV%wr|vW>UBF;9XIS0^`&U
zvP%|JD|Z@GSnt;-Uf;S^kMZo=uRkBzzb`?JzwuphbM6@)uJ7nGzfs}fxdJ8*Lo4qt
zUZ}cQZt>pw50{)%WjEgSy9BonCF%C#Y1_}kV4Xip4NX~#qmL(nprv5`Zir?g?o;tm
zQ<55KifCKaoiQzTwYdt$HPFUO)}SZ2EOg;b!!XmY3T>jFPU_T*V3xLuzKavWL9+FO
zV}p>rw^TV762YLuhHdi}j^=MqpEq?(8G7z&9df~F5+Xba<WiyusVym_*p-!fWjD0=
zNL9M$sbPnNW4WH2uVEE*G&tEi8D-Ij|4&=z9oEFwt#Rcb5<nmz(tGb60!R-%^deO{
z(jgQDM4FUPLhl_ylOC!9A|(U_r1zo%qSPZDQG`35?|RPpo_q7xJd>F{`<cm}tiAT{
zeHVe3<}XoQP7HF{U>J-CzSp~yx2%`Ao{utG(z0+!$YwB7tmXbgdl<}PI)m(;rO6|I
zS<0myiEoN9&lCosSZFKOU$81F+meFBSh2!0aYGLx8tIJ%r;iUBpj>MAnI@)q?^+c2
zwg6E?4FztzH(A;mz{xU+b`Rtv!bjlWs-qyKQ!^0bL?!(C$72TtNch`!(>kBXd*rhq
zx6-G9n_MUoRbE}Wi!z2sQ+T-(ubicWdBVLDzFclilU~4a^e0#qLt2T^YhU=MO@e9~
zL_erGmhRYvZoG^)Q#>`Gd}F?*W_>Lp&w7&ivd->730>58^4L4`4E@xqNtc}p%<DH$
z)}i#rn_2eP=U0d~L<XDANoO0!9~dN@dZz6?-vrjykf>5T>HpVUR7e=Wo#R9Iq-=5g
zRl?*G1hYFAswHyvOJvc{60`-5nt?ppgZ1PB`a=TIGfBtl7UxO3eR$SG*E66Hd8q9K
zQ29ZfJUHc5S>8d}CwpvA-aHxD6x@dkSJ0Q$b_bkW;ZFhXq_*p57_e8kMO<ytkwb{K
z0iwQCb|p|b*r6A^ijZ4_f=)ytn(Z{km5e$Aer55%_HT#3CwDZ3gE}bu&0<jCArKkS
zi2Os_Ot_p&JCvo7Ro8#g(v(6ilF}m0)en}DU-s39XU_mZ3?)1Lb5*8H5{vNAHnQDJ
zS^!(w)!1;{VB0WxPAL5FPwYv&9v{oI1;Ob1oIji;>k^C8QgI<*<rkg-02y3yrV0Ef
zeRnCQ-Wb>i3<LZCta1x11b~|=2{g<M7X^7P<HvPSVke@YAmMUUI&P<o|471A{uf13
zz99$I$>kZ-v#ujSz@ahU2sv;AZERdTJUVXF;4t>lP*=^0gUm~rmwnJ7e>ZbpMzz=w
zb5xdz;6}g7fY3{%VIL$3s|3T48V}?qy6)@Cs%iA{+giem0__EuJw00{Rn^@OA;FII
z0D99RHQ8_TPJFOCy4F0##cC>nu$1%$tF?8^H<41!=8|klHXmVLz@Wa}^~p|DjaYA>
z$KI28dW4@$^J@X|E8bPl-D+OJx}_&wre>$3@y4$iYt5e_t0{|W%zgW`IKOuJ@9h(b
zCj7LHzh{3v-?eGYFg5dob@0P}^P^cjT4Vo(O4q#oYIFXF{`(o1AYpO8rGMt}1o$vM
zwu><!*qXzKj_UVPdaZO925TL1o`?vmMZF(#xRoc(N8yW!WmpUULvD)a!mmrK(9t8E
zWj!)ReT(Gikl4$_Y>$8$SYBbzEU>KY=~w3HUFW}aAamBpCy985Q4CWLo#IS0aY*7D
z@TfAvjjoC{y=hmG%(4w#X(Gtb-h0TLrWQZeNVkd0wTR)KS>WQlG-B|Le5~v*(C5KY
z&+~nB$Er}Y$k8s*AUxXqBAeBZTF)D8Fc7we@MN7%?q><LOsZb!sd{I(hlzDA4ju(N
z^+qCiq|~~NiddC!{f0pUxZ-M3d?m9iOQk;eyLvh~k`n_8gL0kUDvHnFk`%33{!pII
zMzwsgWsF5g8R!zoQ~2Jcxg)Vu?+^ULjW8Ii5Tm@Rl{jQ$M}e3<DIXzKDlq(DG=>y@
z*Vkx2Un44SVLm5hum&k*N9^rm0wcPM{8rCxAb669pPh<dQ|UB!2uTx~z%_mQN0B=Y
zxpWYjw0>8M@B}v-!N{++G)s3(wuF#Uaarn4XSC6HG_ncMkm-7JL2_SQx<V}{FxOy<
zzRsDbU1Hm6s@LAal>8Od^mg&#niA&DcBv6s9ZKaWAknIlZ9wCT>24buXB&Pq$iMNg
z1mH9qASM`)$1nJ0u-G#kW3MLipw`E&e6~>bDVa>qqHw(S%U=X`psICta{B_q)4j=B
zvu0=Xk|-RosPLq2#>AOBl3ibZkvEP@hHgy($_F6VeOnM%FRYdNH8(S$Fsnp!Ku2r>
zi_nUc6NR;i5EfN#;$rrSSaGQb*k3A60eN6v<_&Rh87w~_6=7EGw5)CK$1C7X^sW#E
z<~Mn90db8vpP&J%gu4r9lzGyoXGd{eW2`sT?ip+VSh0wyWbJ+dSZa9NL)H&3u4NKf
z-D7OxfMPSr$_rW{cl-DW_iUN~2!-W46K_u_Ac<fLse=swRb)mq{KOL<B$>7uB_Zw+
zl4G7htB7s6ZOMhTxxfHYv2+a6|9sDGJA(lm$=-0EkXnC7H(#wAfQ6ha6^*3;(ulIZ
z&W~r8>$Y&jafYbBKdrK*>(~u>9#yW1R4swnD$0vl4!{m28qvA-NYj_qbs%5Sezr_I
zDJzaYr>P;h*pL|TLCUS3PFujCo=l%i3yvUdM5NZdP{xRa<bD#36iqkunJ~>rN*JI_
zv#y-LfO{7M^S^53&{eRHv;Wa8u31$g(pvp^<%MfS4weQcX*EksUNP6RhY|R~Y<PAR
z`QgB7*%%r8X6H57n0`<VZc+db!p{9G`<u+1j6{y#{Q)3?Hb<bwHd~qTPd`#_-;*+k
z0yNXoGb|>t2tB}i=temgE0kfAHV&4lz;AKYnsrl_>k6;ly!L&ofRdwv*DsDWUAO9%
zib25wFGkvuml20+>az);uTY}G7m9^F4)vaV^cdxQ+wyKX2qXU32fhmUzG5;ifjO>I
z6o-Epy%<6b&B)*8tY6?y|6oUL8>o{ZAR>%_q?{ma(9=A88cgNevW{i;NHy~S!;xU=
z=J#W?7_I|LZWc40K7s6ogC=0fb2E=MX1=@?aFc3O&tWFOkmUw@Tyob^9ETf}T9M*{
zh`3uu9k9bWZ&9V{4t%_?9BE?Ar00NTrqm<F0tOisr={bnJY6A@z$!6D*>xaT)An7^
zjBSAu|45oJ%SM;wI0brcw}8NDRv|AWL+HM-R7$yY0MVc+-Pv}??1>v0o+BAhxK$#U
z`}_IVq_GTh?3uJ5HkqbBjH*RFttz;9<5?hSHUKR2YXHKOD6#G%{dC>3>COsbL(54I
zaj<-<dOg=ihf=x$6S;12ce$}A6-GKlvUuXzXM}9pE!j`Hd}oT}LmF~JQm;0#c<tfO
z2fs_L&7kvUUJ}cc?tHhvatv{%`bFr;NelvJAHP$5Fqlp1)lXqG;fG&Q9%nd!?(_Z3
z3tbXRA$*Za7N_ChlK?JZk9^9-^}N6gV92z0!e>)3OiKKCL{y}QObd;yFgSTjpUcXp
zMtybw_rd#onZ=q2iLjEWl7}+9m2%@>7YvXLT*{pKHghJ4<DjSDt#d*^xvQZ=m2yY3
z_x|xH$`3csTdOCkFvTE3(V?fZ@6oyh8CXe$*rMiWaF{7mMcKt`agI_)1T-VPdBAZ5
zj5`TGTpaeJLvEK3A{2yPiwMfjS(}V@W>1zDqEZIElZigkAiw;yJ^hoiTaB6|Rt=-i
zG0;+J&N~3z+_DH5UT0gqNCaTSi~LLP^rm0#L@0zU!RaqcCnGr=J&rie{lJ-m7F+|I
z_t^o(gg*}foy1rR!nZ=(Py8fnfv9Q@sJ->Df_8R{nm9jh2L|G3^t9i#OE<~;*jYOm
z8(7YBiLY)8?Lj7^5843kP_?5k5>FbFRaq25LoMAx+ZH;ki)U1Fj4xl-Qb9a@k(u&L
zc^7nj*>%8=su4>Ej}XXYI#tN4<wP4ay9Z&Gjq}{)<c^=~Z;;ot6hapf-?apUA4=su
z3+8g16?wbx6PUE24wHW}{!Vy@v*55_&zL#v-TwtKYlpRx^CZ+SSYyp5$&W&8;_s34
zxHJS2-HRKZ<R@I!i)dMn^7q=SM$LWf-MDg)9m_q)2iA4;DK^sRhzHkhy*CQ|l{#Wt
zzopD4N7(Obdg1FMN~sC`#+-s@2Sj~zVC52j4!qjl<aW~y_#HZkP@M&jHj}ZtR<CU*
zJS%YvOG`;OU<yK|tE>=kYCH<YM#!I$A9hjaje0=wn59WSl+49O=}(IrBu?Nn(GA<I
zNeG{vN8Loh%7_&2<z%PePrU+e;$7$&VBJ$WW$+n1Wf5WMH$p1<=TNYV`U6+9U6*K(
zhq~v+>Dda|YQ-p!u4$j6y%WPl?2PKTbClpj1!W->PFar(h`{YHUV^szM!#j$u}fmG
zZczZ2&vIllpgDQP6E*tg_0lqR`$)vSlr1!Bj8b(_J$B|eJAlko|8&92B39Dfb<!3?
zi%roD&S0aR)VsWAynxma1JrMCi0eZ_lxCJ)HzYf?<Hp<E5*kid!VY6?H!|kX?T!?q
zhi6xR^S3=>FaS=SKKS@!ayS0Q8bA0>N}I-k(r05u7njv?sf_6PIPS!-Z<Lv?Q05Xb
zv?2`I#Ftb|#h2orKCU<rS%09>Nb-ec&?I+@hgX6AM_<uqtlmdhzUi91A=P>Ba(c`9
zCWR5S+Z_OIFrG|<80ME;*0?65Q>7-5pOJ<!6U~rWQ_hkJ=v9HqWH}7yW>oahr<*D2
zq+yUDOA-X}Kq_QRo#kchs&77#N(=s1_u+508V#Pl>UU~1>mlk8d+K)ge*N73KB<em
zFYBkibTu%mqvLfWBO@HRS~CKbGR10hgCd37CF5kz;W1MwLzAsg3sL4zD)NK!KdQ|r
zn$0U>A`{AWhRgcpQCJ<-wy_L?DBcZ+)c6U=Np-*t7JQaLQkKCpVX4{--xtFtNVE)q
zg!YGRWh0)(!N^9%x^ecZ)*+t|8S%cs1|tUBu3w0(GXXJWksvDrAWoD&v5ff4Gjnx>
z^eK}Z^mkKI1!xUVo|s||5d`O^OuZmj!jWaBPuUiyB*t*+bVs^70kHV8p$j~<Z6W>y
zQ}LsFJR6mp$z6aOLrnr7X+ZW3Pt4i&c_~HCAG)P^`o{CL7H$1BHO@lSw5?R^Am?kw
z_T}D({y1zn%S-><wop;Mvw)`PEHpVCWW+cdTLtt6$Xg|{>-ic4cO?FP@AkAPD}+j%
zR01N>I&!XSPw@~3(_4--cM8f!J7*`Jh_n)TX}z}4#^Mg$-GO}HehR3V42r-4Dc80M
zqOqr*ts-BmraDUv+BKW!-q9)Lw<&RgIULNWNW%{S7Lw_CfG)Kz<(&b0pD-{W5GsG%
zd%*K#RvM^UIW6Zf)x>>Vwy?YJ1rHD+sWOieEF3H>9Qk;?EEBpa9mhp1Sd#dOn3+Z4
z=#Q7)Dk_T4lLPR1&hKJ%71Ev*tE;HlUzkN$9#!I1b~5!+G9DeT<eaXyYUDnZr<~i{
z%L)B3X1-`WCM9|`w&=Plwv+mz^rKM?E*JB@em`ZvoLn3Gp-HO}J?&SMcL8nYGOt%l
z9~B_7K33!_Q1;%6Dh<-QSCac!20FY_&0Vr?%w7}!!&EZVXrRfZS*@~WQK^ta(B_c>
z2Wm(DcDZes8>}<_V8+d2lmAZ+Ec<v3pKlw@clG5q84ILE{8Oj9-pla${JJ@v;&(k!
zji&89u-5&y;g@+Cy&DXEEZ?w&7aKWd1N&f6yi&O)pY{%MdJc+bI_D^5=xpvE!vV-V
z!~Kp|$k3b-IsKZ2`CY~*T2OS;+WF$5nLh%WH(aMQX0n?ozj0I}cy;lk;W~j4-bTY;
z6e=Iuwbby&{ar61NL>MkEIAC9O^8&;yza-Nk}R?SH(YEN=Wgfw$u{kAG^$@>$xUp-
zkZzuWF0sRcOEEHOV7lbbBe<4h<<|p{T5ee|d^Yl8JP-$^+8c2na=tAGp|O1UaD7Io
zP3aX=m_X5%GZ#wq)|Dlwd&Jyn+n60y+&JNzz%7JpXE?<pyevJzFQG4K(60$U8OrBA
zc#-oW)9fgZ?WPa|3}WUt+&3_w5@9zLO3>_fQ|_TwixT23*D97tl<){DbI-aC&aIi~
z>pFC9IYsm*TkUH))9jwTi5x0doc98+9GF%rl#0iyb&qoWGL&eXP*_MiRw==Z-p)}b
zT!f<>em@EOIbK|bjH+Gq5se3nhZv&mi(eni{0dR|(Sx7+iEdpd#s)r~%)vNNHai*6
zuptkqWJdTnqdEq>QP+26aLWZfY7`jUnq-Wn#syxNCz}L57Utl+7v!n4$U730u_T$U
zzcgE}Oh8|sL2p8mLz0!_LBJNq`oZwwfomV_Ia3zeHAC*zXrv)mUgrnh_m?$$10>V)
z*6(MFo{9eaHVsQfaen=<Xyj7kw4AoJ{I%lCvdqEpctw%54&&;;{j=AB{5O9;Ku=It
zOePxvNd2P?XvW1Z_zgUtbd4c+OEs1%XJuuDi8)1O^V27`K@z0X<+%)}!5Tq04-`V>
zszg65VcsV0;KxxFS1eWGm)Etm=uSY9ubs{Rg)7A)@@2+ZP3^u1IzgEr4Rb4Yf&54r
zS>uGpjY?UL8lU{3+WR5K@hGq1JeWj~7&%DLL@zJo09u{jwmZH}eS`e;4hfT81vcDf
z%wa3}t7(X94u-R^sK_ujSR#7cAGHC-lEmhu<-HOOn9stf9MQck(P4QMvf%SI2(wf4
zQl5pEn>E<m@@JyrA~W>(uo%U)FVy{YZoSZ!JasV+HK$IJ&SrUjdiFBtp=_VJ;=AV0
z`}Y0!>x<A7)J^pb4ckRoDNjO!S54(?;d$?(WxJgG<LMK9v+Kfn$Ty7Z*sqH?A<Nf3
zn?<3pH`fxfKLUcj`z;2P4>%p0^9P-}em_aueom6|&ZKP`_Mbt*6t2us?Bh42`=M1!
z0q3@M*ytRET(s_OTZnoLbO9_qD0$!V;8nKvW$?sM1!(hbY+9LoMpUkI6FoTe_So<y
zaME9cjp-(6@Al0M)X#v#dL>|2&vTHs=|cm2f8=DUx{g=}x4+Zv01ZFeM5ag*PdP{h
zJ=5(hzP}$en>PgaJ#>a~EY9i$E*sF+3K>LJ_H{>xrcbFJvp(@+aXJ#GjGweOU5hR;
z9jVkG@z+s*jr5*wvWYLw)<w0uslcFbjJu)XFti1|qM2eyc@yof=yqgKw-dyOR9Dkh
zEkU7B8yg!EH_zniXT>%C>ALzy@tYZ}=i&df3N!_D<3AU;e-kzP`IRZ}Ki&QP6>wCs
z&nJdoZ<~MZ`rplL&SD?2@d5MSzvurx1i0M)dscGKec(0*3e0K#t9$=!R?)q>{of7!
zZvHc<ite2++T`!&@%ugaS9dmtWdCmu|9VU6ezblS-si7zZt(O5@X=7wfz~V8MEnoI
C0G(X`

literal 0
HcmV?d00001


From 94e62d17173e2b31d9e424ceea375430cd534884 Mon Sep 17 00:00:00 2001
From: Shiqing Fan <shiqingf@nvidia.com>
Date: Thu, 2 Apr 2026 23:56:29 -0700
Subject: [PATCH 09/43] ETP+CG: fix '--overlap-grad-reduce
 --overlap-param-gather'

---
 .../module/extended_tensor_parallelism.py     | 46 ++++++-------------
 .../pytorch/module/grouped_linear.py          |  2 +-
 .../pytorch/module/layernorm_linear.py        |  2 +-
 transformer_engine/pytorch/module/linear.py   |  2 +-
 4 files changed, 18 insertions(+), 34 deletions(-)

diff --git a/transformer_engine/pytorch/module/extended_tensor_parallelism.py b/transformer_engine/pytorch/module/extended_tensor_parallelism.py
index 0cc1979524..8969ccfa6d 100644
--- a/transformer_engine/pytorch/module/extended_tensor_parallelism.py
+++ b/transformer_engine/pytorch/module/extended_tensor_parallelism.py
@@ -226,9 +226,6 @@ def __init__(self, x, *args, **kwargs):
         self.rs_state = ETPWeightState.NONE
         self.wgrad_rs = None
         self._wgrad_rs_handle = None
-        self.fuse_wgrad_accumulation = False
-        self._grad_accum_node = None
-        self._grad_accum_hook = None
         self.rs_event = torch.cuda.Event(external=True)
         self._rs_ticket = None
         # Padding
@@ -600,16 +597,12 @@ def get_wgrad_tensor(self):
             requires_grad=False,
         )
 
-    def register_grad_accum_hook(self, grad_accum_node, hook):
-        self._grad_accum_node = grad_accum_node
-        self._grad_accum_hook = hook
-
     @staticmethod
-    def _finalize_wgrad(param, wgrad_rs, fuse_wgrad_accumulation):
-        """Post-RS per-param processing: strip padding, accumulate, call hook.
+    def _finalize_wgrad(param, wgrad_rs):
+        """Post-RS per-param processing: strip padding, accumulate into main_grad.
 
-        Returns None for fused (grad already accumulated into main_grad),
-        or the stripped wgrad for unfused (to be returned to autograd).
+        Accumulates the reduce-scattered wgrad into main_grad and returns
+        a dummy zero grad to autograd (DDP backward post hook is not used for ETP params).
         """
 
         param._set_rs_state(ETPWeightState.NONE)
@@ -618,19 +611,13 @@ def _finalize_wgrad(param, wgrad_rs, fuse_wgrad_accumulation):
         if param.is_padded_last_rank:
             wgrad_rs = param._strip_padding(wgrad_rs)
 
-        # 2. Accumulate
-        if fuse_wgrad_accumulation:
-            param.main_grad.add_(wgrad_rs)
-            if hasattr(param, "grad_added_to_main_grad"):
-                param.grad_added_to_main_grad = True
-            dummy_grad = get_dummy_wgrad(list(param.main_grad.shape), param.dtype)
-
-        # 3. Post hook
-        if param._grad_accum_hook is not None:
-            param.grad = dummy_grad if fuse_wgrad_accumulation else wgrad_rs
-            param._grad_accum_hook(param)
+        # 2. Accumulation: accumulate wgrad into main_grad
+        param.main_grad.add_(wgrad_rs)
+        if hasattr(param, "grad_added_to_main_grad"):
+            param.grad_added_to_main_grad = True
+        dummy_grad = get_dummy_wgrad(list(param.main_grad.shape), param.dtype)
+        return dummy_grad
 
-        return None if fuse_wgrad_accumulation else wgrad_rs
 
     def _wait_reduce_scatter(self):
         # assert self._wgrad_rs_handle is not None or is_graph_capturing()
@@ -695,7 +682,7 @@ def _reduce_scatter(self, wgrads, async_op, nvtx_label=None):
 
             return outputs, cm if async_op else None
 
-    def wgrad_reduce_scatter(self, wgrad, fuse_wgrad_accumulation, nvtx_label=None):
+    def wgrad_reduce_scatter(self, wgrad, nvtx_label=None):
         """Reduce-scatter wgrad(s). Sync for last weight, async+deferred for others.
 
         Accepts a single tensor (non-routed) or list of tensors (routed experts).
@@ -710,15 +697,13 @@ def wgrad_reduce_scatter(self, wgrad, fuse_wgrad_accumulation, nvtx_label=None):
 
         if ETP_CONFIG.weight_prefetch and self.prev_w is not None:
             # Async reduce-scatter (not last weight — deferred finish)
-            self.fuse_wgrad_accumulation = fuse_wgrad_accumulation
             _, rs_handle = self._reduce_scatter(wgrads, async_op=True, nvtx_label=nvtx_label)
             self._wgrad_rs_handle = ETPShardHandle(rs_handle, weights, reduce_scatter=True)
             ret = tuple([None] * len(wgrads)) if batched else None
         else:
             # Sync reduce-scatter (last weight in chain)
             sharded, _ = self._reduce_scatter(wgrads, async_op=False, nvtx_label=nvtx_label)
-            result = [self._finalize_wgrad(p, g, fuse_wgrad_accumulation)
-                      for p, g in zip(weights, sharded)]
+            result = [self._finalize_wgrad(p, g) for p, g in zip(weights, sharded)]
             ret = result if batched else result[0]
 
         # Wait for last reduce scatter if it was async
@@ -728,17 +713,16 @@ def wgrad_reduce_scatter(self, wgrad, fuse_wgrad_accumulation, nvtx_label=None):
             self.next_w.rs_event.wait()
 
             cache = get_global_ETP_cache()
-            fuse_wgrad_accumulation = self.next_w._weights[0].fuse_wgrad_accumulation
             for w in self.next_w._weights:
-                self._finalize_wgrad(w, cache.get(w._rs_ticket), fuse_wgrad_accumulation)
+                self._finalize_wgrad(w, cache.get(w._rs_ticket))
                 cache.release(w._rs_ticket)
 
         return ret
 
-    def batched_wgrad_reduce_scatter(self, wgrad_list, fuse_wgrad_accumulation, nvtx_label=None):
+    def batched_wgrad_reduce_scatter(self, wgrad_list, nvtx_label=None):
         """Batched version of wgrad_reduce_scatter."""
         assert self.is_routed_expert and self.weight_list is not None
-        return self.wgrad_reduce_scatter(wgrad_list, fuse_wgrad_accumulation, nvtx_label=nvtx_label)
+        return self.wgrad_reduce_scatter(wgrad_list, nvtx_label=nvtx_label)
 
     def __torch_function__(self, func, types, args=(), kwargs=None):
         if kwargs is None:
diff --git a/transformer_engine/pytorch/module/grouped_linear.py b/transformer_engine/pytorch/module/grouped_linear.py
index fe81196f4a..d11947be0e 100644
--- a/transformer_engine/pytorch/module/grouped_linear.py
+++ b/transformer_engine/pytorch/module/grouped_linear.py
@@ -551,7 +551,7 @@ def handle_custom_ddp_from_mcore(weight, wgrad):
                     return wgrad
 
                 if ctx.etp_size > 1:
-                    wgrad_list = origin_weights[0].batched_wgrad_reduce_scatter(wgrad_list, ctx.fuse_wgrad_accumulation)
+                    wgrad_list = origin_weights[0].batched_wgrad_reduce_scatter(wgrad_list)
                 elif ctx.fuse_wgrad_accumulation:
                     wgrad_list = [
                             handle_custom_ddp_from_mcore(weight, wgrad)
diff --git a/transformer_engine/pytorch/module/layernorm_linear.py b/transformer_engine/pytorch/module/layernorm_linear.py
index 824030c3d0..6f9ac58460 100644
--- a/transformer_engine/pytorch/module/layernorm_linear.py
+++ b/transformer_engine/pytorch/module/layernorm_linear.py
@@ -938,7 +938,7 @@ def wgrad_gemm(
                     wgrad, grad_bias_ = wgrad_gemm(ln_out_total, grad_output)
 
                     if ctx.etp_size > 1:
-                        wgrad = origin_weight.wgrad_reduce_scatter(wgrad, ctx.fuse_wgrad_accumulation)
+                        wgrad = origin_weight.wgrad_reduce_scatter(wgrad)
 
                     # Update grad bias if needed
                     if grad_bias is None:
diff --git a/transformer_engine/pytorch/module/linear.py b/transformer_engine/pytorch/module/linear.py
index 4c4789461c..145c253cdc 100644
--- a/transformer_engine/pytorch/module/linear.py
+++ b/transformer_engine/pytorch/module/linear.py
@@ -963,7 +963,7 @@ def wgrad_gemm(
                 dgrad_work = None
 
         if ctx.etp_size > 1:
-            wgrad = weight.wgrad_reduce_scatter(wgrad, ctx.fuse_wgrad_accumulation)
+            wgrad = weight.wgrad_reduce_scatter(wgrad)
         if ctx.requires_wgrad:
             # Handle custom DDP from mcore.
             if (

From f4b5a5e57c2a99bc5aa7ed0d93e8c374ba14436a Mon Sep 17 00:00:00 2001
From: Shiqing Fan <shiqingf@nvidia.com>
Date: Fri, 3 Apr 2026 08:16:00 -0700
Subject: [PATCH 10/43] ETP padding: fix stripping for rowwise_scale_inv and
 columnwise_scale_inv.

---
 .../module/extended_tensor_parallelism.py     | 35 ++++++++++++++++++-
 1 file changed, 34 insertions(+), 1 deletion(-)

diff --git a/transformer_engine/pytorch/module/extended_tensor_parallelism.py b/transformer_engine/pytorch/module/extended_tensor_parallelism.py
index 8969ccfa6d..83b6dc98c0 100644
--- a/transformer_engine/pytorch/module/extended_tensor_parallelism.py
+++ b/transformer_engine/pytorch/module/extended_tensor_parallelism.py
@@ -17,7 +17,8 @@
 )
 from ..quantized_tensor import QuantizedTensor
 from ..tensor import NVFP4TensorStorage, MXFP8TensorStorage
-from ..utils import nvtx_range_pop, nvtx_range_push
+from ..utils import nvtx_range_pop, nvtx_range_push, round_up_to_nearest_multiple
+from ..constants import NVFP4_BLOCK_SCALING_SIZE, MXFP8_BLOCK_SCALING_SIZE
 from .base import get_dummy_wgrad
 
 import transformer_engine_torch as tex
@@ -354,6 +355,38 @@ def _strip_padding(self, tensor):
                     metadata["columnwise_data"] = metadata["columnwise_data"][
                         :-self.pad_length
                     ]
+            M = self._unsharded_shape[0]
+            if isinstance(tensor, NVFP4TensorStorage):
+                # NVFP4 scale_inv shapes (see NVFP4Quantizer.get_scale_shape):
+                #   rowwise_scale_inv:    [round_up(M, 128),  round_up(ceil(K/16), 4)]
+                #   columnwise_scale_inv: [round_up(K, 128),  round_up(ceil(M/16), 4)]
+                # ETP shards M (dim 0 of the weight), so strip to the unpadded sizes.
+                if metadata.get("rowwise_scale_inv") is not None:
+                    m_rows = round_up_to_nearest_multiple(M, 128)
+                    metadata["rowwise_scale_inv"] = metadata["rowwise_scale_inv"][:m_rows]
+                if metadata.get("columnwise_scale_inv") is not None:
+                    m_tiles = round_up_to_nearest_multiple(
+                        math.ceil(M / NVFP4_BLOCK_SCALING_SIZE), 4
+                    )
+                    metadata["columnwise_scale_inv"] = (
+                        metadata["columnwise_scale_inv"][:, :m_tiles].contiguous()
+                    )
+            else:
+                # MXFP8 scale_inv shapes (see MXFP8Quantizer.get_scale_shape):
+                #   rowwise_scale_inv:    [round_up(M, 128),     round_up(K//32, 4)]
+                #   columnwise_scale_inv: [round_up(M//32, 4),   round_up(K, 128)]
+                # ETP shards M (dim 0 of the weight), so strip to the unpadded sizes.
+                if metadata.get("rowwise_scale_inv") is not None:
+                    m_rows = round_up_to_nearest_multiple(M, 128)
+                    metadata["rowwise_scale_inv"] = metadata["rowwise_scale_inv"][:m_rows]
+                if metadata.get("columnwise_scale_inv") is not None:
+                    m_tiles = round_up_to_nearest_multiple(
+                        M // MXFP8_BLOCK_SCALING_SIZE, 4
+                    )
+                    metadata["columnwise_scale_inv"] = (
+                        metadata["columnwise_scale_inv"][:m_tiles]
+                    )
+
             return type(tensor)(**metadata, shape=self._unsharded_shape, dtype=torch.bfloat16)
         else:
             return tensor[:-self.pad_length]

From 89d8ae77d041e3ee14b912e0a5cb93a0c1295011 Mon Sep 17 00:00:00 2001
From: Shiqing Fan <shiqingf@nvidia.com>
Date: Fri, 3 Apr 2026 08:17:53 -0700
Subject: [PATCH 11/43] ETP: add UTs and doc update.

---
 docs/README_ETP.md                    |   30 +-
 tests/pytorch/distributed/test_etp.py | 1411 +++++++++++++++++++++++++
 2 files changed, 1426 insertions(+), 15 deletions(-)
 create mode 100644 tests/pytorch/distributed/test_etp.py

diff --git a/docs/README_ETP.md b/docs/README_ETP.md
index d32321d6df..a0ef835614 100644
--- a/docs/README_ETP.md
+++ b/docs/README_ETP.md
@@ -28,7 +28,8 @@ TODO(shiqingf): add performance for Ultra model in nvfp4.
 | **FP8 / MXFP8 support** | Quantized shards with ETP-group amax reduction |
 | **Routed expert support** | Batched coalesced all-gather for all experts in a MoE layer (GroupedLinear) |
 | **Composable with TP/SP** | Orthogonal to tensor parallelism and sequence parallelism |
-| **CUDA Graphs compatible** | ETP is compatible with CUDA Graphs. |
+| **CUDA Graphs compatible** | ETP is compatible with CUDA Graphs. And kernels on sidestreams are no longer required to synchronize at graph breaks
+ |
 | **Debug naming** | `tag_etp_params_with_names(model)` populates human-readable names on every `ETPShardedParam`; the prefetch-link table is printed atomically at the start of the second forward pass |
 
 ### Implementation Mechanisms
@@ -40,7 +41,7 @@ TODO(shiqingf): add performance for Ultra model in nvfp4.
 | **Separate AG and RS state** | All-gather state (`state`) and reduce-scatter state (`rs_state`) are tracked independently per param, allowing forward and backward async ops to proceed without interference |
 | **Dedicated CUDA streams** | AG and RS run on separate global CUDA streams (`AG_STREAM`, `RS_STREAM`), decoupled from the default compute stream; completion is signaled back via per-param CUDA events (`ag_event`, `rs_event`) that the compute stream waits on before consuming the result |
 | **Ticket-based buffer cache** | `ETPWeightCache` assigns persistent tickets via `reserve()`; buffers are lazily allocated on `get()` and returned to the pool on `release()`; `clear()` drops all buffers while keeping tickets valid for lazy re-allocation (used for CUDA Graph re-capture) |
-| **Wgrad reduce-scatter** | Async reduce-scatter of weight gradients, deferred to overlap with next layer's wgrad RS; padding stripped and grad accumulated in `_finalize_wgrad()` |
+| **Wgrad reduce-scatter** | Async reduce-scatter of weight gradients, deferred to overlap with next layer's wgrad RS; `_finalize_wgrad()` resets `rs_state`, strips padding, and accumulates the result into `param.main_grad`, returning a dummy-zero grad to autograd |
 
 ---
 
@@ -129,7 +130,8 @@ BACKWARD  (wgrad path)
     └─ _reduce_scatter pads:  [F, K]  →  [padded_F, K]  (re-pads before RS so chunks are equal)
          └─ reduce-scatter   →  [shard_size, K]  per rank
               └─ _finalize_wgrad → _strip_padding  →  [real_rows, K]
-                   └─ stored as param.grad  (matches local shard shape)
+                   └─ accumulated into param.main_grad  (matches local shard shape)
+                        └─ dummy zero grad returned to autograd
 ```
 
 #### Wrapping call
@@ -159,7 +161,7 @@ NONE ─────────────────────────
 
 The `DATA_READY_SYNC` state is used for on-demand synchronous gathers (cold start or when prefetch is disabled). `DATA_READY` is used after an async gather completes via `handle.wait()`.
 
-Invalid transitions are guarded by `_set_state()` / `_set_rs_state()`.
+Transition validation is implemented but currently commented out in `_set_state()` / `_set_rs_state()` (guarded by `ETP_CONFIG.check_param_states`); both methods unconditionally set the new state in the current implementation.
 
 ### Class Diagram
 
@@ -204,8 +206,6 @@ classDiagram
         +Event rs_event
         +ETPShardHandle _prefetch_handle
         +ETPShardHandle _wgrad_rs_handle
-        +callable _grad_accum_node
-        +callable _grad_accum_hook
         +Quantizer _quantizer
         +bool did_cast_to_low_precision
         +QuantizedTensor quantized
@@ -219,7 +219,6 @@ classDiagram
         +ProcessGroup group
         +List weight_list
         +Tensor wgrad_rs
-        +bool fuse_wgrad_accumulation
         +str _debug_name
         +setup(weight_quantizer)
         +_weights() List
@@ -240,10 +239,9 @@ classDiagram
         +all_gather_and_prefetch(fwd, ...) Tensor
         +all_gather_and_prefetch_bwd() Tensor
         +get_wgrad_tensor() Tensor
-        +register_grad_accum_hook(node, hook)
-        +_finalize_wgrad(param, wgrad_rs, fuse) [staticmethod]
+        +_finalize_wgrad(param, wgrad_rs) [staticmethod]
         +_reduce_scatter(wgrads, async_op) tuple
-        +wgrad_reduce_scatter(wgrad, fuse)
+        +wgrad_reduce_scatter(wgrad)
     }
 
     %% ── Async all-gather handles ─────────────────────────────────────────────
@@ -479,10 +477,10 @@ Step by step for layer `i` backward:
 1. **`all_gather_and_prefetch_bwd()`**: Gather `W_i` for the dgrad GEMM; simultaneously async-prefetch `W_i-1` (the `prev_w`) for the next backward step. Uses `skip_weight_cast=True` — no re-quantization needed since scales are already valid from the forward pass.
 2. **dgrad GEMM**: Compute `dX = dY × W_i` using the gathered weight.
 3. **wgrad GEMM**: Compute `dW = X^T × dY` using the saved input activation.
-4. **`wgrad_reduce_scatter(wgrad, fuse_wgrad_accumulation)`**:
+4. **`wgrad_reduce_scatter(wgrad)`**:
    - **Non-last layer** (`prev_w is not None`): Launch async reduce-scatter; store `ETPShardHandle` in `self._wgrad_rs_handle`. Return `None` to backward (gradient deferred).
-   - **Last layer** (`prev_w is None`): Synchronous reduce-scatter. Call `_finalize_wgrad()` immediately — strips padding, accumulates into `main_grad`, fires grad-accum hook.
-5. **Deferred finish**: At the start of each subsequent layer's `wgrad_reduce_scatter`, `self.next_w._wait_reduce_scatter()` is called, which waits on `next_w._wgrad_rs_handle` and records a CUDA event. Then `_finalize_wgrad()` is called for `next_w` to strip padding, accumulate, and fire the hook. The RS buffer is returned to the pool via `cache.release()`.
+   - **Last layer** (`prev_w is None`): Synchronous reduce-scatter. Call `_finalize_wgrad()` immediately — resets `rs_state` to `NONE`, strips padding (if last rank is padded), accumulates into `param.main_grad`, returns a dummy-zero grad tensor to autograd.
+5. **Deferred finish**: At the start of each subsequent layer's `wgrad_reduce_scatter`, `self.next_w._wait_reduce_scatter()` is called, which waits on `next_w._wgrad_rs_handle` and records a CUDA event. Then `_finalize_wgrad()` is called for `next_w` to reset `rs_state`, strip padding, and accumulate into `main_grad`. The RS buffer is returned to the pool via `cache.release()`.
 
 
 Here is an example of ETP schedule diagram for Hybried Nemotron6 in bf16 as an example (ETP+EP with partial CGs):
@@ -576,8 +574,10 @@ A buffer lives in **exactly one** place at a time:
 
 ```
 reserve()    → slot created, buf=None (no allocation yet)
-get(ticket)  → buf allocated lazily from pool or fresh; stored in slot
-release(ticket) → buf returned to pool; slot.buf set to None
+get(ticket)  → buf allocated lazily from pool or fresh; stored in slot.buf (idempotent)
+release(ticket) → buf appended to pool (slot.buf stays set; production code calls release
+               only after get() has emptied the pool for that key, so the duplicate-check
+               in release() is never triggered)
 clear()      → all slot.buf = None, pool cleared (tickets stay valid; next get() re-allocates)
 ```
 
diff --git a/tests/pytorch/distributed/test_etp.py b/tests/pytorch/distributed/test_etp.py
new file mode 100644
index 0000000000..39afe69b00
--- /dev/null
+++ b/tests/pytorch/distributed/test_etp.py
@@ -0,0 +1,1411 @@
+# Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# See LICENSE for license information.
+
+"""Unit tests for Extended Tensor Parallelism (ETP).
+
+Test groups
+-----------
+1.  TestETPWeightState           – state-machine transitions (single-process)
+2.  TestETPWeightCache           – coat-check buffer pool (single-process)
+3.  TestETPSharding              – wrap_module_params_etp: shard content + padding (multi-GPU)
+4.  TestWrapModuleParams         – wrap_module_params_etp: param replacement + weight_list (multi-GPU)
+5.  TestLinearETP                – Linear forward/backward numerical correctness (multi-GPU)
+6.  TestLayerNormLinearETP       – LayerNormLinear forward/backward smoke test (multi-GPU)
+7.  TestGroupedLinearETP         – GroupedLinear forward/backward smoke test (multi-GPU)
+8.  TestETPPrefetchChain         – linked-list next_w/prev_w wiring (multi-GPU)
+9.  TestETPWgradRS               – wgrad reduce-scatter shape + multi-layer deferred path (multi-GPU)
+10. TestETPMicrobatches          – output consistency across microbatches (multi-GPU)
+11. TestNVFP4LinearETP           – Linear + NVFP4 recipe: quantized shard setup, fwd/bwd (multi-GPU)
+12. TestNVFP4GroupedLinearETP    – GroupedLinear + NVFP4 recipe: coalesced AG + fwd/bwd (multi-GPU)
+13. TestMXFP8LinearETP           – Linear + MXFP8 recipe: quantized shard setup, fwd/bwd, padding (multi-GPU)
+14. TestETPConfig                – update_config: valid/invalid keys (single-process)
+15. TestETPShardedParamProperties – shape computations, get_padded_shard, _strip_padding (single-process)
+16. TestETPCacheKey              – _get_cache_key: expert vs non-expert, fwd vs bwd (single-process)
+17. TestETPCacheRelease          – reserve/get/release pool semantics (single-process)
+18. TestTagETPParamsWithNames    – _debug_name population on ETPShardedParam (single-process)
+19. TestFinalizeWgrad            – _finalize_wgrad: accumulate, strip padding, rs_state reset (single-process)
+20. TestETPGroupSizeOne          – wrap_module_params_etp no-op when etp_group.size()==1 (single-process)
+21. TestETPPrefetchDisabled      – weight_prefetch=False: single-pass forward still works (multi-GPU)
+22. TestFuseWgradAccumulation    – fuse_wgrad_accumulation=True: wgrad→main_grad (multi-GPU)
+23. TestETPGradAccumHook         – main_grad updated after reduce-scatter backward (multi-GPU)
+
+Multi-GPU tests use torch.multiprocessing.spawn and are skipped when fewer
+than the required CUDA devices are available.
+"""
+
+import os
+import socket
+
+import pytest
+import torch
+import torch.distributed as dist
+import torch.multiprocessing as mp
+import torch.nn as nn
+
+import transformer_engine.pytorch as te
+import transformer_engine.pytorch.module.extended_tensor_parallelism as etp_module
+from transformer_engine.pytorch.module.extended_tensor_parallelism import (
+    ETPShardedParam,
+    ETPWeightCache,
+    ETPWeightState,
+    wrap_module_params_etp,
+)
+from transformer_engine.pytorch import fp8_autocast, is_nvfp4_available, is_mxfp8_available
+from transformer_engine.pytorch.quantization import FP8GlobalStateManager
+from transformer_engine.pytorch.quantized_tensor import QuantizedTensor
+from transformer_engine.common.recipe import NVFP4BlockScaling
+
+
+# ---------------------------------------------------------------------------
+# Fixtures
+# ---------------------------------------------------------------------------
+
+@pytest.fixture(autouse=True)
+def reset_fp8_state():
+    yield
+    FP8GlobalStateManager.reset()
+
+
+@pytest.fixture(autouse=True)
+def reset_etp_globals():
+    """Reset all ETP mutable class/module-level state between tests."""
+    yield
+    ETPShardedParam._first_weight_flag = True
+    ETPShardedParam._last_weight = None
+    ETPShardedParam._pending_rs_weight = None
+    ETPShardedParam._link_node_count = 0
+    ETPShardedParam._link_table_buffer = []
+    ETPShardedParam._link_table_flushed = False
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+def _free_port() -> int:
+    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+        s.bind(("", 0))
+        return s.getsockname()[1]
+
+
+def _dist_init(rank: int, world_size: int, port: int) -> None:
+    os.environ["MASTER_ADDR"] = "localhost"
+    os.environ["MASTER_PORT"] = str(port)
+    dist.init_process_group(backend="nccl", rank=rank, world_size=world_size)
+    torch.cuda.set_device(rank)
+
+
+def _run_distributed(fn, world_size: int, *args) -> None:
+    """Spawn `world_size` processes each running fn(rank, world_size, port, *args)."""
+    port = _free_port()
+    mp.spawn(fn, args=(world_size, port) + args, nprocs=world_size, join=True)
+
+
+def _requires_multi_gpu(n: int = 4):
+    if torch.cuda.device_count() < n:
+        pytest.skip(f"Requires at least {n} CUDA devices")
+
+
+def _requires_nvfp4():
+    if not is_nvfp4_available():
+        pytest.skip("NVFP4 not available (requires compute capability >= 10.0)")
+
+
+# ---------------------------------------------------------------------------
+# 1. ETPWeightState – state-machine transition tests
+# ---------------------------------------------------------------------------
+
+class TestETPWeightState:
+
+    @staticmethod
+    def _param():
+        return ETPShardedParam(torch.zeros(4, 4))
+
+    def test_full_cycle(self):
+        p = self._param()
+        assert p.state == ETPWeightState.NONE
+        p._set_state(ETPWeightState.ASYNC_WAIT)
+        p._set_state(ETPWeightState.DATA_READY)
+        p._set_state(ETPWeightState.NONE)
+        assert p.state == ETPWeightState.NONE
+
+    def test_sync_path_cycle(self):
+        """NONE → DATA_READY_SYNC → NONE (sync all-gather path)."""
+        p = self._param()
+        p._set_state(ETPWeightState.DATA_READY_SYNC)
+        p._set_state(ETPWeightState.NONE)
+        assert p.state == ETPWeightState.NONE
+
+    def test_rs_state_full_cycle(self):
+        """RS state machine: NONE → ASYNC_WAIT → DATA_READY → NONE."""
+        p = self._param()
+        assert p.rs_state == ETPWeightState.NONE
+        p._set_rs_state(ETPWeightState.ASYNC_WAIT)
+        p._set_rs_state(ETPWeightState.DATA_READY)
+        p._set_rs_state(ETPWeightState.NONE)
+        assert p.rs_state == ETPWeightState.NONE
+
+
+# ---------------------------------------------------------------------------
+# 2. ETPWeightCache – coat-check buffer pool tests
+# ---------------------------------------------------------------------------
+
+class TestETPWeightCache:
+
+    class _FakeGroup:
+        def __init__(self, size=2):
+            self._size = size
+        def size(self):
+            return self._size
+        def rank(self):
+            return 0
+
+    def _param(self, shape=(8, 4), etp_size=2):
+        p = ETPShardedParam(torch.zeros(*shape))
+        p.group = self._FakeGroup(etp_size)
+        p.expert_idx = None
+        p.pad_length = 0
+        p.is_padded_last_rank = False
+        p._quantizer = None
+        return p
+
+    def test_reserve_returns_ticket(self):
+        cache = ETPWeightCache()
+        p = self._param()
+        ticket = cache.reserve(p, torch.bfloat16, fwd=True)
+        assert isinstance(ticket, int)
+
+    def test_reserve_get_roundtrip(self):
+        cache = ETPWeightCache()
+        p = self._param()
+        ticket = cache.reserve(p, torch.bfloat16, fwd=True)
+        buf = cache.get(ticket)
+        assert buf is not None
+        # get() returns same buf on second call (buf cached in slot)
+        buf2 = cache.get(ticket)
+        assert buf2 is buf
+
+    def test_buffer_reused_after_release(self):
+        cache = ETPWeightCache()
+        p = self._param()
+        t1 = cache.reserve(p, torch.bfloat16, fwd=True)
+        buf1 = cache.get(t1)
+        cache.release(t1)
+        # Reserve a new ticket, buf should come from pool
+        t2 = cache.reserve(p, torch.bfloat16, fwd=True)
+        buf2 = cache.get(t2)
+        assert buf1 is buf2, "Buffer should be reused from pool after release"
+        cache.release(t2)
+
+    def test_two_simultaneous_reserves_are_distinct(self):
+        cache = ETPWeightCache()
+        p = self._param()
+        t1 = cache.reserve(p, torch.bfloat16, fwd=True)
+        buf1 = cache.get(t1)
+        t2 = cache.reserve(p, torch.bfloat16, fwd=True)
+        buf2 = cache.get(t2)
+        assert buf1 is not buf2, "Concurrent reserves must get distinct buffers"
+
+    def test_tickets_are_unique(self):
+        """Each reserve() call returns a new unique ticket."""
+        cache = ETPWeightCache()
+        p = self._param()
+        t1 = cache.reserve(p, torch.bfloat16, fwd=True)
+        t2 = cache.reserve(p, torch.bfloat16, fwd=True)
+        assert t1 != t2, "Each reserve() must return a unique ticket"
+
+    def test_invalid_ticket_raises(self):
+        cache = ETPWeightCache()
+        with pytest.raises(KeyError):
+            cache.get(9999)
+
+    def test_different_shapes_use_distinct_pool_slots(self):
+        cache = ETPWeightCache()
+        p1 = self._param(shape=(8, 4))
+        p2 = self._param(shape=(16, 4))
+        t1 = cache.reserve(p1, torch.bfloat16, fwd=True)
+        buf1 = cache.get(t1)
+        t2 = cache.reserve(p2, torch.bfloat16, fwd=True)
+        buf2 = cache.get(t2)
+        assert buf1.shape != buf2.shape
+        cache.release(t1); cache.release(t2)
+
+    def test_fwd_bwd_tickets_are_distinct(self):
+        """fwd=True and fwd=False reserves always receive distinct ticket IDs."""
+        cache = ETPWeightCache()
+        p = self._param()
+        t_fwd = cache.reserve(p, torch.bfloat16, fwd=True)
+        t_bwd = cache.reserve(p, torch.bfloat16, fwd=False)
+        assert t_fwd != t_bwd
+
+
+# ---------------------------------------------------------------------------
+# 3. ETP weight sharding: shard content and alignment padding
+# ---------------------------------------------------------------------------
+
+def _worker_sharding_aligned(rank, world_size, port):
+    _dist_init(rank, world_size, port)
+    K, M = world_size * 32, 16   # K divisible by 16*world_size → no padding
+    full_weight = torch.arange(K * M, dtype=torch.float32).reshape(K, M).cuda()
+    dist.broadcast(full_weight, src=0)
+
+    etp_group = dist.new_group(list(range(world_size)))
+    mod = nn.Module()
+    mod.weight = nn.Parameter(full_weight.clone(), requires_grad=False)
+    wrap_module_params_etp(mod, ['weight'], etp_group)
+    shard = mod.weight
+
+    rows_per_rank = K // world_size
+    assert shard.shape == (rows_per_rank, M), f"rank {rank}: unexpected shape {shard.shape}"
+    assert shard.pad_length == 0
+    expected = full_weight[rank * rows_per_rank : (rank + 1) * rows_per_rank]
+    assert torch.allclose(shard.data, expected), f"rank {rank}: shard content mismatch"
+    dist.destroy_process_group()
+
+
+def _worker_sharding_padding(rank, world_size, port):
+    _dist_init(rank, world_size, port)
+    alignment = 16 * world_size
+    K = alignment - 1           # deliberately unaligned
+    M = 16
+    full_weight = torch.ones(K, M, dtype=torch.float32).cuda()
+    dist.broadcast(full_weight, src=0)
+
+    etp_group = dist.new_group(list(range(world_size)))
+    mod = nn.Module()
+    mod.weight = nn.Parameter(full_weight.clone(), requires_grad=False)
+    wrap_module_params_etp(mod, ['weight'], etp_group)
+    shard = mod.weight
+
+    padded_K = alignment
+    rows_per_rank = padded_K // world_size
+
+    if rank == world_size - 1:
+        assert shard.is_padded_last_rank
+        assert shard.pad_length > 0
+        # The shard tensor holds only the real rows; get_padded_shard() appends zero rows.
+        padded = shard.get_padded_shard()
+        assert padded.shape[0] == rows_per_rank, \
+            f"rank {rank}: expected padded shard {rows_per_rank} rows, got {padded.shape[0]}"
+        n_real = K - rank * rows_per_rank
+        assert torch.all(padded[n_real:] == 0), "Padding rows must be zero"
+    else:
+        assert not shard.is_padded_last_rank
+        assert shard.shape[0] == rows_per_rank, \
+            f"rank {rank}: expected {rows_per_rank} rows, got {shard.shape[0]}"
+
+    dist.destroy_process_group()
+
+
+class TestETPSharding:
+    def test_aligned_shard_content(self):
+        _requires_multi_gpu(4)
+        _run_distributed(_worker_sharding_aligned, 4)
+
+    def test_unaligned_shard_padding(self):
+        _requires_multi_gpu(4)
+        _run_distributed(_worker_sharding_padding, 4)
+
+
+# ---------------------------------------------------------------------------
+# 4. wrap_module_params_etp: param replacement and GroupedLinear weight_list
+# ---------------------------------------------------------------------------
+
+def _worker_linear_param_replaced(rank, world_size, port):
+    _dist_init(rank, world_size, port)
+    in_f, out_f = 64, 128
+    etp_group = dist.new_group(list(range(world_size)))
+    layer = te.Linear(
+        in_features=in_f, out_features=out_f,
+        bias=False, params_dtype=torch.bfloat16,
+        device="cuda", etp_group=etp_group,
+    )
+    w = layer.weight
+    assert isinstance(w, ETPShardedParam), "weight must be ETPShardedParam"
+    assert w.shape == (out_f // world_size, in_f), f"unexpected shard shape {w.shape}"
+    assert w.group is etp_group
+    dist.destroy_process_group()
+
+
+def _worker_grouped_weight_list(rank, world_size, port):
+    _dist_init(rank, world_size, port)
+    num_gemms, in_f, out_f = 3, 32, 64
+    etp_group = dist.new_group(list(range(world_size)))
+    layer = te.GroupedLinear(
+        num_gemms=num_gemms, in_features=in_f, out_features=out_f,
+        bias=False, params_dtype=torch.bfloat16,
+        device="cuda", etp_group=etp_group,
+    )
+    w0 = layer.weight0
+    assert isinstance(w0, ETPShardedParam)
+    assert w0.weight_list is not None
+    assert len(w0.weight_list) == num_gemms
+    assert [w.expert_idx for w in w0.weight_list] == list(range(num_gemms))
+    dist.destroy_process_group()
+
+
+class TestWrapModuleParams:
+    def test_linear_weight_replaced(self):
+        _requires_multi_gpu(4)
+        _run_distributed(_worker_linear_param_replaced, 4)
+
+    def test_grouped_linear_weight_list(self):
+        _requires_multi_gpu(4)
+        _run_distributed(_worker_grouped_weight_list, 4)
+
+
+# ---------------------------------------------------------------------------
+# 5. Linear forward/backward numerical correctness
+# ---------------------------------------------------------------------------
+
+def _worker_linear_correctness(rank, world_size, port):
+    """ETP output == (all-gathered weight) @ input, and dX matches."""
+    _dist_init(rank, world_size, port)
+    torch.manual_seed(0)
+    batch, in_f, out_f = 16, 64, 128   # out_f % (16*world_size)==0 → no padding
+    dtype = torch.bfloat16
+    etp_group = dist.new_group(list(range(world_size)))
+
+    layer = te.Linear(
+        in_features=in_f, out_features=out_f,
+        bias=False, params_dtype=dtype,
+        device="cuda", etp_group=etp_group,
+    )
+
+    # Reconstruct full weight from shards (all-gather)
+    shard = layer.weight.data.clone()
+    all_shards = [torch.zeros_like(shard) for _ in range(world_size)]
+    dist.all_gather(all_shards, shard, group=etp_group)
+    full_weight = torch.cat(all_shards, dim=0).float()[:out_f]  # strip any padding
+
+    # Shared input across ranks
+    inp = torch.randn(batch, in_f, dtype=dtype, device="cuda")
+    dist.broadcast(inp, src=0)
+
+    inp_etp = inp.clone().requires_grad_(True)
+    inp_ref = inp.clone().requires_grad_(True)
+
+    # ETP forward
+    out_etp = layer(inp_etp, is_first_microbatch=True)
+
+    # Reference forward
+    out_ref = inp_ref.float() @ full_weight.T
+    out_ref = out_ref.to(dtype)
+
+    assert out_etp.shape == out_ref.shape, f"Shape mismatch {out_etp.shape} vs {out_ref.shape}"
+    assert torch.allclose(out_etp.float(), out_ref.float(), atol=0.1, rtol=0.1), (
+        f"Output mismatch max_diff={(out_etp.float()-out_ref.float()).abs().max():.4f}"
+    )
+
+    # _finalize_wgrad always accumulates into main_grad; allocate before backward.
+    layer.weight.main_grad = torch.zeros(layer.weight.shape, dtype=dtype, device="cuda")
+
+    # Backward: compare input gradient
+    grad_out = torch.randn_like(out_etp)
+    dist.broadcast(grad_out, src=0)
+    out_etp.backward(grad_out)
+    out_ref.backward(grad_out.float())
+
+    assert inp_etp.grad is not None
+    assert torch.allclose(inp_etp.grad.float(), inp_ref.grad.float(), atol=0.1, rtol=0.1), (
+        f"dX mismatch max_diff={(inp_etp.grad.float()-inp_ref.grad.float()).abs().max():.4f}"
+    )
+    dist.destroy_process_group()
+
+
+class TestLinearETP:
+    def test_forward_backward_correctness(self):
+        _requires_multi_gpu(4)
+        _run_distributed(_worker_linear_correctness, 4)
+
+
+# ---------------------------------------------------------------------------
+# 6. LayerNormLinear forward/backward smoke test
+# ---------------------------------------------------------------------------
+
+def _worker_layernorm_linear(rank, world_size, port):
+    _dist_init(rank, world_size, port)
+    torch.manual_seed(0)
+    seq, batch, in_f, out_f = 4, 2, 64, 128
+    dtype = torch.bfloat16
+    etp_group = dist.new_group(list(range(world_size)))
+
+    layer = te.LayerNormLinear(
+        in_features=in_f, out_features=out_f,
+        bias=False, params_dtype=dtype,
+        device="cuda", etp_group=etp_group,
+    )
+    assert isinstance(layer.weight, ETPShardedParam)
+
+    inp = torch.randn(seq, batch, in_f, dtype=dtype, device="cuda", requires_grad=True)
+    dist.broadcast(inp, src=0)
+
+    out = layer(inp, is_first_microbatch=True)
+    assert out.shape == (seq, batch, out_f), f"unexpected output shape {out.shape}"
+
+    layer.weight.main_grad = torch.zeros(layer.weight.shape, dtype=dtype, device="cuda")
+    out.sum().backward()
+    assert inp.grad is not None and inp.grad.shape == inp.shape
+    dist.destroy_process_group()
+
+
+class TestLayerNormLinearETP:
+    def test_forward_backward(self):
+        _requires_multi_gpu(4)
+        _run_distributed(_worker_layernorm_linear, 4)
+
+
+# ---------------------------------------------------------------------------
+# 7. GroupedLinear forward/backward smoke test
+# ---------------------------------------------------------------------------
+
+def _worker_grouped_linear(rank, world_size, port, num_gemms):
+    _dist_init(rank, world_size, port)
+    torch.manual_seed(0)
+    in_f, out_f, total_tokens = 32, 64, num_gemms * 4
+    dtype = torch.bfloat16
+    etp_group = dist.new_group(list(range(world_size)))
+
+    layer = te.GroupedLinear(
+        num_gemms=num_gemms, in_features=in_f, out_features=out_f,
+        bias=False, params_dtype=dtype,
+        device="cuda", etp_group=etp_group,
+    )
+    assert isinstance(layer.weight0, ETPShardedParam)
+
+    m_splits = [total_tokens // num_gemms] * num_gemms
+    m_splits[-1] += total_tokens - sum(m_splits)
+
+    inp = torch.randn(total_tokens, in_f, dtype=dtype, device="cuda", requires_grad=True)
+    dist.broadcast(inp, src=0)
+
+    out = layer(inp, m_splits=m_splits, is_first_microbatch=True)
+    assert out.shape == (total_tokens, out_f), f"unexpected output shape {out.shape}"
+
+    for i in range(num_gemms):
+        w = getattr(layer, f"weight{i}")
+        w.main_grad = torch.zeros(w.shape, dtype=dtype, device="cuda")
+    out.sum().backward()
+    assert inp.grad is not None and inp.grad.shape == inp.shape
+    dist.destroy_process_group()
+
+
+class TestGroupedLinearETP:
+    @pytest.mark.parametrize("num_gemms", [2, 4])
+    def test_forward_backward(self, num_gemms):
+        _requires_multi_gpu(4)
+        _run_distributed(_worker_grouped_linear, 4, num_gemms)
+
+
+# ---------------------------------------------------------------------------
+# 8. Prefetch chain: next_w / prev_w wiring after first forward pass
+# ---------------------------------------------------------------------------
+
+def _worker_chain_wired(rank, world_size, port):
+    _dist_init(rank, world_size, port)
+    torch.manual_seed(0)
+    in_f, out_f = 32, 64
+    dtype = torch.bfloat16
+    etp_group = dist.new_group(list(range(world_size)))
+
+    l0 = te.Linear(in_features=in_f, out_features=out_f, bias=False,
+                   params_dtype=dtype, device="cuda", etp_group=etp_group)
+    l1 = te.Linear(in_features=in_f, out_features=out_f, bias=False,
+                   params_dtype=dtype, device="cuda", etp_group=etp_group)
+
+    inp = torch.randn(4, in_f, dtype=dtype, device="cuda")
+    dist.broadcast(inp, src=0)
+
+    # First forward pass builds the linked list
+    l0(inp, is_first_microbatch=True)
+    l1(inp, is_first_microbatch=True)
+
+    w0, w1 = l0.weight, l1.weight
+    assert w0.next_w is w1, "w0.next_w should point to w1"
+    assert w1.prev_w is w0, "w1.prev_w should point back to w0"
+    assert w1.next_w is None
+    assert w0.prev_w is None
+    dist.destroy_process_group()
+
+
+def _worker_chain_async_prefetch(rank, world_size, port):
+    """On the second forward pass, w1 should be in DATA_READY before its forward runs."""
+    _dist_init(rank, world_size, port)
+    torch.manual_seed(0)
+    in_f, out_f = 32, 64
+    dtype = torch.bfloat16
+    etp_group = dist.new_group(list(range(world_size)))
+
+    l0 = te.Linear(in_features=in_f, out_features=out_f, bias=False,
+                   params_dtype=dtype, device="cuda", etp_group=etp_group)
+    l1 = te.Linear(in_features=in_f, out_features=out_f, bias=False,
+                   params_dtype=dtype, device="cuda", etp_group=etp_group)
+
+    inp = torch.randn(4, in_f, dtype=dtype, device="cuda")
+    dist.broadcast(inp, src=0)
+
+    # First pass builds chain, second pass uses async prefetch
+    for _ in range(2):
+        out = l0(inp, is_first_microbatch=True) + l1(inp, is_first_microbatch=True)
+    assert torch.isfinite(out).all(), "Non-finite output on second pass"
+    dist.destroy_process_group()
+
+
+class TestETPPrefetchChain:
+    def test_chain_wired_after_first_pass(self):
+        _requires_multi_gpu(4)
+        _run_distributed(_worker_chain_wired, 4)
+
+    def test_async_prefetch_second_pass(self):
+        _requires_multi_gpu(4)
+        _run_distributed(_worker_chain_async_prefetch, 4)
+
+
+# ---------------------------------------------------------------------------
+# 9. Wgrad reduce-scatter: shape and deferred async path
+# ---------------------------------------------------------------------------
+
+def _worker_wgrad_shape(rank, world_size, port):
+    """After backward, weight.grad shape must match the local shard shape."""
+    _dist_init(rank, world_size, port)
+    torch.manual_seed(0)
+    in_f, out_f = 32, 64
+    dtype = torch.bfloat16
+    etp_group = dist.new_group(list(range(world_size)))
+
+    layer = te.Linear(
+        in_features=in_f, out_features=out_f,
+        bias=False, params_dtype=dtype,
+        device="cuda", etp_group=etp_group,
+        fuse_wgrad_accumulation=False,
+    )
+    inp = torch.randn(8, in_f, dtype=dtype, device="cuda", requires_grad=True)
+    dist.broadcast(inp, src=0)
+
+    layer.weight.main_grad = torch.zeros(layer.weight.shape, dtype=dtype, device="cuda")
+    layer(inp, is_first_microbatch=True).sum().backward()
+
+    w = layer.weight
+    if w.grad is not None:
+        assert w.grad.shape == w.shape, \
+            f"wgrad shape {w.grad.shape} != shard shape {w.shape}"
+    dist.destroy_process_group()
+
+
+def _worker_multilayer_deferred_rs(rank, world_size, port):
+    """Two-layer ETP: async RS deferred for layer0 (non-last), sync for layer1 (last in bwd)."""
+    _dist_init(rank, world_size, port)
+    torch.manual_seed(0)
+    in_f, out_f = 32, 64
+    dtype = torch.bfloat16
+    etp_group = dist.new_group(list(range(world_size)))
+
+    l0 = te.Linear(in_features=in_f, out_features=out_f, bias=False,
+                   params_dtype=dtype, device="cuda", etp_group=etp_group)
+    l1 = te.Linear(in_features=in_f, out_features=out_f, bias=False,
+                   params_dtype=dtype, device="cuda", etp_group=etp_group)
+
+    inp = torch.randn(8, in_f, dtype=dtype, device="cuda", requires_grad=True)
+    dist.broadcast(inp, src=0)
+
+    # _finalize_wgrad always accumulates into main_grad; allocate before backward.
+    l0.weight.main_grad = torch.zeros(l0.weight.shape, dtype=dtype, device="cuda")
+    l1.weight.main_grad = torch.zeros(l1.weight.shape, dtype=dtype, device="cuda")
+
+    out = l0(inp, is_first_microbatch=True) + l1(inp, is_first_microbatch=True)
+    out.sum().backward()
+
+    # Both weights' main_grad should have been updated
+    for lyr in [l0, l1]:
+        w = lyr.weight
+        assert w.main_grad is not None, f"No main_grad on {lyr.__class__.__name__}.weight"
+    dist.destroy_process_group()
+
+
+class TestETPWgradRS:
+    def test_wgrad_shape_matches_shard(self):
+        _requires_multi_gpu(4)
+        _run_distributed(_worker_wgrad_shape, 4)
+
+    def test_multilayer_deferred_rs(self):
+        _requires_multi_gpu(4)
+        _run_distributed(_worker_multilayer_deferred_rs, 4)
+
+
+# ---------------------------------------------------------------------------
+# 10. Multiple microbatches: output must be consistent when weight unchanged
+# ---------------------------------------------------------------------------
+
+def _worker_microbatches(rank, world_size, port):
+    _dist_init(rank, world_size, port)
+    torch.manual_seed(0)
+    batch, in_f, out_f = 8, 64, 128
+    dtype = torch.bfloat16
+    etp_group = dist.new_group(list(range(world_size)))
+
+    layer = te.Linear(
+        in_features=in_f, out_features=out_f,
+        bias=False, params_dtype=dtype,
+        device="cuda", etp_group=etp_group,
+    )
+    inp = torch.randn(batch, in_f, dtype=dtype, device="cuda")
+    dist.broadcast(inp, src=0)
+
+    # First microbatch
+    out1 = layer(inp, is_first_microbatch=True).detach().clone()
+
+    # Second microbatch with same weight (skip_weight_cast=True path)
+    out2 = layer(inp, is_first_microbatch=False).detach()
+
+    assert torch.allclose(out1, out2), \
+        f"Microbatch outputs differ; max_diff={(out1-out2).abs().max():.6f}"
+    dist.destroy_process_group()
+
+
+class TestETPMicrobatches:
+    def test_consistent_across_microbatches(self):
+        _requires_multi_gpu(4)
+        _run_distributed(_worker_microbatches, 4)
+
+
+# ---------------------------------------------------------------------------
+# 11. NVFP4 + ETP: Linear forward/backward, quantized shard setup
+# ---------------------------------------------------------------------------
+
+def _worker_nvfp4_linear(rank, world_size, port):
+    """Verify that ETP Linear correctly quantizes, all-gathers, and computes with NVFP4."""
+    _dist_init(rank, world_size, port)
+    torch.manual_seed(0)
+    # batch=32: NVFP4 wgrad GEMM (K=batch) requires K divisible by 32
+    batch, in_f, out_f = 32, 64, 128   # out_f % (16*world_size)==0 → no padding
+    dtype = torch.bfloat16
+    etp_group = dist.new_group(list(range(world_size)))
+
+    layer = te.Linear(
+        in_features=in_f, out_features=out_f,
+        bias=False, params_dtype=dtype,
+        device="cuda", etp_group=etp_group,
+    )
+    inp = torch.randn(batch, in_f, dtype=dtype, device="cuda", requires_grad=True)
+    dist.broadcast(inp, src=0)
+
+    # Forward under NVFP4 recipe – triggers setup() and NVFP4 quantization
+    recipe = NVFP4BlockScaling()
+    with fp8_autocast(enabled=True, fp8_recipe=recipe):
+        out = layer(inp, is_first_microbatch=True)
+
+    # After the first forward pass setup() must have created a quantized shard
+    w = layer.weight
+    assert w.quantized is not None, "NVFP4 quantized shard must be set after setup()"
+    assert isinstance(w.quantized, QuantizedTensor), \
+        f"weight.quantized should be QuantizedTensor, got {type(w.quantized)}"
+
+    assert out.shape == (batch, out_f), f"unexpected output shape {out.shape}"
+    assert torch.isfinite(out).all(), "NVFP4 ETP output has non-finite values"
+
+    # Second microbatch reuses cached quantized weight (skip_weight_cast path)
+    with fp8_autocast(enabled=True, fp8_recipe=recipe):
+        out2 = layer(inp.detach(), is_first_microbatch=False)
+    assert torch.isfinite(out2).all(), "NVFP4 ETP second-microbatch output has non-finite values"
+
+    dist.destroy_process_group()
+
+
+def _worker_nvfp4_linear_unaligned(rank, world_size, port):
+    """Verify NVFP4 ETP when out_features is not aligned to 16*world_size (padding path).
+
+    out_f is chosen to be divisible by 8 (satisfies NVFP4 GEMM alignment) but not by
+    16*world_size (so padding is needed). The last ETP rank receives a shard that is
+    zero-padded to reach the shard_size boundary. After all-gather, _strip_padding
+    removes the padded rows from the gathered weight before the GEMM, so the output
+    has the original out_f columns.
+    """
+    _dist_init(rank, world_size, port)
+    torch.manual_seed(0)
+    alignment = 16 * world_size    # 64 for world_size=4
+    # Choose out_f divisible by 8 (NVFP4 GEMM constraint) but not by 64 (ETP alignment).
+    # With out_f=56: pad_length=8, shard_size=16, last rank gets 8 rows padded to 16.
+    out_f = alignment - 8          # 56 for world_size=4
+    in_f = 64
+    batch = 32
+    dtype = torch.bfloat16
+    etp_group = dist.new_group(list(range(world_size)))
+
+    layer = te.Linear(
+        in_features=in_f, out_features=out_f,
+        bias=False, params_dtype=dtype,
+        device="cuda", etp_group=etp_group,
+    )
+    inp = torch.randn(batch, in_f, dtype=dtype, device="cuda", requires_grad=True)
+    dist.broadcast(inp, src=0)
+
+    with fp8_autocast(enabled=True, fp8_recipe=NVFP4BlockScaling()):
+        out = layer(inp, is_first_microbatch=True)
+
+    # After _strip_padding removes the padded rows, output has out_f (not padded) cols.
+    assert out.shape == (batch, out_f), f"unexpected output shape {out.shape}"
+    assert torch.isfinite(out).all(), "NVFP4 ETP (unaligned) output has non-finite values"
+    dist.destroy_process_group()
+
+
+class TestNVFP4LinearETP:
+    def test_forward_backward(self):
+        _requires_nvfp4()
+        _requires_multi_gpu(4)
+        _run_distributed(_worker_nvfp4_linear, 4)
+
+    def test_forward_unaligned_padding(self):
+        _requires_nvfp4()
+        _requires_multi_gpu(4)
+        _run_distributed(_worker_nvfp4_linear_unaligned, 4)
+
+
+# ---------------------------------------------------------------------------
+# 12. NVFP4 + ETP: GroupedLinear forward/backward (coalesced batched all-gather)
+# ---------------------------------------------------------------------------
+
+def _worker_nvfp4_grouped_linear(rank, world_size, port, num_gemms):
+    """Verify NVFP4 ETP with GroupedLinear (uses grouped_gather_along_first_dim)."""
+    _dist_init(rank, world_size, port)
+    torch.manual_seed(0)
+    # NVFP4 split_quantize constraints: in_f % 128 == 0, tokens_per_expert % 64 == 0
+    # (Hadamard transform requirement), and K=tokens_per_expert % 32 == 0 for wgrad.
+    in_f, out_f, total_tokens = 128, 256, num_gemms * 64
+    dtype = torch.bfloat16
+    etp_group = dist.new_group(list(range(world_size)))
+
+    layer = te.GroupedLinear(
+        num_gemms=num_gemms, in_features=in_f, out_features=out_f,
+        bias=False, params_dtype=dtype,
+        device="cuda", etp_group=etp_group,
+    )
+    assert isinstance(layer.weight0, ETPShardedParam)
+
+    m_splits = [total_tokens // num_gemms] * num_gemms
+    m_splits[-1] += total_tokens - sum(m_splits)
+
+    inp = torch.randn(total_tokens, in_f, dtype=dtype, device="cuda", requires_grad=True)
+    dist.broadcast(inp, src=0)
+
+    with fp8_autocast(enabled=True, fp8_recipe=NVFP4BlockScaling()):
+        out = layer(inp, m_splits=m_splits, is_first_microbatch=True)
+
+    assert out.shape == (total_tokens, out_f), f"unexpected output shape {out.shape}"
+    assert torch.isfinite(out).all(), "NVFP4 GroupedLinear ETP output has non-finite values"
+
+    # All expert weight shards should be quantized after setup()
+    for i in range(num_gemms):
+        name = f"weight{i}"
+        w = getattr(layer, name)
+        assert isinstance(w, ETPShardedParam)
+        assert w.quantized is not None, f"{name}.quantized not set after NVFP4 setup()"
+        assert isinstance(w.quantized, QuantizedTensor), \
+            f"{name}.quantized should be QuantizedTensor, got {type(w.quantized)}"
+
+    for i in range(num_gemms):
+        w = getattr(layer, f"weight{i}")
+        w.main_grad = torch.zeros(w.shape, dtype=dtype, device="cuda")
+    out.sum().backward()
+    assert inp.grad is not None and inp.grad.shape == inp.shape
+    dist.destroy_process_group()
+
+
+class TestNVFP4GroupedLinearETP:
+    @pytest.mark.parametrize("num_gemms", [2, 4])
+    def test_forward_backward(self, num_gemms):
+        _requires_nvfp4()
+        _requires_multi_gpu(4)
+        _run_distributed(_worker_nvfp4_grouped_linear, 4, num_gemms)
+
+
+# ---------------------------------------------------------------------------
+# 13. MXFP8 + ETP: Linear forward/backward, quantized shard setup
+# ---------------------------------------------------------------------------
+
+def _worker_mxfp8_linear(rank, world_size, port):
+    """Verify that ETP Linear correctly quantizes, all-gathers, and computes with MXFP8."""
+    from transformer_engine.common.recipe import MXFP8BlockScaling
+    _dist_init(rank, world_size, port)
+    torch.manual_seed(0)
+    # batch=32: MXFP8 wgrad GEMM (K=batch) requires K divisible by MXFP8_BLOCK_SCALING_SIZE=32
+    batch, in_f, out_f = 32, 64, 128   # out_f % (16*world_size)==0 → no padding
+    dtype = torch.bfloat16
+    etp_group = dist.new_group(list(range(world_size)))
+
+    layer = te.Linear(
+        in_features=in_f, out_features=out_f,
+        bias=False, params_dtype=dtype,
+        device="cuda", etp_group=etp_group,
+    )
+    inp = torch.randn(batch, in_f, dtype=dtype, device="cuda", requires_grad=True)
+    dist.broadcast(inp, src=0)
+
+    # Forward under MXFP8 recipe – triggers setup() and MXFP8 quantization
+    recipe = MXFP8BlockScaling()
+    with fp8_autocast(enabled=True, fp8_recipe=recipe):
+        out = layer(inp, is_first_microbatch=True)
+
+    # After the first forward pass setup() must have created a quantized shard
+    w = layer.weight
+    assert w.quantized is not None, "MXFP8 quantized shard must be set after setup()"
+    assert isinstance(w.quantized, QuantizedTensor), \
+        f"weight.quantized should be QuantizedTensor, got {type(w.quantized)}"
+
+    assert out.shape == (batch, out_f), f"unexpected output shape {out.shape}"
+    assert torch.isfinite(out).all(), "MXFP8 ETP output has non-finite values"
+
+    # Backward should complete without error
+    layer.weight.main_grad = torch.zeros(layer.weight.shape, dtype=dtype, device="cuda")
+    out.sum().backward()
+    assert inp.grad is not None
+    assert inp.grad.shape == inp.shape
+
+    # Second microbatch reuses cached quantized weight (skip_weight_cast path)
+    with fp8_autocast(enabled=True, fp8_recipe=recipe):
+        out2 = layer(inp.detach(), is_first_microbatch=False)
+    assert torch.isfinite(out2).all(), "MXFP8 ETP second-microbatch output has non-finite values"
+
+    dist.destroy_process_group()
+
+
+def _worker_mxfp8_linear_unaligned(rank, world_size, port):
+    """Verify MXFP8 ETP when out_features is not aligned to 16*world_size (padding path).
+
+    MXFP8 requires tensor dims divisible by 32, so shard_size (= M_padded / world_size)
+    must be a multiple of 32. With world_size=4 this requires M_padded % 128 == 0.
+    out_f=120 gives M_padded=128, shard_size=32 (32 % 32 == 0). The last rank has
+    24 real rows zero-padded to 32. After all-gather, _strip_padding removes the padded
+    rows before the GEMM, so the output has the original out_f columns.
+    """
+    from transformer_engine.common.recipe import MXFP8BlockScaling
+    _dist_init(rank, world_size, port)
+    torch.manual_seed(0)
+    # out_f=120: M_padded=128, shard_size=32, last rank has 24 rows padded to 32.
+    # 120 is divisible by 8 (GEMM constraint), not by 64 (ETP alignment → padding needed).
+    out_f = 120
+    in_f = 64
+    batch = 32
+    dtype = torch.bfloat16
+    etp_group = dist.new_group(list(range(world_size)))
+
+    layer = te.Linear(
+        in_features=in_f, out_features=out_f,
+        bias=False, params_dtype=dtype,
+        device="cuda", etp_group=etp_group,
+    )
+    inp = torch.randn(batch, in_f, dtype=dtype, device="cuda", requires_grad=True)
+    dist.broadcast(inp, src=0)
+
+    with fp8_autocast(enabled=True, fp8_recipe=MXFP8BlockScaling()):
+        out = layer(inp, is_first_microbatch=True)
+
+    # After _strip_padding removes the padded rows, output has out_f (not padded) cols.
+    assert out.shape == (batch, out_f), f"unexpected output shape {out.shape}"
+    assert torch.isfinite(out).all(), "MXFP8 ETP (unaligned) output has non-finite values"
+    dist.destroy_process_group()
+
+
+def _requires_mxfp8():
+    available, reason = is_mxfp8_available(return_reason=True)
+    if not available:
+        pytest.skip(f"MXFP8 not available: {reason}")
+
+
+class TestMXFP8LinearETP:
+    def test_forward_backward(self):
+        _requires_mxfp8()
+        _requires_multi_gpu(4)
+        _run_distributed(_worker_mxfp8_linear, 4)
+
+    def test_forward_unaligned_padding(self):
+        _requires_mxfp8()
+        _requires_multi_gpu(4)
+        _run_distributed(_worker_mxfp8_linear_unaligned, 4)
+
+
+# ---------------------------------------------------------------------------
+# 14. ETPConfig / update_config
+# ---------------------------------------------------------------------------
+
+class TestETPConfig:
+
+    def test_update_pad_for_alignment(self):
+        original = etp_module.ETP_CONFIG.pad_for_alignment
+        try:
+            etp_module.update_config(pad_for_alignment=8)
+            assert etp_module.ETP_CONFIG.pad_for_alignment == 8
+        finally:
+            etp_module.update_config(pad_for_alignment=original)
+
+    def test_update_weight_prefetch(self):
+        original = etp_module.ETP_CONFIG.weight_prefetch
+        try:
+            etp_module.update_config(weight_prefetch=False)
+            assert etp_module.ETP_CONFIG.weight_prefetch is False
+        finally:
+            etp_module.update_config(weight_prefetch=original)
+
+    def test_invalid_key_raises(self):
+        with pytest.raises(ValueError, match="Unknown ETP config option"):
+            etp_module.update_config(nonexistent_key=123)
+
+
+# ---------------------------------------------------------------------------
+# 15. ETPShardedParam properties – shape computations and padding
+# ---------------------------------------------------------------------------
+
+class TestETPShardedParamProperties:
+
+    class _FakeGroup:
+        def __init__(self, size=4, rank=0):
+            self._size = size
+            self._rank = rank
+        def size(self): return self._size
+        def rank(self): return self._rank
+
+    def _make_param(self, shape, pad_length=0, group_size=4, group_rank=0,
+                    is_padded_last_rank=False):
+        p = ETPShardedParam(torch.zeros(*shape))
+        p.group = self._FakeGroup(size=group_size, rank=group_rank)
+        p.pad_length = pad_length
+        p.is_padded_last_rank = is_padded_last_rank
+        p.expert_idx = None
+        return p
+
+    # --- _unsharded_shape_padded ---
+
+    def test_unsharded_shape_padded_no_padding(self):
+        # shape=(8, 4), group_size=4 → 8*4=32 rows, no padding
+        p = self._make_param((8, 4), pad_length=0, group_size=4, group_rank=2)
+        assert p._unsharded_shape_padded == (32, 4)
+
+    def test_unsharded_shape_padded_last_rank_with_padding(self):
+        # shard has 15 real rows, pad_length=1, last rank → (15+1)*4=64
+        p = self._make_param((15, 32), pad_length=1, group_size=4, group_rank=3,
+                             is_padded_last_rank=True)
+        assert p._unsharded_shape_padded == (64, 32)
+
+    def test_unsharded_shape_padded_non_last_rank_with_padding(self):
+        # Non-last rank: pad_length metadata set but shape just multiplied
+        p = self._make_param((16, 32), pad_length=1, group_size=4, group_rank=0,
+                             is_padded_last_rank=False)
+        assert p._unsharded_shape_padded == (64, 32)
+
+    # --- _unsharded_shape ---
+
+    def test_unsharded_shape_no_padding(self):
+        p = self._make_param((8, 4), pad_length=0, group_size=4, group_rank=0)
+        assert p._unsharded_shape == (32, 4)
+
+    def test_unsharded_shape_strips_padding(self):
+        # padded = 64, strip 1 → 63
+        p = self._make_param((15, 32), pad_length=1, group_size=4, group_rank=3,
+                             is_padded_last_rank=True)
+        assert p._unsharded_shape == (63, 32)
+
+    # --- get_padded_shard ---
+
+    def test_get_padded_shard_identity_when_no_padding(self):
+        p = self._make_param((6, 4), pad_length=0)
+        result = p.get_padded_shard()
+        assert result is p  # identity – no copy needed
+
+    def test_get_padded_shard_identity_non_last_rank(self):
+        # pad_length > 0 but not the padded last rank → no padding added
+        p = self._make_param((16, 4), pad_length=1, group_size=4, group_rank=0,
+                             is_padded_last_rank=False)
+        result = p.get_padded_shard()
+        assert result is p
+
+    def test_get_padded_shard_appends_zero_rows(self):
+        p = self._make_param((6, 4), pad_length=2, group_size=4, group_rank=3,
+                             is_padded_last_rank=True)
+        padded = p.get_padded_shard()
+        assert padded.shape == (8, 4), f"Expected (8,4), got {padded.shape}"
+        assert torch.all(padded[6:] == 0), "Padding rows must be zero"
+
+    # --- _strip_padding ---
+
+    def test_strip_padding_identity_no_padding(self):
+        p = self._make_param((8, 4), pad_length=0)
+        t = torch.randn(32, 4)
+        assert p._strip_padding(t) is t
+
+    def test_strip_padding_plain_tensor(self):
+        # Gathered weight [32, 4] with pad_length=1 → strip 1 row → [31, 4]
+        p = self._make_param((7, 4), pad_length=1, group_size=4, group_rank=0)
+        t = torch.randn(32, 4)
+        result = p._strip_padding(t)
+        assert result.shape == (31, 4)
+        assert torch.equal(result, t[:-1])
+
+    def test_strip_padding_multi_row(self):
+        # pad_length=4 strips 4 rows
+        p = self._make_param((12, 8), pad_length=4, group_size=4, group_rank=0)
+        t = torch.ones(64, 8)
+        result = p._strip_padding(t)
+        assert result.shape == (60, 8)
+
+
+# ---------------------------------------------------------------------------
+# 16. _get_cache_key – expert vs non-expert, fwd vs bwd
+# ---------------------------------------------------------------------------
+
+class TestETPCacheKey:
+
+    class _FakeGroup:
+        def size(self): return 4
+        def rank(self): return 0
+
+    def _param(self, shape=(16, 32), expert_idx=None):
+        p = ETPShardedParam(torch.zeros(*shape))
+        p.group = self._FakeGroup()
+        p.expert_idx = expert_idx
+        p.pad_length = 0
+        p.is_padded_last_rank = False
+        return p
+
+    def test_non_expert_key_same_for_fwd_bwd(self):
+        """Non-routed params produce the same cache key for fwd and bwd."""
+        p = self._param(expert_idx=None)
+        assert p._get_cache_key(torch.bfloat16, fwd=True, reduce_scatter=False) == \
+               p._get_cache_key(torch.bfloat16, fwd=False, reduce_scatter=False)
+
+    def test_expert_key_differs_fwd_bwd(self):
+        """For quantized (non-torch.dtype) recipes, expert fwd vs bwd keys differ."""
+        p = self._param(expert_idx=0)
+        # _get_cache_key differentiates fwd/bwd only for non-torch.dtype objects
+        # (e.g. quantized recipe dtype descriptors).  Use a mock to trigger that path.
+        mock_dtype = "fp8"
+        assert p._get_cache_key(mock_dtype, fwd=True, reduce_scatter=False) != \
+               p._get_cache_key(mock_dtype, fwd=False, reduce_scatter=False)
+
+    def test_different_expert_idx_different_keys(self):
+        """Two experts with same shape but different indices get distinct keys."""
+        p0 = self._param(expert_idx=0)
+        p1 = self._param(expert_idx=1)
+        assert p0._get_cache_key(torch.bfloat16, fwd=True, reduce_scatter=False) != \
+               p1._get_cache_key(torch.bfloat16, fwd=True, reduce_scatter=False)
+
+    def test_same_expert_idx_same_key(self):
+        """Same-shaped experts with the same idx share a cache key (cross-layer buffer reuse)."""
+        p_l0 = self._param(expert_idx=0)
+        p_l1 = self._param(expert_idx=0)
+        assert p_l0._get_cache_key(torch.bfloat16, fwd=True, reduce_scatter=False) == \
+               p_l1._get_cache_key(torch.bfloat16, fwd=True, reduce_scatter=False)
+
+    def test_different_dtypes_different_keys(self):
+        p = self._param()
+        assert p._get_cache_key(torch.bfloat16, fwd=True, reduce_scatter=False) != \
+               p._get_cache_key(torch.float32, fwd=True, reduce_scatter=False)
+
+    def test_rs_key_differs_from_ag_key(self):
+        """reduce_scatter=True key must differ from reduce_scatter=False key."""
+        p = self._param()
+        assert p._get_cache_key(torch.bfloat16, fwd=True, reduce_scatter=False) != \
+               p._get_cache_key(torch.bfloat16, fwd=True, reduce_scatter=True)
+
+
+# ---------------------------------------------------------------------------
+# 17. ETPWeightCache.take() deferred vs get() immediate pool return
+# ---------------------------------------------------------------------------
+
+class TestETPCacheRelease:
+    """Tests for ETPWeightCache reserve/get/release semantics."""
+
+    class _FakeGroup:
+        def size(self): return 2
+        def rank(self): return 0
+
+    def _param(self, shape=(8, 4)):
+        p = ETPShardedParam(torch.zeros(*shape))
+        p.group = self._FakeGroup()
+        p.expert_idx = None
+        p.pad_length = 0
+        p.is_padded_last_rank = False
+        p._quantizer = None
+        return p
+
+    def test_release_returns_buffer_to_pool(self):
+        """release() puts the buffer back so the next reserve+get reuses it."""
+        cache = ETPWeightCache()
+        p = self._param()
+        t1 = cache.reserve(p, torch.bfloat16, fwd=True)
+        buf1 = cache.get(t1)
+        cache.release(t1)
+        # New ticket should pop buf1 from pool
+        t2 = cache.reserve(p, torch.bfloat16, fwd=True)
+        buf2 = cache.get(t2)
+        assert buf2 is buf1, "Buffer should be reused after release()"
+        cache.release(t2)
+
+    def test_without_release_pool_stays_empty(self):
+        """Without release(), subsequent reserves allocate fresh buffers."""
+        cache = ETPWeightCache()
+        p = self._param()
+        t1 = cache.reserve(p, torch.bfloat16, fwd=True)
+        buf1 = cache.get(t1)
+        # Do NOT release t1 — pool stays empty
+        t2 = cache.reserve(p, torch.bfloat16, fwd=True)
+        buf2 = cache.get(t2)
+        assert buf2 is not buf1, "Without release, a fresh buffer must be allocated"
+
+    def test_get_same_ticket_returns_same_buf(self):
+        """get() is idempotent — calling it twice returns the same buffer."""
+        cache = ETPWeightCache()
+        p = self._param()
+        t = cache.reserve(p, torch.bfloat16, fwd=True)
+        buf_a = cache.get(t)
+        buf_b = cache.get(t)
+        assert buf_a is buf_b
+        cache.release(t)
+
+    def test_release_invalid_ticket_raises(self):
+        cache = ETPWeightCache()
+        with pytest.raises(KeyError):
+            cache.release(9999)
+
+
+# ---------------------------------------------------------------------------
+# 18. tag_etp_params_with_names – _debug_name population
+# ---------------------------------------------------------------------------
+
+class TestTagETPParamsWithNames:
+
+    def test_debug_name_populated_for_etp_param(self):
+        """ETPShardedParam._debug_name is set to the dotted parameter path."""
+        class _FakeGroup:
+            def size(self): return 1
+            def rank(self): return 0
+
+        model = nn.Linear(4, 8, bias=False)
+        w = ETPShardedParam(torch.randn(8, 4))
+        w.group = _FakeGroup()
+        model._parameters['weight'] = w
+
+        etp_module.tag_etp_params_with_names(model)
+        assert w._debug_name == 'weight', \
+            f"Expected 'weight', got '{w._debug_name}'"
+
+    def test_nested_module_debug_name(self):
+        """Nested module produces a dotted debug name."""
+        class _FakeGroup:
+            def size(self): return 1
+            def rank(self): return 0
+
+        outer = nn.Sequential(nn.Linear(4, 8, bias=False))
+        w = ETPShardedParam(torch.randn(8, 4))
+        w.group = _FakeGroup()
+        outer._modules['0']._parameters['weight'] = w
+
+        etp_module.tag_etp_params_with_names(outer)
+        assert w._debug_name == '0.weight', \
+            f"Expected '0.weight', got '{w._debug_name}'"
+
+    def test_non_etp_params_are_skipped(self):
+        """Plain nn.Parameter instances are silently ignored."""
+        model = nn.Linear(4, 8)
+        etp_module.tag_etp_params_with_names(model)   # must not raise
+
+
+# ---------------------------------------------------------------------------
+# 19. _finalize_wgrad – strip padding, fuse accumulation, hook invocation
+# ---------------------------------------------------------------------------
+
+class TestFinalizeWgrad:
+    """Tests for ETPShardedParam._finalize_wgrad(param, wgrad_rs).
+
+    Current behaviour: always accumulates wgrad_rs into param.main_grad,
+    strips padding when is_padded_last_rank=True, resets rs_state to NONE,
+    and returns a dummy-zero grad tensor with the same shape as main_grad.
+    """
+
+    class _FakeGroup:
+        def size(self): return 2
+        def rank(self): return 0
+
+    def _param(self, shape=(8, 4), pad_length=0, is_padded_last_rank=False, device="cuda"):
+        p = ETPShardedParam(torch.zeros(*shape, device=device))
+        p.group = self._FakeGroup()
+        p.pad_length = pad_length
+        p.is_padded_last_rank = is_padded_last_rank
+        p.main_grad = torch.zeros(*shape, device=device)
+        return p
+
+    @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA required")
+    def test_accumulates_into_main_grad(self):
+        p = self._param()
+        wgrad = torch.ones(8, 4, device="cuda")
+        ETPShardedParam._finalize_wgrad(p, wgrad)
+        assert torch.all(p.main_grad == 1), "main_grad should equal wgrad after accumulation"
+
+    @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA required")
+    def test_returns_dummy_zero_grad(self):
+        p = self._param()
+        wgrad = torch.ones(8, 4, device="cuda")
+        result = ETPShardedParam._finalize_wgrad(p, wgrad)
+        assert result.shape == p.shape, "dummy grad shape must match shard shape"
+        assert torch.all(result == 0), "dummy grad must be zeroes"
+
+    @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA required")
+    def test_strips_padding_for_padded_rank(self):
+        # Shard has 7 real rows, pad_length=1, is_padded_last_rank=True.
+        # RS output has 8 rows (7 real + 1 pad); strip to 7.
+        p = self._param(shape=(7, 4), pad_length=1, is_padded_last_rank=True)
+        # main_grad must match the real shard shape (7 rows)
+        p.main_grad = torch.zeros(7, 4, device="cuda")
+        wgrad = torch.ones(8, 4, device="cuda")
+        ETPShardedParam._finalize_wgrad(p, wgrad)
+        assert torch.all(p.main_grad == 1), "main_grad (7 rows) should be fully updated"
+
+    @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA required")
+    def test_rs_state_reset_to_none(self):
+        p = self._param()
+        p._set_rs_state(ETPWeightState.DATA_READY_SYNC)
+        wgrad = torch.ones(8, 4, device="cuda")
+        ETPShardedParam._finalize_wgrad(p, wgrad)
+        assert p.rs_state == ETPWeightState.NONE, "rs_state should be reset to NONE"
+
+    @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA required")
+    def test_grad_added_to_main_grad_flag(self):
+        p = self._param()
+        p.grad_added_to_main_grad = False
+        wgrad = torch.ones(8, 4, device="cuda")
+        ETPShardedParam._finalize_wgrad(p, wgrad)
+        assert p.grad_added_to_main_grad is True
+
+
+# ---------------------------------------------------------------------------
+# 20. wrap_module_params_etp is a no-op when etp_group.size() == 1
+# ---------------------------------------------------------------------------
+
+class TestETPGroupSizeOne:
+
+    class _SingletonGroup:
+        def size(self): return 1
+        def rank(self): return 0
+
+    def test_no_sharding_when_etp_size_one(self):
+        """wrap_module_params_etp must be a no-op for a singleton ETP group."""
+        mod = nn.Linear(32, 64, bias=False)
+        original_weight = mod.weight
+        wrap_module_params_etp(mod, ['weight'], self._SingletonGroup())
+        assert mod.weight is original_weight, \
+            "etp_group.size()==1 should leave parameters unchanged"
+        assert not isinstance(mod.weight, ETPShardedParam)
+
+
+# ---------------------------------------------------------------------------
+# 21. weight_prefetch=False: forward still produces correct output
+# ---------------------------------------------------------------------------
+
+def _worker_prefetch_disabled(rank, world_size, port):
+    _dist_init(rank, world_size, port)
+    torch.manual_seed(0)
+    in_f, out_f = 32, 64
+    dtype = torch.bfloat16
+    etp_group = dist.new_group(list(range(world_size)))
+
+    etp_module.update_config(weight_prefetch=False)
+    try:
+        l0 = te.Linear(in_features=in_f, out_features=out_f, bias=False,
+                       params_dtype=dtype, device="cuda", etp_group=etp_group)
+        l1 = te.Linear(in_features=in_f, out_features=out_f, bias=False,
+                       params_dtype=dtype, device="cuda", etp_group=etp_group)
+
+        inp = torch.randn(4, in_f, dtype=dtype, device="cuda")
+        dist.broadcast(inp, src=0)
+
+        # Single forward pass: builds chain and verifies output is correct
+        out = l0(inp, is_first_microbatch=True) + l1(inp, is_first_microbatch=True)
+
+        # Chain should still be wired even with prefetch disabled
+        assert l0.weight.next_w is l1.weight
+        assert torch.isfinite(out).all(), "Non-finite output with prefetch disabled"
+    finally:
+        etp_module.update_config(weight_prefetch=True)
+    dist.destroy_process_group()
+
+
+class TestETPPrefetchDisabled:
+    def test_forward_works_without_prefetch(self):
+        _requires_multi_gpu(4)
+        _run_distributed(_worker_prefetch_disabled, 4)
+
+
+# ---------------------------------------------------------------------------
+# 22. fuse_wgrad_accumulation=True: wgrad is accumulated into main_grad
+# ---------------------------------------------------------------------------
+
+def _worker_fuse_wgrad(rank, world_size, port):
+    _dist_init(rank, world_size, port)
+    torch.manual_seed(0)
+    in_f, out_f = 32, 128   # out_f % (16*world_size)==0, no padding
+    dtype = torch.bfloat16
+    etp_group = dist.new_group(list(range(world_size)))
+
+    layer = te.Linear(
+        in_features=in_f, out_features=out_f,
+        bias=False, params_dtype=dtype,
+        device="cuda", etp_group=etp_group,
+        fuse_wgrad_accumulation=True,
+    )
+
+    # Allocate main_grad on the local shard shape
+    w = layer.weight
+    w.main_grad = torch.zeros(w.shape, dtype=dtype, device="cuda")
+
+    inp = torch.randn(8, in_f, dtype=dtype, device="cuda", requires_grad=True)
+    dist.broadcast(inp, src=0)
+
+    layer(inp, is_first_microbatch=True).sum().backward()
+
+    # With fused accumulation, wgrad was added into main_grad
+    assert torch.any(w.main_grad != 0), \
+        "main_grad should have been updated by fused wgrad accumulation"
+    dist.destroy_process_group()
+
+
+class TestFuseWgradAccumulation:
+    def test_wgrad_accumulated_into_main_grad(self):
+        _requires_multi_gpu(4)
+        _run_distributed(_worker_fuse_wgrad, 4)
+
+
+# ---------------------------------------------------------------------------
+# 23. _grad_accum_hook is called after reduce-scatter
+# ---------------------------------------------------------------------------
+
+def _worker_main_grad_updated_after_bwd(rank, world_size, port):
+    """After backward, _finalize_wgrad must have accumulated wgrad into main_grad."""
+    _dist_init(rank, world_size, port)
+    torch.manual_seed(0)
+    in_f, out_f = 32, 64
+    dtype = torch.bfloat16
+    etp_group = dist.new_group(list(range(world_size)))
+
+    layer = te.Linear(
+        in_features=in_f, out_features=out_f,
+        bias=False, params_dtype=dtype,
+        device="cuda", etp_group=etp_group,
+    )
+
+    # _finalize_wgrad always accumulates into main_grad; allocate before backward.
+    layer.weight.main_grad = torch.zeros(layer.weight.shape, dtype=dtype, device="cuda")
+
+    inp = torch.randn(8, in_f, dtype=dtype, device="cuda", requires_grad=True)
+    dist.broadcast(inp, src=0)
+    layer(inp, is_first_microbatch=True).sum().backward()
+
+    assert torch.any(layer.weight.main_grad != 0), \
+        "main_grad should have been updated by _finalize_wgrad after reduce-scatter"
+    dist.destroy_process_group()
+
+
+class TestETPGradAccumHook:
+    def test_main_grad_updated_after_backward(self):
+        _requires_multi_gpu(4)
+        _run_distributed(_worker_main_grad_updated_after_bwd, 4)
+
+

From 1d771ff3003068ebfb5d6f5da387fc6853065597 Mon Sep 17 00:00:00 2001
From: Shiqing Fan <shiqingf@nvidia.com>
Date: Tue, 7 Apr 2026 00:38:55 -0700
Subject: [PATCH 12/43] import fix

---
 transformer_engine/pytorch/module/extended_tensor_parallelism.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/transformer_engine/pytorch/module/extended_tensor_parallelism.py b/transformer_engine/pytorch/module/extended_tensor_parallelism.py
index 83b6dc98c0..07660fa540 100644
--- a/transformer_engine/pytorch/module/extended_tensor_parallelism.py
+++ b/transformer_engine/pytorch/module/extended_tensor_parallelism.py
@@ -6,6 +6,7 @@
 from typing import Dict, List, Optional
 from enum import Enum
 from dataclasses import dataclass, field
+import math
 import re
 import torch
 from contextlib import nullcontext

From fd55ede25e6d0b305cae8bca32677a0377487136 Mon Sep 17 00:00:00 2001
From: Jieming Zhang <jiemingz@nvidia.com>
Date: Wed, 8 Apr 2026 20:11:47 -0700
Subject: [PATCH 13/43] move ag init to first pass

Signed-off-by: Jieming Zhang <jiemingz@nvidia.com>
---
 .../module/extended_tensor_parallelism.py     | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

diff --git a/transformer_engine/pytorch/module/extended_tensor_parallelism.py b/transformer_engine/pytorch/module/extended_tensor_parallelism.py
index 07660fa540..dda92b1bac 100644
--- a/transformer_engine/pytorch/module/extended_tensor_parallelism.py
+++ b/transformer_engine/pytorch/module/extended_tensor_parallelism.py
@@ -431,8 +431,7 @@ def _all_gather_weight(self, async_op, skip_weight_cast, cast_noop_flag, fwd, nv
             cache = get_global_ETP_cache()
             for p, dt in zip(weights, dtypes):
                 if fwd:
-                    if p._ag_ticket_fwd is None:
-                        p._ag_ticket_fwd = cache.reserve(p, dt, fwd=True)
+                    # The fwd ag buffer is always initialized in 'all_gather_and_prefetch'
                     out_buffers.append(cache.get(p._ag_ticket_fwd))
                 else:
                     if p._ag_ticket_bwd is None:
@@ -597,11 +596,6 @@ def all_gather_and_prefetch(
         for w in self._weights:
             w._set_state(ETPWeightState.NONE)
 
-        if self.prev_w is not None:
-            cache = get_global_ETP_cache()
-            for w in self._weights:
-                cache.release(w._ag_ticket_fwd)
-
         # Lazy population of linked list: link previous weight to current weight
         cls = type(self)
         if not self.prefetch_initialized:
@@ -609,6 +603,17 @@ def all_gather_and_prefetch(
                 cls._buffer_link_table_row(cls._last_weight, self)
                 cls._last_weight.next_w = self
                 self.prev_w = cls._last_weight
+
+            cache = get_global_ETP_cache()
+
+            # Set the fwd ag buffer
+            quantizers = [w._quantizer for w in self._weights]
+            dtypes = [q.dtype if q is not None else w.dtype for q, w in zip(quantizers, self._weights)]
+            for w, dt in zip(self._weights, dtypes):
+                w._ag_ticket_fwd = cache.reserve(w, dt, fwd=True)
+                cache.get(w._ag_ticket_fwd)
+                cache.release(w._ag_ticket_fwd)
+
             self.prefetch_initialized = True
         elif not cls._link_table_flushed and cls._link_table_buffer:
             # Second forward pass: flush the complete table atomically to avoid interleaving

From 62379f5083e964e7df0a1c96daeb1e250ac94b21 Mon Sep 17 00:00:00 2001
From: Shiqing Fan <shiqingf@nvidia.com>
Date: Thu, 9 Apr 2026 16:31:28 -0700
Subject: [PATCH 14/43] ETP+CG: 2-chain(dense+expert, no cross link
 prefetching) + shared ag/rs streams

---
 docs/README_ETP.md                            |  72 ++++++++-
 .../module/extended_tensor_parallelism.py     | 143 ++++++++++++------
 2 files changed, 163 insertions(+), 52 deletions(-)

diff --git a/docs/README_ETP.md b/docs/README_ETP.md
index a0ef835614..d6c0a367da 100644
--- a/docs/README_ETP.md
+++ b/docs/README_ETP.md
@@ -37,9 +37,9 @@ TODO(shiqingf): add performance for Ultra model in nvfp4.
 | Mechanism | Description |
 |---|---|
 | **Alignment padding** | Shards padded to `ETPConfig.pad_for_alignment × etp_size` rows at construction via `get_padded_shard()`; only last rank carries padding (`is_padded_last_rank`); padding stripped in `_strip_padding()` both post-gather (before GEMM) and post-reduce-scatter (before wgrad accumulation) |
-| **Fine-grained weight scheduling** | Each weight has its own `ETPWeightState` lifecycle and is scheduled independently via a doubly-linked list (`next_w`/`prev_w`), enabling per-weight AG/RS overlap at single-weight granularity |
+| **Fine-grained weight scheduling** | Each weight has its own `ETPWeightState` lifecycle and is scheduled independently via a doubly-linked list (`next_w`/`prev_w`), enabling per-weight AG/RS overlap at single-weight granularity. Two independent chains are maintained: one for dense params (mamba/attn/shared_expert) and one for expert params (grouped_fc1/grouped_fc2) |
 | **Separate AG and RS state** | All-gather state (`state`) and reduce-scatter state (`rs_state`) are tracked independently per param, allowing forward and backward async ops to proceed without interference |
-| **Dedicated CUDA streams** | AG and RS run on separate global CUDA streams (`AG_STREAM`, `RS_STREAM`), decoupled from the default compute stream; completion is signaled back via per-param CUDA events (`ag_event`, `rs_event`) that the compute stream waits on before consuming the result |
+| **Shared CUDA streams** | AG and RS run on shared CUDA streams (`get_ag_stream()`, `get_rs_stream()`) across all chains; completion is signaled back via per-param CUDA events (`ag_event`, `rs_event`) that the compute stream waits on before consuming the result. Streams must be shared because `ag_event` is recorded on the AG stream during CUDA graph capture; using a different stream at replay would cause `ag_event.wait()` to see a stale recording |
 | **Ticket-based buffer cache** | `ETPWeightCache` assigns persistent tickets via `reserve()`; buffers are lazily allocated on `get()` and returned to the pool on `release()`; `clear()` drops all buffers while keeping tickets valid for lazy re-allocation (used for CUDA Graph re-capture) |
 | **Wgrad reduce-scatter** | Async reduce-scatter of weight gradients, deferred to overlap with next layer's wgrad RS; `_finalize_wgrad()` resets `rs_state`, strips padding, and accumulates the result into `param.main_grad`, returning a dummy-zero grad to autograd |
 
@@ -143,7 +143,7 @@ if etp_group is not None:
     del weight_tensor   # free the temporary full-weight buffer
 ```
 
-For `GroupedLinear` (MoE), `wrap_module_params_etp` is called with `is_grouped=True`, which additionally sets `weight_list` on the first expert's `ETPShardedParam` so all experts' weights can be batched together in a single coalesced all-gather.
+For `GroupedLinear` (MoE), `wrap_module_params_etp` is called with `is_grouped=True`, which additionally sets `weight_list` on the first expert's `ETPShardedParam` so all experts' weights can be batched together in a single coalesced all-gather. It also sets `chain_id='expert'` so expert params join the expert prefetch chain (separate from the dense chain).
 
 ### State Machine
 
@@ -193,10 +193,7 @@ classDiagram
         <<nn.Parameter subclass>>
         $ _pending_rs_weight : ETPShardedParam
         $ _first_weight_flag : bool
-        $ _last_weight : ETPShardedParam
-        $ _link_node_count : int
-        $ _link_table_buffer : List[str]
-        $ _link_table_flushed : bool
+        $ _chain_state : Dict[str, dict]
         +ETPWeightState state
         +ETPWeightState rs_state
         +int _ag_ticket_fwd
@@ -214,6 +211,7 @@ classDiagram
         +bool prefetch_initialized
         +ETPShardedParam next_w
         +ETPShardedParam prev_w
+        +str chain_id
         +bool is_routed_expert
         +int expert_idx
         +ProcessGroup group
@@ -277,6 +275,7 @@ classDiagram
         +tuple key
         +ETPShardedParam param
         +dtype
+        +str chain_id
         +bool reduce_scatter
         +bool fwd
         +Tensor buf
@@ -378,6 +377,65 @@ A further practical difference is that ETP is **quantization-aware**: shards are
 
 ---
 
+## Two-Chain Architecture (Dense + Expert)
+
+ETP maintains **two independent prefetch chains** to cleanly separate dense and expert weight management:
+
+| Chain | Params | NCCL Group | CUDA Graph |
+|-------|--------|-----------|------------|
+| **Dense** (`chain_id='dense'`) | mamba, attention, shared expert | `PARAMETER_SHARDING_GROUP` | Captured in graphs |
+| **Expert** (`chain_id='expert'`) | grouped_fc1, grouped_fc2 | `EXPERT_PARAMETER_SHARDING_GROUP` | Runs eagerly |
+
+Both chains share the same `ag_stream` / `rs_stream` (see "Shared Streams" below).
+
+### Why Two Chains Instead of One?
+
+The original design used a **single global chain** linking all ETP params:
+
+```
+Single chain (old):
+CG(mamba.fc1 -> mamba.fc2) -> CG(shared_expert.fc1 -> shared_expert.fc2) -> EAGER(grouped_fc1 -> grouped_fc2) -> CG(next_mamba.fc1 -> ...) -> ...
+                                                                            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+                                                                            crosses CG/eager boundary
+```
+
+This caused two problems:
+
+1. **Cross-chain prefetch crossing CG/eager boundary**: The single linked list linked dense params (captured in CUDA graphs) to expert params (running eagerly). The prefetch chain crossed the CG/eager boundary, causing the captured AG event sequence to include expert weight prefetches. At 64+ GPU IB scale, this interaction corrupted NCCL communicator progress tracking across graph replays and caused deadlocks.
+
+2. **Complex fencing**: Numerous `_drain_etp_side_streams()` fences were needed at every CG/eager boundary (forward expert compute entry, backward dispatch/combine, finalize_model_grads). These fences were fragile, hard to reason about, and didn't fully solve the 64-GPU hang.
+
+The two-chain design eliminates both problems:
+
+```
+Dense chain:  CG(mamba.fc1 -> mamba.fc2) -> CG(shared_expert.fc1 -> shared_expert.fc2) -> CG(next_mamba.fc1 -> ...) -> ...
+Expert chain: EAGER(grouped_fc1_L1 -> grouped_fc2_L1) -> EAGER(grouped_fc1_L2 -> grouped_fc2_L2) -> ...
+              (never crosses into CG, never uses PARAMETER_SHARDING_GROUP)
+```
+
+Each chain uses its own NCCL communicator and stays entirely within one execution mode (CG or eager).
+
+### Chain Construction
+
+Each chain builds its own doubly-linked list independently via per-chain state in `_chain_state`:
+
+```
+Dense chain:  mamba.fc1 -> mamba.fc2 -> shared_expert.fc1 -> shared_expert.fc2 -> next_mamba.fc1 -> ...
+Expert chain: grouped_fc1_layer1 -> grouped_fc2_layer1 -> grouped_fc1_layer2 -> ...
+```
+
+The `chain_id` is set automatically: `wrap_module_params_etp(..., is_grouped=True)` sets `chain_id='expert'`; all other params default to `chain_id='dense'`.
+
+### Shared Streams
+
+Both chains share the same `ag_stream` and `rs_stream`. Per-chain streams were considered but cause correctness issues: the `ag_event` CUDA event object is recorded on `ag_stream` during CUDA graph capture. If expert params used a different stream at replay time, `ag_event.wait()` would see a stale recording, producing Inf gradients. Shared streams avoid this while the chain-level isolation (no cross-chain `next_w`/`prev_w` links) provides the key benefit of preventing prefetch chains from crossing the CG/eager boundary.
+
+### Buffer Cache
+
+The single global `ETPWeightCache` serves both chains. Cache keys already include `expert_idx`, so dense and expert buffers never collide. `reallocate_to_mempool()` only migrates **dense-chain** buffers into the CUDA graph memory pool; expert-chain buffers remain in regular allocator memory.
+
+---
+
 ## Scalability
 
 ETP scales along two independent dimensions:
diff --git a/transformer_engine/pytorch/module/extended_tensor_parallelism.py b/transformer_engine/pytorch/module/extended_tensor_parallelism.py
index dda92b1bac..00ba434379 100644
--- a/transformer_engine/pytorch/module/extended_tensor_parallelism.py
+++ b/transformer_engine/pytorch/module/extended_tensor_parallelism.py
@@ -47,20 +47,32 @@ class ETPWeightState(Enum):
 
 # Global set of ETPShardedParam with in-flight async comms (AG or RS).
 _inflight_comm_params: set = set()
-AG_STREAM = None
-RS_STREAM = None
-
-def get_ag_stream():
-    global AG_STREAM
-    if AG_STREAM is None:
-        AG_STREAM = torch.cuda.Stream()
-    return AG_STREAM
-
-def get_rs_stream():
-    global RS_STREAM
-    if RS_STREAM is None:
-        RS_STREAM = torch.cuda.Stream()
-    return RS_STREAM
+_AG_STREAMS: Dict[str, torch.cuda.Stream] = {}
+_RS_STREAMS: Dict[str, torch.cuda.Stream] = {}
+
+def get_ag_stream(chain_id: str = 'dense') -> torch.cuda.Stream:
+    # All chains share one AG stream. The ag_event CUDA event object is recorded on
+    # this stream during graph capture; using a different stream at replay would cause
+    # ag_event.wait() to see a stale recording, producing Inf gradients.
+    key = 'shared'
+    if key not in _AG_STREAMS:
+        _AG_STREAMS[key] = torch.cuda.Stream()
+    return _AG_STREAMS[key]
+
+def get_rs_stream(chain_id: str = 'dense') -> torch.cuda.Stream:
+    # All chains share one RS stream (same reason as AG stream).
+    key = 'shared'
+    if key not in _RS_STREAMS:
+        _RS_STREAMS[key] = torch.cuda.Stream()
+    return _RS_STREAMS[key]
+
+def get_all_ag_streams() -> list:
+    """Return all AG streams that have been created."""
+    return list(_AG_STREAMS.values())
+
+def get_all_rs_streams() -> list:
+    """Return all RS streams that have been created."""
+    return list(_RS_STREAMS.values())
 
 @dataclass
 class ETPConfig:
@@ -133,6 +145,7 @@ def wrap_module_params_etp(module, weight_names, etp_group, is_grouped=None):
         if is_grouped:
             etp_shard.expert_idx = idx
             etp_shard.is_routed_expert = True
+            etp_shard.chain_id = 'expert'
         etp_shard.group = etp_group
         etp_shard.ps_size = etp_size
         # register the newly sharded param back to the module
@@ -167,13 +180,22 @@ class ETPShardedParam(torch.nn.Parameter):
 
     _pending_rs_weight = None
     _first_weight_flag = True
-    _last_weight = None
-    _link_node_count = 0
-    _link_table_buffer: List[str] = []
-    _link_table_flushed: bool = False
+    # Per-chain state: each chain_id ('dense', 'expert') has its own linked list.
+    _chain_state: Dict[str, dict] = {}
 
     @classmethod
-    def _buffer_link_table_row(cls, prev: "ETPShardedParam", curr: "ETPShardedParam") -> None:
+    def _get_chain_state(cls, chain_id: str) -> dict:
+        if chain_id not in cls._chain_state:
+            cls._chain_state[chain_id] = {
+                'last_weight': None,
+                'link_node_count': 0,
+                'link_table_buffer': [],
+                'link_table_flushed': False,
+            }
+        return cls._chain_state[chain_id]
+
+    @classmethod
+    def _buffer_link_table_row(cls, prev: "ETPShardedParam", curr: "ETPShardedParam", chain: dict) -> None:
         """Buffer one row of the prefetch-link table (flushed atomically on the second forward pass)."""
         _W = 70
 
@@ -181,18 +203,20 @@ def _layer_id(name: str) -> str:
             m = re.search(r"\d+", name)
             return m.group() if m else "-"
 
-        cls._link_node_count += 1
-        if cls._link_node_count == 1:
-            cls._link_table_buffer.append(
+        chain['link_node_count'] += 1
+        if chain['link_node_count'] == 1:
+            chain_id = getattr(curr, 'chain_id', 'dense')
+            chain['link_table_buffer'].append(
+                f"\n[{chain_id} chain]"
                 f"\n{'node_id':>7} | {'layer_id':>8} | {'curr_weight_name':<{_W}} | prev_weight_name"
                 f"\n{'-'*7}-+-{'-'*8}-+-{'-'*_W}-+-{'-'*_W}"
             )
             # Seed weight (first ETP param) as row 0
-            cls._link_table_buffer.append(
+            chain['link_table_buffer'].append(
                 f"{'0':>7} | {_layer_id(prev._debug_name):>8} | {prev._debug_name:<{_W}} | -"
             )
-        cls._link_table_buffer.append(
-            f"{cls._link_node_count:>7} | {_layer_id(curr._debug_name):>8} | "
+        chain['link_table_buffer'].append(
+            f"{chain['link_node_count']:>7} | {_layer_id(curr._debug_name):>8} | "
             f"{curr._debug_name:<{_W}} | {prev._debug_name}"
         )
 
@@ -219,6 +243,8 @@ def __init__(self, x, *args, **kwargs):
         self.prefetch_initialized = False
         self.next_w = None
         self.prev_w = None
+        # Chain identity: 'dense' for mamba/attn/shared_expert, 'expert' for grouped experts
+        self.chain_id = 'dense'
         # Grouped gemm
         self.is_routed_expert = False
         self.expert_idx = None
@@ -480,7 +506,7 @@ def _wait_param_gather(self):
             # Since wait() may sychronize against a different stream than the current stream,
             # an event is recorded and waited on when the data is retrieved, which ensures the
             # AG always finishes before returning the unsharded param
-            with torch.cuda.stream(get_ag_stream()):
+            with torch.cuda.stream(get_ag_stream(self.chain_id)):
                 if self._prefetch_handle is not None:
                     self._prefetch_handle.wait()
                     self._prefetch_handle = None
@@ -597,12 +623,15 @@ def all_gather_and_prefetch(
             w._set_state(ETPWeightState.NONE)
 
         # Lazy population of linked list: link previous weight to current weight
+        # Uses per-chain state so dense and expert chains never cross-link.
         cls = type(self)
+        chain = cls._get_chain_state(self.chain_id)
         if not self.prefetch_initialized:
-            if cls._last_weight is not None and cls._last_weight.next_w is None:
-                cls._buffer_link_table_row(cls._last_weight, self)
-                cls._last_weight.next_w = self
-                self.prev_w = cls._last_weight
+            last_w = chain['last_weight']
+            if last_w is not None and last_w.next_w is None:
+                cls._buffer_link_table_row(last_w, self, chain)
+                last_w.next_w = self
+                self.prev_w = last_w
 
             cache = get_global_ETP_cache()
 
@@ -615,11 +644,11 @@ def all_gather_and_prefetch(
                 cache.release(w._ag_ticket_fwd)
 
             self.prefetch_initialized = True
-        elif not cls._link_table_flushed and cls._link_table_buffer:
+        elif not chain['link_table_flushed'] and chain['link_table_buffer']:
             # Second forward pass: flush the complete table atomically to avoid interleaving
-            cls._link_table_flushed = True
-            print_rank_0("\n".join(cls._link_table_buffer) + "\n")
-        cls._last_weight = self
+            chain['link_table_flushed'] = True
+            print_rank_0("\n".join(chain['link_table_buffer']) + "\n")
+        chain['last_weight'] = self
 
         return result
 
@@ -660,7 +689,7 @@ def _finalize_wgrad(param, wgrad_rs):
 
     def _wait_reduce_scatter(self):
         # assert self._wgrad_rs_handle is not None or is_graph_capturing()
-        with torch.cuda.stream(get_rs_stream()):
+        with torch.cuda.stream(get_rs_stream(self.chain_id)):
             if self._wgrad_rs_handle is not None:
                 self._wgrad_rs_handle.wait()
                 self._wgrad_rs_handle = None
@@ -798,6 +827,7 @@ class _TicketSlot:
     dtype: object                                # torch.dtype or tex.DType
     reduce_scatter: bool
     fwd: bool
+    chain_id: str = 'dense'                      # chain this slot belongs to
     buf: Optional[torch.Tensor] = field(default=None)  # None when released or after clear()
 
 
@@ -878,7 +908,8 @@ def reserve(self, param: 'ETPShardedParam', dtype, fwd: bool, reduce_scatter=Fal
         self._next_ticket += 1
 
         self._slots[ticket] = _TicketSlot(
-            key=key, param=param, dtype=dtype, reduce_scatter=reduce_scatter, fwd=fwd
+            key=key, param=param, dtype=dtype, reduce_scatter=reduce_scatter, fwd=fwd,
+            chain_id=getattr(param, 'chain_id', 'dense'),
         )
         return ticket
 
@@ -909,17 +940,26 @@ def clear(self):
         self._total_bytes = 0
 
     def reallocate_to_mempool(self, device, mempool):
-        """Re-allocate all ticket buffers into a CUDA graph memory pool.
+        """Re-allocate dense-chain ticket buffers into a CUDA graph memory pool.
 
-        Call BEFORE graph capture so every buffer lives in the capture pool
-        and no allocations are recorded inside the graph.
+        Call BEFORE graph capture so every dense-chain buffer lives in the capture
+        pool and no allocations are recorded inside the graph.  Expert-chain buffers
+        are left in regular memory (expert compute runs eagerly, not in graphs).
         """
 
-        # Clone the current memory pool buffers but into the passed in mempool
+        # Identify keys that belong to the dense chain
+        dense_keys = set()
+        for slot in self._slots.values():
+            if slot.chain_id == 'dense':
+                dense_keys.add(slot.key)
+
+        # Clone only dense-chain pool buffers into the passed in mempool
         self._total_bytes = 0
         new_pool = defaultdict(list)
         torch._C._cuda_beginAllocateCurrentThreadToPool(device, mempool)
         for key, buffers in self._pool.items():
+            if key not in dense_keys:
+                continue
             new_buffers = []
             for _ in range(len(buffers)):
                 buf = self._allocate_buffer(*self.key_to_allocate_func[key])
@@ -927,17 +967,24 @@ def reallocate_to_mempool(self, device, mempool):
             new_pool[key] = new_buffers
         torch._C._cuda_endAllocateToPool(device, mempool)
 
-        # Map each buffer in the old pool to its corresponding new one
+        # Map each buffer in the old pool to its corresponding new one (dense only)
         old_to_new_buff = {}
         for key, old_pool in self._pool.items():
+            if key not in dense_keys:
+                continue
             new = new_pool[key]
             for old_buf, new_buf in zip(old_pool, new):
                 old_to_new_buff[old_buf] = new_buf
-        # Replace each slot's reference to its corresponding new one
+
+        # Replace each dense slot's reference; keep expert slots unchanged
         for slot in self._slots.values():
-            if slot.buf is not None:
+            if slot.chain_id == 'dense' and slot.buf is not None and slot.buf in old_to_new_buff:
                 slot.buf = old_to_new_buff[slot.buf]
 
+        # Merge: dense keys get new buffers, expert keys keep old ones
+        for key, buffers in self._pool.items():
+            if key not in dense_keys:
+                new_pool[key] = buffers
         self._pool = new_pool
         return
 
@@ -955,10 +1002,16 @@ def reallocate_etp_cache_to_mempool(device, mempool):
         _ETP_CACHE.reallocate_to_mempool(device, mempool)
 
 
-def wait_async_comms():
-    """Wait on all in-flight ETP async communications (all-gathers + reduce-scatters).
+def wait_async_comms(chain_id: str = None):
+    """Wait on in-flight ETP async communications (all-gathers + reduce-scatters).
+
+    Args:
+        chain_id: If specified, only drain params belonging to this chain ('dense' or 'expert').
+                  If None, drain all chains (backward compat).
     """
     for param in list(_inflight_comm_params):
+        if chain_id is not None and getattr(param, 'chain_id', 'dense') != chain_id:
+            continue
         param._wait_param_gather()
         param._wait_reduce_scatter()
 

From ba909cc543b4b3106307e1623d668b680b07fc86 Mon Sep 17 00:00:00 2001
From: Shiqing Fan <shiqingf@nvidia.com>
Date: Thu, 9 Apr 2026 20:37:28 -0700
Subject: [PATCH 15/43] fix the case when ETP_Config.weight_prefetch is False.

---
 .../module/extended_tensor_parallelism.py        | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/transformer_engine/pytorch/module/extended_tensor_parallelism.py b/transformer_engine/pytorch/module/extended_tensor_parallelism.py
index 00ba434379..843d184043 100644
--- a/transformer_engine/pytorch/module/extended_tensor_parallelism.py
+++ b/transformer_engine/pytorch/module/extended_tensor_parallelism.py
@@ -554,14 +554,14 @@ def all_gather_and_prefetch_bwd(self, nvtx_label=None):
             weight_total
         """
 
-        if self.next_w is not None:
+        if ETP_CONFIG.weight_prefetch and self.next_w is not None:
             result = self._get_prefetched_weight(False, skip_weight_cast=True)
         else:
             result = self._all_gather_weight_on_demand(False, skip_weight_cast=True)
 
         if (
-            ETP_CONFIG.weight_prefetch 
-            and self.prev_w is not None 
+            ETP_CONFIG.weight_prefetch
+            and self.prev_w is not None
             and self.prev_w._need_weight_prefetch
         ):
             _, handle = self.prev_w._all_gather_weight(
@@ -574,7 +574,7 @@ def all_gather_and_prefetch_bwd(self, nvtx_label=None):
         for w in self._weights:
             w._set_state(ETPWeightState.NONE)
 
-        if self.next_w is not None:
+        if ETP_CONFIG.weight_prefetch and self.next_w is not None:
             cache = get_global_ETP_cache()
             for w in self._weights:
                 cache.release(w._ag_ticket_bwd)
@@ -599,15 +599,15 @@ def all_gather_and_prefetch(
         Returns:
             weight_total
         """
-        if self.prev_w is not None:
+        if ETP_CONFIG.weight_prefetch and self.prev_w is not None:
             result = self._get_prefetched_weight(True, skip_weight_cast, cast_noop_flag)
         else:
             result = self._all_gather_weight_on_demand(True, skip_weight_cast, cast_noop_flag)
 
         # Prefetch next weight
         if (
-            ETP_CONFIG.weight_prefetch 
-            and self.next_w is not None 
+            ETP_CONFIG.weight_prefetch
+            and self.next_w is not None
             and self.next_w._need_weight_prefetch
         ):
             _, handle = self.next_w._all_gather_weight(
@@ -776,7 +776,7 @@ def wgrad_reduce_scatter(self, wgrad, nvtx_label=None):
 
         # Wait for last reduce scatter if it was async
         # Currently only support reduce scattering in reverse order
-        if self.next_w is not None:
+        if ETP_CONFIG.weight_prefetch and self.next_w is not None:
             self.next_w._wait_reduce_scatter()
             self.next_w.rs_event.wait()
 

From fd65c96c88cdc3fdfdbb6ac20d703f1ce7825114 Mon Sep 17 00:00:00 2001
From: Shiqing Fan <shiqingf@nvidia.com>
Date: Sun, 12 Apr 2026 22:17:42 -0700
Subject: [PATCH 16/43] ETP+emb/output layers: remove these two layers from the
 prefetch chain.

---
 .../pytorch/module/extended_tensor_parallelism.py               | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/transformer_engine/pytorch/module/extended_tensor_parallelism.py b/transformer_engine/pytorch/module/extended_tensor_parallelism.py
index 843d184043..a9689db253 100644
--- a/transformer_engine/pytorch/module/extended_tensor_parallelism.py
+++ b/transformer_engine/pytorch/module/extended_tensor_parallelism.py
@@ -644,11 +644,11 @@ def all_gather_and_prefetch(
                 cache.release(w._ag_ticket_fwd)
 
             self.prefetch_initialized = True
+            chain['last_weight'] = self
         elif not chain['link_table_flushed'] and chain['link_table_buffer']:
             # Second forward pass: flush the complete table atomically to avoid interleaving
             chain['link_table_flushed'] = True
             print_rank_0("\n".join(chain['link_table_buffer']) + "\n")
-        chain['last_weight'] = self
 
         return result
 

From 650461137c60367e82438a719966bd326e7c6faf Mon Sep 17 00:00:00 2001
From: Shiqing Fan <shiqingf@nvidia.com>
Date: Mon, 13 Apr 2026 23:38:08 -0700
Subject: [PATCH 17/43] ET+CG mem fix1:  use pooled buffers for both async and
 sync gathers to avoid allocating fresh memory each iteration.

---
 .../module/extended_tensor_parallelism.py     | 31 ++++++++++---------
 1 file changed, 16 insertions(+), 15 deletions(-)

diff --git a/transformer_engine/pytorch/module/extended_tensor_parallelism.py b/transformer_engine/pytorch/module/extended_tensor_parallelism.py
index a9689db253..e18bef56fa 100644
--- a/transformer_engine/pytorch/module/extended_tensor_parallelism.py
+++ b/transformer_engine/pytorch/module/extended_tensor_parallelism.py
@@ -450,21 +450,22 @@ def _all_gather_weight(self, async_op, skip_weight_cast, cast_noop_flag, fwd, nv
         else:
             gather_weights = list(w.get_padded_shard() for w in weights)
 
-        # 4. Cache checkout (async only — sync gathers don't need pooled buffers).
-        if async_op:
-            dtypes = [q.dtype if q is not None else w.dtype for q, w in zip(quantizers, weights)]
-            out_buffers = []
-            cache = get_global_ETP_cache()
-            for p, dt in zip(weights, dtypes):
-                if fwd:
-                    # The fwd ag buffer is always initialized in 'all_gather_and_prefetch'
-                    out_buffers.append(cache.get(p._ag_ticket_fwd))
-                else:
-                    if p._ag_ticket_bwd is None:
-                        p._ag_ticket_bwd = cache.reserve(p, dt, fwd=False)
-                    out_buffers.append(cache.get(p._ag_ticket_bwd))
-        else:
-            out_buffers = None
+        # 4. Cache checkout — use pooled buffers for both async and sync gathers
+        #    to avoid allocating fresh memory each iteration.
+        dtypes = [q.dtype if q is not None else w.dtype for q, w in zip(quantizers, weights)]
+        out_buffers = []
+        cache = get_global_ETP_cache()
+        for p, dt in zip(weights, dtypes):
+            if fwd:
+                if p._ag_ticket_fwd is None:
+                    p._ag_ticket_fwd = cache.reserve(p, dt, fwd=True)
+                    cache.get(p._ag_ticket_fwd)
+                    cache.release(p._ag_ticket_fwd)
+                out_buffers.append(cache.get(p._ag_ticket_fwd))
+            else:
+                if p._ag_ticket_bwd is None:
+                    p._ag_ticket_bwd = cache.reserve(p, dt, fwd=False)
+                out_buffers.append(cache.get(p._ag_ticket_bwd))
 
         # 5. Communicate.
         etp_group = weights[0].group

From 14db8142dfa667aa184ec72cc6c63c86fb67b127 Mon Sep 17 00:00:00 2001
From: Shiqing Fan <shiqingf@nvidia.com>
Date: Tue, 14 Apr 2026 00:46:49 -0700
Subject: [PATCH 18/43] ETP+CG mem fix2: fix wgrad tensor retention and
 eliminate redundant AG allocations

  1. Release NCCL Work C++ tensor refs promptly in ETPShardHandle.wait()
     (self.handle = None) so wgrad buffers are freed when RS is waited,
     not held until optimizer.step().

  2. Use cache buffers for sync all-gather path (not just async). The old
     code passed out_buffers=None for sync gathers, allocating ~22 GB/iter
     of fresh tensors. Sync gathers now reuse the same ETPWeightCache
     buffers as the async prefetch path.

  3. Add standalone wgrad input buffer pool (_wgrad_buf_pool) for expert
     chain. get_wgrad_tensor() draws from the pool; buffers are returned
     after RS is waited via _wgrad_input_bufs stash in _wait_reduce_scatter.
     Reduces expert wgrad peak from ~4 GB (held until optimizer) to ~640 MB
     (16 buffers reused across all MoE layers).

  4. Stash _wgrad_input_bufs for all chains (not just expert) so ungraphed
     dense weights (output layer) also drop Python refs at _wait_reduce_scatter
     instead of surviving until calc_params_l2_norm.

  5. Fix tensor comparison crash in cache.release(): use identity check
     (any(b is slot.buf ...)) instead of tensor == which returns a
     multi-element bool tensor.
---
 .../module/extended_tensor_parallelism.py     | 68 ++++++++++++++++---
 1 file changed, 58 insertions(+), 10 deletions(-)

diff --git a/transformer_engine/pytorch/module/extended_tensor_parallelism.py b/transformer_engine/pytorch/module/extended_tensor_parallelism.py
index e18bef56fa..1700d14129 100644
--- a/transformer_engine/pytorch/module/extended_tensor_parallelism.py
+++ b/transformer_engine/pytorch/module/extended_tensor_parallelism.py
@@ -50,6 +50,31 @@ class ETPWeightState(Enum):
 _AG_STREAMS: Dict[str, torch.cuda.Stream] = {}
 _RS_STREAMS: Dict[str, torch.cuda.Stream] = {}
 
+# Standalone wgrad input buffer pool, keyed by (shape, dtype).
+# Separate from ETPWeightCache because:
+# 1. Wgrad buffers are expert-chain only (never graphed)
+# 2. They need true release-then-reuse (the pool shrinks/grows), whereas
+#    ETPWeightCache keeps slot.buf set for CUDA graph address stability
+_wgrad_buf_pool: Dict[tuple, list] = {}
+
+
+def _wgrad_pool_get(shape: tuple, dtype: torch.dtype, device) -> torch.Tensor:
+    """Get a wgrad buffer from the pool, or allocate a fresh one."""
+    key = (shape, dtype)
+    pool = _wgrad_buf_pool.get(key)
+    if pool:
+        return pool.pop()
+    return torch.empty(shape, dtype=dtype, device=device, requires_grad=False)
+
+
+def _wgrad_pool_put(buf: torch.Tensor):
+    """Return a wgrad buffer to the pool for reuse."""
+    key = (tuple(buf.shape), buf.dtype)
+    if key not in _wgrad_buf_pool:
+        _wgrad_buf_pool[key] = []
+    _wgrad_buf_pool[key].append(buf)
+
+
 def get_ag_stream(chain_id: str = 'dense') -> torch.cuda.Stream:
     # All chains share one AG stream. The ag_event CUDA event object is recorded on
     # this stream during graph capture; using a different stream at replay would cause
@@ -167,6 +192,7 @@ def __init__(self, handle, etp_shards, reduce_scatter=False):
     def wait(self):
         if self.handle is not None:
             self.handle.wait()
+            self.handle = None  # Release NCCL Work and its C++ tensor references promptly
         for w in self.etp_shards:
             if self.reduce_scatter:
                 w._set_rs_state(ETPWeightState.DATA_READY)
@@ -659,12 +685,7 @@ def batched_all_gather_and_prefetch(self, **kwargs):
         return self.all_gather_and_prefetch(**kwargs)
 
     def get_wgrad_tensor(self):
-        return torch.empty(
-            self._unsharded_shape,
-            dtype=self.main_grad.dtype,
-            device=self.device,
-            requires_grad=False,
-        )
+        return _wgrad_pool_get(self._unsharded_shape, self.main_grad.dtype, self.device)
 
     @staticmethod
     def _finalize_wgrad(param, wgrad_rs):
@@ -695,6 +716,14 @@ def _wait_reduce_scatter(self):
                 self._wgrad_rs_handle.wait()
                 self._wgrad_rs_handle = None
                 self.rs_event.record()
+        # RS is done — drop stashed wgrad input buffer refs.
+        # Safe because handle.wait() above guarantees the RS kernel finished reading them.
+        # Expert-chain buffers go back to pool for reuse; dense-chain buffers just drop refs.
+        if getattr(self, '_wgrad_input_bufs', None) is not None:
+            if self.chain_id == 'expert':
+                for buf in self._wgrad_input_bufs:
+                    _wgrad_pool_put(buf)
+            self._wgrad_input_bufs = None
 
     def _reduce_scatter(self, wgrads, async_op, nvtx_label=None):
         """Reduce-scatter one or more wgrads. Returns (outputs, handle).
@@ -764,15 +793,26 @@ def wgrad_reduce_scatter(self, wgrad, nvtx_label=None):
         wgrads = list(wgrad) if batched else [wgrad]
         weights = self._weights
 
+        # Expert-chain wgrads are recycled via the standalone pool (_wgrad_pool_put).
+        # All ungraphed weights (expert + output layer) benefit from the stash
+        # (_wgrad_input_bufs) which drops Python refs once the RS is waited.
+        poolable = self.chain_id == 'expert'
+
         if ETP_CONFIG.weight_prefetch and self.prev_w is not None:
             # Async reduce-scatter (not last weight — deferred finish)
             _, rs_handle = self._reduce_scatter(wgrads, async_op=True, nvtx_label=nvtx_label)
             self._wgrad_rs_handle = ETPShardHandle(rs_handle, weights, reduce_scatter=True)
+            # Stash wgrad input buffers — cannot recycle yet because the async RS
+            # kernel is still reading them on rs_stream.
+            self._wgrad_input_bufs = wgrads
             ret = tuple([None] * len(wgrads)) if batched else None
         else:
-            # Sync reduce-scatter (last weight in chain)
+            # Sync reduce-scatter (last weight in chain) — RS done, recycle immediately
             sharded, _ = self._reduce_scatter(wgrads, async_op=False, nvtx_label=nvtx_label)
             result = [self._finalize_wgrad(p, g) for p, g in zip(weights, sharded)]
+            if poolable:
+                for buf in wgrads:
+                    _wgrad_pool_put(buf)
             ret = result if batched else result[0]
 
         # Wait for last reduce scatter if it was async
@@ -927,10 +967,18 @@ def get(self, ticket: int) -> torch.Tensor:
         return slot.buf
 
     def release(self, ticket: int):
-        """Return the buffer to the pool.  Ticket remains valid."""
+        """Return the buffer to the pool.  Ticket remains valid.
+
+        slot.buf is intentionally NOT cleared: get() must stay idempotent so that
+        CUDA-graph-captured buffers keep their fixed address across replays, and
+        reallocate_to_mempool() can find every dense-chain buffer.
+        """
         slot = self._slots[ticket]
-        assert slot.buf is not None
-        if slot.buf not in self._pool[slot.key]:
+        if slot.buf is None:
+            return
+        # Use identity check — tensor == tensor returns a multi-element bool tensor
+        # which crashes in a boolean context ("Boolean value of Tensor is ambiguous").
+        if not any(b is slot.buf for b in self._pool.get(slot.key, [])):
             self._pool[slot.key].append(slot.buf)
 
     def clear(self):

From e9acc1b10923ea634ba8bda92716cc7e893fde78 Mon Sep 17 00:00:00 2001
From: Shiqing Fan <shiqingf@nvidia.com>
Date: Tue, 14 Apr 2026 02:54:58 -0700
Subject: [PATCH 19/43] ETP+CG mem fix3: release gathered expert weights after
 dgrad GEMM in backward

  For nvfp4, batched_all_gather_and_prefetch_bwd() returns NVFP4TensorStorage
  objects with internal sub-tensors (columnwise data/scale_inv). The local
  `weights` variable kept them alive until function return, wasting memory
  through the wgrad phase. Delete `weights` immediately after the dgrad GEMM
  (the last consumer), saving weight_sizes for the fuse_wgrad_accumulation=False
  fallback path.
---
 transformer_engine/pytorch/module/grouped_linear.py | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/transformer_engine/pytorch/module/grouped_linear.py b/transformer_engine/pytorch/module/grouped_linear.py
index d11947be0e..3a3e8f86fc 100644
--- a/transformer_engine/pytorch/module/grouped_linear.py
+++ b/transformer_engine/pytorch/module/grouped_linear.py
@@ -453,6 +453,13 @@ def backward(ctx, grad_output: torch.Tensor) -> Tuple[Union[torch.Tensor, None],
                     use_split_accumulator=dgrad_gemm_use_split_accumulator,
                 )
 
+            # Gathered weights are no longer needed after dgrad GEMM.
+            # For nvfp4, the NVFP4TensorStorage and its sub-tensors (scale_inv etc.)
+            # would otherwise survive until function return via this local ref.
+            if ctx.etp_size > 1:
+                weight_sizes = [w.size() for w in weights]
+                del weights
+
             if ctx.weights_requires_grad:
                 wgrad_gemm_use_split_accumulator = _2X_ACC_WGRAD
                 if ctx.fp8:
@@ -464,9 +471,10 @@ def backward(ctx, grad_output: torch.Tensor) -> Tuple[Union[torch.Tensor, None],
                 if ctx.fuse_wgrad_accumulation:
                     wgrad_list = main_grads
                 else:
+                    sizes = weight_sizes if ctx.etp_size > 1 else [w.size() for w in weights]
                     wgrad_list = [
-                        torch.empty(w.size(), dtype=ctx.activation_dtype, device=ctx.device)
-                        for w in weights
+                        torch.empty(sz, dtype=ctx.activation_dtype, device=ctx.device)
+                        for sz in sizes
                     ]
 
                 if ctx.save_original_input:

From 96515f1d4c2d839fb0520e93392e0055fae5ccae Mon Sep 17 00:00:00 2001
From: Shiqing Fan <shiqingf@nvidia.com>
Date: Tue, 14 Apr 2026 08:46:34 -0700
Subject: [PATCH 20/43] [Conservative] fix ETP+CG+DDPOverlaping hang: serialize
 DDP RS and EETP AG on IB in expert backward

  ag_stream.wait_stream(main_stream) before batched_all_gather_and_prefetch_bwd
  in grouped_linear.py backward. With --overlap-grad-reduce + CG, DDP backward
  hooks fire a reduce-scatter (IB, main_stream) that races with the EETP
  all-gather (IB, ag_stream), causing NCCL deadlock at 64+ GPU IB scale
---
 transformer_engine/pytorch/module/grouped_linear.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/transformer_engine/pytorch/module/grouped_linear.py b/transformer_engine/pytorch/module/grouped_linear.py
index 3a3e8f86fc..0aae97541c 100644
--- a/transformer_engine/pytorch/module/grouped_linear.py
+++ b/transformer_engine/pytorch/module/grouped_linear.py
@@ -420,6 +420,16 @@ def backward(ctx, grad_output: torch.Tensor) -> Tuple[Union[torch.Tensor, None],
                 accumulate_wgrad_into_param_main_grad = ctx.fuse_wgrad_accumulation
 
             if ctx.etp_size > 1:
+                # Drain main_stream before launching EETP all-gather on ag_stream.
+                # With --overlap-grad-reduce + CG, DDP backward hooks may have fired a
+                # reduce-scatter (IB, on main_stream) that races with the EETP AG (IB,
+                # on ag_stream). Making ag_stream wait for main_stream serializes IB ops.
+                from transformer_engine.pytorch.module.extended_tensor_parallelism import (
+                    get_ag_stream,
+                )
+                get_ag_stream(origin_weights[0].chain_id).wait_stream(
+                    torch.cuda.current_stream()
+                )
                 weights = origin_weights[0].batched_all_gather_and_prefetch_bwd()
 
             if ctx.requires_dgrad:

From d78bd538941a8a42b10fa94361cc427dfcd6fbd9 Mon Sep 17 00:00:00 2001
From: Shiqing Fan <shiqingf@nvidia.com>
Date: Tue, 14 Apr 2026 19:40:46 -0700
Subject: [PATCH 21/43] ETP+CG+DDP final fix: ETP: restore
 register_grad_accum_hook + _finalize_wgrad DDP hook trigger

  Re-add register_grad_accum_hook() to store DDP backward hook on ETP params.
  _finalize_wgrad now calls the hook after RS wait + main_grad.add_(), firing
  DDP register_grad_ready at the correct serialization point. This replaces
  the previous approach of skipping DDP hooks entirely for ETP params.

  param.grad = dummy_grad is a Python attr set (does NOT trigger autograd's
  grad accumulator); the explicit _grad_accum_hook() call is required.
---
 .../module/extended_tensor_parallelism.py     | 31 +++++++++++++++++--
 .../pytorch/module/grouped_linear.py          | 10 ------
 2 files changed, 28 insertions(+), 13 deletions(-)

diff --git a/transformer_engine/pytorch/module/extended_tensor_parallelism.py b/transformer_engine/pytorch/module/extended_tensor_parallelism.py
index 1700d14129..830ae7f6e0 100644
--- a/transformer_engine/pytorch/module/extended_tensor_parallelism.py
+++ b/transformer_engine/pytorch/module/extended_tensor_parallelism.py
@@ -261,6 +261,9 @@ def __init__(self, x, *args, **kwargs):
         self._prefetch_handle = None
         self._need_weight_prefetch = True
         self.ag_event = torch.cuda.Event(external=True)
+        # DDP backward hook (set by register_grad_accum_hook)
+        self._grad_accum_node = None
+        self._grad_accum_hook = None
         # Quantization
         self._quantizer = None
         self.did_cast_to_low_precision = False
@@ -687,12 +690,24 @@ def batched_all_gather_and_prefetch(self, **kwargs):
     def get_wgrad_tensor(self):
         return _wgrad_pool_get(self._unsharded_shape, self.main_grad.dtype, self.device)
 
+    def register_grad_accum_hook(self, grad_accum_node, hook):
+        """Register a DDP backward hook to be called from _finalize_wgrad.
+
+        For ETP params, autograd may receive None (async RS) so the normal grad
+        accumulator hook never fires. Instead, _finalize_wgrad calls the hook
+        explicitly after RS wait + gradient accumulation, ensuring DDP's
+        register_grad_ready fires at exactly the right time.
+        """
+        self._grad_accum_node = grad_accum_node
+        self._grad_accum_hook = hook
+
     @staticmethod
     def _finalize_wgrad(param, wgrad_rs):
-        """Post-RS per-param processing: strip padding, accumulate into main_grad.
+        """Post-RS per-param processing: strip padding, accumulate, call DDP hook.
 
-        Accumulates the reduce-scattered wgrad into main_grad and returns
-        a dummy zero grad to autograd (DDP backward post hook is not used for ETP params).
+        Accumulates the reduce-scattered wgrad into main_grad and triggers
+        the DDP backward hook (register_grad_ready) so the DP reduce-scatter
+        fires at the correct time during backward.
         """
 
         param._set_rs_state(ETPWeightState.NONE)
@@ -706,6 +721,16 @@ def _finalize_wgrad(param, wgrad_rs):
         if hasattr(param, "grad_added_to_main_grad"):
             param.grad_added_to_main_grad = True
         dummy_grad = get_dummy_wgrad(list(param.main_grad.shape), param.dtype)
+
+        # 3. Trigger DDP backward hook (register_grad_ready).
+        # ETP bypasses autograd's normal gradient flow (returns None for async RS,
+        # accumulates directly into main_grad), so we must trigger the DDP hook
+        # manually. param.grad = dummy_grad is a Python attribute set that does NOT
+        # fire autograd's grad accumulator hook — only the explicit call below does.
+        if getattr(param, '_grad_accum_hook', None) is not None:
+            param.grad = dummy_grad
+            param._grad_accum_hook()
+
         return dummy_grad
 
 
diff --git a/transformer_engine/pytorch/module/grouped_linear.py b/transformer_engine/pytorch/module/grouped_linear.py
index 0aae97541c..3a3e8f86fc 100644
--- a/transformer_engine/pytorch/module/grouped_linear.py
+++ b/transformer_engine/pytorch/module/grouped_linear.py
@@ -420,16 +420,6 @@ def backward(ctx, grad_output: torch.Tensor) -> Tuple[Union[torch.Tensor, None],
                 accumulate_wgrad_into_param_main_grad = ctx.fuse_wgrad_accumulation
 
             if ctx.etp_size > 1:
-                # Drain main_stream before launching EETP all-gather on ag_stream.
-                # With --overlap-grad-reduce + CG, DDP backward hooks may have fired a
-                # reduce-scatter (IB, on main_stream) that races with the EETP AG (IB,
-                # on ag_stream). Making ag_stream wait for main_stream serializes IB ops.
-                from transformer_engine.pytorch.module.extended_tensor_parallelism import (
-                    get_ag_stream,
-                )
-                get_ag_stream(origin_weights[0].chain_id).wait_stream(
-                    torch.cuda.current_stream()
-                )
                 weights = origin_weights[0].batched_all_gather_and_prefetch_bwd()
 
             if ctx.requires_dgrad:

From 1fd09f513ff6560b97d9c191c66faa205ee6105f Mon Sep 17 00:00:00 2001
From: Shiqing Fan <shiqingf@nvidia.com>
Date: Wed, 15 Apr 2026 03:12:09 -0700
Subject: [PATCH 22/43] ETP+CG mem fix4:  del main_grads after
 batched_wgrad_reduce_scatter     to drop Python refs to wgrad input buffers
 immediately. The async RS     still holds C++ refs via NCCL Work until
 _wait_reduce_scatter. Reduces     peak memory during graph capture warmup
 (~320 MB per MoE layer).

---
 .../pytorch/module/extended_tensor_parallelism.py           | 6 +++---
 transformer_engine/pytorch/module/grouped_linear.py         | 6 ++++++
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/transformer_engine/pytorch/module/extended_tensor_parallelism.py b/transformer_engine/pytorch/module/extended_tensor_parallelism.py
index 830ae7f6e0..535f178988 100644
--- a/transformer_engine/pytorch/module/extended_tensor_parallelism.py
+++ b/transformer_engine/pytorch/module/extended_tensor_parallelism.py
@@ -725,10 +725,10 @@ def _finalize_wgrad(param, wgrad_rs):
         # 3. Trigger DDP backward hook (register_grad_ready).
         # ETP bypasses autograd's normal gradient flow (returns None for async RS,
         # accumulates directly into main_grad), so we must trigger the DDP hook
-        # manually. param.grad = dummy_grad is a Python attribute set that does NOT
-        # fire autograd's grad accumulator hook — only the explicit call below does.
+        # manually. Do NOT set param.grad before calling — the hook checks
+        # param.grad and would accumulate it into main_grad if zero_out_wgrad
+        # is True, corrupting the gradient with a non-zero dummy.
         if getattr(param, '_grad_accum_hook', None) is not None:
-            param.grad = dummy_grad
             param._grad_accum_hook()
 
         return dummy_grad
diff --git a/transformer_engine/pytorch/module/grouped_linear.py b/transformer_engine/pytorch/module/grouped_linear.py
index 3a3e8f86fc..fca5b4ee61 100644
--- a/transformer_engine/pytorch/module/grouped_linear.py
+++ b/transformer_engine/pytorch/module/grouped_linear.py
@@ -560,6 +560,12 @@ def handle_custom_ddp_from_mcore(weight, wgrad):
 
                 if ctx.etp_size > 1:
                     wgrad_list = origin_weights[0].batched_wgrad_reduce_scatter(wgrad_list)
+                    # Drop Python refs to wgrad input buffers. The async RS on rs_stream
+                    # still holds C++ refs (via NCCL Work); those are released when
+                    # _wait_reduce_scatter calls handle.wait() + self.handle = None.
+                    # Without this del, main_grads keeps the tensors alive until function
+                    # return, wasting memory during graph capture warmup.
+                    del main_grads
                 elif ctx.fuse_wgrad_accumulation:
                     wgrad_list = [
                             handle_custom_ddp_from_mcore(weight, wgrad)

From 66fb81c3b8d2dd045d0179fc38da5ca554988ee8 Mon Sep 17 00:00:00 2001
From: Shiqing Fan <shiqingf@nvidia.com>
Date: Wed, 15 Apr 2026 19:54:51 -0700
Subject: [PATCH 23/43] update doc

---
 docs/README_ETP.md | 39 +++++++++++++++++++++++++++++++++++++--
 1 file changed, 37 insertions(+), 2 deletions(-)

diff --git a/docs/README_ETP.md b/docs/README_ETP.md
index d6c0a367da..52368579fb 100644
--- a/docs/README_ETP.md
+++ b/docs/README_ETP.md
@@ -28,8 +28,7 @@ TODO(shiqingf): add performance for Ultra model in nvfp4.
 | **FP8 / MXFP8 support** | Quantized shards with ETP-group amax reduction |
 | **Routed expert support** | Batched coalesced all-gather for all experts in a MoE layer (GroupedLinear) |
 | **Composable with TP/SP** | Orthogonal to tensor parallelism and sequence parallelism |
-| **CUDA Graphs compatible** | ETP is compatible with CUDA Graphs. And kernels on sidestreams are no longer required to synchronize at graph breaks
- |
+| **CUDA Graphs compatible** | Dense-chain prefetches captured in graphs; expert-chain runs eagerly. DDP RS serialized via `register_grad_accum_hook` (called from `_finalize_wgrad` for eager params, from `_CudagraphReplayNode.backward` for graphed params). Forward drains at CG/eager boundary prevent IB races. |
 | **Debug naming** | `tag_etp_params_with_names(model)` populates human-readable names on every `ETPShardedParam`; the prefetch-link table is printed atomically at the start of the second forward pass |
 
 ### Implementation Mechanisms
@@ -434,6 +433,42 @@ Both chains share the same `ag_stream` and `rs_stream`. Per-chain streams were c
 
 The single global `ETPWeightCache` serves both chains. Cache keys already include `expert_idx`, so dense and expert buffers never collide. `reallocate_to_mempool()` only migrates **dense-chain** buffers into the CUDA graph memory pool; expert-chain buffers remain in regular allocator memory.
 
+### Excluding Params from the Chain
+
+Setting `weight.prefetch_initialized = True` at construction skips chain registration entirely. Megatron uses this for the embedding and output-layer weights, which perform synchronous all-gathers and must not join the dense chain (they execute outside the CUDA graph boundary, and linking them into the dense chain would cause the chain to cross the CG/eager boundary, reproducing the same NCCL deadlock as the old single-chain design). Setting `_need_weight_prefetch = False` in addition disables the async path so these weights always do synchronous AG.
+
+### ETP + DDP Serialization (`register_grad_accum_hook`)
+
+ETP bypasses autograd's normal gradient flow: `wgrad_reduce_scatter` returns `None` for async RS (chain interior params), and `_finalize_wgrad` accumulates directly into `main_grad`. As a result, autograd's grad accumulator never fires for these params, and standard DDP backward hooks (`grad_acc.register_hook`) would never trigger.
+
+Without proper serialization, DDP reduce-scatter (IB) and ETP reduce-scatter (IB) can run concurrently on different CUDA streams at 64+ GPU IB scale, causing NCCL deadlock.
+
+The solution: `register_grad_accum_hook(grad_acc, hook)` stores the DDP hook on the `ETPShardedParam`. `_finalize_wgrad` calls the hook **manually** after RS wait + gradient accumulation:
+
+```python
+# _finalize_wgrad (called after RS is waited and gradient accumulated)
+param.main_grad.add_(wgrad_rs)          # gradient accumulated
+param.grad = dummy_grad                 # Python attr set (does NOT fire autograd)
+param._grad_accum_hook()                # manually triggers DDP register_grad_ready
+```
+
+This fires `register_grad_ready` at exactly the right serialization point, ensuring DDP RS launches only after ETP RS completes. The hook trigger differs by execution mode:
+
+| Weight type | Hook trigger location | When |
+|---|---|---|
+| **Graphed dense** (mamba/attn/shared_expert) | `_CudagraphReplayNode.backward` in `cuda_graphs.py` | After graph replay (Python, not captured) |
+| **Eager expert** (grouped_fc1/fc2) | `_finalize_wgrad` in `extended_tensor_parallelism.py` | After RS wait + `main_grad.add_` (Python, every iteration) |
+| **Eager chain head** (sync RS) | `_finalize_wgrad` called directly in `wgrad_reduce_scatter` | Immediately after sync RS completes |
+
+For graphed params: `_finalize_wgrad` runs during capture but the hook returns early (`is_graph_capturing()`). At replay, `_finalize_wgrad` doesn't re-run from Python (captured GPU ops only). `_CudagraphReplayNode.backward` explicitly triggers the hook after setting `grad_added_to_main_grad = True`.
+
+### Forward-Path Drains at CG/Eager Boundary
+
+Before eager expert compute starts (`_forward_mlp_expert_compute`), two drains ensure no in-flight IB ops race with expert backward:
+
+1. `_drain_etp_side_streams('dense')` — drains the dense ETP AG prefetch (e.g., `AG(next_mamba_fc1)` launched by the preceding shared_expert GEMM on `ag_stream`)
+2. `_drain_param_gather()` — drains async DDP param all-gather from `--overlap-param-gather` + CG (the forward pre-hook `finish_param_sync` is skipped during graph capture/replay)
+
 ---
 
 ## Scalability

From e09983cad09d93ad008b6d3ef327f58990fa8281 Mon Sep 17 00:00:00 2001
From: Shiqing Fan <shiqingf@nvidia.com>
Date: Thu, 16 Apr 2026 07:15:47 -0700
Subject: [PATCH 24/43] ETP+CG: re-enable bwd ETP RS overlapping across Graphs.

---
 .../pytorch/module/extended_tensor_parallelism.py         | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/transformer_engine/pytorch/module/extended_tensor_parallelism.py b/transformer_engine/pytorch/module/extended_tensor_parallelism.py
index 535f178988..007a15ab2f 100644
--- a/transformer_engine/pytorch/module/extended_tensor_parallelism.py
+++ b/transformer_engine/pytorch/module/extended_tensor_parallelism.py
@@ -1076,18 +1076,22 @@ def reallocate_etp_cache_to_mempool(device, mempool):
         _ETP_CACHE.reallocate_to_mempool(device, mempool)
 
 
-def wait_async_comms(chain_id: str = None):
+def wait_async_comms(chain_id: str = None, skip_rs: bool = False):
     """Wait on in-flight ETP async communications (all-gathers + reduce-scatters).
 
     Args:
         chain_id: If specified, only drain params belonging to this chain ('dense' or 'expert').
                   If None, drain all chains (backward compat).
+        skip_rs: If True, only process AG handles (record ag_event on ag_stream)
+                 and skip RS handles.  Used to record a CUDA graph completion event
+                 after AG but before RS, enabling cross-graph RS overlap.
     """
     for param in list(_inflight_comm_params):
         if chain_id is not None and getattr(param, 'chain_id', 'dense') != chain_id:
             continue
         param._wait_param_gather()
-        param._wait_reduce_scatter()
+        if not skip_rs:
+            param._wait_reduce_scatter()
 
 
 @dataclass

From d72bcced27e3d20c4f85e9d13aea7dd6d4f6ff45 Mon Sep 17 00:00:00 2001
From: Shiqing Fan <shiqingf@nvidia.com>
Date: Thu, 16 Apr 2026 19:37:49 -0700
Subject: [PATCH 25/43] Revert "ETP+CG: re-enable bwd ETP RS overlapping across
 Graphs."

This reverts commit e09983cad09d93ad008b6d3ef327f58990fa8281.
---
 .../pytorch/module/extended_tensor_parallelism.py         | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/transformer_engine/pytorch/module/extended_tensor_parallelism.py b/transformer_engine/pytorch/module/extended_tensor_parallelism.py
index 007a15ab2f..535f178988 100644
--- a/transformer_engine/pytorch/module/extended_tensor_parallelism.py
+++ b/transformer_engine/pytorch/module/extended_tensor_parallelism.py
@@ -1076,22 +1076,18 @@ def reallocate_etp_cache_to_mempool(device, mempool):
         _ETP_CACHE.reallocate_to_mempool(device, mempool)
 
 
-def wait_async_comms(chain_id: str = None, skip_rs: bool = False):
+def wait_async_comms(chain_id: str = None):
     """Wait on in-flight ETP async communications (all-gathers + reduce-scatters).
 
     Args:
         chain_id: If specified, only drain params belonging to this chain ('dense' or 'expert').
                   If None, drain all chains (backward compat).
-        skip_rs: If True, only process AG handles (record ag_event on ag_stream)
-                 and skip RS handles.  Used to record a CUDA graph completion event
-                 after AG but before RS, enabling cross-graph RS overlap.
     """
     for param in list(_inflight_comm_params):
         if chain_id is not None and getattr(param, 'chain_id', 'dense') != chain_id:
             continue
         param._wait_param_gather()
-        if not skip_rs:
-            param._wait_reduce_scatter()
+        param._wait_reduce_scatter()
 
 
 @dataclass

From 7cc86fd7de159aeb9f537bc52b3c39abc4293248 Mon Sep 17 00:00:00 2001
From: Shiqing Fan <shiqingf@nvidia.com>
Date: Sun, 19 Apr 2026 08:35:14 -0700
Subject: [PATCH 26/43] ETP: fix iter-2 NaN + unbounded wgrad pool growth;
 partition streams
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

  Three related fixes exposed by adding embedding / output_layer to the
  UNGRAPHED prefetch chain:

  1. iter-2 NaN (consumer-side race on AG output buffer). The async AG
     prefetch was issued from main_stream; NCCL's caller-stream preEvent
     queued behind pending CG/compute and NCCL started late, leaving the
     consumer GEMM reading a partially-written buffer.
     Fix: wrap the async issue in
     in both all_gather_and_prefetch (fwd) and all_gather_and_prefetch_bwd
     (bwd). Added a state guard in _get_prefetched_weight that asserts an
     AG was issued for this consume cycle — catches silent stale-cache
     reads from misconfigured _need_weight_prefetch flags.

  2. Unbounded _wgrad_buf_pool growth. _wait_reduce_scatter pushed the
     wgrad input buffer into the pool unconditionally, but callers that
     don't acquire via _wgrad_pool_get (Megatron layers.py wgrad GEMM,
     aten F.embedding backward) never popped — every iter leaked N fresh
     buffers into the pool.
     Fix: tag pool-owned buffers at _wgrad_pool_get; _wgrad_pool_put
     no-ops on foreign buffers, letting the caching allocator recycle.
     Side effect: throughput 80 → 580 TFLOPs/GPU (pool thrash eliminated).

  3. ag/rs streams partitioned by (chain_id, NCCL group). UNGRAPHED chain
     can span multiple communicators (ETP vs EETP); sharing a single
     user-level stream forced cross-group NCCL ops to serialize. Stream
     dicts are now keyed on (chain_id, id(group)); adds
     get_{ag,rs}_streams_for_chain() helpers.
---
 .../module/extended_tensor_parallelism.py     | 326 ++++++++++++++----
 1 file changed, 253 insertions(+), 73 deletions(-)

diff --git a/transformer_engine/pytorch/module/extended_tensor_parallelism.py b/transformer_engine/pytorch/module/extended_tensor_parallelism.py
index 535f178988..87c39b99b5 100644
--- a/transformer_engine/pytorch/module/extended_tensor_parallelism.py
+++ b/transformer_engine/pytorch/module/extended_tensor_parallelism.py
@@ -27,6 +27,111 @@
 DEBUG_TENSOR = None
 
 
+class ETPChain(str, Enum):
+    """Prefetch chain identifier for an ETPShardedParam.
+
+    GRAPHED   — fwd/bwd captured by a CUDA graph (MLM _CudaGraphRunner).
+    UNGRAPHED — fwd/bwd runs eagerly; includes embedding/output_layer and
+                routed grouped experts always, plus router/shared_experts
+                when their scope tag is not in cuda_graph_scope.
+
+    Chains never cross-link (prev_w/next_w stay within one chain). CG
+    disabled → single UNGRAPHED chain; full-iteration graph → single GRAPHED.
+    """
+    GRAPHED = "ETP_graphed"
+    UNGRAPHED = "ETP_ungraphed"
+
+
+# Module-level cuda_graph_scope, set by MLM at init via set_cuda_graph_scope().
+# None or empty → CG is disabled; every ETP param classifies as UNGRAPHED.
+# Value is a set of scope tags; e.g. {"mamba","attn","moe_router"}.
+_CUDA_GRAPH_SCOPE: Optional[set] = None
+# Whether shared_experts are run with overlap (cannot be captured). When True,
+# shared_experts stay UNGRAPHED regardless of moe_router scope inclusion, matching
+# the transformer_layer.py guard that excludes them from the captured submodules.
+_MOE_SHARED_EXPERT_OVERLAP: bool = False
+
+
+def set_cuda_graph_scope(scope, moe_shared_expert_overlap: bool = False):
+    """Record the active cuda_graph_scope for ETP chain classification.
+
+    Called by MLM at init, BEFORE classify_etp_chains(). ``scope`` may be
+    None, an empty iterable (CG disabled), or an iterable of scope tags.
+    """
+    global _CUDA_GRAPH_SCOPE, _MOE_SHARED_EXPERT_OVERLAP
+    _CUDA_GRAPH_SCOPE = set(scope) if scope else None
+    _MOE_SHARED_EXPERT_OVERLAP = bool(moe_shared_expert_overlap)
+
+
+def _classify_param_chain(param_name: str) -> 'ETPChain':
+    """Classify an ETPShardedParam by name + active cuda_graph_scope.
+
+    embedding / output_layer are always UNGRAPHED. Other kinds (mamba mixer,
+    self/cross_attention, shared_experts, routed experts) are GRAPHED iff
+    their scope tag is present in cuda_graph_scope; otherwise UNGRAPHED.
+    """
+    n = param_name
+
+    # Always ungraphed — embedding and output_layer live outside any CG runner.
+    if "embedding" in n or "output_layer" in n:
+        return ETPChain.UNGRAPHED
+
+    scope = _CUDA_GRAPH_SCOPE
+    if not scope:
+        # CG disabled: every ETP param goes to the single UNGRAPHED chain.
+        return ETPChain.UNGRAPHED
+
+    if ".mlp.shared_experts." in n:
+        if _MOE_SHARED_EXPERT_OVERLAP:
+            return ETPChain.UNGRAPHED
+        return ETPChain.GRAPHED if ("moe" in scope or "moe_router" in scope) else ETPChain.UNGRAPHED
+
+    if ".mlp.experts." in n:
+        return ETPChain.GRAPHED if "moe" in scope else ETPChain.UNGRAPHED
+
+    if ".self_attention." in n or ".cross_attention." in n:
+        return ETPChain.GRAPHED if "attn" in scope else ETPChain.UNGRAPHED
+
+    if ".mixer." in n:
+        return ETPChain.GRAPHED if "mamba" in scope else ETPChain.UNGRAPHED
+
+    return ETPChain.UNGRAPHED
+
+
+def classify_etp_chains(model) -> None:
+    """Walk model.named_parameters() and set chain_id on every ETPShardedParam.
+
+    Call once at init, AFTER set_cuda_graph_scope() and BEFORE the first fwd
+    of any graphed param. Raises if an already chain-initialized param would
+    be reclassified into a different chain (its prev/next links are already
+    wired into the wrong list).
+    """
+    conflicts = []
+    for name, param in model.named_parameters():
+        if not isinstance(param, ETPShardedParam):
+            continue
+        target = _classify_param_chain(name).value
+        if param.prefetch_initialized and param.chain_id != target:
+            conflicts.append((name, param.chain_id, target))
+            continue
+        param.chain_id = target
+
+        # Bwd-prefetch opt-out: embedding.word_embeddings.weight does not need
+        # an AG in the bwd pass (its wgrad is a scatter-add on sharded rows
+        # and its input has no dgrad). Skipping its bwd AG saves one collective.
+        if "embedding" in name:
+            param._need_weight_prefetch_bwd = False
+    if conflicts:
+        raise RuntimeError(
+            "classify_etp_chains: the following params were already chain-initialized "
+            "with a different chain_id than the classifier would assign — this means "
+            "their chain links are already wired into the wrong list. Move classification "
+            "earlier in init. Conflicts: "
+            + ", ".join(f"{n}: {old!r}->{new!r}" for n, old, new in conflicts[:3])
+            + ("..." if len(conflicts) > 3 else "")
+        )
+
+
 class ETPWeightState(Enum):
     NONE = "NONE"              # Sharded, no pending operation
     ASYNC_WAIT = "ASYNC_WAIT"  # Async all-gather in progress
@@ -50,55 +155,84 @@ class ETPWeightState(Enum):
 _AG_STREAMS: Dict[str, torch.cuda.Stream] = {}
 _RS_STREAMS: Dict[str, torch.cuda.Stream] = {}
 
-# Standalone wgrad input buffer pool, keyed by (shape, dtype).
-# Separate from ETPWeightCache because:
-# 1. Wgrad buffers are expert-chain only (never graphed)
-# 2. They need true release-then-reuse (the pool shrinks/grows), whereas
-#    ETPWeightCache keeps slot.buf set for CUDA graph address stability
+# Wgrad input buffer pool, keyed by (shape, dtype). UNGRAPHED-only: GRAPHED
+# wgrad bufs need address stability for CG replay and are not pool-recycled.
 _wgrad_buf_pool: Dict[tuple, list] = {}
 
 
 def _wgrad_pool_get(shape: tuple, dtype: torch.dtype, device) -> torch.Tensor:
-    """Get a wgrad buffer from the pool, or allocate a fresh one."""
+    """Get a pool buffer or allocate fresh. Tagged so _wgrad_pool_put accepts
+    only pool-owned buffers — callers that don't use _wgrad_pool_get (e.g.
+    Megatron layers.py wgrad GEMM, aten F.embedding bwd) fall through to the
+    caching allocator on release."""
     key = (shape, dtype)
     pool = _wgrad_buf_pool.get(key)
     if pool:
-        return pool.pop()
-    return torch.empty(shape, dtype=dtype, device=device, requires_grad=False)
+        buf = pool.pop()
+    else:
+        buf = torch.empty(shape, dtype=dtype, device=device, requires_grad=False)
+    buf._from_etp_wgrad_pool = True
+    return buf
 
 
 def _wgrad_pool_put(buf: torch.Tensor):
-    """Return a wgrad buffer to the pool for reuse."""
+    """Return a pool-owned buffer for reuse (no-op for untagged buffers; see
+    _wgrad_pool_get)."""
+    if not getattr(buf, '_from_etp_wgrad_pool', False):
+        return
     key = (tuple(buf.shape), buf.dtype)
     if key not in _wgrad_buf_pool:
         _wgrad_buf_pool[key] = []
     _wgrad_buf_pool[key].append(buf)
 
 
-def get_ag_stream(chain_id: str = 'dense') -> torch.cuda.Stream:
-    # All chains share one AG stream. The ag_event CUDA event object is recorded on
-    # this stream during graph capture; using a different stream at replay would cause
-    # ag_event.wait() to see a stale recording, producing Inf gradients.
-    key = 'shared'
+def _stream_key(chain_id: str, group) -> tuple:
+    """Key for the per-(chain, group) AG/RS stream dicts.
+
+    Two partitioning axes:
+      - chain_id: captured (GRAPHED) vs eager (UNGRAPHED) ops must not share
+        a stream (eager ops would contaminate capture/replay state).
+      - group: independent NCCL communicators (e.g. ETP vs EETP) get their
+        own user-level stream to avoid cross-group serialization.
+    """
+    return (chain_id, id(group) if group is not None else 0)
+
+
+def get_ag_stream(chain_id: str = ETPChain.GRAPHED.value, group=None) -> torch.cuda.Stream:
+    """Return the ETP all-gather stream for (chain_id, group). See _stream_key."""
+    key = _stream_key(chain_id, group)
     if key not in _AG_STREAMS:
         _AG_STREAMS[key] = torch.cuda.Stream()
     return _AG_STREAMS[key]
 
-def get_rs_stream(chain_id: str = 'dense') -> torch.cuda.Stream:
-    # All chains share one RS stream (same reason as AG stream).
-    key = 'shared'
+
+def get_rs_stream(chain_id: str = ETPChain.GRAPHED.value, group=None) -> torch.cuda.Stream:
+    """Return the ETP reduce-scatter stream for (chain_id, group). See _stream_key."""
+    key = _stream_key(chain_id, group)
     if key not in _RS_STREAMS:
         _RS_STREAMS[key] = torch.cuda.Stream()
     return _RS_STREAMS[key]
 
+
 def get_all_ag_streams() -> list:
-    """Return all AG streams that have been created."""
+    """All AG streams created so far, across chains and groups."""
     return list(_AG_STREAMS.values())
 
+
 def get_all_rs_streams() -> list:
-    """Return all RS streams that have been created."""
+    """All RS streams created so far, across chains and groups."""
     return list(_RS_STREAMS.values())
 
+
+def get_ag_streams_for_chain(chain_id: str) -> list:
+    """AG streams for one chain (all groups that chain has touched)."""
+    return [s for k, s in _AG_STREAMS.items() if k[0] == chain_id]
+
+
+def get_rs_streams_for_chain(chain_id: str) -> list:
+    """RS streams for one chain (all groups that chain has touched)."""
+    return [s for k, s in _RS_STREAMS.items() if k[0] == chain_id]
+
 @dataclass
 class ETPConfig:
     """Global configuration for Extended Tensor Parallelism."""
@@ -170,7 +304,10 @@ def wrap_module_params_etp(module, weight_names, etp_group, is_grouped=None):
         if is_grouped:
             etp_shard.expert_idx = idx
             etp_shard.is_routed_expert = True
-            etp_shard.chain_id = 'expert'
+            # Grouped routed experts are UNGRAPHED unless the "moe" scope captures
+            # them; classify_etp_chains() will fix this up at init time based on
+            # the actual cuda_graph_scope. We set UNGRAPHED here as a safe default.
+            etp_shard.chain_id = ETPChain.UNGRAPHED.value
         etp_shard.group = etp_group
         etp_shard.ps_size = etp_size
         # register the newly sharded param back to the module
@@ -206,7 +343,9 @@ class ETPShardedParam(torch.nn.Parameter):
 
     _pending_rs_weight = None
     _first_weight_flag = True
-    # Per-chain state: each chain_id ('dense', 'expert') has its own linked list.
+    # Per-chain state: each chain_id (ETPChain.GRAPHED / ETPChain.UNGRAPHED) has
+    # its own linked list. Chains never cross-link: prev_w/next_w only connect
+    # params with the same chain_id.
     _chain_state: Dict[str, dict] = {}
 
     @classmethod
@@ -231,7 +370,7 @@ def _layer_id(name: str) -> str:
 
         chain['link_node_count'] += 1
         if chain['link_node_count'] == 1:
-            chain_id = getattr(curr, 'chain_id', 'dense')
+            chain_id = getattr(curr, 'chain_id', ETPChain.UNGRAPHED.value)
             chain['link_table_buffer'].append(
                 f"\n[{chain_id} chain]"
                 f"\n{'node_id':>7} | {'layer_id':>8} | {'curr_weight_name':<{_W}} | prev_weight_name"
@@ -260,6 +399,11 @@ def __init__(self, x, *args, **kwargs):
         self._ag_ticket_bwd = None
         self._prefetch_handle = None
         self._need_weight_prefetch = True
+        # Per-direction prefetch opt-outs. Default True. The embedding weight
+        # never needs an AG during bwd (its wgrad is a scatter-add indexed by
+        # token ids, and its input is non-differentiable, so no dgrad either).
+        # classify_etp_chains() sets this to False for embedding.word_embeddings.weight.
+        self._need_weight_prefetch_bwd = True
         self.ag_event = torch.cuda.Event(external=True)
         # DDP backward hook (set by register_grad_accum_hook)
         self._grad_accum_node = None
@@ -272,8 +416,11 @@ def __init__(self, x, *args, **kwargs):
         self.prefetch_initialized = False
         self.next_w = None
         self.prev_w = None
-        # Chain identity: 'dense' for mamba/attn/shared_expert, 'expert' for grouped experts
-        self.chain_id = 'dense'
+        # Chain identity (ETPChain.GRAPHED / ETPChain.UNGRAPHED). Defaults to
+        # UNGRAPHED as a safe fallback; classify_etp_chains(model) walks the
+        # model at init time (after set_cuda_graph_scope) and reclassifies
+        # based on param name + active cuda_graph_scope.
+        self.chain_id = ETPChain.UNGRAPHED.value
         # Grouped gemm
         self.is_routed_expert = False
         self.expert_idx = None
@@ -533,14 +680,22 @@ def _all_gather_weight(self, async_op, skip_weight_cast, cast_noop_flag, fwd, nv
         return result, handle
 
     def _wait_param_gather(self):
-            # Since wait() may sychronize against a different stream than the current stream,
-            # an event is recorded and waited on when the data is retrieved, which ensures the
-            # AG always finishes before returning the unsharded param
-            with torch.cuda.stream(get_ag_stream(self.chain_id)):
-                if self._prefetch_handle is not None:
-                    self._prefetch_handle.wait()
-                    self._prefetch_handle = None
-                    self.ag_event.record()
+        # Wait-site for the async AG. Issuer (all_gather_and_prefetch{,_bwd})
+        # and wait-site both use the TARGET's ag_stream so the caller-stream
+        # "preEvent" PyTorch records at issue time lives on an idle stream.
+        # A busy issue-stream would queue the preEvent behind pending work,
+        # delay NCCL start, and — even with the sync chain main ← ag_event ←
+        # ag_stream handle.wait() ← NCCL endEvent — leave the consumer GEMM
+        # reading a partial AG buffer. (NCCL kernel itself runs on PyTorch's
+        # per-PG ncclStream, not ag_stream.) handle.wait() here inserts the
+        # wait on NCCL's completion event into ag_stream; ag_event.record()
+        # then marks ag_stream for consumers (main_stream via ag_event.wait
+        # or MLM drains via main.wait_stream).
+        with torch.cuda.stream(get_ag_stream(self.chain_id, self.group)):
+            if self._prefetch_handle is not None:
+                self._prefetch_handle.wait()
+                self._prefetch_handle = None
+                self.ag_event.record()
 
     def _all_gather_weight_on_demand(self, fwd, skip_weight_cast=False, cast_noop_flag=None):
         result, _ = self._all_gather_weight(
@@ -555,6 +710,20 @@ def _all_gather_weight_on_demand(self, fwd, skip_weight_cast=False, cast_noop_fl
         return result if self.is_routed_expert else result[0]
 
     def _get_prefetched_weight(self, fwd, skip_weight_cast=False, cast_noop_flag=None):
+        # Stale-read guard: state must reflect an AG issued for this cycle;
+        # otherwise cache.get() would return the prior iter's AG buffer.
+        if ETP_CONFIG.check_param_states:
+            for w in self._weights:
+                assert w.state in (
+                    ETPWeightState.ASYNC_WAIT,
+                    ETPWeightState.DATA_READY,
+                    ETPWeightState.DATA_READY_SYNC,
+                ), (
+                    f"[ETP] _get_prefetched_weight({'fwd' if fwd else 'bwd'}) on "
+                    f"{self._debug_name} with state={w.state!r} — no AG issued; "
+                    f"cache.get() would return stale data. Check the chain's "
+                    f"_need_weight_prefetch flag and issuer's prefetch logic."
+                )
         # Wait for async prefetch if in progress
         self._wait_param_gather()
         self.ag_event.wait()
@@ -593,11 +762,15 @@ def all_gather_and_prefetch_bwd(self, nvtx_label=None):
             ETP_CONFIG.weight_prefetch
             and self.prev_w is not None
             and self.prev_w._need_weight_prefetch
+            and self.prev_w._need_weight_prefetch_bwd
         ):
-            _, handle = self.prev_w._all_gather_weight(
-                async_op=True, skip_weight_cast=True, cast_noop_flag=None,
-                fwd=False, nvtx_label=nvtx_label,
-            )
+            # Issue on the target's ag_stream (see _wait_param_gather).
+            target_stream = get_ag_stream(self.prev_w.chain_id, self.prev_w.group)
+            with torch.cuda.stream(target_stream):
+                _, handle = self.prev_w._all_gather_weight(
+                    async_op=True, skip_weight_cast=True, cast_noop_flag=None,
+                    fwd=False, nvtx_label=nvtx_label,
+                )
             self.prev_w._prefetch_handle = handle
 
         # The unsharded tensor has been returned, no pending work so reset state to NONE
@@ -640,12 +813,15 @@ def all_gather_and_prefetch(
             and self.next_w is not None
             and self.next_w._need_weight_prefetch
         ):
-            _, handle = self.next_w._all_gather_weight(
-                async_op=True, 
-                skip_weight_cast=skip_weight_cast,
-                cast_noop_flag=cast_noop_flag, 
-                fwd=fwd, nvtx_label=nvtx_label,
-            )
+            # Issue on the target's ag_stream (see _wait_param_gather).
+            target_stream = get_ag_stream(self.next_w.chain_id, self.next_w.group)
+            with torch.cuda.stream(target_stream):
+                _, handle = self.next_w._all_gather_weight(
+                    async_op=True,
+                    skip_weight_cast=skip_weight_cast,
+                    cast_noop_flag=cast_noop_flag,
+                    fwd=fwd, nvtx_label=nvtx_label,
+                )
             self.next_w._prefetch_handle = handle
 
         # The unsharded tensor has been returned, no pending work so reset state to NONE
@@ -735,17 +911,19 @@ def _finalize_wgrad(param, wgrad_rs):
 
 
     def _wait_reduce_scatter(self):
-        # assert self._wgrad_rs_handle is not None or is_graph_capturing()
-        with torch.cuda.stream(get_rs_stream(self.chain_id)):
+        # Asymmetric wrt _wait_param_gather: RS is issued from main_stream
+        # (not rs_stream) because main produced the RS input (wgrad) and
+        # naturally holds the write→read ordering. Wait-site enters rs_stream
+        # so it observes NCCL completion and rs_event marks it for consumers.
+        with torch.cuda.stream(get_rs_stream(self.chain_id, self.group)):
             if self._wgrad_rs_handle is not None:
                 self._wgrad_rs_handle.wait()
                 self._wgrad_rs_handle = None
                 self.rs_event.record()
-        # RS is done — drop stashed wgrad input buffer refs.
-        # Safe because handle.wait() above guarantees the RS kernel finished reading them.
-        # Expert-chain buffers go back to pool for reuse; dense-chain buffers just drop refs.
+        # Release stashed wgrad inputs: UNGRAPHED buffers go back to the pool;
+        # GRAPHED just drops Python refs (addresses must stay stable for CG).
         if getattr(self, '_wgrad_input_bufs', None) is not None:
-            if self.chain_id == 'expert':
+            if self.chain_id == ETPChain.UNGRAPHED.value:
                 for buf in self._wgrad_input_bufs:
                     _wgrad_pool_put(buf)
             self._wgrad_input_bufs = None
@@ -818,10 +996,10 @@ def wgrad_reduce_scatter(self, wgrad, nvtx_label=None):
         wgrads = list(wgrad) if batched else [wgrad]
         weights = self._weights
 
-        # Expert-chain wgrads are recycled via the standalone pool (_wgrad_pool_put).
-        # All ungraphed weights (expert + output layer) benefit from the stash
-        # (_wgrad_input_bufs) which drops Python refs once the RS is waited.
-        poolable = self.chain_id == 'expert'
+        # UNGRAPHED-chain wgrads are recycled via the standalone pool (_wgrad_pool_put).
+        # GRAPHED-chain wgrads cannot pool-recycle because CUDA graphs require
+        # stable buffer addresses across replay.
+        poolable = self.chain_id == ETPChain.UNGRAPHED.value
 
         if ETP_CONFIG.weight_prefetch and self.prev_w is not None:
             # Async reduce-scatter (not last weight — deferred finish)
@@ -893,7 +1071,7 @@ class _TicketSlot:
     dtype: object                                # torch.dtype or tex.DType
     reduce_scatter: bool
     fwd: bool
-    chain_id: str = 'dense'                      # chain this slot belongs to
+    chain_id: str = ETPChain.GRAPHED.value       # chain this slot belongs to
     buf: Optional[torch.Tensor] = field(default=None)  # None when released or after clear()
 
 
@@ -975,7 +1153,7 @@ def reserve(self, param: 'ETPShardedParam', dtype, fwd: bool, reduce_scatter=Fal
 
         self._slots[ticket] = _TicketSlot(
             key=key, param=param, dtype=dtype, reduce_scatter=reduce_scatter, fwd=fwd,
-            chain_id=getattr(param, 'chain_id', 'dense'),
+            chain_id=getattr(param, 'chain_id', ETPChain.UNGRAPHED.value),
         )
         return ticket
 
@@ -1014,25 +1192,26 @@ def clear(self):
         self._total_bytes = 0
 
     def reallocate_to_mempool(self, device, mempool):
-        """Re-allocate dense-chain ticket buffers into a CUDA graph memory pool.
+        """Re-allocate GRAPHED-chain ticket buffers into a CUDA graph memory pool.
 
-        Call BEFORE graph capture so every dense-chain buffer lives in the capture
-        pool and no allocations are recorded inside the graph.  Expert-chain buffers
-        are left in regular memory (expert compute runs eagerly, not in graphs).
+        Call BEFORE graph capture so every GRAPHED-chain buffer lives in the capture
+        pool and no allocations are recorded inside the graph. UNGRAPHED-chain
+        buffers are left in regular memory (they are never referenced by any
+        captured graph).
         """
 
-        # Identify keys that belong to the dense chain
-        dense_keys = set()
+        # Identify keys that belong to the GRAPHED chain
+        graphed_keys = set()
         for slot in self._slots.values():
-            if slot.chain_id == 'dense':
-                dense_keys.add(slot.key)
+            if slot.chain_id == ETPChain.GRAPHED.value:
+                graphed_keys.add(slot.key)
 
-        # Clone only dense-chain pool buffers into the passed in mempool
+        # Clone only GRAPHED-chain pool buffers into the passed in mempool
         self._total_bytes = 0
         new_pool = defaultdict(list)
         torch._C._cuda_beginAllocateCurrentThreadToPool(device, mempool)
         for key, buffers in self._pool.items():
-            if key not in dense_keys:
+            if key not in graphed_keys:
                 continue
             new_buffers = []
             for _ in range(len(buffers)):
@@ -1041,23 +1220,23 @@ def reallocate_to_mempool(self, device, mempool):
             new_pool[key] = new_buffers
         torch._C._cuda_endAllocateToPool(device, mempool)
 
-        # Map each buffer in the old pool to its corresponding new one (dense only)
+        # Map each buffer in the old pool to its corresponding new one (GRAPHED only)
         old_to_new_buff = {}
         for key, old_pool in self._pool.items():
-            if key not in dense_keys:
+            if key not in graphed_keys:
                 continue
             new = new_pool[key]
             for old_buf, new_buf in zip(old_pool, new):
                 old_to_new_buff[old_buf] = new_buf
 
-        # Replace each dense slot's reference; keep expert slots unchanged
+        # Replace each GRAPHED slot's reference; keep UNGRAPHED slots unchanged
         for slot in self._slots.values():
-            if slot.chain_id == 'dense' and slot.buf is not None and slot.buf in old_to_new_buff:
+            if slot.chain_id == ETPChain.GRAPHED.value and slot.buf is not None and slot.buf in old_to_new_buff:
                 slot.buf = old_to_new_buff[slot.buf]
 
-        # Merge: dense keys get new buffers, expert keys keep old ones
+        # Merge: GRAPHED keys get new buffers, UNGRAPHED keys keep old ones
         for key, buffers in self._pool.items():
-            if key not in dense_keys:
+            if key not in graphed_keys:
                 new_pool[key] = buffers
         self._pool = new_pool
         return
@@ -1080,11 +1259,12 @@ def wait_async_comms(chain_id: str = None):
     """Wait on in-flight ETP async communications (all-gathers + reduce-scatters).
 
     Args:
-        chain_id: If specified, only drain params belonging to this chain ('dense' or 'expert').
-                  If None, drain all chains (backward compat).
+        chain_id: If specified, only drain params belonging to this chain
+                  (ETPChain.GRAPHED.value or ETPChain.UNGRAPHED.value).
+                  If None, drain all chains.
     """
     for param in list(_inflight_comm_params):
-        if chain_id is not None and getattr(param, 'chain_id', 'dense') != chain_id:
+        if chain_id is not None and getattr(param, 'chain_id', ETPChain.UNGRAPHED.value) != chain_id:
             continue
         param._wait_param_gather()
         param._wait_reduce_scatter()

From a652a63e952045c3faafccdf5cd3dc59e63f5c29 Mon Sep 17 00:00:00 2001
From: Shiqing Fan <shiqingf@nvidia.com>
Date: Tue, 21 Apr 2026 04:51:47 -0700
Subject: [PATCH 27/43] ETP+nvfp4: coalescing amax reduction for fprop
 groupedgemm

---
 transformer_engine/pytorch/csrc/common.h      |  21 +++-
 transformer_engine/pytorch/csrc/extensions.h  |   7 ++
 .../pytorch/csrc/extensions/cast.cpp          |  66 ++++++++++
 .../pytorch/csrc/extensions/pybind.cpp        |   7 ++
 transformer_engine/pytorch/csrc/quantizer.cpp |  52 +++++++-
 .../module/extended_tensor_parallelism.py     | 113 +++++++++++++++++-
 6 files changed, 262 insertions(+), 4 deletions(-)

diff --git a/transformer_engine/pytorch/csrc/common.h b/transformer_engine/pytorch/csrc/common.h
index 6aab9938b3..e5ead50d09 100644
--- a/transformer_engine/pytorch/csrc/common.h
+++ b/transformer_engine/pytorch/csrc/common.h
@@ -365,11 +365,30 @@ class NVFP4Quantizer : public Quantizer {
    */
   void quantize_with_amax(TensorWrapper& input, TensorWrapper& out);
 
+  /*! @brief Compute (and D2D fill) local amax only — no cast, no allreduce.
+   *
+   * Writes the local amax into out's rowwise and/or columnwise amax
+   * buffers. Callers are expected to perform a coalesced allreduce
+   * across the amax reduction group afterwards, then invoke
+   * quantize_cast_only to finish the cast with the reduced amax.
+   */
+  void compute_amax_only(const TensorWrapper& input, TensorWrapper& out);
+
+  /*! @brief Cast to NVFP4 assuming amax already reduced externally.
+   *
+   * Skips both local amax compute and the internal amax allreduce.
+   * Callers must guarantee out's amax buffers already hold the reduced
+   * amax (e.g. via compute_amax_only + allreduce_coalesced).
+   */
+  void quantize_cast_only(const TensorWrapper& input, TensorWrapper& out,
+                          const std::optional<TensorWrapper>& noop_flag = std::nullopt);
+
   std::vector<size_t> get_scale_shape(const std::vector<size_t>& shape, bool columnwise) const;
 
  private:
   void quantize_impl(const TensorWrapper& input, TensorWrapper& out,
-                     const std::optional<TensorWrapper>& noop_flag, bool compute_amax);
+                     const std::optional<TensorWrapper>& noop_flag, bool compute_amax,
+                     bool skip_amax_reduction = false);
 };
 
 std::unique_ptr<Quantizer> convert_quantizer(py::handle quantizer);
diff --git a/transformer_engine/pytorch/csrc/extensions.h b/transformer_engine/pytorch/csrc/extensions.h
index e4d4e5094c..fc26d025c4 100644
--- a/transformer_engine/pytorch/csrc/extensions.h
+++ b/transformer_engine/pytorch/csrc/extensions.h
@@ -285,6 +285,13 @@ std::vector<py::object> rmsnorm_fwd(const py::handle &input, const py::handle &w
 py::object quantize(const at::Tensor &tensor, py::handle quantizer, const py::object &output,
                     std::optional<at::Tensor> noop_flag);
 
+// NVFP4-only split-phase quantize: compute amax, coalesce allreduce externally, then cast.
+py::object compute_amax_nvfp4(const at::Tensor &tensor, py::handle quantizer,
+                              const py::object &output);
+py::object quantize_cast_only_nvfp4(const at::Tensor &tensor, py::handle quantizer,
+                                    const py::object &output,
+                                    std::optional<at::Tensor> noop_flag);
+
 py::object dequantize(const py::handle &input, DType otype);
 
 py::object group_quantize(const at::Tensor &tensor, py::handle quantizer, const size_t num_tensors,
diff --git a/transformer_engine/pytorch/csrc/extensions/cast.cpp b/transformer_engine/pytorch/csrc/extensions/cast.cpp
index f8f793f036..2c261c3c6d 100644
--- a/transformer_engine/pytorch/csrc/extensions/cast.cpp
+++ b/transformer_engine/pytorch/csrc/extensions/cast.cpp
@@ -80,6 +80,72 @@ py::object quantize(const at::Tensor &tensor, py::handle quantizer, const py::ob
   return output_py;
 }
 
+/*! @brief NVFP4-only: compute local amax into `output`'s amax buffers, no cast, no allreduce.
+ *
+ * Pair with an external coalesced allreduce of the returned amax tensors,
+ * then call `quantize_cast_only_nvfp4` to finish the cast.
+ */
+py::object compute_amax_nvfp4(const at::Tensor &tensor, py::handle quantizer,
+                                const py::object &output) {
+  NVTE_CHECK(detail::IsNVFP4Quantizers(quantizer.ptr()),
+             "compute_amax_nvfp4 requires an NVFP4Quantizer");
+  auto quantizer_cpp = convert_quantizer(quantizer);
+  auto *nvfp4_quantizer = dynamic_cast<NVFP4Quantizer *>(quantizer_cpp.get());
+  NVTE_CHECK(nvfp4_quantizer != nullptr, "Failed to cast quantizer to NVFP4Quantizer");
+
+  auto input_contiguous = tensor.contiguous();
+  auto input_cpp = makeTransformerEngineTensor(input_contiguous);
+
+  TensorWrapper output_cpp;
+  py::object output_py;
+  if (output.is_none()) {
+    const auto shape = get_tensor_shape(input_cpp);
+    const auto fake_dtype = input_cpp.dtype();
+    std::tie(output_cpp, output_py) = quantizer_cpp->create_tensor(shape, fake_dtype);
+  } else {
+    std::tie(output_cpp, output_py) = quantizer_cpp->convert_and_update_tensor(output);
+  }
+
+  nvfp4_quantizer->compute_amax_only(input_cpp, output_cpp);
+  return output_py;
+}
+
+/*! @brief NVFP4-only: cast to FP4 using pre-reduced amax in `output`'s amax buffers.
+ *
+ * Skips both local amax compute and the internal allreduce. Caller must have
+ * already populated `output`'s amax via compute_amax_nvfp4 + coalesced allreduce.
+ */
+py::object quantize_cast_only_nvfp4(const at::Tensor &tensor, py::handle quantizer,
+                                      const py::object &output,
+                                      std::optional<at::Tensor> noop_flag) {
+  NVTE_CHECK(detail::IsNVFP4Quantizers(quantizer.ptr()),
+             "quantize_cast_only_nvfp4 requires an NVFP4Quantizer");
+  auto quantizer_cpp = convert_quantizer(quantizer);
+  auto *nvfp4_quantizer = dynamic_cast<NVFP4Quantizer *>(quantizer_cpp.get());
+  NVTE_CHECK(nvfp4_quantizer != nullptr, "Failed to cast quantizer to NVFP4Quantizer");
+
+  auto input_contiguous = tensor.contiguous();
+  auto input_cpp = makeTransformerEngineTensor(input_contiguous);
+
+  TensorWrapper output_cpp;
+  py::object output_py;
+  if (output.is_none()) {
+    const auto shape = get_tensor_shape(input_cpp);
+    const auto fake_dtype = input_cpp.dtype();
+    std::tie(output_cpp, output_py) = quantizer_cpp->create_tensor(shape, fake_dtype);
+  } else {
+    std::tie(output_cpp, output_py) = quantizer_cpp->convert_and_update_tensor(output);
+  }
+
+  std::optional<TensorWrapper> noop_flag_cpp;
+  if (noop_flag.has_value()) {
+    noop_flag_cpp = makeTransformerEngineTensor(*noop_flag);
+  }
+
+  nvfp4_quantizer->quantize_cast_only(input_cpp, output_cpp, noop_flag_cpp);
+  return output_py;
+}
+
 namespace {
 
 // helper functions for NVFP4 grouped quantization (cuda graph safe with shapes stored in device without D2H copy)
diff --git a/transformer_engine/pytorch/csrc/extensions/pybind.cpp b/transformer_engine/pytorch/csrc/extensions/pybind.cpp
index 8302a13010..daee89a038 100644
--- a/transformer_engine/pytorch/csrc/extensions/pybind.cpp
+++ b/transformer_engine/pytorch/csrc/extensions/pybind.cpp
@@ -137,6 +137,13 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
   NVTE_DECLARE_COMMON_PYBIND11_HANDLES(m)
   m.def("quantize", transformer_engine::pytorch::quantize, py::arg("tensor"), py::arg("quantizer"),
         py::arg("output") = py::none(), py::arg("noop") = py::none());
+  m.def("compute_amax_nvfp4", transformer_engine::pytorch::compute_amax_nvfp4,
+        "NVFP4: compute local amax into output's amax buffers; no cast, no allreduce",
+        py::arg("tensor"), py::arg("quantizer"), py::arg("output") = py::none());
+  m.def("quantize_cast_only_nvfp4", transformer_engine::pytorch::quantize_cast_only_nvfp4,
+        "NVFP4: cast using pre-reduced amax in output's amax buffers; skips amax compute and allreduce",
+        py::arg("tensor"), py::arg("quantizer"), py::arg("output") = py::none(),
+        py::arg("noop") = py::none());
   m.def("dequantize", &transformer_engine::pytorch::dequantize, "Dequantize", py::arg("input"),
         py::arg("otype"));
   m.def("group_quantize", transformer_engine::pytorch::group_quantize, py::arg("tensor"),
diff --git a/transformer_engine/pytorch/csrc/quantizer.cpp b/transformer_engine/pytorch/csrc/quantizer.cpp
index 0214f7ff71..ac8fa26bc3 100644
--- a/transformer_engine/pytorch/csrc/quantizer.cpp
+++ b/transformer_engine/pytorch/csrc/quantizer.cpp
@@ -2121,7 +2121,7 @@ std::pair<TensorWrapper, py::object> NVFP4Quantizer::convert_and_update_tensor(
 
 void NVFP4Quantizer::quantize_impl(const TensorWrapper& input, TensorWrapper& out,
                                    const std::optional<TensorWrapper>& noop_flag,
-                                   bool compute_amax) {
+                                   bool compute_amax, bool skip_amax_reduction) {
   // Nothing to be done if input is empty
   if (input.numel() == 0) {
     return;
@@ -2225,7 +2225,7 @@ void NVFP4Quantizer::quantize_impl(const TensorWrapper& input, TensorWrapper& ou
   }
 
   // amax reduction
-  if (this->with_amax_reduction) {
+  if (this->with_amax_reduction && !skip_amax_reduction) {
     std::vector<at::Tensor> amax_tensors;
     // push amax tensors inside if they need to be reduced
     auto make_amax_tensor = [](void* data_ptr) {
@@ -2378,6 +2378,54 @@ void NVFP4Quantizer::quantize_with_amax(TensorWrapper& input, TensorWrapper& out
   this->quantize_impl(input, out, std::nullopt, false);
 }
 
+void NVFP4Quantizer::compute_amax_only(const TensorWrapper& input, TensorWrapper& out) {
+  // Nothing to be done if input is empty
+  if (input.numel() == 0) {
+    return;
+  }
+
+  // Only the non-RHT path is supported for the split-phase API today.
+  // RHT path's amax depends on the RHT-rotated view, which is produced
+  // alongside the cast; decoupling amax from cast is not meaningful there.
+  NVTE_CHECK(!this->with_rht,
+             "NVFP4Quantizer::compute_amax_only does not support with_rht=true");
+
+  auto stream = at::cuda::getCurrentCUDAStream();
+
+  QuantizationConfigWrapper quant_config;
+  quant_config.set_nvfp4_2d_quantization(this->with_2d_quantization);
+
+  // Mirror the compute-amax block of quantize_impl exactly.
+  auto rowwise_amax_ptr = out.get_amax().data_ptr;
+  auto columnwise_amax_ptr = out.get_columnwise_amax().data_ptr;
+  void* amax_ptr = rowwise_amax_ptr != nullptr ? rowwise_amax_ptr : columnwise_amax_ptr;
+  NVTE_CHECK(amax_ptr != nullptr, "Could not find amax pointer");
+
+  out.set_amax(amax_ptr, DType::kFloat32, std::vector<size_t>{1});
+  NVTE_SCOPED_GIL_RELEASE(
+      { nvte_compute_amax_with_config(input.data(), out.data(), quant_config, stream); });
+  out.set_amax(rowwise_amax_ptr, DType::kFloat32, std::vector<size_t>{1});
+
+  // Replicate amax into whichever of rowwise/columnwise slots were requested.
+  if (rowwise_amax_ptr != amax_ptr && rowwise_amax_ptr != nullptr) {
+    NVTE_CHECK_CUDA(cudaMemcpyAsync(rowwise_amax_ptr, amax_ptr, sizeof(float),
+                                    cudaMemcpyDeviceToDevice, stream));
+  }
+  if (columnwise_amax_ptr != amax_ptr && columnwise_amax_ptr != nullptr) {
+    NVTE_CHECK_CUDA(cudaMemcpyAsync(columnwise_amax_ptr, amax_ptr, sizeof(float),
+                                    cudaMemcpyDeviceToDevice, stream));
+  }
+}
+
+void NVFP4Quantizer::quantize_cast_only(const TensorWrapper& input, TensorWrapper& out,
+                                         const std::optional<TensorWrapper>& noop_flag) {
+  // Amax is expected to already live in out's amax buffers (e.g. from
+  // compute_amax_only + an external coalesced allreduce). Skip both local
+  // amax compute and the internal allreduce.
+  this->quantize_impl(input, out, noop_flag, /*compute_amax=*/false,
+                      /*skip_amax_reduction=*/true);
+}
+
 std::vector<size_t> NVFP4Quantizer::get_scale_shape(const std::vector<size_t>& shape,
                                                     bool columnwise) const {
   size_t numel = 1;
diff --git a/transformer_engine/pytorch/module/extended_tensor_parallelism.py b/transformer_engine/pytorch/module/extended_tensor_parallelism.py
index 87c39b99b5..9012e8ab55 100644
--- a/transformer_engine/pytorch/module/extended_tensor_parallelism.py
+++ b/transformer_engine/pytorch/module/extended_tensor_parallelism.py
@@ -233,12 +233,102 @@ def get_rs_streams_for_chain(chain_id: str) -> list:
     """RS streams for one chain (all groups that chain has touched)."""
     return [s for k, s in _RS_STREAMS.items() if k[0] == chain_id]
 
+# Cached once per process: whether the TE build exposes the split-phase APIs.
+_COALESCED_AMAX_TE_APIS_AVAILABLE = (
+    hasattr(tex, "compute_amax_nvfp4") and hasattr(tex, "quantize_cast_only_nvfp4")
+)
+
+
+def _coalesced_amax_static_eligible(weights):
+    """Walk the weight list once and decide whether the coalesced-amax path
+    is applicable. Depends only on fields that are fixed after model
+    construction (quantizer class, flags, amax_reduction_group, group size)."""
+    if not _COALESCED_AMAX_TE_APIS_AVAILABLE:
+        return False
+    if len(weights) <= 1:
+        return False
+
+    group = None
+    for w in weights:
+        q = w._quantizer
+        if q is None or not isinstance(w.quantized, NVFP4TensorStorage):
+            return False
+        if not getattr(q, "with_amax_reduction", False):
+            return False
+        if getattr(q, "with_rht", False):
+            # RHT path does amax on RHT-rotated view, can't split compute
+            # from cast the way compute_amax_only assumes.
+            return False
+        g = getattr(q, "amax_reduction_group", None)
+        if g is None:
+            return False
+        if group is None:
+            group = g
+        elif g is not group:
+            return False
+    return group.size() > 1
+
+
+def _quantize_with_coalesced_amax(weights, skip_weight_cast, cast_noop_flag):
+    """Replace the per-weight (compute_amax + allreduce + cast) loop with:
+       compute_amax loop  →  one coalesced allreduce  →  cast loop."""
+    group = weights[0]._quantizer.amax_reduction_group
+
+    # Materialize padded shards once; on padded last-rank get_padded_shard()
+    # launches an F.pad kernel, and we'd otherwise pay it twice per expert.
+    padded_shards = [w.get_padded_shard() for w in weights]
+
+    # Phase 1: per-weight local amax into each w.quantized's amax buffers.
+    # Keep rowwise/columnwise both populated so the group allreduce sees
+    # whichever the consumer GEMM will read.
+    for w, shard in zip(weights, padded_shards):
+        w._quantizer.set_usage(rowwise=True, columnwise=True)
+        tex.compute_amax_nvfp4(
+            tensor=shard,
+            quantizer=w._quantizer,
+            output=w.quantized,
+        )
+
+    # Phase 2: one coalesced allreduce across every weight's amax tensors.
+    amax_tensors = []
+    for w in weights:
+        rw = w.quantized._amax_rowwise
+        cw = w.quantized._amax_columnwise
+        if rw is not None:
+            amax_tensors.append(rw)
+        if cw is not None and (rw is None or cw.data_ptr() != rw.data_ptr()):
+            amax_tensors.append(cw)
+    torch.distributed.all_reduce_coalesced(
+        amax_tensors,
+        op=torch.distributed.ReduceOp.MAX,
+        group=group,
+    )
+
+    # Phase 3: per-weight cast using the pre-reduced amax; skips the internal
+    # allreduce inside the quantizer.
+    for w, shard in zip(weights, padded_shards):
+        tex.quantize_cast_only_nvfp4(
+            tensor=shard,
+            quantizer=w._quantizer,
+            output=w.quantized,
+            noop=cast_noop_flag,
+        )
+        w.did_cast_to_low_precision = True
+
+
 @dataclass
 class ETPConfig:
     """Global configuration for Extended Tensor Parallelism."""
     pad_for_alignment: int = 16
     check_param_states: bool = True
     weight_prefetch: bool = True
+    # When True and the weight list in _all_gather_weight contains >1 NVFP4
+    # shards that share an amax reduction group, coalesce their per-expert
+    # amax allreduces into a single NCCL call. Requires TE with
+    # tex.compute_amax_nvfp4 / tex.quantize_cast_only_nvfp4; the eligibility
+    # guard in _coalesced_amax_static_eligible falls back to the per-weight
+    # path when either binding is missing.
+    coalesce_amax_allreduce: bool = True
 
 ETP_CONFIG = ETPConfig()
 
@@ -614,8 +704,29 @@ def _all_gather_weight(self, async_op, skip_weight_cast, cast_noop_flag, fwd, nv
                 w._set_state(ETPWeightState.DATA_READY_SYNC)
 
         # 2. Prepare: quantize, set usage direction.
+        # Static eligibility (quantizer class, flags, amax group) is fixed
+        # after model construction — compute once and cache on self so the
+        # hot path only pays the cheap per-call skip_weight_cast check.
+        if ETP_CONFIG.coalesce_amax_allreduce:
+            static_ok = getattr(self, "_coalesced_amax_static", None)
+            if static_ok is None:
+                static_ok = _coalesced_amax_static_eligible(weights)
+                self._coalesced_amax_static = static_ok
+            # Per-call: match the skip_weight_cast gate in _quantize_if_needed
+            # (fire when either skip_weight_cast is False or cast_noop_flag
+            # was provided by the FP8/NVFP4 recipe).
+            use_coalesced = static_ok and not (
+                skip_weight_cast is True and cast_noop_flag is None
+            )
+        else:
+            use_coalesced = False
+
+        if use_coalesced:
+            _quantize_with_coalesced_amax(weights, skip_weight_cast, cast_noop_flag)
+        else:
+            for w in weights:
+                w._quantize_if_needed(skip_weight_cast, cast_noop_flag)
         for w in weights:
-            w._quantize_if_needed(skip_weight_cast, cast_noop_flag)
             if w.did_cast_to_low_precision:
                 w._quantizer.set_usage(rowwise=fwd, columnwise=not fwd)
 

From c0538ec56465a19113d465c394e5bc315a1f1c1b Mon Sep 17 00:00:00 2001
From: Jieming Zhang <jiemingz@nvidia.com>
Date: Tue, 21 Apr 2026 07:40:52 -0700
Subject: [PATCH 28/43] remap quantized params

Signed-off-by: Jieming Zhang <jiemingz@nvidia.com>
---
 .../module/extended_tensor_parallelism.py     | 28 ++++++++++++++++++-
 1 file changed, 27 insertions(+), 1 deletion(-)

diff --git a/transformer_engine/pytorch/module/extended_tensor_parallelism.py b/transformer_engine/pytorch/module/extended_tensor_parallelism.py
index 9012e8ab55..ea00d602b2 100644
--- a/transformer_engine/pytorch/module/extended_tensor_parallelism.py
+++ b/transformer_engine/pytorch/module/extended_tensor_parallelism.py
@@ -9,7 +9,6 @@
 import math
 import re
 import torch
-from contextlib import nullcontext
 
 from ..distributed import (
     gather_along_first_dim,
@@ -149,6 +148,7 @@ class ETPWeightState(Enum):
 
 # Global ETP buffer cache (persists across clear(); never set to None after creation).
 _ETP_CACHE = None
+_ETP_PARAMS = []
 
 # Global set of ETPShardedParam with in-flight async comms (AG or RS).
 _inflight_comm_params: set = set()
@@ -403,6 +403,9 @@ def wrap_module_params_etp(module, weight_names, etp_group, is_grouped=None):
         # register the newly sharded param back to the module
         module._parameters[name] = etp_shard
 
+        global _ETP_PARAMS
+        _ETP_PARAMS.append(etp_shard)
+
     if is_grouped:
         allweights = [getattr(module, name) for name in weight_names]
         allweights[0].weight_list = allweights
@@ -1350,6 +1353,29 @@ def reallocate_to_mempool(self, device, mempool):
             if key not in graphed_keys:
                 new_pool[key] = buffers
         self._pool = new_pool
+
+        # Now remap the quantized params:
+        torch._C._cuda_beginAllocateCurrentThreadToPool(device, mempool)
+        for param in _ETP_PARAMS:
+            weights = param.weight_list if param.is_routed_expert and param.weight_list is not None else [param]
+            for w in weights:
+                if w.quantized is not None:
+                    if isinstance(w.quantized, NVFP4TensorStorage):
+                        w.quantized._rowwise_data = torch.clone(w.quantized._rowwise_data)
+                        w.quantized._columnwise_data = torch.clone(w.quantized._columnwise_data)
+                        w.quantized._rowwise_scale_inv = torch.clone(w.quantized._rowwise_scale_inv)
+                        w.quantized._columnwise_scale_inv = torch.clone(w.quantized._columnwise_scale_inv)
+                        w.quantized._amax_columnwise = torch.clone(w.quantized._amax_columnwise)
+                        w.quantized._amax_rowwise = torch.clone(w.quantized._amax_rowwise)
+                    elif isinstance(w.quantized, MXFP8TensorStorage):
+                        w.quantized._rowwise_data = torch.clone(w.quantized._rowwise_data)
+                        w.quantized._columnwise_data = torch.clone(w.quantized._columnwise_data)
+                        w.quantized._rowwise_scale_inv = torch.clone(w.quantized._rowwise_scale_inv)
+                        w.quantized._columnwise_scale_inv = torch.clone(w.quantized._columnwise_scale_inv)
+                    else:
+                        assert False
+        torch._C._cuda_endAllocateToPool(device, mempool)
+
         return
 
 def get_global_ETP_cache() -> ETPWeightCache:

From 7949e509e694947b448cdd86dd36f15a8e684749 Mon Sep 17 00:00:00 2001
From: Shiqing Fan <shiqingf@nvidia.com>
Date: Tue, 21 Apr 2026 19:48:32 -0700
Subject: [PATCH 29/43] =?UTF-8?q?remap=20quantized=20param=20patch:=20(1)?=
 =?UTF-8?q?=20Remap=20quantized=20params=20into=20the=20CG=20mempool=20?=
 =?UTF-8?q?=E2=80=94?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

   but only for params on nd MoE paths whose scope is not captured) run eagerly and
   don't the GRAPHED chain. UNGRAPHED-chain params (embedding, output_layer, need their
   quantized storage in the CG mempool.

   (2) _ETP_PARAMS already contains every individual expert (appended per weight_name in
   wrap_module_params_etp), so iterate it directly — no weight_list unroll needed.
---
 .../module/extended_tensor_parallelism.py     | 42 ++++++++++---------
 1 file changed, 23 insertions(+), 19 deletions(-)

diff --git a/transformer_engine/pytorch/module/extended_tensor_parallelism.py b/transformer_engine/pytorch/module/extended_tensor_parallelism.py
index ea00d602b2..69ffc1b7b9 100644
--- a/transformer_engine/pytorch/module/extended_tensor_parallelism.py
+++ b/transformer_engine/pytorch/module/extended_tensor_parallelism.py
@@ -1354,26 +1354,30 @@ def reallocate_to_mempool(self, device, mempool):
                 new_pool[key] = buffers
         self._pool = new_pool
 
-        # Now remap the quantized params:
+        # Remap quantized params into the CG mempool — but only for params on
+        # the GRAPHED chain. UNGRAPHED-chain params (embedding, output_layer,
+        # and MoE paths whose scope is not captured) run eagerly and don't
+        # need their quantized storage in the CG mempool.
         torch._C._cuda_beginAllocateCurrentThreadToPool(device, mempool)
-        for param in _ETP_PARAMS:
-            weights = param.weight_list if param.is_routed_expert and param.weight_list is not None else [param]
-            for w in weights:
-                if w.quantized is not None:
-                    if isinstance(w.quantized, NVFP4TensorStorage):
-                        w.quantized._rowwise_data = torch.clone(w.quantized._rowwise_data)
-                        w.quantized._columnwise_data = torch.clone(w.quantized._columnwise_data)
-                        w.quantized._rowwise_scale_inv = torch.clone(w.quantized._rowwise_scale_inv)
-                        w.quantized._columnwise_scale_inv = torch.clone(w.quantized._columnwise_scale_inv)
-                        w.quantized._amax_columnwise = torch.clone(w.quantized._amax_columnwise)
-                        w.quantized._amax_rowwise = torch.clone(w.quantized._amax_rowwise)
-                    elif isinstance(w.quantized, MXFP8TensorStorage):
-                        w.quantized._rowwise_data = torch.clone(w.quantized._rowwise_data)
-                        w.quantized._columnwise_data = torch.clone(w.quantized._columnwise_data)
-                        w.quantized._rowwise_scale_inv = torch.clone(w.quantized._rowwise_scale_inv)
-                        w.quantized._columnwise_scale_inv = torch.clone(w.quantized._columnwise_scale_inv)
-                    else:
-                        assert False
+        for w in _ETP_PARAMS:
+            if getattr(w, "chain_id", ETPChain.GRAPHED.value) != ETPChain.GRAPHED.value:
+                continue
+            if w.quantized is None:
+                continue
+            if isinstance(w.quantized, NVFP4TensorStorage):
+                w.quantized._rowwise_data = torch.clone(w.quantized._rowwise_data)
+                w.quantized._columnwise_data = torch.clone(w.quantized._columnwise_data)
+                w.quantized._rowwise_scale_inv = torch.clone(w.quantized._rowwise_scale_inv)
+                w.quantized._columnwise_scale_inv = torch.clone(w.quantized._columnwise_scale_inv)
+                w.quantized._amax_columnwise = torch.clone(w.quantized._amax_columnwise)
+                w.quantized._amax_rowwise = torch.clone(w.quantized._amax_rowwise)
+            elif isinstance(w.quantized, MXFP8TensorStorage):
+                w.quantized._rowwise_data = torch.clone(w.quantized._rowwise_data)
+                w.quantized._columnwise_data = torch.clone(w.quantized._columnwise_data)
+                w.quantized._rowwise_scale_inv = torch.clone(w.quantized._rowwise_scale_inv)
+                w.quantized._columnwise_scale_inv = torch.clone(w.quantized._columnwise_scale_inv)
+            else:
+                assert False
         torch._C._cuda_endAllocateToPool(device, mempool)
 
         return

From fcef5903d215a138485d93a70e9a50a88b9dde4a Mon Sep 17 00:00:00 2001
From: Shiqing Fan <shiqingf@nvidia.com>
Date: Wed, 22 Apr 2026 02:46:11 -0700
Subject: [PATCH 30/43] ETP+CG: launch etp async rs on rs_stream to match
 isse-site invariant.

---
 .../pytorch/module/extended_tensor_parallelism.py    | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/transformer_engine/pytorch/module/extended_tensor_parallelism.py b/transformer_engine/pytorch/module/extended_tensor_parallelism.py
index 69ffc1b7b9..cb823f8680 100644
--- a/transformer_engine/pytorch/module/extended_tensor_parallelism.py
+++ b/transformer_engine/pytorch/module/extended_tensor_parallelism.py
@@ -1116,8 +1116,16 @@ def wgrad_reduce_scatter(self, wgrad, nvtx_label=None):
         poolable = self.chain_id == ETPChain.UNGRAPHED.value
 
         if ETP_CONFIG.weight_prefetch and self.prev_w is not None:
-            # Async reduce-scatter (not last weight — deferred finish)
-            _, rs_handle = self._reduce_scatter(wgrads, async_op=True, nvtx_label=nvtx_label)
+            # Async reduce-scatter (not last weight — deferred finish).  Issue on rs_stream to
+            # match wait-site (issue-site invariant; see _wait_param_gather).  wgrad is produced
+            # on outer stream by bwd GEMM, so sync outer → rs_stream first.
+            outer_stream = torch.cuda.current_stream()
+            rs_stream = get_rs_stream(self.chain_id, self.group)
+            outer_sync_event = torch.cuda.Event()
+            outer_sync_event.record(outer_stream)
+            rs_stream.wait_event(outer_sync_event)
+            with torch.cuda.stream(rs_stream):
+                _, rs_handle = self._reduce_scatter(wgrads, async_op=True, nvtx_label=nvtx_label)
             self._wgrad_rs_handle = ETPShardHandle(rs_handle, weights, reduce_scatter=True)
             # Stash wgrad input buffers — cannot recycle yet because the async RS
             # kernel is still reading them on rs_stream.

From cd88d3b76ed0be0795e5c910a1f8aaddf04fce35 Mon Sep 17 00:00:00 2001
From: Shiqing Fan <shiqingf@nvidia.com>
Date: Thu, 23 Apr 2026 03:47:18 -0700
Subject: [PATCH 31/43] ETP+NVFP4: fused multi-tensor amax kernel

  Replaces the per-expert (zero_amax + amax + D2D amax replicate) chain in the
  ETP coalesced-amax path with a pair of multi-tensor kernel launches. The
  compute kernel writes rowwise and columnwise amax directly (atomicMaxFloat),
  eliminating the per-expert D2D copy.
---
 transformer_engine/common/CMakeLists.txt      |   1 +
 .../include/transformer_engine/recipe.h       |  20 ++
 .../common/recipe/multi_amax.cu               | 274 ++++++++++++++++++
 transformer_engine/pytorch/csrc/extensions.h  |   8 +
 .../pytorch/csrc/extensions/cast.cpp          |  76 +++++
 .../pytorch/csrc/extensions/pybind.cpp        |   3 +
 .../module/extended_tensor_parallelism.py     |  33 ++-
 7 files changed, 410 insertions(+), 5 deletions(-)
 create mode 100644 transformer_engine/common/recipe/multi_amax.cu

diff --git a/transformer_engine/common/CMakeLists.txt b/transformer_engine/common/CMakeLists.txt
index a105a0343f..032d635e61 100644
--- a/transformer_engine/common/CMakeLists.txt
+++ b/transformer_engine/common/CMakeLists.txt
@@ -163,6 +163,7 @@ list(APPEND transformer_engine_cuda_sources
      recipe/current_scaling.cu
      recipe/delayed_scaling.cu
      recipe/fp8_block_scaling.cu
+     recipe/multi_amax.cu
      comm_gemm_overlap/userbuffers/userbuffers.cu)
 
 list(APPEND transformer_engine_cuda_arch_specific_sources
diff --git a/transformer_engine/common/include/transformer_engine/recipe.h b/transformer_engine/common/include/transformer_engine/recipe.h
index cad27a2992..2244056823 100644
--- a/transformer_engine/common/include/transformer_engine/recipe.h
+++ b/transformer_engine/common/include/transformer_engine/recipe.h
@@ -99,6 +99,26 @@ void nvte_compute_amax(const NVTETensor input, NVTETensor output, cudaStream_t s
 void nvte_compute_amax_with_config(const NVTETensor input, NVTETensor output,
                                    const NVTEQuantizationConfig config, cudaStream_t stream);
 
+/*! \brief Compute amax for a list of independent tensors in a single kernel launch.
+ *
+ *  Unlike nvte_group_amax (which requires a single contiguous input split along dim 0),
+ *  this API accepts arrays of independent input tensors, each with its own allocation.
+ *  Designed for the ETP grouped-experts case where per-expert weights live in separate
+ *  buffers.  For each i in [0, num_tensors), computes amax(inputs[i]) and writes it to
+ *  outputs[i]'s amax buffer.  outputs[i] must be an FP8 per-tensor scaling or NVFP4 1D
+ *  scaling tensor.  All inputs must share the same dtype.  If the list exceeds the
+ *  per-launch batch capacity, it is internally chunked.
+ *
+ *  \param[in]      inputs        Array of input tensors (unquantized).  Size num_tensors.
+ *  \param[in,out]  outputs       Array of output tensors.  Only the amax is updated.
+ *                                Size num_tensors.
+ *  \param[in]      num_tensors   Number of tensors.
+ *  \param[in]      config        Quantization configuration (for noop_tensor).  May be NULL.
+ *  \param[in]      stream        CUDA stream used for the operation.
+ */
+void nvte_multi_compute_amax(const NVTETensor *inputs, NVTETensor *outputs, size_t num_tensors,
+                             const NVTEQuantizationConfig config, cudaStream_t stream);
+
 /*! \brief Update an FP8 tensor's scale based on its amax.
  *
  *  This is only supported for FP8 tensors with per-tensor scaling.
diff --git a/transformer_engine/common/recipe/multi_amax.cu b/transformer_engine/common/recipe/multi_amax.cu
new file mode 100644
index 0000000000..5420dde587
--- /dev/null
+++ b/transformer_engine/common/recipe/multi_amax.cu
@@ -0,0 +1,274 @@
+/*************************************************************************
+ * Copyright (c) 2022-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ *
+ * See LICENSE for license information.
+ ************************************************************************/
+
+#include <transformer_engine/recipe.h>
+
+#include <algorithm>
+#include <vector>
+
+#include "../common.h"
+#include "../util/logging.h"
+#include "../util/vectorized_pointwise.h"
+#include "recipe_common.cuh"
+
+namespace transformer_engine {
+namespace {
+
+constexpr int multi_amax_kernel_threads = 512;
+// Per-launch capacity.  kMaxTensorsPerBatch * ~40 bytes per slot keeps the args
+// struct within the 4KB kernel parameter limit with comfortable headroom.
+constexpr int kMaxTensorsPerBatch = 64;
+
+struct MultiAmaxArgs {
+  const void *input_list[kMaxTensorsPerBatch];
+  void *output_rowwise_amax_list[kMaxTensorsPerBatch];
+  void *output_columnwise_amax_list[kMaxTensorsPerBatch];
+  size_t input_numel[kMaxTensorsPerBatch];
+  size_t num_aligned_elements[kMaxTensorsPerBatch];
+  int num_tensors;
+};
+
+// Zero out every output amax slot (rowwise + columnwise, deduped) in a single launch.
+// Respects the noop_ptr contract shared with the single-tensor amax path.
+__launch_bounds__(multi_amax_kernel_threads) __global__
+    void MultiZeroAmaxKernel(MultiAmaxArgs args, const float *noop_ptr) {
+  if (noop_ptr != nullptr && noop_ptr[0] == 1.0f) {
+    return;
+  }
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  int stride = blockDim.x * gridDim.x;
+  for (; tid < args.num_tensors; tid += stride) {
+    float *rw = static_cast<float *>(args.output_rowwise_amax_list[tid]);
+    float *cw = static_cast<float *>(args.output_columnwise_amax_list[tid]);
+    if (rw != nullptr) {
+      *rw = 0.0f;
+    }
+    if (cw != nullptr && cw != rw) {
+      *cw = 0.0f;
+    }
+  }
+}
+
+// Per-tensor amax with one block-strip per tensor.  blockIdx.y selects the
+// tensor; blockIdx.x is the work chunk within that tensor.  Each block
+// vector-loads the tensor, reduces across threads, and atomicMaxFloats the
+// result into BOTH output amax slots (rowwise + columnwise, deduped).  This
+// subsumes the per-expert D2D copy that the single-tensor path does after the
+// amax kernel.
+template <int nvec, bool aligned, typename InputType>
+__launch_bounds__(multi_amax_kernel_threads) __global__
+    void MultiAmaxKernel(MultiAmaxArgs args, const float *noop_ptr) {
+  if (noop_ptr != nullptr && noop_ptr[0] == 1.0f) {
+    return;
+  }
+
+  const int t_idx = blockIdx.y;
+  if (t_idx >= args.num_tensors) {
+    return;
+  }
+
+  const InputType *input = static_cast<const InputType *>(args.input_list[t_idx]);
+  const size_t N = args.input_numel[t_idx];
+  if (N == 0) {
+    return;
+  }
+  const size_t M = args.num_aligned_elements[t_idx];
+
+  VectorizedLoader<InputType, nvec, aligned> loader(input, N);
+  InputType max = InputType{0.f};
+  const int warp_id = threadIdx.x / THREADS_PER_WARP;
+
+  for (size_t tid = blockIdx.x * blockDim.x + threadIdx.x; tid < M;
+       tid += gridDim.x * blockDim.x) {
+    loader.load(tid, N);
+#pragma unroll
+    for (int i = 0; i < nvec; ++i) {
+      const InputType val = static_cast<InputType>(loader.separate()[i]);
+      __builtin_assume(max >= InputType{0.f});
+      if constexpr (std::is_same_v<InputType, __nv_bfloat16>) {
+#if __CUDA_ARCH__ >= 800
+        max = __hmax(__habs(val), max);
+#else
+        max = static_cast<__nv_bfloat16>(
+            fmaxf(fabsf(static_cast<float>(val)), static_cast<float>(max)));
+#endif
+      } else if constexpr (std::is_same_v<InputType, __half>) {
+        max = __hmax(__habs(val), max);
+      } else {
+        max = fmaxf(fabsf(val), max);
+      }
+    }
+  }
+
+  // Reduce amax over block.
+  max = reduce_max<multi_amax_kernel_threads / THREADS_PER_WARP>(max, warp_id);
+  if (threadIdx.x == 0) {
+    float *rw = static_cast<float *>(args.output_rowwise_amax_list[t_idx]);
+    float *cw = static_cast<float *>(args.output_columnwise_amax_list[t_idx]);
+    if (rw != nullptr) {
+      atomicMaxFloat(rw, static_cast<float>(max));
+    }
+    if (cw != nullptr && cw != rw) {
+      atomicMaxFloat(cw, static_cast<float>(max));
+    }
+  }
+}
+
+template <typename InputType>
+void launch_multi_amax_batch(const MultiAmaxArgs &args, size_t max_numel, Alignment align,
+                             const float *noop_ptr, cudaStream_t stream) {
+  // Zero all amax outputs in one launch.
+  {
+    constexpr int threads = multi_amax_kernel_threads;
+    const int num_blocks = std::max(1, DIVUP(args.num_tensors, threads));
+    MultiZeroAmaxKernel<<<num_blocks, threads, 0, stream>>>(args, noop_ptr);
+    NVTE_CHECK_CUDA(cudaGetLastError());
+  }
+
+  if (max_numel == 0) {
+    return;
+  }
+
+  // Grid: y = tensor index, x = work chunks within the largest tensor.  Blocks
+  // that exceed a shorter tensor's aligned element count bail out via the
+  // bounds check inside the kernel.
+  constexpr int nvec = 32 / sizeof(InputType);
+  constexpr size_t threads = multi_amax_kernel_threads;
+  const size_t max_aligned = (max_numel + nvec - 1) / nvec;
+  size_t num_blocks_x = DIVUP(max_aligned, threads);
+  constexpr size_t max_blocks = 65535;
+  num_blocks_x = std::min(num_blocks_x, max_blocks);
+  num_blocks_x = std::max<size_t>(num_blocks_x, 1);
+  dim3 grid(num_blocks_x, static_cast<unsigned int>(args.num_tensors), 1);
+
+  switch (align) {
+    case Alignment::SAME_ALIGNED:
+      MultiAmaxKernel<nvec, true, InputType>
+          <<<grid, threads, 0, stream>>>(args, noop_ptr);
+      break;
+    case Alignment::SAME_UNALIGNED:
+      MultiAmaxKernel<nvec, false, InputType>
+          <<<grid, threads, 0, stream>>>(args, noop_ptr);
+      break;
+    case Alignment::DIFFERENT:
+      // Heterogeneous alignment across tensors — fall back to nvec=1, aligned=true path
+      // which is safe for any pointer alignment.
+      MultiAmaxKernel<1, true, InputType>
+          <<<grid, threads, 0, stream>>>(args, noop_ptr);
+      break;
+  }
+  NVTE_CHECK_CUDA(cudaGetLastError());
+}
+
+// Fill one MultiAmaxArgs batch from a slice of the full input/output list.
+// Returns (max_numel in this batch, worst-case alignment across the batch).
+template <typename InputType>
+std::pair<size_t, Alignment> build_batch_args(const std::vector<Tensor *> &inputs,
+                                              const std::vector<Tensor *> &outputs, size_t start,
+                                              size_t count, MultiAmaxArgs &args) {
+  constexpr int nvec = 32 / sizeof(InputType);
+  size_t max_numel = 0;
+  // SAME_ALIGNED is the most optimistic; degrade to SAME_UNALIGNED if any
+  // tensor is merely same-layout but unaligned, to DIFFERENT if alignment
+  // varies across tensors.
+  Alignment batch_align = Alignment::SAME_ALIGNED;
+  for (size_t i = 0; i < count; ++i) {
+    const Tensor &inp = *inputs[start + i];
+    Tensor &out = *outputs[start + i];
+    const size_t N = inp.data.numel();
+    void *rw_ptr = out.amax.dptr;
+    void *cw_ptr = out.columnwise_amax.dptr;
+
+    args.input_list[i] = inp.data.dptr;
+    args.output_rowwise_amax_list[i] = rw_ptr;
+    args.output_columnwise_amax_list[i] = cw_ptr;
+    args.input_numel[i] = N;
+    args.num_aligned_elements[i] = get_num_aligned_elements(inp.data.dptr, N, nvec,
+                                                            sizeof(InputType));
+    max_numel = std::max(max_numel, N);
+
+    // Fold this tensor's alignment into the batch decision.  CheckAlignment on a
+    // single pointer yields SAME_ALIGNED or SAME_UNALIGNED; mixing the two across
+    // tensors means heterogeneous — switch to the DIFFERENT fall-back.
+    if (N > 0) {
+      Alignment a = CheckAlignment(N, nvec, static_cast<const InputType *>(inp.data.dptr));
+      if (batch_align == Alignment::SAME_ALIGNED && a == Alignment::SAME_UNALIGNED) {
+        batch_align = Alignment::SAME_UNALIGNED;
+      } else if (batch_align == Alignment::SAME_UNALIGNED && a == Alignment::SAME_ALIGNED) {
+        batch_align = Alignment::SAME_UNALIGNED;
+      } else if (a == Alignment::DIFFERENT) {
+        batch_align = Alignment::DIFFERENT;
+      }
+    }
+  }
+  args.num_tensors = static_cast<int>(count);
+  return {max_numel, batch_align};
+}
+
+void multi_compute_amax_impl(const NVTETensor *inputs_, NVTETensor *outputs_, size_t num_tensors,
+                             const NVTEQuantizationConfig config_, cudaStream_t stream) {
+  if (num_tensors == 0) {
+    return;
+  }
+  NVTE_CHECK(inputs_ != nullptr, "nvte_multi_compute_amax: inputs is NULL");
+  NVTE_CHECK(outputs_ != nullptr, "nvte_multi_compute_amax: outputs is NULL");
+
+  // Convert, validate, collect into plain vectors.
+  std::vector<Tensor *> inputs(num_tensors);
+  std::vector<Tensor *> outputs(num_tensors);
+  DType input_dtype;
+  for (size_t i = 0; i < num_tensors; ++i) {
+    inputs[i] = convertNVTETensorCheck(inputs_[i]);
+    outputs[i] = convertNVTETensorCheck(outputs_[i]);
+    const auto &inp = *inputs[i];
+    auto &out = *outputs[i];
+    NVTE_CHECK(inp.scaling_mode == NVTE_DELAYED_TENSOR_SCALING,
+               "nvte_multi_compute_amax: input[", i,
+               "] must be unquantized, got scaling_mode=", to_string(inp.scaling_mode));
+    NVTE_CHECK(!is_fp8_dtype(inp.data.dtype),
+               "nvte_multi_compute_amax: input[", i,
+               "] must be unquantized, got dtype=", to_string(inp.data.dtype));
+    if (i == 0) {
+      input_dtype = inp.data.dtype;
+    } else {
+      NVTE_CHECK(inp.data.dtype == input_dtype,
+                 "nvte_multi_compute_amax: all inputs must share dtype; input[0]=",
+                 to_string(input_dtype), ", input[", i, "]=", to_string(inp.data.dtype));
+    }
+    NVTE_CHECK(out.scaling_mode == NVTE_DELAYED_TENSOR_SCALING ||
+                   out.scaling_mode == NVTE_NVFP4_1D_SCALING,
+               "nvte_multi_compute_amax: output[", i, "] must be FP8 per-tensor or NVFP4 1D");
+    NVTE_CHECK(out.amax.dptr != nullptr || out.columnwise_amax.dptr != nullptr,
+               "nvte_multi_compute_amax: output[", i, "] has no amax buffer");
+  }
+
+  const float *noop_ptr = nullptr;
+  if (config_ != nullptr) {
+    const QuantizationConfig *config_cpp = reinterpret_cast<const QuantizationConfig *>(config_);
+    const NVTETensor noop = config_cpp->noop_tensor;
+    noop_ptr = reinterpret_cast<float *>(
+        (noop != nullptr ? convertNVTETensorCheck(noop)->data.dptr : nullptr));
+  }
+
+  // Chunk across kMaxTensorsPerBatch launches (single launch in the common 8-expert case).
+  TRANSFORMER_ENGINE_TYPE_SWITCH_INPUT(input_dtype, IType, {
+    for (size_t start = 0; start < num_tensors; start += kMaxTensorsPerBatch) {
+      const size_t count = std::min<size_t>(kMaxTensorsPerBatch, num_tensors - start);
+      MultiAmaxArgs args = {};
+      auto [max_numel, batch_align] = build_batch_args<IType>(inputs, outputs, start, count, args);
+      launch_multi_amax_batch<IType>(args, max_numel, batch_align, noop_ptr, stream);
+    }
+  });  // NOLINT(*)
+}
+
+}  // anonymous namespace
+}  // namespace transformer_engine
+
+void nvte_multi_compute_amax(const NVTETensor *inputs, NVTETensor *outputs, size_t num_tensors,
+                             const NVTEQuantizationConfig config, cudaStream_t stream) {
+  NVTE_API_CALL(nvte_multi_compute_amax);
+  transformer_engine::multi_compute_amax_impl(inputs, outputs, num_tensors, config, stream);
+}
diff --git a/transformer_engine/pytorch/csrc/extensions.h b/transformer_engine/pytorch/csrc/extensions.h
index fc26d025c4..c9b5674426 100644
--- a/transformer_engine/pytorch/csrc/extensions.h
+++ b/transformer_engine/pytorch/csrc/extensions.h
@@ -292,6 +292,14 @@ py::object quantize_cast_only_nvfp4(const at::Tensor &tensor, py::handle quantiz
                                     const py::object &output,
                                     std::optional<at::Tensor> noop_flag);
 
+// NVFP4-only multi-tensor amax: fuses N per-expert (zero_amax + amax + D2D replicate)
+// chains into a single pair of kernel launches (one multi-zero + one multi-amax) that
+// writes amax into every output's rowwise AND columnwise buffers.  Outputs must be
+// pre-allocated; amax is written in place, no return.
+void compute_multi_amax_nvfp4(const std::vector<at::Tensor> &tensor_list,
+                              std::vector<py::handle> quantizer_list,
+                              const std::vector<py::object> &output_list);
+
 py::object dequantize(const py::handle &input, DType otype);
 
 py::object group_quantize(const at::Tensor &tensor, py::handle quantizer, const size_t num_tensors,
diff --git a/transformer_engine/pytorch/csrc/extensions/cast.cpp b/transformer_engine/pytorch/csrc/extensions/cast.cpp
index 2c261c3c6d..e2602ed133 100644
--- a/transformer_engine/pytorch/csrc/extensions/cast.cpp
+++ b/transformer_engine/pytorch/csrc/extensions/cast.cpp
@@ -146,6 +146,82 @@ py::object quantize_cast_only_nvfp4(const at::Tensor &tensor, py::handle quantiz
   return output_py;
 }
 
+/*! @brief NVFP4-only: compute amax for N input tensors in a single launch.
+ *
+ * Each output's rowwise AND columnwise amax buffers are populated directly by the
+ * kernel (atomicMaxFloat), fusing the per-expert zero_amax + amax_kernel + D2D
+ * replicate chain into two multi-tensor launches.  Caller pairs this with an
+ * external coalesced allreduce and then N calls to quantize_cast_only_nvfp4.
+ *
+ * Amax is written into the outputs passed in via output_list; no return value is
+ * needed — caller already holds references to those objects.
+ */
+void compute_multi_amax_nvfp4(const std::vector<at::Tensor> &tensor_list,
+                              std::vector<py::handle> quantizer_list,
+                              const std::vector<py::object> &output_list) {
+  const size_t num_tensors = tensor_list.size();
+  NVTE_CHECK(num_tensors > 0, "compute_multi_amax_nvfp4 requires at least one tensor");
+  NVTE_CHECK(quantizer_list.size() == num_tensors,
+             "compute_multi_amax_nvfp4: quantizer_list size mismatch");
+  NVTE_CHECK(output_list.size() == num_tensors,
+             "compute_multi_amax_nvfp4: output_list size mismatch");
+
+  // Locals held for the duration of this call (destroyed at function return).
+  // TensorWrappers only hold NVTETensor handles (opaque indexes into a global pool
+  // released by ~TensorWrapper); they do NOT reference quantizer_cpp or py::object,
+  // so we do not need to preserve quantizer unique_ptrs past this scope.
+  std::vector<at::Tensor> input_contiguous;
+  input_contiguous.reserve(num_tensors);
+  std::vector<TensorWrapper> input_wrappers;
+  input_wrappers.reserve(num_tensors);
+  std::vector<TensorWrapper> output_wrappers;
+  output_wrappers.reserve(num_tensors);
+
+  std::vector<NVTETensor> inputs_nvte;
+  std::vector<NVTETensor> outputs_nvte;
+  inputs_nvte.reserve(num_tensors);
+  outputs_nvte.reserve(num_tensors);
+
+  for (size_t i = 0; i < num_tensors; ++i) {
+    NVTE_CHECK(detail::IsNVFP4Quantizers(quantizer_list[i].ptr()),
+               "compute_multi_amax_nvfp4: quantizer[", i, "] is not an NVFP4Quantizer");
+    auto quantizer_cpp = convert_quantizer(quantizer_list[i]);
+    auto *nvfp4_quantizer = dynamic_cast<NVFP4Quantizer *>(quantizer_cpp.get());
+    NVTE_CHECK(nvfp4_quantizer != nullptr && !nvfp4_quantizer->with_rht,
+               "compute_multi_amax_nvfp4 requires NVFP4Quantizer with with_rht=false (idx=", i,
+               ")");
+
+    input_contiguous.emplace_back(tensor_list[i].contiguous());
+    input_wrappers.emplace_back(makeTransformerEngineTensor(input_contiguous.back()));
+
+    TensorWrapper out_cpp;
+    py::object out_py;
+    NVTE_CHECK(!output_list[i].is_none(),
+               "compute_multi_amax_nvfp4: output_list[", i, "] is None; caller must pre-allocate");
+    std::tie(out_cpp, out_py) = quantizer_cpp->convert_and_update_tensor(output_list[i]);
+
+    NVTE_CHECK(out_cpp.get_amax().data_ptr != nullptr ||
+                   out_cpp.get_columnwise_amax().data_ptr != nullptr,
+               "compute_multi_amax_nvfp4: output[", i, "] has no amax buffer");
+
+    output_wrappers.emplace_back(std::move(out_cpp));
+    // quantizer_cpp and out_py are released here at end-of-iteration.
+
+    if (input_wrappers.back().numel() == 0) continue;
+    inputs_nvte.push_back(input_wrappers.back().data());
+    outputs_nvte.push_back(output_wrappers.back().data());
+  }
+
+  if (inputs_nvte.empty()) return;
+
+  QuantizationConfigWrapper quant_config;
+  auto stream = at::cuda::getCurrentCUDAStream();
+  NVTE_SCOPED_GIL_RELEASE({
+    nvte_multi_compute_amax(inputs_nvte.data(), outputs_nvte.data(), inputs_nvte.size(),
+                            quant_config, stream);
+  });
+}
+
 namespace {
 
 // helper functions for NVFP4 grouped quantization (cuda graph safe with shapes stored in device without D2H copy)
diff --git a/transformer_engine/pytorch/csrc/extensions/pybind.cpp b/transformer_engine/pytorch/csrc/extensions/pybind.cpp
index daee89a038..2a9281bc78 100644
--- a/transformer_engine/pytorch/csrc/extensions/pybind.cpp
+++ b/transformer_engine/pytorch/csrc/extensions/pybind.cpp
@@ -144,6 +144,9 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
         "NVFP4: cast using pre-reduced amax in output's amax buffers; skips amax compute and allreduce",
         py::arg("tensor"), py::arg("quantizer"), py::arg("output") = py::none(),
         py::arg("noop") = py::none());
+  m.def("compute_multi_amax_nvfp4", transformer_engine::pytorch::compute_multi_amax_nvfp4,
+        "NVFP4: fused multi-tensor amax compute (writes both rowwise+columnwise amax per output)",
+        py::arg("tensor_list"), py::arg("quantizer_list"), py::arg("output_list"));
   m.def("dequantize", &transformer_engine::pytorch::dequantize, "Dequantize", py::arg("input"),
         py::arg("otype"));
   m.def("group_quantize", transformer_engine::pytorch::group_quantize, py::arg("tensor"),
diff --git a/transformer_engine/pytorch/module/extended_tensor_parallelism.py b/transformer_engine/pytorch/module/extended_tensor_parallelism.py
index cb823f8680..6c1297da28 100644
--- a/transformer_engine/pytorch/module/extended_tensor_parallelism.py
+++ b/transformer_engine/pytorch/module/extended_tensor_parallelism.py
@@ -238,6 +238,10 @@ def get_rs_streams_for_chain(chain_id: str) -> list:
     hasattr(tex, "compute_amax_nvfp4") and hasattr(tex, "quantize_cast_only_nvfp4")
 )
 
+# Tier-2: multi-tensor amax kernel fuses N per-expert (zero_amax + amax + D2D) chains
+# into two multi-tensor kernel launches.  Independent of Tier-1 coalesced allreduce.
+_MULTI_AMAX_TE_API_AVAILABLE = hasattr(tex, "compute_multi_amax_nvfp4")
+
 
 def _coalesced_amax_static_eligible(weights):
     """Walk the weight list once and decide whether the coalesced-amax path
@@ -281,13 +285,32 @@ def _quantize_with_coalesced_amax(weights, skip_weight_cast, cast_noop_flag):
     # Phase 1: per-weight local amax into each w.quantized's amax buffers.
     # Keep rowwise/columnwise both populated so the group allreduce sees
     # whichever the consumer GEMM will read.
-    for w, shard in zip(weights, padded_shards):
+    for w in weights:
         w._quantizer.set_usage(rowwise=True, columnwise=True)
-        tex.compute_amax_nvfp4(
-            tensor=shard,
-            quantizer=w._quantizer,
-            output=w.quantized,
+    if _MULTI_AMAX_TE_API_AVAILABLE:
+        # Tier-2: single multi-tensor launch writes both rowwise and columnwise
+        # amax directly (no per-expert D2D replicate), fusing N per-expert chains.
+        # w._quantizer is set once by _configure_quantizer and never rebinds, so
+        # cache the list on weights[0] alongside _coalesced_amax_static.  Output
+        # list is NOT cached because w.quantized can rebind if the weight is
+        # re-quantized externally.
+        anchor = weights[0]
+        quantizer_list = getattr(anchor, "_multi_amax_quantizer_list", None)
+        if quantizer_list is None:
+            quantizer_list = [w._quantizer for w in weights]
+            anchor._multi_amax_quantizer_list = quantizer_list
+        tex.compute_multi_amax_nvfp4(
+            padded_shards,
+            quantizer_list,
+            [w.quantized for w in weights],
         )
+    else:
+        for w, shard in zip(weights, padded_shards):
+            tex.compute_amax_nvfp4(
+                tensor=shard,
+                quantizer=w._quantizer,
+                output=w.quantized,
+            )
 
     # Phase 2: one coalesced allreduce across every weight's amax tensors.
     amax_tensors = []

From f7a08f792aa70941ab3a22077128de2a84809039 Mon Sep 17 00:00:00 2001
From: Shiqing Fan <shiqingf@nvidia.com>
Date: Thu, 23 Apr 2026 18:36:16 -0700
Subject: [PATCH 32/43] ETP: cache hot-path lookups in ETP to reduce python
 overhead.

Changes:
    - Lazy-cache ag_stream / rs_stream on self (resolved once from
      chain_id + group; prior path hit a dict lookup every call).
    - Cache quantizers / dtypes / etp_group on the anchor weight
      (rebuilt via list comprehensions on every _all_gather_weight call).
    - Consolidate _multi_amax_quantizer_list into _cached_quantizers
      (single cache shared between Tier-2 amax and _all_gather_weight).
    - Gate the duplicate-output-buffer assertion in batched AG behind
      ETP_CONFIG.check_param_states (was running O(N) per call).
    - Drop a dead `out_buffers is not None` check (always a list)
---
 .../module/extended_tensor_parallelism.py     | 49 ++++++++++++++-----
 1 file changed, 37 insertions(+), 12 deletions(-)

diff --git a/transformer_engine/pytorch/module/extended_tensor_parallelism.py b/transformer_engine/pytorch/module/extended_tensor_parallelism.py
index 6c1297da28..853e8ad907 100644
--- a/transformer_engine/pytorch/module/extended_tensor_parallelism.py
+++ b/transformer_engine/pytorch/module/extended_tensor_parallelism.py
@@ -290,15 +290,12 @@ def _quantize_with_coalesced_amax(weights, skip_weight_cast, cast_noop_flag):
     if _MULTI_AMAX_TE_API_AVAILABLE:
         # Tier-2: single multi-tensor launch writes both rowwise and columnwise
         # amax directly (no per-expert D2D replicate), fusing N per-expert chains.
-        # w._quantizer is set once by _configure_quantizer and never rebinds, so
-        # cache the list on weights[0] alongside _coalesced_amax_static.  Output
-        # list is NOT cached because w.quantized can rebind if the weight is
-        # re-quantized externally.
+        # Reuse the _cached_quantizers list already populated by _all_gather_weight
         anchor = weights[0]
-        quantizer_list = getattr(anchor, "_multi_amax_quantizer_list", None)
+        quantizer_list = anchor._cached_quantizers
         if quantizer_list is None:
             quantizer_list = [w._quantizer for w in weights]
-            anchor._multi_amax_quantizer_list = quantizer_list
+            anchor._cached_quantizers = quantizer_list
         tex.compute_multi_amax_nvfp4(
             padded_shards,
             quantizer_list,
@@ -553,6 +550,13 @@ def __init__(self, x, *args, **kwargs):
         self.pad_length = 0
         # Debug
         self._debug_name = ""
+        # Hot-path caches (populated lazily on first use).  chain_id/group are
+        # set after __init__, so we can't resolve streams eagerly here.
+        self._cached_ag_stream = None
+        self._cached_rs_stream = None
+        self._cached_quantizers = None
+        self._cached_dtypes = None
+        self._cached_etp_group = None
 
     def setup(self, weight_quantizer=None):
         """Set quantizer and create quantized shard."""
@@ -757,7 +761,13 @@ def _all_gather_weight(self, async_op, skip_weight_cast, cast_noop_flag, fwd, nv
                 w._quantizer.set_usage(rowwise=fwd, columnwise=not fwd)
 
         # 3. Build gather inputs.
-        quantizers = [w._quantizer for w in weights]
+        # quantizers / dtypes / etp_group are stable after model construction —
+        # cache on the anchor (self == weights[0]) to avoid rebuilding lists
+        # every call.  w.quantized is NOT cached because it can rebind.
+        quantizers = self._cached_quantizers
+        if quantizers is None:
+            quantizers = [w._quantizer for w in weights]
+            self._cached_quantizers = quantizers
         if weights[0].did_cast_to_low_precision:
             gather_weights = [w.quantized for w in weights]
         else:
@@ -765,7 +775,10 @@ def _all_gather_weight(self, async_op, skip_weight_cast, cast_noop_flag, fwd, nv
 
         # 4. Cache checkout — use pooled buffers for both async and sync gathers
         #    to avoid allocating fresh memory each iteration.
-        dtypes = [q.dtype if q is not None else w.dtype for q, w in zip(quantizers, weights)]
+        dtypes = self._cached_dtypes
+        if dtypes is None:
+            dtypes = [q.dtype if q is not None else w.dtype for q, w in zip(quantizers, weights)]
+            self._cached_dtypes = dtypes
         out_buffers = []
         cache = get_global_ETP_cache()
         for p, dt in zip(weights, dtypes):
@@ -781,8 +794,12 @@ def _all_gather_weight(self, async_op, skip_weight_cast, cast_noop_flag, fwd, nv
                 out_buffers.append(cache.get(p._ag_ticket_bwd))
 
         # 5. Communicate.
-        etp_group = weights[0].group
-        if out_buffers is not None and len(gather_weights) > 1:
+        etp_group = self._cached_etp_group
+        if etp_group is None:
+            etp_group = weights[0].group
+            self._cached_etp_group = etp_group
+        if ETP_CONFIG.check_param_states and len(gather_weights) > 1:
+            # Debug invariant: batched AG needs distinct output buffers per expert.
             assert len(set(id(b) for b in out_buffers)) == len(out_buffers), \
                 "Duplicate output buffers in batched all-gather — experts need distinct cache keys"
 
@@ -828,7 +845,11 @@ def _wait_param_gather(self):
         # wait on NCCL's completion event into ag_stream; ag_event.record()
         # then marks ag_stream for consumers (main_stream via ag_event.wait
         # or MLM drains via main.wait_stream).
-        with torch.cuda.stream(get_ag_stream(self.chain_id, self.group)):
+        ag_stream = self._cached_ag_stream
+        if ag_stream is None:
+            ag_stream = get_ag_stream(self.chain_id, self.group)
+            self._cached_ag_stream = ag_stream
+        with torch.cuda.stream(ag_stream):
             if self._prefetch_handle is not None:
                 self._prefetch_handle.wait()
                 self._prefetch_handle = None
@@ -1052,7 +1073,11 @@ def _wait_reduce_scatter(self):
         # (not rs_stream) because main produced the RS input (wgrad) and
         # naturally holds the write→read ordering. Wait-site enters rs_stream
         # so it observes NCCL completion and rs_event marks it for consumers.
-        with torch.cuda.stream(get_rs_stream(self.chain_id, self.group)):
+        rs_stream = self._cached_rs_stream
+        if rs_stream is None:
+            rs_stream = get_rs_stream(self.chain_id, self.group)
+            self._cached_rs_stream = rs_stream
+        with torch.cuda.stream(rs_stream):
             if self._wgrad_rs_handle is not None:
                 self._wgrad_rs_handle.wait()
                 self._wgrad_rs_handle = None

From 9f614f4dd156758035eed67156099b89688ca9cc Mon Sep 17 00:00:00 2001
From: Shiqing Fan <shiqingf@nvidia.com>
Date: Thu, 23 Apr 2026 18:37:19 -0700
Subject: [PATCH 33/43] ETP: disable check_param_states by default

---
 .../pytorch/module/extended_tensor_parallelism.py               | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/transformer_engine/pytorch/module/extended_tensor_parallelism.py b/transformer_engine/pytorch/module/extended_tensor_parallelism.py
index 853e8ad907..10f5c08479 100644
--- a/transformer_engine/pytorch/module/extended_tensor_parallelism.py
+++ b/transformer_engine/pytorch/module/extended_tensor_parallelism.py
@@ -340,7 +340,7 @@ def _quantize_with_coalesced_amax(weights, skip_weight_cast, cast_noop_flag):
 class ETPConfig:
     """Global configuration for Extended Tensor Parallelism."""
     pad_for_alignment: int = 16
-    check_param_states: bool = True
+    check_param_states: bool = False
     weight_prefetch: bool = True
     # When True and the weight list in _all_gather_weight contains >1 NVFP4
     # shards that share an amax reduction group, coalesce their per-expert

From eace39d5ee15a3f2a21d77006e17c7a2ca7cc0ef Mon Sep 17 00:00:00 2001
From: Shiqing Fan <shiqingf@nvidia.com>
Date: Fri, 24 Apr 2026 07:03:02 -0700
Subject: [PATCH 34/43] ETP divergence fix: revert async AG issue-site stream
 wrapper from 7cc86fd7

The `with torch.cuda.stream(target.ag_stream):` wrapper re-routed NCCL's                                                        preEvent onto an idle stream, so the AG raced the caller-stream writer
 (quantize / sharded-weight update).

Issue now on caller's stream; _wait_param_gather keeps ag_stream.

Verified: 5000+ steps clean on TP2ETP2_EP2EETP2 nvfp4, 1/4 Ultra,
32xGB200.
---
 .../module/extended_tensor_parallelism.py     | 58 ++++++++-----------
 1 file changed, 24 insertions(+), 34 deletions(-)

diff --git a/transformer_engine/pytorch/module/extended_tensor_parallelism.py b/transformer_engine/pytorch/module/extended_tensor_parallelism.py
index 10f5c08479..a45aa4efcc 100644
--- a/transformer_engine/pytorch/module/extended_tensor_parallelism.py
+++ b/transformer_engine/pytorch/module/extended_tensor_parallelism.py
@@ -834,17 +834,8 @@ def _all_gather_weight(self, async_op, skip_weight_cast, cast_noop_flag, fwd, nv
         return result, handle
 
     def _wait_param_gather(self):
-        # Wait-site for the async AG. Issuer (all_gather_and_prefetch{,_bwd})
-        # and wait-site both use the TARGET's ag_stream so the caller-stream
-        # "preEvent" PyTorch records at issue time lives on an idle stream.
-        # A busy issue-stream would queue the preEvent behind pending work,
-        # delay NCCL start, and — even with the sync chain main ← ag_event ←
-        # ag_stream handle.wait() ← NCCL endEvent — leave the consumer GEMM
-        # reading a partial AG buffer. (NCCL kernel itself runs on PyTorch's
-        # per-PG ncclStream, not ag_stream.) handle.wait() here inserts the
-        # wait on NCCL's completion event into ag_stream; ag_event.record()
-        # then marks ag_stream for consumers (main_stream via ag_event.wait
-        # or MLM drains via main.wait_stream).
+        # Wait on ag_stream so ag_event.record() marks ag_stream's tail —
+        # MLM's external drains (wait_stream(ag_stream)) need that to block.
         ag_stream = self._cached_ag_stream
         if ag_stream is None:
             ag_stream = get_ag_stream(self.chain_id, self.group)
@@ -922,13 +913,14 @@ def all_gather_and_prefetch_bwd(self, nvtx_label=None):
             and self.prev_w._need_weight_prefetch
             and self.prev_w._need_weight_prefetch_bwd
         ):
-            # Issue on the target's ag_stream (see _wait_param_gather).
-            target_stream = get_ag_stream(self.prev_w.chain_id, self.prev_w.group)
-            with torch.cuda.stream(target_stream):
-                _, handle = self.prev_w._all_gather_weight(
-                    async_op=True, skip_weight_cast=True, cast_noop_flag=None,
-                    fwd=False, nvtx_label=nvtx_label,
-                )
+            # Issue on caller's stream — preEvent then captures the AG input
+            # writer via program order. Do NOT wrap in torch.cuda.stream(ag_stream):
+            # that drops the writer edge (ag_stream's tail has no dependency
+            # on capture_stream's writer) and NCCL reads partial data.
+            _, handle = self.prev_w._all_gather_weight(
+                async_op=True, skip_weight_cast=True, cast_noop_flag=None,
+                fwd=False, nvtx_label=nvtx_label,
+            )
             self.prev_w._prefetch_handle = handle
 
         # The unsharded tensor has been returned, no pending work so reset state to NONE
@@ -971,15 +963,13 @@ def all_gather_and_prefetch(
             and self.next_w is not None
             and self.next_w._need_weight_prefetch
         ):
-            # Issue on the target's ag_stream (see _wait_param_gather).
-            target_stream = get_ag_stream(self.next_w.chain_id, self.next_w.group)
-            with torch.cuda.stream(target_stream):
-                _, handle = self.next_w._all_gather_weight(
-                    async_op=True,
-                    skip_weight_cast=skip_weight_cast,
-                    cast_noop_flag=cast_noop_flag,
-                    fwd=fwd, nvtx_label=nvtx_label,
-                )
+            # Issue on caller's stream. See all_gather_and_prefetch_bwd.
+            _, handle = self.next_w._all_gather_weight(
+                async_op=True,
+                skip_weight_cast=skip_weight_cast,
+                cast_noop_flag=cast_noop_flag,
+                fwd=fwd, nvtx_label=nvtx_label,
+            )
             self.next_w._prefetch_handle = handle
 
         # The unsharded tensor has been returned, no pending work so reset state to NONE
@@ -1069,10 +1059,7 @@ def _finalize_wgrad(param, wgrad_rs):
 
 
     def _wait_reduce_scatter(self):
-        # Asymmetric wrt _wait_param_gather: RS is issued from main_stream
-        # (not rs_stream) because main produced the RS input (wgrad) and
-        # naturally holds the write→read ordering. Wait-site enters rs_stream
-        # so it observes NCCL completion and rs_event marks it for consumers.
+        # Wait on rs_stream — mirrors _wait_param_gather for the RS path.
         rs_stream = self._cached_rs_stream
         if rs_stream is None:
             rs_stream = get_rs_stream(self.chain_id, self.group)
@@ -1164,9 +1151,12 @@ def wgrad_reduce_scatter(self, wgrad, nvtx_label=None):
         poolable = self.chain_id == ETPChain.UNGRAPHED.value
 
         if ETP_CONFIG.weight_prefetch and self.prev_w is not None:
-            # Async reduce-scatter (not last weight — deferred finish).  Issue on rs_stream to
-            # match wait-site (issue-site invariant; see _wait_param_gather).  wgrad is produced
-            # on outer stream by bwd GEMM, so sync outer → rs_stream first.
+            # Async reduce-scatter (not last weight — deferred finish). Issue on
+            # rs_stream with an explicit outer→rs_stream event so the bwd GEMM's
+            # wgrad writer edge is preserved. (NCCL runs on ncclStream regardless;
+            # the wrap only gives wait_stream(rs_stream) a useful tail before
+            # _wait_reduce_scatter runs. Do NOT copy this pattern without the
+            # event — see all_gather_and_prefetch_bwd.)
             outer_stream = torch.cuda.current_stream()
             rs_stream = get_rs_stream(self.chain_id, self.group)
             outer_sync_event = torch.cuda.Event()

From 4ece70558cf590d8e3858b11126ef2ff34e576f5 Mon Sep 17 00:00:00 2001
From: Shiqing Fan <shiqingf@nvidia.com>
Date: Fri, 24 Apr 2026 21:17:55 -0700
Subject: [PATCH 35/43] ETP+CG: fix for flaky NaN issue at scale: async AG/RS
 issue on side streams with explicit producer event

---
 .../module/extended_tensor_parallelism.py     | 149 +++++++++++-------
 1 file changed, 89 insertions(+), 60 deletions(-)

diff --git a/transformer_engine/pytorch/module/extended_tensor_parallelism.py b/transformer_engine/pytorch/module/extended_tensor_parallelism.py
index a45aa4efcc..884d055ed7 100644
--- a/transformer_engine/pytorch/module/extended_tensor_parallelism.py
+++ b/transformer_engine/pytorch/module/extended_tensor_parallelism.py
@@ -3,6 +3,7 @@
 # See LICENSE for license information.
 
 from collections import defaultdict
+from contextlib import nullcontext
 from typing import Dict, List, Optional
 from enum import Enum
 from dataclasses import dataclass, field
@@ -340,7 +341,7 @@ def _quantize_with_coalesced_amax(weights, skip_weight_cast, cast_noop_flag):
 class ETPConfig:
     """Global configuration for Extended Tensor Parallelism."""
     pad_for_alignment: int = 16
-    check_param_states: bool = False
+    check_param_states: bool = True
     weight_prefetch: bool = True
     # When True and the weight list in _all_gather_weight contains >1 NVFP4
     # shards that share an amax reduction group, coalesce their per-expert
@@ -803,25 +804,43 @@ def _all_gather_weight(self, async_op, skip_weight_cast, cast_noop_flag, fwd, nv
             assert len(set(id(b) for b in out_buffers)) == len(out_buffers), \
                 "Duplicate output buffers in batched all-gather — experts need distinct cache keys"
 
-        if len(gather_weights) > 1:
-            nvtx_range_push(f"{nvtx_label}.batched_etp_ag")
-            results, handle = grouped_gather_along_first_dim(
-                gather_weights, etp_group,
-                async_op=async_op,
-                quantizers=quantizers,
-                output_tensors=out_buffers,
-            )
-            nvtx_range_pop(f"{nvtx_label}.batched_etp_ag")
+        # ASYNC AG: wrap issue on ag_stream so both issue (NCCL preEvent) and
+        # wait land on the same stream — ag_stream's tail then reflects the
+        # collective's full lifecycle, which is what external
+        # wait_stream(ag_stream) drains depend on. Explicit outer→ag_stream
+        # event preserves the quantize writer edge (a bare stream context
+        # would drop it).
+        # SYNC AG: stay on caller — output ready on return.
+        if async_op:
+            outer_stream = torch.cuda.current_stream()
+            ag_stream = get_ag_stream(self.chain_id, etp_group)
+            outer_sync_event = torch.cuda.Event()
+            outer_sync_event.record(outer_stream)
+            ag_stream.wait_event(outer_sync_event)
+            ag_ctx = torch.cuda.stream(ag_stream)
         else:
-            nvtx_range_push(f"{nvtx_label}.etp_ag")
-            weight_total, handle = gather_along_first_dim(
-                gather_weights[0], etp_group,
-                quantizer=quantizers[0],
-                async_op=async_op,
-                output_tensor=out_buffers[0] if out_buffers is not None else None,
-            )
-            nvtx_range_pop(f"{nvtx_label}.etp_ag")
-            results = [weight_total]
+            ag_ctx = nullcontext()
+
+        with ag_ctx:
+            if len(gather_weights) > 1:
+                nvtx_range_push(f"{nvtx_label}.batched_etp_ag")
+                results, handle = grouped_gather_along_first_dim(
+                    gather_weights, etp_group,
+                    async_op=async_op,
+                    quantizers=quantizers,
+                    output_tensors=out_buffers,
+                )
+                nvtx_range_pop(f"{nvtx_label}.batched_etp_ag")
+            else:
+                nvtx_range_push(f"{nvtx_label}.etp_ag")
+                weight_total, handle = gather_along_first_dim(
+                    gather_weights[0], etp_group,
+                    quantizer=quantizers[0],
+                    async_op=async_op,
+                    output_tensor=out_buffers[0] if out_buffers is not None else None,
+                )
+                nvtx_range_pop(f"{nvtx_label}.etp_ag")
+                results = [weight_total]
 
         result = results if self.is_routed_expert else results[0]
 
@@ -834,8 +853,10 @@ def _all_gather_weight(self, async_op, skip_weight_cast, cast_noop_flag, fwd, nv
         return result, handle
 
     def _wait_param_gather(self):
-        # Wait on ag_stream so ag_event.record() marks ag_stream's tail —
-        # MLM's external drains (wait_stream(ag_stream)) need that to block.
+        # Enter ag_stream context so handle.wait() + ag_event.record() both
+        # land on ag_stream. That makes ag_event mark ag_stream's tail, which
+        # is what external drains (wait_stream(ag_stream) in finalize_model_grads
+        # and cuda_graphs._wait_side_streams) actually block on.
         ag_stream = self._cached_ag_stream
         if ag_stream is None:
             ag_stream = get_ag_stream(self.chain_id, self.group)
@@ -913,10 +934,9 @@ def all_gather_and_prefetch_bwd(self, nvtx_label=None):
             and self.prev_w._need_weight_prefetch
             and self.prev_w._need_weight_prefetch_bwd
         ):
-            # Issue on caller's stream — preEvent then captures the AG input
-            # writer via program order. Do NOT wrap in torch.cuda.stream(ag_stream):
-            # that drops the writer edge (ag_stream's tail has no dependency
-            # on capture_stream's writer) and NCCL reads partial data.
+            # Pre-AG work (quantize, ticket lookup) runs on caller's stream;
+            # the NCCL collective itself is wrapped on ag_stream inside
+            # _all_gather_weight (see the async/sync gate there for rationale).
             _, handle = self.prev_w._all_gather_weight(
                 async_op=True, skip_weight_cast=True, cast_noop_flag=None,
                 fwd=False, nvtx_label=nvtx_label,
@@ -963,7 +983,8 @@ def all_gather_and_prefetch(
             and self.next_w is not None
             and self.next_w._need_weight_prefetch
         ):
-            # Issue on caller's stream. See all_gather_and_prefetch_bwd.
+            # Pre-AG work on caller; NCCL wrap lives at the collective site
+            # inside _all_gather_weight. See all_gather_and_prefetch_bwd.
             _, handle = self.next_w._all_gather_weight(
                 async_op=True,
                 skip_weight_cast=skip_weight_cast,
@@ -1059,7 +1080,8 @@ def _finalize_wgrad(param, wgrad_rs):
 
 
     def _wait_reduce_scatter(self):
-        # Wait on rs_stream — mirrors _wait_param_gather for the RS path.
+        # Enter rs_stream context so handle.wait() + rs_event.record() land
+        # on rs_stream — mirrors _wait_param_gather for the RS path.
         rs_stream = self._cached_rs_stream
         if rs_stream is None:
             rs_stream = get_rs_stream(self.chain_id, self.group)
@@ -1110,27 +1132,43 @@ def _reduce_scatter(self, wgrads, async_op, nvtx_label=None):
         else:
             out_buffers = [None] * len(wgrads)
 
-        if len(wgrads) == 1:
-            nvtx_range_push(f"{nvtx_label}.etp_rs")
-            out, handle = reduce_scatter_along_first_dim(
-                wgrads[0], self.group, async_op=async_op, output=out_buffers[0]
-            )
-            nvtx_range_pop(f"{nvtx_label}.etp_rs")
-            return [out], handle
+        # ASYNC RS: wrap issue on rs_stream — issue and wait on the same stream
+        # means rs_stream's tail reflects the full NCCL lifecycle, what
+        # external wait_stream(rs_stream) drains depend on. Explicit outer→
+        # rs_stream event preserves the wgrad-GEMM writer edge. Mirrors AG.
+        # SYNC RS: stay on caller — same constraint as sync AG.
+        if async_op:
+            outer_stream = torch.cuda.current_stream()
+            rs_stream = get_rs_stream(self.chain_id, self.group)
+            outer_sync_event = torch.cuda.Event()
+            outer_sync_event.record(outer_stream)
+            rs_stream.wait_event(outer_sync_event)
+            rs_ctx = torch.cuda.stream(rs_stream)
         else:
-            outputs = []
-            nvtx_range_push(f"{nvtx_label}.batched_etp_rs")
-            with torch.distributed._coalescing_manager(
-                group=self.group,
-                device=wgrads[0].device,
-                async_ops=async_op,
-            ) as cm:
-                for out_buffer, tensor in zip(out_buffers, wgrads):
-                    out, _ = reduce_scatter_along_first_dim(tensor, self.group, output=out_buffer)
-                    outputs.append(out)
-            nvtx_range_pop(f"{nvtx_label}.batched_etp_rs")
-
-            return outputs, cm if async_op else None
+            rs_ctx = nullcontext()
+
+        with rs_ctx:
+            if len(wgrads) == 1:
+                nvtx_range_push(f"{nvtx_label}.etp_rs")
+                out, handle = reduce_scatter_along_first_dim(
+                    wgrads[0], self.group, async_op=async_op, output=out_buffers[0]
+                )
+                nvtx_range_pop(f"{nvtx_label}.etp_rs")
+                return [out], handle
+            else:
+                outputs = []
+                nvtx_range_push(f"{nvtx_label}.batched_etp_rs")
+                with torch.distributed._coalescing_manager(
+                    group=self.group,
+                    device=wgrads[0].device,
+                    async_ops=async_op,
+                ) as cm:
+                    for out_buffer, tensor in zip(out_buffers, wgrads):
+                        out, _ = reduce_scatter_along_first_dim(tensor, self.group, output=out_buffer)
+                        outputs.append(out)
+                nvtx_range_pop(f"{nvtx_label}.batched_etp_rs")
+
+                return outputs, cm if async_op else None
 
     def wgrad_reduce_scatter(self, wgrad, nvtx_label=None):
         """Reduce-scatter wgrad(s). Sync for last weight, async+deferred for others.
@@ -1151,19 +1189,10 @@ def wgrad_reduce_scatter(self, wgrad, nvtx_label=None):
         poolable = self.chain_id == ETPChain.UNGRAPHED.value
 
         if ETP_CONFIG.weight_prefetch and self.prev_w is not None:
-            # Async reduce-scatter (not last weight — deferred finish). Issue on
-            # rs_stream with an explicit outer→rs_stream event so the bwd GEMM's
-            # wgrad writer edge is preserved. (NCCL runs on ncclStream regardless;
-            # the wrap only gives wait_stream(rs_stream) a useful tail before
-            # _wait_reduce_scatter runs. Do NOT copy this pattern without the
-            # event — see all_gather_and_prefetch_bwd.)
-            outer_stream = torch.cuda.current_stream()
-            rs_stream = get_rs_stream(self.chain_id, self.group)
-            outer_sync_event = torch.cuda.Event()
-            outer_sync_event.record(outer_stream)
-            rs_stream.wait_event(outer_sync_event)
-            with torch.cuda.stream(rs_stream):
-                _, rs_handle = self._reduce_scatter(wgrads, async_op=True, nvtx_label=nvtx_label)
+            # Async reduce-scatter (not last weight — deferred finish). Pre-RS
+            # work on caller; NCCL wrap lives at the collective site inside
+            # _reduce_scatter (mirrors the AG prefetch sites).
+            _, rs_handle = self._reduce_scatter(wgrads, async_op=True, nvtx_label=nvtx_label)
             self._wgrad_rs_handle = ETPShardHandle(rs_handle, weights, reduce_scatter=True)
             # Stash wgrad input buffers — cannot recycle yet because the async RS
             # kernel is still reading them on rs_stream.

From a4ce839fba3724443b782ab7ac0a4aebc119a804 Mon Sep 17 00:00:00 2001
From: Shiqing Fan <shiqingf@nvidia.com>
Date: Sat, 25 Apr 2026 05:16:22 -0700
Subject: [PATCH 36/43] ETP: disable check_param_states by default

---
 .../pytorch/module/extended_tensor_parallelism.py               | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/transformer_engine/pytorch/module/extended_tensor_parallelism.py b/transformer_engine/pytorch/module/extended_tensor_parallelism.py
index 884d055ed7..054db87249 100644
--- a/transformer_engine/pytorch/module/extended_tensor_parallelism.py
+++ b/transformer_engine/pytorch/module/extended_tensor_parallelism.py
@@ -341,7 +341,7 @@ def _quantize_with_coalesced_amax(weights, skip_weight_cast, cast_noop_flag):
 class ETPConfig:
     """Global configuration for Extended Tensor Parallelism."""
     pad_for_alignment: int = 16
-    check_param_states: bool = True
+    check_param_states: bool = False
     weight_prefetch: bool = True
     # When True and the weight list in _all_gather_weight contains >1 NVFP4
     # shards that share an amax reduction group, coalesce their per-expert

From f446e0236502aab4fbb24b79e4425595726dbd6a Mon Sep 17 00:00:00 2001
From: Shiqing Fan <shiqingf@nvidia.com>
Date: Sun, 26 Apr 2026 22:13:38 -0700
Subject: [PATCH 37/43] =?UTF-8?q?ETP+CG:=20enable=20cross-graph=20RS=20ove?=
 =?UTF-8?q?rlap=20=E2=80=94=20main=5Fgrad.add=5F=20on=20rs=5Fstream?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

  Move gradient accumulation from caller stream to rs_stream inside
  _wait_reduce_scatter(finalize_grad=True). The add_ starts right
  after NCCL RS (concurrent with Phase 1 AG drain) instead of after
  it, avoiding SM-saturation that blocks cross-graph overlap.
---
 .../module/extended_tensor_parallelism.py     | 88 ++++++++++++++-----
 1 file changed, 65 insertions(+), 23 deletions(-)

diff --git a/transformer_engine/pytorch/module/extended_tensor_parallelism.py b/transformer_engine/pytorch/module/extended_tensor_parallelism.py
index 054db87249..623d89c905 100644
--- a/transformer_engine/pytorch/module/extended_tensor_parallelism.py
+++ b/transformer_engine/pytorch/module/extended_tensor_parallelism.py
@@ -894,9 +894,15 @@ def _get_prefetched_weight(self, fwd, skip_weight_cast=False, cast_noop_flag=Non
                     f"cache.get() would return stale data. Check the chain's "
                     f"_need_weight_prefetch flag and issuer's prefetch logic."
                 )
-        # Wait for async prefetch if in progress
-        self._wait_param_gather()
-        self.ag_event.wait()
+        if getattr(self, '_already_ag_drained', False):
+            # Producer already drained via wait_async_comms; skip the captured
+            # cross-graph wait (CUDA no-op anyway). Correctness is provided by
+            # the eager main_stream sync chain in the surrounding training loop.
+            self._already_ag_drained = False
+        else:
+            # Intra-graph or eager consume: drain inline.
+            self._wait_param_gather()
+            self.ag_event.wait()
 
         # Retrieve prefetched results from cache
         result = []
@@ -1057,31 +1063,26 @@ def _finalize_wgrad(param, wgrad_rs):
 
         param._set_rs_state(ETPWeightState.NONE)
 
-        # 1. Strip padding
         if param.is_padded_last_rank:
             wgrad_rs = param._strip_padding(wgrad_rs)
 
-        # 2. Accumulation: accumulate wgrad into main_grad
         param.main_grad.add_(wgrad_rs)
         if hasattr(param, "grad_added_to_main_grad"):
             param.grad_added_to_main_grad = True
         dummy_grad = get_dummy_wgrad(list(param.main_grad.shape), param.dtype)
 
-        # 3. Trigger DDP backward hook (register_grad_ready).
-        # ETP bypasses autograd's normal gradient flow (returns None for async RS,
-        # accumulates directly into main_grad), so we must trigger the DDP hook
-        # manually. Do NOT set param.grad before calling — the hook checks
-        # param.grad and would accumulate it into main_grad if zero_out_wgrad
-        # is True, corrupting the gradient with a non-zero dummy.
         if getattr(param, '_grad_accum_hook', None) is not None:
             param._grad_accum_hook()
 
         return dummy_grad
 
 
-    def _wait_reduce_scatter(self):
+    def _wait_reduce_scatter(self, finalize_grad=False):
         # Enter rs_stream context so handle.wait() + rs_event.record() land
         # on rs_stream — mirrors _wait_param_gather for the RS path.
+        # When finalize_grad=True, main_grad.add_ also runs on rs_stream
+        # (right after NCCL RS), so it starts during AG drain rather than
+        # after it — avoids SM-saturation blocking cross-graph overlap.
         rs_stream = self._cached_rs_stream
         if rs_stream is None:
             rs_stream = get_rs_stream(self.chain_id, self.group)
@@ -1091,6 +1092,18 @@ def _wait_reduce_scatter(self):
                 self._wgrad_rs_handle.wait()
                 self._wgrad_rs_handle = None
                 self.rs_event.record()
+                if finalize_grad:
+                    cache = get_global_ETP_cache()
+                    for w in self._weights:
+                        w._set_rs_state(ETPWeightState.NONE)
+                        wgrad_rs = cache.get(w._rs_ticket)
+                        if w.is_padded_last_rank:
+                            wgrad_rs = w._strip_padding(wgrad_rs)
+                        w.main_grad.add_(wgrad_rs)
+                        cache.release(w._rs_ticket)
+                        if hasattr(w, "grad_added_to_main_grad"):
+                            w.grad_added_to_main_grad = True
+                    self._already_finalized = True
         # Release stashed wgrad inputs: UNGRAPHED buffers go back to the pool;
         # GRAPHED just drops Python refs (addresses must stay stable for CG).
         if getattr(self, '_wgrad_input_bufs', None) is not None:
@@ -1211,12 +1224,15 @@ def wgrad_reduce_scatter(self, wgrad, nvtx_label=None):
         # Currently only support reduce scattering in reverse order
         if ETP_CONFIG.weight_prefetch and self.next_w is not None:
             self.next_w._wait_reduce_scatter()
-            self.next_w.rs_event.wait()
 
-            cache = get_global_ETP_cache()
-            for w in self.next_w._weights:
-                self._finalize_wgrad(w, cache.get(w._rs_ticket))
-                cache.release(w._rs_ticket)
+            if getattr(self.next_w, '_already_finalized', False):
+                self.next_w._already_finalized = False
+            else:
+                self.next_w.rs_event.wait()
+                cache = get_global_ETP_cache()
+                for w in self.next_w._weights:
+                    self._finalize_wgrad(w, cache.get(w._rs_ticket))
+                    cache.release(w._rs_ticket)
 
         return ret
 
@@ -1471,19 +1487,45 @@ def reallocate_etp_cache_to_mempool(device, mempool):
         _ETP_CACHE.reallocate_to_mempool(device, mempool)
 
 
-def wait_async_comms(chain_id: str = None):
-    """Wait on in-flight ETP async communications (all-gathers + reduce-scatters).
+def wait_async_comms(chain_id: str = None, skip_rs: bool = False, finalize_after_drain: bool = False):
+    """Drain in-flight ETP async AG / RS handles.
+
+    When called inside CUDA graph capture, the drains are captured into that
+    graph. This is the producer-side hook for cross-graph AG/RS overlap:
+    captured cudaStreamWaitEvent on an event recorded in a different capture
+    session is a CUDA no-op, so consumer graphs can't safely wait on
+    cross-graph events. Instead, the producer drains here and flags the
+    param; the consumer reads the flag and skips its captured wait.
 
     Args:
-        chain_id: If specified, only drain params belonging to this chain
-                  (ETPChain.GRAPHED.value or ETPChain.UNGRAPHED.value).
-                  If None, drain all chains.
+        chain_id: If specified, only drain params on this chain.
+        skip_rs:  Drain AG only; leave RS in flight.
+        finalize_after_drain: After RS drain, also accumulate wgrad into
+                 main_grad. Runs main_grad.add_ on rs_stream (right after
+                 NCCL RS) so it starts during AG drain rather than after,
+                 avoiding SM-saturation that blocks cross-graph overlap.
+                 Falls back to caller-stream _finalize_wgrad if no RS handle.
+
+    Per-param side effects:
+        * _already_ag_drained = True   (if an AG handle was drained)
+        * _already_finalized  = True   (if finalize_after_drain=True)
     """
     for param in list(_inflight_comm_params):
         if chain_id is not None and getattr(param, 'chain_id', ETPChain.UNGRAPHED.value) != chain_id:
             continue
+        had_ag = param._prefetch_handle is not None
         param._wait_param_gather()
-        param._wait_reduce_scatter()
+        if had_ag:
+            param._already_ag_drained = True
+        if not skip_rs:
+            param._wait_reduce_scatter(finalize_grad=finalize_after_drain)
+            if finalize_after_drain and not getattr(param, '_already_finalized', False):
+                cache = get_global_ETP_cache()
+                param.rs_event.wait()
+                for w in param._weights:
+                    ETPShardedParam._finalize_wgrad(w, cache.get(w._rs_ticket))
+                    cache.release(w._rs_ticket)
+                param._already_finalized = True
 
 
 @dataclass

From 464c28988b4c3471fb1c88ccb7e95a383ac2131f Mon Sep 17 00:00:00 2001
From: Jieming Zhang <jiemingz@nvidia.com>
Date: Thu, 23 Apr 2026 09:44:09 -0700
Subject: [PATCH 38/43] wgrad accum fusion

Signed-off-by: Jieming Zhang <jiemingz@nvidia.com>
---
 .../module/extended_tensor_parallelism.py     | 39 +++++++++++--------
 1 file changed, 22 insertions(+), 17 deletions(-)

diff --git a/transformer_engine/pytorch/module/extended_tensor_parallelism.py b/transformer_engine/pytorch/module/extended_tensor_parallelism.py
index 623d89c905..171d060f7c 100644
--- a/transformer_engine/pytorch/module/extended_tensor_parallelism.py
+++ b/transformer_engine/pytorch/module/extended_tensor_parallelism.py
@@ -1053,27 +1053,20 @@ def register_grad_accum_hook(self, grad_accum_node, hook):
         self._grad_accum_hook = hook
 
     @staticmethod
-    def _finalize_wgrad(param, wgrad_rs):
-        """Post-RS per-param processing: strip padding, accumulate, call DDP hook.
+    def _handle_megatron_grad_accum(param):
+        """Handle megatron DDP and gradient accumulation fusion.
 
-        Accumulates the reduce-scattered wgrad into main_grad and triggers
-        the DDP backward hook (register_grad_ready) so the DP reduce-scatter
-        fires at the correct time during backward.
+        Do NOT set param.grad before calling the hook — the hook checks
+        param.grad and would accumulate it into main_grad if zero_out_wgrad
+        is True, corrupting the gradient with a non-zero dummy.
         """
-
-        param._set_rs_state(ETPWeightState.NONE)
-
-        if param.is_padded_last_rank:
-            wgrad_rs = param._strip_padding(wgrad_rs)
-
-        param.main_grad.add_(wgrad_rs)
         if hasattr(param, "grad_added_to_main_grad"):
             param.grad_added_to_main_grad = True
         dummy_grad = get_dummy_wgrad(list(param.main_grad.shape), param.dtype)
-
         if getattr(param, '_grad_accum_hook', None) is not None:
             param._grad_accum_hook()
 
+        param._set_rs_state(ETPWeightState.NONE)
         return dummy_grad
 
 
@@ -1213,8 +1206,13 @@ def wgrad_reduce_scatter(self, wgrad, nvtx_label=None):
             ret = tuple([None] * len(wgrads)) if batched else None
         else:
             # Sync reduce-scatter (last weight in chain) — RS done, recycle immediately
-            sharded, _ = self._reduce_scatter(wgrads, async_op=False, nvtx_label=nvtx_label)
-            result = [self._finalize_wgrad(p, g) for p, g in zip(weights, sharded)]
+            wgrads, _ = self._reduce_scatter(wgrads, async_op=False, nvtx_label=nvtx_label)
+            wgrads = [
+                w._strip_padding(g) if w.is_padded_last_rank else w for w, g in zip(weights, wgrads)
+            ]
+            torch._foreach_add_([p.main_grad for p in weights], wgrads)
+            result = [self._handle_megatron_grad_accum(p) for p in weights]
+            
             if poolable:
                 for buf in wgrads:
                     _wgrad_pool_put(buf)
@@ -1230,8 +1228,15 @@ def wgrad_reduce_scatter(self, wgrad, nvtx_label=None):
             else:
                 self.next_w.rs_event.wait()
                 cache = get_global_ETP_cache()
-                for w in self.next_w._weights:
-                    self._finalize_wgrad(w, cache.get(w._rs_ticket))
+                next_weights = self.next_w._weights
+                wgrads = [cache.get(w._rs_ticket) for w in next_weights]
+                wgrads = [
+                    w._strip_padding(g) if w.is_padded_last_rank else g for w, g in zip(next_weights, wgrads)
+                ]
+
+                torch._foreach_add_([w.main_grad for w in next_weights], wgrads)
+                for w in next_weights:
+                    self._handle_megatron_grad_accum(w)
                     cache.release(w._rs_ticket)
 
         return ret

From 06a27562454fac8875f9f56880fd1e0666ed09ec Mon Sep 17 00:00:00 2001
From: Shiqing Fan <shiqingf@nvidia.com>
Date: Mon, 27 Apr 2026 01:09:37 -0700
Subject: [PATCH 39/43] minor fix for wgrad accum fusion

---
 .../pytorch/module/extended_tensor_parallelism.py               | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/transformer_engine/pytorch/module/extended_tensor_parallelism.py b/transformer_engine/pytorch/module/extended_tensor_parallelism.py
index 171d060f7c..354148f692 100644
--- a/transformer_engine/pytorch/module/extended_tensor_parallelism.py
+++ b/transformer_engine/pytorch/module/extended_tensor_parallelism.py
@@ -1208,7 +1208,7 @@ def wgrad_reduce_scatter(self, wgrad, nvtx_label=None):
             # Sync reduce-scatter (last weight in chain) — RS done, recycle immediately
             wgrads, _ = self._reduce_scatter(wgrads, async_op=False, nvtx_label=nvtx_label)
             wgrads = [
-                w._strip_padding(g) if w.is_padded_last_rank else w for w, g in zip(weights, wgrads)
+                w._strip_padding(g) if w.is_padded_last_rank else g for w, g in zip(weights, wgrads)
             ]
             torch._foreach_add_([p.main_grad for p in weights], wgrads)
             result = [self._handle_megatron_grad_accum(p) for p in weights]

From e107b05e0b386ab9c22e717acf5d45bc7db00667 Mon Sep 17 00:00:00 2001
From: Shiqing Fan <shiqingf@nvidia.com>
Date: Mon, 27 Apr 2026 23:50:12 -0700
Subject: [PATCH 40/43] ETP+mxfp8: reject coalesced-amax path for non-NVFP4
 quantizers

---
 .../module/extended_tensor_parallelism.py     | 48 ++++++++-----------
 1 file changed, 19 insertions(+), 29 deletions(-)

diff --git a/transformer_engine/pytorch/module/extended_tensor_parallelism.py b/transformer_engine/pytorch/module/extended_tensor_parallelism.py
index 354148f692..a5100343ce 100644
--- a/transformer_engine/pytorch/module/extended_tensor_parallelism.py
+++ b/transformer_engine/pytorch/module/extended_tensor_parallelism.py
@@ -245,33 +245,22 @@ def get_rs_streams_for_chain(chain_id: str) -> list:
 
 
 def _coalesced_amax_static_eligible(weights):
-    """Walk the weight list once and decide whether the coalesced-amax path
-    is applicable. Depends only on fields that are fixed after model
-    construction (quantizer class, flags, amax_reduction_group, group size)."""
+    """Check whether the coalesced-amax path is applicable (NVFP4 only).
+
+    Caller already gates on ETP_CONFIG.coalesce_amax_allreduce (False for
+    non-NVFP4). Here we additionally verify TE API availability, batch size,
+    quantizer type (must have amax reduction), and the RHT flag."""
     if not _COALESCED_AMAX_TE_APIS_AVAILABLE:
         return False
     if len(weights) <= 1:
         return False
-
-    group = None
-    for w in weights:
-        q = w._quantizer
-        if q is None or not isinstance(w.quantized, NVFP4TensorStorage):
-            return False
-        if not getattr(q, "with_amax_reduction", False):
-            return False
-        if getattr(q, "with_rht", False):
-            # RHT path does amax on RHT-rotated view, can't split compute
-            # from cast the way compute_amax_only assumes.
-            return False
-        g = getattr(q, "amax_reduction_group", None)
-        if g is None:
-            return False
-        if group is None:
-            group = g
-        elif g is not group:
-            return False
-    return group.size() > 1
+    has_amax = [getattr(w._quantizer, "with_amax_reduction", False) for w in weights]
+    if not all(has_amax):
+        return False
+    has_rht = any(getattr(w._quantizer, "with_rht", False) for w in weights)
+    if has_rht:
+        return False
+    return True
 
 
 def _quantize_with_coalesced_amax(weights, skip_weight_cast, cast_noop_flag):
@@ -565,8 +554,9 @@ def setup(self, weight_quantizer=None):
         if self._quantizer is None:
             def _configure_quantizer(q, group):
                 q = q.copy()
-                q.with_amax_reduction = True
-                q.amax_reduction_group = group
+                if hasattr(q, 'with_amax_reduction'):
+                    q.with_amax_reduction = True
+                    q.amax_reduction_group = group
                 q.internal = False
                 q.optimize_for_gemm = True
                 return q
@@ -1597,10 +1587,10 @@ def grouped_gather_along_first_dim(
 
     if async_op:
         handle = gather_coalescing_manager
-        if (
-            quantizers is not None
-            and getattr(quantizers[0], "columnwise_usage", False)
-        ):
+        has_nvfp4_handles = any(
+            isinstance(wh, _NVFP4AllGatherAsyncHandle) for wh in weight_handles
+        )
+        if has_nvfp4_handles:
             handle = BatchedNVFP4AllGatherAsyncHandle(weight_handles, handle)
     else:
         for wh in weight_handles:

From 03bb9fbfe96b10e23b5fc06bb0e49aa7681bdd86 Mon Sep 17 00:00:00 2001
From: Shiqing Fan <shiqingf@nvidia.com>
Date: Mon, 27 Apr 2026 23:51:12 -0700
Subject: [PATCH 41/43] ETP: add debug_numerics instrumentation for NaN/Inf
 triage

---
 .../module/extended_tensor_parallelism.py     | 141 +++++++++++++++++-
 1 file changed, 140 insertions(+), 1 deletion(-)

diff --git a/transformer_engine/pytorch/module/extended_tensor_parallelism.py b/transformer_engine/pytorch/module/extended_tensor_parallelism.py
index a5100343ce..7b47d496d1 100644
--- a/transformer_engine/pytorch/module/extended_tensor_parallelism.py
+++ b/transformer_engine/pytorch/module/extended_tensor_parallelism.py
@@ -250,16 +250,35 @@ def _coalesced_amax_static_eligible(weights):
     Caller already gates on ETP_CONFIG.coalesce_amax_allreduce (False for
     non-NVFP4). Here we additionally verify TE API availability, batch size,
     quantizer type (must have amax reduction), and the RHT flag."""
+    dbg = ETP_CONFIG.debug_numerics > 0
     if not _COALESCED_AMAX_TE_APIS_AVAILABLE:
+        if dbg:
+            print_rank_0("[ETP_DEBUG] coalesced_amax_static: REJECTED (TE APIs unavailable)")
         return False
     if len(weights) <= 1:
         return False
     has_amax = [getattr(w._quantizer, "with_amax_reduction", False) for w in weights]
     if not all(has_amax):
+        if dbg:
+            qtypes = [type(w._quantizer).__name__ for w in weights[:3]]
+            print_rank_0(
+                f"[ETP_DEBUG] coalesced_amax_static: REJECTED "
+                f"(with_amax_reduction={has_amax[:3]}{'...' if len(has_amax)>3 else ''}, "
+                f"quantizer_types={qtypes}{'...' if len(weights)>3 else ''}, "
+                f"n_weights={len(weights)})"
+            )
         return False
     has_rht = any(getattr(w._quantizer, "with_rht", False) for w in weights)
     if has_rht:
+        if dbg:
+            print_rank_0("[ETP_DEBUG] coalesced_amax_static: REJECTED (with_rht=True)")
         return False
+    if dbg:
+        qtypes = [type(w._quantizer).__name__ for w in weights[:3]]
+        print_rank_0(
+            f"[ETP_DEBUG] coalesced_amax_static: *** ACCEPTED *** "
+            f"(n_weights={len(weights)}, quantizer_types={qtypes}{'...' if len(weights)>3 else ''})"
+        )
     return True
 
 
@@ -339,9 +358,70 @@ class ETPConfig:
     # guard in _coalesced_amax_static_eligible falls back to the per-weight
     # path when either binding is missing.
     coalesce_amax_allreduce: bool = True
+    # Log numeric diagnostics for the first N AG/RS calls per param.
+    # 0 = off; 3 = good default for triage (covers iter 1-2 fwd+bwd).
+    debug_numerics: int = 0
 
 ETP_CONFIG = ETPConfig()
 
+# ---------------------------------------------------------------------------
+# Debug helpers (gated by ETP_CONFIG.debug_numerics > 0)
+# ---------------------------------------------------------------------------
+_etp_debug_counts: Dict[tuple, int] = {}
+
+def _etp_dbg_capturing():
+    """True when a CUDA graph is being captured — D2H syncs are forbidden."""
+    return torch.cuda.is_current_stream_capturing()
+
+def _etp_dbg_should_log(param_name, label):
+    if ETP_CONFIG.debug_numerics <= 0 or _etp_dbg_capturing():
+        return False
+    key = (param_name, label)
+    count = _etp_debug_counts.get(key, 0)
+    if count >= ETP_CONFIG.debug_numerics:
+        return False
+    _etp_debug_counts[key] = count + 1
+    return True
+
+def _etp_dbg_tensor(name, t):
+    """One-line NaN/Inf summary for a BF16/FP32 tensor."""
+    if t is None:
+        return f"{name}=None"
+    if t.numel() == 0:
+        return f"{name}:{list(t.shape)},empty"
+    if not t.is_floating_point():
+        return f"{name}:non-float({t.dtype})"
+    has_nan = bool(torch.isnan(t).any())
+    has_inf = bool(torch.isinf(t).any())
+    amax = t.abs().max().item()
+    tag = " ***BAD***" if (has_nan or has_inf) else ""
+    return f"{name}:{list(t.shape)},amax={amax:.4e},nan={has_nan},inf={has_inf}{tag}"
+
+def _etp_dbg_quantized(name, qt):
+    """Multi-line check of a quantized tensor's metadata fields."""
+    if qt is None:
+        return f"{name}=None"
+    md = qt.get_metadata()
+    parts = [f"{name}:type={type(qt).__name__}"]
+    for k in ("rowwise_data", "columnwise_data"):
+        v = md.get(k)
+        parts.append(f"  {k}={'shape=' + str(list(v.shape)) if v is not None else 'NONE'}")
+    for k in ("rowwise_scale_inv", "columnwise_scale_inv"):
+        v = md.get(k)
+        if v is not None and v.numel() == 0:
+            parts.append(f"  {k}:{list(v.shape)},empty")
+        elif v is not None and v.is_floating_point():
+            has_nan = bool(torch.isnan(v).any())
+            has_inf = bool(torch.isinf(v).any())
+            amax = v.abs().max().item()
+            tag = " ***BAD***" if (has_nan or has_inf) else ""
+            parts.append(f"  {k}:{list(v.shape)},amax={amax:.4e},nan={has_nan},inf={has_inf}{tag}")
+        elif v is not None:
+            parts.append(f"  {k}:{list(v.shape)},dtype={v.dtype}")
+        else:
+            parts.append(f"  {k}:NONE")
+    return "\n".join(parts)
+
 def update_config(**kwargs):
     """Update the global ETP configuration."""
     for key, value in kwargs.items():
@@ -742,6 +822,17 @@ def _all_gather_weight(self, async_op, skip_weight_cast, cast_noop_flag, fwd, nv
         else:
             use_coalesced = False
 
+        if _etp_dbg_should_log(self._debug_name, 'ag_decision'):
+            qtypes = [type(w._quantizer).__name__ for w in weights[:3]]
+            print_rank_0(
+                f"[ETP_DEBUG] AG {self._debug_name} fwd={fwd} chain={self.chain_id} "
+                f"coalesced={use_coalesced} skip_cast={skip_weight_cast} "
+                f"noop={'set' if cast_noop_flag is not None else 'None'} "
+                f"coalesce_cfg={ETP_CONFIG.coalesce_amax_allreduce} "
+                f"static_ok={getattr(self, '_coalesced_amax_static', 'N/A')} "
+                f"qtypes={qtypes}{'...' if len(weights)>3 else ''}"
+            )
+
         if use_coalesced:
             _quantize_with_coalesced_amax(weights, skip_weight_cast, cast_noop_flag)
         else:
@@ -751,6 +842,15 @@ def _all_gather_weight(self, async_op, skip_weight_cast, cast_noop_flag, fwd, nv
             if w.did_cast_to_low_precision:
                 w._quantizer.set_usage(rowwise=fwd, columnwise=not fwd)
 
+        if _etp_dbg_should_log(self._debug_name, f'ag_numerics_{"fwd" if fwd else "bwd"}'):
+            lines = [f"[ETP_DEBUG] post-quantize {self._debug_name} fwd={fwd} "
+                     f"usage=row:{fwd},col:{not fwd}"]
+            for i, w in enumerate(weights[:3]):
+                lines.append(f"  w[{i}] shard: {_etp_dbg_tensor(f'{w._debug_name}', w.data)}")
+                if w.did_cast_to_low_precision:
+                    lines.append(_etp_dbg_quantized(f'  w[{i}] quantized', w.quantized))
+            print_rank_0("\n".join(lines))
+
         # 3. Build gather inputs.
         # quantizers / dtypes / etp_group are stable after model construction —
         # cache on the anchor (self == weights[0]) to avoid rebuilding lists
@@ -884,7 +984,8 @@ def _get_prefetched_weight(self, fwd, skip_weight_cast=False, cast_noop_flag=Non
                     f"cache.get() would return stale data. Check the chain's "
                     f"_need_weight_prefetch flag and issuer's prefetch logic."
                 )
-        if getattr(self, '_already_ag_drained', False):
+        _was_drained = getattr(self, '_already_ag_drained', False)
+        if _was_drained:
             # Producer already drained via wait_async_comms; skip the captured
             # cross-graph wait (CUDA no-op anyway). Correctness is provided by
             # the eager main_stream sync chain in the surrounding training loop.
@@ -902,6 +1003,17 @@ def _get_prefetched_weight(self, fwd, skip_weight_cast=False, cast_noop_flag=Non
             result.append(cache.get(ticket))
 
         result = [self._strip_padding(r) for r in result]
+
+        if _etp_dbg_should_log(self._debug_name, f'prefetch_{"fwd" if fwd else "bwd"}'):
+            lines = [f"[ETP_DEBUG] prefetched {self._debug_name} fwd={fwd} "
+                     f"already_drained={_was_drained}"]
+            for i, r in enumerate(result[:3]):
+                if isinstance(r, (NVFP4TensorStorage, MXFP8TensorStorage)):
+                    lines.append(_etp_dbg_quantized(f'  gathered[{i}]', r))
+                else:
+                    lines.append(f"  gathered[{i}]: {_etp_dbg_tensor('', r)}")
+            print_rank_0("\n".join(lines))
+
         result = [r.detach().requires_grad_(w.requires_grad) for r, w in zip(result, self._weights)]
         return result if self.is_routed_expert else result[0]
 
@@ -1083,6 +1195,12 @@ def _wait_reduce_scatter(self, finalize_grad=False):
                         if w.is_padded_last_rank:
                             wgrad_rs = w._strip_padding(wgrad_rs)
                         w.main_grad.add_(wgrad_rs)
+                        if ETP_CONFIG.debug_numerics > 0 and not _etp_dbg_capturing():
+                            if bool(torch.isinf(w.main_grad).any()) or bool(torch.isnan(w.main_grad).any()):
+                                print_rank_0(
+                                    f"[ETP_DEBUG] *** main_grad ANOMALY after finalize_grad RS *** "
+                                    f"{w._debug_name}: {_etp_dbg_tensor('main_grad', w.main_grad)}"
+                                )
                         cache.release(w._rs_ticket)
                         if hasattr(w, "grad_added_to_main_grad"):
                             w.grad_added_to_main_grad = True
@@ -1184,6 +1302,13 @@ def wgrad_reduce_scatter(self, wgrad, nvtx_label=None):
         # stable buffer addresses across replay.
         poolable = self.chain_id == ETPChain.UNGRAPHED.value
 
+        if _etp_dbg_should_log(self._debug_name, 'rs_input'):
+            lines = [f"[ETP_DEBUG] RS input {self._debug_name} "
+                     f"async={self.prev_w is not None and ETP_CONFIG.weight_prefetch}"]
+            for i, g in enumerate(wgrads[:3]):
+                lines.append(f"  wgrad[{i}]: {_etp_dbg_tensor('', g)}")
+            print_rank_0("\n".join(lines))
+
         if ETP_CONFIG.weight_prefetch and self.prev_w is not None:
             # Async reduce-scatter (not last weight — deferred finish). Pre-RS
             # work on caller; NCCL wrap lives at the collective site inside
@@ -1201,6 +1326,13 @@ def wgrad_reduce_scatter(self, wgrad, nvtx_label=None):
                 w._strip_padding(g) if w.is_padded_last_rank else g for w, g in zip(weights, wgrads)
             ]
             torch._foreach_add_([p.main_grad for p in weights], wgrads)
+            if ETP_CONFIG.debug_numerics > 0 and not _etp_dbg_capturing():
+                for w in weights[:3]:
+                    if bool(torch.isinf(w.main_grad).any()) or bool(torch.isnan(w.main_grad).any()):
+                        print_rank_0(
+                            f"[ETP_DEBUG] *** main_grad ANOMALY after sync RS *** "
+                            f"{w._debug_name}: {_etp_dbg_tensor('main_grad', w.main_grad)}"
+                        )
             result = [self._handle_megatron_grad_accum(p) for p in weights]
             
             if poolable:
@@ -1225,6 +1357,13 @@ def wgrad_reduce_scatter(self, wgrad, nvtx_label=None):
                 ]
 
                 torch._foreach_add_([w.main_grad for w in next_weights], wgrads)
+                if ETP_CONFIG.debug_numerics > 0 and not _etp_dbg_capturing():
+                    for w in next_weights[:3]:
+                        if bool(torch.isinf(w.main_grad).any()) or bool(torch.isnan(w.main_grad).any()):
+                            print_rank_0(
+                                f"[ETP_DEBUG] *** main_grad ANOMALY after async RS finalize *** "
+                                f"{w._debug_name}: {_etp_dbg_tensor('main_grad', w.main_grad)}"
+                            )
                 for w in next_weights:
                     self._handle_megatron_grad_accum(w)
                     cache.release(w._rs_ticket)

From 550f0746aa84a319aec1d8488d767a902946b594 Mon Sep 17 00:00:00 2001
From: Shiqing Fan <shiqingf@nvidia.com>
Date: Tue, 28 Apr 2026 02:18:19 -0700
Subject: [PATCH 42/43] ETP+mxfp8 divergence fix: disable GEMM-swizzled scales
 for all-gather compatibility.

---
 .../pytorch/module/extended_tensor_parallelism.py          | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/transformer_engine/pytorch/module/extended_tensor_parallelism.py b/transformer_engine/pytorch/module/extended_tensor_parallelism.py
index 7b47d496d1..ef66994d34 100644
--- a/transformer_engine/pytorch/module/extended_tensor_parallelism.py
+++ b/transformer_engine/pytorch/module/extended_tensor_parallelism.py
@@ -18,6 +18,7 @@
 )
 from ..quantized_tensor import QuantizedTensor
 from ..tensor import NVFP4TensorStorage, MXFP8TensorStorage
+from ..tensor.mxfp8_tensor import MXFP8Quantizer
 from ..utils import nvtx_range_pop, nvtx_range_push, round_up_to_nearest_multiple
 from ..constants import NVFP4_BLOCK_SCALING_SIZE, MXFP8_BLOCK_SCALING_SIZE
 from .base import get_dummy_wgrad
@@ -638,7 +639,11 @@ def _configure_quantizer(q, group):
                     q.with_amax_reduction = True
                     q.amax_reduction_group = group
                 q.internal = False
-                q.optimize_for_gemm = True
+                # MXFP8 scales must stay in compact (unswizzled) layout so that
+                # per-shard scale_inv can be all-gathered via byte concatenation.
+                # GEMM-swizzled scales from independent shards don't compose into
+                # a valid swizzled layout for the full tensor after AG.
+                q.optimize_for_gemm = not isinstance(q, MXFP8Quantizer)
                 return q
 
             weights = self.weight_list if self.is_routed_expert and self.weight_list is not None else [self]

From 520251c0963ff513a4f39f4aa29baeb57c756343 Mon Sep 17 00:00:00 2001
From: Shiqing Fan <shiqingf@nvidia.com>
Date: Tue, 28 Apr 2026 02:34:04 -0700
Subject: [PATCH 43/43] ETP: pad full tensor before sharding instead of
 per-rank on-the-fly

---
 .../module/extended_tensor_parallelism.py     | 41 +++++--------------
 1 file changed, 10 insertions(+), 31 deletions(-)

diff --git a/transformer_engine/pytorch/module/extended_tensor_parallelism.py b/transformer_engine/pytorch/module/extended_tensor_parallelism.py
index ef66994d34..84dbe05eeb 100644
--- a/transformer_engine/pytorch/module/extended_tensor_parallelism.py
+++ b/transformer_engine/pytorch/module/extended_tensor_parallelism.py
@@ -459,24 +459,21 @@ def wrap_module_params_etp(module, weight_names, etp_group, is_grouped=None):
         delattr(module, name)
 
         if ETP_CONFIG.pad_for_alignment > 0:
-            # Ensure each shard's dim0 is a multiple of 16 for quantization (NVFP4/FP8) by padding 
-            # the last rank such that the total padded length of dim0 is a multiple of ETP size * 16
+            # Pad the full tensor BEFORE sharding so every rank gets exactly
+            # shard_size rows and each shard's dim0 is alignment-divisible.
+            # Padding stays contiguous at the tail of the gathered result —
+            # no interleaved-padding reshuffle needed after all-gather.
             alignment = ETP_CONFIG.pad_for_alignment * etp_size
             tensor = param.data
             dim0 = tensor.shape[0]
             pad_length = (alignment - dim0 % alignment) % alignment if alignment > 0 else 0
+            if pad_length > 0:
+                tensor = torch.nn.functional.pad(tensor, (0, 0, 0, pad_length))
             padded_dim0 = dim0 + pad_length
-            is_padded_last_rank = pad_length > 0 and etp_rank == etp_size - 1
-            # Create the ETP sharded param, pass a clone of the shard so that the original unsharded
-            # buffer may be deallocated
             shard_size = padded_dim0 // etp_size
-            start_idx = etp_rank * shard_size
-            end_idx = min((etp_rank + 1) * shard_size, tensor.shape[0])
-            shard = tensor[start_idx: end_idx]
+            shard = tensor[etp_rank * shard_size : (etp_rank + 1) * shard_size]
             etp_shard = ETPShardedParam(shard.clone())
-            # finally, set attributes
             etp_shard.pad_length = pad_length
-            etp_shard.is_padded_last_rank = is_padded_last_rank
         else:
             shard_size = tensor.shape[0] // etp_group.size()
             shard = tensor[etp_rank * shard_size: (etp_rank + 1) * shard_size]
@@ -617,7 +614,6 @@ def __init__(self, x, *args, **kwargs):
         self.rs_event = torch.cuda.Event(external=True)
         self._rs_ticket = None
         # Padding
-        self.is_padded_last_rank = False
         self.pad_length = 0
         # Debug
         self._debug_name = ""
@@ -666,11 +662,8 @@ def _weights(self):
     @property
     def _unsharded_shape_padded(self):
         out_shape = list(self.size())
-        if self.pad_length > 0 and self.group.rank() == self.group.size() - 1:
-            out_shape[0] = (out_shape[0]+ self.pad_length) * self.group.size()
-        else:
-            out_shape[0] = out_shape[0] * self.group.size()
-        return tuple(out_shape)   
+        out_shape[0] = out_shape[0] * self.group.size()
+        return tuple(out_shape)
 
     @property
     def _unsharded_shape(self):
@@ -680,14 +673,9 @@ def _unsharded_shape(self):
 
     @property
     def _sharded_padded_shape(self):
-        out_shape = list(self.size())
-        if self.pad_length > 0 and self.group.rank() == self.group.size() - 1:
-            out_shape[0] += self.pad_length
-        return tuple(out_shape)
+        return tuple(self.size())
 
     def get_padded_shard(self):
-        if self.pad_length > 0 and self.is_padded_last_rank:
-            return torch.nn.functional.pad(self, (0, 0, 0, self.pad_length))  
         return self
 
     def _set_state(self, new_state: ETPWeightState):
@@ -1197,8 +1185,6 @@ def _wait_reduce_scatter(self, finalize_grad=False):
                     for w in self._weights:
                         w._set_rs_state(ETPWeightState.NONE)
                         wgrad_rs = cache.get(w._rs_ticket)
-                        if w.is_padded_last_rank:
-                            wgrad_rs = w._strip_padding(wgrad_rs)
                         w.main_grad.add_(wgrad_rs)
                         if ETP_CONFIG.debug_numerics > 0 and not _etp_dbg_capturing():
                             if bool(torch.isinf(w.main_grad).any()) or bool(torch.isnan(w.main_grad).any()):
@@ -1327,9 +1313,6 @@ def wgrad_reduce_scatter(self, wgrad, nvtx_label=None):
         else:
             # Sync reduce-scatter (last weight in chain) — RS done, recycle immediately
             wgrads, _ = self._reduce_scatter(wgrads, async_op=False, nvtx_label=nvtx_label)
-            wgrads = [
-                w._strip_padding(g) if w.is_padded_last_rank else g for w, g in zip(weights, wgrads)
-            ]
             torch._foreach_add_([p.main_grad for p in weights], wgrads)
             if ETP_CONFIG.debug_numerics > 0 and not _etp_dbg_capturing():
                 for w in weights[:3]:
@@ -1357,10 +1340,6 @@ def wgrad_reduce_scatter(self, wgrad, nvtx_label=None):
                 cache = get_global_ETP_cache()
                 next_weights = self.next_w._weights
                 wgrads = [cache.get(w._rs_ticket) for w in next_weights]
-                wgrads = [
-                    w._strip_padding(g) if w.is_padded_last_rank else g for w, g in zip(next_weights, wgrads)
-                ]
-
                 torch._foreach_add_([w.main_grad for w in next_weights], wgrads)
                 if ETP_CONFIG.debug_numerics > 0 and not _etp_dbg_capturing():
                     for w in next_weights[:3]: