From e49dd752dcd7c83fdd707d2ed7dfbdded5001132 Mon Sep 17 00:00:00 2001 From: Jieming Zhang Date: Thu, 5 Mar 2026 14:45:49 -0800 Subject: [PATCH 01/43] ETP Signed-off-by: Jieming Zhang --- transformer_engine/pytorch/distributed.py | 147 ++-- transformer_engine/pytorch/module/base.py | 6 + .../module/extended_tensor_parallelism.py | 775 ++++++++++++++++++ .../pytorch/module/grouped_linear.py | 156 ++-- .../pytorch/module/layernorm_linear.py | 76 +- .../pytorch/module/layernorm_mlp.py | 10 + transformer_engine/pytorch/module/linear.py | 46 +- .../tensor/storage/nvfp4_tensor_storage.py | 12 + transformer_engine/pytorch/utils.py | 40 + 9 files changed, 1158 insertions(+), 110 deletions(-) create mode 100644 transformer_engine/pytorch/module/extended_tensor_parallelism.py diff --git a/transformer_engine/pytorch/distributed.py b/transformer_engine/pytorch/distributed.py index f269e21b8c..dc5d8b3063 100644 --- a/transformer_engine/pytorch/distributed.py +++ b/transformer_engine/pytorch/distributed.py @@ -6,11 +6,11 @@ from __future__ import annotations from collections.abc import Iterable -from contextlib import contextmanager, AbstractContextManager, ContextDecorator +from contextlib import contextmanager, AbstractContextManager, ContextDecorator, nullcontext from functools import lru_cache from dataclasses import dataclass import math -from typing import Any, Callable, Dict, List, Optional, Tuple, Union +from typing import Any, Callable, ContextManager, Dict, List, Optional, Tuple, Union import warnings import torch @@ -60,6 +60,14 @@ "partition_stride": 1, } + +_EXTENDED_TENSOR_MODEL_PARALLEL_ATTRIBUTE_DEFAUTLTS = { + 'etp_model_parallel': False, + 'etp_partition_dim': -1, + 'etp_partition_stride': 1, +} + + _USE_REENTRANT_ACTIVATION_RECOMPUTE = True _FP8_ACTIVATION_RECOMPUTE_ENABLED = False @@ -159,6 +167,19 @@ def set_tensor_model_parallel_attributes( setattr(tensor, "partition_stride", stride) +def set_extended_tensor_parallel_attributes( + tensor: torch.Tensor, is_parallel: bool, dim: int, stride: int +) -> None: + """Set ps attributes to tensor.""" + # Make sure the attributes are not set. + for attribute in _EXTENDED_TENSOR_MODEL_PARALLEL_ATTRIBUTE_DEFAUTLTS: + assert not hasattr(tensor, attribute) + # Set the attributes. + setattr(tensor, 'etp_model_parallel', is_parallel) + setattr(tensor, 'etp_partition_dim', dim) + setattr(tensor, 'etp_partition_stride', stride) + + @lru_cache def get_distributed_world_size(group: Optional[dist_group_type] = None) -> int: """Return world size for the distributed group.""" @@ -908,7 +929,7 @@ def fork(self, name: str = "model-parallel-rng"): def reduce_scatter_along_first_dim( - inp: torch.Tensor, tp_group: dist_group_type, async_op: bool = False + inp: torch.Tensor, tp_group: dist_group_type, async_op: bool = False, output: torch.Tensor = None ) -> Tuple[torch.Tensor, Optional[torch.distributed.Work]]: """Reduce-scatter the input tensor across model parallel group.""" world_size = get_distributed_world_size(tp_group) @@ -916,14 +937,15 @@ def reduce_scatter_along_first_dim( if world_size == 1: return inp, None - dim_size = list(inp.size()) - assert ( - dim_size[0] % world_size == 0 - ), "First dimension of the tensor should be divisible by tensor parallel size" + if output is None: + dim_size = list(inp.size()) + assert ( + dim_size[0] % world_size == 0 + ), "First dimension of the tensor should be divisible by tensor parallel size" - dim_size[0] = dim_size[0] // world_size + dim_size[0] = dim_size[0] // world_size - output = torch.empty(dim_size, dtype=inp.dtype, device=torch.cuda.current_device()) + output = torch.empty(dim_size, dtype=inp.dtype, device=torch.cuda.current_device()) handle = torch.distributed.reduce_scatter_tensor( output, inp.contiguous(), group=tp_group, async_op=async_op ) @@ -1271,17 +1293,20 @@ class _NVFP4AllGatherAsyncHandle: async_handle: torch.distributed.Work _synchronized: bool = False - def wait(self) -> None: - """Wait for the async operation to complete and post-process the tensor.""" - if self._synchronized: - return - self.async_handle.wait() + def post_process_nvfp4_gather(self) -> None: _post_process_nvfp4_gather( self.output, self.columnwise_data_interleaved, self.columnwise_scale_inv_interleaved, self.world_size, ) + + def wait(self) -> None: + """Wait for the async operation to complete and post-process the tensor.""" + if self._synchronized: + return + self.async_handle.wait() + self.post_process_nvfp4_gather() self._synchronized = True @@ -1292,6 +1317,8 @@ def _all_gather_nvfp4( async_op: bool = False, quantizer: NVFP4Quantizer, out_shape: Optional[list[int]] = None, + output_tensor = None, + grouped = False, ) -> tuple[NVFP4TensorStorage, Optional[torch.distributed.Work]]: """All-gather NVFP4 tensor along first dimension.""" @@ -1348,6 +1375,12 @@ def _all_gather_nvfp4( out = quantizer(out) return out, None + # Construct NVFP4 output tensor + if output_tensor is not None: + out = output_tensor + else: + out = quantizer.make_empty(out_shape, dtype=dtype, device=device) + # Cast input tensor to NVFP4 with required data if not isinstance(inp, NVFP4TensorStorage): inp = quantizer(inp) @@ -1360,17 +1393,19 @@ def _all_gather_nvfp4( ) inp = quantizer(inp.dequantize()) - # Construct NVFP4 output tensor - out = quantizer.make_empty(out_shape, dtype=dtype, device=device) - - # Coalesce NCCL collectives for gathering data and scale inverses. - with torch.distributed._coalescing_manager( - group=process_group, - device=device, - async_ops=async_op, - ) as gather_coalescing_manager: + if not grouped: + # Coalesce NCCL collectives for gathering data and scale inverses. + gather_coalescing_manager = torch.distributed._coalescing_manager( + group=process_group, + device=device, + async_ops=async_op, + ) + else: + gather_coalescing_manager = nullcontext() + with gather_coalescing_manager as coalesced_handle: # Gather NVFP4 data for row-wise usage + out_columnwise_data = None if quantizer.rowwise_usage: # Remove padding from NVFP4 scale-inverses @@ -1446,15 +1481,19 @@ def _all_gather_nvfp4( # Transfer amax to output. out._amax_columnwise = inp._amax_columnwise - handle = gather_coalescing_manager if async_op else None + handle = coalesced_handle if async_op else None # Fixes interleaved data for transposed tensor/scale inv and pads scale inv if needed. - if async_op and quantizer.columnwise_usage: - handle = _NVFP4AllGatherAsyncHandle( - out, out_columnwise_data, out_scale_inv, world_size, handle - ) - elif quantizer.columnwise_usage: - _post_process_nvfp4_gather(out, out_columnwise_data, out_scale_inv, world_size, handle) + if quantizer.columnwise_usage: + if async_op or grouped: + # Defer post-processing: either the async op hasn't completed yet, or an + # external coalescing manager owns the NCCL ops and hasn't flushed them. + inner_handle = handle if async_op else None + handle = _NVFP4AllGatherAsyncHandle( + out, out_columnwise_data, out_scale_inv, world_size, inner_handle + ) + else: + _post_process_nvfp4_gather(out, out_columnwise_data, out_scale_inv, world_size, handle) return out, handle @@ -1466,6 +1505,8 @@ def _all_gather_mxfp8( async_op: bool = False, quantizer: MXFP8Quantizer, out_shape: Optional[list[int]] = None, + output_tensor: torch.Tensor = None, + grouped: bool = False, ) -> tuple[MXFP8TensorStorage, Optional[torch.distributed.Work]]: """All-gather MXFP8 tensor along first dimension.""" @@ -1528,15 +1569,22 @@ def _all_gather_mxfp8( inp = quantizer(inp.dequantize()) # Construct MXFP8 output tensor - out = quantizer.make_empty(out_shape, dtype=dtype, device=device) + if output_tensor is not None: + out = output_tensor + else: + out = quantizer.make_empty(out_shape, dtype=dtype, device=device) - # Coalesce NCCL collectives - with torch.distributed._coalescing_manager( - group=process_group, - device=device, - async_ops=async_op, - ) as coalescing_manager: + if not grouped: + # Coalesce NCCL collectives for gathering data and scale inverses. + gather_coalescing_manager = torch.distributed._coalescing_manager( + group=process_group, + device=device, + async_ops=async_op, + ) + else: + gather_coalescing_manager = nullcontext() + with gather_coalescing_manager as coalesced_handle: # Gather MXFP8 data for row-wise usage if quantizer.rowwise_usage: @@ -1583,7 +1631,7 @@ def _all_gather_mxfp8( group=process_group, ) - handle = coalescing_manager if async_op else None + handle = coalesced_handle if async_op else None return out, handle @@ -1592,6 +1640,8 @@ def gather_along_first_dim( process_group: dist_group_type, async_op: bool = False, quantizer: Optional[Quantizer] = None, + output_tensor: torch.Tensor = None, + grouped: bool = False, ) -> tuple[torch.Tensor, Optional[torch.distributed.Work]]: """ All-gather tensors and concatenate along first dimension. @@ -1679,6 +1729,8 @@ def gather_along_first_dim( async_op=async_op, quantizer=quantizer, out_shape=out_shape, + output_tensor=output_tensor, + grouped=grouped, ) # NVFP4 case @@ -1690,6 +1742,8 @@ def gather_along_first_dim( async_op=async_op, quantizer=quantizer, out_shape=out_shape, + output_tensor=output_tensor, + grouped=grouped, ) # High-precision communication for quantized tensors @@ -1719,19 +1773,20 @@ def gather_along_first_dim( inp = inp.dequantize() # Communication for plain PyTorch tensors - out = torch.empty( - out_shape, - dtype=inp.dtype, - device=inp.device, - memory_format=torch.contiguous_format, - ) + if output_tensor is None: + output_tensor = torch.empty( + out_shape, + dtype=inp.dtype, + device=inp.device, + memory_format=torch.contiguous_format, + ) handle = torch.distributed.all_gather_into_tensor( - out, + output_tensor, inp.contiguous(), group=process_group, async_op=async_op, ) - return out, handle + return output_tensor, handle # Global cache to store symmetric memory tensors diff --git a/transformer_engine/pytorch/module/base.py b/transformer_engine/pytorch/module/base.py index 9c21141a39..b565a40f87 100644 --- a/transformer_engine/pytorch/module/base.py +++ b/transformer_engine/pytorch/module/base.py @@ -631,6 +631,7 @@ def __init__(self, name: Optional[str] = None) -> None: self.activation_dtype: Optional[torch.dtype] = None self.wgrad_accumulation_and_reduce_hooks = [] self.wgrad_store = None + self.etp_size = 1 if not TEDebugState.debug_enabled: TEDebugState.initialize() @@ -956,6 +957,8 @@ def set_tensor_parallel_group(self, tp_group: Union[dist_group_type, None]) -> N self.fast_setattr("tp_group", tp_group) self.fast_setattr("tp_group_initialized", True) + + def _get_fp8_params(self) -> Union[List[torch.Tensor], None]: """returns the FP8 weights.""" fp8_params = [] @@ -1242,6 +1245,9 @@ def reset_parameters(self, defer_init: Optional[bool] = False) -> None: for name, param in self.named_parameters(recurse=False): # Check if parameter is a DTensor (FSDP2) or regular tensor is_dtensor = isinstance(param, DTensor) + from .extended_tensor_parallelism import ETPShardedParam + is_etp = isinstance(param, ETPShardedParam) + dtensor_param = param if is_dtensor else None # Need to update/quantize local tensor in case of DTensor param = param._local_tensor if is_dtensor else param diff --git a/transformer_engine/pytorch/module/extended_tensor_parallelism.py b/transformer_engine/pytorch/module/extended_tensor_parallelism.py new file mode 100644 index 0000000000..cb4e058418 --- /dev/null +++ b/transformer_engine/pytorch/module/extended_tensor_parallelism.py @@ -0,0 +1,775 @@ +# Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# See LICENSE for license information. + +from collections import defaultdict +from typing import Dict, List +from enum import Enum +from dataclasses import dataclass +import torch + +from ..distributed import ( + gather_along_first_dim, + reduce_scatter_along_first_dim, + _NVFP4AllGatherAsyncHandle +) +from ..quantized_tensor import QuantizedTensor +from ..tensor import NVFP4TensorStorage, MXFP8TensorStorage +from ..utils import nvtx_range_pop, nvtx_range_push +from .base import get_dummy_wgrad + +import transformer_engine_torch as tex + + +class ETPWeightState(Enum): + NONE = "NONE" # Sharded, no pending operation + ASYNC_WAIT = "ASYNC_WAIT" # Async all-gather in progress + ASYNC_DONE = "ASYNC_DONE" # Async all-gather complete, result in cache + +_STATE_TRANSITIONS = { + ETPWeightState.NONE: {ETPWeightState.ASYNC_WAIT}, + ETPWeightState.ASYNC_WAIT: {ETPWeightState.ASYNC_DONE}, + ETPWeightState.ASYNC_DONE: {ETPWeightState.NONE}, +} + + +# Global AG Prefetching Buffer for ETP. +_ALL_GATHER_BUFFER = None + + +@dataclass +class ETPConfig: + """Global configuration for Extended Tensor Parallelism.""" + pad_for_alignment: int = 16 + weight_prefetch: bool = True + +ETP_CONFIG = ETPConfig() + +def update_config(**kwargs): + """Update the global ETP configuration.""" + for key, value in kwargs.items(): + if not hasattr(ETP_CONFIG, key): + raise ValueError(f"Unknown ETP config option: {key}") + setattr(ETP_CONFIG, key, value) + + +def wrap_module_params_etp(module, weight_names, etp_group, is_grouped=None): + """Shard and re-register all parameters of a module using ETP weight sharding.""" + if etp_group.size() == 1: + return + + etp_size = etp_group.size() + etp_rank = etp_group.rank() + + for idx, name in enumerate(weight_names): + param = getattr(module, name, None) + if param is None: + continue + + # delete the original parameter, which will be replaced by an ETP sharded one + delattr(module, name) + + if ETP_CONFIG.pad_for_alignment > 0: + # Ensure each shard's dim0 is a multiple of 16 for quantization (NVFP4/FP8) by padding + # the last rank such that the total padded length of dim0 is a multiple of ETP size * 16 + alignment = ETP_CONFIG.pad_for_alignment * etp_size + tensor = param.data + dim0 = tensor.shape[0] + pad_length = (alignment - dim0 % alignment) % alignment if alignment > 0 else 0 + padded_dim0 = dim0 + pad_length + is_padded_last_rank = pad_length > 0 and etp_rank == etp_size - 1 + # Create the ETP sharded param, pass a clone of the shard so that the original unsharded + # buffer may be deallocated + shard_size = padded_dim0 // etp_size + start_idx = etp_rank * shard_size + end_idx = min((etp_rank + 1) * shard_size, tensor.shape[0]) + shard = tensor[start_idx: end_idx] + etp_shard = ETPShardedParam(shard.clone()) + # finally, set attributes + etp_shard.pad_length = pad_length + etp_shard.is_padded_last_rank = is_padded_last_rank + else: + shard_size = tensor.shape[0] // etp_group.size() + shard = tensor[etp_rank * shard_size: (etp_rank + 1) * shard_size] + etp_shard = ETPShardedParam(shard.clone()) + + if is_grouped: + etp_shard.expert_idx = idx + etp_shard.is_routed_expert = True + etp_shard.group = etp_group + etp_shard.ps_size = etp_size + # register the newly sharded param back to the module + module._parameters[name] = etp_shard + + if is_grouped: + allweights = [getattr(module, name) for name in weight_names] + allweights[0].weight_list = allweights + + +class ETPShardHandle: + + def __init__(self, handle, etp_shards: list): + self.handle = handle + self.etp_shards = etp_shards + + def wait(self): + if self.handle is not None: + self.handle.wait() + for w in self.etp_shards: + w._set_state(ETPWeightState.ASYNC_DONE) + + +class ETPShardedParam(torch.nn.Parameter): + + _pending_rs_weight = None + _first_weight_flag = True + _last_weight = None + + @staticmethod + def __new__(cls, tensor, *args, **kwargs): + requires_grad = kwargs.get('requires_grad', True) + return super(ETPShardedParam, cls).__new__(cls, tensor, requires_grad=requires_grad) + + def __init__(self, x, *args, **kwargs): + super().__init__() + + self.state = ETPWeightState.NONE + self._cache_ticket = None + self._prefetch_handle = None + self._grad_accum_node = None + self._grad_accum_hook = None + # Quantization + self._quantizer = None + self.did_cast_to_low_precision = False + self.quantized = None + # Prefetching linked list + self.is_first_weight = False + self.next_w = None + self.prev_w = None + # Grouped gemm + self.is_routed_expert = False + self.expert_idx = None + self.group = None + self.weight_list = None + # Reduce-scatter state (set during wgrad_reduce_scatter) + self.wgrad_rs = None + self.wgrad_rs_handle = None + self.fuse_wgrad_accumulation = False + # Padding + self.is_padded_last_rank = False + self.pad_length = 0 + + def setup(self, weight_quantizer=None): + """Set quantizer and create quantized shard.""" + + if self._quantizer is None: + def _configure_quantizer(q, group): + q = q.copy() + q.with_amax_reduction = True + q.amax_reduction_group = group + q.internal = False + q.optimize_for_gemm = True + return q + + weights = self.weight_list if self.is_routed_expert and self.weight_list is not None else [self] + for quantizer, weight in zip(weight_quantizer, weights): + if quantizer is None: + continue + + weight._quantizer = _configure_quantizer(quantizer, weight.group) + weight.quantized = weight._quantizer.quantize(weight.get_padded_shard()) + weight.quantized.is_routed_expert = getattr(weight, 'is_routed_expert', False) + + @property + def _weights(self): + """Return the list of individual weight shards (self for non-routed, weight_list for routed).""" + weights = self.weight_list if self.is_routed_expert else [self] + # Safety: all weights must be in the same state. + assert all(w.state == weights[0].state for w in weights) + return list(weights) + + @property + def _unsharded_shape_padded(self): + out_shape = list(self.size()) + if self.pad_length > 0 and self.group.rank() == self.group.size() - 1: + out_shape[0] = (out_shape[0]+ self.pad_length) * self.group.size() + else: + out_shape[0] = out_shape[0] * self.group.size() + return tuple(out_shape) + + @property + def _unsharded_shape(self): + out_shape = list(self._unsharded_shape_padded) + out_shape[0] -= self.pad_length + return tuple(out_shape) + + def get_padded_shard(self): + if self.pad_length > 0 and self.is_padded_last_rank: + return torch.nn.functional.pad(self, (0, 0, 0, self.pad_length)) + return self + + def _set_state(self, new_state: ETPWeightState): + """Validate and update state machine transition.""" + assert new_state in _STATE_TRANSITIONS[self.state], \ + f"Invalid state transition: {self.state} -> {new_state}" + self.state = new_state + + def _get_cache_key(self, dtype, fwd: bool) -> tuple: + """Build cache key using output shape + dtype. + + Weights with matching gathered shape and dtype share a buffer. + For expert weights gathered in parallel, self.expert_idx distinguishes them so + each gets a distinct buffer, while same-indexed experts across layers share. + """ + + if not isinstance(dtype, torch.dtype): + return (self._unsharded_shape_padded, dtype, fwd, not fwd, self.expert_idx) + return (self._unsharded_shape_padded, dtype, self.expert_idx) + + def _quantize_if_needed(self, skip_weight_cast=False, cast_noop_flag=None): + """Re-quantize sharded weight into existing buffer. Returns quantized weight or self.""" + if self._quantizer is None: + self.did_cast_to_low_precision = False + return self + + self._quantizer.set_usage(rowwise=True, columnwise=True) + if skip_weight_cast is False or cast_noop_flag is not None: + tex.quantize( + tensor=self.get_padded_shard(), + quantizer=self._quantizer, + output=self.quantized, + noop=cast_noop_flag, + ) + self.did_cast_to_low_precision = True + + return self.quantized + + def _strip_padding(self, tensor): + if self.pad_length == 0: + return tensor + + if isinstance(tensor, QuantizedTensor): + assert isinstance(tensor, (NVFP4TensorStorage, MXFP8TensorStorage)), \ + f"Unsupported quantized tensor type for ETP padding: {type(tensor)}" + + metadata = tensor.get_metadata() + if metadata.get("rowwise_data") is not None: + metadata["rowwise_data"] = metadata["rowwise_data"][:-self.pad_length] + if metadata.get("columnwise_data") is not None: + if isinstance(tensor, NVFP4TensorStorage): + # NVFP4 transposes columnwise and packs 2 values per byte + metadata["columnwise_data"] = metadata["columnwise_data"][ + ..., :-self.pad_length // 2 + ].contiguous() + else: + # MXFP8 columnwise is not transposed, strip first dim + metadata["columnwise_data"] = metadata["columnwise_data"][ + :-self.pad_length + ] + return type(tensor)(**metadata, shape=self._unsharded_shape, dtype=torch.bfloat16) + else: + return tensor[:-self.pad_length] + + def _all_gather_weight(self, async_op, skip_weight_cast, cast_noop_flag, fwd, nvtx_label=None): + """Quantize (if needed) and all-gather weight. Returns (weight_total, handle).""" + weights = self._weights + + # 1. Transition state for async gathers. + if async_op: + for w in weights: + w._set_state(ETPWeightState.ASYNC_WAIT) + + # 2. Prepare: quantize, set usage direction. + for w in weights: + w._quantize_if_needed(skip_weight_cast, cast_noop_flag) + if w.did_cast_to_low_precision: + w._quantizer.set_usage(rowwise=fwd, columnwise=not fwd) + + # 3. Build gather inputs. + quantizers = [w._quantizer for w in weights] + if weights[0].did_cast_to_low_precision: + gather_weights = [w.quantized for w in weights] + else: + gather_weights = list(w.get_padded_shard() for w in weights) + + # 4. Cache checkout (async only — sync gathers don't need pooled buffers). + if async_op: + dtypes = [q.dtype if q is not None else w.dtype for q, w in zip(quantizers, weights)] + out_buffers = [] + for p, dt in zip(weights, dtypes): + assert p._cache_ticket is None, \ + f"Cache ticket leak: weight {id(p)} still has unreturned ticket {p._cache_ticket}" + buf, p._cache_ticket = get_global_ETP_cache().checkout(p, dt, fwd) + out_buffers.append(buf) + else: + out_buffers = None + + # 5. Communicate. + etp_group = weights[0].group + if out_buffers is not None and len(gather_weights) > 1: + assert len(set(id(b) for b in out_buffers)) == len(out_buffers), \ + "Duplicate output buffers in batched all-gather — experts need distinct cache keys" + + if len(gather_weights) > 1: + nvtx_range_push(f"{nvtx_label}.batched_etp_ag") + results, handle = grouped_gather_along_first_dim( + gather_weights, etp_group, + async_op=async_op, + quantizers=quantizers, + output_tensors=out_buffers, + ) + nvtx_range_pop(f"{nvtx_label}.batched_etp_ag") + else: + nvtx_range_push(f"{nvtx_label}.etp_ag") + weight_total, handle = gather_along_first_dim( + gather_weights[0], etp_group, + quantizer=quantizers[0], + async_op=async_op, + output_tensor=out_buffers[0] if out_buffers is not None else None, + ) + nvtx_range_pop(f"{nvtx_label}.etp_ag") + results = [weight_total] + + result = results if self.is_routed_expert else results[0] + + # 6. Wrap handle. + if async_op: + handle = ETPShardHandle(handle, weights) + else: + handle = None + + return result, handle + + def _get_unsharded(self, fwd, skip_weight_cast=False, cast_noop_flag=None): + """Get unsharded (all-gathered) weight tensor(s). + + Handles both routed experts (returns list) and single weights (returns tensor). + Supports sync gather, async prefetch wait, and cache retrieval. + """ + weights = self._weights + + # Wait for async prefetch if in progress + if weights[0].state == ETPWeightState.ASYNC_WAIT: + self._prefetch_handle.wait() + self._prefetch_handle = None + + if weights[0].state == ETPWeightState.NONE: + # Synchronous all-gather (no cache — buffers allocated inline) + result, _ = self._all_gather_weight( + async_op=False, + skip_weight_cast=skip_weight_cast, + cast_noop_flag=cast_noop_flag, + fwd=fwd, + ) + result = result if self.is_routed_expert else [result] + + elif weights[0].state == ETPWeightState.ASYNC_DONE: + # Retrieve prefetched results from cache + cache = get_global_ETP_cache() + result = [] + for w in weights: + buf = cache.get(w._cache_ticket) + w._cache_ticket = None + # Post-gather quantization safety net: weight was prefetched + # before weight_quantizer was set + if not w.did_cast_to_low_precision: + if w._quantizer is not None and not isinstance(buf, QuantizedTensor): + w._quantize_if_needed() + buf = w._quantizer.quantize(buf) + w._set_state(ETPWeightState.NONE) + result.append(buf) + else: + assert False, f"Unexpected state: {weights[0].state}" + + result = [self._strip_padding(r) for r in result] + result = [r.detach().requires_grad_(w.requires_grad) for r, w in zip(result, weights)] + return result if self.is_routed_expert else result[0] + + def all_gather_and_prefetch_bwd(self, nvtx_label=None): + """ + Backward variant: get current weight (from cache if prefetched, else + sync gather) and async-prefetch prev_w. + + Safe thanks to the coat-check cache: get() returns the current buffer + to the pool, and the prefetch's checkout() will allocate a separate + buffer if the pool is empty (i.e. the current buffer is still live + via the caller's tensor reference). + + Returns: + weight_total + """ + result = self._get_unsharded(fwd=False, skip_weight_cast=True) + + if ETP_CONFIG.weight_prefetch and self.prev_w is not None: + _, handle = self.prev_w._all_gather_weight( + async_op=True, skip_weight_cast=True, cast_noop_flag=None, + fwd=False, nvtx_label=nvtx_label, + ) + self.prev_w._prefetch_handle = handle + return result + + def batched_all_gather_and_prefetch_bwd(self, nvtx_label=None): + """Batched backward all-gather + prefetch. Wrapper around all_gather_and_prefetch_bwd.""" + return self.all_gather_and_prefetch_bwd(nvtx_label=nvtx_label) + + def all_gather_and_prefetch( + self, + fwd: bool = True, + skip_weight_cast: bool = False, + cast_noop_flag: torch.Tensor = None, + nvtx_label: str = None, + ): + """ + All-gather current weight and async-prefetch the next weight. + + Returns: + weight_total + """ + # Lazy population of linked list: link previous weight to current weight + cls = type(self) + if cls._first_weight_flag: + self.is_first_weight = True + cls._first_weight_flag = False + + if self.is_first_weight: + cls._last_weight = None + + if cls._last_weight is not None and cls._last_weight.next_w is None: + print_rank_0(f"linking curr w: {id(self)} {self.is_routed_expert} prev_w: {id(cls._last_weight)}") + cls._last_weight.next_w = self + self.prev_w = cls._last_weight + cls._last_weight = self + + result = self._get_unsharded(fwd, skip_weight_cast=skip_weight_cast, cast_noop_flag=cast_noop_flag) + + if ETP_CONFIG.weight_prefetch and self.next_w is not None: + target = self.next_w + _, handle = target._all_gather_weight( + async_op=True, skip_weight_cast=skip_weight_cast, + cast_noop_flag=cast_noop_flag, fwd=fwd, nvtx_label=nvtx_label, + ) + target._prefetch_handle = handle + return result + + def batched_all_gather_and_prefetch(self, **kwargs): + """Batched all-gather + prefetch for expert weights. Wrapper around all_gather_and_prefetch.""" + return self.all_gather_and_prefetch(**kwargs) + + def get_wgrad_tensor(self): + return torch.empty( + self._unsharded_shape, + dtype=self.main_grad.dtype, + device=self.device, + requires_grad=False, + ) + + def register_grad_accum_hook(self, grad_accum_node, hook): + self._grad_accum_node = grad_accum_node + self._grad_accum_hook = hook + + @classmethod + def _resolve_pending_rs(cls, expected_next): + """Finish any pending reduce-scatter from a previous weight.""" + if cls._pending_rs_weight is not None: + assert cls._pending_rs_weight is expected_next + cls._pending_rs_weight.finish_wgrad_reduce_scatter() + cls._pending_rs_weight = None + + @staticmethod + def _apply_fused_wgrad(param, wgrad_rs): + """Apply fuse_wgrad_accumulation logic to a single param and return a dummy grad.""" + + # the last rank in the etp group pads the param, so need to remove the padding here + if param.group.rank() == param.group.size() - 1: + wgrad_rs = param._strip_padding(wgrad_rs) + + param.main_grad.add_(wgrad_rs) + # Handle mcore grad accum fusion + if hasattr(param, "grad_added_to_main_grad"): + param.grad_added_to_main_grad = True + param.grad = get_dummy_wgrad(list(param.main_grad.shape), param.dtype) + if param._grad_accum_hook is not None: + param._grad_accum_hook(param) + + def finish_wgrad_reduce_scatter(self): + if self.wgrad_rs_handle is not None: + self.wgrad_rs_handle.wait() + self.wgrad_rs_handle = None + + for param, wgrad_rs in zip(self._weights, self.wgrad_rs): + if self.fuse_wgrad_accumulation: + self._apply_fused_wgrad(param, wgrad_rs) + else: + if param.is_padded_last_rank: + wgrad_rs = param._strip_padding(wgrad_rs) + param.grad = wgrad_rs + # Finally call the grad accum node + param._grad_accum_node(param.grad) + + def _reduce_scatter(self, wgrads, async_op): + """Reduce-scatter one or more wgrads. Returns (outputs, handle). + + Single tensor: plain reduce-scatter (no coalescing). + Multiple tensors: coalesced reduce-scatter. + """ + + if self.pad_length > 0: + wgrads = [torch.nn.functional.pad(w, (0, 0, 0, self.pad_length)) for w in wgrads] + + if len(wgrads) == 1: + out, handle = reduce_scatter_along_first_dim( + wgrads[0], self.group, async_op=async_op + ) + return [out], handle + else: + outputs = [] + with torch.distributed._coalescing_manager( + group=self.group, + device=wgrads[0].device, + async_ops=async_op, + ) as cm: + for tensor in wgrads: + out, _ = reduce_scatter_along_first_dim(tensor, self.group) + outputs.append(out) + return outputs, cm if async_op else None + + def wgrad_reduce_scatter(self, wgrad, fuse_wgrad_accumulation): + """Reduce-scatter wgrad(s). Sync for last weight, async+deferred for others. + + Accepts a single tensor (non-routed) or list of tensors (routed experts). + + Returns: + Single tensor or list for sync (last weight) — backward should return this. + None or tuple of Nones for async — backward should return this. + """ + batched = isinstance(wgrad, (list, tuple)) + wgrads = list(wgrad) if batched else [wgrad] + weights = self._weights + + self._resolve_pending_rs(self.next_w) + + if self.prev_w is None: + # Sync reduce-scatter (last weight in chain) + sharded, _ = self._reduce_scatter(wgrads, async_op=False) + if fuse_wgrad_accumulation: + [self._apply_fused_wgrad(p, g) for p, g in zip(weights, sharded)] + result = [None] * len(weights) + else: + result = [ + p._strip_padding(g) if p.is_padded_last_rank else g + for p, g in zip(weights, sharded) + ] + return result if batched else result[0] + else: + # Async reduce-scatter (not last weight — deferred finish) + self.fuse_wgrad_accumulation = fuse_wgrad_accumulation + self.wgrad_rs, self.wgrad_rs_handle = self._reduce_scatter(wgrads, async_op=True) + type(self)._pending_rs_weight = self + return tuple([None] * len(wgrads)) if batched else None + + def batched_wgrad_reduce_scatter(self, wgrad_list, fuse_wgrad_accumulation): + """Batched version of wgrad_reduce_scatter.""" + return self.wgrad_reduce_scatter(wgrad_list, fuse_wgrad_accumulation) + + def __torch_function__(self, func, types, args=(), kwargs=None): + if kwargs is None: + kwargs = {} + + if func is torch.Tensor.detach: + with torch._C.DisableTorchFunctionSubclass(): + # Perform the raw detach + result = func(*args, **kwargs) + # Re-wrap it in your subclass so PyTorch is happy + return result.as_subclass(type(self)) + + # 2. For everything else (add, mul, etc.), be transparent/decay. + with torch._C.DisableTorchFunctionSubclass(): + return func(*args, **kwargs) + + +def print_rank_0(message, rank=None): + """If distributed is initialized or rank is specified, print only on rank 0.""" + if rank is not None: + if rank == 0: + print(message, flush=True) + elif torch.distributed.is_initialized(): + if torch.distributed.get_rank() == 0: + print(message, flush=True) + else: + print(message, flush=True) + +class ETPWeightCache: + """ + Buffers are pooled by cache key (shape + dtype). Two operations: + + - ``checkout(param, dtype, fwd)`` → ``(buffer, ticket)`` + Takes a buffer from the pool (or allocates). Ticket is ``id(buf)``. + - ``get(ticket, param, dtype, fwd)`` → ``buffer`` + Retrieves the buffer, asserts key matches, returns it to the pool, + and invalidates the ticket. + + Every checkout is paired with exactly one get (1:1). + Two weights sharing the same cache key get distinct buffers if one + is still checked out, preventing aliasing. + """ + + # Bytes per element for known dtypes (used for logging). + _BYTES_PER_ELEMENT = { + torch.bfloat16: 2, torch.float16: 2, torch.float32: 4, + tex.DType.kFloat4E2M1: 0.5, + tex.DType.kFloat8E4M3: 1, + } + + def __init__(self): + self._pool: Dict[tuple, List[torch.Tensor]] = defaultdict(list) + self._tickets: Dict[int, tuple] = {} # ticket → (key, buf) + self._free_tickets: list[int] = [] # recycled ticket IDs + self._max_ticket: int = 0 # high-water mark for ticket allocation + self._total_bytes: int = 0 # running total of allocated bytes + + @staticmethod + def _buf_bytes(shape, dtype) -> int: + """Estimate buffer size in bytes.""" + numel = 1 + for d in shape: + numel *= d + bpe = ETPWeightCache._BYTES_PER_ELEMENT.get(dtype, None) + return numel * bpe + + def _allocate_buffer(self, param: 'ETPShardedParam', dtype) -> torch.Tensor: + out_shape = param._unsharded_shape_padded + if not isinstance(dtype, torch.dtype): + quantizer = param._quantizer + assert quantizer is not None + assert quantizer.rowwise_usage ^ quantizer.columnwise_usage + + device = torch.cuda.current_device() + buf = param._quantizer.make_empty(out_shape, dtype=torch.bfloat16, device=device) + else: + buf = torch.empty( + out_shape, dtype=dtype, device=param.device, memory_format=torch.contiguous_format + ) + buf_bytes = self._buf_bytes(out_shape, dtype) + self._total_bytes += buf_bytes + print_rank_0( + f"[ETP Cache] +{buf_bytes / 1024**2:.1f} MB (shape={out_shape}, dtype={dtype}) " + f"total={self._total_bytes / 1024**2:.1f} MB" + ) + return buf + + def checkout(self, param: 'ETPShardedParam', dtype, fwd: bool): + """Get a buffer for all-gather output. Returns (buffer, ticket). + + Ticket IDs are recycled so they stay bounded. + If all buffers for this key are checked out, allocates a new one. + """ + key = param._get_cache_key(dtype, fwd) + pool = self._pool[key] + buf = pool.pop() if pool else self._allocate_buffer(param, dtype) + + if self._free_tickets: + ticket = self._free_tickets.pop() + else: + ticket = self._max_ticket + self._max_ticket += 1 + self._tickets[ticket] = (key, buf) + return buf, ticket + + def get(self, ticket: int) -> torch.Tensor: + """Retrieve buffer by ticket and return it to the pool. + + This combines the old get + ticket_return into a single call. + After this call the ticket is invalidated and the buffer is + available for future checkouts. + """ + assert ticket in self._tickets, f"Invalid ticket: {ticket}" + key, buf = self._tickets.pop(ticket) + self._free_tickets.append(ticket) + self._pool[key].append(buf) + return buf + + +def get_global_ETP_cache() -> ETPWeightCache: + """Get or lazily create the global cache instance.""" + global _ALL_GATHER_BUFFER + if _ALL_GATHER_BUFFER is None: + _ALL_GATHER_BUFFER = ETPWeightCache() + return _ALL_GATHER_BUFFER + + +@dataclass +class BatchedNVFP4AllGatherAsyncHandle: + """Handle for batched asynchronous NVFP4 all-gathers.""" + output_handles: List[_NVFP4AllGatherAsyncHandle] + outer_async_handle: torch.distributed.Work + _synchronized: bool = False + + def wait(self) -> None: + """Wait for the async operation to complete and post-process the tensor.""" + if self._synchronized: + return + self.outer_async_handle.wait() + # Fixes interleaved data for transposed tensor/scale inv and pads scale inv if needed. + for output_handle in self.output_handles: + if output_handle is not None: + assert output_handle.async_handle is None + output_handle.post_process_nvfp4_gather() + # release any tensor references just in case + output_handle.output = None + output_handle.columnwise_data_interleaved = None + output_handle.columnwise_scale_inv_interleaved = None + + self._synchronized = True + + +def grouped_gather_along_first_dim( + weights: list, + process_group, + async_op: bool = False, + quantizers: list = None, + output_tensors: list = None, +): + """ + All-gather multiple weights in a single coalesced operation. + + Handles NVFP4 post-processing for both sync and async paths. + """ + # Determine device from first weight. + inp = weights[0] + if isinstance(inp, NVFP4TensorStorage): + device = ( + inp._rowwise_data.device if inp._rowwise_data is not None + else inp._columnwise_data.device + ) + else: + device = inp.device + + weights_all = [] + weight_handles = [] + with torch.distributed._coalescing_manager( + group=process_group, device=device, async_ops=async_op, + ) as gather_coalescing_manager: + for i, weight in enumerate(weights): + weight_all, weight_handle = gather_along_first_dim( + weight, process_group, + quantizer=quantizers[i], + output_tensor=output_tensors[i] if output_tensors is not None else None, + grouped=True, + ) + weights_all.append(weight_all) + weight_handles.append(weight_handle) + + if async_op: + handle = gather_coalescing_manager + if ( + quantizers is not None + and getattr(quantizers[0], "columnwise_usage", False) + ): + handle = BatchedNVFP4AllGatherAsyncHandle(weight_handles, handle) + else: + for wh in weight_handles: + if isinstance(wh, _NVFP4AllGatherAsyncHandle): + wh.post_process_nvfp4_gather() + handle = None + + return weights_all, handle diff --git a/transformer_engine/pytorch/module/grouped_linear.py b/transformer_engine/pytorch/module/grouped_linear.py index f3e7b57cf1..58c4e3b130 100644 --- a/transformer_engine/pytorch/module/grouped_linear.py +++ b/transformer_engine/pytorch/module/grouped_linear.py @@ -5,6 +5,7 @@ """GroupedLinear API""" from typing import Union, Optional, Callable, Tuple, List from itertools import chain +import traceback import warnings import functools @@ -22,6 +23,7 @@ _2X_ACC_WGRAD, ) from ._common import WeightGradStore +from .extended_tensor_parallelism import wrap_module_params_etp from ..quantization import FP8GlobalStateManager from ..utils import ( divide, @@ -32,6 +34,7 @@ get_nvtx_range_context, ) from ..distributed import ( + set_extended_tensor_parallel_attributes, set_tensor_model_parallel_attributes, get_distributed_world_size, is_fp8_activation_recompute_enabled, @@ -43,9 +46,9 @@ from ..constants import GemmParallelModes, dist_group_type from ..jit import no_torch_dynamo from ..cpu_offload import is_cpu_offload_enabled, mark_not_offload, start_offload - from ..tensor.float8_tensor import Float8CurrentScalingQuantizer, Float8Quantizer from ..quantized_tensor import ( + QuantizedTensor, QuantizedTensorStorage, Quantizer, prepare_for_saving, @@ -96,6 +99,8 @@ def forward( skip_fp8_weight_update, save_original_input, debug, + etp_size, + recompute, ) = non_tensor_args num_gemms = len(m_splits) @@ -104,6 +109,14 @@ def forward( device = inp.device weight_requires_grad = weights[0].requires_grad + if etp_size > 1: + weights_etp_sharded = weights + weights = weights[0].batched_all_gather_and_prefetch( + fwd=True, + skip_weight_cast=is_first_microbatch is False, + cast_noop_flag=skip_fp8_weight_update, + ) + # Configure quantizers if save_original_input and isinstance(input_quantizers[0], Float8Quantizer): raise ValueError("DelayedScaling recipe is not supported with save_original_input") @@ -165,7 +178,7 @@ def forward( start_offload(*inputmats) # Initialize weights - weights_fp8: list + weights_fp8: list if fp8 or debug: # FP8 cast to workspace buffer weights_fp8 = [] @@ -180,7 +193,6 @@ def forward( workspace_dtype=activation_dtype, ) weights_fp8.append(weight_fp8) - else: weights_fp8 = [cast_if_needed(weight, activation_dtype) for weight in weights] @@ -257,12 +269,20 @@ def forward( for weight in weights: ctx.weight_objects.append(weight) - tensors_to_save, tensor_objects = prepare_for_saving( - *inputmats, - *weights_fp8, - *weights, - *biases, - ) + if etp_size == 1: + tensors_to_save, tensor_objects = prepare_for_saving( + *inputmats, + *weights_fp8, + *weights, + *biases, + ) + else: + tensors_to_save, tensor_objects = prepare_for_saving( + *inputmats, + *weights_etp_sharded, + *biases, + ) + ctx.save_for_backward(*tensors_to_save) ctx.tensor_objects = tensor_objects @@ -278,6 +298,8 @@ def forward( if hasattr(weights[0], "__fsdp_param__"): # MCore FSDP creates main_grad lazily before backward ctx.main_grad_funcs = [weights[i].get_main_grad for i in range(num_gemms)] + elif etp_size > 1: + ctx.main_grad_funcs = [weights_etp_sharded[i].get_wgrad_tensor for i in range(num_gemms)] else: ctx.main_grad_funcs = [ lambda j=i: weights[j].main_grad for i in range(num_gemms) @@ -308,6 +330,8 @@ def forward( ctx.debug = debug ctx.save_original_input = save_original_input ctx.input_quantizers = input_quantizers + ctx.etp_size = etp_size + ctx.recompute = recompute # [*, in_features] -> [*, out_features] except first dimension changes for SP return out.view(-1, *inp.shape[1:-1], out.shape[-1]) @@ -318,11 +342,19 @@ def backward(ctx, grad_output: torch.Tensor) -> Tuple[Union[torch.Tensor, None], with get_nvtx_range_context("_GroupedLinear_backward"): saved_tensors = restore_from_saved(ctx.tensor_objects, ctx.saved_tensors) N = ctx.num_gemms - inputmats = saved_tensors[:N] - weights = saved_tensors[N : 2 * N] - origin_weights = saved_tensors[2 * N : 3 * N] - biases = saved_tensors[3 * N : 4 * N] - main_grads = [main_grad_func() for main_grad_func in ctx.main_grad_funcs] + + if ctx.etp_size == 1: + inputmats = saved_tensors[:N] + weights = saved_tensors[N : 2 * N] + origin_weights = saved_tensors[2 * N : 3 * N] + biases = saved_tensors[3 * N : 4 * N] + else: + inputmats = saved_tensors[:N] + origin_weights = saved_tensors[N : 2 * N] + biases = saved_tensors[2 * N : 3 * N] + + if ctx.fuse_wgrad_accumulation: + main_grads = [main_grad_func() for main_grad_func in ctx.main_grad_funcs] if ctx.cpu_offloading: if ctx.grad_added_to_main_grad: @@ -330,10 +362,6 @@ def backward(ctx, grad_output: torch.Tensor) -> Tuple[Union[torch.Tensor, None], origin_weights[i] = ctx.weight_objects[i] ctx.weight_objects[i] = None - if ctx.fuse_wgrad_accumulation: - for i in range(N): - origin_weights[i].main_grad = main_grads[i] - # Preprocess grad output grad_output_view = grad_output.contiguous().view(-1, grad_output.shape[-1]) grad_output = [None] * ctx.num_gemms @@ -383,13 +411,20 @@ def backward(ctx, grad_output: torch.Tensor) -> Tuple[Union[torch.Tensor, None], ctx.m_splits, ) - if ctx.is_first_microbatch is not None: + if ctx.etp_size > 1: + accumulate_wgrad_into_param_main_grad = False + elif ctx.is_first_microbatch is not None: accumulate_wgrad_into_param_main_grad = ( ctx.fuse_wgrad_accumulation and not ctx.is_first_microbatch ) else: accumulate_wgrad_into_param_main_grad = ctx.fuse_wgrad_accumulation + if ctx.etp_size > 1: + weights = origin_weights[0].batched_all_gather_and_prefetch_bwd( + nvtx_label="te._GroupedLinear.bwd", + ) + if ctx.requires_dgrad: dgrad_gemm_use_split_accumulator = _2X_ACC_DGRAD if ctx.fp8 or ctx.debug: @@ -421,7 +456,32 @@ def backward(ctx, grad_output: torch.Tensor) -> Tuple[Union[torch.Tensor, None], use_split_accumulator=dgrad_gemm_use_split_accumulator, ) + def handle_custom_ddp_from_mcore(weight, wgrad): + if ctx.weights_requires_grad: + # Handle custom DDP from mcore. + if ctx.fuse_wgrad_accumulation and hasattr( + weight, "grad_added_to_main_grad" + ): + weight.grad_added_to_main_grad = True + if getattr(weight, "zero_out_wgrad", False): + wgrad = get_dummy_wgrad( + list(weight.main_grad.shape), + weight.dtype, + zero=True, + ) + else: + wgrad = get_dummy_wgrad( + list(weight.main_grad.shape), + weight.dtype, + ) + elif ctx.fuse_wgrad_accumulation: + wgrad = None + else: + wgrad = None + return wgrad + if ctx.weights_requires_grad: + """Wgrad computation.""" wgrad_gemm_use_split_accumulator = _2X_ACC_WGRAD if ctx.fp8: recipe = ctx.fp8_recipe @@ -429,6 +489,7 @@ def backward(ctx, grad_output: torch.Tensor) -> Tuple[Union[torch.Tensor, None], wgrad_gemm_use_split_accumulator = ( recipe.fp8_gemm_wgrad.use_split_accumulator ) + if ctx.fuse_wgrad_accumulation: wgrad_list = main_grads else: @@ -476,7 +537,7 @@ def backward(ctx, grad_output: torch.Tensor) -> Tuple[Union[torch.Tensor, None], use_split_accumulator=wgrad_gemm_use_split_accumulator, accumulate=( accumulate_wgrad_into_param_main_grad - if not getattr(weights[0], "overwrite_main_grad", False) + if ctx.etp_size == 1 and not getattr(weights[0], "overwrite_main_grad", False) else False ), ) @@ -494,36 +555,13 @@ def backward(ctx, grad_output: torch.Tensor) -> Tuple[Union[torch.Tensor, None], # Deallocate input tensor clear_tensor_data(*inputmats) - def handle_custom_ddp_from_mcore(weight, wgrad): - if ctx.weights_requires_grad: - # Handle custom DDP from mcore. - if ctx.fuse_wgrad_accumulation and hasattr( - weight, "grad_added_to_main_grad" - ): - weight.grad_added_to_main_grad = True - if getattr(weight, "zero_out_wgrad", False): - wgrad = get_dummy_wgrad( - list(weight.main_grad.shape), - weight.dtype, - zero=True, - ) - else: - wgrad = get_dummy_wgrad( - list(weight.main_grad.shape), - weight.dtype, - ) - elif ctx.fuse_wgrad_accumulation: - wgrad = None - else: - wgrad = None - return wgrad - + if ctx.etp_size > 1: + wgrad_list = origin_weights[0].batched_wgrad_reduce_scatter(wgrad_list, ctx.fuse_wgrad_accumulation) + elif ctx.fuse_wgrad_accumulation: wgrad_list = [ - handle_custom_ddp_from_mcore(weight, wgrad) - for weight, wgrad in zip(origin_weights, wgrad_list) - ] - else: - wgrad_list = [None] * ctx.num_gemms + handle_custom_ddp_from_mcore(weight, wgrad) + for weight, wgrad in zip(origin_weights, wgrad_list) + ] if not ctx.use_bias or ( ctx.wgrad_store is not None @@ -630,6 +668,8 @@ def __init__( save_original_input: bool = False, single_grouped_parameter: bool = False, name: Optional[str] = None, + etp_group: Optional[dist_group_type] = None, + recompute: bool = False, ) -> None: super().__init__(name) @@ -682,6 +722,13 @@ def __init__( "Because the TP communication is handled outside of this module." ) + if etp_group is None: + self.etp_size = 1 + else: + self.etp_size = get_distributed_world_size(etp_group) + assert tp_size == 1, f"TODO(shiqingf): ETP+TP is not well supported yet." + self.recompute = recompute + self.parallel_mode = parallel_mode assert ( self.parallel_mode in GemmParallelModes @@ -734,6 +781,10 @@ def __init__( is_meta = torch.device(device).type == "meta" self.reset_parameters(defer_init=is_meta) + if etp_group is not None: + weight_names = [f"weight{idx}" for idx in range(self.num_gemms)] + wrap_module_params_etp(self, weight_names, etp_group, is_grouped=True) + if self.wgrad_store.delay_wgrad_compute(): for name, param in self.named_parameters(): for i in range(self.num_gemms): @@ -887,6 +938,11 @@ def forward( weight_tensors = self._get_weight_tensors() bias_tensors = [getattr(self, f"bias{i}") for i in range(self.num_gemms)] + if self.etp_size > 1: + weight_tensors[0].setup( + weight_quantizer=self._get_weight_quantizers(), + ) + quantizers = self._get_quantizers() if not debug else self._get_debug_quantizers() if debug: @@ -932,6 +988,8 @@ def forward( None, # skip_fp8_weight_update self.save_original_input, debug, + self.etp_size, + self.recompute, ) out = linear_fn(*autograd_ctx, inp, non_tensor_args, *weight_tensors, *bias_tensors) diff --git a/transformer_engine/pytorch/module/layernorm_linear.py b/transformer_engine/pytorch/module/layernorm_linear.py index ce0581024a..2dda7d8812 100644 --- a/transformer_engine/pytorch/module/layernorm_linear.py +++ b/transformer_engine/pytorch/module/layernorm_linear.py @@ -26,6 +26,7 @@ _2X_ACC_DGRAD, _2X_ACC_WGRAD, ) +from .extended_tensor_parallelism import wrap_module_params_etp from ..quantization import FP8GlobalStateManager from ..utils import ( assert_dim_for_fp8_exec, @@ -42,6 +43,7 @@ get_nvtx_range_context, ) from ..distributed import ( + set_extended_tensor_parallel_attributes, set_tensor_model_parallel_attributes, get_distributed_world_size, allreduce, @@ -71,8 +73,10 @@ mark_not_offload, mark_activation_offload, ) +from ..tensor.nvfp4_tensor import NVFP4Tensor from ..tensor.storage.float8_blockwise_tensor_storage import Float8BlockwiseQTensorStorage from ..tensor.storage.mxfp8_tensor_storage import MXFP8TensorStorage + from ..export import is_in_onnx_export_mode, assert_warmed_up from ..cpp_extensions import ( @@ -140,6 +144,7 @@ def forward( skip_fp8_weight_update, symmetric_ar_type, debug, + etp_size, ) = non_tensor_args # NVTX label for profiling @@ -286,6 +291,15 @@ def forward( # ------------------------------------------------------ # Prepare weight tensor # ------------------------------------------------------ + + if etp_size > 1: + weight_etp_sharded = weight + weight = weight.all_gather_and_prefetch( + fwd=True, + skip_weight_cast=is_first_microbatch is False, + cast_noop_flag=skip_fp8_weight_update, + ) + weightmat = weight is_weight_param_quantized = False if fp8 or debug: @@ -368,6 +382,7 @@ def forward( extra_output=reduce_scatter_out, ) nvtx_range_pop(f"{nvtx_label}.gemm") + # ------------------------------------------------------ # Finished forward GEMM... # ------------------------------------------------------ @@ -400,7 +415,7 @@ def forward( nvtx_range_pop(f"{nvtx_label}.row_parallel_comm") else: out = gemm_out - out = out.view(-1, *inp_shape[1:-1], out_features) + out = out.view(-1, *inp_shape[1:-1], out.shape[-1]) # ------------------------------------------------------ # Output tensor is ready to return... # ------------------------------------------------------ @@ -463,8 +478,9 @@ def forward( tensors_to_save, tensor_objects = prepare_for_saving( inputmat, - weightmat, - weight, + # For ETP, avoid keeping the gathered weightmat in memory for memory saving. + weightmat if etp_size == 1 else None, + weight if etp_size == 1 else weight_etp_sharded, bias, ln_weight, ln_out, @@ -483,6 +499,8 @@ def forward( if hasattr(weight, "__fsdp_param__"): # MCore FSDP creates main_grad lazily before backward ctx.main_grad_func = weight.get_main_grad + elif etp_size > 1: + ctx.main_grad_func = weight_etp_sharded.get_wgrad_tensor else: ctx.main_grad_func = lambda: weight.main_grad ctx.grad_input_quantizer = grad_input_quantizer @@ -523,6 +541,7 @@ def forward( FP8GlobalStateManager.IS_FIRST_FP8_MODULE = _first_fp8_module ctx.wgrad_store = wgrad_store ctx.debug = debug + ctx.etp_size = etp_size # ------------------------------------------------------ # Cached state for backward pass is ready... @@ -567,7 +586,7 @@ def backward( # Since main_grad can be modified inplace, it should not be a part of saved_tensors main_grad = ( ctx.main_grad_func() - if weight is not None and ctx.fuse_wgrad_accumulation and ctx.requires_wgrad + if origin_weight is not None and ctx.fuse_wgrad_accumulation and ctx.requires_wgrad else None ) @@ -590,7 +609,7 @@ def backward( if ctx.cpu_offloading: if ctx.grad_added_to_main_grad: origin_weight = ctx.weight_object - if ctx.requires_wgrad and ctx.fuse_wgrad_accumulation: + if ctx.requires_wgrad and ctx.fuse_wgrad_accumulation and ctx.etp_size == 1: origin_weight.main_grad = main_grad # Configure Userbuffers communication (comm+GEMM overlap) @@ -640,13 +659,14 @@ def backward( # Prepare grad output tensor # Note: Cast to expected dtype and perform tensor-parallel communication + grad_output = grad_outputs[0] nvtx_range_push(f"{nvtx_label}.grad_output_preprocess") ( grad_output, grad_bias, ) = TransformerEngineBaseModule.grad_output_preprocess( ctx, - grad_outputs[0], + grad_output, ctx.parallel_mode == "row", ctx.grad_output_quantizer, ) @@ -702,6 +722,10 @@ def backward( # -------------------------------------------------- # Make sure required data is available + if ctx.etp_size > 1: + weight = origin_weight.all_gather_and_prefetch_bwd( + nvtx_label=nvtx_label) + if isinstance(grad_output, QuantizedTensorStorage): grad_output.update_usage(rowwise_usage=True) if ctx.weight_quantizer is not None and isinstance(weight, QuantizedTensorStorage): @@ -843,7 +867,11 @@ def backward( use_split_accumulator = recipe.fp8_gemm_wgrad.use_split_accumulator # Figure out whether to output wgrad GEMM directly into main grad - if ctx.is_first_microbatch is not None: + if ctx.etp_size > 1: + # When ETP is enabled, GA is always disabled. ETP Wgrad workflow: + # allocte wgrad_out tmp buffer -> RS(wgrad_gemm) -> GradientAccumulation + accumulate_wgrad_into_param_main_grad = False + elif ctx.is_first_microbatch is not None: accumulate_wgrad_into_param_main_grad = ( ctx.fuse_wgrad_accumulation and not ctx.is_first_microbatch ) @@ -910,6 +938,7 @@ def wgrad_gemm( "with Userbuffers (tensor-parallel communication overlapping)" ) ctx.wgrad_store.put([ln_out_total, grad_output], wgrad_gemm) + assert False, f"TODO(shiqingf): not supported for ETP..." else: # Call wgrad GEMM now @@ -941,9 +970,8 @@ def wgrad_gemm( else: dgrad = ub_obj_wgrad.get_buffer(local_chunk=True).clone() - # -------------------------------------------------- - # Grad weight has been computed... - # -------------------------------------------------- + if ctx.etp_size > 1: + wgrad = origin_weight.wgrad_reduce_scatter(wgrad, ctx.fuse_wgrad_accumulation) # Don't return grad bias if not needed if not ctx.use_bias: @@ -992,7 +1020,9 @@ def wgrad_gemm( clear_tensor_data(mu) clear_tensor_data(rsigma) - if ctx.requires_wgrad: + if ctx.etp_size > 1: + wgrad = None + elif ctx.requires_wgrad: # Handle custom DDP from mcore. if ctx.fuse_wgrad_accumulation and hasattr(origin_weight, "grad_added_to_main_grad"): origin_weight.grad_added_to_main_grad = True @@ -1160,6 +1190,7 @@ def __init__( delay_wgrad_compute: bool = False, symmetric_ar_type: Optional[str] = None, name: Optional[str] = None, + etp_group: Optional[dist_group_type] = None, ) -> None: super().__init__(name) @@ -1190,6 +1221,10 @@ def __init__( self.set_tensor_parallel_group(tp_group) self.set_nccl_overlap_warning_if_tp() + if etp_group is None: + self.etp_size = 1 + else: + self.etp_size = get_distributed_world_size(etp_group) self.parallel_mode = parallel_mode assert ( self.parallel_mode in GemmParallelModes @@ -1199,6 +1234,7 @@ def __init__( self.out_features = divide(self.out_features, self.tp_size) elif self.parallel_mode == "row": self.in_features = divide(self.in_features, self.tp_size) + self.tp_out_features = self.out_features if init_method is None: init_method = get_default_init_method() @@ -1382,6 +1418,10 @@ def __init__( self.reset_parameters(defer_init=device == "meta") + if etp_group is not None: + wrap_module_params_etp(self, self.weight_names, etp_group) + del weight_tensor + # For RPL, bias has to be added after TP collectives # So it cannot be fused with the GEMM if self.parallel_mode == "row" and self.apply_bias: @@ -1402,6 +1442,7 @@ def __init__( if name in self.weight_names or name in self.bias_names: param.skip_backward_post_hook = True + def set_meta_tensor(self, fwd: bool, recipe: Recipe) -> None: """Init scales and amaxes for fwd | bwd.""" super().set_meta_tensor(fwd, recipe) @@ -1445,6 +1486,13 @@ def reset_parameters(self, defer_init=False): dim=1 if self.parallel_mode == "row" else 0, stride=1, ) + if self.etp_size > 1: + set_extended_tensor_parallel_attributes( + tensor=getattr(self, weight), + is_parallel=True, + dim=0, # ETP always shard along the first dim. + stride=1, + ) # Set parallelism attributes for linear biases if self.use_bias: @@ -1516,6 +1564,11 @@ def forward( # Get concatenated weight and bias tensors weight_tensor, bias_tensor = self._get_weight_and_bias_tensors() + if self.etp_size > 1: + weight_tensor.setup( + weight_quantizer=self._get_weight_quantizers(), + ) + quantizers = ( self._get_quantizers(fp8_output, fp8_grad, is_grad_enabled) if not debug @@ -1580,6 +1633,7 @@ def forward( skip_fp8_weight_update, self.symmetric_ar_type, debug, + self.etp_size, ) out = fwd_fn( *autograd_ctx, diff --git a/transformer_engine/pytorch/module/layernorm_mlp.py b/transformer_engine/pytorch/module/layernorm_mlp.py index 16e620fd94..f204383166 100644 --- a/transformer_engine/pytorch/module/layernorm_mlp.py +++ b/transformer_engine/pytorch/module/layernorm_mlp.py @@ -47,6 +47,7 @@ get_nvtx_range_context, ) from ..distributed import ( + set_extended_tensor_parallel_attributes, set_tensor_model_parallel_attributes, get_distributed_world_size, allreduce, @@ -1799,6 +1800,7 @@ def __init__( delay_wgrad_compute: bool = False, symmetric_ar_type: Optional[str] = None, checkpoint: bool = False, + etp_group: Optional[dist_group_type] = None, ) -> None: super().__init__(name) @@ -1843,6 +1845,11 @@ def __init__( self.set_tensor_parallel_group(tp_group) self.set_nccl_overlap_warning_if_tp() + if etp_group is None: + self.etp_size = 1 + else: + self.etp_size = get_distributed_world_size(etp_group) + if init_method is None: init_method = get_default_init_method() if output_layer_init_method is None: @@ -2007,6 +2014,9 @@ def reset_parameters(self, defer_init=False): # Set parallel attributes for linear parameters set_tensor_model_parallel_attributes(self.fc1_weight, True, 0, 1) set_tensor_model_parallel_attributes(self.fc2_weight, True, 1, 1) + if self.etp_size > 1: + set_extended_tensor_parallel_attributes(self.fc1_weight, True, 0, 1) + set_extended_tensor_parallel_attributes(self.fc2_weight, True, 0, 1) if self.use_bias: set_tensor_model_parallel_attributes(self.fc1_bias, True, 0, 1) if self.set_parallel_mode: diff --git a/transformer_engine/pytorch/module/linear.py b/transformer_engine/pytorch/module/linear.py index 31dac4d329..20f4799167 100644 --- a/transformer_engine/pytorch/module/linear.py +++ b/transformer_engine/pytorch/module/linear.py @@ -25,6 +25,7 @@ _2X_ACC_WGRAD, ) from ._common import noop_cat, WeightGradStore +from .extended_tensor_parallelism import wrap_module_params_etp from ..quantization import FP8GlobalStateManager from ..utils import ( cast_if_needed, @@ -128,6 +129,7 @@ def forward( symmetric_ar_type, save_original_input, debug, + etp_size, ) = non_tensor_args # NVTX label for profiling @@ -249,6 +251,15 @@ def forward( # ------------------------------------------------------ # Prepare weight tensor # ------------------------------------------------------ + + if etp_size > 1: + weight_etp_sharded = weight + weight = weight.all_gather_and_prefetch( + fwd=True, + skip_weight_cast=is_first_microbatch is False, + cast_noop_flag=skip_fp8_weight_update, + ) + weightmat = weight if fp8 or debug: # Configure quantizer @@ -434,8 +445,8 @@ def forward( # TODO(ksivamani): Check memory usage tensors_to_save, tensor_objects = prepare_for_saving( saved_inputmat, - weightmat, - weight, + weightmat if etp_size == 1 else None, + weight if etp_size == 1 else weight_etp_sharded, bias, ) ctx.save_for_backward(*tensors_to_save) @@ -456,6 +467,8 @@ def forward( if hasattr(weight, "__fsdp_param__"): # MCore FSDP creates main_grad lazily before backward ctx.main_grad_func = weight.get_main_grad + elif etp_size > 1: + ctx.main_grad_func = weight_etp_sharded.get_wgrad_tensor else: ctx.main_grad_func = lambda: weight.main_grad @@ -486,6 +499,7 @@ def forward( if in_fp8_activation_recompute_phase(): FP8GlobalStateManager.IS_FIRST_FP8_MODULE = _first_fp8_module ctx.wgrad_store = wgrad_store + ctx.etp_size = etp_size # ------------------------------------------------------ # Cached state for backward pass is ready... @@ -522,7 +536,7 @@ def backward(ctx, grad_output: torch.Tensor) -> Tuple[Union[torch.Tensor, None], if ctx.cpu_offloading: if ctx.grad_added_to_main_grad: weight = ctx.weight_object - if ctx.requires_wgrad and ctx.fuse_wgrad_accumulation: + if ctx.requires_wgrad and ctx.fuse_wgrad_accumulation and ctx.etp_size == 1: weight.main_grad = main_grad # Gather intermediate/activation tensors if needed @@ -684,6 +698,10 @@ def backward(ctx, grad_output: torch.Tensor) -> Tuple[Union[torch.Tensor, None], # Compute grad input tensor # -------------------------------------------------- + if ctx.etp_size > 1: + weight_fp8 = weight.all_gather_and_prefetch_bwd( + nvtx_label=nvtx_label) + dgrad = None dgrad_work = None if ctx.requires_dgrad: @@ -832,7 +850,9 @@ def backward(ctx, grad_output: torch.Tensor) -> Tuple[Union[torch.Tensor, None], use_split_accumulator = recipe.fp8_gemm_wgrad.use_split_accumulator # Figure out whether to output wgrad GEMM directly into main grad - if ctx.is_first_microbatch is not None: + if ctx.etp_size > 1: + accumulate_wgrad_into_param_main_grad = False + elif ctx.is_first_microbatch is not None: accumulate_wgrad_into_param_main_grad = ( ctx.fuse_wgrad_accumulation and not ctx.is_first_microbatch ) @@ -943,6 +963,8 @@ def wgrad_gemm( dgrad_work.wait() dgrad_work = None + if ctx.etp_size > 1: + wgrad = weight.wgrad_reduce_scatter(wgrad, ctx.fuse_wgrad_accumulation) if ctx.requires_wgrad: # Handle custom DDP from mcore. if ( @@ -1098,6 +1120,7 @@ def __init__( symmetric_ar_type: Optional[str] = None, save_original_input: bool = False, name: Optional[str] = None, + etp_group: Optional[dist_group_type] = None, ) -> None: super().__init__(name) @@ -1126,6 +1149,11 @@ def __init__( self.set_tensor_parallel_group(tp_group) self.set_nccl_overlap_warning_if_tp() + if etp_group is None: + self.etp_size = 1 + else: + self.etp_size = get_distributed_world_size(etp_group) + self.parallel_mode = parallel_mode assert ( self.parallel_mode in GemmParallelModes @@ -1297,6 +1325,10 @@ def __init__( self.reset_parameters(defer_init=device == "meta") + if etp_group is not None: + wrap_module_params_etp(self, self.weight_names, etp_group) + del weight_tensor + # For RPL, bias has to be added after TP collectives # So it cannot be fused with the GEMM if self.parallel_mode == "row" and self.apply_bias: @@ -1399,6 +1431,11 @@ def forward( try: weight_tensor, bias_tensor = self._get_weight_and_bias_tensors() + if self.etp_size > 1: + weight_tensor.setup( + weight_quantizer=self._get_weight_quantizers(), + ) + quantizers = ( self._get_quantizers(fp8_output, fp8_grad, is_grad_enabled) if not debug @@ -1459,6 +1496,7 @@ def forward( self.symmetric_ar_type, self.save_original_input, debug, + self.etp_size, ) out = linear_fn( *autograd_ctx, diff --git a/transformer_engine/pytorch/tensor/storage/nvfp4_tensor_storage.py b/transformer_engine/pytorch/tensor/storage/nvfp4_tensor_storage.py index e7509f3994..0b6830c1d0 100644 --- a/transformer_engine/pytorch/tensor/storage/nvfp4_tensor_storage.py +++ b/transformer_engine/pytorch/tensor/storage/nvfp4_tensor_storage.py @@ -297,6 +297,18 @@ def view(self, shape: torch.Size): with_gemm_swizzled_scales=self._with_gemm_swizzled_scales, ) + def copy_(self, tensor: NVFP4TensorStorage): + assert tensor._fp4_dtype == self._fp4_dtype + + self._rowwise_data.copy_(tensor._rowwise_data) + self._columnwise_data.copy_(tensor._columnwise_data) + self._rowwise_scale_inv.copy_(tensor._rowwise_scale_inv) + self._columnwise_scale_inv.copy_(tensor._columnwise_scale_inv) + self._amax_rowwise.copy_(tensor._amax_rowwise) + self._amax_columnwise.copy_(tensor._amax_columnwise) + self._quantizer = tensor._quantizer.copy() if tensor._quantizer is not None else None + + def __repr__(self): data_rowwise = self.dequantize() diff --git a/transformer_engine/pytorch/utils.py b/transformer_engine/pytorch/utils.py index 47af9fabe1..c225cb3009 100644 --- a/transformer_engine/pytorch/utils.py +++ b/transformer_engine/pytorch/utils.py @@ -430,6 +430,46 @@ def cast_if_needed(tensor: torch.Tensor, dtype: torch.dtype) -> torch.Tensor: with torch.enable_grad(): return tensor.to(dtype=dtype) +def mask_distributed_columns_graph_compatible(tensor, num_mask_cols, etp_size, valid_indices=None): + """ + CUDA graph compatible version using index_select. + + Args: + tensor: [M, N] input tensor + num_mask_cols: total columns to mask + etp_size: number of chunks + valid_indices: pre-computed valid column indices (optional, will be computed if None) + + Returns: + result: [M, N - num_mask_cols] tensor with masked columns removed + valid_indices: indices of valid columns (for reuse) + """ + # Sanity check + assert num_mask_cols > 0 and etp_size > 1 + _, N = tensor.shape + + chunk_size = N // etp_size + mask_per_chunk = num_mask_cols // etp_size + assert num_mask_cols % etp_size == 0 and mask_per_chunk >= 1 + + # Pre-compute valid indices if not provided + if valid_indices is None: + # Build list of valid column indices + indices_list = [] + for chunk_idx in range(etp_size): + chunk_start = chunk_idx * chunk_size + chunk_end = chunk_start + chunk_size + valid_end = chunk_end - mask_per_chunk + indices_list.extend(range(chunk_start, valid_end)) + + # Allocated during warmup of CG. + valid_indices = torch.tensor(indices_list, dtype=torch.long, device=tensor.device) + + # Use index_select instead of boolean indexing (CUDA graph compatible) + result = torch.index_select(tensor, dim=1, index=valid_indices) + + return result, valid_indices + def check_dim_for_fp8_exec(tensor: torch.Tensor) -> bool: """Check if tensor dimensions are supported for FP8 TN GEMM""" From 4e0e39db354f443db098475770fd89141b36fe14 Mon Sep 17 00:00:00 2001 From: Jieming Zhang Date: Thu, 5 Mar 2026 15:42:53 -0800 Subject: [PATCH 02/43] cleanup Signed-off-by: Jieming Zhang --- transformer_engine/pytorch/distributed.py | 23 +---- transformer_engine/pytorch/module/base.py | 6 -- .../pytorch/module/grouped_linear.py | 86 +++++++++---------- .../pytorch/module/layernorm_linear.py | 32 ++----- .../pytorch/module/layernorm_mlp.py | 10 --- .../tensor/storage/nvfp4_tensor_storage.py | 12 --- transformer_engine/pytorch/utils.py | 40 --------- 7 files changed, 51 insertions(+), 158 deletions(-) diff --git a/transformer_engine/pytorch/distributed.py b/transformer_engine/pytorch/distributed.py index dc5d8b3063..20c617ddb7 100644 --- a/transformer_engine/pytorch/distributed.py +++ b/transformer_engine/pytorch/distributed.py @@ -10,7 +10,7 @@ from functools import lru_cache from dataclasses import dataclass import math -from typing import Any, Callable, ContextManager, Dict, List, Optional, Tuple, Union +from typing import Any, Callable, Dict, List, Optional, Tuple, Union import warnings import torch @@ -60,14 +60,6 @@ "partition_stride": 1, } - -_EXTENDED_TENSOR_MODEL_PARALLEL_ATTRIBUTE_DEFAUTLTS = { - 'etp_model_parallel': False, - 'etp_partition_dim': -1, - 'etp_partition_stride': 1, -} - - _USE_REENTRANT_ACTIVATION_RECOMPUTE = True _FP8_ACTIVATION_RECOMPUTE_ENABLED = False @@ -167,19 +159,6 @@ def set_tensor_model_parallel_attributes( setattr(tensor, "partition_stride", stride) -def set_extended_tensor_parallel_attributes( - tensor: torch.Tensor, is_parallel: bool, dim: int, stride: int -) -> None: - """Set ps attributes to tensor.""" - # Make sure the attributes are not set. - for attribute in _EXTENDED_TENSOR_MODEL_PARALLEL_ATTRIBUTE_DEFAUTLTS: - assert not hasattr(tensor, attribute) - # Set the attributes. - setattr(tensor, 'etp_model_parallel', is_parallel) - setattr(tensor, 'etp_partition_dim', dim) - setattr(tensor, 'etp_partition_stride', stride) - - @lru_cache def get_distributed_world_size(group: Optional[dist_group_type] = None) -> int: """Return world size for the distributed group.""" diff --git a/transformer_engine/pytorch/module/base.py b/transformer_engine/pytorch/module/base.py index b565a40f87..9c21141a39 100644 --- a/transformer_engine/pytorch/module/base.py +++ b/transformer_engine/pytorch/module/base.py @@ -631,7 +631,6 @@ def __init__(self, name: Optional[str] = None) -> None: self.activation_dtype: Optional[torch.dtype] = None self.wgrad_accumulation_and_reduce_hooks = [] self.wgrad_store = None - self.etp_size = 1 if not TEDebugState.debug_enabled: TEDebugState.initialize() @@ -957,8 +956,6 @@ def set_tensor_parallel_group(self, tp_group: Union[dist_group_type, None]) -> N self.fast_setattr("tp_group", tp_group) self.fast_setattr("tp_group_initialized", True) - - def _get_fp8_params(self) -> Union[List[torch.Tensor], None]: """returns the FP8 weights.""" fp8_params = [] @@ -1245,9 +1242,6 @@ def reset_parameters(self, defer_init: Optional[bool] = False) -> None: for name, param in self.named_parameters(recurse=False): # Check if parameter is a DTensor (FSDP2) or regular tensor is_dtensor = isinstance(param, DTensor) - from .extended_tensor_parallelism import ETPShardedParam - is_etp = isinstance(param, ETPShardedParam) - dtensor_param = param if is_dtensor else None # Need to update/quantize local tensor in case of DTensor param = param._local_tensor if is_dtensor else param diff --git a/transformer_engine/pytorch/module/grouped_linear.py b/transformer_engine/pytorch/module/grouped_linear.py index 58c4e3b130..fd55f84b3c 100644 --- a/transformer_engine/pytorch/module/grouped_linear.py +++ b/transformer_engine/pytorch/module/grouped_linear.py @@ -34,7 +34,6 @@ get_nvtx_range_context, ) from ..distributed import ( - set_extended_tensor_parallel_attributes, set_tensor_model_parallel_attributes, get_distributed_world_size, is_fp8_activation_recompute_enabled, @@ -46,9 +45,9 @@ from ..constants import GemmParallelModes, dist_group_type from ..jit import no_torch_dynamo from ..cpu_offload import is_cpu_offload_enabled, mark_not_offload, start_offload + from ..tensor.float8_tensor import Float8CurrentScalingQuantizer, Float8Quantizer from ..quantized_tensor import ( - QuantizedTensor, QuantizedTensorStorage, Quantizer, prepare_for_saving, @@ -100,7 +99,6 @@ def forward( save_original_input, debug, etp_size, - recompute, ) = non_tensor_args num_gemms = len(m_splits) @@ -178,7 +176,7 @@ def forward( start_offload(*inputmats) # Initialize weights - weights_fp8: list + weights_fp8: list if fp8 or debug: # FP8 cast to workspace buffer weights_fp8 = [] @@ -193,6 +191,7 @@ def forward( workspace_dtype=activation_dtype, ) weights_fp8.append(weight_fp8) + else: weights_fp8 = [cast_if_needed(weight, activation_dtype) for weight in weights] @@ -331,7 +330,6 @@ def forward( ctx.save_original_input = save_original_input ctx.input_quantizers = input_quantizers ctx.etp_size = etp_size - ctx.recompute = recompute # [*, in_features] -> [*, out_features] except first dimension changes for SP return out.view(-1, *inp.shape[1:-1], out.shape[-1]) @@ -342,7 +340,6 @@ def backward(ctx, grad_output: torch.Tensor) -> Tuple[Union[torch.Tensor, None], with get_nvtx_range_context("_GroupedLinear_backward"): saved_tensors = restore_from_saved(ctx.tensor_objects, ctx.saved_tensors) N = ctx.num_gemms - if ctx.etp_size == 1: inputmats = saved_tensors[:N] weights = saved_tensors[N : 2 * N] @@ -352,9 +349,7 @@ def backward(ctx, grad_output: torch.Tensor) -> Tuple[Union[torch.Tensor, None], inputmats = saved_tensors[:N] origin_weights = saved_tensors[N : 2 * N] biases = saved_tensors[2 * N : 3 * N] - - if ctx.fuse_wgrad_accumulation: - main_grads = [main_grad_func() for main_grad_func in ctx.main_grad_funcs] + main_grads = [main_grad_func() for main_grad_func in ctx.main_grad_funcs] if ctx.cpu_offloading: if ctx.grad_added_to_main_grad: @@ -362,6 +357,10 @@ def backward(ctx, grad_output: torch.Tensor) -> Tuple[Union[torch.Tensor, None], origin_weights[i] = ctx.weight_objects[i] ctx.weight_objects[i] = None + if ctx.fuse_wgrad_accumulation: + for i in range(N): + origin_weights[i].main_grad = main_grads[i] + # Preprocess grad output grad_output_view = grad_output.contiguous().view(-1, grad_output.shape[-1]) grad_output = [None] * ctx.num_gemms @@ -456,32 +455,7 @@ def backward(ctx, grad_output: torch.Tensor) -> Tuple[Union[torch.Tensor, None], use_split_accumulator=dgrad_gemm_use_split_accumulator, ) - def handle_custom_ddp_from_mcore(weight, wgrad): - if ctx.weights_requires_grad: - # Handle custom DDP from mcore. - if ctx.fuse_wgrad_accumulation and hasattr( - weight, "grad_added_to_main_grad" - ): - weight.grad_added_to_main_grad = True - if getattr(weight, "zero_out_wgrad", False): - wgrad = get_dummy_wgrad( - list(weight.main_grad.shape), - weight.dtype, - zero=True, - ) - else: - wgrad = get_dummy_wgrad( - list(weight.main_grad.shape), - weight.dtype, - ) - elif ctx.fuse_wgrad_accumulation: - wgrad = None - else: - wgrad = None - return wgrad - if ctx.weights_requires_grad: - """Wgrad computation.""" wgrad_gemm_use_split_accumulator = _2X_ACC_WGRAD if ctx.fp8: recipe = ctx.fp8_recipe @@ -489,7 +463,6 @@ def handle_custom_ddp_from_mcore(weight, wgrad): wgrad_gemm_use_split_accumulator = ( recipe.fp8_gemm_wgrad.use_split_accumulator ) - if ctx.fuse_wgrad_accumulation: wgrad_list = main_grads else: @@ -555,13 +528,39 @@ def handle_custom_ddp_from_mcore(weight, wgrad): # Deallocate input tensor clear_tensor_data(*inputmats) - if ctx.etp_size > 1: - wgrad_list = origin_weights[0].batched_wgrad_reduce_scatter(wgrad_list, ctx.fuse_wgrad_accumulation) - elif ctx.fuse_wgrad_accumulation: - wgrad_list = [ - handle_custom_ddp_from_mcore(weight, wgrad) - for weight, wgrad in zip(origin_weights, wgrad_list) - ] + def handle_custom_ddp_from_mcore(weight, wgrad): + if ctx.weights_requires_grad: + # Handle custom DDP from mcore. + if ctx.fuse_wgrad_accumulation and hasattr( + weight, "grad_added_to_main_grad" + ): + weight.grad_added_to_main_grad = True + if getattr(weight, "zero_out_wgrad", False): + wgrad = get_dummy_wgrad( + list(weight.main_grad.shape), + weight.dtype, + zero=True, + ) + else: + wgrad = get_dummy_wgrad( + list(weight.main_grad.shape), + weight.dtype, + ) + elif ctx.fuse_wgrad_accumulation: + wgrad = None + else: + wgrad = None + return wgrad + + if ctx.etp_size > 1: + wgrad_list = origin_weights[0].batched_wgrad_reduce_scatter(wgrad_list, ctx.fuse_wgrad_accumulation) + elif ctx.fuse_wgrad_accumulation: + wgrad_list = [ + handle_custom_ddp_from_mcore(weight, wgrad) + for weight, wgrad in zip(origin_weights, wgrad_list) + ] + else: + wgrad_list = [None] * ctx.num_gemms if not ctx.use_bias or ( ctx.wgrad_store is not None @@ -669,7 +668,6 @@ def __init__( single_grouped_parameter: bool = False, name: Optional[str] = None, etp_group: Optional[dist_group_type] = None, - recompute: bool = False, ) -> None: super().__init__(name) @@ -727,7 +725,6 @@ def __init__( else: self.etp_size = get_distributed_world_size(etp_group) assert tp_size == 1, f"TODO(shiqingf): ETP+TP is not well supported yet." - self.recompute = recompute self.parallel_mode = parallel_mode assert ( @@ -989,7 +986,6 @@ def forward( self.save_original_input, debug, self.etp_size, - self.recompute, ) out = linear_fn(*autograd_ctx, inp, non_tensor_args, *weight_tensors, *bias_tensors) diff --git a/transformer_engine/pytorch/module/layernorm_linear.py b/transformer_engine/pytorch/module/layernorm_linear.py index 2dda7d8812..f6f24f6a5f 100644 --- a/transformer_engine/pytorch/module/layernorm_linear.py +++ b/transformer_engine/pytorch/module/layernorm_linear.py @@ -43,7 +43,6 @@ get_nvtx_range_context, ) from ..distributed import ( - set_extended_tensor_parallel_attributes, set_tensor_model_parallel_attributes, get_distributed_world_size, allreduce, @@ -73,10 +72,8 @@ mark_not_offload, mark_activation_offload, ) -from ..tensor.nvfp4_tensor import NVFP4Tensor from ..tensor.storage.float8_blockwise_tensor_storage import Float8BlockwiseQTensorStorage from ..tensor.storage.mxfp8_tensor_storage import MXFP8TensorStorage - from ..export import is_in_onnx_export_mode, assert_warmed_up from ..cpp_extensions import ( @@ -382,7 +379,6 @@ def forward( extra_output=reduce_scatter_out, ) nvtx_range_pop(f"{nvtx_label}.gemm") - # ------------------------------------------------------ # Finished forward GEMM... # ------------------------------------------------------ @@ -586,7 +582,7 @@ def backward( # Since main_grad can be modified inplace, it should not be a part of saved_tensors main_grad = ( ctx.main_grad_func() - if origin_weight is not None and ctx.fuse_wgrad_accumulation and ctx.requires_wgrad + if weight is not None and ctx.fuse_wgrad_accumulation and ctx.requires_wgrad else None ) @@ -659,14 +655,13 @@ def backward( # Prepare grad output tensor # Note: Cast to expected dtype and perform tensor-parallel communication - grad_output = grad_outputs[0] nvtx_range_push(f"{nvtx_label}.grad_output_preprocess") ( grad_output, grad_bias, ) = TransformerEngineBaseModule.grad_output_preprocess( ctx, - grad_output, + grad_outputs[0], ctx.parallel_mode == "row", ctx.grad_output_quantizer, ) @@ -938,7 +933,6 @@ def wgrad_gemm( "with Userbuffers (tensor-parallel communication overlapping)" ) ctx.wgrad_store.put([ln_out_total, grad_output], wgrad_gemm) - assert False, f"TODO(shiqingf): not supported for ETP..." else: # Call wgrad GEMM now @@ -970,8 +964,9 @@ def wgrad_gemm( else: dgrad = ub_obj_wgrad.get_buffer(local_chunk=True).clone() - if ctx.etp_size > 1: - wgrad = origin_weight.wgrad_reduce_scatter(wgrad, ctx.fuse_wgrad_accumulation) + # -------------------------------------------------- + # Grad weight has been computed... + # -------------------------------------------------- # Don't return grad bias if not needed if not ctx.use_bias: @@ -1020,11 +1015,11 @@ def wgrad_gemm( clear_tensor_data(mu) clear_tensor_data(rsigma) - if ctx.etp_size > 1: - wgrad = None - elif ctx.requires_wgrad: + if ctx.requires_wgrad: # Handle custom DDP from mcore. - if ctx.fuse_wgrad_accumulation and hasattr(origin_weight, "grad_added_to_main_grad"): + if ctx.etp_size > 1: + wgrad = origin_weight.wgrad_reduce_scatter(wgrad, ctx.fuse_wgrad_accumulation) + elif ctx.fuse_wgrad_accumulation and hasattr(origin_weight, "grad_added_to_main_grad"): origin_weight.grad_added_to_main_grad = True if getattr(origin_weight, "zero_out_wgrad", False): wgrad = get_dummy_wgrad( @@ -1234,7 +1229,6 @@ def __init__( self.out_features = divide(self.out_features, self.tp_size) elif self.parallel_mode == "row": self.in_features = divide(self.in_features, self.tp_size) - self.tp_out_features = self.out_features if init_method is None: init_method = get_default_init_method() @@ -1442,7 +1436,6 @@ def __init__( if name in self.weight_names or name in self.bias_names: param.skip_backward_post_hook = True - def set_meta_tensor(self, fwd: bool, recipe: Recipe) -> None: """Init scales and amaxes for fwd | bwd.""" super().set_meta_tensor(fwd, recipe) @@ -1486,13 +1479,6 @@ def reset_parameters(self, defer_init=False): dim=1 if self.parallel_mode == "row" else 0, stride=1, ) - if self.etp_size > 1: - set_extended_tensor_parallel_attributes( - tensor=getattr(self, weight), - is_parallel=True, - dim=0, # ETP always shard along the first dim. - stride=1, - ) # Set parallelism attributes for linear biases if self.use_bias: diff --git a/transformer_engine/pytorch/module/layernorm_mlp.py b/transformer_engine/pytorch/module/layernorm_mlp.py index f204383166..16e620fd94 100644 --- a/transformer_engine/pytorch/module/layernorm_mlp.py +++ b/transformer_engine/pytorch/module/layernorm_mlp.py @@ -47,7 +47,6 @@ get_nvtx_range_context, ) from ..distributed import ( - set_extended_tensor_parallel_attributes, set_tensor_model_parallel_attributes, get_distributed_world_size, allreduce, @@ -1800,7 +1799,6 @@ def __init__( delay_wgrad_compute: bool = False, symmetric_ar_type: Optional[str] = None, checkpoint: bool = False, - etp_group: Optional[dist_group_type] = None, ) -> None: super().__init__(name) @@ -1845,11 +1843,6 @@ def __init__( self.set_tensor_parallel_group(tp_group) self.set_nccl_overlap_warning_if_tp() - if etp_group is None: - self.etp_size = 1 - else: - self.etp_size = get_distributed_world_size(etp_group) - if init_method is None: init_method = get_default_init_method() if output_layer_init_method is None: @@ -2014,9 +2007,6 @@ def reset_parameters(self, defer_init=False): # Set parallel attributes for linear parameters set_tensor_model_parallel_attributes(self.fc1_weight, True, 0, 1) set_tensor_model_parallel_attributes(self.fc2_weight, True, 1, 1) - if self.etp_size > 1: - set_extended_tensor_parallel_attributes(self.fc1_weight, True, 0, 1) - set_extended_tensor_parallel_attributes(self.fc2_weight, True, 0, 1) if self.use_bias: set_tensor_model_parallel_attributes(self.fc1_bias, True, 0, 1) if self.set_parallel_mode: diff --git a/transformer_engine/pytorch/tensor/storage/nvfp4_tensor_storage.py b/transformer_engine/pytorch/tensor/storage/nvfp4_tensor_storage.py index 0b6830c1d0..e7509f3994 100644 --- a/transformer_engine/pytorch/tensor/storage/nvfp4_tensor_storage.py +++ b/transformer_engine/pytorch/tensor/storage/nvfp4_tensor_storage.py @@ -297,18 +297,6 @@ def view(self, shape: torch.Size): with_gemm_swizzled_scales=self._with_gemm_swizzled_scales, ) - def copy_(self, tensor: NVFP4TensorStorage): - assert tensor._fp4_dtype == self._fp4_dtype - - self._rowwise_data.copy_(tensor._rowwise_data) - self._columnwise_data.copy_(tensor._columnwise_data) - self._rowwise_scale_inv.copy_(tensor._rowwise_scale_inv) - self._columnwise_scale_inv.copy_(tensor._columnwise_scale_inv) - self._amax_rowwise.copy_(tensor._amax_rowwise) - self._amax_columnwise.copy_(tensor._amax_columnwise) - self._quantizer = tensor._quantizer.copy() if tensor._quantizer is not None else None - - def __repr__(self): data_rowwise = self.dequantize() diff --git a/transformer_engine/pytorch/utils.py b/transformer_engine/pytorch/utils.py index c225cb3009..47af9fabe1 100644 --- a/transformer_engine/pytorch/utils.py +++ b/transformer_engine/pytorch/utils.py @@ -430,46 +430,6 @@ def cast_if_needed(tensor: torch.Tensor, dtype: torch.dtype) -> torch.Tensor: with torch.enable_grad(): return tensor.to(dtype=dtype) -def mask_distributed_columns_graph_compatible(tensor, num_mask_cols, etp_size, valid_indices=None): - """ - CUDA graph compatible version using index_select. - - Args: - tensor: [M, N] input tensor - num_mask_cols: total columns to mask - etp_size: number of chunks - valid_indices: pre-computed valid column indices (optional, will be computed if None) - - Returns: - result: [M, N - num_mask_cols] tensor with masked columns removed - valid_indices: indices of valid columns (for reuse) - """ - # Sanity check - assert num_mask_cols > 0 and etp_size > 1 - _, N = tensor.shape - - chunk_size = N // etp_size - mask_per_chunk = num_mask_cols // etp_size - assert num_mask_cols % etp_size == 0 and mask_per_chunk >= 1 - - # Pre-compute valid indices if not provided - if valid_indices is None: - # Build list of valid column indices - indices_list = [] - for chunk_idx in range(etp_size): - chunk_start = chunk_idx * chunk_size - chunk_end = chunk_start + chunk_size - valid_end = chunk_end - mask_per_chunk - indices_list.extend(range(chunk_start, valid_end)) - - # Allocated during warmup of CG. - valid_indices = torch.tensor(indices_list, dtype=torch.long, device=tensor.device) - - # Use index_select instead of boolean indexing (CUDA graph compatible) - result = torch.index_select(tensor, dim=1, index=valid_indices) - - return result, valid_indices - def check_dim_for_fp8_exec(tensor: torch.Tensor) -> bool: """Check if tensor dimensions are supported for FP8 TN GEMM""" From de29fac4f7e466c9770144749fa0394f94d4fa58 Mon Sep 17 00:00:00 2001 From: Jieming Zhang Date: Tue, 10 Mar 2026 10:22:19 -0700 Subject: [PATCH 03/43] fix post_hook not being called in certain cases Signed-off-by: Jieming Zhang --- .../module/extended_tensor_parallelism.py | 76 +++++++++---------- .../pytorch/module/grouped_linear.py | 2 +- .../pytorch/module/layernorm_linear.py | 8 +- 3 files changed, 39 insertions(+), 47 deletions(-) diff --git a/transformer_engine/pytorch/module/extended_tensor_parallelism.py b/transformer_engine/pytorch/module/extended_tensor_parallelism.py index cb4e058418..47311d3080 100644 --- a/transformer_engine/pytorch/module/extended_tensor_parallelism.py +++ b/transformer_engine/pytorch/module/extended_tensor_parallelism.py @@ -410,6 +410,7 @@ def all_gather_and_prefetch_bwd(self, nvtx_label=None): def batched_all_gather_and_prefetch_bwd(self, nvtx_label=None): """Batched backward all-gather + prefetch. Wrapper around all_gather_and_prefetch_bwd.""" + assert self.is_routed_expert and self.weight_list is not None return self.all_gather_and_prefetch_bwd(nvtx_label=nvtx_label) def all_gather_and_prefetch( @@ -453,6 +454,7 @@ def all_gather_and_prefetch( def batched_all_gather_and_prefetch(self, **kwargs): """Batched all-gather + prefetch for expert weights. Wrapper around all_gather_and_prefetch.""" + assert self.is_routed_expert and self.weight_list is not None return self.all_gather_and_prefetch(**kwargs) def get_wgrad_tensor(self): @@ -467,44 +469,30 @@ def register_grad_accum_hook(self, grad_accum_node, hook): self._grad_accum_node = grad_accum_node self._grad_accum_hook = hook - @classmethod - def _resolve_pending_rs(cls, expected_next): - """Finish any pending reduce-scatter from a previous weight.""" - if cls._pending_rs_weight is not None: - assert cls._pending_rs_weight is expected_next - cls._pending_rs_weight.finish_wgrad_reduce_scatter() - cls._pending_rs_weight = None - @staticmethod - def _apply_fused_wgrad(param, wgrad_rs): - """Apply fuse_wgrad_accumulation logic to a single param and return a dummy grad.""" - - # the last rank in the etp group pads the param, so need to remove the padding here - if param.group.rank() == param.group.size() - 1: + def _finalize_wgrad(param, wgrad_rs, fuse_wgrad_accumulation): + """Post-RS per-param processing: strip padding, accumulate, call hook. + + Returns None for fused (grad already accumulated into main_grad), + or the stripped wgrad for unfused (to be returned to autograd). + """ + # 1. Strip padding + if param.is_padded_last_rank: wgrad_rs = param._strip_padding(wgrad_rs) - param.main_grad.add_(wgrad_rs) - # Handle mcore grad accum fusion - if hasattr(param, "grad_added_to_main_grad"): - param.grad_added_to_main_grad = True - param.grad = get_dummy_wgrad(list(param.main_grad.shape), param.dtype) + # 2. Accumulate + if fuse_wgrad_accumulation: + param.main_grad.add_(wgrad_rs) + if hasattr(param, "grad_added_to_main_grad"): + param.grad_added_to_main_grad = True + dummy_grad = get_dummy_wgrad(list(param.main_grad.shape), param.dtype) + + # 3. Post hook if param._grad_accum_hook is not None: + param.grad = dummy_grad if fuse_wgrad_accumulation else wgrad_rs param._grad_accum_hook(param) - def finish_wgrad_reduce_scatter(self): - if self.wgrad_rs_handle is not None: - self.wgrad_rs_handle.wait() - self.wgrad_rs_handle = None - - for param, wgrad_rs in zip(self._weights, self.wgrad_rs): - if self.fuse_wgrad_accumulation: - self._apply_fused_wgrad(param, wgrad_rs) - else: - if param.is_padded_last_rank: - wgrad_rs = param._strip_padding(wgrad_rs) - param.grad = wgrad_rs - # Finally call the grad accum node - param._grad_accum_node(param.grad) + return dummy_grad if fuse_wgrad_accumulation else wgrad_rs def _reduce_scatter(self, wgrads, async_op): """Reduce-scatter one or more wgrads. Returns (outputs, handle). @@ -546,29 +534,33 @@ def wgrad_reduce_scatter(self, wgrad, fuse_wgrad_accumulation): wgrads = list(wgrad) if batched else [wgrad] weights = self._weights - self._resolve_pending_rs(self.next_w) + # Wait for last reduce scatter if it was async + if ETPShardedParam._pending_rs_weight is not None: + param = ETPShardedParam._pending_rs_weight + assert param is self.next_w + param.wgrad_rs_handle.wait() + param.wgrad_rs_handle = None + + for p, g in zip(param._weights, param.wgrad_rs): + self._finalize_wgrad(p, g, param.fuse_wgrad_accumulation) + ETPShardedParam._pending_rs_weight = None if self.prev_w is None: # Sync reduce-scatter (last weight in chain) sharded, _ = self._reduce_scatter(wgrads, async_op=False) - if fuse_wgrad_accumulation: - [self._apply_fused_wgrad(p, g) for p, g in zip(weights, sharded)] - result = [None] * len(weights) - else: - result = [ - p._strip_padding(g) if p.is_padded_last_rank else g - for p, g in zip(weights, sharded) - ] + result = [self._finalize_wgrad(p, g, fuse_wgrad_accumulation) + for p, g in zip(weights, sharded)] return result if batched else result[0] else: # Async reduce-scatter (not last weight — deferred finish) self.fuse_wgrad_accumulation = fuse_wgrad_accumulation self.wgrad_rs, self.wgrad_rs_handle = self._reduce_scatter(wgrads, async_op=True) - type(self)._pending_rs_weight = self + ETPShardedParam._pending_rs_weight = self return tuple([None] * len(wgrads)) if batched else None def batched_wgrad_reduce_scatter(self, wgrad_list, fuse_wgrad_accumulation): """Batched version of wgrad_reduce_scatter.""" + assert self.is_routed_expert and self.weight_list is not None return self.wgrad_reduce_scatter(wgrad_list, fuse_wgrad_accumulation) def __torch_function__(self, func, types, args=(), kwargs=None): diff --git a/transformer_engine/pytorch/module/grouped_linear.py b/transformer_engine/pytorch/module/grouped_linear.py index fd55f84b3c..2f1fac23bf 100644 --- a/transformer_engine/pytorch/module/grouped_linear.py +++ b/transformer_engine/pytorch/module/grouped_linear.py @@ -357,7 +357,7 @@ def backward(ctx, grad_output: torch.Tensor) -> Tuple[Union[torch.Tensor, None], origin_weights[i] = ctx.weight_objects[i] ctx.weight_objects[i] = None - if ctx.fuse_wgrad_accumulation: + if ctx.fuse_wgrad_accumulation and ctx.etp_size == 1: for i in range(N): origin_weights[i].main_grad = main_grads[i] diff --git a/transformer_engine/pytorch/module/layernorm_linear.py b/transformer_engine/pytorch/module/layernorm_linear.py index f6f24f6a5f..c3c6cf73d1 100644 --- a/transformer_engine/pytorch/module/layernorm_linear.py +++ b/transformer_engine/pytorch/module/layernorm_linear.py @@ -575,6 +575,10 @@ def backward( rsigma, ) = restore_from_saved(ctx.tensor_objects, saved_tensors) + if ctx.etp_size > 1: + weight = origin_weight.all_gather_and_prefetch_bwd( + nvtx_label=nvtx_label) + # Delete the references to tensor objects once they've been consumed # by the `restore_from_saved` method to construct back the actual tensors. ctx.tensor_objects = None @@ -716,10 +720,6 @@ def backward( # Note: Gradient w.r.t. GEMM input (i.e. norm output). # -------------------------------------------------- - # Make sure required data is available - if ctx.etp_size > 1: - weight = origin_weight.all_gather_and_prefetch_bwd( - nvtx_label=nvtx_label) if isinstance(grad_output, QuantizedTensorStorage): grad_output.update_usage(rowwise_usage=True) From 9b846dc9172b9a216716c104172d94d05d914fc1 Mon Sep 17 00:00:00 2001 From: Jieming Zhang Date: Thu, 19 Mar 2026 15:13:42 -0700 Subject: [PATCH 04/43] Cudagraph Support Signed-off-by: Jieming Zhang --- transformer_engine/pytorch/distributed.py | 24 +- .../module/extended_tensor_parallelism.py | 507 ++++++++++++------ .../pytorch/module/layernorm_linear.py | 6 +- 3 files changed, 369 insertions(+), 168 deletions(-) diff --git a/transformer_engine/pytorch/distributed.py b/transformer_engine/pytorch/distributed.py index 20c617ddb7..cea07eb6d5 100644 --- a/transformer_engine/pytorch/distributed.py +++ b/transformer_engine/pytorch/distributed.py @@ -1253,12 +1253,16 @@ def _post_process_nvfp4_gather( handle.wait() handle = None - # Fix the interleaved transposed data from gathering along first dim. - out._columnwise_scale_inv = _swap_first_dims(columnwise_scale_inv_interleaved, world_size) - out._columnwise_data = _swap_first_dims(columnwise_data_interleaved, world_size) + # TODO + # # Fix the interleaved transposed data from gathering along first dim. + # out._columnwise_scale_inv = _swap_first_dims(columnwise_scale_inv_interleaved, world_size) + # out._columnwise_data = _swap_first_dims(columnwise_data_interleaved, world_size) + out._columnwise_scale_inv.copy_(_swap_first_dims(columnwise_scale_inv_interleaved, world_size)) + out._columnwise_data.copy_(_swap_first_dims(columnwise_data_interleaved, world_size)) - # Optionally pad the scaling inverse if needed. - out._columnwise_scale_inv = pad_columnwise_scale_inv(out._columnwise_scale_inv) + # # Optionally pad the scaling inverse if needed. + # out._columnwise_scale_inv = pad_columnwise_scale_inv(out._columnwise_scale_inv) + out._columnwise_scale_inv.copy_(pad_columnwise_scale_inv(out._columnwise_scale_inv)) @dataclass @@ -1409,7 +1413,9 @@ def _all_gather_nvfp4( ) # Transfer amax to output. - out._amax_rowwise = inp._amax_rowwise + #TODO: jiemingz + # out._amax_rowwise = inp._amax_rowwise + out._amax_rowwise.copy_(inp._amax_rowwise) # Gather the transposed NVFP4 data along first dimension. Fix format later. if quantizer.columnwise_usage: @@ -1458,7 +1464,8 @@ def _all_gather_nvfp4( ) # Transfer amax to output. - out._amax_columnwise = inp._amax_columnwise + out._amax_columnwise.copy_(inp._amax_columnwise) + handle = coalesced_handle if async_op else None @@ -1473,6 +1480,9 @@ def _all_gather_nvfp4( ) else: _post_process_nvfp4_gather(out, out_columnwise_data, out_scale_inv, world_size, handle) + else: + if handle is not None: + handle.output = out return out, handle diff --git a/transformer_engine/pytorch/module/extended_tensor_parallelism.py b/transformer_engine/pytorch/module/extended_tensor_parallelism.py index 47311d3080..d6a33e9826 100644 --- a/transformer_engine/pytorch/module/extended_tensor_parallelism.py +++ b/transformer_engine/pytorch/module/extended_tensor_parallelism.py @@ -3,10 +3,11 @@ # See LICENSE for license information. from collections import defaultdict -from typing import Dict, List +from typing import Dict, List, Optional from enum import Enum -from dataclasses import dataclass +from dataclasses import dataclass, field import torch +from contextlib import nullcontext from ..distributed import ( gather_along_first_dim, @@ -20,27 +21,49 @@ import transformer_engine_torch as tex +DEBUG_TENSOR = None + class ETPWeightState(Enum): NONE = "NONE" # Sharded, no pending operation ASYNC_WAIT = "ASYNC_WAIT" # Async all-gather in progress - ASYNC_DONE = "ASYNC_DONE" # Async all-gather complete, result in cache + DATA_READY = "DATA_READY" # Async all-gather complete, result in cache + DATA_READY_SYNC = "DATA_READY_SYNC" # Sync all-gather complete, result in cache + _STATE_TRANSITIONS = { - ETPWeightState.NONE: {ETPWeightState.ASYNC_WAIT}, - ETPWeightState.ASYNC_WAIT: {ETPWeightState.ASYNC_DONE}, - ETPWeightState.ASYNC_DONE: {ETPWeightState.NONE}, + ETPWeightState.NONE: {ETPWeightState.ASYNC_WAIT, ETPWeightState.DATA_READY_SYNC}, + ETPWeightState.ASYNC_WAIT: {ETPWeightState.DATA_READY}, + ETPWeightState.DATA_READY: {ETPWeightState.NONE}, + ETPWeightState.DATA_READY_SYNC: {ETPWeightState.NONE}, } -# Global AG Prefetching Buffer for ETP. -_ALL_GATHER_BUFFER = None +# Global ETP buffer cache (persists across clear(); never set to None after creation). +_ETP_CACHE = None + +# Global set of ETPShardedParam with in-flight async comms (AG or RS). +_inflight_comm_params: set = set() +AG_STREAM = None +RS_STREAM = None + +def get_ag_stream(): + global AG_STREAM + if AG_STREAM is None: + AG_STREAM = torch.cuda.Stream() + return AG_STREAM +def get_rs_stream(): + global RS_STREAM + if RS_STREAM is None: + RS_STREAM = torch.cuda.Stream() + return RS_STREAM @dataclass class ETPConfig: """Global configuration for Extended Tensor Parallelism.""" pad_for_alignment: int = 16 + check_param_states: bool = True weight_prefetch: bool = True ETP_CONFIG = ETPConfig() @@ -108,15 +131,22 @@ def wrap_module_params_etp(module, weight_names, etp_group, is_grouped=None): class ETPShardHandle: - def __init__(self, handle, etp_shards: list): + def __init__(self, handle, etp_shards, reduce_scatter=False): self.handle = handle self.etp_shards = etp_shards + self.reduce_scatter = reduce_scatter + _inflight_comm_params.add(etp_shards[0]) def wait(self): if self.handle is not None: self.handle.wait() for w in self.etp_shards: - w._set_state(ETPWeightState.ASYNC_DONE) + if self.reduce_scatter: + w._set_rs_state(ETPWeightState.DATA_READY) + else: + w._set_state(ETPWeightState.DATA_READY) + + _inflight_comm_params.discard(self.etp_shards[0]) class ETPShardedParam(torch.nn.Parameter): @@ -133,17 +163,19 @@ def __new__(cls, tensor, *args, **kwargs): def __init__(self, x, *args, **kwargs): super().__init__() + # all gather self.state = ETPWeightState.NONE - self._cache_ticket = None + self._ag_ticket_fwd = None + self._ag_ticket_bwd = None self._prefetch_handle = None - self._grad_accum_node = None - self._grad_accum_hook = None + self._need_weight_prefetch = True + self.ag_event = torch.cuda.Event(external=True) # Quantization self._quantizer = None self.did_cast_to_low_precision = False self.quantized = None # Prefetching linked list - self.is_first_weight = False + self.prefetch_initialized = False self.next_w = None self.prev_w = None # Grouped gemm @@ -152,9 +184,14 @@ def __init__(self, x, *args, **kwargs): self.group = None self.weight_list = None # Reduce-scatter state (set during wgrad_reduce_scatter) + self.rs_state = ETPWeightState.NONE self.wgrad_rs = None - self.wgrad_rs_handle = None + self._wgrad_rs_handle = None self.fuse_wgrad_accumulation = False + self._grad_accum_node = None + self._grad_accum_hook = None + self.rs_event = torch.cuda.Event(external=True) + self._rs_ticket = None # Padding self.is_padded_last_rank = False self.pad_length = 0 @@ -203,18 +240,31 @@ def _unsharded_shape(self): out_shape[0] -= self.pad_length return tuple(out_shape) + @property + def _sharded_padded_shape(self): + out_shape = list(self.size()) + if self.pad_length > 0 and self.group.rank() == self.group.size() - 1: + out_shape[0] += self.pad_length + return tuple(out_shape) + def get_padded_shard(self): if self.pad_length > 0 and self.is_padded_last_rank: return torch.nn.functional.pad(self, (0, 0, 0, self.pad_length)) return self def _set_state(self, new_state: ETPWeightState): - """Validate and update state machine transition.""" - assert new_state in _STATE_TRANSITIONS[self.state], \ - f"Invalid state transition: {self.state} -> {new_state}" + # if ETP_CONFIG.check_param_states: + # assert new_state in _STATE_TRANSITIONS[self.state], \ + # f"Invalid state transition: {self.state} -> {new_state}" self.state = new_state - def _get_cache_key(self, dtype, fwd: bool) -> tuple: + def _set_rs_state(self, new_state: ETPWeightState): + # if ETP_CONFIG.check_param_states: + # assert new_state in _STATE_TRANSITIONS[self.rs_state], \ + # f"Invalid state transition: {self.rs_state} -> {new_state}" + self.rs_state = new_state + + def _get_cache_key(self, dtype, fwd: bool, reduce_scatter: bool) -> tuple: """Build cache key using output shape + dtype. Weights with matching gathered shape and dtype share a buffer. @@ -223,8 +273,8 @@ def _get_cache_key(self, dtype, fwd: bool) -> tuple: """ if not isinstance(dtype, torch.dtype): - return (self._unsharded_shape_padded, dtype, fwd, not fwd, self.expert_idx) - return (self._unsharded_shape_padded, dtype, self.expert_idx) + return (self._unsharded_shape_padded, dtype, fwd, not fwd, self.expert_idx, reduce_scatter) + return (self._unsharded_shape_padded, dtype, self.expert_idx, reduce_scatter) def _quantize_if_needed(self, skip_weight_cast=False, cast_noop_flag=None): """Re-quantize sharded weight into existing buffer. Returns quantized weight or self.""" @@ -278,6 +328,9 @@ def _all_gather_weight(self, async_op, skip_weight_cast, cast_noop_flag, fwd, nv if async_op: for w in weights: w._set_state(ETPWeightState.ASYNC_WAIT) + else: + for w in weights: + w._set_state(ETPWeightState.DATA_READY_SYNC) # 2. Prepare: quantize, set usage direction. for w in weights: @@ -296,11 +349,16 @@ def _all_gather_weight(self, async_op, skip_weight_cast, cast_noop_flag, fwd, nv if async_op: dtypes = [q.dtype if q is not None else w.dtype for q, w in zip(quantizers, weights)] out_buffers = [] + cache = get_global_ETP_cache() for p, dt in zip(weights, dtypes): - assert p._cache_ticket is None, \ - f"Cache ticket leak: weight {id(p)} still has unreturned ticket {p._cache_ticket}" - buf, p._cache_ticket = get_global_ETP_cache().checkout(p, dt, fwd) - out_buffers.append(buf) + if fwd: + if p._ag_ticket_fwd is None: + p._ag_ticket_fwd = cache.reserve(p, dt, fwd=True) + out_buffers.append(cache.get(p._ag_ticket_fwd)) + else: + if p._ag_ticket_bwd is None: + p._ag_ticket_bwd = cache.reserve(p, dt, fwd=False) + out_buffers.append(cache.get(p._ag_ticket_bwd)) else: out_buffers = None @@ -340,49 +398,42 @@ def _all_gather_weight(self, async_op, skip_weight_cast, cast_noop_flag, fwd, nv return result, handle - def _get_unsharded(self, fwd, skip_weight_cast=False, cast_noop_flag=None): - """Get unsharded (all-gathered) weight tensor(s). - - Handles both routed experts (returns list) and single weights (returns tensor). - Supports sync gather, async prefetch wait, and cache retrieval. - """ - weights = self._weights + def _wait_param_gather(self): + # Since wait() may sychronize against a different stream than the current stream, + # an event is recorded and waited on when the data is retrieved, which ensures the + # AG always finishes before returning the unsharded param + with torch.cuda.stream(get_ag_stream()): + if self._prefetch_handle is not None: + self._prefetch_handle.wait() + self._prefetch_handle = None + self.ag_event.record() + + def _all_gather_weight_on_demand(self, fwd, skip_weight_cast=False, cast_noop_flag=None): + result, _ = self._all_gather_weight( + async_op=False, + skip_weight_cast=skip_weight_cast, + cast_noop_flag=cast_noop_flag, + fwd=fwd, + ) + result = result if self.is_routed_expert else [result] + result = [self._strip_padding(r) for r in result] + result = [r.detach().requires_grad_(w.requires_grad) for r, w in zip(result,self._weights)] + return result if self.is_routed_expert else result[0] + def _get_prefetched_weight(self, fwd, skip_weight_cast=False, cast_noop_flag=None): # Wait for async prefetch if in progress - if weights[0].state == ETPWeightState.ASYNC_WAIT: - self._prefetch_handle.wait() - self._prefetch_handle = None - - if weights[0].state == ETPWeightState.NONE: - # Synchronous all-gather (no cache — buffers allocated inline) - result, _ = self._all_gather_weight( - async_op=False, - skip_weight_cast=skip_weight_cast, - cast_noop_flag=cast_noop_flag, - fwd=fwd, - ) - result = result if self.is_routed_expert else [result] + self._wait_param_gather() + self.ag_event.wait() - elif weights[0].state == ETPWeightState.ASYNC_DONE: - # Retrieve prefetched results from cache - cache = get_global_ETP_cache() - result = [] - for w in weights: - buf = cache.get(w._cache_ticket) - w._cache_ticket = None - # Post-gather quantization safety net: weight was prefetched - # before weight_quantizer was set - if not w.did_cast_to_low_precision: - if w._quantizer is not None and not isinstance(buf, QuantizedTensor): - w._quantize_if_needed() - buf = w._quantizer.quantize(buf) - w._set_state(ETPWeightState.NONE) - result.append(buf) - else: - assert False, f"Unexpected state: {weights[0].state}" + # Retrieve prefetched results from cache + result = [] + cache = get_global_ETP_cache() + for w in self._weights: + ticket = w._ag_ticket_fwd if fwd else w._ag_ticket_bwd + result.append(cache.get(ticket)) result = [self._strip_padding(r) for r in result] - result = [r.detach().requires_grad_(w.requires_grad) for r, w in zip(result, weights)] + result = [r.detach().requires_grad_(w.requires_grad) for r, w in zip(result, self._weights)] return result if self.is_routed_expert else result[0] def all_gather_and_prefetch_bwd(self, nvtx_label=None): @@ -398,14 +449,32 @@ def all_gather_and_prefetch_bwd(self, nvtx_label=None): Returns: weight_total """ - result = self._get_unsharded(fwd=False, skip_weight_cast=True) - if ETP_CONFIG.weight_prefetch and self.prev_w is not None: + if self.next_w is not None: + result = self._get_prefetched_weight(False, skip_weight_cast=True) + else: + result = self._all_gather_weight_on_demand(False, skip_weight_cast=True) + + if ( + ETP_CONFIG.weight_prefetch + and self.prev_w is not None + and self.prev_w._need_weight_prefetch + ): _, handle = self.prev_w._all_gather_weight( async_op=True, skip_weight_cast=True, cast_noop_flag=None, fwd=False, nvtx_label=nvtx_label, ) self.prev_w._prefetch_handle = handle + + # The unsharded tensor has been returned, no pending work so reset state to NONE + for w in self._weights: + w._set_state(ETPWeightState.NONE) + + if self.next_w is not None: + cache = get_global_ETP_cache() + for w in self._weights: + cache.release(w._ag_ticket_bwd) + return result def batched_all_gather_and_prefetch_bwd(self, nvtx_label=None): @@ -426,30 +495,44 @@ def all_gather_and_prefetch( Returns: weight_total """ - # Lazy population of linked list: link previous weight to current weight - cls = type(self) - if cls._first_weight_flag: - self.is_first_weight = True - cls._first_weight_flag = False + if self.prev_w is not None: + result = self._get_prefetched_weight(True, skip_weight_cast, cast_noop_flag) + else: + result = self._all_gather_weight_on_demand(True, skip_weight_cast, cast_noop_flag) - if self.is_first_weight: - cls._last_weight = None + # Prefetch next weight + if ( + ETP_CONFIG.weight_prefetch + and self.next_w is not None + and self.next_w._need_weight_prefetch + ): + _, handle = self.next_w._all_gather_weight( + async_op=True, + skip_weight_cast=skip_weight_cast, + cast_noop_flag=cast_noop_flag, + fwd=fwd, nvtx_label=nvtx_label, + ) + self.next_w._prefetch_handle = handle - if cls._last_weight is not None and cls._last_weight.next_w is None: - print_rank_0(f"linking curr w: {id(self)} {self.is_routed_expert} prev_w: {id(cls._last_weight)}") - cls._last_weight.next_w = self - self.prev_w = cls._last_weight - cls._last_weight = self + # The unsharded tensor has been returned, no pending work so reset state to NONE + for w in self._weights: + w._set_state(ETPWeightState.NONE) - result = self._get_unsharded(fwd, skip_weight_cast=skip_weight_cast, cast_noop_flag=cast_noop_flag) + if self.prev_w is not None: + cache = get_global_ETP_cache() + for w in self._weights: + cache.release(w._ag_ticket_fwd) + + # Lazy population of linked list: link previous weight to current weight + cls = type(self) + if not self.prefetch_initialized: + if cls._last_weight is not None and cls._last_weight.next_w is None: + print_rank_0(f"linking curr w: {id(self)} {self.is_routed_expert} prev_w: {id(cls._last_weight)}") + cls._last_weight.next_w = self + self.prev_w = cls._last_weight + self.prefetch_initialized = True + cls._last_weight = self - if ETP_CONFIG.weight_prefetch and self.next_w is not None: - target = self.next_w - _, handle = target._all_gather_weight( - async_op=True, skip_weight_cast=skip_weight_cast, - cast_noop_flag=cast_noop_flag, fwd=fwd, nvtx_label=nvtx_label, - ) - target._prefetch_handle = handle return result def batched_all_gather_and_prefetch(self, **kwargs): @@ -476,6 +559,9 @@ def _finalize_wgrad(param, wgrad_rs, fuse_wgrad_accumulation): Returns None for fused (grad already accumulated into main_grad), or the stripped wgrad for unfused (to be returned to autograd). """ + + param._set_rs_state(ETPWeightState.NONE) + # 1. Strip padding if param.is_padded_last_rank: wgrad_rs = param._strip_padding(wgrad_rs) @@ -492,7 +578,15 @@ def _finalize_wgrad(param, wgrad_rs, fuse_wgrad_accumulation): param.grad = dummy_grad if fuse_wgrad_accumulation else wgrad_rs param._grad_accum_hook(param) - return dummy_grad if fuse_wgrad_accumulation else wgrad_rs + return None if fuse_wgrad_accumulation else wgrad_rs + + def _wait_reduce_scatter(self): + # assert self._wgrad_rs_handle is not None or is_graph_capturing() + with torch.cuda.stream(get_rs_stream()): + if self._wgrad_rs_handle is not None: + self._wgrad_rs_handle.wait() + self._wgrad_rs_handle = None + self.rs_event.record() def _reduce_scatter(self, wgrads, async_op): """Reduce-scatter one or more wgrads. Returns (outputs, handle). @@ -501,12 +595,29 @@ def _reduce_scatter(self, wgrads, async_op): Multiple tensors: coalesced reduce-scatter. """ + for w in self._weights: + if async_op: + w._set_rs_state(ETPWeightState.ASYNC_WAIT) + else: + w._set_rs_state(ETPWeightState.DATA_READY_SYNC) + if self.pad_length > 0: wgrads = [torch.nn.functional.pad(w, (0, 0, 0, self.pad_length)) for w in wgrads] + if async_op: + dtypes = [w.dtype for w in wgrads] + out_buffers = [] + cache = get_global_ETP_cache() + for p, dt in zip(self._weights, dtypes): + if p._rs_ticket is None: + p._rs_ticket = cache.reserve(p, dt, fwd=False, reduce_scatter=True) + out_buffers.append(cache.get(p._rs_ticket)) + else: + out_buffers = [None] * len(wgrads) + if len(wgrads) == 1: out, handle = reduce_scatter_along_first_dim( - wgrads[0], self.group, async_op=async_op + wgrads[0], self.group, async_op=async_op, output=out_buffers[0] ) return [out], handle else: @@ -516,9 +627,10 @@ def _reduce_scatter(self, wgrads, async_op): device=wgrads[0].device, async_ops=async_op, ) as cm: - for tensor in wgrads: - out, _ = reduce_scatter_along_first_dim(tensor, self.group) + for out_buffer, tensor in zip(out_buffers, wgrads): + out, _ = reduce_scatter_along_first_dim(tensor, self.group, output=out_buffer) outputs.append(out) + return outputs, cm if async_op else None def wgrad_reduce_scatter(self, wgrad, fuse_wgrad_accumulation): @@ -534,29 +646,32 @@ def wgrad_reduce_scatter(self, wgrad, fuse_wgrad_accumulation): wgrads = list(wgrad) if batched else [wgrad] weights = self._weights - # Wait for last reduce scatter if it was async - if ETPShardedParam._pending_rs_weight is not None: - param = ETPShardedParam._pending_rs_weight - assert param is self.next_w - param.wgrad_rs_handle.wait() - param.wgrad_rs_handle = None - - for p, g in zip(param._weights, param.wgrad_rs): - self._finalize_wgrad(p, g, param.fuse_wgrad_accumulation) - ETPShardedParam._pending_rs_weight = None - - if self.prev_w is None: + if ETP_CONFIG.weight_prefetch and self.prev_w is not None: + # Async reduce-scatter (not last weight — deferred finish) + self.fuse_wgrad_accumulation = fuse_wgrad_accumulation + _, rs_handle = self._reduce_scatter(wgrads, async_op=True) + self._wgrad_rs_handle = ETPShardHandle(rs_handle, weights, reduce_scatter=True) + ret = tuple([None] * len(wgrads)) if batched else None + else: # Sync reduce-scatter (last weight in chain) sharded, _ = self._reduce_scatter(wgrads, async_op=False) result = [self._finalize_wgrad(p, g, fuse_wgrad_accumulation) for p, g in zip(weights, sharded)] - return result if batched else result[0] - else: - # Async reduce-scatter (not last weight — deferred finish) - self.fuse_wgrad_accumulation = fuse_wgrad_accumulation - self.wgrad_rs, self.wgrad_rs_handle = self._reduce_scatter(wgrads, async_op=True) - ETPShardedParam._pending_rs_weight = self - return tuple([None] * len(wgrads)) if batched else None + ret = result if batched else result[0] + + # Wait for last reduce scatter if it was async + # Currently only support reduce scattering in reverse order + if self.next_w is not None: + self.next_w._wait_reduce_scatter() + self.next_w.rs_event.wait() + + cache = get_global_ETP_cache() + fuse_wgrad_accumulation = self.next_w._weights[0].fuse_wgrad_accumulation + for w in self.next_w._weights: + self._finalize_wgrad(w, cache.get(w._rs_ticket), fuse_wgrad_accumulation) + cache.release(w._rs_ticket) + + return ret def batched_wgrad_reduce_scatter(self, wgrad_list, fuse_wgrad_accumulation): """Batched version of wgrad_reduce_scatter.""" @@ -590,34 +705,48 @@ def print_rank_0(message, rank=None): else: print(message, flush=True) -class ETPWeightCache: - """ - Buffers are pooled by cache key (shape + dtype). Two operations: +@dataclass +class _TicketSlot: + """Internal slot backing a persistent ticket in the ETP buffer cache.""" + key: tuple # cache key (shape, dtype, ...) + param: 'ETPShardedParam' # for lazy allocation metadata + dtype: object # torch.dtype or tex.DType + reduce_scatter: bool + fwd: bool + buf: Optional[torch.Tensor] = field(default=None) # None when released or after clear() - - ``checkout(param, dtype, fwd)`` → ``(buffer, ticket)`` - Takes a buffer from the pool (or allocates). Ticket is ``id(buf)``. - - ``get(ticket, param, dtype, fwd)`` → ``buffer`` - Retrieves the buffer, asserts key matches, returns it to the pool, - and invalidates the ticket. - Every checkout is paired with exactly one get (1:1). - Two weights sharing the same cache key get distinct buffers if one - is still checked out, preventing aliasing. +class ETPWeightCache: + """ + Ticket-based buffer pool for ETP all-gather / reduce-scatter buffers. + + - ``reserve(param, dtype, fwd)`` → ``ticket`` + Assigns a persistent ticket (no buffer allocated yet). + - ``get(ticket)`` → ``buffer`` + Returns the buffer, lazily allocating from pool or fresh if needed. + - ``release(ticket)`` + Returns the buffer to the pool. Ticket remains valid; next ``get()`` + will re-allocate from the pool. + - ``clear()`` + Drops all buffers and pools. Tickets remain valid; next ``get()`` + lazily allocates fresh buffers. """ # Bytes per element for known dtypes (used for logging). _BYTES_PER_ELEMENT = { - torch.bfloat16: 2, torch.float16: 2, torch.float32: 4, + torch.bfloat16: 2, + torch.float16: 2, + torch.float32: 4, tex.DType.kFloat4E2M1: 0.5, tex.DType.kFloat8E4M3: 1, } def __init__(self): self._pool: Dict[tuple, List[torch.Tensor]] = defaultdict(list) - self._tickets: Dict[int, tuple] = {} # ticket → (key, buf) - self._free_tickets: list[int] = [] # recycled ticket IDs - self._max_ticket: int = 0 # high-water mark for ticket allocation + self._slots: Dict[int, _TicketSlot] = {} + self._next_ticket: int = 0 self._total_bytes: int = 0 # running total of allocated bytes + self.key_to_allocate_func = {} @staticmethod def _buf_bytes(shape, dtype) -> int: @@ -628,65 +757,125 @@ def _buf_bytes(shape, dtype) -> int: bpe = ETPWeightCache._BYTES_PER_ELEMENT.get(dtype, None) return numel * bpe - def _allocate_buffer(self, param: 'ETPShardedParam', dtype) -> torch.Tensor: - out_shape = param._unsharded_shape_padded + def _allocate_buffer(self, param: 'ETPShardedParam', dtype, reduce_scatter, fwd) -> torch.Tensor: + if reduce_scatter: + out_shape = param._sharded_padded_shape + else: + out_shape = param._unsharded_shape_padded + if not isinstance(dtype, torch.dtype): quantizer = param._quantizer assert quantizer is not None - assert quantizer.rowwise_usage ^ quantizer.columnwise_usage + param._quantizer.set_usage(rowwise=fwd, columnwise=not fwd) - device = torch.cuda.current_device() - buf = param._quantizer.make_empty(out_shape, dtype=torch.bfloat16, device=device) + buf = param._quantizer.make_empty( + out_shape, + dtype=torch.bfloat16, + device=torch.cuda.current_device(), + ) else: buf = torch.empty( out_shape, dtype=dtype, device=param.device, memory_format=torch.contiguous_format ) + buf_bytes = self._buf_bytes(out_shape, dtype) self._total_bytes += buf_bytes print_rank_0( f"[ETP Cache] +{buf_bytes / 1024**2:.1f} MB (shape={out_shape}, dtype={dtype}) " - f"total={self._total_bytes / 1024**2:.1f} MB" + f"total={self._total_bytes / 1024**2:.1f} MB id: {id(buf)} fwd: {fwd}" ) return buf - def checkout(self, param: 'ETPShardedParam', dtype, fwd: bool): - """Get a buffer for all-gather output. Returns (buffer, ticket). + def reserve(self, param: 'ETPShardedParam', dtype, fwd: bool, reduce_scatter=False) -> int: + """Assign a persistent ticket. No buffer is allocated until ``get()``.""" + key = param._get_cache_key(dtype, fwd, reduce_scatter) + ticket = self._next_ticket + self._next_ticket += 1 - Ticket IDs are recycled so they stay bounded. - If all buffers for this key are checked out, allocates a new one. - """ - key = param._get_cache_key(dtype, fwd) - pool = self._pool[key] - buf = pool.pop() if pool else self._allocate_buffer(param, dtype) - - if self._free_tickets: - ticket = self._free_tickets.pop() - else: - ticket = self._max_ticket - self._max_ticket += 1 - self._tickets[ticket] = (key, buf) - return buf, ticket + self._slots[ticket] = _TicketSlot( + key=key, param=param, dtype=dtype, reduce_scatter=reduce_scatter, fwd=fwd + ) + return ticket def get(self, ticket: int) -> torch.Tensor: - """Retrieve buffer by ticket and return it to the pool. - - This combines the old get + ticket_return into a single call. - After this call the ticket is invalidated and the buffer is - available for future checkouts. + """Return the buffer for *ticket*, lazily allocating if needed.""" + slot = self._slots[ticket] + if slot.buf is None: + pool = self._pool[slot.key] + slot.buf = pool.pop() if pool else self._allocate_buffer( + slot.param, slot.dtype, slot.reduce_scatter, fwd=slot.fwd + ) + self.key_to_allocate_func[slot.key] = (slot.param, slot.dtype, slot.reduce_scatter, slot.fwd) + + return slot.buf + + def release(self, ticket: int): + """Return the buffer to the pool. Ticket remains valid.""" + slot = self._slots[ticket] + assert slot.buf is not None + if slot.buf not in self._pool[slot.key]: + self._pool[slot.key].append(slot.buf) + + def clear(self): + """Drop all buffers; tickets remain valid and lazily re-allocate on next get().""" + for slot in self._slots.values(): + slot.buf = None + self._pool.clear() + self._total_bytes = 0 + + def reallocate_to_mempool(self, device, mempool): + """Re-allocate all ticket buffers into a CUDA graph memory pool. + + Call BEFORE graph capture so every buffer lives in the capture pool + and no allocations are recorded inside the graph. """ - assert ticket in self._tickets, f"Invalid ticket: {ticket}" - key, buf = self._tickets.pop(ticket) - self._free_tickets.append(ticket) - self._pool[key].append(buf) - return buf + # Clone the current memory pool buffers but into the passed in mempool + self._total_bytes = 0 + new_pool = defaultdict(list) + torch._C._cuda_beginAllocateCurrentThreadToPool(device, mempool) + for key, buffers in self._pool.items(): + new_buffers = [] + for _ in range(len(buffers)): + buf = self._allocate_buffer(*self.key_to_allocate_func[key]) + new_buffers.append(buf) + new_pool[key] = new_buffers + torch._C._cuda_endAllocateToPool(device, mempool) + + # Map each buffer in the old pool to its corresponding new one + old_to_new_buff = {} + for key, old_pool in self._pool.items(): + new = new_pool[key] + for old_buf, new_buf in zip(old_pool, new): + old_to_new_buff[old_buf] = new_buf + # Replace each slot's reference to its corresponding new one + for slot in self._slots.values(): + if slot.buf is not None: + slot.buf = old_to_new_buff[slot.buf] + + self._pool = new_pool + return def get_global_ETP_cache() -> ETPWeightCache: """Get or lazily create the global cache instance.""" - global _ALL_GATHER_BUFFER - if _ALL_GATHER_BUFFER is None: - _ALL_GATHER_BUFFER = ETPWeightCache() - return _ALL_GATHER_BUFFER + global _ETP_CACHE + if _ETP_CACHE is None: + _ETP_CACHE = ETPWeightCache() + return _ETP_CACHE + + +def reallocate_etp_cache_to_mempool(device, mempool): + """Re-allocate all ETP cache buffers into a CUDA graph memory pool.""" + if _ETP_CACHE is not None: + _ETP_CACHE.reallocate_to_mempool(device, mempool) + + +def wait_async_comms(): + """Wait on all in-flight ETP async communications (all-gathers + reduce-scatters). + """ + for param in list(_inflight_comm_params): + param._wait_param_gather() + param._wait_reduce_scatter() @dataclass diff --git a/transformer_engine/pytorch/module/layernorm_linear.py b/transformer_engine/pytorch/module/layernorm_linear.py index c3c6cf73d1..13b7a66b75 100644 --- a/transformer_engine/pytorch/module/layernorm_linear.py +++ b/transformer_engine/pytorch/module/layernorm_linear.py @@ -938,6 +938,9 @@ def wgrad_gemm( # Call wgrad GEMM now wgrad, grad_bias_ = wgrad_gemm(ln_out_total, grad_output) + if ctx.etp_size > 1: + wgrad = origin_weight.wgrad_reduce_scatter(wgrad, ctx.fuse_wgrad_accumulation) + # Update grad bias if needed if grad_bias is None: grad_bias = grad_bias_ @@ -963,7 +966,6 @@ def wgrad_gemm( dgrad = reduce_scatter_out else: dgrad = ub_obj_wgrad.get_buffer(local_chunk=True).clone() - # -------------------------------------------------- # Grad weight has been computed... # -------------------------------------------------- @@ -1018,7 +1020,7 @@ def wgrad_gemm( if ctx.requires_wgrad: # Handle custom DDP from mcore. if ctx.etp_size > 1: - wgrad = origin_weight.wgrad_reduce_scatter(wgrad, ctx.fuse_wgrad_accumulation) + pass elif ctx.fuse_wgrad_accumulation and hasattr(origin_weight, "grad_added_to_main_grad"): origin_weight.grad_added_to_main_grad = True if getattr(origin_weight, "zero_out_wgrad", False): From 9cb1379aca2719854bf308d61b945beead458827 Mon Sep 17 00:00:00 2001 From: Shiqing Fan Date: Mon, 30 Mar 2026 01:53:25 -0700 Subject: [PATCH 05/43] debug: make etp link table log human readable. --- .../module/extended_tensor_parallelism.py | 47 ++++++++++++++++++- 1 file changed, 46 insertions(+), 1 deletion(-) diff --git a/transformer_engine/pytorch/module/extended_tensor_parallelism.py b/transformer_engine/pytorch/module/extended_tensor_parallelism.py index d6a33e9826..bc921705a9 100644 --- a/transformer_engine/pytorch/module/extended_tensor_parallelism.py +++ b/transformer_engine/pytorch/module/extended_tensor_parallelism.py @@ -6,6 +6,7 @@ from typing import Dict, List, Optional from enum import Enum from dataclasses import dataclass, field +import re import torch from contextlib import nullcontext @@ -76,6 +77,17 @@ def update_config(**kwargs): setattr(ETP_CONFIG, key, value) +def tag_etp_params_with_names(model): + """Populate _debug_name on every ETPShardedParam with its full dotted parameter name. + + Call once after model construction so the linking log prints human-readable names + instead of raw tensor ids. + """ + for name, param in model.named_parameters(): + if isinstance(param, ETPShardedParam): + param._debug_name = name + + def wrap_module_params_etp(module, weight_names, etp_group, is_grouped=None): """Shard and re-register all parameters of a module using ETP weight sharding.""" if etp_group.size() == 1: @@ -154,6 +166,33 @@ class ETPShardedParam(torch.nn.Parameter): _pending_rs_weight = None _first_weight_flag = True _last_weight = None + _link_node_count = 0 + _link_table_buffer: List[str] = [] + _link_table_flushed: bool = False + + @classmethod + def _buffer_link_table_row(cls, prev: "ETPShardedParam", curr: "ETPShardedParam") -> None: + """Buffer one row of the prefetch-link table (flushed atomically on the second forward pass).""" + _W = 70 + + def _layer_id(name: str) -> str: + m = re.search(r"\d+", name) + return m.group() if m else "-" + + cls._link_node_count += 1 + if cls._link_node_count == 1: + cls._link_table_buffer.append( + f"\n{'node_id':>7} | {'layer_id':>8} | {'curr_weight_name':<{_W}} | prev_weight_name" + f"\n{'-'*7}-+-{'-'*8}-+-{'-'*_W}-+-{'-'*_W}" + ) + # Seed weight (first ETP param) as row 0 + cls._link_table_buffer.append( + f"{'0':>7} | {_layer_id(prev._debug_name):>8} | {prev._debug_name:<{_W}} | -" + ) + cls._link_table_buffer.append( + f"{cls._link_node_count:>7} | {_layer_id(curr._debug_name):>8} | " + f"{curr._debug_name:<{_W}} | {prev._debug_name}" + ) @staticmethod def __new__(cls, tensor, *args, **kwargs): @@ -195,6 +234,8 @@ def __init__(self, x, *args, **kwargs): # Padding self.is_padded_last_rank = False self.pad_length = 0 + # Debug + self._debug_name = "" def setup(self, weight_quantizer=None): """Set quantizer and create quantized shard.""" @@ -527,10 +568,14 @@ def all_gather_and_prefetch( cls = type(self) if not self.prefetch_initialized: if cls._last_weight is not None and cls._last_weight.next_w is None: - print_rank_0(f"linking curr w: {id(self)} {self.is_routed_expert} prev_w: {id(cls._last_weight)}") + cls._buffer_link_table_row(cls._last_weight, self) cls._last_weight.next_w = self self.prev_w = cls._last_weight self.prefetch_initialized = True + elif not cls._link_table_flushed and cls._link_table_buffer: + # Second forward pass: flush the complete table atomically to avoid interleaving + cls._link_table_flushed = True + print_rank_0("\n".join(cls._link_table_buffer) + "\n") cls._last_weight = self return result From a5ef6753c7e6bdac16e936331f95c696bcf3f5f8 Mon Sep 17 00:00:00 2001 From: Shiqing Fan Date: Mon, 30 Mar 2026 02:12:46 -0700 Subject: [PATCH 06/43] doc: add README_ETP.md --- docs/README_ETP.md | 685 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 685 insertions(+) create mode 100644 docs/README_ETP.md diff --git a/docs/README_ETP.md b/docs/README_ETP.md new file mode 100644 index 0000000000..2dfb3dd227 --- /dev/null +++ b/docs/README_ETP.md @@ -0,0 +1,685 @@ +# Extended Tensor Parallelism (ETP) + +## Overview + +Extended Tensor Parallelism (ETP) is a **light-weight**, **high-performance** and **memory-efficient** distributed training strategy implemented in TransformerEngine. It shards weight tensors across an ETP process group and reconstructs them on-demand via async all-gather, enabling training of larger models without sacrificing throughput by overlapping communication with computation. + +ETP applies to any TE module that wraps a `Linear` layer: `Linear`, `LayerNormLinear`, `LayerNormMLP` (for dense models), and `GroupedLinear` (for MoE models). When used with `GroupedLinear`, ETP provides additional batched coalesced all-gather support for gathering multiple expert weights in a single NCCL operation. + +ETP supports all TE low-precision formats (FP8, MXFP8, NVFP4) with a **quantize-then-gather** strategy: each rank quantizes only its local shard before the all-gather, so wire bandwidth scales with the quantized size (0.5× for FP8, 0.25× for NVFP4) rather than the full BF16 weight. + +--- + +## Performance + +TODO(shiqingf): add performance for Ultra model in nvfp4. + +---- + +## Features + +### User-Visible Features + +| Feature | Description | +|---|---| +| **Weight sharding** | Weights sharded 1/N across ETP group along `out_features`, reducing per-GPU VRAM | +| **Async prefetch** | Next layer's weight all-gather overlaps with current layer's GEMM in both forward (prefetches `next_w`) and backward (prefetches `prev_w`); controlled by `ETPConfig.weight_prefetch` | +| **NVFP4 support** | Full 4-bit quantized all-gather with interleaved-format post-processing | +| **FP8 / MXFP8 support** | Quantized shards with ETP-group amax reduction | +| **Routed expert support** | Batched coalesced all-gather for all experts in a MoE layer (GroupedLinear) | +| **Composable with TP/SP** | Orthogonal to tensor parallelism and sequence parallelism | +| **CUDA Graphs compatible** | ETP is compatible with CUDA Graphs. | +| **Debug naming** | `tag_etp_params_with_names(model)` populates human-readable names on every `ETPShardedParam`; the prefetch-link table is printed atomically at the start of the second forward pass | + +### Implementation Mechanisms + +| Mechanism | Description | +|---|---| +| **Alignment padding** | Shards padded to `ETPConfig.pad_for_alignment × etp_size` rows at construction via `get_padded_shard()`; only last rank carries padding (`is_padded_last_rank`); padding stripped in `_strip_padding()` both post-gather (before GEMM) and post-reduce-scatter (before wgrad accumulation) | +| **Fine-grained weight scheduling** | Each weight has its own `ETPWeightState` lifecycle and is scheduled independently via a doubly-linked list (`next_w`/`prev_w`), enabling per-weight AG/RS overlap at single-weight granularity | +| **Separate AG and RS state** | All-gather state (`state`) and reduce-scatter state (`rs_state`) are tracked independently per param, allowing forward and backward async ops to proceed without interference | +| **Dedicated CUDA streams** | AG and RS run on separate global CUDA streams (`AG_STREAM`, `RS_STREAM`), decoupled from the default compute stream; completion is signaled back via per-param CUDA events (`ag_event`, `rs_event`) that the compute stream waits on before consuming the result | +| **Ticket-based buffer cache** | `ETPWeightCache` assigns persistent tickets via `reserve()`; buffers are lazily allocated on `get()` and returned to the pool on `release()`; `clear()` drops all buffers while keeping tickets valid for lazy re-allocation (used for CUDA Graph re-capture) | +| **Wgrad reduce-scatter** | Async reduce-scatter of weight gradients, deferred to overlap with next layer's wgrad RS; padding stripped and grad accumulated in `_finalize_wgrad()` | + +--- + +## Design + +### Core Idea + +In standard Tensor Parallelism (TP), each GPU holds a shard of each weight and communicates activations. ETP goes one level deeper: **each weight is sharded along the `out_features` dimension (dim 0) across an ETP group of N GPUs**, so each GPU stores only 1/N of the weight. Before each GEMM, an all-gather reconstructs the full weight; after the backward GEMM, a reduce-scatter propagates the weight gradient back to the shards. + +``` +Standard column-parallel TP (TP=2, 2 GPUs, weight W of shape [K, M]): + GPU0 owns W[:K/2, :] (first half of out_features) + GPU1 owns W[K/2:, :] (second half of out_features) + +ETP (on top of column-parallel TP, ETP=2 per TP rank, 4 GPUs): + GPU0 (TP0, ETP0) owns W[:K/4, :] (first quarter of out_features) + GPU1 (TP0, ETP1) owns W[K/4:K/2, :] (second quarter of out_features) + GPU2 (TP1, ETP0) owns W[K/2:3K/4, :] + GPU3 (TP1, ETP1) owns W[3K/4:K, :] +``` + +ETP always shards along `out_features` regardless of the TP parallel mode (`column` or `row`). For `row` parallel mode, TP shards `in_features` while ETP shards `out_features`, making the two dimensions orthogonal. + +ETP is composable with TP and Sequence Parallelism for `Linear`, `LayerNormLinear`, and `LayerNormMLP`. The `etp_group` process group is orthogonal to the `tp_group`, giving a 2D parallelism grid. + + +### Weight Sharding + +#### Initialization + +Every rank independently allocates and initializes the **full** weight tensor, then slices out its local portion — there is no broadcast or communication during construction. + +``` +te.Linear.__init__(out_features=F, in_features=K, etp_group=group) +│ +├─ 1. Every rank: weight_tensor = torch.empty(F, K) ← full weight, same shape on all ranks +│ +├─ 2. reset_parameters() ← Kaiming-uniform init on every rank +│ identical seed ⇒ identical values on all ranks; slice is consistent without any comm +│ +└─ 3. wrap_module_params_etp(self, weight_names, etp_group) + │ + ├─ alignment = pad_for_alignment(16) × etp_size + │ pad_length = (alignment − F % alignment) % alignment + │ shard_size = (F + pad_length) // etp_size + │ + ├─ start = rank × shard_size + │ end = min((rank+1) × shard_size, F) ← clips real rows for last rank + │ shard = weight_tensor[start : end].clone() + │ + ├─ ETPShardedParam(shard) + │ .pad_length = pad_length + │ .is_padded_last_rank = (rank == etp_size−1 and pad_length > 0) + │ .group = etp_group + │ + ├─ module._parameters["weight"] = etp_shard ← replace nn.Parameter + │ + └─ del weight_tensor ← full buffer freed +``` + +Example: `F=63, K=32, etp_size=4, pad_for_alignment=16` + +``` +alignment=64, pad_length=1, shard_size=16 + +rank 0: rows [ 0:16] → ETPShardedParam [16, 32] pad_length=0 is_padded=False +rank 1: rows [16:32] → ETPShardedParam [16, 32] pad_length=0 is_padded=False +rank 2: rows [32:48] → ETPShardedParam [16, 32] pad_length=0 is_padded=False +rank 3: rows [48:63] → ETPShardedParam [15, 32] pad_length=1 is_padded=True +``` + +#### Padding and strip flow + +Padding is added **entering** each collective so all ranks contribute equal-sized chunks; it is stripped **exiting** each collective so downstream consumers see the real shape. + +``` +FORWARD + local shard [real_rows, K] (e.g. [15, 32] on last rank) + └─ get_padded_shard() → [shard_size, K] (e.g. [16, 32] zero row appended) + └─ all-gather → [padded_F, K] (e.g. [64, 32] across etp_size ranks) + └─ _strip_padding → [F, K] (e.g. [63, 32] ← weight seen by GEMM) + └─ GEMM → output [B, F] + +BACKWARD (wgrad path) + wgrad [B, F] (computed against stripped weight, so first dim is F not padded_F) + └─ _reduce_scatter pads: [F, K] → [padded_F, K] (re-pads before RS so chunks are equal) + └─ reduce-scatter → [shard_size, K] per rank + └─ _finalize_wgrad → _strip_padding → [real_rows, K] + └─ stored as param.grad (matches local shard shape) +``` + +#### Wrapping call + +```python +# Called in Linear/LayerNormLinear/LayerNormMLP/GroupedLinear __init__ +if etp_group is not None: + wrap_module_params_etp(self, self.weight_names, etp_group) + del weight_tensor # free the temporary full-weight buffer +``` + +For `GroupedLinear` (MoE), `wrap_module_params_etp` is called with `is_grouped=True`, which additionally sets `weight_list` on the first expert's `ETPShardedParam` so all experts' weights can be batched together in a single coalesced all-gather. + +### State Machine + +Each `ETPShardedParam` tracks two independent state machines: one for the all-gather (`state`) and one for the reduce-scatter (`rs_state`). Each uses the same four-state enum: + +``` +NONE ──────────► ASYNC_WAIT ──────────► DATA_READY ──────────► NONE +(shard only) (AG/RS launched) (AG/RS complete, (consumed, + result in cache) back to shard) + +NONE ─────────────────────────────────► DATA_READY_SYNC ──────► NONE + (sync gather, (consumed) + result available) +``` + +The `DATA_READY_SYNC` state is used for on-demand synchronous gathers (cold start or when prefetch is disabled). `DATA_READY` is used after an async gather completes via `handle.wait()`. + +Invalid transitions are guarded by `_set_state()` / `_set_rs_state()`. + +### Class Diagram + +
+Click to expand + +```mermaid +classDiagram + + %% ── Enums ──────────────────────────────────────────────────────────────── + class ETPWeightState { + <> + NONE + ASYNC_WAIT + DATA_READY + DATA_READY_SYNC + } + + %% ── Config ─────────────────────────────────────────────────────────────── + class ETPConfig { + <> + +int pad_for_alignment + +bool check_param_states + +bool weight_prefetch + } + + %% ── Core parameter class ───────────────────────────────────────────────── + class ETPShardedParam { + <> + $ _pending_rs_weight : ETPShardedParam + $ _first_weight_flag : bool + $ _last_weight : ETPShardedParam + $ _link_node_count : int + $ _link_table_buffer : List[str] + $ _link_table_flushed : bool + +ETPWeightState state + +ETPWeightState rs_state + +int _ag_ticket_fwd + +int _ag_ticket_bwd + +int _rs_ticket + +Event ag_event + +Event rs_event + +ETPShardHandle _prefetch_handle + +ETPShardHandle _wgrad_rs_handle + +callable _grad_accum_node + +callable _grad_accum_hook + +Quantizer _quantizer + +bool did_cast_to_low_precision + +QuantizedTensor quantized + +int pad_length + +bool is_padded_last_rank + +bool prefetch_initialized + +ETPShardedParam next_w + +ETPShardedParam prev_w + +bool is_routed_expert + +int expert_idx + +ProcessGroup group + +List weight_list + +Tensor wgrad_rs + +bool fuse_wgrad_accumulation + +str _debug_name + +setup(weight_quantizer) + +_weights() List + +_set_state(new_state) + +_set_rs_state(new_state) + +_get_cache_key(dtype, fwd, reduce_scatter) tuple + +_unsharded_shape_padded() tuple + +_unsharded_shape() tuple + +_sharded_padded_shape() tuple + +get_padded_shard() Tensor + +_strip_padding(tensor) Tensor + +_quantize_if_needed(skip, flag) + +_all_gather_weight(async_op, ...) tuple + +_all_gather_weight_on_demand(fwd, ...) Tensor + +_get_prefetched_weight(fwd, ...) Tensor + +_wait_param_gather() + +_wait_reduce_scatter() + +all_gather_and_prefetch(fwd, ...) Tensor + +all_gather_and_prefetch_bwd() Tensor + +get_wgrad_tensor() Tensor + +register_grad_accum_hook(node, hook) + +_finalize_wgrad(param, wgrad_rs, fuse) [staticmethod] + +_reduce_scatter(wgrads, async_op) tuple + +wgrad_reduce_scatter(wgrad, fuse) + } + + %% ── Async all-gather handles ───────────────────────────────────────────── + class ETPShardHandle { + +Work handle + +List etp_shards + +bool reduce_scatter + +wait() + } + + class BatchedNVFP4AllGatherAsyncHandle { + <> + +List output_handles + +Work outer_async_handle + +bool _synchronized + +wait() + } + + class _NVFP4AllGatherAsyncHandle { + +NVFP4TensorStorage output + +Tensor columnwise_data_interleaved + +Tensor columnwise_scale_inv_interleaved + +int world_size + +Work async_handle + +bool _synchronized + +post_process_nvfp4_gather() + +wait() + } + + %% ── Buffer pool / ticket cache ─────────────────────────────────────────── + class _TicketSlot { + <> + +tuple key + +ETPShardedParam param + +dtype + +bool reduce_scatter + +bool fwd + +Tensor buf + } + + class ETPWeightCache { + -Dict _pool + -Dict _slots + -int _next_ticket + -int _total_bytes + +reserve(param, dtype, fwd, reduce_scatter) int + +get(ticket) Tensor + +release(ticket) + +clear() + +reallocate_to_mempool(device, mempool) + -_allocate_buffer(param, dtype, reduce_scatter, fwd) Tensor + -_buf_bytes(shape, dtype) int + } + + %% ── External bases (simplified) ────────────────────────────────────────── + class torch_nn_Parameter { + <> + } + class QuantizedTensor { + <> + } + class NVFP4TensorStorage { + <> + } + + %% ── Relationships ──────────────────────────────────────────────────────── + + %% inheritance + torch_nn_Parameter <|-- ETPShardedParam + + %% state machine ownership + ETPShardedParam --> ETPWeightState : state / rs_state + + %% doubly-linked prefetch list (self-referential) + ETPShardedParam --> ETPShardedParam : next_w / prev_w + + %% grouped expert list (self-referential) + ETPShardedParam --> ETPShardedParam : weight_list + + %% in-flight prefetch / RS handles + ETPShardedParam --> ETPShardHandle : _prefetch_handle / _wgrad_rs_handle + + %% handle back-reference to shards for state transitions + ETPShardHandle --> ETPShardedParam : etp_shards + + %% handle polymorphism: plain Work or NVFP4-batched + ETPShardHandle --> BatchedNVFP4AllGatherAsyncHandle : handle (NVFP4 path) + + %% batched handle contains one entry per expert + BatchedNVFP4AllGatherAsyncHandle --> _NVFP4AllGatherAsyncHandle : output_handles + + %% config singleton controls all params + ETPShardedParam ..> ETPConfig : ETP_CONFIG + + %% buffer pool used via global singleton + ETPShardedParam ..> ETPWeightCache : reserve / get / release + + %% ticket slots + ETPWeightCache --> _TicketSlot : _slots + + %% quantized tensor stored per param + ETPShardedParam --> QuantizedTensor : quantized + + %% NVFP4 storage is a QuantizedTensor + NVFP4TensorStorage --|> QuantizedTensor + + %% NVFP4 handle output type + _NVFP4AllGatherAsyncHandle --> NVFP4TensorStorage : output +``` + +
+ +--- + +## Difference with FSDP + +FSDP (Fully Sharded Data Parallelism) and ETP both shard weight parameters, but they target different axes and serve different purposes: + +| Dimension | FSDP | ETP | +|---|---|---| +| **Sharding axis** | Data-parallel replicas | ETP process group (model-parallel dimension) | +| **Target layer** | All parameters uniformly | Any TE Linear, LayerNormLinear, LayerNormMLP, or GroupedLinear weight | +| **Communication** | All-gather before fwd, reduce-scatter after bwd | Same pattern, but orthogonal group | +| **State tracked** | PyTorch handles lifecycle | `ETPWeightState` state machine per param (separate for AG and RS) | +| **Quantization** | Framework-level, post-gather | **Quantize-then-gather** (lower bandwidth) | +| **Buffer management** | PyTorch flat-param storage | Ticket-based buffer pool per shape/dtype | +| **Prefetching** | PyTorch forward-hook prefetch | Lazy linked-list async prefetch across layers | +| **Gradient flow** | Reduce over data-parallel dim | Reduce-scatter over ETP dim | +| **Composability** | Wraps module hierarchy | Opt-in per-module via `etp_group` arg | + +**Key distinction**: FSDP shards across the *data-parallel dimension* (replicas processing different samples), while ETP shards across the *model-parallel dimension* (GPUs processing the same sample). They can coexist: a model can use FSDP for data parallelism and ETP for weight memory reduction simultaneously. + +A further practical difference is that ETP is **quantization-aware**: shards are quantized *before* the all-gather, so the wire bandwidth is proportional to the quantized size (e.g., FP4 = 1/4 of BF16), not the original weight size. FSDP gathers in full precision by default. + +--- + +## Scalability + +ETP scales along two independent dimensions: + +1. **ETP group size (N)**: Divides per-GPU weight memory by N. With N=8 and BF16 weights, a weight of 8 GB is reduced to 1 GB per GPU. With NVFP4, the same weight becomes 250 MB per GPU. + +2. **Number of experts (E)** (MoE only): Expert weights are gathered in parallel via a batched coalesced all-gather (`grouped_gather_along_first_dim`), so adding more experts within a MoE layer does not serialize the communication. + +**Combined scaling**: In a model with TP×ETP parallelism, the effective per-GPU weight size is `W / (TP × ETP)`. For example, TP=4 + ETP=8 gives 32× weight compression before training data parallelism is even considered. + +**Prefetch chain amortizes communication**: The linked-list prefetch means that for an L-layer model, L-1 all-gathers are completely hidden behind compute. Only the very first layer's all-gather (or the first backward layer) may stall, and only if the GPU compute is faster than the network. + + +TODO: add scalability perf of Ultra in nvfp4. + +--- + +## Schedule Details + +### Forward Pass + +``` +Layer i-1 fwd Layer i fwd Layer i+1 fwd +┌─────────────────────────┐ ┌─────────────────────────┐ ┌────────────── +│ all_gather_and_prefetch │ │ all_gather_and_prefetch │ │ ... +│ ├─ get W_i-1 (cached) │ │ ├─ get W_i (cached) │ │ +│ └─ async AG W_i ───── │─────▶ ready at use time │ │ +│ │ │ │ │ +│ GEMM(input, W_i-1) │ │ GEMM(input, W_i) │ │ +└─────────────────────────┘ └─────────────────────────┘ └────────────── + ↑ Overlap ↑ Overlap + AG(W_i) ∥ GEMM(W_i-1) AG(W_i+1) ∥ GEMM(W_i) +``` + +Step by step for layer `i`: + +1. **Lazy linked-list construction** (first pass only): Each `ETPShardedParam` has a `prefetch_initialized` flag. On the first call to `all_gather_and_prefetch`, this flag is `False`. The weight links itself to the previous weight (`cls._last_weight`) by setting `prev_w` / `next_w`, then sets `prefetch_initialized = True`. On subsequent passes the linking block is skipped. The complete link table is buffered during the first pass and flushed atomically as a single log print at the start of the second pass. +2. **Retrieve current weight**: + - `prev_w is not None` and `_ag_ticket_fwd is not None` → pull from buffer cache (ticket already reserved by async prefetch) + - otherwise → synchronous on-demand all-gather (only on very first use or when prefetch is disabled) +3. **Quantize if needed** (FP8/NVFP4/MXFP8): re-quantize the local shard into its pre-allocated quantized buffer before communication. +4. **Run GEMM** using the gathered full weight. +5. **Async prefetch next weight**: kick off `_all_gather_weight(async_op=True)` for `next_w` and store the handle in `next_w._prefetch_handle`. +6. **Release buffer**: after returning the gathered weight to the caller, the buffer for the current weight is returned to the pool via `cache.release(ticket)`. +7. **Save sharded weight** (not gathered) for the backward pass: `weight_etp_sharded` is stored in `ctx`; the gathered buffer is transient. + +#### Prefetch implementation sketch + +```python +# all_gather_and_prefetch (simplified) +if self.prev_w is not None and self._ag_ticket_fwd is not None: + result = self._get_prefetched_weight(fwd=True, ...) # cached +else: + result = self._all_gather_weight_on_demand(fwd=True, ...) # sync fallback + +if ETP_CONFIG.weight_prefetch and self.next_w is not None: + _, handle = self.next_w._all_gather_weight(async_op=True, ...) + self.next_w._prefetch_handle = handle + +if self.prev_w is not None: + cache.release(self._ag_ticket_fwd) # return consumed buffer to pool + +# First-pass only: link into prefetch chain +if not self.prefetch_initialized: + if cls._last_weight is not None and cls._last_weight.next_w is None: + cls._buffer_link_table_row(cls._last_weight, self) + cls._last_weight.next_w = self + self.prev_w = cls._last_weight + self.prefetch_initialized = True +elif not cls._link_table_flushed and cls._link_table_buffer: + cls._link_table_flushed = True + print_rank_0("\n".join(cls._link_table_buffer) + "\n") # atomic flush +cls._last_weight = self +``` + +The all-gather for layer `i+1` runs on the dedicated `AG_STREAM` while the GEMM for layer `i` runs on the compute stream, giving near-perfect overlap for GPU-compute-bound models. Similarly, the wgrad reduce-scatter runs on `RS_STREAM`. Both streams signal completion via CUDA events (`ag_event`, `rs_event`) that are waited on the compute stream before the result is consumed, ensuring correct ordering without blocking either communication stream. + +### Backward Pass + +The backward schedule mirrors forward, but traverses the layer chain in reverse: + +``` +Layer i+1 bwd Layer i bwd Layer i-1 bwd +┌─────────────────────────┐ ┌─────────────────────────┐ ┌────────────── +│ all_gather_and_prefetch │ │ all_gather_and_prefetch │ │ ... +│ ├─ get W_i+1 (cached) │ │ ├─ get W_i (cached) │ │ +│ └─ async AG W_i ────── │─────▶ ready at use time │ │ +│ │ │ │ │ +│ dgrad GEMM(grad, W_i+1) │ │ dgrad GEMM(grad, W_i) │ │ +│ wgrad GEMM(act, grad) │ │ wgrad GEMM(act, grad) │ │ +│ async RS(wgrad_i+1) ─── │─────▶ finish RS before use │ │ +└─────────────────────────┘ └─────────────────────────┘ └────────────── +``` + +Step by step for layer `i` backward: + +1. **`all_gather_and_prefetch_bwd()`**: Gather `W_i` for the dgrad GEMM; simultaneously async-prefetch `W_i-1` (the `prev_w`) for the next backward step. Uses `skip_weight_cast=True` — no re-quantization needed since scales are already valid from the forward pass. +2. **dgrad GEMM**: Compute `dX = dY × W_i` using the gathered weight. +3. **wgrad GEMM**: Compute `dW = X^T × dY` using the saved input activation. +4. **`wgrad_reduce_scatter(wgrad, fuse_wgrad_accumulation)`**: + - **Non-last layer** (`prev_w is not None`): Launch async reduce-scatter; store `ETPShardHandle` in `self._wgrad_rs_handle`. Return `None` to backward (gradient deferred). + - **Last layer** (`prev_w is None`): Synchronous reduce-scatter. Call `_finalize_wgrad()` immediately — strips padding, accumulates into `main_grad`, fires grad-accum hook. +5. **Deferred finish**: At the start of each subsequent layer's `wgrad_reduce_scatter`, `self.next_w._wait_reduce_scatter()` is called, which waits on `next_w._wgrad_rs_handle` and records a CUDA event. Then `_finalize_wgrad()` is called for `next_w` to strip padding, accumulate, and fire the hook. The RS buffer is returned to the pool via `cache.release()`. + +### Coalesced Expert Communication + +For MoE layers with multiple routed experts, all experts' all-gathers are coalesced into a single NCCL operation via `torch.distributed._coalescing_manager`. This reduces NCCL kernel launch overhead and improves bus utilization compared to E sequential all-gathers. The wgrad reduce-scatter for all experts is similarly coalesced. + +--- + +## Low-Precision Details + +### FP8 (per-tensor scaling) + +- Each `ETPShardedParam` is assigned a quantizer via `setup(weight_quantizer)`. +- The quantizer is configured with `amax_reduction_group=etp_group` (the group is already stored in the param from construction), so the amax is all-reduced across the ETP group before scaling—ensuring all GPUs in the group use the same scale factor for the full weight. +- On the first microbatch (`is_first_microbatch=True`), `_quantize_if_needed()` re-quantizes the shard. On subsequent microbatches, `skip_weight_cast=True` reuses the existing quantized buffer, saving re-quantization cost. +- A `cast_noop_flag` tensor (from the FP8 recipe) can signal that no scale update is needed, enabling a no-op cast path. + +### NVFP4 (4-bit, block-scaled) + +NVFP4 requires special communication handling because: +- Each 4-bit value shares a scale with its 16-element block. +- The layout has both rowwise and columnwise views, each with separate data and `scale_inv` tensors. +- After all-gather, the interleaved format must be re-assembled into a GEMM-ready layout. + +The `_all_gather_nvfp4()` function in `distributed.py` handles this: +1. **Pre-communication**: Strips padding from `scale_inv` tensors (padding ensures alignment to communication boundaries). +2. **All-gather**: Gathers both `data` and `scale_inv` for the rowwise view; similarly for the columnwise view (with transposed tensor handling). +3. **Post-processing** (`_post_process_nvfp4_gather` / `post_process_nvfp4_gather`): + - Fixes interleaved data layout back to packed format. + - Re-pads `scale_inv` to the GEMM-required alignment. + - Transitions the tensor to `GEMM_READY` state. + +For async all-gathers, post-processing is deferred into `_NVFP4AllGatherAsyncHandle.wait()`, keeping it off the critical path. + +For routed experts, `BatchedNVFP4AllGatherAsyncHandle` wraps one handle per expert; the single outer coalescing-manager handle is waited first, then each expert's NVFP4 post-processing is applied sequentially. + +`_strip_padding` handles NVFP4 scale_inv correctly: +- `rowwise_scale_inv`: strip to `round_up(M, 128)` rows (dim 0) +- `columnwise_scale_inv`: strip to `round_up(ceil(M / 16), 4)` columns (dim 1, transposed) + +### MXFP8 (microscaling FP8) + +MXFP8 follows the same quantize-then-gather pattern as FP8. The amax reduction for microscaling is handled within the quantizer; ETP configures the reduction group to be the ETP group. + +`_strip_padding` handles MXFP8 scale_inv correctly: +- `rowwise_scale_inv`: strip to `round_up(M, 128)` rows (dim 0) +- `columnwise_scale_inv`: strip to `round_up(M // 32, 4)` rows (dim 0; columnwise is not transposed for MXFP8) + +### Bandwidth Savings from Quantization + +| Dtype | Size vs BF16 | Example: 8B param weight | +|---|---|---| +| BF16 | 1× | 16 GB per ETP group | +| FP8 | 0.5× | 8 GB | +| NVFP4 | 0.25× | 4 GB | + +With ETP size N=8 and NVFP4, each GPU holds and gathers 0.5 GB instead of the full 16 GB. + +--- + +## Memory Savings + +### Per-GPU Weight Memory + +With ETP group size N, each GPU stores only `1/N` of each weight at rest. The gathered weight is transient (lives only during the GEMM) and reused from the pool. + +### Ticket-Based Buffer Pool + +`ETPWeightCache` pools gathered weight buffers by `(shape, dtype, fwd, expert_idx, reduce_scatter)` key so that same-shaped weights across layers reuse a single GPU allocation instead of allocating per-layer. + +#### Data structures + +``` +_pool : { cache_key → [buf, buf, ...] } available (released) buffers +_slots : { ticket_id → _TicketSlot } persistent per-param ticket slots + (key, param, dtype, fwd, reduce_scatter, buf) +_next_ticket : int monotonically increasing ticket ID counter +``` + +Each `ETPShardedParam` holds up to three tickets: +- `_ag_ticket_fwd` — forward all-gather buffer +- `_ag_ticket_bwd` — backward all-gather buffer +- `_rs_ticket` — reduce-scatter buffer + +A buffer lives in **exactly one** place at a time: + +``` +reserve() → slot created, buf=None (no allocation yet) +get(ticket) → buf allocated lazily from pool or fresh; stored in slot +release(ticket) → buf returned to pool; slot.buf set to None +clear() → all slot.buf = None, pool cleared (tickets stay valid; next get() re-allocates) +``` + +#### CUDA Graph support + +Before graph capture, call `reallocate_etp_cache_to_mempool(device, mempool)` to migrate all pool buffers into the CUDA graph memory pool. This ensures no allocations occur inside the captured graph. + +### No Activation Duplication + +The sharded weight (`weight_etp_sharded`) is saved for the backward pass instead of the gathered weight. This avoids keeping a full-size weight copy in the gradient tape, which would negate the memory savings. + +### Quantized Shard Storage + +When using FP8/NVFP4/MXFP8, only the quantized shard (not BF16) is stored persistently in `ETPShardedParam.quantized`. The full-precision master weight can reside in the optimizer state on CPU or be managed separately, keeping GPU footprint at quantized shard size. + +--- + +## API Usage + +
+Click to expand + +```python +import torch.distributed as dist +from transformer_engine.pytorch import Linear, LayerNormLinear, LayerNormMLP +from transformer_engine.pytorch.module.extended_tensor_parallelism import ( + tag_etp_params_with_names, + update_config, +) + +# Set up process groups +tp_group = ... # Tensor-parallel group +etp_group = ... # ETP group (orthogonal to TP) + +# Drop-in replacement for standard TE Linear (dense model) +# Weights are sharded at construction time by wrap_module_params_etp +layer = Linear( + in_features=4096, + out_features=4096, + parallel_mode="column", # or "row" + tp_group=tp_group, + etp_group=etp_group, # Enable ETP +) + +# Also works with LayerNormLinear and LayerNormMLP (dense or MoE feed-forward) +ffn = LayerNormMLP( + hidden_size=4096, + ffn_hidden_size=16384, + tp_group=tp_group, + etp_group=etp_group, # Enable ETP +) + +# Weight is automatically an ETPShardedParam holding only the local shard +assert isinstance(layer.weight, ETPShardedParam) + +# Call setup() once after constructing quantizers (FP8/NVFP4). +# Note: etp_group is already stored in the param; setup() only takes quantizers. +layer.weight.setup(weight_quantizer=quantizers) + +# Optionally tag all ETP params with human-readable names for the link table log. +# Call once after full model construction. +tag_etp_params_with_names(model) + +# Forward/backward are transparent — ETP handles all-gather/reduce-scatter internally +output = layer(input) +``` + +
+ +For MoE layers with routed experts, `GroupedLinear` uses the same `etp_group` argument and handles batched expert weight gathers automatically. + +--- + +## Implementation Files + +| File | Role | +|---|---| +| `transformer_engine/pytorch/module/extended_tensor_parallelism.py` | Core ETP: `ETPShardedParam`, `ETPWeightCache`, `_TicketSlot`, `ETPWeightState`, `ETPConfig`, `wrap_module_params_etp`, `tag_etp_params_with_names`, `update_config`, `reallocate_etp_cache_to_mempool`, `wait_async_comms` | +| `transformer_engine/pytorch/module/linear.py` | ETP integration in `Linear` forward/backward | +| `transformer_engine/pytorch/module/layernorm_linear.py` | ETP integration in `LayerNormLinear` forward/backward | +| `transformer_engine/pytorch/module/layernorm_mlp.py` | ETP integration in `LayerNormMLP` forward/backward | +| `transformer_engine/pytorch/module/grouped_linear.py` | ETP integration for MoE routed-expert grouped GEMMs | +| `transformer_engine/pytorch/distributed.py` | `gather_along_first_dim`, `_all_gather_nvfp4`, `_NVFP4AllGatherAsyncHandle` | +| `tests/pytorch/distributed/test_etp.py` | ETP unit tests: state machine, buffer cache, weight sharding, module param replacement, `Linear`/`LayerNormLinear`/`GroupedLinear` fwd/bwd correctness, prefetch chain, wgrad reduce-scatter, microbatches, NVFP4 fwd/bwd (aligned + unaligned), MXFP8 fwd/bwd (aligned + unaligned) | +| `tests/pytorch/distributed/test_tp_etp.py` | TP+ETP integration tests: process group layout, `Linear` (column/row parallel) weight shape and fwd/bwd correctness, `LayerNormLinear` and `LayerNormMLP` fwd/bwd smoke tests; runs on 4 GPUs with TP=2, ETP=2 | + +---- + +## Best Practice + +TODO + +---- + +## Caveats + +- First forward pass always stalls (cold start) + + On the very first forward pass, `state == NONE` for all weights (no prefetch has run yet), so every weight does a synchronous all-gather. Only from the second pass onward does the async prefetch chain kick in. For frameworks that benchmark the first iteration (e.g., profilers, compilation warmup), this cold-start stall looks like a regression. + +- Link table logged on second forward pass + + The prefetch-link table (printed via `tag_etp_params_with_names` + the built-in logging) is buffered during the first forward pass and flushed atomically at the start of the second forward pass. This ensures it is not interleaved with other logs, but means it will not appear until the second iteration. + +---- + +## Future Work + +TODO + +---- From 92fc0f0910b39a614b269c3e87f812bfab88916e Mon Sep 17 00:00:00 2001 From: Shiqing Fan Date: Mon, 30 Mar 2026 08:24:44 -0700 Subject: [PATCH 07/43] debug: add default and meaningful nvtx_label for ETP (batched) AG/RS kernels. --- .../module/extended_tensor_parallelism.py | 29 +++++++++++++++---- .../pytorch/module/grouped_linear.py | 4 +-- .../pytorch/module/layernorm_linear.py | 3 +- transformer_engine/pytorch/module/linear.py | 3 +- 4 files changed, 26 insertions(+), 13 deletions(-) diff --git a/transformer_engine/pytorch/module/extended_tensor_parallelism.py b/transformer_engine/pytorch/module/extended_tensor_parallelism.py index bc921705a9..0cc1979524 100644 --- a/transformer_engine/pytorch/module/extended_tensor_parallelism.py +++ b/transformer_engine/pytorch/module/extended_tensor_parallelism.py @@ -363,6 +363,13 @@ def _strip_padding(self, tensor): def _all_gather_weight(self, async_op, skip_weight_cast, cast_noop_flag, fwd, nvtx_label=None): """Quantize (if needed) and all-gather weight. Returns (weight_total, handle).""" + if nvtx_label is None: + nvtx_label = ( + self._debug_name + + (".fwd" if fwd else ".bwd") + + (".async" if async_op else ".sync") + ) + weights = self._weights # 1. Transition state for async gathers. @@ -633,12 +640,18 @@ def _wait_reduce_scatter(self): self._wgrad_rs_handle = None self.rs_event.record() - def _reduce_scatter(self, wgrads, async_op): + def _reduce_scatter(self, wgrads, async_op, nvtx_label=None): """Reduce-scatter one or more wgrads. Returns (outputs, handle). Single tensor: plain reduce-scatter (no coalescing). Multiple tensors: coalesced reduce-scatter. """ + if nvtx_label is None: + nvtx_label = ( + self._debug_name + + ".bwd" + + (".async" if async_op else ".sync") + ) for w in self._weights: if async_op: @@ -661,12 +674,15 @@ def _reduce_scatter(self, wgrads, async_op): out_buffers = [None] * len(wgrads) if len(wgrads) == 1: + nvtx_range_push(f"{nvtx_label}.etp_rs") out, handle = reduce_scatter_along_first_dim( wgrads[0], self.group, async_op=async_op, output=out_buffers[0] ) + nvtx_range_pop(f"{nvtx_label}.etp_rs") return [out], handle else: outputs = [] + nvtx_range_push(f"{nvtx_label}.batched_etp_rs") with torch.distributed._coalescing_manager( group=self.group, device=wgrads[0].device, @@ -675,10 +691,11 @@ def _reduce_scatter(self, wgrads, async_op): for out_buffer, tensor in zip(out_buffers, wgrads): out, _ = reduce_scatter_along_first_dim(tensor, self.group, output=out_buffer) outputs.append(out) + nvtx_range_pop(f"{nvtx_label}.batched_etp_rs") return outputs, cm if async_op else None - def wgrad_reduce_scatter(self, wgrad, fuse_wgrad_accumulation): + def wgrad_reduce_scatter(self, wgrad, fuse_wgrad_accumulation, nvtx_label=None): """Reduce-scatter wgrad(s). Sync for last weight, async+deferred for others. Accepts a single tensor (non-routed) or list of tensors (routed experts). @@ -694,12 +711,12 @@ def wgrad_reduce_scatter(self, wgrad, fuse_wgrad_accumulation): if ETP_CONFIG.weight_prefetch and self.prev_w is not None: # Async reduce-scatter (not last weight — deferred finish) self.fuse_wgrad_accumulation = fuse_wgrad_accumulation - _, rs_handle = self._reduce_scatter(wgrads, async_op=True) + _, rs_handle = self._reduce_scatter(wgrads, async_op=True, nvtx_label=nvtx_label) self._wgrad_rs_handle = ETPShardHandle(rs_handle, weights, reduce_scatter=True) ret = tuple([None] * len(wgrads)) if batched else None else: # Sync reduce-scatter (last weight in chain) - sharded, _ = self._reduce_scatter(wgrads, async_op=False) + sharded, _ = self._reduce_scatter(wgrads, async_op=False, nvtx_label=nvtx_label) result = [self._finalize_wgrad(p, g, fuse_wgrad_accumulation) for p, g in zip(weights, sharded)] ret = result if batched else result[0] @@ -718,10 +735,10 @@ def wgrad_reduce_scatter(self, wgrad, fuse_wgrad_accumulation): return ret - def batched_wgrad_reduce_scatter(self, wgrad_list, fuse_wgrad_accumulation): + def batched_wgrad_reduce_scatter(self, wgrad_list, fuse_wgrad_accumulation, nvtx_label=None): """Batched version of wgrad_reduce_scatter.""" assert self.is_routed_expert and self.weight_list is not None - return self.wgrad_reduce_scatter(wgrad_list, fuse_wgrad_accumulation) + return self.wgrad_reduce_scatter(wgrad_list, fuse_wgrad_accumulation, nvtx_label=nvtx_label) def __torch_function__(self, func, types, args=(), kwargs=None): if kwargs is None: diff --git a/transformer_engine/pytorch/module/grouped_linear.py b/transformer_engine/pytorch/module/grouped_linear.py index 2f1fac23bf..fe81196f4a 100644 --- a/transformer_engine/pytorch/module/grouped_linear.py +++ b/transformer_engine/pytorch/module/grouped_linear.py @@ -420,9 +420,7 @@ def backward(ctx, grad_output: torch.Tensor) -> Tuple[Union[torch.Tensor, None], accumulate_wgrad_into_param_main_grad = ctx.fuse_wgrad_accumulation if ctx.etp_size > 1: - weights = origin_weights[0].batched_all_gather_and_prefetch_bwd( - nvtx_label="te._GroupedLinear.bwd", - ) + weights = origin_weights[0].batched_all_gather_and_prefetch_bwd() if ctx.requires_dgrad: dgrad_gemm_use_split_accumulator = _2X_ACC_DGRAD diff --git a/transformer_engine/pytorch/module/layernorm_linear.py b/transformer_engine/pytorch/module/layernorm_linear.py index 13b7a66b75..824030c3d0 100644 --- a/transformer_engine/pytorch/module/layernorm_linear.py +++ b/transformer_engine/pytorch/module/layernorm_linear.py @@ -576,8 +576,7 @@ def backward( ) = restore_from_saved(ctx.tensor_objects, saved_tensors) if ctx.etp_size > 1: - weight = origin_weight.all_gather_and_prefetch_bwd( - nvtx_label=nvtx_label) + weight = origin_weight.all_gather_and_prefetch_bwd() # Delete the references to tensor objects once they've been consumed # by the `restore_from_saved` method to construct back the actual tensors. diff --git a/transformer_engine/pytorch/module/linear.py b/transformer_engine/pytorch/module/linear.py index 20f4799167..4c4789461c 100644 --- a/transformer_engine/pytorch/module/linear.py +++ b/transformer_engine/pytorch/module/linear.py @@ -699,8 +699,7 @@ def backward(ctx, grad_output: torch.Tensor) -> Tuple[Union[torch.Tensor, None], # -------------------------------------------------- if ctx.etp_size > 1: - weight_fp8 = weight.all_gather_and_prefetch_bwd( - nvtx_label=nvtx_label) + weight_fp8 = weight.all_gather_and_prefetch_bwd() dgrad = None dgrad_work = None From b63927171b8cef3622884db4b3cd93511ba1ec66 Mon Sep 17 00:00:00 2001 From: Shiqing Fan Date: Tue, 31 Mar 2026 00:51:44 -0700 Subject: [PATCH 08/43] doc: udpate README --- docs/README_ETP.md | 6 ++++++ docs/etp/etp_ep_nt6_schedule_bf16.png | Bin 0 -> 188377 bytes 2 files changed, 6 insertions(+) create mode 100644 docs/etp/etp_ep_nt6_schedule_bf16.png diff --git a/docs/README_ETP.md b/docs/README_ETP.md index 2dfb3dd227..d32321d6df 100644 --- a/docs/README_ETP.md +++ b/docs/README_ETP.md @@ -484,6 +484,12 @@ Step by step for layer `i` backward: - **Last layer** (`prev_w is None`): Synchronous reduce-scatter. Call `_finalize_wgrad()` immediately — strips padding, accumulates into `main_grad`, fires grad-accum hook. 5. **Deferred finish**: At the start of each subsequent layer's `wgrad_reduce_scatter`, `self.next_w._wait_reduce_scatter()` is called, which waits on `next_w._wgrad_rs_handle` and records a CUDA event. Then `_finalize_wgrad()` is called for `next_w` to strip padding, accumulate, and fire the hook. The RS buffer is returned to the pool via `cache.release()`. + +Here is an example of ETP schedule diagram for Hybried Nemotron6 in bf16 as an example (ETP+EP with partial CGs): + +![alt text](etp/etp_ep_nt6_schedule_bf16.png) + + ### Coalesced Expert Communication For MoE layers with multiple routed experts, all experts' all-gathers are coalesced into a single NCCL operation via `torch.distributed._coalescing_manager`. This reduces NCCL kernel launch overhead and improves bus utilization compared to E sequential all-gathers. The wgrad reduce-scatter for all experts is similarly coalesced. diff --git a/docs/etp/etp_ep_nt6_schedule_bf16.png b/docs/etp/etp_ep_nt6_schedule_bf16.png new file mode 100644 index 0000000000000000000000000000000000000000..828f7b14a61d5903dd21dba22b72a978324b5ae5 GIT binary patch literal 188377 zcmZ_01yqz>_dX1WGD8S6bVv+C38Hj&&Cn$wDAFmZgoL1UcY_krAq~Z%F^xKy|p7#IX-l&mHO#tmW&jBAV# zEbt0Fsp}f}g6X2EAdOKrM6(Y5A#1LOwop;QxDCFCU|7G|pWs*DU}9VY zUoo#0!@!WlK+8&LyJP;yxSme3Lfx~oW*?nspDQA( zt|==i{eb8?ET;05+Lt#Ni!6)vjW3Foo@q8C7E5z<*6E_Pr?-B4ibu|nk@vrGT?rRlMp{J|a5GZX7oTWK`8?_q_sfkjk%Bp!SF*@j_* z%Z-61`?-b}8#7z+A){SUN%tNzKU*3a|0tklshlRLqwe@vdE?t`bQnt+cM^VI6?S)c zxIT@b58J~;-ioCgV>x+Dt#_59={1hS`>@tgTuIA)apY{8$TsG;>wh=tG(+;9o4${L z^bPRRX_#b=d_+SN9W`XUn-^H2bV@uyQ=X%8E8k5$3w10p3>YA)E*V+fAr@|iH za_xQyYOcb0PbbaNf;N9FYt4bMP5)-f&p}(&!~4^=+B7%|I$gqa{pVY%Y1bP=fkySt&-UA? z2RGwU2^@}Ik*=3D<~{3gZoCy18U9kcvoj0TZ!ob8uvE%LQ43~rBn>^(7qT@S8Yr|8 z*YB-W=nh!0&Fk5ouG{h4Y5u;*Gi~1=&ldOONz%gKZAoM%Qja3Kqw=6Tl8fPwpjmrT4@>RS9%#K#zX!@jwA+Ll6K=$ zR%hc*i$VFL<_}@yJF|^$zb?)aZr`_=Yw;e=6zhDwTVp$s>9Lx!@-ZoBjw4-7@SL_YGw&g=4MqS1Bhz30g`37clq&yC^` zIMeO>MI3O}=(&}JZP$x`t?EC8% z;!caQRo;^cgJ}XhPve+XIw+ZbJeorMYd2ZIZZ4t5v;OX;FHzfOkK--pR6=Og9xZ}s z(7N#~kz-1bY>)rRRJHYJE?4b~i@hE`o3Y+lCMB?jj#{`zLjB2Hj50{2M0{BP4;}Wu z2OTl*3G%4dup#5$TsZj(#HMnFvhzAVR>KHj;8Am2nN*&Ul|mg5NCY1=O)A!FOd z)V5)N?g*p_Uyu{&B)<^-y>2a1JzF-U)S!;4q3jmRyQk}#-^gxj_d4ZN`23ixct;&^ z=aeXg?v~hdUU0*!1};qLvxEE({u>#XMsSd#m8Pha{zOKbD)v;3ZBWB2!~#Zm!YuWVO_w zU8+3<{(JLeVX}>Lli_5?g_c>>YDYi0T8OiIqJPGuq$`q?-8j2TmD>Av?SmQ{0&0F5 zE|adkrJm_(>o^9PK$)PM1XR3MAV9S8l+l@@Ld=w3E7{(B$(F<8kEQC{y%S0&A^y6| zuwlQCZVODS4-1AxV(LOwzDxFOe?E2Sgkwph$f8lNu!`XpT(G9?L_He z`e)baX0aCwULlHpIgEB+n}$}_CyF!(!f2?v-n75M`F(LP{A}~f9sNo(P;pdWi$|Eo zEI9hdJlhzPdMy3$_QUswbT2MOvSe^%a1|t;ufI)d0bzSG_hK#?520Q9peB_3?yt@A zR!x_!ua>U~EZxMVL&15`{aR@rF?)3TzX9dL@VM1-$)`P6N=_2HgF(Id`4Nam5&kwh z7AKS6B6xzS%Q4Sp?uYx34L;PapKPt$E>3DdxopP3yuYB&SY8)S+#9=WHxvwp5>K7d zbN=@Ja8~tQ>n+-RAUOAfg!k__A(8VkUOSDu4%=P=_J%LcU(pI~5}aDS^ZveQ0uB(EK+tIyYZs?lk2lB-sc?wcrzxmAR&P(UdJVwuiXH-805saZ}|8^ z2B=w%&CkJ|+0VY0!?a`k#Ap{p_BHmNMhAMy$9-u_z@+Ewd-qOVN^8Yl_t7Uw{xnV| zCEb#_N!m;!7)z7MUjOrm-+T9_&U?Pav8XyHBU&2y3~p}zUZ37^>*0Ud^xn;TB2`|# zy{Kq>GrpW8^{}TBuT{5!mSdX(d1G)zVSV&c0ZS?Q-c|nYtbJPN8A`&|!T*KqUx}Cr zQ?B?dwzeDYmn`>qp(6k!UzQV`;ps3^cKQCO5MmYtg+@fMG95^Tk{aB>c3&&sY8PrE z_vztCkSQ0IlWMTqyGyAU;7{iE)$*pe`@>J4%P8V=`%3y)4=1=TWKvXjMyBp5Ek-D}|S>rI5*(G0PIs9wx_pdsz*q{!5 z`q3N1NP1hFzU0*!wPsK>uD{jIkI!inI1KiatXtvlbV{_j!i^eut0OhXQ^&hsoZVvQ zf9PsIclhy;?pDNb43AnC;x~S0Rt`l#li&1hX*YK`~aVnCQ{ z#au>)K97X>Qqb2AVezcTvU|Dc*almR#;xMSJHno;@i|}L_UjBqbHy!@8I&)T$-YYy z=|LE%sdR_^K}Fc201#ll2=`=93#CSSc>WV(2FqO{pE9bGW-15 z+t{fYN|%pf`kYVR^M0wghnvWe`5W&yjhk1FrGo*xLP934*o(xw4@TbUziMtgekACb zWiwdx`9=D+pn^yl@p_1z|BAWd4ec+>Tsi5){1LycrRdodE?Ue#CJsjCr`wVJHx0@a zuhRfr7tHAwCH^xQLe+Eqe(#TXmaT5#5YxVGV^L?8&p6b5HfCPhEnXfn{EMX;a%+T2y=3%$jgA$6}dzao}|}rotA6Woq)rC-5?K@fK_Pq zMJ+3JHQS?=xJajdtrb0Z$I203V11hH;Fg-imphrqtbufQRgC_9lAK9$Zq_lgV?AP! z3{LsHs0U@0dWpSS5ct@vg-;ii`=rdM(KX`P$G{>=*VVh}66UG#c&i+G4M<+?Qs;5S zyi5Lv(3`HpffY1_*(xme8-2D^Z22#}f9$Z$saSt2OGz1-$@>hot9oTB^!g=P`c8(3 z8z?6)744;_&0_ysHQl2?jo+!be+6%0OrWBV0)R$cI#F&kj2lwT_`XVFDyx$BMd8^Y zoyEn)(r`lYeZ>^v>YH_B#jt`?Z4YIt-F8Xo@yxQz>hXa%v0BD^AhnzI#cq9To_0ieMi z_{Og1?q2@5jqWWzv$6N2n>W0M=1Z}V z!xvGSE?n`2?3?dDdmVj=!uEOndo!jJ=N19|%Nn1h#7Ki`&i~d7#5_4bNf$34z6$ld z56MLnmo^Q_OMZ4BB+F+LI~vncT~Lj5xHa)LM)WI4pLcXhC^ngt64UP)ztnO_e6FNT zaJ)Q}B;wtD$fA~BqW2cdA*2TCO>on@X2N8$$+=JTmyrIWvPCUd?MRdBJ^U=1RgVbb zbC~a>MNgzhRG3_z$9$D%|HVW^L-MmU<(_M993_;PsWQn~mSlP;LnID>7eG2!1kWL$=ppELCKFZ4K7;VqshMj@^`ZCs`>@&k> zgFh?pCkYrgt9n$)Dic->-OAm5x*E$@!{WUVuLBu!)bEef0;$3-`T~j3!$=Ek>*5$2fQ2S=vY} zxf_a8?Hgl@Xz;v-v1+{c!btZs@;Kr7^3s;O5_&JTyl4R%C-JTTb2%4-pd}|Lokn0#?7v zvcf6V18muO32bhQ0W0&bYZA2^5m)D06zO=nyAXLxq~&7&V+oD$(*kg)?=~O8h21ZI z9$9l+4$fxVgZu^D}%6ZVR#?H59fM)hUMY>;Rm~v*={5;_q2_l}}^m!bM z+7S4RLCE3b48$-hELzGIOa;Rjt?s#P8|F}!lMq-43!1DdAZ*AS!{Pr^_{|61uE2#nwzuIx=?51lY6l1!* zGQE%Wbv}iU=Bo{d!dhJqR#m?2-R7R6*vu8+{sxeG+yC#<@36q7bp&xMrT#8a8FqIB zbf7BEYgUg-^uw-cKzIQs3Fe)=hI4cIj`h2% zrB@6iyR-47w9$5=3~s~HOUG41Od=C&uw=}d?>yYv6O$;Gf}XR{hcgy&z5 zT7se%6r`y7dAzl0MZ+m)1dv;`_2=+5UgyVZuf=cG6aa2{vNM;@R34Hw&=KZqQus#9CZC9PUPc=pKKocz8p!eo@u>xWX#B)#Nt+aicYA*2y&ChE*kGA$$w_yQ;En27vi{gq8dx#WWxSS__9ONPZJ!U*!9HiOFxu7d0nP&%GyQ zXJy^{@NweQ(p-emd^)3%p_(walf+qo_tt(Xb)D)hA!0U?FAhz%AN!S~m{il3=Xk}w z)Cqav?F7x__&NCegy`YTQF#B0FOXdk@I@TdYKX5cWTw&0VSl-g$70}3_#uFGP27RQ zA_P#V@uQ~hP-3t7vDNyZyJs`l4g~dUJsBQhEme+NEjS4oO?;tI=Mic1PhXC;7aw0x zvA7z>V)ujw2wNPiJSypG|1dE$*TMVJ^m#$Hh9Otb*nz#QV#IaX7z4QG+Ep3Q@n#zQ6vMbXUS15U4u*37dzoU-t3 z0=q6}kAMHRkp3cZk?+=2O;}qoV7?U{8cR4x@1KW!wqMNT&knxd)h;07sq`;}@ndDB zbd3Y4>EgWIL|dj)B6yBC&jaws=h-?`L4~p*@LYDqL=H*`4oX^1<++y<+{w2jd@dk7 z>N0=I9YA|mDlx1hR~|uxfnBh}1MUeZNdVEjK6fcnGT5zUsZ389Ib8pIRffj0fg*WT zonp4y`?p4Jy?5J*T~b%RJRjK?L~J5WjAr?y~9`ukp3}1(#N5a%G=A?iv~FWh_ilj12^ZwC4Qa0 z3i$z)F;Ex5H5;C;!xK6{VlsFmiMjeRx~nPrVaHBa|LaHfj_o7~>WNnx+r{bL=X^CF z`24&$I|y8f5~{pa&&K77OxNTUnB&R1VT zPAD546?5_uIP~F`O!T>m3AmO_%g8UVWXDj08NCSZV0}Z(Q*;K* z;VxKzqw3SWbnY?FJ-iYXK)TnskD^>EZub_uiGCsMwjFLYT52F*6!oNHZ+oUeMll0K z;ewHTcFOVAbPfd>n`W1Z#q*Ces-8$>HL@|NfeR*s!p-GHYY1+19cD3Qu`=9&oy%j> zRQQ-NBgV<6BW?m+N*|D-^aKEy!mHrUv6R?73K|sJjemvxV9CR)JTESDM31GzASwT7 zne=6v`}`arz04}9yLNN8yXD4jGTw3kut+NxwFhMX*wM6}BaN+r8|Oz`(A5E$P33f_+BBIu6na*Rj;ppD zrd^(YXC2#{M1m1AiSYSGADfz~h*Dba?9dM}RoPNPVD^Q|=_0!k;T7(Cix)Q#w<$T5 zP)Zzv>>6+GEp`^Z{aXtmvZ225Ij99qQ`kFJOq2ThoNnE>K;D+e*NsJ@Ic#kG2c_AP zxlFrH_m*PX9Mcy0IvS@c83clMnoqt>7`;#lUjX8``9~r5Ijia$r(VA=Pfp*>r^xx3 zz!|Q$eWktSEGtd9UKbKN1<3G-u{@su+<~!za(pHneA6dP$ zre1l}0zn6snIUzhwWZxU^`UN}G)duw3#r<9%9P7UoyX?SZ^LlNJ6q{%#Sgh3{a-ES zkOo?i+)e)hBEJG6dKUJt;AHM7%s2l;({9d}vOg$G= zs>JM#t{uZIr4zl>Zi%Sx7p?n0D+3e7335m1X3Q=a7F4W*2 z=!4Qjr!=NN7=GaZgvKw(^|xC9Zsd@^Tj0uUpCc$ zWs}>W^7uVB<16=N^7XtbF(C^bltDixFo`-pY(*>x$w^g!!?hsvVemy8ZRp6sk**(6{9x?wx{qt^D%-siK2Ke4fjO%G=N~PvI z&J>hi%dgGzQ_K;;%cDOBjZK^esw;5Z5QgB>ic++%r8%^C0EL;>3BI<}OZ;Yy!q3i( zhGkj_6k}*_kwIPAhMF9$8IiHQ+59(9&@{2iFKAN6XJ|TSP$~Xtnj}9@PC4aMk3*eG-hAYI5BIPJWvSU-q(iEr!*%?@1xOFBUaN`x;e^3Z{A0#?` zZyAYjjHp&@WBH&7nkd<#)vkY=UAyGq;QT_AJIT$%u-(hTByN8AfW;OV_M=q(*w!ou zAPpjzR+e;tf-mG#6kASU6)C1p|K^V^#wH*pds(7z1BJhW{NEJ4o3MNQ#&1^Qxzy?S z%qK_*@v22GS{jwTJX7|tv9~3PoRmQqJOX@_{I)j4_S$*$nT2ZsDsugO2*YfZCEJOw8PN$4De-oL}$)&Ai?BsX+Gz+~$cO=wv8k z+-%R(=KB4kYIrx2A%~2X6?LOFJ^Uz~zjNkzdsZotrpLw}K-VsEs2W;EsZyFxU=+!~ zz+m$(+6v$A)nu{dYi}|mB7 z^m3sj-z0Oz39uk&3kxRrvnvVkwPGaRu*R+__tY-a9j!ckXE^)vcUR?!3?v#J^WPJ6 zm3kh1C?1ldm&SHq9Q5XJtLk!W-s`3|YrW}9!O0!2oGD7_jZIE1n%{e0`9m>e0Wu@+ zuM$&O1f*I<`EYtt)V%5#EsqQbz2iC{tW0i%b0P?L z@TG8dm0LwKQ`Dn?I}k5?6zOq<51kg;w#%!!f^tN39xKEBKaV!i#un9T>4IF}3C)Ey zemd$P(%>7hFwCG)l3KMVf6fvtm;>B<3{&1QVjhhJxb@l&Y#EcunrX}7fQ+fP8Q zM4qk@vAW<^d(5=HcrA-^Z;x(B_f#4(&K-s`DPm8FZr!7^Ki@;)=BHAZF`23C>;6st z$>7LurW=u%wdwZ}B{*s!O5vk$MdB4jCw876vZ@^k1E}Jz;)TMl2|9E2y^t?L@s+aw zWc)T7b~(+Wb8WlPm-<$b%x#l5fh&g>FqcFCnWn>fxJVa;yd*a8=;6{bOSE;;&dTMkURRv4Ug*? zOxYp^HMhj6sQK+A$ruEPGqsRBl|^rVRfOfwIJBHG$EfE*S?Y>kgnuf~VDjbn^XCO^ z`bDEho?A75u`n?75i=ACiZ#8aTn$Wr3QCgA+pNRcX3wM|ENz!1``02bI`2Xy=#2s! z7KTlhrARMCOZI z-dduk2B`Kez$HL%Tuj#7^L;v7LNTP+pRn2>la=`&O@!mhY7}rYP#KEC>T-IMNqGlYUOGkV%P&rn$nV1pkWR>tM6zl4pc%wpc1=Ze}(2voa2UTDK&7c%ysDZeWM zd?t(wZ-GnYy9Hdgy+46HPH8&nhE)j!eh*~UA75W3ngJwN7LgOUl)Te9^NGqaHrtW9 zp^Xb#lYx5(O2l$5pqp1!B7prRu{O_d_`I>N>3jS1VEs=Cf_szplk&3yHiq*#@j@;Z z=?VC9(9-Ax(1tsk@QL^%v1>2`cGkaEopV*AqcS3XOq`NETOWPvaj*&)tY?>a;_qa9 zn06`i1@0x@aG}LuYK-I9zRr2OWGrV~Bd!ET(Z{pVxpqRW2(-+icrEBEfR3`($8~@< z%D{$U#7p7-xMmgD;N%%aOli+(k(}0A)R1MP;hd?P*x>RV!O4P(exs7cUtd9Q$AAR+ z6=S%tO`OwDK;&XpStA4@s48m4+%X4qX<2L?X7Xg*oQiqsTNI?>IIGug;9CJH zzB7=(X)H=pjmv~Z@guB7;pf50H_yBB%7TF^@VjuDZGiK5Ou8UNwob!Qp`iyT{LaS{ z>#0}ZL%y+{Iyjk|aiA19feQF%hL>{u>7pGvW}>T4tv}$yg#a#u?2z_;Dbxz8=l3GT z1NZ|S8dJUb_4t!L*6opmfv2;%_dBkl>(&7P{POHwj4xB2bULJs}zG7wuo`UVQv?D>}Mv4-kGWE$Z ztD?KkA}$a&qEAN{S0OAgMqBv%Sl#cNoGP&~RD?oMhL95BM>SFz=dl=?dFv&z7EG$5 zo#xg+uFvh~=4kJOkqdRqv0{dVc73(-Z{4$fHM*>1%a`~Mi<1n$YHxJYK3AEE!wz2P z7+j816NlzfaBgSxZi+rXv|4lm>V(?k2OuKE(07Q=H$LYRqCf2oYgX<`iZ*X}C4M5`QO($ukha@vNSCA;~PTVNb+C$3^DBz754b<7Z@y z1GOnfl-3RT0F3+JP!oeM6a5QF9$;>tIxQp96HW`Wq-iqXOMwnlg*Vu)1x3qV)Kg;i z7+m7Y1vDQ2r}8<q1VTdd?`5CD8KM%vs`6qCFi2mM4)tBNHtOY17Vf2lT9ai(GZYm z(Tu^?mh8IB7T7fJN3R*ZU2?ze^nSlz=B{saQby|$cB4}$9J41hr%Kb&(NXzT-r0Y& z6@N8Qu1~!V3n7BUZCQ_jT>@Y{pI_}SEl4or9DBbH5)e{m7*t#NeJ7D?PtPws*@oIp zz5@!;qk;!Dw;Lj!2CZglDPQ=r{6?YN^OQ4$6CB|!Mi+LDugQq@yOM>{jP0$2{Ye%A zEdm>|7~lq7mkn7=9*hGK$^#HR2s&KVA2h@dFw_LoVwgj`rXgs>I1*D;`3YhsmYghh z0E(jEeDi_^lc*Ibn$2+U=%%yg#eHU-5EapkerZ zx{9~_=n9Ud~oM!>MeQ6CeL?^v9Tk2jPE|de8&>N{&-=1&YnTsleZJ{0mX|`b4 z>&Q`oO4R+{$rffb&0r)>#|}T(m6_-q);---P|oyLwruQk(H(3dx;?z;hmEcu%Kgt@ zyhZPyhZyT$>z%a-RZOqGcp0M~GFb=soeOu|I^oAXjgGjREQYEq-?G_Lvf)@-fZlS5 zh*18Ai|(R8?xYPt5twiS`3P8QBa1cKx*Yc|C|MZ2=-l@Qa)Eg-=df8rytOZA76|Gf zw=DURt*BO;hBNv4$RcvVV<$osG)E>1G;$0ZoVgMMnQvl(V^e|#+;2|Xr85INA8EJ{ z&x~dd^p~Fcvfn?9ko~vA_2&_xwE1^olYUTiJwmE4%gAuBpD+$ z<^NH-2(RK6(~Q8F2g`8#4Os+@Q0zx~#=v4!41zyRhQg&m;0^BZ`A+Nb5#{~Pz4U$g z(1?4TH+}>H;yLfg3lvyjG{gCUyj$Xmz$aQdb=@x+%%c?-PA-+DZ9$#=zk~N*}Hq6kv0@_D&CGH1j0o3~8)3tKzMx^9z zttO26fE--SpqsbCJg=ds6<*IxaWfb=59q!JV+qz|T+AcDxChZN3 zsTQm%n!K1cC%aI3?|qi>r#jp=KpUbQQUta-GdHAC5w5zi?*V1`l?jOw=GcU|t--)~ z!2t5iKT2u-)zXuok4#Om3^PD=^~HeN5`{xX2mvYE$aN&|e+De3oxu6vXQy4H%|=a! zv`q#kER6+|9BotRU~o+2U^|vW8^FuoNb(zttAery)I6MxZ|HA#c5z}1R7%76kzeQ%0}2NI1S`A3*D)Qhw6ACnM$ zS93nrP|)GB0g7#3(%m`R@@DzRsr?!5%gLO2@7@51Z#oj#JN8gP8?wNT_s*;ju35^! za2*FHM_2sv@H1$-L-U)(;h1q?;a5RaTBIUpz({FSUnQ|t#{N>fk zOX6QO2LfrVR6eI*s2zn= z{7cOVFjO;UxdU1Bo9B+$<^#5mSdkcfV^G0|fMwd-FEiWDwTRWNU`Bh~nZFrNzzm>) zQ)@(2sVSd;jVw2&eSQu&OQ}f4xmwdy(^%ge^%6o@}l8=b7PLzRE&st#zs|B9T?fo0qS zTFb(u(gMH98ZZ-t+z5p$Ry%;hasUPlBwqOVBQI-kJq@crcw)0EJ`Pp{?So9kg7h`T z`i~VKCB?+%j7U!undWo$cK@EJYwh6<%|~B=YhLJy=(h%862D&f zJTn~>$|9I9@`;znyuT|Y1=NV%zx?gVCApK(+bc1&%8okf? zZ*)aaCrt`?jic$h4W2RqZa|!@*8C**=RfWYi7Pz}I8vs_u?uF}wVv$Az42L6n7 zIXL^kAM@JhCtC$YaGIG^5qM!?;zukh)x!O1+Kh%3)xMNoQXyp!tkkP(26YQ$;yYQ)%8j_!)o!U5}u0 zL@5O;LeK?DUE~gv@BGvSGmT70=Ym16dqFKIyUQT^TR22emTBDj=%dExHvnE}(1Xy& z{Zt(`4vjvITQmV}YS2m*g_@6qW6=kO-ByAa*Hi6@CSp&T9o7CceplrUI@l~fqj>cI zT++u#7j+dUsT9k`c9iQPgqeGQmRc@?`HcPhJqy@(6ucds1g*%Ivj~dS)nw+7MZ#P(!USaOr)Ibf%0|;-9?&1Tk-9^U z_}Tr`k35+a%B5r0@%EeVm%}&u#A}6omEfGy{8~a&7tC3ti!pb2UdJB64>_jtwBbpKcdWIMDAh7AEc)%+HsBE zrYvXE%zbxwXqFBVaU1Z`kT?eM%BuzgT6T;kQ(_1Qy|ZSZ1AxXzz%*W_)|J5kKwVJH z0n>pS(xuOuMKILCG;b~D5C(oGLd=U{E$6iB#u!e`M5rsmnVNy4w!*aM_*?LXK6VGo zV*t4^HC(~!XMdZ#9)$$aj-}q!IRm-gK^`*oWX6+SMe}s|sIVC_|L8nBSU9FSmn z0>O&})au~V=(uny&}7K!q~ZhzO&&YlXJhMGmw72<@+Alb0~%(^^+XIz!x<`Ar<|2M zkYm6h!rL)V53etx<+UPuIf8e$Ns>#j?qC9kmdW=u=qG#25{T1$5uK_KCdPaLr}s^g z-O+|3p9b?Zfi?`VMI7f#Q5j&*_>iHqMtReWHmOBsa2Ve-cPI3&X$YW7R+)cYHY?4zu6n$;gb%w`jlu}i{Z?{<{6;YQVZCJPm`cs z5nugM6~ZuD=0s<@h0?ig`dMl2#7}*_iCte#Uu_92=BT%3pI9GMvxYhhkpij2PofD6Y}oQMSs~5TPg_U0xdc^v+&AZqnNONNWZZI z!v@X~1r+_B2$&xrAY7y!c|aE?i2-Y4d^PtP&cEW8o=*$ z%-{<_xgc}dy+8mgh$wFZ76F1gq!UO<_2zgWC#4d_s>grIwL>e&he;x#hV_nqcd;7Y zM_ixqX`v5P|EM5uSz^jG4zgZ1=_YKUQ=eV6oi2DGORd7bdkw{8{8DZd_5SJa zU*36rj5u&vi@w-^bVVpJ^K$B!-tk+hx@_J*+veZsgOYePIr{h^&|_iB?FWvEWki}V zpp%0>RKa7j(G;MX`*4CT3e)P)dmQ>tcg{D8%Y|KkpqTgTzCCCE^u6dTMA9EJ|FRnN z11n3NMuzKhKtBVd{oqfTHb|^QBzPoX3D~{Yp#oKcx^%jv2-~M`GzAh|JrDy@I|FV_bous%%M)V1WF_B8Y{aWQY3X%NV2Hl{9CbU1Q6Vx} zNtTC@9xKw=eoYa(#Hnbm!6k@>^FnBzj2y?(Klk1EOb0rytL|DpveF)?vh=>{*0&Kv zcd&qo3ix``uN0{2E;(_Z6G1M@hoF+21ndfMvsciwP~v4OhLT2*d2exGJpxs|7f*fM zTpJqg6!>zjm%ZS2dGj$<%L=CYiXU&aJ4FQOVbCDToAz;2VP#DkOu3h$k{H7_n(w2> zgFTlpdJ|VL5gXE=tkNIgiN~VONa_~rih;;#xR{TM=}>WbLWbFRm6HvCjZ%M*09pvl zau6Bd+V}?Fu}~fbt)rR4f3a5BcGy?o(m_f}ZwIv~X7wd}a`fm>e8V~? z8_RyxIB18(qSV?yqhX~zBQlHi1JU!;Jcw)%#+?XG-8&gbyH>Ol44?nx_(2U*L{*6Pk=#>(8gNwIQ5Y=!0QW+Z#LC9N zEN3XWCIHH`J_WzSBYV$w)8hK&}MUpFm_U3G`_R#XBZSmPPd-8x%)|ZcK!3_)Ud6B z{H-1ckaA4AjBDqSh3*|&I9!InTEb&e?{4*23b;;3fsr>n%oET;cl^<^ha#b_ayD*! z$G0)lTUlddR!e?eo_ju{f5wT0spi|1zdZV=%3=^Wydj*c-?zc+8dINAo0=gp2__5H zOy%~AcaC&Q2;@G`BBvB}Q^cvJV}Ab~_VF2arJwh>pD+q-0V6*^ou{m@_rGRbh8Y+@ z=o8;>CiP<_s(0I=U61>eaLQd7tq!$hYRLky(OQ1}YM8jIh2dy_b{P+~Ta%*GHwh1r z{GBxR=e+=&YQi90G*{0!R8iZddrZm`nrPt!s0LZ=GAVImI-JzYEd zr(ij`ZoZNpkwRdfKV^bNhE~BV%VEC)3*v#hv3OYhIK=nl${8na8nNr^3+v`EL1Iv~{!6)J;v)7lAmJWZQ4cdH5>r_=b zsmMmU=U*N0u)JiPbfi_KA0U*|N+ZoO&PW0+xO$+Wn@pyih*b7kJ}1 zJqzRjj;CB>TanMp6>-~kR14dIS8~)8)a$Qsk?v*52HsZD^hXzGL1e>xVVIV|iX{hw zZ_Bb5N@VBxAL8KP#9d_4qy{UVvc`za5hb|G!BoMRUUlHa5aKZtI5j0td9gl{WmBk! z^}cK`zjHzej?G)`(%F1eAQo>9KslQAe&09TRW-QAz{+c%?^Xa`yG17jr zd9y=$e|BEr4w^u|I;_(ex%GM!ANgWsK}gpweudlSy+WPV){Z$AfcJ8h%gC;kh+><& zQF+@SS<|ZcHX6N@*Kl_GF}e41vmBJ)R9RHN&`&Ez8wRZ3OyKx~DeIQG7r!&oG*cam z(e%_p`n{vlFo9ZDmC+Y{>lqNe0E{_(8C~fE^XnQRCL;B0V0Hsng1J-i0s1{y0pw7#J*Je4J-=q_xR#t*z{K-}dCn50lh+5uYzoMDdfyM`wlXlxT2 z%x!75xo(r?M_k1XF^>cEG^7=rT;@r$5~s>%Uaa=TDOksv%;;xBr~NS^{R0cS*gYgT zA2op_`iPrdqX71-FB#j-z)S{UCh?;+#e#0Hau0I`<{@U{gp^t&z#EA{gTnH|DLlZ% z2jHK-0||LmiAh5&pZ?sapxtc~s zpdN!+rg|JO+nbzU=+LwWvJ@Za$dk8wk1K=s1);$L);J43fl7Oqpb)FDIKi6@qNJTF zA#c^Qq?eU6Q*#}oHyB~NBABAM5l?6RTPr}?$6-1&(dqT>PNKpKabi5aY7~PiFx(UY z5$rSyITdUjmcB80^_C_EbmFa)!`C=X>cPAUIavB<8(mr)(u5Zl5kWVY(=bL{QG^$# zv6rc6C!v+Bmhp(Y5U;}#1<)GfM@krl=W_uSB3or1=>HdN46j*V-t{56*rjr-8rjyn z1%UBI&{ggscK~g`4~>3gGd7DcU5a4x#bvqIFSHBSNdE24C82mT4AYjlO#~{yB%_Uh zL_!1&6Cs2>)G-j9PFK;`DiU2B(t@6 z0>j?X_jgV8`XuFXKB!&;{D=y?(}2C!O!Yms!hKFDeOJZ+4cK}8Z)BKIQd44$Rv<9M z*E9MGh=)X@rlc#Ibd9_do=$;2HP*yK&nG~*3ueCR@V>7A?;#=8_LjNuD+`Q^go;J~ z3Ur_h(DP|$F;@c@cCz!p=OM@rWXy61x@pl93|Grzn;WJia@MY z28(==u&d3Dm@imB9SQhgyK^eB^vXke)_Mce^9+)Rv{g^Zywse+jn_j(_j%dB1Hd@cp-WNur3zB4xm zK|T%)dfnGF2~Wzl!9rdr_;R2G{cqk9j{m5$pX8LUDE#DiN=}R%eIUGz-EWQm*OX@e zAnLQN6?%pXJ|#bi_h1xs7|?l0A!zp}E!odE)v-iP1V*0$M?@Lmd`ypkt3e`Wt2-Cn zJUplk5L3i}VqiYJT#%h@k#Bbh57=?tl*RZNIlrPQ4}c?@n#<5i9<<<2^mcqUf~}Y} zpD)>rfj3iKRM9NHrAk@L3;JE%MqpP*ZprH+nxUbw3+4W0vVk?d z_yx6pB0*E0{@#+AY#tPs1*FjbkEyeas;c|KJtZk1DBay4-KBIYA>Ae29QugTjdUyB zDc#*5-5k2RIdI@^-~YIG+%J4%4EAR2wdS19^ZZuR=Uf@2cHQA_M#7JDiuAmIo5X*& z2yqoC^H)8k;B)gm9)N$;;P^j6iT(zwMM@-KJUB;&Z{3fJ#b?!)`0EB#*!P%w#lbF> zo``kOcL8!Zl)|{yKY@ApW^j>|jG@PXW~>y7hASXa5W3*TIe60jqWjIl)Jmu3IjfCi zC%X^6C9!EjC)2BmgD>(`fnD#gH+1tOtHFb6fsbHvN>>{XH1>N1z~GRDL>wC z#Q;YV3~(0mIj)K1JQigFQ$>k4ly~S8eE*yq4(eX5T@14rM4dq2wCC_L)9rdl&pf1yyno~Sqh z0jYf8?QRtVkN8SLH*w$W+6G%Kx;1?MFq1;uKO<5cSiDl#yrzwziVe+^F*PxYZvjiB zV34VDzDJGG5p?VGsNIzy19M%{xf`8@@Q{GLS;K39IC4%8Z&M9?6Oq||?d&s1W?-m&iswee6LkIXZl3_bVpA;R>{7Gz*v%bi+(JbQd96D~y3n7fdcc zyH|3s41B`BjiH@~4n}_4@__pswyd2%Kjoe+>NA%@Do)TywUsON_qC)Gpfok(>fL)) z3&JU;aQt^h+{plMzXX+srlJZ>30PVjDQ9En$Du$LrQh`sP!#Ne;?hAK_}FgjUJl1P zkmPbD=E{NY-Vb_EC}MtI$@wqN+i8RJH^Et&mgiq;lR?rEhz^B)+Rd7_@faotWGh*O zz*0(DApG;gcNuV1qwcDoUPtbIERwV1amkIv%?D=EE2PY$V>rL&fDP>}<%hY*Z#_n6 zRx*I(`b|eaDN;8U4VSh^yPCSop^{?c(=|77|W@S!%L^>SAjb@*Rz!*AtUsTN_fvJ=&; z`Vi90)(R_9+j{J4)P;VtM|H{%r9r%y*XLEk{tc8BB9w6}cIjITAIR(RW#yp2 z%lVwpXl{YAVb_1d+%5-vPhnGlX1xv^9>Z1uZ3@T4Ar2b9*VdnMRC6T<(}~eAlW|*I z%O@hYKIV~4&YhNtK)sKJ>v9OL0Bj6m3V>Z+EnWwW13iNj^CN23&Mr=a#Wt^jSJ)1Lz{F4 zc97P|zHCiUlq{J1+C`Qivp0Ol)GXan5~Ac*y_&ysN*n{@+D}wr7jhPq1x3;D$|gru zckYm`!iUX}8t_^6hPbx+@tbenZ87EZ7z(I}pYeN%FUbYJ_+hw9D@1xgB((`2~sge66?zijk<#(4<> zY@2EOgwN-3&Tm6rDQ2#{Jcv$`0IN$2=}j0g77WVxmv5S)bsmweI|Hh4*-j}}Zfw$G3wSm>U-4(yIwfdMh-CZb?(Q@Qh^oJWO&-xaxPKD{ zecTkvboZ|tsQp1AS(epi_(7U06X?>AlJCDR7Ma1WdeSpdSH$lE?(UmL`|hLJ%g${>o<~=;gEKlzV%^c$m70s z1Xn-)3#O2-NTRyxDRJeJsQp34iAt2$GRtwnuU|BU^-7+`Xqwh!ew2DyOFwy)d}^zw z88ki4O4t7!bm=9#y3Gryxw&i93y|-NG3;-bYqr?&*hNosI##n%z5^CnT)&ahVk<}o zAbJeYoejY%1>3=m&kuvu!kcT@=vd)5Mn;Pq{QhlO&m?L|_E8`$0ch#_dti zZU9XP_k6Qsrs!kfRTDXMt#lsFNM)^+0fzTnCd__X0l#9<+?ZncDYm8Gk=)5S)9hIX z#Ph>Fc^=LVJU08P#}(zSsZ4mi#<@LW590Y=d$l(V->8M?`tV4SOh`EDG`T@PJh^=N z_qKWzjH}rZdFR7-YhWR@6D{%QKGj)mRpvr27iWtvV$t%76jA4*U0?4!b#BDjDd_7G z2GNj_AdL!jv%1J=`@06@PR5B=2HEm(?d4_#O92r7`@3=&>qb% zUHI$KsDep@)o?P4`a-z_F#{I5%i_aa>Z_jHE_~t_XH$b?ZRg7Z4ii3s7n`oD%Cxg* z1LwsnGFGpe=Z+}y3h9yynmy0$V+Wz}G+Pvo-ZIs`Ko;=v4RPTy*vM z3Zh2(B{i902Y+6EG{~6cz@3i(w4M4=djAEPuO~@6ew)ZqKu=Ox7TA^-An5CXJhQObWL_o3(s5Z zcrThhR@vx_xb;`aCj75noe0&~ob7sojYM$yJyv++xl@$&`NaG#))llYwOT#jXEyEi z;QhBtf}(&=?S}-p$|mFm;9?ZNE%dpBz%u6fI2Qap<}nM)t)tI4PXGHlqaN~_OZ!pg zfTq_;b3s*sTr8b=bnYr-!Xe|&;b!s3-Dz;%__d>$R&vy$PZuqL=-#Tw^~HNpV7n?7 zY_>gih|WU5_2Bbzq<`HK$5vY~zWB$Lj3p*t0n+N99@b-};Yb8ccAF4}IlN=p`lFcl zTqav`@f`15nlze9Yts4>y$>bsT+LU^t8m57@TW4`>Q9}fyFUy4+=T3O7|;N&$81w@ z)73XJ_N}cO~7E7T9`44;q&`R^#_+uhVCVwTGVPiXts9J^DgY>H_H%Num>b1i5*|-gvR2Q zh9rBz73V=j>-p2q7`tQh-$%CRL|Z3?`Wi)b#C-~_k*StM|iAH;9*MJb8UBj z72Ct8g;{-z9Vv@PS(WJYz3C}+mx;a$`z|Z#(!Kxt;kYeBc9IZXqJGQz%%KzpofNd@ zf9pqxuN`!yGT0W|7`8QC+al=wTCV8kMQ3HK8hm7EE(txl-{_Hj8BG&xp~ zO!0@ijca+YFS&FwIC71)t|h0H+&f1RAF^3&faa{~RlrE{xX#$PEzwlggXy1m&7h1m z5=U;7Os^981y$-*JjG=$(ni1>&XJ2)Yv2 zj23&}f&{?`bIhA#CsA2`(H4iu^9}M)j9MDLfg2Qpl*cSKqZ4ST<7h*FxUHGOcB|Ti zj#*CIzlEcvAOKP{tEjya#I$T>o4ip-Ig#1I#ArZSMIYAcrecI>n%#O#*V@+7%r4xJ zl6|w&ZM`}SvQ_1K?Y`~IFRKcV*2Ba&*o8Zw8YPJDkGfmg#y~1Q+x(v$Qo;@OiQh}l z&3meydi^5;XsNYzKVOSL>~m*`nGK`(8@aol83ZEqs|jqgff=glYyo`r!V0hd7qFE| z?hH%tt|BoAUNS;rmHW1bqzgUs_&(^p3qhJQM*j;SUK%I6i|VKP5Dj{o|4Qm zpDv)Ay@4OWX#~bd09ptLddggMB{01(m5eL{AP`{mwDLZ=0T^)aP6IBdS{Fbp1T0|Z zNhh@1(Qg^Xfyuq;Ky)UZBEI1w-S-yR?4X3)eQpix|GBh*pB~h2a9M8ac;t`GtL>u1 z0+NNZv|Ep6l5*j5NnucBf2?_pvs4kndj@u= zFTJ3ez!`DwI2a3Pe{r4xYb0RGBJ31~G z8Psj6V*7&E!*YM~N{(jWhBh_J>rt9n~FaL15=>BSeQ#4x|x)|~26jB+$?rD+dfnDe< zCHB+h9gczx4S@1;y1XAEu;pj@QnT=D0)<@E#qN5LQ|`k_db`Vc=kKm;Y#W$OI(g-- zcD3iL;mJ#gR_o2uGG^MG<3j6(R!@g+gN;E&Yg5v=QL~BiSB1f&`HC1)0WP0tyY}}> zduA()+P*Ik(EUL81j_uke~SSyK-3mIt99riW-X@hFh+Q)Hp&*i zRQvt}+dD%P6wKDap~tgXswxY9)GXCXySQBlGh2n6T$-0(8bA(w*Vp0ezXE$CV3|25 z4*(W}0U*)?u8UAvz-rc+%?L#10kSf-RWCqg>W?6ZdA$e>BDm8wtO8wnJu+gnO+ej{ zm76ynAP^f(qrs#N4pP459Q7N}(?X$@%0w*l@4C_OEM8YDSQ4+#+&tv)9VuxBr)$4; zy#HFW^>VwxBfhTh} z=jBZWO_p+lYCe;kzO-7;_>?;hB)DMK+qoZ^ar%E)yCZ$tu-g}&o$ai&(#)_H9?Iu?mn>cVZ4$^ zD0G^SOm93*WN*8;BP5DcqtQ6-w%n-A`{ZzXLR6>CW=ygzgk7(#Om|g*MC({CojJ

C$H;grA)kTfh4v04S~PX!LR$po#(HZH{B}3tZHe$%StAU0EQ3 zXSo{b$K4Q8+p&6*F^?5Z8K0TRxF~_Cyb)T{*ws6wPMnbG8NA8&6w?&d!;C z;wD~z-spBcJz->aLrCmJaast$XJI)l3du~i1g;i^sG*%75u_L!>>xQRuq8V}0h`Xa z9>5VpeSiGhd8GfTT&K-WITN{#U|dJL)L)r4FNAFQ{slNC`n&7w0e7bmVil4wiT>b_QxZZ;u~?vRJ9!j z)kl+^l6<`3FG7C1Jj?NP%1OK zHnXMHJkzEBJDim$s~qpzMP5C7{nRP(Ee;+d&Jch$OxK3Q52z9`W@L$;=l#}MCfI{< zot>@0V;@lZgH(rI#Mw3irh>AK7BbA)ntu#+M ziEU0t5^-O7pXz`8@FizCHcDIYdj2;Ff5jdf08;->RD(DC4b^@3c+tAnzht^d=tb;t z^X<|Cs42DK91v3;tRPH-E;w%jI%#4BUH2<501{4QF%|8}O4GEHARQ|-`-L#Ueq|#4%pCGK6T#_nrycm9)ai&5r0`Aemx8@82%&jvf%sxka*}m!ekx<_F=5>Du~G~PTyXm5G7{| zInb8GC&dYYNkMn(&}Y%dAJ0c{DYw;snAC#iEM+&4C1n5BpWh?ndJLOwiMPY^dGpDLATpI}$w2F3bHH|~ z0PO`%d~}ZONPHEscBgTv)-M7B(vL1=2;ZHPF2l# z%~sRpa=l>GiqHD#e%%PgY`{Afmx+;L-!; zFTJY))yc|6j%%&e&L<_n=pK*#3$x_+z<$E7Cm$OrY3JR=K z2nmPecZ63FDOzOFQ9VW=!5lC+e$_zEsmUK;%T-NpeCX!LI{xxl7t?LDP-~@f1~VII zsx9QQ9M}CLr6UWDugWN|>f0d&%8%#yjUOtzqFFQqDiLq@!q2zlHGhBj+8*b; z)qk@;iC7+jVAEQzRY_h6P;J)TdqZ&m+KN%DCuiS`5{wow=%dP`V3A7f{~7~eUH_2Z z&2H9dzz{r+XE|;uvwu>D{G#DE;6(-$ATAWox^4kANA#=O+Njr5}l{n5a z6LiCg``l%qv_ICtF^_q+Ee_eYj|ov@OT0cg3*sAgyULZhMDB<454RB)9QD?_jgyK; zpDCLxdXh~1#20J)6`_#_!Z*h=>(YB>I5_I6q~fC-2y>uHQA=pO;gzTk10*j#=={BV^P%et$1@33Kg? zU&Jq1Nu@{hl1-Bk8JqlgffdOA0>r#bQJ?w?*};QcAPXwsaM2|sPW%r*1*E+m$Q*!> z>OYZ{28kk)Wc-ajqc6XQ`O95PzS5O8Kx$RV641?)TC4;PCn4pc`paYW-&AbTBi~d9 zclXCv0U6xOc_8}F~k5Jpp<*fF$wG7T3 zVU?S=16Ytoe^?(M*~|Rh+bxZlZc;FGBfU}~Gu0 z41s_*$Hxd~3mjMDP&ShqQpviEj!Sl8J%7L?lKDB>`^LY;HH2TCR<+u9c1p76G3s&C z=XK?S+X<8S%elsmjhE|Q`o+;i;;<$?+?ewR3&lj3=b)@&(#x}3;$i7rh284!#pB|u zsXP@DVSl>;OZckig@cd$91Q5gw<(;-&PM)RMR$4!+?E@aaC&R|50gi$F9QlM*gotT z!mG~h6Z{Ump1u4lxObjyH-{D&4@4r3VE&HMHZk%w=nj~JkNiQab2aqyz4BVC$=G`P znS)^m?qkdKQ`JUvyGII| z@qUMzz^d~Taift&3`^{}T`-zF^X9h^wjWf|!8-S?hEVRf)pO_FWTe0m zv~9Wa<5l#C2dlJKUhn8I9ZRv!m1QM=F1Lf!?a2&`x3?H*71IUjOXiE)FV=%QVqP-& z40NY4ZRUqEEgfmvA4c^jj08HiWOqF{%B8~12Ulb9-Syg`yQOE?Pms+A#kDi2f0Kg6 z*gF-R2B+b>-08vzo}2j{0OGvcg_`WvgW~u8X+-|Ivu>ul?xpeD<($)D>h($Tpp5ua zO_})t`XI`3z1>&+7T>r{i={u{t(C>;2!hdP|CPPpxh+G|1MG-mhl{tf>>R+T>1CCX(-QV%7LR}8NwSDD5J?##TG9I4XuCd}pXZLz!~sr0z(`@y`3=6`NFWCD%|mn& zYg&XIkLijk?@{1k;(gg{xvt%(Y)~WTd!fhcBZ|^l@A2al*;8N~V)Nx}hd{@^)O zu3ul|S1TkPo@nzoPxIYwqw8|N;i@6gZC|gQpi30YMLYRcIoVF2vofDzZy(r8?w?_K zg&*D`N%=N;ZS79>{05~*k8BB&4?yDC0twGn>+V(-uaLguv0X^ng-9VspR_eRYeX@3 zj6qKx3O`IZ;o{-cl8Q?{g^K&Eqg?gzDL=N&cX)`-PlStDc-<4mkP5jNp9KJ3CaZ}= zHlwb;N7e&$%@g5Tl?q%|9?h4t6~Z3G0XOLl4yOZUA5o_A<&%Z2$V*a)gfoR)FUK=m zQYiv|RPPuLw!u(t8RXtj9AL^O4%Aw5Khl3cuFM4k@RwOseC`HWD>}2_NcE2N$(<9G zcrX?WV{qyYjQA-++w11#I1|P9ois^5M;n06{RWC7Dq_Ddm<-VZ(R|Iqm%`}$pB{D2CLOd#9i2$sWNZK+g;ii#Sg zHwR;S{5#FYaEVL?D)g#29OnM8-J2ZgaApg3D-(<++ZTg;iSR_48^noG!l&1}RBtV< zOCk}G5%_eg9kgn9%Txcn)H~eFTzoXzU#L^4OEdR%?X0>`-y5bASW?Vx38O(?!+tH` zv>6O8<_Zf7>v+0{{XsjMv;F(WqfMqi5+T@@$9hFxLBTgy@rsIuUOD~hVJ%aSA)0jI zkC$y%_|IiKHA-&j??j|K8GuQ6doZp{uFG~;u#Yl+a`%;|XQq z>+nE?&kac=+>bMq_NCv83DqCl_1aS1dqq1Om#Qs9Q7v86a#ReO+|w*5?qIIv+Va)| zL$pyAM>8n)T}Ju<2zHyp zHvtf}Uv9vJj^Jj}RaaH3RI`-+7ZS1@ZTBqT3mL+Agp>c940z=*oaN1>>iTl9(US(H z1-11kS^4?-ELg?HS*=Vmwn<&LA@ynuhXXCqo&X%7Is7*9qFh2>0)S#F6 zm()m^{;BJsl}BF1$bt3rx7(f>8#USNoDqOK9Ov5&VRU&bd1lMI?mfi3Oukz#N>R`< zccnD3m+p?j9bEm4=`4O~uLVn0B2kZFePbFGFZ-i8N0Gf+i#VK(jBXY2M#tokkUTsK z4-sF&QLf3VNs4eU;~x5nF}>FAfrhG#wLD_%QwXoilkE3DTwfO!E_xm({hJuC5PZLD zrvauv!8nrjDr!_jUoUkUhGvTG!o~#y%0e967z$;C%uFo74o#_ z*cCi^4yXA4kVRVv&7myzYqIOf%1kE;?!AKyr5=ErL##%i**b<W zBg=b}Ka9H{L5|q1J|y^Rd?c2|$Ap_~u)}E%Eju{A93`O}-y;Hs3H^6?iI!j!H44?I zD^N&s3&?8+Q-Orj9~v=MVju!)w#-CMvZ>l|vAb$&c0TDZJ!wAr<2%wd=9}GM|Kzt4 z>l_r0-~B{XI1>v>dVVrMd(p5DxyZkAH4uBSG=$0AxYy{NCj@MbP_(0Z5q6&+o~A<0 z)BfE4tp%$-vo7Slju=*ota2cjFnEFyjKms#m1BHod|oLSVfCZ?oX$wVc4OD5;mu-! zS^7||7~^R&^YZJ`D!Xkke?=Sg3%>|_K)HU4Ki-{0e7_Re!X259aLZ0<(pt0ME( zjizeyekga|Ubn?z){#LqJ4F=SpQz5iQ#m}Bsn@UM{nFdziZ2yi^hds@Amom;gMu{S z4MmJN4UyNy?WNAm>e8?E&ilqjz;`8(_l;7trDG_YHKV-^+Oc#~_v=yeFIB4uX4DTC zd~O8pU6s{pSzQJK+5Ey1-|Mc(y)0|z$MJH4mO5|TOz6vHCD<Zq2VlvOsWxZ`#L&*2|o!D7o>D;}?MTTEXee3Lt z!Q_nHGw~RJw&x~(xyrPA*q`mLT9Kv|X2S<^aE$ha1-Jsrm=+!ui__=;&^q&IM?b&-bP)AAvkFx(=j_JqKN8(uC2(@{#CUpY0~K$g z^(TN}{IH)J6Uy-oCB+ahoyDB@@4D}6Rv~+U+GGdV76-Y4lg* zDYWwZ*bPRYN<_X73ZIHn6uM%$6$p{P z(&#t5a<-r0V&ocP2$QPvq1oejCF(*bE_=bZ+8vfJUy>|moyJA9cHK?p#f2l1S=sm8 z-CRU`E?KK4w-@8r_8*aJ%hn&eM1`(1_&k>8OFNJ+7p@-H4D>t@GJ9GQTy;1)9nyt@ zmUowav=eIiH=r+ko!!;EA(=OW9YEAyZub{zJzzp@5P#bP0tg}r8&n11mz6T~K7FF& z(dpJa)4|e}FpO4BA$Z(1|wPvBgP<~WR$&$7ys@#!u+zV&?kSJ#cKPVj#Vou1xK)HUfp>t5I zFidD0h|n$bz{tHpzy_dr@esddKJK!@5IE5>a{fyc% zVSX&B*ddb@49kRnF@?CSWtTUwe% zF_|r9>l32ir~#WP&>JXy=weCnM9_ybGIki+)<_BFOaN|HL3R7yzF#={dTyw%C0+frTLG-mAnt%eO>?rl_cqYbmk6*U{{yp;G zE|7YnCSa1pgo|`b%K@WTxE;ytUVj}FAYhZrr%l!*v5de&aDSk*cP4(~?QgHvb zH}JG^WU)s;hQjj)4V?2X0Bv@IN748t&V$WIzkvT9-<)w*#8|U4*2$_fav!0<>3p~- z!Hakh)vy@Aa0VxmR^jA-<_J>C7Rz*ZC(8<58GpiQ1lnITPgAsizTqT~ex&o#T-GNK9W!++g^JK|0Ede^Z1s7z)k1iF$++bpol z=sHic5I^y+Y4E#Qnw57PXAlFz>pBO6j6(@%gQem z5Vka%LkpX= zes}&^y9liW=Q@XEzH(UFeg$wxXfyVzJ=$p>5dU}XS8viiN3$MgOkAhuh94}XB9 zhPtazrTz5;2pdSi1^%$9qNPMIh7VxdFZ^S%jfzAMf=6+Ur7>~nCqRMBj=nvwq@rg7 z00W4;a9Ew>*wezNZ?CES|LM>!GYyVL-Bsnl{g7xL9|cAb@8wrS&dI)g%)Bl$>k)dq zU3R<(Z*$1x`O`z*{o|b~92XwGIh+Jn2L?5+hX?HsiCzv${uwF+W72!GZZCC!```5q zE)i)m>{2qCT+~~N*xC1P74!!wMx0Xim&W}WC3z9<*GLQSprb18ZqJK}nKPp>asfS! z?B8CAqsH(ibk_IAyzmX0MW5h@LiJZ9^}nO}sCe28CM8g()+~^sPQYALNP@CyY|1_z z6emI|GMbzUjCnQROREM%!&V-Kh=c&%8xf7oVxNQ`NPcaNe zA{NP=Q~WmMLIQRld@Cd_z3E7V9abzWRdM&{i~?m6!G5Q2mFz~{z)pv*XR9F*mqnO_ zP8oQa7J(_tt!Ulo8==pZ#zhWT5lNGXUOB9IX{2Yo^E|_1FLiRFE4jZO(L51qEi0?| zkYB>mZXn>Z0WytBRuu6+nqzgGYsKpaZbGhq!=Cg(sL}Vr=&LiGBJ@oBmZE&A#dTzK zh-P~?bxC$q5arH~?%x0Jow&W@T1lq93KkNVkFMQD6ZCO{CP=M*OA0H0IH^QCf3uoq z0HsY_jtRePD$s=ZsV+#-WSDk_DU;3PTg~|s&KypzlltF#q65stn2O56_Mck?QEYmK zQw)ORSwa{pRW-~T7*~G)yJ79!-vDvC>sGH?w*(0ql347YbPNpe6_at_zO@|jSM=;x zYH#HHruylDsqzl*EoR^G`0BDatQolxllUFi*+PXPm63Hj14YfK`)N>y1kw3w)zr=? zr>VnDMp2|y3^!}=nwKLf*pt#!8MRP4ky-0#@s8~04H1v&)U|5jfnsubC-jZePnW8- z2A}kXh8QJ}F9q}1Bo&ju=wAVqcrXdmwo2-B7F0!~kWdper%qc$cNunvZxKZlZ2w@( z!N1UoQ*RR`^zsB_Xt5vM5kbXTs`Yc7KWz)dYbABW`1ggM-hHG~fU@Y>HpVm+N&l+i z%ncEsu0m}Z8--5lUc>{?9Lo)4hS~%d{B(3w;VgZ-pXP$5hXGfVHSdW!JKn*c!Y3J# zmIavY(b2qess4r7NR6=Y;~8;a8xU_U^X-sR^1ta-GomMi`jIXm-CCsD)6za3ih)i^ zZ@xc=T4ED%1*#8zQo`<}k!%kgaKKXgS>xrq^erlsipYdsdb8vc=z39qxb=bZ_4z=Q zIynORJA?QrwpUo_W%Qc?Cn=s0@88ECBfq9llL|+X*B@Jj49_wn+UU2qC_D5QZxt3s z!3G;*Q=1F)RU|P8W6j;nua_NPqhnw$gDoNHh~9z%LMc?F$g6iP^>{=&$br|wzQ-XM z!6HA!6rdrM9FJJZ0#$dQ9XkEBmG(sRugqKAuXAbUO1C3QRVuiq8f;(GvIBYo_rAZL zCaG|~yH)mU={Udmo-gjOkUJyA%{nYWOQH7VvjC8bL%>1g%dSG?jqM{&?kvBr;%2_u z8K`Wn5-y-N_P$SfBW^;1w!m1MF!wdlKQ-2(^Zt3E#X$d(Lo>}jkwSFJ;CgN-w1;x{ z^?ZeX?3OCoi_h>vmspTndB8SO6XBaI;b<~JIg;}sX_SW+zoXy{dZK+Al43XC_AeuI zWX}TrY#}DOqgnqBJ+bVohhyx$(2kE_(xeQS=}bOi*I{+s1QEQjmXQi@bJ~iAVRi_ z1_%8e-Ow>HAC5-JSmSEXen9Zwp^>~Sjohua;^2eOk3CABSncjDW(#g??cVU^6yl$m zvD@HyuJ>`F1)`>5jH{*8N9cuNsH`JsbJ#6Y8|RLy4L2BK?#p$OZlfCERk_D{>Svhu zp>LoPkZc;4ytf=B%oM8nVl2}&AD|biNi`QNgx1T{k44mPMHuGJuEc#USAnq8i3=dM z5*D!Ft{7pgB0QJPoL=(hc|%h#JS4$5F$DX_y*a=o-{j11@wJCt+^#?5Z&_K?JP`*A zjc*2DQp%i`Me|Tb3?*;FZ6@|?;->(uT(ViQM2m_&pf&JW_vVcdu*=FyZyCZD;=zJ% z6YsccHKZTfNfCdF_;24AgH2Qiqh{cj3!9ks)cpCdO3gwlP9u(cLL)&XoX?yMjt_NS zrEXONmBjOq+6M1|)PiD$_~bScTwxPwt75?ZlJ7gk8H z_%jh{lf2*^E?eZXTjTv5!p~7ki1%`6D2|6JiMLdFO#?h zrclSfq`dE1O0?C9{UlRjl8@xpyT5lfFbh_d$MCsyzJNqDk`olB2sV@~?;hrt>S5bk zKiQcHMp*|~PRMv1lIpX&JCCHaS|}u9qPCXCb5X8iaUPpYa0n-#tW<#(lUYHgv5=)lmcQDk!&q~`Q6D1P(t@N-U~#^ z$G~6=!YlJ7>BU>%^Sg$SsB}^L>+DC54Zkqn!s;Gqb+vbH0b69*>I^ZaZ@doSue!6S zEK(dC_V%GjOBueN6!WGWx$V*UwUkp56|*%|jJhb(!e}tTA?CzHbUDfHM25hATXpc! zaCzzfzNBsw&I}sMG&}yt$9JL9=Xg#^kpb`Qgny@PYB2O92zp?A#T%&*{E{inJZv27 z*r?B^+bN(4^(-*6Qe3@`pY z=Tt9pl8DhLTQYAYb#-}?Q-$McIYPvrw`Z7^nVFL{tlWFms#l|G%Y z5Ym?l<2d#iTMfsP@Ves)t9KV<+8*lUEXl&dKdLlrcWLZR>e#dl~wvD%83nrUz*V_{c6kyV;=*N+V`#N4_CePT>ny9#+6l zmWgsq=J2XNa8*q5TRo~-yS1Yokw_E ztjBP}P0fy1WJEy$nR?FWK6askP)n=b;&y}X8}Aq>ZGQ5WE#D_&iW{{!*IlY7O0$2; zD;MT7>An>Z$ysxq1;Oe&Y<3EJ$i;U{M(GEue5pD79;Oef0@dgu5@x>sMd??~^%X!@Qq}DE&6Ae(FoI{(ux7sJqZgm=< zKA-y%VhF?`&21{~Wur&E+jGSMV&M#uFJABe4(QKNQ+{8EH3JzyIoxh^F|~ysDJM+= z(iVw*7`EmUS)x!T=jov%N`_iPILGlfMiO#Pxb|(jta>hUgE;ioc1kkkN{^`->-)q8 zU;uI4`^RP$UHH`oHp$r_KmrLeM0(DdmTLX7g@Z=IDHiJFmGW6aXS2fisgHnSmk=KA zl~*a3g@>eN7lDunC8;F~N1XHZ{-o*DcV{A!BKXg7a3PCN;njg`A=hNI8&yOgIl|&e zjIvQcnhH8X>ZZnc>3b3pP4A~(E<@&gRl%mNx5)D1Wt%ESaokK9i<1uGddZ` zs#%0Pcv^4k@Zpy$5&n@;Klotj%(%qubD|`n^6HXvCj;4&#Jii!}zq8 z=>vE=uIe#6l1sH6_KT?QQKBNE^lG09$2uH5+}!rpN`3Acnt@=DYgc3tS&O?q3})jo zOC(?ir*PYz`N*PKEMR?0%NJHFBg{0dA>YpLPY7OgR0>loh6FG{DzcULpYTfe~VSVQ!xae>0vo1+bVgGH##&8j8B$*E8dJOpJ5^aFqfz6E*fMXf=e;q z_4Az6h<5!OX%z)liWy$ljYtrLxIYVz%V z?hvnk*~#3*CwPY@Gc^;#5Jhqk@WTL087tQ;#}U$GFx-*A`TQ;6Wa7rD`dx^dr<8Y- zYl9$$I60s9c_BA}Mu|4wW0_-;pV!rG)EPd<-9=)7tMJ`vr^tCj`IJbN-GNK6ou%TqUw+pwm|?9(U)a7INc{J2ghzRb1apCbx;MWTRgg85w`cO_~zs zu~g!YySy7LlTq&&qUl%NQZ4kTRjt*a1&9w5D`Ur(Q4=Xz?YFy8RmTvQ^?UH$4}`b4 zn@@^~v(-b!h8c@y#;tH{DN_;@rSoO!I_lgr3#cLv`=#wO`p;xX*9@xF49#e6vdcHv3+*BdAcWY7Q2%uph zJkZeX8h85KS0l%E7r*IuApONhDKILOKvbOISz>lVyn@cZtT1WzhX&T$yZjZ`4`u89 zuV4E-+JzOquHHO5e-E&>&=bTn<#vkI1Xge|z*C|69&_yZQe^(>umx(*KX!+1_m1+HBi4ce6HccAeSY%uSnZx4E<0TpKr= z_gug4{eRf22XoHMNAt#UxSez{kb!bUPNBczB>r%DdbW1Z!n@+zeNH=ltu<^{ak>FE zpFGfJm;8*i48)<7An!S;;MYaPy7I=SiHus8kN&rdz(>#B%yESo1nd%N_g?Sw@DZ}_ zLe}pN<7%lSotyr(4_>0o$F~-?TvfbmlXbms*E>_uR1xt6Pfmt?yOr&bl_N_L?s{99Wwyk_>)7MIm(P(R;~7xfEtfF0 zL!e!$mA0U2!#koBO(D6NTo*Q4)~>1W~<4wj%y>69rUrlcA}UOzR{K;V)i4-8Va*dR6J>VU6i2j_kxQj9Q5s#nI0*_? z8K$ZEXXU#0FX!JZ5#Z4!TJLKRUq4Hg+&|F$nu<`&D2bq{lDh(pFnZY-dd;|g z;yPw5lTV3)PX`5tyDqf-Xtsi{Sc89f376}EyS{RLm@&rtd`KxB+r{Okwt%bbJ@549 z#l$Iubkw5R8n3EbcVam7@ZS9*2N4!E9k;i;t6xGQ`kh$+r}5LpQGcK5tAcin8896I%pu(3G@_G*YfgtlC) zA!%`&mq_(xByg5a{1G6dFTlzaa$*dez^{(@;rCu|!$0E66llQVKZang>UBQQzF?~= zKc;Rmzp@qhq%rEmA5HxIM!zaAUS+gFY9UE`&=<6yxjU>-GV!`;yl=`kjA591Y0$dD z0p0cOC8F2#80yr+NH+7N^1qw%F@^%#w}#b08RG>xPAIqG8d9>i885HII#1C>aCNz_H zBucJ3fe(YGdh@mri`viY^1zy9hem)6V-OFWABSzB*=E^gpg`9rJ@FXgRb>hPE)dY% zHj-d}@v8>UJm-zOLA$qhQJG*ahzRp)qA41dxYau_8g+0Q|$uJxE_}ds-j3 zAtRIk6nxEvvXPftE*&|3D=$X}_eGB?7^D)IY>^eK&6S%+XbQs1a|ldNH>g+FO<$ME zbIZq8e}IWQHl3Bv|7y0Y?yuAPH)OPgu)f^fZ?ynEk$g$TPz;KLloXd< znnePbI>E#l5ce~!&I(5mvOuJ;Z)MX)^hLENDV@@$)FC_`*2CKy26q;e%{u@^IO?O0 zsSXz0qeb^Ste<5+%Y%jvsrTyfc;~#IVW2tKDag4I?ZJ3t>-A|vFXR5c(1Pk{*rV2~ z5*l$w=)BgFRP$(Kx74K!D!1=hY_Pw`{~mq(v}VYMh}7ogwONz^9ST*hoJ&f8!(*Ne zI)AWIxLBBv=$?PMZ})pGm9C`FhN^!|7ndF2X)8S8Jvqt4gz}0!xjiP(dtB>^)sf$pJ-DMoY^ht(+EjtPfSC#+w6#l!UnKq)?{(snJhs~}hz-0xT*u21AIkTt^F~LWUWbc8Z!dK zF+d|tOhP-8$>%iI_SLeC1M-N!&~_u@8;1tb^WVn1>9;m6PO(yqYw!ylp$$fB9N?+d^GMuaIKF9yCg_eyqp5V? zZqde*;M1(?-y$!}^vQA~v^I|kc;-S+Vmesnu^&}qNyE}rxD^V0{)n$_K53j>PW6V( z_T!AM?MMekMl^C2c6U{s@2fs?;N8Szfqv;W!m-@9l{qsK8#S7i%x&WU=o1?L^Y4Ze5*W_jz~E zAbRd99dqFY{HAEz2f5I4gfbBlA~v?);b5dtt0T*1jk3pk+tsn()#*rqU~6Fehar~K z*wv~oGT3$+!fpRdi@Xd%n64*`n^ifpv@%)XlOkrqbMgOvjA2qA&~^;AL(!E55h)I~ z)+?kC!LNVm4}s0$b)+l(LHkOu3ZB^Fk=pxk;x_p}iG>#Qdhuv=3q|zsoR;}%HC&3g|Jw9&0T-fTosXkB#EGk+mCE?kHxG9rZz|DU- z>>+XP-s}|Lh}f-@eK7GuJM(=!g7n1bN58O)v$@z7W0D-C7D)E}f0Hc;Bqw%;f+3>hbE^2`tx93rte!=wh{nZGR@9XT9auv6u`i+k{BuhD-G0)(p{&!!; z9fs9*+sK?8zd^Wpw_kkOpZ)Ut&cAmOE<5|bE{Zy%o%N?%v3dEH9tN$%=e^4MY^Ckc zwJ&}Aw=(@VtNJ>y5_`M(Mn>cu0q7&p-&_DN3ISa@qjnWwjbi52EQSSuUzj;%&CXjU z++)MT)_{64Y}MRxyDxY)DFTIna$c`>xGK`Yeo~iZO93m&@cOL`jm@|{u%$DNvEHbG z(T_&3k-6tD-ovMIhMoCL-A2v$Y~BBj=O+^=f%}=g@&qzI|Lwim(L%X z71X88RX8#f!B1c7y8UhsR97*17uvpws0|qj1Q@4i>>l3q z0YDg;kT&6GbaND()tx=nJWl$K|rH z@-&P6w?MclL7m&T$bG9EQ;(PXvvYP#_(8nH?U+r~T$%QNOQ5+-Yx`R;eYQ08gx z(;t@pT+VEzfe;#^c!ux>dOHvYz%}LrEtaG400r^%v3mro9q0nn`PY<@aqj(tQdS42 z&7keS;~j2_n7>Bq@WezTc`ffGwH?&|*0c(&dTT^%!v1q0mbdZ+3G@(bSx0qI^e_4~Z0Q-_#s})miAH&;IZ<4uos>^*&X&scr!5`9*w*T? zy9xs4MhotayB$?sIWNaucnk=n&C9tm$nPoW)=>V`nXoB>bu(O+&y;u2Y1DORx5Q5l zX)F6RaK6iJKWR9&t6>OBv&y3=@4JMA#{ET4aW|rZ%1q$Aqf}DjfrBQQzd_oKmlUpa zJS^5Huz<#Oc)?3#cZ&6TVk-0TYsQC6d_F}^qNkq~kXZpYzYi|8 zOOWJIi7zN2=GXa}jTHXYO;10}SQ--gjwz=PD9#5wmAQAW%=Pnx|WT$A&U zTgFjdMb2zBfo?PRajR0Wir+~^Zk_jp@8P+A8YBp3XT)=gcBmIiOhR%`+&*9$za%I9 z^6_YYk)Fn{UmV=w@9+OyMhsm3lEyK~;K?Y-#RL5IfCKs{NbCN6SnqIGjU3N%cRHPM zyVvx|{q(y1@sY3epQES3WPnL7#?JCH06&`uP}fff%naeo3;Rk?IQ)$rxhbD0q2J$B zs$?uww^>)i?qYt8$WN*ofZAU!Qhz0UeHTrD5yS(z&;89)O zqGu)IJdON?8!;7^*qW0f+Z4lQqod-vOD6RvRyB@EyF$Vfslje@6f;`U3C6>Bv!}is zW{#S!la|(ppFd)CHA_ZZz?-oKNOol}DSvx$uH=3BqgOmfT=)x3wHJQ$=u5JWVa3{f zQo*OZZlx?#_9`tlU(i zDk-$3FNFs`fzIw4W4b5hH`mfKJ055T_cM*{!HUbv(cipv1(rO2{iSa?SUD>F<;Q#8 ztorUR!SDpl6xP^g7)kF_eDjUE^jdXBI|eq`NqNj{S~*EIOX-a1Cqo>9()!dF`N%{e zlA-Tizmnz$EJhD!FS+1Uy+3i%v6 zd=LXRX;921pAb2(NM6HD(!Y61StAc4f1@=ycz9H}3i9CWHB3?*&vE(m^q~ciI5~oo z$ybqxIA&g6&gIQ<#i%Q3+mXp;ayK9f_z{cw$#dyZ0Udnwmtfl(c;d|*jlAi zK%Fs$JUp^f<~7X7-$r$Hb%jcJ;4VGT-`}G)1Pl6Z7%8D_L~Mb?FNzY)Mp|HnwlP1- zKeVYrM3^z7oS*bCB61!dIF>Z0%)Qcek%Ta4_WqMqF?UG_8p|7tGfP!wcxk_$Q zvaIA-UBR_=beyDqWh%hWvpJ~N^eZ2Gk&O{vLM)~MrSLl!%H$NvdzZLgF7svbAkav+ zwGUIv{9!vVp6aewx6Ee_)T*cltMP4;s>ERG_S7Jp^>$qaf?oVOaWsbXKYMfitZ^?% zDNr4``3Ng{oCT|yu1L(F>3biiXvm1tJCeAJDhwN_H9C3DdO~GuIcbbd(opKWuxl7j ztjw&=?Ze+a_S*XY@mIiObnb7Fmu|&gq%lj`eeVI%$Zd>aPU_0b9Lg1Z&9_dmdEbPiGlZGYnsTKiC}8N zpF4IuP`95eTVS3|#Hg^%LH{Jp!l4db_cb>KLoi!=LM%GjO{i`wcimctKZpB-diqzp zwyv$4GMfgwXT1XMUkF*L3|i9V0l`nRxN^L?bl?B|cY)e4*b4DBZv1G_2VPY>5Q$$+$jY-}_MOgsKS!ift zqc_>?Pj)KLA=Xy!9XkU>bCPXH5sS9#Qqj+!tNFONNcWikFdv>xzF3$$uh`m^7U8=_ zuLaHSW0~BOjvS*DRY=-G=jIWoZrMQ_^KxbdV<~l~-tZ7QnFBm%6gss!10$$i0)*F< z@tMtbt0%W?FcD$v;ReZ+!FW_>?VRsdBc@EQO2rt&PM`aRV$1faB?qpT^F!lmyRTd~ z26To1ZoNjc{$IM?jGRMfcf(G)4|dhIPFFiWs#5&YkkLZhqdl=^`D__ZGDe9!!a|$& z@*e`}kweBDF~t7Tsl592C8O?`4%>b$B#>-g^{9FT$`PQ9Yx5qe{d(cl)n$%H;95oX|tr z*6xy>lAl6<@6hrLYv^b3=QP+V;3I~Y9x~7Qo7khVAE>*al{ME&Y0?s_$T@t&Y2SEl zL3*BCTt03*?P7tR=Jg>^kVyB7cBz$P`^rwkj(AeAwf|%Xf3_IDew&}s64p2SR6FvF zlyd;d8&Q=!0&|{`S6hYJ(CRMKvZ)=QB^6(SZi|ol_ zPQ#{Bj8s&kCqWibKN;0vB~jY-X$S=GCDDmuhvUXZOlxZ1^q2pN0}c4Lm3p5<(`@R#` z$ud-7fgW!J{5d}LNx8Kco+fJgtR=PLf zE*PzS`f{hFCVl$KQZp*SNIuQSdj#n|nS7Z(8Gf4n+dKarG0n4{GKJOKhWV@@&;%ho z(XlOCCWpG~IHGHaW2#94mkP1?PV$OCHSw%Tx_nijO=M-gy3Ikw)mN;?dqsvCtoV8A z31&-TYZ|NW;1uKk@+*l@#pM9 zoFhBh1fPxwxJE{4{Y9Tv&Uoj{0+{+`;7+F^@nWCay60x--HfzydFhSjKO5!E9}}%W z5B&VoP+u)YaTJxQ#i-9~PIX$7a4;AtexmX5xpZr%Sj$)$x32>BUbW61?;_arK?3XE9c08?D|NEv6lsH*i9x2HEy8jv&w>V$v8o^2b6OxD@ z{5AI}V=oekNZ({1LWa@C%BLaQve;4Aj8m1cw0`byoqAn)!^M`Nwi*uquo^**3qc2*cye z+;$iHKd*p+cM<EZ&84m0unfKgkjgJM*!Dj)yCoFG`6YaSMXi6h394bYh6>C zxyrKmyJA4*nN!{JItxY%Odsj|1$Y$x(uXdgjwKyoZBnsKAh#NO+|L@54e5vNF>VOF z6LPHb@`POeMB_B!QeQd7KiiDr@W&FW(Inbq)w^rxkPs3+U-W(eh3a^RngFT6c3tpb z=~w;!I=!Ea9iu}k?X*A9MEH|QPg*IgVA+6788Vk(y}CmP&S$UB-AwI?&|c6si;Q3y zA2ePI5P-Lvi+iuq*Fprm7+T=5n0b@u<_OW(1BQ6uVLS}b$j-suxZJok)Ffp$4~>!F zwZ6;4dZPWU=!f$s5@yhQGu4G^M(4>-6pv z2L()jQL>)OA6iyF0nhfW#|oa8vzwPEk~&xjiHL^!>!78!E$j)G!zq>;FuXQ@h+o(} z3I=HXq!FN4L&*TUd74pm39hf>sO2fF%r!9MK?hHLCn6Y2-B9$F+wUP0NzHQuZW-nH zfYPC)p>-wyZRO=&JH=BY<%QmiDv%65&4qeW3Uwa`5$I2dn(hUJEBT3uE@$9|qoX6h z=hkAsf`+8g!73CCiW2&EvIuk*%p3FgCu67J`}oX+9H0*_;^+@1PtfTiSEO5ZdZ~tk zlzKm}hJlK9shVMzhWb3r%m0o^gXx8qf8{Nveuv>jLRjkH|Eh(f#(>M?pb319ds%({ z<#1k7)-l9;oNvdoHk*l|>7|iUjZvTEBe*plj$I@6vri@+qX8Iz6vNnHdi^b$76R$4 zjA0ShWt!I{0v~PQ27grQJIF#6_+QHdADMKj`DaJ;KIN|N?o?$8X^cQ?=g#dQy6B%@ zUsGsdc@4VXF98A*IWgc7#os>)MHk_7-5t8z8SH(3J_T$O&0pp$fVLnjegHQPP=%!7 zEiEhryiQymp8EqNZ9j2&9L)lUqG-hvXOZ-v7~<9W>FK6=b#EhYr*V#)`nk8L!jgTf z&X5i@hfU*2X>e4ne+pIc8&bn*Q+Ir+eEN6HdUbQvjW?slsR~+P^bBJ4(GLjWLyaBl zfE|HBuor-HLx2AF$VNs6_)Gtb!E~q~!9|IUn>v4vcsOV(@!=DA%*2+Chbx9t6uBz& z7+4vySh;v29pxd+jsR+Zt0r^0fS{n)>GD5dK>qmgy4mAu@W|T{@Fviv3dl4BfExgG z;GZ|W_jU#&3+2q@GuzG2{;y9%F@a1poj^X?HEBIy3o}LU%Taw&>OD}%4(!EiEzfl= zBS=_k!d8aj6@PKEY|V@7!S#um7&iir0Y7(&a|cvuX@om)JgaM*`O$6BxniJrV^mdc zJ@4XUkh!S*`lpf?>M@-d!lcJjzH1;!i?B;xb2{w*tLk86mkh8=Cr%5hPU$4Ls|1!b ziBJ0M-0q$(yi}dQ-7){d29K>ykC$hbkNddugl5$z@h3|Eyk96vTZeHdnUiYEeys=g z0Z+299^od#kZe&uEQq5Y5V^$1*n9i?*rfT5n+4dlkXqkr$?50;9Sz``k&uu8ATA{M z_;Rm+R6-x%H~^FePax_D7~3<-4^wm*7-T+qfnW^a3uHhOm=(Hp`E_-5MyZ5jby#Zr zvMsK*H$VpzIVX5)QndU)ueE2nPAe`}lGnVLQ-o2>X`puUD>$kQk_a(@nqgXtIX83G z0eGF_dSc%$+m*^aved$}7lDUqPQ`@46HBQLrbmW6Hl6VSN1L~)4JS8S1F#Ww<@^|?+W;>;xjQ_@LAIr>Mz51%e!IWX`bkETS@kY}i zFVLEi;v~-6CqvuJmNEpav(aXYN3T?nIJ^^2ew70Q!%1Q`(WmXWwbfT{m;+tkPIVZQ?xJ0~ySt5%(p>8ousv5e z_lj;)a$LK~=C2 zMKBPC1ytRtXXO!GK#hlJF2N3{!hxW7Q@F#tL%wYZd>=S3Tr8Cz865 zQnuZv!_Vf234ckTCTdB@0%6~w_a=u{Vq)sN5b zx#J1Wj@nRX7Lvn7gY$)lPzL_eF)WGDfy%5nZaqV@0ggKeMUZk(W_*17K64*?+Y)e& z0xB>DQAWm-{YtLUkGDFB zEG(7+`x;ke+wZj}fsH6jK_JB-yng_XW#KC|64?<;QY)0x`I}z>t%hN##uA|WjCs@a zz^X1MkR#{Ur{mA#5Q^Lhq<)|^nu;Z>e$Kwfmz7RfX!m zIGm-|=^L97XaNv;3O)RBCs&{j&7%e^<_J9R!+55|f}YGAeUdDiO$N1 z(eX;>;i)iuq94GuqC}37tuyIfYjKQCSMkKGeE(~R*eAyI*NhMjRs2IvNy$p4oQ%z4 zcW+O=u!|5A)4WYUVX*TBfQsTL&t;<0&3w{L!Lnp~eK+tp`nIu5JN$*}V~JMidF$i7 z{KK2#^TzG*Z!%aDY5>=>eq78YY}jZCo}s!KJ{T$)!V~N*)MS9B(-qG)^%n0K>Ko4p zfjA0kR7;-D<+_;&ootA1aPyb5y1ZoxC6$dF3>1up4c26GW}d=)Rs!dwmCU7;8cY!S-h3i^!>ePIQM#fPEP6>FwYN* z@H@fEZXtu{dVqfBIMyJZu!bm`YL!NITGt8W^}p6%8!UimI>3>BJZGxS+bAa-ujP4& zHH|+`c5TVKUyPnQe+S&&v87;}&ra7U$3Dsbp;PA=&jXqbP@}NDwvv^n$*l7PD8_lE zyPk?AeSYHkuPzrnnmF;IH{H&b|EI~43&E+@!I<{IgCd7C`3<0A#t4943&6J7pRZbe ztc3iY9AAXM%542UG-51D#hNEMmNl%m>-s>xZ**|X*q93pcRx?Qzl>CW0e?%k*$UX* zD*=x?>0Hm}a=UEGFRuo*2L}r`H~c&!Yj%KC%Adz~I0J#N7}(#FB(#vMa*JJsH+M7le$+1m^PT;^pS-4#~vVbU~ z_~PN6U?3aN@I-mk_n(@P&#fD<<>TSa5gcCuU82!DHf$K#U#1*_-o>)#w_7=KBqlFvTZRzK8TkHu$6 z@O0hpf6ZuwJpALf;b<=O&DsUM;a@^+`On-q zP&|&Cjgr++SWq8^euITH-^ciP)vwad>9!!YC9=TRtBb)v(_RL?@P5xt+i@4qfjZjWH`bs_=83%-xfSHjg zXg=FNkC34ka5P_vOhwB?|Ht>2ew;Xf?UM~Xz!gD@ms1E^VGhQ7dN=EM+)4{nC0-4r zJEC~UP<&eJ6j-HDf+-T(pG&{}f(;}VttVahu^7Uo46pC_W5V}8(TuJRH!HU0|F_XE z`4HO&zV26k?+o~@HogO`s16+&f6ZnH%Y$F27J+v2Hmxm~DM}3ir&S%35ZT!TpsPDZ ztUk>osgU^$@JmtOKxMYD#*eoRjkG3>sJnd|3UHh{_~zE%$-JFTEC4lLS5B)}eFX-V zI#pQ=7mrwM290tR63ne`Kd@vR&7dE^exp^y_|cI6M|~_ zws2&r&Xi2pdms$)i-S<(Z=oBMOFC$-7=$=Jb#KylWptuBOb+#xW6V$^z>)Ynm*4$z zyB|P+TA0QNW1cL&3_jofo7K|vrP<8C7oqdO*? zBZ%x2ED{L0F^GtYqhN`Zjf>2L%m1AuHv7B?Sti#U09`3||@xa0Gq zLSe@`4_(RxHos4`x<`!bvf!(o1O^!y!}VU6{L#FbU@Ag->-^X|;i5BjX&~c^LyD$>afV7 zs`8*oKZOzwTE)>w(OSke2|c;_@|cytcWj% zZP^7k)9J?|mtv!AZzjUwK>0%k zSq8_b-bSDjS?R}-iN22|1>V25^y+;jY)%dlN0%$`{R(ML4Oy@5?@-j~WoKglS zTXEBAxn_jN`Rg?>3Ak0xpKUG%DC2_h)eN`Jg!zQ@J*E)@-u(oRG_IQ&H;yb zlgTH8KMMngX`>LeQTV(^yQL~bcnql+XxMN05jgx_E%j^Z!{ z0JMbpvN~Du%^iKLCQUvmJXT}t=0$bi3F;K?O}zX*BRy4oz7p}s4DM1zXgB$1(RY2p$ zh<)vQ$ZTudx@*6ufB#$DD%VR^Z>iFs5Jx%X?pniF#r&aE7}1m(q%reYC!QB12t6rw zjAfY4RfQHSP^7eMjDXFi%{R$-?FH38D$eEO3mTs6^)zVqT%X;T{i4I` z^ha;T+Osfq`CQbm(=-Cbd^~Ht)x*=y&x+;{VCrD!M|oB(QZLNtcNK*^$1Yg2pQOn=tz4Pgrs8FeEo#zsUa zq(??A~}P}!dk}HSkr(~C{a02AZ<)U*fb7Q@!9oy^t00@Ru7KYR{anK%@N|y z3sg#3&$;+bP6sL6|33L*d{K*`8xm9GS$XaE9i#&foV&EcM|2ZZ6mX8+G zW?9;@>d?>?YSuIRyh-}{-uByT@v=wIci)9vtASAH%h6bty9mqQG@I!O>X>i5q+I6P zw9ms;g+N&3b-4P&7OMz6yYH4tU8p424M+%?b^xyhh=-j6)zf-}3>Bgo0qY+o6z8 z)jH!dl3^VYTOv$L=@|u%LsvXSl(zkBB-fIBf%Bdo_ z-anAJZuEp25vMi}c40s+5!90@@0WocP*NUThqQ@VKc@GCcmCYT%aRjXH$E_D3u7zi z(Ca2l&dgmB*7}yENeAVe8+lxaU6ZvFENNlCbhq+_mfHtT*!Egiuozl}_4N^2f1Ch- z5CZ{7iK1T|h~w`fXr7*gg9>O5-`Bz(?j5$Gv|&smjP#_rG%?yU9ES-j&w=>-nELrf zXlnL!2uNeSrV-@!^moc&n z_l9Lk1ZxpKc+o$G8{sC|hTOK(wC?j{1_v+;p>IFoFS7q&e+;RZvckvWG%+)yvIXPa zP;mu==rOA9ftsuuQ#auH($rjG5#N};Vhxn!Et;Xx(nNdvY=qJfPBq;r%7KE5zm&Tw zsbDxzVKTs}Sf@|HsmOYb#xxf+GHl6A*bp5>)%B|SyIByB$zvv&qj3j`hr*d3&~XQe z);VdW>xd1I`Cmes9RMi7vN>?OAI{0=!aZlAWk{5{w@jGv;tDKYN(~5^$E@@uDrQw5 zRSxd@mLaP|vG)lP$@0vReR8ciFt>z+&fK73pO z3a`NXR7`kJM~ypC`z^j^e`JNAJM_RnAT{F@zQOtrNa+oyTw(UWF(J9JU@#Ia;!ons zAIMolfazH|R0xVAzEP9!*TmJBG(9It%F3V-Fvv0*-*bt16O^%Y5xbNlUpP{u^+|4MhT)Hik9zaD`<*BFjaBzY4Xo7{ zm@P;s%rAJ~eA;|Q!fAmRBNfjk)zQfiSIBl<IY3SE^QiX|u5Ph`OaT?uVvatEX-}%9EvCCZ$)tuXK+( zIySq>kIqdUNKBzAy*_2oDa4OVc3vV+ZwdssbgH>YWhnrZHfRO3MJ`(G26?nkBJ1~? zepnNgq(Iel1PrihJ|6YwZZFLwI(qC5A*L(c<`G+yqdHc-eTB2LU zREWLK0j>oX+_Er_B{YzK=$X(fiG62kISj z94sZH!vogLr4cb&Vh@v)D&km_;4}hX4k0JTCXoumm2vwbP9V|LgK4Vv@V*822i3yM z_c55^qUFHo%gc`>E!Z*@f)@V&JhHM`Q+d_~4V_(bF^^LkkLR%yL5p9Jc72U?rhIFj ztFR~E|6>94$#CfI>4@u^PgZ9f9p^K<;yqwa_spc@r<+veW zV3|mpaptp7Sm~Sa!q`2COiv4snkfQt)aku{DeZy1oX@anWJkBI< zXhe$tQ?5#u>B?g8tG=ZTqJ2n$8^^~NFW?T;;{YmO{D{ghr*hU6bf651%*+}?wd6(6 ziIU}YeHbK*oO!g-74Lazv|1D+O%B51Em~!+t_S_nOUCR5O6Oyh<;0I zOx9GEtJ*zIp7hDpsxL(J*1Xe5@>_1Ag`pnx!Qfa^RS#T$d=u&Nm>=$!^o(YCMnMd3 z7RHp({uJg^bCHH6-fAV+;m$?AZLr;;+^HhrWD7|A zUHttw!xh$@eS{Epg=X@L1#Sp(54fl8t62`e(N;{-S6BzDtDj>EXnr$dOG56u6~I`O zZb1ABtR|LXoGQh22TQ#n2 z#1MUpKg0o_j-dmRIpU7v-c243@CoiFw1Fe~;Tc7edT=r;dr(hHfnew$Xe;`auKAYy z94G?3#D}GbEoD#dnr|GV08+tE6PR#N*M0Iq>(jG*4v+OQ+Dz|tdmj!HG%dlbQHs9; z!DHr{bSkyR%`LyVHa)C>z$?_>J`fFI-_GECx*>5%ATi-jdH7Lleg6Qtpk`XO?5^s! zzw_4Y9}Y{$-{1v7)y2jUN)GtPU3i>gi2h9gK84>p`4=d9x{;s{nYvz^8mc}3MQu@QVnXh}jMD&1&)MlHQ; zHMN67laEy*!|DN*1@i4hmNIdra`kwVpX`39=QG-_{9^nLef@_(3^6c4BIJ9FyaUyf z*h_LAarBcA*QZ&3??Matgj8mULR=@K2g9rwELlCB%Y6yvPmlh=a8!gw?WL{bpal88 z)}$%kiF(#_Az+mqrP<``a@n1LUzfo)_%;3h{rTl3ja0y53|+LQublK z_5GlLg38>$u$tQtqn1$-uf9E7++r#Y5IL@w=N^Z92*}!!{!~Kj9V1qmC~@*bp^ulw z4A((QH&d&l6I|LSQ#!0-r+=UDK(bK4{Tt^hm@v)(X?u!MQT`wn1~?Dg>gLmAuX)jC zHAbFIUB{5DhCxd>Oc(ZEN5d-)*a=APBOv{vS11(1h$p{^%>+__l6*mh7&;C3BT>Po z@PtJ*!l1bu7pG+MqTE<`a1Xc3AN zo~_iGt|>CnC7V-~C%femk6k*Qpyb}=EN*pJSLdYUb*jcGp0C(% zU--2=8edwPS$qy9{-moR;g(*H3Hp(j_=Q}x0V&*6PJpu7k;LTe@g;sMc-kepj~WRz z{T4-xg{XDB!DL@SI!MIZZ(ByQGd6a8@}CKuk>%XyrjfCsdx?NF*mb$f{#vVh*3{hf z{LSjkj~dCP-Ra0KnL(gZu5lOdJ2*-)i-G?vcPu#aOjLC%2i&P{)2+FujR=l`Nv0f9 zv1{YlNr4?f9h*Eq^argj?Eqa!_^8{1!On!eQ;if$*Z;5tJ?aeFYYe0>GE9$2ZrK51 zmbls{axBjBh*@%oH=sbD-Q4!9m|n}CKJxOJ!oQCt&A~euRIyPRHlkvXr=9Rh2f2_) zReete3x$fWDyar7CH;l8PdJ3sq0r7h!=)KeO%I$AkFIx7yP)bJNbo=+^{-7oQ;d^hoM*R5L!dbQ*Hn&}^^L_2q1soc+ zFJXe^kZQf9eiVL9R}Pddc<1h)Wg`6AyLFoDV=?)_cRd${R zWMp3@&Cy7DdvtFyL^ zU)jLmmoSi>aLNE56iQFCbT0NRZE_Z5%2NlIfhC%J`fAzYv9mcl^yWnalYI&Z%oa9r zk19n6sUxT2BSs?$9@$Qhw}=kZ1eMkSRzU2 zwTh{Sr^cib^z3+Pz5tjAmgpE+p-(121nn}d#%()s_fcKp9-l8v74+MbA--GV{!9*^ zhe8r8=LAwGjq8>8)5=Q5j{hVadW{SDc?c2^hxIWPm`IW zt^=*=wHD1_GnibtdpL-;KIZUg`N9Ay`mCn72tY{XaD}(l1^!{3VF5TdbQwmZV6l*z zF!f88(Y~bodah4OO6yo_*Zrwhs+T1pz}j#XYh{J#BTRtIH0!CkD7pAxBbblD;1j@f^Mu=odl z6wato{DR4o%aa&l_|45sryG~1q;8a zaRGOP#hf@Yi%77t;gImSUzv#eOOUB!K)51RJBEH5g$zOZr+tY&J}~`Wgf%)ZQY@+; z27v=mzp_EfN4+y|qiiLhWeCqZp~q)e^Am>Jx}rXkpUjtl_gb!jIL+VK;1zk^XVB_AU07jMLBaE&pbh_b!yF&2W041d7GIlbMFN4m>u7)og*4%*CxS*@$aa95g!a zq{4t(t6!$`2?3G18ZFuVL*48S`w{{F3xKce(s2=L5E(4OrQ-Rs1kKo>ixT-~)>|17 z)Lk-b`F*6^5Dc+H9O2xYTL#c$DN*dB8GP)7SR1s#a_Q`N8 zfux7-x5=DLZ3R-2I=30r7?$vP8VcEyBtM`yr#Ls`J5bTHZr=J2`NW{Qsq-i2culC} zpP?x}sX5FB1@IW7SJy`lLB~+~DHMP5;0U}HSdM{-Hl5~mLO=jKUY~SbMVxMI@yYU= z$m4vS=1Q){Wvmrm2F&N*XFv;(ljk`Dla-s-BG9}mL;>lY=k2W58z5XmxswOd=QTn# z-$cubGPdR(=h*4Dy@0dN6OM^h@$vH;#EXuJ8Q4t0|2xN7O%=K##II5fQeTzv+4UI< zpqi#2wCdG?reTK?cY|}D2UZMR2$hD1tN=uzi8g^MppXfLD4IL7J(y{Hrv9pkCGrlr zAsQd3<$2=;$`66G=b~OHv!yTFwOYJU7zm3bHwY9{^Rz4|FoY5vE@Qu!^6T#u&Qyic zbdf@z2O>4~$dx)!^d%^{B{Py*-Uy0IN><5EJi-tJov4 z1B^cXP!oQA(h6PE|Hsrfhv(HTZ#Qb}G`4NKv2ELEY&EuRyRjNJwi`FL?KI!c`+nz~ z-*r8Iq}S7D?X}m;S~F|ro_nILte-`C#Nz9;MtWpbLFc5!_!_97MaV>egV^O_cw`6r zFSgX*G_FP})_wp+sz4lk^?VP41;6p;dF<(OK&6Mb-ImO)Uwtu z3}_%)TE-eWnL?!+-M0dI-JjE(bv3M+S`i_=LNEvjw5rVoego_W^g5>6M98tL=y1^w zXRD#nIIEP#sbFAW_+p>`nxphFDV@yTMFZ`vw&4fSUuv31W>KceFtvYrH}NWy#sR>* zo-pUXAu>l!u5)In>_725$Nr44u=7*5pHVVCIia#kDZcR<695?A>xZUp0KE&b?=Vq_ z=i=u0;LK#M?wD+Aa*DTjv!%M!SzG7v<;djYMn+YXAj-ICK(bQow$Wf={X7N$#PsmR zviG=cihdc`zG2x8IG;1r+x!IHn~Iq2YfjuJbS}*7@RIEg{iE1on{uO}cO4gwCHj#` z)C&61COOuY+j?csq7&{XhqUM z1t5_}OFq%1@=f)Mxe;u*G)`qhQqeQf^K;cj$y>76fqh_^jB*ApxPPX_cN&hl3T_B>| z-jB6Q=5(qSM}~!j_2~i+P5w^9zH*lnmnAu-A z(zV^6Qsj=Ep4yxF-eh;K{xL5#!Q=h8XbnHMNr)~}UdK~x2{touF|KM)GAh>90GnxZ zyU{+cL)zt4*-~UvRJx_S^iHqKZz^5LOd_~kHa?wUF0(wG6}hM}DyDbUzw&idtZt#; z?*io5s$rjwBY*eqO$+!6q(mO;vV>WrWK$>to|O`~DG4g21Dfa_&anXn*DBs1Oatx!d!Q}Zcpd6*Umrh&ONHYhg316{ zh8x^eJSNTVSVL+`;zl0DM22LxJUmaaLCiNI)>T~c1zH3IWK?W#O(H28^bC40Hfr{fLg#S~DPZ--Y(P#R2rF8wx1(ZnTGhZ35= z2xy$kWRZ=HzWmxsA1@zQKNFDE@d@TGO{BTqUY4N9rqTT5SAP+BY)=1SfPqP2Vsm}4 zk!K=FByD4)Zm7)>l}2Wd(s8xivggQ}Bq18O4aFZ{uCu3@*XR@&mta1{Rsy=F>BM%t zdySw}xXGP+#@0wqwz4i-FH(~jj?DjTUpd|s0 zWifI2{+-IrxLQ+kWFh~0MFseo=FSvc5-RWWM4@HMPn*9+#dr{5BT4)n`N*-jgreSs zw6hx;nc9Ahi5;mbS3x7+vJa3XPUAs|v2!aPE~Too7rroVO$KD>+w(o%>7N8vSvNcf zz-htNE09xw_(9{YaYOk1ErF@_vHHy-lu6_ii3F+=Gg?mMs1`##gx-O1b)lQmXSnf=^C0ku0d2)ra)DG@@Q*OeDTErDO?1*6Imd5q*TJy?1HK^t zsvfmC3)x0b|CDsKG2z*z*vYDd>)mQRA*Cr))3?FtqrFB5?W=z>OLFZ5N4I9{nv(8+pvvCc2aC7}ka|ilpAn7JXfK@d8V6ML8 zXz!983)YYVIt0^{BRLWkiXfMIyG~>ShFR00{tvQ@FAD}6E8x;X#a49V2qC0DtR)<; z_!5%}DYWcq8lCAc9DN=bwP>x))gsWW$lt7icH88m7=)PDCRQZWF=1NfoHL zcB-CDUm8NJ17aup1D8j7Q9Cnqu_`ji2P(5t`8>jNh)Nzu1RROk3bz;TcFo`O62J#=_*M@Pr& zYaP^_-{r~_bab+wovvqESQ7H`fEV2imudyq0HAE8b^uu2>DrMeP1%%YZaA}8LXXsZ z`+<76r?C&Y=Yx)&G3!g#vNeI9*aD5upNy3a-}36=myLE_tOK1@j+*e3WaByY5KHJG z+4)pp1bdx>1A-`m^D}5+mC#vSUKNE@NczWz+!N=mZ9!K%h$U}~qMMyXNc5ht z2P8TN*EC=+=$M(GJfICh`>|>`7W{akTyCwZrbK*_y{QzjNW8kGR0bu=n6H{dSbrdS z$ZGGJ{B}^2PP*=N!Y)VgVM4%^c;filLq<&)z#6cNZ&7%X*|R9-mQu#CU|wKf;~#Ly za&?kLn~0L#P1JORxX ziK{%lOGNqMm0HpMbWDAuQN(B9?)+%y4YXj2v=%6GXwsIzyGu`4HsM(M|D4wgf9fcx}@Dcq9 z>DtkG5XWjS3WB5TG zxJuugOk4N$gXXjv6{JbZzpLaeX-J!|-I^VzS(3(7JSxfCn1-u-=wt<_e0OQqW zEue8NS2ZFM>0swG39oOZqaqOAR5OWn>+r(-$8r4CmPM9$cjdQ+o&U-zy=+`n&_2Aj zoN4tE#V0TpmB2Yk{`}nQb+_)^sUHV~P@rUXioQ|L>d-)W7 zsMZ^x4}d;<6o7h40|G?8Sh@UbJi6<=Rtt}N`dp8J_sh>GhgwUt!vEK7qey4nFTK4} zs`WUvl@k(a;Ek#nCToBP_$L4JA`ktEp(R(;N)T|%$I|nwNlxL<1$Ty>IG?9}6MZ}o zSf|g=wd+eCKY5l>*{9BH5m}nYyn2?+ZJu{Gtpb%OK+FVY$hed$93C?J@c<++!+WlX z05Ov<5m3Wk{RMoBe)5%NY1kyu4fc^lhym?LPn@=#23Aq#_HEx{f4L)zsW-9pTl&Pv zMDx)-8?$8!%_i1a;_-o8RRSl#$Btt9l)j*tuh!2?#zB1V<@b_9qR3ro`6uJYk1*dw zO)^Taj*2pWupxOge9m3mRitm|$cc=_MoLXl_1WWOM#tEaF~bB{^4(eSyHq~V5&j)e z6_q)L9A#DMoZq3#w9(eMf{w z<6bQh2#^@H$OyB7M8G)gN#)tyNeMqUUMcO)cn@`v+*eUA?Dh*jQ=IYJ$k&D)ADQtD zcO)J21`WhIAuKn{R+o zWYD%l0-miNKH)R!g$T(344)0_T8^n0DqCx_B)wbhckNTJ<{7)EOL*JP+aEApo!|V@ zP%ONb@jxHoic$XZ=bS%F6@Lx9CKB8$^X6Oxc)8WwgNJ2O2sU&UT|uE5@q#R;hdtaw_joTQ+x+afYDdQ40UL>7^zdD55-q{~n@_}j?pJp}hj^b-OQ^%8y=2Kl(#^YN~)y0G7@)F9& zjntSb4$XPxu*K_+`T`={gNoEWDfscfZ7oJW`D0e7b+un}H?b_{#9K|s(CET`dq6lr z-Os6I`;m;9X4803%f~gMRq045+RlS&%#?FZ3@q{*!Dn)u{7!0>S~Cb|KTPB5zGD$` zkl0wU_#^sGVi1%tH_%or_zfD7o=~_tjw!{`3NnLb%CBMmriFNI_bXV?NzvZ#=-?ZA ztvd)B?7i?7=&NH73oz})`aVjR+Pg=~OP$1~=27*pj_s#WV?Sbi_f8p*mR^$mma$$= zJXqOHcju9FlEy+RKdlQx(BhnN+f?FM*pmz+9omX457)RNbaftsudG6y1*-;8;XM7; zW2H9-Mav(<;*2B#W@iNYYYZtVdbHsw(4GOf;VkRXMuJP^0o|m2QocISh<|)NLyaSP zFoN#wrN~Y_RD%%-0|t0ohW?YplK$WY@q(AkB51>6?%6S^dXK_{qb-}Ok@bp6xuDLu z;1WHOpJK1lh;U%r#DG@RXibL!Br%U?wn2&TevWd_l$I=`Vv4g0&MaBroLY2Qx0>wG zae|FVPM4j7VzQ`gl4!9NL1(98RZ@QN=7a&Pr)e(oKqh zO;e~*F<)ZDQQpV<0vj|HFh(X|c1zHCmB@oI4qza8J?Lb8x4o1&*_6&`gT-jb#r4Q5 zCx?>aQV_l)l6gANn+WMR&~cxjP$)ds*tX zw+#*W%G}DdqUtSMi2oq@g)Ko*UdA6>pw7PHF&&?P=63ewH-)wh=M=wq-Aqgs1avb>S7}O2$DQE2 zIw^6|6HRHM0ktn9y`~c~`H6vE^VcgClVhfdf4=}?`}{9iN6!hm9kL`Lib-LkznN)- zL>50+6C%xq=+>GTW>LtxSX}gWb*HzL3U)Fd$UPUe*C1^GxCG$}6eq5ceNLgAs$w&PyXQuto1dHZ`>};j+t=^;6}omocM?I;<=-l7LJx0)*NxrP){NjE4aF z*qfUh?ItVLI;}v`$|kW8SMhqI0T@UX%0FsAS;qLdj0i5DC(wkt7^v~7Qms@2LLL<= z<==;QAY%UA(#Z)fmZL0`?aulAVcloSDz6FA2zcD!r(9fIrMbSIH~SNb2?>NZ`Q@#b%$h+~#Cd;5X$QN5t5I{JA_?@*+7* zV41q~^R?c$t65bi^lF>5a-T{ue#2z1a|d94Y+t>1 z)M@~5AynhZ5>^WobKA?*GexfVci~8Mx`dVx-Ji|@vQ}2dw?}ia*^M=;%20UV1t|3y zd|u75{1#Y0mfvCh8~KG}^2 zfZ75|vd<$CN6qweY!5@1Z`Ah7Fh5K9gTE3IGJsz@omvQ>31b{e~qSb@K-K)dGJ{y(Cw zuFY=d(|WDHnHm=ll+}H%DqWeskpwuBY$lE${q-?``#6{S89_GmMP+k%pZ>mFyaUze zL`>tlU6$*!O_gf2UZ-_KXjwozQWT%~dDk(zmb}%mWuSEXY5-$r-EK0As}=8Pt`yOA z(T*ocv+mnOOy;4uEn7QVtG`8c0NZ z-lcNwSZn&Dbj^|AbD*kFbt0$-sCxzH2P!VbL+gPO-tk-k0(cIfG0v_(qbH8;VyS%r zWluJr9nyyTPBc)iyX=^P zc^9N;H_mT|&%^=#$F{dCGz@a-%%;orrmc)yzMDWnIu@VjEszS&&-x%O_`5O>)VX=w z?Em@J2j92h?lyIvH4uq$vDU@{@EZfg{1TueN0l{FZIP6#>tdr9OEA1QYp}|`>CApJ zv&l3QnImsz^6YbEiiOh?Pm4#esa4B4ervyFfKqpibNU>wbFMYHEYIU|Km|oF%;bBP zR)@@1T8IJ||JhvUtre1h#zNp=&UYM*^9f|qd2o{%Y|{7S zfxiGAIcOrw#dWYR&3p7&Jg^VX<~bf_02yY9Ty_M`+xzQ*K-XqMJ+hhP|>TTHmiK7zKj6l_2__#*wxZ9KveSfk1^n6mY(+rY|(uhbekD=>4 zGaOHn4_?0S*d6Ka@OWbWzSSsYodE=7k`{$!-j{O5T>`y!L`|a=!R^{_~{BhV+2l z%FS%Y^`g6`r-15eTOBxDkOTirX7f}^kosMx8}c}vsV2w?D)!1qat39(tU(B=D4FSj zCPjDjY!RVcCii@ABTqq!t@YOEm#;P2_JArRk3uyTK)G4lr-;#}jaPRd!%>x~AFh-E z%9d|SKXLaQyD?4v^t?M}-cM4N8>w2Q^|~J~&Gi(rpZncLR02SQ^v;PXagOqzFvYwk zwCk}4JEu;H<&7-JA2K_!Sj>2@`cakG6P$3$&3e5`aO|C3z=F(zY?yqK`JJ#z{#oiR zGGM8HZb#4nA+G&%T;FcOJWsA?z+QDfZbzO&BZHMU`My27f14!;MuXtGe>%ubt=YH> zolZq`q&0(!k7w-3n+AF;wg*YeSPraLYE{B()HdrmhN!TG`Syi7Br7Z+XMNHwn9 z?sD>Z2Pn(ma2UeF9CigNPbwR804eas8}dR2g$SeBwfb2>M;=Zn0Om;3V& zA=23Azmm(tZfc`S&5Pl}Twz!1G9j6KYebtN(+ODt>tfGTb*7WYO{jwd8!n5wH67P5 z>$DX@rU0202$UO1fWP=2)TE(M&7z*y#3tE7VLVe#jL~x5LbbLuPRHiJsXjyJZNl2X z^ViiDd+|r0tV{?KWX-lPQQ+rI{!M&7EgTFVP#Mouh}jhZ7^wlmb)e<|R|ut~WGJgX zjR2e>h(a#QeNL8RWc&$mu8{xSdV}M_IcWQKEGm0EsWyeyF*1M$bL6oz0j5ccsLx#q1?jeXP1igQSzg zcs2t}{l@S8!LI@}ogdOCtv){Ia~~-Z8c16$cD?&PG?Pv1;|XG&U33c3_Kqg+^EL~U zcUtRcsbRy{EAwrCC`=QQ-GBhIM-h3G4hhtg?)PIf9hgr>36th!3 zkNkIq%|IWrUaql}NGim7*y&2J;dPqC%o|%rW4=w1#uI+rS#bUy{E78zCr9GTgx(m- zWCn}VpiiKkI@xh1v=EUULK9V-@0%)pfBrZUkVh*0^=%jFvILjSZtZbF+x~91vb69K zISF-`gvhs>u_nm(7_uDq>+>%I_;eQE{Y&BSr#`sR8K7y0;U(Z|jN;tR$??GJrVtJm zQlJeY3}d{c0liz#7wcf_&`)S~s_SvdsdG0Fa5<+7L}AH(kD*+V);jJ(5=M_~-RrvTwGIKdv~@nO^{M@RoanMrvR;EE6j9uwZkeTO~%YJF@xdGy%? zLt<_~;kKVK8^kx5al+{LGo7_{r+R;t<}%o4xSpE0^>MG(Dw9Hq?8#XbV5V1V0cT#~I~+ju5he_97Vx;ne_89X z=OC99wI_!XGtYAP>`P1EaVg|Z>9k4l;m}EdI@%R76|N7Qt$1B@LS|#5u%|3jyUfr2 zR@eX2toy=a*KNF!@}=OC(vv#>VwWVASIMgMi)dGs35n*X{=%@M-^CTU-kbaH2w%yk z_}MYP&%~36Un;31=-ni+{6N+qM@$WQ&*XJEYhHVO%e?q`!`^{#G*bu}g3Et1&Z2<{ zp?tT6js%>TIrV_84Tsy2Bt!yr^t8ECSShVZvu8h3B-9kQ&G<6bpY0aTv4O&7EozHr zwCtd~4E1F&aSWi2dfiQw{|k7c?uh}?(4Va~$wC8y^#EbD`O@q^_qaeJe%1K%rH>ik zN99D~q$~Ix(eNl9%d1{&EZZMlWhsWBwVP>@@#HEq1|N&I0j~* ze{f2!@3Wgw{sKdG_9)Y{S5>Ob$d{+Do27I1@pE&X~Un%cEk?2}YDr_Z9lxAV2GciuzPPvGv_Sw5&WD zs>7SW;9gJ9>`?sZI5mO>+d(4Y`lTdi8Lo~DgJSlnpV!|?9Rg5iG`KiH3n&9m8!{ZC zA%yIHFya0Nh7j@_%85O{+(>nOv{YTJ;`5>A)f_-~tUCe(KiQY>(dz~5CW@K$Wz3yT>Y4h~?n|4<% z=&H9vZANb8(D{Ne8OYj`_i3oY!|GKsnM#B?bR#V~pw&FoMdo~kGOi~Lx{9Pyegh2A zQZ}RE_chTCvR(gDh|>Z{)Eq1mUNzf3mcT0HT_Vx|8&X7Fk7C!O0RxrO?E&WALu6{i zt3)7Du@612QSb8@o##)j6LUj1_A)MX#;jpn5o)+u!{e zxbiv|E~wTzFBr987V-2G4sGxAI}to-58Gnv52|V3vS!SO0UyXezULPhLgYZ*^lEG~ zlVsGH-(^mr3+zyU50hH;w2Jwa5ai!cA209mDzcONJPn<52V^c^6lFAz)o&EFR==rv zXE39_U9Fh9&V*w9FiKNLPGwmOz*HLf^9uSB(%IaxQ;b$U+r{d;ern1yZ>Ip_O?JiM z>a38SB)g3Y(fo_#?@E@j2%U7|S+I1s0PeMNa#LQ} z_2`l;H3`Hrz(DMMV)805gYb(>g=DUL(LU?3J2&*ZGgd_caD@*>VeQyO#RxGg(Z_&EA4)NQ7M9peGrZLj8j^EX{yt5!fmw)7O(RoG z+6RPa2He5RHAZGi0mnqa4DREG%L4LyRRR$taOpQ<82>sFdsR`%M3Zvk{EuI=&Z(Wz zGIj2;B3@gfwz3^>PIm5)+AVgv7n1Xef26FA*jO^l-3h=>ZFfm1GvWmmz$;X~1VV<} zj7L|~b)vZ844%XPRDZp0D9gZdb9pH4O=H5#>`KgFC>Ju>>1>JbLgIxSxof_S5BX~K zEDW{d7^0{WBGELqE(Q~P0Wt@8Z@VQEMQ{;bIOS!t!1C3%#0&irBt->G|1bMjYLBb+ z7exl>8EEB+xU&(NAHyN4O9uXRW^rDOnDAOC+^Ff);%QkDW@M`P1Y%P5G-XxjZ4%CGN#8@ULF?5{=7Uqq%s;+ zA1}1!)kBE$l!KJQ4Fr>&>5WIl0u+p~QJ|s(Gue&qqk0m=d@5?l!{bT%gMXBh7Ibkl zsRW=x7%duV|M|!nkd^FQ)}HFg_j!9mL}_LcQH|FOUID%g<5xyA zqOYVSGFo*;!IlV7q(FdD2Q)7F5(e2%Nq|~h900D6oFbu#E>nUdr9(iAKm`2wGtfFU z5Ob!uWkZ>b%Om#6CA;P-PI+UZM?@UwH6H@;hYBYc%6!>Oc)85xiTOc;XtR0>S zd{M=x&MZA^dhA8fl;GQ^0z@f+gDfXTEqJ5pzi%(yuh-JFe1tE=Blu#NGNIv(EX>Te z#e~r0a=pxHF7GIAY zRXHRu!JHvCmtq#dj)-ed3pfN~37DMUHi09M!I)8crn06hf~%n}GN2?bYC30klXJYL5eS8=YE?_j(OtTbkN5!7lS_>O`>LcH zdN6-U0f|EH>(~g9kZ;%6fukugq%Pp|y8r0xz|zpa>jzK^dhI2v6Uu$1X5I~dss`+_ z{%r)e<1W-syb;M51t}mFP|FX5IIQdI>pvp8m{2m9;E-TG2*a9f;Nen!5(6xPzdaYE*2UVlJZnRLG$YA4pg64x# z#%m)GB_Mv=h5+J&PA!K2MCT+xl|srC8<|t}H8{zd^;?CEWd5_B1OaloqU#P<^39y> z;&nia;1bRYRE)v%g->ZY%j z3JBoDx?I$d;I!XAVj42g;8en=!%d5m5@owucQcLPvGF>R6C~JEkl$@;vfJ*f1Xm+z z%02AR@reQ)iToj4A^#BVPlWsImDLLJJ``c{%Q#=C(wh`^#8IYyqd2Ws%%+A`T%Y~h zmV2S|grJ-T$%~P!8&>77NAtNY0oxPjD|>}5S-{VDKUujG1*EXGrr$z}Jo{fikTW1C zP)Z>H9FFioFcD%|4}ju9OgWPxF@byL(D*)68Ff&xh>LYsn!Zi%n5qfdm_|XnU1BWH z-G62O4yOpD?GR0YbxSvsti%jUAT)&<7E2LS62B_1e-CTE)&+FSVL^a4VyY3-DcdK; zsOqaG9!i)CXzP35?!?mQwb-;7Mr&Xt*7PbVf)dTUGNY7f;bW$9!K*xc3Z^{xPiIt) zHQI%mFQshJ%T`mb(rfRvX?((@*Pr>tur)`;3{N{zLhF&`vayeVjGxgiO}|wV35cHi zC^CB_kd%Ug4yd1xiok7DV5b&BK6)=yVeef_S}engIYNZf)xm)$ykW7Klq}7+l7Yi9nxPl}1rPVQAb}7l2*kaPcBi3*SeJ~{* z<1FY(@ehIdqCyqgs2Dim@}d`<(;!6^h_DF3iZA(@&m=W+pMdT;nHxti2xLt}8Trv$~PKq8s#yN8^v}bTzrd)}ieYYSL+@r+S zj^_VJhHwQ$^Emo2>;%c4PBfUS)t~bpc@-Fqt$9Oo!>iVqEV~zR&JHQ$>lzR>H}1Bc zbTj*%eWhejRi+ca_7+jK3AtZMHLpRp@WLB)Lqm%=woHi@kG#2M3ZiPf$FD1;(6)Wz zFUi9KJgX^iZfBIzdDQoDbPSzSrwj|maUR+ncb$%#5J8BxBWPMpmFHE2r8K*e8c885 zn~r#m7By~1#_EzkMib+!O~@~@yfR~|COW+jtS0AOJ^~X<-+FCYP6#}_hA$yugiujEU3L}S9CtxudsNLBik(kjQb-)6r6#8 zT>%ryOUYmuM|P^+FCC^${6v|o6L;_1IjKb9A7v8*q3AD84O^*9qnC?aOVtH&rQFXD zMU)N0Pwxxrkt9Ru{(M|Dwr~D(%tX-Og8rk5pQh7-0N;5HoY8%K?kwX)Tcm@@dUd_u zcx|-}2XlS_I40)ANV&buVzIN#?!eEgJ#wnV{-frJV~nbaIdrzJyqASW?8oX+>U%5PdrP+ ziv~_PZ_??)RQG#J+a9fPn{GLDM1dnM6;?CqFBK7+#4$7&=a6~u>7&Bu@%hv%*8B|c zhgO_+@u`-=RY)nPnlTHue+nip=tnQr`aL$3{O!i;x>O|^2gm7x`hIDLX$a_(RQamJ z5LbOYL;ymRt_Mak$v7;8HsjA@F+Z|z0L?Y{h2G7*o7hjQFQx*ye+){~XBc(_y~Y#TLwTkNEIZR*%4l*sf?Y#>{^FOuVP25oRovSr45Df&X)MY*9-?(briDi6aQ~z@Eud4)u&XBIh8iWFmT9YA>ulJ|6Y~fEY*+vVsF3 z?+_9vqW{hqSaB%OIbYP42UP6|^wB7&_YUqlULy zo7C`^&s<)ZcPQ}K8R}jm#52u&UO4G}hO{3cr=ahjk_#0SOz{8jype@AhmqssiR#V6 z<#b8_;kn9>GvgB7JUP&4y{)Ha>;?Gdxmudqjg*#O&uzZcIg!{&e1me&v6(%o!!Gj3E0=tmZ?0|M~DDTX<)*9&y`N6j~_8@V~2cT$O12 ztto{O_2Es42sCIk$a#EZaag@8@DUHNtk%x{a(8D0@$qv){tvD7Ze+0rY{q{!mxJhL z^wHdReU?WbU*-BH2Wt=acr8%)05FXP)*g0DWGSwLXY36pVb6a6gm zze9h5223UrZsNbw|5xB5tO2F?Uy@n+3+2UsBv!tEDa|G7dePExvEAi9`qQu$XtV%# zHHCZi44}#@Fx^WJPtl@gXFE!F6#pHQlDG$}zObzBkox~<5a2s7Rw9riMl8X17KnAy zgdj#rEn6(gF@d3CnnogDgtKHu;Mk#czmxyGM+%2V9?OjRcTV6k{ys6FVnadyl@IU- zvJ9blXqmQtl@7gDW0V2|MGQtK12UVnSee&3z@V7-uMY94-L`2tpOQ|V({sBzaG3ma zd59CFMFtKO>FSZ6|I_bJ(1%cPhZNi{$ZwFx1(>(FJ>4{{zbpV7f9~Y43SS^BKGRvs zc|-foVg&Q=OO}!L%vdQd+c;0HF{|DB=rzu0RC3lDx#1I ztauxf&b|W?ebm-*ofRn(IcQ=~8BR%ori}(P+)wSIqM+Y_Q1q{xAWYUjA1Soq{DkMr zsm@Os#Qe&j#geH-Yv-J!|5=~B4rpMbd7zqmkSe5r_ZC20Dq0Xgpa4@DeNl40b?{w&e3r%(6{0X&+JKfL{Lhv;Od+`T5=;@Je3K z>mqbzrhy!l+e8J8Lrz~_BC>&fq?FekdKSzRTR5)NqjMm%jDmuakeE2unB+e8`si?S z^$Pl+WnQEW3ri;+gGNwCzEq(m**ii~Xv6;UD{#OZg4obsB|JQ3n{`rQl71}Y&gfvW zm)iwJo@XK_S=A;km!SjAa9I4jcgy7jrrEZ^%rtyX4mFtu;~=CeF8@W1X&_#0~K z?9Z&J@y5}JP130empz5z2(ZbE)q_z&vfp}Czg2(rh1yccTg57kj4a$btCP=8Zq@$K zP*kYVGx}YD@K6vKyX@D~MKd&8AX@O-3YL>2?yGF23YFQHZ)#goVJUR9XqJ}iqXOTi zQ?hIntM@08h9n4cplx(&=;A}LYKFXNpY^I`4Ols}oe$BbBsARqU2uJNJ-HBuX^^0NSJlZ)2f-e`vCH!V6Y0Cf+ zHo$%z3uISEcAfUeHJfcT4KziyDu{$(m@AH(=Dp(z{2HU;N0U#`2<>1DXl1)AGrZT&ec_49fpW9@e}HqR&$ zwx&E6Ylmgi`}?yq)rzGJ_dWz!5v{Jrf*G=-S8aQ%&!_?nKgI@SgPBKekAzgeZ3)rI zo$3(+3TwdrQ_ztnQ1qnD3@aIB!6WLxaew8opR1{2>GM(Cy^`go}e3O zi+XeK1D?J=5$Mq0-PqN-s~;o8`;Y}DptG)gcbIe8zeCP$x)tjt@16usoZ4fIXfams zm5X~vrp~!8?Y7!|Q*l%*epT-_>SH?*hXY$)!V3Ob2$UOi4uB8Gc;sL`D?EoxncGs5ZljufRPj?OD1 zBl26c9oQM$#qw^uKanNRYxh{u@wZ?wxLTIn?4wgSa7F~H^!vVdo{Wo!W}lvJHx59L z^50G@VmFFL2BrG4iUYkoJR`AKq&W5Oe|XU3)9M`0it@fCwFq_M(!lz4pbZI=BF%tj zuw$GoggKxN=stwAXp1*iI>-ET5)t-7wc5pyjMMxS2ECH&v1UOxncsT zER&pt0L%%qu~=Lg=+QBR?{zvOPT;~h?vkY}SGVSA{r&HSz@kIPY znU?lk2{b_6P9omq&G^}T6m6!flZ|EWPGqI9^lp@f?y=oZqIbh@wWeK85KW_V<<;X$ z;AwqB0|u?ew;~J1S%67_@l@{O3E;p-z~`kjhJk?r0*XyQp5%5@fqi%v=nS)@+vZrD zzYrF*2hiVg*evByBN$OaTn)t%h|d7Ac~#9LFiKdnu&+k|xKf#)GPBd`VKP@hfPq1N zkkIdl>-CrGW)BEwpy%0g;Bs~&{GQ!%J%poPmt7TFKe-B$_>3|Y1qI~&%frPGeJ?Bq zeG-C2INLnJW0e&V$?t>F?(b>3T`ZvV zer|`w;6A+%VyhfzwA`l{DNZdwmwfJKw~aVx8y=JX#)&fR>GNX6?|mJv`z89%DOcHd zvs&v^RV(3(uF~|kL|Ys#B~+dB_Nkw??R%N_RoX2pknlo5?pI`WCXx=qB!gbajDlZG z>>f7kmz(K8+H6;n{BKZ02}KPhFhCu7E_frmh@lXD27`YLi)2Eo+0FfBh5TOF?zZZ- zZy`soy+1RU9Jws;Nj#D>)LUx?ysW*xS6wD1Tv(h)6C7bHvfjPU_Qln-*ay{ZUt^`& zh1Jex;6V|8sa<{SZ1d~l1HzS;tM!gF;(o-MPHHY@A&kyg%611ey$ZRw!_C3!Zn=_< zk|Gq4>)9v#{nLd#OM*f!sjRg<^cz?ymK8ONf8Wc)PLlLDH%aWgg$fJK@Wx{iAvH?f z3?kuI;((Cz7gG9;X~(I}?c= z>o-A8fp#({=_E6Wo_arerU2pRx=5tv&5 zDuUaSMZ{cApv5u_xLx;~tA_<});yB}O_iXx4Z+5MWW!k1W%%UR>2MR2J1)C5PTZdt zT`x(?k5@Z;ij*StoF0b&vvgoYqvagADYPGBWzt$Om_!r6A?Jg_8aHkQu#3xPvL_dN zzg`U~Jk-LztEzPy6{R@60JuBkaP!Y?+P5CG0vjvce$a;J>G{~U%d8UhcG(NK?Aj3N z2l{@N?Q!q5Jm2zY4CP{^9pab{A9rkOwq27-L6S&>d0l6^W%g-J@%p{rnrJ>3ti@zs z8`L?12ew}(ESpou?E8W+mIja^ z&}lb%^4+JYaZh?AOA$OC8cz^-RVd&e8 zcjO651Kl~q{AtuIuAb3FzR00lh;CX3n|;sw4CW4DYkiBEiWnAv+icse_QOW1-+;$) z{=2~Ms<5wpcswbM4$4=wW^d1y{DQ+^Wah+lm)-Ae!-P=_saKEvsd$4S0zCno?hBgq zP!{65h&b~21q8(Py)}?Fa#=_cVM_Z93c*lE3Dgj}c#1r2uuw}~ud@TyhF~=Qo74Lf zYRe=_``-tX?=5sp==*jCzw9JX0L`yfh#q4XMAVq+Ud8R~!kRIB1|9|Qp~nGuWe)x^ zYWFSxcwoH6?*6vP9juPQ{Lxcu=4nyd@u|XKyFa{yUEtM!Kr)i!G0?Z)Q@UfsVmdFN z-v1ZSxMK!P-M$aV0{?Rz15&GNEe;=L9-N?wGw;A37<4&buKWvr1-@5FWtJsr z6xkW~7DXsrUEhaoe0m3(JxvvcW3T(|9*3LQCp=V(NGui~{!9`9(HnrfDo}AoHGTkS z&56-q9;S)YY+)vO0rZK)_Yh&VdgoyofUEW%fKBG(ZXp$weV{P_aE}mg1xaTHzYljs ztelc31zwD?yO|h@&0%Tb|6}j1qoVA(K44x^oyn z6v?3^q)S1%q)R{=Boq}85fSw}<8y~T-~0aiegC{`v2I!Su;x0~Is5GR?cd&eDxul) zVjGu=Fe?@me+|%ob^G8f{K}8*!gF`D>?_^-_U^Ov*5~>eZaS1)zDn>gLO)SCp<#_L z)SPvci$3=@9Bt-axG8f|v=gU5zec*I=US??O{Gqt9@E|&yYqVb!Uhw)#v5p+Lg(1e zH=B+s;N3qNOE%nj(c|~Ta;z|})>MM=q^?MMg$h=`;?Y}@N36>LA%@()!*ZZ*^hitZl-mcp}lMt*k{U)EeKTeJ7Y4pk#%5y($ zJ!15BcLJ>Y7cYG!=--2ops#Y5wQz~P%#L{B9eaiU`F0ipReAkase_b}y8{bg6Q#90 zZl3HeH64_E`t%?$z7H!eOTLQtg#3I1D?@*uGv;04C}s9mp>zp`y{;T{Gx^aE9Uhf_ ztrvL$FlUShRwi6fTa)?vV9KifsOb%(It;O3z_T{-@H;O|uI!uNaHKp5*Jnwt;aI!> zy}mn!{3c~7hCNIp*}%wXVh{d$)&N{$Ci;3 z?bmYH)ewaRMB}+NG8ltIYj<%77+&S2@R1Sw9r{A9TlJ}^&(>ju}g`7<}pzvpV z!1DV0t&dlQ-19k}%Chu|1~j}Rn)ELh6oB_&_zRBCHRM+FiSYL-S;=_iiu zlh8!jb9trg>H;&?ik?pL8h`i~U3EB!fM#@IuX^u4Y`{`TaMk}$#-yxB8Oo9A^=%0T9c>s&8<3`}BWfU3l4?R0Bvd`oR zObgAnBg7Jm(se`A`Nq%BPNkzQRhldmNSEG^K6Lnq9S(8%csKEgE2x7gU9Z6Tc7r*( zAe_|yb#;2Px(zh9Sc)TbXQ5Paq)JXK|HI<5;^JbBZ(tq<8;i;k`FW}1%4Z)sF|uIr zm118Oj!Tw|2_TCrauIm=fv9m4jD|oPm5xf@eOu@9npPkoL%=LiB8==B%fM|%`r8#Z zdyF4sCJ zfhrOF@B>T|;P%mDBk+apPqm5OUL8!E$I_cTa}2L$`}SqIc?1IH7G@^&jXrX%xmj^5 z-@*0ycqnS?Dylt3)T_k%Mt_Bk2KJ&*UqXBz)fG(cm5+Q4ic(X_<$QhPR%>Ur7=tH& zHasIjrCk7$z?u)PM6`Dnd5?USW)76~bQw}v1UawK$?`X6dXWRO+7MRq-a4mh|W@Z7= zGoQu*i@XV}!Wnp#rGB>2C7t}44AZyyEj~#%`W|1YxOj_ttKKHg`*74D|4Rgat&PU` zrRdV@s~TBx0?NOov?-_+plE_MG!fK2@=Wf{JUU zs7o>(#qBy(-*xiWT@(tJ>e~*~g{*IuqJHL4uJ+2zc^h>*J)QIaq8!y7F7vZafxXOl zuI2_)9$>i{ zB3B*<_k!MguGJC znkSZC+>bdUs&J5bsoub@zQZnN>;8hptBTt<8P+QqSL^C72b$iUzOI`nh~blvF#51| zZRwjSsK|l7Bww{1emEd_xhQ{o?PrtKhuLcOCB(Wn!($)8If!kkZj^g%WoJ|24aW3X zOb7-;&Cq^{FdKhwFL)8Odt%Tb=qHyp7xtLj8A#TB5l7cEtJKP3JK&2fH*T&W>!NBE z+!&y`7x}?LYCyH~!B6_fieL;&f&E7HoQtn{gSq$F=CYsVCbwS{M>x%RU3Flh5dXp? zQuFzJ*_}RwUw(j|KV$ZdU8Rb=Mrwh!mF)$kxQ>lM1_mOi`S}n0CQ5O%56hBM1)XXh z+MVlyzu{X{*H8K3D?lJxaZ4hX*+O=5+{&BpvUdm3wg;P_TtFOc@0%mxOYe0xts9DR zDsDS*35tOlG12RR6%Tta3)_{VJe~Kg`F{^fR0$cjBb?vYP&iw2R?YF7r^5cd)0dpUh zNbcL-4evjVR@_M(+shWutM8e<88d&^_;N6+K=>@TxGd}8b!{F$($w@DWX3mz!;re6 zcr8R03e+<~$8R#s#+0C~X6`h>Y=kb*OOqzen{Qa3I_P;45|K2I0vYkz$f&TZz9%VHDk zOex*dJn(V`u%<}JNz8)Tr_5QH(geY7W_M}lE=34URuDwM(_ea74O4VMkrE0 z`sX7S5qU9A$aY`Lmsu;dFIeregyR72)?_pQJK{ReH;er z!D=Y6B>+8Z)ya|GyD=@FqkBR6^VGHK*H|^G9jT>@>qjP)XNtv0S+bIoql08sqm3N) zV9i*^>j4Rj4a-5bd7e>2ZmVn+14TNoxX&EOmdIS1pqiLWbKVg0na#~Pd-l!(_MWMC zsW+`f*HpEK(#7%|5w~RCnlD5Lx$kPH*j$^Ht}wCcEL@po^9*x7TnXd28|5F$?{p&5 zoSez?)bTo5mS%8pMC#8_(%5MR8y3QIl8z|thggYSW=2c#50f7zZ1^wnp(0~mj$D0p z{azZqB}rCD$<)&ez57OcXJ{Ojqyik(W1fiE=RPJIdxRyIrlJy($T+u#9V7|i&=Sd^ zlh7f91f7msRIl`P7?wz@m0El}AM`nmJbGiV@%4!VJ6o%r@3<1};l_zq=$rgFhyl-# z#ODQ1n0>TofJ1hU_mj=H4qCf456x@b!>)d;@>X@ZcuPF!C;7bvWAZ4QGt;cZs_iNS z$=~uKtjbKD9i8)82x(K&U+&P6#ISd-9E3Yj6){ivY_@`uC?SPqB0A&Xn?$}s$tfuo zaQm^l(0hr3y$p|Cm_!_>3REbL*KNLvN^rD*-l5qG%5)~o=vV3xK~luY#1rysA6XHe zq-!WD|Ehdlub!TH(xN-YQZyaD|ADs=dyi$gzxcJ;DQbYk8XVEC%DcgFhq=`*E@H_d zxG{HWcp=GqciaD@QH`Jd+&EX@4gqzR9{c6~AAbLxVr7MfcZtyI<|3Hi{qy_w4CE@*=V$Z_z$w(a#2{#nr`#g95qb(C5VQ1? zudk22-p(+KFTOxlQ~Un&yVr;J=`aSR<^>S;ErVOP8a_OFOGC7)-{juNEwUN%E$_Pn z>Cm!!R3w2F@nWjf3D-<vyluY@nYaU%EGt2?Y-#mLBs8y<@;8V&V2<*X+b{%_EkJXZ=X`O`|^7UpqYY(gg(_V z8zzWd7z-QwO1&7E`{FbqRkP8=b6BM1YHBW{8u>7Ez5I={4~Mss-oYYa5id`y+xNb@ zdk;8ZAH%J0T26nXf9z5?>d_$Wkf5#Hq`LZSF}lN5-9gy;rO97%3r&}`FVe8& z2Us2}DmdulvFkg|1I(P2rS|i4({vF%&E_%Jd$Uq{Jik~H8x6^YohSF;F`CrTBnhkY z(DQU)2{_%(=+?gm?Lsh}|FlQ;^V^-1%%nKG{zwW#S*=@=2~J>u@WG9_^V5)o4iuMMwj74|%F6ehpHuYmN=-KNvc zi0VyhENRYcrOT^H!{3O6Et*IJ_Ri4y2jc>Yb7OTe>2Kaz4ayR^q1Q_uvNBn0wfN1u zm&-+#mKGKUMLPTWzx-gb{UIF5u57Oq0RF&Js!RE2qh*Axz$+UCJeTk{qgZ^7ySrhM<>XKL|cQxULxMJZAC>2B|lj(hdmWU7}&CLp8NPp=e5$sspgQK>NlFNS;AUM z5GrkDUnBF&YV*Mv_f_K@bae z{WFB^h(hsqZg<<0q)csBzT@*N2)gC0F*2s!x=l_;$w!WV>A8oj2eLy6?nysT?VKF4 zkor=-*!;Aj_GkIhnQ);6vi&AwHY&)}8rdM301sS-08!)F)jbPRPYnDz(cPPE$TIc* zzxf{!1-vhb3%DFs3SP@w)zZ#qo?*n~oi!hnL{aC>rau`noKfde*@K7;J_v#;Gzw-Y zrdTcgSWn~L&FkFGc(9+|z8YK;d0uvc*k^Xz7nR0-=pgS34dA)Oqf*szBJQXqcAa+9 zME({T0wZMDrxhV|4xvBb3(Cqq1(Ox9XkNk((d;Q%2!-adRB%_5Orv||{T&^9xEPYo z+~tc1I<7i8KDT;5*;dq5z0?0wNyIHVKJ%QsTJsMcB{Wb-mYM_-gvLXeG0CcM{^{F% zs!>FpQn0{5i_4Nj-kVLA6=_}sh_(1PYZF5~$Fk*vT#vW&N;P;UGJ>7gl;R*aZhJN^ z*K#^6`d>7yJlzgFcfLgw{VXn1rd%|{u z<13%~TSfjIUK>Evy<%Q1-EhO9p=1QxjiD(<+ox<-CLT#f=-r}ULYdut$QX`7ocjSf zf4m78fADl(AYh~1E;PW&SvS-=rhho2I$%2MQHL6t=4A2pYi3=V74eZ(L4{018nLB` zWM9J_#n{>N>}Np^{0<*&z@p@~g#L-{V_iRloAiUOV~q0AHOA%^cA~5#&1eUoSml-o zjxLt(+#(XnbRfCgQug3psmZg6;j!(z^>eccgkz2~ilf}G29lG_uht7W%Ln4Nk|5tl zP3Z(NCQ=>q^|@ceOa9t|45N=rM-{g(>PTk>YqMt4;*S+m%wD+1LDnaVA20k!$cotX zyg3OLz{zkbsC<3MA-hWC+jo_{d^vubnLMTm*xq&_FPrP^rJu``xvU_`uoEf^xN)QF zqTt(RP=QS^7FGRP967lrAGV~keKOVTX}2v$zerpSyLl=Q*P^xdHZ1eApf2P0N4b7^ zy4W81J16?|9RJ}dgy>R^`B!-2He8kfeob)o&1(-oPKqCSH>FG^*3!qj+?4@)H(7QU z-n4;+kQHgEJsxA$mF1fUvlneUAY$&;A)O4)?ETWni~p-i%fIU^ptAE@D5 ze`bDTp=M~g__@#o`Mu9V3wW~6k$6I6`0Yb8+2_wc$6oAH;0%9oEwQJ%1WqOHzGkgG z@}vCLG*6Luan)J(qy6l~5W*-S<_6Q&w+T$I*jK`9?56LDuNSfD z)7LDEq9XG^Um-eF3Po-NK{dpX92|o|%1wasgS0?~2#~)%@c=_SZ!P z+qa#?Cjn!`tLpl`Fka5M4vE5&E|(-fL!w8-&GcXPGPvckr#R(z-(`d`RaT9l z?|^jk`?E%C`o#e9MU-JQ`4&5J+cN&WQVs6WoF_<#S%Ism1grX1ae^9fmmq13lNv3F7W z|0*r}S4`n5IMbkKRr}=M|M#2S0i4uJ5J~dnZJNfswKRgC# zc8Xb(z2^UXMcfsES!5OwEmQxWKMw+Xzd^^(_`kd%Gwx_q5{94rabo{GKG+Pr+t-M= z-v9a4z^fzHODb_omlcQ3l8`2CUGNc*=y23htJpYyK}je=)Sxog(xZp#U~cjHvtfQF zE)kJw%j#|N#B*0S;fb*W1F)FqhT9#2f4&`fse6fQCDlgP5m3^Qp1941s})baEmyoU zp{KnsZ@7JEbpNW%X(rs3KD}JD{Po!Wl=kuMYU_xl5OzPmbK38tj~!n8C{40^@Rw`y z7sr>S61peiVJsrXt3hDF?)L7n!mVt-^foZ8-07`P0q^sGkCjXxw_hJl**b}p?}%-u zYY4QQwv?SH-lV}a(yqR>JM6>Aj5vE1#pYbPdOL#QzxvcA8DuiW1@LCnS)v`O0PjG$DrwnFE-|f-T`=&o~ zUKTX)cxC<90<9slTU^#$nUp}&(Gk#=^*9z9ypPa5rl=RMRhoW#{r;oV*B`R3?iKYl zFMc|^6|sEL|M%++euv)x2j3Z+6Y(oL>4pc_`AohSyC((ON;`=jia!mQ9(n@Gxr*Gk zp?~Or@C4v*XNH+RU)R>wZfhC6ctRi9rM>c8W8~h)&tEo2vcA3D zzS;GG?A(6~9Rxg^EstiK5Y{b>A6J#@pL6f9G7rsNT56C;rrFsPHy+N_s`iP)=xz;Y zTz1#{sq1VHy~6v+{CgfC(E9z`qIGccTf1oFgy7;>fhVp#7KYZpo4xzKvQxPKfl*{+ zyTh=~?PjNlAyT&SI;Hz1f$zl1pPnsM#q~D){bwx}kFz|XY}3}}uVDJmk5$_#z=KYM zm9G8q_9}oR!0pRIr1x=2wH-0ZE`E3QKEV6I< zDGdK1_x<^hkS1IlQTgyu+TY~AKR?Hp0rGr6NUQZfzZy_6K-~YodG$_JmT^;|_`+KI zKbIJCj7+A_nYOI}OGJjis5qt^X9!m#e!9j9an#bDJ~neJcYNb0bnWS4z|jX|F;A0% zyLTC_QGnW+;O}DCuwI-s;G(m96Hu%`E+2_P^(eO*lK%0CXa;aXmPOn~l!(2g_Kks5 znUihyz#ze$FNc^XptcDOcb~i}o5>$8t$#N)K1zWsS8`~zAGv$Op}j_`mX~Sj12Bsb;D?l45a~GDUV~yy$hp| zegNX;YdTj&eD_w$%gdp6FWWbQeq3VrP7aAkX#+n`-=yI&X!`cy+3h;}@mjl4kkuu6 z>@zD2sNwCPi6CgST5I!?7V_it(`Xj6mbU}Q2qvAsn%kV^J?{V&1Kl`9L`0lTPmHRD zZC?Y1g!fhhVB{|>EPzLRoUbPxzNxDlJOs)uWg{>O#;VRwgcA;t%DJ z9DE;Vd;8)*f%P;bUBFDbx0#$lcmse~v5xpXPb0;>w#PuPuC37=3Yf#^x6{ctU7zpw zdcU8Si;u?N=}1ivKZ;2A8ssjPRSHdqC~?QuA*b7(`FxBlfk?n=CIEFB9-My&I_O=v z>Q5~SNFBS=Wf{AVA$LnbZ`I`Vjdrm#Lc;&D`yFuLE|6U!#c&H{1uTqzhNPcR}w5B63w#Knozht7q}@&d(p;w-dv) zjx-*Mx-d@B`dW>lbGZsZZOrh?6%yqa$Z0KDy#gnny&5=x$cgayiW|I#9f{|zh&oO| z374%epd|LlTSr5AGfV$CbCx#T-Ftdu>$0AVcfLv`>3l+n131{`Pa{_VIp?o1%PWOq*SOU%y&4zQ65Vl)TQnWjY8(8!dq*2!n4ota1x=}X8c)jmfZ(N_ zdZZ?_-hP~IPM{R<>q|t!X(W2dbLc5)%^6ZKL#~Zih)^z~g%!6O<@HP&oEciUzsm&J z;h$HkQCeL=LwtOq@H!Sj91I%EZogk-=85y&kHaVnZqr&5hu@2t)z_-zQj zoGcs;`R5-Ec9x-(LduMN2>wf!D(ATxA*)`N5E<{?Ss)UIwqyN5B#9DLFksk@9lxHO zh`KJ&lfX>gul2;KB*KUzf8bDpPG%y4L}mnp4M~7I z&F8*CN#9dBwBCXa#(*EqzHzxTYJ4Nl>UF&n9f9P`%nVPKR-qv21>%RVNbd2+1Ph#^ zsWUK~I@!WDgSo89)_GkYt2Ddc5Gx!#X`6IVpYO>1^Q~|oh#alp=g~O=&Ws~kJZsgP!+D7w| zP@@Y9*56K-snI7f>w}B6Cm$+XU0+|1p|OxTE6$RNJw+c&0kIKzVAGDGT#qc%o_*wW zo^(%fk{+dsqDS$=*nX^}?HyPR8I6n7~hnhJPl;2XZ6z9gwK=+WcHj~%u&-oaUd zi6s(MAvLRaYVLjW&U((w%zUslN^vC_h#B0Axmss>3q7VWaf)`Enn-f*65s}cM;Y)$ zt|7Zg=1q}euh^d6D#%_}X&mC8jZHOS(*3&^fZke7hX9Nl6^M!Uw~C`=S57{Y68Y!Z zg$5hJ;hX(@RqDl|!9;LbcU!=`hZKR%&@~zv{Md+^_FQ2_S!nWuT|lV^-SrIH0>1*e zAfe`x6@G?5skf(fgMG`b%~$>YX=}8ezHzcxXFUUw+PP35oPkYWdj0xQ8Ihz~s=j2v z5uago8RLqKsxB6-AF4Wm#cbrqvMry>Ec#oral*8U4}__+izk{qG}r<9{N6X(5qrSW z$63RH2%X81j2T?N@F!)-;%2Yb36tmx>e?Kmy(z*T3&6TF{bD>^sPqM z$81Wmz<9Gie|*Vq1iX>bxfjUqJ>T|L!#D~RpF9HuJ_K1m&70lbAsQtOmQW0Gh+Gn8 zgzDki=hjyTR}pN@ze+%k#Q^LSmw`D(8>j)+;A~tkURcaqKrt`7QNbrx6>lv`@NxM? zALR~UYDfLWBmnkcrs{@qQ{&=7X+50MQR8m4%m|vQ zW~TdXbvT6;kJD4`#@HKhXcQDw%bX>5&MpBBzGe$llHBa`9#BIGAgI-7ul=xUYI>RGOjQM{68{9Q${KoxBu;S#ZfWdfc_g3#T$+A|~efU$qA zNONbdwz$6jZn%&MnMfqwDdpD&&|tF)be^>z&tJU)jH=LOw4RFajPctxw%`Yt0}Zxd zjb(vS(0i9SNUbx{o^JTvtEVE$Kto-)aDnDCapV(nY6KaF6h{^!W~r`Sta;R)Uj6qw z7{ZkpEU2P)N|3x0 zk^^zf2tEY9O3@*;((6tX_Y0&rC0hf~-*bb(DR^Sv>BUyjWc(>W_B*+_^qpV3mKp@phH4@n(`ol)?JeZ0Vg}qg%k)aGVGE8Dw`BEPvodMZb zKQGpUnl8~}H){)YQ-j{&eeWVC!eY>&E2O^XKBnfZ%Dvd+z78C;EQEmRbI{BmwCul? ziaRPSHp2U#mP3o_7dR3o_8xK1A<5 z74m2^_#TB9qy~ISoahi-$38pmb{H8RPT&9&-Br{}9jBj&bFnTK#&|05C=EmWes*yD zB>CO=tx|$Z*eQ^w$s)|!PIxhkGQ zWnz!DzH=vu9WnPVY&QRH-HCyQH>hxIOoOgdEFadhwX8gH4X7gR6SAU}J49 z^w9(`rq?ONQ4ad*gGOr2+;j8u#O_cBjz4X2B<>g^S6+^Vwu?c4ZWQ+1)D&kIc99Sl zZ)d)Dg##tmP7wm6JFkrVew}#-!JRuFwI55`jASv8#E@sY;BzCP$&s370&1T$9A8yF z&M}^7q!-)QAdwu&k-n!Nk7i4QGKCuuIOYojZ$(GO-ox|hwVfq%c@geH>j5;;J!%K; zKNe)}3hte-Sm(qGmOU#7yxp|Li<_bn6n=$?p6tUW*aq3%HisAtknR!7D87TFU}E(`7fk63ozp>qV9e(W3}{Zu)4M`#%<5U=a2SF%wK3 zt{{xiT~dR3P#qVkqOh27BsuZCN9^yKoK;cwK>?hAtC*C4Ro)z2seabf$M-<74scmx zfWC`vNlh&{PY`ZIITFb>aq{_{-yTP!bGmi!iN(kzVVFe0$~D`L}nup6M-NIlu+utz3kVYG)Sjl zKm0iGZIvny?}dp=z>(#US8%;Npy@bG|RH($cj7Pe=^6?&kHM~!~AuupM|L~VU^}GoYoygEXI6Op#ojso-(6*o>mHiou!ze(8I z!a3qC&5_Hn4ix1|?=9Wn{#g)t0Xwp5%TEtHg1Sz{HfuiP6mV{?_G-XK( zTDcWya>@W{%X`*xD_{%*Ykrj56Dm&a+4*^Sd&CHF@qm@5b*5!_W>hcEz8sFwWEj-i zYSE(n4mOY@*MwXtZPh8qbejxlL&=q6lEp+(g^;I*Uq9J#V1#YdkzM+Q>O^tf#fUAA zQ?0-+Y%7j&1ONJV=8RxfnUNWFv=B{sJfQ-hUDxHtK*SDOnm_WYyi5z>3k!bqh>7nn zuRLZu;*9SE`atEo1z^fCS~h{8 zGQ0@&9#j2KI0sG-%8ENQxU7T;Tu{**#HJuo4`--vwuFioK z<%q3n)$0B53HH;pu?>!m_h4WPUuMl^g-9|mv}41mhrb*&ljGM&R&NrBs4b0_`1kZ! zpxeOLRIMbGW0``tcSY~j0e{VNdn^w$Z`=@KnQBQ-goIQn1T${|T_pn;+Wy|LPNX1K3c9ay%F1>_EE zlptg<=_!qc>*<#~dK5{rx*8f;otr6)`CXgi44rj!^e!nuhivdx9k_GD*;ko5OMu4& z;^im!QN6UtO0>j7az_ba^tq~wvbz|+q@htEnXmWI{TLYLGDyy7z z#V7I9?3|=mRc$SMR>GNiK@lxt3Nn0S6S7luvR|Y|rvS{|Ya$BV##J-{;dp4@^XJda zF^7d1Xdt^frA0tF3r9*C&#k%=wEkLIu_~(OMNi_n*P*t9y1y-J0Jki(K$YfbAoMw? z*as6Yx+2M)KoA2QFVG*~wXGU!C&wbi9xOwes7fbbiZqyMglDU($9!au zM=-^sRX;|Q-KDnrTi`>YLvcQxO=0)QUPL=U_@XB}x=O&!-LN`q)*B-dDm%-HcVE%F z0W@{rU21)VeAACd$9n_RW_Xl1MLpINu5o}Sb7(r;aq?GlQ?n+yw5KtMM#VXNS9kiv z9^BAsi8&6ZGGJI9)3cNV!q|T7rl!hGxk+3fUdG~uAezFMXAIqW9RC}NK}iFGJhr!k zF^d)v?Xfhx>HH>xkG{IE_flrl-mvX&3Y8UC!K(~$6FpFK0c{B+VRlgf&O=-yj0?we z6Bjcbx+E*>uD0+cTl!wC=|uL`fTh-eGZ|A)&1C4$knr?sH6aNXI;Z)61UIGS6?AWY z_@E78E{kc*ZEO_tX3L){yD89w9OUDgTgHfs<*Yi=TE_n?>ftb)YJlwu+lMO#FKooo z2`n|_@#yZNJJxu#LC>qZgG5`7` zNUg1{%?+#+M3$J)0}TZKPLyh7DsdAzkFwg1Cogv)2%Xz`xk+JuPQZmjBh_YYVy>?+ zEkyv_os9Du{L!=RzlM-=r-}jz^B8{g(q77K1Xw9oFB~IG;udv zIr!BRtoAq1n1=GwThJ>5tcm^5tJ@d1N3!xZ-|8S zj47?@QJKj|T^r00$hl;v{gRQbz=#092}!eF^cA(**x9m%~p@ds6nob})Q!Di!sR75Mii=f{53qXFa>1PGnV0Mm>4ow^ zz;*WQSpcucu~B6>kPxiG@x8A1j}WKSS#%LUH;AU``6du>vxqi4(laJsQF83)VgZB5RP9bUem)%t(9fVygMTG{&=F`dp+f2$}wZG-auAkpfrKwy@1QG+B%s-#J_3ir04V+v0s?eVm7E5z_wn zijV4rAkeX-rnc4ZQxWhU0L~a8&n9=<@BzEkE$R>uIKTadVy&rcLKbF(XD8xBc_6>c za4jg1st+cC>DA}Z?4lA55pD)-g_ht{1cW}R z%LzXRQ}p%q?gZwSQ*ZXh;s3Ka-AZ6{qMvzN`f;&T-T(Omy!ZYo%q~1Db^~#iNVAYs zu3aCxSKd!m0Pt0yWZcx*LQDVlztQHj8W-h#8KRCkfd74n#+V_f(Q@Z;5vBks?Y#H# z@v;5d0w`yC6Pe2ov*!?@=0oIvGpey(mGOU`YOunTQ#G3B5_ib?cbuXqkHdAs<9M!- zkPvJTaK=s6*u3mt!`0g`FNS%f*HJSt1Q!!`Hh{SI?(S>~X&6+T-Y!8;Jxknu_7;Bu>EBVpDhUojoc4$oN|(heHOanc z2Xb5%-Rm|WJm(X763+_WC0%|6Xmk+rg>dJZJ2}NwdM`cq8yHdk!J(1KvL3Di4iQA; z`o`!GQ^W}O>bQS3cO50m5P1@E8!0`m7pk{L5)!7YmXNonv{n=nODm8L-`Y$~@VYY< z-oQGM^S2g)m8>_9^$nH!~2;fJb*VhpalC= z4Go*dV%!X`0e8-CBPi5m8|Q#l=oKhqzjj?4NYb4UQg<%li)h}r{3nNC&Vf^pR8=11`8w9o5z?nbn#A7)B0t$Cc|3Qj zg)>m=E1sKVdu91^tfAyY8Cc7Oev=flE7Gqjr;rL)Pfc_oQAI5Nnyys5-T z2+O#qW-n|nM`KB7c(Ceev2MWMgA{DgeMz4oYf#PjM;k=r1WIvlU|^Lt1n5M5a8&_L zT*3$xIO$@no6{3|96$v?bS~RK$Vm3pXCRY2nj=$?oAeMidUs51wL_|!Rf;`#6lK!t zPjlD%mv3pT5$$djMh6|Og}>&D#5pC&I-M#pVK4=)$C3!Scw;I{1U?n3QSIqw5C6@DQKPYh_yZPil%;UlQ=6oH50p`>)2c6J(u z%LLHIkc*+DKc2W4Yn+z#b`+3vqw<}oI5bNRK#Wibu#E60tP`1kUq16_aE_7D{XyMI zD*yoiZqfsp`#~Vq1EvDFj7QN|=&FsuS#}Z0t8p-gNsV2esn;f;Gf0iWN}D1d0?QS$BQsQ|dw z3)K!augsnOk>*Ore*pPZ5M$G{WK>H8l>E8gMaWgWS9u%R~S=!M0A0I3p-X^H{CX+Eh#$aD(p zFAbP6zTAj6UUmDh^T zmd=5Mvy%92kQcq^98X3!02f?AgPv;8Yo%XNU`sV7fuq2UzS#P;X#L&0G9i$Y6;waw zTKyJ@L1fUbccR?StW!zG=p^W|QI7=PP#gX0Dz09@S(%@VF%imeLjaD0`ZOp#!xeA7 zL3ZU^s%gK3FkNW{!1^FhuVfGa(6TXQYr8H5Y~a@e5HVMmvkKqBDh1mGoIfBxMYB9` zc5wlmjF1xY5Y<6}n7$vXaiN++QqXM;?v603ACX}Q))oLMTFF)l0bTl$>14s!A=Usb zh1@hE|4-&K#4QR4)~c&ocWyi7EC4gWd`e#_&Z{EiEmD&)EJ4B!8;* zDM^4`Q34`;?_>jLi)DeB!`r~Ucm*~s&38V@W4k|Gfdb#v{-O96fa+l|2m&`qN2wf8 zJCJ(=Per<$>)XXjhCuoydx|}M@Lnv0J#zRv;x8)bC5(|rE2u~XzCWCAv^|TT#Y|NR}pU&X;Raul4OpN zl9J+DsSm%o38FN5A~P&*2691oqCSWd!G4Js>DK)N14GY)h?s~%OycD(bmgm5z>zb- zWWVx4p#~k>l_6;?*?$829?B+=t)$TIXC2Ow$>}QwZB+mfpbn&6v0^n%7lqBIl<@dr z?_Guk8~{JYnhbI_dlfi)7Zdw@r9C(LS(EnlzVHaW9-`cs!D4`On3oDb=5C^xZiH2S z1_mHzAejsQ(MvQ?*dLjM5Z)`{%N*v~G5pW2ty}@-=j80!IT!_WZx_rrDPQf10(~uB zb#$zPx+fBf8!;fC49@-mQwY#y9c(F&$sOMN?ys#nW{TZ0HN=7x<)Z-m9W%I~Y9znp6vKD>8d2LPHo&%UgjCzlPjC9`OUrs)^Yk1Y>!13Ee3{V_A)6?$ry$^r+Ak zXJ=>mSBM?0|Ex&%0edrXCmlg|Q38mvbo1oTUGeU=lMZCyocYfgQbJU3K_WJ`x0`H- z8MDy7(s$v@`EFmEP+gl?JJrb={+r|N({gVb2x2~U}<7GLnZ^{L0 zP%6hHl!%GagZw^#Ue@7Oh$GT1pgjBSVFi%K;g1k zZKt4I;Wp`MlH=?^GSgqe=gimI5H+_d|I>33Lxp(mjfWrm#E(orj z0MIiK`nIe$FEgUvbl*Me%?%LHSUtAAiw4RWAeRozVstl7=ws-_IyGIsd|1R`E&!^= zs-=R&C~D^x`_N!a9$2Kg5kxyn96#<5^kCR`0B$?6G4SclQ-ncw-$=pwvRu13L5Q0g z7z{Is3y+1*=1e^#$3I1zr(hldzeQg?Fk8wGVT9_9KYvv+J`hY42ijsujc1QsnHzku z?Im?d+Fy5&4QMPq#ak8qcD!>D0A9gO0b`r;KicK~h-6@t-@0c5486 zrsZco6cShvFMgU;5C@Qrgfd2&C}f2dF9S=kaQOwYb77k#iNaD42>eGvWf*@9Y9E#JM5n(1P!m@Qxyz#DR-! zmA9;`GO-tt2>Iy}5WIC4?UxA&52ejyZBZ-+c(6qE-CBSW6>1O*R0`KaR+C>TA#)3G>1za6^!&w_W=p0Vv>sl@ zQ7=HBru@sy!hXNpt)WOf#Q7>!Kq52)h%|*LAVHP`o+Ze3LG7q6O4R1@pNnE7Xu7+S zp%~1>m1l;L@FTh*bU&oiRx!9X9WRxfR#0v=x$HQk^G{;bvLNdDfGlg^>ODbiW z<|V+f>_rcBR8h>BDdGZZz}s^#qv%_oIypaW8ZaD6EErz}Z-g^`ZEf$u5!*ARP|z z2&pyzK1c4ZtpP8Gu3Y|2EucezG)aVBetC>1r#gnK+&SI-7go53OWpktN_nWOYz{nI zkmp*x--jb@ZjZ5I0x~}Qx_WR(Xe8d~(y?o@_BKdRAnO&A!UAd@KR!1%=lk({eNx<3 z1qq$d#B!&^jkAmKQz}WWSmE5>bGMAXH}_WEdq#Uh<0iw6nmctupL1lG9G?4TU*9^f z{5C=KtbO9vs1XgHGnOiK)4r}+&6BL;KmQq*Acm;QzO(mA#1|25NyHx>w_nGj{XTg* zC)D!g=!Gfw=_%S{qd&w9A*Wz}=N;CL;9-HD?5PhA0IKbQO|a`3cm# zHpPJ^#8agK`gZLx{hy$3L3d64 z;n5(d`lY9#X6(|q5Gq^324}Y=EBKyrU!kYTWua*wfH`#`5{$+TQ?dmWcG{P2Rg~>6 zE<;nHgf@rN1qnM4{-K|?`s@0I38@XqVVpYjM=ttS+W!wxe;L)*_qE}|El_BS6)456 zMT!+#+@%zEcXtTxE(MANTHFK0-Ca`LU4jR9chA|s|9Q^)@`b@5j6sr}z1E!by0674 zy^zW-5l77SS1KUcK3d!x3tu>hUt%5R^3o!sSxZ&a-Ib3cbb_o?2z(BGT z7h*cO87*)=Q}S)1^tk6&Lj9aZj8bl#{sO+ vH-GbBoJj5d~#|Go4RHZ#*d5cYoO z6~HPeW{J|h^I4R|;Fcp34gz>lN|*w0Sb78j3ps#@uvXQ1T-8$u5qtBI(at7w7hc3A z83_6hWPTC7XAp9_@O*eSxvs^TynPsJ74o@{XEJ&egRl~=D+40Q4L${ZzvuHG z2xIg@N8!ZH`Gyt5uR!)|pS#d&DcoMz=-b??S#hZ0B&qvmMMsI%)0;^(VQ2OQwj*T7 zM#dtnLZ93DqIatq&AqVTpa#sBC!HY+H$OSNYXz5?r23LvrWI97ftRPup(HGzAnMur z@Up9xZwqXgnX2}G;j#%Jz4FDC_I>DzDj`S~BF_BPDmr^| zE|({?^$a*ghI}sfV#J`I^Q=KekIEKHc;&YB!pT={_dYwh>WbVKp`(-|gBQ=!j<|hZ z%R#%LSgvv7g{%z-z#iP&^dJ*9=_qgT@?=xk%~JP?RC+Z*n^?kWkvavQTf>i#HwFl~ z%}w3HHOk<@0Adw`uL{6XCVIW@&n=?ri;dxNnshKB>)NxS-h{E3qaxs6M~_a;^tgbd zUcKayhg!MUPefu8*LZKM!}=NPqhDi+yb_fJujYRbGGle zST+O33#5Qok~_y%8b2ILHGq99{*RP z=%{88vFl(k&$j^%H6l6)up+$czyvKK4sNTip!D=(8AIQ^(-HDt5cSVSK3y!T_h?+zvZOfOz3e3d zj$$AD%aBt)fuHI)K+!MO6RMPB#b)pEa>oL;UsFCyJg$-EESfVZMnijhYIf>Uxl+hS z^D2JxIy-fh>n>r~XjUKMKWXe+2QA%R6pWSMEagZ7}X>!(dFaDd_bUgQt({+Hus5a&@UcuiH;x>*?6waXFrqaztt4G6ck%V|hV8j5U)+VSNgnZVH zpAS1I1m9m|QP%o!;xD?t$idt`Xr;3O;`iFvBz3~0YXzNi*3l+m&%;!J?j$%Z8A9kMfnvxtGS>Pln%TKp9bM%rMz z)$7*?5V8KG_J2iM;OWnR$8>as5~_M&D<*u3^A3`;YC>o6f4rg3_kWhW7xrO!Kt=xl z%@+Q*_kE3ogZuJ~C*t=It@!_M>hOQR`Tu`2dNP|^)G`8eRC1q#%h9`douDw6AOY7{%q%Y1W)4j31kny0I$ zzo?nXjA$O@x6AqU0}b2mU?+iTr3O_NOGQV+fT0^zlOCZ9{-s`|uN=96=jO&n4n`ya zmt8MLN|0r4OFnSsdlcIN>djO#J9QOG+PGpxV{cEC3+hCe1)Ga9l73 zvEzeBlW@roCs*CVMG)&gfNT3H>}IN9%#du1z3zk#8`64x_oXf!Tqd{gQbcq%)u+&4 zzv9;9n}p-MHIhK(_mlBM*djrH479?<@zGlo=)&2>=tVmhL-EB%TRM@GIX^7SCpB(*@ajL_MrxceXjhYhxyrSg4QCPLOKlmjQNPRFl#oMk zKFZhK8K#!FUJTw-n7j7SRcdVtuQUD6<$swV1#-TCkS`4xEA5_a#+{e-SZ(Z|P;ZWZ zGb%h|={;C+B(gl#`A#*i!DTyN2)A#ZM+}+tRzR6j1rpGEr{XML!65rw8N3%`npMX1 zpSa4j>g{i4f0VdN`G16eh(bSqoL5ol!{+nfhNxOi-Vm|p8(p}GgwYU|oGxAMB~!Iw z9WD2Iao8`Gos3sS&|(c{bu{V~+S&8!m8n#%Zh5Z&D9IGX=g-s5l~CGzeX9+BvYMM! zgH+G?>PrU1q%T|vwv@7z3Dml6` z+N{nU7K7rMl17`ZmB;M90aSMN_+rH6)v(-8GR%$qfy&GMn3v014?Z=N zk>}#pgo)rbUL_23)5}UvEYtNtbOd|)Gv(r0=1_e?vt4TU+ZY7xu<*)1`#zST->i2( zr}Eju+B1+PE*<&()J&q+S6XAUja)S!-Te`GCR+%?R$}BzC4nyF53M~%XqQ&e*UKslmS)RU6 zQ^+hQ%T?~3mErfG^`02_(XB%Opf`WwwHY|f({eIBJHI@4x+&Iz_Qq?)0Z1sj@A@TK znwzp#lls;lys!evrv$C36Ny0)F-k(2EU)$>mlpk=`<)Y>-f)}bs;{XDXRl+t@G&aD zZ~SEo0kOdpeu4Z5UrpAlLv&rY)?p4-|Et!IG?9gYjdRvXMHkRlD(D2vq}@_d|7{uG zp@_Z$x)GwU#P_JBDIIR3oX=I8xxm(uWQ^B&7ZzDyafQ^GYF~}aV~+(})tc#Dk!6on z1NH|l%bfYhS-;y`$H!mo7Z%U=Lz^jx4B}jWPk&1@d|pA7sGGi^x&)iq8PsT^Px%cu zLmz~aD)d_>=ICqIdm4HxES63Mv6yWjjgPncdCs5i-JgAri)FQ{nKN&^A@U>9>8`z6 z;(W`UA?jq_SVR{koR-78yfUFzt`5HUw}l(a>{e3%am+pJ-uW~`I{DAJLOem+$YGqN9lH0$2YqGC?I&|sq}6ozZYNDr-{6gAps zxICU-n&s@6MYcF;;l4SVTq<{{N_8bhb!3_u&Rq-%d${O&a4*yRCgEOw97&NrGF=fq z!3d5A*&jyeH|`o=Q45dNMVIl#kn?`PJNvxHQTcFNWO4&~I-Xc4yt=aHD%EewpZFfA zkuRUHeMu#;aNB;P*Xpy@Xdp3_uswr2-ZcIe_!uZ8U6*i43y4#qyPQ}1K{8=DwNEw>M{To^r3Il9&~Lqz-rSkJ-#$u{Du1cfQRc`6%(+m1T_O7FZJul*?)EOc z29d~E=8e-??+e=|R z#<0ZYXg!_?XLtt`>j>U-G9PCF5kmEdob{^FU0e4oNttr#x;nO}i3m&V(i6u;VFCv@ zeps4R@An82_7FAfORQr)&^SICk3#ZeOtq>q?%RHqe~tF4()73%xON)$$K@to{tS?V zKzYA!mp%sY`I-7dsZ7q`^lJ6WJDI_=Old=rB4vyh_~p9%usTpMC%9xon$^z~igu$n zt%=%uabUW$?RvWxmClN}@C%^Q>Vw7Q?56+bge^Mvu@zW9FEz>so8O^-*yA%@s-eTq zDA#9)jA)P9jX}VK<9U8p>umYadc6MkRb3C$*}y)=x>DsA4qN_IolGCr6UTFd=eT$+ zm1w1v4O((Y5{rJN_vXJ$w438q=D%0AfkW!f!S-MDzWlOOnr zZfnG#aCQuQzy)eCxzH}72u3n8Mx?Q}>aGp@?2TsL>+o5#cKov^aB+G(^|P^A4t7-J z%yDMFvcVlsxaPE9lh||YxjpNh@-Ep1o$@+fxIk(wXL~^;;?We>eY(pRUD43`89U`1 z#gxbI;@B=X4msP8Th5D!h5HS8WV?>HoS?a&4T5rRK@Sh83?v#YhRpE&{f_>`cdoO(*RB!j>p6CaxT0NOw6x7A)XH5*NWJSvsg!SIqml=Jr4mMyK^LDx zdy{PNE*KF@++dBNbGhHcSWqHAeMvg~$s#jd&&z##0CD=7N*elj!l)>%vckK5HcW(S=ozConu4c0XOZ)v_lPfP?#wD(vb@d@g z1?@m(5AyFMnIYrERqixl#|Xv9a#QH9eX z1b-?9GZYN{eq)vDuRw7!Yw_#3|GG!|Ok!l*Am4oWQZ4O84DM@MpbroA-A#n>`^{B~ zhkt1c-Db9g8g9qiB#|C)*_kCMarM*mu|1xvb440Q3xY=p3Sx;gxmh4A) zoQ#&`ph4S$nf2NS^>%co2^e&BYtX)wLLd|<99@1XgBC6M9gYs;yzF(XzUR9Fe4NG? zBQ{B4Hz%r1;sW@eZ_Y!)+RyF|4}IZnc^B(&v|Dz|1#z%spwL_*u&kt6f9JM^+QSm{ z7H1V6LC*q7i^$%!0~Q*DD3w#W=-=AwIF5w|503UW!h9GnHm3wc?%>Bw+#9Hk#uqPh zh4UZ3eO60#0g<}ke5Nf!68+H_Y&MV>hi4Q>HkNKXyU>BqEH>{fd^ws@C&B-1wIv zS~t3&e1*vJ{SB#Zxl71A`m-gh?CRRJg@V7)|15UY^)q1_@o`UN z(g9b++U3Vrk^HW{;^Rw1eQbJsE_96*MkSDq9x#{lzAd&w3^{W1oaSO8qsm$ZGGB}P zK2FryW_-XqkK0|rFzKXePZBGg&`Y8>nk~LN^~H|9&ry^d7D+l5G5WHx+*QT0hG|(! z%e1HALI_nzs(`n+}oSurH6-AF6BnmkXZMOlZjxxH6(2z z{?i-sg-E@I9lJCvThpX-phC)i;k1K|i(TS+b>1WMw08bs_~2nSDW^yJ!_0zWA?(tk zj_)=}gy@@@A~2-*LcpSv{*Lo0QJE9}rApEZvcG$g1qS@?Rc0W)#=laFG<`A)G2d-; zNFLTh?0J_NG@435K`7cn1F>`@f<}!CW|YYyEiSzXv}xeHoa5l*q)^-cmDxW({zqzm zOrBIT6%JtiOa3H5kB<*`Z@>S@F;x&1eMA>N!hgmly7}6ZSZDF4cmbH;%@u$`By)PQ zV@dE25`#WSnqe#}C`_%Z8-98Bv?t`le?mQF(Vtr}QtPx=+Sbxg2t2dTubnryjuW}} zG(Hah!^js0e?a0i+%zRgvllYPAP$@T^0<(p1A`s$hf2Hey;U=euFZ60X+Ai=s`@!t z_MiauSqs^ax~xRcnzB`SMH-zr0;#usYM)l|D7QdQw&$rZfC$VgM zL9&@2@%>%A&u`cKmbJUOJ}@YYawS5KfXjZm?dD1GuWSg7jr+#i?9}Aiw~iY<8pno` z$=G{S6N?WsV_@xl?7WJ-huyX1#LnD5;#=ZWER~UQIJZ-;<(_B?r_xumT;Sg(xE4a$7L2yKcHacSQg(zwb4*%(Kh> zXN(>S=qT;_o9$nzJvB-51WfB5XQX}#blB-KX|&7<7Nm&fT3_VhKfFH#zvI08S-vN? z5nP+O7I2$dxU7Yq&j>2Qw@Jc26wtI$sgMs=&q$@32+m~Bw!T1UmXf{AUERNDw?DS` z`E3o%gk{!7bTTB?I&vPApdWJ`M>|vShtwy(sj3z!CgdP81@ziwUy05vG#nDzmbolN z9yjfNW~&1&L7!)al5T@Pd+kE^D-g5_f6|5AuL zvg^JYtUc@PAcaJTkbD&@Kk)}4I4#U(QVk507?8hN4BhQ??!^_bD3_C}o}0}#nrjtR z?`M4K^E z){wH|E&~O84BY{MpQ`6lM?3%U_=Yr+DW14ov#7cN!IMAu`H+-n(2OnR2)))QtLb!_m^g`kgA^&yR>5C_q*@U8rkT>E>cQa*mZ^{ey@PHmd{rXF7?!SyC0Ph z1BXsF%8z)pT!Y5q=qR$SpS2H z1|Z)}P@k*4hmMW#!hkjlQ&w7n+h38Fe=-CLo+m4S^zbqxLAm+B^AQ*8$H6b7?*7FU zEwB#dE89iJ7`;qAI=B6_TzGhVaI`RMBIRxdN$sQSX#AiTuLS8xI`dFO2qUd5Bugm{ zBILW$YR)$CYmaTO^TO@Cydy{|) z;y8@?iHqCo@^cq2AD{Cs5K29~w+BS@pCkb4E1t!m9bo>p0PGc@V-IG_RG5M6v_Fuq z3d44Vs+!giBBhc?^oXk>(@4c${(*cKY#~3k3yjfkv@9w+NZLrt;aIFT0$QgwSzxTg z`IU0VI;p7-6$?&^kmYgh)STa}M!4s#C7A~8(`}?!rb8)w6q}J3We3`6=sWUY@@WpE zb!zIcVl_ek5NVE>4Ci_b&`(T{_0_Jc?JFBIY^G%MWQ1s>hiHq6Q0Q z{hNxo1AAwUY$7OVV?V$gr~lR6LSKPz4WzY|AJKIAd$&e7y*N|AcQOxVDB3$~DTvzx1A);4Fjv

y99^fS)l+jIQj>)xmGqeC>3hq~pH=m%bqJ z_E`Fk?+`{+NVVGR#Fc9)9-hdFhp8t38K95YKp|F(6ZYOhjtaEzqL3+|1XlA=(Qjk@ zs77(AP`fqlV5RIYK{65SL{wQ{)zO48VK%b^PJ;!LCB1hk!O+H4kZ4^Q6}gS#(8Z0!ThOPORxfCo%#7RrqPV_u-F{RchwXCbs!DGnP=F4nVmA+X3#Nmvd0bGS;>Wpb71Z*XZO)0xiTVb8NCKtV~5eRA;J z$5Xqb*w>4-kq_2s-x~WYN&@tL_7?JN?(K7~{@fW6EoBOMKX;kmV?*f0B43?IE5w_Vv5hYIbn(@rs^~AB45E2EKS}6(_-AGsUAJ$d|;|Kxur7 z-cxWEKx=H!dI{|5p9NbfDiTwmVC=$_;tK5&@@&xe2j0_Cbc0}YS%h5~Su^$~kLN)% zzSX2zF)VjxLjpVg>|VMn(4FKkr|Dwe@wixMr^6GZ-;my4C)_NZCTLQS8BB-x7BCRR zpFWh8c{Jc)P%G9XJo*iuEhZKF?S3%NSr;N8;^DD&Xn;&%bAB1(U2m`36fEHVM7HlRN;Ia7qtP;fgK7ONKBUIX~+?%75k6@}x#65L#tjDSx)JwnaeY&7<1 zgai_BkFgoV?|si5JJW_5Mg37+@3;QT+sbqv&svGF-jxdd1p-FBQZX%DwpZ+=S1-$u zBEMbuo7`g%3bO!(4yhKS}bh8T_8wxVJ@WMXUZ?>MuV)fndrr#4SuwZp{s-luF z=doX$qa>d$SU#Oom-FHC?**&F$~PB@!+{-CCB)VNH3Xx(3$Q>J@03Z4Nv?p3-sv-_ z5}WbiJ7yijqkY$y3~tfFRAs(NdkV!S{fy94K~PI=v&$9GiOkeLt|L~X;jV)DxBplO zCj(%Eh@Z_ZscYASu+o6*# zj5>)Wmn`Uaa@}T=Sf0&=?Hb+A=Jb@&{$==nO z`=Va1m=N`|~pcAUkJiMFmI#odC{l=r`5(!FYN#%rSF)ef?YZGZF%YLSXM{$SKcw`+b5= z$w_8BpAwW0rwi96J!g_LaTRj}fjhREgCkX~`^hUxq3zVZY^TgYLfs~SBQi@xZ zRqyXPo2%I_WUt!gTgZOju2jvSM)Av=muJ~3M_N^iHa>UlF@BTA|CKmEk0O~HwAP5@ z3k;qx?SG7Mi}@TG2=#VVL>z#w*D*h#x|Qkd9SqHFA2qw42&D(pMN@R&rkY*&3cv=X zw_;#!BTAHPWY&ro7V=M^$MMddLE-N1h^rZooRw#RhZxErwH_KH>;fjurjg1#s8(8+ z)BWZ`^36`-59eo%dPCI5@Yi*bWORweu-dB2uyWO_0!Nh{F!>Vb@pNU{`K>eB%O}o6 z7J&7`_wofo+^s`Lz~B&&1OU7qnJC)=_vQZ5{tkYm`M5$#1P3wu6HDzBd=kY;MBn8x zY3gbwXFwKK^zZ^vuU4*P(Y2qKho1Y18|(T4M#RfQuHKiccDu7HGDeEh}Bm{iSZacK@jfmlt1cpg$pA&r>xD$DQP{A#^O ztXEICws%59vK+yoH{ad&rbQ%L{|fru*Sc-&|JeweBVaJzk+6E@Jx86DnbuslUm%}y&j6d>LI>}2V7VbW_^PCWw;z4>g<5aA;l0J=PFwvouCPS$ ztJKxo?_jsBPk!vq`%Cg|3!s&DrTK7U9Hoqk*0x9=iZ|rHXnx@xEqfV`|9W;s40#A+ zm)n)gB(jilI#35&?W_m=L{(3*?&*~$etIxn#tdUDP>bV?OdLZLd4bAO)IE~epna8a zfRI?e8^$UT^!?jlV?OimHqNtl7cM>_e=-YQ^oHN)X}hR#ghb${wm4xKfMue2E57^Y znL7P-JzNpt`+6D#A7Q9RaN-xM6SBEp&UI|iV!`w_8!59b;y;?bJxN;m(x7|0*l3*N zHXHJ=BZX3OB;7%c0@QcGqjd-HEWs>2#1RfRq{^drH_(Jw)w*!hdZFA3_H0nWUIww8 z$gwe_|9_cpB9y;chcn%PGVDL8Qu2E+k8vmvJvf}qL$5Lf0yf)<02)ugq?u*DQlwHq z#AYshv-&pIdN7eWa$^jLlAvJ_3Y#$e7do{VCB^Sa1O#z_Si}>6e2c%X$N~i4U;jA> z;O6=_BnuJDaGp`3qbmSX{6*g&DEVtfV!^m|76p-zy6o6&{F&Mj1k-Sk^Rr`|TuYN5@W%gC$E>nf;FIqWo#_^WYuFj}+w z!fK`Sj=b+L)wI_)Z*m(Gni?BX?09wqkyU6#E5qTaEohyTWFM4)@d~PLXE>LV)lre__j$gtb8of?{+nf z7KG|6{_V8J&%*^VkoMVBgYoJ}^l6R$R;o$R*HojCFo}YAGVi>g$zzAcvj|D(4_7iU ztp_r&5`^5Ri6TN?m!%g+VaFN7@vrac#i;d~9Z(zSub*Og>#?Rq#5;f*yCTXlxF^suZ5-(c3;YNWsK)3n9 zB8EcLIt#Le*fCk%L3Nql=s9RdM$H3nv;&TefCWWtX8}O1$uFF}dR9dG&Ga>a4#Tnzb18{-626&<~r#!}4o=+M{Hj2ZX6WGV(_q2OSj%-w& zyKNQrK&DAebz&Vqax`6_$zc|Tz`G)OX!UWny?}1*6V%9&4zKOX?KoUsem(Dg^H}|1 zH8Z&Ux*XFxLA&PJAznb&AbrI*TX{`q7Gd4IgBv(U*#x*j3sM3}7960jK5+HZ|F>c?5t z_?M_170@t#xNE;TS}M2E6S`^mH-a@dp^+)r*O0*sFKm=U>Ov4FZ^Y5Xob}5mab~#s zq+#m7@aa{lZMJPi_BgXRSM&w_?FDq&jU3Z{hITo=O_?Sa@Uyf@qmmiU!nEid$i zlY~`P5UH5!L!AT^p820bZ)osH+hv^n(o~KLtQkL)UtR$qq>cAhnrAtR?^re620Qyp@(;hv2PE=!lxB}Jr6WffCYOtR>+X)i7>J#wa?#x#s zVM%YZG^lo{0gkxCAGUDO4IZ62mZ?a}b4nuTw>vE#;e)qbftH#bhpw4#i9MS=T}j3h zrcHc#uS@Qnlv#>+x$` zqZvrdfL0T@b4NP~|1Msiz{kcJ@=bvD0#KJm-NBAKnYP!RFR8M5h}{w+>a^YbI!Q=gtKN%CsUEu{NRZ`eoBWfu;zwF59G70gIO_*jb$aCvq`5X5t3@ zRv#)^OuBljiRiHpa@qup+SSJMgPzV?!PMuTN0=EXAofa_p|uz0etw9}#GEXNvn+(u zx16-ewiKj!kdK)f@|*KMJENYnzR7u8Xy?U#tvJ2eaTiE4b1`AiVvIl3LY=-qfI?W)1ywCOU%x91oGHzF!*Oq|ueAHi> zi25>kG-)01Q72#VZhUTZnQOK?a7md&a3cI~&$0J83H^CEyOO}3 zT&FAiLGUHc#2*b&KShL>wlbVsIeGv*`U zlA=5v#f*+M0+%on*8UL@s{WtMPS}#laLRL$E9u>gyt#oY~xhKnbp3+OPm9}43h+)L=H7H5NgQdHCU$LYhKxgW|J<(`} zOl~jWAehMpEI;43-aUtYo&IO!*c{;wU~$`k@uE;ngQSjDe-u`t5u>O2j|$4ITB6R} zPoMm03;PEhAS(RlF~bG{gjG41uU4)vhEZ*L4=3~G_?$K+MQbff>!OxIfIPS;MiqOb z``0Nz3l|L9o8+y)6TNfbUhlL74mE9g)+u&<ADFuDLX!pwR(lv&jG^@Z!} zz#o@O&jNg;W9c7%5Yrb9bEy~JU_e=K#ztBi>@-`9sm9hw+$pfdQn;rjusy?;@3EGv zJV)v=OO*4ZBKe&IMR~RPm+Zv0arrQQ$Z$&eP&LJL5aAF-ba>o3ehTUq^3lAxSg@4Y zxX|X7&@ia1oQc-6_iY6?guk!rzY&Y@)U*Sq1{AzX-~-37e@UHKT1liwJ)5kl1mY|>u3vm z_XVY0F108!fKfAdHB^lxn&O$xB9rv+*?whu+Dc{PZLQ~3BH9i4^kY+RkkE&$-FQKh zdrU^o-yPp5sVM|}-qYzH<_^TV?ZKMFn$~Z(E3~VdT?^FjH~#W+_@31=aFo1r(*M`~ zue!;<-Rn-;)sodtzoqphCcEL~i_-O7ft?m__rnZUP^+|u`@k8lZziwX+sXama5l`; z8UN$^$LbBU1dnnj@Zf{nF?OLs(f`SIU!w%Fnn#~@b#t%B#xLTKb9ul&J1gOQq{dW# zT|o*$0)*%Ex{a(=dhZ0HqoWzj<@ktWkz%Ewcc9N7vVY4ZleFUhAy(vnZkf>^>Nin4EKkuyF$hI4BJ%8W9`+;~-@UTXW5R_Rw#uZqqPg|keP1P%)y zU!MUMKU5d;-ydrxf%OAOmVQoT%}UJ1!Xba@O2WRwGVNV28}>scabxV8Nyz=@&!2Bd zSwBv<)X9dO>Xz|cu8w`{3UPRUT;kKyxK&J28tajxV7)(E7Rk2*LYsSRpysDsLRr;#Nvh)v2@peJ14n=!c2mfH|&5t3y59)~aft^%-OpsF$e zfmq@n@l@(?C<;mg4U1J>>h!xQFB;O`gOHu}E^nTjxdD@(D-N$mJ|%xD10zHSRhtPJ zV^h!Wo`a9y1057(l%-?n@XGZ;%-G+&h;xbFa_JdY)%=Cmn!%<@!%D!zS(|TX{WPW@ zZdkb`1LELinnlLg$AaTR2hbYmGkP#vCqeysHhL$7E?*#~>O8on?bPFX#)Ksz@0$mwrH?o5lrnfFM0L1dGz@FpxUc8_N z_lNdyHd^jXK?)YXR)F}L$h37gV@!FMY0g@B`2VId6L4OeY+C{yl`IHwjzL0q*gMvx zU(;Sd4mm1F|sawR7Ko5U;h zY6+kLERl;b@)aKwM^Ps5KdZ@mB&}};L2!i(o+7bQ`kel$D(|$bAx=E%*IzHUk4S#( zE;V|#0jR(})74W1#aQYx5p=XjNo^$i6mQ_u)8PoQUh5bQA)jX(=kX>(YFYp*FeYI(tF}8wzz|5_Yu0o=VD*smea=? zUt>8_MWZdcQsLugeTv5pwl+U~Gx5g0<=|ExR#4s-LrTHr;qwRg$Km0TE~KeA(OP#m zjYnmtbgunxmT%yLJH<5p!wAU)q7n7GWedv7-1NDE(9S{IJ)0= zNyKW}cU;RYr_Lb|((E*<#n0$EFN$)l5ZiZVC<$lcA~y@m%X)k$Cs{|h1m>}PxT7Ls z7OS^qSD~6rj-ev8r)2w{i`vt2v*QTgxX(%lVaD)T zt}S+NcpwDHa_(_Gq`Tx=pcc0KPLqI$-XklT&Cb4$SA4`A)Dp4J(F32XwECT|axVPa*X1|{ z(hf6UHjG1K#$bxSvTjbGc^x5_t1D!npzJIBDO3)^gXKk7lvyB;-}!~Z#4zRYUT+!6 zehky0wr}Bketv8r(~vyQHZN2?ru&-=}lF7E|9$+s>C`yY63zMG*A z@-6dT6AE1=nIK57JdeI3e8Vn>6KT+t#a=x*l`!kni509oZJw1s@eL*3-`mM%pi8Od ziPp(2&TjZ@9zE?Rg@3XbK?IY`RI5eu)w)%(Xo@}6*)F7BBhm%Q1wy~jV#QDk-X)wb zE#uSE%p_#q z?uihyrgV`bCAF4#ZTQJmtnHlj{=aiX)PKzOI_BmnbqH6sd=*wzJv^SRRh92le~S~h!?#vF+I|A_%I&o#K;C$s!N#`pIEL^wa55Uw;z{a9+UBrZ z;!gL^jt%W&-<7!xk@;Pq=!4Aqm+v;}yTuXEup3;BmsS#5qn{(yPbWmp`uU%F3^J1| zg|`{?xzeFq&C!q~P53ZASy);tJ0ui^t4K@zk)vEG-zU3td4WSQ)G$_S+5e*81YXCJ z64E_7y5hPhj+HjcgrB6FDUgqj0JK57+?uRhE=)AToZ-ovkM;J-N8z~NV$=Vso*yr@ zisSl$S~`15FRyLkS+TSO1t-hP+Osw24_;E~o0IWuALmeKq>Za3-D zp(2&5E^n?p?}&Kmov*39j_lFH(*q^)#8&M0cV44UZAZ%_U)x_{oxnxf`mpHRnMfNd z4Aw8Z9x4*wt}7e+tD)k4xXi;+r_WfN;G}qAW;_(d(k89fzCu2ofa;5J`ta76M8M9Y z?E^=TLCC(@p-qBC8<{~G9*O_;deIyIzSbhFbU2Tf=3}ug4_`9V z=FT@{p_cPO0uLs=;^}W<-`&|DaL9CA@AK@#a1-9DmDtA&)un?taebD zXXef6(o}r2pr=`!6USl|u5?ro!u3^X6HGGa$ z2jC0^C#x#9`opEmvp9+FhgbKtyOXE0%V{4+IzBxV-SnRYuU~GL9i;o6-c}>U=*6dg zJrQixnQk}AEb2;y=;LY)h-_?M^@~7bx!BorJx4yQS7?E^-9i{{tYJDt<4QG;x3SD= z(8GV_yT_)Jnw(B!1PTTb73pFLft`0>aZy$$ofTuAGf?BJ8I}va!;8=hf@&m#Rf>Ofo7VJitHR%O7bqu0 zOwuI0F#Rj^eA8!->{r8RtH4a=?S>LjtW_aCq5LD^plq?R-c|76?H%KJSW_rzlanDe zU%QuD@v3Sr%H2o4#QFn{A{(0mkLtGs%(54S(X84|kF%vZaRx(&i8aq@b-=8zCFt58 zoT=L6W%e=8TPdA8ZY}k;Q^1l^azmUPX6X*t?V@DY5Jxg_cu*eC`#>&WL&J2X5O!L*5hC|LTQWG z43kofj#(T-Osi9F)Mo0L5#=z}`l|no78>52g#m%EEN3NPELLxG8io4$m0K5$y4Yg` z)8MJfa~K z6mls&%?+K;Q7_XD_wWi_*6Q?V5>JuOGSPjtgY@uk*tsv7EI~%`S(za$Ve0Xp(VF=vkzFBVt4|YQ$;>&7FsNY?ynQf@Uxt?kZMw8&-_m&1tQu zVWPr@kyx-3x~Q)DdiHKjk9s(nr`n(@wb}1ZcjfWb8(m)iSe=~5x6(qsFTY68U6M1-%^eRiU4_GX2;&IWh0wdmNpIbhK&PSXl$xFuAA0{oLAqqq33`|D9P5 zqqT#>qOQ%(0lk{pjMqy|t^zfIFKKJ1=^3i5P-D_rJvO;FuSb_b$Cqa>;w|jkI<@qT zhwjY1hIBo5e-K|zv`7w+-I5$3!lrYxk;Sn{xRE{%w_qRF?8wjN5q-Hh-~Ih>Mwx+? zFrU_3dGSR+dPPgCtG|tv5!GY6y+|zkV$>f&QgcM&_mq=Pp&7 z7rY*ZRPCOeO<`byC8Y*Rv)K*~7z;G8Wa_Ae0tSwrbnq1tV=8WrpISnn1?T3v*?UGb zv{HA5OCdx<>u&gmMF*EnagUk2?!`aiW>M^xTSATsd_3R2lmFA@02lm@30`edaUEO` z|MAnRvFv@%7kG1NwaM16Q-`Di7IEzRGM%AgZ}}s2gGlL&`ZK-H$H_OnJ%mw*_jw1K zysZ!Y25iqrKAw-lFE8iRBRYs?=oK^A3ClCve6%&QP{r(9JHwf}5Tq;4h6>|f8EqH^ zd_(9_Wr+nmZbJA=NcAi4he+p^XY`Ckr0dv1x^Ue_csVNLGU)yvs{S%6&Zg_aMF|iR zf+V<0u;A|Q?(XjHF2UX1-K}x=0Kp|lSb=YHOApRvdON5g2kX{u^Tt$9s7e1*LH z{H+2OxGwIlBG2#*7sFr2jv6%e$mc9YX7c(!XpfUlh~SUzipQkN?J4wajA#@E>wYRK zX7AwZxV4cbY(2}gG3(b`HdT0fxX;LtP@|DTLWv?^vKvC!U4K6(*q6)wzTkAc0P77d zCy~0gct>8?b@AP`)|yFiaB==%kge70j{oI{51BftX9+G4c+R3EDHt0ooak2tCDP!N z37R`9YCvMiKw5<2_I*c2&3_aQ%4L!N`BfN*`|emm!7@cbaYWT-pkF?KLhGxTa6$+) zIUDR5eNCXs736SHSB=&ccaM%Ln z4DwH3;WH)G2TfzY{u56~WwXBRWfj~BjQOe#{Tmh{0D6>EzJ3llquN^{;-e0#pR@I_ z(2?D>Ho2BE)TmV27aj@_dnNcuXDwCB05f+}?8;8LJ0IBXgr&DYwK+#xpMCI`H=xz zPTcoIS*lEX-a_j`c6{sB`A#;Mea`O?Jm~b{d+OceK2q;ytIIN#vc@UrdwsgrW-->j zr_rD}XA6c$9E*CQ0tAHOD{138>|Tz0C2s(O1KjRA2uJ`ZRce;EQoD;sRS6f*GSxmZ z`36?T8XRt1E6_?OInGz=n7Z7CO9q`v190oow7c!^!wbEnQY)1!8mn`2ri4-xg2#@%MCVv z%X;>wvgmErzcu>PW^*l7xtQ5M&Ebfn#%K-HePni7jb)tSs4);Kb<(<%-V)h7v$IfI z`p%~;v!Pn49rfODj$>TL+juowuXkG<{@x<5oyg$Z@xSr;HLN)#3P71rhpf~$^njl) zn+~7T;H|tH>Xl#D&wt=?IQ*q=5NM)gd49kyD=rTvLaxe1F^fT?kSDQ9p(EIaY(qQG zniiqdsyN-N#K1ePaPWLuHi-lJN(9KX1ZP!0_zZ8}vu`NK%RF{cVzVwD4reo^{H8MI zN=#nbB28;`J89WTjPIV&hGp?6G+WgZVJ5H_?Ia<~%MsD=AWB%SDSoe^@*SFHC==RXNT+2P_|dg8BgoO#L6N|CCA;_%m&f3jqAyI_2AgP-|fBS zLRs(}D}FKAU4{{JgfV96QitwV04kvMsv?8)<838uvI| zpext<@_hM&?=>eV)qJ*Y8pq*e(ovyUeD&EK7K2jtumxD9%||YG+bS=X_YZa(dfaBP z*ox7b%$~2)ZBll@=v(7GrcI~QhBH+nk&ASQ2#7j8hHvG>r6XYxDVMLokaGb?a3Fx! zF*x~B7|ZIO&Gc*X9&Z`rz(g)97nMrm^n9YDm3u5(jDhn0f*a(r zwg6!J=PcDbv4_M_~x3LYd2g^md~z}ZB{;J@$Jla3xZ5; zQoX!9;9^Zjeg24aT~in+IpK1rRS5gu+`g9I5Evf)o3~S}<==wrRSb@#$T8m|cI|4b zatep;EOQxmPC|*acUO0Z7dbhxT6=L}npP#$LpB9GV_K&`K)LqNbf_MpcdxzV_pxEu z(Q>-fZE^5>>rF^VP>ovKe1?{8AGQI^>z7}^wFoAJzZ*kxI7oC zv0FEE?|AurPN<41x|>t&Rq&*d%-#eVwbV~|TCaW~6r22x&lJ?UwmPher&TZ+(y#wg zb0$HCfHu55ui?-OSGlijBOa@l*<>M^xcoi~bF*8SQpM|V$c_W6VyO&BQGbzR3Yi}T zn3TrmP%0P6TmKY$puY37-IzOMl2<^WRW;GWDyOq+gVBK8kb=zEsN8-$9-L!Ni-g7* zOoz`%_#NH)ZJCG^24wc3jl~*~om~k`ZSCCrctKWuv-b#*AwmO-7a0?ic57g}|0s!} z|4Y*+N(9%`YmE#-VyK!>M*d?Ph?`hLB41(cv3&D-MBLJ1r#at3r`+m3Z2QJqrBQGD z1j$EWXO23IcrI*ItkBRb8Y1cke{nPUHZlIW@|fA2v(YZJrKWa-E6>YHn(Pkmb!9jf z{}*B@C?bfY(s5>`VG5rQ9`V`v`mZkbrgAOCZwy%4T)wqD3969ox&bdS#KYNon^a3# zm*MEs4zClLuuG<4dmSmszXO#8iv0>a0#v;Sns(To(85Z2r86o zecnc&W&%4{ES7wiSNCEG*0dP?IALjXxFn24Ix1XnSaZ2-v?-g=`-s*nG0?c_pku&R zi+25G5Uj|SJ|097<$x@1FOVW)rRTA5`__KEt7Xc3N$5`WMFc zj?aBPMS4?$%T*r>J%&E6+2aWdRsyussUKWa$?b6|gUg~$8`^qZ;#V{3Amc%%Ux)=E zubVN3-+Qcb1zveO-6BoS2APH_)hY)qEWeYWpfoV2$l;%#tSYZQWrS!aLL zS@;VvQU5AJ#EPx@e745w{q^&Q7G9T_OjU>|Ys2Y2**^6PY%aS`V~tW5C(B6=w^No9 zCV5(9b94O=pulQI_=Z9GEEQUF!dk1_Wbk?4pFUWqxNr#3PO&FF-+!izFSMh_EtkpW z)8y%wn8QeypOS|>t;0K|lTe-r*Vn3#)Wo8^2^9VrF-?R44DhCt+07O&&p+QnG=Q;p z9$Rn|PAbLIn%Bb;T*(DbV~0uutD2oI*8OmtX>{gP4&&6*$bg6xS`pA?xh!M5HTth z!yNat+Pq>?l*uht@}*Z*PsfY|QYkb7P@ulk_k}KXgt=QXvlDoTDUu0CIu0yHa`$)`<0Exrxc=ZgY`GH`8FVmzMkdSV) zdIyRhyiEEbzWp29DGW5)!%?%1!Qw747!&oI$_5$-N{vVo_D`!OL}M$dwf{hfrukt8 z6AE>z=_eSqDEsJ6h1ti8#=8(WTkbL9uYVkNJR8qpo7v6h^4RFs$FtaP?xY5p6O7@1 zo^-asp;Cfu2cMz1zQey5f%D>_QYW-r0R;>)wI0auUxmzBhwEQ8@{bBXH56ikdg%WO zU2#&_BD@3gWKKd1Dk;KGu)@MCZ!&}e{i)OS{`dh7|Af3+@}*e3$3N0WG*)(P6b>R2 zKDJd>Y^W%m_d9Me`5``2p(q7cK(ZrtnClaS(s7UL<=JOH%Y46BNwIgKR3j!LYHHm6 z&90K*U}=*5ON{oUgRS=S<4Zk?N~3T(ka>MS+xX-ZTD<>?7U9WOrO9po*s6-xj$~TT zXb}~_L>hC)CHot}0LlCPy4nSsd0<`X=-QPZ&9|F(u~w(3_vl6yB6fdDugqbhK9{NZ z`2%?dMgS5DN|olxY(9lprD9PqBo==atvRcFJ>N6?X^aiU{cihd8E;r!4azr}+Qe(S z4APauY1=;1RcD5T2tE%&U6hao@r9BlvJ29K0Lg4f1mv?7ylCVipV$5xhVnR2mcH4LFJ0S_N4mBq{Eee`p*=IMI( z`@_{g%|;b#D=xmyowGes6KQsSBW6_2FBV>)J@ZP!asvjE?ue}`2UvBQK{2CsK@)PU zI6yl44dPY9vfbq0k*5r=DeXh{EH@mwWIH(kNK0(Cy&&7bM`}kwm5Oc<9qnZXxt} z<*?-b4GBw#B2n_jQLWc;wO5(i3eA6caM)8&K=hHpim1|5$wtd&1&)G#`FCuD6&gp^e|}MZg*=f)xt&&o&<2o>K42?_I5riDAVYE{8g}@=>jI4Sbuq2u-uT z?kSSYGrm$Z-h1n`o=5<$ly;a9K^HSSUlb%aw3$yg4GfmOZ${6O&nh0DUQ(zPp1~u9 z#+{CD-#pD4N+tvC27Ss`I!or_7>iLUHRW>o+5nB~&hrGJI!o2R-Pw6Kuscz=_R=@l zs4mN7Jd7zFNyfvKx?z0U<7H;6&|^~A{(iwJX>6Iz%+UBchV{e1fPn88Jc^ytb@@gi zqH<7GT8j!%1`gEfEPCqmM{|Y5pMbf_a@f=yUf+u1LYIf3wKt6WJ*v1`)i+JCdy00t z80r)Y(z(7#MU+F+bz$?--bMdkNIH7*bADTqI z&N2AK4WFM~v%&bRo<<6e!ajAb(Boa?Vx?CZW~$LEvpdqB40a?+Mh(~w_JDTiB#j1ZkKyV1?r7R1f`}lgHcM% zb}miMwqT`B;xTD8LVJr9VU2E&H$GSdQ{DeTcda}>iE0qa1*a*<)9GmS$TM1N;J9{o z)BR~RTx*e5QQ^s4A8x!@K7PAkov~Rjj_$>>>34M4Fdwsdst{Htqw6^d_n~E_)ejKpVBH&!n}a|E2v6 z-;ac}Rn5slp}tV!bm6)^JCE}%P7rfgK&Qr3L!I zO%MKiq1{(&VVWP2%@^w#rNWbn_aZH)^7?PMW^`G$%B2h-q%FWALu`2Pb$2|6$}C2E zGwP6IA<7hPp-T7U@0F0-`NpJG;`ihc=pkK}Q0#&vuu<$Io>$9?S_KiZc%Xb1*=1y= zN12dF1R6m9K%-2*yt&_qX6LCCeVuOg&kvi4ALWcp@W|U4Ql95UKkbOol|~ zTe;{&(zoZNcFbBcK$35`7oj$(BXAK_2^rB(quJNbFq&LE>XYsTR%t*9iNIU|Zh_sX zmGHD?oXDoo27#3n59uGx9$ZF5w5NARdU+qMg|#w^f5G#Y=p66U>$C*v@6#|)e%;nN zO^4JjxrJWQ_2fi{@b3EA3nk|t?+&Szq17fx(i;af_rryGtv#yL16MZNwMT&eqRv&Z zsC@PA?Dv9*CZbQ=->JMsiu6M){adjJ#LJ%2kj!9~$G~Vbe|^{=Rn$Wj3a!B!_%A}@ z(7SpXkpIQ-q7Z1~U2>Kp%G9SoaR?*+LoQmT_VTSR{j7}0_&q(JYxUa0{!pB|X!|Shh*;K;3dHt*O_IiG*g<9|M@C9=mEK>Mj40HG5KhQEmhNdOb_Xlo69OEFmz@zj8Qb{ZwuuV98;Nr@>nbxH)CAh-5@o~4lCt8?D-_mj1oRNmMcz8?2i z&PYq-ljCCr>&;S#_^X9393sLGG7ZQrBqRVJSF2e5?bsi6`~JL|{5a5TJbO=BN~vz9 zzs`EFXVFfN%kA+x^2OFK>QPBJ;Ivl+*&tTEdlq}qs@>&)tn+b6MOpvk9_`d1`fRnM zO8{?>TyhyqTo_(gH>V7Isv*$6biywcUpN?9SUjFyi^XAU#W*tC4J?!)BMK#}>%!qNdz7UR@|2d#a`+ll_KIZ|-3Tnf> zq29OCG5Ql3Y(Y_byTuD6#|2oC`_*Y~i+A@0O>iQRls<2a`-MkXW2o`UVYp#RkeE1} zod@<;jDx|lSshyARRcR~Unshm-_-9eDC4>Z)DYL4I=H;60tCkW!-?G3f(wX3r`d_) z250BUauL{4wcwGWFvWfi3w3MJ^Q`3R%xt@2vRh^!rOVL{GmTky7vslO3PX(**kH^Qt2EucM5K7 zw@WP%5|x3*Cbv`j{B3F}QA{9gf9hH9wA<%aUYa5=Yx9@wVoDQ)DA_sy5C%N zV#`~uRc*bLrlzrsi3HyBwQgQUO5~Rf`I?1l_1DQanxV}5vxuciwUNwsIX%$N_frNo z!~I=`G4#n^bTk{;=$Wsw!#WV@qc(}DlpwH38s;aG;#vOd$ITkdM!D>%EbduwIX8Sa zb`wsKqK-2q!ZT*8(}1|HNo1*FHamZ0PLKQj_}LjzzcUd{-H4_!NJuC|Jif#CNg0YzTxB&#a~mf3#}1DJ(b#vLoHp*W`L3R} zbPhMOi4c_wo^^**47ykW+9WvOCfPqGU&Yqy`_u!_FhEYlJ*{*!i_PHuRtCA*$*|I%1#0lV#PYWb0C2+iw{sH+|^vc4TpZnBJ;c!XyNjR+a?4PyfjD>0( z_kh2sfIYN0BAc2UhUej{^FfIY6z6wbc27dv=Fyep&>zn4yB>b`hLNy?|`ecRCnX)eMzm* z+^S+rHT69Qz^T`K;ZGJ>WBtG*_?C?h)m@%odsKOeuZVn)`AljIkTQIt<}RNxDM2KBAxS;E3UTdy=e$4WH3e18kn&40Rn;FJ0Vz@ilu z#{P!c8#+5PkQ9QI(7XiH%PjttnRqQ{>}NZ1IRo}**~LZ^F$vCo9AI{QZCn{!9FUW$z%N^pez3WkpWN#l(5!{| z4JcPc`W6ci+(pZfe2VA|skX|-R;!#ZS`MNxT4}Owai8>aPwWjx8?vI+=8XI{3yduO z{vL*UWF*>DvQSi~1c(TuGiJjWdJxaK;8(zEwK^GWJ~O~!aAvu@V7H|_A<&1Y#CyC@ zF=Y|#$Pj^#R-htK7eg|e*THNw%eV7^uhj0yU~my7Cei6HQrt>!wNar2OrqSRj{5As zmXjf5Jm6l!jTeeEd<^iZg-ZqxNxvc}v5&Hh8nBQ;u>YmstTmcy-em7!Js)O;BZxzb zRNWygP>gm2DlP#FhUc*|e*Pju1_z(5YmX%KO1829G^bGm5_>8DInJlOT~*#|Zc>OeII}pm%LCCI2gl*(zPugv-ACb__)oVj|c8b+UNp(tE&2|iCM||v^ zLgsL4bo9)OWf}_>y>w6t1OtW-soXR9dY*0qOY-@=`_o^kj(T~Df>nt&t!ta7cKf%GSin)h1jN68f6pfkD6+Si_?I4PNK0YHVZ1pkc zih1%HFPixf;;=oP=DV7NNeiAi$AT-}==5wbZPTQ&Y<9MH4l9>_{{N* zrxCm%uLE%5k)HXG6O#>h!>K_-eRszc8WQaL4Zhcl(<1L| z^B#>^>}~1b$Z9I5`|5O!aVSRaG-58c&GE|FUvMy5Mn)rXK&!7}=C?V!TpP5tV2{ts z^5`hcPodb^x$z(z_k|Stzv8i;NsilgA+>9MXQ$WsQc8m)bWdN-J~;8~{=)J_EDuAT|(qsQm06`yKPD=uObs6F>R`Y>t#fMA(u>E8+^ zZ%j+m)9&^gddVn1LZ0sKt1FXLY=fr$&dmKK;hyh*gGGQdbN`w*vca`^T4Ob@wS<*E zq=i>ZYIbU1{a&3(C68fEA)@duP}-t%e&bua5JID{wk&6Bp=7pk@3dFhb6jQ z>L7i7G=e(GGUqa@M>Pve6R0B!qx#MZ?GI5hlo6YI*y6v%ovm-*;29Z|HhX^P_6rR- zFwya@`sQ}V@Tlvx7@O^kM@+SQ6Z#LhE$%W>$uE~!de64HUSWTY2@i)=>hL4`Fw!>$ z+m%6zD)Z8Np4~6dq^i)`(vr|AV;J}|y!|FHHaGt25dP;EzBS`4TMJbl|IiNCU%2rv zP0`S+wAu1G7K!XJ_dfl})lUHQ)IVA>5SvQYinqWZR9XF$yP`AY0K z1J$(dPcG_57bL)A$K@*6AV4Vz6p()Oxy;)#+O~$ZT^8nc(pxhJhbh5D29-2Bq$CE! zm&V{_@ZI>WJ-MwT}X_Fsh7pMs|O#gx~JqS9d~-LVD!vq>aKDN$@Ztz^d~}+ z@3M}if~gMBDr|^Ezd@m2mS#yKmWBZ)TO6!rGgUb{FWp`rh>SDqG@*tVaQZ<*>0ubA z4S<=VY9u(k2P-kFt~A_j&K#lDZULF=)4$)c(Oj_^?38+aoj9u2f0k6O!C~4O4L(a{ zas8q*VyBEfSUbM_cC zkfpKNy7W#4E!{c_>VcaLPtzq}k55rmH{^4a96?KtkkWir)~GN1#{>|5^vnD)S;vc= z-skl#sct|w*2v6kP6BFbZdI9Nvso&o)5t8|=>G=%;uMkNn&uTH6C64C>Y?=mtidj` zYiDQoGL?80^REXRELb%yvM$)s+01ae+dv=Bh-~t3)rVj7Nbn;csJCYw9~SOtrwHs; zo!Xrn@5-WP-Irl2?k_8byN76+IjZ^UmG zUPanUd#hNQ7wz#&j(u zeqtq9Et73$1GQF=mc9-dbJfl~usz!Cbm{nd`>5HaV=PHoJy~*orz48T6MD>OqGtYL zZT$?~`ma0VEy{0)zCCptuU4US=pxa#$MO97sIXK_4tK52YaEVskSYN#N(u5UVk@~4n#_ht=EYKIVW5>mMaNSQp2+0IUWWLd@lpFjmphEq1n~HkE7WV7 z&XPbFf~%FjzbB;dShRJIfeXvQ$U)=uRH~(*O{Fna&ft)$vcem_5 zbA?2W=zkA$MIjj-Z~v`YwiW*0?OELa3WeYQ_v`*Y&lHyAUr?1Q>fPN9kMI5c>8L+# z@@h;D`QPzv!omt2d1jjcj7_H=KfPnoRnEHq)>b?PkrES%k53ooN-l+>zIU+ui!X9? zES*3|*hjaV-gN12VvKAaUYL@ws!P>G9;?p%=z)Hs#@oh!*An4TQP-DGjlLVvsE0aa z2<>E91QxBW7Q#bUAk3g3%i}{_zdPE`6%8Z#`I*yh8|TTPyY3@LsX(bM*Da{d_< z{qjl!WY4hgyEv%W$dnYXumrVia_e&GmIYEXa|_HPmbiNUNCpwKIy~N7-Ov;$wdo_{ z+bUe3RlmKBR*iTLTRopl#7U{e@-I3kr{ql za7hZSF=~KEaQ0i0(+Duk8#hTBHx192&8B0k%bCg#C_sDc;Ix`72R*6XnxjS+5x&md z!hNMSFSf_e+KRaZNh^qQE=4u*`Hb%3py!w3>_n))#cZ0bf$5#(&E3CJNh3tN>E?dZ z#5y@C9;BjT4jLuCT*&6=V`+Z&k=RN4?38K7L_hm<`pyw;Rmz##(%n|&Xv#M4^Eyi` zmJ)<(EoJj>@!$O7{5{;)ew$`giqD|y0rdn~704(Cvf-sD1eI;23ub!xfB)G2fcaA? zqu}qJ5WXt&7l-rMD_k6J=@HeY^T&7#1vcHHnN;9zFOJ||Gm|*#WAMYw|j(tu>Tn6r4D}AycqY_T|*KiyGKz1?6$Mn(MHe?v1Z${mr{5~8CY<&x}YM>#xLOl4+igl9?*wuJut z{6!Pn#Pz|$w17^aKB6cdTj7~AL^ItzNF~>*#?tD9VO&TMiimhV;c2+nZLr>NfYpQl za@ZbE=uDb(IRn-wrm4~S)+H*SACd?~7?(P_+dbi}{RHcXnq-`hnVX&vywd+yQg4c! zn~0g3TEgxc`a(_q@t2a%&E@50)_(X+bBebDOvt0vC*2{xY1Zzx5KdU0w5z?Cu!z=& zdqs&#Nd{Qk4P|JMJNf_mjjyf9kCwELJFZIS8k$mhKDY4;O`~_ke6YA(*T+XVx&GBV zmqNMxMgznJ>(9dJSjLPKRrCm_X9SGCPsK3FH>ONuW$gNp*)msBHd&^{?&%WbKZr(_ zxa?LIDYXe(H2Rl(0t>wPL^RS(W@jS?kZ%`IVg(YH+0i6h-lG-_!XIjCd@#LJovYC{NH06A)lNMre@rdAck$2Q z#x%FJT|N#;=}mFriLxom+*aQZe(2M&@xkFZt&PL_!+CvhrC0to30@{eA-_oUyUgfu zoxf0>(B`=$XC@!tiEi4}h%GL?ek86sgc=UlUNwj51 zVPRpH?q^2k8Bg=Xix}UvPf{H{Z{Jglw#Z1UMl0jB27ZB8n5LOxHFWW+gJ9Z}lOjWJ zN-A%TQ!VKR|6s9u+c~KVt0&gaCgO=vo~KrBWOAE&FrNI0d?e^@A?0R#&72=p@NAEY z2!qdcw9m6y&-hFeX~yt3BZ&1G`o$f_9h`PBGams3-TAgZo6UyN!5JkkI5IL-|FU4| zjl@d5#xPm1{?rjFCbG+_(3yzh6%-WOdFl9xBVy#MwXv1}l5V?rq1wlv6tdD(AVv`u ziW5@YtVPYTwPpVoxr~Kj%X&qz>fjtR=TrInLGjOfMLwf)Gm;PzAA748gZoOl7n@5nvLy^U| z8(Hw~g>4i5i=%wMp^d@hh3mr2xL(exudt)-o@;+wLGg!rj{3ycrVCFIMrgP;c&C{O zi3Wvs5#%N)uj3%31m@}JRNsW(S)|P~Q}8&Xgt_c=K5pQ@WCG0!SvKzsL^8KBiPP~x zOQI6w^b_?Y1g!PUqqYSP2&@hnE@Cy}{|okszoT%A)64cE{)^;~z_l9q5C>B7AXUw7 zZd2}zxNtyJf@vrq0o$5bpIE4?R-c$3kiH4N`43G9{3z=;SZUl1lJiNwMKCC= z$XOsHI>G**A`Yvd3Qe~vY%>xlUf@;I@@ntRVrM=y8ZyA=UviQnG%KSF@3l|K5hU2kc*v!sr+?UTWHmL19t&Jy*Xd5|V?vO=KYkGN5}BD)kkJ z@7qd8wptOV{HT^AIS)>7HI@g9&n&T5b^Vfo!J=4{gU5L?ToETPC%q3TX0=Y@_X0lR zPg{T7Bz(Z+-N@Z)o$xq5bR%PTqsWg8B!t9bd_z7-UO%M>jimTPq8absq>NAe=)s5o zO635Ss#?52zF_d}9a{GQU=8bcvCiY`q`m}3El;D0TvxL$Z1o-~Mo`Dp!m$;OtDye%pSG^3UqXI4#z+2*nW4%$+@|D z>f)`7TzpJ$MqSjbl&W=r#cf+@O-;?y^&XXOXKQ(RMa3B4N~PD@85QviG#9b(SZ7%m z*nw9BKo5W!ZJxJTF1LrFt?eWsQsDw%Bg&lR`}!E)C)c~f0*dDa%>X_(?qI1T^8E7h z^EEj#>s_Qvfod}ep@GGfT!+dmc4o#o`c!{?9DwU{3fPhU{P`2`QK!|qr;@YbzLo%? z)@aYfqE5DgW}3b%dcS!$;FvrtT%x=E`SXWFE{8ja;bao*{`$y}OLN{Grrd3n_GhRW zNz#&WMO=g0Q9Y%1itLjMpEX~m_U`-We<_VAa-2aSGm496+2NPK=pPgfXR~!N0A=-KD(E%@6;^6H}#7w%P|N0mM)U>@ev~aWd=Mt)>=# zhkn|57x@@4&u_LF|kS7$HyXHm^^ZWdeaPz@Mtr)&Tz-ARmNvoNjIPvD*PsD`Vr`+Mf4i zU~$5-67|{jye$Bx?aYDEx-XqGGpb{sacsEUZWBfav?lVgLGi9h_ZK}Mh~SC81dPVw z0Fw)FOvXennYMLub1S`Y9-f}IpDR=7>yvQMq}#nWdi6zN-Cm2K-{|y6Tz`gJG4D@< zb-_I*kM#$f>}O>`lYrm3Ayn4 zY2B@Du^xIh^1lS%?k+mNuPdz z;(^j_481%P8i090elE0}ub^(r$J6O_M~W*3=&@P69y@R=hv!X9ey>-uN5oRmOs6RYcZxFYAY1 zEam?wk;x2zVf5VTXdr=Nt+IqJ%Rjc*+E?%uU?8)-b$5H$x#Kx@7Kqav&F@tzmpl0ws_<3W88`-HGFg!% z?g2CNPr<959!>xWxA$9bN!{41hhaA?&d+bG$Un%mbDT-h^NWVM@VVVV*DGGY(;6Fq z1%Il0iDHHk&Jag#K+D0!HI>F;bXY4iAVh?0h?BqOebc^O8=%vE+`5YSzv@yI z_#Y?LUnfZ={uhCf{@bp?U#WI}$zWS2tv~x5%@zZG#qfvfPLsO^(oMztN5{v<6nK|1 z>Hug)rBdwT{k;2IBKY%-dL4dX8Wg6(?cvPF+l629DW3JWR00-9)WZbuR;pwoBn2c6 zR{F0&9#whyX~4ezgoC3hcgAb*+N77fK!uWLes;D%PoKQ19RVH{4GRY+Fu3Vtc5Lc} z$GQ;e3u4|xX=UcuL&|iJg1dMt2|o)|Rhh}{J ztsql1)>+)o+|Mn@i`qzSZE8N0X$ClE>o5H%7axlpo18>^h08%A#BTTE9s?=#1^_s( zZU-YGZ^Azn90S6=9b4H$bdp+~=~tLtcO5;qW`h$C zKo3Wq{zdhhM0AKM^6`n-O=%*At7+ zquJqVQ;QEcxVxX=9107EIMqr(Lf&lH2D;S8NC*)t0sP9%c_~v-%+z~D%_EM?2d_4V(?vKGLD2~iI3Av!=c^3^#M6PD zw99nv1qONYjP7NQPdXj0d#LP~M2f(BfxU_1Z-r9&3J*6uFPU;&=_qnU|9Za0rBBQ` zmWVjb=W%TuRmW8d7wTXCvz%G}Y0_8TBq~Tk0>+8A^(6WX`flAgm9IWLF?LsGe zktrX$&P)>Qj{GUTa~(}|{V6$&+B~C1R##~9r-t`KCtEKU%=s|0 z#{&>Mqhj8V4_^93nS;?j7eS>di;OMi@D^07)L^?eS zRq>nSlK2EBmlV>HTWgbP(baiEkvW-faJzY~o^Z-AKQhnH9QetC#SS3~(~^T3g)4DI z1aTaiM*udM!yyg1-oF5G%J+K zW~=)K{^eH+s31-kQlyZ4y7j7yHit5D2VZWrXSDYtqa3HxLx*GZj7F2d2h0D6r_Vu{ zLFOspoUr1%>!6j8CVh)_h#1Ak`$RA-jQ1D&fk&9w`6K=f7y8AdF%fw+>3lt9l`#j!cOR-{XTcqp= zn{+AGa2g2-#Q(M_p#=ZhQgV{O|2>f6mw}~E@3!AJ4@=Py7M93|>}^bbQWZ9y^Xeft zdXx5j{w73N*Cj82WcEe+Fi_lu;Zs7tBVo4GZv+o%qt?})L#1D_cB)Dnatf-Z>DoGa z4ZW(iPi%M{!WQPxLFVSAI9-;?$^fNl5lB5n9cLI_0byoJdmuValJ5~SJ9UM5IHOx_ zHVf{mPJ}TV&g{1k|FrS}MF1XE?rBT7#G<`k|9QE`8hvKr2f)N z(GCaEGkQJqRWCOcK@W_J{^UjK6%~|-_JVGRa@UsWBk#-N$e^eyXmgMsnO+SqXL%;* zxB{e$_jlJ*rZ&$zeWJX&-J_~mhYUusEKHyD3nSfVAKRbTV0$uW@Ek2n76$*sUfYn4 zGX+byBf&Dng<`lA@%lU`(WvIzzH#<9#}^h77eX4T2RMa#xF-uL%lb#t>h`4lBTsF} z6daSl?c_vum4D>O7fo0aP5Oa^mQJjOXm%i$?$TXW`>mMd2Xr3|GFraZ=Y|rWm;0;$ zRH(?j#wOR^F zSD)GExiZ(CAZr|n^1c7HmHA?!QB!{I2>}HvP#jZGep0RY(d-ewKdBX`N48)+ssLp! zc-lhcNu(V>FiC1^WVjHlW`v$tpm2jh%RsMk`LT3u;*(QAlME;20h_lr;ImtDVxn5L z1<0+A4+z9rsx_Mw+g`{$e|*m&O3_-?bBu@C5d9-2D$?c|TFY_UIz0W651)LEAmbfg zRi9?S)LBZqu2gDarU?fp5oOt&QG6XBNSx~>e0#o<+pRAu9FfavqX{?%un*?M6l8wX z;4;6EJS!3j1S5YFVLjSJVh3{T)Ia#GsC1Clc#+!bOFyhI@o|54UG;qIogC5CMCjF# zym#Gf2>ADnUO5D%zD0i64C*PPlD<4Xt~&WH<*j^BQ`Z}fhKgI3UQuH8sfP799mG_+ z1(o&`aHWa;gmI9dxWT9eJ8jx+&pkayHYK`6()Z)?`ErLF(4f&N20-9_q1d6k{8376 zC?bg}#6Dctdz z+?7N`N&@lU=iaw3U@Kri{y{2=zdg39HAs4Ta|fB?(;7dk+q$;H<$1|1Sx{Www4azY z{K<66rweWEtq9#!4fNSn85n$>%>m(tBjR=`=zhTg%lR^)9?T#`x69(-XcF7@YtYtl z*G+88IIs9-u2JDdhwiUUyxqGwrF`5XM}GSqN;~AABS3zT_*-)_i$kc>IggFmVO01? z89K+$o!R3fb*sWrg>%iz$|eW~i-NV`KcL(UM{oQW2J+v=W5USQ|0bqeq1CVI>rJ}a4R5X}#^m&w zkVA#JL0j<=$p_hDQ^{RkYBJ=u{9`0T#!7bz1At}63xr;c~aENr=9Y_zelXPRxkSEvvf z!Yste$c62YEs#V}GTr@XxLN8VAYlF$t~ox_PT6%w+AroQTgtq~KpdwY&62u`9%yyQ zklc%5tq)2iUB6%MU!k}@5D>F%b2XhG->hE@P|*?C5oi-rWoit%Wu^tlquzdr)97~a zUbeQesmq2&3H>rWwdnU=C6ZVo18%o!kgQCwk6<8cbaZxhzDWiFBGf$43yK*>FLa)- zPWzqvxwjgh(hpe=2tD)8`n^JSCggA(8`V;PzS!<^&a1PD@H^s1JN|Yd-QG5rEE@IX zmbZZkj2D$TjD%V&9{Whw;c`O*YT?yV&O>UNyWoER@)cEv*8R}*$8zNd`{;w@lU}nX#_iQxQ^iutiK7Z3)62*Y4uwF ze_XwFSd>w>J}lh|2nY;4bmx%L-JQ~nLw5_3!XO>e-KlhkbVzrngmfc(kMH@N?>guF z2iL${&wln^vDbaCdr?UU?o740a@`K-NPir>N|LQ|z$2FkX z62AC-?sjd^X2Ve|G{2QbgNXtY9RGwUsIU>EAa7Towk}4QEk1QgO2Q7^xKUcZ=A>Ee zkIiZb$jIPZ2>yX0__xcHfUn@|`Z5oL3ViEeQlQ;Snm z7!A7;mhLh3?dD5h&Niw-wI4&v-f7n^qHT|T)h0ft*rPUTabxK8BPQeuHvQXw3(G31 zs_n29NI}LsT@QG~lKc%+V*Ea2Wb-v1tyIaLQ(@Wh$nLYGr2iujb$P~qN{uYqWg{cm zI;LCdX=~VOlcY|7CV3ag4u#;i33)t5T71-L=Jtp!@E2EHKd<_lsSp$Q?~*A+{mP>Y zKArB*;BE>=<0_Ef@@Sj1wfVovP_IEwTqMd9u-a!1ODf-^Bn1S>{jg*=#ta(Bt|ov1 zH7d+CL*viIF7o^tRn!y*EDPOm^BYy~15QFWO=rAqEHXe@pO98G=Cak<2qCYSKprF) zS6?kSv}^TOq5Ku*j$ZP=r}?@t(~*x3Eg{^HV(%H&mRBosx-~$Zb5nugH9bO)DV83e zZu+EPE*X_sfw8cYQ_K?22_m$>#6U`_mT!|Wgt?nPJ{HW17EGp#yu&<|Lm+SAr5m#B zsuEZ&Dx3oS%=G2XFCsl&cN_=9w)7mREWN8&C6(?LdH1yYvfIkalY#dIv}IzbblLGC z=1Z{a%*S7+RMJ}{XrfLJiNCAHPbr@x%Ecu@3M=~faiQQ}RKLNj%o1_&%;c+n;h2QO z7~zz3@~TcTF>CmncL7NIHe+?y!%%P>;ex^jyt>hvax6w4gU|bT-2>=n0W74yh-b>2 zT$sMQcjK2LnEMgW@_7^+Zp!>w18`%saJ8%LAaEdw3x|rr*Nl7vUc~nC z+XIG43^%T%Q)XEj|3p5SR9+X)Ae+J*74+7-t@HiFpnj$%zEfN?Zj>Kqz{ZzNFE}<>1{}0cach%YqL8JWfl9VVG zXiq!1zmknYZAnbvE1|icmKM!JDmv_!A(Oa!)nJWXgKuxZjT z@)21u*(?_9-k}AyUq^Drtg*tTG4L0vToCdyRT&aJy^=vZ_VpJb;*Oz(%$*BT>dFk^ z(@BWB?loIxTn&LQ&#Y^DLZsi#Ii>9=%Z%3Dj`KjcE3JZYUg#)+dtCmBvgW@-N=^^H3P}f*475~yhB%woWPD!H_bq$jgdXw z+7q-Msce=YssCYi*J-+mllnU{BHF}}ybnu`N`(T19)8g)CXF5f?kdJ&Qp+LCsnV4V z8mC)?m?@BqaVB#V6-BQ9`7>(_&_wbT(>U2bDra_f?5;c_<%517+2OG0v*OBjEBmk12~=;{B!yYv&kSl8KXos32suCHeH zO$>M@wBXJA)nat)p>g|`(C|<|ku_1pFT)Q#i8x7RQnO66#Rb|h`EY%dlz_9BMDvr~ zd*jb1yZ3=lj476Xgrr=)_~goqVZ!}3*NR+CShmjaUoXPQ%b2;KMro>?G6#_mB<8Ee zRJ7-WWHiyo?q3@)2HU$(jUA?GF03vq4JDsj$LTNY3#s(^7(_NP-^SkjxhS7r=H-Fn%a->~Nlm@mDM!4B{0~|cg4h9a>1z>h^V8}ZWK`>8S3}^B zSB}K1lBnGr;58@+q8t^%y(15ct0PiIeQhGAn`m;RRj`BK>3M3@5Gfkd*4Ac{&V^*H z1LvFPhk3>zYMzt%StatG)gVHVf_=19ydivW_=YX@o$gy1ACdrt?TkzI)ps7|JQJ)6L&*I4{KrY^&$S6wSR}tD6pU-dW2P~*{K`L& z-RqEv#o;*WRDOW?^i>PZJ9h{KAeUN22w#{<>csi$D-M|b}~?J!uxj(HzjcJo~pgIcgar=MvQ0aI0=MlK84dhW3t?1U)hPQ5?I(|23h zA~*h?2GG(l9b+zG%76z2z2zV`(Wa=(=-!}!us%BK?a!@dH49AZnd;Jc;G+$s>Jy?R zETGXv`D+Vj z^kHJcutF)rl)-W=HWWI53XacYQ@7l#V{aI*Fn}uE0jd zBt#HdzLiqxdv_`HRai_-C97SZH~%!cG(pWK^iS?T3LqGb*|?(C&-t#NsV%lP4lZEE zb19dG31W7WB-LR^`FPs=9vhx60|SGH!da2h3~bkc366)_!`9GYlS#K&r; zod&~x-7muYMhH_rlAWxN6-o151HJBI3=vnDLd6w(_SJJzN*x`~Ieqsgk{^z-L}@JE z8~R@9NEV?%5nW@6&GNy42?c((Yuv$mrjTJ}Qz< zUNVHDk9TLJV%g)NEWi?rQu7d1K~%LaGM|3)w5}-fnzFun6Zgpyb9iWBarZmR)Mq$T zWhoOmLM_wwFPx;Vo^HEmOqj)y78!YdL57O(uM}&(seMc6IKTVZHG%P+J`qEAQJoZ2 zFf+m|0y#C*WPAYt-5{D*ucNe3C&gSx+NO(?=|;c5HBRw{ct?0u6(W9|t1xvS{l;*4 z;yVuLb+M`T{F(FLcN7px2Kvr_B)_t_7Dl*l_1hFuL=?lBot&*UY=qOUMG~VOr$tlg zLVqWoy(qne6R@D6Lx$;Wr6>}IzA#9Ko3IccuOGi`Wd|MJkCrfdFDuo(3x{c@gV4^h zLUw+?9p9e$-k8Yq>%84vp{bnGrM&ZB46>ta)G`UH(`3@ zU)g_?`fQd=R=%3pt<2^m5vGC+@na5E;Y|B(Ke%IG=c0xsa7<@?Jr8`|%h@3W6#DrSprTHz$ z{#`@Em63I_sbs`ruG70ZTz@+v1K@59$s2^-FE=bfUy3maUghNBVr|Fd$|K!!6 z#ehuxiinQDYk+n6``jF((T5q@5ghv2>xpoYZQS(K#(xW9q4W66-qtgx{78^Vq_5|a zcG3UoOyL`lr6@3H@SM5!NXMzg0ibFuyTt2|loTNkgq%oc<5av_QhE}X$`8%Bb`1;J zml(HEw0K2_nR53%31}cBACyqUFi~Fh@0O!#i#|JgqBWS$2|CJHzqurpP(tmrbFDZ{ z@=;-^mdnzNL=xbcr30;@q;ZvP4Rww0ivXfil(S>;9}bNTnjh~DDoB*EU;dNt3=Yk! zgdeSgqp3*9zBRR6d5A#)h$#5At61mZrCbhxYHXq+RK7W<8v*3Z$UU+!k~GJkDrG@{ zA&$5flC7LvPa&2lQIMGVO2N#Tapsu@A3@Q|j8Kce5SVQEFSHA#Tz_`5R|s@g=tC!{ z%4T=2G$RT@JP_^@=uYJPR2@4p%8zkUHalWj1Wb%R9~gf_F6g;hvcbae`YJW#9?hBY zrHBbl*J2{z?#vXr3lI0J20sl80Jbs1Yy672{rn9eSbBn-aQo>mc|m$GKh5^R#BkVvHfASEFC%3mdq&Vhsh0 zI7E-52V;+U#zkv8>gzD&3P+5Ml9Log6}Ywr$MN&6Lp66Iv&Ba{Tl8gVXiLu8;HOPr zmNIcq{8z#DgvW|+!R{nckGi%xIi9RPHwz)rw(oe@K`v- zeDlJsZi#|NT4;!aphL6wshs_@x>y(%9F@(XjXV@wN=|5o!qU@=XC5s}FUz;nQ^VaP zQamLDb4ohYk{*>4R8O7h=|fH&5RKfT32p;pDqX|A6H=Fva3Fu#VcR>s9vWO>7v7-1 zLG2bIq7Wp&C49+~{-mbKqhFd{i`39MO4aUuxS#@{A!T{M=#WxZ#ySAUF%S{FS?6#%L~oGk2C+)cg0j)@p8P*qo8TpnbisPk=o z!1DD+mgM0MIC@{D+U!@`ikv39TUuLgaeA|&u>j$mVi&bV?O_%qKX#(8)_r9uYOY2B zkA-%FwaWRZdfbzMouIoA!RU0_?mF8htt{B7c}qlPPJ=1i7oG(7*`J%w2HKS)KE4m4 zikBZ4h$a-Vy5n1r&Lep!kKa(Zk3_2_2&Ns<$$FP4+GzXf438H7QfVO~5z~Z1Nb{7E zL4kmbc4-jMcql>w91MZn8+UBhMyE2+Lq*<5=%aeA`v zI|gmfM~7XG8jH!P4g}4`^L1zgAwo4-%PynM{rRd6a$7-Lyk7w!j>(z%$m#f{U{D8~ zWmK|8nr?hpIiuc;NW1H9_fD=Wdt-<#|1T_b6e&jIjy4xG;H1g={oTs!ZH;;3z@yD? zF8$^a$$NPTPXh&#SIk!@vS*SuRK4}mUdR4zIwtX3i zl?V(@{%n`K){U|>zUJ^}nE&SmzDk8Xn43#i+WlmHR^$rhoHXsGQKCvMS+qZ?%Tppo zUCRPHSht?@ZKYOuvL?>jFH{AJIJ{~SY+qDA)zCwt6aeW$5m&(+m*hi=4dzl2T}V|O zXA1v!h*jfWy-@&~%d$Gu7IfA2Unxm{M^0FvFGX9B&HUi<&5a&Td%VqM?0e^=e5@kl zB4pSOG1deRG^Y}7nD7mc@Eh|2J^hDmE-xzlQf<+n3Eu{L=TMgiYcIc_P!^!O;62*m zngN(`sRqUelbf!>#3?%6FO;9`!MO!RBxz6VQSTKQ;Pk|!;XTbI0)R1z73UkT6%f=( z7wGR<yYiJzPo)m28Jb$_c!MhYUMpNc;#NvxR32 zV2?CKqAaD2NVI*5TGcWEuP{VZ-U4!*NsTNbfrr7ckG!*&IbP`?|I@1q3Lc5H2Al?NVZ;<r+1 z${_|*_@mkFD+?6yxJWQ@cAib|3wOJu$*TS90KAsh`M5at zl`W=Iel8e}jaCR5(9S&>*4oRWBl^coQo>TP1H#ux5}R*|m^~7h%r{ffjcC~oyD!L%zf}!@C9T}RBO1HpMeR##7i~~ z-x^PQ)|L-&@_$h%<`n%k2&%^n{{$27k@)f54w%=sd6)ZkV=q?AdpjuU<$B)uIOcLq z`Zc01xoJ=I=eez$@VJTjN`M;SxVmojy}PzxOlPLKuHnV{RA?i=q(MZuH9b}wy*-4{ zP_%%7f}RomrV91qX_*WnTAQyTIb$t$Y*wvOVarMR1C1uFM zz#}E(nATeO9{Oj6|8~56J#Ez@z<0cvSfmaOj#70I4KY!_1WH*#Vc+NYyvpr(=sL%j zV$IxGhDO~FdwJ2FepiX|;`)5Ky*@{msi_8>y9{$z&&)sR~0yB2W^ur}AoL3AD`3!D#=D6_YRe*Q|z#vK=%jx;L7 z%*yiWe$OA`*S~1ajqZlNGx9pBY}8Fi?CkM6=abLOa{PRe0OQ*S7-87gs0Uo*D%>b< zCJMCjtj>FAJYh{>E;a_qW#JLTCp#>fiX?aVdwX@d>RLCl0&0#-YV7ZDNz zE%mOY_#e)VIxNpBNtfL*ciNv$=tROH-R#g%1zOxdf){T-66WEcE0r$hw6Z(d{poaU z6XDjgG6~CVu8iX^2h2PCKQXiOZ=Y}fMC9ILwp-E%!Vi8tA7Sf^WlP%dA!yy{%?^M4 zKxvEo-C?zqzuJ%Nf?b^wI~8w)mR)%~23p_(GITyqk~I`6h#gDTtz$I7AZO?FEe;I04Z_S`S@Z6!nghqAre{563Cu z6!dpb6QV*nWeMK9SbmdGMd+7;UPjd+S9w;rH#i{thfc`-tT;r3+b=XqKg2Bs4O28Nn#U5?wLGnlya&!Clu)Uk=?LG2 z-~RZM_*Qgd{=&;5e9=&(l;WX(9TdVom<_qq%RtZ*ufSM%ZZAsh!^Vdj+T! zOY`a=b-WhYj^?6w^;PX+QlcTD-W6FQGVZT7?(Fc`Udnv*cy_+{iqC_?qv?$gFP;OB zrfD@scuL(NMO|E<4lQuHtUa*iA$UeUOX-b;MKek2$V2kNIuWyw!wT_2K4GI~0FET+ z`xmznc?ys-T7#qBe3LhmJBzMRcN2Y65f!L*gej}?seL2rKrLTd&6RH%7~%XQXx^~c zPwn1k-A2o96WIKUUVi>GO4fpQ>)Wc-rh*n=9OJY4=T$7bKFgCOV+wqE{?w(sH!hi-}xc}`{hJ*4M~ZOI&MvE7f}RFex%%#0fx zOvMzexx`tthXnTDr+X8S|A>kUf33A7NLKruIO!6{&UUPAXNhG(-Nrj z_I1m^as*$#H~H}X<+iIh+s|jIv=|J+DX>~WkX}~D7yyIfocBzWe{Y)KZB!e!7{M$? zQ#qt=??l?{-|Ulk*t7_mP@G!MJ|+GU^K(iPxJBocwM|q{bqu`RMT_WswfSg*-e*sV zyD2J9!FcKkNh*vGg-m?`P*R6}HmsJigIAd6gall{z|81i_T7lA% zAx2BT=ivL{<%q8~4xjj4_t?p{AHB?JK0KA3OX+A{gSaBU@=SFvKIVk1o&Gr1Wr> z+C)PnCwNlTyqcR7x6a)0nzE)hzkXjc`AcQ<&N6?rx4E8}`hh<*TLKL&QpT0bj622n zGrAVnvLMN8_^$iIEOyxOF|?y_pPu+_wL#;l7{pGqx6X1J*)`$S2%RXzjG8e7u^4ms z!2j?u*HR&6+g_6M4I@mZ1E9ul7mVWXY|4k0IVV=<$P!2d!FZOu>fbHBIp!mIt5aF_ z(^awrY9S=#xY`^!V!eV8CWxFA@F(cFx4BJb%5_@P0qp+f7j@SOaJCGI1dp^ly+`l) z!Nh-h(nc;XuXjHW?RP1S)8WgbM52Odcp$_N-fY{E)uSHvuK3Z~IEJ#esdctL$H&qe zAY&)X_!d=Jow|MN<16un2K(hO zlRh;~YSd$eWf=!>@Qm!N4(0dQe~umdHKHUxdU!d7IX1jj3Setm`%5}N&{yzSBKjMU z{eU&(po-OI)wucM@_e#KgkYV#Z|oOJ>;9ITv*6+{i#YB}N`eJ-tb#FZomzVlk9`C7 z%Zlf53ESe$__8BTp40^fN$E=1*k|4eLZNT*B3=O1)K6jJ5`Wz#d#d!Zyt>_VXz{!A zs$CsLb3LQ`t(OJ37>w#jz5*xfr!Xf~MO({gBf7(%78N%gz?2V-N0gENJFe?tV+QKY{ox!+;w)-m-R4Tta zY(w|^*_KO_v8Hc(4IOJEZDOcS;1>kDC321ZZ|RagY3x6erg=rq24z3cIx}ps2zeaq zuHD^z;pEZ&DP{(~MapgRo0%|zL=Ase7Ksvi*!jEpy1NuX}}rl;}qz^7+HN_~qkt2#d<`z$sUZ3}(NbH1X$5thap_{qx& z5s{PI#fkQt^E##Jb2KJVpjGJi$WGSN5ol*mT}ESmuNUf>6JwQt_u| z9U2=-=d0z*OqhIQ{4-%6)prGJFu2bZ9GM~eQT&?|wYX1-vC8dpL^V;0s>VxI$00P1 z9%RduEOmY8{+sUWQme<;Z+E-=(*eBe2nBTD#`b-D?sWv64?epOSVtoFhkw<+#&G@# z>PjF?jD+Hmlm5!=-=DxB7pXOy)PGXCI}eZImOEI!wtGRn{4{o! zbZuxGUOigNZI|r5ZhV=ouiuf4?s)!MPT`sh6u3UkNl$Y9{BG^9^U9;fIo%eM9w*Hu zP38PsR^wL7ku~Pm=M=*!?ZaWa@9vfegbh53KbM>>_--8O1n6;mY@bvy4f3Bz{+wpJ zD6OQ5L_C$DtXVGM`&y+bP~E}9NHx=>Enp;T5~2|C}~pYT4*wC)f7F_ihBGnca4I3k?da_DL-!&DS- zuU`DKXCls)3kK)(rox~x=5em%oVmo8;;(3b))>m`Vp7OD0`&mLa&=bg=E%85skiNe z#77Sn4)Ub{FB!W%VOOP%-O;vD_s5s0Wi?>;uAI;3*dA%=iN5?X+ISJMdn+bE!O}ptgoM29mWcJK{2@he0pzH{UY0V_y3a)FH+uAEY zxl<~MjVFDHA4~a_9l^Mmvx;(cVLRrQ-*rCD-BG)MN~_n~U*i^Iwkga|CZT86b&$wx z7iF~gj5SFnG^)W`c2gIYkIshM>b_Mb%9cc2K`vmCv+j$2R`=B=s()iePEF%6KW8{(FZnj3Y(d_h3Hjfvb%4+Ht> za@`l{7g+o(z|sy;Ut0VQxGF@i#syEYN2Q1OOx0-WVyXhav5Yt$ zR@>ZBi+b2XzbISS_qr^X5U3caHlsyMsO@%i0ISCKK>SdrR>w0F-EPVYCW=*l)oo)F z=TirbBpS?RmMJHmTSCkA!lINYuo5UAuyLO4VO?Z5Ud#6WQ~3Eo#3rjkjWr(j1Gq4h z63tf3d(EtVHY-oc3)Zq+j7VIFUmk3FVREK)vCPklQa7Wi?_x`!1TmZ@9db~z&Jf-a z_4Z9^K-3YAG=AKZQ`h0BWz?I0JXLua=(KDi#b2afsfvh}K@E8Wcg~NV{gD!k1D}*F zS_{=2may5CQIoPFwEs@>6`B|iMPoQCq54gh029BzJaiJcIpxbBEW?a*=cBuHs_KYe zLMsMxCaE=7x~CONFOXK$7cb)^Y~YGlA+)`vnbR0YEaB(oZq6R`Zp9+M3nblA$hJry z>&*b9DK67?lQO!d2&XOq~RM-u6hNWMLj8pK4tVuWX%Y4oyLZgwxZ5e%}PD$&=SUngo~pJ;v%}cwzB}^!Haf>KHZ4qN9=!^pN;;7$xYZ4u({j z>(PnipD$21D%+9f5K(H}Sets;Haku$N^CVm=B03vY8HAof~EQeyMXwKhb+xqJ@|!N z5~kmAdQ6^|Z)bLnJy@!{h;+ux-rvkhkA*n7sg1DIxSFRICmVM$x;!X3RAJ^FyIB=j zj-{(pscC*1!_eEQ>@s2z7AWA=9ZK*Ml3RZ{<8nPI_vV<%}IW(@q7~K%C8?= zY`qMN!P@P_ArKpxV2y~0s+00wp$yt30D=|iuP4l@dCEEIp$Uc>spUL*jIGOAiGmsH zogyWphb)ezmEaG<&Xdm4N;?F{jvXClhw}UJ_B@^Up?c6iuBFw@HPX3%OWO6XZAP9{ z*H%|p1g9ZngA1GGfg;zTG2*ES`(Bl^+qAdSqszH|zFGI`K3oiH)}9Nl=&ok=wUZ6F zr*R>=*=ghQKfY#UN-YQM4-m4U<;r4^g7D@0f8-q{y?tl+_pUmHBWs&shrj~GYw{}r zndMoDp~BaFzbS20)F62!io};$Mjq_63?TSh%5~-GeVb+boPmm<{82x-H@;&y`xR4q zdbNJ#*his>w7If;YZ@Mh{sswcd#Nyo#pd#NkMaD?QCP#%e5|xzvbS9ElUBkiV;-{2 zTfcp?+~w5X?`e|i?ii&#S$)uKjWl?<(a&D&iE3xtCjQ$-*lNjrE2N+dsCWwa5F_ep zu<^(2TU#E_WZ1Vr-qe^zSQWnCcq=_K%M^<}v0s=rtGz_g#*JQGR?x&bjaresjW0Y* zsI*^&WG{dRF=yVua)(#Zs^j*?&0$55orbILGJEnOSlyssa^PH} zOpgND)RO^$=*Jt7aN@&{mcbJfhVr9ky_5z6x?EFR+Ff@NqLL4Pv*dGHOOMxucYjJf z02K%dO?ZNBk!DrjSdU%V4bhh8U86qx$n#nGu|gj++Ile0^yTkAeHjmY6Y7o9i%;R6 z8NGa2dpJI~(%kpx7Do2h7dsvPO{OuU-_ir+-!%#j@EvMrUmlh(zO;;Cxc06Nu6znq zR1oNCdX1}JxMBg%K})P)c>w`wc<`+-UKtwqTfp5W7Gl@5c-&4!>X;w5ji0ab)-V6L znvzp>y|Pm*78{!^uT6=QOTvzDf>Lr2#OShgrCD&Qxz5m!1nS;_+nv$yGD z6Li>b@3K>-N{o59KK?S(b7Q0IAQfQtW4TeTQsHGWvHmD!r&WzJre$`*;K#PpW9Izz$8Bx4 zR(zFLrsaya=F5K(9g{FU8`6QX|K9?zJAfM}kCLN}%~^HEr!Vpx=8YQbS#x z7~;lX2XyjraB*qOdLGRw;Y)v*4On+bxl!v3`0R6iRQh0u;^OLBcNueR+9)-WJX_v+ zaaL(mFHc0Rslr!kiJ`TY-6F!{p3OKdr+(s7Dsy~|YyarsFg84{eqHS5B8R`YxM?C7qSlWg?l*&QA@#w|`c@%aDT%fd2lsxVV~&M%8hUwF=MF(;?B4Wb{5i z(vef#PnN1LF>^i9!N(GNzEhWw-l?~;R#jZ!iL~qvCMGUJ97xr>*1`|)3Mql(S)ozu zSj9(hmSG&bdCESPhHDITOib?UwQur}v9{jg=4l^g;~ITpHi%D)tMzPT5>q!*SMQas zy3I@h`S$rQMLa^P3}kemR_2%PZieteJ}tL7)(<*2_R7L~`0yN7gjdxhF)M5WMzSNx zslE_UihxXt^Q7u%KkyBw{K@DctO(iW!}Hm6ej-|Mo3s8g{cw(lO@% z9_!uCD;9FeCO5yNF^O3cbfm7*k8JlQ^i78>yZ+WHd%IoPSFE=eHNj~9z1=&eka+sn z5s3m4?hMIT|0u`#pTP2FuS|0a(co|Siwl0F3Yq4r{d@e$CsMo!Xys9V?)kQN*`Sk2 zH`kN<*S&L3%52VPZSO^qT%@-RyL5vE@=X;b@)L5#fwdUR7`t-+`@wU|BONIK4ft@8 zZ-*;+NQ(DUh##7tq1&P5rIctQ*w|vYD2q4gFygkkS*H4Wp((ADwo=&+>J}fH6GidJ zNm3urtpKgkRuGw~31G48Dlg_pQqv#n^O~=ZiBwS@`D!;yQ6%u5bT^P)DdrXP=gHwU5CX-!nBu)?SFxA~<9uAbE0=cnUeHe+m1o;Jb-InS|EVdQd1-ang;s zd2WXot+u}Gh*L6g&oKs=l1njW@I%N>DcZGwZoayhRZNZUr{-T{_$r+->dKqiqn~Zj z%6M9hTy-Bue_}HuEOX{;G2IG^G7FsHoX3v58xiPlD!)1 z=jM&R_Ng~D?%b>;6&5wb@+YEeu*)X}d6=>HC&%})l6BRZrtTk%G|d2KUQ)!swioh= zPwjxUMHef!JhrFTZc^Lq%xl`qVb!-Fl`Y70qc`=n!d2<<#6jixcCD%Hhk-3mv0Jil zjpJkeG|HP}3sd#JU!0v`f+28TT+*)0)%+f(-Kwxjf{9w@Pl!ANIng7aL9~ton<56s zj^aNZTMAvR#cJU18FnjKkvea7IxOa)8C`Vga=tv#8Uh}}LXRH+ea?#EZlcqLDsQ(e zub)y3jc>S=E%0y?;@=DIC##-7 zSDTl*V)or3esJT$7UdCgps;kGTt${{g!T(A`6brXek?j1St#KJr3 z4A>}*=lD?B6u0u`C^)d;bcE2yyZHU@nOd1`V`1E~Iy*x#6FAsKp^H?RN9aSw*h zGRi~DAX?h|W%P@vdoPdf5{f_QWdha}TvY!hA{2=KDK-83er~wHaWMXS2!E$w&{+y> zMTP|VQ3j(PH6GUUN>}Q2F1#hUDARRjQ=hAiiYVj2RVN3U=sy$rZYMIUCPU?ae;H~J z1)6MDK1GmLMl*1AsinK4r3#)APdBu*+hqXRuXnuP)+ILQnJy?O0MK-`hUF7L(}i~0 z4{%6;Zb~V2$uCrOSAooeCUkV=LmYp#n~fzz_uv}tn?xAA_1X`;@3Eg}J_SJ&zx}Q- zsw|^?Gx8{kO!NOQ+F6R!ZD3xmbO)Z!(Z%iT8ay+MD-IryWWPvZlgL8(mLNPnILelomNmN@<*hn(^`w zb%xQe1lss_Mwa=n(u&FXFOKhcvkBdSQA!FnbwD#*ijj{kARz)|QH4|eU9m8;{GXfj z*d}J+|NWqhsh=dvwa$>}udkS`=h$iebkv#LpKz>IYwISn++@s{v($0IS+3xi77qq< zXXvuYzx&<0WJJa%Bmh*qz;2{>zic%%H5C;jLtp7s=$-==<%vweM%(Ek!X5l2*5*GA zGycAHoYQFb>41U?0PHLC@8Su?2mHOBmrI~EbRFXfK(5yCMPRT@4Go=Ae=6s_t%Nz87D7a8UXw%<#3axE{=+}`oAs(|80vDb*K?8cz@>0 zwXapF(9wLB{^JnKjd~2hzlOFB5*_E&<0bgrbM{e1TAgBH;;lxT1DrWHL}GSFI;A#REOlqjLm zrwc-Lq+w~K8`-fF<~lX+l`jDg+yL97W4YPY`tQSa9IU2VyK-qa#n^$Hgx`f4ql5?) z1f|P#!|jk_nuD@ytM?K*z6+rP#*fV#eOztfNvj{~oX!zL$Y*lR{W2mAF`ysaXI(T= zj|MvLhBi;%slQv{2fh=w~Zcw^=)ndTjJ{fkv z%)k)2I~Dyj?1g#HF9BQ%_yydTn0uE|*YgzE+7r*X57Ba|pmsy?G>&cOVexrFt+2;+GxbsvC&5*aXEAE98#+`f z{$V?pV?f73E9M^Zxj&i3U`6030~i8Hmd^raEf|e`{BAX-?;*5m8_9`M%AcvMVDkIh zIYb=Le0qQ3BA9Lo&I$*b>zjcNDNgPLRyqLzeP?qbwmP6v z?xzcFz*C^SA!|GBcoc9uaGKMzCgpQ70rZu~1MW=4#o-;~-Fl>)Td9CUFhat?C(wKz z5%)$!_^{)({c74{v-)A|pmZS>F2euWioJhZafyLT}|bkiuf|EVy7f~T4X<>F&()-i z$m1~=USWB4$HT0SvHeU*oWYwtLkWod7a05QAT#Li^W$lXp$E|=6&d<@Z~;FcQ}qWJ z`oaUa_m`cagfWed-$m)VxBg>$?Y&{o>lJC!fIEE#J_S#8+x11@gzb*;u8#^B%Yvj~ zYS@c|qTMIAz*7p{uDHGtc{)q8&Wgmes3?{e2KPq8|1*5+Y+_RD5uK=NHMs=xz$YAq zKc*f-P8Qt1=d0HNa!j;g91If=4lsK4W>}WL&yQK}nS|!a-1*>tlq%Umpv3b(tUiJn zo+fLU>f-^Y{^w7t4xL-W<>qhg9M;U1dAn)#{z>k z4}zk`uJ{#FD%X=z(!B>rJxyMEFxV(S{@1fQLhcF7dyCfDJmb51B_wq?r5-@xxXt%Z zGu`|5{$t;G3QPe-J2nUA8Z8YnVZWwG-LB|~j8&#~Ut0Q@ha`-0XpST;P-v$-zjLWx^&x4*(pkl- z=Zg_GYbz@k*P_Bg%CmZ#NqFGGuuWsVCJ7OUKN5=HMkH>AVaDUZn>q{v|3O*cW9e`e z*)sgz@mL+$0Y!=7|M7-^*G1v-ueiKp)VdWH_}Gj06b*r{O>!5q)=M4;UHy`nP6@Oc6g`yV z+;%mk(9epL!FD)Pnr|1;FL1w;a6Z$;AU!0_*f>cf1%hn%!#>WIC2{uipaqq>j7X2V zJwM&g8~Fl}ZKe+=dM)75UTe=C_$!TKWv$k5*NpuP*R}4|2fK4Xr;!_K6LVAGk@S7^ z6C6Txei^{u0%3b>-kdb9=jnG`jte5*IbE$+635D(B+9FkHDdl+3ZLGfM=%1DmtqyA%bxDG zzm_L19B)}m#)Dl57tInbPQLNeG;scz%tt}mYIwfPe!(7>eod_97*NT*CT*uq-QV%2 z3NM0#m58CO$*?x}!n2RW!! z%SZva*3ZDgY6>1#CgXo^Un9Qmf14sUeEdFr@F^S(h)1!m-fj}pP-1tQGW0}53dA2h z>iT$8-6`u^0HOM<%Td|Okhwpa1%#HHljZPf6x%FY*-o4fn{Ps)+&A;5ilDxT3&|j>{xtgGzn#k=TxoMC{pBj z;F#AS0D`aVe+A#6Ff(aL|IzFID+vFCiGg3};pxQr1$?gPo1yP{>oz%2 zIkA1o^1`fLjKlflac!1bc(~(4 zl@|>T+DI(;p+diYc)|@;{m8?ox|-HsQ5)QQoS-6{#^+4sg2t4e!1`@=>>P5TNTYFY z0Y=@h`U!|tXHi$Y2@AQT#~Is@Z~(8dU&ntqC6|rbO`lTcOWM{d|0zJ@W=q zHPBn@_5bnq)=^ciUH7meD&5`PAl)6C?rxOsMnFPJIt8S=yBk3|MOr|RPU-IN;*IBd z&N=Vz-*1dP)D4ck@B6y1SZmI?=Hg2Vbn;sWpA}Q98R0>`wv!xu>ou*eS^Xnn%QWlc zPwg1hd*lN)hAe|xGkJ}4u6U}GlB-#5e!jAwc(pb?QcCm#H2i>G27a3XK|3f$L9H>B zA|<~dijh9hFUV#;evXaxvpX3ekB&i3mGPg_GKmy)EK;_U`LrjWA0L^-$|MoiSE1jk z2-1R@Ef&@oSx}gUW?9}`M zH~Z&b9y)lVL<+5-mqCu!zl>y@VFGN&A{IX!wl~UtVd`TF0#=j9C=WoyTq?6(2n1j_ zt;~$*PE%dYjc4&rz31*Rd;uBl(G&u%CFjTJoP}d4UA2>!V5AzHj~9&m5uT} z^kkK?n44}7S{3>B*;(gtX%+b@?sOw~F@GPNCv64WTA{~Wo_c03`IS20b9Oyb+ibqrW^=td;=K0|Hg zHptG+=A`pvZqzRS)EJ9YYJo4{$<9TD>TmU>6@GA#@l-jxyM}skoMiCRdoA^acR6+F z4EVHNlJIH2&NF69>5eKhEvo1IXY7D5#h)${LEFPb4t=hO?$RrFL%yf-JQ|?f{=kbwhx_5?J*Wt~~ z@UK+MbvSm_OUU(t5u&r-%@q~lF0sJa1}0|}kfvt|E04k;ewc3dEbpm_O(7}m5;Hzr zY7Y@%C^Lf*hA#F#>!@rWLit4mjy%@AV&E8>n)=$1lZ|EQx8imE5SWHSJWLjqkfjnu z{_Cj~@;RWpcdAI^V>S(ZQMEIE7sMeZ*cM3xmA%mXQT8&B*=$aA`vJvhZ-GUO6r7hn3R$xpIPBoroY&jxTu$EFe;Ro zkmpX&Gz~&Zh%hX4`O{iO*mR0T0lVrJ?Dml)B_%PbXo8R>OC*fOLP-hz{jlx&Cy9Nh za}OH;OEejP#OY*c$d_`L&c4o!ap5=H-S^8S4|qi}7p)W&8IO$wO(*@BNW1oP4Kg^7 zR#Mzr&#W*7$}>z!(O(BDDfQ_`^G!)HPz8V7`0Y!6HF)<&O)B!0i`^n!B6V1I1;U2$ z$ncKXx9)SooPt&(JlWs2H!qhSkD>%#o8O&yyBtLUvt8Pvu#wVV@>J+pe+e!fps2rZ zZG+HR<$>>2HTavCst>c1&HKg$E0haKxswp$v6nLdru_`93lWLi zO!iK)h{7ctYDEepi%$VHTzHLkgAMLrbV$Mn5U{HwBIg(J%gA=62X6GithHRNhGV}m z81cWXx1Ob%Ky0KUQk5?BpGEV>Ky|he_bH}nHuSj;Lg3EyzQ9#{*W=G}3^qzE)37S4 z)g9pXXhdZ2aYMzF{PEsBA@YukB??kwvp&#h{d@@0=M^amB9ga%G{8ytbS`Gf-;@|{ z*}aLpeX~RlY}U5t>r2PO8n^B*e~%qhz3oPEJC0CQAyh7E`|VLlN`%>#j^o zXDGRBoO?Gk?{3yKNn@l&e+2pbw@T|j&SDs6C~5^BGV@w|_&OXh9?ysCt;Hy`46G2R zo>Pj{X2$0joBBgn(I^gn{@q<~Lj-qem=MzHK=MONOH0G&RcR+kNF)uNQmuyNBP}JG z7h8adTy#ZzoX)?CjI3})a2(VYCMdR*18X={tHqaSVzDaDAG+vsyPx(RSjD78-GDs4 z7w01$KZe!0 z?5NCivu;gjAck@PI-?Bgt=8)35!$29&lR@ebY)`D(rWQh8w~Vv#K%(>o&`&sRr?@ak^t*9fy&#|&5d+b~p7Dp7tt**K2NpV=DAwEv1= zvS2!k5;_tbQ3DrDN|Q?y9{b>LR%f@!YbAKK{9*(p@8Ik00mw-Xm%jx-N^2PZjA6_b z*@Pxe==Kgl0Bnuts4mOJYC`T0_XY$XOWJwJ-Weo}*lhz9(TWyPH- z+*!Y#AO3#CnNslyw^4&5;bHjeMDL333_ax$Z544~K^JZXpg?BH^>IHjECCxa<;#gk zt|f~K*k-vh+`_LwaE9|jHhV?7M~sR|G);ellWd2h1# zj+xJWa5{^-!NcE}sxtZpm6icq!V3~DjXgdjYz_PxbWGf%(h@uUgkqnIGN9RH6R*5xX@d^pT?#Gs$mvNG^VrA`4f2E}75CBO4`6yRdF)+#k@RZqqW1e{;QfZO&+p@K z{6K0iaA?suhHGde&Z%P3#!6i=Lw~y6v_2)b6~1Oztjo94vRBdV;r#e;zdn5taeW@9 zZ$e(Fm8bI8yd;pv4?UZp_)W!O`=tRF))i9LVo`2hx*OlnTBbISikTU{I=WPZZaS~4 z)~k^98Hzd$fvHNv%(7|Yz6fPxeh{OHknYaaQp49JD~cqaEr9;x+#*Gu8j7m8O*^}( z_FtO;&|YLeS%8S^v6BbfNjLi$k(v?tI=ARuna=UBRkBhCAZDMQCw7Z)s)&^vsw&p1 z?{IO=gD|Kvb558ShJA(43O7DjebaJD&SMTK!m-0W0bFKQjDHqzSs|HX~Liqw}2&(R3< z^V}@Xbyplr6-bX}_30_}b==k40vYPQrVCyD;}O6n^je%}{r-?**U`Ibzd`j=ABRDBjZPb89FfV{y-3-zaz_e^X}%=j7Iob>tbZ# zpT8|-8OXBu6+sXzYa6Ikwl(M%TMwW0R1f{xvuSChk>%cVejdL!S9NDar@=WivUbgW z(g5y&ELvZR{DKmH9CvOzGVtu{4HNR*z-%*}WIOR34a(*Q1-?2`11tQ0Zk-_g*#VNd zNbme$v}W`&hq(}T0aQsvlU#JK!8^bJMC9q2=H^{l`9+Y{*xU_6^1tV|ndS!mAajx> zY{dCTDg}WiJLKtzpfZ4Mg?&9YfMv<0<~{_`27W6?uNYHQxQ8JLaUMxee>Tg>{B@eE zu|lcS=rD>_rgazZ&u6OdE>AlDdFdv*D85&5pFiJ%A7gIfo5;qaF!%pdDjE{|!% zGL9TMNAXTL0jX-@3(M^FA?aZ!i)@3+JJGENImvycN=*qggp*%xlA($SW`xF*ackwR zPX2$SCV>b$*s0DFHZdSrti@o?O#yfQmVUch=5LrS(5Xg#q>QqF_9Z$^&ancX>)_Ja6FIB@lP7# zzYK1g6qLG!Z9RHuecAVf4|P>#ExfV!Kg(vU#^0hgx@}7tc-gEE<_5V*zFVjM724A} zyW*e`yEsSg_$wUoZ|t^&2w-(3OB`Qt{|bFQ z|JNsyFazH9WuJ7}U*e$uc%yU<=pME6islimzYg5LN6`-zWrK!U;oaXT&VPM~Ux_5R z&XV_|rhWhSXn+#D2=HY69MS0ibzy%SHZ=ioo$Bq@Ol*Hi9{zo+fx<>lkbcBxEl`g9 z?;-m4Rf=Zqg6k~W)pt&={(p@Z)YFg~rS<%61^xR&)T6<5svo8dM}S(i|M>EMzd^wP zLk_h@q4EE^)gn)Doh6>LdS-t+lm9(lolguK4t4e4X4StwQMeI%R_uSh0RI1)Cdz)v z|25GtWsO1$rny(ht1NVe(S1~Svj2~P2t);=(6#+;Kq>H}J(Mc@zU6Pp;hg~r<*1?1 zG8cO2f8Fcv#ba}n+)X*eKT3Stj#|`dY3Ge&r zed~6(w7+LzSHZzndsSHV!%|CSQ_1J+Vo~)1;neyDE@sMPabtepN>%c0_QE^z=(t5Y zA@?IA5fQ(B)I-dn49&w}`Falz4{dGjg0R(hIyyQE3JS+Q9*JoRKTKb-E}swtoiYC7 zvS8joJrH4ZTj#Ojkt&74cmWdVa`AoUbVM{WRk{~#xv>1NFL;d4rg%*N(u z>KOMLf?>{Jc_S7TB_JEFR(lA;8D%qDU3iz}Y0Q%fu z6u=RWfb;MaP}@uxfJjSA8v^zt`I0~?Snm>-&CG$#C@nKn(E|t?{Ma%6#zuqnBixKe zj*Ss0h}C{q8I)ejkfJ}}$-WL~3LwEJ20ZMFxo7EmG|?SQRD;6t%Y3P}s@tFvzA zcx>bZ=pL3opz8+iD-7m=(b4?ieAzE4d+N#7)$YgdPW65yBoPWsK9R_LZhww!BoHO1 zJ(4-7O=m}K_GZfUJju_HKf%G2aToKFJF^sYd#GzlMM5+QMh6Cruwm-n`CkQ9!>bf@ zb0f;8l!lv5N&Sr+0y`xN$yPISEfsL}7TF!SV|*nblg@nS2W!L=Dx3|#|4Z8K*_!s`tYO-faWY9er0xC74I&3f4GFL z=ioQ*ebBtMJ6$R%4?I4ea4HJ4T~gsufy-Z$b(WJ)sE@I-pEI>ncEFv&o=Kev=aGwzL3t0thz1q@9A<^LD=x9UUFl4WP{m z0E*-be)3km+-?^ON0rMowB5jCrYYZXe*Vh=_?5 zg{E>na3>PRyE{PhAkc8gfdf1mft|e1Q$c;tP$Q9a8*B!v9NAw$ZQTGq*}iGzRTNr| zl*`f}b*5Z$2GHF|KOco5W}jBq)B6S3R542Yl}PqNSd=F744)gzAp|~3%Ag(Q@0M42 zn6?1c4+#rXVf|bZtND-fQuyQ3S&9;$1nXBrnw2IKqp$Y<36egfA|(~se>j^|!~i%| zCw)Q@H4Oc3%!0L9!Np3jo6l03QOBvm?ZAinVB&_E3UF`j-$OQC$I04z|1`JoPc>F@ zTEsY?G`4JwS1c;j<;gP5M0+l-t`gk8=04IP``l3Y4L}9=z;ECI;ABnW_h5a)V=)Nl z8Kr%40?&y`&*-3O4QV>@*!OBB_-eE5s0*Ih>*yOmgiCfogNG29^C9|5Jy2NMryaxj zWB7-oLsyERcT1(X5NI1Fn4Aq{FOMrOwV-WAn`_0_8>_qbfU9GQr}FYp3wH7U>I^0c z>nAXD=;zRj!Otfd@z9;(Sj85)`cSmhHULpSDAD_|*?=37<-t7Ddj^O4yyM{pjDiA; zz*RVIa0C>}56`0!l@wH#(4kWp9t41*;qG<=hJN0NDOd28WWVnjWDY1ruCc8mpMn9x#1a zjh#>UKb{y0w7XZFJYY4^Ax1#p(h{_DFG1{&I{V3c<;I8(}Lo<9V`h7`HeIoE%aZ5aY~YKi~KIQM9ohb9pD zBn#S}{giN6`2n74KGJw$O2I@z$h_*g^VauXghAjS?PzAAu&03;LKFg$bE+$ zO}8of7>|n<45N}e@(SYCEN7)jIk6gHrD^cb=q?-eM<%{0I;edLL{)NBZ<;mrsn%lr*W2X%$U@tWRyKLOH^;AM zk{aW1Zo`IMw^Ek8&$w>enxi^4_c--iTgYZ#ZbM=kWj`OTk6utD9@a#a1HdwDNVqT8 zagvrrjHiS{Ss%Ei=m62qh$P~LhgJgF==6ILs{^BDyvPm-Dn(s*lD&~mP!tn6K&iYu zrSGP)K#Hzn(r?p8;#{cwGh&XH5%$NHk)U`b zn#BN1^7cpiwMRxLO;_xZ?ls`-Lg-jC5vE@-W=)Zh2Ns9~*yIv9Dx_^Q3?=5Ke_+xH zxcK(Eh2Q03uElAp%CFYhe4)_RdoE^hz>Cb8M_@5@>8$g`X3Hw;8toE~hSfp? zV}*)%W^7b&ouvB>0bkA=bhG%_WFQBeX1XWI#R*mu$)Drg96nn__^HmBcguoAi%dwF z^~-%NvS$+XUkOvOMCWg;R4cn*?B7Gb~^L`2pp*ku7R|9ltaLQ zaZ)lV$}Qv-=srva<;>ju(?n1GDLfNJ5M(27_2eniX3#qMQs$qI^Vc%Lr3*Ky-e$g*b zV;pPP#LIW6w_DUf%s7tWlXlWiMU~e6TRQ6JL`fnjxe=zxiw$)E{e!${zRiQ}fCobj z2LrZN{e;joazZ2q!<3bBO+1%w(ZErk3=t`T!ZU-d-;bh))4fU-t;F#6M0SAfzRoQo z7k5sMUWI^pX7{cBr6}*7+~UulSWHeO7-Kt& zLb2S_anfcP0tr8q?hNQNFSeIGBE1CYtSlHC081+#A#L<^$({E8xo z!#Sv&Ax0R4gz^xF3H@BF*ce^h+%p#Pu5W6~3jlL&ck|aorKY6jq77~;pa~e7MNkgi zJd?H|L>z?8j}um=8bGp?YZi&chFy#~U(@o7x?T+n;)S?5++{0ewPlQJQYH^ro~%U+ zrL^aJ;mWrEBgM;t_WLnGF8!3Ri&A}DHCj*RtYk22VO&Ou#_$l{Oo;m9^2EQF9uj1%qCI124+DyuJnKR&hi z5u^KuUtRWe+M%ok(Lc%K)Ruf~hwAqN*ArR-n8~ZQPlzw)C>(=^*3g&6aR^7ynHDPA( znt*d8YXmliJ=ln5weZONc#_95V89|EQ&w3el_lk(j(F{ahh2KI0={hv&}J{^LKtQ# z&R81X+1BhYGy>QV8HFtcWnF^k7{sh41Y?kPT|e=0_q`bNl+pc|S$!!ooad?@)E@OL znkRDu%-eb!P}Hete)g?y`()b@+iW%OKNsL}k&W5NyNFutFSfp46YlA8ByqwgC4YN_ z5YPZ!7F(K%15u|UM-!ARQOx5#!J@TDa^&q1{}mNMA-Op&^Y_&U;7$b)GtyC4O_ZY? zNq^k(ReLd0d)ZR*#p;ua!)|dQT8rz5gsB`iy=2o}jBpERmIDNw*-nYaJl8Nn!Xe2j zQEFA$m2)^lv2R`!|8F58$V2}pi4%wOL#ucm%W8pJh!SD}mG_)%XRJs$_l`m;gVPp| z$MF*qAs#B5_p;yXn8=WocPup_I8_LL!~6dW#(|;$yTo&UG`9b$i0~m(tgTcC{Yj=X zG;oz`+5DAZxHiLO-e4y6lAoMKTfB(Fa-!!{iL#W`7KPL)aQr&DjT|SEUkUn`dd<$T z%l}^8{{los{!iGBKem1WK?8J!IqG#4p&ljN$N`~utJIP^+_&qM}11*ElqJ3LpOz(0ppGFBo|rLWL% z@aa1{Sf@N_XmvleoF$s$ot^%8NCH=tGT3R^v_)>*7q?r)!<(ZzDsiWISfCCv?2EUF z>Gg=*n{pdY+0hh*(}=jw<>TW>(_7hM(04oKQ_#@_j#wDmt;)*Z@i_F_H#T)@Yh6Et zPb)8wyEzPNNXl_oul1F5u=*vtglb;h{-TU;X7Pv(Eo!gN zE{29Cnuhw+ORq$RVg5R1!s_5q6FM(}7TKhOEgKFZmYIVnW!cI+A9os4%0`*kI_M-TvNQQl`5rTWmjgLa%SK(0cPZ{zmwl8N`{& zMm&zs)C=lk+v2Xf+-mo__B*PrrwWDhwzCyi>u;jy)e>B-a)q0HiI9j31n%oDpJTsA zRRIYGkf9~IQ7?R9Je_?sJS8mEXf|6v=Vvq9HUa7KZ0PK#Z1v5p1#`K^!(Ti0Qr|B4 z_qohfRK5u<+}D4&u`IkUNtTzDFyT!6his1&#lcqvE*svG3}7pH=t=x@b2&i`Z@qg* zj87V)oX4h}oCuDOXNH*?T@K2qKX@Jr?C+K`N@34D9pP*U6fYp`OE78tY zqlvFFT!+iFMZTM^Tyqet}?{6e0%Y+DP&2~8Z5-8KH!;|RyXjcmh6d^0;HRW$#LNAUG#{#ejoP6b^g@j1@YZ_dw$ z#srO7#PNl=H>++(CR@%o$tSVvm(=*)C0#+-I(8k4y}!I*j0=jH!gig13$}OT zvYkx|Tr#}DE6g?fo<_6guad;2$feqnWEaTGIqZ+cxxzS$QM9wP*y8-f&g@K4%3laVoONa!J}Xwmgc>Hg7*PHll_vMzkmO?6HrpNPIZN6@J5*)fA& zQFzkO@+W-uq007(M$^0a>GGl||CF1!kO#Qv_tg=ie~nSqIH7ux(RP13{+CT*4vdNU z!tLd3K!f|hmPe}X8sv5$_BrC#@S9^r$0HI#?5!}Lld|JU9CKZT`7) z_Mc|kUsok9jzgWMqPNsGRU4DeNVwa<+ySSjttI}X*VS9Z(5+U`bUj~^$nkmJB6+Th z_SU$ht-DUThuPahK?P`Xj85n8jxk-ZzG?l$vxrqa{b3gMhCZ}ydkU{tpQHmcrhMk7+spv!6WMMY~JfK{`GZ59gXX~%>Plh_KC5S=%PXAV31zes+$Bf5=# z@r4Emu7e(<2)}nc&cv)VDfUH--=1-F)sD?9eigitx+L%JwY+oh&&{o)uLvw(+5q5d zUnIRoY|+m9jHT3}Id=OBJ2sD+hoOsU%^7^VA1nN)@fY9u6s913(RxzstE$HOYj0Sm z98!6=2xMFoAReg4QRJEc(%rB?`}#H7A!|T-=K1df2w|RlKjgNO(OC9iD0r_sHgJkcM!kiM}YxUCHRM zcl(vb8iKxSo|D-x%zAl4%p)uNBJr|0wiS0gniDk*C(ES-PP^beKTZc>Ysbf!*zq}% z3370T_lh#lH#sJ{?K5m;$daUPTyMG3pM@~$hH^|wvRa}gYScx{l%GS3Z7Qf;XYPBN zrgB!mqWOx~Sal=Zul8bxZ$(*{66D0hev_>Zbo4~NU(N~o9*@yJ=Cj4U_2!55h^|*! zket9|A?mS7I-73hxn<#(k7k?~s*prH9u;D(H3)C1<60do`nEhV5tG8@V&UcW$mEUq zl9#@9`Plc1U26(XCx!H|H_`b90{2Rj1vk$!ypb*`hr`zx$CcG|3_N;yEKtU6FFq?a z^!C?Iq~$sqFKjX@VDKtsIwQ%~^-J=-|MNqvB_~V}wCrJO7AGFLzuYPLoyho-P7{M; z({_Twyxp_mv*OG_)8g^ER?aR1vG2BZAl}4?qR^aO+c++M?uX?2Ek|pY_*_f2_Kp@; zj>hR%GIIf^2aREadGxV3Q{2o@&Yp1hC=$;M%ib>ZyFUdHR!& zUd>2Vpv6z8Vi0D_=Q@EOch=&F&g)1*;Xx;fljfaR5wt=QaEX{Al|J7~h;^y1Q+$cr zi*+Sjy+0QYm7|#L>swG3ZBkDJ*=o{#PfVXU;&zy$SLPN=X?2k1rB^Y<>zG9Jvl4xw zwm+=L*k<8zJUm^|HMW?@ULwv|ENr#j7T@4pb+)3s!_0h7pvg8Z2TT(nFz6+^g4cl#Lf-488kUZZo??c&auGqxtn9g{m&?%wD_A{?troHES;9ibV;SSA zoR~^iYwyb+pJ9io)1Oc2Xlq}mY~t-Fj(T3&F6>s?2zZwKuF&kOG$FsRo}K1L>eV|IaENZ@9 zFM7tU^`Udj%}5A&y`rYXay+MpDItSnXyP=XvxwMt{g;Ax>CL%55{U$E)Or8Ii{gW= zRa`-)?;JB*8Ls?-I)`yr#e&{24wH+JpB$fb;!T^2GG83rZ;!HtN+95$dE8m;d2Ce6=wm{`Pr#>yKI=tH%Aa$#c}Nml;8y z4IKQiV|z!J_UuGmc^!AKiX}qg(5|_sC5n=)+IAN=H+NL=UA&KRXd10`S7mqzcpXIR z&E zH2u!~Q8#RNITD^%IK%d_hkd`8>sa?}QJ>0hyhqT3=rmEQL32h=3pB#7Ct*Rk_*~pV z4^3KENBra1hP!==k<%F;zE9MJ%5vruTubvm?kzO_=5Z{2+srlQB|Vh&2u!KgNyQj? zOq84LGoAl@7U%IuUON!D*){GJYSo|kv>P_-bAPrvYq(dTON)43?O~U$Fuf=EsAs=W zn`azeSY4U&5nUwEeYY9DQ-JZyHZJ$BpQ+H>fWUQjv(99d{2U`=>-m?1#OVHFdpi+>yMLl#H>TR?7>0}?$m7P5sTxzONj z^LvHKEosc<;reF$(c29FzUj1PSWglTzV34h-;|6N`QfxqnwtBBG!swfFO7ubqMIt3 z1*Bw0okk~ii)EXiF;hZI>Msyd*sHN0+1>I8nZ{3d1oU#-|FC9uQ_UjNCQi={4y^Iy z30p}Go$gwjkk>?bPe}h+AV>aydldOtI863_af9UZ_{@U3);Oub5R?^`{W1Xx|!3n z!_tvy zb80`z_EnBC=85h<_9rT(RpLw51Kkz?Es@|rRXfi7_L=^|3dH{=`7eB%mbIgvVjm@0g@Sg>sxsj|by4M%0&4jan!nRqSq)*rn(0Lr+4OXIEiX-d+kjqU=) z_vKX;dXZXv;qj24y5%U-w-Ft3{n0#Vs?d7(EOuDXO~&r!E%_|Dv&&npNC7{|Z;IEBpZA{s?l@DBt^GR?j*g zM}6Z4h+{>^!^DDtV^Ba<|1D%!va}z2@tI&>w4Yx{jms28(_ zvjQSZJ`06iqRcv@Dc{#kzCetKpuW=v1$kD*#fgpt{~F5ECBtFdC!vp>i{O{BCic7h}8&G1!dJ@nbz9G&TlOt=vo@-u}0NtPHtz_p1Y6 zPpR_y_l>odZ16;i(`&ZDNIW4kX?(Lla!UJI;FAT>cyJ#{{$Dm9@J3i0^~VdcC&ZM* z3x4)G=#=+-fHDSLWjt{ z<%>s-iwy)lle9QfY%&~}4asVN(4MUa;d|82*SQLE%-dFCCaL$R=0f zlNqP|Gt85F(#}!gxNO+wyLg5Y- zBJ-x>9W~3$nuM~A3$yX3dbe^FlicMoa>l=>7BEr3_2Ol}Y*~=72b#O#WZY2gBgu_m zissQ}i+*+W_-N{Mcz7k2_%ICT=w#iqnMP{oNuk$qacfpH47hSQ-_VhBv$r-j-xz*n z6cjAbZ-uZ_OYGD`5$eOs?-3Q1|NQ$IfWrKw>3~pv`umLi7x4KGnnI^MOy42d&K-SV zWP}}=nU0Q*iRrtZ*xZW4Q&~9B8HI+1g1#+fpi@ig^I#iJU74^9g@TdBva&MZ&5VJC z<+lBU^qa5McIWr>)YR1Ubf99{pPEu@U0f=P&Db7rUTAT(28bpev23adrx7#IViW;g z5b$sqJ>PUvP(YG<_wL=v$;mSuVq$wiJe%a_wfxCj`LGR{Aq_$-C=-*D%`AYL)=*Mf z#?)l+akSbHstgvOYu0VrXnNj6%#W0)f_kFAe({x4p_^~uUG-7MyJEO8QWgcs-nm%k zcD4%E6bEZTA3h>XpPdH=D=X`j*3lgU$9GpC*a4pEj_nG9*Cvg(`&V!5e0&~&P^y`9 zQSb{6+97LSXE6Td5~#)m!V+@?>Hnk#xMQKcm=aycUX%Wph7?tbuKAc`9Jr~!U-IZ* zt_7m;+A+aaplku9&7FCxvLh&D+ZCe+9t+06q&&MGaZIDg4^UO+#m1>Zz7N0!C4Mf{&tX>oy*E(2 zjq#t#4ZepQF9CUkEMddka^r8GT$V#}XVZ@xKMu%xhE`V+8A!Dptd?ZdI*$AB?YeQ8 zjh>yi?flxMAFa4{V^!P*nhy?V(PEyxvRV+@nKaNYoB+{;*C@xV`@S(pu_sfh-HB$)BKF3J9+u}@@d@1QG5r>+3yBoNV%zw+)sh`LI*Gx z0O~fX-)gGv7jx!&Q^n3euvAP1>K2_qXD0U_bh;#7x8A*&nsTX}`+l#9aJU^v(#KM{ zkjHo9RTbuHOl8Bd@{)L#$`vyO&)9tKuAVF#j=sk5yQhe%wSe-F`^MGr;SP~yN&BzY zW^G0fneC^~zT^zdA4#x;!!o}f2@89xIbu2b)>Q)R?X^kD;HQ``Hq&01>8Yu8euPyP zN60F=+fJD*3FkTLIQmw-eUQwDZ3p$$=K{C1R3W{@r_?O*=Ad|YXdl001Bi@##wD<6 zJG15Nkp<3_*1N1L+p$tTKmO02F2%SfY#LAGb%PJ&7Dc$fH7ZiLQa2L*zm+Ypg=y-HVfa1IJ#> zL1GT8sm1rd5!+X~gP(*R9>Xoe*Sn>aF*~uf=bvu@wZ{*NFkKLO?|{2XxKz;~fgTjj zIZRO_V7*NFzrR4{N3mSZzIwZ^c>pSTQUV>CkJbWVlRo~>OOEXMOwL!iPG~&X=U@AR ziGAGXaTzzYIbsLl-~#LxTY{dCLu|%HB~-oxxy~n%N`v;UkGqv^Cvf2e8f`;&fQHvX z3VN=L%n+QzYfX{kUWE2a#sU#2{l4T~KiSVTgEgC29Ca!Z@M>H}C-w|X)JSNEuG9?D#g5j3?wK&YXdyO|yWA^i_x)(d*!l5;>mv{9djl_Wx%_@Nh+0>JbKB*fy^{% zTlpC}^)wsO$(RoCLLBnUmuD}!j##$*l07?X%?K6Cz51Z2cHTVefSnqWE1U}Td3~iK zPNhoNi-L!sVY2%AUEM(bimEQ&>Qvkt)Ld!HDavmHXr2&M*r{t!w#3b-egkUa^OV2B zSfUP*CxgR*yjJ6t=5gG}^Lt1!V@~qGN)DrmIP0|s>F67SdE7$zPG~THo?cl#Fly6^ zY<9iipJ-4F2(0iDGx;V~65q&z&)gPlzf_pjA0#-FNiv6UFb_L{cE=RswdRT=TJWTt z66Oj5=W1b163Sf>-P4pEq@jGb1U}{6xD5S>d z2f+#JWVpT?0Dduq59G>41lh4c#QsXmUpEqU$%40@Ol3AmJ2t6_U8Rvr>BQ}0J^FCX z9q5~1uY6)BHFE0m;Q~z^oclhTl!1AV^J0p|WBC*a^S&GdvmlHk)CVa;w>eWf8KFse zw{JxZa+zHV17+} z+e$^&DT#F_?nVsiPo5ZiEP{w29{NdG;%L@#7muv!S*|3JN1$@2xdD&p4^m*aa7lf! z0_=spAqQ)%5Ss?>!T{xEJh2UiXYMA>J1-|J1kMr#{vau|b1~PMmi1h|tmq%~znrQO znDPlgiJFIdE`E=xMU~d5e-Lg6ec-EKIkhCXRhqHnKbsL@c(Yz3hD#5l{J@g9igglq zjmI`2A^Kd`NfvJexx`8TtiT)Ub=_mD$4`D;Z5HKqgFA$ptU`)u_=)DDuDsf=6U&^t z6-zu`9G&C{yA2Lsiyw(-in5Vl%<>0GDoaS9?jYi|b|9WCtuthC0Tgm?6SY9j5(9f) z&Wq;BzS(iaS@xS*d10WQXUU4Hbd;zWH8kRE<+zb~1LQiCnT6yElCNJm3aZTyvX3gF z7&fv~%fn;v!m|XNp)4DHdnPXDPDNd}BNWHZ$Qptn&G=+@ldQ~-dLb7*;v$nuqZV}( zEw&>f;CV(eiXCRi$=H+$fTdPhfLi{m5Axi?@cS5hIQ5euR4AJ3 zfspaWt6uW#5glATTFH<+_F$|xV|P9$0u#h9R&_~&SbP|BjTQcN0wxLTqgOep%?9rO zhNdFxq)TTeV?sm6jA(tk5z{fz@mR~f6=6<`T4RDa+CbGzRzPloJyV3!lakmGQ@d%- zsAj-Jr^hluQ!(K!R@Oh^Q&S|iMnYZw-KiSr&v$Y5)MY}kje2`0VlkEZ`QSOj-Z6?l zvV=@XE%S8fMos{cPwcJ^Q5n73TpDU zjOeJ0){+wL0P?svYhYsk`f1yg>DxD3VKO9!@bhg>6fBfELNPX4IJJie)HEV$Q~6s^ zP6a%$#Blj<)zqANoynN(eZ{D%e))$|1-)>7On=73k;i3=Zv zC4^}|D+=(Tm4v`5A{%oT<^Q0dVD#76_H|eCe@5`fV^>ZPQ*{G|1DY)ZmCc2B zLg&M0sSo*tkUGqvScTZF1&YTAsH5aM8a(LXo>|o{Xv60tC$&&$|b zOg}P5@_9btUWecblYDa%a-!%PrzGO6BuGf!a;NdKl9)JlDH^+ajW@~QnIyd|bWdh3 zli$JIG=xq3N&2}b!9|&ucQ%=kB#X?myx@V58tFpSd6{gdz~{e}ToD-VQY0{&H#7{{ zzssniwy&N;(de~E7#6tNoMEns#3P%ZoDRjd1@#mdK4Wk#rUo06O4JqE3-O-y#7bcZ zPYaOTTk0DUxGrT<8A1KeOGya2FLRiC<+tcVC2W%LBHFUWl1HC@zJXBmO~AA4aMKri zr=Vap`OBXoU7&NPmQ{VQiMp67R3uKsy$j0J6tol>n4mFOD05W+5UhG^Zdqvbvb=3_ z|5`BBsO4;hzGo@>VGh-|E~+p9cz9x|=`m!IeHPbAPrzAGSpPv|PZft7oLbeXFxoI@ zmOAxq2PJs;p8sGY94OP^BOy@H0=P+!e?C5GM(NGc? z&laqhh2vtxdu2CHDqt{`&47wu`iE~Znn-IER^kA3pk)<;#yKET$XijoyBRB!RjD#d zD_ZC>zh#ntQ18)u=PA{@Ho9R_gRUg8eyTy0&+!^Y_lIqLCWB$|aOj`9VZOuR4rfH~ zur1SxPr4k1MYyc411zxZCwiqW>l9YtEC^zr3{YL_4nUK0%E+1T{^+rW`7AkX>IMkP zN2v{8dknBmNwd(YBYS8qu0mghIKneEk<|K2(QLk;)}lX<qTteIfFm*)#cxa{6|ZDMtyCR-vJ>DX~mKWAK@H$_VxZPGggfL#$oUmEufSozl@Nsi&AR}kx)PV5G?SmeIH_q%O&#l?=C^-$d0+kRp&8DQgOH#VKyQLfH5)hH@?(Pl+r33_| zJ2oLAAYB4dBBwiv2boZwDi)C7EQVILL?7rPzs0b6_~GT?{^`o`0shEw`GYFFkhek?HCX zNZ{>qum3!?JqfCC(tM3$-!|rOExI*@Kw(8;4UDiuKLwWxVS(4jZ`=BIP&= z4)IXJ2@ZawkHjE1p)`-E6Zs-RBi>T|Xj#;*4t=mYBADsbVM~WH-Zjy|wvwrVDS|5iM-k;jKO5I5D3_K@(i_Z{m}^=M_uNNNiN%g_d!oYU&~{!3rr6SMbV9fwj> zkRySQLcY}MX&rgyd-m`5jBKE;a~@J{rDXiZoagK%G>1P3N}n{BEZpj~at4q=x>Gy~HhDF9Ih!5D?d&2=IH5N=^21SP zEw&WO<0S2qmTWis50A zSU|M-!~F04GAxt!PCASY)Qt43x829i+q?aIVb$Zbq}0JX-ap*AUMX|w3dzX zwrB(2GFzxZI0k5G1gPB`Cx+rL%mwXbInIZlE$$sf)5L}h-gpdCC`cmnCZPH>B%}7MgvDY?w zoFrw)0?VVuiQI(baM1L{`?)cd{P>IH^G$Xp{jlT`9AQjmb5kDNK*KS-leb{1(LOSX z?`3e7b(IlPn$l9rJhO2ASDoxv3;HIbWS%H=OQ5q$npgd?e6eL6#_`MbDx7M<0n!=T zoazV8>Ki_$)UX@I2z6!K&De{_n(}u-H6jZO?}X!WArm0W&i7WU5TH$ucv}hJb~Zb)-D)x|ov5EB=)-eWT-UorHIhukXF6 zk-nLkqq?^-v6WUFO{0H=xiYg{U2z=S6+%ftb6AOXDX3YNoT{X%Y@A*>dwy;iiAxlDK0bZXSG9|PW-!3SCposVh%v%b$h7BMmhfQnUSy6W@x2b3Gjq%$ z^^8rEUUu4alzHYs=BMG*+-MhbEXQ|vsd%m8vK80mh-5wF!-nL0Xpa21hjCrlF39lI z`Mjv7yS0Rixj2I`bBrHeqUS`qQfJ6#sD_rcZ;YPEsTaaDINNTp;#&mUnO(mTvd-7U za8!^vN?KL+ck}`K@S}pDTW>%?*{KMN&qXi*kHtF-dBnP9S{8{5-)!HPufcw8lu7p)C*-^#S5Cp3z!Qxx&= zW)PCIew zaarX}Ij0j8jdsiMj#pirzudNBHVXo>atbBS4JO;0F`GkM6VKNco~me}Eb)Z7n>9Gq+5IT#V>Jb$;wbU$FmFqA6bbVfxAC>1WM08l~WJJBpt0 z3C^ctHHwV9jq|5xWO(Kb@i%s7GE%Cz8;Z?_Y+-d=0twNrIKy9aTOHC>#YA4|_K1e` zq5W31t2snI%ev-4z|Yo(G!EGfjvI&05K|yhh}mOJ`*J19gZ{lY>7yuE-dI1M7YgMf z`@|T=kh%n?$`H`@MOTlY*Dj0M)-%nrFAbIUyFymhsc9B*4kq&Z1Fv(>w(XmZxSg=Y*9SXd1i39rZ2`Zrpo+B8K3K*LH?FB$fE#*WaSU>a{ zf@EoU^XqQrM_pZ6Oe^%ELlo0K>W>U|1g^Y?a&a-*!QxCmp=#^QrqzQp>Ee@4RNQrl)y>dw%fV{yybJ*EEPdTKIk!X9VAT)NWRJr$6- zaMuMbeBZxhofQy6Ay&De8%8cUJZkB1i7yulUxbZG7AXjbeAr%e@ifsu88sifEVS6` zS4CqF2teg!O+byngng^Xh6e38$8?2orYo^CV}Wl*ahqG^J}-j`17P3u<65qzp=JtvbLZptHG1zgSsa;L z8`yC?n1z@&?cGb@=|ds2#93wd_Yw=K$?h#L+7Tj@IV;QQO;-t?Q-zStej2T+ud;~3 zW)h%q??uqjZ^L+^qS+5R>-)4bvne=j=n!cS6&|nx>!{pO1o@-raGRW;7nNLy+=8h! zQsGF_htYAR>*xJ2uUZuN|$&iYQ1an z?~6geuckvx6so0b1UD2*Uh+j5t5=_E*q3_ee&I-Y`>}NXd@FQ++5)Npr8i$#*cx9& z-aaPR79UJzUDay+-eeSiuQZEiwsZSEyL`KvQ-36*lUiTU#Jp0AqgA3G8FkA2gUWHI zM+*f~x1v6)UwY&V{S6DiK7Ocp!UXGtkf9RBYU z_7Z3mkAg`dmDf?HNZGyVGV8#M)coJ`FoQ8xcFeqq*^EH5674OI7bKa-}J3oHAIwZ;=tY`&$Z(KkcuKljs3~VHmkq}13b2$1Na)`JT@a{ zPmX=&wG^J->|_D6vt@^}gw|!JIxspVco~(#d&c-tgPf0ZE1*b#_cUuza(nsYgAbTz z)$Nvh|7C|tszLnPjMW_8FCsJM1^d?7uCRCPm&%esW#$31s`lXhHn6=H{kC7$Bt_W) zZUBTDX4jlTr@#sP;S3&n3<4TbT@yA$<`y_D_7%7uixvf*kAgcg+8|J4+xzV=Rn)(# zYzrPVK`u)kgpLk0RdfXD*yUE2x?BwQ#$I54APPwINV?F-boh*tcP4Ic1p6V5F+ac( zdORhxC%O@i-vKN=!CVb2ffaYa@VUKvUI0e0?Ei&T{a0+0bb$h7*@XUh7}-hkju^B0 z$do~z#}G+|QuC}d1_uEOd5M~FluZ$Miq1B`e)1z2EfX*RRltqr15PDz3U9~$l4JcV znui$Eta~Lo-7PD~mMp+Ew9U{f^xKR9drE8VeX4wDTKpP##$bevAAkwByy?@+*4{ix zBIXW#>h1$W6ZtdM|6{{Mg20G_HtWVTqLg*shYd_@|u&<0aW%4w{9@4%6sk1M7ehS-+Bm)CQ z&hG2YF3Scxbr+ZaSoRi(gy3M%MWE07?Kt}5jOyi6D)t+n;EUl;_D2ua~NRo+SAPxU(-ObwSn z$x#&pD(J);&DW^H({wX+KL~19VmSWDKH$$AzNK~@{;{Rz#X}6t{3~|aAHBEDk_)K5 zMXBUW0yIvr{%|4%*#~Qm#$H5VPObs0EzaRrSL8Xy;x)mITI9c%u?==E6MQL57mBfQ z7b53@FPCj^?oblJSmMf|e*?C9@c_ToK=iq5lJFt3aNur^eWpO(HcWx)X}W7Y2txel zjq}5mM#Wo~V>_DLL*#prC?n%R8++6GcG(ktM))FcJjtQ7=xIhmXRRI0#WUI{uns8+}>Y)51g4aeqcAG zA6Oo~ZfuivdjlMuvKt!v(D}ZB&x9YC)UmObX{`=G5&h2Y)dH{tLfhs7)2gn?+`8Tt zwr3ca8LeIe?*@;NNcO_S;a?zFJXrJFI?ZI<|20!0gVy`@<$P;dpZK^^i*-0V4~S*r z?_ak9E6+-`w8}5w3+1iBE&w{(WEsnJ3%tGPcmukD56xa%mu2t8rWeqA&=M;QV-iQK5;>-;WIHY#O)DAc z$0rHx63220{y6Q|GbjRw=iQrqhYm5Izfl6`JocevVC{tig4W}HEP-+|V%;Tsc9jHZ zzDFHpAqYNBn{QF6;ukYCWo8kO?-4u)oUoXuVy}=6+X5%|5*T47l#ND4Q`&LFe_~(Y zL(~S5gsYI+7XriX$SKkvV7^7sXD%nmc>5f61w7RVs2cie6bq?Z^8VTz5p{Urhlv^+ z2Y_5niy-Vdg5D|+J{7#ct2k%@BhqwYYnjQ!{SDKloX?R=os=!$gGHiH84CS(NzSPP z_Gw%XXdFFt2To_xfPRCoL7Hl&kZTK19@G1i#_!Jw7t#6`-luT@w4dQhVp@@mD(CsU zTucK>LA|eqf#*t|hKjFrPk`IFvKEoxEBOO(f?E3n9)CREFlMI2@TKp`icy@qvQKc6 zD+`#^k|tUVwb8`gETNq+oTx*Ob1DzlOysiQr=0qJdC(tV_~?ra5Q~9WLhE{$j6N^^ zf@ihA#~XbG=4EUkKmbYmrlgn6^9BowAOXZ}fr9kC$Zih8=RM##4umOYy#^vTy@6K_ z-tD$$RH32DT3Il&6l{rj#;ggrS~8aPh!>T9Tq!7CS}4^u^n%Q_on{UJ8Nwgi>I0Xm zJ>7`7hets~2O?9}1-0X#MKs<(V{PXA5YL#FqAs&7&D@N?C@+Z|O$d!2~ug`F2r~lma5Ufobhem%uRo`DLgrGAklQn;_>g=(Ee`y8XvKISsx{#+W<-se9%LiZ8jc5Mng( zd4wLjW%w>l9gP({@CNEgFCD@vFjhxUKgqQf{53t{3JIXrJg-N+Eqf0q!&?-C=L`+C zh6lMNN%oo+BH9{oKxG0{3j^zwd9Is$BzV8Rz3Z{*0s2Ny(J=&6@?PZ(8K;q>nG&QT zfscW_q;EkLrJRc;#k0t?ho3T2n}Iz31iL^h(JZ)SkjSLb2`*3Nse&Fj7gIc5(YjG2 zgh2$U6bf%10>SFdwkEMlrA%Y5{F8RWH*POS8OdE_wh8PxEaxQcp zBRNE0PB0lA6B3l(7N76Oj0~lqNq4OqAWYVp@uRs#A#=uU8MDqJAx*f{acz5VN9L`q zUb30aU=~3Xls?m+%6O=7k^jPEss7cJaJi58J+$<0W@Y?948e`)e4lrw1~(8%OI&!g z<0{KOhDef+4!mRCt~$NL?83P7Xq?1m+iO2HAkK5X)y1R`Tg%WKRoSO>5#)MQl)opO zA>Wv|d@a}EF)s+x=xHZn*vzm)>#C%9Nz(*YPx;LVC$Rss7K<^H*}i*5%KjOO2yt>) z>jK5C+w*L?#flNtz?Vcj*L$auGB>=BjV7WH(CC@0Tnw*7RQDv?UUp`4|~jH|$2<->cR9FTYQ zaA9lSi4v>Orm_@C8t+DtI5kL@!E}NS-=-sFO+H^GoX1ueJdBKZ=;b~o4X=qyiykxG zp(HRTX9#KHMn{Nye8qCkaBL>tAb-;z5$irBSL8#^p9AdqtOOoz<5rfJf@;7RO?+Bt zkDOvzjt@T;6P+4+^nQ}*f>w7Qj>yksXUK+Bc&Jd5pO!45mPVVmyz(PqJ)Vy(`|gJ! z-rgpZBeVM>>?j@c>5>@orJDy!FQO*G`H1qaOKB3`f#>wD$y%YDcK5-Cg+up1w+EOh zJ3vwbfy;HveIz@}D^AycFAnE<5r)YZc-;jA)bI@f5&ixdfk9WBil(8jI2cG&E@T>! zjAd$kcBr_eqb?M|Au32y9kq@zXw}#j!K#Q7(BIKo+hks^N)VqrmzxiP3}!zB<3~Ve zfbL8J=H*RW&us$oEPiaVbicOvXJriD2ec{jE-746ei$@D;VeyY3H^_kGWvx{koqFD z236&-UBHW8#v&Q+jRy{A!8a=vuf( zTL;lnSM4HUJ&zb_IZJD9ug)+hkX0ke&3gTHTA7Adi#t4F<=#l;G3#1Tw^0#GtY{1c}TZ?fIoka%~t;uYG zV-|}?@~ve|-GX=pkP_0q;b%8yN;z4a4JF~lJNhd1CRIS(S_SGD-ViNMBOU(y(3zSP zM%L2zEazmyJPK=zYnfq^Rk)TdrI19ovJLtj30bdPLU3NC9b!#W@Y;Jy3&M$<>OLBI znrt|Y56L{xKBRg=s%=sZ6u~g`?+@+by)lymi1FzXk+o5!e@cFx#hpPQz)}YFh^-jJ zONmJd1UW%1==UBgh+hkdoW~VzKdDCfj+QoKz3XIG+zLrGMb48fnmbW`=B3ljY zy`gPm9fwuTJhv>7sAkrwFuD9E2TtwE?FYLwSTshvsHS%N4W490!h4Dpvr69NnEyCy z-1k#O-C6a?Urn}QGI>ZpIH*oUCvvPL;x=;14HvFdNX(*Po%@TZ3Y5>9)5w9+zG6s^ zy%;r9gEyZp;bj;PTyb8#YK*#DLA^LpuO)PxeGPf9Q>({TH#sujp^M=+AuVJTGtMz( zouX>0-pnF;kb@L3cgu6Bd9KE+5H48bC~mE$zOBB`zCDNAxd2n3GwpsNYad?DfU{3Q zg0pf6p)_l)DAj(Gfc2PUCt8z{wRHCMI(T!_qxrkWu(Zw?r}X(2$!i83DMK5eX@q69 z(40lPM?9^}(6%o3s`vjy{y;RhBP&N#G@UYBy|kN!GNFW?J$f?cZKfe*ez`&Ipr*kl zDauGz$!61MZIk={M6vF~9e8@UG1uh=1Rm}h5MlC>3Zu93DzWfvE7CKZ`vq~FRk9nnj)vtDupW|kW2*`xEQ&TJ05!Lg&3E|V5Bw1 zdGRxOFBQAcysI>q8H%{~d{njIV74sE^=~N_mUJ;%L*jRX=Mtb=uf#-T|85lRd4l<& za<}lwN%I_B>m}0XN8#7yCC0&_`sf36+yGlkc{*a0m`)Bl)t^X{*)EI*K|;*(=Hi9Q z&)a^-`yS?9@2NW+Y0LGYVBe*Z*?Ek~fuH~b2MD(;$g6d`l`Y-IbvL{zFBvhH&_;|N ztUPaqDLeDiiYq?3!$NDUtV|*@vZ+|WBR)qfwF{NLR#eI0oZCYmX8x=rbgHlz@@(yn zEiywKFlv;HVwhGD#QBB#c3?#GeiXv22_@6#6Lv zK*eQ)S7|TTG#GFCBwD9huFE%pE|zPA>L|lQZJnXOM%K6)zA__{hX(U10-kaY@_n0&Ca@2NKQvS zvG1mS!(NJ80qR}ds<@nl#z&8kh`dRAT6LBe1Mag=c;dqMvQeZBpdURsBMQUx?z2E1q_V`cx7GfW+(3 zlAz_^k-<40a6+q%mwmQ(^G4pyzLxG;|6=y;p_iI>I(`knefujijh*1i4O0t%UVe>X zDvTb(M*+it%m8Na5!Co2qf9`BB~s58qF>PITT~IBL)|50m^J{3zFi)qfr(u(6%48yklYOm z>cpxJeLny?(QjQvJVIc6Z9nJ(S^+-p4Xw+E+i4mr1NQDNUy2Sg5IZwHcBZz`Y9m!0W=5s z)63b~P`{=Lp7DC{=?mw=z-1BiCg$Y=(8hRW4J6QDl;HgX&~le&5dY?UjLpFOsSiMP z?Eob8?DmuS%e@9C(2NG9#s1u=AvQ1btrrPL_WlQeSj|j1Ew2z7JbbHD-lCbid6jA6 zs>M79?>cgB2IYSll?!;hmYR(43FFo`pj?IbPyvHXihbf}KuZWx-Rcg-#QnK4|GCPF zx$!JG4&ixk3huhV=4n5UhKLM3M$SIe1|UhX-XKxw3wEGubuFP~K^;X31MQFz2zb+= zjqlg@kH1p;3+n-AQZN9XanyT2qLACANY>HLIA-YSzZK*m6o1u+&cQ&p%zxv#Jg!`i>fyDPKAr+GZSmj+g`l6BK6SLRVr{-@p&+rA}Rg>7^3Pa|A3z9 zlUTR(d94CGqz1o&&}87#hYPvw>h$4w?dT2kGiFa1@|ivun{0(-YJ)cvaSIv-tH{6g z9>W(A$_=a5_V1Y`fU)`P0M$G6Z7K_m&kPa5>Fo--Ubb~QVer|#K|1r+eJ0UfQYtVP zfW9i?wv?O9V}&|ong{uZdP9sh(})=eGrk>x$-xTL1wM;LPFi+H`gG^ff}}pd9L~t( z&@Vb;Xg4D=WX_FJb@~lNCeBU3<=ZC>a5TVabc6~tCWeRtyKscJf}r`7Au5>hDy2lP z>26YRF<50es+`{B(xuf?EGpL|!rs;AH!Fau+Mfn9S1E;B3TM)WYBV;e5F7yRhyr<9 z_CfR=SUz$rw;Nu7CdjI=>K06l{h+O9bMQN&0|Jf1HgcCqLkdoEr@$msl9iTEJ}Kd{ zca0nN!KIC&2*MnG&jSKQD@_K|uM2u^kKI5K_mGPgG5Xe4ST`@4_{s}-qvmoLSCfIc z<=VTtM?Y`BE6Dg60w3&Iku@-Z7RL6x-RfF5G_pN?&82Ns5%*Jhw=KJmO7sj^>fhl2 zsM{KO&H+HD!3^Hx{+D1x=KvOs9W>n=Y-N1AL06G&u6bro-ysiqeza}ktH1`JHOiOq z2A2o*qQ?QypI{8=l#Jxf?z3lz0T>`tX@yM>P(M!a&8aHme;be!kWVC!DU_>J*H;!R z0=ct;p@!%ga+J~;93g^798tvSKH@?Bb^urD#4&S-tLQ1M3jYUdk&p@a26+*_lcjm3 zU{m8*VkcO+(guxSepCTv4Ax!WC?z4f&vl-W|k#HEAdV=rTv)7D_3_^_V1?{2n&BGB>HP&@0bi-iX!*yW0p;9m8f|GJTC{r%q;7?PJ2gztT>q|k>arj z?lgcA2iT@HQ&4ay;|;@}%MxHZ#T_1gaq6N{YkTzkwOu6*_Ra=}MM_P9kuw$@>R!$g zIr4qPEXWlNDRXVQvO`uT0h{wa@8>X;1Z#=aP>>b@h7^kP1@M%o1W>}k_wq{$$xSFTcPPVs z+v5pt21vN00o}=avr}gydi@_fCn*_R72+bhQdE;8yGgdQe0_oLvb}VdTa2N`5 zMe{ajCHiHNa4Q^%TT76l6Aa(F)A&-`6ytlS<30zh5|hxjvooIXizr#vl#x58GT)cI z>of*Ap;KvSy+Qym-WSp~kk?ZFB%k}LPHEUuwR}z|4Q6sx=3#;R*~9u`74)2r$#dCo zV+tzfOSNWfV@qtIVZ%OE6{mR@HL{vdw~B% z^PZ~fJOlsR2Yjw)lGA918#5K~UX_NQa5X~RMb+F7r#biWX!Cv7nYz|eOd>-IgxXh4 z*kb!2K#PO90V8`h%$IbD_OxnCpm%+h!CL%)hE=sNimw5ov26h?o&##$^ZXX_CvxLSA`y7VHmaqy(=gy^ge+S8svYAwQGIPI{IDr`8 zECsg*?RmzZT^+sWUEs&{Si;KU{FX?awFMU&S6?Kb%YE3E0 z!3S^%Z4R@c)WskPX*2H$f|~IOQApNoyfyRAt*!hWdfQi zH*)lxr5DMpm^0BQF#Fy9fx^2-DxGLiws^<_9;`a#)bI%-w5AzNFS_;te@4%YrdE5- z+WKEaJ~n{GWh5zJ%ax>KI0ZQF>?|9Sa;{K?e<&31KqLM@T>Tlw~< zY(SC&>R_xAbOea}Vj{?o&(D6Tyg41>b`e(L^=~7+U%oeJKbgzFg=Kz|U_qb;Oc%QR z6Rabbcrq6mR>9a}(E2aN6Q6bnm}Ncq;QWfCX#kJBu;4MzjZlD7VG2`0pqoFAbJmueTkrI}1!mOU-l%O% z-U-dJ*qww&!o@pp^vr69b3lfX^CaUPYqL!pgByx7n0G;uU}0G{^|zbNEZ57eP%$F- z5t%f&mmJ?bU!xg}vor(jIAj(Vvj>2O`3Kd%Z3e2X( zoex{&bzW)&ND6wx9xys!c>8D^vL*y%OL{vXw3*rE*e;qsd%9XGW|`I+#pa6UY7FE% z;y+GhjzC$cBP~h^lyN}6mEl)`fP{j(e+d+8E+87Q)i)wytpaux&Y-wCiGXqs2b=@j ztj+WU74t4Pmq1$G3YdOa4ye7{z%&a=A}O)B=>YDW6m}pxLX0}2vWWDs!m}2 z;XYW{qKf~Q9*lp)Y6o0IJuFx)8PIDn)Axo7m6111Fs2bx9aZpCo0QjLb@m!okCV;v z5x>%Z=R&dr0prop-cTQ-blj=M$2NMiwFd~LSQtE^CSSbSE_O(k--*#?0A0U$T{C$+ zSKD*>-n0Z!OY%`(h&47VEsqSOt1YB=fuOB7tcpe#Vo3G$QmehBr4bXiUt z+S@3NL#icRnwwSzj^u-l^A zpgoq7UI|Js0c)x@fD?#Kp-F9SyU{)_x7tt1*ku9oM7+LG_;{NDY<$_G5G2&pq((g9 zTv>)o{(wZ16a}YH1^Mh^w9vxn=omik`^*Z4h_Mb?pn%)LMPL@z&z2M_MIXcP&=|3Y zXv`zd-o5uwQXENvBR|>h9gX7#+~DGXD+k#NYA}c*`K{l@OsM0>`S@rLNFm+j@B`#K zSa+gnbAvWxm|)#{JjEm2&DTIjxvns_%sosVhR>qVEX|I`t#Byx06y8qWfp;dpt(F? z0JWlljbjHwHqXtNWL4~rNb&>D)nv~iJ91{bV^9qaI1bYu(odEcj17%xQNH{=Toy}# z#sVkYBUkTbhT;SjS-c995X7j(iJa*-U)Mx1HSsl>Y1i+n5FoTtj|TImMk}Ik@eR=j z;`K_WqqdRr5@g!q!KHiSPmUONELb9#k27ksv=dKyeg)M^0L@s1v1J$;3PhDh5>0x9 zIv~P?MS=Wr1MNIRtQAsLD?iP*>hvJbz;>87PSL(>#}1#XY(k*%nATC!RymW?8TCu{ zAF@KFII?rqx|XuhQX`!63}`u z_O%Wze9@Q;iW;mt5GD0B0uxy;^!3zDx-Y1avsU6XI}lRVP#T;*%5`h|tA7SHskqra(za1n#rNb8u)88x3F3!3G=%kh9$xDgGdKN|zy4W97o z&*!U8)-3SsGMsdMjyI%lW@^PQvBH%Aw09meX|yK6nt5gi18wBwql3Zmi%$x6<<3Xiz-V8)6-vrhFFu(n2)o_QuqO#+WtwV^v_5s!OIl)J!@ zK>$!gC!TfFqx{p!pDm4hE*|A6#JCR8Ov~^yhw3r>Jl(Ei3jfsw%1==OlQ->tA*6Dt za(x<>i0`d{K1&apii_s5$i}D7Xzw-+2P|H^-p!?7;)~J^XOn&YHVic_j4NDGHXT3Y z!H5_p`3o&;5tHVYwEUmAuDez+I`i^Hq-WVvt|KRfZU^nGRF4Eh9;|)V&l1 zT$TwAP(9E&>^cs9rY32b=|A+4n?kdq!sq6x|KXn|X3`7E9Q~?Kt1{%Z;R4{Z5e1Q6 zOD9RLDR8Kc)2JQtnkHz66A$pO(hgJ0B+K`$<1+wM%@<7tPlfX}eGwQ~sLNunk_LyR zZo0tSH5hOF$f!9;E643Kr)kXCF$t2&)PV^cf(P;rIM)h(O`dScKkRJ6E#EAuevaF4 zT87*SQ!JOM410blFB*G}BxTov1dkc^hjeZ7Uuw!E4C6_~7Phd*Ri;(q#EnkQmXXzp)!mlE=$#i2fG zXa!)Dy{?_dk(qDhO7Td6z8c#ul@&7C4m72>bg)>U^FwZf!x-o2PS~G*G&k$!PCne1 z75*&$V4p|WQsGI&KIKbjiby_s`tw6>#_pI7sHt{s%!u1;>0R;nHUgg8BO|g_?En(d z;)Dcg^<9Wo9j@a*p5(>$t$#v6a^NpqC zXrxok0L4l?YqYmkN;TDbnS{K$cu7ugjEd|VACSJfB~Qml$ue$*dt%WRFr6Hgr>XZ* zLL8iA$qGO+k4fuU651%3E!qVi%9-N(`(dS8{1)EMa!TWl3XWHVhz|~^BqGg*$6)GW z>j)i$c0>R$5mKpM-K3oXlk0)J}|nzNWjt*a1r_1@$}*K zVSXWNAp*V=gB8)li91863w16Gm~fv77+XU#-rhQV%A^Cjvrg$RKl!h-C-jN4Qq`O8 zwggo1KDZiwnwx->kny&_q3}U7+LUu77g+_)Z0h%j~ZE@FcF7J1c2b|EE!aRY1~ zmpJ_hB!%hNML@a8tclJ%{y{2V+HkmxJoW^T^rX=8dMv#amLo(*lXSyfYl#Eo4l5N_2 z+6IofrltXEhCeWyaMYt@@N4mN91#&PaKh`7M`wj<1YpZH;l*jA?>P2|DlC;R$zLnB*?1oRnp* zP3iAqSs5Z?sU=h{VBwKB6HwAq)-YibJs|I+s90{TjKyTOmFzJz3)AsBG9>mfXyULw@zOS zk`m3%E487qY*M2Rwr-GA1NR~&sDif6#!+Fv;l3^S?3x)k(RX2O+gK*x`# zR!2`jqu4BsE(qJ#g!mpfszvDW^mybp%VNx%m2i&QXg ziO^>5on%}=r#8K9yft)%J6fc@+MQ~A?Ep>uUEy!M{~p6SV9Wa-*dh$x)B#hNx&1rz^G_rQ z^m%h#N$S8$yZ`=QGHAzRo*oeT@Bh~V(Hgw}`Tu9ux1QV|`%mEHuT*<58UfslX=3p8 z|N3P8L{Zeg)#rcw7z?h7b$9dO-*3mikykc_LZtuozJuVo6MA_({~zd1a0cD<-%zwa zegqv;`iZfx1>gOC{QXIDkEQJW{w4qX@{-(ORSZ7N|4-oTUyo*I#Ou#D`uzinvq4Q` z1zhj{xRT#rGAhU9ne|8A5`$pO3)EvQt~umVt)wObb>+AUjW^(SY4X%H#iFH^TTLhf^z-{ z9$Ntf)(cv_wIgM+I8BL+Y*GL!S00Zg2bk%%pcTr0*|AFO%e)TBxE#AD>0(V*?;10<<#Q$dS25#cJ;xSyJB)1AmZ|uEi-#>h$ty&%cTr+%{ zvIQtq_SAx2z;;2J-wBKyL-N4|!l$#U$BmW4sax;{_re~yEI$M|rY-w8!7AU$81V$? z!2~9}Yk)odb(Zvu&k-hD0hbZXD82wV3#bJqbaP7Go`J>$z)f!gZh%VE&;6C%B)oNV z`6>AdCFsw*h7ZX649cPT8-SKQ1rEijuK-#GPf$5nGj_jMQ)+vFi{r4>&z^RGPt_)% zcyiQ4_`qO)4-mke;Br}n0o(}pz3Q3_MnHnM9PIE4<&;|D%K<+BMv19_;AYO)Y9gDv zKN1VxGnGw=0X%v@qYifRK_%PICm%r5+G~J=Vax(e#f3yk2CVbuTkX%M!=#UeYio~l z86MsKR9H9`(EH+Fez@>fZ*sRf;LBtTd(qQMr~NuT?=QXmwUtFdrKKN(>H@qR7K*cK z#lJN=&FbqtdSTMir;JbX*VZ|7v{N@a__5|AwGKM5SbxBPz7=acwVfkFvbf|4-d*MQ zjmq`2bhv!Y0B*b&@EwRez#s^!iZxtscng!XHIG)h_Ev7Ck;gHBG%NDU^z=^Lf$$`) z2y{WcJ~z%Vgm+TR_A4LRUVtxIy9i2T1OJBj9TNL|sM;Pr=-K)L;Lc)JAlXPopv(5) zsrYC|uw)G8H&H%O-Jj9movL8gicL>rVfBf__x}z!ES<4R-5^nxZ(1yQ%I8`Y3sJy+ z;pNQFk|@elgrdOC8`4N4fQ#!6+~egT<9hoJwfvzuP;xx=-2&(&maKAd!|*$fIsAc1 zuHAS1I>uoic-1NA%14)eiCwwBHWn}ySW_LUIk?{wZ}jQ&mddm592_JUb4k`$E4HPv zx;^XpG0uV3e`W!4$g#fyh<@Gi1c`F-31mWWE52oioFv-TX=lxlfQ4R4Z-&$`n?-GD z3Q&3Y#%N^258m6P0cg3-JAzQ3njrK#?z&8vTN&8K)^#1k?*J-HIX{BO@hk9@T>(nz zej%&`av>g_!8WP&6a`8lARqD(8E1dhoA3xwR%l^0bZ_Zr!|Gp-v7(RW2p6*bA^?qn zQ79?16IqK|*dJXPwJC&(0^5!(ZUa{$y7+I_STBP&7lBcAJU29JAjpnqOCM?2etX_> zv*t`WiPFcf{CrPogD>zkEHLS2WgO(_oero#og3%_DwJpXzZ zgusFBshDqx*rImFC<@EyrVca*fwEOSm+Bmg%!{_jd$|m#19Vxrp1tn|`=qow^^}36 z3~NKy(Iyx8mgATA|IBrWo_qv~_Z&u4gz2-0Jv+DM}bL2 zJvOm0lXAWZcrnGt3j=)89DMD4`}88q6Y*JVo(JJok?LYoPDQ;ZPtL4WE-uxvqw>Xg zIp8P#NOz>itz~I#j)1z~j=j1aOi%vlwnC`bPo`rF${x@<=K(z07X+kGM9ATwK8U3zVN0*Q_EewM7V!4k4AU;~60*IPdsgpz zy-(ji`~np#-V3Y$ptVjAGRs_{lWk;Q)c z$2Q?ogRWDV!CkWtLnV*&ty9cPerYBO(y{0;_|`>u(gR)nWm?AjwQ~?$44xq|F*0wl z%A(Had}LgkatD0TDA|U@ymHnhd3Neqzt#O94oCHh{Iitz%ktUUAG@FO`fO$1>U+X~ zfA$D{h`c299_os$IFndJDS?I>lP!=m3-ZUvm$?C`+q`2KnS;eE6tltRL0@r}oDY{L zweCu7!)cOWMZUKbGigNUf?qhXZD6Y1oRz4BIJ9#C^oe2By_bBuNY7*4MjC5&4Z0NU z`&1{J?#6JVBkgm`Y#vBLQH8ikxG)Y9{HVIlVw!vvUMkw@%bMaj?*U~oD)YAa&^9ju zb1FkpeCroLe!Rp`5(<1tZU36`O)PR=t@gwCyKST%Lwvl6*RS_WmYk z8GrK$))tIG3cLFF>={p4zqj}<+EZ)hGPmUUS2|ht288slml}pUBF;9XIS0^`&U zvP%|JD|Z@GSnt;-Uf;S^kMZo=uRkBzzb`?JzwuphbM6@)uJ7nGzfs}fxdJ8*Lo4qt zUZ}cQZt>pw50{)%WjEgSy9BonCF%C#Y1_}kV4Xip4NX~#qmL(nprv5`Zir?g?o;tm zQ<55KifCKaoiQzTwYdt$HPFUO)}SZ2EOg;b!!XmY3T>jFPU_T*V3xLuzKavWL9+FO zV}p>rw^TV762YLuhHdi}j^=MqpEq?(8G7z&9df~F5+XbaF{`J-CzSp~yx2%`Ao{utG(z0+!$YwB7tmXbgdl<}PI)m(;rO6|I zS<0myiEoN9&lCosSZFKOU$81F+meFBSh2!0aYGLx8tIJ%r;iUBpj>MAnI@)q?^+c2 zwg6E?4FztzH(A;mz{xU+b`Rtv!bjlWs-qyKQ!^0bL?!(C$72TtNch`!(>kBXd*rhq zx6-G9n_MUoRbE}Wi!z2sQ+T-(ubicWdBVLDzFclilU~4a^e0#qLt2T^YhU=MO@e9~ zL_erGmhRYvZoG^)Q#>`Gd}F?*W_>Lp&w7&ivd->730>58^4L4`4E@xqNtc}p%5T>HpVUR7e=Wo#R9Iq-=5g zRl?*G1hYFAswHyvOJvc{60`-5nt?ppgZ1PB`a=TIGfBtl7UxO3eR$SG*E66Hd8q9K zQ29ZfJUHc5S>8d}CwpvA-aHxD6x@dkSJ0Q$b_bkW;ZFhXq_*p57_e8kMOk^C8QgI<*i3kZ-v#ujSz@ahU2sv;AZERdTJUVXF;4t>lP*=^0gUm~rmwnJ7e>ZbpMzz=w zb5xdz;6}g7fY3{%VIL$3s|3T48V}?qy6)@Cs%iA{+giem0__EuJw00{Rn^@OA;FII z0D99RHQ8_TPJFOCy4F0##cC>nu$1%$tF?8^H<41!=8|klHXmVLz@Wa}^~p|DjaYA> z$KI28dW4@$^J@X|E8bPl-D+OJx}_&wre>$3@y4$iYt5e_t0{|W%zgW`IKOuJ@9h(b zCj7LHzh{3v-?eGYFg5dob@0P}^P^cjT4Vo(O4q#oYIFXF{`(o1AYpO8rGMt}1o$vM zwu>$8$SYBbzEU>KY=~w3HUFW}aAamBpCy985Q4CWLo#IS0aY*7D z@TfAvjjoC{y=hmG%(4w#X(Gtb-h0TLrWQZeNVkd0wTR)KS>WQlG-B|Le5~v*(C5KY z&+~nB$Er}Y$k8s*AUxXqBAeBZTF)D8Fc7we@MN7%?q>y@ z*Vkx2Un44SVLm5hum&k*N9^rm0wcPM{8rCxAb669pPh8M@B}v-!N{++G)s3(wuF#Uaarn4XSC6HG_ncMkm-7JL2_SQx8Od^mg&#niA&DcBv6s9ZKaWAknIlZ9wCT>24buXB&Pq$iMNg z1mH9qASM`)$1nJ0u-G#kW3MLipw`E&e6~>bDVa>qqHw(S%U=X`psICta{B_q)4j=B zvu0=Xk|-RosPLq2#>AOBl3ibZkvEP@hHgy($_F6VeOnM%FRYdNH8(S$Fsnp!Ku2r> zi_nUc6NR;i5EfN#;$rrSSaGQb*k3A60eN6v<_&Rh87w~_6=7EGw5)CK$1C7X^sW#E z<~Mn90db8vpP&J%gu4r9lzGyoXGd{eW2`sT?ip+VSh0wyWbJ+dSZa9NL)H&3u4NKf z-D7OxfMPSr$_rW{cl-DW_iUN~2!-W46K_u_Ac7uB_Zw+ zl4G7htB7s6ZOMhTxxfHYv2+a6|9sDGJA(lm$=-0EkXnC7H(#wAfQ6ha6^*3;(ulIZ z&W~r8>$Y&jafYbBKdrK*>(~u>9#yW1R4swnD$0vl4!{m28qvA-NYj_qbs%5Sezr_I zDJzaYr>P;h*pL|TLCUS3PFujCo=l%i3yvUdM5NZdP{xRarN*JI_ zv#y-LfO{7M^S^53&{eRHv;Wa8u31$g(pvp^<%MfS4weQcX*EksUNP6RhY|R~Yt2tB}i=temgE0kfAHV&4lz;AKYnsrl_>k6;ly!L&ofRdwv*DsDWUAO9% zib25wFGkvuml20+>az);uTY}G7m9^F4)vaV^cdxQ+wyKX2qXU32fhmUzG5;ifjO>I z6o-Epy%<6b&B)*8tY6?y|6oUL8>o{ZAR>%_q?{ma(9=A88cgNevW{i;NHy~S!;xU= z=J#W?7_I|LZWc40K7s6ogC=0fb2E=MX1=@?aFc3O&tWFOkmUw@Tyob^9ETf}T9M*{ zh`3uu9k9bWZ&9V{4t%_?9BE?Ar00NTrqmxaT)An7^ zjBSAu|45oJ%SM;wI0brcw}8NDRv|AWL+HM-R7$yY0MVc+-Pv}??1>v0o+BAhxK$#U z`}_IVq_GTh?3uJ5HkqbBjH*RFttz;9<5?hSHUKR2YXHKOD6#G%{dC>3>COsbL(54I zaj<-)3OiKKCL{y}QObd;yFgSTjpUcXp zMtybw_rd#onZ=q2iLjEWl7}+9m2%@>7YvXLT*{pKHghJ41zDqEZIElZigkAiw;yJ^hoiTaB6|Rt=-i zG0;+J&N~3z+_DH5UT0gqNCaTSi~LLP^rm0#L@0zU!RaqcCnGr=J&rie{lJ-m7F+|I z_t^o(gg*}foy1rR!nZ=(Py8fnfv9Q@sJ->Df_8R{nm9jh2L|G3^t9i#OE<~;*jYOm z8(7YBiLY)8?Lj7^5843kP_?5k5>FbFRaq25LoMAx+ZH;ki)U1Fj4xl-Qb9a@k(u&L zc^7nj*>%8=su4>Ej}XXYI#tN4=kYCHDda|YQ-p!u4$j6y%WPl?2PKTbClpj1!W->PFar(h`{YHUV^szM!#j$u}fmG zZczZ2&vIllpgDQP6E*tg_0lqR`$)vSlr1!Bj8b(_J$B|eJAlko|8&92B39DfbFaS=SKKS@!ayS0Q8bA0>N}I-k(r05u7njv?sf_6PIPS!-ZNb-ec&?I+@hgX6AM_Ba(c`9 zCWR5S+Z_OIFrG|<80ME;*0?65Q>7-5pOJoahr<*D2 zq+yUDOA-X}Kq_QRo#kchs&77#N(=s1_u+508V#Pl>UU~1>mlk8d+K)ge*N73KBA`{AWhRgcpQCJ<-wy_L?DBcZ+)c6U=Np-*t7JQaLQkKCpVX4{--xtFtNVE)q zg!YGRWh0)(!N^9%x^ecZ)*+t|8S%cs1|tUBu3w0(GXXJWksvDrAWoD&v5ff4Gjnx> z^eK}Z^mkKI1!xUVo|s||5d`O^OuZmj!jWaBPuUiyB*t*+bVs^70kHV8p$j~y zQ}LsFJR6mp$z6aOLrnr7X+ZW3Pt4i&c_~HCAG)P^`o{CL7H$1BHO@lSw5?R^Am?kw z_T}D({y1zn%S->-ic4cO?FP@AkAPD}+j% zR01N>I&!XSPw@~3(_4--cM8f!J7*`Jh_n)TX}z}4#^Mg$-GO}HehR3V42r-4Dc80M zqOqr*ts-BmraDUv+BKW!-q9)Lw<&RgIULNWNW%{S7Lw_CfG)Kz<(&b0pD-{W5GsG% zd%*K#RvM^UIW6Zf)x>>Vwy?YJ1rHD+sWOieEF3H>9Qk;?EEBpa9mhp1Sd#dOn3+Z4 z=#Q7)Dk_T4lLPR1&hKJ%71Ev*tE;HlUzkN$9#!I1b~5!+G9DeT z2Wm(DcDZes8>}<_V8+d2lmAZ+EcMkEIAC9O^8&;yza-Nk}R?SH(YEN=Wgfw$u{kAG^$@>$xUp- zkZzuWF0sRcOEEHOV7lbbBe<4h<<|p{T5ee|d^Yl8JP-$^+8c2na=tAGp|O1UaD7Io zP3aX=m_X5%GZ#wq)|Dlwd&Jyn+n60y+&JNzz%7JpXE?_fQ|_TwixT23*D97tl<){DbI-aC&aIi~ z>pFC9IYsm*TkUH))9jwTi5x0doc98+9GF%rl#0iyb&qoWGL&eXP*_MiRw==Z-p)}b zT!f<>em@EOIbK|bjH+Gq5se3nhZv&mi(eni{0dR|(Sx7+iEdpd#s)r~%)vNNHai*6 zuptkqWJdTnqdEq>QP+26aLWZfY7`jUnq-Wn#syxNCz}L57Utl+7v!n4$U730u_T$U zzcgE}Oh8|sL2p8mLz0!_LBJNq`oZwwfomV_Ia3zeHAC*zXrv)mUgrnh_m?$$10>V) z*6(MFo{9eaHVsQfaen= zszg65VcsV0;KxxFS1eWGm)Etm=uSY9ubs{Rg)7A)@@2+ZP3^u1IzgEr4Rb4Yf&54r zS>uGpjY?UL8lU{3+WR5K@hGq1JeWj~7&%DLL@zJo09u{jwmZH}eS`e;4hfT81vcDf z%wa3}t7(X94u-R^sK_ujSR#7cAGHC-lEmhu<-HOOn9stf9MQck(P4QMvf%SI2(wf4 zQl5pEn>E(uo%U)FVy{YZoSZ!JasV+HK$IJ&SrUjdiFBtp=_VJ;=AV0 z`}Y0!>x%p0^9P-}em_aueom6|&ZKP`_Mbt*6t2us?Bh42`=M1! z0q3@M*ytRET(s_OTZnoLbO9_qD0$!V;8nKvW$?sM1!(hbY+9LoMpUkI6FoTe_So|2&vTHs=|cm2f8=DUx{g=}x4+Zv01ZFeM5ag*PdP{h zJ=5(hzP}$en>PgaJ#>a~EY9i$E*sF+3K>LJ_H{>xrcbFJvp(@+aXJ#GjGweOU5hR; z9jVkG@z+s*jr5*wvWYLw)%C>ALzy@tYZ}=i&df3N!_D<3AU;e-kzP`IRZ}Ki&QP6>wCs z&nJdoZ<~MZ`rplL&SD?2@d5MSzvurx1i0M)dscGKec(0*3e0K#t9$=!R?)q>{of7! zZvHc Date: Thu, 2 Apr 2026 23:56:29 -0700 Subject: [PATCH 09/43] ETP+CG: fix '--overlap-grad-reduce --overlap-param-gather' --- .../module/extended_tensor_parallelism.py | 46 ++++++------------- .../pytorch/module/grouped_linear.py | 2 +- .../pytorch/module/layernorm_linear.py | 2 +- transformer_engine/pytorch/module/linear.py | 2 +- 4 files changed, 18 insertions(+), 34 deletions(-) diff --git a/transformer_engine/pytorch/module/extended_tensor_parallelism.py b/transformer_engine/pytorch/module/extended_tensor_parallelism.py index 0cc1979524..8969ccfa6d 100644 --- a/transformer_engine/pytorch/module/extended_tensor_parallelism.py +++ b/transformer_engine/pytorch/module/extended_tensor_parallelism.py @@ -226,9 +226,6 @@ def __init__(self, x, *args, **kwargs): self.rs_state = ETPWeightState.NONE self.wgrad_rs = None self._wgrad_rs_handle = None - self.fuse_wgrad_accumulation = False - self._grad_accum_node = None - self._grad_accum_hook = None self.rs_event = torch.cuda.Event(external=True) self._rs_ticket = None # Padding @@ -600,16 +597,12 @@ def get_wgrad_tensor(self): requires_grad=False, ) - def register_grad_accum_hook(self, grad_accum_node, hook): - self._grad_accum_node = grad_accum_node - self._grad_accum_hook = hook - @staticmethod - def _finalize_wgrad(param, wgrad_rs, fuse_wgrad_accumulation): - """Post-RS per-param processing: strip padding, accumulate, call hook. + def _finalize_wgrad(param, wgrad_rs): + """Post-RS per-param processing: strip padding, accumulate into main_grad. - Returns None for fused (grad already accumulated into main_grad), - or the stripped wgrad for unfused (to be returned to autograd). + Accumulates the reduce-scattered wgrad into main_grad and returns + a dummy zero grad to autograd (DDP backward post hook is not used for ETP params). """ param._set_rs_state(ETPWeightState.NONE) @@ -618,19 +611,13 @@ def _finalize_wgrad(param, wgrad_rs, fuse_wgrad_accumulation): if param.is_padded_last_rank: wgrad_rs = param._strip_padding(wgrad_rs) - # 2. Accumulate - if fuse_wgrad_accumulation: - param.main_grad.add_(wgrad_rs) - if hasattr(param, "grad_added_to_main_grad"): - param.grad_added_to_main_grad = True - dummy_grad = get_dummy_wgrad(list(param.main_grad.shape), param.dtype) - - # 3. Post hook - if param._grad_accum_hook is not None: - param.grad = dummy_grad if fuse_wgrad_accumulation else wgrad_rs - param._grad_accum_hook(param) + # 2. Accumulation: accumulate wgrad into main_grad + param.main_grad.add_(wgrad_rs) + if hasattr(param, "grad_added_to_main_grad"): + param.grad_added_to_main_grad = True + dummy_grad = get_dummy_wgrad(list(param.main_grad.shape), param.dtype) + return dummy_grad - return None if fuse_wgrad_accumulation else wgrad_rs def _wait_reduce_scatter(self): # assert self._wgrad_rs_handle is not None or is_graph_capturing() @@ -695,7 +682,7 @@ def _reduce_scatter(self, wgrads, async_op, nvtx_label=None): return outputs, cm if async_op else None - def wgrad_reduce_scatter(self, wgrad, fuse_wgrad_accumulation, nvtx_label=None): + def wgrad_reduce_scatter(self, wgrad, nvtx_label=None): """Reduce-scatter wgrad(s). Sync for last weight, async+deferred for others. Accepts a single tensor (non-routed) or list of tensors (routed experts). @@ -710,15 +697,13 @@ def wgrad_reduce_scatter(self, wgrad, fuse_wgrad_accumulation, nvtx_label=None): if ETP_CONFIG.weight_prefetch and self.prev_w is not None: # Async reduce-scatter (not last weight — deferred finish) - self.fuse_wgrad_accumulation = fuse_wgrad_accumulation _, rs_handle = self._reduce_scatter(wgrads, async_op=True, nvtx_label=nvtx_label) self._wgrad_rs_handle = ETPShardHandle(rs_handle, weights, reduce_scatter=True) ret = tuple([None] * len(wgrads)) if batched else None else: # Sync reduce-scatter (last weight in chain) sharded, _ = self._reduce_scatter(wgrads, async_op=False, nvtx_label=nvtx_label) - result = [self._finalize_wgrad(p, g, fuse_wgrad_accumulation) - for p, g in zip(weights, sharded)] + result = [self._finalize_wgrad(p, g) for p, g in zip(weights, sharded)] ret = result if batched else result[0] # Wait for last reduce scatter if it was async @@ -728,17 +713,16 @@ def wgrad_reduce_scatter(self, wgrad, fuse_wgrad_accumulation, nvtx_label=None): self.next_w.rs_event.wait() cache = get_global_ETP_cache() - fuse_wgrad_accumulation = self.next_w._weights[0].fuse_wgrad_accumulation for w in self.next_w._weights: - self._finalize_wgrad(w, cache.get(w._rs_ticket), fuse_wgrad_accumulation) + self._finalize_wgrad(w, cache.get(w._rs_ticket)) cache.release(w._rs_ticket) return ret - def batched_wgrad_reduce_scatter(self, wgrad_list, fuse_wgrad_accumulation, nvtx_label=None): + def batched_wgrad_reduce_scatter(self, wgrad_list, nvtx_label=None): """Batched version of wgrad_reduce_scatter.""" assert self.is_routed_expert and self.weight_list is not None - return self.wgrad_reduce_scatter(wgrad_list, fuse_wgrad_accumulation, nvtx_label=nvtx_label) + return self.wgrad_reduce_scatter(wgrad_list, nvtx_label=nvtx_label) def __torch_function__(self, func, types, args=(), kwargs=None): if kwargs is None: diff --git a/transformer_engine/pytorch/module/grouped_linear.py b/transformer_engine/pytorch/module/grouped_linear.py index fe81196f4a..d11947be0e 100644 --- a/transformer_engine/pytorch/module/grouped_linear.py +++ b/transformer_engine/pytorch/module/grouped_linear.py @@ -551,7 +551,7 @@ def handle_custom_ddp_from_mcore(weight, wgrad): return wgrad if ctx.etp_size > 1: - wgrad_list = origin_weights[0].batched_wgrad_reduce_scatter(wgrad_list, ctx.fuse_wgrad_accumulation) + wgrad_list = origin_weights[0].batched_wgrad_reduce_scatter(wgrad_list) elif ctx.fuse_wgrad_accumulation: wgrad_list = [ handle_custom_ddp_from_mcore(weight, wgrad) diff --git a/transformer_engine/pytorch/module/layernorm_linear.py b/transformer_engine/pytorch/module/layernorm_linear.py index 824030c3d0..6f9ac58460 100644 --- a/transformer_engine/pytorch/module/layernorm_linear.py +++ b/transformer_engine/pytorch/module/layernorm_linear.py @@ -938,7 +938,7 @@ def wgrad_gemm( wgrad, grad_bias_ = wgrad_gemm(ln_out_total, grad_output) if ctx.etp_size > 1: - wgrad = origin_weight.wgrad_reduce_scatter(wgrad, ctx.fuse_wgrad_accumulation) + wgrad = origin_weight.wgrad_reduce_scatter(wgrad) # Update grad bias if needed if grad_bias is None: diff --git a/transformer_engine/pytorch/module/linear.py b/transformer_engine/pytorch/module/linear.py index 4c4789461c..145c253cdc 100644 --- a/transformer_engine/pytorch/module/linear.py +++ b/transformer_engine/pytorch/module/linear.py @@ -963,7 +963,7 @@ def wgrad_gemm( dgrad_work = None if ctx.etp_size > 1: - wgrad = weight.wgrad_reduce_scatter(wgrad, ctx.fuse_wgrad_accumulation) + wgrad = weight.wgrad_reduce_scatter(wgrad) if ctx.requires_wgrad: # Handle custom DDP from mcore. if ( From f4b5a5e57c2a99bc5aa7ed0d93e8c374ba14436a Mon Sep 17 00:00:00 2001 From: Shiqing Fan Date: Fri, 3 Apr 2026 08:16:00 -0700 Subject: [PATCH 10/43] ETP padding: fix stripping for rowwise_scale_inv and columnwise_scale_inv. --- .../module/extended_tensor_parallelism.py | 35 ++++++++++++++++++- 1 file changed, 34 insertions(+), 1 deletion(-) diff --git a/transformer_engine/pytorch/module/extended_tensor_parallelism.py b/transformer_engine/pytorch/module/extended_tensor_parallelism.py index 8969ccfa6d..83b6dc98c0 100644 --- a/transformer_engine/pytorch/module/extended_tensor_parallelism.py +++ b/transformer_engine/pytorch/module/extended_tensor_parallelism.py @@ -17,7 +17,8 @@ ) from ..quantized_tensor import QuantizedTensor from ..tensor import NVFP4TensorStorage, MXFP8TensorStorage -from ..utils import nvtx_range_pop, nvtx_range_push +from ..utils import nvtx_range_pop, nvtx_range_push, round_up_to_nearest_multiple +from ..constants import NVFP4_BLOCK_SCALING_SIZE, MXFP8_BLOCK_SCALING_SIZE from .base import get_dummy_wgrad import transformer_engine_torch as tex @@ -354,6 +355,38 @@ def _strip_padding(self, tensor): metadata["columnwise_data"] = metadata["columnwise_data"][ :-self.pad_length ] + M = self._unsharded_shape[0] + if isinstance(tensor, NVFP4TensorStorage): + # NVFP4 scale_inv shapes (see NVFP4Quantizer.get_scale_shape): + # rowwise_scale_inv: [round_up(M, 128), round_up(ceil(K/16), 4)] + # columnwise_scale_inv: [round_up(K, 128), round_up(ceil(M/16), 4)] + # ETP shards M (dim 0 of the weight), so strip to the unpadded sizes. + if metadata.get("rowwise_scale_inv") is not None: + m_rows = round_up_to_nearest_multiple(M, 128) + metadata["rowwise_scale_inv"] = metadata["rowwise_scale_inv"][:m_rows] + if metadata.get("columnwise_scale_inv") is not None: + m_tiles = round_up_to_nearest_multiple( + math.ceil(M / NVFP4_BLOCK_SCALING_SIZE), 4 + ) + metadata["columnwise_scale_inv"] = ( + metadata["columnwise_scale_inv"][:, :m_tiles].contiguous() + ) + else: + # MXFP8 scale_inv shapes (see MXFP8Quantizer.get_scale_shape): + # rowwise_scale_inv: [round_up(M, 128), round_up(K//32, 4)] + # columnwise_scale_inv: [round_up(M//32, 4), round_up(K, 128)] + # ETP shards M (dim 0 of the weight), so strip to the unpadded sizes. + if metadata.get("rowwise_scale_inv") is not None: + m_rows = round_up_to_nearest_multiple(M, 128) + metadata["rowwise_scale_inv"] = metadata["rowwise_scale_inv"][:m_rows] + if metadata.get("columnwise_scale_inv") is not None: + m_tiles = round_up_to_nearest_multiple( + M // MXFP8_BLOCK_SCALING_SIZE, 4 + ) + metadata["columnwise_scale_inv"] = ( + metadata["columnwise_scale_inv"][:m_tiles] + ) + return type(tensor)(**metadata, shape=self._unsharded_shape, dtype=torch.bfloat16) else: return tensor[:-self.pad_length] From 89d8ae77d041e3ee14b912e0a5cb93a0c1295011 Mon Sep 17 00:00:00 2001 From: Shiqing Fan Date: Fri, 3 Apr 2026 08:17:53 -0700 Subject: [PATCH 11/43] ETP: add UTs and doc update. --- docs/README_ETP.md | 30 +- tests/pytorch/distributed/test_etp.py | 1411 +++++++++++++++++++++++++ 2 files changed, 1426 insertions(+), 15 deletions(-) create mode 100644 tests/pytorch/distributed/test_etp.py diff --git a/docs/README_ETP.md b/docs/README_ETP.md index d32321d6df..a0ef835614 100644 --- a/docs/README_ETP.md +++ b/docs/README_ETP.md @@ -28,7 +28,8 @@ TODO(shiqingf): add performance for Ultra model in nvfp4. | **FP8 / MXFP8 support** | Quantized shards with ETP-group amax reduction | | **Routed expert support** | Batched coalesced all-gather for all experts in a MoE layer (GroupedLinear) | | **Composable with TP/SP** | Orthogonal to tensor parallelism and sequence parallelism | -| **CUDA Graphs compatible** | ETP is compatible with CUDA Graphs. | +| **CUDA Graphs compatible** | ETP is compatible with CUDA Graphs. And kernels on sidestreams are no longer required to synchronize at graph breaks + | | **Debug naming** | `tag_etp_params_with_names(model)` populates human-readable names on every `ETPShardedParam`; the prefetch-link table is printed atomically at the start of the second forward pass | ### Implementation Mechanisms @@ -40,7 +41,7 @@ TODO(shiqingf): add performance for Ultra model in nvfp4. | **Separate AG and RS state** | All-gather state (`state`) and reduce-scatter state (`rs_state`) are tracked independently per param, allowing forward and backward async ops to proceed without interference | | **Dedicated CUDA streams** | AG and RS run on separate global CUDA streams (`AG_STREAM`, `RS_STREAM`), decoupled from the default compute stream; completion is signaled back via per-param CUDA events (`ag_event`, `rs_event`) that the compute stream waits on before consuming the result | | **Ticket-based buffer cache** | `ETPWeightCache` assigns persistent tickets via `reserve()`; buffers are lazily allocated on `get()` and returned to the pool on `release()`; `clear()` drops all buffers while keeping tickets valid for lazy re-allocation (used for CUDA Graph re-capture) | -| **Wgrad reduce-scatter** | Async reduce-scatter of weight gradients, deferred to overlap with next layer's wgrad RS; padding stripped and grad accumulated in `_finalize_wgrad()` | +| **Wgrad reduce-scatter** | Async reduce-scatter of weight gradients, deferred to overlap with next layer's wgrad RS; `_finalize_wgrad()` resets `rs_state`, strips padding, and accumulates the result into `param.main_grad`, returning a dummy-zero grad to autograd | --- @@ -129,7 +130,8 @@ BACKWARD (wgrad path) └─ _reduce_scatter pads: [F, K] → [padded_F, K] (re-pads before RS so chunks are equal) └─ reduce-scatter → [shard_size, K] per rank └─ _finalize_wgrad → _strip_padding → [real_rows, K] - └─ stored as param.grad (matches local shard shape) + └─ accumulated into param.main_grad (matches local shard shape) + └─ dummy zero grad returned to autograd ``` #### Wrapping call @@ -159,7 +161,7 @@ NONE ───────────────────────── The `DATA_READY_SYNC` state is used for on-demand synchronous gathers (cold start or when prefetch is disabled). `DATA_READY` is used after an async gather completes via `handle.wait()`. -Invalid transitions are guarded by `_set_state()` / `_set_rs_state()`. +Transition validation is implemented but currently commented out in `_set_state()` / `_set_rs_state()` (guarded by `ETP_CONFIG.check_param_states`); both methods unconditionally set the new state in the current implementation. ### Class Diagram @@ -204,8 +206,6 @@ classDiagram +Event rs_event +ETPShardHandle _prefetch_handle +ETPShardHandle _wgrad_rs_handle - +callable _grad_accum_node - +callable _grad_accum_hook +Quantizer _quantizer +bool did_cast_to_low_precision +QuantizedTensor quantized @@ -219,7 +219,6 @@ classDiagram +ProcessGroup group +List weight_list +Tensor wgrad_rs - +bool fuse_wgrad_accumulation +str _debug_name +setup(weight_quantizer) +_weights() List @@ -240,10 +239,9 @@ classDiagram +all_gather_and_prefetch(fwd, ...) Tensor +all_gather_and_prefetch_bwd() Tensor +get_wgrad_tensor() Tensor - +register_grad_accum_hook(node, hook) - +_finalize_wgrad(param, wgrad_rs, fuse) [staticmethod] + +_finalize_wgrad(param, wgrad_rs) [staticmethod] +_reduce_scatter(wgrads, async_op) tuple - +wgrad_reduce_scatter(wgrad, fuse) + +wgrad_reduce_scatter(wgrad) } %% ── Async all-gather handles ───────────────────────────────────────────── @@ -479,10 +477,10 @@ Step by step for layer `i` backward: 1. **`all_gather_and_prefetch_bwd()`**: Gather `W_i` for the dgrad GEMM; simultaneously async-prefetch `W_i-1` (the `prev_w`) for the next backward step. Uses `skip_weight_cast=True` — no re-quantization needed since scales are already valid from the forward pass. 2. **dgrad GEMM**: Compute `dX = dY × W_i` using the gathered weight. 3. **wgrad GEMM**: Compute `dW = X^T × dY` using the saved input activation. -4. **`wgrad_reduce_scatter(wgrad, fuse_wgrad_accumulation)`**: +4. **`wgrad_reduce_scatter(wgrad)`**: - **Non-last layer** (`prev_w is not None`): Launch async reduce-scatter; store `ETPShardHandle` in `self._wgrad_rs_handle`. Return `None` to backward (gradient deferred). - - **Last layer** (`prev_w is None`): Synchronous reduce-scatter. Call `_finalize_wgrad()` immediately — strips padding, accumulates into `main_grad`, fires grad-accum hook. -5. **Deferred finish**: At the start of each subsequent layer's `wgrad_reduce_scatter`, `self.next_w._wait_reduce_scatter()` is called, which waits on `next_w._wgrad_rs_handle` and records a CUDA event. Then `_finalize_wgrad()` is called for `next_w` to strip padding, accumulate, and fire the hook. The RS buffer is returned to the pool via `cache.release()`. + - **Last layer** (`prev_w is None`): Synchronous reduce-scatter. Call `_finalize_wgrad()` immediately — resets `rs_state` to `NONE`, strips padding (if last rank is padded), accumulates into `param.main_grad`, returns a dummy-zero grad tensor to autograd. +5. **Deferred finish**: At the start of each subsequent layer's `wgrad_reduce_scatter`, `self.next_w._wait_reduce_scatter()` is called, which waits on `next_w._wgrad_rs_handle` and records a CUDA event. Then `_finalize_wgrad()` is called for `next_w` to reset `rs_state`, strip padding, and accumulate into `main_grad`. The RS buffer is returned to the pool via `cache.release()`. Here is an example of ETP schedule diagram for Hybried Nemotron6 in bf16 as an example (ETP+EP with partial CGs): @@ -576,8 +574,10 @@ A buffer lives in **exactly one** place at a time: ``` reserve() → slot created, buf=None (no allocation yet) -get(ticket) → buf allocated lazily from pool or fresh; stored in slot -release(ticket) → buf returned to pool; slot.buf set to None +get(ticket) → buf allocated lazily from pool or fresh; stored in slot.buf (idempotent) +release(ticket) → buf appended to pool (slot.buf stays set; production code calls release + only after get() has emptied the pool for that key, so the duplicate-check + in release() is never triggered) clear() → all slot.buf = None, pool cleared (tickets stay valid; next get() re-allocates) ``` diff --git a/tests/pytorch/distributed/test_etp.py b/tests/pytorch/distributed/test_etp.py new file mode 100644 index 0000000000..39afe69b00 --- /dev/null +++ b/tests/pytorch/distributed/test_etp.py @@ -0,0 +1,1411 @@ +# Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# See LICENSE for license information. + +"""Unit tests for Extended Tensor Parallelism (ETP). + +Test groups +----------- +1. TestETPWeightState – state-machine transitions (single-process) +2. TestETPWeightCache – coat-check buffer pool (single-process) +3. TestETPSharding – wrap_module_params_etp: shard content + padding (multi-GPU) +4. TestWrapModuleParams – wrap_module_params_etp: param replacement + weight_list (multi-GPU) +5. TestLinearETP – Linear forward/backward numerical correctness (multi-GPU) +6. TestLayerNormLinearETP – LayerNormLinear forward/backward smoke test (multi-GPU) +7. TestGroupedLinearETP – GroupedLinear forward/backward smoke test (multi-GPU) +8. TestETPPrefetchChain – linked-list next_w/prev_w wiring (multi-GPU) +9. TestETPWgradRS – wgrad reduce-scatter shape + multi-layer deferred path (multi-GPU) +10. TestETPMicrobatches – output consistency across microbatches (multi-GPU) +11. TestNVFP4LinearETP – Linear + NVFP4 recipe: quantized shard setup, fwd/bwd (multi-GPU) +12. TestNVFP4GroupedLinearETP – GroupedLinear + NVFP4 recipe: coalesced AG + fwd/bwd (multi-GPU) +13. TestMXFP8LinearETP – Linear + MXFP8 recipe: quantized shard setup, fwd/bwd, padding (multi-GPU) +14. TestETPConfig – update_config: valid/invalid keys (single-process) +15. TestETPShardedParamProperties – shape computations, get_padded_shard, _strip_padding (single-process) +16. TestETPCacheKey – _get_cache_key: expert vs non-expert, fwd vs bwd (single-process) +17. TestETPCacheRelease – reserve/get/release pool semantics (single-process) +18. TestTagETPParamsWithNames – _debug_name population on ETPShardedParam (single-process) +19. TestFinalizeWgrad – _finalize_wgrad: accumulate, strip padding, rs_state reset (single-process) +20. TestETPGroupSizeOne – wrap_module_params_etp no-op when etp_group.size()==1 (single-process) +21. TestETPPrefetchDisabled – weight_prefetch=False: single-pass forward still works (multi-GPU) +22. TestFuseWgradAccumulation – fuse_wgrad_accumulation=True: wgrad→main_grad (multi-GPU) +23. TestETPGradAccumHook – main_grad updated after reduce-scatter backward (multi-GPU) + +Multi-GPU tests use torch.multiprocessing.spawn and are skipped when fewer +than the required CUDA devices are available. +""" + +import os +import socket + +import pytest +import torch +import torch.distributed as dist +import torch.multiprocessing as mp +import torch.nn as nn + +import transformer_engine.pytorch as te +import transformer_engine.pytorch.module.extended_tensor_parallelism as etp_module +from transformer_engine.pytorch.module.extended_tensor_parallelism import ( + ETPShardedParam, + ETPWeightCache, + ETPWeightState, + wrap_module_params_etp, +) +from transformer_engine.pytorch import fp8_autocast, is_nvfp4_available, is_mxfp8_available +from transformer_engine.pytorch.quantization import FP8GlobalStateManager +from transformer_engine.pytorch.quantized_tensor import QuantizedTensor +from transformer_engine.common.recipe import NVFP4BlockScaling + + +# --------------------------------------------------------------------------- +# Fixtures +# --------------------------------------------------------------------------- + +@pytest.fixture(autouse=True) +def reset_fp8_state(): + yield + FP8GlobalStateManager.reset() + + +@pytest.fixture(autouse=True) +def reset_etp_globals(): + """Reset all ETP mutable class/module-level state between tests.""" + yield + ETPShardedParam._first_weight_flag = True + ETPShardedParam._last_weight = None + ETPShardedParam._pending_rs_weight = None + ETPShardedParam._link_node_count = 0 + ETPShardedParam._link_table_buffer = [] + ETPShardedParam._link_table_flushed = False + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + +def _free_port() -> int: + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: + s.bind(("", 0)) + return s.getsockname()[1] + + +def _dist_init(rank: int, world_size: int, port: int) -> None: + os.environ["MASTER_ADDR"] = "localhost" + os.environ["MASTER_PORT"] = str(port) + dist.init_process_group(backend="nccl", rank=rank, world_size=world_size) + torch.cuda.set_device(rank) + + +def _run_distributed(fn, world_size: int, *args) -> None: + """Spawn `world_size` processes each running fn(rank, world_size, port, *args).""" + port = _free_port() + mp.spawn(fn, args=(world_size, port) + args, nprocs=world_size, join=True) + + +def _requires_multi_gpu(n: int = 4): + if torch.cuda.device_count() < n: + pytest.skip(f"Requires at least {n} CUDA devices") + + +def _requires_nvfp4(): + if not is_nvfp4_available(): + pytest.skip("NVFP4 not available (requires compute capability >= 10.0)") + + +# --------------------------------------------------------------------------- +# 1. ETPWeightState – state-machine transition tests +# --------------------------------------------------------------------------- + +class TestETPWeightState: + + @staticmethod + def _param(): + return ETPShardedParam(torch.zeros(4, 4)) + + def test_full_cycle(self): + p = self._param() + assert p.state == ETPWeightState.NONE + p._set_state(ETPWeightState.ASYNC_WAIT) + p._set_state(ETPWeightState.DATA_READY) + p._set_state(ETPWeightState.NONE) + assert p.state == ETPWeightState.NONE + + def test_sync_path_cycle(self): + """NONE → DATA_READY_SYNC → NONE (sync all-gather path).""" + p = self._param() + p._set_state(ETPWeightState.DATA_READY_SYNC) + p._set_state(ETPWeightState.NONE) + assert p.state == ETPWeightState.NONE + + def test_rs_state_full_cycle(self): + """RS state machine: NONE → ASYNC_WAIT → DATA_READY → NONE.""" + p = self._param() + assert p.rs_state == ETPWeightState.NONE + p._set_rs_state(ETPWeightState.ASYNC_WAIT) + p._set_rs_state(ETPWeightState.DATA_READY) + p._set_rs_state(ETPWeightState.NONE) + assert p.rs_state == ETPWeightState.NONE + + +# --------------------------------------------------------------------------- +# 2. ETPWeightCache – coat-check buffer pool tests +# --------------------------------------------------------------------------- + +class TestETPWeightCache: + + class _FakeGroup: + def __init__(self, size=2): + self._size = size + def size(self): + return self._size + def rank(self): + return 0 + + def _param(self, shape=(8, 4), etp_size=2): + p = ETPShardedParam(torch.zeros(*shape)) + p.group = self._FakeGroup(etp_size) + p.expert_idx = None + p.pad_length = 0 + p.is_padded_last_rank = False + p._quantizer = None + return p + + def test_reserve_returns_ticket(self): + cache = ETPWeightCache() + p = self._param() + ticket = cache.reserve(p, torch.bfloat16, fwd=True) + assert isinstance(ticket, int) + + def test_reserve_get_roundtrip(self): + cache = ETPWeightCache() + p = self._param() + ticket = cache.reserve(p, torch.bfloat16, fwd=True) + buf = cache.get(ticket) + assert buf is not None + # get() returns same buf on second call (buf cached in slot) + buf2 = cache.get(ticket) + assert buf2 is buf + + def test_buffer_reused_after_release(self): + cache = ETPWeightCache() + p = self._param() + t1 = cache.reserve(p, torch.bfloat16, fwd=True) + buf1 = cache.get(t1) + cache.release(t1) + # Reserve a new ticket, buf should come from pool + t2 = cache.reserve(p, torch.bfloat16, fwd=True) + buf2 = cache.get(t2) + assert buf1 is buf2, "Buffer should be reused from pool after release" + cache.release(t2) + + def test_two_simultaneous_reserves_are_distinct(self): + cache = ETPWeightCache() + p = self._param() + t1 = cache.reserve(p, torch.bfloat16, fwd=True) + buf1 = cache.get(t1) + t2 = cache.reserve(p, torch.bfloat16, fwd=True) + buf2 = cache.get(t2) + assert buf1 is not buf2, "Concurrent reserves must get distinct buffers" + + def test_tickets_are_unique(self): + """Each reserve() call returns a new unique ticket.""" + cache = ETPWeightCache() + p = self._param() + t1 = cache.reserve(p, torch.bfloat16, fwd=True) + t2 = cache.reserve(p, torch.bfloat16, fwd=True) + assert t1 != t2, "Each reserve() must return a unique ticket" + + def test_invalid_ticket_raises(self): + cache = ETPWeightCache() + with pytest.raises(KeyError): + cache.get(9999) + + def test_different_shapes_use_distinct_pool_slots(self): + cache = ETPWeightCache() + p1 = self._param(shape=(8, 4)) + p2 = self._param(shape=(16, 4)) + t1 = cache.reserve(p1, torch.bfloat16, fwd=True) + buf1 = cache.get(t1) + t2 = cache.reserve(p2, torch.bfloat16, fwd=True) + buf2 = cache.get(t2) + assert buf1.shape != buf2.shape + cache.release(t1); cache.release(t2) + + def test_fwd_bwd_tickets_are_distinct(self): + """fwd=True and fwd=False reserves always receive distinct ticket IDs.""" + cache = ETPWeightCache() + p = self._param() + t_fwd = cache.reserve(p, torch.bfloat16, fwd=True) + t_bwd = cache.reserve(p, torch.bfloat16, fwd=False) + assert t_fwd != t_bwd + + +# --------------------------------------------------------------------------- +# 3. ETP weight sharding: shard content and alignment padding +# --------------------------------------------------------------------------- + +def _worker_sharding_aligned(rank, world_size, port): + _dist_init(rank, world_size, port) + K, M = world_size * 32, 16 # K divisible by 16*world_size → no padding + full_weight = torch.arange(K * M, dtype=torch.float32).reshape(K, M).cuda() + dist.broadcast(full_weight, src=0) + + etp_group = dist.new_group(list(range(world_size))) + mod = nn.Module() + mod.weight = nn.Parameter(full_weight.clone(), requires_grad=False) + wrap_module_params_etp(mod, ['weight'], etp_group) + shard = mod.weight + + rows_per_rank = K // world_size + assert shard.shape == (rows_per_rank, M), f"rank {rank}: unexpected shape {shard.shape}" + assert shard.pad_length == 0 + expected = full_weight[rank * rows_per_rank : (rank + 1) * rows_per_rank] + assert torch.allclose(shard.data, expected), f"rank {rank}: shard content mismatch" + dist.destroy_process_group() + + +def _worker_sharding_padding(rank, world_size, port): + _dist_init(rank, world_size, port) + alignment = 16 * world_size + K = alignment - 1 # deliberately unaligned + M = 16 + full_weight = torch.ones(K, M, dtype=torch.float32).cuda() + dist.broadcast(full_weight, src=0) + + etp_group = dist.new_group(list(range(world_size))) + mod = nn.Module() + mod.weight = nn.Parameter(full_weight.clone(), requires_grad=False) + wrap_module_params_etp(mod, ['weight'], etp_group) + shard = mod.weight + + padded_K = alignment + rows_per_rank = padded_K // world_size + + if rank == world_size - 1: + assert shard.is_padded_last_rank + assert shard.pad_length > 0 + # The shard tensor holds only the real rows; get_padded_shard() appends zero rows. + padded = shard.get_padded_shard() + assert padded.shape[0] == rows_per_rank, \ + f"rank {rank}: expected padded shard {rows_per_rank} rows, got {padded.shape[0]}" + n_real = K - rank * rows_per_rank + assert torch.all(padded[n_real:] == 0), "Padding rows must be zero" + else: + assert not shard.is_padded_last_rank + assert shard.shape[0] == rows_per_rank, \ + f"rank {rank}: expected {rows_per_rank} rows, got {shard.shape[0]}" + + dist.destroy_process_group() + + +class TestETPSharding: + def test_aligned_shard_content(self): + _requires_multi_gpu(4) + _run_distributed(_worker_sharding_aligned, 4) + + def test_unaligned_shard_padding(self): + _requires_multi_gpu(4) + _run_distributed(_worker_sharding_padding, 4) + + +# --------------------------------------------------------------------------- +# 4. wrap_module_params_etp: param replacement and GroupedLinear weight_list +# --------------------------------------------------------------------------- + +def _worker_linear_param_replaced(rank, world_size, port): + _dist_init(rank, world_size, port) + in_f, out_f = 64, 128 + etp_group = dist.new_group(list(range(world_size))) + layer = te.Linear( + in_features=in_f, out_features=out_f, + bias=False, params_dtype=torch.bfloat16, + device="cuda", etp_group=etp_group, + ) + w = layer.weight + assert isinstance(w, ETPShardedParam), "weight must be ETPShardedParam" + assert w.shape == (out_f // world_size, in_f), f"unexpected shard shape {w.shape}" + assert w.group is etp_group + dist.destroy_process_group() + + +def _worker_grouped_weight_list(rank, world_size, port): + _dist_init(rank, world_size, port) + num_gemms, in_f, out_f = 3, 32, 64 + etp_group = dist.new_group(list(range(world_size))) + layer = te.GroupedLinear( + num_gemms=num_gemms, in_features=in_f, out_features=out_f, + bias=False, params_dtype=torch.bfloat16, + device="cuda", etp_group=etp_group, + ) + w0 = layer.weight0 + assert isinstance(w0, ETPShardedParam) + assert w0.weight_list is not None + assert len(w0.weight_list) == num_gemms + assert [w.expert_idx for w in w0.weight_list] == list(range(num_gemms)) + dist.destroy_process_group() + + +class TestWrapModuleParams: + def test_linear_weight_replaced(self): + _requires_multi_gpu(4) + _run_distributed(_worker_linear_param_replaced, 4) + + def test_grouped_linear_weight_list(self): + _requires_multi_gpu(4) + _run_distributed(_worker_grouped_weight_list, 4) + + +# --------------------------------------------------------------------------- +# 5. Linear forward/backward numerical correctness +# --------------------------------------------------------------------------- + +def _worker_linear_correctness(rank, world_size, port): + """ETP output == (all-gathered weight) @ input, and dX matches.""" + _dist_init(rank, world_size, port) + torch.manual_seed(0) + batch, in_f, out_f = 16, 64, 128 # out_f % (16*world_size)==0 → no padding + dtype = torch.bfloat16 + etp_group = dist.new_group(list(range(world_size))) + + layer = te.Linear( + in_features=in_f, out_features=out_f, + bias=False, params_dtype=dtype, + device="cuda", etp_group=etp_group, + ) + + # Reconstruct full weight from shards (all-gather) + shard = layer.weight.data.clone() + all_shards = [torch.zeros_like(shard) for _ in range(world_size)] + dist.all_gather(all_shards, shard, group=etp_group) + full_weight = torch.cat(all_shards, dim=0).float()[:out_f] # strip any padding + + # Shared input across ranks + inp = torch.randn(batch, in_f, dtype=dtype, device="cuda") + dist.broadcast(inp, src=0) + + inp_etp = inp.clone().requires_grad_(True) + inp_ref = inp.clone().requires_grad_(True) + + # ETP forward + out_etp = layer(inp_etp, is_first_microbatch=True) + + # Reference forward + out_ref = inp_ref.float() @ full_weight.T + out_ref = out_ref.to(dtype) + + assert out_etp.shape == out_ref.shape, f"Shape mismatch {out_etp.shape} vs {out_ref.shape}" + assert torch.allclose(out_etp.float(), out_ref.float(), atol=0.1, rtol=0.1), ( + f"Output mismatch max_diff={(out_etp.float()-out_ref.float()).abs().max():.4f}" + ) + + # _finalize_wgrad always accumulates into main_grad; allocate before backward. + layer.weight.main_grad = torch.zeros(layer.weight.shape, dtype=dtype, device="cuda") + + # Backward: compare input gradient + grad_out = torch.randn_like(out_etp) + dist.broadcast(grad_out, src=0) + out_etp.backward(grad_out) + out_ref.backward(grad_out.float()) + + assert inp_etp.grad is not None + assert torch.allclose(inp_etp.grad.float(), inp_ref.grad.float(), atol=0.1, rtol=0.1), ( + f"dX mismatch max_diff={(inp_etp.grad.float()-inp_ref.grad.float()).abs().max():.4f}" + ) + dist.destroy_process_group() + + +class TestLinearETP: + def test_forward_backward_correctness(self): + _requires_multi_gpu(4) + _run_distributed(_worker_linear_correctness, 4) + + +# --------------------------------------------------------------------------- +# 6. LayerNormLinear forward/backward smoke test +# --------------------------------------------------------------------------- + +def _worker_layernorm_linear(rank, world_size, port): + _dist_init(rank, world_size, port) + torch.manual_seed(0) + seq, batch, in_f, out_f = 4, 2, 64, 128 + dtype = torch.bfloat16 + etp_group = dist.new_group(list(range(world_size))) + + layer = te.LayerNormLinear( + in_features=in_f, out_features=out_f, + bias=False, params_dtype=dtype, + device="cuda", etp_group=etp_group, + ) + assert isinstance(layer.weight, ETPShardedParam) + + inp = torch.randn(seq, batch, in_f, dtype=dtype, device="cuda", requires_grad=True) + dist.broadcast(inp, src=0) + + out = layer(inp, is_first_microbatch=True) + assert out.shape == (seq, batch, out_f), f"unexpected output shape {out.shape}" + + layer.weight.main_grad = torch.zeros(layer.weight.shape, dtype=dtype, device="cuda") + out.sum().backward() + assert inp.grad is not None and inp.grad.shape == inp.shape + dist.destroy_process_group() + + +class TestLayerNormLinearETP: + def test_forward_backward(self): + _requires_multi_gpu(4) + _run_distributed(_worker_layernorm_linear, 4) + + +# --------------------------------------------------------------------------- +# 7. GroupedLinear forward/backward smoke test +# --------------------------------------------------------------------------- + +def _worker_grouped_linear(rank, world_size, port, num_gemms): + _dist_init(rank, world_size, port) + torch.manual_seed(0) + in_f, out_f, total_tokens = 32, 64, num_gemms * 4 + dtype = torch.bfloat16 + etp_group = dist.new_group(list(range(world_size))) + + layer = te.GroupedLinear( + num_gemms=num_gemms, in_features=in_f, out_features=out_f, + bias=False, params_dtype=dtype, + device="cuda", etp_group=etp_group, + ) + assert isinstance(layer.weight0, ETPShardedParam) + + m_splits = [total_tokens // num_gemms] * num_gemms + m_splits[-1] += total_tokens - sum(m_splits) + + inp = torch.randn(total_tokens, in_f, dtype=dtype, device="cuda", requires_grad=True) + dist.broadcast(inp, src=0) + + out = layer(inp, m_splits=m_splits, is_first_microbatch=True) + assert out.shape == (total_tokens, out_f), f"unexpected output shape {out.shape}" + + for i in range(num_gemms): + w = getattr(layer, f"weight{i}") + w.main_grad = torch.zeros(w.shape, dtype=dtype, device="cuda") + out.sum().backward() + assert inp.grad is not None and inp.grad.shape == inp.shape + dist.destroy_process_group() + + +class TestGroupedLinearETP: + @pytest.mark.parametrize("num_gemms", [2, 4]) + def test_forward_backward(self, num_gemms): + _requires_multi_gpu(4) + _run_distributed(_worker_grouped_linear, 4, num_gemms) + + +# --------------------------------------------------------------------------- +# 8. Prefetch chain: next_w / prev_w wiring after first forward pass +# --------------------------------------------------------------------------- + +def _worker_chain_wired(rank, world_size, port): + _dist_init(rank, world_size, port) + torch.manual_seed(0) + in_f, out_f = 32, 64 + dtype = torch.bfloat16 + etp_group = dist.new_group(list(range(world_size))) + + l0 = te.Linear(in_features=in_f, out_features=out_f, bias=False, + params_dtype=dtype, device="cuda", etp_group=etp_group) + l1 = te.Linear(in_features=in_f, out_features=out_f, bias=False, + params_dtype=dtype, device="cuda", etp_group=etp_group) + + inp = torch.randn(4, in_f, dtype=dtype, device="cuda") + dist.broadcast(inp, src=0) + + # First forward pass builds the linked list + l0(inp, is_first_microbatch=True) + l1(inp, is_first_microbatch=True) + + w0, w1 = l0.weight, l1.weight + assert w0.next_w is w1, "w0.next_w should point to w1" + assert w1.prev_w is w0, "w1.prev_w should point back to w0" + assert w1.next_w is None + assert w0.prev_w is None + dist.destroy_process_group() + + +def _worker_chain_async_prefetch(rank, world_size, port): + """On the second forward pass, w1 should be in DATA_READY before its forward runs.""" + _dist_init(rank, world_size, port) + torch.manual_seed(0) + in_f, out_f = 32, 64 + dtype = torch.bfloat16 + etp_group = dist.new_group(list(range(world_size))) + + l0 = te.Linear(in_features=in_f, out_features=out_f, bias=False, + params_dtype=dtype, device="cuda", etp_group=etp_group) + l1 = te.Linear(in_features=in_f, out_features=out_f, bias=False, + params_dtype=dtype, device="cuda", etp_group=etp_group) + + inp = torch.randn(4, in_f, dtype=dtype, device="cuda") + dist.broadcast(inp, src=0) + + # First pass builds chain, second pass uses async prefetch + for _ in range(2): + out = l0(inp, is_first_microbatch=True) + l1(inp, is_first_microbatch=True) + assert torch.isfinite(out).all(), "Non-finite output on second pass" + dist.destroy_process_group() + + +class TestETPPrefetchChain: + def test_chain_wired_after_first_pass(self): + _requires_multi_gpu(4) + _run_distributed(_worker_chain_wired, 4) + + def test_async_prefetch_second_pass(self): + _requires_multi_gpu(4) + _run_distributed(_worker_chain_async_prefetch, 4) + + +# --------------------------------------------------------------------------- +# 9. Wgrad reduce-scatter: shape and deferred async path +# --------------------------------------------------------------------------- + +def _worker_wgrad_shape(rank, world_size, port): + """After backward, weight.grad shape must match the local shard shape.""" + _dist_init(rank, world_size, port) + torch.manual_seed(0) + in_f, out_f = 32, 64 + dtype = torch.bfloat16 + etp_group = dist.new_group(list(range(world_size))) + + layer = te.Linear( + in_features=in_f, out_features=out_f, + bias=False, params_dtype=dtype, + device="cuda", etp_group=etp_group, + fuse_wgrad_accumulation=False, + ) + inp = torch.randn(8, in_f, dtype=dtype, device="cuda", requires_grad=True) + dist.broadcast(inp, src=0) + + layer.weight.main_grad = torch.zeros(layer.weight.shape, dtype=dtype, device="cuda") + layer(inp, is_first_microbatch=True).sum().backward() + + w = layer.weight + if w.grad is not None: + assert w.grad.shape == w.shape, \ + f"wgrad shape {w.grad.shape} != shard shape {w.shape}" + dist.destroy_process_group() + + +def _worker_multilayer_deferred_rs(rank, world_size, port): + """Two-layer ETP: async RS deferred for layer0 (non-last), sync for layer1 (last in bwd).""" + _dist_init(rank, world_size, port) + torch.manual_seed(0) + in_f, out_f = 32, 64 + dtype = torch.bfloat16 + etp_group = dist.new_group(list(range(world_size))) + + l0 = te.Linear(in_features=in_f, out_features=out_f, bias=False, + params_dtype=dtype, device="cuda", etp_group=etp_group) + l1 = te.Linear(in_features=in_f, out_features=out_f, bias=False, + params_dtype=dtype, device="cuda", etp_group=etp_group) + + inp = torch.randn(8, in_f, dtype=dtype, device="cuda", requires_grad=True) + dist.broadcast(inp, src=0) + + # _finalize_wgrad always accumulates into main_grad; allocate before backward. + l0.weight.main_grad = torch.zeros(l0.weight.shape, dtype=dtype, device="cuda") + l1.weight.main_grad = torch.zeros(l1.weight.shape, dtype=dtype, device="cuda") + + out = l0(inp, is_first_microbatch=True) + l1(inp, is_first_microbatch=True) + out.sum().backward() + + # Both weights' main_grad should have been updated + for lyr in [l0, l1]: + w = lyr.weight + assert w.main_grad is not None, f"No main_grad on {lyr.__class__.__name__}.weight" + dist.destroy_process_group() + + +class TestETPWgradRS: + def test_wgrad_shape_matches_shard(self): + _requires_multi_gpu(4) + _run_distributed(_worker_wgrad_shape, 4) + + def test_multilayer_deferred_rs(self): + _requires_multi_gpu(4) + _run_distributed(_worker_multilayer_deferred_rs, 4) + + +# --------------------------------------------------------------------------- +# 10. Multiple microbatches: output must be consistent when weight unchanged +# --------------------------------------------------------------------------- + +def _worker_microbatches(rank, world_size, port): + _dist_init(rank, world_size, port) + torch.manual_seed(0) + batch, in_f, out_f = 8, 64, 128 + dtype = torch.bfloat16 + etp_group = dist.new_group(list(range(world_size))) + + layer = te.Linear( + in_features=in_f, out_features=out_f, + bias=False, params_dtype=dtype, + device="cuda", etp_group=etp_group, + ) + inp = torch.randn(batch, in_f, dtype=dtype, device="cuda") + dist.broadcast(inp, src=0) + + # First microbatch + out1 = layer(inp, is_first_microbatch=True).detach().clone() + + # Second microbatch with same weight (skip_weight_cast=True path) + out2 = layer(inp, is_first_microbatch=False).detach() + + assert torch.allclose(out1, out2), \ + f"Microbatch outputs differ; max_diff={(out1-out2).abs().max():.6f}" + dist.destroy_process_group() + + +class TestETPMicrobatches: + def test_consistent_across_microbatches(self): + _requires_multi_gpu(4) + _run_distributed(_worker_microbatches, 4) + + +# --------------------------------------------------------------------------- +# 11. NVFP4 + ETP: Linear forward/backward, quantized shard setup +# --------------------------------------------------------------------------- + +def _worker_nvfp4_linear(rank, world_size, port): + """Verify that ETP Linear correctly quantizes, all-gathers, and computes with NVFP4.""" + _dist_init(rank, world_size, port) + torch.manual_seed(0) + # batch=32: NVFP4 wgrad GEMM (K=batch) requires K divisible by 32 + batch, in_f, out_f = 32, 64, 128 # out_f % (16*world_size)==0 → no padding + dtype = torch.bfloat16 + etp_group = dist.new_group(list(range(world_size))) + + layer = te.Linear( + in_features=in_f, out_features=out_f, + bias=False, params_dtype=dtype, + device="cuda", etp_group=etp_group, + ) + inp = torch.randn(batch, in_f, dtype=dtype, device="cuda", requires_grad=True) + dist.broadcast(inp, src=0) + + # Forward under NVFP4 recipe – triggers setup() and NVFP4 quantization + recipe = NVFP4BlockScaling() + with fp8_autocast(enabled=True, fp8_recipe=recipe): + out = layer(inp, is_first_microbatch=True) + + # After the first forward pass setup() must have created a quantized shard + w = layer.weight + assert w.quantized is not None, "NVFP4 quantized shard must be set after setup()" + assert isinstance(w.quantized, QuantizedTensor), \ + f"weight.quantized should be QuantizedTensor, got {type(w.quantized)}" + + assert out.shape == (batch, out_f), f"unexpected output shape {out.shape}" + assert torch.isfinite(out).all(), "NVFP4 ETP output has non-finite values" + + # Second microbatch reuses cached quantized weight (skip_weight_cast path) + with fp8_autocast(enabled=True, fp8_recipe=recipe): + out2 = layer(inp.detach(), is_first_microbatch=False) + assert torch.isfinite(out2).all(), "NVFP4 ETP second-microbatch output has non-finite values" + + dist.destroy_process_group() + + +def _worker_nvfp4_linear_unaligned(rank, world_size, port): + """Verify NVFP4 ETP when out_features is not aligned to 16*world_size (padding path). + + out_f is chosen to be divisible by 8 (satisfies NVFP4 GEMM alignment) but not by + 16*world_size (so padding is needed). The last ETP rank receives a shard that is + zero-padded to reach the shard_size boundary. After all-gather, _strip_padding + removes the padded rows from the gathered weight before the GEMM, so the output + has the original out_f columns. + """ + _dist_init(rank, world_size, port) + torch.manual_seed(0) + alignment = 16 * world_size # 64 for world_size=4 + # Choose out_f divisible by 8 (NVFP4 GEMM constraint) but not by 64 (ETP alignment). + # With out_f=56: pad_length=8, shard_size=16, last rank gets 8 rows padded to 16. + out_f = alignment - 8 # 56 for world_size=4 + in_f = 64 + batch = 32 + dtype = torch.bfloat16 + etp_group = dist.new_group(list(range(world_size))) + + layer = te.Linear( + in_features=in_f, out_features=out_f, + bias=False, params_dtype=dtype, + device="cuda", etp_group=etp_group, + ) + inp = torch.randn(batch, in_f, dtype=dtype, device="cuda", requires_grad=True) + dist.broadcast(inp, src=0) + + with fp8_autocast(enabled=True, fp8_recipe=NVFP4BlockScaling()): + out = layer(inp, is_first_microbatch=True) + + # After _strip_padding removes the padded rows, output has out_f (not padded) cols. + assert out.shape == (batch, out_f), f"unexpected output shape {out.shape}" + assert torch.isfinite(out).all(), "NVFP4 ETP (unaligned) output has non-finite values" + dist.destroy_process_group() + + +class TestNVFP4LinearETP: + def test_forward_backward(self): + _requires_nvfp4() + _requires_multi_gpu(4) + _run_distributed(_worker_nvfp4_linear, 4) + + def test_forward_unaligned_padding(self): + _requires_nvfp4() + _requires_multi_gpu(4) + _run_distributed(_worker_nvfp4_linear_unaligned, 4) + + +# --------------------------------------------------------------------------- +# 12. NVFP4 + ETP: GroupedLinear forward/backward (coalesced batched all-gather) +# --------------------------------------------------------------------------- + +def _worker_nvfp4_grouped_linear(rank, world_size, port, num_gemms): + """Verify NVFP4 ETP with GroupedLinear (uses grouped_gather_along_first_dim).""" + _dist_init(rank, world_size, port) + torch.manual_seed(0) + # NVFP4 split_quantize constraints: in_f % 128 == 0, tokens_per_expert % 64 == 0 + # (Hadamard transform requirement), and K=tokens_per_expert % 32 == 0 for wgrad. + in_f, out_f, total_tokens = 128, 256, num_gemms * 64 + dtype = torch.bfloat16 + etp_group = dist.new_group(list(range(world_size))) + + layer = te.GroupedLinear( + num_gemms=num_gemms, in_features=in_f, out_features=out_f, + bias=False, params_dtype=dtype, + device="cuda", etp_group=etp_group, + ) + assert isinstance(layer.weight0, ETPShardedParam) + + m_splits = [total_tokens // num_gemms] * num_gemms + m_splits[-1] += total_tokens - sum(m_splits) + + inp = torch.randn(total_tokens, in_f, dtype=dtype, device="cuda", requires_grad=True) + dist.broadcast(inp, src=0) + + with fp8_autocast(enabled=True, fp8_recipe=NVFP4BlockScaling()): + out = layer(inp, m_splits=m_splits, is_first_microbatch=True) + + assert out.shape == (total_tokens, out_f), f"unexpected output shape {out.shape}" + assert torch.isfinite(out).all(), "NVFP4 GroupedLinear ETP output has non-finite values" + + # All expert weight shards should be quantized after setup() + for i in range(num_gemms): + name = f"weight{i}" + w = getattr(layer, name) + assert isinstance(w, ETPShardedParam) + assert w.quantized is not None, f"{name}.quantized not set after NVFP4 setup()" + assert isinstance(w.quantized, QuantizedTensor), \ + f"{name}.quantized should be QuantizedTensor, got {type(w.quantized)}" + + for i in range(num_gemms): + w = getattr(layer, f"weight{i}") + w.main_grad = torch.zeros(w.shape, dtype=dtype, device="cuda") + out.sum().backward() + assert inp.grad is not None and inp.grad.shape == inp.shape + dist.destroy_process_group() + + +class TestNVFP4GroupedLinearETP: + @pytest.mark.parametrize("num_gemms", [2, 4]) + def test_forward_backward(self, num_gemms): + _requires_nvfp4() + _requires_multi_gpu(4) + _run_distributed(_worker_nvfp4_grouped_linear, 4, num_gemms) + + +# --------------------------------------------------------------------------- +# 13. MXFP8 + ETP: Linear forward/backward, quantized shard setup +# --------------------------------------------------------------------------- + +def _worker_mxfp8_linear(rank, world_size, port): + """Verify that ETP Linear correctly quantizes, all-gathers, and computes with MXFP8.""" + from transformer_engine.common.recipe import MXFP8BlockScaling + _dist_init(rank, world_size, port) + torch.manual_seed(0) + # batch=32: MXFP8 wgrad GEMM (K=batch) requires K divisible by MXFP8_BLOCK_SCALING_SIZE=32 + batch, in_f, out_f = 32, 64, 128 # out_f % (16*world_size)==0 → no padding + dtype = torch.bfloat16 + etp_group = dist.new_group(list(range(world_size))) + + layer = te.Linear( + in_features=in_f, out_features=out_f, + bias=False, params_dtype=dtype, + device="cuda", etp_group=etp_group, + ) + inp = torch.randn(batch, in_f, dtype=dtype, device="cuda", requires_grad=True) + dist.broadcast(inp, src=0) + + # Forward under MXFP8 recipe – triggers setup() and MXFP8 quantization + recipe = MXFP8BlockScaling() + with fp8_autocast(enabled=True, fp8_recipe=recipe): + out = layer(inp, is_first_microbatch=True) + + # After the first forward pass setup() must have created a quantized shard + w = layer.weight + assert w.quantized is not None, "MXFP8 quantized shard must be set after setup()" + assert isinstance(w.quantized, QuantizedTensor), \ + f"weight.quantized should be QuantizedTensor, got {type(w.quantized)}" + + assert out.shape == (batch, out_f), f"unexpected output shape {out.shape}" + assert torch.isfinite(out).all(), "MXFP8 ETP output has non-finite values" + + # Backward should complete without error + layer.weight.main_grad = torch.zeros(layer.weight.shape, dtype=dtype, device="cuda") + out.sum().backward() + assert inp.grad is not None + assert inp.grad.shape == inp.shape + + # Second microbatch reuses cached quantized weight (skip_weight_cast path) + with fp8_autocast(enabled=True, fp8_recipe=recipe): + out2 = layer(inp.detach(), is_first_microbatch=False) + assert torch.isfinite(out2).all(), "MXFP8 ETP second-microbatch output has non-finite values" + + dist.destroy_process_group() + + +def _worker_mxfp8_linear_unaligned(rank, world_size, port): + """Verify MXFP8 ETP when out_features is not aligned to 16*world_size (padding path). + + MXFP8 requires tensor dims divisible by 32, so shard_size (= M_padded / world_size) + must be a multiple of 32. With world_size=4 this requires M_padded % 128 == 0. + out_f=120 gives M_padded=128, shard_size=32 (32 % 32 == 0). The last rank has + 24 real rows zero-padded to 32. After all-gather, _strip_padding removes the padded + rows before the GEMM, so the output has the original out_f columns. + """ + from transformer_engine.common.recipe import MXFP8BlockScaling + _dist_init(rank, world_size, port) + torch.manual_seed(0) + # out_f=120: M_padded=128, shard_size=32, last rank has 24 rows padded to 32. + # 120 is divisible by 8 (GEMM constraint), not by 64 (ETP alignment → padding needed). + out_f = 120 + in_f = 64 + batch = 32 + dtype = torch.bfloat16 + etp_group = dist.new_group(list(range(world_size))) + + layer = te.Linear( + in_features=in_f, out_features=out_f, + bias=False, params_dtype=dtype, + device="cuda", etp_group=etp_group, + ) + inp = torch.randn(batch, in_f, dtype=dtype, device="cuda", requires_grad=True) + dist.broadcast(inp, src=0) + + with fp8_autocast(enabled=True, fp8_recipe=MXFP8BlockScaling()): + out = layer(inp, is_first_microbatch=True) + + # After _strip_padding removes the padded rows, output has out_f (not padded) cols. + assert out.shape == (batch, out_f), f"unexpected output shape {out.shape}" + assert torch.isfinite(out).all(), "MXFP8 ETP (unaligned) output has non-finite values" + dist.destroy_process_group() + + +def _requires_mxfp8(): + available, reason = is_mxfp8_available(return_reason=True) + if not available: + pytest.skip(f"MXFP8 not available: {reason}") + + +class TestMXFP8LinearETP: + def test_forward_backward(self): + _requires_mxfp8() + _requires_multi_gpu(4) + _run_distributed(_worker_mxfp8_linear, 4) + + def test_forward_unaligned_padding(self): + _requires_mxfp8() + _requires_multi_gpu(4) + _run_distributed(_worker_mxfp8_linear_unaligned, 4) + + +# --------------------------------------------------------------------------- +# 14. ETPConfig / update_config +# --------------------------------------------------------------------------- + +class TestETPConfig: + + def test_update_pad_for_alignment(self): + original = etp_module.ETP_CONFIG.pad_for_alignment + try: + etp_module.update_config(pad_for_alignment=8) + assert etp_module.ETP_CONFIG.pad_for_alignment == 8 + finally: + etp_module.update_config(pad_for_alignment=original) + + def test_update_weight_prefetch(self): + original = etp_module.ETP_CONFIG.weight_prefetch + try: + etp_module.update_config(weight_prefetch=False) + assert etp_module.ETP_CONFIG.weight_prefetch is False + finally: + etp_module.update_config(weight_prefetch=original) + + def test_invalid_key_raises(self): + with pytest.raises(ValueError, match="Unknown ETP config option"): + etp_module.update_config(nonexistent_key=123) + + +# --------------------------------------------------------------------------- +# 15. ETPShardedParam properties – shape computations and padding +# --------------------------------------------------------------------------- + +class TestETPShardedParamProperties: + + class _FakeGroup: + def __init__(self, size=4, rank=0): + self._size = size + self._rank = rank + def size(self): return self._size + def rank(self): return self._rank + + def _make_param(self, shape, pad_length=0, group_size=4, group_rank=0, + is_padded_last_rank=False): + p = ETPShardedParam(torch.zeros(*shape)) + p.group = self._FakeGroup(size=group_size, rank=group_rank) + p.pad_length = pad_length + p.is_padded_last_rank = is_padded_last_rank + p.expert_idx = None + return p + + # --- _unsharded_shape_padded --- + + def test_unsharded_shape_padded_no_padding(self): + # shape=(8, 4), group_size=4 → 8*4=32 rows, no padding + p = self._make_param((8, 4), pad_length=0, group_size=4, group_rank=2) + assert p._unsharded_shape_padded == (32, 4) + + def test_unsharded_shape_padded_last_rank_with_padding(self): + # shard has 15 real rows, pad_length=1, last rank → (15+1)*4=64 + p = self._make_param((15, 32), pad_length=1, group_size=4, group_rank=3, + is_padded_last_rank=True) + assert p._unsharded_shape_padded == (64, 32) + + def test_unsharded_shape_padded_non_last_rank_with_padding(self): + # Non-last rank: pad_length metadata set but shape just multiplied + p = self._make_param((16, 32), pad_length=1, group_size=4, group_rank=0, + is_padded_last_rank=False) + assert p._unsharded_shape_padded == (64, 32) + + # --- _unsharded_shape --- + + def test_unsharded_shape_no_padding(self): + p = self._make_param((8, 4), pad_length=0, group_size=4, group_rank=0) + assert p._unsharded_shape == (32, 4) + + def test_unsharded_shape_strips_padding(self): + # padded = 64, strip 1 → 63 + p = self._make_param((15, 32), pad_length=1, group_size=4, group_rank=3, + is_padded_last_rank=True) + assert p._unsharded_shape == (63, 32) + + # --- get_padded_shard --- + + def test_get_padded_shard_identity_when_no_padding(self): + p = self._make_param((6, 4), pad_length=0) + result = p.get_padded_shard() + assert result is p # identity – no copy needed + + def test_get_padded_shard_identity_non_last_rank(self): + # pad_length > 0 but not the padded last rank → no padding added + p = self._make_param((16, 4), pad_length=1, group_size=4, group_rank=0, + is_padded_last_rank=False) + result = p.get_padded_shard() + assert result is p + + def test_get_padded_shard_appends_zero_rows(self): + p = self._make_param((6, 4), pad_length=2, group_size=4, group_rank=3, + is_padded_last_rank=True) + padded = p.get_padded_shard() + assert padded.shape == (8, 4), f"Expected (8,4), got {padded.shape}" + assert torch.all(padded[6:] == 0), "Padding rows must be zero" + + # --- _strip_padding --- + + def test_strip_padding_identity_no_padding(self): + p = self._make_param((8, 4), pad_length=0) + t = torch.randn(32, 4) + assert p._strip_padding(t) is t + + def test_strip_padding_plain_tensor(self): + # Gathered weight [32, 4] with pad_length=1 → strip 1 row → [31, 4] + p = self._make_param((7, 4), pad_length=1, group_size=4, group_rank=0) + t = torch.randn(32, 4) + result = p._strip_padding(t) + assert result.shape == (31, 4) + assert torch.equal(result, t[:-1]) + + def test_strip_padding_multi_row(self): + # pad_length=4 strips 4 rows + p = self._make_param((12, 8), pad_length=4, group_size=4, group_rank=0) + t = torch.ones(64, 8) + result = p._strip_padding(t) + assert result.shape == (60, 8) + + +# --------------------------------------------------------------------------- +# 16. _get_cache_key – expert vs non-expert, fwd vs bwd +# --------------------------------------------------------------------------- + +class TestETPCacheKey: + + class _FakeGroup: + def size(self): return 4 + def rank(self): return 0 + + def _param(self, shape=(16, 32), expert_idx=None): + p = ETPShardedParam(torch.zeros(*shape)) + p.group = self._FakeGroup() + p.expert_idx = expert_idx + p.pad_length = 0 + p.is_padded_last_rank = False + return p + + def test_non_expert_key_same_for_fwd_bwd(self): + """Non-routed params produce the same cache key for fwd and bwd.""" + p = self._param(expert_idx=None) + assert p._get_cache_key(torch.bfloat16, fwd=True, reduce_scatter=False) == \ + p._get_cache_key(torch.bfloat16, fwd=False, reduce_scatter=False) + + def test_expert_key_differs_fwd_bwd(self): + """For quantized (non-torch.dtype) recipes, expert fwd vs bwd keys differ.""" + p = self._param(expert_idx=0) + # _get_cache_key differentiates fwd/bwd only for non-torch.dtype objects + # (e.g. quantized recipe dtype descriptors). Use a mock to trigger that path. + mock_dtype = "fp8" + assert p._get_cache_key(mock_dtype, fwd=True, reduce_scatter=False) != \ + p._get_cache_key(mock_dtype, fwd=False, reduce_scatter=False) + + def test_different_expert_idx_different_keys(self): + """Two experts with same shape but different indices get distinct keys.""" + p0 = self._param(expert_idx=0) + p1 = self._param(expert_idx=1) + assert p0._get_cache_key(torch.bfloat16, fwd=True, reduce_scatter=False) != \ + p1._get_cache_key(torch.bfloat16, fwd=True, reduce_scatter=False) + + def test_same_expert_idx_same_key(self): + """Same-shaped experts with the same idx share a cache key (cross-layer buffer reuse).""" + p_l0 = self._param(expert_idx=0) + p_l1 = self._param(expert_idx=0) + assert p_l0._get_cache_key(torch.bfloat16, fwd=True, reduce_scatter=False) == \ + p_l1._get_cache_key(torch.bfloat16, fwd=True, reduce_scatter=False) + + def test_different_dtypes_different_keys(self): + p = self._param() + assert p._get_cache_key(torch.bfloat16, fwd=True, reduce_scatter=False) != \ + p._get_cache_key(torch.float32, fwd=True, reduce_scatter=False) + + def test_rs_key_differs_from_ag_key(self): + """reduce_scatter=True key must differ from reduce_scatter=False key.""" + p = self._param() + assert p._get_cache_key(torch.bfloat16, fwd=True, reduce_scatter=False) != \ + p._get_cache_key(torch.bfloat16, fwd=True, reduce_scatter=True) + + +# --------------------------------------------------------------------------- +# 17. ETPWeightCache.take() deferred vs get() immediate pool return +# --------------------------------------------------------------------------- + +class TestETPCacheRelease: + """Tests for ETPWeightCache reserve/get/release semantics.""" + + class _FakeGroup: + def size(self): return 2 + def rank(self): return 0 + + def _param(self, shape=(8, 4)): + p = ETPShardedParam(torch.zeros(*shape)) + p.group = self._FakeGroup() + p.expert_idx = None + p.pad_length = 0 + p.is_padded_last_rank = False + p._quantizer = None + return p + + def test_release_returns_buffer_to_pool(self): + """release() puts the buffer back so the next reserve+get reuses it.""" + cache = ETPWeightCache() + p = self._param() + t1 = cache.reserve(p, torch.bfloat16, fwd=True) + buf1 = cache.get(t1) + cache.release(t1) + # New ticket should pop buf1 from pool + t2 = cache.reserve(p, torch.bfloat16, fwd=True) + buf2 = cache.get(t2) + assert buf2 is buf1, "Buffer should be reused after release()" + cache.release(t2) + + def test_without_release_pool_stays_empty(self): + """Without release(), subsequent reserves allocate fresh buffers.""" + cache = ETPWeightCache() + p = self._param() + t1 = cache.reserve(p, torch.bfloat16, fwd=True) + buf1 = cache.get(t1) + # Do NOT release t1 — pool stays empty + t2 = cache.reserve(p, torch.bfloat16, fwd=True) + buf2 = cache.get(t2) + assert buf2 is not buf1, "Without release, a fresh buffer must be allocated" + + def test_get_same_ticket_returns_same_buf(self): + """get() is idempotent — calling it twice returns the same buffer.""" + cache = ETPWeightCache() + p = self._param() + t = cache.reserve(p, torch.bfloat16, fwd=True) + buf_a = cache.get(t) + buf_b = cache.get(t) + assert buf_a is buf_b + cache.release(t) + + def test_release_invalid_ticket_raises(self): + cache = ETPWeightCache() + with pytest.raises(KeyError): + cache.release(9999) + + +# --------------------------------------------------------------------------- +# 18. tag_etp_params_with_names – _debug_name population +# --------------------------------------------------------------------------- + +class TestTagETPParamsWithNames: + + def test_debug_name_populated_for_etp_param(self): + """ETPShardedParam._debug_name is set to the dotted parameter path.""" + class _FakeGroup: + def size(self): return 1 + def rank(self): return 0 + + model = nn.Linear(4, 8, bias=False) + w = ETPShardedParam(torch.randn(8, 4)) + w.group = _FakeGroup() + model._parameters['weight'] = w + + etp_module.tag_etp_params_with_names(model) + assert w._debug_name == 'weight', \ + f"Expected 'weight', got '{w._debug_name}'" + + def test_nested_module_debug_name(self): + """Nested module produces a dotted debug name.""" + class _FakeGroup: + def size(self): return 1 + def rank(self): return 0 + + outer = nn.Sequential(nn.Linear(4, 8, bias=False)) + w = ETPShardedParam(torch.randn(8, 4)) + w.group = _FakeGroup() + outer._modules['0']._parameters['weight'] = w + + etp_module.tag_etp_params_with_names(outer) + assert w._debug_name == '0.weight', \ + f"Expected '0.weight', got '{w._debug_name}'" + + def test_non_etp_params_are_skipped(self): + """Plain nn.Parameter instances are silently ignored.""" + model = nn.Linear(4, 8) + etp_module.tag_etp_params_with_names(model) # must not raise + + +# --------------------------------------------------------------------------- +# 19. _finalize_wgrad – strip padding, fuse accumulation, hook invocation +# --------------------------------------------------------------------------- + +class TestFinalizeWgrad: + """Tests for ETPShardedParam._finalize_wgrad(param, wgrad_rs). + + Current behaviour: always accumulates wgrad_rs into param.main_grad, + strips padding when is_padded_last_rank=True, resets rs_state to NONE, + and returns a dummy-zero grad tensor with the same shape as main_grad. + """ + + class _FakeGroup: + def size(self): return 2 + def rank(self): return 0 + + def _param(self, shape=(8, 4), pad_length=0, is_padded_last_rank=False, device="cuda"): + p = ETPShardedParam(torch.zeros(*shape, device=device)) + p.group = self._FakeGroup() + p.pad_length = pad_length + p.is_padded_last_rank = is_padded_last_rank + p.main_grad = torch.zeros(*shape, device=device) + return p + + @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA required") + def test_accumulates_into_main_grad(self): + p = self._param() + wgrad = torch.ones(8, 4, device="cuda") + ETPShardedParam._finalize_wgrad(p, wgrad) + assert torch.all(p.main_grad == 1), "main_grad should equal wgrad after accumulation" + + @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA required") + def test_returns_dummy_zero_grad(self): + p = self._param() + wgrad = torch.ones(8, 4, device="cuda") + result = ETPShardedParam._finalize_wgrad(p, wgrad) + assert result.shape == p.shape, "dummy grad shape must match shard shape" + assert torch.all(result == 0), "dummy grad must be zeroes" + + @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA required") + def test_strips_padding_for_padded_rank(self): + # Shard has 7 real rows, pad_length=1, is_padded_last_rank=True. + # RS output has 8 rows (7 real + 1 pad); strip to 7. + p = self._param(shape=(7, 4), pad_length=1, is_padded_last_rank=True) + # main_grad must match the real shard shape (7 rows) + p.main_grad = torch.zeros(7, 4, device="cuda") + wgrad = torch.ones(8, 4, device="cuda") + ETPShardedParam._finalize_wgrad(p, wgrad) + assert torch.all(p.main_grad == 1), "main_grad (7 rows) should be fully updated" + + @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA required") + def test_rs_state_reset_to_none(self): + p = self._param() + p._set_rs_state(ETPWeightState.DATA_READY_SYNC) + wgrad = torch.ones(8, 4, device="cuda") + ETPShardedParam._finalize_wgrad(p, wgrad) + assert p.rs_state == ETPWeightState.NONE, "rs_state should be reset to NONE" + + @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA required") + def test_grad_added_to_main_grad_flag(self): + p = self._param() + p.grad_added_to_main_grad = False + wgrad = torch.ones(8, 4, device="cuda") + ETPShardedParam._finalize_wgrad(p, wgrad) + assert p.grad_added_to_main_grad is True + + +# --------------------------------------------------------------------------- +# 20. wrap_module_params_etp is a no-op when etp_group.size() == 1 +# --------------------------------------------------------------------------- + +class TestETPGroupSizeOne: + + class _SingletonGroup: + def size(self): return 1 + def rank(self): return 0 + + def test_no_sharding_when_etp_size_one(self): + """wrap_module_params_etp must be a no-op for a singleton ETP group.""" + mod = nn.Linear(32, 64, bias=False) + original_weight = mod.weight + wrap_module_params_etp(mod, ['weight'], self._SingletonGroup()) + assert mod.weight is original_weight, \ + "etp_group.size()==1 should leave parameters unchanged" + assert not isinstance(mod.weight, ETPShardedParam) + + +# --------------------------------------------------------------------------- +# 21. weight_prefetch=False: forward still produces correct output +# --------------------------------------------------------------------------- + +def _worker_prefetch_disabled(rank, world_size, port): + _dist_init(rank, world_size, port) + torch.manual_seed(0) + in_f, out_f = 32, 64 + dtype = torch.bfloat16 + etp_group = dist.new_group(list(range(world_size))) + + etp_module.update_config(weight_prefetch=False) + try: + l0 = te.Linear(in_features=in_f, out_features=out_f, bias=False, + params_dtype=dtype, device="cuda", etp_group=etp_group) + l1 = te.Linear(in_features=in_f, out_features=out_f, bias=False, + params_dtype=dtype, device="cuda", etp_group=etp_group) + + inp = torch.randn(4, in_f, dtype=dtype, device="cuda") + dist.broadcast(inp, src=0) + + # Single forward pass: builds chain and verifies output is correct + out = l0(inp, is_first_microbatch=True) + l1(inp, is_first_microbatch=True) + + # Chain should still be wired even with prefetch disabled + assert l0.weight.next_w is l1.weight + assert torch.isfinite(out).all(), "Non-finite output with prefetch disabled" + finally: + etp_module.update_config(weight_prefetch=True) + dist.destroy_process_group() + + +class TestETPPrefetchDisabled: + def test_forward_works_without_prefetch(self): + _requires_multi_gpu(4) + _run_distributed(_worker_prefetch_disabled, 4) + + +# --------------------------------------------------------------------------- +# 22. fuse_wgrad_accumulation=True: wgrad is accumulated into main_grad +# --------------------------------------------------------------------------- + +def _worker_fuse_wgrad(rank, world_size, port): + _dist_init(rank, world_size, port) + torch.manual_seed(0) + in_f, out_f = 32, 128 # out_f % (16*world_size)==0, no padding + dtype = torch.bfloat16 + etp_group = dist.new_group(list(range(world_size))) + + layer = te.Linear( + in_features=in_f, out_features=out_f, + bias=False, params_dtype=dtype, + device="cuda", etp_group=etp_group, + fuse_wgrad_accumulation=True, + ) + + # Allocate main_grad on the local shard shape + w = layer.weight + w.main_grad = torch.zeros(w.shape, dtype=dtype, device="cuda") + + inp = torch.randn(8, in_f, dtype=dtype, device="cuda", requires_grad=True) + dist.broadcast(inp, src=0) + + layer(inp, is_first_microbatch=True).sum().backward() + + # With fused accumulation, wgrad was added into main_grad + assert torch.any(w.main_grad != 0), \ + "main_grad should have been updated by fused wgrad accumulation" + dist.destroy_process_group() + + +class TestFuseWgradAccumulation: + def test_wgrad_accumulated_into_main_grad(self): + _requires_multi_gpu(4) + _run_distributed(_worker_fuse_wgrad, 4) + + +# --------------------------------------------------------------------------- +# 23. _grad_accum_hook is called after reduce-scatter +# --------------------------------------------------------------------------- + +def _worker_main_grad_updated_after_bwd(rank, world_size, port): + """After backward, _finalize_wgrad must have accumulated wgrad into main_grad.""" + _dist_init(rank, world_size, port) + torch.manual_seed(0) + in_f, out_f = 32, 64 + dtype = torch.bfloat16 + etp_group = dist.new_group(list(range(world_size))) + + layer = te.Linear( + in_features=in_f, out_features=out_f, + bias=False, params_dtype=dtype, + device="cuda", etp_group=etp_group, + ) + + # _finalize_wgrad always accumulates into main_grad; allocate before backward. + layer.weight.main_grad = torch.zeros(layer.weight.shape, dtype=dtype, device="cuda") + + inp = torch.randn(8, in_f, dtype=dtype, device="cuda", requires_grad=True) + dist.broadcast(inp, src=0) + layer(inp, is_first_microbatch=True).sum().backward() + + assert torch.any(layer.weight.main_grad != 0), \ + "main_grad should have been updated by _finalize_wgrad after reduce-scatter" + dist.destroy_process_group() + + +class TestETPGradAccumHook: + def test_main_grad_updated_after_backward(self): + _requires_multi_gpu(4) + _run_distributed(_worker_main_grad_updated_after_bwd, 4) + + From 1d771ff3003068ebfb5d6f5da387fc6853065597 Mon Sep 17 00:00:00 2001 From: Shiqing Fan Date: Tue, 7 Apr 2026 00:38:55 -0700 Subject: [PATCH 12/43] import fix --- transformer_engine/pytorch/module/extended_tensor_parallelism.py | 1 + 1 file changed, 1 insertion(+) diff --git a/transformer_engine/pytorch/module/extended_tensor_parallelism.py b/transformer_engine/pytorch/module/extended_tensor_parallelism.py index 83b6dc98c0..07660fa540 100644 --- a/transformer_engine/pytorch/module/extended_tensor_parallelism.py +++ b/transformer_engine/pytorch/module/extended_tensor_parallelism.py @@ -6,6 +6,7 @@ from typing import Dict, List, Optional from enum import Enum from dataclasses import dataclass, field +import math import re import torch from contextlib import nullcontext From fd55ede25e6d0b305cae8bca32677a0377487136 Mon Sep 17 00:00:00 2001 From: Jieming Zhang Date: Wed, 8 Apr 2026 20:11:47 -0700 Subject: [PATCH 13/43] move ag init to first pass Signed-off-by: Jieming Zhang --- .../module/extended_tensor_parallelism.py | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/transformer_engine/pytorch/module/extended_tensor_parallelism.py b/transformer_engine/pytorch/module/extended_tensor_parallelism.py index 07660fa540..dda92b1bac 100644 --- a/transformer_engine/pytorch/module/extended_tensor_parallelism.py +++ b/transformer_engine/pytorch/module/extended_tensor_parallelism.py @@ -431,8 +431,7 @@ def _all_gather_weight(self, async_op, skip_weight_cast, cast_noop_flag, fwd, nv cache = get_global_ETP_cache() for p, dt in zip(weights, dtypes): if fwd: - if p._ag_ticket_fwd is None: - p._ag_ticket_fwd = cache.reserve(p, dt, fwd=True) + # The fwd ag buffer is always initialized in 'all_gather_and_prefetch' out_buffers.append(cache.get(p._ag_ticket_fwd)) else: if p._ag_ticket_bwd is None: @@ -597,11 +596,6 @@ def all_gather_and_prefetch( for w in self._weights: w._set_state(ETPWeightState.NONE) - if self.prev_w is not None: - cache = get_global_ETP_cache() - for w in self._weights: - cache.release(w._ag_ticket_fwd) - # Lazy population of linked list: link previous weight to current weight cls = type(self) if not self.prefetch_initialized: @@ -609,6 +603,17 @@ def all_gather_and_prefetch( cls._buffer_link_table_row(cls._last_weight, self) cls._last_weight.next_w = self self.prev_w = cls._last_weight + + cache = get_global_ETP_cache() + + # Set the fwd ag buffer + quantizers = [w._quantizer for w in self._weights] + dtypes = [q.dtype if q is not None else w.dtype for q, w in zip(quantizers, self._weights)] + for w, dt in zip(self._weights, dtypes): + w._ag_ticket_fwd = cache.reserve(w, dt, fwd=True) + cache.get(w._ag_ticket_fwd) + cache.release(w._ag_ticket_fwd) + self.prefetch_initialized = True elif not cls._link_table_flushed and cls._link_table_buffer: # Second forward pass: flush the complete table atomically to avoid interleaving From 62379f5083e964e7df0a1c96daeb1e250ac94b21 Mon Sep 17 00:00:00 2001 From: Shiqing Fan Date: Thu, 9 Apr 2026 16:31:28 -0700 Subject: [PATCH 14/43] ETP+CG: 2-chain(dense+expert, no cross link prefetching) + shared ag/rs streams --- docs/README_ETP.md | 72 ++++++++- .../module/extended_tensor_parallelism.py | 143 ++++++++++++------ 2 files changed, 163 insertions(+), 52 deletions(-) diff --git a/docs/README_ETP.md b/docs/README_ETP.md index a0ef835614..d6c0a367da 100644 --- a/docs/README_ETP.md +++ b/docs/README_ETP.md @@ -37,9 +37,9 @@ TODO(shiqingf): add performance for Ultra model in nvfp4. | Mechanism | Description | |---|---| | **Alignment padding** | Shards padded to `ETPConfig.pad_for_alignment × etp_size` rows at construction via `get_padded_shard()`; only last rank carries padding (`is_padded_last_rank`); padding stripped in `_strip_padding()` both post-gather (before GEMM) and post-reduce-scatter (before wgrad accumulation) | -| **Fine-grained weight scheduling** | Each weight has its own `ETPWeightState` lifecycle and is scheduled independently via a doubly-linked list (`next_w`/`prev_w`), enabling per-weight AG/RS overlap at single-weight granularity | +| **Fine-grained weight scheduling** | Each weight has its own `ETPWeightState` lifecycle and is scheduled independently via a doubly-linked list (`next_w`/`prev_w`), enabling per-weight AG/RS overlap at single-weight granularity. Two independent chains are maintained: one for dense params (mamba/attn/shared_expert) and one for expert params (grouped_fc1/grouped_fc2) | | **Separate AG and RS state** | All-gather state (`state`) and reduce-scatter state (`rs_state`) are tracked independently per param, allowing forward and backward async ops to proceed without interference | -| **Dedicated CUDA streams** | AG and RS run on separate global CUDA streams (`AG_STREAM`, `RS_STREAM`), decoupled from the default compute stream; completion is signaled back via per-param CUDA events (`ag_event`, `rs_event`) that the compute stream waits on before consuming the result | +| **Shared CUDA streams** | AG and RS run on shared CUDA streams (`get_ag_stream()`, `get_rs_stream()`) across all chains; completion is signaled back via per-param CUDA events (`ag_event`, `rs_event`) that the compute stream waits on before consuming the result. Streams must be shared because `ag_event` is recorded on the AG stream during CUDA graph capture; using a different stream at replay would cause `ag_event.wait()` to see a stale recording | | **Ticket-based buffer cache** | `ETPWeightCache` assigns persistent tickets via `reserve()`; buffers are lazily allocated on `get()` and returned to the pool on `release()`; `clear()` drops all buffers while keeping tickets valid for lazy re-allocation (used for CUDA Graph re-capture) | | **Wgrad reduce-scatter** | Async reduce-scatter of weight gradients, deferred to overlap with next layer's wgrad RS; `_finalize_wgrad()` resets `rs_state`, strips padding, and accumulates the result into `param.main_grad`, returning a dummy-zero grad to autograd | @@ -143,7 +143,7 @@ if etp_group is not None: del weight_tensor # free the temporary full-weight buffer ``` -For `GroupedLinear` (MoE), `wrap_module_params_etp` is called with `is_grouped=True`, which additionally sets `weight_list` on the first expert's `ETPShardedParam` so all experts' weights can be batched together in a single coalesced all-gather. +For `GroupedLinear` (MoE), `wrap_module_params_etp` is called with `is_grouped=True`, which additionally sets `weight_list` on the first expert's `ETPShardedParam` so all experts' weights can be batched together in a single coalesced all-gather. It also sets `chain_id='expert'` so expert params join the expert prefetch chain (separate from the dense chain). ### State Machine @@ -193,10 +193,7 @@ classDiagram <> $ _pending_rs_weight : ETPShardedParam $ _first_weight_flag : bool - $ _last_weight : ETPShardedParam - $ _link_node_count : int - $ _link_table_buffer : List[str] - $ _link_table_flushed : bool + $ _chain_state : Dict[str, dict] +ETPWeightState state +ETPWeightState rs_state +int _ag_ticket_fwd @@ -214,6 +211,7 @@ classDiagram +bool prefetch_initialized +ETPShardedParam next_w +ETPShardedParam prev_w + +str chain_id +bool is_routed_expert +int expert_idx +ProcessGroup group @@ -277,6 +275,7 @@ classDiagram +tuple key +ETPShardedParam param +dtype + +str chain_id +bool reduce_scatter +bool fwd +Tensor buf @@ -378,6 +377,65 @@ A further practical difference is that ETP is **quantization-aware**: shards are --- +## Two-Chain Architecture (Dense + Expert) + +ETP maintains **two independent prefetch chains** to cleanly separate dense and expert weight management: + +| Chain | Params | NCCL Group | CUDA Graph | +|-------|--------|-----------|------------| +| **Dense** (`chain_id='dense'`) | mamba, attention, shared expert | `PARAMETER_SHARDING_GROUP` | Captured in graphs | +| **Expert** (`chain_id='expert'`) | grouped_fc1, grouped_fc2 | `EXPERT_PARAMETER_SHARDING_GROUP` | Runs eagerly | + +Both chains share the same `ag_stream` / `rs_stream` (see "Shared Streams" below). + +### Why Two Chains Instead of One? + +The original design used a **single global chain** linking all ETP params: + +``` +Single chain (old): +CG(mamba.fc1 -> mamba.fc2) -> CG(shared_expert.fc1 -> shared_expert.fc2) -> EAGER(grouped_fc1 -> grouped_fc2) -> CG(next_mamba.fc1 -> ...) -> ... + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + crosses CG/eager boundary +``` + +This caused two problems: + +1. **Cross-chain prefetch crossing CG/eager boundary**: The single linked list linked dense params (captured in CUDA graphs) to expert params (running eagerly). The prefetch chain crossed the CG/eager boundary, causing the captured AG event sequence to include expert weight prefetches. At 64+ GPU IB scale, this interaction corrupted NCCL communicator progress tracking across graph replays and caused deadlocks. + +2. **Complex fencing**: Numerous `_drain_etp_side_streams()` fences were needed at every CG/eager boundary (forward expert compute entry, backward dispatch/combine, finalize_model_grads). These fences were fragile, hard to reason about, and didn't fully solve the 64-GPU hang. + +The two-chain design eliminates both problems: + +``` +Dense chain: CG(mamba.fc1 -> mamba.fc2) -> CG(shared_expert.fc1 -> shared_expert.fc2) -> CG(next_mamba.fc1 -> ...) -> ... +Expert chain: EAGER(grouped_fc1_L1 -> grouped_fc2_L1) -> EAGER(grouped_fc1_L2 -> grouped_fc2_L2) -> ... + (never crosses into CG, never uses PARAMETER_SHARDING_GROUP) +``` + +Each chain uses its own NCCL communicator and stays entirely within one execution mode (CG or eager). + +### Chain Construction + +Each chain builds its own doubly-linked list independently via per-chain state in `_chain_state`: + +``` +Dense chain: mamba.fc1 -> mamba.fc2 -> shared_expert.fc1 -> shared_expert.fc2 -> next_mamba.fc1 -> ... +Expert chain: grouped_fc1_layer1 -> grouped_fc2_layer1 -> grouped_fc1_layer2 -> ... +``` + +The `chain_id` is set automatically: `wrap_module_params_etp(..., is_grouped=True)` sets `chain_id='expert'`; all other params default to `chain_id='dense'`. + +### Shared Streams + +Both chains share the same `ag_stream` and `rs_stream`. Per-chain streams were considered but cause correctness issues: the `ag_event` CUDA event object is recorded on `ag_stream` during CUDA graph capture. If expert params used a different stream at replay time, `ag_event.wait()` would see a stale recording, producing Inf gradients. Shared streams avoid this while the chain-level isolation (no cross-chain `next_w`/`prev_w` links) provides the key benefit of preventing prefetch chains from crossing the CG/eager boundary. + +### Buffer Cache + +The single global `ETPWeightCache` serves both chains. Cache keys already include `expert_idx`, so dense and expert buffers never collide. `reallocate_to_mempool()` only migrates **dense-chain** buffers into the CUDA graph memory pool; expert-chain buffers remain in regular allocator memory. + +--- + ## Scalability ETP scales along two independent dimensions: diff --git a/transformer_engine/pytorch/module/extended_tensor_parallelism.py b/transformer_engine/pytorch/module/extended_tensor_parallelism.py index dda92b1bac..00ba434379 100644 --- a/transformer_engine/pytorch/module/extended_tensor_parallelism.py +++ b/transformer_engine/pytorch/module/extended_tensor_parallelism.py @@ -47,20 +47,32 @@ class ETPWeightState(Enum): # Global set of ETPShardedParam with in-flight async comms (AG or RS). _inflight_comm_params: set = set() -AG_STREAM = None -RS_STREAM = None - -def get_ag_stream(): - global AG_STREAM - if AG_STREAM is None: - AG_STREAM = torch.cuda.Stream() - return AG_STREAM - -def get_rs_stream(): - global RS_STREAM - if RS_STREAM is None: - RS_STREAM = torch.cuda.Stream() - return RS_STREAM +_AG_STREAMS: Dict[str, torch.cuda.Stream] = {} +_RS_STREAMS: Dict[str, torch.cuda.Stream] = {} + +def get_ag_stream(chain_id: str = 'dense') -> torch.cuda.Stream: + # All chains share one AG stream. The ag_event CUDA event object is recorded on + # this stream during graph capture; using a different stream at replay would cause + # ag_event.wait() to see a stale recording, producing Inf gradients. + key = 'shared' + if key not in _AG_STREAMS: + _AG_STREAMS[key] = torch.cuda.Stream() + return _AG_STREAMS[key] + +def get_rs_stream(chain_id: str = 'dense') -> torch.cuda.Stream: + # All chains share one RS stream (same reason as AG stream). + key = 'shared' + if key not in _RS_STREAMS: + _RS_STREAMS[key] = torch.cuda.Stream() + return _RS_STREAMS[key] + +def get_all_ag_streams() -> list: + """Return all AG streams that have been created.""" + return list(_AG_STREAMS.values()) + +def get_all_rs_streams() -> list: + """Return all RS streams that have been created.""" + return list(_RS_STREAMS.values()) @dataclass class ETPConfig: @@ -133,6 +145,7 @@ def wrap_module_params_etp(module, weight_names, etp_group, is_grouped=None): if is_grouped: etp_shard.expert_idx = idx etp_shard.is_routed_expert = True + etp_shard.chain_id = 'expert' etp_shard.group = etp_group etp_shard.ps_size = etp_size # register the newly sharded param back to the module @@ -167,13 +180,22 @@ class ETPShardedParam(torch.nn.Parameter): _pending_rs_weight = None _first_weight_flag = True - _last_weight = None - _link_node_count = 0 - _link_table_buffer: List[str] = [] - _link_table_flushed: bool = False + # Per-chain state: each chain_id ('dense', 'expert') has its own linked list. + _chain_state: Dict[str, dict] = {} @classmethod - def _buffer_link_table_row(cls, prev: "ETPShardedParam", curr: "ETPShardedParam") -> None: + def _get_chain_state(cls, chain_id: str) -> dict: + if chain_id not in cls._chain_state: + cls._chain_state[chain_id] = { + 'last_weight': None, + 'link_node_count': 0, + 'link_table_buffer': [], + 'link_table_flushed': False, + } + return cls._chain_state[chain_id] + + @classmethod + def _buffer_link_table_row(cls, prev: "ETPShardedParam", curr: "ETPShardedParam", chain: dict) -> None: """Buffer one row of the prefetch-link table (flushed atomically on the second forward pass).""" _W = 70 @@ -181,18 +203,20 @@ def _layer_id(name: str) -> str: m = re.search(r"\d+", name) return m.group() if m else "-" - cls._link_node_count += 1 - if cls._link_node_count == 1: - cls._link_table_buffer.append( + chain['link_node_count'] += 1 + if chain['link_node_count'] == 1: + chain_id = getattr(curr, 'chain_id', 'dense') + chain['link_table_buffer'].append( + f"\n[{chain_id} chain]" f"\n{'node_id':>7} | {'layer_id':>8} | {'curr_weight_name':<{_W}} | prev_weight_name" f"\n{'-'*7}-+-{'-'*8}-+-{'-'*_W}-+-{'-'*_W}" ) # Seed weight (first ETP param) as row 0 - cls._link_table_buffer.append( + chain['link_table_buffer'].append( f"{'0':>7} | {_layer_id(prev._debug_name):>8} | {prev._debug_name:<{_W}} | -" ) - cls._link_table_buffer.append( - f"{cls._link_node_count:>7} | {_layer_id(curr._debug_name):>8} | " + chain['link_table_buffer'].append( + f"{chain['link_node_count']:>7} | {_layer_id(curr._debug_name):>8} | " f"{curr._debug_name:<{_W}} | {prev._debug_name}" ) @@ -219,6 +243,8 @@ def __init__(self, x, *args, **kwargs): self.prefetch_initialized = False self.next_w = None self.prev_w = None + # Chain identity: 'dense' for mamba/attn/shared_expert, 'expert' for grouped experts + self.chain_id = 'dense' # Grouped gemm self.is_routed_expert = False self.expert_idx = None @@ -480,7 +506,7 @@ def _wait_param_gather(self): # Since wait() may sychronize against a different stream than the current stream, # an event is recorded and waited on when the data is retrieved, which ensures the # AG always finishes before returning the unsharded param - with torch.cuda.stream(get_ag_stream()): + with torch.cuda.stream(get_ag_stream(self.chain_id)): if self._prefetch_handle is not None: self._prefetch_handle.wait() self._prefetch_handle = None @@ -597,12 +623,15 @@ def all_gather_and_prefetch( w._set_state(ETPWeightState.NONE) # Lazy population of linked list: link previous weight to current weight + # Uses per-chain state so dense and expert chains never cross-link. cls = type(self) + chain = cls._get_chain_state(self.chain_id) if not self.prefetch_initialized: - if cls._last_weight is not None and cls._last_weight.next_w is None: - cls._buffer_link_table_row(cls._last_weight, self) - cls._last_weight.next_w = self - self.prev_w = cls._last_weight + last_w = chain['last_weight'] + if last_w is not None and last_w.next_w is None: + cls._buffer_link_table_row(last_w, self, chain) + last_w.next_w = self + self.prev_w = last_w cache = get_global_ETP_cache() @@ -615,11 +644,11 @@ def all_gather_and_prefetch( cache.release(w._ag_ticket_fwd) self.prefetch_initialized = True - elif not cls._link_table_flushed and cls._link_table_buffer: + elif not chain['link_table_flushed'] and chain['link_table_buffer']: # Second forward pass: flush the complete table atomically to avoid interleaving - cls._link_table_flushed = True - print_rank_0("\n".join(cls._link_table_buffer) + "\n") - cls._last_weight = self + chain['link_table_flushed'] = True + print_rank_0("\n".join(chain['link_table_buffer']) + "\n") + chain['last_weight'] = self return result @@ -660,7 +689,7 @@ def _finalize_wgrad(param, wgrad_rs): def _wait_reduce_scatter(self): # assert self._wgrad_rs_handle is not None or is_graph_capturing() - with torch.cuda.stream(get_rs_stream()): + with torch.cuda.stream(get_rs_stream(self.chain_id)): if self._wgrad_rs_handle is not None: self._wgrad_rs_handle.wait() self._wgrad_rs_handle = None @@ -798,6 +827,7 @@ class _TicketSlot: dtype: object # torch.dtype or tex.DType reduce_scatter: bool fwd: bool + chain_id: str = 'dense' # chain this slot belongs to buf: Optional[torch.Tensor] = field(default=None) # None when released or after clear() @@ -878,7 +908,8 @@ def reserve(self, param: 'ETPShardedParam', dtype, fwd: bool, reduce_scatter=Fal self._next_ticket += 1 self._slots[ticket] = _TicketSlot( - key=key, param=param, dtype=dtype, reduce_scatter=reduce_scatter, fwd=fwd + key=key, param=param, dtype=dtype, reduce_scatter=reduce_scatter, fwd=fwd, + chain_id=getattr(param, 'chain_id', 'dense'), ) return ticket @@ -909,17 +940,26 @@ def clear(self): self._total_bytes = 0 def reallocate_to_mempool(self, device, mempool): - """Re-allocate all ticket buffers into a CUDA graph memory pool. + """Re-allocate dense-chain ticket buffers into a CUDA graph memory pool. - Call BEFORE graph capture so every buffer lives in the capture pool - and no allocations are recorded inside the graph. + Call BEFORE graph capture so every dense-chain buffer lives in the capture + pool and no allocations are recorded inside the graph. Expert-chain buffers + are left in regular memory (expert compute runs eagerly, not in graphs). """ - # Clone the current memory pool buffers but into the passed in mempool + # Identify keys that belong to the dense chain + dense_keys = set() + for slot in self._slots.values(): + if slot.chain_id == 'dense': + dense_keys.add(slot.key) + + # Clone only dense-chain pool buffers into the passed in mempool self._total_bytes = 0 new_pool = defaultdict(list) torch._C._cuda_beginAllocateCurrentThreadToPool(device, mempool) for key, buffers in self._pool.items(): + if key not in dense_keys: + continue new_buffers = [] for _ in range(len(buffers)): buf = self._allocate_buffer(*self.key_to_allocate_func[key]) @@ -927,17 +967,24 @@ def reallocate_to_mempool(self, device, mempool): new_pool[key] = new_buffers torch._C._cuda_endAllocateToPool(device, mempool) - # Map each buffer in the old pool to its corresponding new one + # Map each buffer in the old pool to its corresponding new one (dense only) old_to_new_buff = {} for key, old_pool in self._pool.items(): + if key not in dense_keys: + continue new = new_pool[key] for old_buf, new_buf in zip(old_pool, new): old_to_new_buff[old_buf] = new_buf - # Replace each slot's reference to its corresponding new one + + # Replace each dense slot's reference; keep expert slots unchanged for slot in self._slots.values(): - if slot.buf is not None: + if slot.chain_id == 'dense' and slot.buf is not None and slot.buf in old_to_new_buff: slot.buf = old_to_new_buff[slot.buf] + # Merge: dense keys get new buffers, expert keys keep old ones + for key, buffers in self._pool.items(): + if key not in dense_keys: + new_pool[key] = buffers self._pool = new_pool return @@ -955,10 +1002,16 @@ def reallocate_etp_cache_to_mempool(device, mempool): _ETP_CACHE.reallocate_to_mempool(device, mempool) -def wait_async_comms(): - """Wait on all in-flight ETP async communications (all-gathers + reduce-scatters). +def wait_async_comms(chain_id: str = None): + """Wait on in-flight ETP async communications (all-gathers + reduce-scatters). + + Args: + chain_id: If specified, only drain params belonging to this chain ('dense' or 'expert'). + If None, drain all chains (backward compat). """ for param in list(_inflight_comm_params): + if chain_id is not None and getattr(param, 'chain_id', 'dense') != chain_id: + continue param._wait_param_gather() param._wait_reduce_scatter() From ba909cc543b4b3106307e1623d668b680b07fc86 Mon Sep 17 00:00:00 2001 From: Shiqing Fan Date: Thu, 9 Apr 2026 20:37:28 -0700 Subject: [PATCH 15/43] fix the case when ETP_Config.weight_prefetch is False. --- .../module/extended_tensor_parallelism.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/transformer_engine/pytorch/module/extended_tensor_parallelism.py b/transformer_engine/pytorch/module/extended_tensor_parallelism.py index 00ba434379..843d184043 100644 --- a/transformer_engine/pytorch/module/extended_tensor_parallelism.py +++ b/transformer_engine/pytorch/module/extended_tensor_parallelism.py @@ -554,14 +554,14 @@ def all_gather_and_prefetch_bwd(self, nvtx_label=None): weight_total """ - if self.next_w is not None: + if ETP_CONFIG.weight_prefetch and self.next_w is not None: result = self._get_prefetched_weight(False, skip_weight_cast=True) else: result = self._all_gather_weight_on_demand(False, skip_weight_cast=True) if ( - ETP_CONFIG.weight_prefetch - and self.prev_w is not None + ETP_CONFIG.weight_prefetch + and self.prev_w is not None and self.prev_w._need_weight_prefetch ): _, handle = self.prev_w._all_gather_weight( @@ -574,7 +574,7 @@ def all_gather_and_prefetch_bwd(self, nvtx_label=None): for w in self._weights: w._set_state(ETPWeightState.NONE) - if self.next_w is not None: + if ETP_CONFIG.weight_prefetch and self.next_w is not None: cache = get_global_ETP_cache() for w in self._weights: cache.release(w._ag_ticket_bwd) @@ -599,15 +599,15 @@ def all_gather_and_prefetch( Returns: weight_total """ - if self.prev_w is not None: + if ETP_CONFIG.weight_prefetch and self.prev_w is not None: result = self._get_prefetched_weight(True, skip_weight_cast, cast_noop_flag) else: result = self._all_gather_weight_on_demand(True, skip_weight_cast, cast_noop_flag) # Prefetch next weight if ( - ETP_CONFIG.weight_prefetch - and self.next_w is not None + ETP_CONFIG.weight_prefetch + and self.next_w is not None and self.next_w._need_weight_prefetch ): _, handle = self.next_w._all_gather_weight( @@ -776,7 +776,7 @@ def wgrad_reduce_scatter(self, wgrad, nvtx_label=None): # Wait for last reduce scatter if it was async # Currently only support reduce scattering in reverse order - if self.next_w is not None: + if ETP_CONFIG.weight_prefetch and self.next_w is not None: self.next_w._wait_reduce_scatter() self.next_w.rs_event.wait() From fd65c96c88cdc3fdfdbb6ac20d703f1ce7825114 Mon Sep 17 00:00:00 2001 From: Shiqing Fan Date: Sun, 12 Apr 2026 22:17:42 -0700 Subject: [PATCH 16/43] ETP+emb/output layers: remove these two layers from the prefetch chain. --- .../pytorch/module/extended_tensor_parallelism.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/transformer_engine/pytorch/module/extended_tensor_parallelism.py b/transformer_engine/pytorch/module/extended_tensor_parallelism.py index 843d184043..a9689db253 100644 --- a/transformer_engine/pytorch/module/extended_tensor_parallelism.py +++ b/transformer_engine/pytorch/module/extended_tensor_parallelism.py @@ -644,11 +644,11 @@ def all_gather_and_prefetch( cache.release(w._ag_ticket_fwd) self.prefetch_initialized = True + chain['last_weight'] = self elif not chain['link_table_flushed'] and chain['link_table_buffer']: # Second forward pass: flush the complete table atomically to avoid interleaving chain['link_table_flushed'] = True print_rank_0("\n".join(chain['link_table_buffer']) + "\n") - chain['last_weight'] = self return result From 650461137c60367e82438a719966bd326e7c6faf Mon Sep 17 00:00:00 2001 From: Shiqing Fan Date: Mon, 13 Apr 2026 23:38:08 -0700 Subject: [PATCH 17/43] ET+CG mem fix1: use pooled buffers for both async and sync gathers to avoid allocating fresh memory each iteration. --- .../module/extended_tensor_parallelism.py | 31 ++++++++++--------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/transformer_engine/pytorch/module/extended_tensor_parallelism.py b/transformer_engine/pytorch/module/extended_tensor_parallelism.py index a9689db253..e18bef56fa 100644 --- a/transformer_engine/pytorch/module/extended_tensor_parallelism.py +++ b/transformer_engine/pytorch/module/extended_tensor_parallelism.py @@ -450,21 +450,22 @@ def _all_gather_weight(self, async_op, skip_weight_cast, cast_noop_flag, fwd, nv else: gather_weights = list(w.get_padded_shard() for w in weights) - # 4. Cache checkout (async only — sync gathers don't need pooled buffers). - if async_op: - dtypes = [q.dtype if q is not None else w.dtype for q, w in zip(quantizers, weights)] - out_buffers = [] - cache = get_global_ETP_cache() - for p, dt in zip(weights, dtypes): - if fwd: - # The fwd ag buffer is always initialized in 'all_gather_and_prefetch' - out_buffers.append(cache.get(p._ag_ticket_fwd)) - else: - if p._ag_ticket_bwd is None: - p._ag_ticket_bwd = cache.reserve(p, dt, fwd=False) - out_buffers.append(cache.get(p._ag_ticket_bwd)) - else: - out_buffers = None + # 4. Cache checkout — use pooled buffers for both async and sync gathers + # to avoid allocating fresh memory each iteration. + dtypes = [q.dtype if q is not None else w.dtype for q, w in zip(quantizers, weights)] + out_buffers = [] + cache = get_global_ETP_cache() + for p, dt in zip(weights, dtypes): + if fwd: + if p._ag_ticket_fwd is None: + p._ag_ticket_fwd = cache.reserve(p, dt, fwd=True) + cache.get(p._ag_ticket_fwd) + cache.release(p._ag_ticket_fwd) + out_buffers.append(cache.get(p._ag_ticket_fwd)) + else: + if p._ag_ticket_bwd is None: + p._ag_ticket_bwd = cache.reserve(p, dt, fwd=False) + out_buffers.append(cache.get(p._ag_ticket_bwd)) # 5. Communicate. etp_group = weights[0].group From 14db8142dfa667aa184ec72cc6c63c86fb67b127 Mon Sep 17 00:00:00 2001 From: Shiqing Fan Date: Tue, 14 Apr 2026 00:46:49 -0700 Subject: [PATCH 18/43] ETP+CG mem fix2: fix wgrad tensor retention and eliminate redundant AG allocations 1. Release NCCL Work C++ tensor refs promptly in ETPShardHandle.wait() (self.handle = None) so wgrad buffers are freed when RS is waited, not held until optimizer.step(). 2. Use cache buffers for sync all-gather path (not just async). The old code passed out_buffers=None for sync gathers, allocating ~22 GB/iter of fresh tensors. Sync gathers now reuse the same ETPWeightCache buffers as the async prefetch path. 3. Add standalone wgrad input buffer pool (_wgrad_buf_pool) for expert chain. get_wgrad_tensor() draws from the pool; buffers are returned after RS is waited via _wgrad_input_bufs stash in _wait_reduce_scatter. Reduces expert wgrad peak from ~4 GB (held until optimizer) to ~640 MB (16 buffers reused across all MoE layers). 4. Stash _wgrad_input_bufs for all chains (not just expert) so ungraphed dense weights (output layer) also drop Python refs at _wait_reduce_scatter instead of surviving until calc_params_l2_norm. 5. Fix tensor comparison crash in cache.release(): use identity check (any(b is slot.buf ...)) instead of tensor == which returns a multi-element bool tensor. --- .../module/extended_tensor_parallelism.py | 68 ++++++++++++++++--- 1 file changed, 58 insertions(+), 10 deletions(-) diff --git a/transformer_engine/pytorch/module/extended_tensor_parallelism.py b/transformer_engine/pytorch/module/extended_tensor_parallelism.py index e18bef56fa..1700d14129 100644 --- a/transformer_engine/pytorch/module/extended_tensor_parallelism.py +++ b/transformer_engine/pytorch/module/extended_tensor_parallelism.py @@ -50,6 +50,31 @@ class ETPWeightState(Enum): _AG_STREAMS: Dict[str, torch.cuda.Stream] = {} _RS_STREAMS: Dict[str, torch.cuda.Stream] = {} +# Standalone wgrad input buffer pool, keyed by (shape, dtype). +# Separate from ETPWeightCache because: +# 1. Wgrad buffers are expert-chain only (never graphed) +# 2. They need true release-then-reuse (the pool shrinks/grows), whereas +# ETPWeightCache keeps slot.buf set for CUDA graph address stability +_wgrad_buf_pool: Dict[tuple, list] = {} + + +def _wgrad_pool_get(shape: tuple, dtype: torch.dtype, device) -> torch.Tensor: + """Get a wgrad buffer from the pool, or allocate a fresh one.""" + key = (shape, dtype) + pool = _wgrad_buf_pool.get(key) + if pool: + return pool.pop() + return torch.empty(shape, dtype=dtype, device=device, requires_grad=False) + + +def _wgrad_pool_put(buf: torch.Tensor): + """Return a wgrad buffer to the pool for reuse.""" + key = (tuple(buf.shape), buf.dtype) + if key not in _wgrad_buf_pool: + _wgrad_buf_pool[key] = [] + _wgrad_buf_pool[key].append(buf) + + def get_ag_stream(chain_id: str = 'dense') -> torch.cuda.Stream: # All chains share one AG stream. The ag_event CUDA event object is recorded on # this stream during graph capture; using a different stream at replay would cause @@ -167,6 +192,7 @@ def __init__(self, handle, etp_shards, reduce_scatter=False): def wait(self): if self.handle is not None: self.handle.wait() + self.handle = None # Release NCCL Work and its C++ tensor references promptly for w in self.etp_shards: if self.reduce_scatter: w._set_rs_state(ETPWeightState.DATA_READY) @@ -659,12 +685,7 @@ def batched_all_gather_and_prefetch(self, **kwargs): return self.all_gather_and_prefetch(**kwargs) def get_wgrad_tensor(self): - return torch.empty( - self._unsharded_shape, - dtype=self.main_grad.dtype, - device=self.device, - requires_grad=False, - ) + return _wgrad_pool_get(self._unsharded_shape, self.main_grad.dtype, self.device) @staticmethod def _finalize_wgrad(param, wgrad_rs): @@ -695,6 +716,14 @@ def _wait_reduce_scatter(self): self._wgrad_rs_handle.wait() self._wgrad_rs_handle = None self.rs_event.record() + # RS is done — drop stashed wgrad input buffer refs. + # Safe because handle.wait() above guarantees the RS kernel finished reading them. + # Expert-chain buffers go back to pool for reuse; dense-chain buffers just drop refs. + if getattr(self, '_wgrad_input_bufs', None) is not None: + if self.chain_id == 'expert': + for buf in self._wgrad_input_bufs: + _wgrad_pool_put(buf) + self._wgrad_input_bufs = None def _reduce_scatter(self, wgrads, async_op, nvtx_label=None): """Reduce-scatter one or more wgrads. Returns (outputs, handle). @@ -764,15 +793,26 @@ def wgrad_reduce_scatter(self, wgrad, nvtx_label=None): wgrads = list(wgrad) if batched else [wgrad] weights = self._weights + # Expert-chain wgrads are recycled via the standalone pool (_wgrad_pool_put). + # All ungraphed weights (expert + output layer) benefit from the stash + # (_wgrad_input_bufs) which drops Python refs once the RS is waited. + poolable = self.chain_id == 'expert' + if ETP_CONFIG.weight_prefetch and self.prev_w is not None: # Async reduce-scatter (not last weight — deferred finish) _, rs_handle = self._reduce_scatter(wgrads, async_op=True, nvtx_label=nvtx_label) self._wgrad_rs_handle = ETPShardHandle(rs_handle, weights, reduce_scatter=True) + # Stash wgrad input buffers — cannot recycle yet because the async RS + # kernel is still reading them on rs_stream. + self._wgrad_input_bufs = wgrads ret = tuple([None] * len(wgrads)) if batched else None else: - # Sync reduce-scatter (last weight in chain) + # Sync reduce-scatter (last weight in chain) — RS done, recycle immediately sharded, _ = self._reduce_scatter(wgrads, async_op=False, nvtx_label=nvtx_label) result = [self._finalize_wgrad(p, g) for p, g in zip(weights, sharded)] + if poolable: + for buf in wgrads: + _wgrad_pool_put(buf) ret = result if batched else result[0] # Wait for last reduce scatter if it was async @@ -927,10 +967,18 @@ def get(self, ticket: int) -> torch.Tensor: return slot.buf def release(self, ticket: int): - """Return the buffer to the pool. Ticket remains valid.""" + """Return the buffer to the pool. Ticket remains valid. + + slot.buf is intentionally NOT cleared: get() must stay idempotent so that + CUDA-graph-captured buffers keep their fixed address across replays, and + reallocate_to_mempool() can find every dense-chain buffer. + """ slot = self._slots[ticket] - assert slot.buf is not None - if slot.buf not in self._pool[slot.key]: + if slot.buf is None: + return + # Use identity check — tensor == tensor returns a multi-element bool tensor + # which crashes in a boolean context ("Boolean value of Tensor is ambiguous"). + if not any(b is slot.buf for b in self._pool.get(slot.key, [])): self._pool[slot.key].append(slot.buf) def clear(self): From e9acc1b10923ea634ba8bda92716cc7e893fde78 Mon Sep 17 00:00:00 2001 From: Shiqing Fan Date: Tue, 14 Apr 2026 02:54:58 -0700 Subject: [PATCH 19/43] ETP+CG mem fix3: release gathered expert weights after dgrad GEMM in backward For nvfp4, batched_all_gather_and_prefetch_bwd() returns NVFP4TensorStorage objects with internal sub-tensors (columnwise data/scale_inv). The local `weights` variable kept them alive until function return, wasting memory through the wgrad phase. Delete `weights` immediately after the dgrad GEMM (the last consumer), saving weight_sizes for the fuse_wgrad_accumulation=False fallback path. --- transformer_engine/pytorch/module/grouped_linear.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/transformer_engine/pytorch/module/grouped_linear.py b/transformer_engine/pytorch/module/grouped_linear.py index d11947be0e..3a3e8f86fc 100644 --- a/transformer_engine/pytorch/module/grouped_linear.py +++ b/transformer_engine/pytorch/module/grouped_linear.py @@ -453,6 +453,13 @@ def backward(ctx, grad_output: torch.Tensor) -> Tuple[Union[torch.Tensor, None], use_split_accumulator=dgrad_gemm_use_split_accumulator, ) + # Gathered weights are no longer needed after dgrad GEMM. + # For nvfp4, the NVFP4TensorStorage and its sub-tensors (scale_inv etc.) + # would otherwise survive until function return via this local ref. + if ctx.etp_size > 1: + weight_sizes = [w.size() for w in weights] + del weights + if ctx.weights_requires_grad: wgrad_gemm_use_split_accumulator = _2X_ACC_WGRAD if ctx.fp8: @@ -464,9 +471,10 @@ def backward(ctx, grad_output: torch.Tensor) -> Tuple[Union[torch.Tensor, None], if ctx.fuse_wgrad_accumulation: wgrad_list = main_grads else: + sizes = weight_sizes if ctx.etp_size > 1 else [w.size() for w in weights] wgrad_list = [ - torch.empty(w.size(), dtype=ctx.activation_dtype, device=ctx.device) - for w in weights + torch.empty(sz, dtype=ctx.activation_dtype, device=ctx.device) + for sz in sizes ] if ctx.save_original_input: From 96515f1d4c2d839fb0520e93392e0055fae5ccae Mon Sep 17 00:00:00 2001 From: Shiqing Fan Date: Tue, 14 Apr 2026 08:46:34 -0700 Subject: [PATCH 20/43] [Conservative] fix ETP+CG+DDPOverlaping hang: serialize DDP RS and EETP AG on IB in expert backward ag_stream.wait_stream(main_stream) before batched_all_gather_and_prefetch_bwd in grouped_linear.py backward. With --overlap-grad-reduce + CG, DDP backward hooks fire a reduce-scatter (IB, main_stream) that races with the EETP all-gather (IB, ag_stream), causing NCCL deadlock at 64+ GPU IB scale --- transformer_engine/pytorch/module/grouped_linear.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/transformer_engine/pytorch/module/grouped_linear.py b/transformer_engine/pytorch/module/grouped_linear.py index 3a3e8f86fc..0aae97541c 100644 --- a/transformer_engine/pytorch/module/grouped_linear.py +++ b/transformer_engine/pytorch/module/grouped_linear.py @@ -420,6 +420,16 @@ def backward(ctx, grad_output: torch.Tensor) -> Tuple[Union[torch.Tensor, None], accumulate_wgrad_into_param_main_grad = ctx.fuse_wgrad_accumulation if ctx.etp_size > 1: + # Drain main_stream before launching EETP all-gather on ag_stream. + # With --overlap-grad-reduce + CG, DDP backward hooks may have fired a + # reduce-scatter (IB, on main_stream) that races with the EETP AG (IB, + # on ag_stream). Making ag_stream wait for main_stream serializes IB ops. + from transformer_engine.pytorch.module.extended_tensor_parallelism import ( + get_ag_stream, + ) + get_ag_stream(origin_weights[0].chain_id).wait_stream( + torch.cuda.current_stream() + ) weights = origin_weights[0].batched_all_gather_and_prefetch_bwd() if ctx.requires_dgrad: From d78bd538941a8a42b10fa94361cc427dfcd6fbd9 Mon Sep 17 00:00:00 2001 From: Shiqing Fan Date: Tue, 14 Apr 2026 19:40:46 -0700 Subject: [PATCH 21/43] ETP+CG+DDP final fix: ETP: restore register_grad_accum_hook + _finalize_wgrad DDP hook trigger Re-add register_grad_accum_hook() to store DDP backward hook on ETP params. _finalize_wgrad now calls the hook after RS wait + main_grad.add_(), firing DDP register_grad_ready at the correct serialization point. This replaces the previous approach of skipping DDP hooks entirely for ETP params. param.grad = dummy_grad is a Python attr set (does NOT trigger autograd's grad accumulator); the explicit _grad_accum_hook() call is required. --- .../module/extended_tensor_parallelism.py | 31 +++++++++++++++++-- .../pytorch/module/grouped_linear.py | 10 ------ 2 files changed, 28 insertions(+), 13 deletions(-) diff --git a/transformer_engine/pytorch/module/extended_tensor_parallelism.py b/transformer_engine/pytorch/module/extended_tensor_parallelism.py index 1700d14129..830ae7f6e0 100644 --- a/transformer_engine/pytorch/module/extended_tensor_parallelism.py +++ b/transformer_engine/pytorch/module/extended_tensor_parallelism.py @@ -261,6 +261,9 @@ def __init__(self, x, *args, **kwargs): self._prefetch_handle = None self._need_weight_prefetch = True self.ag_event = torch.cuda.Event(external=True) + # DDP backward hook (set by register_grad_accum_hook) + self._grad_accum_node = None + self._grad_accum_hook = None # Quantization self._quantizer = None self.did_cast_to_low_precision = False @@ -687,12 +690,24 @@ def batched_all_gather_and_prefetch(self, **kwargs): def get_wgrad_tensor(self): return _wgrad_pool_get(self._unsharded_shape, self.main_grad.dtype, self.device) + def register_grad_accum_hook(self, grad_accum_node, hook): + """Register a DDP backward hook to be called from _finalize_wgrad. + + For ETP params, autograd may receive None (async RS) so the normal grad + accumulator hook never fires. Instead, _finalize_wgrad calls the hook + explicitly after RS wait + gradient accumulation, ensuring DDP's + register_grad_ready fires at exactly the right time. + """ + self._grad_accum_node = grad_accum_node + self._grad_accum_hook = hook + @staticmethod def _finalize_wgrad(param, wgrad_rs): - """Post-RS per-param processing: strip padding, accumulate into main_grad. + """Post-RS per-param processing: strip padding, accumulate, call DDP hook. - Accumulates the reduce-scattered wgrad into main_grad and returns - a dummy zero grad to autograd (DDP backward post hook is not used for ETP params). + Accumulates the reduce-scattered wgrad into main_grad and triggers + the DDP backward hook (register_grad_ready) so the DP reduce-scatter + fires at the correct time during backward. """ param._set_rs_state(ETPWeightState.NONE) @@ -706,6 +721,16 @@ def _finalize_wgrad(param, wgrad_rs): if hasattr(param, "grad_added_to_main_grad"): param.grad_added_to_main_grad = True dummy_grad = get_dummy_wgrad(list(param.main_grad.shape), param.dtype) + + # 3. Trigger DDP backward hook (register_grad_ready). + # ETP bypasses autograd's normal gradient flow (returns None for async RS, + # accumulates directly into main_grad), so we must trigger the DDP hook + # manually. param.grad = dummy_grad is a Python attribute set that does NOT + # fire autograd's grad accumulator hook — only the explicit call below does. + if getattr(param, '_grad_accum_hook', None) is not None: + param.grad = dummy_grad + param._grad_accum_hook() + return dummy_grad diff --git a/transformer_engine/pytorch/module/grouped_linear.py b/transformer_engine/pytorch/module/grouped_linear.py index 0aae97541c..3a3e8f86fc 100644 --- a/transformer_engine/pytorch/module/grouped_linear.py +++ b/transformer_engine/pytorch/module/grouped_linear.py @@ -420,16 +420,6 @@ def backward(ctx, grad_output: torch.Tensor) -> Tuple[Union[torch.Tensor, None], accumulate_wgrad_into_param_main_grad = ctx.fuse_wgrad_accumulation if ctx.etp_size > 1: - # Drain main_stream before launching EETP all-gather on ag_stream. - # With --overlap-grad-reduce + CG, DDP backward hooks may have fired a - # reduce-scatter (IB, on main_stream) that races with the EETP AG (IB, - # on ag_stream). Making ag_stream wait for main_stream serializes IB ops. - from transformer_engine.pytorch.module.extended_tensor_parallelism import ( - get_ag_stream, - ) - get_ag_stream(origin_weights[0].chain_id).wait_stream( - torch.cuda.current_stream() - ) weights = origin_weights[0].batched_all_gather_and_prefetch_bwd() if ctx.requires_dgrad: From 1fd09f513ff6560b97d9c191c66faa205ee6105f Mon Sep 17 00:00:00 2001 From: Shiqing Fan Date: Wed, 15 Apr 2026 03:12:09 -0700 Subject: [PATCH 22/43] ETP+CG mem fix4: del main_grads after batched_wgrad_reduce_scatter to drop Python refs to wgrad input buffers immediately. The async RS still holds C++ refs via NCCL Work until _wait_reduce_scatter. Reduces peak memory during graph capture warmup (~320 MB per MoE layer). --- .../pytorch/module/extended_tensor_parallelism.py | 6 +++--- transformer_engine/pytorch/module/grouped_linear.py | 6 ++++++ 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/transformer_engine/pytorch/module/extended_tensor_parallelism.py b/transformer_engine/pytorch/module/extended_tensor_parallelism.py index 830ae7f6e0..535f178988 100644 --- a/transformer_engine/pytorch/module/extended_tensor_parallelism.py +++ b/transformer_engine/pytorch/module/extended_tensor_parallelism.py @@ -725,10 +725,10 @@ def _finalize_wgrad(param, wgrad_rs): # 3. Trigger DDP backward hook (register_grad_ready). # ETP bypasses autograd's normal gradient flow (returns None for async RS, # accumulates directly into main_grad), so we must trigger the DDP hook - # manually. param.grad = dummy_grad is a Python attribute set that does NOT - # fire autograd's grad accumulator hook — only the explicit call below does. + # manually. Do NOT set param.grad before calling — the hook checks + # param.grad and would accumulate it into main_grad if zero_out_wgrad + # is True, corrupting the gradient with a non-zero dummy. if getattr(param, '_grad_accum_hook', None) is not None: - param.grad = dummy_grad param._grad_accum_hook() return dummy_grad diff --git a/transformer_engine/pytorch/module/grouped_linear.py b/transformer_engine/pytorch/module/grouped_linear.py index 3a3e8f86fc..fca5b4ee61 100644 --- a/transformer_engine/pytorch/module/grouped_linear.py +++ b/transformer_engine/pytorch/module/grouped_linear.py @@ -560,6 +560,12 @@ def handle_custom_ddp_from_mcore(weight, wgrad): if ctx.etp_size > 1: wgrad_list = origin_weights[0].batched_wgrad_reduce_scatter(wgrad_list) + # Drop Python refs to wgrad input buffers. The async RS on rs_stream + # still holds C++ refs (via NCCL Work); those are released when + # _wait_reduce_scatter calls handle.wait() + self.handle = None. + # Without this del, main_grads keeps the tensors alive until function + # return, wasting memory during graph capture warmup. + del main_grads elif ctx.fuse_wgrad_accumulation: wgrad_list = [ handle_custom_ddp_from_mcore(weight, wgrad) From 66fb81c3b8d2dd045d0179fc38da5ca554988ee8 Mon Sep 17 00:00:00 2001 From: Shiqing Fan Date: Wed, 15 Apr 2026 19:54:51 -0700 Subject: [PATCH 23/43] update doc --- docs/README_ETP.md | 39 +++++++++++++++++++++++++++++++++++++-- 1 file changed, 37 insertions(+), 2 deletions(-) diff --git a/docs/README_ETP.md b/docs/README_ETP.md index d6c0a367da..52368579fb 100644 --- a/docs/README_ETP.md +++ b/docs/README_ETP.md @@ -28,8 +28,7 @@ TODO(shiqingf): add performance for Ultra model in nvfp4. | **FP8 / MXFP8 support** | Quantized shards with ETP-group amax reduction | | **Routed expert support** | Batched coalesced all-gather for all experts in a MoE layer (GroupedLinear) | | **Composable with TP/SP** | Orthogonal to tensor parallelism and sequence parallelism | -| **CUDA Graphs compatible** | ETP is compatible with CUDA Graphs. And kernels on sidestreams are no longer required to synchronize at graph breaks - | +| **CUDA Graphs compatible** | Dense-chain prefetches captured in graphs; expert-chain runs eagerly. DDP RS serialized via `register_grad_accum_hook` (called from `_finalize_wgrad` for eager params, from `_CudagraphReplayNode.backward` for graphed params). Forward drains at CG/eager boundary prevent IB races. | | **Debug naming** | `tag_etp_params_with_names(model)` populates human-readable names on every `ETPShardedParam`; the prefetch-link table is printed atomically at the start of the second forward pass | ### Implementation Mechanisms @@ -434,6 +433,42 @@ Both chains share the same `ag_stream` and `rs_stream`. Per-chain streams were c The single global `ETPWeightCache` serves both chains. Cache keys already include `expert_idx`, so dense and expert buffers never collide. `reallocate_to_mempool()` only migrates **dense-chain** buffers into the CUDA graph memory pool; expert-chain buffers remain in regular allocator memory. +### Excluding Params from the Chain + +Setting `weight.prefetch_initialized = True` at construction skips chain registration entirely. Megatron uses this for the embedding and output-layer weights, which perform synchronous all-gathers and must not join the dense chain (they execute outside the CUDA graph boundary, and linking them into the dense chain would cause the chain to cross the CG/eager boundary, reproducing the same NCCL deadlock as the old single-chain design). Setting `_need_weight_prefetch = False` in addition disables the async path so these weights always do synchronous AG. + +### ETP + DDP Serialization (`register_grad_accum_hook`) + +ETP bypasses autograd's normal gradient flow: `wgrad_reduce_scatter` returns `None` for async RS (chain interior params), and `_finalize_wgrad` accumulates directly into `main_grad`. As a result, autograd's grad accumulator never fires for these params, and standard DDP backward hooks (`grad_acc.register_hook`) would never trigger. + +Without proper serialization, DDP reduce-scatter (IB) and ETP reduce-scatter (IB) can run concurrently on different CUDA streams at 64+ GPU IB scale, causing NCCL deadlock. + +The solution: `register_grad_accum_hook(grad_acc, hook)` stores the DDP hook on the `ETPShardedParam`. `_finalize_wgrad` calls the hook **manually** after RS wait + gradient accumulation: + +```python +# _finalize_wgrad (called after RS is waited and gradient accumulated) +param.main_grad.add_(wgrad_rs) # gradient accumulated +param.grad = dummy_grad # Python attr set (does NOT fire autograd) +param._grad_accum_hook() # manually triggers DDP register_grad_ready +``` + +This fires `register_grad_ready` at exactly the right serialization point, ensuring DDP RS launches only after ETP RS completes. The hook trigger differs by execution mode: + +| Weight type | Hook trigger location | When | +|---|---|---| +| **Graphed dense** (mamba/attn/shared_expert) | `_CudagraphReplayNode.backward` in `cuda_graphs.py` | After graph replay (Python, not captured) | +| **Eager expert** (grouped_fc1/fc2) | `_finalize_wgrad` in `extended_tensor_parallelism.py` | After RS wait + `main_grad.add_` (Python, every iteration) | +| **Eager chain head** (sync RS) | `_finalize_wgrad` called directly in `wgrad_reduce_scatter` | Immediately after sync RS completes | + +For graphed params: `_finalize_wgrad` runs during capture but the hook returns early (`is_graph_capturing()`). At replay, `_finalize_wgrad` doesn't re-run from Python (captured GPU ops only). `_CudagraphReplayNode.backward` explicitly triggers the hook after setting `grad_added_to_main_grad = True`. + +### Forward-Path Drains at CG/Eager Boundary + +Before eager expert compute starts (`_forward_mlp_expert_compute`), two drains ensure no in-flight IB ops race with expert backward: + +1. `_drain_etp_side_streams('dense')` — drains the dense ETP AG prefetch (e.g., `AG(next_mamba_fc1)` launched by the preceding shared_expert GEMM on `ag_stream`) +2. `_drain_param_gather()` — drains async DDP param all-gather from `--overlap-param-gather` + CG (the forward pre-hook `finish_param_sync` is skipped during graph capture/replay) + --- ## Scalability From e09983cad09d93ad008b6d3ef327f58990fa8281 Mon Sep 17 00:00:00 2001 From: Shiqing Fan Date: Thu, 16 Apr 2026 07:15:47 -0700 Subject: [PATCH 24/43] ETP+CG: re-enable bwd ETP RS overlapping across Graphs. --- .../pytorch/module/extended_tensor_parallelism.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/transformer_engine/pytorch/module/extended_tensor_parallelism.py b/transformer_engine/pytorch/module/extended_tensor_parallelism.py index 535f178988..007a15ab2f 100644 --- a/transformer_engine/pytorch/module/extended_tensor_parallelism.py +++ b/transformer_engine/pytorch/module/extended_tensor_parallelism.py @@ -1076,18 +1076,22 @@ def reallocate_etp_cache_to_mempool(device, mempool): _ETP_CACHE.reallocate_to_mempool(device, mempool) -def wait_async_comms(chain_id: str = None): +def wait_async_comms(chain_id: str = None, skip_rs: bool = False): """Wait on in-flight ETP async communications (all-gathers + reduce-scatters). Args: chain_id: If specified, only drain params belonging to this chain ('dense' or 'expert'). If None, drain all chains (backward compat). + skip_rs: If True, only process AG handles (record ag_event on ag_stream) + and skip RS handles. Used to record a CUDA graph completion event + after AG but before RS, enabling cross-graph RS overlap. """ for param in list(_inflight_comm_params): if chain_id is not None and getattr(param, 'chain_id', 'dense') != chain_id: continue param._wait_param_gather() - param._wait_reduce_scatter() + if not skip_rs: + param._wait_reduce_scatter() @dataclass From d72bcced27e3d20c4f85e9d13aea7dd6d4f6ff45 Mon Sep 17 00:00:00 2001 From: Shiqing Fan Date: Thu, 16 Apr 2026 19:37:49 -0700 Subject: [PATCH 25/43] Revert "ETP+CG: re-enable bwd ETP RS overlapping across Graphs." This reverts commit e09983cad09d93ad008b6d3ef327f58990fa8281. --- .../pytorch/module/extended_tensor_parallelism.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/transformer_engine/pytorch/module/extended_tensor_parallelism.py b/transformer_engine/pytorch/module/extended_tensor_parallelism.py index 007a15ab2f..535f178988 100644 --- a/transformer_engine/pytorch/module/extended_tensor_parallelism.py +++ b/transformer_engine/pytorch/module/extended_tensor_parallelism.py @@ -1076,22 +1076,18 @@ def reallocate_etp_cache_to_mempool(device, mempool): _ETP_CACHE.reallocate_to_mempool(device, mempool) -def wait_async_comms(chain_id: str = None, skip_rs: bool = False): +def wait_async_comms(chain_id: str = None): """Wait on in-flight ETP async communications (all-gathers + reduce-scatters). Args: chain_id: If specified, only drain params belonging to this chain ('dense' or 'expert'). If None, drain all chains (backward compat). - skip_rs: If True, only process AG handles (record ag_event on ag_stream) - and skip RS handles. Used to record a CUDA graph completion event - after AG but before RS, enabling cross-graph RS overlap. """ for param in list(_inflight_comm_params): if chain_id is not None and getattr(param, 'chain_id', 'dense') != chain_id: continue param._wait_param_gather() - if not skip_rs: - param._wait_reduce_scatter() + param._wait_reduce_scatter() @dataclass From 7cc86fd7de159aeb9f537bc52b3c39abc4293248 Mon Sep 17 00:00:00 2001 From: Shiqing Fan Date: Sun, 19 Apr 2026 08:35:14 -0700 Subject: [PATCH 26/43] ETP: fix iter-2 NaN + unbounded wgrad pool growth; partition streams MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three related fixes exposed by adding embedding / output_layer to the UNGRAPHED prefetch chain: 1. iter-2 NaN (consumer-side race on AG output buffer). The async AG prefetch was issued from main_stream; NCCL's caller-stream preEvent queued behind pending CG/compute and NCCL started late, leaving the consumer GEMM reading a partially-written buffer. Fix: wrap the async issue in in both all_gather_and_prefetch (fwd) and all_gather_and_prefetch_bwd (bwd). Added a state guard in _get_prefetched_weight that asserts an AG was issued for this consume cycle — catches silent stale-cache reads from misconfigured _need_weight_prefetch flags. 2. Unbounded _wgrad_buf_pool growth. _wait_reduce_scatter pushed the wgrad input buffer into the pool unconditionally, but callers that don't acquire via _wgrad_pool_get (Megatron layers.py wgrad GEMM, aten F.embedding backward) never popped — every iter leaked N fresh buffers into the pool. Fix: tag pool-owned buffers at _wgrad_pool_get; _wgrad_pool_put no-ops on foreign buffers, letting the caching allocator recycle. Side effect: throughput 80 → 580 TFLOPs/GPU (pool thrash eliminated). 3. ag/rs streams partitioned by (chain_id, NCCL group). UNGRAPHED chain can span multiple communicators (ETP vs EETP); sharing a single user-level stream forced cross-group NCCL ops to serialize. Stream dicts are now keyed on (chain_id, id(group)); adds get_{ag,rs}_streams_for_chain() helpers. --- .../module/extended_tensor_parallelism.py | 326 ++++++++++++++---- 1 file changed, 253 insertions(+), 73 deletions(-) diff --git a/transformer_engine/pytorch/module/extended_tensor_parallelism.py b/transformer_engine/pytorch/module/extended_tensor_parallelism.py index 535f178988..87c39b99b5 100644 --- a/transformer_engine/pytorch/module/extended_tensor_parallelism.py +++ b/transformer_engine/pytorch/module/extended_tensor_parallelism.py @@ -27,6 +27,111 @@ DEBUG_TENSOR = None +class ETPChain(str, Enum): + """Prefetch chain identifier for an ETPShardedParam. + + GRAPHED — fwd/bwd captured by a CUDA graph (MLM _CudaGraphRunner). + UNGRAPHED — fwd/bwd runs eagerly; includes embedding/output_layer and + routed grouped experts always, plus router/shared_experts + when their scope tag is not in cuda_graph_scope. + + Chains never cross-link (prev_w/next_w stay within one chain). CG + disabled → single UNGRAPHED chain; full-iteration graph → single GRAPHED. + """ + GRAPHED = "ETP_graphed" + UNGRAPHED = "ETP_ungraphed" + + +# Module-level cuda_graph_scope, set by MLM at init via set_cuda_graph_scope(). +# None or empty → CG is disabled; every ETP param classifies as UNGRAPHED. +# Value is a set of scope tags; e.g. {"mamba","attn","moe_router"}. +_CUDA_GRAPH_SCOPE: Optional[set] = None +# Whether shared_experts are run with overlap (cannot be captured). When True, +# shared_experts stay UNGRAPHED regardless of moe_router scope inclusion, matching +# the transformer_layer.py guard that excludes them from the captured submodules. +_MOE_SHARED_EXPERT_OVERLAP: bool = False + + +def set_cuda_graph_scope(scope, moe_shared_expert_overlap: bool = False): + """Record the active cuda_graph_scope for ETP chain classification. + + Called by MLM at init, BEFORE classify_etp_chains(). ``scope`` may be + None, an empty iterable (CG disabled), or an iterable of scope tags. + """ + global _CUDA_GRAPH_SCOPE, _MOE_SHARED_EXPERT_OVERLAP + _CUDA_GRAPH_SCOPE = set(scope) if scope else None + _MOE_SHARED_EXPERT_OVERLAP = bool(moe_shared_expert_overlap) + + +def _classify_param_chain(param_name: str) -> 'ETPChain': + """Classify an ETPShardedParam by name + active cuda_graph_scope. + + embedding / output_layer are always UNGRAPHED. Other kinds (mamba mixer, + self/cross_attention, shared_experts, routed experts) are GRAPHED iff + their scope tag is present in cuda_graph_scope; otherwise UNGRAPHED. + """ + n = param_name + + # Always ungraphed — embedding and output_layer live outside any CG runner. + if "embedding" in n or "output_layer" in n: + return ETPChain.UNGRAPHED + + scope = _CUDA_GRAPH_SCOPE + if not scope: + # CG disabled: every ETP param goes to the single UNGRAPHED chain. + return ETPChain.UNGRAPHED + + if ".mlp.shared_experts." in n: + if _MOE_SHARED_EXPERT_OVERLAP: + return ETPChain.UNGRAPHED + return ETPChain.GRAPHED if ("moe" in scope or "moe_router" in scope) else ETPChain.UNGRAPHED + + if ".mlp.experts." in n: + return ETPChain.GRAPHED if "moe" in scope else ETPChain.UNGRAPHED + + if ".self_attention." in n or ".cross_attention." in n: + return ETPChain.GRAPHED if "attn" in scope else ETPChain.UNGRAPHED + + if ".mixer." in n: + return ETPChain.GRAPHED if "mamba" in scope else ETPChain.UNGRAPHED + + return ETPChain.UNGRAPHED + + +def classify_etp_chains(model) -> None: + """Walk model.named_parameters() and set chain_id on every ETPShardedParam. + + Call once at init, AFTER set_cuda_graph_scope() and BEFORE the first fwd + of any graphed param. Raises if an already chain-initialized param would + be reclassified into a different chain (its prev/next links are already + wired into the wrong list). + """ + conflicts = [] + for name, param in model.named_parameters(): + if not isinstance(param, ETPShardedParam): + continue + target = _classify_param_chain(name).value + if param.prefetch_initialized and param.chain_id != target: + conflicts.append((name, param.chain_id, target)) + continue + param.chain_id = target + + # Bwd-prefetch opt-out: embedding.word_embeddings.weight does not need + # an AG in the bwd pass (its wgrad is a scatter-add on sharded rows + # and its input has no dgrad). Skipping its bwd AG saves one collective. + if "embedding" in name: + param._need_weight_prefetch_bwd = False + if conflicts: + raise RuntimeError( + "classify_etp_chains: the following params were already chain-initialized " + "with a different chain_id than the classifier would assign — this means " + "their chain links are already wired into the wrong list. Move classification " + "earlier in init. Conflicts: " + + ", ".join(f"{n}: {old!r}->{new!r}" for n, old, new in conflicts[:3]) + + ("..." if len(conflicts) > 3 else "") + ) + + class ETPWeightState(Enum): NONE = "NONE" # Sharded, no pending operation ASYNC_WAIT = "ASYNC_WAIT" # Async all-gather in progress @@ -50,55 +155,84 @@ class ETPWeightState(Enum): _AG_STREAMS: Dict[str, torch.cuda.Stream] = {} _RS_STREAMS: Dict[str, torch.cuda.Stream] = {} -# Standalone wgrad input buffer pool, keyed by (shape, dtype). -# Separate from ETPWeightCache because: -# 1. Wgrad buffers are expert-chain only (never graphed) -# 2. They need true release-then-reuse (the pool shrinks/grows), whereas -# ETPWeightCache keeps slot.buf set for CUDA graph address stability +# Wgrad input buffer pool, keyed by (shape, dtype). UNGRAPHED-only: GRAPHED +# wgrad bufs need address stability for CG replay and are not pool-recycled. _wgrad_buf_pool: Dict[tuple, list] = {} def _wgrad_pool_get(shape: tuple, dtype: torch.dtype, device) -> torch.Tensor: - """Get a wgrad buffer from the pool, or allocate a fresh one.""" + """Get a pool buffer or allocate fresh. Tagged so _wgrad_pool_put accepts + only pool-owned buffers — callers that don't use _wgrad_pool_get (e.g. + Megatron layers.py wgrad GEMM, aten F.embedding bwd) fall through to the + caching allocator on release.""" key = (shape, dtype) pool = _wgrad_buf_pool.get(key) if pool: - return pool.pop() - return torch.empty(shape, dtype=dtype, device=device, requires_grad=False) + buf = pool.pop() + else: + buf = torch.empty(shape, dtype=dtype, device=device, requires_grad=False) + buf._from_etp_wgrad_pool = True + return buf def _wgrad_pool_put(buf: torch.Tensor): - """Return a wgrad buffer to the pool for reuse.""" + """Return a pool-owned buffer for reuse (no-op for untagged buffers; see + _wgrad_pool_get).""" + if not getattr(buf, '_from_etp_wgrad_pool', False): + return key = (tuple(buf.shape), buf.dtype) if key not in _wgrad_buf_pool: _wgrad_buf_pool[key] = [] _wgrad_buf_pool[key].append(buf) -def get_ag_stream(chain_id: str = 'dense') -> torch.cuda.Stream: - # All chains share one AG stream. The ag_event CUDA event object is recorded on - # this stream during graph capture; using a different stream at replay would cause - # ag_event.wait() to see a stale recording, producing Inf gradients. - key = 'shared' +def _stream_key(chain_id: str, group) -> tuple: + """Key for the per-(chain, group) AG/RS stream dicts. + + Two partitioning axes: + - chain_id: captured (GRAPHED) vs eager (UNGRAPHED) ops must not share + a stream (eager ops would contaminate capture/replay state). + - group: independent NCCL communicators (e.g. ETP vs EETP) get their + own user-level stream to avoid cross-group serialization. + """ + return (chain_id, id(group) if group is not None else 0) + + +def get_ag_stream(chain_id: str = ETPChain.GRAPHED.value, group=None) -> torch.cuda.Stream: + """Return the ETP all-gather stream for (chain_id, group). See _stream_key.""" + key = _stream_key(chain_id, group) if key not in _AG_STREAMS: _AG_STREAMS[key] = torch.cuda.Stream() return _AG_STREAMS[key] -def get_rs_stream(chain_id: str = 'dense') -> torch.cuda.Stream: - # All chains share one RS stream (same reason as AG stream). - key = 'shared' + +def get_rs_stream(chain_id: str = ETPChain.GRAPHED.value, group=None) -> torch.cuda.Stream: + """Return the ETP reduce-scatter stream for (chain_id, group). See _stream_key.""" + key = _stream_key(chain_id, group) if key not in _RS_STREAMS: _RS_STREAMS[key] = torch.cuda.Stream() return _RS_STREAMS[key] + def get_all_ag_streams() -> list: - """Return all AG streams that have been created.""" + """All AG streams created so far, across chains and groups.""" return list(_AG_STREAMS.values()) + def get_all_rs_streams() -> list: - """Return all RS streams that have been created.""" + """All RS streams created so far, across chains and groups.""" return list(_RS_STREAMS.values()) + +def get_ag_streams_for_chain(chain_id: str) -> list: + """AG streams for one chain (all groups that chain has touched).""" + return [s for k, s in _AG_STREAMS.items() if k[0] == chain_id] + + +def get_rs_streams_for_chain(chain_id: str) -> list: + """RS streams for one chain (all groups that chain has touched).""" + return [s for k, s in _RS_STREAMS.items() if k[0] == chain_id] + @dataclass class ETPConfig: """Global configuration for Extended Tensor Parallelism.""" @@ -170,7 +304,10 @@ def wrap_module_params_etp(module, weight_names, etp_group, is_grouped=None): if is_grouped: etp_shard.expert_idx = idx etp_shard.is_routed_expert = True - etp_shard.chain_id = 'expert' + # Grouped routed experts are UNGRAPHED unless the "moe" scope captures + # them; classify_etp_chains() will fix this up at init time based on + # the actual cuda_graph_scope. We set UNGRAPHED here as a safe default. + etp_shard.chain_id = ETPChain.UNGRAPHED.value etp_shard.group = etp_group etp_shard.ps_size = etp_size # register the newly sharded param back to the module @@ -206,7 +343,9 @@ class ETPShardedParam(torch.nn.Parameter): _pending_rs_weight = None _first_weight_flag = True - # Per-chain state: each chain_id ('dense', 'expert') has its own linked list. + # Per-chain state: each chain_id (ETPChain.GRAPHED / ETPChain.UNGRAPHED) has + # its own linked list. Chains never cross-link: prev_w/next_w only connect + # params with the same chain_id. _chain_state: Dict[str, dict] = {} @classmethod @@ -231,7 +370,7 @@ def _layer_id(name: str) -> str: chain['link_node_count'] += 1 if chain['link_node_count'] == 1: - chain_id = getattr(curr, 'chain_id', 'dense') + chain_id = getattr(curr, 'chain_id', ETPChain.UNGRAPHED.value) chain['link_table_buffer'].append( f"\n[{chain_id} chain]" f"\n{'node_id':>7} | {'layer_id':>8} | {'curr_weight_name':<{_W}} | prev_weight_name" @@ -260,6 +399,11 @@ def __init__(self, x, *args, **kwargs): self._ag_ticket_bwd = None self._prefetch_handle = None self._need_weight_prefetch = True + # Per-direction prefetch opt-outs. Default True. The embedding weight + # never needs an AG during bwd (its wgrad is a scatter-add indexed by + # token ids, and its input is non-differentiable, so no dgrad either). + # classify_etp_chains() sets this to False for embedding.word_embeddings.weight. + self._need_weight_prefetch_bwd = True self.ag_event = torch.cuda.Event(external=True) # DDP backward hook (set by register_grad_accum_hook) self._grad_accum_node = None @@ -272,8 +416,11 @@ def __init__(self, x, *args, **kwargs): self.prefetch_initialized = False self.next_w = None self.prev_w = None - # Chain identity: 'dense' for mamba/attn/shared_expert, 'expert' for grouped experts - self.chain_id = 'dense' + # Chain identity (ETPChain.GRAPHED / ETPChain.UNGRAPHED). Defaults to + # UNGRAPHED as a safe fallback; classify_etp_chains(model) walks the + # model at init time (after set_cuda_graph_scope) and reclassifies + # based on param name + active cuda_graph_scope. + self.chain_id = ETPChain.UNGRAPHED.value # Grouped gemm self.is_routed_expert = False self.expert_idx = None @@ -533,14 +680,22 @@ def _all_gather_weight(self, async_op, skip_weight_cast, cast_noop_flag, fwd, nv return result, handle def _wait_param_gather(self): - # Since wait() may sychronize against a different stream than the current stream, - # an event is recorded and waited on when the data is retrieved, which ensures the - # AG always finishes before returning the unsharded param - with torch.cuda.stream(get_ag_stream(self.chain_id)): - if self._prefetch_handle is not None: - self._prefetch_handle.wait() - self._prefetch_handle = None - self.ag_event.record() + # Wait-site for the async AG. Issuer (all_gather_and_prefetch{,_bwd}) + # and wait-site both use the TARGET's ag_stream so the caller-stream + # "preEvent" PyTorch records at issue time lives on an idle stream. + # A busy issue-stream would queue the preEvent behind pending work, + # delay NCCL start, and — even with the sync chain main ← ag_event ← + # ag_stream handle.wait() ← NCCL endEvent — leave the consumer GEMM + # reading a partial AG buffer. (NCCL kernel itself runs on PyTorch's + # per-PG ncclStream, not ag_stream.) handle.wait() here inserts the + # wait on NCCL's completion event into ag_stream; ag_event.record() + # then marks ag_stream for consumers (main_stream via ag_event.wait + # or MLM drains via main.wait_stream). + with torch.cuda.stream(get_ag_stream(self.chain_id, self.group)): + if self._prefetch_handle is not None: + self._prefetch_handle.wait() + self._prefetch_handle = None + self.ag_event.record() def _all_gather_weight_on_demand(self, fwd, skip_weight_cast=False, cast_noop_flag=None): result, _ = self._all_gather_weight( @@ -555,6 +710,20 @@ def _all_gather_weight_on_demand(self, fwd, skip_weight_cast=False, cast_noop_fl return result if self.is_routed_expert else result[0] def _get_prefetched_weight(self, fwd, skip_weight_cast=False, cast_noop_flag=None): + # Stale-read guard: state must reflect an AG issued for this cycle; + # otherwise cache.get() would return the prior iter's AG buffer. + if ETP_CONFIG.check_param_states: + for w in self._weights: + assert w.state in ( + ETPWeightState.ASYNC_WAIT, + ETPWeightState.DATA_READY, + ETPWeightState.DATA_READY_SYNC, + ), ( + f"[ETP] _get_prefetched_weight({'fwd' if fwd else 'bwd'}) on " + f"{self._debug_name} with state={w.state!r} — no AG issued; " + f"cache.get() would return stale data. Check the chain's " + f"_need_weight_prefetch flag and issuer's prefetch logic." + ) # Wait for async prefetch if in progress self._wait_param_gather() self.ag_event.wait() @@ -593,11 +762,15 @@ def all_gather_and_prefetch_bwd(self, nvtx_label=None): ETP_CONFIG.weight_prefetch and self.prev_w is not None and self.prev_w._need_weight_prefetch + and self.prev_w._need_weight_prefetch_bwd ): - _, handle = self.prev_w._all_gather_weight( - async_op=True, skip_weight_cast=True, cast_noop_flag=None, - fwd=False, nvtx_label=nvtx_label, - ) + # Issue on the target's ag_stream (see _wait_param_gather). + target_stream = get_ag_stream(self.prev_w.chain_id, self.prev_w.group) + with torch.cuda.stream(target_stream): + _, handle = self.prev_w._all_gather_weight( + async_op=True, skip_weight_cast=True, cast_noop_flag=None, + fwd=False, nvtx_label=nvtx_label, + ) self.prev_w._prefetch_handle = handle # The unsharded tensor has been returned, no pending work so reset state to NONE @@ -640,12 +813,15 @@ def all_gather_and_prefetch( and self.next_w is not None and self.next_w._need_weight_prefetch ): - _, handle = self.next_w._all_gather_weight( - async_op=True, - skip_weight_cast=skip_weight_cast, - cast_noop_flag=cast_noop_flag, - fwd=fwd, nvtx_label=nvtx_label, - ) + # Issue on the target's ag_stream (see _wait_param_gather). + target_stream = get_ag_stream(self.next_w.chain_id, self.next_w.group) + with torch.cuda.stream(target_stream): + _, handle = self.next_w._all_gather_weight( + async_op=True, + skip_weight_cast=skip_weight_cast, + cast_noop_flag=cast_noop_flag, + fwd=fwd, nvtx_label=nvtx_label, + ) self.next_w._prefetch_handle = handle # The unsharded tensor has been returned, no pending work so reset state to NONE @@ -735,17 +911,19 @@ def _finalize_wgrad(param, wgrad_rs): def _wait_reduce_scatter(self): - # assert self._wgrad_rs_handle is not None or is_graph_capturing() - with torch.cuda.stream(get_rs_stream(self.chain_id)): + # Asymmetric wrt _wait_param_gather: RS is issued from main_stream + # (not rs_stream) because main produced the RS input (wgrad) and + # naturally holds the write→read ordering. Wait-site enters rs_stream + # so it observes NCCL completion and rs_event marks it for consumers. + with torch.cuda.stream(get_rs_stream(self.chain_id, self.group)): if self._wgrad_rs_handle is not None: self._wgrad_rs_handle.wait() self._wgrad_rs_handle = None self.rs_event.record() - # RS is done — drop stashed wgrad input buffer refs. - # Safe because handle.wait() above guarantees the RS kernel finished reading them. - # Expert-chain buffers go back to pool for reuse; dense-chain buffers just drop refs. + # Release stashed wgrad inputs: UNGRAPHED buffers go back to the pool; + # GRAPHED just drops Python refs (addresses must stay stable for CG). if getattr(self, '_wgrad_input_bufs', None) is not None: - if self.chain_id == 'expert': + if self.chain_id == ETPChain.UNGRAPHED.value: for buf in self._wgrad_input_bufs: _wgrad_pool_put(buf) self._wgrad_input_bufs = None @@ -818,10 +996,10 @@ def wgrad_reduce_scatter(self, wgrad, nvtx_label=None): wgrads = list(wgrad) if batched else [wgrad] weights = self._weights - # Expert-chain wgrads are recycled via the standalone pool (_wgrad_pool_put). - # All ungraphed weights (expert + output layer) benefit from the stash - # (_wgrad_input_bufs) which drops Python refs once the RS is waited. - poolable = self.chain_id == 'expert' + # UNGRAPHED-chain wgrads are recycled via the standalone pool (_wgrad_pool_put). + # GRAPHED-chain wgrads cannot pool-recycle because CUDA graphs require + # stable buffer addresses across replay. + poolable = self.chain_id == ETPChain.UNGRAPHED.value if ETP_CONFIG.weight_prefetch and self.prev_w is not None: # Async reduce-scatter (not last weight — deferred finish) @@ -893,7 +1071,7 @@ class _TicketSlot: dtype: object # torch.dtype or tex.DType reduce_scatter: bool fwd: bool - chain_id: str = 'dense' # chain this slot belongs to + chain_id: str = ETPChain.GRAPHED.value # chain this slot belongs to buf: Optional[torch.Tensor] = field(default=None) # None when released or after clear() @@ -975,7 +1153,7 @@ def reserve(self, param: 'ETPShardedParam', dtype, fwd: bool, reduce_scatter=Fal self._slots[ticket] = _TicketSlot( key=key, param=param, dtype=dtype, reduce_scatter=reduce_scatter, fwd=fwd, - chain_id=getattr(param, 'chain_id', 'dense'), + chain_id=getattr(param, 'chain_id', ETPChain.UNGRAPHED.value), ) return ticket @@ -1014,25 +1192,26 @@ def clear(self): self._total_bytes = 0 def reallocate_to_mempool(self, device, mempool): - """Re-allocate dense-chain ticket buffers into a CUDA graph memory pool. + """Re-allocate GRAPHED-chain ticket buffers into a CUDA graph memory pool. - Call BEFORE graph capture so every dense-chain buffer lives in the capture - pool and no allocations are recorded inside the graph. Expert-chain buffers - are left in regular memory (expert compute runs eagerly, not in graphs). + Call BEFORE graph capture so every GRAPHED-chain buffer lives in the capture + pool and no allocations are recorded inside the graph. UNGRAPHED-chain + buffers are left in regular memory (they are never referenced by any + captured graph). """ - # Identify keys that belong to the dense chain - dense_keys = set() + # Identify keys that belong to the GRAPHED chain + graphed_keys = set() for slot in self._slots.values(): - if slot.chain_id == 'dense': - dense_keys.add(slot.key) + if slot.chain_id == ETPChain.GRAPHED.value: + graphed_keys.add(slot.key) - # Clone only dense-chain pool buffers into the passed in mempool + # Clone only GRAPHED-chain pool buffers into the passed in mempool self._total_bytes = 0 new_pool = defaultdict(list) torch._C._cuda_beginAllocateCurrentThreadToPool(device, mempool) for key, buffers in self._pool.items(): - if key not in dense_keys: + if key not in graphed_keys: continue new_buffers = [] for _ in range(len(buffers)): @@ -1041,23 +1220,23 @@ def reallocate_to_mempool(self, device, mempool): new_pool[key] = new_buffers torch._C._cuda_endAllocateToPool(device, mempool) - # Map each buffer in the old pool to its corresponding new one (dense only) + # Map each buffer in the old pool to its corresponding new one (GRAPHED only) old_to_new_buff = {} for key, old_pool in self._pool.items(): - if key not in dense_keys: + if key not in graphed_keys: continue new = new_pool[key] for old_buf, new_buf in zip(old_pool, new): old_to_new_buff[old_buf] = new_buf - # Replace each dense slot's reference; keep expert slots unchanged + # Replace each GRAPHED slot's reference; keep UNGRAPHED slots unchanged for slot in self._slots.values(): - if slot.chain_id == 'dense' and slot.buf is not None and slot.buf in old_to_new_buff: + if slot.chain_id == ETPChain.GRAPHED.value and slot.buf is not None and slot.buf in old_to_new_buff: slot.buf = old_to_new_buff[slot.buf] - # Merge: dense keys get new buffers, expert keys keep old ones + # Merge: GRAPHED keys get new buffers, UNGRAPHED keys keep old ones for key, buffers in self._pool.items(): - if key not in dense_keys: + if key not in graphed_keys: new_pool[key] = buffers self._pool = new_pool return @@ -1080,11 +1259,12 @@ def wait_async_comms(chain_id: str = None): """Wait on in-flight ETP async communications (all-gathers + reduce-scatters). Args: - chain_id: If specified, only drain params belonging to this chain ('dense' or 'expert'). - If None, drain all chains (backward compat). + chain_id: If specified, only drain params belonging to this chain + (ETPChain.GRAPHED.value or ETPChain.UNGRAPHED.value). + If None, drain all chains. """ for param in list(_inflight_comm_params): - if chain_id is not None and getattr(param, 'chain_id', 'dense') != chain_id: + if chain_id is not None and getattr(param, 'chain_id', ETPChain.UNGRAPHED.value) != chain_id: continue param._wait_param_gather() param._wait_reduce_scatter() From a652a63e952045c3faafccdf5cd3dc59e63f5c29 Mon Sep 17 00:00:00 2001 From: Shiqing Fan Date: Tue, 21 Apr 2026 04:51:47 -0700 Subject: [PATCH 27/43] ETP+nvfp4: coalescing amax reduction for fprop groupedgemm --- transformer_engine/pytorch/csrc/common.h | 21 +++- transformer_engine/pytorch/csrc/extensions.h | 7 ++ .../pytorch/csrc/extensions/cast.cpp | 66 ++++++++++ .../pytorch/csrc/extensions/pybind.cpp | 7 ++ transformer_engine/pytorch/csrc/quantizer.cpp | 52 +++++++- .../module/extended_tensor_parallelism.py | 113 +++++++++++++++++- 6 files changed, 262 insertions(+), 4 deletions(-) diff --git a/transformer_engine/pytorch/csrc/common.h b/transformer_engine/pytorch/csrc/common.h index 6aab9938b3..e5ead50d09 100644 --- a/transformer_engine/pytorch/csrc/common.h +++ b/transformer_engine/pytorch/csrc/common.h @@ -365,11 +365,30 @@ class NVFP4Quantizer : public Quantizer { */ void quantize_with_amax(TensorWrapper& input, TensorWrapper& out); + /*! @brief Compute (and D2D fill) local amax only — no cast, no allreduce. + * + * Writes the local amax into out's rowwise and/or columnwise amax + * buffers. Callers are expected to perform a coalesced allreduce + * across the amax reduction group afterwards, then invoke + * quantize_cast_only to finish the cast with the reduced amax. + */ + void compute_amax_only(const TensorWrapper& input, TensorWrapper& out); + + /*! @brief Cast to NVFP4 assuming amax already reduced externally. + * + * Skips both local amax compute and the internal amax allreduce. + * Callers must guarantee out's amax buffers already hold the reduced + * amax (e.g. via compute_amax_only + allreduce_coalesced). + */ + void quantize_cast_only(const TensorWrapper& input, TensorWrapper& out, + const std::optional& noop_flag = std::nullopt); + std::vector get_scale_shape(const std::vector& shape, bool columnwise) const; private: void quantize_impl(const TensorWrapper& input, TensorWrapper& out, - const std::optional& noop_flag, bool compute_amax); + const std::optional& noop_flag, bool compute_amax, + bool skip_amax_reduction = false); }; std::unique_ptr convert_quantizer(py::handle quantizer); diff --git a/transformer_engine/pytorch/csrc/extensions.h b/transformer_engine/pytorch/csrc/extensions.h index e4d4e5094c..fc26d025c4 100644 --- a/transformer_engine/pytorch/csrc/extensions.h +++ b/transformer_engine/pytorch/csrc/extensions.h @@ -285,6 +285,13 @@ std::vector rmsnorm_fwd(const py::handle &input, const py::handle &w py::object quantize(const at::Tensor &tensor, py::handle quantizer, const py::object &output, std::optional noop_flag); +// NVFP4-only split-phase quantize: compute amax, coalesce allreduce externally, then cast. +py::object compute_amax_nvfp4(const at::Tensor &tensor, py::handle quantizer, + const py::object &output); +py::object quantize_cast_only_nvfp4(const at::Tensor &tensor, py::handle quantizer, + const py::object &output, + std::optional noop_flag); + py::object dequantize(const py::handle &input, DType otype); py::object group_quantize(const at::Tensor &tensor, py::handle quantizer, const size_t num_tensors, diff --git a/transformer_engine/pytorch/csrc/extensions/cast.cpp b/transformer_engine/pytorch/csrc/extensions/cast.cpp index f8f793f036..2c261c3c6d 100644 --- a/transformer_engine/pytorch/csrc/extensions/cast.cpp +++ b/transformer_engine/pytorch/csrc/extensions/cast.cpp @@ -80,6 +80,72 @@ py::object quantize(const at::Tensor &tensor, py::handle quantizer, const py::ob return output_py; } +/*! @brief NVFP4-only: compute local amax into `output`'s amax buffers, no cast, no allreduce. + * + * Pair with an external coalesced allreduce of the returned amax tensors, + * then call `quantize_cast_only_nvfp4` to finish the cast. + */ +py::object compute_amax_nvfp4(const at::Tensor &tensor, py::handle quantizer, + const py::object &output) { + NVTE_CHECK(detail::IsNVFP4Quantizers(quantizer.ptr()), + "compute_amax_nvfp4 requires an NVFP4Quantizer"); + auto quantizer_cpp = convert_quantizer(quantizer); + auto *nvfp4_quantizer = dynamic_cast(quantizer_cpp.get()); + NVTE_CHECK(nvfp4_quantizer != nullptr, "Failed to cast quantizer to NVFP4Quantizer"); + + auto input_contiguous = tensor.contiguous(); + auto input_cpp = makeTransformerEngineTensor(input_contiguous); + + TensorWrapper output_cpp; + py::object output_py; + if (output.is_none()) { + const auto shape = get_tensor_shape(input_cpp); + const auto fake_dtype = input_cpp.dtype(); + std::tie(output_cpp, output_py) = quantizer_cpp->create_tensor(shape, fake_dtype); + } else { + std::tie(output_cpp, output_py) = quantizer_cpp->convert_and_update_tensor(output); + } + + nvfp4_quantizer->compute_amax_only(input_cpp, output_cpp); + return output_py; +} + +/*! @brief NVFP4-only: cast to FP4 using pre-reduced amax in `output`'s amax buffers. + * + * Skips both local amax compute and the internal allreduce. Caller must have + * already populated `output`'s amax via compute_amax_nvfp4 + coalesced allreduce. + */ +py::object quantize_cast_only_nvfp4(const at::Tensor &tensor, py::handle quantizer, + const py::object &output, + std::optional noop_flag) { + NVTE_CHECK(detail::IsNVFP4Quantizers(quantizer.ptr()), + "quantize_cast_only_nvfp4 requires an NVFP4Quantizer"); + auto quantizer_cpp = convert_quantizer(quantizer); + auto *nvfp4_quantizer = dynamic_cast(quantizer_cpp.get()); + NVTE_CHECK(nvfp4_quantizer != nullptr, "Failed to cast quantizer to NVFP4Quantizer"); + + auto input_contiguous = tensor.contiguous(); + auto input_cpp = makeTransformerEngineTensor(input_contiguous); + + TensorWrapper output_cpp; + py::object output_py; + if (output.is_none()) { + const auto shape = get_tensor_shape(input_cpp); + const auto fake_dtype = input_cpp.dtype(); + std::tie(output_cpp, output_py) = quantizer_cpp->create_tensor(shape, fake_dtype); + } else { + std::tie(output_cpp, output_py) = quantizer_cpp->convert_and_update_tensor(output); + } + + std::optional noop_flag_cpp; + if (noop_flag.has_value()) { + noop_flag_cpp = makeTransformerEngineTensor(*noop_flag); + } + + nvfp4_quantizer->quantize_cast_only(input_cpp, output_cpp, noop_flag_cpp); + return output_py; +} + namespace { // helper functions for NVFP4 grouped quantization (cuda graph safe with shapes stored in device without D2H copy) diff --git a/transformer_engine/pytorch/csrc/extensions/pybind.cpp b/transformer_engine/pytorch/csrc/extensions/pybind.cpp index 8302a13010..daee89a038 100644 --- a/transformer_engine/pytorch/csrc/extensions/pybind.cpp +++ b/transformer_engine/pytorch/csrc/extensions/pybind.cpp @@ -137,6 +137,13 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { NVTE_DECLARE_COMMON_PYBIND11_HANDLES(m) m.def("quantize", transformer_engine::pytorch::quantize, py::arg("tensor"), py::arg("quantizer"), py::arg("output") = py::none(), py::arg("noop") = py::none()); + m.def("compute_amax_nvfp4", transformer_engine::pytorch::compute_amax_nvfp4, + "NVFP4: compute local amax into output's amax buffers; no cast, no allreduce", + py::arg("tensor"), py::arg("quantizer"), py::arg("output") = py::none()); + m.def("quantize_cast_only_nvfp4", transformer_engine::pytorch::quantize_cast_only_nvfp4, + "NVFP4: cast using pre-reduced amax in output's amax buffers; skips amax compute and allreduce", + py::arg("tensor"), py::arg("quantizer"), py::arg("output") = py::none(), + py::arg("noop") = py::none()); m.def("dequantize", &transformer_engine::pytorch::dequantize, "Dequantize", py::arg("input"), py::arg("otype")); m.def("group_quantize", transformer_engine::pytorch::group_quantize, py::arg("tensor"), diff --git a/transformer_engine/pytorch/csrc/quantizer.cpp b/transformer_engine/pytorch/csrc/quantizer.cpp index 0214f7ff71..ac8fa26bc3 100644 --- a/transformer_engine/pytorch/csrc/quantizer.cpp +++ b/transformer_engine/pytorch/csrc/quantizer.cpp @@ -2121,7 +2121,7 @@ std::pair NVFP4Quantizer::convert_and_update_tensor( void NVFP4Quantizer::quantize_impl(const TensorWrapper& input, TensorWrapper& out, const std::optional& noop_flag, - bool compute_amax) { + bool compute_amax, bool skip_amax_reduction) { // Nothing to be done if input is empty if (input.numel() == 0) { return; @@ -2225,7 +2225,7 @@ void NVFP4Quantizer::quantize_impl(const TensorWrapper& input, TensorWrapper& ou } // amax reduction - if (this->with_amax_reduction) { + if (this->with_amax_reduction && !skip_amax_reduction) { std::vector amax_tensors; // push amax tensors inside if they need to be reduced auto make_amax_tensor = [](void* data_ptr) { @@ -2378,6 +2378,54 @@ void NVFP4Quantizer::quantize_with_amax(TensorWrapper& input, TensorWrapper& out this->quantize_impl(input, out, std::nullopt, false); } +void NVFP4Quantizer::compute_amax_only(const TensorWrapper& input, TensorWrapper& out) { + // Nothing to be done if input is empty + if (input.numel() == 0) { + return; + } + + // Only the non-RHT path is supported for the split-phase API today. + // RHT path's amax depends on the RHT-rotated view, which is produced + // alongside the cast; decoupling amax from cast is not meaningful there. + NVTE_CHECK(!this->with_rht, + "NVFP4Quantizer::compute_amax_only does not support with_rht=true"); + + auto stream = at::cuda::getCurrentCUDAStream(); + + QuantizationConfigWrapper quant_config; + quant_config.set_nvfp4_2d_quantization(this->with_2d_quantization); + + // Mirror the compute-amax block of quantize_impl exactly. + auto rowwise_amax_ptr = out.get_amax().data_ptr; + auto columnwise_amax_ptr = out.get_columnwise_amax().data_ptr; + void* amax_ptr = rowwise_amax_ptr != nullptr ? rowwise_amax_ptr : columnwise_amax_ptr; + NVTE_CHECK(amax_ptr != nullptr, "Could not find amax pointer"); + + out.set_amax(amax_ptr, DType::kFloat32, std::vector{1}); + NVTE_SCOPED_GIL_RELEASE( + { nvte_compute_amax_with_config(input.data(), out.data(), quant_config, stream); }); + out.set_amax(rowwise_amax_ptr, DType::kFloat32, std::vector{1}); + + // Replicate amax into whichever of rowwise/columnwise slots were requested. + if (rowwise_amax_ptr != amax_ptr && rowwise_amax_ptr != nullptr) { + NVTE_CHECK_CUDA(cudaMemcpyAsync(rowwise_amax_ptr, amax_ptr, sizeof(float), + cudaMemcpyDeviceToDevice, stream)); + } + if (columnwise_amax_ptr != amax_ptr && columnwise_amax_ptr != nullptr) { + NVTE_CHECK_CUDA(cudaMemcpyAsync(columnwise_amax_ptr, amax_ptr, sizeof(float), + cudaMemcpyDeviceToDevice, stream)); + } +} + +void NVFP4Quantizer::quantize_cast_only(const TensorWrapper& input, TensorWrapper& out, + const std::optional& noop_flag) { + // Amax is expected to already live in out's amax buffers (e.g. from + // compute_amax_only + an external coalesced allreduce). Skip both local + // amax compute and the internal allreduce. + this->quantize_impl(input, out, noop_flag, /*compute_amax=*/false, + /*skip_amax_reduction=*/true); +} + std::vector NVFP4Quantizer::get_scale_shape(const std::vector& shape, bool columnwise) const { size_t numel = 1; diff --git a/transformer_engine/pytorch/module/extended_tensor_parallelism.py b/transformer_engine/pytorch/module/extended_tensor_parallelism.py index 87c39b99b5..9012e8ab55 100644 --- a/transformer_engine/pytorch/module/extended_tensor_parallelism.py +++ b/transformer_engine/pytorch/module/extended_tensor_parallelism.py @@ -233,12 +233,102 @@ def get_rs_streams_for_chain(chain_id: str) -> list: """RS streams for one chain (all groups that chain has touched).""" return [s for k, s in _RS_STREAMS.items() if k[0] == chain_id] +# Cached once per process: whether the TE build exposes the split-phase APIs. +_COALESCED_AMAX_TE_APIS_AVAILABLE = ( + hasattr(tex, "compute_amax_nvfp4") and hasattr(tex, "quantize_cast_only_nvfp4") +) + + +def _coalesced_amax_static_eligible(weights): + """Walk the weight list once and decide whether the coalesced-amax path + is applicable. Depends only on fields that are fixed after model + construction (quantizer class, flags, amax_reduction_group, group size).""" + if not _COALESCED_AMAX_TE_APIS_AVAILABLE: + return False + if len(weights) <= 1: + return False + + group = None + for w in weights: + q = w._quantizer + if q is None or not isinstance(w.quantized, NVFP4TensorStorage): + return False + if not getattr(q, "with_amax_reduction", False): + return False + if getattr(q, "with_rht", False): + # RHT path does amax on RHT-rotated view, can't split compute + # from cast the way compute_amax_only assumes. + return False + g = getattr(q, "amax_reduction_group", None) + if g is None: + return False + if group is None: + group = g + elif g is not group: + return False + return group.size() > 1 + + +def _quantize_with_coalesced_amax(weights, skip_weight_cast, cast_noop_flag): + """Replace the per-weight (compute_amax + allreduce + cast) loop with: + compute_amax loop → one coalesced allreduce → cast loop.""" + group = weights[0]._quantizer.amax_reduction_group + + # Materialize padded shards once; on padded last-rank get_padded_shard() + # launches an F.pad kernel, and we'd otherwise pay it twice per expert. + padded_shards = [w.get_padded_shard() for w in weights] + + # Phase 1: per-weight local amax into each w.quantized's amax buffers. + # Keep rowwise/columnwise both populated so the group allreduce sees + # whichever the consumer GEMM will read. + for w, shard in zip(weights, padded_shards): + w._quantizer.set_usage(rowwise=True, columnwise=True) + tex.compute_amax_nvfp4( + tensor=shard, + quantizer=w._quantizer, + output=w.quantized, + ) + + # Phase 2: one coalesced allreduce across every weight's amax tensors. + amax_tensors = [] + for w in weights: + rw = w.quantized._amax_rowwise + cw = w.quantized._amax_columnwise + if rw is not None: + amax_tensors.append(rw) + if cw is not None and (rw is None or cw.data_ptr() != rw.data_ptr()): + amax_tensors.append(cw) + torch.distributed.all_reduce_coalesced( + amax_tensors, + op=torch.distributed.ReduceOp.MAX, + group=group, + ) + + # Phase 3: per-weight cast using the pre-reduced amax; skips the internal + # allreduce inside the quantizer. + for w, shard in zip(weights, padded_shards): + tex.quantize_cast_only_nvfp4( + tensor=shard, + quantizer=w._quantizer, + output=w.quantized, + noop=cast_noop_flag, + ) + w.did_cast_to_low_precision = True + + @dataclass class ETPConfig: """Global configuration for Extended Tensor Parallelism.""" pad_for_alignment: int = 16 check_param_states: bool = True weight_prefetch: bool = True + # When True and the weight list in _all_gather_weight contains >1 NVFP4 + # shards that share an amax reduction group, coalesce their per-expert + # amax allreduces into a single NCCL call. Requires TE with + # tex.compute_amax_nvfp4 / tex.quantize_cast_only_nvfp4; the eligibility + # guard in _coalesced_amax_static_eligible falls back to the per-weight + # path when either binding is missing. + coalesce_amax_allreduce: bool = True ETP_CONFIG = ETPConfig() @@ -614,8 +704,29 @@ def _all_gather_weight(self, async_op, skip_weight_cast, cast_noop_flag, fwd, nv w._set_state(ETPWeightState.DATA_READY_SYNC) # 2. Prepare: quantize, set usage direction. + # Static eligibility (quantizer class, flags, amax group) is fixed + # after model construction — compute once and cache on self so the + # hot path only pays the cheap per-call skip_weight_cast check. + if ETP_CONFIG.coalesce_amax_allreduce: + static_ok = getattr(self, "_coalesced_amax_static", None) + if static_ok is None: + static_ok = _coalesced_amax_static_eligible(weights) + self._coalesced_amax_static = static_ok + # Per-call: match the skip_weight_cast gate in _quantize_if_needed + # (fire when either skip_weight_cast is False or cast_noop_flag + # was provided by the FP8/NVFP4 recipe). + use_coalesced = static_ok and not ( + skip_weight_cast is True and cast_noop_flag is None + ) + else: + use_coalesced = False + + if use_coalesced: + _quantize_with_coalesced_amax(weights, skip_weight_cast, cast_noop_flag) + else: + for w in weights: + w._quantize_if_needed(skip_weight_cast, cast_noop_flag) for w in weights: - w._quantize_if_needed(skip_weight_cast, cast_noop_flag) if w.did_cast_to_low_precision: w._quantizer.set_usage(rowwise=fwd, columnwise=not fwd) From c0538ec56465a19113d465c394e5bc315a1f1c1b Mon Sep 17 00:00:00 2001 From: Jieming Zhang Date: Tue, 21 Apr 2026 07:40:52 -0700 Subject: [PATCH 28/43] remap quantized params Signed-off-by: Jieming Zhang --- .../module/extended_tensor_parallelism.py | 28 ++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/transformer_engine/pytorch/module/extended_tensor_parallelism.py b/transformer_engine/pytorch/module/extended_tensor_parallelism.py index 9012e8ab55..ea00d602b2 100644 --- a/transformer_engine/pytorch/module/extended_tensor_parallelism.py +++ b/transformer_engine/pytorch/module/extended_tensor_parallelism.py @@ -9,7 +9,6 @@ import math import re import torch -from contextlib import nullcontext from ..distributed import ( gather_along_first_dim, @@ -149,6 +148,7 @@ class ETPWeightState(Enum): # Global ETP buffer cache (persists across clear(); never set to None after creation). _ETP_CACHE = None +_ETP_PARAMS = [] # Global set of ETPShardedParam with in-flight async comms (AG or RS). _inflight_comm_params: set = set() @@ -403,6 +403,9 @@ def wrap_module_params_etp(module, weight_names, etp_group, is_grouped=None): # register the newly sharded param back to the module module._parameters[name] = etp_shard + global _ETP_PARAMS + _ETP_PARAMS.append(etp_shard) + if is_grouped: allweights = [getattr(module, name) for name in weight_names] allweights[0].weight_list = allweights @@ -1350,6 +1353,29 @@ def reallocate_to_mempool(self, device, mempool): if key not in graphed_keys: new_pool[key] = buffers self._pool = new_pool + + # Now remap the quantized params: + torch._C._cuda_beginAllocateCurrentThreadToPool(device, mempool) + for param in _ETP_PARAMS: + weights = param.weight_list if param.is_routed_expert and param.weight_list is not None else [param] + for w in weights: + if w.quantized is not None: + if isinstance(w.quantized, NVFP4TensorStorage): + w.quantized._rowwise_data = torch.clone(w.quantized._rowwise_data) + w.quantized._columnwise_data = torch.clone(w.quantized._columnwise_data) + w.quantized._rowwise_scale_inv = torch.clone(w.quantized._rowwise_scale_inv) + w.quantized._columnwise_scale_inv = torch.clone(w.quantized._columnwise_scale_inv) + w.quantized._amax_columnwise = torch.clone(w.quantized._amax_columnwise) + w.quantized._amax_rowwise = torch.clone(w.quantized._amax_rowwise) + elif isinstance(w.quantized, MXFP8TensorStorage): + w.quantized._rowwise_data = torch.clone(w.quantized._rowwise_data) + w.quantized._columnwise_data = torch.clone(w.quantized._columnwise_data) + w.quantized._rowwise_scale_inv = torch.clone(w.quantized._rowwise_scale_inv) + w.quantized._columnwise_scale_inv = torch.clone(w.quantized._columnwise_scale_inv) + else: + assert False + torch._C._cuda_endAllocateToPool(device, mempool) + return def get_global_ETP_cache() -> ETPWeightCache: From 7949e509e694947b448cdd86dd36f15a8e684749 Mon Sep 17 00:00:00 2001 From: Shiqing Fan Date: Tue, 21 Apr 2026 19:48:32 -0700 Subject: [PATCH 29/43] =?UTF-8?q?remap=20quantized=20param=20patch:=20(1)?= =?UTF-8?q?=20Remap=20quantized=20params=20into=20the=20CG=20mempool=20?= =?UTF-8?q?=E2=80=94?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit but only for params on nd MoE paths whose scope is not captured) run eagerly and don't the GRAPHED chain. UNGRAPHED-chain params (embedding, output_layer, need their quantized storage in the CG mempool. (2) _ETP_PARAMS already contains every individual expert (appended per weight_name in wrap_module_params_etp), so iterate it directly — no weight_list unroll needed. --- .../module/extended_tensor_parallelism.py | 42 ++++++++++--------- 1 file changed, 23 insertions(+), 19 deletions(-) diff --git a/transformer_engine/pytorch/module/extended_tensor_parallelism.py b/transformer_engine/pytorch/module/extended_tensor_parallelism.py index ea00d602b2..69ffc1b7b9 100644 --- a/transformer_engine/pytorch/module/extended_tensor_parallelism.py +++ b/transformer_engine/pytorch/module/extended_tensor_parallelism.py @@ -1354,26 +1354,30 @@ def reallocate_to_mempool(self, device, mempool): new_pool[key] = buffers self._pool = new_pool - # Now remap the quantized params: + # Remap quantized params into the CG mempool — but only for params on + # the GRAPHED chain. UNGRAPHED-chain params (embedding, output_layer, + # and MoE paths whose scope is not captured) run eagerly and don't + # need their quantized storage in the CG mempool. torch._C._cuda_beginAllocateCurrentThreadToPool(device, mempool) - for param in _ETP_PARAMS: - weights = param.weight_list if param.is_routed_expert and param.weight_list is not None else [param] - for w in weights: - if w.quantized is not None: - if isinstance(w.quantized, NVFP4TensorStorage): - w.quantized._rowwise_data = torch.clone(w.quantized._rowwise_data) - w.quantized._columnwise_data = torch.clone(w.quantized._columnwise_data) - w.quantized._rowwise_scale_inv = torch.clone(w.quantized._rowwise_scale_inv) - w.quantized._columnwise_scale_inv = torch.clone(w.quantized._columnwise_scale_inv) - w.quantized._amax_columnwise = torch.clone(w.quantized._amax_columnwise) - w.quantized._amax_rowwise = torch.clone(w.quantized._amax_rowwise) - elif isinstance(w.quantized, MXFP8TensorStorage): - w.quantized._rowwise_data = torch.clone(w.quantized._rowwise_data) - w.quantized._columnwise_data = torch.clone(w.quantized._columnwise_data) - w.quantized._rowwise_scale_inv = torch.clone(w.quantized._rowwise_scale_inv) - w.quantized._columnwise_scale_inv = torch.clone(w.quantized._columnwise_scale_inv) - else: - assert False + for w in _ETP_PARAMS: + if getattr(w, "chain_id", ETPChain.GRAPHED.value) != ETPChain.GRAPHED.value: + continue + if w.quantized is None: + continue + if isinstance(w.quantized, NVFP4TensorStorage): + w.quantized._rowwise_data = torch.clone(w.quantized._rowwise_data) + w.quantized._columnwise_data = torch.clone(w.quantized._columnwise_data) + w.quantized._rowwise_scale_inv = torch.clone(w.quantized._rowwise_scale_inv) + w.quantized._columnwise_scale_inv = torch.clone(w.quantized._columnwise_scale_inv) + w.quantized._amax_columnwise = torch.clone(w.quantized._amax_columnwise) + w.quantized._amax_rowwise = torch.clone(w.quantized._amax_rowwise) + elif isinstance(w.quantized, MXFP8TensorStorage): + w.quantized._rowwise_data = torch.clone(w.quantized._rowwise_data) + w.quantized._columnwise_data = torch.clone(w.quantized._columnwise_data) + w.quantized._rowwise_scale_inv = torch.clone(w.quantized._rowwise_scale_inv) + w.quantized._columnwise_scale_inv = torch.clone(w.quantized._columnwise_scale_inv) + else: + assert False torch._C._cuda_endAllocateToPool(device, mempool) return From fcef5903d215a138485d93a70e9a50a88b9dde4a Mon Sep 17 00:00:00 2001 From: Shiqing Fan Date: Wed, 22 Apr 2026 02:46:11 -0700 Subject: [PATCH 30/43] ETP+CG: launch etp async rs on rs_stream to match isse-site invariant. --- .../pytorch/module/extended_tensor_parallelism.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/transformer_engine/pytorch/module/extended_tensor_parallelism.py b/transformer_engine/pytorch/module/extended_tensor_parallelism.py index 69ffc1b7b9..cb823f8680 100644 --- a/transformer_engine/pytorch/module/extended_tensor_parallelism.py +++ b/transformer_engine/pytorch/module/extended_tensor_parallelism.py @@ -1116,8 +1116,16 @@ def wgrad_reduce_scatter(self, wgrad, nvtx_label=None): poolable = self.chain_id == ETPChain.UNGRAPHED.value if ETP_CONFIG.weight_prefetch and self.prev_w is not None: - # Async reduce-scatter (not last weight — deferred finish) - _, rs_handle = self._reduce_scatter(wgrads, async_op=True, nvtx_label=nvtx_label) + # Async reduce-scatter (not last weight — deferred finish). Issue on rs_stream to + # match wait-site (issue-site invariant; see _wait_param_gather). wgrad is produced + # on outer stream by bwd GEMM, so sync outer → rs_stream first. + outer_stream = torch.cuda.current_stream() + rs_stream = get_rs_stream(self.chain_id, self.group) + outer_sync_event = torch.cuda.Event() + outer_sync_event.record(outer_stream) + rs_stream.wait_event(outer_sync_event) + with torch.cuda.stream(rs_stream): + _, rs_handle = self._reduce_scatter(wgrads, async_op=True, nvtx_label=nvtx_label) self._wgrad_rs_handle = ETPShardHandle(rs_handle, weights, reduce_scatter=True) # Stash wgrad input buffers — cannot recycle yet because the async RS # kernel is still reading them on rs_stream. From cd88d3b76ed0be0795e5c910a1f8aaddf04fce35 Mon Sep 17 00:00:00 2001 From: Shiqing Fan Date: Thu, 23 Apr 2026 03:47:18 -0700 Subject: [PATCH 31/43] ETP+NVFP4: fused multi-tensor amax kernel Replaces the per-expert (zero_amax + amax + D2D amax replicate) chain in the ETP coalesced-amax path with a pair of multi-tensor kernel launches. The compute kernel writes rowwise and columnwise amax directly (atomicMaxFloat), eliminating the per-expert D2D copy. --- transformer_engine/common/CMakeLists.txt | 1 + .../include/transformer_engine/recipe.h | 20 ++ .../common/recipe/multi_amax.cu | 274 ++++++++++++++++++ transformer_engine/pytorch/csrc/extensions.h | 8 + .../pytorch/csrc/extensions/cast.cpp | 76 +++++ .../pytorch/csrc/extensions/pybind.cpp | 3 + .../module/extended_tensor_parallelism.py | 33 ++- 7 files changed, 410 insertions(+), 5 deletions(-) create mode 100644 transformer_engine/common/recipe/multi_amax.cu diff --git a/transformer_engine/common/CMakeLists.txt b/transformer_engine/common/CMakeLists.txt index a105a0343f..032d635e61 100644 --- a/transformer_engine/common/CMakeLists.txt +++ b/transformer_engine/common/CMakeLists.txt @@ -163,6 +163,7 @@ list(APPEND transformer_engine_cuda_sources recipe/current_scaling.cu recipe/delayed_scaling.cu recipe/fp8_block_scaling.cu + recipe/multi_amax.cu comm_gemm_overlap/userbuffers/userbuffers.cu) list(APPEND transformer_engine_cuda_arch_specific_sources diff --git a/transformer_engine/common/include/transformer_engine/recipe.h b/transformer_engine/common/include/transformer_engine/recipe.h index cad27a2992..2244056823 100644 --- a/transformer_engine/common/include/transformer_engine/recipe.h +++ b/transformer_engine/common/include/transformer_engine/recipe.h @@ -99,6 +99,26 @@ void nvte_compute_amax(const NVTETensor input, NVTETensor output, cudaStream_t s void nvte_compute_amax_with_config(const NVTETensor input, NVTETensor output, const NVTEQuantizationConfig config, cudaStream_t stream); +/*! \brief Compute amax for a list of independent tensors in a single kernel launch. + * + * Unlike nvte_group_amax (which requires a single contiguous input split along dim 0), + * this API accepts arrays of independent input tensors, each with its own allocation. + * Designed for the ETP grouped-experts case where per-expert weights live in separate + * buffers. For each i in [0, num_tensors), computes amax(inputs[i]) and writes it to + * outputs[i]'s amax buffer. outputs[i] must be an FP8 per-tensor scaling or NVFP4 1D + * scaling tensor. All inputs must share the same dtype. If the list exceeds the + * per-launch batch capacity, it is internally chunked. + * + * \param[in] inputs Array of input tensors (unquantized). Size num_tensors. + * \param[in,out] outputs Array of output tensors. Only the amax is updated. + * Size num_tensors. + * \param[in] num_tensors Number of tensors. + * \param[in] config Quantization configuration (for noop_tensor). May be NULL. + * \param[in] stream CUDA stream used for the operation. + */ +void nvte_multi_compute_amax(const NVTETensor *inputs, NVTETensor *outputs, size_t num_tensors, + const NVTEQuantizationConfig config, cudaStream_t stream); + /*! \brief Update an FP8 tensor's scale based on its amax. * * This is only supported for FP8 tensors with per-tensor scaling. diff --git a/transformer_engine/common/recipe/multi_amax.cu b/transformer_engine/common/recipe/multi_amax.cu new file mode 100644 index 0000000000..5420dde587 --- /dev/null +++ b/transformer_engine/common/recipe/multi_amax.cu @@ -0,0 +1,274 @@ +/************************************************************************* + * Copyright (c) 2022-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * + * See LICENSE for license information. + ************************************************************************/ + +#include + +#include +#include + +#include "../common.h" +#include "../util/logging.h" +#include "../util/vectorized_pointwise.h" +#include "recipe_common.cuh" + +namespace transformer_engine { +namespace { + +constexpr int multi_amax_kernel_threads = 512; +// Per-launch capacity. kMaxTensorsPerBatch * ~40 bytes per slot keeps the args +// struct within the 4KB kernel parameter limit with comfortable headroom. +constexpr int kMaxTensorsPerBatch = 64; + +struct MultiAmaxArgs { + const void *input_list[kMaxTensorsPerBatch]; + void *output_rowwise_amax_list[kMaxTensorsPerBatch]; + void *output_columnwise_amax_list[kMaxTensorsPerBatch]; + size_t input_numel[kMaxTensorsPerBatch]; + size_t num_aligned_elements[kMaxTensorsPerBatch]; + int num_tensors; +}; + +// Zero out every output amax slot (rowwise + columnwise, deduped) in a single launch. +// Respects the noop_ptr contract shared with the single-tensor amax path. +__launch_bounds__(multi_amax_kernel_threads) __global__ + void MultiZeroAmaxKernel(MultiAmaxArgs args, const float *noop_ptr) { + if (noop_ptr != nullptr && noop_ptr[0] == 1.0f) { + return; + } + int tid = blockIdx.x * blockDim.x + threadIdx.x; + int stride = blockDim.x * gridDim.x; + for (; tid < args.num_tensors; tid += stride) { + float *rw = static_cast(args.output_rowwise_amax_list[tid]); + float *cw = static_cast(args.output_columnwise_amax_list[tid]); + if (rw != nullptr) { + *rw = 0.0f; + } + if (cw != nullptr && cw != rw) { + *cw = 0.0f; + } + } +} + +// Per-tensor amax with one block-strip per tensor. blockIdx.y selects the +// tensor; blockIdx.x is the work chunk within that tensor. Each block +// vector-loads the tensor, reduces across threads, and atomicMaxFloats the +// result into BOTH output amax slots (rowwise + columnwise, deduped). This +// subsumes the per-expert D2D copy that the single-tensor path does after the +// amax kernel. +template +__launch_bounds__(multi_amax_kernel_threads) __global__ + void MultiAmaxKernel(MultiAmaxArgs args, const float *noop_ptr) { + if (noop_ptr != nullptr && noop_ptr[0] == 1.0f) { + return; + } + + const int t_idx = blockIdx.y; + if (t_idx >= args.num_tensors) { + return; + } + + const InputType *input = static_cast(args.input_list[t_idx]); + const size_t N = args.input_numel[t_idx]; + if (N == 0) { + return; + } + const size_t M = args.num_aligned_elements[t_idx]; + + VectorizedLoader loader(input, N); + InputType max = InputType{0.f}; + const int warp_id = threadIdx.x / THREADS_PER_WARP; + + for (size_t tid = blockIdx.x * blockDim.x + threadIdx.x; tid < M; + tid += gridDim.x * blockDim.x) { + loader.load(tid, N); +#pragma unroll + for (int i = 0; i < nvec; ++i) { + const InputType val = static_cast(loader.separate()[i]); + __builtin_assume(max >= InputType{0.f}); + if constexpr (std::is_same_v) { +#if __CUDA_ARCH__ >= 800 + max = __hmax(__habs(val), max); +#else + max = static_cast<__nv_bfloat16>( + fmaxf(fabsf(static_cast(val)), static_cast(max))); +#endif + } else if constexpr (std::is_same_v) { + max = __hmax(__habs(val), max); + } else { + max = fmaxf(fabsf(val), max); + } + } + } + + // Reduce amax over block. + max = reduce_max(max, warp_id); + if (threadIdx.x == 0) { + float *rw = static_cast(args.output_rowwise_amax_list[t_idx]); + float *cw = static_cast(args.output_columnwise_amax_list[t_idx]); + if (rw != nullptr) { + atomicMaxFloat(rw, static_cast(max)); + } + if (cw != nullptr && cw != rw) { + atomicMaxFloat(cw, static_cast(max)); + } + } +} + +template +void launch_multi_amax_batch(const MultiAmaxArgs &args, size_t max_numel, Alignment align, + const float *noop_ptr, cudaStream_t stream) { + // Zero all amax outputs in one launch. + { + constexpr int threads = multi_amax_kernel_threads; + const int num_blocks = std::max(1, DIVUP(args.num_tensors, threads)); + MultiZeroAmaxKernel<<>>(args, noop_ptr); + NVTE_CHECK_CUDA(cudaGetLastError()); + } + + if (max_numel == 0) { + return; + } + + // Grid: y = tensor index, x = work chunks within the largest tensor. Blocks + // that exceed a shorter tensor's aligned element count bail out via the + // bounds check inside the kernel. + constexpr int nvec = 32 / sizeof(InputType); + constexpr size_t threads = multi_amax_kernel_threads; + const size_t max_aligned = (max_numel + nvec - 1) / nvec; + size_t num_blocks_x = DIVUP(max_aligned, threads); + constexpr size_t max_blocks = 65535; + num_blocks_x = std::min(num_blocks_x, max_blocks); + num_blocks_x = std::max(num_blocks_x, 1); + dim3 grid(num_blocks_x, static_cast(args.num_tensors), 1); + + switch (align) { + case Alignment::SAME_ALIGNED: + MultiAmaxKernel + <<>>(args, noop_ptr); + break; + case Alignment::SAME_UNALIGNED: + MultiAmaxKernel + <<>>(args, noop_ptr); + break; + case Alignment::DIFFERENT: + // Heterogeneous alignment across tensors — fall back to nvec=1, aligned=true path + // which is safe for any pointer alignment. + MultiAmaxKernel<1, true, InputType> + <<>>(args, noop_ptr); + break; + } + NVTE_CHECK_CUDA(cudaGetLastError()); +} + +// Fill one MultiAmaxArgs batch from a slice of the full input/output list. +// Returns (max_numel in this batch, worst-case alignment across the batch). +template +std::pair build_batch_args(const std::vector &inputs, + const std::vector &outputs, size_t start, + size_t count, MultiAmaxArgs &args) { + constexpr int nvec = 32 / sizeof(InputType); + size_t max_numel = 0; + // SAME_ALIGNED is the most optimistic; degrade to SAME_UNALIGNED if any + // tensor is merely same-layout but unaligned, to DIFFERENT if alignment + // varies across tensors. + Alignment batch_align = Alignment::SAME_ALIGNED; + for (size_t i = 0; i < count; ++i) { + const Tensor &inp = *inputs[start + i]; + Tensor &out = *outputs[start + i]; + const size_t N = inp.data.numel(); + void *rw_ptr = out.amax.dptr; + void *cw_ptr = out.columnwise_amax.dptr; + + args.input_list[i] = inp.data.dptr; + args.output_rowwise_amax_list[i] = rw_ptr; + args.output_columnwise_amax_list[i] = cw_ptr; + args.input_numel[i] = N; + args.num_aligned_elements[i] = get_num_aligned_elements(inp.data.dptr, N, nvec, + sizeof(InputType)); + max_numel = std::max(max_numel, N); + + // Fold this tensor's alignment into the batch decision. CheckAlignment on a + // single pointer yields SAME_ALIGNED or SAME_UNALIGNED; mixing the two across + // tensors means heterogeneous — switch to the DIFFERENT fall-back. + if (N > 0) { + Alignment a = CheckAlignment(N, nvec, static_cast(inp.data.dptr)); + if (batch_align == Alignment::SAME_ALIGNED && a == Alignment::SAME_UNALIGNED) { + batch_align = Alignment::SAME_UNALIGNED; + } else if (batch_align == Alignment::SAME_UNALIGNED && a == Alignment::SAME_ALIGNED) { + batch_align = Alignment::SAME_UNALIGNED; + } else if (a == Alignment::DIFFERENT) { + batch_align = Alignment::DIFFERENT; + } + } + } + args.num_tensors = static_cast(count); + return {max_numel, batch_align}; +} + +void multi_compute_amax_impl(const NVTETensor *inputs_, NVTETensor *outputs_, size_t num_tensors, + const NVTEQuantizationConfig config_, cudaStream_t stream) { + if (num_tensors == 0) { + return; + } + NVTE_CHECK(inputs_ != nullptr, "nvte_multi_compute_amax: inputs is NULL"); + NVTE_CHECK(outputs_ != nullptr, "nvte_multi_compute_amax: outputs is NULL"); + + // Convert, validate, collect into plain vectors. + std::vector inputs(num_tensors); + std::vector outputs(num_tensors); + DType input_dtype; + for (size_t i = 0; i < num_tensors; ++i) { + inputs[i] = convertNVTETensorCheck(inputs_[i]); + outputs[i] = convertNVTETensorCheck(outputs_[i]); + const auto &inp = *inputs[i]; + auto &out = *outputs[i]; + NVTE_CHECK(inp.scaling_mode == NVTE_DELAYED_TENSOR_SCALING, + "nvte_multi_compute_amax: input[", i, + "] must be unquantized, got scaling_mode=", to_string(inp.scaling_mode)); + NVTE_CHECK(!is_fp8_dtype(inp.data.dtype), + "nvte_multi_compute_amax: input[", i, + "] must be unquantized, got dtype=", to_string(inp.data.dtype)); + if (i == 0) { + input_dtype = inp.data.dtype; + } else { + NVTE_CHECK(inp.data.dtype == input_dtype, + "nvte_multi_compute_amax: all inputs must share dtype; input[0]=", + to_string(input_dtype), ", input[", i, "]=", to_string(inp.data.dtype)); + } + NVTE_CHECK(out.scaling_mode == NVTE_DELAYED_TENSOR_SCALING || + out.scaling_mode == NVTE_NVFP4_1D_SCALING, + "nvte_multi_compute_amax: output[", i, "] must be FP8 per-tensor or NVFP4 1D"); + NVTE_CHECK(out.amax.dptr != nullptr || out.columnwise_amax.dptr != nullptr, + "nvte_multi_compute_amax: output[", i, "] has no amax buffer"); + } + + const float *noop_ptr = nullptr; + if (config_ != nullptr) { + const QuantizationConfig *config_cpp = reinterpret_cast(config_); + const NVTETensor noop = config_cpp->noop_tensor; + noop_ptr = reinterpret_cast( + (noop != nullptr ? convertNVTETensorCheck(noop)->data.dptr : nullptr)); + } + + // Chunk across kMaxTensorsPerBatch launches (single launch in the common 8-expert case). + TRANSFORMER_ENGINE_TYPE_SWITCH_INPUT(input_dtype, IType, { + for (size_t start = 0; start < num_tensors; start += kMaxTensorsPerBatch) { + const size_t count = std::min(kMaxTensorsPerBatch, num_tensors - start); + MultiAmaxArgs args = {}; + auto [max_numel, batch_align] = build_batch_args(inputs, outputs, start, count, args); + launch_multi_amax_batch(args, max_numel, batch_align, noop_ptr, stream); + } + }); // NOLINT(*) +} + +} // anonymous namespace +} // namespace transformer_engine + +void nvte_multi_compute_amax(const NVTETensor *inputs, NVTETensor *outputs, size_t num_tensors, + const NVTEQuantizationConfig config, cudaStream_t stream) { + NVTE_API_CALL(nvte_multi_compute_amax); + transformer_engine::multi_compute_amax_impl(inputs, outputs, num_tensors, config, stream); +} diff --git a/transformer_engine/pytorch/csrc/extensions.h b/transformer_engine/pytorch/csrc/extensions.h index fc26d025c4..c9b5674426 100644 --- a/transformer_engine/pytorch/csrc/extensions.h +++ b/transformer_engine/pytorch/csrc/extensions.h @@ -292,6 +292,14 @@ py::object quantize_cast_only_nvfp4(const at::Tensor &tensor, py::handle quantiz const py::object &output, std::optional noop_flag); +// NVFP4-only multi-tensor amax: fuses N per-expert (zero_amax + amax + D2D replicate) +// chains into a single pair of kernel launches (one multi-zero + one multi-amax) that +// writes amax into every output's rowwise AND columnwise buffers. Outputs must be +// pre-allocated; amax is written in place, no return. +void compute_multi_amax_nvfp4(const std::vector &tensor_list, + std::vector quantizer_list, + const std::vector &output_list); + py::object dequantize(const py::handle &input, DType otype); py::object group_quantize(const at::Tensor &tensor, py::handle quantizer, const size_t num_tensors, diff --git a/transformer_engine/pytorch/csrc/extensions/cast.cpp b/transformer_engine/pytorch/csrc/extensions/cast.cpp index 2c261c3c6d..e2602ed133 100644 --- a/transformer_engine/pytorch/csrc/extensions/cast.cpp +++ b/transformer_engine/pytorch/csrc/extensions/cast.cpp @@ -146,6 +146,82 @@ py::object quantize_cast_only_nvfp4(const at::Tensor &tensor, py::handle quantiz return output_py; } +/*! @brief NVFP4-only: compute amax for N input tensors in a single launch. + * + * Each output's rowwise AND columnwise amax buffers are populated directly by the + * kernel (atomicMaxFloat), fusing the per-expert zero_amax + amax_kernel + D2D + * replicate chain into two multi-tensor launches. Caller pairs this with an + * external coalesced allreduce and then N calls to quantize_cast_only_nvfp4. + * + * Amax is written into the outputs passed in via output_list; no return value is + * needed — caller already holds references to those objects. + */ +void compute_multi_amax_nvfp4(const std::vector &tensor_list, + std::vector quantizer_list, + const std::vector &output_list) { + const size_t num_tensors = tensor_list.size(); + NVTE_CHECK(num_tensors > 0, "compute_multi_amax_nvfp4 requires at least one tensor"); + NVTE_CHECK(quantizer_list.size() == num_tensors, + "compute_multi_amax_nvfp4: quantizer_list size mismatch"); + NVTE_CHECK(output_list.size() == num_tensors, + "compute_multi_amax_nvfp4: output_list size mismatch"); + + // Locals held for the duration of this call (destroyed at function return). + // TensorWrappers only hold NVTETensor handles (opaque indexes into a global pool + // released by ~TensorWrapper); they do NOT reference quantizer_cpp or py::object, + // so we do not need to preserve quantizer unique_ptrs past this scope. + std::vector input_contiguous; + input_contiguous.reserve(num_tensors); + std::vector input_wrappers; + input_wrappers.reserve(num_tensors); + std::vector output_wrappers; + output_wrappers.reserve(num_tensors); + + std::vector inputs_nvte; + std::vector outputs_nvte; + inputs_nvte.reserve(num_tensors); + outputs_nvte.reserve(num_tensors); + + for (size_t i = 0; i < num_tensors; ++i) { + NVTE_CHECK(detail::IsNVFP4Quantizers(quantizer_list[i].ptr()), + "compute_multi_amax_nvfp4: quantizer[", i, "] is not an NVFP4Quantizer"); + auto quantizer_cpp = convert_quantizer(quantizer_list[i]); + auto *nvfp4_quantizer = dynamic_cast(quantizer_cpp.get()); + NVTE_CHECK(nvfp4_quantizer != nullptr && !nvfp4_quantizer->with_rht, + "compute_multi_amax_nvfp4 requires NVFP4Quantizer with with_rht=false (idx=", i, + ")"); + + input_contiguous.emplace_back(tensor_list[i].contiguous()); + input_wrappers.emplace_back(makeTransformerEngineTensor(input_contiguous.back())); + + TensorWrapper out_cpp; + py::object out_py; + NVTE_CHECK(!output_list[i].is_none(), + "compute_multi_amax_nvfp4: output_list[", i, "] is None; caller must pre-allocate"); + std::tie(out_cpp, out_py) = quantizer_cpp->convert_and_update_tensor(output_list[i]); + + NVTE_CHECK(out_cpp.get_amax().data_ptr != nullptr || + out_cpp.get_columnwise_amax().data_ptr != nullptr, + "compute_multi_amax_nvfp4: output[", i, "] has no amax buffer"); + + output_wrappers.emplace_back(std::move(out_cpp)); + // quantizer_cpp and out_py are released here at end-of-iteration. + + if (input_wrappers.back().numel() == 0) continue; + inputs_nvte.push_back(input_wrappers.back().data()); + outputs_nvte.push_back(output_wrappers.back().data()); + } + + if (inputs_nvte.empty()) return; + + QuantizationConfigWrapper quant_config; + auto stream = at::cuda::getCurrentCUDAStream(); + NVTE_SCOPED_GIL_RELEASE({ + nvte_multi_compute_amax(inputs_nvte.data(), outputs_nvte.data(), inputs_nvte.size(), + quant_config, stream); + }); +} + namespace { // helper functions for NVFP4 grouped quantization (cuda graph safe with shapes stored in device without D2H copy) diff --git a/transformer_engine/pytorch/csrc/extensions/pybind.cpp b/transformer_engine/pytorch/csrc/extensions/pybind.cpp index daee89a038..2a9281bc78 100644 --- a/transformer_engine/pytorch/csrc/extensions/pybind.cpp +++ b/transformer_engine/pytorch/csrc/extensions/pybind.cpp @@ -144,6 +144,9 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { "NVFP4: cast using pre-reduced amax in output's amax buffers; skips amax compute and allreduce", py::arg("tensor"), py::arg("quantizer"), py::arg("output") = py::none(), py::arg("noop") = py::none()); + m.def("compute_multi_amax_nvfp4", transformer_engine::pytorch::compute_multi_amax_nvfp4, + "NVFP4: fused multi-tensor amax compute (writes both rowwise+columnwise amax per output)", + py::arg("tensor_list"), py::arg("quantizer_list"), py::arg("output_list")); m.def("dequantize", &transformer_engine::pytorch::dequantize, "Dequantize", py::arg("input"), py::arg("otype")); m.def("group_quantize", transformer_engine::pytorch::group_quantize, py::arg("tensor"), diff --git a/transformer_engine/pytorch/module/extended_tensor_parallelism.py b/transformer_engine/pytorch/module/extended_tensor_parallelism.py index cb823f8680..6c1297da28 100644 --- a/transformer_engine/pytorch/module/extended_tensor_parallelism.py +++ b/transformer_engine/pytorch/module/extended_tensor_parallelism.py @@ -238,6 +238,10 @@ def get_rs_streams_for_chain(chain_id: str) -> list: hasattr(tex, "compute_amax_nvfp4") and hasattr(tex, "quantize_cast_only_nvfp4") ) +# Tier-2: multi-tensor amax kernel fuses N per-expert (zero_amax + amax + D2D) chains +# into two multi-tensor kernel launches. Independent of Tier-1 coalesced allreduce. +_MULTI_AMAX_TE_API_AVAILABLE = hasattr(tex, "compute_multi_amax_nvfp4") + def _coalesced_amax_static_eligible(weights): """Walk the weight list once and decide whether the coalesced-amax path @@ -281,13 +285,32 @@ def _quantize_with_coalesced_amax(weights, skip_weight_cast, cast_noop_flag): # Phase 1: per-weight local amax into each w.quantized's amax buffers. # Keep rowwise/columnwise both populated so the group allreduce sees # whichever the consumer GEMM will read. - for w, shard in zip(weights, padded_shards): + for w in weights: w._quantizer.set_usage(rowwise=True, columnwise=True) - tex.compute_amax_nvfp4( - tensor=shard, - quantizer=w._quantizer, - output=w.quantized, + if _MULTI_AMAX_TE_API_AVAILABLE: + # Tier-2: single multi-tensor launch writes both rowwise and columnwise + # amax directly (no per-expert D2D replicate), fusing N per-expert chains. + # w._quantizer is set once by _configure_quantizer and never rebinds, so + # cache the list on weights[0] alongside _coalesced_amax_static. Output + # list is NOT cached because w.quantized can rebind if the weight is + # re-quantized externally. + anchor = weights[0] + quantizer_list = getattr(anchor, "_multi_amax_quantizer_list", None) + if quantizer_list is None: + quantizer_list = [w._quantizer for w in weights] + anchor._multi_amax_quantizer_list = quantizer_list + tex.compute_multi_amax_nvfp4( + padded_shards, + quantizer_list, + [w.quantized for w in weights], ) + else: + for w, shard in zip(weights, padded_shards): + tex.compute_amax_nvfp4( + tensor=shard, + quantizer=w._quantizer, + output=w.quantized, + ) # Phase 2: one coalesced allreduce across every weight's amax tensors. amax_tensors = [] From f7a08f792aa70941ab3a22077128de2a84809039 Mon Sep 17 00:00:00 2001 From: Shiqing Fan Date: Thu, 23 Apr 2026 18:36:16 -0700 Subject: [PATCH 32/43] ETP: cache hot-path lookups in ETP to reduce python overhead. Changes: - Lazy-cache ag_stream / rs_stream on self (resolved once from chain_id + group; prior path hit a dict lookup every call). - Cache quantizers / dtypes / etp_group on the anchor weight (rebuilt via list comprehensions on every _all_gather_weight call). - Consolidate _multi_amax_quantizer_list into _cached_quantizers (single cache shared between Tier-2 amax and _all_gather_weight). - Gate the duplicate-output-buffer assertion in batched AG behind ETP_CONFIG.check_param_states (was running O(N) per call). - Drop a dead `out_buffers is not None` check (always a list) --- .../module/extended_tensor_parallelism.py | 49 ++++++++++++++----- 1 file changed, 37 insertions(+), 12 deletions(-) diff --git a/transformer_engine/pytorch/module/extended_tensor_parallelism.py b/transformer_engine/pytorch/module/extended_tensor_parallelism.py index 6c1297da28..853e8ad907 100644 --- a/transformer_engine/pytorch/module/extended_tensor_parallelism.py +++ b/transformer_engine/pytorch/module/extended_tensor_parallelism.py @@ -290,15 +290,12 @@ def _quantize_with_coalesced_amax(weights, skip_weight_cast, cast_noop_flag): if _MULTI_AMAX_TE_API_AVAILABLE: # Tier-2: single multi-tensor launch writes both rowwise and columnwise # amax directly (no per-expert D2D replicate), fusing N per-expert chains. - # w._quantizer is set once by _configure_quantizer and never rebinds, so - # cache the list on weights[0] alongside _coalesced_amax_static. Output - # list is NOT cached because w.quantized can rebind if the weight is - # re-quantized externally. + # Reuse the _cached_quantizers list already populated by _all_gather_weight anchor = weights[0] - quantizer_list = getattr(anchor, "_multi_amax_quantizer_list", None) + quantizer_list = anchor._cached_quantizers if quantizer_list is None: quantizer_list = [w._quantizer for w in weights] - anchor._multi_amax_quantizer_list = quantizer_list + anchor._cached_quantizers = quantizer_list tex.compute_multi_amax_nvfp4( padded_shards, quantizer_list, @@ -553,6 +550,13 @@ def __init__(self, x, *args, **kwargs): self.pad_length = 0 # Debug self._debug_name = "" + # Hot-path caches (populated lazily on first use). chain_id/group are + # set after __init__, so we can't resolve streams eagerly here. + self._cached_ag_stream = None + self._cached_rs_stream = None + self._cached_quantizers = None + self._cached_dtypes = None + self._cached_etp_group = None def setup(self, weight_quantizer=None): """Set quantizer and create quantized shard.""" @@ -757,7 +761,13 @@ def _all_gather_weight(self, async_op, skip_weight_cast, cast_noop_flag, fwd, nv w._quantizer.set_usage(rowwise=fwd, columnwise=not fwd) # 3. Build gather inputs. - quantizers = [w._quantizer for w in weights] + # quantizers / dtypes / etp_group are stable after model construction — + # cache on the anchor (self == weights[0]) to avoid rebuilding lists + # every call. w.quantized is NOT cached because it can rebind. + quantizers = self._cached_quantizers + if quantizers is None: + quantizers = [w._quantizer for w in weights] + self._cached_quantizers = quantizers if weights[0].did_cast_to_low_precision: gather_weights = [w.quantized for w in weights] else: @@ -765,7 +775,10 @@ def _all_gather_weight(self, async_op, skip_weight_cast, cast_noop_flag, fwd, nv # 4. Cache checkout — use pooled buffers for both async and sync gathers # to avoid allocating fresh memory each iteration. - dtypes = [q.dtype if q is not None else w.dtype for q, w in zip(quantizers, weights)] + dtypes = self._cached_dtypes + if dtypes is None: + dtypes = [q.dtype if q is not None else w.dtype for q, w in zip(quantizers, weights)] + self._cached_dtypes = dtypes out_buffers = [] cache = get_global_ETP_cache() for p, dt in zip(weights, dtypes): @@ -781,8 +794,12 @@ def _all_gather_weight(self, async_op, skip_weight_cast, cast_noop_flag, fwd, nv out_buffers.append(cache.get(p._ag_ticket_bwd)) # 5. Communicate. - etp_group = weights[0].group - if out_buffers is not None and len(gather_weights) > 1: + etp_group = self._cached_etp_group + if etp_group is None: + etp_group = weights[0].group + self._cached_etp_group = etp_group + if ETP_CONFIG.check_param_states and len(gather_weights) > 1: + # Debug invariant: batched AG needs distinct output buffers per expert. assert len(set(id(b) for b in out_buffers)) == len(out_buffers), \ "Duplicate output buffers in batched all-gather — experts need distinct cache keys" @@ -828,7 +845,11 @@ def _wait_param_gather(self): # wait on NCCL's completion event into ag_stream; ag_event.record() # then marks ag_stream for consumers (main_stream via ag_event.wait # or MLM drains via main.wait_stream). - with torch.cuda.stream(get_ag_stream(self.chain_id, self.group)): + ag_stream = self._cached_ag_stream + if ag_stream is None: + ag_stream = get_ag_stream(self.chain_id, self.group) + self._cached_ag_stream = ag_stream + with torch.cuda.stream(ag_stream): if self._prefetch_handle is not None: self._prefetch_handle.wait() self._prefetch_handle = None @@ -1052,7 +1073,11 @@ def _wait_reduce_scatter(self): # (not rs_stream) because main produced the RS input (wgrad) and # naturally holds the write→read ordering. Wait-site enters rs_stream # so it observes NCCL completion and rs_event marks it for consumers. - with torch.cuda.stream(get_rs_stream(self.chain_id, self.group)): + rs_stream = self._cached_rs_stream + if rs_stream is None: + rs_stream = get_rs_stream(self.chain_id, self.group) + self._cached_rs_stream = rs_stream + with torch.cuda.stream(rs_stream): if self._wgrad_rs_handle is not None: self._wgrad_rs_handle.wait() self._wgrad_rs_handle = None From 9f614f4dd156758035eed67156099b89688ca9cc Mon Sep 17 00:00:00 2001 From: Shiqing Fan Date: Thu, 23 Apr 2026 18:37:19 -0700 Subject: [PATCH 33/43] ETP: disable check_param_states by default --- .../pytorch/module/extended_tensor_parallelism.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/transformer_engine/pytorch/module/extended_tensor_parallelism.py b/transformer_engine/pytorch/module/extended_tensor_parallelism.py index 853e8ad907..10f5c08479 100644 --- a/transformer_engine/pytorch/module/extended_tensor_parallelism.py +++ b/transformer_engine/pytorch/module/extended_tensor_parallelism.py @@ -340,7 +340,7 @@ def _quantize_with_coalesced_amax(weights, skip_weight_cast, cast_noop_flag): class ETPConfig: """Global configuration for Extended Tensor Parallelism.""" pad_for_alignment: int = 16 - check_param_states: bool = True + check_param_states: bool = False weight_prefetch: bool = True # When True and the weight list in _all_gather_weight contains >1 NVFP4 # shards that share an amax reduction group, coalesce their per-expert From eace39d5ee15a3f2a21d77006e17c7a2ca7cc0ef Mon Sep 17 00:00:00 2001 From: Shiqing Fan Date: Fri, 24 Apr 2026 07:03:02 -0700 Subject: [PATCH 34/43] ETP divergence fix: revert async AG issue-site stream wrapper from 7cc86fd7 The `with torch.cuda.stream(target.ag_stream):` wrapper re-routed NCCL's preEvent onto an idle stream, so the AG raced the caller-stream writer (quantize / sharded-weight update). Issue now on caller's stream; _wait_param_gather keeps ag_stream. Verified: 5000+ steps clean on TP2ETP2_EP2EETP2 nvfp4, 1/4 Ultra, 32xGB200. --- .../module/extended_tensor_parallelism.py | 58 ++++++++----------- 1 file changed, 24 insertions(+), 34 deletions(-) diff --git a/transformer_engine/pytorch/module/extended_tensor_parallelism.py b/transformer_engine/pytorch/module/extended_tensor_parallelism.py index 10f5c08479..a45aa4efcc 100644 --- a/transformer_engine/pytorch/module/extended_tensor_parallelism.py +++ b/transformer_engine/pytorch/module/extended_tensor_parallelism.py @@ -834,17 +834,8 @@ def _all_gather_weight(self, async_op, skip_weight_cast, cast_noop_flag, fwd, nv return result, handle def _wait_param_gather(self): - # Wait-site for the async AG. Issuer (all_gather_and_prefetch{,_bwd}) - # and wait-site both use the TARGET's ag_stream so the caller-stream - # "preEvent" PyTorch records at issue time lives on an idle stream. - # A busy issue-stream would queue the preEvent behind pending work, - # delay NCCL start, and — even with the sync chain main ← ag_event ← - # ag_stream handle.wait() ← NCCL endEvent — leave the consumer GEMM - # reading a partial AG buffer. (NCCL kernel itself runs on PyTorch's - # per-PG ncclStream, not ag_stream.) handle.wait() here inserts the - # wait on NCCL's completion event into ag_stream; ag_event.record() - # then marks ag_stream for consumers (main_stream via ag_event.wait - # or MLM drains via main.wait_stream). + # Wait on ag_stream so ag_event.record() marks ag_stream's tail — + # MLM's external drains (wait_stream(ag_stream)) need that to block. ag_stream = self._cached_ag_stream if ag_stream is None: ag_stream = get_ag_stream(self.chain_id, self.group) @@ -922,13 +913,14 @@ def all_gather_and_prefetch_bwd(self, nvtx_label=None): and self.prev_w._need_weight_prefetch and self.prev_w._need_weight_prefetch_bwd ): - # Issue on the target's ag_stream (see _wait_param_gather). - target_stream = get_ag_stream(self.prev_w.chain_id, self.prev_w.group) - with torch.cuda.stream(target_stream): - _, handle = self.prev_w._all_gather_weight( - async_op=True, skip_weight_cast=True, cast_noop_flag=None, - fwd=False, nvtx_label=nvtx_label, - ) + # Issue on caller's stream — preEvent then captures the AG input + # writer via program order. Do NOT wrap in torch.cuda.stream(ag_stream): + # that drops the writer edge (ag_stream's tail has no dependency + # on capture_stream's writer) and NCCL reads partial data. + _, handle = self.prev_w._all_gather_weight( + async_op=True, skip_weight_cast=True, cast_noop_flag=None, + fwd=False, nvtx_label=nvtx_label, + ) self.prev_w._prefetch_handle = handle # The unsharded tensor has been returned, no pending work so reset state to NONE @@ -971,15 +963,13 @@ def all_gather_and_prefetch( and self.next_w is not None and self.next_w._need_weight_prefetch ): - # Issue on the target's ag_stream (see _wait_param_gather). - target_stream = get_ag_stream(self.next_w.chain_id, self.next_w.group) - with torch.cuda.stream(target_stream): - _, handle = self.next_w._all_gather_weight( - async_op=True, - skip_weight_cast=skip_weight_cast, - cast_noop_flag=cast_noop_flag, - fwd=fwd, nvtx_label=nvtx_label, - ) + # Issue on caller's stream. See all_gather_and_prefetch_bwd. + _, handle = self.next_w._all_gather_weight( + async_op=True, + skip_weight_cast=skip_weight_cast, + cast_noop_flag=cast_noop_flag, + fwd=fwd, nvtx_label=nvtx_label, + ) self.next_w._prefetch_handle = handle # The unsharded tensor has been returned, no pending work so reset state to NONE @@ -1069,10 +1059,7 @@ def _finalize_wgrad(param, wgrad_rs): def _wait_reduce_scatter(self): - # Asymmetric wrt _wait_param_gather: RS is issued from main_stream - # (not rs_stream) because main produced the RS input (wgrad) and - # naturally holds the write→read ordering. Wait-site enters rs_stream - # so it observes NCCL completion and rs_event marks it for consumers. + # Wait on rs_stream — mirrors _wait_param_gather for the RS path. rs_stream = self._cached_rs_stream if rs_stream is None: rs_stream = get_rs_stream(self.chain_id, self.group) @@ -1164,9 +1151,12 @@ def wgrad_reduce_scatter(self, wgrad, nvtx_label=None): poolable = self.chain_id == ETPChain.UNGRAPHED.value if ETP_CONFIG.weight_prefetch and self.prev_w is not None: - # Async reduce-scatter (not last weight — deferred finish). Issue on rs_stream to - # match wait-site (issue-site invariant; see _wait_param_gather). wgrad is produced - # on outer stream by bwd GEMM, so sync outer → rs_stream first. + # Async reduce-scatter (not last weight — deferred finish). Issue on + # rs_stream with an explicit outer→rs_stream event so the bwd GEMM's + # wgrad writer edge is preserved. (NCCL runs on ncclStream regardless; + # the wrap only gives wait_stream(rs_stream) a useful tail before + # _wait_reduce_scatter runs. Do NOT copy this pattern without the + # event — see all_gather_and_prefetch_bwd.) outer_stream = torch.cuda.current_stream() rs_stream = get_rs_stream(self.chain_id, self.group) outer_sync_event = torch.cuda.Event() From 4ece70558cf590d8e3858b11126ef2ff34e576f5 Mon Sep 17 00:00:00 2001 From: Shiqing Fan Date: Fri, 24 Apr 2026 21:17:55 -0700 Subject: [PATCH 35/43] ETP+CG: fix for flaky NaN issue at scale: async AG/RS issue on side streams with explicit producer event --- .../module/extended_tensor_parallelism.py | 149 +++++++++++------- 1 file changed, 89 insertions(+), 60 deletions(-) diff --git a/transformer_engine/pytorch/module/extended_tensor_parallelism.py b/transformer_engine/pytorch/module/extended_tensor_parallelism.py index a45aa4efcc..884d055ed7 100644 --- a/transformer_engine/pytorch/module/extended_tensor_parallelism.py +++ b/transformer_engine/pytorch/module/extended_tensor_parallelism.py @@ -3,6 +3,7 @@ # See LICENSE for license information. from collections import defaultdict +from contextlib import nullcontext from typing import Dict, List, Optional from enum import Enum from dataclasses import dataclass, field @@ -340,7 +341,7 @@ def _quantize_with_coalesced_amax(weights, skip_weight_cast, cast_noop_flag): class ETPConfig: """Global configuration for Extended Tensor Parallelism.""" pad_for_alignment: int = 16 - check_param_states: bool = False + check_param_states: bool = True weight_prefetch: bool = True # When True and the weight list in _all_gather_weight contains >1 NVFP4 # shards that share an amax reduction group, coalesce their per-expert @@ -803,25 +804,43 @@ def _all_gather_weight(self, async_op, skip_weight_cast, cast_noop_flag, fwd, nv assert len(set(id(b) for b in out_buffers)) == len(out_buffers), \ "Duplicate output buffers in batched all-gather — experts need distinct cache keys" - if len(gather_weights) > 1: - nvtx_range_push(f"{nvtx_label}.batched_etp_ag") - results, handle = grouped_gather_along_first_dim( - gather_weights, etp_group, - async_op=async_op, - quantizers=quantizers, - output_tensors=out_buffers, - ) - nvtx_range_pop(f"{nvtx_label}.batched_etp_ag") + # ASYNC AG: wrap issue on ag_stream so both issue (NCCL preEvent) and + # wait land on the same stream — ag_stream's tail then reflects the + # collective's full lifecycle, which is what external + # wait_stream(ag_stream) drains depend on. Explicit outer→ag_stream + # event preserves the quantize writer edge (a bare stream context + # would drop it). + # SYNC AG: stay on caller — output ready on return. + if async_op: + outer_stream = torch.cuda.current_stream() + ag_stream = get_ag_stream(self.chain_id, etp_group) + outer_sync_event = torch.cuda.Event() + outer_sync_event.record(outer_stream) + ag_stream.wait_event(outer_sync_event) + ag_ctx = torch.cuda.stream(ag_stream) else: - nvtx_range_push(f"{nvtx_label}.etp_ag") - weight_total, handle = gather_along_first_dim( - gather_weights[0], etp_group, - quantizer=quantizers[0], - async_op=async_op, - output_tensor=out_buffers[0] if out_buffers is not None else None, - ) - nvtx_range_pop(f"{nvtx_label}.etp_ag") - results = [weight_total] + ag_ctx = nullcontext() + + with ag_ctx: + if len(gather_weights) > 1: + nvtx_range_push(f"{nvtx_label}.batched_etp_ag") + results, handle = grouped_gather_along_first_dim( + gather_weights, etp_group, + async_op=async_op, + quantizers=quantizers, + output_tensors=out_buffers, + ) + nvtx_range_pop(f"{nvtx_label}.batched_etp_ag") + else: + nvtx_range_push(f"{nvtx_label}.etp_ag") + weight_total, handle = gather_along_first_dim( + gather_weights[0], etp_group, + quantizer=quantizers[0], + async_op=async_op, + output_tensor=out_buffers[0] if out_buffers is not None else None, + ) + nvtx_range_pop(f"{nvtx_label}.etp_ag") + results = [weight_total] result = results if self.is_routed_expert else results[0] @@ -834,8 +853,10 @@ def _all_gather_weight(self, async_op, skip_weight_cast, cast_noop_flag, fwd, nv return result, handle def _wait_param_gather(self): - # Wait on ag_stream so ag_event.record() marks ag_stream's tail — - # MLM's external drains (wait_stream(ag_stream)) need that to block. + # Enter ag_stream context so handle.wait() + ag_event.record() both + # land on ag_stream. That makes ag_event mark ag_stream's tail, which + # is what external drains (wait_stream(ag_stream) in finalize_model_grads + # and cuda_graphs._wait_side_streams) actually block on. ag_stream = self._cached_ag_stream if ag_stream is None: ag_stream = get_ag_stream(self.chain_id, self.group) @@ -913,10 +934,9 @@ def all_gather_and_prefetch_bwd(self, nvtx_label=None): and self.prev_w._need_weight_prefetch and self.prev_w._need_weight_prefetch_bwd ): - # Issue on caller's stream — preEvent then captures the AG input - # writer via program order. Do NOT wrap in torch.cuda.stream(ag_stream): - # that drops the writer edge (ag_stream's tail has no dependency - # on capture_stream's writer) and NCCL reads partial data. + # Pre-AG work (quantize, ticket lookup) runs on caller's stream; + # the NCCL collective itself is wrapped on ag_stream inside + # _all_gather_weight (see the async/sync gate there for rationale). _, handle = self.prev_w._all_gather_weight( async_op=True, skip_weight_cast=True, cast_noop_flag=None, fwd=False, nvtx_label=nvtx_label, @@ -963,7 +983,8 @@ def all_gather_and_prefetch( and self.next_w is not None and self.next_w._need_weight_prefetch ): - # Issue on caller's stream. See all_gather_and_prefetch_bwd. + # Pre-AG work on caller; NCCL wrap lives at the collective site + # inside _all_gather_weight. See all_gather_and_prefetch_bwd. _, handle = self.next_w._all_gather_weight( async_op=True, skip_weight_cast=skip_weight_cast, @@ -1059,7 +1080,8 @@ def _finalize_wgrad(param, wgrad_rs): def _wait_reduce_scatter(self): - # Wait on rs_stream — mirrors _wait_param_gather for the RS path. + # Enter rs_stream context so handle.wait() + rs_event.record() land + # on rs_stream — mirrors _wait_param_gather for the RS path. rs_stream = self._cached_rs_stream if rs_stream is None: rs_stream = get_rs_stream(self.chain_id, self.group) @@ -1110,27 +1132,43 @@ def _reduce_scatter(self, wgrads, async_op, nvtx_label=None): else: out_buffers = [None] * len(wgrads) - if len(wgrads) == 1: - nvtx_range_push(f"{nvtx_label}.etp_rs") - out, handle = reduce_scatter_along_first_dim( - wgrads[0], self.group, async_op=async_op, output=out_buffers[0] - ) - nvtx_range_pop(f"{nvtx_label}.etp_rs") - return [out], handle + # ASYNC RS: wrap issue on rs_stream — issue and wait on the same stream + # means rs_stream's tail reflects the full NCCL lifecycle, what + # external wait_stream(rs_stream) drains depend on. Explicit outer→ + # rs_stream event preserves the wgrad-GEMM writer edge. Mirrors AG. + # SYNC RS: stay on caller — same constraint as sync AG. + if async_op: + outer_stream = torch.cuda.current_stream() + rs_stream = get_rs_stream(self.chain_id, self.group) + outer_sync_event = torch.cuda.Event() + outer_sync_event.record(outer_stream) + rs_stream.wait_event(outer_sync_event) + rs_ctx = torch.cuda.stream(rs_stream) else: - outputs = [] - nvtx_range_push(f"{nvtx_label}.batched_etp_rs") - with torch.distributed._coalescing_manager( - group=self.group, - device=wgrads[0].device, - async_ops=async_op, - ) as cm: - for out_buffer, tensor in zip(out_buffers, wgrads): - out, _ = reduce_scatter_along_first_dim(tensor, self.group, output=out_buffer) - outputs.append(out) - nvtx_range_pop(f"{nvtx_label}.batched_etp_rs") - - return outputs, cm if async_op else None + rs_ctx = nullcontext() + + with rs_ctx: + if len(wgrads) == 1: + nvtx_range_push(f"{nvtx_label}.etp_rs") + out, handle = reduce_scatter_along_first_dim( + wgrads[0], self.group, async_op=async_op, output=out_buffers[0] + ) + nvtx_range_pop(f"{nvtx_label}.etp_rs") + return [out], handle + else: + outputs = [] + nvtx_range_push(f"{nvtx_label}.batched_etp_rs") + with torch.distributed._coalescing_manager( + group=self.group, + device=wgrads[0].device, + async_ops=async_op, + ) as cm: + for out_buffer, tensor in zip(out_buffers, wgrads): + out, _ = reduce_scatter_along_first_dim(tensor, self.group, output=out_buffer) + outputs.append(out) + nvtx_range_pop(f"{nvtx_label}.batched_etp_rs") + + return outputs, cm if async_op else None def wgrad_reduce_scatter(self, wgrad, nvtx_label=None): """Reduce-scatter wgrad(s). Sync for last weight, async+deferred for others. @@ -1151,19 +1189,10 @@ def wgrad_reduce_scatter(self, wgrad, nvtx_label=None): poolable = self.chain_id == ETPChain.UNGRAPHED.value if ETP_CONFIG.weight_prefetch and self.prev_w is not None: - # Async reduce-scatter (not last weight — deferred finish). Issue on - # rs_stream with an explicit outer→rs_stream event so the bwd GEMM's - # wgrad writer edge is preserved. (NCCL runs on ncclStream regardless; - # the wrap only gives wait_stream(rs_stream) a useful tail before - # _wait_reduce_scatter runs. Do NOT copy this pattern without the - # event — see all_gather_and_prefetch_bwd.) - outer_stream = torch.cuda.current_stream() - rs_stream = get_rs_stream(self.chain_id, self.group) - outer_sync_event = torch.cuda.Event() - outer_sync_event.record(outer_stream) - rs_stream.wait_event(outer_sync_event) - with torch.cuda.stream(rs_stream): - _, rs_handle = self._reduce_scatter(wgrads, async_op=True, nvtx_label=nvtx_label) + # Async reduce-scatter (not last weight — deferred finish). Pre-RS + # work on caller; NCCL wrap lives at the collective site inside + # _reduce_scatter (mirrors the AG prefetch sites). + _, rs_handle = self._reduce_scatter(wgrads, async_op=True, nvtx_label=nvtx_label) self._wgrad_rs_handle = ETPShardHandle(rs_handle, weights, reduce_scatter=True) # Stash wgrad input buffers — cannot recycle yet because the async RS # kernel is still reading them on rs_stream. From a4ce839fba3724443b782ab7ac0a4aebc119a804 Mon Sep 17 00:00:00 2001 From: Shiqing Fan Date: Sat, 25 Apr 2026 05:16:22 -0700 Subject: [PATCH 36/43] ETP: disable check_param_states by default --- .../pytorch/module/extended_tensor_parallelism.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/transformer_engine/pytorch/module/extended_tensor_parallelism.py b/transformer_engine/pytorch/module/extended_tensor_parallelism.py index 884d055ed7..054db87249 100644 --- a/transformer_engine/pytorch/module/extended_tensor_parallelism.py +++ b/transformer_engine/pytorch/module/extended_tensor_parallelism.py @@ -341,7 +341,7 @@ def _quantize_with_coalesced_amax(weights, skip_weight_cast, cast_noop_flag): class ETPConfig: """Global configuration for Extended Tensor Parallelism.""" pad_for_alignment: int = 16 - check_param_states: bool = True + check_param_states: bool = False weight_prefetch: bool = True # When True and the weight list in _all_gather_weight contains >1 NVFP4 # shards that share an amax reduction group, coalesce their per-expert From f446e0236502aab4fbb24b79e4425595726dbd6a Mon Sep 17 00:00:00 2001 From: Shiqing Fan Date: Sun, 26 Apr 2026 22:13:38 -0700 Subject: [PATCH 37/43] =?UTF-8?q?ETP+CG:=20enable=20cross-graph=20RS=20ove?= =?UTF-8?q?rlap=20=E2=80=94=20main=5Fgrad.add=5F=20on=20rs=5Fstream?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move gradient accumulation from caller stream to rs_stream inside _wait_reduce_scatter(finalize_grad=True). The add_ starts right after NCCL RS (concurrent with Phase 1 AG drain) instead of after it, avoiding SM-saturation that blocks cross-graph overlap. --- .../module/extended_tensor_parallelism.py | 88 ++++++++++++++----- 1 file changed, 65 insertions(+), 23 deletions(-) diff --git a/transformer_engine/pytorch/module/extended_tensor_parallelism.py b/transformer_engine/pytorch/module/extended_tensor_parallelism.py index 054db87249..623d89c905 100644 --- a/transformer_engine/pytorch/module/extended_tensor_parallelism.py +++ b/transformer_engine/pytorch/module/extended_tensor_parallelism.py @@ -894,9 +894,15 @@ def _get_prefetched_weight(self, fwd, skip_weight_cast=False, cast_noop_flag=Non f"cache.get() would return stale data. Check the chain's " f"_need_weight_prefetch flag and issuer's prefetch logic." ) - # Wait for async prefetch if in progress - self._wait_param_gather() - self.ag_event.wait() + if getattr(self, '_already_ag_drained', False): + # Producer already drained via wait_async_comms; skip the captured + # cross-graph wait (CUDA no-op anyway). Correctness is provided by + # the eager main_stream sync chain in the surrounding training loop. + self._already_ag_drained = False + else: + # Intra-graph or eager consume: drain inline. + self._wait_param_gather() + self.ag_event.wait() # Retrieve prefetched results from cache result = [] @@ -1057,31 +1063,26 @@ def _finalize_wgrad(param, wgrad_rs): param._set_rs_state(ETPWeightState.NONE) - # 1. Strip padding if param.is_padded_last_rank: wgrad_rs = param._strip_padding(wgrad_rs) - # 2. Accumulation: accumulate wgrad into main_grad param.main_grad.add_(wgrad_rs) if hasattr(param, "grad_added_to_main_grad"): param.grad_added_to_main_grad = True dummy_grad = get_dummy_wgrad(list(param.main_grad.shape), param.dtype) - # 3. Trigger DDP backward hook (register_grad_ready). - # ETP bypasses autograd's normal gradient flow (returns None for async RS, - # accumulates directly into main_grad), so we must trigger the DDP hook - # manually. Do NOT set param.grad before calling — the hook checks - # param.grad and would accumulate it into main_grad if zero_out_wgrad - # is True, corrupting the gradient with a non-zero dummy. if getattr(param, '_grad_accum_hook', None) is not None: param._grad_accum_hook() return dummy_grad - def _wait_reduce_scatter(self): + def _wait_reduce_scatter(self, finalize_grad=False): # Enter rs_stream context so handle.wait() + rs_event.record() land # on rs_stream — mirrors _wait_param_gather for the RS path. + # When finalize_grad=True, main_grad.add_ also runs on rs_stream + # (right after NCCL RS), so it starts during AG drain rather than + # after it — avoids SM-saturation blocking cross-graph overlap. rs_stream = self._cached_rs_stream if rs_stream is None: rs_stream = get_rs_stream(self.chain_id, self.group) @@ -1091,6 +1092,18 @@ def _wait_reduce_scatter(self): self._wgrad_rs_handle.wait() self._wgrad_rs_handle = None self.rs_event.record() + if finalize_grad: + cache = get_global_ETP_cache() + for w in self._weights: + w._set_rs_state(ETPWeightState.NONE) + wgrad_rs = cache.get(w._rs_ticket) + if w.is_padded_last_rank: + wgrad_rs = w._strip_padding(wgrad_rs) + w.main_grad.add_(wgrad_rs) + cache.release(w._rs_ticket) + if hasattr(w, "grad_added_to_main_grad"): + w.grad_added_to_main_grad = True + self._already_finalized = True # Release stashed wgrad inputs: UNGRAPHED buffers go back to the pool; # GRAPHED just drops Python refs (addresses must stay stable for CG). if getattr(self, '_wgrad_input_bufs', None) is not None: @@ -1211,12 +1224,15 @@ def wgrad_reduce_scatter(self, wgrad, nvtx_label=None): # Currently only support reduce scattering in reverse order if ETP_CONFIG.weight_prefetch and self.next_w is not None: self.next_w._wait_reduce_scatter() - self.next_w.rs_event.wait() - cache = get_global_ETP_cache() - for w in self.next_w._weights: - self._finalize_wgrad(w, cache.get(w._rs_ticket)) - cache.release(w._rs_ticket) + if getattr(self.next_w, '_already_finalized', False): + self.next_w._already_finalized = False + else: + self.next_w.rs_event.wait() + cache = get_global_ETP_cache() + for w in self.next_w._weights: + self._finalize_wgrad(w, cache.get(w._rs_ticket)) + cache.release(w._rs_ticket) return ret @@ -1471,19 +1487,45 @@ def reallocate_etp_cache_to_mempool(device, mempool): _ETP_CACHE.reallocate_to_mempool(device, mempool) -def wait_async_comms(chain_id: str = None): - """Wait on in-flight ETP async communications (all-gathers + reduce-scatters). +def wait_async_comms(chain_id: str = None, skip_rs: bool = False, finalize_after_drain: bool = False): + """Drain in-flight ETP async AG / RS handles. + + When called inside CUDA graph capture, the drains are captured into that + graph. This is the producer-side hook for cross-graph AG/RS overlap: + captured cudaStreamWaitEvent on an event recorded in a different capture + session is a CUDA no-op, so consumer graphs can't safely wait on + cross-graph events. Instead, the producer drains here and flags the + param; the consumer reads the flag and skips its captured wait. Args: - chain_id: If specified, only drain params belonging to this chain - (ETPChain.GRAPHED.value or ETPChain.UNGRAPHED.value). - If None, drain all chains. + chain_id: If specified, only drain params on this chain. + skip_rs: Drain AG only; leave RS in flight. + finalize_after_drain: After RS drain, also accumulate wgrad into + main_grad. Runs main_grad.add_ on rs_stream (right after + NCCL RS) so it starts during AG drain rather than after, + avoiding SM-saturation that blocks cross-graph overlap. + Falls back to caller-stream _finalize_wgrad if no RS handle. + + Per-param side effects: + * _already_ag_drained = True (if an AG handle was drained) + * _already_finalized = True (if finalize_after_drain=True) """ for param in list(_inflight_comm_params): if chain_id is not None and getattr(param, 'chain_id', ETPChain.UNGRAPHED.value) != chain_id: continue + had_ag = param._prefetch_handle is not None param._wait_param_gather() - param._wait_reduce_scatter() + if had_ag: + param._already_ag_drained = True + if not skip_rs: + param._wait_reduce_scatter(finalize_grad=finalize_after_drain) + if finalize_after_drain and not getattr(param, '_already_finalized', False): + cache = get_global_ETP_cache() + param.rs_event.wait() + for w in param._weights: + ETPShardedParam._finalize_wgrad(w, cache.get(w._rs_ticket)) + cache.release(w._rs_ticket) + param._already_finalized = True @dataclass From 464c28988b4c3471fb1c88ccb7e95a383ac2131f Mon Sep 17 00:00:00 2001 From: Jieming Zhang Date: Thu, 23 Apr 2026 09:44:09 -0700 Subject: [PATCH 38/43] wgrad accum fusion Signed-off-by: Jieming Zhang --- .../module/extended_tensor_parallelism.py | 39 +++++++++++-------- 1 file changed, 22 insertions(+), 17 deletions(-) diff --git a/transformer_engine/pytorch/module/extended_tensor_parallelism.py b/transformer_engine/pytorch/module/extended_tensor_parallelism.py index 623d89c905..171d060f7c 100644 --- a/transformer_engine/pytorch/module/extended_tensor_parallelism.py +++ b/transformer_engine/pytorch/module/extended_tensor_parallelism.py @@ -1053,27 +1053,20 @@ def register_grad_accum_hook(self, grad_accum_node, hook): self._grad_accum_hook = hook @staticmethod - def _finalize_wgrad(param, wgrad_rs): - """Post-RS per-param processing: strip padding, accumulate, call DDP hook. + def _handle_megatron_grad_accum(param): + """Handle megatron DDP and gradient accumulation fusion. - Accumulates the reduce-scattered wgrad into main_grad and triggers - the DDP backward hook (register_grad_ready) so the DP reduce-scatter - fires at the correct time during backward. + Do NOT set param.grad before calling the hook — the hook checks + param.grad and would accumulate it into main_grad if zero_out_wgrad + is True, corrupting the gradient with a non-zero dummy. """ - - param._set_rs_state(ETPWeightState.NONE) - - if param.is_padded_last_rank: - wgrad_rs = param._strip_padding(wgrad_rs) - - param.main_grad.add_(wgrad_rs) if hasattr(param, "grad_added_to_main_grad"): param.grad_added_to_main_grad = True dummy_grad = get_dummy_wgrad(list(param.main_grad.shape), param.dtype) - if getattr(param, '_grad_accum_hook', None) is not None: param._grad_accum_hook() + param._set_rs_state(ETPWeightState.NONE) return dummy_grad @@ -1213,8 +1206,13 @@ def wgrad_reduce_scatter(self, wgrad, nvtx_label=None): ret = tuple([None] * len(wgrads)) if batched else None else: # Sync reduce-scatter (last weight in chain) — RS done, recycle immediately - sharded, _ = self._reduce_scatter(wgrads, async_op=False, nvtx_label=nvtx_label) - result = [self._finalize_wgrad(p, g) for p, g in zip(weights, sharded)] + wgrads, _ = self._reduce_scatter(wgrads, async_op=False, nvtx_label=nvtx_label) + wgrads = [ + w._strip_padding(g) if w.is_padded_last_rank else w for w, g in zip(weights, wgrads) + ] + torch._foreach_add_([p.main_grad for p in weights], wgrads) + result = [self._handle_megatron_grad_accum(p) for p in weights] + if poolable: for buf in wgrads: _wgrad_pool_put(buf) @@ -1230,8 +1228,15 @@ def wgrad_reduce_scatter(self, wgrad, nvtx_label=None): else: self.next_w.rs_event.wait() cache = get_global_ETP_cache() - for w in self.next_w._weights: - self._finalize_wgrad(w, cache.get(w._rs_ticket)) + next_weights = self.next_w._weights + wgrads = [cache.get(w._rs_ticket) for w in next_weights] + wgrads = [ + w._strip_padding(g) if w.is_padded_last_rank else g for w, g in zip(next_weights, wgrads) + ] + + torch._foreach_add_([w.main_grad for w in next_weights], wgrads) + for w in next_weights: + self._handle_megatron_grad_accum(w) cache.release(w._rs_ticket) return ret From 06a27562454fac8875f9f56880fd1e0666ed09ec Mon Sep 17 00:00:00 2001 From: Shiqing Fan Date: Mon, 27 Apr 2026 01:09:37 -0700 Subject: [PATCH 39/43] minor fix for wgrad accum fusion --- .../pytorch/module/extended_tensor_parallelism.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/transformer_engine/pytorch/module/extended_tensor_parallelism.py b/transformer_engine/pytorch/module/extended_tensor_parallelism.py index 171d060f7c..354148f692 100644 --- a/transformer_engine/pytorch/module/extended_tensor_parallelism.py +++ b/transformer_engine/pytorch/module/extended_tensor_parallelism.py @@ -1208,7 +1208,7 @@ def wgrad_reduce_scatter(self, wgrad, nvtx_label=None): # Sync reduce-scatter (last weight in chain) — RS done, recycle immediately wgrads, _ = self._reduce_scatter(wgrads, async_op=False, nvtx_label=nvtx_label) wgrads = [ - w._strip_padding(g) if w.is_padded_last_rank else w for w, g in zip(weights, wgrads) + w._strip_padding(g) if w.is_padded_last_rank else g for w, g in zip(weights, wgrads) ] torch._foreach_add_([p.main_grad for p in weights], wgrads) result = [self._handle_megatron_grad_accum(p) for p in weights] From e107b05e0b386ab9c22e717acf5d45bc7db00667 Mon Sep 17 00:00:00 2001 From: Shiqing Fan Date: Mon, 27 Apr 2026 23:50:12 -0700 Subject: [PATCH 40/43] ETP+mxfp8: reject coalesced-amax path for non-NVFP4 quantizers --- .../module/extended_tensor_parallelism.py | 48 ++++++++----------- 1 file changed, 19 insertions(+), 29 deletions(-) diff --git a/transformer_engine/pytorch/module/extended_tensor_parallelism.py b/transformer_engine/pytorch/module/extended_tensor_parallelism.py index 354148f692..a5100343ce 100644 --- a/transformer_engine/pytorch/module/extended_tensor_parallelism.py +++ b/transformer_engine/pytorch/module/extended_tensor_parallelism.py @@ -245,33 +245,22 @@ def get_rs_streams_for_chain(chain_id: str) -> list: def _coalesced_amax_static_eligible(weights): - """Walk the weight list once and decide whether the coalesced-amax path - is applicable. Depends only on fields that are fixed after model - construction (quantizer class, flags, amax_reduction_group, group size).""" + """Check whether the coalesced-amax path is applicable (NVFP4 only). + + Caller already gates on ETP_CONFIG.coalesce_amax_allreduce (False for + non-NVFP4). Here we additionally verify TE API availability, batch size, + quantizer type (must have amax reduction), and the RHT flag.""" if not _COALESCED_AMAX_TE_APIS_AVAILABLE: return False if len(weights) <= 1: return False - - group = None - for w in weights: - q = w._quantizer - if q is None or not isinstance(w.quantized, NVFP4TensorStorage): - return False - if not getattr(q, "with_amax_reduction", False): - return False - if getattr(q, "with_rht", False): - # RHT path does amax on RHT-rotated view, can't split compute - # from cast the way compute_amax_only assumes. - return False - g = getattr(q, "amax_reduction_group", None) - if g is None: - return False - if group is None: - group = g - elif g is not group: - return False - return group.size() > 1 + has_amax = [getattr(w._quantizer, "with_amax_reduction", False) for w in weights] + if not all(has_amax): + return False + has_rht = any(getattr(w._quantizer, "with_rht", False) for w in weights) + if has_rht: + return False + return True def _quantize_with_coalesced_amax(weights, skip_weight_cast, cast_noop_flag): @@ -565,8 +554,9 @@ def setup(self, weight_quantizer=None): if self._quantizer is None: def _configure_quantizer(q, group): q = q.copy() - q.with_amax_reduction = True - q.amax_reduction_group = group + if hasattr(q, 'with_amax_reduction'): + q.with_amax_reduction = True + q.amax_reduction_group = group q.internal = False q.optimize_for_gemm = True return q @@ -1597,10 +1587,10 @@ def grouped_gather_along_first_dim( if async_op: handle = gather_coalescing_manager - if ( - quantizers is not None - and getattr(quantizers[0], "columnwise_usage", False) - ): + has_nvfp4_handles = any( + isinstance(wh, _NVFP4AllGatherAsyncHandle) for wh in weight_handles + ) + if has_nvfp4_handles: handle = BatchedNVFP4AllGatherAsyncHandle(weight_handles, handle) else: for wh in weight_handles: From 03bb9fbfe96b10e23b5fc06bb0e49aa7681bdd86 Mon Sep 17 00:00:00 2001 From: Shiqing Fan Date: Mon, 27 Apr 2026 23:51:12 -0700 Subject: [PATCH 41/43] ETP: add debug_numerics instrumentation for NaN/Inf triage --- .../module/extended_tensor_parallelism.py | 141 +++++++++++++++++- 1 file changed, 140 insertions(+), 1 deletion(-) diff --git a/transformer_engine/pytorch/module/extended_tensor_parallelism.py b/transformer_engine/pytorch/module/extended_tensor_parallelism.py index a5100343ce..7b47d496d1 100644 --- a/transformer_engine/pytorch/module/extended_tensor_parallelism.py +++ b/transformer_engine/pytorch/module/extended_tensor_parallelism.py @@ -250,16 +250,35 @@ def _coalesced_amax_static_eligible(weights): Caller already gates on ETP_CONFIG.coalesce_amax_allreduce (False for non-NVFP4). Here we additionally verify TE API availability, batch size, quantizer type (must have amax reduction), and the RHT flag.""" + dbg = ETP_CONFIG.debug_numerics > 0 if not _COALESCED_AMAX_TE_APIS_AVAILABLE: + if dbg: + print_rank_0("[ETP_DEBUG] coalesced_amax_static: REJECTED (TE APIs unavailable)") return False if len(weights) <= 1: return False has_amax = [getattr(w._quantizer, "with_amax_reduction", False) for w in weights] if not all(has_amax): + if dbg: + qtypes = [type(w._quantizer).__name__ for w in weights[:3]] + print_rank_0( + f"[ETP_DEBUG] coalesced_amax_static: REJECTED " + f"(with_amax_reduction={has_amax[:3]}{'...' if len(has_amax)>3 else ''}, " + f"quantizer_types={qtypes}{'...' if len(weights)>3 else ''}, " + f"n_weights={len(weights)})" + ) return False has_rht = any(getattr(w._quantizer, "with_rht", False) for w in weights) if has_rht: + if dbg: + print_rank_0("[ETP_DEBUG] coalesced_amax_static: REJECTED (with_rht=True)") return False + if dbg: + qtypes = [type(w._quantizer).__name__ for w in weights[:3]] + print_rank_0( + f"[ETP_DEBUG] coalesced_amax_static: *** ACCEPTED *** " + f"(n_weights={len(weights)}, quantizer_types={qtypes}{'...' if len(weights)>3 else ''})" + ) return True @@ -339,9 +358,70 @@ class ETPConfig: # guard in _coalesced_amax_static_eligible falls back to the per-weight # path when either binding is missing. coalesce_amax_allreduce: bool = True + # Log numeric diagnostics for the first N AG/RS calls per param. + # 0 = off; 3 = good default for triage (covers iter 1-2 fwd+bwd). + debug_numerics: int = 0 ETP_CONFIG = ETPConfig() +# --------------------------------------------------------------------------- +# Debug helpers (gated by ETP_CONFIG.debug_numerics > 0) +# --------------------------------------------------------------------------- +_etp_debug_counts: Dict[tuple, int] = {} + +def _etp_dbg_capturing(): + """True when a CUDA graph is being captured — D2H syncs are forbidden.""" + return torch.cuda.is_current_stream_capturing() + +def _etp_dbg_should_log(param_name, label): + if ETP_CONFIG.debug_numerics <= 0 or _etp_dbg_capturing(): + return False + key = (param_name, label) + count = _etp_debug_counts.get(key, 0) + if count >= ETP_CONFIG.debug_numerics: + return False + _etp_debug_counts[key] = count + 1 + return True + +def _etp_dbg_tensor(name, t): + """One-line NaN/Inf summary for a BF16/FP32 tensor.""" + if t is None: + return f"{name}=None" + if t.numel() == 0: + return f"{name}:{list(t.shape)},empty" + if not t.is_floating_point(): + return f"{name}:non-float({t.dtype})" + has_nan = bool(torch.isnan(t).any()) + has_inf = bool(torch.isinf(t).any()) + amax = t.abs().max().item() + tag = " ***BAD***" if (has_nan or has_inf) else "" + return f"{name}:{list(t.shape)},amax={amax:.4e},nan={has_nan},inf={has_inf}{tag}" + +def _etp_dbg_quantized(name, qt): + """Multi-line check of a quantized tensor's metadata fields.""" + if qt is None: + return f"{name}=None" + md = qt.get_metadata() + parts = [f"{name}:type={type(qt).__name__}"] + for k in ("rowwise_data", "columnwise_data"): + v = md.get(k) + parts.append(f" {k}={'shape=' + str(list(v.shape)) if v is not None else 'NONE'}") + for k in ("rowwise_scale_inv", "columnwise_scale_inv"): + v = md.get(k) + if v is not None and v.numel() == 0: + parts.append(f" {k}:{list(v.shape)},empty") + elif v is not None and v.is_floating_point(): + has_nan = bool(torch.isnan(v).any()) + has_inf = bool(torch.isinf(v).any()) + amax = v.abs().max().item() + tag = " ***BAD***" if (has_nan or has_inf) else "" + parts.append(f" {k}:{list(v.shape)},amax={amax:.4e},nan={has_nan},inf={has_inf}{tag}") + elif v is not None: + parts.append(f" {k}:{list(v.shape)},dtype={v.dtype}") + else: + parts.append(f" {k}:NONE") + return "\n".join(parts) + def update_config(**kwargs): """Update the global ETP configuration.""" for key, value in kwargs.items(): @@ -742,6 +822,17 @@ def _all_gather_weight(self, async_op, skip_weight_cast, cast_noop_flag, fwd, nv else: use_coalesced = False + if _etp_dbg_should_log(self._debug_name, 'ag_decision'): + qtypes = [type(w._quantizer).__name__ for w in weights[:3]] + print_rank_0( + f"[ETP_DEBUG] AG {self._debug_name} fwd={fwd} chain={self.chain_id} " + f"coalesced={use_coalesced} skip_cast={skip_weight_cast} " + f"noop={'set' if cast_noop_flag is not None else 'None'} " + f"coalesce_cfg={ETP_CONFIG.coalesce_amax_allreduce} " + f"static_ok={getattr(self, '_coalesced_amax_static', 'N/A')} " + f"qtypes={qtypes}{'...' if len(weights)>3 else ''}" + ) + if use_coalesced: _quantize_with_coalesced_amax(weights, skip_weight_cast, cast_noop_flag) else: @@ -751,6 +842,15 @@ def _all_gather_weight(self, async_op, skip_weight_cast, cast_noop_flag, fwd, nv if w.did_cast_to_low_precision: w._quantizer.set_usage(rowwise=fwd, columnwise=not fwd) + if _etp_dbg_should_log(self._debug_name, f'ag_numerics_{"fwd" if fwd else "bwd"}'): + lines = [f"[ETP_DEBUG] post-quantize {self._debug_name} fwd={fwd} " + f"usage=row:{fwd},col:{not fwd}"] + for i, w in enumerate(weights[:3]): + lines.append(f" w[{i}] shard: {_etp_dbg_tensor(f'{w._debug_name}', w.data)}") + if w.did_cast_to_low_precision: + lines.append(_etp_dbg_quantized(f' w[{i}] quantized', w.quantized)) + print_rank_0("\n".join(lines)) + # 3. Build gather inputs. # quantizers / dtypes / etp_group are stable after model construction — # cache on the anchor (self == weights[0]) to avoid rebuilding lists @@ -884,7 +984,8 @@ def _get_prefetched_weight(self, fwd, skip_weight_cast=False, cast_noop_flag=Non f"cache.get() would return stale data. Check the chain's " f"_need_weight_prefetch flag and issuer's prefetch logic." ) - if getattr(self, '_already_ag_drained', False): + _was_drained = getattr(self, '_already_ag_drained', False) + if _was_drained: # Producer already drained via wait_async_comms; skip the captured # cross-graph wait (CUDA no-op anyway). Correctness is provided by # the eager main_stream sync chain in the surrounding training loop. @@ -902,6 +1003,17 @@ def _get_prefetched_weight(self, fwd, skip_weight_cast=False, cast_noop_flag=Non result.append(cache.get(ticket)) result = [self._strip_padding(r) for r in result] + + if _etp_dbg_should_log(self._debug_name, f'prefetch_{"fwd" if fwd else "bwd"}'): + lines = [f"[ETP_DEBUG] prefetched {self._debug_name} fwd={fwd} " + f"already_drained={_was_drained}"] + for i, r in enumerate(result[:3]): + if isinstance(r, (NVFP4TensorStorage, MXFP8TensorStorage)): + lines.append(_etp_dbg_quantized(f' gathered[{i}]', r)) + else: + lines.append(f" gathered[{i}]: {_etp_dbg_tensor('', r)}") + print_rank_0("\n".join(lines)) + result = [r.detach().requires_grad_(w.requires_grad) for r, w in zip(result, self._weights)] return result if self.is_routed_expert else result[0] @@ -1083,6 +1195,12 @@ def _wait_reduce_scatter(self, finalize_grad=False): if w.is_padded_last_rank: wgrad_rs = w._strip_padding(wgrad_rs) w.main_grad.add_(wgrad_rs) + if ETP_CONFIG.debug_numerics > 0 and not _etp_dbg_capturing(): + if bool(torch.isinf(w.main_grad).any()) or bool(torch.isnan(w.main_grad).any()): + print_rank_0( + f"[ETP_DEBUG] *** main_grad ANOMALY after finalize_grad RS *** " + f"{w._debug_name}: {_etp_dbg_tensor('main_grad', w.main_grad)}" + ) cache.release(w._rs_ticket) if hasattr(w, "grad_added_to_main_grad"): w.grad_added_to_main_grad = True @@ -1184,6 +1302,13 @@ def wgrad_reduce_scatter(self, wgrad, nvtx_label=None): # stable buffer addresses across replay. poolable = self.chain_id == ETPChain.UNGRAPHED.value + if _etp_dbg_should_log(self._debug_name, 'rs_input'): + lines = [f"[ETP_DEBUG] RS input {self._debug_name} " + f"async={self.prev_w is not None and ETP_CONFIG.weight_prefetch}"] + for i, g in enumerate(wgrads[:3]): + lines.append(f" wgrad[{i}]: {_etp_dbg_tensor('', g)}") + print_rank_0("\n".join(lines)) + if ETP_CONFIG.weight_prefetch and self.prev_w is not None: # Async reduce-scatter (not last weight — deferred finish). Pre-RS # work on caller; NCCL wrap lives at the collective site inside @@ -1201,6 +1326,13 @@ def wgrad_reduce_scatter(self, wgrad, nvtx_label=None): w._strip_padding(g) if w.is_padded_last_rank else g for w, g in zip(weights, wgrads) ] torch._foreach_add_([p.main_grad for p in weights], wgrads) + if ETP_CONFIG.debug_numerics > 0 and not _etp_dbg_capturing(): + for w in weights[:3]: + if bool(torch.isinf(w.main_grad).any()) or bool(torch.isnan(w.main_grad).any()): + print_rank_0( + f"[ETP_DEBUG] *** main_grad ANOMALY after sync RS *** " + f"{w._debug_name}: {_etp_dbg_tensor('main_grad', w.main_grad)}" + ) result = [self._handle_megatron_grad_accum(p) for p in weights] if poolable: @@ -1225,6 +1357,13 @@ def wgrad_reduce_scatter(self, wgrad, nvtx_label=None): ] torch._foreach_add_([w.main_grad for w in next_weights], wgrads) + if ETP_CONFIG.debug_numerics > 0 and not _etp_dbg_capturing(): + for w in next_weights[:3]: + if bool(torch.isinf(w.main_grad).any()) or bool(torch.isnan(w.main_grad).any()): + print_rank_0( + f"[ETP_DEBUG] *** main_grad ANOMALY after async RS finalize *** " + f"{w._debug_name}: {_etp_dbg_tensor('main_grad', w.main_grad)}" + ) for w in next_weights: self._handle_megatron_grad_accum(w) cache.release(w._rs_ticket) From 550f0746aa84a319aec1d8488d767a902946b594 Mon Sep 17 00:00:00 2001 From: Shiqing Fan Date: Tue, 28 Apr 2026 02:18:19 -0700 Subject: [PATCH 42/43] ETP+mxfp8 divergence fix: disable GEMM-swizzled scales for all-gather compatibility. --- .../pytorch/module/extended_tensor_parallelism.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/transformer_engine/pytorch/module/extended_tensor_parallelism.py b/transformer_engine/pytorch/module/extended_tensor_parallelism.py index 7b47d496d1..ef66994d34 100644 --- a/transformer_engine/pytorch/module/extended_tensor_parallelism.py +++ b/transformer_engine/pytorch/module/extended_tensor_parallelism.py @@ -18,6 +18,7 @@ ) from ..quantized_tensor import QuantizedTensor from ..tensor import NVFP4TensorStorage, MXFP8TensorStorage +from ..tensor.mxfp8_tensor import MXFP8Quantizer from ..utils import nvtx_range_pop, nvtx_range_push, round_up_to_nearest_multiple from ..constants import NVFP4_BLOCK_SCALING_SIZE, MXFP8_BLOCK_SCALING_SIZE from .base import get_dummy_wgrad @@ -638,7 +639,11 @@ def _configure_quantizer(q, group): q.with_amax_reduction = True q.amax_reduction_group = group q.internal = False - q.optimize_for_gemm = True + # MXFP8 scales must stay in compact (unswizzled) layout so that + # per-shard scale_inv can be all-gathered via byte concatenation. + # GEMM-swizzled scales from independent shards don't compose into + # a valid swizzled layout for the full tensor after AG. + q.optimize_for_gemm = not isinstance(q, MXFP8Quantizer) return q weights = self.weight_list if self.is_routed_expert and self.weight_list is not None else [self] From 520251c0963ff513a4f39f4aa29baeb57c756343 Mon Sep 17 00:00:00 2001 From: Shiqing Fan Date: Tue, 28 Apr 2026 02:34:04 -0700 Subject: [PATCH 43/43] ETP: pad full tensor before sharding instead of per-rank on-the-fly --- .../module/extended_tensor_parallelism.py | 41 +++++-------------- 1 file changed, 10 insertions(+), 31 deletions(-) diff --git a/transformer_engine/pytorch/module/extended_tensor_parallelism.py b/transformer_engine/pytorch/module/extended_tensor_parallelism.py index ef66994d34..84dbe05eeb 100644 --- a/transformer_engine/pytorch/module/extended_tensor_parallelism.py +++ b/transformer_engine/pytorch/module/extended_tensor_parallelism.py @@ -459,24 +459,21 @@ def wrap_module_params_etp(module, weight_names, etp_group, is_grouped=None): delattr(module, name) if ETP_CONFIG.pad_for_alignment > 0: - # Ensure each shard's dim0 is a multiple of 16 for quantization (NVFP4/FP8) by padding - # the last rank such that the total padded length of dim0 is a multiple of ETP size * 16 + # Pad the full tensor BEFORE sharding so every rank gets exactly + # shard_size rows and each shard's dim0 is alignment-divisible. + # Padding stays contiguous at the tail of the gathered result — + # no interleaved-padding reshuffle needed after all-gather. alignment = ETP_CONFIG.pad_for_alignment * etp_size tensor = param.data dim0 = tensor.shape[0] pad_length = (alignment - dim0 % alignment) % alignment if alignment > 0 else 0 + if pad_length > 0: + tensor = torch.nn.functional.pad(tensor, (0, 0, 0, pad_length)) padded_dim0 = dim0 + pad_length - is_padded_last_rank = pad_length > 0 and etp_rank == etp_size - 1 - # Create the ETP sharded param, pass a clone of the shard so that the original unsharded - # buffer may be deallocated shard_size = padded_dim0 // etp_size - start_idx = etp_rank * shard_size - end_idx = min((etp_rank + 1) * shard_size, tensor.shape[0]) - shard = tensor[start_idx: end_idx] + shard = tensor[etp_rank * shard_size : (etp_rank + 1) * shard_size] etp_shard = ETPShardedParam(shard.clone()) - # finally, set attributes etp_shard.pad_length = pad_length - etp_shard.is_padded_last_rank = is_padded_last_rank else: shard_size = tensor.shape[0] // etp_group.size() shard = tensor[etp_rank * shard_size: (etp_rank + 1) * shard_size] @@ -617,7 +614,6 @@ def __init__(self, x, *args, **kwargs): self.rs_event = torch.cuda.Event(external=True) self._rs_ticket = None # Padding - self.is_padded_last_rank = False self.pad_length = 0 # Debug self._debug_name = "" @@ -666,11 +662,8 @@ def _weights(self): @property def _unsharded_shape_padded(self): out_shape = list(self.size()) - if self.pad_length > 0 and self.group.rank() == self.group.size() - 1: - out_shape[0] = (out_shape[0]+ self.pad_length) * self.group.size() - else: - out_shape[0] = out_shape[0] * self.group.size() - return tuple(out_shape) + out_shape[0] = out_shape[0] * self.group.size() + return tuple(out_shape) @property def _unsharded_shape(self): @@ -680,14 +673,9 @@ def _unsharded_shape(self): @property def _sharded_padded_shape(self): - out_shape = list(self.size()) - if self.pad_length > 0 and self.group.rank() == self.group.size() - 1: - out_shape[0] += self.pad_length - return tuple(out_shape) + return tuple(self.size()) def get_padded_shard(self): - if self.pad_length > 0 and self.is_padded_last_rank: - return torch.nn.functional.pad(self, (0, 0, 0, self.pad_length)) return self def _set_state(self, new_state: ETPWeightState): @@ -1197,8 +1185,6 @@ def _wait_reduce_scatter(self, finalize_grad=False): for w in self._weights: w._set_rs_state(ETPWeightState.NONE) wgrad_rs = cache.get(w._rs_ticket) - if w.is_padded_last_rank: - wgrad_rs = w._strip_padding(wgrad_rs) w.main_grad.add_(wgrad_rs) if ETP_CONFIG.debug_numerics > 0 and not _etp_dbg_capturing(): if bool(torch.isinf(w.main_grad).any()) or bool(torch.isnan(w.main_grad).any()): @@ -1327,9 +1313,6 @@ def wgrad_reduce_scatter(self, wgrad, nvtx_label=None): else: # Sync reduce-scatter (last weight in chain) — RS done, recycle immediately wgrads, _ = self._reduce_scatter(wgrads, async_op=False, nvtx_label=nvtx_label) - wgrads = [ - w._strip_padding(g) if w.is_padded_last_rank else g for w, g in zip(weights, wgrads) - ] torch._foreach_add_([p.main_grad for p in weights], wgrads) if ETP_CONFIG.debug_numerics > 0 and not _etp_dbg_capturing(): for w in weights[:3]: @@ -1357,10 +1340,6 @@ def wgrad_reduce_scatter(self, wgrad, nvtx_label=None): cache = get_global_ETP_cache() next_weights = self.next_w._weights wgrads = [cache.get(w._rs_ticket) for w in next_weights] - wgrads = [ - w._strip_padding(g) if w.is_padded_last_rank else g for w, g in zip(next_weights, wgrads) - ] - torch._foreach_add_([w.main_grad for w in next_weights], wgrads) if ETP_CONFIG.debug_numerics > 0 and not _etp_dbg_capturing(): for w in next_weights[:3]: