pathways-utils/pathwaysutils/experimental/reshard.py at c27e1fb3e67a1f7d67f2ac5ebd2ec473a1580ee8 · AI-Hypercomputer/pathways-utils · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Experimental resharding API for elastic device sets."""

import base64
import collections
import json
import logging
import math
import operator
from typing import Any, Dict, Sequence

import jax
from pathwaysutils import lru_cache
from pathwaysutils import plugin_executable
from pathwaysutils.experimental import split_by_mesh_axis


_logger = logging.getLogger(__name__)

INTERMEDIATE_SPLIT_SUFFIX = "_intermediate_split"
INTERMEDIATE_REPLICA_SUFFIX = "_intermediate_replica"


def _identity(x: Any) -> Any:
  """A helper function that returns its input."""
  return x

class ReshardingPlanWrapper:
  """Wrapper around PluginProgram(reshard_request)."""

  _plugin_executable: plugin_executable.PluginExecutable
  _avals: Sequence[jax.core.ShapedArray]
  _out_shardings: Sequence[jax.sharding.Sharding]

  def __init__(
      self,
      avals: Sequence[jax.core.ShapedArray],
      source_shardings: Sequence[jax.sharding.Sharding],
      destination_shardings: Sequence[jax.sharding.Sharding],
      donate: bool,
  ):
    def ifrt_hlo_sharding(
        aval: jax.core.ShapedArray, sharding: jax.sharding.Sharding
    ) -> Dict[str, Any]:
      result = {
          "devices": {
              "device_ids": [
                  device.id for device in sharding._addressable_device_assignment  # pylint: disable=protected-access
              ]
          },
          "xla_hlo_sharding": (
              base64.b64encode(
                  sharding._to_xla_hlo_sharding(aval.ndim)  # pylint: disable=protected-access
                  .to_proto()
                  .SerializeToString()
              ).decode("utf-8")
          ),
      }
      if sharding.memory_kind is not None:
        result["memory_kind"] = sharding.memory_kind
      return result

    request = {
        "reshardRequest": {
            "donateInput": donate,
            "inSharding": [
                ifrt_hlo_sharding(aval, old_sharding)
                for aval, old_sharding in zip(avals, source_shardings)
            ],
            "outSharding": [
                ifrt_hlo_sharding(aval, new_sharding)
                for aval, new_sharding in zip(avals, destination_shardings)
            ],
        }
    }

    self._plugin_executable = plugin_executable.PluginExecutable(
        json.dumps(request)
    )
    self._avals = avals
    self._out_shardings = destination_shardings

  def execute(self, inp_arrays: tuple[jax.Array, ...]) -> Sequence[jax.Array]:
    out_arrays, fut = self._plugin_executable.call(
        inp_arrays, self._out_shardings, self._avals
    )
    fut.result()
    return out_arrays


def _get_resharding_plan(
    avals: tuple[jax.core.ShapedArray, ...],
    old_shardings: tuple[jax.sharding.Sharding, ...],
    new_shardings: tuple[jax.sharding.Sharding, ...],
    donate: bool,
) -> ReshardingPlanWrapper:
  """Returns a resharding plan for the given sharding task."""
  return ReshardingPlanWrapper(avals, old_shardings, new_shardings, donate)


_get_resharding_plan_cached = lru_cache.lru_cache()(_get_resharding_plan)


def reshard(
    x: Any,
    sharding: jax.sharding.Sharding | Any,
    *,
    donate: bool = False,
    may_alias: bool | None = None,  # pylint: disable=unused-argument
    cache_resharding_plans: bool = False,
) -> Any:
  """Reshards `x` to `sharding`.

  Args:
    x: An array, scalar, or (nested) standard Python container thereof.
    sharding: A `Sharding` or a (nested) `Sharding` in standard Python container
      (must be a tree prefix of `x`), representing the device(s) and sharding to
      which `x` should be sharded to. The result will be committed to the
      device(s) of the sharding.
    donate: If `True`, donate all input arrays, which may reduce the amount of
      memory needed for resharding. Buffers donated to resharding should not be
      reused.
    may_alias: If `True`, may alias the input array with the output array. May
      reduce the amount of memory needed for resharding. Not used at the moment.
    cache_resharding_plans: If `True`, uses a resharding plan cache to avoid
      recreating plans for the same resharding operation. May improve
      performance for use cases where the same resharding operation is done many
      times. May degrade performance if most reshardings operations are
      different, since the cache will cause Pathways Components to remain loaded
      for each cached plan. `False` by default.

  Returns:
    A copy of `x` whose sharding is `sharding`.
  """
  flat_x, tree_def = jax.tree.flatten(x)
  flat_sharding = jax.api_util.flatten_axes(
      "reshard sharding", tree_def, sharding
  )

  # We must split the arrays into two groups:
  # 1. jax.Array
  # 2. non jax.Array
  # For jax.Array, we will use the ifrt client to get the resharding plan and
  # execute it.
  # These arrays must be further split into groups based on the device set of
  # the sharding, since plugin programs only supports execution on the same
  # device set.
  # For non jax.Array, we will use jax.device_put to put the array to the
  # destination devices.
  #
  # We need to track what index each array is in the original pytree, so we can
  # put them back together in the right order.
  array_info_lambda = lambda: {"arrays": [], "indices": [], "dst_shardings": []}
  jax_arrays = collections.defaultdict(array_info_lambda)
  non_reshardable_arrays = array_info_lambda()
  for index, (arr, dst_sharding) in enumerate(zip(flat_x, flat_sharding)):
    if not isinstance(dst_sharding, jax.sharding.Sharding):
      raise ValueError("`sharding` must contain only `jax.sharding.Sharding`")
    if not isinstance(arr, jax.Array) or (
        hasattr(arr, "dtype")
        and jax.dtypes.issubdtype(arr.dtype, jax.dtypes.prng_key)
    ):
      non_reshardable_arrays["arrays"].append(arr)
      non_reshardable_arrays["indices"].append(index)
      non_reshardable_arrays["dst_shardings"].append(dst_sharding)
    else:
      device_set = frozenset(arr.sharding.device_set)
      jax_arrays[device_set]["arrays"].append(arr)
      jax_arrays[device_set]["indices"].append(index)
      jax_arrays[device_set]["dst_shardings"].append(dst_sharding)

  if non_reshardable_arrays["arrays"]:
    non_reshardable_arrays["arrays"] = jax.device_put(
        non_reshardable_arrays["arrays"],
        non_reshardable_arrays["dst_shardings"],
        donate=donate,
        may_alias=may_alias,
    )

  for array_info in jax_arrays.values():
    get_resharding_plan_func = (
        _get_resharding_plan_cached
        if cache_resharding_plans
        else _get_resharding_plan
    )
    array_info["arrays"] = get_resharding_plan_func(
        tuple(arr.aval for arr in array_info["arrays"]),
        tuple(arr.sharding for arr in array_info["arrays"]),
        tuple(array_info["dst_shardings"]),
        donate,
    ).execute(tuple(array_info["arrays"]))

  result = [None] * len(flat_x)
  for arr, idx in zip(
      non_reshardable_arrays["arrays"], non_reshardable_arrays["indices"]
  ):
    result[idx] = arr
  for array_info in jax_arrays.values():
    for arr, idx in zip(array_info["arrays"], array_info["indices"]):
      result[idx] = arr

  return jax.tree.unflatten(tree_def, result)


class NoIntermediateShardingError(Exception):
  """Raised when no intermediate sharding is found."""


class NoIntermediateShardingNeededError(NoIntermediateShardingError):
  """Raised when no intermediate sharding is needed for optimization."""


def _get_sharding_spec_dims(sharding: jax.sharding.NamedSharding):
  """Gets the sharding dimension sizes from a NamedSharding."""
  mesh = sharding.mesh
  dims = []
  for spec in sharding.spec:
    if spec is None:
      dims.append(1)
    elif isinstance(spec, str):
      dims.append(mesh.shape[spec])
    elif isinstance(spec, (list, tuple)):
      dims.append(math.prod([mesh.shape[ax] for ax in spec]))
    else:
      raise ValueError(f"Unsupported partition spec: {spec}")
  return dims


def _check_sharding_divisibility(
    in_sharding: jax.sharding.NamedSharding,
    out_sharding: jax.sharding.NamedSharding,
    src_dims: Sequence[int],
    dst_dims: Sequence[int],
):
  """Checks if source and destination shardings are compatible for optimization."""
  src_largest_dim = max(src_dims) if src_dims else 1
  dst_largest_dim = max(dst_dims) if dst_dims else 1
  src_total_dims = math.prod(src_dims)
  dst_total_dims = math.prod(dst_dims)

  # Not able to handle resharding with undividable shardings.
  if src_largest_dim % dst_largest_dim != 0:
    raise NoIntermediateShardingError(
        "Resharding with undividable shardings is not optimized with"
        " intermediate sharding."
        f" in_sharding={in_sharding}, out_sharding={out_sharding}"
    )
  if src_total_dims <= dst_total_dims:
    raise NoIntermediateShardingError(
        "No intermediate sharding is found because the source sharding is not"
        " larger than the target sharding."
        f" in_sharding={in_sharding}, out_sharding={out_sharding}"
    )
  if src_total_dims % dst_total_dims != 0:
    raise NoIntermediateShardingError(
        "No intermediate sharding is found because the source sharding is not"
        " divisible by the target sharding."
        f" in_sharding={in_sharding}, out_sharding={out_sharding}"
    )


def _get_split_candidates(
    in_sharding: jax.sharding.NamedSharding,
    src_dims: Sequence[int],
    dst_dims: Sequence[int],
    gcd_shards: Sequence[int],
) -> list[tuple[int, str]]:
  """Finds dimensions that are candidates for splitting."""
  split_candidates = []
  for i, spec in enumerate(in_sharding.spec):
    # TODO(b/1234) - Support splitting a dimension that is sharded over multiple
    # mesh axes.
    if (
        gcd_shards[i] == 1
        and src_dims[i] > dst_dims[i]
        and isinstance(spec, str)
    ):
      split_candidates.append((i, spec))

  if not split_candidates:
    raise NoIntermediateShardingError(
        "No intermediate sharding is found because all of the"
        " gcd(src_dim_shards, dst_dim_shards) are 1s, or no suitable"
        " dimension to split."
    )
  return split_candidates


def _build_intermediate_mesh_and_spec(
    src_mesh: jax.sharding.Mesh,
    in_spec: jax.sharding.PartitionSpec,
    src_dims: Sequence[int],
    dst_dims: Sequence[int],
    split_candidates: list[tuple[int, str]],
) -> tuple[jax.sharding.Mesh, jax.sharding.PartitionSpec, list[str]]:
  """Builds the intermediate Mesh and PartitionSpec."""
  # Build a map of mesh axis to split information: (dim_idx, replicas)
  mesh_axis_to_split_info = {}
  for dim_idx, mesh_axis in split_candidates:
    src_dim = src_dims[dim_idx]
    dst_dim = dst_dims[dim_idx]
    replicas = src_dim // dst_dim
    mesh_axis_to_split_info[mesh_axis] = (dim_idx, replicas)

  # Build the intermediate mesh by expanding axes that need splitting.
  new_replicated_axis_names = []
  new_replicated_mesh_shape = []
  new_axis_names = []
  new_mesh_shape = []
  for axis_name in src_mesh.axis_names:
    axis_size = src_mesh.shape[axis_name]
    if axis_name in mesh_axis_to_split_info:
      dim_idx, replicas = mesh_axis_to_split_info[axis_name]
      dst_dim = dst_dims[dim_idx]
      split_axis_name = axis_name + INTERMEDIATE_SPLIT_SUFFIX
      replica_axis_name = axis_name + INTERMEDIATE_REPLICA_SUFFIX
      new_replicated_axis_names.append(replica_axis_name)
      new_replicated_mesh_shape.append(replicas)
      new_axis_names.append(split_axis_name)
      new_mesh_shape.append(dst_dim)
    else:
      new_axis_names.append(axis_name)
      new_mesh_shape.append(axis_size)

  new_axis_names = new_replicated_axis_names + new_axis_names
  new_mesh_shape = new_replicated_mesh_shape + new_mesh_shape
  intermediate_mesh = jax.sharding.Mesh(
      src_mesh.devices.reshape(new_mesh_shape),
      axis_names=tuple(new_axis_names),
  )

  # Build the intermediate PartitionSpec.
  intermediate_spec_list = list(in_spec)
  for dim_idx, mesh_axis in split_candidates:
    split_axis_name = mesh_axis + INTERMEDIATE_SPLIT_SUFFIX
    intermediate_spec_list[dim_idx] = split_axis_name
  intermediate_spec = jax.sharding.PartitionSpec(*intermediate_spec_list)

  return intermediate_mesh, intermediate_spec, new_replicated_axis_names


def find_intermediate_sharding(
    in_sharding: jax.sharding.Sharding, out_sharding: jax.sharding.Sharding
) -> tuple[jax.sharding.NamedSharding, list[str]]:
  """Finds an intermediate sharding to reshard to before target sharding.

  This function tries to find an intermediate sharding that can be used to
  reshard the in_sharding to the out_sharding. This is useful when resharding
  from an in_sharding to an out_sharding that requires an all-gather, which can
  be expensive.

  For example, consider resharding an array from in_sharding (e.g., [fsdp: 8,
  tp: 1]) to out_sharding (e.g., [fsdp: 1, tp: 4]). In this case, the source
  has a larger sharding factor, 8, than the target's largest sharding factor, 4.
  To avoid an expensive all-gather, we introduce an intermediate sharding, e.g.,
  [fsdp_split: 4, tp: 1, fsdp_replica: 2]). This intermediate sharding
  allows us to reshard the source array by still sharding along the fsdp
  dimension and replicating it on the remaining devices. Then we can reshard any
  replica of the source to the target as normal.

  Args:
    in_sharding: The source sharding.
    out_sharding: The target sharding.

  Returns:
    A tuple containing:
      - An intermediate sharding.
      - A list of axis names that are replicated in the intermediate sharding.

  Raises:
    NoIntermediateShardingError: If no intermediate sharding is found.
    NoIntermediateShardingNeededError: If no intermediate sharding is needed for
      optimization.
  """
  if not isinstance(in_sharding, jax.sharding.NamedSharding) or not isinstance(
      out_sharding, jax.sharding.NamedSharding
  ):
    raise NoIntermediateShardingError(
        "Only NamedSharding is supported for now. Got"
        f" in_sharding={in_sharding} and out_sharding={out_sharding}"
    )
  src_mesh = in_sharding.mesh

  if len(in_sharding.spec) != len(out_sharding.spec):
    raise NoIntermediateShardingError(
        "Source and destination shardings must have the same rank (same"
        f" PartitionSpec length). Got in_sharding.spec={in_sharding.spec} and"
        f" out_sharding.spec={out_sharding.spec}"
    )

  src_dims = _get_sharding_spec_dims(in_sharding)
  dst_dims = _get_sharding_spec_dims(out_sharding)

  _check_sharding_divisibility(in_sharding, out_sharding, src_dims, dst_dims)

  gcd_shards = jax.tree.map(math.gcd, src_dims, dst_dims)

  # If all of the gcd(src_dim_shards, dst_dim_shards) are 1s, an all-gather is
  # needed as the single replica of the source cannot be presented by any
  # sharded form on the target devices.
  if jax.tree.reduce(operator.mul, gcd_shards, 1) != 1:
    raise NoIntermediateShardingNeededError()

  try:
    split_candidates = _get_split_candidates(
        in_sharding, src_dims, dst_dims, gcd_shards
    )
  except NoIntermediateShardingError as e:
    raise NoIntermediateShardingError(
        f"{e} in_sharding={in_sharding}, out_sharding={out_sharding}"
    ) from e

  intermediate_mesh, intermediate_spec, replicated_axes = (
      _build_intermediate_mesh_and_spec(
          src_mesh, in_sharding.spec, src_dims, dst_dims, split_candidates
      )
  )

  intermediate_sharding = jax.sharding.NamedSharding(
      intermediate_mesh,
      intermediate_spec,
      memory_kind=in_sharding.memory_kind,
  )
  return intermediate_sharding, replicated_axes


def reshard_with_intermediate_sharding(
    x: Any,
    in_sharding: jax.sharding.Sharding,
    out_sharding: jax.sharding.Sharding,
    *,
    donate: bool = False,
    may_alias: bool | None = None,  # pylint: disable=unused-argument
) -> Any:
  """Reshards `x` to `sharding` with intermediate sharding."""
  try:
    intermediate_sharding, replicated_axes_names = find_intermediate_sharding(
        in_sharding, out_sharding
    )
  except NoIntermediateShardingError as e:
    _logger.debug(f"No intermediate sharding needed or found. {e}")
    x_to_reshard = x
  else:
    x_to_reshard = jax.jit(
        _identity,
        out_shardings=intermediate_sharding,
    )(x)
    for split_axis in replicated_axes_names:
      x_to_reshard, *_ = split_by_mesh_axis.split_by_mesh_axis(
          x_to_reshard, split_axis
      )

  return reshard(
      x_to_reshard, out_sharding, donate=donate, may_alias=may_alias
  )