Skip to content

Commit a802f42

Browse files
SW publisherJenkins
authored andcommitted
DeepSpeed content for 1.23.0
Signed-off-by: SW publisher <sw_publisher@habana-labs.com>
1 parent 720787e commit a802f42

128 files changed

Lines changed: 11535 additions & 705 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.pre-commit-config.yaml

Lines changed: 0 additions & 89 deletions
This file was deleted.

CODEOWNERS

Lines changed: 2 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -5,55 +5,6 @@
55
# Learn more about CODEOWNERS syntax here:
66
# https://docs.github.com/en/repositories/managing-your-repositorys-settings-and-features/customizing-your-repository/about-code-owners
77

8+
* oelayan@habana.ai lbachar@habana.ai mkovalenko@habana.ai nsonnenschein@habana.ai snahir@habana.ai mmalekan@habana.ai
89

9-
# top-level repo folders
10-
/.github/ @loadams
11-
/azure/ @loadams
12-
/benchmarks/ @guanhuawang @tjruwase
13-
/bin/ @loadams
14-
/csrc/ @tjruwase
15-
/deepspeed/ @loadams @tjruwase
16-
/docker/ @loadams @guanhuawang
17-
/docs/ @loadams @tjruwase
18-
/examples/ @jomayeri @tohtana
19-
/op_builder/ @loadams @tjruwase @jomayeri
20-
/release/ @loadams @jomayeri
21-
/requirements/ @loadams
22-
/scripts/ @loadams @tjruwase
23-
/tests/ @tjruwase @loadams @tohtana
24-
25-
# deepspeed
26-
/deepspeed/autotuning/ @loadams
27-
/deepspeed/checkpoint/ @tjruwase
28-
/deepspeed/comm/ @guanhuawang
29-
/deepspeed/compression/ @tjruwase
30-
/deepspeed/elasticity/ @tjruwase
31-
/deepspeed/launcher/ @loadams
32-
/deepspeed/module_inject/ @hwchen2017 @loadams
33-
/deepspeed/moe/ @tohtana
34-
/deepspeed/monitor/ @tjruwase
35-
/deepspeed/nebula/ @tjruwase
36-
/deepspeed/nvme/ @tjruwase @jomayeri
37-
/deepspeed/ops/ @tohtana
38-
/deepspeed/pipe/ @tohtana @loadams
39-
/deepspeed/profiling/ @loadams
40-
/deepspeed/sequence/ @tohtana
41-
/deepspeed/utils/ @tjruwase @tohtana
42-
43-
# inference
44-
/deepspeed/inference/ @hwchen2017 @tohtana
45-
/deepspeed/model_implementations/@tohtana @loadams
46-
47-
# training
48-
/deepspeed/runtime/ @tjruwase @tohtana
49-
/deepspeed/runtime/activation_checkpointing/ @tjruwase
50-
/deepspeed/runtime/checkpoint_engine/ @tjruwase
51-
/deepspeed/runtime/comm/ @guanhuawang
52-
/deepspeed/runtime/compression/ @tjruwase
53-
/deepspeed/runtime/data_pipeline/ @tjruwase
54-
/deepspeed/runtime/domino/ @guanhuawang @hwchen2017
55-
/deepspeed/runtime/fp16/ @tjruwase @tohtana
56-
/deepspeed/runtime/fp16/onebit/ @tjruwase
57-
/deepspeed/runtime/pipe/ @loadams @tohtana
58-
/deepspeed/runtime/swap_tensor/ @tjruwase @jomayeri
59-
/deepspeed/runtime/zero/ @tjruwase @tohtana
10+
CODEOWNERS oelayan@habana.ai oabramovich@habana.ai

accelerator/hpu_accelerator.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -301,6 +301,14 @@ def get_op_builder(self, class_name):
301301
else:
302302
return self.class_dict['NotImplementedBuilder'] if 'NotImplementedBuilder' in self.class_dict else None
303303

304+
#shall be removed once moving to torch.compile
305+
def wrap_in_hpu_graph(self, module):
306+
if self.hpu.is_lazy():
307+
module = self.hpu.wrap_in_hpu_graph(module)
308+
else:
309+
print("Warning: hpu graphs in eager mode is not supported, ignoring")
310+
return module
311+
304312
def build_extension(self):
305313
from torch.utils.cpp_extension import BuildExtension
306314
return BuildExtension
@@ -309,6 +317,7 @@ def export_envs(self):
309317
return []
310318

311319
def visible_devices_envs(self):
320+
# TODO SW-195658: remove WA to not return HABANA_VISIBLE_MODULES once SW-195657 is resolved
312321
# Current way deepspeed set this env var is not applicable with all HPU instances
313322
# User has to follow instructions in:
314323
# https://docs.habana.ai/en/latest/PyTorch/Reference/PT_Multiple_Tenants_on_HPU/Multiple_Workloads_Single_Docker.html

accelerator/real_accelerator.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,7 @@ def get_accelerator():
6767
f"XPU_Accelerator requires intel_extension_for_pytorch, which is not installed on this system.")
6868
elif accelerator_name == "xpu.external":
6969
try:
70-
import intel_extension_for_deepspeed # noqa: F401 # type: ignore
70+
from intel_extension_for_deepspeed import XPU_Accelerator # noqa: F401 # type: ignore
7171
except ImportError as e:
7272
raise ValueError(
7373
f"XPU_Accelerator external requires intel_extension_for_deepspeed, which is not installed on this system."
@@ -224,6 +224,12 @@ def get_accelerator():
224224
ds_accelerator = CPU_Accelerator()
225225
elif accelerator_name == "xpu.external":
226226
# XPU_Accelerator is already imported in detection stage
227+
try:
228+
from intel_extension_for_deepspeed import XPU_Accelerator # noqa: F811
229+
except ImportError as e:
230+
raise ValueError(
231+
f"XPU_Accelerator external requires intel_extension_for_deepspeed, which is not installed on this system."
232+
)
227233
ds_accelerator = XPU_Accelerator()
228234
elif accelerator_name == "xpu":
229235
from .xpu_accelerator import XPU_Accelerator
@@ -258,7 +264,7 @@ def get_accelerator():
258264
def set_accelerator(accel_obj):
259265
global ds_accelerator
260266
_validate_accelerator(accel_obj)
261-
if accel_logger is not None:
267+
if accel_logger is not None and accel_obj is not None:
262268
accel_logger.info(f"Setting ds_accelerator to {accel_obj._name} (model specified)")
263269
ds_accelerator = accel_obj
264270

build.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
+hpu.synapse.v1.23.0

csrc/fp_quantizer/fp_quantize.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424

2525
at::Tensor quantize(torch::Tensor& out,
2626
torch::Tensor& val,
27+
torch::Tensor& scale,
2728
int group_size,
2829
int stochastic_rounding,
2930
int q_bits,
@@ -59,6 +60,7 @@ at::Tensor quantize(torch::Tensor& out,
5960

6061
void dequantize(torch::Tensor& val,
6162
torch::Tensor& val_q,
63+
torch::Tensor& scale,
6264
int group_size,
6365
int q_mantisa_bits,
6466
int q_exponent_bits)

deepspeed/autotuning/autotuner.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,7 @@ def __init__(self, args, active_resources):
8181
if not os.path.exists(self.results_dir):
8282
try:
8383
os.makedirs(self.results_dir, exist_ok=True)
84-
logger.info(f"Created autotuning results directory: {self.exps_dir}")
84+
logger.info(f"Created autotuning results directory: {self.results_dir}")
8585
except:
8686
logger.error(
8787
f"Failed to create {self.results_dir}, please check results_dir in the autotuning config file is accessible by all the nodes in the job."

deepspeed/autotuning/constants.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -144,7 +144,7 @@
144144
"zero_optimization": {
145145
"stage": 3
146146
},
147-
"memory_break_down": False
147+
"memory_breakdown": False
148148
}
149149

150150
DEFAULT_TUNING_SPACE_ZERO_0 = {"zero_optimization": {"stage": 0}}

deepspeed/comm/ccl.py

Lines changed: 5 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -77,27 +77,12 @@ def run_collective(self, name, **kwargs):
7777
return CCLHandler(self.ccl_comm_op)
7878

7979
def all_reduce(self, tensor, op=ReduceOp.SUM, group=None, async_op=False):
80-
use_caching = False
81-
if use_caching:
82-
match_id = f"{tensor.size()}-{op}"
83-
name = "all_reduce_caching"
84-
if name in self.available_coll:
85-
group = self.get_all_ranks_from_group(group)
86-
return self.ccl_comm_op.all_reduce_caching(tensor, op, match_id, group, async_op)
87-
else:
88-
return self.run_collective(name=name,
89-
tensor=tensor,
90-
op=op,
91-
match_id=match_id,
92-
group=group,
93-
async_op=async_op)
80+
name = "all_reduce"
81+
if name in self.available_coll:
82+
group = self.get_all_ranks_from_group(group)
83+
return self.ccl_comm_op.all_reduce(tensor, op, group, async_op)
9484
else:
95-
name = "all_reduce"
96-
if name in self.available_coll:
97-
group = self.get_all_ranks_from_group(group)
98-
return self.ccl_comm_op.all_reduce(tensor, op, group, async_op)
99-
else:
100-
return self.run_collective(name=name, tensor=tensor, op=op, group=group, async_op=async_op)
85+
return self.run_collective(name=name, tensor=tensor, op=op, group=group, async_op=async_op)
10186

10287
def inference_all_reduce(self, tensor, op=ReduceOp.SUM, group=None):
10388
name = "inference_all_reduce"

deepspeed/compile/inductor.py

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -4,16 +4,18 @@
44
# DeepSpeed Team
55

66
import torch
7+
from deepspeed.utils.torch import required_torch_version
78

89
try:
9-
import torch.utils._pytree as pytree
10-
from torch._functorch.aot_autograd import create_aot_dispatcher_function
11-
from torch._inductor.lowering import register_lowering, fallbacks, add_needs_realized_inputs
12-
from torch._inductor.ir import TensorBox, FallbackKernel, Layout, IRNode
13-
from torch._inductor.virtualized import V
14-
from torch._inductor.scheduler import Scheduler
15-
16-
original_create_aot_dispatcher_function = create_aot_dispatcher_function
10+
if required_torch_version(min_version=2.6):
11+
import torch.utils._pytree as pytree
12+
from torch._functorch.aot_autograd import create_aot_dispatcher_function
13+
from torch._inductor.lowering import register_lowering, fallbacks, add_needs_realized_inputs
14+
from torch._inductor.ir import TensorBox, FallbackKernel, Layout, IRNode
15+
from torch._inductor.virtualized import V
16+
from torch._inductor.scheduler import Scheduler
17+
18+
original_create_aot_dispatcher_function = create_aot_dispatcher_function
1719
except ImportError:
1820
pass
1921

0 commit comments

Comments
 (0)