Skip to content

Commit d8d865f

Browse files
delockloadamsLiangliang-MaQuentin-Anthonydashstander
authored
[Fix] Fix cpu inference UT failure (#4430)
This PR fix UT test error as described in this PR and the following test job. This PR skips `TestModelTask` if dtype is not supported by accelerator, or `InferenceBuilder` is not implemented by accelerator. #4419 https://github.com/microsoft/DeepSpeed/actions/runs/6341645987/job/17235544538 --------- Co-authored-by: Logan Adams <114770087+loadams@users.noreply.github.com> Co-authored-by: Liangliang-Ma <1906710196@qq.com> Co-authored-by: Quentin Anthony <qganthony@yahoo.com> Co-authored-by: Dashiell Stander <dash.stander@gmail.com> Co-authored-by: Olatunji Ruwase <olruwase@microsoft.com> Co-authored-by: Ramya Ramineni <62723901+rraminen@users.noreply.github.com> Co-authored-by: Xie Zejian <xiezej@gmail.com> Co-authored-by: Conglong Li <conglong.li@gmail.com> Co-authored-by: Michael Wyatt <michaelwyatt@microsoft.com>
1 parent 75db3d7 commit d8d865f

6 files changed

Lines changed: 94 additions & 29 deletions

File tree

.github/workflows/cpu-inference.yml

Lines changed: 43 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,22 +1,43 @@
11
name: cpu-inference
22

33
on:
4+
pull_request:
5+
paths-ignore:
6+
- 'docs/**'
7+
- 'blogs/**'
48
workflow_dispatch:
9+
merge_group:
10+
branches: [ master ]
11+
512

613
concurrency:
714
group: ${{ github.workflow }}-${{ github.ref }}
815
cancel-in-progress: true
916

1017
jobs:
1118
unit-tests:
12-
runs-on: ubuntu-20.04
19+
runs-on: [self-hosted, cpu]
1320

1421
steps:
1522
- uses: actions/checkout@v3
1623

1724
- id: setup-venv
1825
uses: ./.github/workflows/setup-venv
1926

27+
- name: Install gcc-9
28+
run: |
29+
sudo add-apt-repository -u ppa:ubuntu-toolchain-r/test
30+
sudo apt install -y gcc-9 g++-9
31+
# set gcc-9 and g++9 to default
32+
sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-9 99
33+
sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-9 99
34+
35+
- name: Check gcc version
36+
run: |
37+
# Get gcc version
38+
gcc --version
39+
g++ --version
40+
2041
- name: Detect instruction sets on instance
2142
run: |
2243
lscpu
@@ -33,8 +54,16 @@ jobs:
3354
3455
- name: Install oneCCL Bindings for PyTorch
3556
run: |
57+
pip install torch
3658
python -m pip install intel_extension_for_pytorch
37-
python -m pip install oneccl_bind_pt==2.0 -f https://developer.intel.com/ipex-whl-stable-cpu
59+
# the curl line is for troubleshooting
60+
curl -L https://pytorch-extension.intel.com/release-whl/stable/cpu/us/
61+
python -m pip install oneccl_bind_pt --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/cpu/us/
62+
pip install py-cpuinfo
63+
# check installed version
64+
pip list |grep \\\<torch\\\>
65+
pip list |grep intel-extension-for-pytorch
66+
pip list |grep oneccl-bind-pt
3867
3968
- name: Install oneCCL
4069
run: |
@@ -62,14 +91,22 @@ jobs:
6291
pip install .[dev,1bit,autotuning,inf]
6392
ds_report
6493
65-
- name: Python environment
94+
- name: Python environment check
6695
run: |
6796
pip list
97+
source oneCCL/build/_install/env/setvars.sh
98+
export LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libstdc++.so.6
99+
# check whether the environment is properly setup
100+
python -c "import torch;import intel_extension_for_pytorch as ipex;import oneccl_bindings_for_pytorch;print('done')"
101+
python -c "import deepspeed;from deepspeed.accelerator import get_accelerator;print(get_accelerator().device_name());print(get_accelerator().is_available())"
68102
69103
- name: Unit tests
70104
run: |
105+
# prep oneCCL for CCLBackend comm ops building
71106
source oneCCL/build/_install/env/setvars.sh
107+
export LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libstdc++.so.6
72108
unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
73-
cd tests
74-
TRANSFORMERS_CACHE=~/tmp/transformers_cache/ TORCH_EXTENSIONS_DIR=./torch-extensions pytest -m 'seq_inference' unit/
75-
TRANSFORMERS_CACHE=~/tmp/transformers_cache/ TORCH_EXTENSIONS_DIR=./torch-extensions pytest -m 'inference_ops' -m 'inference' unit/
109+
cd tests
110+
# LOCAL_SIZE=2 enforce CPU to report 2 devices, this helps run the test on github default runner
111+
LOCAL_SIZE=2 COLUMNS=240 TRANSFORMERS_CACHE=~/tmp/transformers_cache/ TORCH_EXTENSIONS_DIR=./torch-extensions pytest -m 'seq_inference' unit/
112+
LOCAL_SIZE=2 COLUMNS=240 TRANSFORMERS_CACHE=~/tmp/transformers_cache/ TORCH_EXTENSIONS_DIR=./torch-extensions pytest -m 'inference_ops' -m 'inference' unit/

csrc/cpu/comm/ccl.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -537,7 +537,7 @@ static void parallel_memcpy(void* to, void* from, size_t n_bytes)
537537
}
538538
}
539539

540-
void inference_all_reduce(torch::Tensor& data, py::object op, std::vector<int> group, bool async_op)
540+
void inference_all_reduce(torch::Tensor& data, py::object op, bool async_op)
541541
{
542542
static py::object ReduceOp = py::module_::import("deepspeed.comm").attr("ReduceOp");
543543
static auto ReduceOpSum = (int)py::int_(ReduceOp.attr("SUM").attr("value"));
@@ -562,7 +562,7 @@ void inference_all_reduce(torch::Tensor& data, py::object op, std::vector<int> g
562562
data.numel(),
563563
get_ccl_datatype(data.scalar_type()),
564564
get_ccl_reduce_op(op, data),
565-
_get_comm_from_group(group))
565+
_get_comm_from_group())
566566
.wait());
567567
return;
568568
}

deepspeed/comm/ccl.py

Lines changed: 31 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,8 @@ def is_initialized(self):
6161

6262
def run_collective(self, name, **kwargs):
6363
if name in self.available_coll:
64-
kwargs['group'] = self.get_all_ranks_from_group(kwargs['group'])
64+
if 'group' in kwargs:
65+
kwargs['group'] = self.get_all_ranks_from_group(kwargs['group'])
6566
if 'dst' in kwargs:
6667
kwargs['dst'] = kwargs['group'].index(kwargs['dst'])
6768
if 'src' in kwargs:
@@ -71,23 +72,38 @@ def run_collective(self, name, **kwargs):
7172
return CCLHandler(self.ccl_comm_op)
7273
else:
7374
func = "super(CCLBackend, self)." + name
74-
return eval(func)(*(kwargs.values()))
75+
eval(func)(*(kwargs.values()))
76+
return CCLHandler(self.ccl_comm_op)
7577

7678
def all_reduce(self, tensor, op=ReduceOp.SUM, group=None, async_op=False):
7779
use_caching = False
7880
if use_caching:
7981
match_id = f"{tensor.size()}-{op}"
80-
return self.run_collective(name="all_reduce_caching",
81-
tensor=tensor,
82-
op=op,
83-
match_id=match_id,
84-
group=group,
85-
async_op=async_op)
82+
name = "all_reduce_caching"
83+
if name in self.available_coll:
84+
group = self.get_all_ranks_from_group(group)
85+
return self.ccl_comm_op.all_reduce_caching(tensor, op, match_id, group, async_op)
86+
else:
87+
return self.run_collective(name=name,
88+
tensor=tensor,
89+
op=op,
90+
match_id=match_id,
91+
group=group,
92+
async_op=async_op)
8693
else:
87-
return self.run_collective(name="all_reduce", tensor=tensor, op=op, group=group, async_op=async_op)
94+
name = "all_reduce"
95+
if name in self.available_coll:
96+
group = self.get_all_ranks_from_group(group)
97+
return self.ccl_comm_op.all_reduce(tensor, op, group, async_op)
98+
else:
99+
return self.run_collective(name=name, tensor=tensor, op=op, group=group, async_op=async_op)
88100

89101
def inference_all_reduce(self, tensor, op=ReduceOp.SUM, group=None, async_op=False):
90-
return self.run_collective(name="inference_all_reduce", tensor=tensor, op=op, group=group, async_op=async_op)
102+
name = "inference_all_reduce"
103+
if name in self.available_coll:
104+
return self.ccl_comm_op.inference_all_reduce(tensor, op, async_op)
105+
else:
106+
return self.run_collective(name=name, tensor=tensor, op=op, group=None, async_op=async_op)
91107

92108
def broadcast(self, tensor, src, group=None, async_op=False):
93109
return self.run_collective(name="broadcast", tensor=tensor, src=src, group=group, async_op=async_op)
@@ -120,11 +136,11 @@ def all_to_all_single(self, output, input, output_split_sizes, input_split_sizes
120136
input_split_sizes=input_split_sizes,
121137
group=group)
122138

123-
def send(self, tensor, dst, group=None, async_op=False):
124-
return self.run_collective(name="send", tensor=tensor, dst=dst, group=group, async_op=async_op)
139+
def send(self, tensor, dst, group=None, tag=0):
140+
return self.run_collective(name="send", tensor=tensor, dst=dst, group=group, tag=tag)
125141

126-
def recv(self, tensor, src, group=None, async_op=False):
127-
return self.run_collective(name="recv", tensor=tensor, src=src, group=group, async_op=async_op)
142+
def recv(self, tensor, src, group=None, tag=0):
143+
return self.run_collective(name="recv", tensor=tensor, src=src, group=group, tag=tag)
128144

129145
def gather(self, tensor, gather_list, dst, group=None, async_op=False):
130146
return self.run_collective(name="gather", tensor=tensor, gather_list=gather_list, dst=dst, group=group)
@@ -170,7 +186,7 @@ def get_all_ranks_from_group(self, group):
170186
while True:
171187
results.append(super(CCLBackend, self).get_global_rank(group, rank))
172188
rank += 1
173-
except ValueError:
189+
except (ValueError, RuntimeError):
174190
pass
175191
if tuple(results) not in self.groups:
176192
self._new_group(results, group)

docs/_tutorials/accelerator-abstraction-interface.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -96,7 +96,7 @@ To run DeepSpeed model on CPU, use the following steps to prepare environment:
9696

9797
```
9898
python -m pip install intel_extension_for_pytorch
99-
python -m pip install oneccl_bind_pt==2.0 -f https://developer.intel.com/ipex-whl-stable-cpu
99+
python -m pip install oneccl_bind_pt -f https://developer.intel.com/ipex-whl-stable-cpu
100100
git clone https://github.com/oneapi-src/oneCCL
101101
cd oneCCL
102102
mkdir build

tests/unit/inference/test_inference.py

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55

66
import os
77
import time
8+
import pickle
89
import torch
910
import pytest
1011
import itertools
@@ -65,7 +66,13 @@
6566
]
6667

6768
# Get a list of all models and mapping from task to supported models
68-
_hf_models = list(HfApi().list_models())
69+
try:
70+
with open("hf_models.pkl", "rb") as fp:
71+
_hf_models = pickle.load(fp)
72+
except FileNotFoundError:
73+
_hf_models = list(HfApi().list_models())
74+
with open("hf_models.pkl", "wb") as fp:
75+
pickle.dump(_hf_models, fp)
6976
_hf_model_names = [m.modelId for m in _hf_models]
7077
_hf_task_to_models = {task: [m.modelId for m in _hf_models if m.pipeline_tag == task] for task in _test_tasks}
7178

@@ -280,6 +287,12 @@ def test(
280287
if invalid_test_msg:
281288
pytest.skip(invalid_test_msg)
282289

290+
if dtype not in get_accelerator().supported_dtypes():
291+
pytest.skip(f"Acceleraor {get_accelerator().device_name()} does not support {dtype}.")
292+
293+
if not deepspeed.ops.__compatible_ops__[InferenceBuilder.NAME]:
294+
pytest.skip("This op had not been implemented on this system.", allow_module_level=True)
295+
283296
model, task = model_w_task
284297
local_rank = int(os.getenv("LOCAL_RANK", "0"))
285298

@@ -536,9 +549,8 @@ def test(
536549
if dtype not in get_accelerator().supported_dtypes():
537550
pytest.skip(f"Acceleraor {get_accelerator().device_name()} does not support {dtype}.")
538551

539-
# TODO: enable this test after torch 2.1 stable release
540552
if dtype == torch.bfloat16 and model_w_task[0] == "Salesforce/codegen-350M-mono":
541-
pytest.skip("Codegen model(bf16) need to use torch version > 2.0.")
553+
pytest.skip("Disable Codegen model(bf16) due to slight result difference")
542554

543555
model, task = model_w_task
544556
local_rank = int(os.getenv("LOCAL_RANK", "0"))

tests/unit/inference/test_inference_config.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ class TestInferenceConfig(DistributedTest):
1515
world_size = 1
1616

1717
def test_overlap_kwargs(self):
18-
config = {"replace_with_kernel_inject": True}
18+
config = {"replace_with_kernel_inject": True, "dtype": torch.float32}
1919
kwargs = {"replace_with_kernel_inject": True}
2020

2121
engine = deepspeed.init_inference(torch.nn.Module(), config=config, **kwargs)
@@ -37,7 +37,7 @@ def test_kwargs_and_config(self):
3737
assert engine._config.dtype == kwargs["dtype"]
3838

3939
def test_json_config(self, tmpdir):
40-
config = {"replace_with_kernel_inject": True}
40+
config = {"replace_with_kernel_inject": True, "dtype": "torch.float32"}
4141
config_json = create_config_from_dict(tmpdir, config)
4242

4343
engine = deepspeed.init_inference(torch.nn.Module(), config=config_json)

0 commit comments

Comments
 (0)