diff --git a/.github/workflows/tests-nightly.yml b/.github/workflows/tests-nightly.yml index 85959682d..db8a1d6d2 100644 --- a/.github/workflows/tests-nightly.yml +++ b/.github/workflows/tests-nightly.yml @@ -20,7 +20,7 @@ jobs: platform: [linux-x64, linux-aarch64, macos, windows] # default runners don't have AVX-512 support, but icelake does cpu_type: ["", icelake] - torch_version: ["2.4.1", "2.10.0", "2.11.0"] + torch_version: ["2.4.1", "2.12.0", "nightly"] exclude: # aarch64 minimum torch version is 2.5.1 diff --git a/.github/workflows/tests-pr.yml b/.github/workflows/tests-pr.yml index e75cccfcc..f04631a87 100644 --- a/.github/workflows/tests-pr.yml +++ b/.github/workflows/tests-pr.yml @@ -31,7 +31,7 @@ jobs: platform: [linux-x64, linux-aarch64, macos] # default runners don't have AVX-512 support, but icelake does cpu_type: ["", icelake] - torch_version: ["2.4.1", "2.11.0"] + torch_version: ["2.4.1", "2.12.0"] exclude: # aarch64 minimum torch version is 2.5.1 diff --git a/bitsandbytes/backends/cpu/ops.py b/bitsandbytes/backends/cpu/ops.py index 6b82c2421..a6277e5cf 100755 --- a/bitsandbytes/backends/cpu/ops.py +++ b/bitsandbytes/backends/cpu/ops.py @@ -153,9 +153,9 @@ def _( lambda: f"Blockwise 4bit dequantization only supports 16/32-bit floats, but got {dtype}", ) - # Fallback as AVX512 implementation has accuracy issues with fp16/fp32 and blocksize >= 2048 + # Fallback as AVX512 implementation has accuracy issues with blocksize >= 2048. # Note: this is not a common use case. - avx512_fallback = _has_avx512 and blocksize >= 2048 and dtype != torch.bfloat16 + avx512_fallback = _has_avx512 and blocksize >= 2048 # Odd shape is not supported by this kernel; fallback to generic implementation shape_fallback = shape[-1] % 2 != 0 diff --git a/tests/test_linear4bit.py b/tests/test_linear4bit.py index 8be220139..ee1433641 100644 --- a/tests/test_linear4bit.py +++ b/tests/test_linear4bit.py @@ -365,8 +365,13 @@ def test_linear4bit_torch_compile(device, quant_type, compute_dtype, compress_st if fullgraph and torch.__version__ < (2, 8, 0, "dev"): pytest.skip("fullgraph mode requires torch 2.8 or higher") - if device == "cuda" and platform.system() == "Windows": - pytest.skip("Triton is not officially supported on Windows") + if platform.system() == "Windows": + if device == "cuda": + pytest.skip("Triton is not officially supported on Windows") + if device == "cpu" and torch.__version__ < (2, 7): + # torch.compile inductor on Windows CPU has include path bugs fixed in torch 2.7 + # https://github.com/pytorch/pytorch/pull/148271 + pytest.skip("torch.compile inductor on Windows CPU requires torch >= 2.7") # Has a strange regression on Linux aarch64 CPU in torch==2.6.0 when fullgraph=False. if ( diff --git a/tests/test_linear8bitlt.py b/tests/test_linear8bitlt.py index 43d4a4942..b078e82b7 100644 --- a/tests/test_linear8bitlt.py +++ b/tests/test_linear8bitlt.py @@ -261,8 +261,13 @@ def test_linear8bitlt_torch_compile(device, threshold, bias, fullgraph, mode): if fullgraph and torch.__version__ < (2, 5): pytest.skip("fullgraph tracing of MatmulLtState requires torch >= 2.5") - if device == "cuda" and platform.system() == "Windows": - pytest.skip("Triton is not officially supported on Windows") + if platform.system() == "Windows": + if device == "cuda": + pytest.skip("Triton is not officially supported on Windows") + if device == "cpu" and torch.__version__ < (2, 7): + # torch.compile inductor on Windows CPU has include path bugs fixed in torch 2.7 + # https://github.com/pytorch/pytorch/pull/148271 + pytest.skip("torch.compile inductor on Windows CPU requires torch >= 2.7") if device == "cuda" and mode == "reduce-overhead" and fullgraph and threshold > 0 and torch.__version__ >= (2, 10): pytest.xfail("Failure due to regression in torch 2.10 related to reduced overhead mode and CUDA.")