bitsandbytes-foundation · matthewdouglas · May 15, 2026 · May 14, 2026 · May 14, 2026 · May 14, 2026
diff --git a/.github/workflows/tests-nightly.yml b/.github/workflows/tests-nightly.yml
@@ -20,7 +20,7 @@ jobs:
         platform: [linux-x64, linux-aarch64, macos, windows]
         # default runners don't have AVX-512 support, but icelake does
         cpu_type: ["", icelake]
-        torch_version: ["2.4.1", "2.10.0", "2.11.0"]
+        torch_version: ["2.4.1", "2.12.0", "nightly"]
 
         exclude:
           # aarch64 minimum torch version is 2.5.1

diff --git a/.github/workflows/tests-pr.yml b/.github/workflows/tests-pr.yml
@@ -31,7 +31,7 @@ jobs:
         platform: [linux-x64, linux-aarch64, macos]
         # default runners don't have AVX-512 support, but icelake does
         cpu_type: ["", icelake]
-        torch_version: ["2.4.1", "2.11.0"]
+        torch_version: ["2.4.1", "2.12.0"]
 
         exclude:
           # aarch64 minimum torch version is 2.5.1

diff --git a/bitsandbytes/backends/cpu/ops.py b/bitsandbytes/backends/cpu/ops.py
@@ -153,9 +153,9 @@ def _(
             lambda: f"Blockwise 4bit dequantization only supports 16/32-bit floats, but got {dtype}",
         )
 
-        # Fallback as AVX512 implementation has accuracy issues with fp16/fp32 and blocksize >= 2048
+        # Fallback as AVX512 implementation has accuracy issues with blocksize >= 2048.
         # Note: this is not a common use case.
-        avx512_fallback = _has_avx512 and blocksize >= 2048 and dtype != torch.bfloat16
+        avx512_fallback = _has_avx512 and blocksize >= 2048
 
         # Odd shape is not supported by this kernel; fallback to generic implementation
         shape_fallback = shape[-1] % 2 != 0

diff --git a/tests/test_linear4bit.py b/tests/test_linear4bit.py
@@ -365,8 +365,13 @@ def test_linear4bit_torch_compile(device, quant_type, compute_dtype, compress_st
     if fullgraph and torch.__version__ < (2, 8, 0, "dev"):
         pytest.skip("fullgraph mode requires torch 2.8 or higher")
 
-    if device == "cuda" and platform.system() == "Windows":
-        pytest.skip("Triton is not officially supported on Windows")
+    if platform.system() == "Windows":
+        if device == "cuda":
+            pytest.skip("Triton is not officially supported on Windows")
+        if device == "cpu" and torch.__version__ < (2, 7):
+            # torch.compile inductor on Windows CPU has include path bugs fixed in torch 2.7
+            # https://github.com/pytorch/pytorch/pull/148271
+            pytest.skip("torch.compile inductor on Windows CPU requires torch >= 2.7")
 
     # Has a strange regression on Linux aarch64 CPU in torch==2.6.0 when fullgraph=False.
     if (

diff --git a/tests/test_linear8bitlt.py b/tests/test_linear8bitlt.py
@@ -261,8 +261,13 @@ def test_linear8bitlt_torch_compile(device, threshold, bias, fullgraph, mode):
     if fullgraph and torch.__version__ < (2, 5):
         pytest.skip("fullgraph tracing of MatmulLtState requires torch >= 2.5")
 
-    if device == "cuda" and platform.system() == "Windows":
-        pytest.skip("Triton is not officially supported on Windows")
+    if platform.system() == "Windows":
+        if device == "cuda":
+            pytest.skip("Triton is not officially supported on Windows")
+        if device == "cpu" and torch.__version__ < (2, 7):
+            # torch.compile inductor on Windows CPU has include path bugs fixed in torch 2.7
+            # https://github.com/pytorch/pytorch/pull/148271
+            pytest.skip("torch.compile inductor on Windows CPU requires torch >= 2.7")
 
     if device == "cuda" and mode == "reduce-overhead" and fullgraph and threshold > 0 and torch.__version__ >= (2, 10):
         pytest.xfail("Failure due to regression in torch 2.10 related to reduced overhead mode and CUDA.")