From 683c10dbb1a6bb0affc1cfbf92bcc462d352fabf Mon Sep 17 00:00:00 2001 From: Adam Sellers Date: Mon, 30 Mar 2026 22:48:01 +1100 Subject: [PATCH 1/3] Fix metal::vec namespace and metal_stdlib include for Metal Toolchain 32023 (macOS 26) Metal Toolchain 32023 (Xcode 26, macOS 26 Tahoe) introduced two breaking changes: 1. vec is no longer in the global namespace - must be qualified as metal::vec 2. no longer transitively includes , so bfloat (aliased as bfloat16_t in bf16.h) is not in scope without an explicit include Without this fix MLX silently falls back to CPU dispatch on macOS 26, resulting in ~50% of expected throughput despite mx.default_device() reporting Device(gpu, 0). Tested on M3 Ultra, macOS 26.4 (25E246), Metal Toolchain 32023.883: - Before: ~17 tok/sec generation, GPU 2% active - After: ~35 tok/sec generation, GPU 100% active @ 1380MHz --- mlx/backend/metal/kernels/utils.h | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/mlx/backend/metal/kernels/utils.h b/mlx/backend/metal/kernels/utils.h index d356450101..cd1dca2af8 100644 --- a/mlx/backend/metal/kernels/utils.h +++ b/mlx/backend/metal/kernels/utils.h @@ -2,6 +2,7 @@ #pragma once +#include #include #include "mlx/backend/metal/kernels/bf16.h" @@ -147,13 +148,13 @@ METAL_FUNC IdxT elem_to_loc_3(uint3 elem, constant const int64_t strides[3]) { // Multiple Arrays with generic dims template -METAL_FUNC vec elem_to_loc_2_nd( +METAL_FUNC metal::vec elem_to_loc_2_nd( uint3 elem, constant const int* shape, constant const int64_t* a_strides, constant const int64_t* b_strides, int ndim) { - vec loc = { + metal::vec loc = { IdxT( elem.x * IdxT(a_strides[ndim - 1]) + IdxT(elem.y) * IdxT(a_strides[ndim - 2])), @@ -170,14 +171,14 @@ METAL_FUNC vec elem_to_loc_2_nd( } template -METAL_FUNC vec elem_to_loc_3_nd( +METAL_FUNC metal::vec elem_to_loc_3_nd( uint3 elem, constant const int* shape, constant const int64_t* a_strides, constant const int64_t* b_strides, constant const int64_t* c_strides, int ndim) { - vec loc = { + metal::vec loc = { IdxT(elem.x * IdxT(a_strides[ndim - 1])) + IdxT(elem.y * IdxT(a_strides[ndim - 2])), IdxT(elem.x * IdxT(b_strides[ndim - 1])) + From b3bd55505fb912c712b57132e311aba03c4b34d8 Mon Sep 17 00:00:00 2001 From: Adam Sellers Date: Mon, 30 Mar 2026 22:53:28 +1100 Subject: [PATCH 2/3] Apply clang-format --- mlx/backend/metal/kernels/utils.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mlx/backend/metal/kernels/utils.h b/mlx/backend/metal/kernels/utils.h index cd1dca2af8..50b7170d28 100644 --- a/mlx/backend/metal/kernels/utils.h +++ b/mlx/backend/metal/kernels/utils.h @@ -2,8 +2,8 @@ #pragma once -#include #include +#include #include "mlx/backend/metal/kernels/bf16.h" #include "mlx/backend/metal/kernels/bf16_math.h" From bcfb117f2007a4e2fb016e2fa1f6aa1d2c77ea41 Mon Sep 17 00:00:00 2001 From: Adam Sellers Date: Tue, 31 Mar 2026 13:40:29 +1100 Subject: [PATCH 3/3] Trigger CI