ml-explore
diff --git a/‎docs/src/dev/extensions.rst‎
Lines changed: 2 additions & 2 deletions b/‎docs/src/dev/extensions.rst‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎examples/extensions/axpby/axpby.cpp‎
Lines changed: 1 addition & 1 deletion b/‎examples/extensions/axpby/axpby.cpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎mlx/backend/cuda/quantized/quantized.cpp‎
Lines changed: 1 addition & 2 deletions b/‎mlx/backend/cuda/quantized/quantized.cpp‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎mlx/backend/metal/allocator.cpp‎
Lines changed: 6 additions & 7 deletions b/‎mlx/backend/metal/allocator.cpp‎
Lines changed: 6 additions & 7 deletions
diff --git a/‎mlx/backend/metal/allocator.h‎
Lines changed: 2 additions & 3 deletions b/‎mlx/backend/metal/allocator.h‎
Lines changed: 2 additions & 3 deletions
diff --git a/‎mlx/backend/metal/binary.cpp‎
Lines changed: 1 addition & 1 deletion b/‎mlx/backend/metal/binary.cpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎mlx/backend/metal/compiled.cpp‎
Lines changed: 1 addition & 1 deletion b/‎mlx/backend/metal/compiled.cpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎mlx/backend/metal/conv.cpp‎
Lines changed: 13 additions & 13 deletions b/‎mlx/backend/metal/conv.cpp‎
Lines changed: 13 additions & 13 deletions
diff --git a/‎mlx/backend/metal/copy.cpp‎
Lines changed: 2 additions & 2 deletions b/‎mlx/backend/metal/copy.cpp‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎mlx/backend/metal/custom_kernel.cpp‎
Lines changed: 2 additions & 2 deletions b/‎mlx/backend/metal/custom_kernel.cpp‎
Lines changed: 2 additions & 2 deletions
@@ -404,7 +404,7 @@ below.
         auto kernel = d.get_kernel(kname, lib);
 
         // Prepare to encode kernel
-        auto& compute_encoder = d.get_command_encoder(s.index);
+        auto& compute_encoder = metal::get_command_encoder(s);
         compute_encoder.set_compute_pipeline_state(kernel);
 
         // Kernel parameters are registered with buffer indices corresponding to
@@ -448,7 +448,7 @@ We can now call the :meth:`axpby` operation on both the CPU and the GPU!
 
 A few things to note about MLX and Metal before moving on. MLX keeps track of
 the active ``command_buffer`` and the ``MTLCommandBuffer`` to which it is
-associated. We rely on :meth:`d.get_command_encoder` to give us the active
+associated. We rely on :meth:`metal::get_command_encoder` to give us the active
 metal compute command encoder instead of building a new one and calling
 :meth:`compute_encoder->end_encoding` at the end. MLX adds kernels (compute
 pipelines) to the active command buffer until some specified limit is hit or
 
@@ -192,7 +192,7 @@ void Axpby::eval_gpu(
   auto kernel = d.get_kernel(kname, lib);
 
   // Prepare to encode kernel
-  auto& compute_encoder = d.get_command_encoder(s.index);
+  auto& compute_encoder = metal::get_command_encoder(s);
   compute_encoder.set_compute_pipeline_state(kernel);
 
   // Kernel parameters are registered with buffer indices corresponding to
 
@@ -109,8 +109,7 @@ void fast::Quantize::eval_gpu(
     std::vector<array>& outputs) {
   nvtx3::scoped_range r("Quantize::eval_gpu");
   auto& s = stream();
-  auto& d = cu::device(s.device);
-  auto& enc = d.get_command_encoder(s);
+  auto& enc = cu::get_command_encoder(s);
   if (dequantize_) {
     auto wq = ensure_row_contiguous(inputs[0], enc, s);
     auto scales = ensure_row_contiguous(inputs[1], enc, s);
 
@@ -31,8 +31,9 @@ void* Buffer::raw_ptr() {
 
 namespace metal {
 
-MetalAllocator::MetalAllocator()
-    : device_(device(mlx::core::Device::gpu).mtl_device()),
+MetalAllocator::MetalAllocator(Device& d)
+    : device_(d.mtl_device()),
+      residency_set_(d.residency_set()),
       buffer_cache_(
           vm_page_size,
           [](MTL::Buffer* buf) { return buf->length(); },
@@ -42,8 +43,7 @@ MetalAllocator::MetalAllocator()
             }
             auto pool = metal::new_scoped_memory_pool();
             buf->release();
-          }),
-      residency_set_(device_) {
+          }) {
   const auto& info = gpu::device_info(0);
   auto memsize = std::get<size_t>(info.at("memory_size"));
   auto max_rec_size =
@@ -52,8 +52,6 @@ MetalAllocator::MetalAllocator()
   block_limit_ = std::min(1.5 * max_rec_size, 0.95 * memsize);
   gc_limit_ = std::min(static_cast<size_t>(0.95 * max_rec_size), block_limit_);
   max_pool_size_ = block_limit_;
-  device(mlx::core::Device::gpu)
-      .set_residency_set(residency_set_.mtl_residency_set());
   bool is_vm = std::get<std::string>(info.at("device_name")) ==
       "Apple Paravirtual device";
   if (is_vm) {
@@ -226,7 +224,8 @@ MetalAllocator& allocator() {
   // By creating the |allocator_| on heap, the destructor of MetalAllocator
   // will not be called on exit and buffers in the cache will be leaked. This
   // can save some time at program exit.
-  static MetalAllocator* allocator_ = new MetalAllocator;
+  static MetalAllocator* allocator_ =
+      new MetalAllocator(device(mlx::core::Device::gpu));
   return *allocator_;
 }
 
 
@@ -9,7 +9,6 @@
 #include "mlx/allocator.h"
 #include "mlx/backend/common/buffer_cache.h"
 #include "mlx/backend/metal/device.h"
-#include "mlx/backend/metal/resident.h"
 
 namespace mlx::core::metal {
 
@@ -52,13 +51,13 @@ class MetalAllocator : public allocator::Allocator {
   static constexpr int small_size_ = 256;
   static constexpr int heap_size_ = 1 << 20;
 
-  MetalAllocator();
+  MetalAllocator(Device& d);
   ~MetalAllocator();
 
   friend MetalAllocator& allocator();
 
   NS::SharedPtr<MTL::Heap> heap_;
-  ResidencySet residency_set_;
+  ResidencySet& residency_set_;
 
   // Caching allocator
   BufferCache<MTL::Buffer> buffer_cache_;
 
@@ -106,7 +106,7 @@ void binary_op_gpu_inplace(
   auto kernel = outputs.size() == 2
       ? get_binary_two_kernel(d, kernel_name, a.dtype(), out.dtype(), op)
       : get_binary_kernel(d, kernel_name, a.dtype(), out.dtype(), op);
-  auto& compute_encoder = d.get_command_encoder(s.index);
+  auto& compute_encoder = metal::get_command_encoder(s);
   compute_encoder.set_compute_pipeline_state(kernel);
 
   int arg_idx = 0;
 
@@ -389,7 +389,7 @@ void Compiled::eval_gpu(
     kernel_name += "_large";
   }
   auto kernel = d.get_kernel(kernel_name, lib);
-  auto& compute_encoder = d.get_command_encoder(s.index);
+  auto& compute_encoder = metal::get_command_encoder(s);
   compute_encoder.set_compute_pipeline_state(kernel);
 
   // Put the inputs in
 
@@ -26,7 +26,7 @@ ensure_row_contiguous(const array& x, metal::Device& d, const Stream& s) {
     return x;
   }
   auto result = contiguous_copy_gpu(x, s);
-  d.add_temporary(result, s.index);
+  metal::get_command_encoder(s).add_temporary(result);
   return result;
 }
 
@@ -52,7 +52,7 @@ void explicit_gemm_conv_ND_gpu(
   std::string kname;
   kname.reserve(32);
   concatenate(kname, "naive_unfold_nd_", type_to_name(in_unfolded), "_", N);
-  auto& compute_encoder = d.get_command_encoder(s.index);
+  auto& compute_encoder = metal::get_command_encoder(s);
   auto kernel = d.get_kernel(kname);
   compute_encoder.set_compute_pipeline_state(kernel);
 
@@ -132,7 +132,7 @@ void explicit_gemm_conv_group_ND_gpu(
   kname.reserve(32);
   concatenate(
       kname, "naive_unfold_transpose_nd_", type_to_name(in_unfolded), "_", N);
-  auto& compute_encoder = d.get_command_encoder(s.index);
+  auto& compute_encoder = metal::get_command_encoder(s);
   auto kernel = d.get_kernel(kname);
   compute_encoder.set_compute_pipeline_state(kernel);
 
@@ -286,7 +286,7 @@ void implicit_gemm_conv_2D_gpu(
       small_filter ? 's' : 'l');
 
   // Encode and dispatch kernel
-  auto& compute_encoder = d.get_command_encoder(s.index);
+  auto& compute_encoder = metal::get_command_encoder(s);
   auto kernel = get_steel_conv_kernel(
       d,
       kname,
@@ -469,7 +469,7 @@ void implicit_gemm_conv_2D_general_gpu(
   };
 
   // Encode and dispatch kernel
-  auto& compute_encoder = d.get_command_encoder(s.index);
+  auto& compute_encoder = metal::get_command_encoder(s);
   auto kernel = get_steel_conv_general_kernel(
       d, kname, hash_name, func_consts, out, bm, bn, bk, wm, wn);
   compute_encoder.set_compute_pipeline_state(kernel);
@@ -595,7 +595,7 @@ void implicit_gemm_conv_3D_gpu(
       small_filter ? 's' : 'l');
 
   // Encode and dispatch kernel
-  auto& compute_encoder = d.get_command_encoder(s.index);
+  auto& compute_encoder = metal::get_command_encoder(s);
   auto kernel =
       get_steel_conv_3d_kernel(d, kname, out, bm, bn, bk, wm, wn, small_filter);
   compute_encoder.set_compute_pipeline_state(kernel);
@@ -644,7 +644,7 @@ void pad_and_slice_conv_3D_gpu(
     array x_copy(xshape, x.dtype(), nullptr, {});
     array zero(0, x.dtype());
     pad_gpu(x, zero, x_copy, {0, -1}, {0, 0}, s);
-    d.add_temporary(x_copy, s.index);
+    metal::get_command_encoder(s).add_temporary(x_copy);
 
     return x_copy;
   };
@@ -804,7 +804,7 @@ void winograd_conv_2D_gpu(
         type_to_name(out),
         "_bc",
         bc);
-    auto& compute_encoder = d.get_command_encoder(s.index);
+    auto& compute_encoder = metal::get_command_encoder(s);
     auto kernel = d.get_kernel(kname);
     compute_encoder.set_compute_pipeline_state(kernel);
 
@@ -837,7 +837,7 @@ void winograd_conv_2D_gpu(
         type_to_name(out),
         "_bc",
         bc);
-    auto& compute_encoder = d.get_command_encoder(s.index);
+    auto& compute_encoder = metal::get_command_encoder(s);
     auto kernel = d.get_kernel(kname);
     compute_encoder.set_compute_pipeline_state(kernel);
 
@@ -889,7 +889,7 @@ void winograd_conv_2D_gpu(
         type_to_name(out),
         "_bo",
         bc);
-    auto& compute_encoder = d.get_command_encoder(s.index);
+    auto& compute_encoder = metal::get_command_encoder(s);
     auto kernel = d.get_kernel(kname);
     compute_encoder.set_compute_pipeline_state(kernel);
 
@@ -950,7 +950,7 @@ void depthwise_conv_2D_gpu(
   "_tgp_w_", tw,
   "_do_flip_", do_flip ? 't' : 'n'); // clang-format on
 
-  auto& compute_encoder = d.get_command_encoder(s.index);
+  auto& compute_encoder = metal::get_command_encoder(s);
   auto kernel = d.get_kernel(base_name, hash_name, func_consts);
   compute_encoder.set_compute_pipeline_state(kernel);
 
@@ -1044,7 +1044,7 @@ void depthwise_conv_1D_gpu(
       type_to_name(out),
       large ? "_large" : "");
 
-  auto& compute_encoder = d.get_command_encoder(s.index);
+  auto& compute_encoder = metal::get_command_encoder(s);
   auto kernel = d.get_kernel(base_name);
   compute_encoder.set_compute_pipeline_state(kernel);
 
@@ -1348,7 +1348,7 @@ void Convolution::eval_gpu(const std::vector<array>& inputs, array& out) {
 
   // Record copies
   if (!copies.empty()) {
-    d.add_temporaries(std::move(copies), s.index);
+    metal::get_command_encoder(s).add_temporaries(std::move(copies));
   }
 }
 
 
@@ -107,7 +107,7 @@ void copy_gpu_inplace(
   auto kernel = dynamic ? get_dynamic_copy_kernel(d, kernel_name, in, out)
                         : get_copy_kernel(d, kernel_name, in, out);
 
-  auto& compute_encoder = d.get_command_encoder(s.index);
+  auto& compute_encoder = metal::get_command_encoder(s);
   compute_encoder.set_compute_pipeline_state(kernel);
 
   inp_offset *= size_of(in.dtype());
@@ -190,7 +190,7 @@ void fill_gpu(const array& val, array& out, const Stream& s) {
   std::string kernel_name = large ? "s2" : (work_per_thread > 1 ? "sn" : "s");
   concatenate(kernel_name, "_copy", type_to_name(val), type_to_name(out));
   auto kernel = get_copy_kernel(d, kernel_name, val, out);
-  auto& compute_encoder = d.get_command_encoder(s.index);
+  auto& compute_encoder = metal::get_command_encoder(s);
   compute_encoder.set_compute_pipeline_state(kernel);
 
   compute_encoder.set_input_array(val, 0);
 
@@ -378,7 +378,7 @@ void CustomKernel::eval_gpu(
 
   auto lib = d.get_library(name_, [this] { return metal::utils() + source_; });
   auto kernel = d.get_kernel(name_, lib);
-  auto& compute_encoder = d.get_command_encoder(s.index);
+  auto& compute_encoder = metal::get_command_encoder(s);
   compute_encoder.set_compute_pipeline_state(kernel);
   int index = 0;
   for (int i = 0; i < checked_inputs.size(); i++) {
@@ -424,7 +424,7 @@ void CustomKernel::eval_gpu(
   MTL::Size grid_dims = MTL::Size(gx, gy, gz);
   compute_encoder.dispatch_threads(grid_dims, group_dims);
 
-  d.add_temporaries(std::move(copies), s.index);
+  compute_encoder.add_temporaries(std::move(copies));
 }
 
 } // namespace mlx::core::fast
Original file line number	Diff line number	Diff line change
`@@ -389,7 +389,7 @@ void Compiled::eval_gpu(`
`389`	`389`	`kernel_name += "_large";`
`390`	`390`	`}`
`391`	`391`	`auto kernel = d.get_kernel(kernel_name, lib);`
`392`		`- auto& compute_encoder = d.get_command_encoder(s.index);`
	`392`	`+ auto& compute_encoder = metal::get_command_encoder(s);`
`393`	`393`	`compute_encoder.set_compute_pipeline_state(kernel);`
`394`	`394`
`395`	`395`	`// Put the inputs in`