NVIDIA · jamxia155 · Mar 31, 2026 · Apr 2, 2026 · Apr 2, 2026 · Apr 13, 2026
@@ -105,6 +105,7 @@ add_library(
   src/preprocessing/quantize/pq.cpp
   src/preprocessing/quantize/scalar.cpp
   src/distance/pairwise_distance.cpp
+  src/selection/select_k.cpp
 )
 add_library(cuvs::c_api ALIAS cuvs_c)
 set_target_properties(

@@ -131,6 +131,23 @@ CUVS_EXPORT cuvsError_t cuvsStreamSync(cuvsResources_t res);
  */
 CUVS_EXPORT cuvsError_t cuvsDeviceIdGet(cuvsResources_t res, int* device_id);
 
+/**
+ * @brief Configure the temporary workspace on this resources object as an uncapped pool, backed
+ *        by the current device memory resource. After the initial reservation is allocated on
+ *        first use, subsequent calls to cuvsRMMAlloc / cuvsRMMFree on the same resources handle
+ *        hit the pool cache rather than calling cudaMallocAsync / cudaFreeAsync, reducing CUDA
+ *        context lock contention under concurrent query threads. The pool grows without shrinking:
+ *        freed allocations are returned to the pool rather than to the device, so the pool's
+ *        high-water mark only increases until the resources object is destroyed.
+ *
+ * @param[in] res                cuvsResources_t opaque C handle
+ * @param[in] initial_size_bytes initial pool reservation in bytes; size to cover the
+ *                               steady-state working set to avoid growth after warmup
+ * @return cuvsError_t
+ */
+CUVS_EXPORT cuvsError_t cuvsResourcesSetWorkspacePool(cuvsResources_t res,
+                                                      size_t initial_size_bytes);
+
 /**
  * @brief Create an Initialized opaque C handle for C++ type `raft::device_resources_snmg`
  *        for multi-GPU operations
@@ -212,6 +229,19 @@ CUVS_EXPORT cuvsError_t cuvsRMMFree(cuvsResources_t res, void* ptr, size_t bytes
 CUVS_EXPORT cuvsError_t cuvsRMMPoolMemoryResourceEnable(int initial_pool_size_percent,
                                             int max_pool_size_percent,
                                             bool managed);
+/**
+ * @brief Switches the working memory resource to use stream-ordered asynchronous allocation
+ * (cudaMallocAsync / cudaFreeAsync). Unlike the pool resource, this resource returns memory to
+ * the stream immediately without blocking the CPU, eliminating device-wide synchronization on
+ * deallocation. This is especially beneficial when multiple CAGRA searches run concurrently on
+ * separate CUDA streams, because the internal workspace allocations no longer serialize kernel
+ * launches. Be aware that this function will change the memory resource for the whole process
+ * and the new memory resource will be used until explicitly changed.
+ *
+ * @return cuvsError_t
+ */
+CUVS_EXPORT cuvsError_t cuvsRMMAsyncMemoryResourceEnable();
+
 /**
  * @brief Resets the memory resource to use the default memory resource (cuda_memory_resource)
  * @return cuvsError_t

@@ -714,6 +714,44 @@ CUVS_EXPORT cuvsError_t cuvsCagraSearch(cuvsResources_t res,
                             DLManagedTensor* distances,
                             cuvsFilter filter);
 
+/**
+ * @brief Search multiple CAGRA index partitions concurrently and return the global top-k per
+ * query.
+ *
+ * For each query row, the function searches all partitions in parallel into an internal
+ * intermediate buffer, applies per-partition distance post-processing, runs a batched top-k
+ * merge across partitions, and writes the final outputs to the caller-supplied device tensors.
+ * All work is submitted to the CUDA stream associated with @p res; use @c cuvsStreamSync to
+ * wait for completion.
+ *
+ * Only float32 datasets are currently supported.
+ *
+ * @param[in]  res            cuvsResources_t opaque C handle
+ * @param[in]  params         search parameters (shared across partitions)
+ * @param[in]  num_partitions number of index partitions
+ * @param[in]  indices        array of num_partitions cuvsCagraIndex_t pointers
+ * @param[in]  queries        DLManagedTensor* (device, float32, [n_queries, dim]); the queries
+ *                            matrix is searched against every partition
+ * @param[out] partition_ids  DLManagedTensor* (device, uint32, [n_queries, k]); which partition
+ *                            each returned neighbor came from
+ * @param[out] neighbors      DLManagedTensor* (device, uint32 or int64, [n_queries, k]); ordinal
+ *                            in the corresponding partition's dataset
+ * @param[out] distances      DLManagedTensor* (device, float32, [n_queries, k]); post-processed
+ *                            distance for each (query, neighbor)
+ * @param[in]  filter         filter to apply during search; use {.type=NO_FILTER, .addr=0} for
+ *                            unfiltered search, or {.type=MULTI_PARTITION_BITSET, .addr=ptr} where
+ *                            ptr is a uintptr_t-cast cuvsMultiPartitionBitsetFilter*
+ */
+CUVS_EXPORT cuvsError_t cuvsCagraSearchMultiPartition(cuvsResources_t res,
+                                                      cuvsCagraSearchParams_t params,
+                                                      uint32_t num_partitions,
+                                                      cuvsCagraIndex_t* indices,
+                                                      DLManagedTensor* queries,
+                                                      DLManagedTensor* partition_ids,
+                                                      DLManagedTensor* neighbors,
+                                                      DLManagedTensor* distances,
+                                                      cuvsFilter filter);
+
 /**
  * @}
  */

@@ -5,6 +5,7 @@
 
 #pragma once
 
+#include <dlpack/dlpack.h>
 #include <stdint.h>
 
 #include <cuvs/core/export.h>
@@ -28,9 +29,28 @@ enum cuvsFilterType {
   /* Filter an index with a bitset */
   BITSET = 1,
   /* Filter an index with a bitmap */
-  BITMAP = 2
+  BITMAP = 2,
+  /* Filter multiple index partitions with a single concatenated bitset plus per-partition offsets */
+  MULTI_PARTITION_BITSET = 3
 };
 
+/**
+ * @brief Filter parameters for multi-partition search.
+ *
+ * Holds a single device bitset that is the concatenation of per-partition bitsets,
+ * together with a device array of per-partition bit offsets.  Pass a pointer to
+ * this struct (cast to uintptr_t) in cuvsFilter::addr with
+ * cuvsFilter::type == MULTI_PARTITION_BITSET.
+ */
+typedef struct {
+  /** Device tensor (uint32, flat) of packed bitset words for all partitions concatenated. */
+  DLManagedTensor* combined_bitset;
+  /** Total number of logical bits in combined_bitset. */
+  int64_t total_bitset_bits;
+  /** Device tensor (int64, [num_partitions]) of per-partition bit offsets into combined_bitset. */
+  DLManagedTensor* partition_offsets;
+} cuvsMultiPartitionBitsetFilter;
+
 /**
  * @brief Struct to hold address of cuvs::neighbors::prefilter and its type
  *

@@ -0,0 +1,37 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#pragma once
+
+#include <cuvs/core/c_api.h>
+#include <dlpack/dlpack.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * @brief Select the k smallest values from a flat device array of n candidates.
+ *
+ * Treats `in_val` as a matrix of shape [1, n] and selects the `k` smallest
+ * float values.  `out_idx` receives the int64 column positions of the selected
+ * values in [0, n), so the caller can recover per-segment identity as:
+ *
+ *   segment_index        = out_idx[j] / segment_k
+ *   position_in_segment  = out_idx[j] % segment_k
+ *
+ * @param[in]  res      cuvsResources_t handle
+ * @param[in]  in_val   DLManagedTensor* shape [1, n], float32, device memory
+ * @param[out] out_val  DLManagedTensor* shape [1, k], float32, device memory
+ * @param[out] out_idx  DLManagedTensor* shape [1, k], int64,   device memory
+ * @return cuvsError_t
+ */
+CUVS_EXPORT cuvsError_t cuvsSelectK(cuvsResources_t res,
+                                    DLManagedTensor* in_val,
+                                    DLManagedTensor* out_val,
+                                    DLManagedTensor* out_idx);
+
+#ifdef __cplusplus
+}
+#endif
@@ -9,11 +9,13 @@
 #include <raft/core/device_resources_snmg.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
 #include <raft/core/resource/device_id.hpp>
+#include <raft/core/resource/device_memory_resource.hpp>
 #include <raft/core/resource/resource_types.hpp>
 #include <raft/core/resources.hpp>
 #include <raft/util/cudart_utils.hpp>
 #include <rapids_logger/logger.hpp>
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/mr/cuda_async_memory_resource.hpp>
 #include <rmm/mr/cuda_memory_resource.hpp>
 #include <rmm/mr/managed_memory_resource.hpp>
 #include <rmm/mr/per_device_resource.hpp>
@@ -35,6 +37,19 @@ extern "C" cuvsError_t cuvsResourcesCreate(cuvsResources_t* res)
   });
 }
 
+extern "C" cuvsError_t cuvsResourcesSetWorkspacePool(cuvsResources_t res, size_t initial_size_bytes)
+{
+  return cuvs::core::translate_exceptions([=] {
+    auto res_ptr = reinterpret_cast<raft::resources*>(res);
+    // Create an uncapped pool: pre-warms with initial_size_bytes to avoid cudaMalloc on every
+    // query, but can grow beyond that if an allocation exceeds the initial reservation.
+    raft::resource::set_workspace_resource(
+      *res_ptr,
+      rmm::mr::pool_memory_resource{rmm::mr::get_current_device_resource_ref(),
+                                    initial_size_bytes});
+  });
+}
+
 extern "C" cuvsError_t cuvsResourcesDestroy(cuvsResources_t res)
 {
   return cuvs::core::translate_exceptions([=] {
@@ -132,20 +147,22 @@ extern "C" cuvsError_t cuvsRMMAlloc(cuvsResources_t res, void** ptr, size_t byte
 {
   return cuvs::core::translate_exceptions([=] {
     auto res_ptr = reinterpret_cast<raft::resources*>(res);
-    auto mr      = rmm::mr::get_current_device_resource_ref();
-    *ptr         = mr.allocate(raft::resource::get_cuda_stream(*res_ptr), bytes);
+    auto stream  = raft::resource::get_cuda_stream(*res_ptr);
+    *ptr         = raft::resource::get_workspace_resource_ref(*res_ptr).allocate(stream, bytes);
   });
 }
 
 extern "C" cuvsError_t cuvsRMMFree(cuvsResources_t res, void* ptr, size_t bytes)
 {
   return cuvs::core::translate_exceptions([=] {
     auto res_ptr = reinterpret_cast<raft::resources*>(res);
-    auto mr      = rmm::mr::get_current_device_resource_ref();
-    mr.deallocate(raft::resource::get_cuda_stream(*res_ptr), ptr, bytes);
+    auto stream  = raft::resource::get_cuda_stream(*res_ptr);
+    raft::resource::get_workspace_resource_ref(*res_ptr).deallocate(stream, ptr, bytes);
   });
 }
 
+thread_local std::shared_ptr<rmm::mr::cuda_async_memory_resource> async_mr;
+
 extern "C" cuvsError_t cuvsRMMPoolMemoryResourceEnable(int initial_pool_size_percent,
                                                        int max_pool_size_percent,
                                                        bool managed)
@@ -164,9 +181,20 @@ extern "C" cuvsError_t cuvsRMMPoolMemoryResourceEnable(int initial_pool_size_per
   });
 }
 
+extern "C" cuvsError_t cuvsRMMAsyncMemoryResourceEnable()
+{
+  return cuvs::core::translate_exceptions([=] {
+    async_mr = std::make_shared<rmm::mr::cuda_async_memory_resource>();
+    rmm::mr::set_current_device_resource(*async_mr);
+  });
+}
+
 extern "C" cuvsError_t cuvsRMMMemoryResourceReset()
 {
-  return cuvs::core::translate_exceptions([=] { rmm::mr::reset_current_device_resource(); });
+  return cuvs::core::translate_exceptions([=] {
+    rmm::mr::reset_current_device_resource();
+    async_mr.reset();
+  });
 }
 
 thread_local std::unique_ptr<rmm::mr::pinned_host_memory_resource> pinned_mr;

@@ -689,6 +689,90 @@ extern "C" cuvsError_t cuvsCagraSearch(cuvsResources_t res,
   });
 }
 
+extern "C" cuvsError_t cuvsCagraSearchMultiPartition(cuvsResources_t res,
+                                                     cuvsCagraSearchParams_t params,
+                                                     uint32_t num_partitions,
+                                                     cuvsCagraIndex_t* indices,
+                                                     DLManagedTensor* queries,
+                                                     DLManagedTensor* partition_ids,
+                                                     DLManagedTensor* neighbors,
+                                                     DLManagedTensor* distances,
+                                                     cuvsFilter filter)
+{
+  return cuvs::core::translate_exceptions([=] {
+    RAFT_EXPECTS(num_partitions > 0, "num_partitions must be > 0");
+    RAFT_EXPECTS(indices != nullptr && queries != nullptr && partition_ids != nullptr &&
+                   neighbors != nullptr && distances != nullptr,
+                 "All pointer arguments must be non-null");
+
+    auto res_ptr = reinterpret_cast<raft::resources*>(res);
+    auto search_params = cuvs::neighbors::cagra::search_params();
+    convert_c_search_params(*params, &search_params);
+
+    // Only float32 is supported for multi-partition search.
+    RAFT_EXPECTS(
+      indices[0]->dtype.code == kDLFloat && indices[0]->dtype.bits == 32,
+      "Multi-partition search only supports float32 indices");
+
+    using T         = float;
+    using IdxT      = uint32_t;
+    using OutIdxT   = uint32_t;
+    using DistanceT = float;
+    using IndexT    = cuvs::neighbors::cagra::index<T, IdxT>;
+
+    std::vector<const IndexT*> idx_vec(num_partitions);
+    for (uint32_t i = 0; i < num_partitions; i++) {
+      RAFT_EXPECTS(indices[i] != nullptr && indices[i]->addr != 0,
+                   "Index at position %u is null or not built", i);
+      idx_vec[i] = reinterpret_cast<const IndexT*>(indices[i]->addr);
+    }
+
+    using queries_view_t = raft::device_matrix_view<const T, int64_t, raft::row_major>;
+    using pid_view_t     = raft::device_matrix_view<uint32_t, int64_t, raft::row_major>;
+    using nbrs_view_t    = raft::device_matrix_view<OutIdxT, int64_t, raft::row_major>;
+    using dist_view_t    = raft::device_matrix_view<DistanceT, int64_t, raft::row_major>;
+
+    auto queries_view       = cuvs::core::from_dlpack<queries_view_t>(queries);
+    auto partition_ids_view = cuvs::core::from_dlpack<pid_view_t>(partition_ids);
+    auto neighbors_view     = cuvs::core::from_dlpack<nbrs_view_t>(neighbors);
+    auto distances_view     = cuvs::core::from_dlpack<dist_view_t>(distances);
+
+    if (filter.type == NO_FILTER) {
+      cuvs::neighbors::cagra::search_multi_partition(*res_ptr,
+                                                     search_params,
+                                                     idx_vec,
+                                                     queries_view,
+                                                     partition_ids_view,
+                                                     neighbors_view,
+                                                     distances_view);
+    } else if (filter.type == MULTI_PARTITION_BITSET) {
+      auto* f = reinterpret_cast<cuvsMultiPartitionBitsetFilter*>(filter.addr);
+      RAFT_EXPECTS(f != nullptr, "MULTI_PARTITION_BITSET filter addr must be non-null");
+
+      using bitset_mdspan_t  = raft::device_vector_view<std::uint32_t, int64_t, raft::row_major>;
+      using offsets_mdspan_t = raft::device_vector_view<int64_t, int64_t, raft::row_major>;
+      auto bitset_mds  = cuvs::core::from_dlpack<bitset_mdspan_t>(f->combined_bitset);
+      auto offsets_mds = cuvs::core::from_dlpack<offsets_mdspan_t>(f->partition_offsets);
+
+      cuvs::core::bitset_view<std::uint32_t, int64_t> combined_bitset_view(
+        bitset_mds, f->total_bitset_bits);
+      cuvs::neighbors::filtering::multi_partition_bitset_filter<uint32_t, int64_t> mp_filter(
+        combined_bitset_view, offsets_mds.data_handle());
+
+      cuvs::neighbors::cagra::search_multi_partition(*res_ptr,
+                                                     search_params,
+                                                     idx_vec,
+                                                     queries_view,
+                                                     partition_ids_view,
+                                                     neighbors_view,
+                                                     distances_view,
+                                                     mp_filter);
+    } else {
+      RAFT_FAIL("Unsupported filter type for multi-partition search: %d", (int)filter.type);
+    }
+  });
+}
+
 extern "C" cuvsError_t cuvsCagraMerge(cuvsResources_t res,
                                       cuvsCagraIndexParams_t params,
                                       cuvsCagraIndex_t* indices,

@@ -0,0 +1,42 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include <cuvs/core/c_api.h>
+#include "../core/exceptions.hpp"
+#include <cuvs/selection/select_k.hpp>
+#include <dlpack/dlpack.h>
+
+#include <raft/core/device_mdspan.hpp>
+#include <raft/core/resources.hpp>
+
+extern "C" cuvsError_t cuvsSelectK(cuvsResources_t res,
+                                   DLManagedTensor* in_val,
+                                   DLManagedTensor* out_val,
+                                   DLManagedTensor* out_idx)
+{
+  return cuvs::core::translate_exceptions([=] {
+    auto* res_ptr = reinterpret_cast<raft::resources*>(res);
+
+    int64_t n = in_val->dl_tensor.shape[1];
+    int64_t k = out_val->dl_tensor.shape[1];
+
+    auto in_view = raft::make_device_matrix_view<const float, int64_t, raft::row_major>(
+      static_cast<const float*>(in_val->dl_tensor.data), 1, n);
+
+    auto out_val_view = raft::make_device_matrix_view<float, int64_t, raft::row_major>(
+      static_cast<float*>(out_val->dl_tensor.data), 1, k);
+
+    auto out_idx_view = raft::make_device_matrix_view<int64_t, int64_t, raft::row_major>(
+      static_cast<int64_t*>(out_idx->dl_tensor.data), 1, k);
+
+    cuvs::selection::select_k(
+      *res_ptr,
+      in_view,
+      std::nullopt,  // implicit positions [0, n) as in_idx
+      out_val_view,
+      out_idx_view,
+      true);  // select_min = true (smallest distance = nearest neighbor)
+  });
+}