NVIDIA · sleeepyjack · Jun 26, 2024 · Jun 14, 2024 · Jun 15, 2024 · Jun 15, 2024
@@ -962,6 +962,106 @@ class open_addressing_ref_impl {
     }
   }
 
+  /**
+   * @brief Executes a callback on every element in the container with key equivalent to the probe
+   * key.
+   *
+   * @note Passes an un-incrementable input iterator to the element whose key is equivalent to
+   * `key` to the callback.
+   *
+   * @tparam ProbeKey Input type which is convertible to 'key_type'
+   + @tparam Callback Callback functor or lambda
+   *
+   * @param key The key to search for
+   * @param callback Function to call on every element found
+   */
+  template <class ProbeKey, class Callback>
+  __device__ void for_each(ProbeKey const& key, Callback callback) const noexcept
-  __device__ void for_each(ProbeKey const& key, Callback callback) const noexcept
+  __device__ void for_each(ProbeKey const& key, Callback&& callback) const noexcept
-  __device__ void for_each(ProbeKey const& key, Callback callback) const noexcept
+  __device__ void for_each(ProbeKey const& key, Callback&& callback) const noexcept
+  {
+    static_assert(cg_size == 1, "Non-CG operation is incompatible with the current probing scheme");
+    auto probing_iter = this->probing_scheme_(key, this->storage_ref_.window_extent());
+
+    while (true) {
+      // TODO atomic_ref::load if insert operator is present
+      auto const window_slots = this->storage_ref_[*probing_iter];
+
+      for (auto i = 0; i < window_size; ++i) {
+        switch (
+          this->predicate_.operator()<is_insert::NO>(key, this->extract_key(window_slots[i]))) {
+          case detail::equal_result::EMPTY: {
+            return;
+          }
+          case detail::equal_result::EQUAL: {
+            callback(const_iterator{&(*(this->storage_ref_.data() + *probing_iter))[i]});
+            if constexpr (allows_duplicates) {
+              continue;
+            } else {
+              return;
+            }
+          }
+          default: continue;
+        }
+      }
+      ++probing_iter;
+    }
+  }
+
+  /**
+   * @brief Executes a callback on every element in the container with key equivalent to the probe
+   * key.
+   *
+   * @note Passes an un-incrementable input iterator to the element whose key is equivalent to
+   * `key` to the callback.
+   *
+   * @note This function uses cooperative group semantics, meaning that any thread may call the
+   * callback if it finds a matching element. If multiple elements are found within the same group,
+   * each thread with a match will call the callback with its associated element.
+   *
+   * @tparam ProbeKey Input type which is convertible to 'key_type'
+   + @tparam Callback Callback functor or lambda
+   *
+   * @param group The Cooperative Group used to perform this operation
+   * @param key The key to search for
+   * @param callback Function to call on every element found
+   */
+  template <class ProbeKey, class Callback>
+  __device__ void for_each(cooperative_groups::thread_block_tile<cg_size> const& group,
+                           ProbeKey const& key,
+                           Callback callback) const noexcept
+  {
+    auto probing_iter = probing_scheme_(group, key, storage_ref_.window_extent());
+
+    while (true) {
+      auto const window_slots = storage_ref_[*probing_iter];
+
+      auto const [state, intra_window_index] = [&]() {
+        auto res = detail::equal_result::UNEQUAL;
+        for (auto i = 0; i < window_size; ++i) {
+          res = this->predicate_.operator()<is_insert::NO>(key, this->extract_key(window_slots[i]));
+          if (res != detail::equal_result::UNEQUAL) { return window_probing_results{res, i}; }
+        }
+        // returns dummy index `-1` for UNEQUAL
+        return window_probing_results{res, -1};
+      }();
+
+      // Find a match for the probe key, thus call the callback with an iterator to the entry
+      auto const equal = state == detail::equal_result::EQUAL;
+      if (equal) {
+        callback(const_iterator{&(*(storage_ref_.data() + *probing_iter))[intra_window_index]});
+      }
+
+      if constexpr (not allows_duplicates) {
+        if (group.any(equal)) { return; }
+      }
+
+      // Find an empty slot, meaning that the probe key isn't present in the container
+      auto const empty = state == detail::equal_result::EMPTY;
+      if (group.any(empty)) { return; }
+
+      ++probing_iter;
+    }
+  }
+
   /**
    * @brief Compares the content of the address `address` (old value) with the `expected` value and,
    * only if they are the same, sets the content of `address` to `desired`.

@@ -446,6 +446,77 @@ class operator_impl<
   }
 };
 
+template <typename Key,
+          cuda::thread_scope Scope,
+          typename KeyEqual,
+          typename ProbingScheme,
+          typename StorageRef,
+          typename... Operators>
+class operator_impl<
+  op::for_each_tag,
+  static_multiset_ref<Key, Scope, KeyEqual, ProbingScheme, StorageRef, Operators...>> {
+  using base_type = static_multiset_ref<Key, Scope, KeyEqual, ProbingScheme, StorageRef>;
+  using ref_type =
+    static_multiset_ref<Key, Scope, KeyEqual, ProbingScheme, StorageRef, Operators...>;
+  using key_type       = typename base_type::key_type;
+  using value_type     = typename base_type::value_type;
+  using iterator       = typename base_type::iterator;
+  using const_iterator = typename base_type::const_iterator;
+
+  static constexpr auto cg_size     = base_type::cg_size;
+  static constexpr auto window_size = base_type::window_size;
+
+ public:
+  /**
+   * @brief Executes a callback on every element in the container with key equivalent to the probe
+   * key.
+   *
+   * @note Passes an un-incrementable input iterator to the element whose key is equivalent to
+   * `key` to the callback.
+   *
+   * @tparam ProbeKey Input type which is convertible to 'key_type'
+   + @tparam Callback Callback functor or lambda
+   *
+   * @param key The key to search for
+   * @param callback Function to call on every element found
+   */
+  template <class ProbeKey, class Callback>
+  __device__ void for_each(ProbeKey const& key, Callback callback) const noexcept
+  {
+    // CRTP: cast `this` to the actual ref type
+    auto const& ref_ = static_cast<ref_type const&>(*this);
+    ref_.impl_.for_each(key, callback);
+  }
+
+  /**
+   * @brief Executes a callback on every element in the container with key equivalent to the probe
+   * key.
+   *
+   * @note Passes an un-incrementable input iterator to the element whose key is equivalent to
+   * `key` to the callback.
+   *
+   * @note This function uses cooperative group semantics, meaning that any thread may call the
+   * callback if it finds a matching element. If multiple elements are found within the same group,
+   * each thread with a match will call the callback with its associated element.
+   *
+   * @tparam ProbeKey Input type which is convertible to 'key_type'
+   + @tparam Callback Callback functor or lambda
+   *
+   * @param group The Cooperative Group used to perform this operation
+   * @param key The key to search for
+   * @param callback Function to call on every element found
+   */
+  template <class ProbeKey, class Callback>
+  __device__ void for_each(cooperative_groups::thread_block_tile<cg_size> const& group,
+                           ProbeKey const& key,
+                           Callback callback) const noexcept
+  {
+    // CRTP: cast `this` to the actual ref type
+    auto const& ref_ = static_cast<ref_type const&>(*this);
+    ref_.impl_.for_each(group, key, callback);
+  }
+};
+
 template <typename Key,
           cuda::thread_scope Scope,
           typename KeyEqual,

@@ -62,6 +62,12 @@ struct count_tag {
 struct find_tag {
 } inline constexpr find;  ///< `cuco::find` operator
 
+/**
+ * @brief `for_each` operator tag
+ */
+struct for_each_tag {
+} inline constexpr for_each;  ///< `cuco::for_each` operator
+
 }  // namespace op
 }  // namespace cuco
 

@@ -98,7 +98,8 @@ ConfigureTest(STATIC_MULTISET_TEST
     static_multiset/count_test.cu
     static_multiset/custom_count_test.cu
     static_multiset/find_test.cu
-    static_multiset/insert_test.cu)
+    static_multiset/insert_test.cu
+    static_multiset/for_each_test.cu)
 
 ###################################################################################################
 # - static_multimap tests -------------------------------------------------------------------------

@@ -0,0 +1,139 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <test_utils.hpp>
+
+#include <cuco/detail/utility/cuda.hpp>
+#include <cuco/static_multiset.cuh>
+
+#include <cuda/atomic>
+#include <cuda/functional>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/transform_iterator.h>
+
+#include <cooperative_groups.h>
+#include <cooperative_groups/reduce.h>
+
+#include <catch2/catch_template_test_macros.hpp>
+
+#include <cstddef>
+
+template <class Ref, class InputIt, class AtomicErrorCounter>
+CUCO_KERNEL void for_each_check_scalar(Ref ref,
+                                       InputIt first,
+                                       std::size_t n,
+                                       std::size_t multiplicity,
+                                       AtomicErrorCounter* error_counter)
+{
+  static_assert(Ref::cg_size == 1, "Scalar test must have cg_size==1");
+  auto const loop_stride = cuco::detail::grid_stride();
+  auto idx               = cuco::detail::global_thread_id();
+
+  while (idx < n) {
+    auto const& key     = *(first + idx);
+    std::size_t matches = 0;
+    ref.for_each(key, [&] __device__(auto const it) {
+      if (ref.key_eq()(key, *it)) { matches++; }
+    });
+    if (matches != multiplicity) { error_counter->fetch_add(1, cuda::memory_order_relaxed); }
+    idx += loop_stride;
+  }
+}
+
+template <class Ref, class InputIt, class AtomicErrorCounter>
+CUCO_KERNEL void for_each_check_cooperative(Ref ref,
+                                            InputIt first,
+                                            std::size_t n,
+                                            std::size_t multiplicity,
+                                            AtomicErrorCounter* error_counter)
+{
+  auto const loop_stride = cuco::detail::grid_stride() / Ref::cg_size;
+  auto idx               = cuco::detail::global_thread_id() / Ref::cg_size;
+  ;
+
+  while (idx < n) {
+    auto const tile =
+      cooperative_groups::tiled_partition<Ref::cg_size>(cooperative_groups::this_thread_block());
+    auto const& key            = *(first + idx);
+    std::size_t thread_matches = 0;
+    ref.for_each(tile, key, [&] __device__(auto const it) {
+      if (ref.key_eq()(key, *it)) { thread_matches++; }
+    });
+    auto const tile_matches =
+      cooperative_groups::reduce(tile, thread_matches, cooperative_groups::plus<std::size_t>());
+    if (tile_matches != multiplicity and tile.thread_rank() == 0) {
+      error_counter->fetch_add(1, cuda::memory_order_relaxed);
+    }
+    idx += loop_stride;
+  }
+}
+
+TEMPLATE_TEST_CASE_SIG(
+  "static_multiset for_each tests",
+  "",
+  ((typename Key, cuco::test::probe_sequence Probe, int CGSize), Key, Probe, CGSize),
+  (int32_t, cuco::test::probe_sequence::double_hashing, 1),
+  (int32_t, cuco::test::probe_sequence::double_hashing, 2),
+  (int64_t, cuco::test::probe_sequence::double_hashing, 1),
+  (int64_t, cuco::test::probe_sequence::double_hashing, 2),
+  (int32_t, cuco::test::probe_sequence::linear_probing, 1),
+  (int32_t, cuco::test::probe_sequence::linear_probing, 2),
+  (int64_t, cuco::test::probe_sequence::linear_probing, 1),
+  (int64_t, cuco::test::probe_sequence::linear_probing, 2))
+{
+  constexpr size_t num_unique_keys{400};
+  constexpr size_t key_multiplicity{5};
+  constexpr size_t num_keys{num_unique_keys * key_multiplicity};
+
+  using probe = std::conditional_t<Probe == cuco::test::probe_sequence::linear_probing,
+                                   cuco::linear_probing<CGSize, cuco::default_hash_function<Key>>,
+                                   cuco::double_hashing<CGSize, cuco::default_hash_function<Key>>>;
+
+  auto set =
+    cuco::static_multiset{num_keys, cuco::empty_key<Key>{-1}, {}, probe{}, {}, cuco::storage<2>{}};
+
+  auto unique_keys_begin  = thrust::counting_iterator<Key>(0);
+  auto gen_duplicate_keys = cuda::proclaim_return_type<Key>(
+    [] __device__(auto const& k) { return static_cast<Key>(k % num_unique_keys); });
+  auto keys_begin = thrust::make_transform_iterator(unique_keys_begin, gen_duplicate_keys);
+
+  set.insert(keys_begin, keys_begin + num_keys);
+
+  using error_counter_type = cuda::atomic<std::size_t, cuda::thread_scope_system>;
+  error_counter_type* error_counter;
+  CUCO_CUDA_TRY(cudaMallocHost(&error_counter, sizeof(error_counter_type)));
+  new (error_counter) error_counter_type{0};
+
+  auto const grid_size  = cuco::detail::grid_size(num_unique_keys, CGSize);
+  auto const block_size = cuco::detail::default_block_size();
+
+  // test scalar for_each
+  if constexpr (CGSize == 1) {
+    for_each_check_scalar<<<grid_size, block_size>>>(
+      set.ref(cuco::for_each), unique_keys_begin, num_unique_keys, key_multiplicity, error_counter);
+    CUCO_CUDA_TRY(cudaDeviceSynchronize());
+    REQUIRE(error_counter->load() == 0);
+    error_counter->store(0);
+  }
+
+  // test CG for_each
+  for_each_check_cooperative<<<grid_size, block_size>>>(
+    set.ref(cuco::for_each), unique_keys_begin, num_unique_keys, key_multiplicity, error_counter);
+  CUCO_CUDA_TRY(cudaDeviceSynchronize());
+  REQUIRE(error_counter->load() == 0);
+
+  CUCO_CUDA_TRY(cudaFreeHost(error_counter));
+}