Skip to content
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
100 changes: 100 additions & 0 deletions include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -962,6 +962,106 @@ class open_addressing_ref_impl {
}
}

/**
* @brief Executes a callback on every element in the container with key equivalent to the probe
* key.
*
* @note Passes an un-incrementable input iterator to the element whose key is equivalent to
* `key` to the callback.
*
* @tparam ProbeKey Input type which is convertible to 'key_type'
+ @tparam Callback Callback functor or lambda
*
* @param key The key to search for
* @param callback Function to call on every element found
*/
template <class ProbeKey, class Callback>
__device__ void for_each(ProbeKey const& key, Callback callback) const noexcept
Copy link
Copy Markdown
Collaborator Author

@sleeepyjack sleeepyjack Jun 15, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
__device__ void for_each(ProbeKey const& key, Callback callback) const noexcept
__device__ void for_each(ProbeKey const& key, Callback&& callback) const noexcept

Unsure if this needs to be a mutable or even universal reference instead. Let's say we define a count functor as such:

struct count_functor {
  std::size_t thread_count = 0;

  // counts the number of matching elements for this thread
  template <class InputIt>
  __device__ void operator()(InputIt) { thread_count++; }
};

And then call

//...
auto thread_counter = count_functor{};
set.for_each(key, thread_counter);
auto const key_count = thread_counter.count;
//...

Then we want the functor to be taken as a mutable reference, right?

Copy link
Copy Markdown
Member

@PointKernel PointKernel Jun 17, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

pass by value is preferred.

The above example is a good example of a bad callback, especially in a parallel context

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is it best practice to pass a callback by-value? I'd have to skim some stackoverflow/cppreference pages to get familiar with the topic. With pass-by-value we lose the ability of giving the callback an internal state that can hold the result of the operation. How would we solve the above example with a callback passed by-value? Pass a pointer to thread_count to the callback?

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Pass a pointer to thread_count to the callback?

Yes

I see a callable defining the operations to be performed on the output instead of being the output itself.

{
static_assert(cg_size == 1, "Non-CG operation is incompatible with the current probing scheme");
auto probing_iter = this->probing_scheme_(key, this->storage_ref_.window_extent());

while (true) {
// TODO atomic_ref::load if insert operator is present
auto const window_slots = this->storage_ref_[*probing_iter];

for (auto i = 0; i < window_size; ++i) {
switch (
this->predicate_.operator()<is_insert::NO>(key, this->extract_key(window_slots[i]))) {
case detail::equal_result::EMPTY: {
return;
}
case detail::equal_result::EQUAL: {
callback(const_iterator{&(*(this->storage_ref_.data() + *probing_iter))[i]});
if constexpr (allows_duplicates) {
continue;
} else {
return;
}
}
default: continue;
}
}
++probing_iter;
}
}

/**
* @brief Executes a callback on every element in the container with key equivalent to the probe
* key.
*
* @note Passes an un-incrementable input iterator to the element whose key is equivalent to
* `key` to the callback.
*
* @note This function uses cooperative group semantics, meaning that any thread may call the
* callback if it finds a matching element. If multiple elements are found within the same group,
* each thread with a match will call the callback with its associated element.
*
* @tparam ProbeKey Input type which is convertible to 'key_type'
+ @tparam Callback Callback functor or lambda
*
* @param group The Cooperative Group used to perform this operation
* @param key The key to search for
* @param callback Function to call on every element found
*/
template <class ProbeKey, class Callback>
__device__ void for_each(cooperative_groups::thread_block_tile<cg_size> const& group,
Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not sure why the unit test is failing. Seems like the logic in this function is flawed.

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think I found the problem. #509 should fix the issue.

ProbeKey const& key,
Callback callback) const noexcept
{
auto probing_iter = probing_scheme_(group, key, storage_ref_.window_extent());

while (true) {
auto const window_slots = storage_ref_[*probing_iter];

auto const [state, intra_window_index] = [&]() {
auto res = detail::equal_result::UNEQUAL;
for (auto i = 0; i < window_size; ++i) {
res = this->predicate_.operator()<is_insert::NO>(key, this->extract_key(window_slots[i]));
if (res != detail::equal_result::UNEQUAL) { return window_probing_results{res, i}; }
}
// returns dummy index `-1` for UNEQUAL
return window_probing_results{res, -1};
}();

// Find a match for the probe key, thus call the callback with an iterator to the entry
auto const equal = state == detail::equal_result::EQUAL;
if (equal) {
callback(const_iterator{&(*(storage_ref_.data() + *probing_iter))[intra_window_index]});
}

if constexpr (not allows_duplicates) {
if (group.any(equal)) { return; }
}

// Find an empty slot, meaning that the probe key isn't present in the container
auto const empty = state == detail::equal_result::EMPTY;
if (group.any(empty)) { return; }

++probing_iter;
}
}

/**
* @brief Compares the content of the address `address` (old value) with the `expected` value and,
* only if they are the same, sets the content of `address` to `desired`.
Expand Down
71 changes: 71 additions & 0 deletions include/cuco/detail/static_multiset/static_multiset_ref.inl
Original file line number Diff line number Diff line change
Expand Up @@ -446,6 +446,77 @@ class operator_impl<
}
};

template <typename Key,
cuda::thread_scope Scope,
typename KeyEqual,
typename ProbingScheme,
typename StorageRef,
typename... Operators>
class operator_impl<
op::for_each_tag,
static_multiset_ref<Key, Scope, KeyEqual, ProbingScheme, StorageRef, Operators...>> {
using base_type = static_multiset_ref<Key, Scope, KeyEqual, ProbingScheme, StorageRef>;
using ref_type =
static_multiset_ref<Key, Scope, KeyEqual, ProbingScheme, StorageRef, Operators...>;
using key_type = typename base_type::key_type;
using value_type = typename base_type::value_type;
using iterator = typename base_type::iterator;
using const_iterator = typename base_type::const_iterator;

static constexpr auto cg_size = base_type::cg_size;
static constexpr auto window_size = base_type::window_size;

public:
/**
* @brief Executes a callback on every element in the container with key equivalent to the probe
* key.
*
* @note Passes an un-incrementable input iterator to the element whose key is equivalent to
* `key` to the callback.
*
* @tparam ProbeKey Input type which is convertible to 'key_type'
+ @tparam Callback Callback functor or lambda
*
* @param key The key to search for
* @param callback Function to call on every element found
*/
template <class ProbeKey, class Callback>
__device__ void for_each(ProbeKey const& key, Callback callback) const noexcept
{
// CRTP: cast `this` to the actual ref type
auto const& ref_ = static_cast<ref_type const&>(*this);
ref_.impl_.for_each(key, callback);
}

/**
* @brief Executes a callback on every element in the container with key equivalent to the probe
* key.
*
* @note Passes an un-incrementable input iterator to the element whose key is equivalent to
* `key` to the callback.
*
* @note This function uses cooperative group semantics, meaning that any thread may call the
* callback if it finds a matching element. If multiple elements are found within the same group,
* each thread with a match will call the callback with its associated element.
*
* @tparam ProbeKey Input type which is convertible to 'key_type'
+ @tparam Callback Callback functor or lambda
*
* @param group The Cooperative Group used to perform this operation
* @param key The key to search for
* @param callback Function to call on every element found
*/
template <class ProbeKey, class Callback>
__device__ void for_each(cooperative_groups::thread_block_tile<cg_size> const& group,
ProbeKey const& key,
Callback callback) const noexcept
{
// CRTP: cast `this` to the actual ref type
auto const& ref_ = static_cast<ref_type const&>(*this);
ref_.impl_.for_each(group, key, callback);
}
};

template <typename Key,
cuda::thread_scope Scope,
typename KeyEqual,
Expand Down
6 changes: 6 additions & 0 deletions include/cuco/operator.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,12 @@ struct count_tag {
struct find_tag {
} inline constexpr find; ///< `cuco::find` operator

/**
* @brief `for_each` operator tag
*/
struct for_each_tag {
} inline constexpr for_each; ///< `cuco::for_each` operator
Comment on lines +68 to +69
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see for_each as an internal utility as opposed to an actual hash table operator. Need to think more on this.

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

From my standpoint I would treat it as an extension to the STL API that is more suitable for the GPU. Having a "cooperative iterator" instead, which would be closer to the spirit of modern C++ has its drawbacks. For example, how do we ensure users only increment the iterator with the same CG? for_each solves this problem by making the probing part internal. We should even be able to redefine any lookup function (find, count, retrieve) that relies on probing with for_each, giving us a proper abstraction layer for probing.

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

On a side note I personally find this funtional approach, i.e., "for each found key do X" very appealing. Historic evidence that it is indeed useful comes from warpcore, where many downstream applications (mostly genomics stuff) implemented their custom lookup operations through for_each functors.


} // namespace op
} // namespace cuco

Expand Down
3 changes: 2 additions & 1 deletion tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,8 @@ ConfigureTest(STATIC_MULTISET_TEST
static_multiset/count_test.cu
static_multiset/custom_count_test.cu
static_multiset/find_test.cu
static_multiset/insert_test.cu)
static_multiset/insert_test.cu
static_multiset/for_each_test.cu)

###################################################################################################
# - static_multimap tests -------------------------------------------------------------------------
Expand Down
139 changes: 139 additions & 0 deletions tests/static_multiset/for_each_test.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
/*
* Copyright (c) 2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <test_utils.hpp>

#include <cuco/detail/utility/cuda.hpp>
#include <cuco/static_multiset.cuh>

#include <cuda/atomic>
#include <cuda/functional>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/transform_iterator.h>

#include <cooperative_groups.h>
#include <cooperative_groups/reduce.h>

#include <catch2/catch_template_test_macros.hpp>

#include <cstddef>

template <class Ref, class InputIt, class AtomicErrorCounter>
CUCO_KERNEL void for_each_check_scalar(Ref ref,
InputIt first,
std::size_t n,
std::size_t multiplicity,
AtomicErrorCounter* error_counter)
{
static_assert(Ref::cg_size == 1, "Scalar test must have cg_size==1");
auto const loop_stride = cuco::detail::grid_stride();
auto idx = cuco::detail::global_thread_id();

while (idx < n) {
auto const& key = *(first + idx);
std::size_t matches = 0;
ref.for_each(key, [&] __device__(auto const it) {
if (ref.key_eq()(key, *it)) { matches++; }
});
if (matches != multiplicity) { error_counter->fetch_add(1, cuda::memory_order_relaxed); }
idx += loop_stride;
}
}

template <class Ref, class InputIt, class AtomicErrorCounter>
CUCO_KERNEL void for_each_check_cooperative(Ref ref,
InputIt first,
std::size_t n,
std::size_t multiplicity,
AtomicErrorCounter* error_counter)
{
auto const loop_stride = cuco::detail::grid_stride() / Ref::cg_size;
auto idx = cuco::detail::global_thread_id() / Ref::cg_size;
;

while (idx < n) {
auto const tile =
cooperative_groups::tiled_partition<Ref::cg_size>(cooperative_groups::this_thread_block());
auto const& key = *(first + idx);
std::size_t thread_matches = 0;
ref.for_each(tile, key, [&] __device__(auto const it) {
if (ref.key_eq()(key, *it)) { thread_matches++; }
});
auto const tile_matches =
cooperative_groups::reduce(tile, thread_matches, cooperative_groups::plus<std::size_t>());
if (tile_matches != multiplicity and tile.thread_rank() == 0) {
error_counter->fetch_add(1, cuda::memory_order_relaxed);
}
idx += loop_stride;
}
}

TEMPLATE_TEST_CASE_SIG(
"static_multiset for_each tests",
"",
((typename Key, cuco::test::probe_sequence Probe, int CGSize), Key, Probe, CGSize),
(int32_t, cuco::test::probe_sequence::double_hashing, 1),
(int32_t, cuco::test::probe_sequence::double_hashing, 2),
(int64_t, cuco::test::probe_sequence::double_hashing, 1),
(int64_t, cuco::test::probe_sequence::double_hashing, 2),
(int32_t, cuco::test::probe_sequence::linear_probing, 1),
(int32_t, cuco::test::probe_sequence::linear_probing, 2),
(int64_t, cuco::test::probe_sequence::linear_probing, 1),
(int64_t, cuco::test::probe_sequence::linear_probing, 2))
{
constexpr size_t num_unique_keys{400};
constexpr size_t key_multiplicity{5};
constexpr size_t num_keys{num_unique_keys * key_multiplicity};

using probe = std::conditional_t<Probe == cuco::test::probe_sequence::linear_probing,
cuco::linear_probing<CGSize, cuco::default_hash_function<Key>>,
cuco::double_hashing<CGSize, cuco::default_hash_function<Key>>>;

auto set =
cuco::static_multiset{num_keys, cuco::empty_key<Key>{-1}, {}, probe{}, {}, cuco::storage<2>{}};

auto unique_keys_begin = thrust::counting_iterator<Key>(0);
auto gen_duplicate_keys = cuda::proclaim_return_type<Key>(
[] __device__(auto const& k) { return static_cast<Key>(k % num_unique_keys); });
auto keys_begin = thrust::make_transform_iterator(unique_keys_begin, gen_duplicate_keys);

set.insert(keys_begin, keys_begin + num_keys);

using error_counter_type = cuda::atomic<std::size_t, cuda::thread_scope_system>;
error_counter_type* error_counter;
CUCO_CUDA_TRY(cudaMallocHost(&error_counter, sizeof(error_counter_type)));
new (error_counter) error_counter_type{0};

auto const grid_size = cuco::detail::grid_size(num_unique_keys, CGSize);
auto const block_size = cuco::detail::default_block_size();

// test scalar for_each
if constexpr (CGSize == 1) {
for_each_check_scalar<<<grid_size, block_size>>>(
set.ref(cuco::for_each), unique_keys_begin, num_unique_keys, key_multiplicity, error_counter);
CUCO_CUDA_TRY(cudaDeviceSynchronize());
REQUIRE(error_counter->load() == 0);
error_counter->store(0);
}

// test CG for_each
for_each_check_cooperative<<<grid_size, block_size>>>(
set.ref(cuco::for_each), unique_keys_begin, num_unique_keys, key_multiplicity, error_counter);
CUCO_CUDA_TRY(cudaDeviceSynchronize());
REQUIRE(error_counter->load() == 0);

CUCO_CUDA_TRY(cudaFreeHost(error_counter));
}