Skip to content

Commit

Permalink
Make thrust::transform use cub::DeviceTransform (NVIDIA#2389)
Browse files Browse the repository at this point in the history
* Add transform benchmark requiring a stable address
* Make thrust::transform use cub::DeviceTransform
* Introduces address stability detection and opt-in in libcu++
* Mark lambdas in Thrust BabelStream benchmark address oblivious
* Optimize prefetch cub::DeviceTransform for small problems

Fixes: NVIDIA#2263
  • Loading branch information
bernhardmgruber authored Nov 6, 2024
1 parent c358bde commit c97f2e3
Show file tree
Hide file tree
Showing 9 changed files with 326 additions and 32 deletions.
7 changes: 4 additions & 3 deletions cub/cub/device/dispatch/dispatch_transform.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -950,14 +950,15 @@ struct dispatch_t<RequiresStableAddress,
return config.error;
}

// choose items per thread to reach minimum bytes in flight
const int items_per_thread =
loaded_bytes_per_iter == 0
? +policy_t::items_per_thread_no_input
: ::cuda::ceil_div(ActivePolicy::min_bif, config->max_occupancy * block_dim * loaded_bytes_per_iter);

// Generate at least one block per SM. This improves tiny problem sizes (e.g. 2^16 elements).
const int items_per_thread_evenly_spread =
static_cast<int>(::cuda::std::min(Offset{items_per_thread}, num_items / (config->sm_count * block_dim)));
// but also generate enough blocks for full occupancy to optimize small problem sizes, e.g., 2^16 or 2^20 elements
const int items_per_thread_evenly_spread = static_cast<int>(
::cuda::std::min(Offset{items_per_thread}, num_items / (config->sm_count * block_dim * config->max_occupancy)));

const int items_per_thread_clamped = ::cuda::std::clamp(
items_per_thread_evenly_spread, +policy_t::min_items_per_thread, +policy_t::max_items_per_thread);
Expand Down
4 changes: 4 additions & 0 deletions docs/libcudacxx/extended_api/functional.rst
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,10 @@ Function wrapper
- Creates a forwarding call wrapper that proclaims return type
- libcu++ 1.9.0 / CCCL 2.0.0 / CUDA 11.8

* - ``cuda::proclaim_copyable_arguments``
- Creates a forwarding call wrapper that proclaims that arguments can be freely copied before an invocation of the wrapped callable
- CCCL 2.8.0

* - :ref:`cuda::get_device_address <libcudacxx-extended-api-functional-get-device-address>`
- Returns a valid address to a device object
- CCCL 2.8.0
65 changes: 65 additions & 0 deletions libcudacxx/include/cuda/__functional/address_stability.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
//===----------------------------------------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
//
//===----------------------------------------------------------------------===//

#ifndef _CUDA___FUNCTIONAL_ADDRESS_STABILITY_H
#define _CUDA___FUNCTIONAL_ADDRESS_STABILITY_H

#include <cuda/std/detail/__config>

#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
# pragma GCC system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
# pragma clang system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
# pragma system_header
#endif // no system header

#include <cuda/std/__type_traits/integral_constant.h>
#include <cuda/std/__utility/move.h>

_LIBCUDACXX_BEGIN_NAMESPACE_CUDA

//! Trait telling whether a function object type F does not rely on the memory addresses of its arguments. The nested
//! value is true when the addresses of the arguments do not matter and arguments can be provided from arbitrary copies
//! of the respective sources. This trait can be specialized for custom function objects types.
//! @see proclaim_copyable_arguments
template <typename F, typename SFINAE = void>
struct proclaims_copyable_arguments : _CUDA_VSTD::false_type
{};

#if !defined(_CCCL_NO_VARIABLE_TEMPLATES)
template <typename F, typename... Args>
_CCCL_INLINE_VAR constexpr bool proclaims_copyable_arguments_v = proclaims_copyable_arguments<F, Args...>::value;
#endif // !_CCCL_NO_VARIABLE_TEMPLATES

// Wrapper for a callable to mark it as permitting copied arguments
template <typename F>
struct __callable_permitting_copied_arguments : F
{
using F::operator();
};

template <typename F>
struct proclaims_copyable_arguments<__callable_permitting_copied_arguments<F>> : _CUDA_VSTD::true_type
{};

//! Creates a new function object from an existing one, which is marked as permitting its arguments to be copies of
//! whatever source they come from. This implies that the addresses of the arguments are irrelevant to the function
//! object.
//! @see proclaims_copyable_arguments
template <typename F>
_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr auto
proclaim_copyable_arguments(F f) -> __callable_permitting_copied_arguments<F>
{
return __callable_permitting_copied_arguments<F>{_CUDA_VSTD::move(f)};
}

_LIBCUDACXX_END_NAMESPACE_CUDA

#endif // _CUDA___FUNCTIONAL_ADDRESS_STABILITY_H
1 change: 1 addition & 0 deletions libcudacxx/include/cuda/functional
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
# pragma system_header
#endif // no system header

#include <cuda/__functional/address_stability.h>
#include <cuda/__functional/get_device_address.h>
#include <cuda/__functional/maximum.h>
#include <cuda/__functional/minimum.h>
Expand Down
72 changes: 59 additions & 13 deletions thrust/benchmarks/bench/transform/basic.cu
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,8 @@
#include <thrust/transform.h>
#include <thrust/zip_function.h>

#include <cuda/__functional/address_stability.h>

#include <nvbench_helper.cuh>

template <class InT, class OutT>
Expand Down Expand Up @@ -106,7 +108,7 @@ constexpr auto startC = 3; // BabelStream: 0.1
constexpr auto startScalar = 4; // BabelStream: 0.4

using element_types = nvbench::type_list<std::int8_t, std::int16_t, float, double, __int128>;
auto array_size_powers = std::vector<std::int64_t>{25};
auto array_size_powers = std::vector<std::int64_t>{25}; // BabelStream uses 2^25, H200 can fit 2^31

template <typename T>
static void mul(nvbench::state& state, nvbench::type_list<T>)
Expand All @@ -121,9 +123,10 @@ static void mul(nvbench::state& state, nvbench::type_list<T>)

state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch&) {
const T scalar = startScalar;
thrust::transform(c.begin(), c.end(), b.begin(), [=] __device__ __host__(const T& ci) {
return ci * scalar;
});
thrust::transform(
c.begin(), c.end(), b.begin(), cuda::proclaim_copyable_arguments([=] __device__ __host__(const T& ci) {
return ci * scalar;
}));
});
}

Expand All @@ -145,9 +148,14 @@ static void add(nvbench::state& state, nvbench::type_list<T>)
state.add_global_memory_writes<T>(n);

state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch&) {
thrust::transform(a.begin(), a.end(), b.begin(), c.begin(), [] __device__ __host__(const T& ai, const T& bi) {
return ai + bi;
});
thrust::transform(
a.begin(),
a.end(),
b.begin(),
c.begin(),
cuda::proclaim_copyable_arguments([] _CCCL_DEVICE(const T& ai, const T& bi) -> T {
return ai + bi;
}));
});
}

Expand All @@ -170,9 +178,14 @@ static void triad(nvbench::state& state, nvbench::type_list<T>)

state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch&) {
const T scalar = startScalar;
thrust::transform(b.begin(), b.end(), c.begin(), a.begin(), [=] __device__ __host__(const T& bi, const T& ci) {
return bi + scalar * ci;
});
thrust::transform(
b.begin(),
b.end(),
c.begin(),
a.begin(),
cuda::proclaim_copyable_arguments([=] _CCCL_DEVICE(const T& bi, const T& ci) {
return bi + scalar * ci;
}));
});
}

Expand All @@ -199,14 +212,47 @@ static void nstream(nvbench::state& state, nvbench::type_list<T>)
thrust::make_zip_iterator(a.begin(), b.begin(), c.begin()),
thrust::make_zip_iterator(a.end(), b.end(), c.end()),
a.begin(),
thrust::make_zip_function([=] __device__ __host__(const T& ai, const T& bi, const T& ci) {
return ai + bi + scalar * ci;
}));

thrust::make_zip_function(
cuda::proclaim_copyable_arguments([=] _CCCL_DEVICE(const T& ai, const T& bi, const T& ci) {
return ai + bi + scalar * ci;
})));
});
}

NVBENCH_BENCH_TYPES(nstream, NVBENCH_TYPE_AXES(element_types))
.set_name("nstream")
.set_type_axes_names({"T{ct}"})
.add_int64_power_of_two_axis("Elements", array_size_powers);

// variation of nstream requiring a stable parameter address because it recovers the element index
template <typename T>
static void nstream_stable(nvbench::state& state, nvbench::type_list<T>)
{
const auto n = static_cast<std::size_t>(state.get_int64("Elements"));
thrust::device_vector<T> a(n, startA);
thrust::device_vector<T> b(n, startB);
thrust::device_vector<T> c(n, startC);

const T* a_start = thrust::raw_pointer_cast(a.data());
const T* b_start = thrust::raw_pointer_cast(b.data());
const T* c_start = thrust::raw_pointer_cast(c.data());

state.add_element_count(n);
state.add_global_memory_reads<T>(3 * n);
state.add_global_memory_writes<T>(n);

state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch&) {
const T scalar = startScalar;
thrust::transform(a.begin(), a.end(), a.begin(), [=] _CCCL_DEVICE(const T& ai) {
const auto i = &ai - a_start;
return ai + b_start[i] + scalar * c_start[i];
});
});
}

NVBENCH_BENCH_TYPES(nstream_stable, NVBENCH_TYPE_AXES(element_types))
.set_name("nstream_stable")
.set_type_axes_names({"T{ct}"})
.add_int64_power_of_two_axis("Elements", array_size_powers);
} // namespace babelstream
24 changes: 24 additions & 0 deletions thrust/testing/address_stability.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
#include <cuda/__functional/address_stability.h>

#include <unittest/unittest.h>

struct my_plus
{
_CCCL_HOST_DEVICE auto operator()(int a, int b) const -> int
{
return a + b;
}
};

void TestAddressStability()
{
using ::cuda::proclaim_copyable_arguments;
using ::cuda::proclaims_copyable_arguments;

static_assert(!proclaims_copyable_arguments<thrust::plus<int>>::value, "");
static_assert(proclaims_copyable_arguments<decltype(proclaim_copyable_arguments(thrust::plus<int>{}))>::value, "");

static_assert(!proclaims_copyable_arguments<my_plus>::value, "");
static_assert(proclaims_copyable_arguments<decltype(proclaim_copyable_arguments(my_plus{}))>::value, "");
}
DECLARE_UNITTEST(TestAddressStability);
77 changes: 77 additions & 0 deletions thrust/testing/cuda/transform.cu
Original file line number Diff line number Diff line change
Expand Up @@ -344,3 +344,80 @@ void TestTransformBinaryCudaStreams()
cudaStreamDestroy(s);
}
DECLARE_UNITTEST(TestTransformBinaryCudaStreams);

struct sum_five
{
_CCCL_HOST_DEVICE auto
operator()(std::int8_t a, std::int16_t b, std::int32_t c, std::int64_t d, float e) const -> double
{
return a + b + c + d + e;
}
};

// The following test cannot be compiled because of a bug in the conversion of thrust::tuple on MSVC 2017
#ifndef _CCCL_COMPILER_MSVC_2017
// we specialize zip_function for sum_five, but do nothing in the call operator so the test below would fail if the
// zip_function is actually called (and not unwrapped)
THRUST_NAMESPACE_BEGIN
template <>
class zip_function<sum_five>
{
public:
_CCCL_HOST_DEVICE zip_function(sum_five func)
: func(func)
{}

_CCCL_HOST_DEVICE sum_five& underlying_function() const
{
return func;
}

template <typename Tuple>
_CCCL_HOST_DEVICE auto
operator()(Tuple&& t) const -> decltype(detail::zip_detail::apply(std::declval<sum_five>(), THRUST_FWD(t)))
{
// not calling func, so we would get a wrong result if we were called
return {};
}

private:
mutable sum_five func;
};
THRUST_NAMESPACE_END

// test that the cuda_cub backend of Thrust unwraps zip_iterators/zip_functions into their input streams
void TestTransformZipIteratorUnwrapping()
{
constexpr int num_items = 100;
thrust::device_vector<std::int8_t> a(num_items, 1);
thrust::device_vector<std::int16_t> b(num_items, 2);
thrust::device_vector<std::int32_t> c(num_items, 3);
thrust::device_vector<std::int64_t> d(num_items, 4);
thrust::device_vector<float> e(num_items, 5);

thrust::device_vector<double> result(num_items);
// SECTION("once") // TODO(bgruber): enable sections when we migrate to Catch2
{
const auto z = thrust::make_zip_iterator(a.begin(), b.begin(), c.begin(), d.begin(), e.begin());
thrust::transform(z, z + num_items, result.begin(), thrust::make_zip_function(sum_five{}));

// compute reference and verify
thrust::device_vector<double> reference(num_items, 1 + 2 + 3 + 4 + 5);
ASSERT_EQUAL(reference, result);
}
// SECTION("trice")
{
const auto z = thrust::make_zip_iterator(
thrust::make_zip_iterator(thrust::make_zip_iterator(a.begin(), b.begin(), c.begin(), d.begin(), e.begin())));
thrust::transform(z,
z + num_items,
result.begin(),
thrust::make_zip_function(thrust::make_zip_function(thrust::make_zip_function(sum_five{}))));

// compute reference and verify
thrust::device_vector<double> reference(num_items, 1 + 2 + 3 + 4 + 5);
ASSERT_EQUAL(reference, result);
}
}
DECLARE_UNITTEST(TestTransformZipIteratorUnwrapping);
#endif // !_CCCL_COMPILER_MSVC_2017
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,10 @@
THRUST_NAMESPACE_BEGIN
namespace cuda_cub
{
// Need a forward declaration here to work around a cyclic include, since "cuda/detail/transform.h" includes this header
template <class Derived, class InputIt, class OutputIt, class TransformOp>
OutputIt THRUST_FUNCTION
transform(execution_policy<Derived>& policy, InputIt first, InputIt last, OutputIt result, TransformOp transform_op);

namespace __copy
{
Expand Down
Loading

0 comments on commit c97f2e3

Please sign in to comment.