Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Make CUDA-compiler specific CUB headers do nothing in other compilers #3378

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 6 additions & 2 deletions cub/cub/detail/launcher/cuda_runtime.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,11 @@
# pragma system_header
#endif // no system header

#include <cub/util_device.cuh>
#if _CCCL_HAS_CUDA_COMPILER
caugonnet marked this conversation as resolved.
Show resolved Hide resolved

#include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
# include <cub/util_device.cuh>

# include <thrust/system/cuda/detail/core/triple_chevron_launch.h>

CUB_NAMESPACE_BEGIN

Expand Down Expand Up @@ -56,3 +58,5 @@ struct TripleChevronFactory
} // namespace detail

CUB_NAMESPACE_END

#endif // _CCCL_CUDA_COMPILER
86 changes: 45 additions & 41 deletions cub/cub/util_device.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -47,25 +47,27 @@
# pragma system_header
#endif // no system header

#include <cub/detail/device_synchronize.cuh> // IWYU pragma: export
#include <cub/util_debug.cuh>
#include <cub/util_type.cuh>
#if _CCCL_HAS_CUDA_COMPILER

# include <cub/detail/device_synchronize.cuh> // IWYU pragma: export
# include <cub/util_debug.cuh>
# include <cub/util_type.cuh>
// for backward compatibility
#include <cub/util_temporary_storage.cuh>
# include <cub/util_temporary_storage.cuh>

#include <cuda/std/__cuda/ensure_current_device.h> // IWYU pragma: export
#include <cuda/std/type_traits>
#include <cuda/std/utility>
# include <cuda/std/__cuda/ensure_current_device.h> // IWYU pragma: export
# include <cuda/std/type_traits>
# include <cuda/std/utility>

#include <array>
#include <atomic>
#include <cassert>
# include <array>
# include <atomic>
# include <cassert>

#include <nv/target>
# include <nv/target>

CUB_NAMESPACE_BEGIN

#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
# ifndef _CCCL_DOXYGEN_INVOKED // Do not document

namespace detail
{
Expand All @@ -90,7 +92,7 @@ template <typename T>
CUB_DETAIL_KERNEL_ATTRIBUTES void EmptyKernel()
{}

#endif // _CCCL_DOXYGEN_INVOKED
# endif // _CCCL_DOXYGEN_INVOKED

/**
* \brief Returns the current device or -1 if an error occurred.
Expand All @@ -105,13 +107,13 @@ CUB_RUNTIME_FUNCTION inline int CurrentDevice()
return device;
}

#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
# ifndef _CCCL_DOXYGEN_INVOKED // Do not document

//! @brief RAII helper which saves the current device and switches to the specified device on construction and switches
//! to the saved device on destruction.
using SwitchDevice = ::cuda::__ensure_current_device;

#endif // _CCCL_DOXYGEN_INVOKED
# endif // _CCCL_DOXYGEN_INVOKED

/**
* \brief Returns the number of CUDA devices available or -1 if an error
Expand Down Expand Up @@ -153,7 +155,7 @@ CUB_RUNTIME_FUNCTION inline int DeviceCount()
return result;
}

#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
# ifndef _CCCL_DOXYGEN_INVOKED // Do not document
/**
* \brief Per-device cache for a CUDA attribute value; the attribute is queried
* and stored for each device upon construction.
Expand Down Expand Up @@ -268,7 +270,7 @@ public:
return entry.payload;
}
};
#endif // _CCCL_DOXYGEN_INVOKED
# endif // _CCCL_DOXYGEN_INVOKED

/**
* \brief Retrieves the PTX version that will be used on the current device (major * 100 + minor * 10).
Expand All @@ -288,11 +290,11 @@ CUB_RUNTIME_FUNCTION inline cudaError_t PtxVersionUncached(int& ptx_version)
// in device code.
// <nv/target> may provide an abstraction for this eventually. For now,
// we have to keep this usage of __CUDA_ARCH__.
#if defined(_NVHPC_CUDA)
# define CUB_TEMP_GET_PTX __builtin_current_device_sm()
#else
# define CUB_TEMP_GET_PTX __CUDA_ARCH__
#endif
# if defined(_NVHPC_CUDA)
# define CUB_TEMP_GET_PTX __builtin_current_device_sm()
# else
# define CUB_TEMP_GET_PTX __CUDA_ARCH__
# endif

cudaError_t result = cudaSuccess;
NV_IF_TARGET(
Expand All @@ -312,7 +314,7 @@ CUB_RUNTIME_FUNCTION inline cudaError_t PtxVersionUncached(int& ptx_version)

ptx_version = CUB_TEMP_GET_PTX;));

#undef CUB_TEMP_GET_PTX
# undef CUB_TEMP_GET_PTX

return result;
}
Expand Down Expand Up @@ -470,29 +472,29 @@ namespace detail
*/
CUB_RUNTIME_FUNCTION inline cudaError_t DebugSyncStream(cudaStream_t stream)
{
#ifndef CUB_DETAIL_DEBUG_ENABLE_SYNC
# ifndef CUB_DETAIL_DEBUG_ENABLE_SYNC

(void) stream;
return cudaSuccess;

#else // CUB_DETAIL_DEBUG_ENABLE_SYNC:
# else // CUB_DETAIL_DEBUG_ENABLE_SYNC:

# define CUB_TMP_SYNC_AVAILABLE \
_CubLog("%s\n", "Synchronizing..."); \
return SyncStream(stream)
# define CUB_TMP_SYNC_AVAILABLE \
_CubLog("%s\n", "Synchronizing..."); \
return SyncStream(stream)

# define CUB_TMP_DEVICE_SYNC_UNAVAILABLE \
(void) stream; \
_CubLog("WARNING: Skipping CUB `debug_synchronous` synchronization (%s).\n", \
"device-side sync requires <sm_90, RDC, and CDPv1"); \
return cudaSuccess
# define CUB_TMP_DEVICE_SYNC_UNAVAILABLE \
(void) stream; \
_CubLog("WARNING: Skipping CUB `debug_synchronous` synchronization (%s).\n", \
"device-side sync requires <sm_90, RDC, and CDPv1"); \
return cudaSuccess

NV_IF_TARGET(NV_IS_HOST, (CUB_TMP_SYNC_AVAILABLE;), (CUB_TMP_DEVICE_SYNC_UNAVAILABLE;));

# undef CUB_TMP_DEVICE_SYNC_UNAVAILABLE
# undef CUB_TMP_SYNC_AVAILABLE
# undef CUB_TMP_DEVICE_SYNC_UNAVAILABLE
# undef CUB_TMP_SYNC_AVAILABLE

#endif // CUB_DETAIL_DEBUG_ENABLE_SYNC
# endif // CUB_DETAIL_DEBUG_ENABLE_SYNC
}

/** \brief Gets whether the current device supports unified addressing */
Expand Down Expand Up @@ -652,19 +654,19 @@ struct ChainedPolicy
CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Invoke(int device_ptx_version, FunctorT& op)
{
// __CUDA_ARCH_LIST__ is only available from CTK 11.5 onwards
#ifdef __CUDA_ARCH_LIST__
# ifdef __CUDA_ARCH_LIST__
return runtime_to_compiletime<1, __CUDA_ARCH_LIST__>(device_ptx_version, op);
// NV_TARGET_SM_INTEGER_LIST is defined by NVHPC. The values need to be multiplied by 10 to match
// __CUDA_ARCH_LIST__. E.g. arch 860 from __CUDA_ARCH_LIST__ corresponds to arch 86 from NV_TARGET_SM_INTEGER_LIST.
#elif defined(NV_TARGET_SM_INTEGER_LIST)
# elif defined(NV_TARGET_SM_INTEGER_LIST)
return runtime_to_compiletime<10, NV_TARGET_SM_INTEGER_LIST>(device_ptx_version, op);
#else
# else
if (device_ptx_version < PolicyPtxVersion)
{
return PrevPolicyT::Invoke(device_ptx_version, op);
}
return op.template Invoke<PolicyT>();
#endif
# endif
}

private:
Expand Down Expand Up @@ -759,4 +761,6 @@ private:

CUB_NAMESPACE_END

#include <cub/detail/launcher/cuda_runtime.cuh> // to complete the definition of TripleChevronFactory
# include <cub/detail/launcher/cuda_runtime.cuh> // to complete the definition of TripleChevronFactory

#endif // _CCCL_HAS_CUDA_COMPILER
Loading