From e961e07842e1df8b23c87e5f4ed5d60e340d739b Mon Sep 17 00:00:00 2001
From: fbusato <fbusato@nvidia.com>
Date: Fri, 10 Jan 2025 20:24:47 +0000
Subject: [PATCH 01/14] add deprecate warning

---
 cub/cub/util_ptx.cuh | 6 ++++++
 1 file changed, 6 insertions(+)
diff --git a/cub/cub/util_ptx.cuh b/cub/cub/util_ptx.cuh
index aa522d9576e..33983d59abd 100644
--- a/cub/cub/util_ptx.cuh
+++ b/cub/cub/util_ptx.cuh
@@ -359,6 +359,7 @@ _CCCL_DEVICE _CCCL_FORCEINLINE int RowMajorTid(int block_dim_x, int block_dim_y,
 /**
  * \brief Returns the warp lane ID of the calling thread
  */
+CCCL_DEPRECATED_BECAUSE("use cuda::ptx::get_sreg_laneid() instead")
 _CCCL_DEVICE _CCCL_FORCEINLINE unsigned int LaneId()
 {
   unsigned int ret;
@@ -370,6 +371,7 @@ _CCCL_DEVICE _CCCL_FORCEINLINE unsigned int LaneId()
  * \brief Returns the warp ID of the calling thread.  Warp ID is guaranteed to be unique among warps, but may not
  * correspond to a zero-based ranking within the thread block.
  */
+CCCL_DEPRECATED_BECAUSE("use cuda::ptx::get_sreg_warpid() instead")
 _CCCL_DEVICE _CCCL_FORCEINLINE unsigned int WarpId()
 {
   unsigned int ret;
@@ -409,6 +411,7 @@ _CCCL_HOST_DEVICE _CCCL_FORCEINLINE unsigned int WarpMask(unsigned int warp_id)
 /**
  * \brief Returns the warp lane mask of all lanes less than the calling thread
  */
+CCCL_DEPRECATED_BECAUSE("use cuda::ptx::get_sreg_lanemask_lt() instead")
 _CCCL_DEVICE _CCCL_FORCEINLINE unsigned int LaneMaskLt()
 {
   unsigned int ret;
@@ -419,6 +422,7 @@ _CCCL_DEVICE _CCCL_FORCEINLINE unsigned int LaneMaskLt()
 /**
  * \brief Returns the warp lane mask of all lanes less than or equal to the calling thread
  */
+CCCL_DEPRECATED_BECAUSE("use cuda::ptx::get_sreg_lanemask_le() instead")
 _CCCL_DEVICE _CCCL_FORCEINLINE unsigned int LaneMaskLe()
 {
   unsigned int ret;
@@ -429,6 +433,7 @@ _CCCL_DEVICE _CCCL_FORCEINLINE unsigned int LaneMaskLe()
 /**
  * \brief Returns the warp lane mask of all lanes greater than the calling thread
  */
+CCCL_DEPRECATED_BECAUSE("use cuda::ptx::get_sreg_lanemask_gt() instead")
 _CCCL_DEVICE _CCCL_FORCEINLINE unsigned int LaneMaskGt()
 {
   unsigned int ret;
@@ -439,6 +444,7 @@ _CCCL_DEVICE _CCCL_FORCEINLINE unsigned int LaneMaskGt()
 /**
  * \brief Returns the warp lane mask of all lanes greater than or equal to the calling thread
  */
+CCCL_DEPRECATED_BECAUSE("use cuda::ptx::get_sreg_lanemask_ge() instead")
 _CCCL_DEVICE _CCCL_FORCEINLINE unsigned int LaneMaskGe()
 {
   unsigned int ret;

From b3d23248e5ff57ea5948ebfefa007ee166a4e093 Mon Sep 17 00:00:00 2001
From: fbusato <fbusato@nvidia.com>
Date: Fri, 10 Jan 2025 20:33:31 +0000
Subject: [PATCH 02/14] try to remove LogicShiftLeft and LogicShiftRight

---
 cub/cub/agent/agent_batch_memcpy.cuh | 18 ++++++++----------
 cub/cub/util_ptx.cuh                 | 25 -------------------------
 2 files changed, 8 insertions(+), 35 deletions(-)

diff --git a/cub/cub/agent/agent_batch_memcpy.cuh b/cub/cub/agent/agent_batch_memcpy.cuh
index f9d5e8b16a1..813c0dae6c8 100644
--- a/cub/cub/agent/agent_batch_memcpy.cuh
+++ b/cub/cub/agent/agent_batch_memcpy.cuh
@@ -424,12 +424,11 @@ public:
     for (uint32_t i = 0; i < NUM_TOTAL_UNITS; ++i)
     {
       // In case the bit-offset of the counter at <index> is larger than the bit range of the
-      // current unit, the bit_shift amount will be larger than the bits provided by this unit. As
-      // C++'s bit-shift has undefined behaviour if the bits being shifted exceed the operand width,
-      // we use the PTX instruction `shr` to make sure behaviour is well-defined.
-      // Negative bit-shift amounts wrap around in unsigned integer math and are ultimately clamped.
+      // current unit, the bit_shift amount will be larger than the bits provided by this unit.
+      // C++'s bit-shift has undefined behaviour if the bits being shifted exceed the operand width.
+      // The bit_shift is a run-time value, it is translated into SASS `shr` and the result behavior is well-defined.
       const uint32_t bit_shift = target_offset - i * USED_BITS_PER_UNIT;
-      val |= detail::LogicShiftRight(data[i], bit_shift) & ITEM_MASK;
+      val |= (data[i] >> bit_shift) & ITEM_MASK;
     }
     return val;
   }
@@ -442,12 +441,11 @@ public:
     for (uint32_t i = 0; i < NUM_TOTAL_UNITS; ++i)
     {
       // In case the bit-offset of the counter at <index> is larger than the bit range of the
-      // current unit, the bit_shift amount will be larger than the bits provided by this unit. As
-      // C++'s bit-shift has undefined behaviour if the bits being shifted exceed the operand width,
-      // we use the PTX instruction `shl` to make sure behaviour is well-defined.
-      // Negative bit-shift amounts wrap around in unsigned integer math and are ultimately clamped.
+      // current unit, the bit_shift amount will be larger than the bits provided by this unit.
+      // C++'s bit-shift has undefined behaviour if the bits being shifted exceed the operand width.
+      // The bit_shift is a run-time value, it is translated into SASS `shl` and the result behavior is well-defined.
       const uint32_t bit_shift = target_offset - i * USED_BITS_PER_UNIT;
-      data[i] += detail::LogicShiftLeft(value, bit_shift) & UNIT_MASK;
+      data[i] += (value << bit_shift) & UNIT_MASK;
     }
   }
 
diff --git a/cub/cub/util_ptx.cuh b/cub/cub/util_ptx.cuh
index 33983d59abd..9e9af137c31 100644
--- a/cub/cub/util_ptx.cuh
+++ b/cub/cub/util_ptx.cuh
@@ -52,31 +52,6 @@ CUB_NAMESPACE_BEGIN
  * Inlined PTX intrinsics
  ******************************************************************************/
 
-namespace detail
-{
-/**
- * @brief Shifts @p val left by the amount specified by unsigned 32-bit value in @p num_bits. If @p
- * num_bits is larger than 32 bits, @p num_bits is clamped to 32.
- */
-_CCCL_DEVICE _CCCL_FORCEINLINE uint32_t LogicShiftLeft(uint32_t val, uint32_t num_bits)
-{
-  uint32_t ret{};
-  asm("shl.b32 %0, %1, %2;" : "=r"(ret) : "r"(val), "r"(num_bits));
-  return ret;
-}
-
-/**
- * @brief Shifts @p val right by the amount specified by unsigned 32-bit value in @p num_bits. If @p
- * num_bits is larger than 32 bits, @p num_bits is clamped to 32.
- */
-_CCCL_DEVICE _CCCL_FORCEINLINE uint32_t LogicShiftRight(uint32_t val, uint32_t num_bits)
-{
-  uint32_t ret{};
-  asm("shr.b32 %0, %1, %2;" : "=r"(ret) : "r"(val), "r"(num_bits));
-  return ret;
-}
-} // namespace detail
-
 /**
  * \brief Shift-right then add.  Returns (\p x >> \p shift) + \p addend.
  */

From 284ff7e3d97bb5615d6ee2bd4b19fd8ff1f5e45a Mon Sep 17 00:00:00 2001
From: fbusato <fbusato@nvidia.com>
Date: Fri, 10 Jan 2025 23:16:04 +0000
Subject: [PATCH 03/14] replace cub special registers usage

---
 cub/cub/agent/agent_radix_sort_histogram.cuh           |  4 +++-
 cub/cub/agent/agent_radix_sort_onesweep.cuh            |  3 ++-
 cub/cub/agent/agent_radix_sort_upsweep.cuh             |  8 +++++---
 cub/cub/agent/agent_rle.cuh                            |  7 ++++---
 cub/cub/block/block_exchange.cuh                       |  4 +++-
 cub/cub/block/block_radix_rank.cuh                     |  9 +++++----
 .../specializations/block_reduce_warp_reductions.cuh   |  4 +++-
 .../block/specializations/block_scan_warp_scans.cuh    |  4 +++-
 cub/cub/warp/specializations/warp_exchange_shfl.cuh    |  6 ++++--
 cub/cub/warp/specializations/warp_exchange_smem.cuh    |  6 ++++--
 cub/cub/warp/specializations/warp_reduce_shfl.cuh      |  5 +++--
 cub/cub/warp/specializations/warp_reduce_smem.cuh      | 10 ++++++----
 cub/cub/warp/specializations/warp_scan_shfl.cuh        |  6 ++++--
 cub/cub/warp/specializations/warp_scan_smem.cuh        |  6 ++++--
 cub/cub/warp/warp_load.cuh                             |  8 ++++++--
 cub/cub/warp/warp_merge_sort.cuh                       |  7 +++++--
 cub/cub/warp/warp_scan.cuh                             |  3 ++-
 cub/cub/warp/warp_store.cuh                            |  8 ++++++--
 18 files changed, 72 insertions(+), 36 deletions(-)

diff --git a/cub/cub/agent/agent_radix_sort_histogram.cuh b/cub/cub/agent/agent_radix_sort_histogram.cuh
index 2785f732450..f7709f2f799 100644
--- a/cub/cub/agent/agent_radix_sort_histogram.cuh
+++ b/cub/cub/agent/agent_radix_sort_histogram.cuh
@@ -50,6 +50,8 @@
 #include <cub/util_math.cuh>
 #include <cub/util_type.cuh>
 
+#include <cuda/ptx>
+
 CUB_NAMESPACE_BEGIN
 
 template <int _BLOCK_THREADS, int _ITEMS_PER_THREAD, int NOMINAL_4B_NUM_PARTS, typename ComputeT, int _RADIX_BITS>
@@ -199,7 +201,7 @@ struct AgentRadixSortHistogram
   _CCCL_DEVICE _CCCL_FORCEINLINE void
   AccumulateSharedHistograms(OffsetT tile_offset, bit_ordered_type (&keys)[ITEMS_PER_THREAD])
   {
-    int part = LaneId() % NUM_PARTS;
+    int part = ::cuda::ptx::get_sreg_tid_x() % NUM_PARTS;
 #pragma unroll
     for (int current_bit = begin_bit, pass = 0; current_bit < end_bit; current_bit += RADIX_BITS, ++pass)
     {
diff --git a/cub/cub/agent/agent_radix_sort_onesweep.cuh b/cub/cub/agent/agent_radix_sort_onesweep.cuh
index a78ee66c7b2..cef96f50c57 100644
--- a/cub/cub/agent/agent_radix_sort_onesweep.cuh
+++ b/cub/cub/agent/agent_radix_sort_onesweep.cuh
@@ -49,6 +49,7 @@
 #include <cub/util_ptx.cuh>
 #include <cub/util_type.cuh>
 
+#include <cuda/ptx>
 #include <cuda/std/type_traits>
 
 CUB_NAMESPACE_BEGIN
@@ -669,7 +670,7 @@ struct AgentRadixSortOnesweep
       , current_bit(current_bit)
       , num_bits(num_bits)
       , warp(threadIdx.x / WARP_THREADS)
-      , lane(LaneId())
+      , lane(::cuda::ptx::get_sreg_tid_x())
       , decomposer(decomposer)
   {
     // initialization
diff --git a/cub/cub/agent/agent_radix_sort_upsweep.cuh b/cub/cub/agent/agent_radix_sort_upsweep.cuh
index e91e32c5bd3..4f1841eca99 100644
--- a/cub/cub/agent/agent_radix_sort_upsweep.cuh
+++ b/cub/cub/agent/agent_radix_sort_upsweep.cuh
@@ -52,6 +52,8 @@
 #include <cub/util_type.cuh>
 #include <cub/warp/warp_reduce.cuh>
 
+#include <cuda/ptx>
+
 CUB_NAMESPACE_BEGIN
 
 /******************************************************************************
@@ -298,7 +300,7 @@ struct AgentRadixSortUpsweep
   _CCCL_DEVICE _CCCL_FORCEINLINE void UnpackDigitCounts()
   {
     unsigned int warp_id  = threadIdx.x >> LOG_WARP_THREADS;
-    unsigned int warp_tid = LaneId();
+    unsigned int warp_tid = ::cuda::ptx::get_sreg_tid_x();
 
 #pragma unroll
     for (int LANE = 0; LANE < LANES_PER_WARP; LANE++)
@@ -419,7 +421,7 @@ struct AgentRadixSortUpsweep
   _CCCL_DEVICE _CCCL_FORCEINLINE void ExtractCounts(OffsetT* counters, int bin_stride = 1, int bin_offset = 0)
   {
     unsigned int warp_id  = threadIdx.x >> LOG_WARP_THREADS;
-    unsigned int warp_tid = LaneId();
+    unsigned int warp_tid = ::cuda::ptx::get_sreg_tid_x();
 
 // Place unpacked digit counters in shared memory
 #pragma unroll
@@ -499,7 +501,7 @@ struct AgentRadixSortUpsweep
   _CCCL_DEVICE _CCCL_FORCEINLINE void ExtractCounts(OffsetT (&bin_count)[BINS_TRACKED_PER_THREAD])
   {
     unsigned int warp_id  = threadIdx.x >> LOG_WARP_THREADS;
-    unsigned int warp_tid = LaneId();
+    unsigned int warp_tid = ::cuda::ptx::get_sreg_tid_x();
 
 // Place unpacked digit counters in shared memory
 #pragma unroll
diff --git a/cub/cub/agent/agent_rle.cuh b/cub/cub/agent/agent_rle.cuh
index 2495d2f5f7a..63e4ccd2306 100644
--- a/cub/cub/agent/agent_rle.cuh
+++ b/cub/cub/agent/agent_rle.cuh
@@ -54,6 +54,7 @@
 #include <cub/iterator/cache_modified_input_iterator.cuh>
 #include <cub/iterator/constant_input_iterator.cuh>
 
+#include <cuda/ptx>
 #include <cuda/std/type_traits>
 
 #include <iterator>
@@ -465,7 +466,7 @@ struct AgentRle
   {
     // Perform warpscans
     unsigned int warp_id = ((WARPS == 1) ? 0 : threadIdx.x / WARP_THREADS);
-    int lane_id          = LaneId();
+    int lane_id          = ::cuda::ptx::get_sreg_tid_x();
 
     LengthOffsetPair identity;
     identity.key   = 0;
@@ -551,7 +552,7 @@ struct AgentRle
     Int2Type<true> is_warp_time_slice)
   {
     unsigned int warp_id = ((WARPS == 1) ? 0 : threadIdx.x / WARP_THREADS);
-    int lane_id          = LaneId();
+    int lane_id          = ::cuda::ptx::get_sreg_tid_x();
 
     // Locally compact items within the warp (first warp)
     if (warp_id == 0)
@@ -608,7 +609,7 @@ struct AgentRle
     Int2Type<false> is_warp_time_slice)
   {
     unsigned int warp_id = ((WARPS == 1) ? 0 : threadIdx.x / WARP_THREADS);
-    int lane_id          = LaneId();
+    int lane_id          = ::cuda::ptx::get_sreg_tid_x();
 
     // Unzip
     OffsetT run_offsets[ITEMS_PER_THREAD];
diff --git a/cub/cub/block/block_exchange.cuh b/cub/cub/block/block_exchange.cuh
index bdc2a3dc932..fc601898f6f 100644
--- a/cub/cub/block/block_exchange.cuh
+++ b/cub/cub/block/block_exchange.cuh
@@ -47,6 +47,8 @@
 #include <cub/util_type.cuh>
 #include <cub/warp/warp_exchange.cuh>
 
+#include <cuda/ptx>
+
 CUB_NAMESPACE_BEGIN
 
 //! @rst
@@ -179,7 +181,7 @@ private:
 
   // TODO(bgruber): can we use signed int here? Only these variables are unsigned:
   unsigned int linear_tid  = RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z);
-  unsigned int lane_id     = LaneId();
+  unsigned int lane_id     = ::cuda::ptx::get_sreg_tid_x();
   unsigned int warp_id     = WARPS == 1 ? 0 : linear_tid / WARP_THREADS;
   unsigned int warp_offset = warp_id * WARP_TIME_SLICED_ITEMS;
 
diff --git a/cub/cub/block/block_radix_rank.cuh b/cub/cub/block/block_radix_rank.cuh
index 92605b5168d..1961ec3ec81 100644
--- a/cub/cub/block/block_radix_rank.cuh
+++ b/cub/cub/block/block_radix_rank.cuh
@@ -48,6 +48,7 @@
 #include <cub/util_ptx.cuh>
 #include <cub/util_type.cuh>
 
+#include <cuda/ptx>
 #include <cuda/std/cstdint>
 #include <cuda/std/limits>
 #include <cuda/std/type_traits>
@@ -716,7 +717,7 @@ public:
 
     volatile DigitCounterT* digit_counters[KEYS_PER_THREAD];
     uint32_t warp_id      = linear_tid >> LOG_WARP_THREADS;
-    uint32_t lane_mask_lt = LaneMaskLt();
+    uint32_t lane_mask_lt = ::cuda::ptx::get_sreg_lanemask_lt();
 
 #pragma unroll
     for (int ITEM = 0; ITEM < KEYS_PER_THREAD; ++ITEM)
@@ -1070,7 +1071,7 @@ struct BlockRadixRankMatchEarlyCounts
         int bin_mask    = *p_match_mask;
         int leader      = (WARP_THREADS - 1) - __clz(bin_mask);
         int warp_offset = 0;
-        int popc        = __popc(bin_mask & LaneMaskLe());
+        int popc        = __popc(bin_mask & ::cuda::ptx::get_sreg_lanemask_le());
         if (lane == leader)
         {
           // atomic is a bit faster
@@ -1099,7 +1100,7 @@ struct BlockRadixRankMatchEarlyCounts
           detail::warp_in_block_matcher_t<RADIX_BITS, PARTIAL_WARP_THREADS, BLOCK_WARPS - 1>::match_any(bin, warp);
         int leader      = (WARP_THREADS - 1) - __clz(bin_mask);
         int warp_offset = 0;
-        int popc        = __popc(bin_mask & LaneMaskLe());
+        int popc        = __popc(bin_mask & ::cuda::ptx::get_sreg_lanemask_le());
         if (lane == leader)
         {
           // atomic is a bit faster
@@ -1135,7 +1136,7 @@ struct BlockRadixRankMatchEarlyCounts
         , digit_extractor(digit_extractor)
         , callback(callback)
         , warp(threadIdx.x / WARP_THREADS)
-        , lane(LaneId())
+        , lane(::cuda::ptx::get_sreg_tid_x())
     {}
   };
 
diff --git a/cub/cub/block/specializations/block_reduce_warp_reductions.cuh b/cub/cub/block/specializations/block_reduce_warp_reductions.cuh
index 4ee2b307bcf..4e43519c00d 100644
--- a/cub/cub/block/specializations/block_reduce_warp_reductions.cuh
+++ b/cub/cub/block/specializations/block_reduce_warp_reductions.cuh
@@ -48,6 +48,8 @@
 #include <cub/util_ptx.cuh>
 #include <cub/warp/warp_reduce.cuh>
 
+#include <cuda/ptx>
+
 CUB_NAMESPACE_BEGIN
 
 /**
@@ -121,7 +123,7 @@ struct BlockReduceWarpReductions
       : temp_storage(temp_storage.Alias())
       , linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
       , warp_id((WARPS == 1) ? 0 : linear_tid / WARP_THREADS)
-      , lane_id(LaneId())
+      , lane_id(::cuda::ptx::get_sreg_tid_x())
   {}
 
   /**
diff --git a/cub/cub/block/specializations/block_scan_warp_scans.cuh b/cub/cub/block/specializations/block_scan_warp_scans.cuh
index 851a71cbe7b..e69acb23ddd 100644
--- a/cub/cub/block/specializations/block_scan_warp_scans.cuh
+++ b/cub/cub/block/specializations/block_scan_warp_scans.cuh
@@ -47,6 +47,8 @@
 #include <cub/util_ptx.cuh>
 #include <cub/warp/warp_scan.cuh>
 
+#include <cuda/ptx>
+
 CUB_NAMESPACE_BEGIN
 
 /**
@@ -127,7 +129,7 @@ struct BlockScanWarpScans
       : temp_storage(temp_storage.Alias())
       , linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
       , warp_id((WARPS == 1) ? 0 : linear_tid / WARP_THREADS)
-      , lane_id(LaneId())
+      , lane_id(::cuda::ptx::get_sreg_tid_x())
   {}
 
   //---------------------------------------------------------------------
diff --git a/cub/cub/warp/specializations/warp_exchange_shfl.cuh b/cub/cub/warp/specializations/warp_exchange_shfl.cuh
index 5abfa7cdd2f..26fe531a07f 100644
--- a/cub/cub/warp/specializations/warp_exchange_shfl.cuh
+++ b/cub/cub/warp/specializations/warp_exchange_shfl.cuh
@@ -40,6 +40,8 @@
 #include <cub/util_ptx.cuh>
 #include <cub/util_type.cuh>
 
+#include <cuda/ptx>
+
 CUB_NAMESPACE_BEGIN
 
 namespace detail
@@ -273,8 +275,8 @@ public:
   WarpExchangeShfl() = delete;
 
   explicit _CCCL_DEVICE _CCCL_FORCEINLINE WarpExchangeShfl(TempStorage&)
-      : lane_id(IS_ARCH_WARP ? LaneId() : (LaneId() % LOGICAL_WARP_THREADS))
-      , warp_id(IS_ARCH_WARP ? 0 : (LaneId() / LOGICAL_WARP_THREADS))
+      : lane_id(IS_ARCH_WARP ? ::cuda::ptx::get_sreg_tid_x() : (::cuda::ptx::get_sreg_tid_x() % LOGICAL_WARP_THREADS))
+      , warp_id(IS_ARCH_WARP ? 0 : (::cuda::ptx::get_sreg_tid_x() / LOGICAL_WARP_THREADS))
       , member_mask(WarpMask<LOGICAL_WARP_THREADS>(warp_id))
   {}
 
diff --git a/cub/cub/warp/specializations/warp_exchange_smem.cuh b/cub/cub/warp/specializations/warp_exchange_smem.cuh
index aabb9e291e9..2a906e130bb 100644
--- a/cub/cub/warp/specializations/warp_exchange_smem.cuh
+++ b/cub/cub/warp/specializations/warp_exchange_smem.cuh
@@ -46,6 +46,8 @@
 #include <cub/util_ptx.cuh>
 #include <cub/util_type.cuh>
 
+#include <cuda/ptx>
+
 CUB_NAMESPACE_BEGIN
 
 namespace detail
@@ -88,8 +90,8 @@ public:
 
   explicit _CCCL_DEVICE _CCCL_FORCEINLINE WarpExchangeSmem(TempStorage& temp_storage)
       : temp_storage(temp_storage.Alias())
-      , lane_id(IS_ARCH_WARP ? LaneId() : (LaneId() % LOGICAL_WARP_THREADS))
-      , warp_id(IS_ARCH_WARP ? 0 : (LaneId() / LOGICAL_WARP_THREADS))
+      , lane_id(IS_ARCH_WARP ? ::cuda::ptx::get_sreg_tid_x() : (::cuda::ptx::get_sreg_tid_x() % LOGICAL_WARP_THREADS))
+      , warp_id(IS_ARCH_WARP ? 0 : (::cuda::ptx::get_sreg_tid_x() / LOGICAL_WARP_THREADS))
       , member_mask(WarpMask<LOGICAL_WARP_THREADS>(warp_id))
   {}
 
diff --git a/cub/cub/warp/specializations/warp_reduce_shfl.cuh b/cub/cub/warp/specializations/warp_reduce_shfl.cuh
index 3e0db152123..adde777ed70 100644
--- a/cub/cub/warp/specializations/warp_reduce_shfl.cuh
+++ b/cub/cub/warp/specializations/warp_reduce_shfl.cuh
@@ -48,6 +48,7 @@
 #include <cub/util_ptx.cuh>
 #include <cub/util_type.cuh>
 
+#include <cuda/ptx>
 #include <cuda/std/cstdint>
 #include <cuda/std/type_traits>
 
@@ -155,7 +156,7 @@ struct WarpReduceShfl
 
   /// Constructor
   _CCCL_DEVICE _CCCL_FORCEINLINE WarpReduceShfl(TempStorage& /*temp_storage*/)
-      : lane_id(static_cast<int>(LaneId()))
+      : lane_id(static_cast<int>(::cuda::ptx::get_sreg_tid_x()))
       , warp_id(IS_ARCH_WARP ? 0 : (lane_id / LOGICAL_WARP_THREADS))
       , member_mask(WarpMask<LOGICAL_WARP_THREADS>(warp_id))
   {
@@ -708,7 +709,7 @@ struct WarpReduceShfl
     }
 
     // Mask out the bits below the current thread
-    warp_flags &= LaneMaskGe();
+    warp_flags &= ::cuda::ptx::get_sreg_lanemask_gt();
 
     // Mask of physical lanes outside the logical warp and convert to logical lanemask
     if (!IS_ARCH_WARP)
diff --git a/cub/cub/warp/specializations/warp_reduce_smem.cuh b/cub/cub/warp/specializations/warp_reduce_smem.cuh
index 87b38db2aa3..fb3885b2793 100644
--- a/cub/cub/warp/specializations/warp_reduce_smem.cuh
+++ b/cub/cub/warp/specializations/warp_reduce_smem.cuh
@@ -49,6 +49,8 @@
 #include <cub/thread/thread_store.cuh>
 #include <cub/util_type.cuh>
 
+#include <cuda/ptx>
+
 CUB_NAMESPACE_BEGIN
 
 /**
@@ -123,8 +125,8 @@ struct WarpReduceSmem
   /// Constructor
   explicit _CCCL_DEVICE _CCCL_FORCEINLINE WarpReduceSmem(TempStorage& temp_storage)
       : temp_storage(temp_storage.Alias())
-      , lane_id(IS_ARCH_WARP ? LaneId() : LaneId() % LOGICAL_WARP_THREADS)
-      , member_mask(WarpMask<LOGICAL_WARP_THREADS>(LaneId() / LOGICAL_WARP_THREADS))
+      , lane_id(IS_ARCH_WARP ? ::cuda::ptx::get_sreg_tid_x() : ::cuda::ptx::get_sreg_tid_x() % LOGICAL_WARP_THREADS)
+      , member_mask(WarpMask<LOGICAL_WARP_THREADS>(::cuda::ptx::get_sreg_tid_x() / LOGICAL_WARP_THREADS))
   {}
 
   /******************************************************************************
@@ -230,12 +232,12 @@ struct WarpReduceSmem
     }
 
     // Keep bits above the current thread.
-    warp_flags &= LaneMaskGt();
+    warp_flags &= ::cuda::ptx::get_sreg_lanemask_gt();
 
     // Accommodate packing of multiple logical warps in a single physical warp
     if (!IS_ARCH_WARP)
     {
-      warp_flags >>= (LaneId() / LOGICAL_WARP_THREADS) * LOGICAL_WARP_THREADS;
+      warp_flags >>= (::cuda::ptx::get_sreg_tid_x() / LOGICAL_WARP_THREADS) * LOGICAL_WARP_THREADS;
     }
 
     // Find next flag
diff --git a/cub/cub/warp/specializations/warp_scan_shfl.cuh b/cub/cub/warp/specializations/warp_scan_shfl.cuh
index c3952b96b4f..e323836e837 100644
--- a/cub/cub/warp/specializations/warp_scan_shfl.cuh
+++ b/cub/cub/warp/specializations/warp_scan_shfl.cuh
@@ -48,6 +48,8 @@
 #include <cub/util_ptx.cuh>
 #include <cub/util_type.cuh>
 
+#include <cuda/ptx>
+
 CUB_NAMESPACE_BEGIN
 
 /**
@@ -116,7 +118,7 @@ struct WarpScanShfl
 
   /// Constructor
   explicit _CCCL_DEVICE _CCCL_FORCEINLINE WarpScanShfl(TempStorage& /*temp_storage*/)
-      : lane_id(LaneId())
+      : lane_id(::cuda::ptx::get_sreg_tid_x())
       , warp_id(IS_ARCH_WARP ? 0 : (lane_id / LOGICAL_WARP_THREADS))
       , member_mask(WarpMask<LOGICAL_WARP_THREADS>(warp_id))
   {
@@ -543,7 +545,7 @@ struct WarpScanShfl
     unsigned int ballot = WARP_BALLOT((pred_key != inclusive_output.key), member_mask);
 
     // Mask away all lanes greater than ours
-    ballot = ballot & LaneMaskLe();
+    ballot = ballot & ::cuda::ptx::get_sreg_lanemask_le();
 
     // Find index of first set bit
     int segment_first_lane = CUB_MAX(0, 31 - __clz(ballot));
diff --git a/cub/cub/warp/specializations/warp_scan_smem.cuh b/cub/cub/warp/specializations/warp_scan_smem.cuh
index 90bdfbf361a..2093c1cdc43 100644
--- a/cub/cub/warp/specializations/warp_scan_smem.cuh
+++ b/cub/cub/warp/specializations/warp_scan_smem.cuh
@@ -49,6 +49,8 @@
 #include <cub/thread/thread_store.cuh>
 #include <cub/util_type.cuh>
 
+#include <cuda/ptx>
+
 CUB_NAMESPACE_BEGIN
 
 /**
@@ -113,10 +115,10 @@ struct WarpScanSmem
       : temp_storage(temp_storage.Alias())
       ,
 
-      lane_id(IS_ARCH_WARP ? LaneId() : LaneId() % LOGICAL_WARP_THREADS)
+      lane_id(IS_ARCH_WARP ? ::cuda::ptx::get_sreg_tid_x() : ::cuda::ptx::get_sreg_tid_x() % LOGICAL_WARP_THREADS)
       ,
 
-      member_mask(WarpMask<LOGICAL_WARP_THREADS>(LaneId() / LOGICAL_WARP_THREADS))
+      member_mask(WarpMask<LOGICAL_WARP_THREADS>(::cuda::ptx::get_sreg_tid_x() / LOGICAL_WARP_THREADS))
   {}
 
   /******************************************************************************
diff --git a/cub/cub/warp/warp_load.cuh b/cub/cub/warp/warp_load.cuh
index ac5c700b958..e368d266428 100644
--- a/cub/cub/warp/warp_load.cuh
+++ b/cub/cub/warp/warp_load.cuh
@@ -46,6 +46,8 @@
 #include <cub/util_type.cuh>
 #include <cub/warp/warp_exchange.cuh>
 
+#include <cuda/ptx>
+
 CUB_NAMESPACE_BEGIN
 
 //! @rst
@@ -438,14 +440,16 @@ public:
   //!        shared memory as temporary storage.
   _CCCL_DEVICE _CCCL_FORCEINLINE WarpLoad()
       : temp_storage(PrivateStorage())
-      , linear_tid(IS_ARCH_WARP ? LaneId() : (LaneId() % LOGICAL_WARP_THREADS))
+      , linear_tid(
+          IS_ARCH_WARP ? ::cuda::ptx::get_sreg_tid_x() : (::cuda::ptx::get_sreg_tid_x() % LOGICAL_WARP_THREADS))
   {}
 
   //! @brief Collective constructor using the specified memory allocation as
   //!        temporary storage.
   _CCCL_DEVICE _CCCL_FORCEINLINE WarpLoad(TempStorage& temp_storage)
       : temp_storage(temp_storage.Alias())
-      , linear_tid(IS_ARCH_WARP ? LaneId() : (LaneId() % LOGICAL_WARP_THREADS))
+      , linear_tid(
+          IS_ARCH_WARP ? ::cuda::ptx::get_sreg_tid_x() : (::cuda::ptx::get_sreg_tid_x() % LOGICAL_WARP_THREADS))
   {}
 
   //! @} end member group
diff --git a/cub/cub/warp/warp_merge_sort.cuh b/cub/cub/warp/warp_merge_sort.cuh
index 40e29322c1f..c54d328b5d7 100644
--- a/cub/cub/warp/warp_merge_sort.cuh
+++ b/cub/cub/warp/warp_merge_sort.cuh
@@ -41,6 +41,7 @@
 #include <cub/util_ptx.cuh>
 #include <cub/util_type.cuh>
 
+#include <cuda/ptx>
 #include <cuda/std/type_traits>
 
 CUB_NAMESPACE_BEGIN
@@ -151,8 +152,10 @@ public:
   WarpMergeSort() = delete;
 
   _CCCL_DEVICE _CCCL_FORCEINLINE WarpMergeSort(typename BlockMergeSortStrategyT::TempStorage& temp_storage)
-      : BlockMergeSortStrategyT(temp_storage, IS_ARCH_WARP ? LaneId() : (LaneId() % LOGICAL_WARP_THREADS))
-      , warp_id(IS_ARCH_WARP ? 0 : (LaneId() / LOGICAL_WARP_THREADS))
+      : BlockMergeSortStrategyT(
+          temp_storage,
+          IS_ARCH_WARP ? ::cuda::ptx::get_sreg_tid_x() : (::cuda::ptx::get_sreg_tid_x() % LOGICAL_WARP_THREADS))
+      , warp_id(IS_ARCH_WARP ? 0 : (::cuda::ptx::get_sreg_tid_x() / LOGICAL_WARP_THREADS))
       , member_mask(WarpMask<LOGICAL_WARP_THREADS>(warp_id))
   {}
 
diff --git a/cub/cub/warp/warp_scan.cuh b/cub/cub/warp/warp_scan.cuh
index 0e0668709b0..75dac108321 100644
--- a/cub/cub/warp/warp_scan.cuh
+++ b/cub/cub/warp/warp_scan.cuh
@@ -49,6 +49,7 @@
 #include <cub/warp/specializations/warp_scan_shfl.cuh>
 #include <cub/warp/specializations/warp_scan_smem.cuh>
 
+#include <cuda/ptx>
 #include <cuda/std/type_traits>
 
 CUB_NAMESPACE_BEGIN
@@ -212,7 +213,7 @@ public:
   //!   Reference to memory allocation having layout type TempStorage
   _CCCL_DEVICE _CCCL_FORCEINLINE WarpScan(TempStorage& temp_storage)
       : temp_storage(temp_storage.Alias())
-      , lane_id(IS_ARCH_WARP ? LaneId() : LaneId() % LOGICAL_WARP_THREADS)
+      , lane_id(IS_ARCH_WARP ? ::cuda::ptx::get_sreg_tid_x() : ::cuda::ptx::get_sreg_tid_x() % LOGICAL_WARP_THREADS)
   {}
 
   //! @}  end member group
diff --git a/cub/cub/warp/warp_store.cuh b/cub/cub/warp/warp_store.cuh
index bb99bc5965e..80cde3e66a8 100644
--- a/cub/cub/warp/warp_store.cuh
+++ b/cub/cub/warp/warp_store.cuh
@@ -45,6 +45,8 @@
 #include <cub/util_type.cuh>
 #include <cub/warp/warp_exchange.cuh>
 
+#include <cuda/ptx>
+
 CUB_NAMESPACE_BEGIN
 
 //! @rst
@@ -378,14 +380,16 @@ public:
   //!        memory as temporary storage.
   _CCCL_DEVICE _CCCL_FORCEINLINE WarpStore()
       : temp_storage(PrivateStorage())
-      , linear_tid(IS_ARCH_WARP ? LaneId() : (LaneId() % LOGICAL_WARP_THREADS))
+      , linear_tid(
+          IS_ARCH_WARP ? ::cuda::ptx::get_sreg_tid_x() : (::cuda::ptx::get_sreg_tid_x() % LOGICAL_WARP_THREADS))
   {}
 
   //! @brief Collective constructor using the specified memory allocation as
   //!        temporary storage.
   _CCCL_DEVICE _CCCL_FORCEINLINE WarpStore(TempStorage& temp_storage)
       : temp_storage(temp_storage.Alias())
-      , linear_tid(IS_ARCH_WARP ? LaneId() : (LaneId() % LOGICAL_WARP_THREADS))
+      , linear_tid(
+          IS_ARCH_WARP ? ::cuda::ptx::get_sreg_tid_x() : (::cuda::ptx::get_sreg_tid_x() % LOGICAL_WARP_THREADS))
   {}
 
   //! @}  end member group

From e7af3851936ea3bc34d04e06ce88fcafd8f0aea2 Mon Sep 17 00:00:00 2001
From: fbusato <fbusato@nvidia.com>
Date: Fri, 10 Jan 2025 23:34:10 +0000
Subject: [PATCH 04/14] deprecate unused functions

---
 cub/cub/util_ptx.cuh | 8 ++++++++
 cub/test/test_util.h | 6 +++---
 2 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/cub/cub/util_ptx.cuh b/cub/cub/util_ptx.cuh
index 9e9af137c31..328a7181b6e 100644
--- a/cub/cub/util_ptx.cuh
+++ b/cub/cub/util_ptx.cuh
@@ -65,6 +65,7 @@ _CCCL_DEVICE _CCCL_FORCEINLINE unsigned int SHR_ADD(unsigned int x, unsigned int
 /**
  * \brief Shift-left then add.  Returns (\p x << \p shift) + \p addend.
  */
+CCCL_DEPRECATED_BECAUSE("will be removed in the next major release")
 _CCCL_DEVICE _CCCL_FORCEINLINE unsigned int SHL_ADD(unsigned int x, unsigned int shift, unsigned int addend)
 {
   unsigned int ret;
@@ -125,6 +126,7 @@ _CCCL_DEVICE _CCCL_FORCEINLINE unsigned int BFE(UnsignedBits source, unsigned in
 /**
  * \brief Bitfield insert.  Inserts the \p num_bits least significant bits of \p y into \p x at bit-offset \p bit_start.
  */
+CCCL_DEPRECATED_BECAUSE("will be removed in the next major release")
 _CCCL_DEVICE _CCCL_FORCEINLINE void
 BFI(unsigned int& ret, unsigned int x, unsigned int y, unsigned int bit_start, unsigned int num_bits)
 {
@@ -134,6 +136,7 @@ BFI(unsigned int& ret, unsigned int x, unsigned int y, unsigned int bit_start, u
 /**
  * \brief Three-operand add.  Returns \p x + \p y + \p z.
  */
+CCCL_DEPRECATED_BECAUSE("will be removed in the next major release")
 _CCCL_DEVICE _CCCL_FORCEINLINE unsigned int IADD3(unsigned int x, unsigned int y, unsigned int z)
 {
   asm("vadd.u32.u32.u32.add %0, %1, %2, %3;" : "=r"(x) : "r"(x), "r"(y), "r"(z));
@@ -167,6 +170,7 @@ _CCCL_DEVICE _CCCL_FORCEINLINE unsigned int IADD3(unsigned int x, unsigned int y
  * \endcode
  *
  */
+CCCL_DEPRECATED_BECAUSE("will be removed in the next major release")
 _CCCL_DEVICE _CCCL_FORCEINLINE int PRMT(unsigned int a, unsigned int b, unsigned int index)
 {
   int ret;
@@ -179,6 +183,7 @@ _CCCL_DEVICE _CCCL_FORCEINLINE int PRMT(unsigned int a, unsigned int b, unsigned
 /**
  * Sync-threads barrier.
  */
+CCCL_DEPRECATED_BECAUSE("will be removed in the next major release")
 _CCCL_DEVICE _CCCL_FORCEINLINE void BAR(int count)
 {
   asm volatile("bar.sync 1, %0;" : : "r"(count));
@@ -287,6 +292,7 @@ _CCCL_DEVICE _CCCL_FORCEINLINE unsigned int SHFL_IDX_SYNC(unsigned int word, int
 /**
  * Floating point multiply. (Mantissa LSB rounds towards zero.)
  */
+CCCL_DEPRECATED_BECAUSE("will be removed in the next major release")
 _CCCL_DEVICE _CCCL_FORCEINLINE float FMUL_RZ(float a, float b)
 {
   float d;
@@ -297,6 +303,7 @@ _CCCL_DEVICE _CCCL_FORCEINLINE float FMUL_RZ(float a, float b)
 /**
  * Floating point multiply-add. (Mantissa LSB rounds towards zero.)
  */
+CCCL_DEPRECATED_BECAUSE("will be removed in the next major release")
 _CCCL_DEVICE _CCCL_FORCEINLINE float FFMA_RZ(float a, float b, float c)
 {
   float d;
@@ -317,6 +324,7 @@ _CCCL_DEVICE _CCCL_FORCEINLINE void ThreadExit()
 /**
  * \brief  Abort execution and generate an interrupt to the host CPU
  */
+CCCL_DEPRECATED_BECAUSE("will be removed in the next major release")
 _CCCL_DEVICE _CCCL_FORCEINLINE void ThreadTrap()
 {
   asm volatile("trap;");
diff --git a/cub/test/test_util.h b/cub/test/test_util.h
index e61cd7cd6e2..9dac8616bce 100644
--- a/cub/test/test_util.h
+++ b/cub/test/test_util.h
@@ -614,7 +614,7 @@ __host__ __device__ __forceinline__ void InitValue(GenMode gen_mode, T& value, s
         case RANDOM_BIT:
         case RANDOM_MINUS_PLUS_ZERO:
           _CubLog("%s\n", "cub::InitValue cannot generate random numbers on device.");
-          CUB_NS_QUALIFIER::ThreadTrap();
+          asm volatile("trap;");
           break;
         case UNIFORM:
           value = 2;
@@ -656,7 +656,7 @@ __host__ __device__ __forceinline__ void InitValue(GenMode gen_mode, bool& value
         case RANDOM_BIT:
         case RANDOM_MINUS_PLUS_ZERO:
           _CubLog("%s\n", "cub::InitValue cannot generate random numbers on device.");
-          CUB_NS_QUALIFIER::ThreadTrap();
+          asm volatile("trap;");
           break;
         case UNIFORM:
           value = true;
@@ -697,7 +697,7 @@ InitValue(GenMode gen_mode, CUB_NS_QUALIFIER::KeyValuePair<KeyT, ValueT>& value,
       ), ( // NV_IS_DEVICE
         _CubLog("%s\n",
                 "cub::InitValue cannot generate random numbers on device.");
-        CUB_NS_QUALIFIER::ThreadTrap();
+        asm volatile("trap;");
       ));
   // clang-format on
 }

From c1084d43831976391f1a2b0db583088a6d441e63 Mon Sep 17 00:00:00 2001
From: fbusato <fbusato@nvidia.com>
Date: Fri, 10 Jan 2025 23:34:20 +0000
Subject: [PATCH 05/14] fix laneid()

---
 cub/cub/agent/agent_radix_sort_histogram.cuh                | 2 +-
 cub/cub/agent/agent_radix_sort_onesweep.cuh                 | 2 +-
 cub/cub/agent/agent_radix_sort_upsweep.cuh                  | 6 +++---
 cub/cub/agent/agent_rle.cuh                                 | 6 +++---
 cub/cub/block/block_exchange.cuh                            | 2 +-
 cub/cub/block/block_radix_rank.cuh                          | 2 +-
 .../block/specializations/block_reduce_warp_reductions.cuh  | 2 +-
 cub/cub/block/specializations/block_scan_warp_scans.cuh     | 2 +-
 cub/cub/warp/specializations/warp_exchange_shfl.cuh         | 4 ++--
 cub/cub/warp/specializations/warp_exchange_smem.cuh         | 4 ++--
 cub/cub/warp/specializations/warp_reduce_shfl.cuh           | 2 +-
 cub/cub/warp/specializations/warp_reduce_smem.cuh           | 6 +++---
 cub/cub/warp/specializations/warp_scan_shfl.cuh             | 2 +-
 cub/cub/warp/specializations/warp_scan_smem.cuh             | 4 ++--
 cub/cub/warp/warp_load.cuh                                  | 4 ++--
 cub/cub/warp/warp_merge_sort.cuh                            | 4 ++--
 cub/cub/warp/warp_scan.cuh                                  | 2 +-
 cub/cub/warp/warp_store.cuh                                 | 4 ++--
 18 files changed, 30 insertions(+), 30 deletions(-)

diff --git a/cub/cub/agent/agent_radix_sort_histogram.cuh b/cub/cub/agent/agent_radix_sort_histogram.cuh
index f7709f2f799..853a56f7fbc 100644
--- a/cub/cub/agent/agent_radix_sort_histogram.cuh
+++ b/cub/cub/agent/agent_radix_sort_histogram.cuh
@@ -201,7 +201,7 @@ struct AgentRadixSortHistogram
   _CCCL_DEVICE _CCCL_FORCEINLINE void
   AccumulateSharedHistograms(OffsetT tile_offset, bit_ordered_type (&keys)[ITEMS_PER_THREAD])
   {
-    int part = ::cuda::ptx::get_sreg_tid_x() % NUM_PARTS;
+    int part = ::cuda::ptx::get_sreg_laneid() % NUM_PARTS;
 #pragma unroll
     for (int current_bit = begin_bit, pass = 0; current_bit < end_bit; current_bit += RADIX_BITS, ++pass)
     {
diff --git a/cub/cub/agent/agent_radix_sort_onesweep.cuh b/cub/cub/agent/agent_radix_sort_onesweep.cuh
index cef96f50c57..fe10d50515f 100644
--- a/cub/cub/agent/agent_radix_sort_onesweep.cuh
+++ b/cub/cub/agent/agent_radix_sort_onesweep.cuh
@@ -670,7 +670,7 @@ struct AgentRadixSortOnesweep
       , current_bit(current_bit)
       , num_bits(num_bits)
       , warp(threadIdx.x / WARP_THREADS)
-      , lane(::cuda::ptx::get_sreg_tid_x())
+      , lane(::cuda::ptx::get_sreg_laneid())
       , decomposer(decomposer)
   {
     // initialization
diff --git a/cub/cub/agent/agent_radix_sort_upsweep.cuh b/cub/cub/agent/agent_radix_sort_upsweep.cuh
index 4f1841eca99..4dcb6232ce3 100644
--- a/cub/cub/agent/agent_radix_sort_upsweep.cuh
+++ b/cub/cub/agent/agent_radix_sort_upsweep.cuh
@@ -300,7 +300,7 @@ struct AgentRadixSortUpsweep
   _CCCL_DEVICE _CCCL_FORCEINLINE void UnpackDigitCounts()
   {
     unsigned int warp_id  = threadIdx.x >> LOG_WARP_THREADS;
-    unsigned int warp_tid = ::cuda::ptx::get_sreg_tid_x();
+    unsigned int warp_tid = ::cuda::ptx::get_sreg_laneid();
 
 #pragma unroll
     for (int LANE = 0; LANE < LANES_PER_WARP; LANE++)
@@ -421,7 +421,7 @@ struct AgentRadixSortUpsweep
   _CCCL_DEVICE _CCCL_FORCEINLINE void ExtractCounts(OffsetT* counters, int bin_stride = 1, int bin_offset = 0)
   {
     unsigned int warp_id  = threadIdx.x >> LOG_WARP_THREADS;
-    unsigned int warp_tid = ::cuda::ptx::get_sreg_tid_x();
+    unsigned int warp_tid = ::cuda::ptx::get_sreg_laneid();
 
 // Place unpacked digit counters in shared memory
 #pragma unroll
@@ -501,7 +501,7 @@ struct AgentRadixSortUpsweep
   _CCCL_DEVICE _CCCL_FORCEINLINE void ExtractCounts(OffsetT (&bin_count)[BINS_TRACKED_PER_THREAD])
   {
     unsigned int warp_id  = threadIdx.x >> LOG_WARP_THREADS;
-    unsigned int warp_tid = ::cuda::ptx::get_sreg_tid_x();
+    unsigned int warp_tid = ::cuda::ptx::get_sreg_laneid();
 
 // Place unpacked digit counters in shared memory
 #pragma unroll
diff --git a/cub/cub/agent/agent_rle.cuh b/cub/cub/agent/agent_rle.cuh
index 63e4ccd2306..5721094f1a9 100644
--- a/cub/cub/agent/agent_rle.cuh
+++ b/cub/cub/agent/agent_rle.cuh
@@ -466,7 +466,7 @@ struct AgentRle
   {
     // Perform warpscans
     unsigned int warp_id = ((WARPS == 1) ? 0 : threadIdx.x / WARP_THREADS);
-    int lane_id          = ::cuda::ptx::get_sreg_tid_x();
+    int lane_id          = ::cuda::ptx::get_sreg_laneid();
 
     LengthOffsetPair identity;
     identity.key   = 0;
@@ -552,7 +552,7 @@ struct AgentRle
     Int2Type<true> is_warp_time_slice)
   {
     unsigned int warp_id = ((WARPS == 1) ? 0 : threadIdx.x / WARP_THREADS);
-    int lane_id          = ::cuda::ptx::get_sreg_tid_x();
+    int lane_id          = ::cuda::ptx::get_sreg_laneid();
 
     // Locally compact items within the warp (first warp)
     if (warp_id == 0)
@@ -609,7 +609,7 @@ struct AgentRle
     Int2Type<false> is_warp_time_slice)
   {
     unsigned int warp_id = ((WARPS == 1) ? 0 : threadIdx.x / WARP_THREADS);
-    int lane_id          = ::cuda::ptx::get_sreg_tid_x();
+    int lane_id          = ::cuda::ptx::get_sreg_laneid();
 
     // Unzip
     OffsetT run_offsets[ITEMS_PER_THREAD];
diff --git a/cub/cub/block/block_exchange.cuh b/cub/cub/block/block_exchange.cuh
index fc601898f6f..c22918ce9d5 100644
--- a/cub/cub/block/block_exchange.cuh
+++ b/cub/cub/block/block_exchange.cuh
@@ -181,7 +181,7 @@ private:
 
   // TODO(bgruber): can we use signed int here? Only these variables are unsigned:
   unsigned int linear_tid  = RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z);
-  unsigned int lane_id     = ::cuda::ptx::get_sreg_tid_x();
+  unsigned int lane_id     = ::cuda::ptx::get_sreg_laneid();
   unsigned int warp_id     = WARPS == 1 ? 0 : linear_tid / WARP_THREADS;
   unsigned int warp_offset = warp_id * WARP_TIME_SLICED_ITEMS;
 
diff --git a/cub/cub/block/block_radix_rank.cuh b/cub/cub/block/block_radix_rank.cuh
index 1961ec3ec81..d62ff160585 100644
--- a/cub/cub/block/block_radix_rank.cuh
+++ b/cub/cub/block/block_radix_rank.cuh
@@ -1136,7 +1136,7 @@ struct BlockRadixRankMatchEarlyCounts
         , digit_extractor(digit_extractor)
         , callback(callback)
         , warp(threadIdx.x / WARP_THREADS)
-        , lane(::cuda::ptx::get_sreg_tid_x())
+        , lane(::cuda::ptx::get_sreg_laneid())
     {}
   };
 
diff --git a/cub/cub/block/specializations/block_reduce_warp_reductions.cuh b/cub/cub/block/specializations/block_reduce_warp_reductions.cuh
index 4e43519c00d..623dc43f027 100644
--- a/cub/cub/block/specializations/block_reduce_warp_reductions.cuh
+++ b/cub/cub/block/specializations/block_reduce_warp_reductions.cuh
@@ -123,7 +123,7 @@ struct BlockReduceWarpReductions
       : temp_storage(temp_storage.Alias())
       , linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
       , warp_id((WARPS == 1) ? 0 : linear_tid / WARP_THREADS)
-      , lane_id(::cuda::ptx::get_sreg_tid_x())
+      , lane_id(::cuda::ptx::get_sreg_laneid())
   {}
 
   /**
diff --git a/cub/cub/block/specializations/block_scan_warp_scans.cuh b/cub/cub/block/specializations/block_scan_warp_scans.cuh
index e69acb23ddd..c9c23e43f05 100644
--- a/cub/cub/block/specializations/block_scan_warp_scans.cuh
+++ b/cub/cub/block/specializations/block_scan_warp_scans.cuh
@@ -129,7 +129,7 @@ struct BlockScanWarpScans
       : temp_storage(temp_storage.Alias())
       , linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
       , warp_id((WARPS == 1) ? 0 : linear_tid / WARP_THREADS)
-      , lane_id(::cuda::ptx::get_sreg_tid_x())
+      , lane_id(::cuda::ptx::get_sreg_laneid())
   {}
 
   //---------------------------------------------------------------------
diff --git a/cub/cub/warp/specializations/warp_exchange_shfl.cuh b/cub/cub/warp/specializations/warp_exchange_shfl.cuh
index 26fe531a07f..f874f961caa 100644
--- a/cub/cub/warp/specializations/warp_exchange_shfl.cuh
+++ b/cub/cub/warp/specializations/warp_exchange_shfl.cuh
@@ -275,8 +275,8 @@ public:
   WarpExchangeShfl() = delete;
 
   explicit _CCCL_DEVICE _CCCL_FORCEINLINE WarpExchangeShfl(TempStorage&)
-      : lane_id(IS_ARCH_WARP ? ::cuda::ptx::get_sreg_tid_x() : (::cuda::ptx::get_sreg_tid_x() % LOGICAL_WARP_THREADS))
-      , warp_id(IS_ARCH_WARP ? 0 : (::cuda::ptx::get_sreg_tid_x() / LOGICAL_WARP_THREADS))
+      : lane_id(IS_ARCH_WARP ? ::cuda::ptx::get_sreg_laneid() : (::cuda::ptx::get_sreg_laneid() % LOGICAL_WARP_THREADS))
+      , warp_id(IS_ARCH_WARP ? 0 : (::cuda::ptx::get_sreg_laneid() / LOGICAL_WARP_THREADS))
       , member_mask(WarpMask<LOGICAL_WARP_THREADS>(warp_id))
   {}
 
diff --git a/cub/cub/warp/specializations/warp_exchange_smem.cuh b/cub/cub/warp/specializations/warp_exchange_smem.cuh
index 2a906e130bb..01f1d2c873e 100644
--- a/cub/cub/warp/specializations/warp_exchange_smem.cuh
+++ b/cub/cub/warp/specializations/warp_exchange_smem.cuh
@@ -90,8 +90,8 @@ public:
 
   explicit _CCCL_DEVICE _CCCL_FORCEINLINE WarpExchangeSmem(TempStorage& temp_storage)
       : temp_storage(temp_storage.Alias())
-      , lane_id(IS_ARCH_WARP ? ::cuda::ptx::get_sreg_tid_x() : (::cuda::ptx::get_sreg_tid_x() % LOGICAL_WARP_THREADS))
-      , warp_id(IS_ARCH_WARP ? 0 : (::cuda::ptx::get_sreg_tid_x() / LOGICAL_WARP_THREADS))
+      , lane_id(IS_ARCH_WARP ? ::cuda::ptx::get_sreg_laneid() : (::cuda::ptx::get_sreg_laneid() % LOGICAL_WARP_THREADS))
+      , warp_id(IS_ARCH_WARP ? 0 : (::cuda::ptx::get_sreg_laneid() / LOGICAL_WARP_THREADS))
       , member_mask(WarpMask<LOGICAL_WARP_THREADS>(warp_id))
   {}
 
diff --git a/cub/cub/warp/specializations/warp_reduce_shfl.cuh b/cub/cub/warp/specializations/warp_reduce_shfl.cuh
index adde777ed70..eb99c84019f 100644
--- a/cub/cub/warp/specializations/warp_reduce_shfl.cuh
+++ b/cub/cub/warp/specializations/warp_reduce_shfl.cuh
@@ -156,7 +156,7 @@ struct WarpReduceShfl
 
   /// Constructor
   _CCCL_DEVICE _CCCL_FORCEINLINE WarpReduceShfl(TempStorage& /*temp_storage*/)
-      : lane_id(static_cast<int>(::cuda::ptx::get_sreg_tid_x()))
+      : lane_id(static_cast<int>(::cuda::ptx::get_sreg_laneid()))
       , warp_id(IS_ARCH_WARP ? 0 : (lane_id / LOGICAL_WARP_THREADS))
       , member_mask(WarpMask<LOGICAL_WARP_THREADS>(warp_id))
   {
diff --git a/cub/cub/warp/specializations/warp_reduce_smem.cuh b/cub/cub/warp/specializations/warp_reduce_smem.cuh
index fb3885b2793..6913556ca2a 100644
--- a/cub/cub/warp/specializations/warp_reduce_smem.cuh
+++ b/cub/cub/warp/specializations/warp_reduce_smem.cuh
@@ -125,8 +125,8 @@ struct WarpReduceSmem
   /// Constructor
   explicit _CCCL_DEVICE _CCCL_FORCEINLINE WarpReduceSmem(TempStorage& temp_storage)
       : temp_storage(temp_storage.Alias())
-      , lane_id(IS_ARCH_WARP ? ::cuda::ptx::get_sreg_tid_x() : ::cuda::ptx::get_sreg_tid_x() % LOGICAL_WARP_THREADS)
-      , member_mask(WarpMask<LOGICAL_WARP_THREADS>(::cuda::ptx::get_sreg_tid_x() / LOGICAL_WARP_THREADS))
+      , lane_id(IS_ARCH_WARP ? ::cuda::ptx::get_sreg_laneid() : ::cuda::ptx::get_sreg_laneid() % LOGICAL_WARP_THREADS)
+      , member_mask(WarpMask<LOGICAL_WARP_THREADS>(::cuda::ptx::get_sreg_laneid() / LOGICAL_WARP_THREADS))
   {}
 
   /******************************************************************************
@@ -237,7 +237,7 @@ struct WarpReduceSmem
     // Accommodate packing of multiple logical warps in a single physical warp
     if (!IS_ARCH_WARP)
     {
-      warp_flags >>= (::cuda::ptx::get_sreg_tid_x() / LOGICAL_WARP_THREADS) * LOGICAL_WARP_THREADS;
+      warp_flags >>= (::cuda::ptx::get_sreg_laneid() / LOGICAL_WARP_THREADS) * LOGICAL_WARP_THREADS;
     }
 
     // Find next flag
diff --git a/cub/cub/warp/specializations/warp_scan_shfl.cuh b/cub/cub/warp/specializations/warp_scan_shfl.cuh
index e323836e837..81998642811 100644
--- a/cub/cub/warp/specializations/warp_scan_shfl.cuh
+++ b/cub/cub/warp/specializations/warp_scan_shfl.cuh
@@ -118,7 +118,7 @@ struct WarpScanShfl
 
   /// Constructor
   explicit _CCCL_DEVICE _CCCL_FORCEINLINE WarpScanShfl(TempStorage& /*temp_storage*/)
-      : lane_id(::cuda::ptx::get_sreg_tid_x())
+      : lane_id(::cuda::ptx::get_sreg_laneid())
       , warp_id(IS_ARCH_WARP ? 0 : (lane_id / LOGICAL_WARP_THREADS))
       , member_mask(WarpMask<LOGICAL_WARP_THREADS>(warp_id))
   {
diff --git a/cub/cub/warp/specializations/warp_scan_smem.cuh b/cub/cub/warp/specializations/warp_scan_smem.cuh
index 2093c1cdc43..b25b2a28952 100644
--- a/cub/cub/warp/specializations/warp_scan_smem.cuh
+++ b/cub/cub/warp/specializations/warp_scan_smem.cuh
@@ -115,10 +115,10 @@ struct WarpScanSmem
       : temp_storage(temp_storage.Alias())
       ,
 
-      lane_id(IS_ARCH_WARP ? ::cuda::ptx::get_sreg_tid_x() : ::cuda::ptx::get_sreg_tid_x() % LOGICAL_WARP_THREADS)
+      lane_id(IS_ARCH_WARP ? ::cuda::ptx::get_sreg_laneid() : ::cuda::ptx::get_sreg_laneid() % LOGICAL_WARP_THREADS)
       ,
 
-      member_mask(WarpMask<LOGICAL_WARP_THREADS>(::cuda::ptx::get_sreg_tid_x() / LOGICAL_WARP_THREADS))
+      member_mask(WarpMask<LOGICAL_WARP_THREADS>(::cuda::ptx::get_sreg_laneid() / LOGICAL_WARP_THREADS))
   {}
 
   /******************************************************************************
diff --git a/cub/cub/warp/warp_load.cuh b/cub/cub/warp/warp_load.cuh
index e368d266428..3f11129c35a 100644
--- a/cub/cub/warp/warp_load.cuh
+++ b/cub/cub/warp/warp_load.cuh
@@ -441,7 +441,7 @@ public:
   _CCCL_DEVICE _CCCL_FORCEINLINE WarpLoad()
       : temp_storage(PrivateStorage())
       , linear_tid(
-          IS_ARCH_WARP ? ::cuda::ptx::get_sreg_tid_x() : (::cuda::ptx::get_sreg_tid_x() % LOGICAL_WARP_THREADS))
+          IS_ARCH_WARP ? ::cuda::ptx::get_sreg_laneid() : (::cuda::ptx::get_sreg_laneid() % LOGICAL_WARP_THREADS))
   {}
 
   //! @brief Collective constructor using the specified memory allocation as
@@ -449,7 +449,7 @@ public:
   _CCCL_DEVICE _CCCL_FORCEINLINE WarpLoad(TempStorage& temp_storage)
       : temp_storage(temp_storage.Alias())
       , linear_tid(
-          IS_ARCH_WARP ? ::cuda::ptx::get_sreg_tid_x() : (::cuda::ptx::get_sreg_tid_x() % LOGICAL_WARP_THREADS))
+          IS_ARCH_WARP ? ::cuda::ptx::get_sreg_laneid() : (::cuda::ptx::get_sreg_laneid() % LOGICAL_WARP_THREADS))
   {}
 
   //! @} end member group
diff --git a/cub/cub/warp/warp_merge_sort.cuh b/cub/cub/warp/warp_merge_sort.cuh
index c54d328b5d7..cda00d253f2 100644
--- a/cub/cub/warp/warp_merge_sort.cuh
+++ b/cub/cub/warp/warp_merge_sort.cuh
@@ -154,8 +154,8 @@ public:
   _CCCL_DEVICE _CCCL_FORCEINLINE WarpMergeSort(typename BlockMergeSortStrategyT::TempStorage& temp_storage)
       : BlockMergeSortStrategyT(
           temp_storage,
-          IS_ARCH_WARP ? ::cuda::ptx::get_sreg_tid_x() : (::cuda::ptx::get_sreg_tid_x() % LOGICAL_WARP_THREADS))
-      , warp_id(IS_ARCH_WARP ? 0 : (::cuda::ptx::get_sreg_tid_x() / LOGICAL_WARP_THREADS))
+          IS_ARCH_WARP ? ::cuda::ptx::get_sreg_laneid() : (::cuda::ptx::get_sreg_laneid() % LOGICAL_WARP_THREADS))
+      , warp_id(IS_ARCH_WARP ? 0 : (::cuda::ptx::get_sreg_laneid() / LOGICAL_WARP_THREADS))
       , member_mask(WarpMask<LOGICAL_WARP_THREADS>(warp_id))
   {}
 
diff --git a/cub/cub/warp/warp_scan.cuh b/cub/cub/warp/warp_scan.cuh
index 75dac108321..cec992e699c 100644
--- a/cub/cub/warp/warp_scan.cuh
+++ b/cub/cub/warp/warp_scan.cuh
@@ -213,7 +213,7 @@ public:
   //!   Reference to memory allocation having layout type TempStorage
   _CCCL_DEVICE _CCCL_FORCEINLINE WarpScan(TempStorage& temp_storage)
       : temp_storage(temp_storage.Alias())
-      , lane_id(IS_ARCH_WARP ? ::cuda::ptx::get_sreg_tid_x() : ::cuda::ptx::get_sreg_tid_x() % LOGICAL_WARP_THREADS)
+      , lane_id(IS_ARCH_WARP ? ::cuda::ptx::get_sreg_laneid() : ::cuda::ptx::get_sreg_laneid() % LOGICAL_WARP_THREADS)
   {}
 
   //! @}  end member group
diff --git a/cub/cub/warp/warp_store.cuh b/cub/cub/warp/warp_store.cuh
index 80cde3e66a8..f0a9929e24f 100644
--- a/cub/cub/warp/warp_store.cuh
+++ b/cub/cub/warp/warp_store.cuh
@@ -381,7 +381,7 @@ public:
   _CCCL_DEVICE _CCCL_FORCEINLINE WarpStore()
       : temp_storage(PrivateStorage())
       , linear_tid(
-          IS_ARCH_WARP ? ::cuda::ptx::get_sreg_tid_x() : (::cuda::ptx::get_sreg_tid_x() % LOGICAL_WARP_THREADS))
+          IS_ARCH_WARP ? ::cuda::ptx::get_sreg_laneid() : (::cuda::ptx::get_sreg_laneid() % LOGICAL_WARP_THREADS))
   {}
 
   //! @brief Collective constructor using the specified memory allocation as
@@ -389,7 +389,7 @@ public:
   _CCCL_DEVICE _CCCL_FORCEINLINE WarpStore(TempStorage& temp_storage)
       : temp_storage(temp_storage.Alias())
       , linear_tid(
-          IS_ARCH_WARP ? ::cuda::ptx::get_sreg_tid_x() : (::cuda::ptx::get_sreg_tid_x() % LOGICAL_WARP_THREADS))
+          IS_ARCH_WARP ? ::cuda::ptx::get_sreg_laneid() : (::cuda::ptx::get_sreg_laneid() % LOGICAL_WARP_THREADS))
   {}
 
   //! @}  end member group

From a88ef76043b4184f19d77feb8dd8a9cf11c22514 Mon Sep 17 00:00:00 2001
From: fbusato <fbusato@nvidia.com>
Date: Sat, 11 Jan 2025 00:37:22 +0000
Subject: [PATCH 06/14] fix lanemask_gt()

---
 cub/cub/warp/specializations/warp_reduce_shfl.cuh | 2 +-
 cub/cub/warp/specializations/warp_reduce_smem.cuh | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/cub/cub/warp/specializations/warp_reduce_shfl.cuh b/cub/cub/warp/specializations/warp_reduce_shfl.cuh
index eb99c84019f..c018f891cde 100644
--- a/cub/cub/warp/specializations/warp_reduce_shfl.cuh
+++ b/cub/cub/warp/specializations/warp_reduce_shfl.cuh
@@ -709,7 +709,7 @@ struct WarpReduceShfl
     }
 
     // Mask out the bits below the current thread
-    warp_flags &= ::cuda::ptx::get_sreg_lanemask_gt();
+    warp_flags &= ::cuda::ptx::get_sreg_lanemask_ge();
 
     // Mask of physical lanes outside the logical warp and convert to logical lanemask
     if (!IS_ARCH_WARP)
diff --git a/cub/cub/warp/specializations/warp_reduce_smem.cuh b/cub/cub/warp/specializations/warp_reduce_smem.cuh
index 6913556ca2a..230dfcea3cc 100644
--- a/cub/cub/warp/specializations/warp_reduce_smem.cuh
+++ b/cub/cub/warp/specializations/warp_reduce_smem.cuh
@@ -232,7 +232,7 @@ struct WarpReduceSmem
     }
 
     // Keep bits above the current thread.
-    warp_flags &= ::cuda::ptx::get_sreg_lanemask_gt();
+    warp_flags &= ::cuda::ptx::get_sreg_lanemask_ge();
 
     // Accommodate packing of multiple logical warps in a single physical warp
     if (!IS_ARCH_WARP)

From 68a115fe3befafd1e43130a558010ae72b604b0f Mon Sep 17 00:00:00 2001
From: fbusato <fbusato@nvidia.com>
Date: Mon, 13 Jan 2025 20:32:06 +0000
Subject: [PATCH 07/14] use cuda::std::terminate()

---
 cub/test/test_util.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/cub/test/test_util.h b/cub/test/test_util.h
index 9dac8616bce..c06d803ecb1 100644
--- a/cub/test/test_util.h
+++ b/cub/test/test_util.h
@@ -614,7 +614,7 @@ __host__ __device__ __forceinline__ void InitValue(GenMode gen_mode, T& value, s
         case RANDOM_BIT:
         case RANDOM_MINUS_PLUS_ZERO:
           _CubLog("%s\n", "cub::InitValue cannot generate random numbers on device.");
-          asm volatile("trap;");
+          cuda::std::terminate();
           break;
         case UNIFORM:
           value = 2;
@@ -656,7 +656,7 @@ __host__ __device__ __forceinline__ void InitValue(GenMode gen_mode, bool& value
         case RANDOM_BIT:
         case RANDOM_MINUS_PLUS_ZERO:
           _CubLog("%s\n", "cub::InitValue cannot generate random numbers on device.");
-          asm volatile("trap;");
+          cuda::std::terminate();
           break;
         case UNIFORM:
           value = true;
@@ -697,7 +697,7 @@ InitValue(GenMode gen_mode, CUB_NS_QUALIFIER::KeyValuePair<KeyT, ValueT>& value,
       ), ( // NV_IS_DEVICE
         _CubLog("%s\n",
                 "cub::InitValue cannot generate random numbers on device.");
-        asm volatile("trap;");
+        cuda::std::terminate();
       ));
   // clang-format on
 }

From 43b2b588986e3bec251cbe6ff235e0d83f59d237 Mon Sep 17 00:00:00 2001
From: fbusato <fbusato@nvidia.com>
Date: Mon, 13 Jan 2025 20:32:23 +0000
Subject: [PATCH 08/14] typo fix

---
 cub/cub/warp/specializations/warp_reduce_smem.cuh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cub/cub/warp/specializations/warp_reduce_smem.cuh b/cub/cub/warp/specializations/warp_reduce_smem.cuh
index 230dfcea3cc..6913556ca2a 100644
--- a/cub/cub/warp/specializations/warp_reduce_smem.cuh
+++ b/cub/cub/warp/specializations/warp_reduce_smem.cuh
@@ -232,7 +232,7 @@ struct WarpReduceSmem
     }
 
     // Keep bits above the current thread.
-    warp_flags &= ::cuda::ptx::get_sreg_lanemask_ge();
+    warp_flags &= ::cuda::ptx::get_sreg_lanemask_gt();
 
     // Accommodate packing of multiple logical warps in a single physical warp
     if (!IS_ARCH_WARP)

From 0af70fdbe1e8992d38c517bcf3a00bb9d5b2e243 Mon Sep 17 00:00:00 2001
From: fbusato <fbusato@nvidia.com>
Date: Mon, 13 Jan 2025 20:32:44 +0000
Subject: [PATCH 09/14] replace SHR_ADD with shift-add

---
 cub/cub/block/block_exchange.cuh              | 22 +++++++++----------
 cub/cub/util_ptx.cuh                          |  3 ++-
 .../specializations/warp_exchange_smem.cuh    |  4 ++--
 3 files changed, 15 insertions(+), 14 deletions(-)

diff --git a/cub/cub/block/block_exchange.cuh b/cub/cub/block/block_exchange.cuh
index c22918ce9d5..2d4029c3b08 100644
--- a/cub/cub/block/block_exchange.cuh
+++ b/cub/cub/block/block_exchange.cuh
@@ -628,7 +628,7 @@ private:
       int item_offset = ranks[i];
       _CCCL_IF_CONSTEXPR (INSERT_PADDING)
       {
-        item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
+        item_offset = (item_offset >> LOG_SMEM_BANKS) + item_offset;
       }
       detail::uninitialized_copy_single(temp_storage.buff + item_offset, input_items[i]);
     }
@@ -641,7 +641,7 @@ private:
       int item_offset = linear_tid * ITEMS_PER_THREAD + i;
       _CCCL_IF_CONSTEXPR (INSERT_PADDING)
       {
-        item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
+        item_offset = (item_offset >> LOG_SMEM_BANKS) + item_offset;
       }
       output_items[i] = temp_storage.buff[item_offset];
     }
@@ -681,7 +681,7 @@ private:
         {
           _CCCL_IF_CONSTEXPR (INSERT_PADDING)
           {
-            item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
+            item_offset = (item_offset >> LOG_SMEM_BANKS) + item_offset;
           }
           detail::uninitialized_copy_single(temp_storage.buff + item_offset, input_items[i]);
         }
@@ -697,7 +697,7 @@ private:
           int item_offset = lane_id * ITEMS_PER_THREAD + i;
           _CCCL_IF_CONSTEXPR (INSERT_PADDING)
           {
-            item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
+            item_offset = (item_offset >> LOG_SMEM_BANKS) + item_offset;
           }
           temp_items[i] = temp_storage.buff[item_offset];
         }
@@ -735,7 +735,7 @@ private:
       int item_offset = ranks[i];
       _CCCL_IF_CONSTEXPR (INSERT_PADDING)
       {
-        item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
+        item_offset = (item_offset >> LOG_SMEM_BANKS) + item_offset;
       }
       detail::uninitialized_copy_single(temp_storage.buff + item_offset, input_items[i]);
     }
@@ -748,7 +748,7 @@ private:
       int item_offset = i * BLOCK_THREADS + linear_tid;
       _CCCL_IF_CONSTEXPR (INSERT_PADDING)
       {
-        item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
+        item_offset = (item_offset >> LOG_SMEM_BANKS) + item_offset;
       }
       output_items[i] = temp_storage.buff[item_offset];
     }
@@ -789,7 +789,7 @@ private:
         {
           _CCCL_IF_CONSTEXPR (INSERT_PADDING)
           {
-            item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
+            item_offset = (item_offset >> LOG_SMEM_BANKS) + item_offset;
           }
           detail::uninitialized_copy_single(temp_storage.buff + item_offset, input_items[i]);
         }
@@ -1136,7 +1136,7 @@ public:
       int item_offset = ranks[i];
       _CCCL_IF_CONSTEXPR (INSERT_PADDING)
       {
-        item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
+        item_offset = (item_offset >> LOG_SMEM_BANKS) + item_offset;
       }
       if (ranks[i] >= 0)
       {
@@ -1152,7 +1152,7 @@ public:
       int item_offset = i * BLOCK_THREADS + linear_tid;
       _CCCL_IF_CONSTEXPR (INSERT_PADDING)
       {
-        item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
+        item_offset = (item_offset >> LOG_SMEM_BANKS) + item_offset;
       }
       output_items[i] = temp_storage.buff[item_offset];
     }
@@ -1195,7 +1195,7 @@ public:
       int item_offset = ranks[i];
       _CCCL_IF_CONSTEXPR (INSERT_PADDING)
       {
-        item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
+        item_offset = (item_offset >> LOG_SMEM_BANKS) + item_offset;
       }
       if (is_valid[i])
       {
@@ -1211,7 +1211,7 @@ public:
       int item_offset = i * BLOCK_THREADS + linear_tid;
       _CCCL_IF_CONSTEXPR (INSERT_PADDING)
       {
-        item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
+        item_offset = (item_offset >> LOG_SMEM_BANKS) + item_offset;
       }
       output_items[i] = temp_storage.buff[item_offset];
     }
diff --git a/cub/cub/util_ptx.cuh b/cub/cub/util_ptx.cuh
index 328a7181b6e..6ca57622cde 100644
--- a/cub/cub/util_ptx.cuh
+++ b/cub/cub/util_ptx.cuh
@@ -55,6 +55,7 @@ CUB_NAMESPACE_BEGIN
 /**
  * \brief Shift-right then add.  Returns (\p x >> \p shift) + \p addend.
  */
+CCCL_DEPRECATED_BECAUSE("will be removed in the next major release")
 _CCCL_DEVICE _CCCL_FORCEINLINE unsigned int SHR_ADD(unsigned int x, unsigned int shift, unsigned int addend)
 {
   unsigned int ret;
@@ -324,7 +325,7 @@ _CCCL_DEVICE _CCCL_FORCEINLINE void ThreadExit()
 /**
  * \brief  Abort execution and generate an interrupt to the host CPU
  */
-CCCL_DEPRECATED_BECAUSE("will be removed in the next major release")
+CCCL_DEPRECATED_BECAUSE("use cuda::std::terminate() instead")
 _CCCL_DEVICE _CCCL_FORCEINLINE void ThreadTrap()
 {
   asm volatile("trap;");
diff --git a/cub/cub/warp/specializations/warp_exchange_smem.cuh b/cub/cub/warp/specializations/warp_exchange_smem.cuh
index 01f1d2c873e..d9435c4bc5d 100644
--- a/cub/cub/warp/specializations/warp_exchange_smem.cuh
+++ b/cub/cub/warp/specializations/warp_exchange_smem.cuh
@@ -149,7 +149,7 @@ public:
     {
       if (INSERT_PADDING)
       {
-        ranks[ITEM] = SHR_ADD(ranks[ITEM], LOG_SMEM_BANKS, ranks[ITEM]);
+        ranks[ITEM] = (ranks[ITEM] >> LOG_SMEM_BANKS) + ranks[ITEM];
       }
 
       temp_storage.items_shared[ranks[ITEM]] = input_items[ITEM];
@@ -164,7 +164,7 @@ public:
 
       if (INSERT_PADDING)
       {
-        item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
+        item_offset = (item_offset >> LOG_SMEM_BANKS) + item_offset;
       }
 
       output_items[ITEM] = temp_storage.items_shared[item_offset];

From a2fff971cea1ad0b7c5757ceeac70dd9ca37f7f8 Mon Sep 17 00:00:00 2001
From: fbusato <fbusato@nvidia.com>
Date: Mon, 13 Jan 2025 23:29:19 +0000
Subject: [PATCH 10/14] replace voting instructions

---
 cub/cub/agent/agent_radix_sort_onesweep.cuh   |  6 ++---
 cub/cub/agent/agent_rle.cuh                   |  2 +-
 cub/cub/agent/agent_sub_warp_merge_sort.cuh   |  8 +++---
 cub/cub/agent/single_pass_scan_operators.cuh  |  8 +++---
 cub/cub/block/block_exchange.cuh              | 10 +++----
 cub/cub/block/block_radix_rank.cuh            | 12 ++++-----
 .../specializations/block_reduce_raking.cuh   |  2 +-
 cub/cub/util_ptx.cuh                          |  4 +++
 .../specializations/warp_exchange_smem.cuh    |  6 ++---
 .../warp/specializations/warp_reduce_shfl.cuh |  2 +-
 .../warp/specializations/warp_reduce_smem.cuh | 14 +++++-----
 .../warp/specializations/warp_scan_shfl.cuh   |  2 +-
 .../warp/specializations/warp_scan_smem.cuh   | 26 +++++++++----------
 cub/cub/warp/warp_merge_sort.cuh              |  2 +-
 cub/test/catch2_test_warp_merge_sort.cu       |  4 +--
 15 files changed, 56 insertions(+), 52 deletions(-)

diff --git a/cub/cub/agent/agent_radix_sort_onesweep.cuh b/cub/cub/agent/agent_radix_sort_onesweep.cuh
index fe10d50515f..a4ad4b852ff 100644
--- a/cub/cub/agent/agent_radix_sort_onesweep.cuh
+++ b/cub/cub/agent/agent_radix_sort_onesweep.cuh
@@ -280,7 +280,7 @@ struct AgentRadixSortOnesweep
           } while (value_j == 0);
 
           inc_sum += value_j & LOOKBACK_VALUE_MASK;
-          want_mask = WARP_BALLOT((value_j & LOOKBACK_GLOBAL_MASK) == 0, want_mask);
+          want_mask = __ballot_sync(want_mask, (value_j & LOOKBACK_GLOBAL_MASK) == 0);
           if (value_j & LOOKBACK_GLOBAL_MASK)
           {
             break;
@@ -484,7 +484,7 @@ struct AgentRadixSortOnesweep
       {
         d_keys_out[global_idx] = Twiddle::Out(key, decomposer);
       }
-      WARP_SYNC(WARP_MASK);
+      __syncwarp(WARP_MASK);
     }
   }
 
@@ -502,7 +502,7 @@ struct AgentRadixSortOnesweep
       {
         d_values_out[global_idx] = value;
       }
-      WARP_SYNC(WARP_MASK);
+      __syncwarp(WARP_MASK);
     }
   }
 
diff --git a/cub/cub/agent/agent_rle.cuh b/cub/cub/agent/agent_rle.cuh
index 5721094f1a9..151940d18fa 100644
--- a/cub/cub/agent/agent_rle.cuh
+++ b/cub/cub/agent/agent_rle.cuh
@@ -625,7 +625,7 @@ struct AgentRle
     WarpExchangeOffsets(temp_storage.aliasable.scatter_aliasable.exchange_offsets[warp_id])
       .ScatterToStriped(run_offsets, thread_num_runs_exclusive_in_warp);
 
-    WARP_SYNC(0xffffffff);
+    __syncwarp(0xffffffff);
 
     WarpExchangeLengths(temp_storage.aliasable.scatter_aliasable.exchange_lengths[warp_id])
       .ScatterToStriped(run_lengths, thread_num_runs_exclusive_in_warp);
diff --git a/cub/cub/agent/agent_sub_warp_merge_sort.cuh b/cub/cub/agent/agent_sub_warp_merge_sort.cuh
index f07b2173cdc..e3e7334fe1b 100644
--- a/cub/cub/agent/agent_sub_warp_merge_sort.cuh
+++ b/cub/cub/agent/agent_sub_warp_merge_sort.cuh
@@ -233,23 +233,23 @@ public:
       KeyT oob_default = AgentSubWarpSort::get_oob_default(Int2Type<std::is_same<bool, KeyT>::value>{});
 
       WarpLoadKeysT(storage.load_keys).Load(keys_input, keys, segment_size, oob_default);
-      WARP_SYNC(warp_merge_sort.get_member_mask());
+      __syncwarp(warp_merge_sort.get_member_mask());
 
       if (!KEYS_ONLY)
       {
         WarpLoadItemsT(storage.load_items).Load(values_input, values, segment_size);
 
-        WARP_SYNC(warp_merge_sort.get_member_mask());
+        __syncwarp(warp_merge_sort.get_member_mask());
       }
 
       warp_merge_sort.Sort(keys, values, BinaryOpT{}, segment_size, oob_default);
-      WARP_SYNC(warp_merge_sort.get_member_mask());
+      __syncwarp(warp_merge_sort.get_member_mask());
 
       WarpStoreKeysT(storage.store_keys).Store(keys_output, keys, segment_size);
 
       if (!KEYS_ONLY)
       {
-        WARP_SYNC(warp_merge_sort.get_member_mask());
+        __syncwarp(warp_merge_sort.get_member_mask());
         WarpStoreItemsT(storage.store_items).Store(values_output, values, segment_size);
       }
     }
diff --git a/cub/cub/agent/single_pass_scan_operators.cuh b/cub/cub/agent/single_pass_scan_operators.cuh
index 71469a0055a..bd6551b8f8d 100644
--- a/cub/cub/agent/single_pass_scan_operators.cuh
+++ b/cub/cub/agent/single_pass_scan_operators.cuh
@@ -733,7 +733,7 @@ public:
       tile_descriptor = reinterpret_cast<TileDescriptor&>(alias);
     }
 
-    while (WARP_ANY((tile_descriptor.status == SCAN_TILE_INVALID), 0xffffffff))
+    while (__any_sync(0xffffffff, (tile_descriptor.status == SCAN_TILE_INVALID)))
     {
       delay_or_prevent_hoisting();
       TxnWord alias   = LoadStatus<Order>(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx);
@@ -918,7 +918,7 @@ struct ScanTileState<T, false>
       delay();
       status = detail::load_relaxed(d_tile_status + TILE_STATUS_PADDING + tile_idx);
       __threadfence();
-    } while (WARP_ANY((status == SCAN_TILE_INVALID), 0xffffffff));
+    } while (__any_sync(0xffffffff, (status == SCAN_TILE_INVALID)));
 
     if (status == StatusWord(SCAN_TILE_PARTIAL))
     {
@@ -1145,7 +1145,7 @@ struct ReduceByKeyScanTileState<ValueT, KeyT, true>
       TxnWord alias   = detail::load_relaxed(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx);
       tile_descriptor = reinterpret_cast<TileDescriptor&>(alias);
 
-    } while (WARP_ANY((tile_descriptor.status == SCAN_TILE_INVALID), 0xffffffff));
+    } while (__any_sync(0xffffffff, (tile_descriptor.status == SCAN_TILE_INVALID)));
 
     status      = tile_descriptor.status;
     value.value = tile_descriptor.value;
@@ -1268,7 +1268,7 @@ struct TilePrefixCallbackOp
     exclusive_prefix = window_aggregate;
 
     // Keep sliding the window back until we come across a tile whose inclusive prefix is known
-    while (WARP_ALL((predecessor_status != StatusWord(SCAN_TILE_INCLUSIVE)), 0xffffffff))
+    while (__all_sync(0xffffffff, (predecessor_status != StatusWord(SCAN_TILE_INCLUSIVE))))
     {
       predecessor_idx -= CUB_PTX_WARP_THREADS;
 
diff --git a/cub/cub/block/block_exchange.cuh b/cub/cub/block/block_exchange.cuh
index 2d4029c3b08..a41048eb94d 100644
--- a/cub/cub/block/block_exchange.cuh
+++ b/cub/cub/block/block_exchange.cuh
@@ -324,7 +324,7 @@ private:
       detail::uninitialized_copy_single(temp_storage.buff + item_offset, input_items[i]);
     }
 
-    WARP_SYNC(0xffffffff);
+    __syncwarp(0xffffffff);
 
 #pragma unroll
     for (int i = 0; i < ITEMS_PER_THREAD; i++)
@@ -363,7 +363,7 @@ private:
         detail::uninitialized_copy_single(temp_storage.buff + item_offset, input_items[i]);
       }
 
-      WARP_SYNC(0xffffffff);
+      __syncwarp(0xffffffff);
 
 #pragma unroll
       for (int i = 0; i < ITEMS_PER_THREAD; i++)
@@ -395,7 +395,7 @@ private:
           detail::uninitialized_copy_single(temp_storage.buff + item_offset, input_items[i]);
         }
 
-        WARP_SYNC(0xffffffff);
+        __syncwarp(0xffffffff);
 
 #pragma unroll
         for (int i = 0; i < ITEMS_PER_THREAD; i++)
@@ -545,7 +545,7 @@ private:
       detail::uninitialized_copy_single(temp_storage.buff + item_offset, input_items[i]);
     }
 
-    WARP_SYNC(0xffffffff);
+    __syncwarp(0xffffffff);
 
 #pragma unroll
     for (int i = 0; i < ITEMS_PER_THREAD; i++)
@@ -589,7 +589,7 @@ private:
           detail::uninitialized_copy_single(temp_storage.buff + item_offset, input_items[i]);
         }
 
-        WARP_SYNC(0xffffffff);
+        __syncwarp(0xffffffff);
 
 #pragma unroll
         for (int i = 0; i < ITEMS_PER_THREAD; i++)
diff --git a/cub/cub/block/block_radix_rank.cuh b/cub/cub/block/block_radix_rank.cuh
index d62ff160585..22fe117a1f9 100644
--- a/cub/cub/block/block_radix_rank.cuh
+++ b/cub/cub/block/block_radix_rank.cuh
@@ -741,7 +741,7 @@ public:
       DigitCounterT warp_digit_prefix = *digit_counters[ITEM];
 
       // Warp-sync
-      WARP_SYNC(0xFFFFFFFF);
+      __syncwarp(0xFFFFFFFF);
 
       // Number of peers having same digit as me
       int32_t digit_count = __popc(peer_mask);
@@ -756,7 +756,7 @@ public:
       }
 
       // Warp-sync
-      WARP_SYNC(0xFFFFFFFF);
+      __syncwarp(0xFFFFFFFF);
 
       // Number of prior keys having same digit
       ranks[ITEM] = warp_digit_prefix + DigitCounterT(peer_digit_prefix);
@@ -978,7 +978,7 @@ struct BlockRadixRankMatchEarlyCounts
           match_masks[bin] = 0;
         }
       }
-      WARP_SYNC(WARP_MASK);
+      __syncwarp(WARP_MASK);
 
       // compute private per-part histograms
       int part = lane % NUM_PARTS;
@@ -992,7 +992,7 @@ struct BlockRadixRankMatchEarlyCounts
       // no extra work is necessary if NUM_PARTS == 1
       if (NUM_PARTS > 1)
       {
-        WARP_SYNC(WARP_MASK);
+        __syncwarp(WARP_MASK);
         // TODO: handle RADIX_DIGITS % WARP_THREADS != 0 if it becomes necessary
         constexpr int WARP_BINS_PER_THREAD = RADIX_DIGITS / WARP_THREADS;
         int bins[WARP_BINS_PER_THREAD];
@@ -1067,7 +1067,7 @@ struct BlockRadixRankMatchEarlyCounts
         ::cuda::std::uint32_t bin = Digit(keys[u]);
         int* p_match_mask         = &match_masks[bin];
         atomicOr(p_match_mask, lane_mask);
-        WARP_SYNC(WARP_MASK);
+        __syncwarp(WARP_MASK);
         int bin_mask    = *p_match_mask;
         int leader      = (WARP_THREADS - 1) - __clz(bin_mask);
         int warp_offset = 0;
@@ -1082,7 +1082,7 @@ struct BlockRadixRankMatchEarlyCounts
         {
           *p_match_mask = 0;
         }
-        WARP_SYNC(WARP_MASK);
+        __syncwarp(WARP_MASK);
         ranks[u] = warp_offset + popc - 1;
       }
     }
diff --git a/cub/cub/block/specializations/block_reduce_raking.cuh b/cub/cub/block/specializations/block_reduce_raking.cuh
index 7c1db2c9050..3d1f7b72707 100644
--- a/cub/cub/block/specializations/block_reduce_raking.cuh
+++ b/cub/cub/block/specializations/block_reduce_raking.cuh
@@ -228,7 +228,7 @@ struct BlockReduceRaking
         // sync before re-using shmem (warp_storage/raking_grid are aliased)
         static_assert(RAKING_THREADS <= CUB_PTX_WARP_THREADS, "RAKING_THREADS must be <= warp size.");
         unsigned int mask = static_cast<unsigned int>((1ull << RAKING_THREADS) - 1);
-        WARP_SYNC(mask);
+        __syncwarp(mask);
 
         partial = WarpReduce(temp_storage.warp_storage)
                     .template Reduce<(IS_FULL_TILE && RAKING_UNGUARDED)>(partial, valid_raking_threads, reduction_op);
diff --git a/cub/cub/util_ptx.cuh b/cub/cub/util_ptx.cuh
index 6ca57622cde..0d0efec864b 100644
--- a/cub/cub/util_ptx.cuh
+++ b/cub/cub/util_ptx.cuh
@@ -217,6 +217,7 @@ _CCCL_DEVICE _CCCL_FORCEINLINE int CTA_SYNC_OR(int p)
 /**
  * Warp barrier
  */
+CCCL_DEPRECATED_BECAUSE("use __syncwarp() instead")
 _CCCL_DEVICE _CCCL_FORCEINLINE void WARP_SYNC(unsigned int member_mask)
 {
   __syncwarp(member_mask);
@@ -225,6 +226,7 @@ _CCCL_DEVICE _CCCL_FORCEINLINE void WARP_SYNC(unsigned int member_mask)
 /**
  * Warp any
  */
+CCCL_DEPRECATED_BECAUSE("use __any_sync() instead")
 _CCCL_DEVICE _CCCL_FORCEINLINE int WARP_ANY(int predicate, unsigned int member_mask)
 {
   return __any_sync(member_mask, predicate);
@@ -233,6 +235,7 @@ _CCCL_DEVICE _CCCL_FORCEINLINE int WARP_ANY(int predicate, unsigned int member_m
 /**
  * Warp any
  */
+CCCL_DEPRECATED_BECAUSE("use __all_sync() instead")
 _CCCL_DEVICE _CCCL_FORCEINLINE int WARP_ALL(int predicate, unsigned int member_mask)
 {
   return __all_sync(member_mask, predicate);
@@ -241,6 +244,7 @@ _CCCL_DEVICE _CCCL_FORCEINLINE int WARP_ALL(int predicate, unsigned int member_m
 /**
  * Warp ballot
  */
+CCCL_DEPRECATED_BECAUSE("use __ballot_sync() instead")
 _CCCL_DEVICE _CCCL_FORCEINLINE int WARP_BALLOT(int predicate, unsigned int member_mask)
 {
   return __ballot_sync(member_mask, predicate);
diff --git a/cub/cub/warp/specializations/warp_exchange_smem.cuh b/cub/cub/warp/specializations/warp_exchange_smem.cuh
index d9435c4bc5d..35b688f813c 100644
--- a/cub/cub/warp/specializations/warp_exchange_smem.cuh
+++ b/cub/cub/warp/specializations/warp_exchange_smem.cuh
@@ -104,7 +104,7 @@ public:
       const int idx                  = ITEMS_PER_THREAD * lane_id + item;
       temp_storage.items_shared[idx] = input_items[item];
     }
-    WARP_SYNC(member_mask);
+    __syncwarp(member_mask);
 
     for (int item = 0; item < ITEMS_PER_THREAD; item++)
     {
@@ -122,7 +122,7 @@ public:
       const int idx                  = LOGICAL_WARP_THREADS * item + lane_id;
       temp_storage.items_shared[idx] = input_items[item];
     }
-    WARP_SYNC(member_mask);
+    __syncwarp(member_mask);
 
     for (int item = 0; item < ITEMS_PER_THREAD; item++)
     {
@@ -155,7 +155,7 @@ public:
       temp_storage.items_shared[ranks[ITEM]] = input_items[ITEM];
     }
 
-    WARP_SYNC(member_mask);
+    __syncwarp(member_mask);
 
 #pragma unroll
     for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
diff --git a/cub/cub/warp/specializations/warp_reduce_shfl.cuh b/cub/cub/warp/specializations/warp_reduce_shfl.cuh
index c018f891cde..55df1f3beee 100644
--- a/cub/cub/warp/specializations/warp_reduce_shfl.cuh
+++ b/cub/cub/warp/specializations/warp_reduce_shfl.cuh
@@ -700,7 +700,7 @@ struct WarpReduceShfl
   _CCCL_DEVICE _CCCL_FORCEINLINE T SegmentedReduce(T input, FlagT flag, ReductionOp reduction_op)
   {
     // Get the start flags for each thread in the warp.
-    int warp_flags = WARP_BALLOT(flag, member_mask);
+    int warp_flags = __ballot_sync(member_mask, flag);
 
     // Convert to tail-segmented
     if (HEAD_SEGMENTED)
diff --git a/cub/cub/warp/specializations/warp_reduce_smem.cuh b/cub/cub/warp/specializations/warp_reduce_smem.cuh
index 6913556ca2a..d7884e26753 100644
--- a/cub/cub/warp/specializations/warp_reduce_smem.cuh
+++ b/cub/cub/warp/specializations/warp_reduce_smem.cuh
@@ -161,7 +161,7 @@ struct WarpReduceSmem
     // Share input through buffer
     ThreadStore<STORE_VOLATILE>(&temp_storage.reduce[lane_id], input);
 
-    WARP_SYNC(member_mask);
+    __syncwarp(member_mask);
 
     // Update input if peer_addend is in range
     if ((ALL_LANES_VALID && IS_POW_OF_TWO) || ((lane_id + OFFSET) < valid_items))
@@ -170,7 +170,7 @@ struct WarpReduceSmem
       input         = reduction_op(input, peer_addend);
     }
 
-    WARP_SYNC(member_mask);
+    __syncwarp(member_mask);
 
     return ReduceStep<ALL_LANES_VALID>(input, valid_items, reduction_op, Int2Type<STEP + 1>());
   }
@@ -224,7 +224,7 @@ struct WarpReduceSmem
   SegmentedReduce(T input, FlagT flag, ReductionOp reduction_op, Int2Type<true> /*has_ballot*/)
   {
     // Get the start flags for each thread in the warp.
-    int warp_flags = WARP_BALLOT(flag, member_mask);
+    int warp_flags = __ballot_sync(member_mask, flag);
 
     if (!HEAD_SEGMENTED)
     {
@@ -257,7 +257,7 @@ struct WarpReduceSmem
       // Share input into buffer
       ThreadStore<STORE_VOLATILE>(&temp_storage.reduce[lane_id], input);
 
-      WARP_SYNC(member_mask);
+      __syncwarp(member_mask);
 
       // Update input if peer_addend is in range
       if (OFFSET + lane_id < next_flag)
@@ -266,7 +266,7 @@ struct WarpReduceSmem
         input         = reduction_op(input, peer_addend);
       }
 
-      WARP_SYNC(member_mask);
+      __syncwarp(member_mask);
     }
 
     return input;
@@ -313,12 +313,12 @@ struct WarpReduceSmem
       // Share input through buffer
       ThreadStore<STORE_VOLATILE>(&temp_storage.reduce[lane_id], input);
 
-      WARP_SYNC(member_mask);
+      __syncwarp(member_mask);
 
       // Get peer from buffer
       T peer_addend = ThreadLoad<LOAD_VOLATILE>(&temp_storage.reduce[lane_id + OFFSET]);
 
-      WARP_SYNC(member_mask);
+      __syncwarp(member_mask);
 
       // Share flag through buffer
       flag_storage[lane_id] = flag_status;
diff --git a/cub/cub/warp/specializations/warp_scan_shfl.cuh b/cub/cub/warp/specializations/warp_scan_shfl.cuh
index 81998642811..22d6b4b6f0a 100644
--- a/cub/cub/warp/specializations/warp_scan_shfl.cuh
+++ b/cub/cub/warp/specializations/warp_scan_shfl.cuh
@@ -542,7 +542,7 @@ struct WarpScanShfl
 
     KeyT pred_key = ShuffleUp<LOGICAL_WARP_THREADS>(inclusive_output.key, 1, 0, member_mask);
 
-    unsigned int ballot = WARP_BALLOT((pred_key != inclusive_output.key), member_mask);
+    unsigned int ballot = __ballot_sync(member_mask, (pred_key != inclusive_output.key));
 
     // Mask away all lanes greater than ours
     ballot = ballot & ::cuda::ptx::get_sreg_lanemask_le();
diff --git a/cub/cub/warp/specializations/warp_scan_smem.cuh b/cub/cub/warp/specializations/warp_scan_smem.cuh
index b25b2a28952..336416b69d9 100644
--- a/cub/cub/warp/specializations/warp_scan_smem.cuh
+++ b/cub/cub/warp/specializations/warp_scan_smem.cuh
@@ -134,7 +134,7 @@ struct WarpScanSmem
     // Share partial into buffer
     ThreadStore<STORE_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) partial);
 
-    WARP_SYNC(member_mask);
+    __syncwarp(member_mask);
 
     // Update partial if addend is in range
     if (HAS_IDENTITY || (lane_id >= OFFSET))
@@ -142,7 +142,7 @@ struct WarpScanSmem
       T addend = (T) ThreadLoad<LOAD_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id - OFFSET]);
       partial  = scan_op(addend, partial);
     }
-    WARP_SYNC(member_mask);
+    __syncwarp(member_mask);
 
     ScanStep<HAS_IDENTITY>(partial, scan_op, Int2Type<STEP + 1>());
   }
@@ -173,7 +173,7 @@ struct WarpScanSmem
     T identity = 0;
     ThreadStore<STORE_VOLATILE>(&temp_storage[lane_id], (CellT) identity);
 
-    WARP_SYNC(member_mask);
+    __syncwarp(member_mask);
 
     // Iterate scan steps
     output = input;
@@ -228,7 +228,7 @@ struct WarpScanSmem
       ThreadStore<STORE_VOLATILE>(temp_storage, (CellT) input);
     }
 
-    WARP_SYNC(member_mask);
+    __syncwarp(member_mask);
 
     return (T) ThreadLoad<LOAD_VOLATILE>(temp_storage);
   }
@@ -278,11 +278,11 @@ struct WarpScanSmem
     // Retrieve aggregate
     ThreadStore<STORE_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) inclusive_output);
 
-    WARP_SYNC(member_mask);
+    __syncwarp(member_mask);
 
     warp_aggregate = (T) ThreadLoad<LOAD_VOLATILE>(&temp_storage[WARP_SMEM_ELEMENTS - 1]);
 
-    WARP_SYNC(member_mask);
+    __syncwarp(member_mask);
   }
 
   //---------------------------------------------------------------------
@@ -309,7 +309,7 @@ struct WarpScanSmem
     // initial value unknown
     ThreadStore<STORE_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) inclusive);
 
-    WARP_SYNC(member_mask);
+    __syncwarp(member_mask);
 
     exclusive = (T) ThreadLoad<LOAD_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id - 1]);
   }
@@ -336,7 +336,7 @@ struct WarpScanSmem
     inclusive = scan_op(initial_value, inclusive);
     ThreadStore<STORE_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) inclusive);
 
-    WARP_SYNC(member_mask);
+    __syncwarp(member_mask);
 
     exclusive = (T) ThreadLoad<LOAD_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id - 1]);
     if (lane_id == 0)
@@ -366,7 +366,7 @@ struct WarpScanSmem
     // Initial value presumed to be unknown or identity (either way our padding is correct)
     ThreadStore<STORE_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) inclusive);
 
-    WARP_SYNC(member_mask);
+    __syncwarp(member_mask);
 
     exclusive      = (T) ThreadLoad<LOAD_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id - 1]);
     warp_aggregate = (T) ThreadLoad<LOAD_VOLATILE>(&temp_storage[WARP_SMEM_ELEMENTS - 1]);
@@ -387,7 +387,7 @@ struct WarpScanSmem
     // Initial value presumed to be unknown or identity (either way our padding is correct)
     ThreadStore<STORE_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) inclusive);
 
-    WARP_SYNC(member_mask);
+    __syncwarp(member_mask);
 
     warp_aggregate = (T) ThreadLoad<LOAD_VOLATILE>(&temp_storage[WARP_SMEM_ELEMENTS - 1]);
     exclusive      = inclusive - input;
@@ -410,11 +410,11 @@ struct WarpScanSmem
     // Broadcast warp aggregate
     ThreadStore<STORE_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) inclusive);
 
-    WARP_SYNC(member_mask);
+    __syncwarp(member_mask);
 
     warp_aggregate = (T) ThreadLoad<LOAD_VOLATILE>(&temp_storage[WARP_SMEM_ELEMENTS - 1]);
 
-    WARP_SYNC(member_mask);
+    __syncwarp(member_mask);
 
     // Update inclusive with initial value
     inclusive = scan_op(initial_value, inclusive);
@@ -422,7 +422,7 @@ struct WarpScanSmem
     // Get exclusive from exclusive
     ThreadStore<STORE_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id - 1], (CellT) inclusive);
 
-    WARP_SYNC(member_mask);
+    __syncwarp(member_mask);
 
     exclusive = (T) ThreadLoad<LOAD_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id - 2]);
 
diff --git a/cub/cub/warp/warp_merge_sort.cuh b/cub/cub/warp/warp_merge_sort.cuh
index cda00d253f2..de3d311ae59 100644
--- a/cub/cub/warp/warp_merge_sort.cuh
+++ b/cub/cub/warp/warp_merge_sort.cuh
@@ -167,7 +167,7 @@ public:
 private:
   _CCCL_DEVICE _CCCL_FORCEINLINE void SyncImplementation() const
   {
-    WARP_SYNC(member_mask);
+    __syncwarp(member_mask);
   }
 
   friend BlockMergeSortStrategyT;
diff --git a/cub/test/catch2_test_warp_merge_sort.cu b/cub/test/catch2_test_warp_merge_sort.cu
index 7b245ebba33..fa4f986ad64 100644
--- a/cub/test/catch2_test_warp_merge_sort.cu
+++ b/cub/test/catch2_test_warp_merge_sort.cu
@@ -88,7 +88,7 @@ __global__ void warp_merge_sort_kernel(T* in, T* out, SegmentSizeItT segment_siz
     const int idx     = thread_offset + item;
     thread_data[item] = in[idx];
   }
-  cub::WARP_SYNC(warp_sort.get_member_mask());
+  __syncwarp(warp_sort.get_member_mask());
 
   // Run merge sort test
   action(warp_sort, thread_data, valid_items, oob_default);
@@ -153,7 +153,7 @@ __global__ void warp_merge_sort_kernel(
     keys[item]    = keys_in[idx];
     values[item]  = values_in[idx];
   }
-  cub::WARP_SYNC(warp_sort.get_member_mask());
+  __syncwarp(warp_sort.get_member_mask());
 
   // Run merge sort test
   action(warp_sort, keys, values, valid_items, oob_default);

From 73dabf6320e4369b31a4c9d18487d6314dbc4e05 Mon Sep 17 00:00:00 2001
From: fbusato <fbusato@nvidia.com>
Date: Mon, 13 Jan 2025 23:47:31 +0000
Subject: [PATCH 11/14] replace cta sync instructions

---
 cub/cub/agent/agent_adjacent_difference.cuh   |  4 +--
 cub/cub/agent/agent_batch_memcpy.cuh          | 14 ++++----
 cub/cub/agent/agent_histogram.cuh             |  8 ++---
 cub/cub/agent/agent_merge.cuh                 | 11 +++---
 cub/cub/agent/agent_merge_sort.cuh            | 20 +++++------
 cub/cub/agent/agent_radix_sort_downsweep.cuh  | 34 +++++++++----------
 cub/cub/agent/agent_radix_sort_histogram.cuh  |  8 ++---
 cub/cub/agent/agent_radix_sort_onesweep.cuh   | 14 ++++----
 cub/cub/agent/agent_radix_sort_upsweep.cuh    | 12 +++----
 cub/cub/agent/agent_reduce_by_key.cuh         |  8 ++---
 cub/cub/agent/agent_rle.cuh                   | 12 +++----
 cub/cub/agent/agent_scan.cuh                  |  8 ++---
 cub/cub/agent/agent_scan_by_key.cuh           |  6 ++--
 cub/cub/agent/agent_segment_fixup.cuh         |  2 +-
 cub/cub/agent/agent_segmented_radix_sort.cuh  | 14 ++++----
 cub/cub/agent/agent_select_if.cuh             | 20 +++++------
 cub/cub/agent/agent_spmv_orig.cuh             | 18 +++++-----
 cub/cub/agent/agent_three_way_partition.cuh   | 10 +++---
 cub/cub/agent/agent_unique_by_key.cuh         | 24 ++++++-------
 cub/cub/block/block_adjacent_difference.cuh   | 14 ++++----
 cub/cub/block/block_discontinuity.cuh         | 16 ++++-----
 cub/cub/block/block_exchange.cuh              | 32 ++++++++---------
 cub/cub/block/block_histogram.cuh             |  2 +-
 cub/cub/block/block_merge_sort.cuh            |  2 +-
 cub/cub/block/block_radix_rank.cuh            | 16 ++++-----
 cub/cub/block/block_radix_sort.cuh            | 12 +++----
 cub/cub/block/block_run_length_decode.cuh     |  4 +--
 cub/cub/block/block_scan.cuh                  | 32 ++++++++---------
 cub/cub/block/block_shuffle.cuh               |  8 ++---
 cub/cub/block/block_store.cuh                 |  6 ++--
 .../specializations/block_histogram_sort.cuh  |  6 ++--
 .../specializations/block_reduce_raking.cuh   |  2 +-
 .../block_reduce_raking_commutative_only.cuh  |  4 +--
 .../block_reduce_warp_reductions.cuh          |  2 +-
 .../specializations/block_scan_raking.cuh     | 32 ++++++++---------
 .../specializations/block_scan_warp_scans.cuh |  6 ++--
 .../device/dispatch/dispatch_batch_memcpy.cuh |  4 +--
 .../device/dispatch/dispatch_radix_sort.cuh   | 16 ++++-----
 .../dispatch/dispatch_segmented_sort.cuh      |  4 +--
 cub/cub/grid/grid_barrier.cuh                 |  8 ++---
 cub/cub/util_ptx.cuh                          |  3 ++
 cub/cub/util_vsmem.cuh                        |  2 +-
 .../catch2_test_block_run_length_decode.cu    |  6 ++--
 thrust/thrust/system/cuda/detail/core/util.h  |  2 +-
 44 files changed, 246 insertions(+), 242 deletions(-)

diff --git a/cub/cub/agent/agent_adjacent_difference.cuh b/cub/cub/agent/agent_adjacent_difference.cuh
index 37e1a013193..b023de3753a 100644
--- a/cub/cub/agent/agent_adjacent_difference.cuh
+++ b/cub/cub/agent/agent_adjacent_difference.cuh
@@ -138,7 +138,7 @@ struct AgentDifference
       BlockLoad(temp_storage.load).Load(load_it + tile_base, input);
     }
 
-    CTA_SYNC();
+    __syncthreads();
 
     if (ReadLeft)
     {
@@ -186,7 +186,7 @@ struct AgentDifference
       }
     }
 
-    CTA_SYNC();
+    __syncthreads();
 
     if (IS_LAST_TILE)
     {
diff --git a/cub/cub/agent/agent_batch_memcpy.cuh b/cub/cub/agent/agent_batch_memcpy.cuh
index 813c0dae6c8..b20b734c389 100644
--- a/cub/cub/agent/agent_batch_memcpy.cuh
+++ b/cub/cub/agent/agent_batch_memcpy.cuh
@@ -832,7 +832,7 @@ private:
       BlockBLevTileCountScanT(temp_storage.staged.blev.block_scan_storage)
         .ExclusiveSum(block_offset, block_offset, blev_tile_prefix_op);
     }
-    CTA_SYNC();
+    __syncthreads();
 
     // Read in the BLEV buffer partition (i.e., the buffers that require block-level collaboration)
     blev_buffer_offset = threadIdx.x * BLEV_BUFFERS_PER_THREAD;
@@ -994,7 +994,7 @@ private:
 
       // Ensure all threads finished collaborative BlockExchange so temporary storage can be reused
       // with next iteration
-      CTA_SYNC();
+      __syncthreads();
     }
   }
 
@@ -1024,7 +1024,7 @@ public:
     }
 
     // Ensure we can repurpose the BlockLoad's temporary storage
-    CTA_SYNC();
+    __syncthreads();
 
     // Count how many buffers fall into each size-class
     VectorizedSizeClassCounterT size_class_histogram = GetBufferSizeClassHistogram(buffer_sizes);
@@ -1035,7 +1035,7 @@ public:
       .ExclusiveSum(size_class_histogram, size_class_histogram, size_class_agg);
 
     // Ensure we can repurpose the scan's temporary storage for scattering the buffer ids
-    CTA_SYNC();
+    __syncthreads();
 
     // Factor in the per-size-class counts / offsets
     // That is, WLEV buffer offset has to be offset by the TLEV buffer count and BLEV buffer offset
@@ -1075,7 +1075,7 @@ public:
 
     // Ensure the prefix callback has finished using its temporary storage and that it can be reused
     // in the next stage
-    CTA_SYNC();
+    __syncthreads();
 
     // Scatter the buffers into one of the three partitions (TLEV, WLEV, BLEV) depending on their
     // size
@@ -1083,7 +1083,7 @@ public:
 
     // Ensure all buffers have been partitioned by their size class AND
     // ensure that blev_buffer_offset has been written to shared memory
-    CTA_SYNC();
+    __syncthreads();
 
     // TODO: think about prefetching tile_buffer_{srcs,dsts} into shmem
     InputBufferIt tile_buffer_srcs        = input_buffer_it + buffer_offset;
@@ -1102,7 +1102,7 @@ public:
       tile_id);
 
     // Ensure we can repurpose the temporary storage required by EnqueueBLEVBuffers
-    CTA_SYNC();
+    __syncthreads();
 
     // Copy warp-level buffers
     BatchMemcpyWLEVBuffers(
diff --git a/cub/cub/agent/agent_histogram.cuh b/cub/cub/agent/agent_histogram.cuh
index e454dc837b1..98a16b36c68 100644
--- a/cub/cub/agent/agent_histogram.cuh
+++ b/cub/cub/agent/agent_histogram.cuh
@@ -320,7 +320,7 @@ struct AgentHistogram
     }
 
     // Barrier to make sure all threads are done updating counters
-    CTA_SYNC();
+    __syncthreads();
   }
 
   // Initialize privatized bin counters.  Specialized for privatized shared-memory counters
@@ -350,7 +350,7 @@ struct AgentHistogram
   _CCCL_DEVICE _CCCL_FORCEINLINE void StoreOutput(CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS])
   {
     // Barrier to make sure all threads are done updating counters
-    CTA_SYNC();
+    __syncthreads();
 
 // Apply privatized bin counts to output bin counts
 #pragma unroll
@@ -690,7 +690,7 @@ struct AgentHistogram
         ConsumeTile<IS_ALIGNED, true>(tile_offset, TILE_SAMPLES);
       }
 
-      CTA_SYNC();
+      __syncthreads();
 
       // Get next tile
       if (threadIdx.x == 0)
@@ -698,7 +698,7 @@ struct AgentHistogram
         temp_storage.tile_idx = tile_queue.Drain(1) + num_even_share_tiles;
       }
 
-      CTA_SYNC();
+      __syncthreads();
 
       tile_idx = temp_storage.tile_idx;
     }
diff --git a/cub/cub/agent/agent_merge.cuh b/cub/cub/agent/agent_merge.cuh
index ae457bb954d..e1649b812ff 100644
--- a/cub/cub/agent/agent_merge.cuh
+++ b/cub/cub/agent/agent_merge.cuh
@@ -132,7 +132,7 @@ struct agent_t
     gmem_to_reg<threads_per_block, IsFullTile>(
       keys_loc, keys1_in + keys1_beg, keys2_in + keys2_beg, num_keys1, num_keys2);
     reg_to_shared<threads_per_block>(&storage.keys_shared[0], keys_loc);
-    CTA_SYNC();
+    __syncthreads();
 
     // use binary search in shared memory to find merge path for each of thread.
     // we can use int type here, because the number of items in shared memory is limited
@@ -158,7 +158,7 @@ struct agent_t
       keys_loc,
       indices,
       compare_op);
-    CTA_SYNC();
+    __syncthreads();
 
     // write keys
     if (IsFullTile)
@@ -182,9 +182,10 @@ struct agent_t
       item_type items_loc[items_per_thread];
       gmem_to_reg<threads_per_block, IsFullTile>(
         items_loc, items1_in + keys1_beg, items2_in + keys2_beg, num_keys1, num_keys2);
-      CTA_SYNC(); // block_store_keys above uses shared memory, so make sure all threads are done before we write to it
+      __syncthreads(); // block_store_keys above uses shared memory, so make sure all threads are done before we write
+                       // to it
       reg_to_shared<threads_per_block>(&storage.items_shared[0], items_loc);
-      CTA_SYNC();
+      __syncthreads();
 
       // gather items from shared mem
 #pragma unroll
@@ -192,7 +193,7 @@ struct agent_t
       {
         items_loc[i] = storage.items_shared[indices[i]];
       }
-      CTA_SYNC();
+      __syncthreads();
 
       // write from reg to gmem
       if (IsFullTile)
diff --git a/cub/cub/agent/agent_merge_sort.cuh b/cub/cub/agent/agent_merge_sort.cuh
index dd8b559f2c4..9c0c54fbeb2 100644
--- a/cub/cub/agent/agent_merge_sort.cuh
+++ b/cub/cub/agent/agent_merge_sort.cuh
@@ -187,7 +187,7 @@ struct AgentBlockSort
         BlockLoadItems(storage.load_items).Load(items_in + tile_base, items_local);
       }
 
-      CTA_SYNC();
+      __syncthreads();
     }
 
     KeyT keys_local[ITEMS_PER_THREAD];
@@ -200,7 +200,7 @@ struct AgentBlockSort
       BlockLoadKeys(storage.load_keys).Load(keys_in + tile_base, keys_local);
     }
 
-    CTA_SYNC();
+    __syncthreads();
     _CCCL_PDL_TRIGGER_NEXT_LAUNCH();
 
     _CCCL_IF_CONSTEXPR (IS_LAST_TILE)
@@ -212,7 +212,7 @@ struct AgentBlockSort
       BlockMergeSortT(storage.block_merge).Sort(keys_local, items_local, compare_op);
     }
 
-    CTA_SYNC();
+    __syncthreads();
 
     if (ping)
     {
@@ -227,7 +227,7 @@ struct AgentBlockSort
 
       _CCCL_IF_CONSTEXPR (!KEYS_ONLY)
       {
-        CTA_SYNC();
+        __syncthreads();
 
         _CCCL_IF_CONSTEXPR (IS_LAST_TILE)
         {
@@ -252,7 +252,7 @@ struct AgentBlockSort
 
       _CCCL_IF_CONSTEXPR (!KEYS_ONLY)
       {
-        CTA_SYNC();
+        __syncthreads();
 
         _CCCL_IF_CONSTEXPR (IS_LAST_TILE)
         {
@@ -583,7 +583,7 @@ struct AgentMerge
       }
     }
 
-    CTA_SYNC();
+    __syncthreads();
     _CCCL_PDL_TRIGGER_NEXT_LAUNCH();
 
     // use binary search in shared memory
@@ -616,7 +616,7 @@ struct AgentMerge
       indices,
       compare_op);
 
-    CTA_SYNC();
+    __syncthreads();
 
     // write keys
     if (ping)
@@ -650,11 +650,11 @@ struct AgentMerge
     _CCCL_IF_CONSTEXPR (!KEYS_ONLY)
 #endif // _CCCL_CUDACC_AT_LEAST(11, 8)
     {
-      CTA_SYNC();
+      __syncthreads();
 
       detail::reg_to_shared<BLOCK_THREADS>(&storage.items_shared[0], items_local);
 
-      CTA_SYNC();
+      __syncthreads();
 
       // gather items from shared mem
       //
@@ -664,7 +664,7 @@ struct AgentMerge
         items_local[item] = storage.items_shared[indices[item]];
       }
 
-      CTA_SYNC();
+      __syncthreads();
 
       // write from reg to gmem
       //
diff --git a/cub/cub/agent/agent_radix_sort_downsweep.cuh b/cub/cub/agent/agent_radix_sort_downsweep.cuh
index 43562c9a2b5..23fcd653fe1 100644
--- a/cub/cub/agent/agent_radix_sort_downsweep.cuh
+++ b/cub/cub/agent/agent_radix_sort_downsweep.cuh
@@ -277,7 +277,7 @@ struct AgentRadixSortDownsweep
       temp_storage.keys_and_offsets.exchange_keys[ranks[ITEM]] = twiddled_keys[ITEM];
     }
 
-    CTA_SYNC();
+    __syncthreads();
 
 #pragma unroll
     for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
@@ -305,7 +305,7 @@ struct AgentRadixSortDownsweep
     int (&ranks)[ITEMS_PER_THREAD],
     OffsetT valid_items)
   {
-    CTA_SYNC();
+    __syncthreads();
 
     ValueExchangeT& exchange_values = temp_storage.exchange_values.Alias();
 
@@ -315,7 +315,7 @@ struct AgentRadixSortDownsweep
       exchange_values[ranks[ITEM]] = values[ITEM];
     }
 
-    CTA_SYNC();
+    __syncthreads();
 
 #pragma unroll
     for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
@@ -342,7 +342,7 @@ struct AgentRadixSortDownsweep
   {
     BlockLoadKeysT(temp_storage.load_keys).Load(d_keys_in + block_offset, keys);
 
-    CTA_SYNC();
+    __syncthreads();
   }
 
   /**
@@ -362,7 +362,7 @@ struct AgentRadixSortDownsweep
 
     BlockLoadKeysT(temp_storage.load_keys).Load(d_keys_in + block_offset, keys, valid_items, oob_item);
 
-    CTA_SYNC();
+    __syncthreads();
   }
 
   /**
@@ -409,7 +409,7 @@ struct AgentRadixSortDownsweep
   {
     BlockLoadValuesT(temp_storage.load_values).Load(d_values_in + block_offset, values);
 
-    CTA_SYNC();
+    __syncthreads();
   }
 
   /**
@@ -428,7 +428,7 @@ struct AgentRadixSortDownsweep
 
     BlockLoadValuesT(temp_storage.load_values).Load(d_values_in + block_offset, values, valid_items);
 
-    CTA_SYNC();
+    __syncthreads();
   }
 
   /**
@@ -474,7 +474,7 @@ struct AgentRadixSortDownsweep
   {
     ValueT values[ITEMS_PER_THREAD];
 
-    CTA_SYNC();
+    __syncthreads();
 
     LoadValues(values, block_offset, valid_items, Int2Type<FULL_TILE>(), Int2Type<LOAD_WARP_STRIPED>());
 
@@ -520,7 +520,7 @@ struct AgentRadixSortDownsweep
     int exclusive_digit_prefix[BINS_TRACKED_PER_THREAD];
     BlockRadixRankT(temp_storage.radix_rank).RankKeys(keys, ranks, digit_extractor(), exclusive_digit_prefix);
 
-    CTA_SYNC();
+    __syncthreads();
 
 // Share exclusive digit prefix
 #pragma unroll
@@ -534,7 +534,7 @@ struct AgentRadixSortDownsweep
       }
     }
 
-    CTA_SYNC();
+    __syncthreads();
 
     // Get inclusive digit prefix
     int inclusive_digit_prefix[BINS_TRACKED_PER_THREAD];
@@ -562,7 +562,7 @@ struct AgentRadixSortDownsweep
       }
     }
 
-    CTA_SYNC();
+    __syncthreads();
 
 // Update global scatter base offsets for each digit
 #pragma unroll
@@ -577,7 +577,7 @@ struct AgentRadixSortDownsweep
       }
     }
 
-    CTA_SYNC();
+    __syncthreads();
 
     // Scatter keys
     ScatterKeys<FULL_TILE>(keys, relative_bin_offsets, ranks, valid_items);
@@ -602,7 +602,7 @@ struct AgentRadixSortDownsweep
       T items[ITEMS_PER_THREAD];
 
       LoadDirectStriped<BLOCK_THREADS>(threadIdx.x, d_in + block_offset, items);
-      CTA_SYNC();
+      __syncthreads();
       StoreDirectStriped<BLOCK_THREADS>(threadIdx.x, d_out + block_offset, items);
 
       block_offset += TILE_ITEMS;
@@ -616,7 +616,7 @@ struct AgentRadixSortDownsweep
       T items[ITEMS_PER_THREAD];
 
       LoadDirectStriped<BLOCK_THREADS>(threadIdx.x, d_in + block_offset, items, valid_items);
-      CTA_SYNC();
+      __syncthreads();
       StoreDirectStriped<BLOCK_THREADS>(threadIdx.x, d_out + block_offset, items, valid_items);
     }
   }
@@ -670,7 +670,7 @@ struct AgentRadixSortDownsweep
       }
     }
 
-    short_circuit = CTA_SYNC_AND(short_circuit);
+    short_circuit = __syncthreads_and(short_circuit);
   }
 
   /**
@@ -719,7 +719,7 @@ struct AgentRadixSortDownsweep
       }
     }
 
-    short_circuit = CTA_SYNC_AND(short_circuit);
+    short_circuit = __syncthreads_and(short_circuit);
   }
 
   /**
@@ -744,7 +744,7 @@ struct AgentRadixSortDownsweep
         ProcessTile<true>(block_offset);
         block_offset += TILE_ITEMS;
 
-        CTA_SYNC();
+        __syncthreads();
       }
 
       // Clean up last partial tile with guarded-I/O
diff --git a/cub/cub/agent/agent_radix_sort_histogram.cuh b/cub/cub/agent/agent_radix_sort_histogram.cuh
index 853a56f7fbc..0eb5a9415ea 100644
--- a/cub/cub/agent/agent_radix_sort_histogram.cuh
+++ b/cub/cub/agent/agent_radix_sort_histogram.cuh
@@ -174,7 +174,7 @@ struct AgentRadixSortHistogram
         }
       }
     }
-    CTA_SYNC();
+    __syncthreads();
   }
 
   _CCCL_DEVICE _CCCL_FORCEINLINE void LoadTileKeys(OffsetT tile_offset, bit_ordered_type (&keys)[ITEMS_PER_THREAD])
@@ -249,7 +249,7 @@ struct AgentRadixSortHistogram
     {
       // Reset the counters.
       Init();
-      CTA_SYNC();
+      __syncthreads();
 
       // Process the tiles.
       OffsetT portion_offset = portion * MAX_PORTION_SIZE;
@@ -261,11 +261,11 @@ struct AgentRadixSortHistogram
         LoadTileKeys(tile_offset, keys);
         AccumulateSharedHistograms(tile_offset, keys);
       }
-      CTA_SYNC();
+      __syncthreads();
 
       // Accumulate the result in global memory.
       AccumulateGlobalHistograms();
-      CTA_SYNC();
+      __syncthreads();
     }
   }
 
diff --git a/cub/cub/agent/agent_radix_sort_onesweep.cuh b/cub/cub/agent/agent_radix_sort_onesweep.cuh
index a4ad4b852ff..c93ce277e67 100644
--- a/cub/cub/agent/agent_radix_sort_onesweep.cuh
+++ b/cub/cub/agent/agent_radix_sort_onesweep.cuh
@@ -350,7 +350,7 @@ struct AgentRadixSortOnesweep
         short_circuit = short_circuit || bins[u] == TILE_ITEMS;
       }
     }
-    short_circuit = CTA_SYNC_OR(short_circuit);
+    short_circuit = __syncthreads_or(short_circuit);
     if (!short_circuit)
     {
       return;
@@ -378,7 +378,7 @@ struct AgentRadixSortOnesweep
     LoadBinsToOffsetsGlobal(offsets);
     LookbackGlobal(bins);
     UpdateBinsGlobal(bins, offsets);
-    CTA_SYNC();
+    __syncthreads();
 
     // scatter the keys
     OffsetT global_offset = s.global_offsets[common_bin];
@@ -601,10 +601,10 @@ struct AgentRadixSortOnesweep
     LoadValues(block_idx * TILE_ITEMS, values);
 
     // scatter values
-    CTA_SYNC();
+    __syncthreads();
     ScatterValuesShared(values, ranks);
 
-    CTA_SYNC();
+    __syncthreads();
     ScatterValuesGlobal(digits);
   }
 
@@ -626,7 +626,7 @@ struct AgentRadixSortOnesweep
       .RankKeys(keys, ranks, digit_extractor(), exclusive_digit_prefix, CountsCallback(*this, bins, keys));
 
     // scatter keys in shared memory
-    CTA_SYNC();
+    __syncthreads();
     ScatterKeysShared(keys, ranks);
 
     // compute global offsets
@@ -635,7 +635,7 @@ struct AgentRadixSortOnesweep
     UpdateBinsGlobal(bins, exclusive_digit_prefix);
 
     // scatter keys in global memory
-    CTA_SYNC();
+    __syncthreads();
     ScatterKeysGlobal();
 
     // scatter values if necessary
@@ -678,7 +678,7 @@ struct AgentRadixSortOnesweep
     {
       s.block_idx = atomicAdd(d_ctrs, 1);
     }
-    CTA_SYNC();
+    __syncthreads();
     block_idx  = s.block_idx;
     full_block = (block_idx + 1) * TILE_ITEMS <= num_items;
   }
diff --git a/cub/cub/agent/agent_radix_sort_upsweep.cuh b/cub/cub/agent/agent_radix_sort_upsweep.cuh
index 4dcb6232ce3..d9418d2bc56 100644
--- a/cub/cub/agent/agent_radix_sort_upsweep.cuh
+++ b/cub/cub/agent/agent_radix_sort_upsweep.cuh
@@ -333,7 +333,7 @@ struct AgentRadixSortUpsweep
     LoadDirectStriped<BLOCK_THREADS>(threadIdx.x, d_keys_in + block_offset, keys);
 
     // Prevent hoisting
-    CTA_SYNC();
+    __syncthreads();
 
     // Bucket tile of keys
     Iterate<0, KEYS_PER_THREAD>::BucketKeys(*this, keys);
@@ -387,12 +387,12 @@ struct AgentRadixSortUpsweep
         block_offset += TILE_ITEMS;
       }
 
-      CTA_SYNC();
+      __syncthreads();
 
       // Aggregate back into local_count registers to prevent overflow
       UnpackDigitCounts();
 
-      CTA_SYNC();
+      __syncthreads();
 
       // Reset composite counters in lanes
       ResetDigitCounters();
@@ -408,7 +408,7 @@ struct AgentRadixSortUpsweep
     // Process partial tile if necessary
     ProcessPartialTile(block_offset, block_end);
 
-    CTA_SYNC();
+    __syncthreads();
 
     // Aggregate back into local_count registers
     UnpackDigitCounts();
@@ -442,7 +442,7 @@ struct AgentRadixSortUpsweep
       }
     }
 
-    CTA_SYNC();
+    __syncthreads();
 
 // Rake-reduce bin_count reductions
 
@@ -522,7 +522,7 @@ struct AgentRadixSortUpsweep
       }
     }
 
-    CTA_SYNC();
+    __syncthreads();
 
 // Rake-reduce bin_count reductions
 #pragma unroll
diff --git a/cub/cub/agent/agent_reduce_by_key.cuh b/cub/cub/agent/agent_reduce_by_key.cuh
index 735993723d8..f48c9567a24 100644
--- a/cub/cub/agent/agent_reduce_by_key.cuh
+++ b/cub/cub/agent/agent_reduce_by_key.cuh
@@ -426,7 +426,7 @@ struct AgentReduceByKey
     OffsetT num_tile_segments,
     OffsetT num_tile_segments_prefix)
   {
-    CTA_SYNC();
+    __syncthreads();
 
 // Compact and scatter pairs
 #pragma unroll
@@ -438,7 +438,7 @@ struct AgentReduceByKey
       }
     }
 
-    CTA_SYNC();
+    __syncthreads();
 
     for (int item = threadIdx.x; item < num_tile_segments; item += BLOCK_THREADS)
     {
@@ -539,7 +539,7 @@ struct AgentReduceByKey
       tile_predecessor = (tile_idx == 0) ? keys[0] : d_keys_in[tile_offset - 1];
     }
 
-    CTA_SYNC();
+    __syncthreads();
 
     // Load values
     if (IS_LAST_TILE)
@@ -551,7 +551,7 @@ struct AgentReduceByKey
       BlockLoadValuesT(temp_storage.load_values).Load(d_values_in + tile_offset, values);
     }
 
-    CTA_SYNC();
+    __syncthreads();
 
     // Initialize head-flags and shuffle up the previous keys
     if (IS_LAST_TILE)
diff --git a/cub/cub/agent/agent_rle.cuh b/cub/cub/agent/agent_rle.cuh
index 151940d18fa..266fbfcae88 100644
--- a/cub/cub/agent/agent_rle.cuh
+++ b/cub/cub/agent/agent_rle.cuh
@@ -502,7 +502,7 @@ struct AgentRle
       temp_storage.aliasable.scan_storage.warp_aggregates.Alias()[warp_id] = thread_inclusive;
     }
 
-    CTA_SYNC();
+    __syncthreads();
 
     // Accumulate total selected and the warp-wide prefix
 
@@ -532,7 +532,7 @@ struct AgentRle
 
     // Ensure all threads have read warp aggregates before temp_storage is repurposed in the
     // subsequent scatter stage
-    CTA_SYNC();
+    __syncthreads();
   }
 
   //---------------------------------------------------------------------
@@ -565,7 +565,7 @@ struct AgentRle
 #pragma unroll
     for (int SLICE = 1; SLICE < WARPS; ++SLICE)
     {
-      CTA_SYNC();
+      __syncthreads();
 
       if (warp_id == SLICE)
       {
@@ -763,7 +763,7 @@ struct AgentRle
 
       if (SYNC_AFTER_LOAD)
       {
-        CTA_SYNC();
+        __syncthreads();
       }
 
       // Set flags
@@ -849,7 +849,7 @@ struct AgentRle
 
       if (SYNC_AFTER_LOAD)
       {
-        CTA_SYNC();
+        __syncthreads();
       }
 
       // Set flags
@@ -879,7 +879,7 @@ struct AgentRle
         }
       }
 
-      CTA_SYNC();
+      __syncthreads();
 
       LengthOffsetPair tile_exclusive_in_global = temp_storage.tile_exclusive;
 
diff --git a/cub/cub/agent/agent_scan.cuh b/cub/cub/agent/agent_scan.cuh
index 7021531d0cc..f4e76de5329 100644
--- a/cub/cub/agent/agent_scan.cuh
+++ b/cub/cub/agent/agent_scan.cuh
@@ -376,7 +376,7 @@ struct AgentScan
       BlockLoadT(temp_storage.load).Load(d_in + tile_offset, items);
     }
 
-    CTA_SYNC();
+    __syncthreads();
 
     // Perform tile scan
     if (tile_idx == 0)
@@ -397,7 +397,7 @@ struct AgentScan
       ScanTile(items, scan_op, prefix_op, Int2Type<IS_INCLUSIVE>());
     }
 
-    CTA_SYNC();
+    __syncthreads();
 
     // Store items
     if (IS_LAST_TILE)
@@ -482,7 +482,7 @@ struct AgentScan
       BlockLoadT(temp_storage.load).Load(d_in + tile_offset, items);
     }
 
-    CTA_SYNC();
+    __syncthreads();
 
     // Block scan
     if (IS_FIRST_TILE)
@@ -496,7 +496,7 @@ struct AgentScan
       ScanTile(items, scan_op, prefix_op, Int2Type<IS_INCLUSIVE>());
     }
 
-    CTA_SYNC();
+    __syncthreads();
 
     // Store items
     if (IS_LAST_TILE)
diff --git a/cub/cub/agent/agent_scan_by_key.cuh b/cub/cub/agent/agent_scan_by_key.cuh
index 6e79ca18d8c..fc2a710837a 100644
--- a/cub/cub/agent/agent_scan_by_key.cuh
+++ b/cub/cub/agent/agent_scan_by_key.cuh
@@ -333,7 +333,7 @@ struct AgentScanByKey
       BlockLoadKeysT(storage.load_keys).Load(d_keys_in + tile_base, keys);
     }
 
-    CTA_SYNC();
+    __syncthreads();
 
     if (IS_LAST_TILE)
     {
@@ -347,7 +347,7 @@ struct AgentScanByKey
       BlockLoadValuesT(storage.load_values).Load(d_values_in + tile_base, values);
     }
 
-    CTA_SYNC();
+    __syncthreads();
 
     // first tile
     if (tile_idx == 0)
@@ -386,7 +386,7 @@ struct AgentScanByKey
       ScanTile(scan_items, tile_aggregate, prefix_op, Int2Type<IS_INCLUSIVE>());
     }
 
-    CTA_SYNC();
+    __syncthreads();
 
     UnzipValues(values, scan_items);
 
diff --git a/cub/cub/agent/agent_segment_fixup.cuh b/cub/cub/agent/agent_segment_fixup.cuh
index 1cf5eff5008..fb518da6a03 100644
--- a/cub/cub/agent/agent_segment_fixup.cuh
+++ b/cub/cub/agent/agent_segment_fixup.cuh
@@ -376,7 +376,7 @@ struct AgentSegmentFixup
       BlockLoadPairs(temp_storage.load_pairs).Load(d_pairs_in + tile_offset, pairs);
     }
 
-    CTA_SYNC();
+    __syncthreads();
 
     KeyValuePairT tile_aggregate;
     if (tile_idx == 0)
diff --git a/cub/cub/agent/agent_segmented_radix_sort.cuh b/cub/cub/agent/agent_segmented_radix_sort.cuh
index fe687fa9f51..0eb03d8d507 100644
--- a/cub/cub/agent/agent_segmented_radix_sort.cuh
+++ b/cub/cub/agent/agent_segmented_radix_sort.cuh
@@ -154,13 +154,13 @@ struct AgentSegmentedRadixSort
     {
       BlockValueLoadT(temp_storage.values_load).Load(d_values_in, thread_values, num_items);
 
-      CTA_SYNC();
+      __syncthreads();
     }
 
     {
       BlockKeyLoadT(temp_storage.keys_load).Load(d_keys_in, thread_keys, num_items, oob_default);
 
-      CTA_SYNC();
+      __syncthreads();
     }
 
     BlockRadixSortT(temp_storage.sort)
@@ -187,13 +187,13 @@ struct AgentSegmentedRadixSort
     BlockUpsweepT upsweep(temp_storage.upsweep, d_keys_in, current_bit, pass_bits, decomposer);
     upsweep.ProcessRegion(OffsetT{}, num_items);
 
-    CTA_SYNC();
+    __syncthreads();
 
     // The count of each digit value in this pass (valid in the first RADIX_DIGITS threads)
     OffsetT bin_count[BINS_TRACKED_PER_THREAD];
     upsweep.ExtractCounts(bin_count);
 
-    CTA_SYNC();
+    __syncthreads();
 
     if (IS_DESCENDING)
     {
@@ -209,7 +209,7 @@ struct AgentSegmentedRadixSort
         }
       }
 
-      CTA_SYNC();
+      __syncthreads();
 
 #pragma unroll
       for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
@@ -243,7 +243,7 @@ struct AgentSegmentedRadixSort
         }
       }
 
-      CTA_SYNC();
+      __syncthreads();
 
 #pragma unroll
       for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
@@ -257,7 +257,7 @@ struct AgentSegmentedRadixSort
       }
     }
 
-    CTA_SYNC();
+    __syncthreads();
 
     // Downsweep
     BlockDownsweepT downsweep(
diff --git a/cub/cub/agent/agent_select_if.cuh b/cub/cub/agent/agent_select_if.cuh
index 4f16992b276..8f68625b1fc 100644
--- a/cub/cub/agent/agent_select_if.cuh
+++ b/cub/cub/agent/agent_select_if.cuh
@@ -408,7 +408,7 @@ struct AgentSelectIf
     OffsetT (&selection_flags)[ITEMS_PER_THREAD],
     Int2Type<USE_STENCIL_WITH_OP> /*select_method*/)
   {
-    CTA_SYNC();
+    __syncthreads();
 
     FlagT flags[ITEMS_PER_THREAD];
     if (IS_LAST_TILE)
@@ -450,7 +450,7 @@ struct AgentSelectIf
     OffsetT (&selection_flags)[ITEMS_PER_THREAD],
     Int2Type<USE_SELECT_FLAGS> /*select_method*/)
   {
-    CTA_SYNC();
+    __syncthreads();
 
     FlagT flags[ITEMS_PER_THREAD];
 
@@ -486,7 +486,7 @@ struct AgentSelectIf
   {
     if (IS_FIRST_TILE && streaming_context.is_first_partition())
     {
-      CTA_SYNC();
+      __syncthreads();
 
       // Set head selection_flags.  First tile sets the first flag for the first item
       BlockDiscontinuityT(temp_storage.scan_storage.discontinuity).FlagHeads(selection_flags, items, inequality_op);
@@ -499,7 +499,7 @@ struct AgentSelectIf
         tile_predecessor = d_in[tile_offset + streaming_context.input_offset() - 1];
       }
 
-      CTA_SYNC();
+      __syncthreads();
 
       BlockDiscontinuityT(temp_storage.scan_storage.discontinuity)
         .FlagHeads(selection_flags, items, inequality_op, tile_predecessor);
@@ -571,7 +571,7 @@ struct AgentSelectIf
     int num_tile_selections,
     OffsetT num_selections_prefix)
   {
-    CTA_SYNC();
+    __syncthreads();
 
 // Compact and scatter items
 #pragma unroll
@@ -584,7 +584,7 @@ struct AgentSelectIf
       }
     }
 
-    CTA_SYNC();
+    __syncthreads();
 
     for (int item = threadIdx.x; item < num_tile_selections; item += BLOCK_THREADS)
     {
@@ -667,7 +667,7 @@ struct AgentSelectIf
     OffsetT num_selections,
     Int2Type<true> /*is_keep_rejects*/)
   {
-    CTA_SYNC();
+    __syncthreads();
 
     int tile_num_rejections = num_tile_items - num_tile_selections;
 
@@ -685,7 +685,7 @@ struct AgentSelectIf
     }
 
     // Ensure all threads finished scattering to shared memory
-    CTA_SYNC();
+    __syncthreads();
 
     // Gather items from shared memory and scatter to global
     ScatterPartitionsToGlobal<IS_LAST_TILE>(
@@ -814,7 +814,7 @@ struct AgentSelectIf
     // Ensure temporary storage used during block load can be reused
     // Also, in case of in-place stream compaction, this is needed to order the loads of
     // *all threads of this thread block* before the st.release of the thread writing this thread block's tile state
-    CTA_SYNC();
+    __syncthreads();
 
     // Exclusive scan of selection_flags
     OffsetT num_tile_selections;
@@ -894,7 +894,7 @@ struct AgentSelectIf
     // Ensure temporary storage used during block load can be reused
     // Also, in case of in-place stream compaction, this is needed to order the loads of
     // *all threads of this thread block* before the st.release of the thread writing this thread block's tile state
-    CTA_SYNC();
+    __syncthreads();
 
     // Exclusive scan of values and selection_flags
     TilePrefixCallbackOpT prefix_op(
diff --git a/cub/cub/agent/agent_spmv_orig.cuh b/cub/cub/agent/agent_spmv_orig.cuh
index 41c40bee28e..12ba663770f 100644
--- a/cub/cub/agent/agent_spmv_orig.cuh
+++ b/cub/cub/agent/agent_spmv_orig.cuh
@@ -372,7 +372,7 @@ struct AgentSpmv
       s_tile_row_end_offsets[item] = wd_row_end_offsets[offset];
     }
 
-    CTA_SYNC();
+    __syncthreads();
 
     // Search for the thread's starting coordinate within the merge tile
     CountingInputIterator<OffsetT> tile_nonzero_indices(tile_start_coord.y);
@@ -386,7 +386,7 @@ struct AgentSpmv
       tile_num_nonzeros,
       thread_start_coord);
 
-    CTA_SYNC(); // Perf-sync
+    __syncthreads(); // Perf-sync
 
     // Compute the thread's merge path segment
     CoordinateT thread_current_coord = thread_start_coord;
@@ -425,7 +425,7 @@ struct AgentSpmv
       }
     }
 
-    CTA_SYNC();
+    __syncthreads();
 
     // Block-wide reduce-value-by-segment
     KeyValuePairT tile_carry;
@@ -553,7 +553,7 @@ struct AgentSpmv
       s_tile_row_end_offsets[item] = wd_row_end_offsets[offset];
     }
 
-    CTA_SYNC();
+    __syncthreads();
 
     // Search for the thread's starting coordinate within the merge tile
     CountingInputIterator<OffsetT> tile_nonzero_indices(tile_start_coord.y);
@@ -567,7 +567,7 @@ struct AgentSpmv
       tile_num_nonzeros,
       thread_start_coord);
 
-    CTA_SYNC(); // Perf-sync
+    __syncthreads(); // Perf-sync
 
     // Compute the thread's merge path segment
     CoordinateT thread_current_coord = thread_start_coord;
@@ -600,7 +600,7 @@ struct AgentSpmv
       scan_segment[ITEM].key = thread_current_coord.x;
     }
 
-    CTA_SYNC();
+    __syncthreads();
 
     // Block-wide reduce-value-by-segment
     KeyValuePairT tile_carry;
@@ -620,7 +620,7 @@ struct AgentSpmv
 
     if (tile_num_rows > 0)
     {
-      CTA_SYNC();
+      __syncthreads();
 
       // Scan downsweep and scatter
       ValueT* s_partials = &temp_storage.aliasable.merge_items[0].nonzero;
@@ -647,7 +647,7 @@ struct AgentSpmv
         }
       }
 
-      CTA_SYNC();
+      __syncthreads();
 
 #pragma unroll 1
       for (int item = threadIdx.x; item < tile_num_rows; item += BLOCK_THREADS)
@@ -709,7 +709,7 @@ struct AgentSpmv
       }
     }
 
-    CTA_SYNC();
+    __syncthreads();
 
     CoordinateT tile_start_coord = temp_storage.tile_coords[0];
     CoordinateT tile_end_coord   = temp_storage.tile_coords[1];
diff --git a/cub/cub/agent/agent_three_way_partition.cuh b/cub/cub/agent/agent_three_way_partition.cuh
index eec24057163..65b72b04372 100644
--- a/cub/cub/agent/agent_three_way_partition.cuh
+++ b/cub/cub/agent/agent_three_way_partition.cuh
@@ -313,7 +313,7 @@ struct AgentThreeWayPartition
     AccumPackT num_tile_selected_prefix,
     OffsetT num_rejected_prefix)
   {
-    CTA_SYNC();
+    __syncthreads();
 
     const OffsetT num_first_selections_prefix  = AccumPackHelperT::first(num_tile_selected_prefix);
     const OffsetT num_second_selections_prefix = AccumPackHelperT::second(num_tile_selected_prefix);
@@ -353,7 +353,7 @@ struct AgentThreeWayPartition
       }
     }
 
-    CTA_SYNC();
+    __syncthreads();
 
     // Gather items from shared memory and scatter to global
     auto first_base =
@@ -421,7 +421,7 @@ struct AgentThreeWayPartition
 
     // Initialize selection_flags
     Initialize<IS_LAST_TILE>(num_tile_items, items, items_selection_flags);
-    CTA_SYNC();
+    __syncthreads();
 
     // Exclusive scan of selection_flags
     BlockScanT(temp_storage.scan_storage.scan)
@@ -486,7 +486,7 @@ struct AgentThreeWayPartition
 
     // Initialize selection_flags
     Initialize<IS_LAST_TILE>(num_tile_items, items, items_selected_flags);
-    CTA_SYNC();
+    __syncthreads();
 
     // Exclusive scan of values and selection_flags
     TilePrefixCallbackOpT prefix_op(tile_state, temp_storage.scan_storage.prefix, ::cuda::std::plus<>{}, tile_idx);
@@ -497,7 +497,7 @@ struct AgentThreeWayPartition
     AccumPackT num_items_in_tile_selected = prefix_op.GetBlockAggregate();
     AccumPackT num_items_selected_prefix  = prefix_op.GetExclusivePrefix();
 
-    CTA_SYNC();
+    __syncthreads();
 
     OffsetT num_rejected_prefix = (tile_idx * TILE_ITEMS) - AccumPackHelperT::sum(num_items_selected_prefix);
 
diff --git a/cub/cub/agent/agent_unique_by_key.cuh b/cub/cub/agent/agent_unique_by_key.cuh
index 30f5d4f50e4..c388f0e744d 100644
--- a/cub/cub/agent/agent_unique_by_key.cuh
+++ b/cub/cub/agent/agent_unique_by_key.cuh
@@ -286,7 +286,7 @@ struct AgentUniqueByKey
       }
     }
 
-    CTA_SYNC();
+    __syncthreads();
 
 // Preventing loop unrolling helps avoid perf degradation when switching from signed to unsigned 32-bit offset
 // types
@@ -296,7 +296,7 @@ struct AgentUniqueByKey
       items_out[num_selections_prefix + item] = GetShared(tag)[item];
     }
 
-    CTA_SYNC();
+    __syncthreads();
   }
 
   //---------------------------------------------------------------------
@@ -337,7 +337,7 @@ struct AgentUniqueByKey
       BlockLoadKeys(temp_storage.load_keys).Load(d_keys_in + tile_offset, keys);
     }
 
-    CTA_SYNC();
+    __syncthreads();
 
     ValueT values[ITEMS_PER_THREAD];
     if (IS_LAST_TILE)
@@ -352,7 +352,7 @@ struct AgentUniqueByKey
       BlockLoadValues(temp_storage.load_values).Load(d_values_in + tile_offset, values);
     }
 
-    CTA_SYNC();
+    __syncthreads();
 
     BlockDiscontinuityKeys(temp_storage.scan_storage.discontinuity).FlagHeads(selection_flags, keys, inequality_op);
 #pragma unroll
@@ -365,7 +365,7 @@ struct AgentUniqueByKey
       }
     }
 
-    CTA_SYNC();
+    __syncthreads();
 
     OffsetT num_tile_selections   = 0;
     OffsetT num_selections        = 0;
@@ -390,7 +390,7 @@ struct AgentUniqueByKey
     }
     num_selections = num_tile_selections;
 
-    CTA_SYNC();
+    __syncthreads();
 
     Scatter(KeyTagT(),
             d_keys_out,
@@ -402,7 +402,7 @@ struct AgentUniqueByKey
             num_selections_prefix,
             num_selections);
 
-    CTA_SYNC();
+    __syncthreads();
 
     Scatter(ValueTagT(),
             d_values_out,
@@ -454,7 +454,7 @@ struct AgentUniqueByKey
       BlockLoadKeys(temp_storage.load_keys).Load(d_keys_in + tile_offset, keys);
     }
 
-    CTA_SYNC();
+    __syncthreads();
 
     ValueT values[ITEMS_PER_THREAD];
     if (IS_LAST_TILE)
@@ -469,7 +469,7 @@ struct AgentUniqueByKey
       BlockLoadValues(temp_storage.load_values).Load(d_values_in + tile_offset, values);
     }
 
-    CTA_SYNC();
+    __syncthreads();
 
     KeyT tile_predecessor = d_keys_in[tile_offset - 1];
     BlockDiscontinuityKeys(temp_storage.scan_storage.discontinuity)
@@ -485,7 +485,7 @@ struct AgentUniqueByKey
       }
     }
 
-    CTA_SYNC();
+    __syncthreads();
 
     OffsetT num_tile_selections   = 0;
     OffsetT num_selections        = 0;
@@ -505,7 +505,7 @@ struct AgentUniqueByKey
       num_selections -= num_discount;
     }
 
-    CTA_SYNC();
+    __syncthreads();
 
     Scatter(KeyTagT(),
             d_keys_out,
@@ -517,7 +517,7 @@ struct AgentUniqueByKey
             num_selections_prefix,
             num_selections);
 
-    CTA_SYNC();
+    __syncthreads();
 
     Scatter(ValueTagT(),
             d_values_out,
diff --git a/cub/cub/block/block_adjacent_difference.cuh b/cub/cub/block/block_adjacent_difference.cuh
index 5bc3bae3219..38636571e80 100644
--- a/cub/cub/block/block_adjacent_difference.cuh
+++ b/cub/cub/block/block_adjacent_difference.cuh
@@ -309,7 +309,7 @@ public:
     // Share last item
     temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
 
-    CTA_SYNC();
+    __syncthreads();
 
 #pragma unroll
     for (int item = ITEMS_PER_THREAD - 1; item > 0; item--)
@@ -408,7 +408,7 @@ public:
     // Share last item
     temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
 
-    CTA_SYNC();
+    __syncthreads();
 
 #pragma unroll
     for (int item = ITEMS_PER_THREAD - 1; item > 0; item--)
@@ -499,7 +499,7 @@ public:
     // Share last item
     temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
 
-    CTA_SYNC();
+    __syncthreads();
 
     if ((linear_tid + 1) * ITEMS_PER_THREAD <= valid_items)
     {
@@ -622,7 +622,7 @@ public:
     // Share last item
     temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
 
-    CTA_SYNC();
+    __syncthreads();
 
     if ((linear_tid + 1) * ITEMS_PER_THREAD <= valid_items)
     {
@@ -736,7 +736,7 @@ public:
     // Share first item
     temp_storage.first_items[linear_tid] = input[0];
 
-    CTA_SYNC();
+    __syncthreads();
 
 #pragma unroll
     for (int item = 0; item < ITEMS_PER_THREAD - 1; item++)
@@ -837,7 +837,7 @@ public:
     // Share first item
     temp_storage.first_items[linear_tid] = input[0];
 
-    CTA_SYNC();
+    __syncthreads();
 
     // Set flag for last thread-item
     T successor_item = (linear_tid == BLOCK_THREADS - 1)
@@ -926,7 +926,7 @@ public:
     // Share first item
     temp_storage.first_items[linear_tid] = input[0];
 
-    CTA_SYNC();
+    __syncthreads();
 
     if ((linear_tid + 1) * ITEMS_PER_THREAD < valid_items)
     {
diff --git a/cub/cub/block/block_discontinuity.cuh b/cub/cub/block/block_discontinuity.cuh
index fb88dfac07f..e4998f32510 100644
--- a/cub/cub/block/block_discontinuity.cuh
+++ b/cub/cub/block/block_discontinuity.cuh
@@ -292,7 +292,7 @@ public:
     // Share last item
     temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
 
-    CTA_SYNC();
+    __syncthreads();
 
     if (linear_tid == 0)
     {
@@ -337,7 +337,7 @@ public:
     // Share last item
     temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
 
-    CTA_SYNC();
+    __syncthreads();
 
     // Set flag for first thread-item
     preds[0] = (linear_tid == 0) ? tile_predecessor_item : // First thread
@@ -586,7 +586,7 @@ public:
     // Share first item
     temp_storage.first_items[linear_tid] = input[0];
 
-    CTA_SYNC();
+    __syncthreads();
 
     // Set flag for last thread-item
     tail_flags[ITEMS_PER_THREAD - 1] =
@@ -686,7 +686,7 @@ public:
     // Share first item
     temp_storage.first_items[linear_tid] = input[0];
 
-    CTA_SYNC();
+    __syncthreads();
 
     // Set flag for last thread-item
     T successor_item = (linear_tid == BLOCK_THREADS - 1) ? tile_successor_item : // Last thread
@@ -790,7 +790,7 @@ public:
     temp_storage.first_items[linear_tid] = input[0];
     temp_storage.last_items[linear_tid]  = input[ITEMS_PER_THREAD - 1];
 
-    CTA_SYNC();
+    __syncthreads();
 
     T preds[ITEMS_PER_THREAD];
 
@@ -920,7 +920,7 @@ public:
     temp_storage.first_items[linear_tid] = input[0];
     temp_storage.last_items[linear_tid]  = input[ITEMS_PER_THREAD - 1];
 
-    CTA_SYNC();
+    __syncthreads();
 
     T preds[ITEMS_PER_THREAD];
 
@@ -1052,7 +1052,7 @@ public:
     temp_storage.first_items[linear_tid] = input[0];
     temp_storage.last_items[linear_tid]  = input[ITEMS_PER_THREAD - 1];
 
-    CTA_SYNC();
+    __syncthreads();
 
     T preds[ITEMS_PER_THREAD];
 
@@ -1189,7 +1189,7 @@ public:
     temp_storage.first_items[linear_tid] = input[0];
     temp_storage.last_items[linear_tid]  = input[ITEMS_PER_THREAD - 1];
 
-    CTA_SYNC();
+    __syncthreads();
 
     T preds[ITEMS_PER_THREAD];
 
diff --git a/cub/cub/block/block_exchange.cuh b/cub/cub/block/block_exchange.cuh
index a41048eb94d..d1ae91c223d 100644
--- a/cub/cub/block/block_exchange.cuh
+++ b/cub/cub/block/block_exchange.cuh
@@ -217,7 +217,7 @@ private:
       detail::uninitialized_copy_single(temp_storage.buff + item_offset, input_items[i]);
     }
 
-    CTA_SYNC();
+    __syncthreads();
 
 #pragma unroll
     for (int i = 0; i < ITEMS_PER_THREAD; i++)
@@ -251,7 +251,7 @@ private:
       const int slice_offset = slice * TIME_SLICED_ITEMS;
       const int slice_oob    = slice_offset + TIME_SLICED_ITEMS;
 
-      CTA_SYNC();
+      __syncthreads();
 
       if (warp_id == slice)
       {
@@ -267,7 +267,7 @@ private:
         }
       }
 
-      CTA_SYNC();
+      __syncthreads();
 
 #pragma unroll
       for (int i = 0; i < ITEMS_PER_THREAD; i++)
@@ -380,7 +380,7 @@ private:
 #pragma unroll
     for (int slice = 1; slice < TIME_SLICES; ++slice)
     {
-      CTA_SYNC();
+      __syncthreads();
 
       if (warp_id == slice)
       {
@@ -436,7 +436,7 @@ private:
       detail::uninitialized_copy_single(temp_storage.buff + item_offset, input_items[i]);
     }
 
-    CTA_SYNC();
+    __syncthreads();
 
 // No timeslicing
 #pragma unroll
@@ -472,7 +472,7 @@ private:
       const int slice_offset = slice * TIME_SLICED_ITEMS;
       const int slice_oob    = slice_offset + TIME_SLICED_ITEMS;
 
-      CTA_SYNC();
+      __syncthreads();
 
 #pragma unroll
       for (int i = 0; i < ITEMS_PER_THREAD; i++)
@@ -495,7 +495,7 @@ private:
         }
       }
 
-      CTA_SYNC();
+      __syncthreads();
 
       if (warp_id == slice)
       {
@@ -574,7 +574,7 @@ private:
 #pragma unroll
     for (int slice = 0; slice < TIME_SLICES; ++slice)
     {
-      CTA_SYNC();
+      __syncthreads();
 
       if (warp_id == slice)
       {
@@ -633,7 +633,7 @@ private:
       detail::uninitialized_copy_single(temp_storage.buff + item_offset, input_items[i]);
     }
 
-    CTA_SYNC();
+    __syncthreads();
 
 #pragma unroll
     for (int i = 0; i < ITEMS_PER_THREAD; i++)
@@ -669,7 +669,7 @@ private:
 #pragma unroll
     for (int slice = 0; slice < TIME_SLICES; slice++)
     {
-      CTA_SYNC();
+      __syncthreads();
 
       const int slice_offset = TIME_SLICED_ITEMS * slice;
 
@@ -687,7 +687,7 @@ private:
         }
       }
 
-      CTA_SYNC();
+      __syncthreads();
 
       if (warp_id == slice)
       {
@@ -740,7 +740,7 @@ private:
       detail::uninitialized_copy_single(temp_storage.buff + item_offset, input_items[i]);
     }
 
-    CTA_SYNC();
+    __syncthreads();
 
 #pragma unroll
     for (int i = 0; i < ITEMS_PER_THREAD; i++)
@@ -779,7 +779,7 @@ private:
       const int slice_offset = slice * TIME_SLICED_ITEMS;
       const int slice_oob    = slice_offset + TIME_SLICED_ITEMS;
 
-      CTA_SYNC();
+      __syncthreads();
 
 #pragma unroll
       for (int i = 0; i < ITEMS_PER_THREAD; i++)
@@ -795,7 +795,7 @@ private:
         }
       }
 
-      CTA_SYNC();
+      __syncthreads();
 
 #pragma unroll
       for (int i = 0; i < ITEMS_PER_THREAD; i++)
@@ -1144,7 +1144,7 @@ public:
       }
     }
 
-    CTA_SYNC();
+    __syncthreads();
 
 #pragma unroll
     for (int i = 0; i < ITEMS_PER_THREAD; i++)
@@ -1203,7 +1203,7 @@ public:
       }
     }
 
-    CTA_SYNC();
+    __syncthreads();
 
 #pragma unroll
     for (int i = 0; i < ITEMS_PER_THREAD; i++)
diff --git a/cub/cub/block/block_histogram.cuh b/cub/cub/block/block_histogram.cuh
index d5726f240f6..8caf6a5bf59 100644
--- a/cub/cub/block/block_histogram.cuh
+++ b/cub/cub/block/block_histogram.cuh
@@ -358,7 +358,7 @@ public:
     // Initialize histogram bin counts to zeros
     InitHistogram(histogram);
 
-    CTA_SYNC();
+    __syncthreads();
 
     // Composite the histogram
     InternalBlockHistogram(temp_storage).Composite(items, histogram);
diff --git a/cub/cub/block/block_merge_sort.cuh b/cub/cub/block/block_merge_sort.cuh
index b6d0c8a33b1..7f931b0d42d 100644
--- a/cub/cub/block/block_merge_sort.cuh
+++ b/cub/cub/block/block_merge_sort.cuh
@@ -760,7 +760,7 @@ public:
 private:
   _CCCL_DEVICE _CCCL_FORCEINLINE void SyncImplementation() const
   {
-    CTA_SYNC();
+    __syncthreads();
   }
 
   friend BlockMergeSortStrategyT;
diff --git a/cub/cub/block/block_radix_rank.cuh b/cub/cub/block/block_radix_rank.cuh
index 22fe117a1f9..fcb63031687 100644
--- a/cub/cub/block/block_radix_rank.cuh
+++ b/cub/cub/block/block_radix_rank.cuh
@@ -478,12 +478,12 @@ public:
       *digit_counters[ITEM] = thread_prefixes[ITEM] + 1;
     }
 
-    CTA_SYNC();
+    __syncthreads();
 
     // Scan shared memory counters
     ScanCounters();
 
-    CTA_SYNC();
+    __syncthreads();
 
 // Extract the local ranks of each key
 #pragma unroll
@@ -711,7 +711,7 @@ public:
       temp_storage.aliasable.raking_grid[linear_tid][ITEM] = 0;
     }
 
-    CTA_SYNC();
+    __syncthreads();
 
     // Each warp will strip-mine its section of input, one strip at a time
 
@@ -762,7 +762,7 @@ public:
       ranks[ITEM] = warp_digit_prefix + DigitCounterT(peer_digit_prefix);
     }
 
-    CTA_SYNC();
+    __syncthreads();
 
     // Scan warp counters
 
@@ -782,7 +782,7 @@ public:
       temp_storage.aliasable.raking_grid[linear_tid][ITEM] = scan_counters[ITEM];
     }
 
-    CTA_SYNC();
+    __syncthreads();
     if (!::cuda::std::is_same<CountsCallback, BlockRadixRankEmptyCallback<BINS_TRACKED_PER_THREAD>>::value)
     {
       CallBack<KEYS_PER_THREAD>(callback);
@@ -1002,7 +1002,7 @@ struct BlockRadixRankMatchEarlyCounts
           int bin = lane + u * WARP_THREADS;
           bins[u] = cub::ThreadReduce(warp_histograms[bin], ::cuda::std::plus<>{});
         }
-        CTA_SYNC();
+        __syncthreads();
 
         // store the resulting histogram in shared memory
         int* warp_offsets = &s.warp_offsets[warp][0];
@@ -1118,7 +1118,7 @@ struct BlockRadixRankMatchEarlyCounts
     {
       ComputeHistogramsWarp(keys);
 
-      CTA_SYNC();
+      __syncthreads();
       int bins[BINS_PER_THREAD];
       ComputeOffsetsWarpUpsweep(bins);
       callback(bins);
@@ -1126,7 +1126,7 @@ struct BlockRadixRankMatchEarlyCounts
       BlockScan(s.prefix_tmp).ExclusiveSum(bins, exclusive_digit_prefix);
 
       ComputeOffsetsWarpDownsweep(exclusive_digit_prefix);
-      CTA_SYNC();
+      __syncthreads();
       ComputeRanksItem(keys, ranks, Int2Type<MATCH_ALGORITHM>());
     }
 
diff --git a/cub/cub/block/block_radix_sort.cuh b/cub/cub/block/block_radix_sort.cuh
index 3223b920b13..080053348d7 100644
--- a/cub/cub/block/block_radix_sort.cuh
+++ b/cub/cub/block/block_radix_sort.cuh
@@ -364,7 +364,7 @@ private:
     Int2Type<false> /*is_keys_only*/,
     Int2Type<true> /*is_blocked*/)
   {
-    CTA_SYNC();
+    __syncthreads();
 
     // Exchange values through shared memory in blocked arrangement
     BlockExchangeValues(temp_storage.exchange_values).ScatterToBlocked(values, ranks);
@@ -377,7 +377,7 @@ private:
     Int2Type<false> /*is_keys_only*/,
     Int2Type<false> /*is_blocked*/)
   {
-    CTA_SYNC();
+    __syncthreads();
 
     // Exchange values through shared memory in blocked arrangement
     BlockExchangeValues(temp_storage.exchange_values).ScatterToStriped(values, ranks);
@@ -443,7 +443,7 @@ private:
       RankKeys(unsigned_keys, ranks, digit_extractor, is_descending);
       begin_bit += RADIX_BITS;
 
-      CTA_SYNC();
+      __syncthreads();
 
       // Exchange keys through shared memory in blocked arrangement
       BlockExchangeKeys(temp_storage.exchange_keys).ScatterToBlocked(keys, ranks);
@@ -457,7 +457,7 @@ private:
         break;
       }
 
-      CTA_SYNC();
+      __syncthreads();
     }
 
 // Untwiddle bits if necessary
@@ -522,7 +522,7 @@ public:
       RankKeys(unsigned_keys, ranks, digit_extractor, is_descending);
       begin_bit += RADIX_BITS;
 
-      CTA_SYNC();
+      __syncthreads();
 
       // Check if this is the last pass
       if (begin_bit >= end_bit)
@@ -543,7 +543,7 @@ public:
       // Exchange values through shared memory in blocked arrangement
       ExchangeValues(values, ranks, is_keys_only, Int2Type<true>());
 
-      CTA_SYNC();
+      __syncthreads();
     }
 
 // Untwiddle bits if necessary
diff --git a/cub/cub/block/block_run_length_decode.cuh b/cub/cub/block/block_run_length_decode.cuh
index 0dca0a5d838..2138ed31d7e 100644
--- a/cub/cub/block/block_run_length_decode.cuh
+++ b/cub/cub/block/block_run_length_decode.cuh
@@ -314,7 +314,7 @@ private:
     }
 
     // Ensure run offsets and run values have been written to shared memory
-    CTA_SYNC();
+    __syncthreads();
   }
 
   template <typename RunLengthT, typename TotalDecodedSizeT>
@@ -335,7 +335,7 @@ private:
     total_decoded_size = static_cast<TotalDecodedSizeT>(decoded_size_aggregate);
 
     // Ensure the prefix scan's temporary storage can be reused (may be superfluous, but depends on scan implementation)
-    CTA_SYNC();
+    __syncthreads();
 
     InitWithRunOffsets(run_values, run_offsets);
   }
diff --git a/cub/cub/block/block_scan.cuh b/cub/cub/block/block_scan.cuh
index c49eb36a52e..44b2342c7d1 100644
--- a/cub/cub/block/block_scan.cuh
+++ b/cub/cub/block/block_scan.cuh
@@ -477,7 +477,7 @@ public:
   //!            // Collectively compute the block-wide exclusive prefix sum
   //!            BlockScan(temp_storage).ExclusiveSum(
   //!                thread_data, thread_data, prefix_op);
-  //!            CTA_SYNC();
+  //!            __syncthreads();
   //!
   //!            // Store scanned items to output segment
   //!            d_data[block_offset] = thread_data;
@@ -714,17 +714,17 @@ public:
   //!            // Load a segment of consecutive items that are blocked across threads
   //!            int thread_data[4];
   //!            BlockLoad(temp_storage.load).Load(d_data + block_offset, thread_data);
-  //!            CTA_SYNC();
+  //!            __syncthreads();
   //!
   //!            // Collectively compute the block-wide exclusive prefix sum
   //!            int block_aggregate;
   //!            BlockScan(temp_storage.scan).ExclusiveSum(
   //!                thread_data, thread_data, prefix_op);
-  //!            CTA_SYNC();
+  //!            __syncthreads();
   //!
   //!            // Store scanned items to output segment
   //!            BlockStore(temp_storage.store).Store(d_data + block_offset, thread_data);
-  //!            CTA_SYNC();
+  //!            __syncthreads();
   //!        }
   //!
   //! Suppose the input ``d_data`` is ``1, 1, 1, 1, 1, 1, 1, 1, ...``.
@@ -957,7 +957,7 @@ public:
   //!            // Collectively compute the block-wide exclusive prefix max scan
   //!            BlockScan(temp_storage).ExclusiveScan(
   //!                thread_data, thread_data, INT_MIN, cuda::maximum<>{}, prefix_op);
-  //!            CTA_SYNC();
+  //!            __syncthreads();
   //!
   //!            // Store scanned items to output segment
   //!            d_data[block_offset] = thread_data;
@@ -1230,16 +1230,16 @@ public:
   //!            // Load a segment of consecutive items that are blocked across threads
   //!            int thread_data[4];
   //!            BlockLoad(temp_storage.load).Load(d_data + block_offset, thread_data);
-  //!            CTA_SYNC();
+  //!            __syncthreads();
   //!
   //!            // Collectively compute the block-wide exclusive prefix max scan
   //!            BlockScan(temp_storage.scan).ExclusiveScan(
   //!                thread_data, thread_data, INT_MIN, cuda::maximum<>{}, prefix_op);
-  //!            CTA_SYNC();
+  //!            __syncthreads();
   //!
   //!            // Store scanned items to output segment
   //!            BlockStore(temp_storage.store).Store(d_data + block_offset, thread_data);
-  //!            CTA_SYNC();
+  //!            __syncthreads();
   //!        }
   //!
   //! Suppose the input ``d_data`` is ``0, -1, 2, -3, 4, -5, ...``.
@@ -1618,7 +1618,7 @@ public:
   //!            // Collectively compute the block-wide inclusive prefix sum
   //!            BlockScan(temp_storage).InclusiveSum(
   //!                thread_data, thread_data, prefix_op);
-  //!            CTA_SYNC();
+  //!            __syncthreads();
   //!
   //!            // Store scanned items to output segment
   //!            d_data[block_offset] = thread_data;
@@ -1874,16 +1874,16 @@ public:
   //!            // Load a segment of consecutive items that are blocked across threads
   //!            int thread_data[4];
   //!            BlockLoad(temp_storage.load).Load(d_data + block_offset, thread_data);
-  //!            CTA_SYNC();
+  //!            __syncthreads();
   //!
   //!            // Collectively compute the block-wide inclusive prefix sum
   //!            BlockScan(temp_storage.scan).IncluisveSum(
   //!                thread_data, thread_data, prefix_op);
-  //!            CTA_SYNC();
+  //!            __syncthreads();
   //!
   //!            // Store scanned items to output segment
   //!            BlockStore(temp_storage.store).Store(d_data + block_offset, thread_data);
-  //!            CTA_SYNC();
+  //!            __syncthreads();
   //!        }
   //!
   //! Suppose the input ``d_data`` is ``1, 1, 1, 1, 1, 1, 1, 1, ...``.
@@ -2123,7 +2123,7 @@ public:
   //!            // Collectively compute the block-wide inclusive prefix max scan
   //!            BlockScan(temp_storage).InclusiveScan(
   //!                thread_data, thread_data, cuda::maximum<>{}, prefix_op);
-  //!            CTA_SYNC();
+  //!            __syncthreads();
   //!
   //!            // Store scanned items to output segment
   //!            d_data[block_offset] = thread_data;
@@ -2516,16 +2516,16 @@ public:
   //!            // Load a segment of consecutive items that are blocked across threads
   //!            int thread_data[4];
   //!            BlockLoad(temp_storage.load).Load(d_data + block_offset, thread_data);
-  //!            CTA_SYNC();
+  //!            __syncthreads();
   //!
   //!            // Collectively compute the block-wide inclusive prefix max scan
   //!            BlockScan(temp_storage.scan).InclusiveScan(
   //!                thread_data, thread_data, cuda::maximum<>{}, prefix_op);
-  //!            CTA_SYNC();
+  //!            __syncthreads();
   //!
   //!            // Store scanned items to output segment
   //!            BlockStore(temp_storage.store).Store(d_data + block_offset, thread_data);
-  //!            CTA_SYNC();
+  //!            __syncthreads();
   //!        }
   //!
   //! Suppose the input ``d_data`` is ``0, -1, 2, -3, 4, -5, ...``.
diff --git a/cub/cub/block/block_shuffle.cuh b/cub/cub/block/block_shuffle.cuh
index a3dedcc3c70..93d8715c63b 100644
--- a/cub/cub/block/block_shuffle.cuh
+++ b/cub/cub/block/block_shuffle.cuh
@@ -164,7 +164,7 @@ public:
   {
     temp_storage[linear_tid] = input;
 
-    CTA_SYNC();
+    __syncthreads();
 
     const int offset_tid = static_cast<int>(linear_tid) + distance;
     if ((offset_tid >= 0) && (offset_tid < BLOCK_THREADS))
@@ -196,7 +196,7 @@ public:
   {
     temp_storage[linear_tid] = input;
 
-    CTA_SYNC();
+    __syncthreads();
 
     unsigned int offset = linear_tid + distance;
     if (offset >= BLOCK_THREADS)
@@ -230,7 +230,7 @@ public:
   {
     temp_storage[linear_tid] = input[ITEMS_PER_THREAD - 1];
 
-    CTA_SYNC();
+    __syncthreads();
 
 #pragma unroll
     for (int ITEM = ITEMS_PER_THREAD - 1; ITEM > 0; --ITEM)
@@ -298,7 +298,7 @@ public:
   {
     temp_storage[linear_tid] = input[0];
 
-    CTA_SYNC();
+    __syncthreads();
 
 #pragma unroll
     for (int ITEM = 0; ITEM < ITEMS_PER_THREAD - 1; ITEM++)
diff --git a/cub/cub/block/block_store.cuh b/cub/cub/block/block_store.cuh
index 443f7a7f93b..e207a1d76c1 100644
--- a/cub/cub/block/block_store.cuh
+++ b/cub/cub/block/block_store.cuh
@@ -897,7 +897,7 @@ private:
         // subsequent loads
         temp_storage.valid_items = valid_items;
       }
-      CTA_SYNC();
+      __syncthreads();
       StoreDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items, temp_storage.valid_items);
     }
   };
@@ -980,7 +980,7 @@ private:
         // subsequent loads
         temp_storage.valid_items = valid_items;
       }
-      CTA_SYNC();
+      __syncthreads();
       StoreDirectWarpStriped(linear_tid, block_itr, items, temp_storage.valid_items);
     }
   };
@@ -1063,7 +1063,7 @@ private:
         // subsequent loads
         temp_storage.valid_items = valid_items;
       }
-      CTA_SYNC();
+      __syncthreads();
       StoreDirectWarpStriped(linear_tid, block_itr, items, temp_storage.valid_items);
     }
   };
diff --git a/cub/cub/block/specializations/block_histogram_sort.cuh b/cub/cub/block/specializations/block_histogram_sort.cuh
index 7ef3c1264a5..38d49a3b8e6 100644
--- a/cub/cub/block/specializations/block_histogram_sort.cuh
+++ b/cub/cub/block/specializations/block_histogram_sort.cuh
@@ -187,7 +187,7 @@ struct BlockHistogramSort
     // Sort bytes in blocked arrangement
     BlockRadixSortT(temp_storage.sort).Sort(items);
 
-    CTA_SYNC();
+    __syncthreads();
 
     // Initialize the shared memory's run_begin and run_end for each bin
     int histo_offset = 0;
@@ -205,7 +205,7 @@ struct BlockHistogramSort
       temp_storage.discontinuities.run_end[histo_offset + linear_tid]   = TILE_SIZE;
     }
 
-    CTA_SYNC();
+    __syncthreads();
 
     int flags[ITEMS_PER_THREAD]; // unused
 
@@ -219,7 +219,7 @@ struct BlockHistogramSort
       temp_storage.discontinuities.run_begin[items[0]] = 0;
     }
 
-    CTA_SYNC();
+    __syncthreads();
 
     // Composite into histogram
     histo_offset = 0;
diff --git a/cub/cub/block/specializations/block_reduce_raking.cuh b/cub/cub/block/specializations/block_reduce_raking.cuh
index 3d1f7b72707..7382732960b 100644
--- a/cub/cub/block/specializations/block_reduce_raking.cuh
+++ b/cub/cub/block/specializations/block_reduce_raking.cuh
@@ -212,7 +212,7 @@ struct BlockReduceRaking
       // Place partial into shared memory grid.
       *BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid) = partial;
 
-      CTA_SYNC();
+      __syncthreads();
 
       // Reduce parallelism to one warp
       if (linear_tid < RAKING_THREADS)
diff --git a/cub/cub/block/specializations/block_reduce_raking_commutative_only.cuh b/cub/cub/block/specializations/block_reduce_raking_commutative_only.cuh
index 49401e87fb4..9bfd94f425d 100644
--- a/cub/cub/block/specializations/block_reduce_raking_commutative_only.cuh
+++ b/cub/cub/block/specializations/block_reduce_raking_commutative_only.cuh
@@ -167,7 +167,7 @@ struct BlockReduceRakingCommutativeOnly
           partial;
       }
 
-      CTA_SYNC();
+      __syncthreads();
 
       // Reduce parallelism to one warp
       if (linear_tid < RAKING_THREADS)
@@ -214,7 +214,7 @@ struct BlockReduceRakingCommutativeOnly
           partial;
       }
 
-      CTA_SYNC();
+      __syncthreads();
 
       // Reduce parallelism to one warp
       if (linear_tid < RAKING_THREADS)
diff --git a/cub/cub/block/specializations/block_reduce_warp_reductions.cuh b/cub/cub/block/specializations/block_reduce_warp_reductions.cuh
index 623dc43f027..efb47d6101e 100644
--- a/cub/cub/block/specializations/block_reduce_warp_reductions.cuh
+++ b/cub/cub/block/specializations/block_reduce_warp_reductions.cuh
@@ -186,7 +186,7 @@ struct BlockReduceWarpReductions
       detail::uninitialized_copy_single(temp_storage.warp_aggregates + warp_id, warp_aggregate);
     }
 
-    CTA_SYNC();
+    __syncthreads();
 
     // Update total aggregate in warp 0, lane 0
     if (linear_tid == 0)
diff --git a/cub/cub/block/specializations/block_scan_raking.cuh b/cub/cub/block/specializations/block_scan_raking.cuh
index f0fe7a5ca2a..7f1b1887bc3 100644
--- a/cub/cub/block/specializations/block_scan_raking.cuh
+++ b/cub/cub/block/specializations/block_scan_raking.cuh
@@ -302,7 +302,7 @@ struct BlockScanRaking
       T* placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
       detail::uninitialized_copy_single(placement_ptr, input);
 
-      CTA_SYNC();
+      __syncthreads();
 
       // Reduce parallelism down to just raking threads
       if (linear_tid < RAKING_THREADS)
@@ -318,7 +318,7 @@ struct BlockScanRaking
         ExclusiveDownsweep(scan_op, exclusive_partial, (linear_tid != 0));
       }
 
-      CTA_SYNC();
+      __syncthreads();
 
       // Grab thread prefix from shared memory
       exclusive_output = *placement_ptr;
@@ -355,7 +355,7 @@ struct BlockScanRaking
       T* placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
       detail::uninitialized_copy_single(placement_ptr, input);
 
-      CTA_SYNC();
+      __syncthreads();
 
       // Reduce parallelism down to just raking threads
       if (linear_tid < RAKING_THREADS)
@@ -371,7 +371,7 @@ struct BlockScanRaking
         ExclusiveDownsweep(scan_op, exclusive_partial);
       }
 
-      CTA_SYNC();
+      __syncthreads();
 
       // Grab exclusive partial from shared memory
       output = *placement_ptr;
@@ -410,7 +410,7 @@ struct BlockScanRaking
       T* placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
       detail::uninitialized_copy_single(placement_ptr, input);
 
-      CTA_SYNC();
+      __syncthreads();
 
       // Reduce parallelism down to just raking threads
       if (linear_tid < RAKING_THREADS)
@@ -433,7 +433,7 @@ struct BlockScanRaking
         }
       }
 
-      CTA_SYNC();
+      __syncthreads();
 
       // Grab thread prefix from shared memory
       output = *placement_ptr;
@@ -478,7 +478,7 @@ struct BlockScanRaking
       T* placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
       detail::uninitialized_copy_single(placement_ptr, input);
 
-      CTA_SYNC();
+      __syncthreads();
 
       // Reduce parallelism down to just raking threads
       if (linear_tid < RAKING_THREADS)
@@ -501,7 +501,7 @@ struct BlockScanRaking
         }
       }
 
-      CTA_SYNC();
+      __syncthreads();
 
       // Grab exclusive partial from shared memory
       output = *placement_ptr;
@@ -559,7 +559,7 @@ struct BlockScanRaking
       T* placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
       detail::uninitialized_copy_single(placement_ptr, input);
 
-      CTA_SYNC();
+      __syncthreads();
 
       // Reduce parallelism down to just raking threads
       if (linear_tid < RAKING_THREADS)
@@ -588,7 +588,7 @@ struct BlockScanRaking
         ExclusiveDownsweep(scan_op, downsweep_prefix);
       }
 
-      CTA_SYNC();
+      __syncthreads();
 
       // Grab thread prefix from shared memory
       output = *placement_ptr;
@@ -626,7 +626,7 @@ struct BlockScanRaking
       T* placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
       detail::uninitialized_copy_single(placement_ptr, input);
 
-      CTA_SYNC();
+      __syncthreads();
 
       // Reduce parallelism down to just raking threads
       if (linear_tid < RAKING_THREADS)
@@ -642,7 +642,7 @@ struct BlockScanRaking
         InclusiveDownsweep(scan_op, exclusive_partial, (linear_tid != 0));
       }
 
-      CTA_SYNC();
+      __syncthreads();
 
       // Grab thread prefix from shared memory
       output = *placement_ptr;
@@ -680,7 +680,7 @@ struct BlockScanRaking
       T* placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
       detail::uninitialized_copy_single(placement_ptr, input);
 
-      CTA_SYNC();
+      __syncthreads();
 
       // Reduce parallelism down to just raking threads
       if (linear_tid < RAKING_THREADS)
@@ -703,7 +703,7 @@ struct BlockScanRaking
         }
       }
 
-      CTA_SYNC();
+      __syncthreads();
 
       // Grab thread prefix from shared memory
       output = *placement_ptr;
@@ -758,7 +758,7 @@ struct BlockScanRaking
       T* placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
       detail::uninitialized_copy_single(placement_ptr, input);
 
-      CTA_SYNC();
+      __syncthreads();
 
       // Reduce parallelism down to just raking threads
       if (linear_tid < RAKING_THREADS)
@@ -787,7 +787,7 @@ struct BlockScanRaking
         InclusiveDownsweep(scan_op, downsweep_prefix);
       }
 
-      CTA_SYNC();
+      __syncthreads();
 
       // Grab thread prefix from shared memory
       output = *placement_ptr;
diff --git a/cub/cub/block/specializations/block_scan_warp_scans.cuh b/cub/cub/block/specializations/block_scan_warp_scans.cuh
index c9c23e43f05..b71855132c1 100644
--- a/cub/cub/block/specializations/block_scan_warp_scans.cuh
+++ b/cub/cub/block/specializations/block_scan_warp_scans.cuh
@@ -199,7 +199,7 @@ struct BlockScanWarpScans
       detail::uninitialized_copy_single(temp_storage.warp_aggregates + warp_id, warp_aggregate);
     }
 
-    CTA_SYNC();
+    __syncthreads();
 
     // Accumulate block aggregates and save the one that is our warp's prefix
     T warp_prefix;
@@ -425,7 +425,7 @@ struct BlockScanWarpScans
       }
     }
 
-    CTA_SYNC();
+    __syncthreads();
 
     // Incorporate thread block prefix into outputs
     T block_prefix = temp_storage.block_prefix;
@@ -530,7 +530,7 @@ struct BlockScanWarpScans
       }
     }
 
-    CTA_SYNC();
+    __syncthreads();
 
     // Incorporate thread block prefix into outputs
     T block_prefix   = temp_storage.block_prefix;
diff --git a/cub/cub/device/dispatch/dispatch_batch_memcpy.cuh b/cub/cub/device/dispatch/dispatch_batch_memcpy.cuh
index 46ff7cbced6..287f702b095 100644
--- a/cub/cub/device/dispatch/dispatch_batch_memcpy.cuh
+++ b/cub/cub/device/dispatch/dispatch_batch_memcpy.cuh
@@ -131,7 +131,7 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::AgentLargeBufferPolicyT::BLO
 
     // Make sure thread 0 does not overwrite the buffer id before other threads have finished with
     // the prior iteration of the loop
-    CTA_SYNC();
+    __syncthreads();
 
     // Binary search the buffer that this tile belongs to
     if (threadIdx.x == 0)
@@ -140,7 +140,7 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::AgentLargeBufferPolicyT::BLO
     }
 
     // Make sure thread 0 has written the buffer this thread block is assigned to
-    CTA_SYNC();
+    __syncthreads();
 
     const BufferOffsetT buffer_id = block_buffer_id;
 
diff --git a/cub/cub/device/dispatch/dispatch_radix_sort.cuh b/cub/cub/device/dispatch/dispatch_radix_sort.cuh
index 0d4d9bf1ea9..c533afa1243 100644
--- a/cub/cub/device/dispatch/dispatch_radix_sort.cuh
+++ b/cub/cub/device/dispatch/dispatch_radix_sort.cuh
@@ -161,7 +161,7 @@ __launch_bounds__(int((ALT_DIGIT_BITS) ? int(ChainedPolicyT::ActivePolicy::AltUp
 
   upsweep.ProcessRegion(even_share.block_offset, even_share.block_end);
 
-  CTA_SYNC();
+  __syncthreads();
 
   // Write out digit counts (striped)
   upsweep.template ExtractCounts<IS_DESCENDING>(d_spine, gridDim.x, blockIdx.x);
@@ -432,7 +432,7 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::SingleTilePolicy::BLOCK_THRE
   // Load keys
   BlockLoadKeys(temp_storage.load_keys).Load(d_keys_in, keys, num_items, default_key);
 
-  CTA_SYNC();
+  __syncthreads();
 
   // Load values
   if (!KEYS_ONLY)
@@ -443,7 +443,7 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::SingleTilePolicy::BLOCK_THRE
 
     BlockLoadValues(temp_storage.load_values).Load(d_values_in, values, num_items);
 
-    CTA_SYNC();
+    __syncthreads();
   }
 
   // Sort tile
@@ -616,13 +616,13 @@ __launch_bounds__(int((ALT_DIGIT_BITS) ? ChainedPolicyT::ActivePolicy::AltSegmen
   BlockUpsweepT upsweep(temp_storage.upsweep, d_keys_in, current_bit, pass_bits, decomposer);
   upsweep.ProcessRegion(segment_begin, segment_end);
 
-  CTA_SYNC();
+  __syncthreads();
 
   // The count of each digit value in this pass (valid in the first RADIX_DIGITS threads)
   OffsetT bin_count[BINS_TRACKED_PER_THREAD];
   upsweep.ExtractCounts(bin_count);
 
-  CTA_SYNC();
+  __syncthreads();
 
   if (IS_DESCENDING)
   {
@@ -638,7 +638,7 @@ __launch_bounds__(int((ALT_DIGIT_BITS) ? ChainedPolicyT::ActivePolicy::AltSegmen
       }
     }
 
-    CTA_SYNC();
+    __syncthreads();
 
 #pragma unroll
     for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
@@ -677,7 +677,7 @@ __launch_bounds__(int((ALT_DIGIT_BITS) ? ChainedPolicyT::ActivePolicy::AltSegmen
       }
     }
 
-    CTA_SYNC();
+    __syncthreads();
 
 #pragma unroll
     for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
@@ -691,7 +691,7 @@ __launch_bounds__(int((ALT_DIGIT_BITS) ? ChainedPolicyT::ActivePolicy::AltSegmen
     }
   }
 
-  CTA_SYNC();
+  __syncthreads();
 
   // Downsweep
   BlockDownsweepT downsweep(
diff --git a/cub/cub/device/dispatch/dispatch_segmented_sort.cuh b/cub/cub/device/dispatch/dispatch_segmented_sort.cuh
index 9d011d414ba..71e127a78db 100644
--- a/cub/cub/device/dispatch/dispatch_segmented_sort.cuh
+++ b/cub/cub/device/dispatch/dispatch_segmented_sort.cuh
@@ -212,7 +212,7 @@ __launch_bounds__(ChainedPolicyT::ActivePolicy::LargeSegmentPolicy::BLOCK_THREAD
     {
       pass_bits = (cub::min)(int{LargeSegmentPolicyT::RADIX_BITS}, (end_bit - current_bit));
 
-      CTA_SYNC();
+      __syncthreads();
       agent.ProcessIterative(
         current_bit,
         pass_bits,
@@ -486,7 +486,7 @@ __launch_bounds__(ChainedPolicyT::ActivePolicy::LargeSegmentPolicy::BLOCK_THREAD
     {
       pass_bits = (cub::min)(int{LargeSegmentPolicyT::RADIX_BITS}, (end_bit - current_bit));
 
-      CTA_SYNC();
+      __syncthreads();
       agent.ProcessIterative(
         current_bit,
         pass_bits,
diff --git a/cub/cub/grid/grid_barrier.cuh b/cub/cub/grid/grid_barrier.cuh
index f2ae69fc091..7e134f7a63f 100644
--- a/cub/cub/grid/grid_barrier.cuh
+++ b/cub/cub/grid/grid_barrier.cuh
@@ -79,7 +79,7 @@ public:
     // Threadfence and syncthreads to make sure global writes are visible before
     // thread-0 reports in with its sync counter
     __threadfence();
-    CTA_SYNC();
+    __syncthreads();
 
     if (blockIdx.x == 0)
     {
@@ -89,7 +89,7 @@ public:
         d_vol_sync[blockIdx.x] = 1;
       }
 
-      CTA_SYNC();
+      __syncthreads();
 
       // Wait for everyone else to report in
       for (int peer_block = threadIdx.x; peer_block < gridDim.x; peer_block += blockDim.x)
@@ -100,7 +100,7 @@ public:
         }
       }
 
-      CTA_SYNC();
+      __syncthreads();
 
       // Let everyone know it's safe to proceed
       for (int peer_block = threadIdx.x; peer_block < gridDim.x; peer_block += blockDim.x)
@@ -122,7 +122,7 @@ public:
         }
       }
 
-      CTA_SYNC();
+      __syncthreads();
     }
   }
 };
diff --git a/cub/cub/util_ptx.cuh b/cub/cub/util_ptx.cuh
index 0d0efec864b..ab402528538 100644
--- a/cub/cub/util_ptx.cuh
+++ b/cub/cub/util_ptx.cuh
@@ -193,6 +193,7 @@ _CCCL_DEVICE _CCCL_FORCEINLINE void BAR(int count)
 /**
  * CTA barrier
  */
+CCCL_DEPRECATED_BECAUSE("use __syncthreads() instead")
 _CCCL_DEVICE _CCCL_FORCEINLINE void CTA_SYNC()
 {
   __syncthreads();
@@ -201,6 +202,7 @@ _CCCL_DEVICE _CCCL_FORCEINLINE void CTA_SYNC()
 /**
  * CTA barrier with predicate
  */
+CCCL_DEPRECATED_BECAUSE("use __syncthreads_and() instead")
 _CCCL_DEVICE _CCCL_FORCEINLINE int CTA_SYNC_AND(int p)
 {
   return __syncthreads_and(p);
@@ -209,6 +211,7 @@ _CCCL_DEVICE _CCCL_FORCEINLINE int CTA_SYNC_AND(int p)
 /**
  * CTA barrier with predicate
  */
+CCCL_DEPRECATED_BECAUSE("use __syncthreads_or() instead")
 _CCCL_DEVICE _CCCL_FORCEINLINE int CTA_SYNC_OR(int p)
 {
   return __syncthreads_or(p);
diff --git a/cub/cub/util_vsmem.cuh b/cub/cub/util_vsmem.cuh
index f5926ce11e5..baba489c0ae 100644
--- a/cub/cub/util_vsmem.cuh
+++ b/cub/cub/util_vsmem.cuh
@@ -168,7 +168,7 @@ public:
   static _CCCL_DEVICE _CCCL_FORCEINLINE bool discard_temp_storage(typename AgentT::TempStorage& temp_storage)
   {
     // Ensure all threads finished using temporary storage
-    CTA_SYNC();
+    __syncthreads();
 
     const std::size_t linear_tid   = threadIdx.x;
     const std::size_t block_stride = line_size * blockDim.x;
diff --git a/cub/test/catch2_test_block_run_length_decode.cu b/cub/test/catch2_test_block_run_length_decode.cu
index cf080e173d7..dc322e49f8a 100644
--- a/cub/test/catch2_test_block_run_length_decode.cu
+++ b/cub/test/catch2_test_block_run_length_decode.cu
@@ -104,7 +104,7 @@ private:
     BlockRunOffsetScanT(temp_storage.run_offsets_scan_storage).ExclusiveSum(run_lengths, run_offsets, decoded_size);
 
     // Ensure temporary shared memory can be repurposed
-    cub::CTA_SYNC();
+    __syncthreads();
 
     // Construct BlockRunLengthDecode and initialize with the run offsets
     return BlockRunLengthDecodeT(temp_storage.decode.run_length_decode_storage, unique_items, run_offsets);
@@ -137,7 +137,7 @@ private:
     }
 
     // Ensure BlockLoad's temporary shared memory can be repurposed
-    cub::CTA_SYNC();
+    __syncthreads();
 
     // Load this block's tile of run lengths
     if (num_valid_items < RUNS_PER_BLOCK)
@@ -151,7 +151,7 @@ private:
     }
 
     // Ensure temporary shared memory can be repurposed
-    cub::CTA_SYNC();
+    __syncthreads();
   }
 
 public:
diff --git a/thrust/thrust/system/cuda/detail/core/util.h b/thrust/thrust/system/cuda/detail/core/util.h
index 46681423790..186990f4a0b 100644
--- a/thrust/thrust/system/cuda/detail/core/util.h
+++ b/thrust/thrust/system/cuda/detail/core/util.h
@@ -681,7 +681,7 @@ THRUST_RUNTIME_FUNCTION inline cudaError_t sync_stream(cudaStream_t stream)
 
 inline void _CCCL_DEVICE sync_threadblock()
 {
-  cub::CTA_SYNC();
+  __syncthreads();
 }
 
 #define CUDA_CUB_RET_IF_FAIL(e)                \

From 3cf7cb5333a6d0220b16db8271ce2e63109549d3 Mon Sep 17 00:00:00 2001
From: fbusato <fbusato@nvidia.com>
Date: Mon, 13 Jan 2025 17:09:37 -0800
Subject: [PATCH 12/14] replace shuffle idx

---
 cub/cub/agent/agent_radix_sort_onesweep.cuh | 2 +-
 cub/cub/block/block_radix_rank.cuh          | 4 ++--
 cub/cub/util_ptx.cuh                        | 1 +
 3 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/cub/cub/agent/agent_radix_sort_onesweep.cuh b/cub/cub/agent/agent_radix_sort_onesweep.cuh
index c93ce277e67..9055d5052b2 100644
--- a/cub/cub/agent/agent_radix_sort_onesweep.cuh
+++ b/cub/cub/agent/agent_radix_sort_onesweep.cuh
@@ -528,7 +528,7 @@ struct AgentRadixSortOnesweep
       {
         num_writes -= int(global_idx + 1) % ALIGN;
       }
-      num_writes = SHFL_IDX_SYNC(num_writes, last_lane, WARP_MASK);
+      num_writes = __shfl_sync(WARP_MASK, num_writes, last_lane);
       if (lane < num_writes)
       {
         ThreadStore<CACHE_MODIFIER>(&d_keys_out[global_idx], key_out);
diff --git a/cub/cub/block/block_radix_rank.cuh b/cub/cub/block/block_radix_rank.cuh
index fcb63031687..ad495e1db31 100644
--- a/cub/cub/block/block_radix_rank.cuh
+++ b/cub/cub/block/block_radix_rank.cuh
@@ -1077,7 +1077,7 @@ struct BlockRadixRankMatchEarlyCounts
           // atomic is a bit faster
           warp_offset = atomicAdd(&warp_offsets[bin], popc);
         }
-        warp_offset = SHFL_IDX_SYNC(warp_offset, leader, WARP_MASK);
+        warp_offset = __shfl_sync(WARP_MASK, warp_offset, leader);
         if (lane == leader)
         {
           *p_match_mask = 0;
@@ -1106,7 +1106,7 @@ struct BlockRadixRankMatchEarlyCounts
           // atomic is a bit faster
           warp_offset = atomicAdd(&warp_offsets[bin], popc);
         }
-        warp_offset = SHFL_IDX_SYNC(warp_offset, leader, WARP_MASK);
+        warp_offset = __shfl_sync(WARP_MASK, warp_offset, leader);
         ranks[u]    = warp_offset + popc - 1;
       }
     }
diff --git a/cub/cub/util_ptx.cuh b/cub/cub/util_ptx.cuh
index ab402528538..8b7d5f8acff 100644
--- a/cub/cub/util_ptx.cuh
+++ b/cub/cub/util_ptx.cuh
@@ -292,6 +292,7 @@ SHFL_IDX_SYNC(unsigned int word, int src_lane, int flags, unsigned int member_ma
 /**
  * Warp synchronous shfl_idx
  */
+CCCL_DEPRECATED_BECAUSE("use __shfl_sync() instead")
 _CCCL_DEVICE _CCCL_FORCEINLINE unsigned int SHFL_IDX_SYNC(unsigned int word, int src_lane, unsigned int member_mask)
 {
   return __shfl_sync(member_mask, word, src_lane);

From e60f28801c30cd9d7891278d34c93a49cff60e47 Mon Sep 17 00:00:00 2001
From: fbusato <fbusato@nvidia.com>
Date: Tue, 14 Jan 2025 10:21:28 -0800
Subject: [PATCH 13/14] recover LogicShiftLeft and LogicShiftRight

Signed-off-by: fbusato <fbusato@nvidia.com>
---
 cub/cub/agent/agent_batch_memcpy.cuh | 18 ++++++++++--------
 cub/cub/util_ptx.cuh                 | 22 ++++++++++++++++++++++
 2 files changed, 32 insertions(+), 8 deletions(-)

diff --git a/cub/cub/agent/agent_batch_memcpy.cuh b/cub/cub/agent/agent_batch_memcpy.cuh
index b20b734c389..27f19a17896 100644
--- a/cub/cub/agent/agent_batch_memcpy.cuh
+++ b/cub/cub/agent/agent_batch_memcpy.cuh
@@ -424,11 +424,12 @@ public:
     for (uint32_t i = 0; i < NUM_TOTAL_UNITS; ++i)
     {
       // In case the bit-offset of the counter at <index> is larger than the bit range of the
-      // current unit, the bit_shift amount will be larger than the bits provided by this unit.
-      // C++'s bit-shift has undefined behaviour if the bits being shifted exceed the operand width.
-      // The bit_shift is a run-time value, it is translated into SASS `shr` and the result behavior is well-defined.
+      // current unit, the bit_shift amount will be larger than the bits provided by this unit. As
+      // C++'s bit-shift has undefined behaviour if the bits being shifted exceed the operand width,
+      // we use the PTX instruction `shr` to make sure behaviour is well-defined.
+      // Negative bit-shift amounts wrap around in unsigned integer math and are ultimately clamped.
       const uint32_t bit_shift = target_offset - i * USED_BITS_PER_UNIT;
-      val |= (data[i] >> bit_shift) & ITEM_MASK;
+      val |= detail::LogicShiftRight(data[i], bit_shift) & ITEM_MASK;
     }
     return val;
   }
@@ -441,11 +442,12 @@ public:
     for (uint32_t i = 0; i < NUM_TOTAL_UNITS; ++i)
     {
       // In case the bit-offset of the counter at <index> is larger than the bit range of the
-      // current unit, the bit_shift amount will be larger than the bits provided by this unit.
-      // C++'s bit-shift has undefined behaviour if the bits being shifted exceed the operand width.
-      // The bit_shift is a run-time value, it is translated into SASS `shl` and the result behavior is well-defined.
+      // current unit, the bit_shift amount will be larger than the bits provided by this unit. As
+      // C++'s bit-shift has undefined behaviour if the bits being shifted exceed the operand width,
+      // we use the PTX instruction `shl` to make sure behaviour is well-defined.
+      // Negative bit-shift amounts wrap around in unsigned integer math and are ultimately clamped.
       const uint32_t bit_shift = target_offset - i * USED_BITS_PER_UNIT;
-      data[i] += (value << bit_shift) & UNIT_MASK;
+      data[i] += detail::LogicShiftLeft(value, bit_shift) & UNIT_MASK;
     }
   }
 
diff --git a/cub/cub/util_ptx.cuh b/cub/cub/util_ptx.cuh
index 8b7d5f8acff..82893676ee2 100644
--- a/cub/cub/util_ptx.cuh
+++ b/cub/cub/util_ptx.cuh
@@ -748,6 +748,28 @@ struct warp_matcher_t<LABEL_BITS, CUB_PTX_WARP_THREADS>
   }
 };
 
+/**
+ * @brief Shifts @p val left by the amount specified by unsigned 32-bit value in @p num_bits. If @p
+ * num_bits is larger than 32 bits, @p num_bits is clamped to 32.
+ */
+_CCCL_DEVICE _CCCL_FORCEINLINE uint32_t LogicShiftLeft(uint32_t val, uint32_t num_bits)
+{
+  uint32_t ret{};
+  asm("shl.b32 %0, %1, %2;" : "=r"(ret) : "r"(val), "r"(num_bits));
+  return ret;
+}
+
+/**
+ * @brief Shifts @p val right by the amount specified by unsigned 32-bit value in @p num_bits. If @p
+ * num_bits is larger than 32 bits, @p num_bits is clamped to 32.
+ */
+_CCCL_DEVICE _CCCL_FORCEINLINE uint32_t LogicShiftRight(uint32_t val, uint32_t num_bits)
+{
+  uint32_t ret{};
+  asm("shr.b32 %0, %1, %2;" : "=r"(ret) : "r"(val), "r"(num_bits));
+  return ret;
+}
+
 } // namespace detail
 #endif // _CCCL_DOXYGEN_INVOKED
 

From ffe65ca4660be49afe718429009ee5895f1d22ba Mon Sep 17 00:00:00 2001
From: fbusato <fbusato@nvidia.com>
Date: Tue, 14 Jan 2025 10:37:41 -0800
Subject: [PATCH 14/14] deprecate shfl functions

Signed-off-by: fbusato <fbusato@nvidia.com>
---
 cub/cub/util_ptx.cuh | 15 +++------------
 1 file changed, 3 insertions(+), 12 deletions(-)

diff --git a/cub/cub/util_ptx.cuh b/cub/cub/util_ptx.cuh
index 82893676ee2..99beeed313e 100644
--- a/cub/cub/util_ptx.cuh
+++ b/cub/cub/util_ptx.cuh
@@ -280,6 +280,7 @@ SHFL_DOWN_SYNC(unsigned int word, int src_offset, int flags, unsigned int member
 /**
  * Warp synchronous shfl_idx
  */
+CCCL_DEPRECATED_BECAUSE("use __shfl_sync() instead")
 _CCCL_DEVICE _CCCL_FORCEINLINE unsigned int
 SHFL_IDX_SYNC(unsigned int word, int src_lane, int flags, unsigned int member_mask)
 {
@@ -657,12 +658,6 @@ _CCCL_DEVICE _CCCL_FORCEINLINE T ShuffleDown(T input, int src_offset, int last_t
 template <int LOGICAL_WARP_THREADS, typename T>
 _CCCL_DEVICE _CCCL_FORCEINLINE T ShuffleIndex(T input, int src_lane, unsigned int member_mask)
 {
-  /// The 5-bit SHFL mask for logically splitting warps into sub-segments starts 8-bits up
-  enum
-  {
-    SHFL_C = ((32 - LOGICAL_WARP_THREADS) << 8) | (LOGICAL_WARP_THREADS - 1)
-  };
-
   using ShuffleWord = typename UnitWord<T>::ShuffleWord;
 
   constexpr int WORDS = (sizeof(T) + sizeof(ShuffleWord) - 1) / sizeof(ShuffleWord);
@@ -672,18 +667,14 @@ _CCCL_DEVICE _CCCL_FORCEINLINE T ShuffleIndex(T input, int src_lane, unsigned in
   ShuffleWord* input_alias  = reinterpret_cast<ShuffleWord*>(&input);
 
   unsigned int shuffle_word;
-  shuffle_word = SHFL_IDX_SYNC((unsigned int) input_alias[0], src_lane, SHFL_C, member_mask);
-
+  shuffle_word    = __shfl_sync(member_mask, (unsigned int) input_alias[0], src_lane, LOGICAL_WARP_THREADS);
   output_alias[0] = shuffle_word;
-
 #pragma unroll
   for (int WORD = 1; WORD < WORDS; ++WORD)
   {
-    shuffle_word = SHFL_IDX_SYNC((unsigned int) input_alias[WORD], src_lane, SHFL_C, member_mask);
-
+    shuffle_word       = __shfl_sync(member_mask, (unsigned int) input_alias[WORD], src_lane, LOGICAL_WARP_THREADS);
     output_alias[WORD] = shuffle_word;
   }
-
   return output;
 }