diff --git a/cub/cub/agent/agent_adjacent_difference.cuh b/cub/cub/agent/agent_adjacent_difference.cuh index 37e1a013193..b023de3753a 100644 --- a/cub/cub/agent/agent_adjacent_difference.cuh +++ b/cub/cub/agent/agent_adjacent_difference.cuh @@ -138,7 +138,7 @@ struct AgentDifference BlockLoad(temp_storage.load).Load(load_it + tile_base, input); } - CTA_SYNC(); + __syncthreads(); if (ReadLeft) { @@ -186,7 +186,7 @@ struct AgentDifference } } - CTA_SYNC(); + __syncthreads(); if (IS_LAST_TILE) { diff --git a/cub/cub/agent/agent_batch_memcpy.cuh b/cub/cub/agent/agent_batch_memcpy.cuh index f9d5e8b16a1..27f19a17896 100644 --- a/cub/cub/agent/agent_batch_memcpy.cuh +++ b/cub/cub/agent/agent_batch_memcpy.cuh @@ -834,7 +834,7 @@ private: BlockBLevTileCountScanT(temp_storage.staged.blev.block_scan_storage) .ExclusiveSum(block_offset, block_offset, blev_tile_prefix_op); } - CTA_SYNC(); + __syncthreads(); // Read in the BLEV buffer partition (i.e., the buffers that require block-level collaboration) blev_buffer_offset = threadIdx.x * BLEV_BUFFERS_PER_THREAD; @@ -996,7 +996,7 @@ private: // Ensure all threads finished collaborative BlockExchange so temporary storage can be reused // with next iteration - CTA_SYNC(); + __syncthreads(); } } @@ -1026,7 +1026,7 @@ public: } // Ensure we can repurpose the BlockLoad's temporary storage - CTA_SYNC(); + __syncthreads(); // Count how many buffers fall into each size-class VectorizedSizeClassCounterT size_class_histogram = GetBufferSizeClassHistogram(buffer_sizes); @@ -1037,7 +1037,7 @@ public: .ExclusiveSum(size_class_histogram, size_class_histogram, size_class_agg); // Ensure we can repurpose the scan's temporary storage for scattering the buffer ids - CTA_SYNC(); + __syncthreads(); // Factor in the per-size-class counts / offsets // That is, WLEV buffer offset has to be offset by the TLEV buffer count and BLEV buffer offset @@ -1077,7 +1077,7 @@ public: // Ensure the prefix callback has finished using its temporary storage and that it can be reused // in the next stage - CTA_SYNC(); + __syncthreads(); // Scatter the buffers into one of the three partitions (TLEV, WLEV, BLEV) depending on their // size @@ -1085,7 +1085,7 @@ public: // Ensure all buffers have been partitioned by their size class AND // ensure that blev_buffer_offset has been written to shared memory - CTA_SYNC(); + __syncthreads(); // TODO: think about prefetching tile_buffer_{srcs,dsts} into shmem InputBufferIt tile_buffer_srcs = input_buffer_it + buffer_offset; @@ -1104,7 +1104,7 @@ public: tile_id); // Ensure we can repurpose the temporary storage required by EnqueueBLEVBuffers - CTA_SYNC(); + __syncthreads(); // Copy warp-level buffers BatchMemcpyWLEVBuffers( diff --git a/cub/cub/agent/agent_histogram.cuh b/cub/cub/agent/agent_histogram.cuh index e454dc837b1..98a16b36c68 100644 --- a/cub/cub/agent/agent_histogram.cuh +++ b/cub/cub/agent/agent_histogram.cuh @@ -320,7 +320,7 @@ struct AgentHistogram } // Barrier to make sure all threads are done updating counters - CTA_SYNC(); + __syncthreads(); } // Initialize privatized bin counters. Specialized for privatized shared-memory counters @@ -350,7 +350,7 @@ struct AgentHistogram _CCCL_DEVICE _CCCL_FORCEINLINE void StoreOutput(CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS]) { // Barrier to make sure all threads are done updating counters - CTA_SYNC(); + __syncthreads(); // Apply privatized bin counts to output bin counts #pragma unroll @@ -690,7 +690,7 @@ struct AgentHistogram ConsumeTile(tile_offset, TILE_SAMPLES); } - CTA_SYNC(); + __syncthreads(); // Get next tile if (threadIdx.x == 0) @@ -698,7 +698,7 @@ struct AgentHistogram temp_storage.tile_idx = tile_queue.Drain(1) + num_even_share_tiles; } - CTA_SYNC(); + __syncthreads(); tile_idx = temp_storage.tile_idx; } diff --git a/cub/cub/agent/agent_merge.cuh b/cub/cub/agent/agent_merge.cuh index ae457bb954d..e1649b812ff 100644 --- a/cub/cub/agent/agent_merge.cuh +++ b/cub/cub/agent/agent_merge.cuh @@ -132,7 +132,7 @@ struct agent_t gmem_to_reg( keys_loc, keys1_in + keys1_beg, keys2_in + keys2_beg, num_keys1, num_keys2); reg_to_shared(&storage.keys_shared[0], keys_loc); - CTA_SYNC(); + __syncthreads(); // use binary search in shared memory to find merge path for each of thread. // we can use int type here, because the number of items in shared memory is limited @@ -158,7 +158,7 @@ struct agent_t keys_loc, indices, compare_op); - CTA_SYNC(); + __syncthreads(); // write keys if (IsFullTile) @@ -182,9 +182,10 @@ struct agent_t item_type items_loc[items_per_thread]; gmem_to_reg( items_loc, items1_in + keys1_beg, items2_in + keys2_beg, num_keys1, num_keys2); - CTA_SYNC(); // block_store_keys above uses shared memory, so make sure all threads are done before we write to it + __syncthreads(); // block_store_keys above uses shared memory, so make sure all threads are done before we write + // to it reg_to_shared(&storage.items_shared[0], items_loc); - CTA_SYNC(); + __syncthreads(); // gather items from shared mem #pragma unroll @@ -192,7 +193,7 @@ struct agent_t { items_loc[i] = storage.items_shared[indices[i]]; } - CTA_SYNC(); + __syncthreads(); // write from reg to gmem if (IsFullTile) diff --git a/cub/cub/agent/agent_merge_sort.cuh b/cub/cub/agent/agent_merge_sort.cuh index dd8b559f2c4..9c0c54fbeb2 100644 --- a/cub/cub/agent/agent_merge_sort.cuh +++ b/cub/cub/agent/agent_merge_sort.cuh @@ -187,7 +187,7 @@ struct AgentBlockSort BlockLoadItems(storage.load_items).Load(items_in + tile_base, items_local); } - CTA_SYNC(); + __syncthreads(); } KeyT keys_local[ITEMS_PER_THREAD]; @@ -200,7 +200,7 @@ struct AgentBlockSort BlockLoadKeys(storage.load_keys).Load(keys_in + tile_base, keys_local); } - CTA_SYNC(); + __syncthreads(); _CCCL_PDL_TRIGGER_NEXT_LAUNCH(); _CCCL_IF_CONSTEXPR (IS_LAST_TILE) @@ -212,7 +212,7 @@ struct AgentBlockSort BlockMergeSortT(storage.block_merge).Sort(keys_local, items_local, compare_op); } - CTA_SYNC(); + __syncthreads(); if (ping) { @@ -227,7 +227,7 @@ struct AgentBlockSort _CCCL_IF_CONSTEXPR (!KEYS_ONLY) { - CTA_SYNC(); + __syncthreads(); _CCCL_IF_CONSTEXPR (IS_LAST_TILE) { @@ -252,7 +252,7 @@ struct AgentBlockSort _CCCL_IF_CONSTEXPR (!KEYS_ONLY) { - CTA_SYNC(); + __syncthreads(); _CCCL_IF_CONSTEXPR (IS_LAST_TILE) { @@ -583,7 +583,7 @@ struct AgentMerge } } - CTA_SYNC(); + __syncthreads(); _CCCL_PDL_TRIGGER_NEXT_LAUNCH(); // use binary search in shared memory @@ -616,7 +616,7 @@ struct AgentMerge indices, compare_op); - CTA_SYNC(); + __syncthreads(); // write keys if (ping) @@ -650,11 +650,11 @@ struct AgentMerge _CCCL_IF_CONSTEXPR (!KEYS_ONLY) #endif // _CCCL_CUDACC_AT_LEAST(11, 8) { - CTA_SYNC(); + __syncthreads(); detail::reg_to_shared(&storage.items_shared[0], items_local); - CTA_SYNC(); + __syncthreads(); // gather items from shared mem // @@ -664,7 +664,7 @@ struct AgentMerge items_local[item] = storage.items_shared[indices[item]]; } - CTA_SYNC(); + __syncthreads(); // write from reg to gmem // diff --git a/cub/cub/agent/agent_radix_sort_downsweep.cuh b/cub/cub/agent/agent_radix_sort_downsweep.cuh index 43562c9a2b5..23fcd653fe1 100644 --- a/cub/cub/agent/agent_radix_sort_downsweep.cuh +++ b/cub/cub/agent/agent_radix_sort_downsweep.cuh @@ -277,7 +277,7 @@ struct AgentRadixSortDownsweep temp_storage.keys_and_offsets.exchange_keys[ranks[ITEM]] = twiddled_keys[ITEM]; } - CTA_SYNC(); + __syncthreads(); #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) @@ -305,7 +305,7 @@ struct AgentRadixSortDownsweep int (&ranks)[ITEMS_PER_THREAD], OffsetT valid_items) { - CTA_SYNC(); + __syncthreads(); ValueExchangeT& exchange_values = temp_storage.exchange_values.Alias(); @@ -315,7 +315,7 @@ struct AgentRadixSortDownsweep exchange_values[ranks[ITEM]] = values[ITEM]; } - CTA_SYNC(); + __syncthreads(); #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) @@ -342,7 +342,7 @@ struct AgentRadixSortDownsweep { BlockLoadKeysT(temp_storage.load_keys).Load(d_keys_in + block_offset, keys); - CTA_SYNC(); + __syncthreads(); } /** @@ -362,7 +362,7 @@ struct AgentRadixSortDownsweep BlockLoadKeysT(temp_storage.load_keys).Load(d_keys_in + block_offset, keys, valid_items, oob_item); - CTA_SYNC(); + __syncthreads(); } /** @@ -409,7 +409,7 @@ struct AgentRadixSortDownsweep { BlockLoadValuesT(temp_storage.load_values).Load(d_values_in + block_offset, values); - CTA_SYNC(); + __syncthreads(); } /** @@ -428,7 +428,7 @@ struct AgentRadixSortDownsweep BlockLoadValuesT(temp_storage.load_values).Load(d_values_in + block_offset, values, valid_items); - CTA_SYNC(); + __syncthreads(); } /** @@ -474,7 +474,7 @@ struct AgentRadixSortDownsweep { ValueT values[ITEMS_PER_THREAD]; - CTA_SYNC(); + __syncthreads(); LoadValues(values, block_offset, valid_items, Int2Type(), Int2Type()); @@ -520,7 +520,7 @@ struct AgentRadixSortDownsweep int exclusive_digit_prefix[BINS_TRACKED_PER_THREAD]; BlockRadixRankT(temp_storage.radix_rank).RankKeys(keys, ranks, digit_extractor(), exclusive_digit_prefix); - CTA_SYNC(); + __syncthreads(); // Share exclusive digit prefix #pragma unroll @@ -534,7 +534,7 @@ struct AgentRadixSortDownsweep } } - CTA_SYNC(); + __syncthreads(); // Get inclusive digit prefix int inclusive_digit_prefix[BINS_TRACKED_PER_THREAD]; @@ -562,7 +562,7 @@ struct AgentRadixSortDownsweep } } - CTA_SYNC(); + __syncthreads(); // Update global scatter base offsets for each digit #pragma unroll @@ -577,7 +577,7 @@ struct AgentRadixSortDownsweep } } - CTA_SYNC(); + __syncthreads(); // Scatter keys ScatterKeys(keys, relative_bin_offsets, ranks, valid_items); @@ -602,7 +602,7 @@ struct AgentRadixSortDownsweep T items[ITEMS_PER_THREAD]; LoadDirectStriped(threadIdx.x, d_in + block_offset, items); - CTA_SYNC(); + __syncthreads(); StoreDirectStriped(threadIdx.x, d_out + block_offset, items); block_offset += TILE_ITEMS; @@ -616,7 +616,7 @@ struct AgentRadixSortDownsweep T items[ITEMS_PER_THREAD]; LoadDirectStriped(threadIdx.x, d_in + block_offset, items, valid_items); - CTA_SYNC(); + __syncthreads(); StoreDirectStriped(threadIdx.x, d_out + block_offset, items, valid_items); } } @@ -670,7 +670,7 @@ struct AgentRadixSortDownsweep } } - short_circuit = CTA_SYNC_AND(short_circuit); + short_circuit = __syncthreads_and(short_circuit); } /** @@ -719,7 +719,7 @@ struct AgentRadixSortDownsweep } } - short_circuit = CTA_SYNC_AND(short_circuit); + short_circuit = __syncthreads_and(short_circuit); } /** @@ -744,7 +744,7 @@ struct AgentRadixSortDownsweep ProcessTile(block_offset); block_offset += TILE_ITEMS; - CTA_SYNC(); + __syncthreads(); } // Clean up last partial tile with guarded-I/O diff --git a/cub/cub/agent/agent_radix_sort_histogram.cuh b/cub/cub/agent/agent_radix_sort_histogram.cuh index 2785f732450..0eb5a9415ea 100644 --- a/cub/cub/agent/agent_radix_sort_histogram.cuh +++ b/cub/cub/agent/agent_radix_sort_histogram.cuh @@ -50,6 +50,8 @@ #include #include +#include + CUB_NAMESPACE_BEGIN template @@ -172,7 +174,7 @@ struct AgentRadixSortHistogram } } } - CTA_SYNC(); + __syncthreads(); } _CCCL_DEVICE _CCCL_FORCEINLINE void LoadTileKeys(OffsetT tile_offset, bit_ordered_type (&keys)[ITEMS_PER_THREAD]) @@ -199,7 +201,7 @@ struct AgentRadixSortHistogram _CCCL_DEVICE _CCCL_FORCEINLINE void AccumulateSharedHistograms(OffsetT tile_offset, bit_ordered_type (&keys)[ITEMS_PER_THREAD]) { - int part = LaneId() % NUM_PARTS; + int part = ::cuda::ptx::get_sreg_laneid() % NUM_PARTS; #pragma unroll for (int current_bit = begin_bit, pass = 0; current_bit < end_bit; current_bit += RADIX_BITS, ++pass) { @@ -247,7 +249,7 @@ struct AgentRadixSortHistogram { // Reset the counters. Init(); - CTA_SYNC(); + __syncthreads(); // Process the tiles. OffsetT portion_offset = portion * MAX_PORTION_SIZE; @@ -259,11 +261,11 @@ struct AgentRadixSortHistogram LoadTileKeys(tile_offset, keys); AccumulateSharedHistograms(tile_offset, keys); } - CTA_SYNC(); + __syncthreads(); // Accumulate the result in global memory. AccumulateGlobalHistograms(); - CTA_SYNC(); + __syncthreads(); } } diff --git a/cub/cub/agent/agent_radix_sort_onesweep.cuh b/cub/cub/agent/agent_radix_sort_onesweep.cuh index a78ee66c7b2..9055d5052b2 100644 --- a/cub/cub/agent/agent_radix_sort_onesweep.cuh +++ b/cub/cub/agent/agent_radix_sort_onesweep.cuh @@ -49,6 +49,7 @@ #include #include +#include #include CUB_NAMESPACE_BEGIN @@ -279,7 +280,7 @@ struct AgentRadixSortOnesweep } while (value_j == 0); inc_sum += value_j & LOOKBACK_VALUE_MASK; - want_mask = WARP_BALLOT((value_j & LOOKBACK_GLOBAL_MASK) == 0, want_mask); + want_mask = __ballot_sync(want_mask, (value_j & LOOKBACK_GLOBAL_MASK) == 0); if (value_j & LOOKBACK_GLOBAL_MASK) { break; @@ -349,7 +350,7 @@ struct AgentRadixSortOnesweep short_circuit = short_circuit || bins[u] == TILE_ITEMS; } } - short_circuit = CTA_SYNC_OR(short_circuit); + short_circuit = __syncthreads_or(short_circuit); if (!short_circuit) { return; @@ -377,7 +378,7 @@ struct AgentRadixSortOnesweep LoadBinsToOffsetsGlobal(offsets); LookbackGlobal(bins); UpdateBinsGlobal(bins, offsets); - CTA_SYNC(); + __syncthreads(); // scatter the keys OffsetT global_offset = s.global_offsets[common_bin]; @@ -483,7 +484,7 @@ struct AgentRadixSortOnesweep { d_keys_out[global_idx] = Twiddle::Out(key, decomposer); } - WARP_SYNC(WARP_MASK); + __syncwarp(WARP_MASK); } } @@ -501,7 +502,7 @@ struct AgentRadixSortOnesweep { d_values_out[global_idx] = value; } - WARP_SYNC(WARP_MASK); + __syncwarp(WARP_MASK); } } @@ -527,7 +528,7 @@ struct AgentRadixSortOnesweep { num_writes -= int(global_idx + 1) % ALIGN; } - num_writes = SHFL_IDX_SYNC(num_writes, last_lane, WARP_MASK); + num_writes = __shfl_sync(WARP_MASK, num_writes, last_lane); if (lane < num_writes) { ThreadStore(&d_keys_out[global_idx], key_out); @@ -600,10 +601,10 @@ struct AgentRadixSortOnesweep LoadValues(block_idx * TILE_ITEMS, values); // scatter values - CTA_SYNC(); + __syncthreads(); ScatterValuesShared(values, ranks); - CTA_SYNC(); + __syncthreads(); ScatterValuesGlobal(digits); } @@ -625,7 +626,7 @@ struct AgentRadixSortOnesweep .RankKeys(keys, ranks, digit_extractor(), exclusive_digit_prefix, CountsCallback(*this, bins, keys)); // scatter keys in shared memory - CTA_SYNC(); + __syncthreads(); ScatterKeysShared(keys, ranks); // compute global offsets @@ -634,7 +635,7 @@ struct AgentRadixSortOnesweep UpdateBinsGlobal(bins, exclusive_digit_prefix); // scatter keys in global memory - CTA_SYNC(); + __syncthreads(); ScatterKeysGlobal(); // scatter values if necessary @@ -669,7 +670,7 @@ struct AgentRadixSortOnesweep , current_bit(current_bit) , num_bits(num_bits) , warp(threadIdx.x / WARP_THREADS) - , lane(LaneId()) + , lane(::cuda::ptx::get_sreg_laneid()) , decomposer(decomposer) { // initialization @@ -677,7 +678,7 @@ struct AgentRadixSortOnesweep { s.block_idx = atomicAdd(d_ctrs, 1); } - CTA_SYNC(); + __syncthreads(); block_idx = s.block_idx; full_block = (block_idx + 1) * TILE_ITEMS <= num_items; } diff --git a/cub/cub/agent/agent_radix_sort_upsweep.cuh b/cub/cub/agent/agent_radix_sort_upsweep.cuh index e91e32c5bd3..d9418d2bc56 100644 --- a/cub/cub/agent/agent_radix_sort_upsweep.cuh +++ b/cub/cub/agent/agent_radix_sort_upsweep.cuh @@ -52,6 +52,8 @@ #include #include +#include + CUB_NAMESPACE_BEGIN /****************************************************************************** @@ -298,7 +300,7 @@ struct AgentRadixSortUpsweep _CCCL_DEVICE _CCCL_FORCEINLINE void UnpackDigitCounts() { unsigned int warp_id = threadIdx.x >> LOG_WARP_THREADS; - unsigned int warp_tid = LaneId(); + unsigned int warp_tid = ::cuda::ptx::get_sreg_laneid(); #pragma unroll for (int LANE = 0; LANE < LANES_PER_WARP; LANE++) @@ -331,7 +333,7 @@ struct AgentRadixSortUpsweep LoadDirectStriped(threadIdx.x, d_keys_in + block_offset, keys); // Prevent hoisting - CTA_SYNC(); + __syncthreads(); // Bucket tile of keys Iterate<0, KEYS_PER_THREAD>::BucketKeys(*this, keys); @@ -385,12 +387,12 @@ struct AgentRadixSortUpsweep block_offset += TILE_ITEMS; } - CTA_SYNC(); + __syncthreads(); // Aggregate back into local_count registers to prevent overflow UnpackDigitCounts(); - CTA_SYNC(); + __syncthreads(); // Reset composite counters in lanes ResetDigitCounters(); @@ -406,7 +408,7 @@ struct AgentRadixSortUpsweep // Process partial tile if necessary ProcessPartialTile(block_offset, block_end); - CTA_SYNC(); + __syncthreads(); // Aggregate back into local_count registers UnpackDigitCounts(); @@ -419,7 +421,7 @@ struct AgentRadixSortUpsweep _CCCL_DEVICE _CCCL_FORCEINLINE void ExtractCounts(OffsetT* counters, int bin_stride = 1, int bin_offset = 0) { unsigned int warp_id = threadIdx.x >> LOG_WARP_THREADS; - unsigned int warp_tid = LaneId(); + unsigned int warp_tid = ::cuda::ptx::get_sreg_laneid(); // Place unpacked digit counters in shared memory #pragma unroll @@ -440,7 +442,7 @@ struct AgentRadixSortUpsweep } } - CTA_SYNC(); + __syncthreads(); // Rake-reduce bin_count reductions @@ -499,7 +501,7 @@ struct AgentRadixSortUpsweep _CCCL_DEVICE _CCCL_FORCEINLINE void ExtractCounts(OffsetT (&bin_count)[BINS_TRACKED_PER_THREAD]) { unsigned int warp_id = threadIdx.x >> LOG_WARP_THREADS; - unsigned int warp_tid = LaneId(); + unsigned int warp_tid = ::cuda::ptx::get_sreg_laneid(); // Place unpacked digit counters in shared memory #pragma unroll @@ -520,7 +522,7 @@ struct AgentRadixSortUpsweep } } - CTA_SYNC(); + __syncthreads(); // Rake-reduce bin_count reductions #pragma unroll diff --git a/cub/cub/agent/agent_reduce_by_key.cuh b/cub/cub/agent/agent_reduce_by_key.cuh index 735993723d8..f48c9567a24 100644 --- a/cub/cub/agent/agent_reduce_by_key.cuh +++ b/cub/cub/agent/agent_reduce_by_key.cuh @@ -426,7 +426,7 @@ struct AgentReduceByKey OffsetT num_tile_segments, OffsetT num_tile_segments_prefix) { - CTA_SYNC(); + __syncthreads(); // Compact and scatter pairs #pragma unroll @@ -438,7 +438,7 @@ struct AgentReduceByKey } } - CTA_SYNC(); + __syncthreads(); for (int item = threadIdx.x; item < num_tile_segments; item += BLOCK_THREADS) { @@ -539,7 +539,7 @@ struct AgentReduceByKey tile_predecessor = (tile_idx == 0) ? keys[0] : d_keys_in[tile_offset - 1]; } - CTA_SYNC(); + __syncthreads(); // Load values if (IS_LAST_TILE) @@ -551,7 +551,7 @@ struct AgentReduceByKey BlockLoadValuesT(temp_storage.load_values).Load(d_values_in + tile_offset, values); } - CTA_SYNC(); + __syncthreads(); // Initialize head-flags and shuffle up the previous keys if (IS_LAST_TILE) diff --git a/cub/cub/agent/agent_rle.cuh b/cub/cub/agent/agent_rle.cuh index 2495d2f5f7a..266fbfcae88 100644 --- a/cub/cub/agent/agent_rle.cuh +++ b/cub/cub/agent/agent_rle.cuh @@ -54,6 +54,7 @@ #include #include +#include #include #include @@ -465,7 +466,7 @@ struct AgentRle { // Perform warpscans unsigned int warp_id = ((WARPS == 1) ? 0 : threadIdx.x / WARP_THREADS); - int lane_id = LaneId(); + int lane_id = ::cuda::ptx::get_sreg_laneid(); LengthOffsetPair identity; identity.key = 0; @@ -501,7 +502,7 @@ struct AgentRle temp_storage.aliasable.scan_storage.warp_aggregates.Alias()[warp_id] = thread_inclusive; } - CTA_SYNC(); + __syncthreads(); // Accumulate total selected and the warp-wide prefix @@ -531,7 +532,7 @@ struct AgentRle // Ensure all threads have read warp aggregates before temp_storage is repurposed in the // subsequent scatter stage - CTA_SYNC(); + __syncthreads(); } //--------------------------------------------------------------------- @@ -551,7 +552,7 @@ struct AgentRle Int2Type is_warp_time_slice) { unsigned int warp_id = ((WARPS == 1) ? 0 : threadIdx.x / WARP_THREADS); - int lane_id = LaneId(); + int lane_id = ::cuda::ptx::get_sreg_laneid(); // Locally compact items within the warp (first warp) if (warp_id == 0) @@ -564,7 +565,7 @@ struct AgentRle #pragma unroll for (int SLICE = 1; SLICE < WARPS; ++SLICE) { - CTA_SYNC(); + __syncthreads(); if (warp_id == SLICE) { @@ -608,7 +609,7 @@ struct AgentRle Int2Type is_warp_time_slice) { unsigned int warp_id = ((WARPS == 1) ? 0 : threadIdx.x / WARP_THREADS); - int lane_id = LaneId(); + int lane_id = ::cuda::ptx::get_sreg_laneid(); // Unzip OffsetT run_offsets[ITEMS_PER_THREAD]; @@ -624,7 +625,7 @@ struct AgentRle WarpExchangeOffsets(temp_storage.aliasable.scatter_aliasable.exchange_offsets[warp_id]) .ScatterToStriped(run_offsets, thread_num_runs_exclusive_in_warp); - WARP_SYNC(0xffffffff); + __syncwarp(0xffffffff); WarpExchangeLengths(temp_storage.aliasable.scatter_aliasable.exchange_lengths[warp_id]) .ScatterToStriped(run_lengths, thread_num_runs_exclusive_in_warp); @@ -762,7 +763,7 @@ struct AgentRle if (SYNC_AFTER_LOAD) { - CTA_SYNC(); + __syncthreads(); } // Set flags @@ -848,7 +849,7 @@ struct AgentRle if (SYNC_AFTER_LOAD) { - CTA_SYNC(); + __syncthreads(); } // Set flags @@ -878,7 +879,7 @@ struct AgentRle } } - CTA_SYNC(); + __syncthreads(); LengthOffsetPair tile_exclusive_in_global = temp_storage.tile_exclusive; diff --git a/cub/cub/agent/agent_scan.cuh b/cub/cub/agent/agent_scan.cuh index 7021531d0cc..f4e76de5329 100644 --- a/cub/cub/agent/agent_scan.cuh +++ b/cub/cub/agent/agent_scan.cuh @@ -376,7 +376,7 @@ struct AgentScan BlockLoadT(temp_storage.load).Load(d_in + tile_offset, items); } - CTA_SYNC(); + __syncthreads(); // Perform tile scan if (tile_idx == 0) @@ -397,7 +397,7 @@ struct AgentScan ScanTile(items, scan_op, prefix_op, Int2Type()); } - CTA_SYNC(); + __syncthreads(); // Store items if (IS_LAST_TILE) @@ -482,7 +482,7 @@ struct AgentScan BlockLoadT(temp_storage.load).Load(d_in + tile_offset, items); } - CTA_SYNC(); + __syncthreads(); // Block scan if (IS_FIRST_TILE) @@ -496,7 +496,7 @@ struct AgentScan ScanTile(items, scan_op, prefix_op, Int2Type()); } - CTA_SYNC(); + __syncthreads(); // Store items if (IS_LAST_TILE) diff --git a/cub/cub/agent/agent_scan_by_key.cuh b/cub/cub/agent/agent_scan_by_key.cuh index 6e79ca18d8c..fc2a710837a 100644 --- a/cub/cub/agent/agent_scan_by_key.cuh +++ b/cub/cub/agent/agent_scan_by_key.cuh @@ -333,7 +333,7 @@ struct AgentScanByKey BlockLoadKeysT(storage.load_keys).Load(d_keys_in + tile_base, keys); } - CTA_SYNC(); + __syncthreads(); if (IS_LAST_TILE) { @@ -347,7 +347,7 @@ struct AgentScanByKey BlockLoadValuesT(storage.load_values).Load(d_values_in + tile_base, values); } - CTA_SYNC(); + __syncthreads(); // first tile if (tile_idx == 0) @@ -386,7 +386,7 @@ struct AgentScanByKey ScanTile(scan_items, tile_aggregate, prefix_op, Int2Type()); } - CTA_SYNC(); + __syncthreads(); UnzipValues(values, scan_items); diff --git a/cub/cub/agent/agent_segment_fixup.cuh b/cub/cub/agent/agent_segment_fixup.cuh index 1cf5eff5008..fb518da6a03 100644 --- a/cub/cub/agent/agent_segment_fixup.cuh +++ b/cub/cub/agent/agent_segment_fixup.cuh @@ -376,7 +376,7 @@ struct AgentSegmentFixup BlockLoadPairs(temp_storage.load_pairs).Load(d_pairs_in + tile_offset, pairs); } - CTA_SYNC(); + __syncthreads(); KeyValuePairT tile_aggregate; if (tile_idx == 0) diff --git a/cub/cub/agent/agent_segmented_radix_sort.cuh b/cub/cub/agent/agent_segmented_radix_sort.cuh index fe687fa9f51..0eb03d8d507 100644 --- a/cub/cub/agent/agent_segmented_radix_sort.cuh +++ b/cub/cub/agent/agent_segmented_radix_sort.cuh @@ -154,13 +154,13 @@ struct AgentSegmentedRadixSort { BlockValueLoadT(temp_storage.values_load).Load(d_values_in, thread_values, num_items); - CTA_SYNC(); + __syncthreads(); } { BlockKeyLoadT(temp_storage.keys_load).Load(d_keys_in, thread_keys, num_items, oob_default); - CTA_SYNC(); + __syncthreads(); } BlockRadixSortT(temp_storage.sort) @@ -187,13 +187,13 @@ struct AgentSegmentedRadixSort BlockUpsweepT upsweep(temp_storage.upsweep, d_keys_in, current_bit, pass_bits, decomposer); upsweep.ProcessRegion(OffsetT{}, num_items); - CTA_SYNC(); + __syncthreads(); // The count of each digit value in this pass (valid in the first RADIX_DIGITS threads) OffsetT bin_count[BINS_TRACKED_PER_THREAD]; upsweep.ExtractCounts(bin_count); - CTA_SYNC(); + __syncthreads(); if (IS_DESCENDING) { @@ -209,7 +209,7 @@ struct AgentSegmentedRadixSort } } - CTA_SYNC(); + __syncthreads(); #pragma unroll for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track) @@ -243,7 +243,7 @@ struct AgentSegmentedRadixSort } } - CTA_SYNC(); + __syncthreads(); #pragma unroll for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track) @@ -257,7 +257,7 @@ struct AgentSegmentedRadixSort } } - CTA_SYNC(); + __syncthreads(); // Downsweep BlockDownsweepT downsweep( diff --git a/cub/cub/agent/agent_select_if.cuh b/cub/cub/agent/agent_select_if.cuh index 4f16992b276..8f68625b1fc 100644 --- a/cub/cub/agent/agent_select_if.cuh +++ b/cub/cub/agent/agent_select_if.cuh @@ -408,7 +408,7 @@ struct AgentSelectIf OffsetT (&selection_flags)[ITEMS_PER_THREAD], Int2Type /*select_method*/) { - CTA_SYNC(); + __syncthreads(); FlagT flags[ITEMS_PER_THREAD]; if (IS_LAST_TILE) @@ -450,7 +450,7 @@ struct AgentSelectIf OffsetT (&selection_flags)[ITEMS_PER_THREAD], Int2Type /*select_method*/) { - CTA_SYNC(); + __syncthreads(); FlagT flags[ITEMS_PER_THREAD]; @@ -486,7 +486,7 @@ struct AgentSelectIf { if (IS_FIRST_TILE && streaming_context.is_first_partition()) { - CTA_SYNC(); + __syncthreads(); // Set head selection_flags. First tile sets the first flag for the first item BlockDiscontinuityT(temp_storage.scan_storage.discontinuity).FlagHeads(selection_flags, items, inequality_op); @@ -499,7 +499,7 @@ struct AgentSelectIf tile_predecessor = d_in[tile_offset + streaming_context.input_offset() - 1]; } - CTA_SYNC(); + __syncthreads(); BlockDiscontinuityT(temp_storage.scan_storage.discontinuity) .FlagHeads(selection_flags, items, inequality_op, tile_predecessor); @@ -571,7 +571,7 @@ struct AgentSelectIf int num_tile_selections, OffsetT num_selections_prefix) { - CTA_SYNC(); + __syncthreads(); // Compact and scatter items #pragma unroll @@ -584,7 +584,7 @@ struct AgentSelectIf } } - CTA_SYNC(); + __syncthreads(); for (int item = threadIdx.x; item < num_tile_selections; item += BLOCK_THREADS) { @@ -667,7 +667,7 @@ struct AgentSelectIf OffsetT num_selections, Int2Type /*is_keep_rejects*/) { - CTA_SYNC(); + __syncthreads(); int tile_num_rejections = num_tile_items - num_tile_selections; @@ -685,7 +685,7 @@ struct AgentSelectIf } // Ensure all threads finished scattering to shared memory - CTA_SYNC(); + __syncthreads(); // Gather items from shared memory and scatter to global ScatterPartitionsToGlobal( @@ -814,7 +814,7 @@ struct AgentSelectIf // Ensure temporary storage used during block load can be reused // Also, in case of in-place stream compaction, this is needed to order the loads of // *all threads of this thread block* before the st.release of the thread writing this thread block's tile state - CTA_SYNC(); + __syncthreads(); // Exclusive scan of selection_flags OffsetT num_tile_selections; @@ -894,7 +894,7 @@ struct AgentSelectIf // Ensure temporary storage used during block load can be reused // Also, in case of in-place stream compaction, this is needed to order the loads of // *all threads of this thread block* before the st.release of the thread writing this thread block's tile state - CTA_SYNC(); + __syncthreads(); // Exclusive scan of values and selection_flags TilePrefixCallbackOpT prefix_op( diff --git a/cub/cub/agent/agent_spmv_orig.cuh b/cub/cub/agent/agent_spmv_orig.cuh index a3f985ad0fa..70aac32d0b8 100644 --- a/cub/cub/agent/agent_spmv_orig.cuh +++ b/cub/cub/agent/agent_spmv_orig.cuh @@ -381,7 +381,7 @@ struct CCCL_DEPRECATED_BECAUSE("Use the cuSPARSE library instead") AgentSpmv s_tile_row_end_offsets[item] = wd_row_end_offsets[offset]; } - CTA_SYNC(); + __syncthreads(); // Search for the thread's starting coordinate within the merge tile CountingInputIterator tile_nonzero_indices(tile_start_coord.y); @@ -395,7 +395,7 @@ struct CCCL_DEPRECATED_BECAUSE("Use the cuSPARSE library instead") AgentSpmv tile_num_nonzeros, thread_start_coord); - CTA_SYNC(); // Perf-sync + __syncthreads(); // Perf-sync // Compute the thread's merge path segment CoordinateT thread_current_coord = thread_start_coord; @@ -434,7 +434,7 @@ struct CCCL_DEPRECATED_BECAUSE("Use the cuSPARSE library instead") AgentSpmv } } - CTA_SYNC(); + __syncthreads(); // Block-wide reduce-value-by-segment KeyValuePairT tile_carry; @@ -562,7 +562,7 @@ struct CCCL_DEPRECATED_BECAUSE("Use the cuSPARSE library instead") AgentSpmv s_tile_row_end_offsets[item] = wd_row_end_offsets[offset]; } - CTA_SYNC(); + __syncthreads(); // Search for the thread's starting coordinate within the merge tile CountingInputIterator tile_nonzero_indices(tile_start_coord.y); @@ -576,7 +576,7 @@ struct CCCL_DEPRECATED_BECAUSE("Use the cuSPARSE library instead") AgentSpmv tile_num_nonzeros, thread_start_coord); - CTA_SYNC(); // Perf-sync + __syncthreads(); // Perf-sync // Compute the thread's merge path segment CoordinateT thread_current_coord = thread_start_coord; @@ -609,7 +609,7 @@ struct CCCL_DEPRECATED_BECAUSE("Use the cuSPARSE library instead") AgentSpmv scan_segment[ITEM].key = thread_current_coord.x; } - CTA_SYNC(); + __syncthreads(); // Block-wide reduce-value-by-segment KeyValuePairT tile_carry; @@ -629,7 +629,7 @@ struct CCCL_DEPRECATED_BECAUSE("Use the cuSPARSE library instead") AgentSpmv if (tile_num_rows > 0) { - CTA_SYNC(); + __syncthreads(); // Scan downsweep and scatter ValueT* s_partials = &temp_storage.aliasable.merge_items[0].nonzero; @@ -656,7 +656,7 @@ struct CCCL_DEPRECATED_BECAUSE("Use the cuSPARSE library instead") AgentSpmv } } - CTA_SYNC(); + __syncthreads(); #pragma unroll 1 for (int item = threadIdx.x; item < tile_num_rows; item += BLOCK_THREADS) @@ -718,7 +718,7 @@ struct CCCL_DEPRECATED_BECAUSE("Use the cuSPARSE library instead") AgentSpmv } } - CTA_SYNC(); + __syncthreads(); CoordinateT tile_start_coord = temp_storage.tile_coords[0]; CoordinateT tile_end_coord = temp_storage.tile_coords[1]; diff --git a/cub/cub/agent/agent_sub_warp_merge_sort.cuh b/cub/cub/agent/agent_sub_warp_merge_sort.cuh index f07b2173cdc..e3e7334fe1b 100644 --- a/cub/cub/agent/agent_sub_warp_merge_sort.cuh +++ b/cub/cub/agent/agent_sub_warp_merge_sort.cuh @@ -233,23 +233,23 @@ public: KeyT oob_default = AgentSubWarpSort::get_oob_default(Int2Type::value>{}); WarpLoadKeysT(storage.load_keys).Load(keys_input, keys, segment_size, oob_default); - WARP_SYNC(warp_merge_sort.get_member_mask()); + __syncwarp(warp_merge_sort.get_member_mask()); if (!KEYS_ONLY) { WarpLoadItemsT(storage.load_items).Load(values_input, values, segment_size); - WARP_SYNC(warp_merge_sort.get_member_mask()); + __syncwarp(warp_merge_sort.get_member_mask()); } warp_merge_sort.Sort(keys, values, BinaryOpT{}, segment_size, oob_default); - WARP_SYNC(warp_merge_sort.get_member_mask()); + __syncwarp(warp_merge_sort.get_member_mask()); WarpStoreKeysT(storage.store_keys).Store(keys_output, keys, segment_size); if (!KEYS_ONLY) { - WARP_SYNC(warp_merge_sort.get_member_mask()); + __syncwarp(warp_merge_sort.get_member_mask()); WarpStoreItemsT(storage.store_items).Store(values_output, values, segment_size); } } diff --git a/cub/cub/agent/agent_three_way_partition.cuh b/cub/cub/agent/agent_three_way_partition.cuh index eec24057163..65b72b04372 100644 --- a/cub/cub/agent/agent_three_way_partition.cuh +++ b/cub/cub/agent/agent_three_way_partition.cuh @@ -313,7 +313,7 @@ struct AgentThreeWayPartition AccumPackT num_tile_selected_prefix, OffsetT num_rejected_prefix) { - CTA_SYNC(); + __syncthreads(); const OffsetT num_first_selections_prefix = AccumPackHelperT::first(num_tile_selected_prefix); const OffsetT num_second_selections_prefix = AccumPackHelperT::second(num_tile_selected_prefix); @@ -353,7 +353,7 @@ struct AgentThreeWayPartition } } - CTA_SYNC(); + __syncthreads(); // Gather items from shared memory and scatter to global auto first_base = @@ -421,7 +421,7 @@ struct AgentThreeWayPartition // Initialize selection_flags Initialize(num_tile_items, items, items_selection_flags); - CTA_SYNC(); + __syncthreads(); // Exclusive scan of selection_flags BlockScanT(temp_storage.scan_storage.scan) @@ -486,7 +486,7 @@ struct AgentThreeWayPartition // Initialize selection_flags Initialize(num_tile_items, items, items_selected_flags); - CTA_SYNC(); + __syncthreads(); // Exclusive scan of values and selection_flags TilePrefixCallbackOpT prefix_op(tile_state, temp_storage.scan_storage.prefix, ::cuda::std::plus<>{}, tile_idx); @@ -497,7 +497,7 @@ struct AgentThreeWayPartition AccumPackT num_items_in_tile_selected = prefix_op.GetBlockAggregate(); AccumPackT num_items_selected_prefix = prefix_op.GetExclusivePrefix(); - CTA_SYNC(); + __syncthreads(); OffsetT num_rejected_prefix = (tile_idx * TILE_ITEMS) - AccumPackHelperT::sum(num_items_selected_prefix); diff --git a/cub/cub/agent/agent_unique_by_key.cuh b/cub/cub/agent/agent_unique_by_key.cuh index 30f5d4f50e4..c388f0e744d 100644 --- a/cub/cub/agent/agent_unique_by_key.cuh +++ b/cub/cub/agent/agent_unique_by_key.cuh @@ -286,7 +286,7 @@ struct AgentUniqueByKey } } - CTA_SYNC(); + __syncthreads(); // Preventing loop unrolling helps avoid perf degradation when switching from signed to unsigned 32-bit offset // types @@ -296,7 +296,7 @@ struct AgentUniqueByKey items_out[num_selections_prefix + item] = GetShared(tag)[item]; } - CTA_SYNC(); + __syncthreads(); } //--------------------------------------------------------------------- @@ -337,7 +337,7 @@ struct AgentUniqueByKey BlockLoadKeys(temp_storage.load_keys).Load(d_keys_in + tile_offset, keys); } - CTA_SYNC(); + __syncthreads(); ValueT values[ITEMS_PER_THREAD]; if (IS_LAST_TILE) @@ -352,7 +352,7 @@ struct AgentUniqueByKey BlockLoadValues(temp_storage.load_values).Load(d_values_in + tile_offset, values); } - CTA_SYNC(); + __syncthreads(); BlockDiscontinuityKeys(temp_storage.scan_storage.discontinuity).FlagHeads(selection_flags, keys, inequality_op); #pragma unroll @@ -365,7 +365,7 @@ struct AgentUniqueByKey } } - CTA_SYNC(); + __syncthreads(); OffsetT num_tile_selections = 0; OffsetT num_selections = 0; @@ -390,7 +390,7 @@ struct AgentUniqueByKey } num_selections = num_tile_selections; - CTA_SYNC(); + __syncthreads(); Scatter(KeyTagT(), d_keys_out, @@ -402,7 +402,7 @@ struct AgentUniqueByKey num_selections_prefix, num_selections); - CTA_SYNC(); + __syncthreads(); Scatter(ValueTagT(), d_values_out, @@ -454,7 +454,7 @@ struct AgentUniqueByKey BlockLoadKeys(temp_storage.load_keys).Load(d_keys_in + tile_offset, keys); } - CTA_SYNC(); + __syncthreads(); ValueT values[ITEMS_PER_THREAD]; if (IS_LAST_TILE) @@ -469,7 +469,7 @@ struct AgentUniqueByKey BlockLoadValues(temp_storage.load_values).Load(d_values_in + tile_offset, values); } - CTA_SYNC(); + __syncthreads(); KeyT tile_predecessor = d_keys_in[tile_offset - 1]; BlockDiscontinuityKeys(temp_storage.scan_storage.discontinuity) @@ -485,7 +485,7 @@ struct AgentUniqueByKey } } - CTA_SYNC(); + __syncthreads(); OffsetT num_tile_selections = 0; OffsetT num_selections = 0; @@ -505,7 +505,7 @@ struct AgentUniqueByKey num_selections -= num_discount; } - CTA_SYNC(); + __syncthreads(); Scatter(KeyTagT(), d_keys_out, @@ -517,7 +517,7 @@ struct AgentUniqueByKey num_selections_prefix, num_selections); - CTA_SYNC(); + __syncthreads(); Scatter(ValueTagT(), d_values_out, diff --git a/cub/cub/agent/single_pass_scan_operators.cuh b/cub/cub/agent/single_pass_scan_operators.cuh index 71469a0055a..bd6551b8f8d 100644 --- a/cub/cub/agent/single_pass_scan_operators.cuh +++ b/cub/cub/agent/single_pass_scan_operators.cuh @@ -733,7 +733,7 @@ public: tile_descriptor = reinterpret_cast(alias); } - while (WARP_ANY((tile_descriptor.status == SCAN_TILE_INVALID), 0xffffffff)) + while (__any_sync(0xffffffff, (tile_descriptor.status == SCAN_TILE_INVALID))) { delay_or_prevent_hoisting(); TxnWord alias = LoadStatus(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx); @@ -918,7 +918,7 @@ struct ScanTileState delay(); status = detail::load_relaxed(d_tile_status + TILE_STATUS_PADDING + tile_idx); __threadfence(); - } while (WARP_ANY((status == SCAN_TILE_INVALID), 0xffffffff)); + } while (__any_sync(0xffffffff, (status == SCAN_TILE_INVALID))); if (status == StatusWord(SCAN_TILE_PARTIAL)) { @@ -1145,7 +1145,7 @@ struct ReduceByKeyScanTileState TxnWord alias = detail::load_relaxed(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx); tile_descriptor = reinterpret_cast(alias); - } while (WARP_ANY((tile_descriptor.status == SCAN_TILE_INVALID), 0xffffffff)); + } while (__any_sync(0xffffffff, (tile_descriptor.status == SCAN_TILE_INVALID))); status = tile_descriptor.status; value.value = tile_descriptor.value; @@ -1268,7 +1268,7 @@ struct TilePrefixCallbackOp exclusive_prefix = window_aggregate; // Keep sliding the window back until we come across a tile whose inclusive prefix is known - while (WARP_ALL((predecessor_status != StatusWord(SCAN_TILE_INCLUSIVE)), 0xffffffff)) + while (__all_sync(0xffffffff, (predecessor_status != StatusWord(SCAN_TILE_INCLUSIVE)))) { predecessor_idx -= CUB_PTX_WARP_THREADS; diff --git a/cub/cub/block/block_adjacent_difference.cuh b/cub/cub/block/block_adjacent_difference.cuh index 5bc3bae3219..38636571e80 100644 --- a/cub/cub/block/block_adjacent_difference.cuh +++ b/cub/cub/block/block_adjacent_difference.cuh @@ -309,7 +309,7 @@ public: // Share last item temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1]; - CTA_SYNC(); + __syncthreads(); #pragma unroll for (int item = ITEMS_PER_THREAD - 1; item > 0; item--) @@ -408,7 +408,7 @@ public: // Share last item temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1]; - CTA_SYNC(); + __syncthreads(); #pragma unroll for (int item = ITEMS_PER_THREAD - 1; item > 0; item--) @@ -499,7 +499,7 @@ public: // Share last item temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1]; - CTA_SYNC(); + __syncthreads(); if ((linear_tid + 1) * ITEMS_PER_THREAD <= valid_items) { @@ -622,7 +622,7 @@ public: // Share last item temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1]; - CTA_SYNC(); + __syncthreads(); if ((linear_tid + 1) * ITEMS_PER_THREAD <= valid_items) { @@ -736,7 +736,7 @@ public: // Share first item temp_storage.first_items[linear_tid] = input[0]; - CTA_SYNC(); + __syncthreads(); #pragma unroll for (int item = 0; item < ITEMS_PER_THREAD - 1; item++) @@ -837,7 +837,7 @@ public: // Share first item temp_storage.first_items[linear_tid] = input[0]; - CTA_SYNC(); + __syncthreads(); // Set flag for last thread-item T successor_item = (linear_tid == BLOCK_THREADS - 1) @@ -926,7 +926,7 @@ public: // Share first item temp_storage.first_items[linear_tid] = input[0]; - CTA_SYNC(); + __syncthreads(); if ((linear_tid + 1) * ITEMS_PER_THREAD < valid_items) { diff --git a/cub/cub/block/block_discontinuity.cuh b/cub/cub/block/block_discontinuity.cuh index fb88dfac07f..e4998f32510 100644 --- a/cub/cub/block/block_discontinuity.cuh +++ b/cub/cub/block/block_discontinuity.cuh @@ -292,7 +292,7 @@ public: // Share last item temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1]; - CTA_SYNC(); + __syncthreads(); if (linear_tid == 0) { @@ -337,7 +337,7 @@ public: // Share last item temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1]; - CTA_SYNC(); + __syncthreads(); // Set flag for first thread-item preds[0] = (linear_tid == 0) ? tile_predecessor_item : // First thread @@ -586,7 +586,7 @@ public: // Share first item temp_storage.first_items[linear_tid] = input[0]; - CTA_SYNC(); + __syncthreads(); // Set flag for last thread-item tail_flags[ITEMS_PER_THREAD - 1] = @@ -686,7 +686,7 @@ public: // Share first item temp_storage.first_items[linear_tid] = input[0]; - CTA_SYNC(); + __syncthreads(); // Set flag for last thread-item T successor_item = (linear_tid == BLOCK_THREADS - 1) ? tile_successor_item : // Last thread @@ -790,7 +790,7 @@ public: temp_storage.first_items[linear_tid] = input[0]; temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1]; - CTA_SYNC(); + __syncthreads(); T preds[ITEMS_PER_THREAD]; @@ -920,7 +920,7 @@ public: temp_storage.first_items[linear_tid] = input[0]; temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1]; - CTA_SYNC(); + __syncthreads(); T preds[ITEMS_PER_THREAD]; @@ -1052,7 +1052,7 @@ public: temp_storage.first_items[linear_tid] = input[0]; temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1]; - CTA_SYNC(); + __syncthreads(); T preds[ITEMS_PER_THREAD]; @@ -1189,7 +1189,7 @@ public: temp_storage.first_items[linear_tid] = input[0]; temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1]; - CTA_SYNC(); + __syncthreads(); T preds[ITEMS_PER_THREAD]; diff --git a/cub/cub/block/block_exchange.cuh b/cub/cub/block/block_exchange.cuh index bdc2a3dc932..d1ae91c223d 100644 --- a/cub/cub/block/block_exchange.cuh +++ b/cub/cub/block/block_exchange.cuh @@ -47,6 +47,8 @@ #include #include +#include + CUB_NAMESPACE_BEGIN //! @rst @@ -179,7 +181,7 @@ private: // TODO(bgruber): can we use signed int here? Only these variables are unsigned: unsigned int linear_tid = RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z); - unsigned int lane_id = LaneId(); + unsigned int lane_id = ::cuda::ptx::get_sreg_laneid(); unsigned int warp_id = WARPS == 1 ? 0 : linear_tid / WARP_THREADS; unsigned int warp_offset = warp_id * WARP_TIME_SLICED_ITEMS; @@ -215,7 +217,7 @@ private: detail::uninitialized_copy_single(temp_storage.buff + item_offset, input_items[i]); } - CTA_SYNC(); + __syncthreads(); #pragma unroll for (int i = 0; i < ITEMS_PER_THREAD; i++) @@ -249,7 +251,7 @@ private: const int slice_offset = slice * TIME_SLICED_ITEMS; const int slice_oob = slice_offset + TIME_SLICED_ITEMS; - CTA_SYNC(); + __syncthreads(); if (warp_id == slice) { @@ -265,7 +267,7 @@ private: } } - CTA_SYNC(); + __syncthreads(); #pragma unroll for (int i = 0; i < ITEMS_PER_THREAD; i++) @@ -322,7 +324,7 @@ private: detail::uninitialized_copy_single(temp_storage.buff + item_offset, input_items[i]); } - WARP_SYNC(0xffffffff); + __syncwarp(0xffffffff); #pragma unroll for (int i = 0; i < ITEMS_PER_THREAD; i++) @@ -361,7 +363,7 @@ private: detail::uninitialized_copy_single(temp_storage.buff + item_offset, input_items[i]); } - WARP_SYNC(0xffffffff); + __syncwarp(0xffffffff); #pragma unroll for (int i = 0; i < ITEMS_PER_THREAD; i++) @@ -378,7 +380,7 @@ private: #pragma unroll for (int slice = 1; slice < TIME_SLICES; ++slice) { - CTA_SYNC(); + __syncthreads(); if (warp_id == slice) { @@ -393,7 +395,7 @@ private: detail::uninitialized_copy_single(temp_storage.buff + item_offset, input_items[i]); } - WARP_SYNC(0xffffffff); + __syncwarp(0xffffffff); #pragma unroll for (int i = 0; i < ITEMS_PER_THREAD; i++) @@ -434,7 +436,7 @@ private: detail::uninitialized_copy_single(temp_storage.buff + item_offset, input_items[i]); } - CTA_SYNC(); + __syncthreads(); // No timeslicing #pragma unroll @@ -470,7 +472,7 @@ private: const int slice_offset = slice * TIME_SLICED_ITEMS; const int slice_oob = slice_offset + TIME_SLICED_ITEMS; - CTA_SYNC(); + __syncthreads(); #pragma unroll for (int i = 0; i < ITEMS_PER_THREAD; i++) @@ -493,7 +495,7 @@ private: } } - CTA_SYNC(); + __syncthreads(); if (warp_id == slice) { @@ -543,7 +545,7 @@ private: detail::uninitialized_copy_single(temp_storage.buff + item_offset, input_items[i]); } - WARP_SYNC(0xffffffff); + __syncwarp(0xffffffff); #pragma unroll for (int i = 0; i < ITEMS_PER_THREAD; i++) @@ -572,7 +574,7 @@ private: #pragma unroll for (int slice = 0; slice < TIME_SLICES; ++slice) { - CTA_SYNC(); + __syncthreads(); if (warp_id == slice) { @@ -587,7 +589,7 @@ private: detail::uninitialized_copy_single(temp_storage.buff + item_offset, input_items[i]); } - WARP_SYNC(0xffffffff); + __syncwarp(0xffffffff); #pragma unroll for (int i = 0; i < ITEMS_PER_THREAD; i++) @@ -626,12 +628,12 @@ private: int item_offset = ranks[i]; _CCCL_IF_CONSTEXPR (INSERT_PADDING) { - item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset); + item_offset = (item_offset >> LOG_SMEM_BANKS) + item_offset; } detail::uninitialized_copy_single(temp_storage.buff + item_offset, input_items[i]); } - CTA_SYNC(); + __syncthreads(); #pragma unroll for (int i = 0; i < ITEMS_PER_THREAD; i++) @@ -639,7 +641,7 @@ private: int item_offset = linear_tid * ITEMS_PER_THREAD + i; _CCCL_IF_CONSTEXPR (INSERT_PADDING) { - item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset); + item_offset = (item_offset >> LOG_SMEM_BANKS) + item_offset; } output_items[i] = temp_storage.buff[item_offset]; } @@ -667,7 +669,7 @@ private: #pragma unroll for (int slice = 0; slice < TIME_SLICES; slice++) { - CTA_SYNC(); + __syncthreads(); const int slice_offset = TIME_SLICED_ITEMS * slice; @@ -679,13 +681,13 @@ private: { _CCCL_IF_CONSTEXPR (INSERT_PADDING) { - item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset); + item_offset = (item_offset >> LOG_SMEM_BANKS) + item_offset; } detail::uninitialized_copy_single(temp_storage.buff + item_offset, input_items[i]); } } - CTA_SYNC(); + __syncthreads(); if (warp_id == slice) { @@ -695,7 +697,7 @@ private: int item_offset = lane_id * ITEMS_PER_THREAD + i; _CCCL_IF_CONSTEXPR (INSERT_PADDING) { - item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset); + item_offset = (item_offset >> LOG_SMEM_BANKS) + item_offset; } temp_items[i] = temp_storage.buff[item_offset]; } @@ -733,12 +735,12 @@ private: int item_offset = ranks[i]; _CCCL_IF_CONSTEXPR (INSERT_PADDING) { - item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset); + item_offset = (item_offset >> LOG_SMEM_BANKS) + item_offset; } detail::uninitialized_copy_single(temp_storage.buff + item_offset, input_items[i]); } - CTA_SYNC(); + __syncthreads(); #pragma unroll for (int i = 0; i < ITEMS_PER_THREAD; i++) @@ -746,7 +748,7 @@ private: int item_offset = i * BLOCK_THREADS + linear_tid; _CCCL_IF_CONSTEXPR (INSERT_PADDING) { - item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset); + item_offset = (item_offset >> LOG_SMEM_BANKS) + item_offset; } output_items[i] = temp_storage.buff[item_offset]; } @@ -777,7 +779,7 @@ private: const int slice_offset = slice * TIME_SLICED_ITEMS; const int slice_oob = slice_offset + TIME_SLICED_ITEMS; - CTA_SYNC(); + __syncthreads(); #pragma unroll for (int i = 0; i < ITEMS_PER_THREAD; i++) @@ -787,13 +789,13 @@ private: { _CCCL_IF_CONSTEXPR (INSERT_PADDING) { - item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset); + item_offset = (item_offset >> LOG_SMEM_BANKS) + item_offset; } detail::uninitialized_copy_single(temp_storage.buff + item_offset, input_items[i]); } } - CTA_SYNC(); + __syncthreads(); #pragma unroll for (int i = 0; i < ITEMS_PER_THREAD; i++) @@ -1134,7 +1136,7 @@ public: int item_offset = ranks[i]; _CCCL_IF_CONSTEXPR (INSERT_PADDING) { - item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset); + item_offset = (item_offset >> LOG_SMEM_BANKS) + item_offset; } if (ranks[i] >= 0) { @@ -1142,7 +1144,7 @@ public: } } - CTA_SYNC(); + __syncthreads(); #pragma unroll for (int i = 0; i < ITEMS_PER_THREAD; i++) @@ -1150,7 +1152,7 @@ public: int item_offset = i * BLOCK_THREADS + linear_tid; _CCCL_IF_CONSTEXPR (INSERT_PADDING) { - item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset); + item_offset = (item_offset >> LOG_SMEM_BANKS) + item_offset; } output_items[i] = temp_storage.buff[item_offset]; } @@ -1193,7 +1195,7 @@ public: int item_offset = ranks[i]; _CCCL_IF_CONSTEXPR (INSERT_PADDING) { - item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset); + item_offset = (item_offset >> LOG_SMEM_BANKS) + item_offset; } if (is_valid[i]) { @@ -1201,7 +1203,7 @@ public: } } - CTA_SYNC(); + __syncthreads(); #pragma unroll for (int i = 0; i < ITEMS_PER_THREAD; i++) @@ -1209,7 +1211,7 @@ public: int item_offset = i * BLOCK_THREADS + linear_tid; _CCCL_IF_CONSTEXPR (INSERT_PADDING) { - item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset); + item_offset = (item_offset >> LOG_SMEM_BANKS) + item_offset; } output_items[i] = temp_storage.buff[item_offset]; } diff --git a/cub/cub/block/block_histogram.cuh b/cub/cub/block/block_histogram.cuh index d5726f240f6..8caf6a5bf59 100644 --- a/cub/cub/block/block_histogram.cuh +++ b/cub/cub/block/block_histogram.cuh @@ -358,7 +358,7 @@ public: // Initialize histogram bin counts to zeros InitHistogram(histogram); - CTA_SYNC(); + __syncthreads(); // Composite the histogram InternalBlockHistogram(temp_storage).Composite(items, histogram); diff --git a/cub/cub/block/block_merge_sort.cuh b/cub/cub/block/block_merge_sort.cuh index b6d0c8a33b1..7f931b0d42d 100644 --- a/cub/cub/block/block_merge_sort.cuh +++ b/cub/cub/block/block_merge_sort.cuh @@ -760,7 +760,7 @@ public: private: _CCCL_DEVICE _CCCL_FORCEINLINE void SyncImplementation() const { - CTA_SYNC(); + __syncthreads(); } friend BlockMergeSortStrategyT; diff --git a/cub/cub/block/block_radix_rank.cuh b/cub/cub/block/block_radix_rank.cuh index 92605b5168d..ad495e1db31 100644 --- a/cub/cub/block/block_radix_rank.cuh +++ b/cub/cub/block/block_radix_rank.cuh @@ -48,6 +48,7 @@ #include #include +#include #include #include #include @@ -477,12 +478,12 @@ public: *digit_counters[ITEM] = thread_prefixes[ITEM] + 1; } - CTA_SYNC(); + __syncthreads(); // Scan shared memory counters ScanCounters(); - CTA_SYNC(); + __syncthreads(); // Extract the local ranks of each key #pragma unroll @@ -710,13 +711,13 @@ public: temp_storage.aliasable.raking_grid[linear_tid][ITEM] = 0; } - CTA_SYNC(); + __syncthreads(); // Each warp will strip-mine its section of input, one strip at a time volatile DigitCounterT* digit_counters[KEYS_PER_THREAD]; uint32_t warp_id = linear_tid >> LOG_WARP_THREADS; - uint32_t lane_mask_lt = LaneMaskLt(); + uint32_t lane_mask_lt = ::cuda::ptx::get_sreg_lanemask_lt(); #pragma unroll for (int ITEM = 0; ITEM < KEYS_PER_THREAD; ++ITEM) @@ -740,7 +741,7 @@ public: DigitCounterT warp_digit_prefix = *digit_counters[ITEM]; // Warp-sync - WARP_SYNC(0xFFFFFFFF); + __syncwarp(0xFFFFFFFF); // Number of peers having same digit as me int32_t digit_count = __popc(peer_mask); @@ -755,13 +756,13 @@ public: } // Warp-sync - WARP_SYNC(0xFFFFFFFF); + __syncwarp(0xFFFFFFFF); // Number of prior keys having same digit ranks[ITEM] = warp_digit_prefix + DigitCounterT(peer_digit_prefix); } - CTA_SYNC(); + __syncthreads(); // Scan warp counters @@ -781,7 +782,7 @@ public: temp_storage.aliasable.raking_grid[linear_tid][ITEM] = scan_counters[ITEM]; } - CTA_SYNC(); + __syncthreads(); if (!::cuda::std::is_same>::value) { CallBack(callback); @@ -977,7 +978,7 @@ struct BlockRadixRankMatchEarlyCounts match_masks[bin] = 0; } } - WARP_SYNC(WARP_MASK); + __syncwarp(WARP_MASK); // compute private per-part histograms int part = lane % NUM_PARTS; @@ -991,7 +992,7 @@ struct BlockRadixRankMatchEarlyCounts // no extra work is necessary if NUM_PARTS == 1 if (NUM_PARTS > 1) { - WARP_SYNC(WARP_MASK); + __syncwarp(WARP_MASK); // TODO: handle RADIX_DIGITS % WARP_THREADS != 0 if it becomes necessary constexpr int WARP_BINS_PER_THREAD = RADIX_DIGITS / WARP_THREADS; int bins[WARP_BINS_PER_THREAD]; @@ -1001,7 +1002,7 @@ struct BlockRadixRankMatchEarlyCounts int bin = lane + u * WARP_THREADS; bins[u] = cub::ThreadReduce(warp_histograms[bin], ::cuda::std::plus<>{}); } - CTA_SYNC(); + __syncthreads(); // store the resulting histogram in shared memory int* warp_offsets = &s.warp_offsets[warp][0]; @@ -1066,22 +1067,22 @@ struct BlockRadixRankMatchEarlyCounts ::cuda::std::uint32_t bin = Digit(keys[u]); int* p_match_mask = &match_masks[bin]; atomicOr(p_match_mask, lane_mask); - WARP_SYNC(WARP_MASK); + __syncwarp(WARP_MASK); int bin_mask = *p_match_mask; int leader = (WARP_THREADS - 1) - __clz(bin_mask); int warp_offset = 0; - int popc = __popc(bin_mask & LaneMaskLe()); + int popc = __popc(bin_mask & ::cuda::ptx::get_sreg_lanemask_le()); if (lane == leader) { // atomic is a bit faster warp_offset = atomicAdd(&warp_offsets[bin], popc); } - warp_offset = SHFL_IDX_SYNC(warp_offset, leader, WARP_MASK); + warp_offset = __shfl_sync(WARP_MASK, warp_offset, leader); if (lane == leader) { *p_match_mask = 0; } - WARP_SYNC(WARP_MASK); + __syncwarp(WARP_MASK); ranks[u] = warp_offset + popc - 1; } } @@ -1099,13 +1100,13 @@ struct BlockRadixRankMatchEarlyCounts detail::warp_in_block_matcher_t::match_any(bin, warp); int leader = (WARP_THREADS - 1) - __clz(bin_mask); int warp_offset = 0; - int popc = __popc(bin_mask & LaneMaskLe()); + int popc = __popc(bin_mask & ::cuda::ptx::get_sreg_lanemask_le()); if (lane == leader) { // atomic is a bit faster warp_offset = atomicAdd(&warp_offsets[bin], popc); } - warp_offset = SHFL_IDX_SYNC(warp_offset, leader, WARP_MASK); + warp_offset = __shfl_sync(WARP_MASK, warp_offset, leader); ranks[u] = warp_offset + popc - 1; } } @@ -1117,7 +1118,7 @@ struct BlockRadixRankMatchEarlyCounts { ComputeHistogramsWarp(keys); - CTA_SYNC(); + __syncthreads(); int bins[BINS_PER_THREAD]; ComputeOffsetsWarpUpsweep(bins); callback(bins); @@ -1125,7 +1126,7 @@ struct BlockRadixRankMatchEarlyCounts BlockScan(s.prefix_tmp).ExclusiveSum(bins, exclusive_digit_prefix); ComputeOffsetsWarpDownsweep(exclusive_digit_prefix); - CTA_SYNC(); + __syncthreads(); ComputeRanksItem(keys, ranks, Int2Type()); } @@ -1135,7 +1136,7 @@ struct BlockRadixRankMatchEarlyCounts , digit_extractor(digit_extractor) , callback(callback) , warp(threadIdx.x / WARP_THREADS) - , lane(LaneId()) + , lane(::cuda::ptx::get_sreg_laneid()) {} }; diff --git a/cub/cub/block/block_radix_sort.cuh b/cub/cub/block/block_radix_sort.cuh index 3223b920b13..080053348d7 100644 --- a/cub/cub/block/block_radix_sort.cuh +++ b/cub/cub/block/block_radix_sort.cuh @@ -364,7 +364,7 @@ private: Int2Type /*is_keys_only*/, Int2Type /*is_blocked*/) { - CTA_SYNC(); + __syncthreads(); // Exchange values through shared memory in blocked arrangement BlockExchangeValues(temp_storage.exchange_values).ScatterToBlocked(values, ranks); @@ -377,7 +377,7 @@ private: Int2Type /*is_keys_only*/, Int2Type /*is_blocked*/) { - CTA_SYNC(); + __syncthreads(); // Exchange values through shared memory in blocked arrangement BlockExchangeValues(temp_storage.exchange_values).ScatterToStriped(values, ranks); @@ -443,7 +443,7 @@ private: RankKeys(unsigned_keys, ranks, digit_extractor, is_descending); begin_bit += RADIX_BITS; - CTA_SYNC(); + __syncthreads(); // Exchange keys through shared memory in blocked arrangement BlockExchangeKeys(temp_storage.exchange_keys).ScatterToBlocked(keys, ranks); @@ -457,7 +457,7 @@ private: break; } - CTA_SYNC(); + __syncthreads(); } // Untwiddle bits if necessary @@ -522,7 +522,7 @@ public: RankKeys(unsigned_keys, ranks, digit_extractor, is_descending); begin_bit += RADIX_BITS; - CTA_SYNC(); + __syncthreads(); // Check if this is the last pass if (begin_bit >= end_bit) @@ -543,7 +543,7 @@ public: // Exchange values through shared memory in blocked arrangement ExchangeValues(values, ranks, is_keys_only, Int2Type()); - CTA_SYNC(); + __syncthreads(); } // Untwiddle bits if necessary diff --git a/cub/cub/block/block_run_length_decode.cuh b/cub/cub/block/block_run_length_decode.cuh index 0dca0a5d838..2138ed31d7e 100644 --- a/cub/cub/block/block_run_length_decode.cuh +++ b/cub/cub/block/block_run_length_decode.cuh @@ -314,7 +314,7 @@ private: } // Ensure run offsets and run values have been written to shared memory - CTA_SYNC(); + __syncthreads(); } template @@ -335,7 +335,7 @@ private: total_decoded_size = static_cast(decoded_size_aggregate); // Ensure the prefix scan's temporary storage can be reused (may be superfluous, but depends on scan implementation) - CTA_SYNC(); + __syncthreads(); InitWithRunOffsets(run_values, run_offsets); } diff --git a/cub/cub/block/block_scan.cuh b/cub/cub/block/block_scan.cuh index c49eb36a52e..44b2342c7d1 100644 --- a/cub/cub/block/block_scan.cuh +++ b/cub/cub/block/block_scan.cuh @@ -477,7 +477,7 @@ public: //! // Collectively compute the block-wide exclusive prefix sum //! BlockScan(temp_storage).ExclusiveSum( //! thread_data, thread_data, prefix_op); - //! CTA_SYNC(); + //! __syncthreads(); //! //! // Store scanned items to output segment //! d_data[block_offset] = thread_data; @@ -714,17 +714,17 @@ public: //! // Load a segment of consecutive items that are blocked across threads //! int thread_data[4]; //! BlockLoad(temp_storage.load).Load(d_data + block_offset, thread_data); - //! CTA_SYNC(); + //! __syncthreads(); //! //! // Collectively compute the block-wide exclusive prefix sum //! int block_aggregate; //! BlockScan(temp_storage.scan).ExclusiveSum( //! thread_data, thread_data, prefix_op); - //! CTA_SYNC(); + //! __syncthreads(); //! //! // Store scanned items to output segment //! BlockStore(temp_storage.store).Store(d_data + block_offset, thread_data); - //! CTA_SYNC(); + //! __syncthreads(); //! } //! //! Suppose the input ``d_data`` is ``1, 1, 1, 1, 1, 1, 1, 1, ...``. @@ -957,7 +957,7 @@ public: //! // Collectively compute the block-wide exclusive prefix max scan //! BlockScan(temp_storage).ExclusiveScan( //! thread_data, thread_data, INT_MIN, cuda::maximum<>{}, prefix_op); - //! CTA_SYNC(); + //! __syncthreads(); //! //! // Store scanned items to output segment //! d_data[block_offset] = thread_data; @@ -1230,16 +1230,16 @@ public: //! // Load a segment of consecutive items that are blocked across threads //! int thread_data[4]; //! BlockLoad(temp_storage.load).Load(d_data + block_offset, thread_data); - //! CTA_SYNC(); + //! __syncthreads(); //! //! // Collectively compute the block-wide exclusive prefix max scan //! BlockScan(temp_storage.scan).ExclusiveScan( //! thread_data, thread_data, INT_MIN, cuda::maximum<>{}, prefix_op); - //! CTA_SYNC(); + //! __syncthreads(); //! //! // Store scanned items to output segment //! BlockStore(temp_storage.store).Store(d_data + block_offset, thread_data); - //! CTA_SYNC(); + //! __syncthreads(); //! } //! //! Suppose the input ``d_data`` is ``0, -1, 2, -3, 4, -5, ...``. @@ -1618,7 +1618,7 @@ public: //! // Collectively compute the block-wide inclusive prefix sum //! BlockScan(temp_storage).InclusiveSum( //! thread_data, thread_data, prefix_op); - //! CTA_SYNC(); + //! __syncthreads(); //! //! // Store scanned items to output segment //! d_data[block_offset] = thread_data; @@ -1874,16 +1874,16 @@ public: //! // Load a segment of consecutive items that are blocked across threads //! int thread_data[4]; //! BlockLoad(temp_storage.load).Load(d_data + block_offset, thread_data); - //! CTA_SYNC(); + //! __syncthreads(); //! //! // Collectively compute the block-wide inclusive prefix sum //! BlockScan(temp_storage.scan).IncluisveSum( //! thread_data, thread_data, prefix_op); - //! CTA_SYNC(); + //! __syncthreads(); //! //! // Store scanned items to output segment //! BlockStore(temp_storage.store).Store(d_data + block_offset, thread_data); - //! CTA_SYNC(); + //! __syncthreads(); //! } //! //! Suppose the input ``d_data`` is ``1, 1, 1, 1, 1, 1, 1, 1, ...``. @@ -2123,7 +2123,7 @@ public: //! // Collectively compute the block-wide inclusive prefix max scan //! BlockScan(temp_storage).InclusiveScan( //! thread_data, thread_data, cuda::maximum<>{}, prefix_op); - //! CTA_SYNC(); + //! __syncthreads(); //! //! // Store scanned items to output segment //! d_data[block_offset] = thread_data; @@ -2516,16 +2516,16 @@ public: //! // Load a segment of consecutive items that are blocked across threads //! int thread_data[4]; //! BlockLoad(temp_storage.load).Load(d_data + block_offset, thread_data); - //! CTA_SYNC(); + //! __syncthreads(); //! //! // Collectively compute the block-wide inclusive prefix max scan //! BlockScan(temp_storage.scan).InclusiveScan( //! thread_data, thread_data, cuda::maximum<>{}, prefix_op); - //! CTA_SYNC(); + //! __syncthreads(); //! //! // Store scanned items to output segment //! BlockStore(temp_storage.store).Store(d_data + block_offset, thread_data); - //! CTA_SYNC(); + //! __syncthreads(); //! } //! //! Suppose the input ``d_data`` is ``0, -1, 2, -3, 4, -5, ...``. diff --git a/cub/cub/block/block_shuffle.cuh b/cub/cub/block/block_shuffle.cuh index a3dedcc3c70..93d8715c63b 100644 --- a/cub/cub/block/block_shuffle.cuh +++ b/cub/cub/block/block_shuffle.cuh @@ -164,7 +164,7 @@ public: { temp_storage[linear_tid] = input; - CTA_SYNC(); + __syncthreads(); const int offset_tid = static_cast(linear_tid) + distance; if ((offset_tid >= 0) && (offset_tid < BLOCK_THREADS)) @@ -196,7 +196,7 @@ public: { temp_storage[linear_tid] = input; - CTA_SYNC(); + __syncthreads(); unsigned int offset = linear_tid + distance; if (offset >= BLOCK_THREADS) @@ -230,7 +230,7 @@ public: { temp_storage[linear_tid] = input[ITEMS_PER_THREAD - 1]; - CTA_SYNC(); + __syncthreads(); #pragma unroll for (int ITEM = ITEMS_PER_THREAD - 1; ITEM > 0; --ITEM) @@ -298,7 +298,7 @@ public: { temp_storage[linear_tid] = input[0]; - CTA_SYNC(); + __syncthreads(); #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD - 1; ITEM++) diff --git a/cub/cub/block/block_store.cuh b/cub/cub/block/block_store.cuh index 443f7a7f93b..e207a1d76c1 100644 --- a/cub/cub/block/block_store.cuh +++ b/cub/cub/block/block_store.cuh @@ -897,7 +897,7 @@ private: // subsequent loads temp_storage.valid_items = valid_items; } - CTA_SYNC(); + __syncthreads(); StoreDirectStriped(linear_tid, block_itr, items, temp_storage.valid_items); } }; @@ -980,7 +980,7 @@ private: // subsequent loads temp_storage.valid_items = valid_items; } - CTA_SYNC(); + __syncthreads(); StoreDirectWarpStriped(linear_tid, block_itr, items, temp_storage.valid_items); } }; @@ -1063,7 +1063,7 @@ private: // subsequent loads temp_storage.valid_items = valid_items; } - CTA_SYNC(); + __syncthreads(); StoreDirectWarpStriped(linear_tid, block_itr, items, temp_storage.valid_items); } }; diff --git a/cub/cub/block/specializations/block_histogram_sort.cuh b/cub/cub/block/specializations/block_histogram_sort.cuh index 7ef3c1264a5..38d49a3b8e6 100644 --- a/cub/cub/block/specializations/block_histogram_sort.cuh +++ b/cub/cub/block/specializations/block_histogram_sort.cuh @@ -187,7 +187,7 @@ struct BlockHistogramSort // Sort bytes in blocked arrangement BlockRadixSortT(temp_storage.sort).Sort(items); - CTA_SYNC(); + __syncthreads(); // Initialize the shared memory's run_begin and run_end for each bin int histo_offset = 0; @@ -205,7 +205,7 @@ struct BlockHistogramSort temp_storage.discontinuities.run_end[histo_offset + linear_tid] = TILE_SIZE; } - CTA_SYNC(); + __syncthreads(); int flags[ITEMS_PER_THREAD]; // unused @@ -219,7 +219,7 @@ struct BlockHistogramSort temp_storage.discontinuities.run_begin[items[0]] = 0; } - CTA_SYNC(); + __syncthreads(); // Composite into histogram histo_offset = 0; diff --git a/cub/cub/block/specializations/block_reduce_raking.cuh b/cub/cub/block/specializations/block_reduce_raking.cuh index 7c1db2c9050..7382732960b 100644 --- a/cub/cub/block/specializations/block_reduce_raking.cuh +++ b/cub/cub/block/specializations/block_reduce_raking.cuh @@ -212,7 +212,7 @@ struct BlockReduceRaking // Place partial into shared memory grid. *BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid) = partial; - CTA_SYNC(); + __syncthreads(); // Reduce parallelism to one warp if (linear_tid < RAKING_THREADS) @@ -228,7 +228,7 @@ struct BlockReduceRaking // sync before re-using shmem (warp_storage/raking_grid are aliased) static_assert(RAKING_THREADS <= CUB_PTX_WARP_THREADS, "RAKING_THREADS must be <= warp size."); unsigned int mask = static_cast((1ull << RAKING_THREADS) - 1); - WARP_SYNC(mask); + __syncwarp(mask); partial = WarpReduce(temp_storage.warp_storage) .template Reduce<(IS_FULL_TILE && RAKING_UNGUARDED)>(partial, valid_raking_threads, reduction_op); diff --git a/cub/cub/block/specializations/block_reduce_raking_commutative_only.cuh b/cub/cub/block/specializations/block_reduce_raking_commutative_only.cuh index 49401e87fb4..9bfd94f425d 100644 --- a/cub/cub/block/specializations/block_reduce_raking_commutative_only.cuh +++ b/cub/cub/block/specializations/block_reduce_raking_commutative_only.cuh @@ -167,7 +167,7 @@ struct BlockReduceRakingCommutativeOnly partial; } - CTA_SYNC(); + __syncthreads(); // Reduce parallelism to one warp if (linear_tid < RAKING_THREADS) @@ -214,7 +214,7 @@ struct BlockReduceRakingCommutativeOnly partial; } - CTA_SYNC(); + __syncthreads(); // Reduce parallelism to one warp if (linear_tid < RAKING_THREADS) diff --git a/cub/cub/block/specializations/block_reduce_warp_reductions.cuh b/cub/cub/block/specializations/block_reduce_warp_reductions.cuh index 4ee2b307bcf..efb47d6101e 100644 --- a/cub/cub/block/specializations/block_reduce_warp_reductions.cuh +++ b/cub/cub/block/specializations/block_reduce_warp_reductions.cuh @@ -48,6 +48,8 @@ #include #include +#include + CUB_NAMESPACE_BEGIN /** @@ -121,7 +123,7 @@ struct BlockReduceWarpReductions : temp_storage(temp_storage.Alias()) , linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) , warp_id((WARPS == 1) ? 0 : linear_tid / WARP_THREADS) - , lane_id(LaneId()) + , lane_id(::cuda::ptx::get_sreg_laneid()) {} /** @@ -184,7 +186,7 @@ struct BlockReduceWarpReductions detail::uninitialized_copy_single(temp_storage.warp_aggregates + warp_id, warp_aggregate); } - CTA_SYNC(); + __syncthreads(); // Update total aggregate in warp 0, lane 0 if (linear_tid == 0) diff --git a/cub/cub/block/specializations/block_scan_raking.cuh b/cub/cub/block/specializations/block_scan_raking.cuh index f0fe7a5ca2a..7f1b1887bc3 100644 --- a/cub/cub/block/specializations/block_scan_raking.cuh +++ b/cub/cub/block/specializations/block_scan_raking.cuh @@ -302,7 +302,7 @@ struct BlockScanRaking T* placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid); detail::uninitialized_copy_single(placement_ptr, input); - CTA_SYNC(); + __syncthreads(); // Reduce parallelism down to just raking threads if (linear_tid < RAKING_THREADS) @@ -318,7 +318,7 @@ struct BlockScanRaking ExclusiveDownsweep(scan_op, exclusive_partial, (linear_tid != 0)); } - CTA_SYNC(); + __syncthreads(); // Grab thread prefix from shared memory exclusive_output = *placement_ptr; @@ -355,7 +355,7 @@ struct BlockScanRaking T* placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid); detail::uninitialized_copy_single(placement_ptr, input); - CTA_SYNC(); + __syncthreads(); // Reduce parallelism down to just raking threads if (linear_tid < RAKING_THREADS) @@ -371,7 +371,7 @@ struct BlockScanRaking ExclusiveDownsweep(scan_op, exclusive_partial); } - CTA_SYNC(); + __syncthreads(); // Grab exclusive partial from shared memory output = *placement_ptr; @@ -410,7 +410,7 @@ struct BlockScanRaking T* placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid); detail::uninitialized_copy_single(placement_ptr, input); - CTA_SYNC(); + __syncthreads(); // Reduce parallelism down to just raking threads if (linear_tid < RAKING_THREADS) @@ -433,7 +433,7 @@ struct BlockScanRaking } } - CTA_SYNC(); + __syncthreads(); // Grab thread prefix from shared memory output = *placement_ptr; @@ -478,7 +478,7 @@ struct BlockScanRaking T* placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid); detail::uninitialized_copy_single(placement_ptr, input); - CTA_SYNC(); + __syncthreads(); // Reduce parallelism down to just raking threads if (linear_tid < RAKING_THREADS) @@ -501,7 +501,7 @@ struct BlockScanRaking } } - CTA_SYNC(); + __syncthreads(); // Grab exclusive partial from shared memory output = *placement_ptr; @@ -559,7 +559,7 @@ struct BlockScanRaking T* placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid); detail::uninitialized_copy_single(placement_ptr, input); - CTA_SYNC(); + __syncthreads(); // Reduce parallelism down to just raking threads if (linear_tid < RAKING_THREADS) @@ -588,7 +588,7 @@ struct BlockScanRaking ExclusiveDownsweep(scan_op, downsweep_prefix); } - CTA_SYNC(); + __syncthreads(); // Grab thread prefix from shared memory output = *placement_ptr; @@ -626,7 +626,7 @@ struct BlockScanRaking T* placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid); detail::uninitialized_copy_single(placement_ptr, input); - CTA_SYNC(); + __syncthreads(); // Reduce parallelism down to just raking threads if (linear_tid < RAKING_THREADS) @@ -642,7 +642,7 @@ struct BlockScanRaking InclusiveDownsweep(scan_op, exclusive_partial, (linear_tid != 0)); } - CTA_SYNC(); + __syncthreads(); // Grab thread prefix from shared memory output = *placement_ptr; @@ -680,7 +680,7 @@ struct BlockScanRaking T* placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid); detail::uninitialized_copy_single(placement_ptr, input); - CTA_SYNC(); + __syncthreads(); // Reduce parallelism down to just raking threads if (linear_tid < RAKING_THREADS) @@ -703,7 +703,7 @@ struct BlockScanRaking } } - CTA_SYNC(); + __syncthreads(); // Grab thread prefix from shared memory output = *placement_ptr; @@ -758,7 +758,7 @@ struct BlockScanRaking T* placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid); detail::uninitialized_copy_single(placement_ptr, input); - CTA_SYNC(); + __syncthreads(); // Reduce parallelism down to just raking threads if (linear_tid < RAKING_THREADS) @@ -787,7 +787,7 @@ struct BlockScanRaking InclusiveDownsweep(scan_op, downsweep_prefix); } - CTA_SYNC(); + __syncthreads(); // Grab thread prefix from shared memory output = *placement_ptr; diff --git a/cub/cub/block/specializations/block_scan_warp_scans.cuh b/cub/cub/block/specializations/block_scan_warp_scans.cuh index 851a71cbe7b..b71855132c1 100644 --- a/cub/cub/block/specializations/block_scan_warp_scans.cuh +++ b/cub/cub/block/specializations/block_scan_warp_scans.cuh @@ -47,6 +47,8 @@ #include #include +#include + CUB_NAMESPACE_BEGIN /** @@ -127,7 +129,7 @@ struct BlockScanWarpScans : temp_storage(temp_storage.Alias()) , linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) , warp_id((WARPS == 1) ? 0 : linear_tid / WARP_THREADS) - , lane_id(LaneId()) + , lane_id(::cuda::ptx::get_sreg_laneid()) {} //--------------------------------------------------------------------- @@ -197,7 +199,7 @@ struct BlockScanWarpScans detail::uninitialized_copy_single(temp_storage.warp_aggregates + warp_id, warp_aggregate); } - CTA_SYNC(); + __syncthreads(); // Accumulate block aggregates and save the one that is our warp's prefix T warp_prefix; @@ -423,7 +425,7 @@ struct BlockScanWarpScans } } - CTA_SYNC(); + __syncthreads(); // Incorporate thread block prefix into outputs T block_prefix = temp_storage.block_prefix; @@ -528,7 +530,7 @@ struct BlockScanWarpScans } } - CTA_SYNC(); + __syncthreads(); // Incorporate thread block prefix into outputs T block_prefix = temp_storage.block_prefix; diff --git a/cub/cub/device/dispatch/dispatch_batch_memcpy.cuh b/cub/cub/device/dispatch/dispatch_batch_memcpy.cuh index 46ff7cbced6..287f702b095 100644 --- a/cub/cub/device/dispatch/dispatch_batch_memcpy.cuh +++ b/cub/cub/device/dispatch/dispatch_batch_memcpy.cuh @@ -131,7 +131,7 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::AgentLargeBufferPolicyT::BLO // Make sure thread 0 does not overwrite the buffer id before other threads have finished with // the prior iteration of the loop - CTA_SYNC(); + __syncthreads(); // Binary search the buffer that this tile belongs to if (threadIdx.x == 0) @@ -140,7 +140,7 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::AgentLargeBufferPolicyT::BLO } // Make sure thread 0 has written the buffer this thread block is assigned to - CTA_SYNC(); + __syncthreads(); const BufferOffsetT buffer_id = block_buffer_id; diff --git a/cub/cub/device/dispatch/dispatch_radix_sort.cuh b/cub/cub/device/dispatch/dispatch_radix_sort.cuh index 0d4d9bf1ea9..c533afa1243 100644 --- a/cub/cub/device/dispatch/dispatch_radix_sort.cuh +++ b/cub/cub/device/dispatch/dispatch_radix_sort.cuh @@ -161,7 +161,7 @@ __launch_bounds__(int((ALT_DIGIT_BITS) ? int(ChainedPolicyT::ActivePolicy::AltUp upsweep.ProcessRegion(even_share.block_offset, even_share.block_end); - CTA_SYNC(); + __syncthreads(); // Write out digit counts (striped) upsweep.template ExtractCounts(d_spine, gridDim.x, blockIdx.x); @@ -432,7 +432,7 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::SingleTilePolicy::BLOCK_THRE // Load keys BlockLoadKeys(temp_storage.load_keys).Load(d_keys_in, keys, num_items, default_key); - CTA_SYNC(); + __syncthreads(); // Load values if (!KEYS_ONLY) @@ -443,7 +443,7 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::SingleTilePolicy::BLOCK_THRE BlockLoadValues(temp_storage.load_values).Load(d_values_in, values, num_items); - CTA_SYNC(); + __syncthreads(); } // Sort tile @@ -616,13 +616,13 @@ __launch_bounds__(int((ALT_DIGIT_BITS) ? ChainedPolicyT::ActivePolicy::AltSegmen BlockUpsweepT upsweep(temp_storage.upsweep, d_keys_in, current_bit, pass_bits, decomposer); upsweep.ProcessRegion(segment_begin, segment_end); - CTA_SYNC(); + __syncthreads(); // The count of each digit value in this pass (valid in the first RADIX_DIGITS threads) OffsetT bin_count[BINS_TRACKED_PER_THREAD]; upsweep.ExtractCounts(bin_count); - CTA_SYNC(); + __syncthreads(); if (IS_DESCENDING) { @@ -638,7 +638,7 @@ __launch_bounds__(int((ALT_DIGIT_BITS) ? ChainedPolicyT::ActivePolicy::AltSegmen } } - CTA_SYNC(); + __syncthreads(); #pragma unroll for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track) @@ -677,7 +677,7 @@ __launch_bounds__(int((ALT_DIGIT_BITS) ? ChainedPolicyT::ActivePolicy::AltSegmen } } - CTA_SYNC(); + __syncthreads(); #pragma unroll for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track) @@ -691,7 +691,7 @@ __launch_bounds__(int((ALT_DIGIT_BITS) ? ChainedPolicyT::ActivePolicy::AltSegmen } } - CTA_SYNC(); + __syncthreads(); // Downsweep BlockDownsweepT downsweep( diff --git a/cub/cub/device/dispatch/dispatch_segmented_sort.cuh b/cub/cub/device/dispatch/dispatch_segmented_sort.cuh index 713332165b0..81381e5dad8 100644 --- a/cub/cub/device/dispatch/dispatch_segmented_sort.cuh +++ b/cub/cub/device/dispatch/dispatch_segmented_sort.cuh @@ -257,7 +257,7 @@ __launch_bounds__(ChainedPolicyT::ActivePolicy::LargeSegmentPolicy::BLOCK_THREAD { pass_bits = (cub::min)(int{LargeSegmentPolicyT::RADIX_BITS}, (end_bit - current_bit)); - CTA_SYNC(); + __syncthreads(); agent.ProcessIterative( current_bit, pass_bits, @@ -535,7 +535,7 @@ __launch_bounds__(ChainedPolicyT::ActivePolicy::LargeSegmentPolicy::BLOCK_THREAD { pass_bits = (cub::min)(int{LargeSegmentPolicyT::RADIX_BITS}, (end_bit - current_bit)); - CTA_SYNC(); + __syncthreads(); agent.ProcessIterative( current_bit, pass_bits, diff --git a/cub/cub/grid/grid_barrier.cuh b/cub/cub/grid/grid_barrier.cuh index f2ae69fc091..7e134f7a63f 100644 --- a/cub/cub/grid/grid_barrier.cuh +++ b/cub/cub/grid/grid_barrier.cuh @@ -79,7 +79,7 @@ public: // Threadfence and syncthreads to make sure global writes are visible before // thread-0 reports in with its sync counter __threadfence(); - CTA_SYNC(); + __syncthreads(); if (blockIdx.x == 0) { @@ -89,7 +89,7 @@ public: d_vol_sync[blockIdx.x] = 1; } - CTA_SYNC(); + __syncthreads(); // Wait for everyone else to report in for (int peer_block = threadIdx.x; peer_block < gridDim.x; peer_block += blockDim.x) @@ -100,7 +100,7 @@ public: } } - CTA_SYNC(); + __syncthreads(); // Let everyone know it's safe to proceed for (int peer_block = threadIdx.x; peer_block < gridDim.x; peer_block += blockDim.x) @@ -122,7 +122,7 @@ public: } } - CTA_SYNC(); + __syncthreads(); } } }; diff --git a/cub/cub/util_ptx.cuh b/cub/cub/util_ptx.cuh index aa522d9576e..99beeed313e 100644 --- a/cub/cub/util_ptx.cuh +++ b/cub/cub/util_ptx.cuh @@ -52,34 +52,10 @@ CUB_NAMESPACE_BEGIN * Inlined PTX intrinsics ******************************************************************************/ -namespace detail -{ -/** - * @brief Shifts @p val left by the amount specified by unsigned 32-bit value in @p num_bits. If @p - * num_bits is larger than 32 bits, @p num_bits is clamped to 32. - */ -_CCCL_DEVICE _CCCL_FORCEINLINE uint32_t LogicShiftLeft(uint32_t val, uint32_t num_bits) -{ - uint32_t ret{}; - asm("shl.b32 %0, %1, %2;" : "=r"(ret) : "r"(val), "r"(num_bits)); - return ret; -} - -/** - * @brief Shifts @p val right by the amount specified by unsigned 32-bit value in @p num_bits. If @p - * num_bits is larger than 32 bits, @p num_bits is clamped to 32. - */ -_CCCL_DEVICE _CCCL_FORCEINLINE uint32_t LogicShiftRight(uint32_t val, uint32_t num_bits) -{ - uint32_t ret{}; - asm("shr.b32 %0, %1, %2;" : "=r"(ret) : "r"(val), "r"(num_bits)); - return ret; -} -} // namespace detail - /** * \brief Shift-right then add. Returns (\p x >> \p shift) + \p addend. */ +CCCL_DEPRECATED_BECAUSE("will be removed in the next major release") _CCCL_DEVICE _CCCL_FORCEINLINE unsigned int SHR_ADD(unsigned int x, unsigned int shift, unsigned int addend) { unsigned int ret; @@ -90,6 +66,7 @@ _CCCL_DEVICE _CCCL_FORCEINLINE unsigned int SHR_ADD(unsigned int x, unsigned int /** * \brief Shift-left then add. Returns (\p x << \p shift) + \p addend. */ +CCCL_DEPRECATED_BECAUSE("will be removed in the next major release") _CCCL_DEVICE _CCCL_FORCEINLINE unsigned int SHL_ADD(unsigned int x, unsigned int shift, unsigned int addend) { unsigned int ret; @@ -150,6 +127,7 @@ _CCCL_DEVICE _CCCL_FORCEINLINE unsigned int BFE(UnsignedBits source, unsigned in /** * \brief Bitfield insert. Inserts the \p num_bits least significant bits of \p y into \p x at bit-offset \p bit_start. */ +CCCL_DEPRECATED_BECAUSE("will be removed in the next major release") _CCCL_DEVICE _CCCL_FORCEINLINE void BFI(unsigned int& ret, unsigned int x, unsigned int y, unsigned int bit_start, unsigned int num_bits) { @@ -159,6 +137,7 @@ BFI(unsigned int& ret, unsigned int x, unsigned int y, unsigned int bit_start, u /** * \brief Three-operand add. Returns \p x + \p y + \p z. */ +CCCL_DEPRECATED_BECAUSE("will be removed in the next major release") _CCCL_DEVICE _CCCL_FORCEINLINE unsigned int IADD3(unsigned int x, unsigned int y, unsigned int z) { asm("vadd.u32.u32.u32.add %0, %1, %2, %3;" : "=r"(x) : "r"(x), "r"(y), "r"(z)); @@ -192,6 +171,7 @@ _CCCL_DEVICE _CCCL_FORCEINLINE unsigned int IADD3(unsigned int x, unsigned int y * \endcode * */ +CCCL_DEPRECATED_BECAUSE("will be removed in the next major release") _CCCL_DEVICE _CCCL_FORCEINLINE int PRMT(unsigned int a, unsigned int b, unsigned int index) { int ret; @@ -204,6 +184,7 @@ _CCCL_DEVICE _CCCL_FORCEINLINE int PRMT(unsigned int a, unsigned int b, unsigned /** * Sync-threads barrier. */ +CCCL_DEPRECATED_BECAUSE("will be removed in the next major release") _CCCL_DEVICE _CCCL_FORCEINLINE void BAR(int count) { asm volatile("bar.sync 1, %0;" : : "r"(count)); @@ -212,6 +193,7 @@ _CCCL_DEVICE _CCCL_FORCEINLINE void BAR(int count) /** * CTA barrier */ +CCCL_DEPRECATED_BECAUSE("use __syncthreads() instead") _CCCL_DEVICE _CCCL_FORCEINLINE void CTA_SYNC() { __syncthreads(); @@ -220,6 +202,7 @@ _CCCL_DEVICE _CCCL_FORCEINLINE void CTA_SYNC() /** * CTA barrier with predicate */ +CCCL_DEPRECATED_BECAUSE("use __syncthreads_and() instead") _CCCL_DEVICE _CCCL_FORCEINLINE int CTA_SYNC_AND(int p) { return __syncthreads_and(p); @@ -228,6 +211,7 @@ _CCCL_DEVICE _CCCL_FORCEINLINE int CTA_SYNC_AND(int p) /** * CTA barrier with predicate */ +CCCL_DEPRECATED_BECAUSE("use __syncthreads_or() instead") _CCCL_DEVICE _CCCL_FORCEINLINE int CTA_SYNC_OR(int p) { return __syncthreads_or(p); @@ -236,6 +220,7 @@ _CCCL_DEVICE _CCCL_FORCEINLINE int CTA_SYNC_OR(int p) /** * Warp barrier */ +CCCL_DEPRECATED_BECAUSE("use __syncwarp() instead") _CCCL_DEVICE _CCCL_FORCEINLINE void WARP_SYNC(unsigned int member_mask) { __syncwarp(member_mask); @@ -244,6 +229,7 @@ _CCCL_DEVICE _CCCL_FORCEINLINE void WARP_SYNC(unsigned int member_mask) /** * Warp any */ +CCCL_DEPRECATED_BECAUSE("use __any_sync() instead") _CCCL_DEVICE _CCCL_FORCEINLINE int WARP_ANY(int predicate, unsigned int member_mask) { return __any_sync(member_mask, predicate); @@ -252,6 +238,7 @@ _CCCL_DEVICE _CCCL_FORCEINLINE int WARP_ANY(int predicate, unsigned int member_m /** * Warp any */ +CCCL_DEPRECATED_BECAUSE("use __all_sync() instead") _CCCL_DEVICE _CCCL_FORCEINLINE int WARP_ALL(int predicate, unsigned int member_mask) { return __all_sync(member_mask, predicate); @@ -260,6 +247,7 @@ _CCCL_DEVICE _CCCL_FORCEINLINE int WARP_ALL(int predicate, unsigned int member_m /** * Warp ballot */ +CCCL_DEPRECATED_BECAUSE("use __ballot_sync() instead") _CCCL_DEVICE _CCCL_FORCEINLINE int WARP_BALLOT(int predicate, unsigned int member_mask) { return __ballot_sync(member_mask, predicate); @@ -292,6 +280,7 @@ SHFL_DOWN_SYNC(unsigned int word, int src_offset, int flags, unsigned int member /** * Warp synchronous shfl_idx */ +CCCL_DEPRECATED_BECAUSE("use __shfl_sync() instead") _CCCL_DEVICE _CCCL_FORCEINLINE unsigned int SHFL_IDX_SYNC(unsigned int word, int src_lane, int flags, unsigned int member_mask) { @@ -304,6 +293,7 @@ SHFL_IDX_SYNC(unsigned int word, int src_lane, int flags, unsigned int member_ma /** * Warp synchronous shfl_idx */ +CCCL_DEPRECATED_BECAUSE("use __shfl_sync() instead") _CCCL_DEVICE _CCCL_FORCEINLINE unsigned int SHFL_IDX_SYNC(unsigned int word, int src_lane, unsigned int member_mask) { return __shfl_sync(member_mask, word, src_lane); @@ -312,6 +302,7 @@ _CCCL_DEVICE _CCCL_FORCEINLINE unsigned int SHFL_IDX_SYNC(unsigned int word, int /** * Floating point multiply. (Mantissa LSB rounds towards zero.) */ +CCCL_DEPRECATED_BECAUSE("will be removed in the next major release") _CCCL_DEVICE _CCCL_FORCEINLINE float FMUL_RZ(float a, float b) { float d; @@ -322,6 +313,7 @@ _CCCL_DEVICE _CCCL_FORCEINLINE float FMUL_RZ(float a, float b) /** * Floating point multiply-add. (Mantissa LSB rounds towards zero.) */ +CCCL_DEPRECATED_BECAUSE("will be removed in the next major release") _CCCL_DEVICE _CCCL_FORCEINLINE float FFMA_RZ(float a, float b, float c) { float d; @@ -342,6 +334,7 @@ _CCCL_DEVICE _CCCL_FORCEINLINE void ThreadExit() /** * \brief Abort execution and generate an interrupt to the host CPU */ +CCCL_DEPRECATED_BECAUSE("use cuda::std::terminate() instead") _CCCL_DEVICE _CCCL_FORCEINLINE void ThreadTrap() { asm volatile("trap;"); @@ -359,6 +352,7 @@ _CCCL_DEVICE _CCCL_FORCEINLINE int RowMajorTid(int block_dim_x, int block_dim_y, /** * \brief Returns the warp lane ID of the calling thread */ +CCCL_DEPRECATED_BECAUSE("use cuda::ptx::get_sreg_laneid() instead") _CCCL_DEVICE _CCCL_FORCEINLINE unsigned int LaneId() { unsigned int ret; @@ -370,6 +364,7 @@ _CCCL_DEVICE _CCCL_FORCEINLINE unsigned int LaneId() * \brief Returns the warp ID of the calling thread. Warp ID is guaranteed to be unique among warps, but may not * correspond to a zero-based ranking within the thread block. */ +CCCL_DEPRECATED_BECAUSE("use cuda::ptx::get_sreg_warpid() instead") _CCCL_DEVICE _CCCL_FORCEINLINE unsigned int WarpId() { unsigned int ret; @@ -409,6 +404,7 @@ _CCCL_HOST_DEVICE _CCCL_FORCEINLINE unsigned int WarpMask(unsigned int warp_id) /** * \brief Returns the warp lane mask of all lanes less than the calling thread */ +CCCL_DEPRECATED_BECAUSE("use cuda::ptx::get_sreg_lanemask_lt() instead") _CCCL_DEVICE _CCCL_FORCEINLINE unsigned int LaneMaskLt() { unsigned int ret; @@ -419,6 +415,7 @@ _CCCL_DEVICE _CCCL_FORCEINLINE unsigned int LaneMaskLt() /** * \brief Returns the warp lane mask of all lanes less than or equal to the calling thread */ +CCCL_DEPRECATED_BECAUSE("use cuda::ptx::get_sreg_lanemask_le() instead") _CCCL_DEVICE _CCCL_FORCEINLINE unsigned int LaneMaskLe() { unsigned int ret; @@ -429,6 +426,7 @@ _CCCL_DEVICE _CCCL_FORCEINLINE unsigned int LaneMaskLe() /** * \brief Returns the warp lane mask of all lanes greater than the calling thread */ +CCCL_DEPRECATED_BECAUSE("use cuda::ptx::get_sreg_lanemask_gt() instead") _CCCL_DEVICE _CCCL_FORCEINLINE unsigned int LaneMaskGt() { unsigned int ret; @@ -439,6 +437,7 @@ _CCCL_DEVICE _CCCL_FORCEINLINE unsigned int LaneMaskGt() /** * \brief Returns the warp lane mask of all lanes greater than or equal to the calling thread */ +CCCL_DEPRECATED_BECAUSE("use cuda::ptx::get_sreg_lanemask_ge() instead") _CCCL_DEVICE _CCCL_FORCEINLINE unsigned int LaneMaskGe() { unsigned int ret; @@ -659,12 +658,6 @@ _CCCL_DEVICE _CCCL_FORCEINLINE T ShuffleDown(T input, int src_offset, int last_t template _CCCL_DEVICE _CCCL_FORCEINLINE T ShuffleIndex(T input, int src_lane, unsigned int member_mask) { - /// The 5-bit SHFL mask for logically splitting warps into sub-segments starts 8-bits up - enum - { - SHFL_C = ((32 - LOGICAL_WARP_THREADS) << 8) | (LOGICAL_WARP_THREADS - 1) - }; - using ShuffleWord = typename UnitWord::ShuffleWord; constexpr int WORDS = (sizeof(T) + sizeof(ShuffleWord) - 1) / sizeof(ShuffleWord); @@ -674,18 +667,14 @@ _CCCL_DEVICE _CCCL_FORCEINLINE T ShuffleIndex(T input, int src_lane, unsigned in ShuffleWord* input_alias = reinterpret_cast(&input); unsigned int shuffle_word; - shuffle_word = SHFL_IDX_SYNC((unsigned int) input_alias[0], src_lane, SHFL_C, member_mask); - + shuffle_word = __shfl_sync(member_mask, (unsigned int) input_alias[0], src_lane, LOGICAL_WARP_THREADS); output_alias[0] = shuffle_word; - #pragma unroll for (int WORD = 1; WORD < WORDS; ++WORD) { - shuffle_word = SHFL_IDX_SYNC((unsigned int) input_alias[WORD], src_lane, SHFL_C, member_mask); - + shuffle_word = __shfl_sync(member_mask, (unsigned int) input_alias[WORD], src_lane, LOGICAL_WARP_THREADS); output_alias[WORD] = shuffle_word; } - return output; } @@ -750,6 +739,28 @@ struct warp_matcher_t } }; +/** + * @brief Shifts @p val left by the amount specified by unsigned 32-bit value in @p num_bits. If @p + * num_bits is larger than 32 bits, @p num_bits is clamped to 32. + */ +_CCCL_DEVICE _CCCL_FORCEINLINE uint32_t LogicShiftLeft(uint32_t val, uint32_t num_bits) +{ + uint32_t ret{}; + asm("shl.b32 %0, %1, %2;" : "=r"(ret) : "r"(val), "r"(num_bits)); + return ret; +} + +/** + * @brief Shifts @p val right by the amount specified by unsigned 32-bit value in @p num_bits. If @p + * num_bits is larger than 32 bits, @p num_bits is clamped to 32. + */ +_CCCL_DEVICE _CCCL_FORCEINLINE uint32_t LogicShiftRight(uint32_t val, uint32_t num_bits) +{ + uint32_t ret{}; + asm("shr.b32 %0, %1, %2;" : "=r"(ret) : "r"(val), "r"(num_bits)); + return ret; +} + } // namespace detail #endif // _CCCL_DOXYGEN_INVOKED diff --git a/cub/cub/util_vsmem.cuh b/cub/cub/util_vsmem.cuh index f5926ce11e5..baba489c0ae 100644 --- a/cub/cub/util_vsmem.cuh +++ b/cub/cub/util_vsmem.cuh @@ -168,7 +168,7 @@ public: static _CCCL_DEVICE _CCCL_FORCEINLINE bool discard_temp_storage(typename AgentT::TempStorage& temp_storage) { // Ensure all threads finished using temporary storage - CTA_SYNC(); + __syncthreads(); const std::size_t linear_tid = threadIdx.x; const std::size_t block_stride = line_size * blockDim.x; diff --git a/cub/cub/warp/specializations/warp_exchange_shfl.cuh b/cub/cub/warp/specializations/warp_exchange_shfl.cuh index 5abfa7cdd2f..f874f961caa 100644 --- a/cub/cub/warp/specializations/warp_exchange_shfl.cuh +++ b/cub/cub/warp/specializations/warp_exchange_shfl.cuh @@ -40,6 +40,8 @@ #include #include +#include + CUB_NAMESPACE_BEGIN namespace detail @@ -273,8 +275,8 @@ public: WarpExchangeShfl() = delete; explicit _CCCL_DEVICE _CCCL_FORCEINLINE WarpExchangeShfl(TempStorage&) - : lane_id(IS_ARCH_WARP ? LaneId() : (LaneId() % LOGICAL_WARP_THREADS)) - , warp_id(IS_ARCH_WARP ? 0 : (LaneId() / LOGICAL_WARP_THREADS)) + : lane_id(IS_ARCH_WARP ? ::cuda::ptx::get_sreg_laneid() : (::cuda::ptx::get_sreg_laneid() % LOGICAL_WARP_THREADS)) + , warp_id(IS_ARCH_WARP ? 0 : (::cuda::ptx::get_sreg_laneid() / LOGICAL_WARP_THREADS)) , member_mask(WarpMask(warp_id)) {} diff --git a/cub/cub/warp/specializations/warp_exchange_smem.cuh b/cub/cub/warp/specializations/warp_exchange_smem.cuh index aabb9e291e9..35b688f813c 100644 --- a/cub/cub/warp/specializations/warp_exchange_smem.cuh +++ b/cub/cub/warp/specializations/warp_exchange_smem.cuh @@ -46,6 +46,8 @@ #include #include +#include + CUB_NAMESPACE_BEGIN namespace detail @@ -88,8 +90,8 @@ public: explicit _CCCL_DEVICE _CCCL_FORCEINLINE WarpExchangeSmem(TempStorage& temp_storage) : temp_storage(temp_storage.Alias()) - , lane_id(IS_ARCH_WARP ? LaneId() : (LaneId() % LOGICAL_WARP_THREADS)) - , warp_id(IS_ARCH_WARP ? 0 : (LaneId() / LOGICAL_WARP_THREADS)) + , lane_id(IS_ARCH_WARP ? ::cuda::ptx::get_sreg_laneid() : (::cuda::ptx::get_sreg_laneid() % LOGICAL_WARP_THREADS)) + , warp_id(IS_ARCH_WARP ? 0 : (::cuda::ptx::get_sreg_laneid() / LOGICAL_WARP_THREADS)) , member_mask(WarpMask(warp_id)) {} @@ -102,7 +104,7 @@ public: const int idx = ITEMS_PER_THREAD * lane_id + item; temp_storage.items_shared[idx] = input_items[item]; } - WARP_SYNC(member_mask); + __syncwarp(member_mask); for (int item = 0; item < ITEMS_PER_THREAD; item++) { @@ -120,7 +122,7 @@ public: const int idx = LOGICAL_WARP_THREADS * item + lane_id; temp_storage.items_shared[idx] = input_items[item]; } - WARP_SYNC(member_mask); + __syncwarp(member_mask); for (int item = 0; item < ITEMS_PER_THREAD; item++) { @@ -147,13 +149,13 @@ public: { if (INSERT_PADDING) { - ranks[ITEM] = SHR_ADD(ranks[ITEM], LOG_SMEM_BANKS, ranks[ITEM]); + ranks[ITEM] = (ranks[ITEM] >> LOG_SMEM_BANKS) + ranks[ITEM]; } temp_storage.items_shared[ranks[ITEM]] = input_items[ITEM]; } - WARP_SYNC(member_mask); + __syncwarp(member_mask); #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) @@ -162,7 +164,7 @@ public: if (INSERT_PADDING) { - item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset); + item_offset = (item_offset >> LOG_SMEM_BANKS) + item_offset; } output_items[ITEM] = temp_storage.items_shared[item_offset]; diff --git a/cub/cub/warp/specializations/warp_reduce_shfl.cuh b/cub/cub/warp/specializations/warp_reduce_shfl.cuh index 3e0db152123..55df1f3beee 100644 --- a/cub/cub/warp/specializations/warp_reduce_shfl.cuh +++ b/cub/cub/warp/specializations/warp_reduce_shfl.cuh @@ -48,6 +48,7 @@ #include #include +#include #include #include @@ -155,7 +156,7 @@ struct WarpReduceShfl /// Constructor _CCCL_DEVICE _CCCL_FORCEINLINE WarpReduceShfl(TempStorage& /*temp_storage*/) - : lane_id(static_cast(LaneId())) + : lane_id(static_cast(::cuda::ptx::get_sreg_laneid())) , warp_id(IS_ARCH_WARP ? 0 : (lane_id / LOGICAL_WARP_THREADS)) , member_mask(WarpMask(warp_id)) { @@ -699,7 +700,7 @@ struct WarpReduceShfl _CCCL_DEVICE _CCCL_FORCEINLINE T SegmentedReduce(T input, FlagT flag, ReductionOp reduction_op) { // Get the start flags for each thread in the warp. - int warp_flags = WARP_BALLOT(flag, member_mask); + int warp_flags = __ballot_sync(member_mask, flag); // Convert to tail-segmented if (HEAD_SEGMENTED) @@ -708,7 +709,7 @@ struct WarpReduceShfl } // Mask out the bits below the current thread - warp_flags &= LaneMaskGe(); + warp_flags &= ::cuda::ptx::get_sreg_lanemask_ge(); // Mask of physical lanes outside the logical warp and convert to logical lanemask if (!IS_ARCH_WARP) diff --git a/cub/cub/warp/specializations/warp_reduce_smem.cuh b/cub/cub/warp/specializations/warp_reduce_smem.cuh index 87b38db2aa3..d7884e26753 100644 --- a/cub/cub/warp/specializations/warp_reduce_smem.cuh +++ b/cub/cub/warp/specializations/warp_reduce_smem.cuh @@ -49,6 +49,8 @@ #include #include +#include + CUB_NAMESPACE_BEGIN /** @@ -123,8 +125,8 @@ struct WarpReduceSmem /// Constructor explicit _CCCL_DEVICE _CCCL_FORCEINLINE WarpReduceSmem(TempStorage& temp_storage) : temp_storage(temp_storage.Alias()) - , lane_id(IS_ARCH_WARP ? LaneId() : LaneId() % LOGICAL_WARP_THREADS) - , member_mask(WarpMask(LaneId() / LOGICAL_WARP_THREADS)) + , lane_id(IS_ARCH_WARP ? ::cuda::ptx::get_sreg_laneid() : ::cuda::ptx::get_sreg_laneid() % LOGICAL_WARP_THREADS) + , member_mask(WarpMask(::cuda::ptx::get_sreg_laneid() / LOGICAL_WARP_THREADS)) {} /****************************************************************************** @@ -159,7 +161,7 @@ struct WarpReduceSmem // Share input through buffer ThreadStore(&temp_storage.reduce[lane_id], input); - WARP_SYNC(member_mask); + __syncwarp(member_mask); // Update input if peer_addend is in range if ((ALL_LANES_VALID && IS_POW_OF_TWO) || ((lane_id + OFFSET) < valid_items)) @@ -168,7 +170,7 @@ struct WarpReduceSmem input = reduction_op(input, peer_addend); } - WARP_SYNC(member_mask); + __syncwarp(member_mask); return ReduceStep(input, valid_items, reduction_op, Int2Type()); } @@ -222,7 +224,7 @@ struct WarpReduceSmem SegmentedReduce(T input, FlagT flag, ReductionOp reduction_op, Int2Type /*has_ballot*/) { // Get the start flags for each thread in the warp. - int warp_flags = WARP_BALLOT(flag, member_mask); + int warp_flags = __ballot_sync(member_mask, flag); if (!HEAD_SEGMENTED) { @@ -230,12 +232,12 @@ struct WarpReduceSmem } // Keep bits above the current thread. - warp_flags &= LaneMaskGt(); + warp_flags &= ::cuda::ptx::get_sreg_lanemask_gt(); // Accommodate packing of multiple logical warps in a single physical warp if (!IS_ARCH_WARP) { - warp_flags >>= (LaneId() / LOGICAL_WARP_THREADS) * LOGICAL_WARP_THREADS; + warp_flags >>= (::cuda::ptx::get_sreg_laneid() / LOGICAL_WARP_THREADS) * LOGICAL_WARP_THREADS; } // Find next flag @@ -255,7 +257,7 @@ struct WarpReduceSmem // Share input into buffer ThreadStore(&temp_storage.reduce[lane_id], input); - WARP_SYNC(member_mask); + __syncwarp(member_mask); // Update input if peer_addend is in range if (OFFSET + lane_id < next_flag) @@ -264,7 +266,7 @@ struct WarpReduceSmem input = reduction_op(input, peer_addend); } - WARP_SYNC(member_mask); + __syncwarp(member_mask); } return input; @@ -311,12 +313,12 @@ struct WarpReduceSmem // Share input through buffer ThreadStore(&temp_storage.reduce[lane_id], input); - WARP_SYNC(member_mask); + __syncwarp(member_mask); // Get peer from buffer T peer_addend = ThreadLoad(&temp_storage.reduce[lane_id + OFFSET]); - WARP_SYNC(member_mask); + __syncwarp(member_mask); // Share flag through buffer flag_storage[lane_id] = flag_status; diff --git a/cub/cub/warp/specializations/warp_scan_shfl.cuh b/cub/cub/warp/specializations/warp_scan_shfl.cuh index c3952b96b4f..22d6b4b6f0a 100644 --- a/cub/cub/warp/specializations/warp_scan_shfl.cuh +++ b/cub/cub/warp/specializations/warp_scan_shfl.cuh @@ -48,6 +48,8 @@ #include #include +#include + CUB_NAMESPACE_BEGIN /** @@ -116,7 +118,7 @@ struct WarpScanShfl /// Constructor explicit _CCCL_DEVICE _CCCL_FORCEINLINE WarpScanShfl(TempStorage& /*temp_storage*/) - : lane_id(LaneId()) + : lane_id(::cuda::ptx::get_sreg_laneid()) , warp_id(IS_ARCH_WARP ? 0 : (lane_id / LOGICAL_WARP_THREADS)) , member_mask(WarpMask(warp_id)) { @@ -540,10 +542,10 @@ struct WarpScanShfl KeyT pred_key = ShuffleUp(inclusive_output.key, 1, 0, member_mask); - unsigned int ballot = WARP_BALLOT((pred_key != inclusive_output.key), member_mask); + unsigned int ballot = __ballot_sync(member_mask, (pred_key != inclusive_output.key)); // Mask away all lanes greater than ours - ballot = ballot & LaneMaskLe(); + ballot = ballot & ::cuda::ptx::get_sreg_lanemask_le(); // Find index of first set bit int segment_first_lane = CUB_MAX(0, 31 - __clz(ballot)); diff --git a/cub/cub/warp/specializations/warp_scan_smem.cuh b/cub/cub/warp/specializations/warp_scan_smem.cuh index 90bdfbf361a..336416b69d9 100644 --- a/cub/cub/warp/specializations/warp_scan_smem.cuh +++ b/cub/cub/warp/specializations/warp_scan_smem.cuh @@ -49,6 +49,8 @@ #include #include +#include + CUB_NAMESPACE_BEGIN /** @@ -113,10 +115,10 @@ struct WarpScanSmem : temp_storage(temp_storage.Alias()) , - lane_id(IS_ARCH_WARP ? LaneId() : LaneId() % LOGICAL_WARP_THREADS) + lane_id(IS_ARCH_WARP ? ::cuda::ptx::get_sreg_laneid() : ::cuda::ptx::get_sreg_laneid() % LOGICAL_WARP_THREADS) , - member_mask(WarpMask(LaneId() / LOGICAL_WARP_THREADS)) + member_mask(WarpMask(::cuda::ptx::get_sreg_laneid() / LOGICAL_WARP_THREADS)) {} /****************************************************************************** @@ -132,7 +134,7 @@ struct WarpScanSmem // Share partial into buffer ThreadStore(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) partial); - WARP_SYNC(member_mask); + __syncwarp(member_mask); // Update partial if addend is in range if (HAS_IDENTITY || (lane_id >= OFFSET)) @@ -140,7 +142,7 @@ struct WarpScanSmem T addend = (T) ThreadLoad(&temp_storage[HALF_WARP_THREADS + lane_id - OFFSET]); partial = scan_op(addend, partial); } - WARP_SYNC(member_mask); + __syncwarp(member_mask); ScanStep(partial, scan_op, Int2Type()); } @@ -171,7 +173,7 @@ struct WarpScanSmem T identity = 0; ThreadStore(&temp_storage[lane_id], (CellT) identity); - WARP_SYNC(member_mask); + __syncwarp(member_mask); // Iterate scan steps output = input; @@ -226,7 +228,7 @@ struct WarpScanSmem ThreadStore(temp_storage, (CellT) input); } - WARP_SYNC(member_mask); + __syncwarp(member_mask); return (T) ThreadLoad(temp_storage); } @@ -276,11 +278,11 @@ struct WarpScanSmem // Retrieve aggregate ThreadStore(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) inclusive_output); - WARP_SYNC(member_mask); + __syncwarp(member_mask); warp_aggregate = (T) ThreadLoad(&temp_storage[WARP_SMEM_ELEMENTS - 1]); - WARP_SYNC(member_mask); + __syncwarp(member_mask); } //--------------------------------------------------------------------- @@ -307,7 +309,7 @@ struct WarpScanSmem // initial value unknown ThreadStore(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) inclusive); - WARP_SYNC(member_mask); + __syncwarp(member_mask); exclusive = (T) ThreadLoad(&temp_storage[HALF_WARP_THREADS + lane_id - 1]); } @@ -334,7 +336,7 @@ struct WarpScanSmem inclusive = scan_op(initial_value, inclusive); ThreadStore(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) inclusive); - WARP_SYNC(member_mask); + __syncwarp(member_mask); exclusive = (T) ThreadLoad(&temp_storage[HALF_WARP_THREADS + lane_id - 1]); if (lane_id == 0) @@ -364,7 +366,7 @@ struct WarpScanSmem // Initial value presumed to be unknown or identity (either way our padding is correct) ThreadStore(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) inclusive); - WARP_SYNC(member_mask); + __syncwarp(member_mask); exclusive = (T) ThreadLoad(&temp_storage[HALF_WARP_THREADS + lane_id - 1]); warp_aggregate = (T) ThreadLoad(&temp_storage[WARP_SMEM_ELEMENTS - 1]); @@ -385,7 +387,7 @@ struct WarpScanSmem // Initial value presumed to be unknown or identity (either way our padding is correct) ThreadStore(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) inclusive); - WARP_SYNC(member_mask); + __syncwarp(member_mask); warp_aggregate = (T) ThreadLoad(&temp_storage[WARP_SMEM_ELEMENTS - 1]); exclusive = inclusive - input; @@ -408,11 +410,11 @@ struct WarpScanSmem // Broadcast warp aggregate ThreadStore(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) inclusive); - WARP_SYNC(member_mask); + __syncwarp(member_mask); warp_aggregate = (T) ThreadLoad(&temp_storage[WARP_SMEM_ELEMENTS - 1]); - WARP_SYNC(member_mask); + __syncwarp(member_mask); // Update inclusive with initial value inclusive = scan_op(initial_value, inclusive); @@ -420,7 +422,7 @@ struct WarpScanSmem // Get exclusive from exclusive ThreadStore(&temp_storage[HALF_WARP_THREADS + lane_id - 1], (CellT) inclusive); - WARP_SYNC(member_mask); + __syncwarp(member_mask); exclusive = (T) ThreadLoad(&temp_storage[HALF_WARP_THREADS + lane_id - 2]); diff --git a/cub/cub/warp/warp_load.cuh b/cub/cub/warp/warp_load.cuh index ac5c700b958..3f11129c35a 100644 --- a/cub/cub/warp/warp_load.cuh +++ b/cub/cub/warp/warp_load.cuh @@ -46,6 +46,8 @@ #include #include +#include + CUB_NAMESPACE_BEGIN //! @rst @@ -438,14 +440,16 @@ public: //! shared memory as temporary storage. _CCCL_DEVICE _CCCL_FORCEINLINE WarpLoad() : temp_storage(PrivateStorage()) - , linear_tid(IS_ARCH_WARP ? LaneId() : (LaneId() % LOGICAL_WARP_THREADS)) + , linear_tid( + IS_ARCH_WARP ? ::cuda::ptx::get_sreg_laneid() : (::cuda::ptx::get_sreg_laneid() % LOGICAL_WARP_THREADS)) {} //! @brief Collective constructor using the specified memory allocation as //! temporary storage. _CCCL_DEVICE _CCCL_FORCEINLINE WarpLoad(TempStorage& temp_storage) : temp_storage(temp_storage.Alias()) - , linear_tid(IS_ARCH_WARP ? LaneId() : (LaneId() % LOGICAL_WARP_THREADS)) + , linear_tid( + IS_ARCH_WARP ? ::cuda::ptx::get_sreg_laneid() : (::cuda::ptx::get_sreg_laneid() % LOGICAL_WARP_THREADS)) {} //! @} end member group diff --git a/cub/cub/warp/warp_merge_sort.cuh b/cub/cub/warp/warp_merge_sort.cuh index 40e29322c1f..de3d311ae59 100644 --- a/cub/cub/warp/warp_merge_sort.cuh +++ b/cub/cub/warp/warp_merge_sort.cuh @@ -41,6 +41,7 @@ #include #include +#include #include CUB_NAMESPACE_BEGIN @@ -151,8 +152,10 @@ public: WarpMergeSort() = delete; _CCCL_DEVICE _CCCL_FORCEINLINE WarpMergeSort(typename BlockMergeSortStrategyT::TempStorage& temp_storage) - : BlockMergeSortStrategyT(temp_storage, IS_ARCH_WARP ? LaneId() : (LaneId() % LOGICAL_WARP_THREADS)) - , warp_id(IS_ARCH_WARP ? 0 : (LaneId() / LOGICAL_WARP_THREADS)) + : BlockMergeSortStrategyT( + temp_storage, + IS_ARCH_WARP ? ::cuda::ptx::get_sreg_laneid() : (::cuda::ptx::get_sreg_laneid() % LOGICAL_WARP_THREADS)) + , warp_id(IS_ARCH_WARP ? 0 : (::cuda::ptx::get_sreg_laneid() / LOGICAL_WARP_THREADS)) , member_mask(WarpMask(warp_id)) {} @@ -164,7 +167,7 @@ public: private: _CCCL_DEVICE _CCCL_FORCEINLINE void SyncImplementation() const { - WARP_SYNC(member_mask); + __syncwarp(member_mask); } friend BlockMergeSortStrategyT; diff --git a/cub/cub/warp/warp_scan.cuh b/cub/cub/warp/warp_scan.cuh index 0e0668709b0..cec992e699c 100644 --- a/cub/cub/warp/warp_scan.cuh +++ b/cub/cub/warp/warp_scan.cuh @@ -49,6 +49,7 @@ #include #include +#include #include CUB_NAMESPACE_BEGIN @@ -212,7 +213,7 @@ public: //! Reference to memory allocation having layout type TempStorage _CCCL_DEVICE _CCCL_FORCEINLINE WarpScan(TempStorage& temp_storage) : temp_storage(temp_storage.Alias()) - , lane_id(IS_ARCH_WARP ? LaneId() : LaneId() % LOGICAL_WARP_THREADS) + , lane_id(IS_ARCH_WARP ? ::cuda::ptx::get_sreg_laneid() : ::cuda::ptx::get_sreg_laneid() % LOGICAL_WARP_THREADS) {} //! @} end member group diff --git a/cub/cub/warp/warp_store.cuh b/cub/cub/warp/warp_store.cuh index bb99bc5965e..f0a9929e24f 100644 --- a/cub/cub/warp/warp_store.cuh +++ b/cub/cub/warp/warp_store.cuh @@ -45,6 +45,8 @@ #include #include +#include + CUB_NAMESPACE_BEGIN //! @rst @@ -378,14 +380,16 @@ public: //! memory as temporary storage. _CCCL_DEVICE _CCCL_FORCEINLINE WarpStore() : temp_storage(PrivateStorage()) - , linear_tid(IS_ARCH_WARP ? LaneId() : (LaneId() % LOGICAL_WARP_THREADS)) + , linear_tid( + IS_ARCH_WARP ? ::cuda::ptx::get_sreg_laneid() : (::cuda::ptx::get_sreg_laneid() % LOGICAL_WARP_THREADS)) {} //! @brief Collective constructor using the specified memory allocation as //! temporary storage. _CCCL_DEVICE _CCCL_FORCEINLINE WarpStore(TempStorage& temp_storage) : temp_storage(temp_storage.Alias()) - , linear_tid(IS_ARCH_WARP ? LaneId() : (LaneId() % LOGICAL_WARP_THREADS)) + , linear_tid( + IS_ARCH_WARP ? ::cuda::ptx::get_sreg_laneid() : (::cuda::ptx::get_sreg_laneid() % LOGICAL_WARP_THREADS)) {} //! @} end member group diff --git a/cub/test/catch2_test_block_run_length_decode.cu b/cub/test/catch2_test_block_run_length_decode.cu index cf080e173d7..dc322e49f8a 100644 --- a/cub/test/catch2_test_block_run_length_decode.cu +++ b/cub/test/catch2_test_block_run_length_decode.cu @@ -104,7 +104,7 @@ private: BlockRunOffsetScanT(temp_storage.run_offsets_scan_storage).ExclusiveSum(run_lengths, run_offsets, decoded_size); // Ensure temporary shared memory can be repurposed - cub::CTA_SYNC(); + __syncthreads(); // Construct BlockRunLengthDecode and initialize with the run offsets return BlockRunLengthDecodeT(temp_storage.decode.run_length_decode_storage, unique_items, run_offsets); @@ -137,7 +137,7 @@ private: } // Ensure BlockLoad's temporary shared memory can be repurposed - cub::CTA_SYNC(); + __syncthreads(); // Load this block's tile of run lengths if (num_valid_items < RUNS_PER_BLOCK) @@ -151,7 +151,7 @@ private: } // Ensure temporary shared memory can be repurposed - cub::CTA_SYNC(); + __syncthreads(); } public: diff --git a/cub/test/catch2_test_warp_merge_sort.cu b/cub/test/catch2_test_warp_merge_sort.cu index 7b245ebba33..fa4f986ad64 100644 --- a/cub/test/catch2_test_warp_merge_sort.cu +++ b/cub/test/catch2_test_warp_merge_sort.cu @@ -88,7 +88,7 @@ __global__ void warp_merge_sort_kernel(T* in, T* out, SegmentSizeItT segment_siz const int idx = thread_offset + item; thread_data[item] = in[idx]; } - cub::WARP_SYNC(warp_sort.get_member_mask()); + __syncwarp(warp_sort.get_member_mask()); // Run merge sort test action(warp_sort, thread_data, valid_items, oob_default); @@ -153,7 +153,7 @@ __global__ void warp_merge_sort_kernel( keys[item] = keys_in[idx]; values[item] = values_in[idx]; } - cub::WARP_SYNC(warp_sort.get_member_mask()); + __syncwarp(warp_sort.get_member_mask()); // Run merge sort test action(warp_sort, keys, values, valid_items, oob_default); diff --git a/cub/test/test_util.h b/cub/test/test_util.h index e61cd7cd6e2..c06d803ecb1 100644 --- a/cub/test/test_util.h +++ b/cub/test/test_util.h @@ -614,7 +614,7 @@ __host__ __device__ __forceinline__ void InitValue(GenMode gen_mode, T& value, s case RANDOM_BIT: case RANDOM_MINUS_PLUS_ZERO: _CubLog("%s\n", "cub::InitValue cannot generate random numbers on device."); - CUB_NS_QUALIFIER::ThreadTrap(); + cuda::std::terminate(); break; case UNIFORM: value = 2; @@ -656,7 +656,7 @@ __host__ __device__ __forceinline__ void InitValue(GenMode gen_mode, bool& value case RANDOM_BIT: case RANDOM_MINUS_PLUS_ZERO: _CubLog("%s\n", "cub::InitValue cannot generate random numbers on device."); - CUB_NS_QUALIFIER::ThreadTrap(); + cuda::std::terminate(); break; case UNIFORM: value = true; @@ -697,7 +697,7 @@ InitValue(GenMode gen_mode, CUB_NS_QUALIFIER::KeyValuePair& value, ), ( // NV_IS_DEVICE _CubLog("%s\n", "cub::InitValue cannot generate random numbers on device."); - CUB_NS_QUALIFIER::ThreadTrap(); + cuda::std::terminate(); )); // clang-format on } diff --git a/thrust/thrust/system/cuda/detail/core/util.h b/thrust/thrust/system/cuda/detail/core/util.h index 46681423790..186990f4a0b 100644 --- a/thrust/thrust/system/cuda/detail/core/util.h +++ b/thrust/thrust/system/cuda/detail/core/util.h @@ -681,7 +681,7 @@ THRUST_RUNTIME_FUNCTION inline cudaError_t sync_stream(cudaStream_t stream) inline void _CCCL_DEVICE sync_threadblock() { - cub::CTA_SYNC(); + __syncthreads(); } #define CUDA_CUB_RET_IF_FAIL(e) \