From 8bd9984d0125a09a13c170a507a2deff7ae748cf Mon Sep 17 00:00:00 2001 From: Liqiang Lu <116412316+liqiangxl@users.noreply.github.com> Date: Fri, 8 Nov 2024 13:59:19 -0500 Subject: [PATCH] clean 2dInnerReductionHeuristic (#3331) Removed paras and computations related to 3D inner reduction. --- csrc/scheduler/reduction.cpp | 153 ++++++----------------------------- 1 file changed, 26 insertions(+), 127 deletions(-) diff --git a/csrc/scheduler/reduction.cpp b/csrc/scheduler/reduction.cpp index c7021706865..c18e8d31ab3 100644 --- a/csrc/scheduler/reduction.cpp +++ b/csrc/scheduler/reduction.cpp @@ -66,22 +66,20 @@ void reduceProductTo(int64_t& z, int64_t& y, int64_t& x, const int64_t max) { std::unique_ptr inner2dReductionHeuristic( const int64_t total_reduction_numel, const int64_t total_iteration_numel, - const int64_t inner_most_dimension_numel, const int64_t n_tensor_inputs, const int64_t max_input_dtype_size, const size_t vectorize_factor) { // Set some targets for parallelization - const int64_t n_elems = total_reduction_numel * total_iteration_numel; + auto dev_prop = at::cuda::getCurrentDeviceProperties(); // WARNING: At some point we may want to generate heuristics for another // device that is not the current device. const int64_t max_threads_per_sm = - (int64_t)at::cuda::getCurrentDeviceProperties() - ->maxThreadsPerMultiProcessor; + (int64_t)dev_prop->maxThreadsPerMultiProcessor; const int64_t device_multiprocessor_count = - (int64_t)at::cuda::getCurrentDeviceProperties()->multiProcessorCount; + (int64_t)dev_prop->multiProcessorCount; auto const max_unroll = ceilDiv( // Available unrolling based on size of data type @@ -209,8 +207,7 @@ std::unique_ptr inner2dReductionHeuristic( // Cross grid inner reduction, number of blocks to cross-grid on int64_t gridim = 1; - // Cross grid outer reduction, number of blocks to cross-grid on - int64_t grodim = 1; + // Blocks for outputs int64_t godim = 1; @@ -218,12 +215,9 @@ std::unique_ptr inner2dReductionHeuristic( int64_t bdimx = 1; // Threads for outputs int64_t bdimy = 1; - // Threads for outer reduction dimension - int64_t bdimz = 1; // Unroll amount int64_t inner_reduction_unroll_factor = 1; - int64_t outer_reduction_unroll_factor = 1; int64_t iter_unroll_factor = 1; inner_reduction_unroll_factor = @@ -232,13 +226,13 @@ std::unique_ptr inner2dReductionHeuristic( // Grab what we can out of reduction domain, but don't go over a warp size yet bdimx = std::min( std::max( - ceilDiv(inner_most_dimension_numel, inner_reduction_unroll_factor), + ceilDiv(total_reduction_numel, inner_reduction_unroll_factor), (int64_t)min_warp_size), target_threads_in_block); // If we're not just barely covering the dimension, round to a more friendly // number - if (bdimx * inner_reduction_unroll_factor != inner_most_dimension_numel) { + if (bdimx * inner_reduction_unroll_factor != total_reduction_numel) { // Round bdimx down to multiple of warp size or power 2 if (bdimx < min_warp_size) { bdimx = scheduler_utils::lastPow2(bdimx); @@ -248,37 +242,17 @@ std::unique_ptr inner2dReductionHeuristic( } // Put everything else in bdimy for now - bdimy = std::max(min_warp_size / bdimx, (int64_t)1); - - // If 3D fill the rest of the threads into bdimz - bdimz = std::min( - std::min( - std::max(target_threads_in_block / (bdimx * bdimy), (int64_t)1), - ceilDiv(total_reduction_numel, inner_most_dimension_numel)), - scheduler_utils::z_block_limit); - - // If 3D doesn't fill out the threads, adjust to add to bdimy - bdimy = std::max(target_threads_in_block / (bdimx * bdimz), (int64_t)1); + bdimy = std::max(target_threads_in_block / bdimx, (int64_t)1); // If we don't have a full warp and have an unroll factor, move unroll into // bdimx - if (bdimx * bdimy * bdimz < min_warp_size && - inner_reduction_unroll_factor > 1) { + if (bdimx * bdimy < min_warp_size && inner_reduction_unroll_factor > 1) { bdimx = std::min( - std::max(inner_most_dimension_numel, min_warp_size), + std::max(total_reduction_numel, min_warp_size), target_threads_in_block); - inner_reduction_unroll_factor = - std::min(ceilDiv(inner_most_dimension_numel, bdimx), max_unroll); - - // Readjust bdimy and bdimz - bdimy = std::max(min_warp_size / bdimx, (int64_t)1); - - bdimz = std::min( - std::max(target_threads_in_block / (bdimx * bdimy), (int64_t)1), - ceilDiv(total_reduction_numel, inner_most_dimension_numel)); - - bdimy = std::max(target_threads_in_block / (bdimx * bdimz), (int64_t)1); + std::min(ceilDiv(total_reduction_numel, bdimx), max_unroll); + bdimy = std::max(target_threads_in_block / bdimx, (int64_t)1); } godim = ceilDiv(total_iteration_numel, bdimy); @@ -293,32 +267,15 @@ std::unique_ptr inner2dReductionHeuristic( (int64_t)vectorize_factor); } - // Attempt to put some unrolling into the outer reduction if inner hasn't - // taken the max unrolling - if (inner_reduction_unroll_factor < max_unroll) { - outer_reduction_unroll_factor = std::min( - ceilDiv(max_unroll, inner_reduction_unroll_factor), - ceilDiv( - ceilDiv(total_reduction_numel, inner_most_dimension_numel), bdimz)); - } - int64_t remainder_in_reduction = ceilDiv( total_reduction_numel, - bdimx * inner_reduction_unroll_factor * bdimz * - outer_reduction_unroll_factor * target_iterations); - - int64_t remainder_in_inner_dim = ceilDiv( - inner_most_dimension_numel, bdimx * inner_reduction_unroll_factor * target_iterations); // If we haven't gotten to the max_unroll case, try to take it out of the // iteration domain - if (inner_reduction_unroll_factor * outer_reduction_unroll_factor < - max_unroll) { + if (inner_reduction_unroll_factor < max_unroll) { // Don't go over a combined inner/outer unroll of max_unroll - auto unroll_available = ceilDiv( - max_unroll, - inner_reduction_unroll_factor * outer_reduction_unroll_factor); + auto unroll_available = ceilDiv(max_unroll, inner_reduction_unroll_factor); if (unroll_available > 1 && godim > 2 * device_multiprocessor_count) { unroll_available = std::min( @@ -335,20 +292,11 @@ std::unique_ptr inner2dReductionHeuristic( // reduction elements. if ((godim < target_blocks && remainder_in_reduction >= 0) || (remainder_in_reduction >= kEight)) { - auto grdim = std::min(remainder_in_reduction, bdimx * bdimy * kEight); - - gridim = remainder_in_inner_dim; - grodim = std::max(grdim / gridim, (int64_t)1); - grodim = std::max( - std::min(remainder_in_reduction / remainder_in_inner_dim, grodim), - (int64_t)1); + gridim = remainder_in_reduction; } - // Try to do some cleanup of ragged waves on device, don't do this if we're - // trying to do a 3D schedule. godim is a remainder of a split, so can only - // control gridim - if (grodim == 1 && - // If we have less than 8 waves of blocks + // Try to do some cleanup of ragged waves on device + if ( // If we have less than 8 waves of blocks gridim * godim < device_multiprocessor_count * kEight && // And we don't have an even divisible number of blocks (gridim * godim) % device_multiprocessor_count != 0 && @@ -368,7 +316,7 @@ std::unique_ptr inner2dReductionHeuristic( } } - if (grodim > 1 || gridim > 1) { + if (gridim > 1) { // Grid reductions do not support unrolling iteration dimension, revert if // set. Recalculate godim. if (iter_unroll_factor) { @@ -380,6 +328,7 @@ std::unique_ptr inner2dReductionHeuristic( } auto rparams = std::make_unique(); + rparams->schedule_3D = false; rparams->fastest_dim = true; rparams->cross_block_inner_reduction = true; rparams->block_dim_inner_reduction = ParallelType::TIDx; @@ -408,20 +357,8 @@ std::unique_ptr inner2dReductionHeuristic( rparams->unroll_factor_iter_dom = iter_unroll_factor; - rparams->schedule_3D = total_reduction_numel != inner_most_dimension_numel; - // Outer reduction domain - if (rparams->schedule_3D) { - rparams->cross_grid_outer_reduction = grodim > 1; - if (bdimz > 1) { - rparams->block_dim_outer_reduction = ParallelType::TIDz; - rparams->cross_block_outer_reduction = true; - } - rparams->unroll_factor_outer_reduction = outer_reduction_unroll_factor; - } - int64_t gdimx = LaunchParams::UNINITIALIZED_VAL; int64_t gdimy = LaunchParams::UNINITIALIZED_VAL; - int64_t gdimz = LaunchParams::UNINITIALIZED_VAL; // If we have a cross grid case we want to have gdimy assigned to godim and // gdimx assigned to grdim. Otherwise it's helpful to pull godim into gdimx in @@ -446,62 +383,26 @@ std::unique_ptr inner2dReductionHeuristic( } } - if (rparams->cross_grid_outer_reduction) { - if (rparams->cross_block_inner_reduction) { - rparams->grid_dim_outer_reduction = ParallelType::BIDz; - gdimz = std::min(grodim, scheduler_utils::z_grid_limit); - rparams->split_grid_dim_outer_reduction = true; - } else { - rparams->grid_dim_outer_reduction = ParallelType::BIDy; - gdimy = std::min(grodim, scheduler_utils::y_grid_limit); - rparams->split_grid_dim_outer_reduction = true; - } - } - rparams->lparams = LaunchParams( gdimx, gdimy, - gdimz, + LaunchParams::UNINITIALIZED_VAL, bdimx, bdimy > 1 ? bdimy : LaunchParams::UNINITIALIZED_VAL, - bdimz > 1 ? bdimz : LaunchParams::UNINITIALIZED_VAL); + LaunchParams::UNINITIALIZED_VAL); if (isDebugDumpEnabled(DebugDumpOption::SchedulerDebug)) { - debug() << "\n===== Reduction Stats ========\n" - << "total_reduction_numel: " - << total_reduction_numel / inner_most_dimension_numel << " * " - << inner_most_dimension_numel << "\n" + debug() << "\n===== Inner 2D Reduction Stats ========\n" + << "total_reduction_numel: " << total_reduction_numel << "\n" << "total_iteration_numel: " << total_iteration_numel << "\n" << "vectorize_factor: " << vectorize_factor << "\n" << "n_tensor_inputs: " << n_tensor_inputs << "\n" << "max_input_dtype_size: " << max_input_dtype_size << "\n" - << "block(" << bdimx << ", " << bdimy << ", " << bdimz << ")" + << "block(" << bdimx << ", " << bdimy << ", " << 1 << ")" << std::endl; debug() << rparams->toString() << std::endl; } - // If 3d, check if it's supported by the scheduler, otherwise force 2D - // schedule - if (rparams->schedule_3D) { - if (rparams->multiple_reds_per_blk && - (rparams->cross_grid_inner_reduction || - rparams->cross_grid_outer_reduction)) { - if (isDebugDumpEnabled(DebugDumpOption::SchedulerDebug)) { - debug() << "\n===== UNSUPPORTED REDUCTION HEURISTIC ========\n"; - debug() << rparams->multiple_reds_per_blk << ", " - << (rparams->unroll_factor_inner_reduction > 1) << ", " - << rparams->cross_grid_inner_reduction << std::endl; - } - return inner2dReductionHeuristic( - total_reduction_numel, - total_iteration_numel, - total_reduction_numel, - n_tensor_inputs, - max_input_dtype_size, - vectorize_factor); - } - } - return rparams; } @@ -909,7 +810,7 @@ std::unique_ptr inner3dReductionHeuristic( bdimz > 1 ? bdimz : LaunchParams::UNINITIALIZED_VAL); if (isDebugDumpEnabled(DebugDumpOption::SchedulerDebug)) { - debug() << "\n===== Reduction Stats ========\n" + debug() << "\n===== Inner 3D Reduction Stats ========\n" << "total_reduction_numel: " << total_reduction_numel / inner_most_dimension_numel << " * " << inner_most_dimension_numel << "\n" @@ -937,9 +838,8 @@ std::unique_ptr inner3dReductionHeuristic( return inner2dReductionHeuristic( total_reduction_numel, total_iteration_numel, - total_reduction_numel, - n_tensor_inputs, - max_input_dtype_size, + (int64_t)n_tensor_inputs, + (int64_t)max_input_dtype_size, vectorize_factor); } } @@ -1498,7 +1398,6 @@ std::unique_ptr reductionHeuristic( return inner2dReductionHeuristic( total_reduction_numel, total_iteration_numel, - inner_most_dimension_numel, (int64_t)n_tensor_inputs, (int64_t)max_input_dtype_size, vectorize_factor);