From 2aacfd763b35ee473c16080ffef945b60ceee82b Mon Sep 17 00:00:00 2001 From: Jacob Hinkle <1454944+jacobhinkle@users.noreply.github.com> Date: Fri, 8 Nov 2024 11:59:20 -0500 Subject: [PATCH] Inspect all IDs instead of just loop in ParallelDimensionMap (#3376) This is important for Hopper MMA (see #3278) in which we only parallelize TIDx on the allocation domain of the MmaOp output. Currently this leads to us generating a usable kernel but we are not able to launch it properly because we can't infer the x dimension of the block size. This PR fixes that by replacing `tv->getLoopDomain()` with `tv->domain()->allIDs()` which will inspect the root, logical, loop, allocation domains and even intermediate IterDomains to try and find parallelized dimensions. --- csrc/parallel_dimension_map.cpp | 2 +- tests/cpp/test_gpu3.cpp | 27 +++++++++++++++++++++++++++ 2 files changed, 28 insertions(+), 1 deletion(-) diff --git a/csrc/parallel_dimension_map.cpp b/csrc/parallel_dimension_map.cpp index f0f33597aec..def2d5bd11e 100644 --- a/csrc/parallel_dimension_map.cpp +++ b/csrc/parallel_dimension_map.cpp @@ -41,7 +41,7 @@ void ParallelDimensionMap::build(Fusion* fusion) { VectorOfUniqueEntries all_concrete_ids; auto all_vals = fusion->usedMathVals(); for (auto tv : ir_utils::filterByType(all_vals)) { - for (auto id : tv->getLoopDomain()) { + for (auto id : tv->domain()->allIDs()) { auto ptype = id->getParallelType(); if (!isParallelTypeThread(ptype)) { continue; diff --git a/tests/cpp/test_gpu3.cpp b/tests/cpp/test_gpu3.cpp index f7deca99425..ae93bd8e515 100644 --- a/tests/cpp/test_gpu3.cpp +++ b/tests/cpp/test_gpu3.cpp @@ -54,6 +54,7 @@ #include #include #include +#include "parallel_dimension_map.h" namespace nvfuser { @@ -8991,6 +8992,32 @@ TEST_F(NVFuserTest, ReplaceSymbolicSizesPreferSimplerExtents) { } } +// Test that we are able to infer parallel dimensions even if they are not +// provided in loop domains. This is important for Hopper MMA since we +// parallelize TIDx on an allocation domain for the MmaOp output that is not in +// its loop domain. +TEST_F(NVFuserTest, ParallelDimensionsInAllocation) { + auto fusion_ptr = std::make_unique(); + Fusion& fusion = *fusion_ptr; + FusionGuard fg(fusion_ptr.get()); + + auto tv0 = makeConcreteTensor({4, 8}); + fusion.addInput(tv0); + auto tv1 = neg(tv0); + auto tv2 = exp(tv1); + fusion.addOutput(tv2); + + IterDomain* merged_id = IterDomain::merge(tv1->axis(0), tv1->axis(1)); + tv1->setAllocationDomain({merged_id}, true); + merged_id->parallelize(ParallelType::TIDx); + + GpuLower gpulw(&fusion); + gpulw.run(); + + Val* tidx_dim = gpulw.parallelDimensionMap().get(ParallelType::TIDx); + ASSERT_TRUE(tidx_dim != nullptr); +} + // Test file size should be up to 10K LoC. Create a new file for more tests. } // namespace nvfuser