From 2aacfd763b35ee473c16080ffef945b60ceee82b Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <1454944+jacobhinkle@users.noreply.github.com>
Date: Fri, 8 Nov 2024 11:59:20 -0500
Subject: [PATCH] Inspect all IDs instead of just loop in ParallelDimensionMap
 (#3376)

This is important for Hopper MMA (see #3278) in which we only
parallelize TIDx on the allocation domain of the MmaOp output. Currently
this leads to us generating a usable kernel but we are not able to
launch it properly because we can't infer the x dimension of the block
size. This PR fixes that by replacing `tv->getLoopDomain()` with
`tv->domain()->allIDs()` which will inspect the root, logical, loop,
allocation domains and even intermediate IterDomains to try and find
parallelized dimensions.
---
 csrc/parallel_dimension_map.cpp |  2 +-
 tests/cpp/test_gpu3.cpp         | 27 +++++++++++++++++++++++++++
 2 files changed, 28 insertions(+), 1 deletion(-)
diff --git a/csrc/parallel_dimension_map.cpp b/csrc/parallel_dimension_map.cpp
index f0f33597aec..def2d5bd11e 100644
--- a/csrc/parallel_dimension_map.cpp
+++ b/csrc/parallel_dimension_map.cpp
@@ -41,7 +41,7 @@ void ParallelDimensionMap::build(Fusion* fusion) {
   VectorOfUniqueEntries<PAndID> all_concrete_ids;
   auto all_vals = fusion->usedMathVals();
   for (auto tv : ir_utils::filterByType<TensorView>(all_vals)) {
-    for (auto id : tv->getLoopDomain()) {
+    for (auto id : tv->domain()->allIDs()) {
       auto ptype = id->getParallelType();
       if (!isParallelTypeThread(ptype)) {
         continue;
diff --git a/tests/cpp/test_gpu3.cpp b/tests/cpp/test_gpu3.cpp
index f7deca99425..ae93bd8e515 100644
--- a/tests/cpp/test_gpu3.cpp
+++ b/tests/cpp/test_gpu3.cpp
@@ -54,6 +54,7 @@
 #include <algorithm>
 #include <cmath>
 #include <sstream>
+#include "parallel_dimension_map.h"
 
 namespace nvfuser {
 
@@ -8991,6 +8992,32 @@ TEST_F(NVFuserTest, ReplaceSymbolicSizesPreferSimplerExtents) {
   }
 }
 
+// Test that we are able to infer parallel dimensions even if they are not
+// provided in loop domains. This is important for Hopper MMA since we
+// parallelize TIDx on an allocation domain for the MmaOp output that is not in
+// its loop domain.
+TEST_F(NVFuserTest, ParallelDimensionsInAllocation) {
+  auto fusion_ptr = std::make_unique<Fusion>();
+  Fusion& fusion = *fusion_ptr;
+  FusionGuard fg(fusion_ptr.get());
+
+  auto tv0 = makeConcreteTensor({4, 8});
+  fusion.addInput(tv0);
+  auto tv1 = neg(tv0);
+  auto tv2 = exp(tv1);
+  fusion.addOutput(tv2);
+
+  IterDomain* merged_id = IterDomain::merge(tv1->axis(0), tv1->axis(1));
+  tv1->setAllocationDomain({merged_id}, true);
+  merged_id->parallelize(ParallelType::TIDx);
+
+  GpuLower gpulw(&fusion);
+  gpulw.run();
+
+  Val* tidx_dim = gpulw.parallelDimensionMap().get(ParallelType::TIDx);
+  ASSERT_TRUE(tidx_dim != nullptr);
+}
+
 // Test file size should be up to 10K LoC. Create a new file for more tests.
 
 } // namespace nvfuser