Merge branch 'main' into move_repro_point

NVIDIA · Oct 31, 2024 · 02b839d · 02b839d
2 parents a7bb26c + abdc3e1
commit 02b839d
Show file tree

Hide file tree

Showing 19 changed files with 410 additions and 81 deletions.
diff --git a/.github/workflows/nvfuser-ci-trigger.yml b/.github/workflows/nvfuser-ci-trigger.yml
@@ -16,7 +16,7 @@ jobs:
 
     # This job only runs for pull request comments
     if: |
-         startsWith(github.event.comment.body, '!build') &&
+         ( startsWith(github.event.comment.body, '!build') || startsWith(github.event.comment.body, '!test') ) &&
          (github.actor == 'xwang233' || github.actor == 'jjsjann123' || github.actor == 'chang-l' || github.actor == 'csarofeen' || github.actor == 'drzejan2' || github.actor == 'IvanYashchuk' || github.actor == 'jacobhinkle' || github.actor == 'kevinstephano' || github.actor == 'liqiangxl' || github.actor == 'mmigdal-nv' || github.actor == 'naoyam' || github.actor == 'ptrblck' || github.actor == 'rdspring1' || github.actor == 'samnordmann' || github.actor == 'zasdfgbnm' || github.actor == 'crcrpar' || github.actor == 'nWEIdia' || github.actor == 'Priya2698' || github.actor == 'wujingyue' || github.actor == 'tfogal' || github.actor == 'protonu' || github.actor == 'cowanmeg' || github.actor == 'nsarka')
     steps:
       - name: Check if comment is issued by authorized person

diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
@@ -0,0 +1,25 @@
+# SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES.
+# All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+
+# A workflow to send CI-related helpful information to PRs
+name: pull
+on:
+  pull_request:
+
+run-name: CI status hello ${{ github.event.pull_request.number }} - ${{ github.event.pull_request.head.sha }}
+jobs:
+  status_hello:
+    name: send CI hello status
+    runs-on: ubuntu-latest
+    permissions:
+      statuses: write
+    steps:
+      - name: Set CI hello status
+        run: |
+          curl \
+          -X POST \
+          -H "Accept: application/vnd.github+json" \
+          -H "Authorization: Bearer ${{ secrets.GITHUB_TOKEN }}" \
+          https://api.github.com/repos/${{ github.repository }}/statuses/${{ github.event.pull_request.head.sha }} \
+          -d "{\"state\":\"success\",\"target_url\":\"https://github.com/NVIDIA/Fuser/wiki/Bot-Commands\",\"description\":\"Authorized users: comment !build or !test to trigger CI pipelines. See wiki.\",\"context\":\"CI notes\"}"
diff --git a/csrc/device_lower/pass/circular_buffer.cpp b/csrc/device_lower/pass/circular_buffer.cpp
@@ -700,6 +700,17 @@ class CloneTmaCircularBufferLoopAndInsertSync
     return wait_exprs;
   }
 
+  // If there is already an if-then-else with electSync() predicate, use it.
+  // Otherwise, create a new one.
+  kir::IfThenElse* getElectSyncIfThenElse() {
+    if (elect_sync_if_then_else_ == nullptr) {
+      elect_sync_if_then_else_ = IrBuilder::create<kir::IfThenElse>(
+          IrBuilder::create<kir::Predicate>(PredicateType::ElectSync));
+      for_loop_stack_.back()->body().push_back(elect_sync_if_then_else_);
+    }
+    return elect_sync_if_then_else_;
+  }
+
   // This function selects a single thread to launch tma load and mbarrier
   // arrive_expected_tx operations. The remaining threads will simply arrive
   // at the mbarrier.
@@ -719,16 +730,14 @@ class CloneTmaCircularBufferLoopAndInsertSync
     NVF_ERROR(mbarrier_arrive_tx_ != nullptr);
     NVF_ERROR(expr != nullptr);
 
-    // Create the if-then-else with electSync() predicate for the arrive expect
-    // transaction.
-    kir::IfThenElse* if_expr = IrBuilder::create<kir::IfThenElse>(
-        IrBuilder::create<kir::Predicate>(PredicateType::ElectSync));
+    // Use the if-then-else with electSync() predicate for the arrive expect
+    // and cpAsyncBulk operations.
+    kir::IfThenElse* if_expr = getElectSyncIfThenElse();
 
     // A single thread issues arriveExpectTx with expected transactions and
     // launches the TMA load.
     if_expr->thenBody().push_back(mbarrier_arrive_tx_);
     if_expr->thenBody().push_back(expr);
-    for_loop_stack_.back()->body().push_back(if_expr);
 
     mbarrier_arrive_tx_ = nullptr;
   }
@@ -841,6 +850,10 @@ class CloneTmaCircularBufferLoopAndInsertSync
   // Mbarrier_ArriveExpectTx to add to cloned_top_level_loop
   kir::MBarrierArriveExpectTx* mbarrier_arrive_tx_ = nullptr;
 
+  // ElectSync if-then-else for the cloned loop. We put all the circular buffer
+  // load TMA operations under this if-then-else.
+  kir::IfThenElse* elect_sync_if_then_else_ = nullptr;
+
   // The circular buffered TVs for the loop being cloned
   std::unordered_set<const TensorView*> circular_buffer_load_tvs_;
 };

diff --git a/csrc/index_compute.cpp b/csrc/index_compute.cpp
@@ -2238,6 +2238,7 @@ kir::TensorIndex* Index::getConsumerIndex(
     DataType as_type) {
   Val* index = nullptr;
   if (!ir_utils::hasRootToLoopLinearTransformations(consumer) ||
+      ir_utils::isCpAsyncBulkLoad(consumer->definition()) ||
       (isIdModelOptionEnabled(IdModelEnableOption::ConsumerIndex) &&
        GpuLower::current()->isTensorIndexerEnabled())) {
     index = GpuLower::current()->tensorIndexer().getLinearIndex(

diff --git a/csrc/ir/nodes.cpp b/csrc/ir/nodes.cpp
@@ -2593,7 +2593,7 @@ IterDomain* IterDomain::merge(
       } else {
         expanded_extent = mul(outer->expandedExtent(), inner->extent());
       }
-    } else if (outer->hasExpandedExtent() && inner->hasExpandedExtent()) {
+    } else if (!outer->hasExpandedExtent() && inner->hasExpandedExtent()) {
       if (outer->isBroadcast()) {
         expanded_extent = inner->expandedExtent();
       } else {

diff --git a/csrc/ir/utils.cpp b/csrc/ir/utils.cpp
@@ -1224,55 +1224,6 @@ TensorViewDetails getDetailsFor(const std::vector<IterDomain*>& dims) {
   return details;
 }
 
-MmaLayout getInputLayout(
-    const TensorViewDetails& in_a,
-    const TensorViewDetails& in_b,
-    const MmaOp::AxesData& m_axes,
-    const MmaOp::AxesData& n_axes,
-    const MmaOp::AxesData& k_axes) {
-  // TT layout (b - broadcast, r - reduction):
-  // A = [M, K, b]
-  // B = [b, K, N]
-  // C = [M, r, N] (root domain)
-  if ((m_axes.front() < in_a.bcasts.front()) &&
-      (k_axes.front() < in_a.bcasts.front()) &&
-      (in_b.bcasts.front() < k_axes.front()) &&
-      (in_b.bcasts.front() < n_axes.front())) {
-    return MmaLayout::TT;
-  }
-  // TN layout (b - broadcast, r - reduction):
-  // A = [M, b, K]
-  // B = [b, N, K]
-  // C = [M, N, r] (root domain)
-  if ((m_axes.front() < in_a.bcasts.front()) &&
-      (in_a.bcasts.front() < k_axes.front()) &&
-      (in_b.bcasts.front() < n_axes.front()) &&
-      (in_b.bcasts.front() < k_axes.front())) {
-    return MmaLayout::TN;
-  }
-  // NT layout (b - broadcast, r - reduction):
-  // A = [K, M, b]
-  // B = [K, b, N]
-  // C = [r, M, N] (root domain)
-  if ((k_axes.front() < in_a.bcasts.front()) &&
-      (m_axes.front() < in_a.bcasts.front()) &&
-      (k_axes.front() < in_b.bcasts.front()) &&
-      (in_b.bcasts.front() < n_axes.front())) {
-    return MmaLayout::NT;
-  }
-  // NN layout (b - broadcast, r - reduction):
-  // A = [b, K, M]
-  // B = [N, K, b]
-  // C = [N, r, M] (root domain)
-  if ((in_a.bcasts.front() < k_axes.front()) &&
-      (k_axes.front() < m_axes.front()) && (n_axes.front() < k_axes.front()) &&
-      (k_axes.front() < in_b.bcasts.front())) {
-    return MmaLayout::NN;
-  }
-
-  NVF_THROW("Unsupported input layout");
-}
-
 MmaOpDetails getMmaOpDetails(
     TensorView* out,
     TensorView* in_a,
@@ -1405,15 +1356,6 @@ MmaOpDetails getMmaOpDetails(
       !details.k_axes.empty(),
       "MmaOp inputs must define at least a single K dimension");
 
-  // TODO: for tensor contraction / split-k uses of MmaOp different input layout
-  // rules may be needed
-  details.input_layout = getInputLayout(
-      in_a_details,
-      in_b_details,
-      details.m_axes,
-      details.n_axes,
-      details.k_axes);
-
   return details;
 }
 

diff --git a/csrc/ir/utils.h b/csrc/ir/utils.h
@@ -38,8 +38,6 @@ struct MmaOpDetails {
   // Concrete or broadcast axes that are present in all inputs
   //  and output
   AxesData batch_axes;
-  // A placeholder for mma input layout
-  std::optional<MmaLayout> input_layout = std::nullopt;
 };
 
 // A helper structure with pieces of information about TensorView

diff --git a/csrc/python_frontend/fusion_cache.cpp b/csrc/python_frontend/fusion_cache.cpp
@@ -781,15 +781,23 @@ void FusionCache::deserialize(std::string filename) {
       NVF_CHECK(
           trie_ptr->fusion_id == fb_trie_node->fusion_id(),
           "The fusion id for this TrieNode should already be set.")
-      Fusion* fusion =
-          queryFusionSchedules(fb_trie_node->fusion_id())->preschedFusion();
+      FusionSchedules* fs = queryFusionSchedules(fb_trie_node->fusion_id());
+      Fusion* fusion = fs->preschedFusion();
       try {
         // There could be bad fusion in the serialization.
         state->buildFusionIr(fusion);
       } catch (const std::exception& e) {
         // catch exception and setException for the terminal node
         trie_ptr->setException(e.what());
       }
+      // The FusionState creates a mapping from CPP Fusion to its State objects.
+      // Since the CPP Fusion is cached in FusionCache and the FusionState is
+      // temporary, the information linking CPP Fusion and Python
+      // FusionDefinition is stored in FusionCache.
+      fs->inputs_fid_ = state->inputs();
+      fs->outputs_fid_ = state->outputs();
+      fs->extents_fid_ = state->extents();
+      fs->map_value_to_fid_ = state->getValueMap();
     }
 
     // Table TrieNode => Field: children: [ulong]

diff --git a/csrc/python_frontend/fusion_cache.h b/csrc/python_frontend/fusion_cache.h
@@ -107,6 +107,14 @@ struct FusionSchedules {
   std::mutex scheds_lock;
   //! ID of fusion in python frontend fusion cache
   int64_t fusion_id_ = -1;
+  //! Fusion IDs of input arguments for FusionState
+  std::vector<int64_t> inputs_fid_;
+  //! IDs for Extents for TensorView input arguments for FusionState
+  std::vector<int64_t> extents_fid_;
+  //! Fusion IDs of output arguments for FusionState
+  std::vector<int64_t> outputs_fid_;
+  //! Map Fusion Val to its corresponding FusionDefinition index
+  std::unordered_map<const Val*, int64_t> map_value_to_fid_;
 };
 
 //! \struct TrieNode

diff --git a/csrc/python_frontend/fusion_definition.cpp b/csrc/python_frontend/fusion_definition.cpp
@@ -108,6 +108,17 @@ void FusionDefinition::finalizeDefinition() {
       throw;
     }
 
+    // The FusionState creates a mapping from CPP Fusion to its State objects.
+    // Since the CPP Fusion is cached in FusionCache and the FusionState is
+    // temporary, the information linking CPP Fusion and Python
+    // FusionDefinition is stored in FusionCache.
+    FusionSchedules* fs =
+        fusionCache()->queryFusionSchedules(fusion_id_.value());
+    fs->inputs_fid_ = inputs();
+    fs->outputs_fid_ = outputs();
+    fs->extents_fid_ = extents();
+    fs->map_value_to_fid_ = getValueMap();
+
     if (isDebugDumpEnabled(DebugDumpOption::FusionIrOriginal)) {
       printIr();
     }
@@ -121,6 +132,17 @@ void FusionDefinition::finalizeDefinition() {
     // build a proper fusion earlier.
     NVF_CHECK(!opt_e.has_value(), opt_e.value());
     fusion_id_ = std::optional<size_t>(trie_node_->fusion_id);
+
+    // A CPP fusion already exists in the FusionCache for this FusionDefinition.
+    // In this case, a new CPP Fusion is not created, so the mapping from CPP
+    // fusion to Python FusionDefinition is not initialized. This state is
+    // stored within FusionSchedules and is retrieved for this FusionDefinition.
+    FusionSchedules* fs =
+        fusionCache()->queryFusionSchedules(fusion_id_.value());
+    inputs_fid_ = fs->inputs_fid_;
+    outputs_fid_ = fs->outputs_fid_;
+    extents_fid_ = fs->extents_fid_;
+    map_value_to_fid_ = fs->map_value_to_fid_;
   }
 
   NVF_ERROR(

diff --git a/csrc/python_frontend/fusion_record.h b/csrc/python_frontend/fusion_record.h
@@ -1368,7 +1368,7 @@ struct TensorRecord : RecordFunctor {
     }
 
     fd.setFusionState(outputs_.at(0).index, tv);
-    fd.addInput(tv);
+    fd.addInput(tv, outputs_.at(0).index);
   }
 
   void print(std::ostream& os, bool close_function = true) const final {
@@ -1545,12 +1545,12 @@ struct OutputRecord : RecordFunctor {
           }
           tv_output->setAllocationDomain(allocation_domain, true);
         }
-        fd.addOutput(tv_output);
+        fd.addOutput(tv_output, args_.at(0).index);
       } else {
         NVF_CHECK(
             stride_order_.empty(),
             "stride_order can't be dictated for scalar outputs.");
-        fd.addOutput(output);
+        fd.addOutput(output, args_.at(0).index);
       }
     }
   }
@@ -2015,7 +2015,7 @@ struct ScalarRecord : RecordFunctor {
   void operator()(FusionState& fd) final {
     Val* output = IrBuilder::create<nvfuser::Val>(value_, dtype_);
     if (!value_.hasValue()) {
-      fd.addInput(output);
+      fd.addInput(output, outputs_.at(0).index);
     }
     fd.setFusionState(outputs_.at(0).index, output);
   }