Skip to content

Commit

Permalink
Merge branch 'main' into move_repro_point
Browse files Browse the repository at this point in the history
  • Loading branch information
kevinstephano authored Oct 31, 2024
2 parents a7bb26c + abdc3e1 commit 02b839d
Show file tree
Hide file tree
Showing 19 changed files with 410 additions and 81 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/nvfuser-ci-trigger.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ jobs:

# This job only runs for pull request comments
if: |
startsWith(github.event.comment.body, '!build') &&
( startsWith(github.event.comment.body, '!build') || startsWith(github.event.comment.body, '!test') ) &&
(github.actor == 'xwang233' || github.actor == 'jjsjann123' || github.actor == 'chang-l' || github.actor == 'csarofeen' || github.actor == 'drzejan2' || github.actor == 'IvanYashchuk' || github.actor == 'jacobhinkle' || github.actor == 'kevinstephano' || github.actor == 'liqiangxl' || github.actor == 'mmigdal-nv' || github.actor == 'naoyam' || github.actor == 'ptrblck' || github.actor == 'rdspring1' || github.actor == 'samnordmann' || github.actor == 'zasdfgbnm' || github.actor == 'crcrpar' || github.actor == 'nWEIdia' || github.actor == 'Priya2698' || github.actor == 'wujingyue' || github.actor == 'tfogal' || github.actor == 'protonu' || github.actor == 'cowanmeg' || github.actor == 'nsarka')
steps:
- name: Check if comment is issued by authorized person
Expand Down
25 changes: 25 additions & 0 deletions .github/workflows/pull.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
# SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES.
# All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause

# A workflow to send CI-related helpful information to PRs
name: pull
on:
pull_request:

run-name: CI status hello ${{ github.event.pull_request.number }} - ${{ github.event.pull_request.head.sha }}
jobs:
status_hello:
name: send CI hello status
runs-on: ubuntu-latest
permissions:
statuses: write
steps:
- name: Set CI hello status
run: |
curl \
-X POST \
-H "Accept: application/vnd.github+json" \
-H "Authorization: Bearer ${{ secrets.GITHUB_TOKEN }}" \
https://api.github.com/repos/${{ github.repository }}/statuses/${{ github.event.pull_request.head.sha }} \
-d "{\"state\":\"success\",\"target_url\":\"https://github.com/NVIDIA/Fuser/wiki/Bot-Commands\",\"description\":\"Authorized users: comment !build or !test to trigger CI pipelines. See wiki.\",\"context\":\"CI notes\"}"
23 changes: 18 additions & 5 deletions csrc/device_lower/pass/circular_buffer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -700,6 +700,17 @@ class CloneTmaCircularBufferLoopAndInsertSync
return wait_exprs;
}

// If there is already an if-then-else with electSync() predicate, use it.
// Otherwise, create a new one.
kir::IfThenElse* getElectSyncIfThenElse() {
if (elect_sync_if_then_else_ == nullptr) {
elect_sync_if_then_else_ = IrBuilder::create<kir::IfThenElse>(
IrBuilder::create<kir::Predicate>(PredicateType::ElectSync));
for_loop_stack_.back()->body().push_back(elect_sync_if_then_else_);
}
return elect_sync_if_then_else_;
}

// This function selects a single thread to launch tma load and mbarrier
// arrive_expected_tx operations. The remaining threads will simply arrive
// at the mbarrier.
Expand All @@ -719,16 +730,14 @@ class CloneTmaCircularBufferLoopAndInsertSync
NVF_ERROR(mbarrier_arrive_tx_ != nullptr);
NVF_ERROR(expr != nullptr);

// Create the if-then-else with electSync() predicate for the arrive expect
// transaction.
kir::IfThenElse* if_expr = IrBuilder::create<kir::IfThenElse>(
IrBuilder::create<kir::Predicate>(PredicateType::ElectSync));
// Use the if-then-else with electSync() predicate for the arrive expect
// and cpAsyncBulk operations.
kir::IfThenElse* if_expr = getElectSyncIfThenElse();

// A single thread issues arriveExpectTx with expected transactions and
// launches the TMA load.
if_expr->thenBody().push_back(mbarrier_arrive_tx_);
if_expr->thenBody().push_back(expr);
for_loop_stack_.back()->body().push_back(if_expr);

mbarrier_arrive_tx_ = nullptr;
}
Expand Down Expand Up @@ -841,6 +850,10 @@ class CloneTmaCircularBufferLoopAndInsertSync
// Mbarrier_ArriveExpectTx to add to cloned_top_level_loop
kir::MBarrierArriveExpectTx* mbarrier_arrive_tx_ = nullptr;

// ElectSync if-then-else for the cloned loop. We put all the circular buffer
// load TMA operations under this if-then-else.
kir::IfThenElse* elect_sync_if_then_else_ = nullptr;

// The circular buffered TVs for the loop being cloned
std::unordered_set<const TensorView*> circular_buffer_load_tvs_;
};
Expand Down
1 change: 1 addition & 0 deletions csrc/index_compute.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2238,6 +2238,7 @@ kir::TensorIndex* Index::getConsumerIndex(
DataType as_type) {
Val* index = nullptr;
if (!ir_utils::hasRootToLoopLinearTransformations(consumer) ||
ir_utils::isCpAsyncBulkLoad(consumer->definition()) ||
(isIdModelOptionEnabled(IdModelEnableOption::ConsumerIndex) &&
GpuLower::current()->isTensorIndexerEnabled())) {
index = GpuLower::current()->tensorIndexer().getLinearIndex(
Expand Down
2 changes: 1 addition & 1 deletion csrc/ir/nodes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2593,7 +2593,7 @@ IterDomain* IterDomain::merge(
} else {
expanded_extent = mul(outer->expandedExtent(), inner->extent());
}
} else if (outer->hasExpandedExtent() && inner->hasExpandedExtent()) {
} else if (!outer->hasExpandedExtent() && inner->hasExpandedExtent()) {
if (outer->isBroadcast()) {
expanded_extent = inner->expandedExtent();
} else {
Expand Down
58 changes: 0 additions & 58 deletions csrc/ir/utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1224,55 +1224,6 @@ TensorViewDetails getDetailsFor(const std::vector<IterDomain*>& dims) {
return details;
}

MmaLayout getInputLayout(
const TensorViewDetails& in_a,
const TensorViewDetails& in_b,
const MmaOp::AxesData& m_axes,
const MmaOp::AxesData& n_axes,
const MmaOp::AxesData& k_axes) {
// TT layout (b - broadcast, r - reduction):
// A = [M, K, b]
// B = [b, K, N]
// C = [M, r, N] (root domain)
if ((m_axes.front() < in_a.bcasts.front()) &&
(k_axes.front() < in_a.bcasts.front()) &&
(in_b.bcasts.front() < k_axes.front()) &&
(in_b.bcasts.front() < n_axes.front())) {
return MmaLayout::TT;
}
// TN layout (b - broadcast, r - reduction):
// A = [M, b, K]
// B = [b, N, K]
// C = [M, N, r] (root domain)
if ((m_axes.front() < in_a.bcasts.front()) &&
(in_a.bcasts.front() < k_axes.front()) &&
(in_b.bcasts.front() < n_axes.front()) &&
(in_b.bcasts.front() < k_axes.front())) {
return MmaLayout::TN;
}
// NT layout (b - broadcast, r - reduction):
// A = [K, M, b]
// B = [K, b, N]
// C = [r, M, N] (root domain)
if ((k_axes.front() < in_a.bcasts.front()) &&
(m_axes.front() < in_a.bcasts.front()) &&
(k_axes.front() < in_b.bcasts.front()) &&
(in_b.bcasts.front() < n_axes.front())) {
return MmaLayout::NT;
}
// NN layout (b - broadcast, r - reduction):
// A = [b, K, M]
// B = [N, K, b]
// C = [N, r, M] (root domain)
if ((in_a.bcasts.front() < k_axes.front()) &&
(k_axes.front() < m_axes.front()) && (n_axes.front() < k_axes.front()) &&
(k_axes.front() < in_b.bcasts.front())) {
return MmaLayout::NN;
}

NVF_THROW("Unsupported input layout");
}

MmaOpDetails getMmaOpDetails(
TensorView* out,
TensorView* in_a,
Expand Down Expand Up @@ -1405,15 +1356,6 @@ MmaOpDetails getMmaOpDetails(
!details.k_axes.empty(),
"MmaOp inputs must define at least a single K dimension");

// TODO: for tensor contraction / split-k uses of MmaOp different input layout
// rules may be needed
details.input_layout = getInputLayout(
in_a_details,
in_b_details,
details.m_axes,
details.n_axes,
details.k_axes);

return details;
}

Expand Down
2 changes: 0 additions & 2 deletions csrc/ir/utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -38,8 +38,6 @@ struct MmaOpDetails {
// Concrete or broadcast axes that are present in all inputs
// and output
AxesData batch_axes;
// A placeholder for mma input layout
std::optional<MmaLayout> input_layout = std::nullopt;
};

// A helper structure with pieces of information about TensorView
Expand Down
12 changes: 10 additions & 2 deletions csrc/python_frontend/fusion_cache.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -781,15 +781,23 @@ void FusionCache::deserialize(std::string filename) {
NVF_CHECK(
trie_ptr->fusion_id == fb_trie_node->fusion_id(),
"The fusion id for this TrieNode should already be set.")
Fusion* fusion =
queryFusionSchedules(fb_trie_node->fusion_id())->preschedFusion();
FusionSchedules* fs = queryFusionSchedules(fb_trie_node->fusion_id());
Fusion* fusion = fs->preschedFusion();
try {
// There could be bad fusion in the serialization.
state->buildFusionIr(fusion);
} catch (const std::exception& e) {
// catch exception and setException for the terminal node
trie_ptr->setException(e.what());
}
// The FusionState creates a mapping from CPP Fusion to its State objects.
// Since the CPP Fusion is cached in FusionCache and the FusionState is
// temporary, the information linking CPP Fusion and Python
// FusionDefinition is stored in FusionCache.
fs->inputs_fid_ = state->inputs();
fs->outputs_fid_ = state->outputs();
fs->extents_fid_ = state->extents();
fs->map_value_to_fid_ = state->getValueMap();
}

// Table TrieNode => Field: children: [ulong]
Expand Down
8 changes: 8 additions & 0 deletions csrc/python_frontend/fusion_cache.h
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,14 @@ struct FusionSchedules {
std::mutex scheds_lock;
//! ID of fusion in python frontend fusion cache
int64_t fusion_id_ = -1;
//! Fusion IDs of input arguments for FusionState
std::vector<int64_t> inputs_fid_;
//! IDs for Extents for TensorView input arguments for FusionState
std::vector<int64_t> extents_fid_;
//! Fusion IDs of output arguments for FusionState
std::vector<int64_t> outputs_fid_;
//! Map Fusion Val to its corresponding FusionDefinition index
std::unordered_map<const Val*, int64_t> map_value_to_fid_;
};

//! \struct TrieNode
Expand Down
22 changes: 22 additions & 0 deletions csrc/python_frontend/fusion_definition.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,17 @@ void FusionDefinition::finalizeDefinition() {
throw;
}

// The FusionState creates a mapping from CPP Fusion to its State objects.
// Since the CPP Fusion is cached in FusionCache and the FusionState is
// temporary, the information linking CPP Fusion and Python
// FusionDefinition is stored in FusionCache.
FusionSchedules* fs =
fusionCache()->queryFusionSchedules(fusion_id_.value());
fs->inputs_fid_ = inputs();
fs->outputs_fid_ = outputs();
fs->extents_fid_ = extents();
fs->map_value_to_fid_ = getValueMap();

if (isDebugDumpEnabled(DebugDumpOption::FusionIrOriginal)) {
printIr();
}
Expand All @@ -121,6 +132,17 @@ void FusionDefinition::finalizeDefinition() {
// build a proper fusion earlier.
NVF_CHECK(!opt_e.has_value(), opt_e.value());
fusion_id_ = std::optional<size_t>(trie_node_->fusion_id);

// A CPP fusion already exists in the FusionCache for this FusionDefinition.
// In this case, a new CPP Fusion is not created, so the mapping from CPP
// fusion to Python FusionDefinition is not initialized. This state is
// stored within FusionSchedules and is retrieved for this FusionDefinition.
FusionSchedules* fs =
fusionCache()->queryFusionSchedules(fusion_id_.value());
inputs_fid_ = fs->inputs_fid_;
outputs_fid_ = fs->outputs_fid_;
extents_fid_ = fs->extents_fid_;
map_value_to_fid_ = fs->map_value_to_fid_;
}

NVF_ERROR(
Expand Down
8 changes: 4 additions & 4 deletions csrc/python_frontend/fusion_record.h
Original file line number Diff line number Diff line change
Expand Up @@ -1368,7 +1368,7 @@ struct TensorRecord : RecordFunctor {
}

fd.setFusionState(outputs_.at(0).index, tv);
fd.addInput(tv);
fd.addInput(tv, outputs_.at(0).index);
}

void print(std::ostream& os, bool close_function = true) const final {
Expand Down Expand Up @@ -1545,12 +1545,12 @@ struct OutputRecord : RecordFunctor {
}
tv_output->setAllocationDomain(allocation_domain, true);
}
fd.addOutput(tv_output);
fd.addOutput(tv_output, args_.at(0).index);
} else {
NVF_CHECK(
stride_order_.empty(),
"stride_order can't be dictated for scalar outputs.");
fd.addOutput(output);
fd.addOutput(output, args_.at(0).index);
}
}
}
Expand Down Expand Up @@ -2015,7 +2015,7 @@ struct ScalarRecord : RecordFunctor {
void operator()(FusionState& fd) final {
Val* output = IrBuilder::create<nvfuser::Val>(value_, dtype_);
if (!value_.hasValue()) {
fd.addInput(output);
fd.addInput(output, outputs_.at(0).index);
}
fd.setFusionState(outputs_.at(0).index, output);
}
Expand Down
Loading

0 comments on commit 02b839d

Please sign in to comment.