From 6ff775aefda74705d099f8d415605502e7b82b3f Mon Sep 17 00:00:00 2001 From: Nicholas Sarkauskas Date: Fri, 20 Dec 2024 11:59:32 -0800 Subject: [PATCH 01/19] Add transformer benchmark skeleton based off of a bert test --- CMakeLists.txt | 1 + benchmarks/cpp/transformer.cpp | 111 +++++++++++++++++++++++++++++++++ 2 files changed, 112 insertions(+) create mode 100644 benchmarks/cpp/transformer.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 91c3076d4ba..7733fa6a5aa 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -755,6 +755,7 @@ if(BUILD_NVFUSER_BENCHMARK) ${NVFUSER_ROOT}/benchmarks/cpp/softmax_backward.cpp ${NVFUSER_ROOT}/benchmarks/cpp/softmax_dropout.cpp ${NVFUSER_ROOT}/benchmarks/cpp/timm.cpp + ${NVFUSER_ROOT}/benchmarks/cpp/transformer.cpp ${NVFUSER_ROOT}/benchmarks/cpp/transpose.cpp ${NVFUSER_ROOT}/benchmarks/cpp/utils.cpp ${NVFUSER_ROOT}/tests/cpp/utils.cpp diff --git a/benchmarks/cpp/transformer.cpp b/benchmarks/cpp/transformer.cpp new file mode 100644 index 00000000000..f17bb20a3fc --- /dev/null +++ b/benchmarks/cpp/transformer.cpp @@ -0,0 +1,111 @@ +// clang-format off +/* + * SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES. + * All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + */ +// clang-format on +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include + +#include + +#include +#include + +using namespace nvfuser; + +// Return reduction tensor view and output of reduction +static void setupDivMaxSoftmaxDropoutForward(Fusion* fusion, DataType dtype) { + FusionGuard fg(fusion); + + bool is_fp16 = dtype == DataType::Half; + + TensorView* tv0 = TensorViewBuilder() + .ndims(4) + .dtype(dtype) + .contiguity({true, std::nullopt, std::nullopt, true}) + .shape({-1, 1, 1, -1}) + .build(); + TensorView* tv1 = makeContigTensor(4, dtype); + + fusion->addInput(tv0); + fusion->addInput(tv1); + + // TODO: should be input + auto d16 = IrBuilder::create(1.0); + + if (is_fp16) { + tv0 = castOp(DataType::Float, tv0); + tv1 = castOp(DataType::Float, tv1); + } + + auto tv2 = div(tv1, d16); + auto tv3 = add(tv2, tv0); + + auto tv10 = softmax(tv3, 3); + auto dropout_tvs = dropout(tv10, IrBuilder::create(0.9)); + auto tv12 = dropout_tvs.mask; + auto tv14 = dropout_tvs.output; + + if (is_fp16) { + tv14 = castOp(DataType::Half, tv14); + tv10 = castOp(DataType::Half, tv10); + tv3 = castOp(DataType::Half, tv3); + } + + fusion->addOutput(tv14); + fusion->addOutput(tv12); + fusion->addOutput(tv10); + fusion->addOutput(tv3); +} + +static void NvFuserScheduler_DivMaxSoftDropFwd( + benchmark::State& benchmark_state, + FusionExecutorCache* executor_cache, + DataType dtype) { + auto w = benchmark_state.range(0); + auto x = benchmark_state.range(1); + auto y = benchmark_state.range(2); + auto z = benchmark_state.range(3); + + at::manual_seed(0); + auto options = + at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0); + + at::Tensor t0 = at::randn({w, 1, 1, z}, options); + at::Tensor t1 = at::randn({w, x, y, z}, options); + + std::vector at_inputs = {t0, t1}; + + auto bytes = + runBenchmarkIterations(benchmark_state, executor_cache, at_inputs); + + benchmark_state.SetBytesProcessed( + bytes * int64_t(benchmark_state.iterations())); +} + +//------------------------------------------------------------------------------ + +NVFUSER_BENCHMARK_DEFINE( + nick_transformer, + setupDivMaxSoftmaxDropoutForward, + NvFuserScheduler_DivMaxSoftDropFwd, + DataType::Float); + +NVFUSER_BENCHMARK_RUN(nick_transformer) + // ->RangeMultiplier(2) + ->Ranges({{8, 8}, {16, 16}, {128, 128}, {128, 128}}) + ->Unit(benchmark::kMicrosecond) + ->UseManualTime(); From c66336be6cf80b13b2ee5ae9667fc7ac0d48a0f8 Mon Sep 17 00:00:00 2001 From: Nicholas Sarkauskas Date: Fri, 20 Dec 2024 13:10:45 -0800 Subject: [PATCH 02/19] savE --- benchmarks/cpp/transformer.cpp | 73 +++++++++------------- tests/cpp/test_multidevice_transformer.cpp | 1 + 2 files changed, 31 insertions(+), 43 deletions(-) diff --git a/benchmarks/cpp/transformer.cpp b/benchmarks/cpp/transformer.cpp index f17bb20a3fc..4854178f15f 100644 --- a/benchmarks/cpp/transformer.cpp +++ b/benchmarks/cpp/transformer.cpp @@ -23,55 +23,42 @@ #include #include +#include using namespace nvfuser; +namespace { +// Note: We test on smaller model and input sizes to avoid high error +// accumulation for validation. +static constexpr int64_t B = 2, E = 768, H = 16, S = 128; +// Note: Dropout probabilities are set to 0. Since the dropout mask is sharded +// it throws off the seed offset between the sharded nvFuser program and the +// unsharded reference. +static constexpr double kDropoutProb = 0.0, kSdpaProb = 0.0, kSdpaScale = 1e-3; +// Note parameters scaled by kParamScale following weight initialization +// recommendations: +// https://huggingface.co/docs/transformers/en/model_doc/gpt2#transformers.GPT2Config.initializer_range +static constexpr double kParamScale = 0.02; +} // namespace + // Return reduction tensor view and output of reduction -static void setupDivMaxSoftmaxDropoutForward(Fusion* fusion, DataType dtype) { +static void setupTransformerForward(Fusion* fusion, DataType dtype) { + FusionGuard fg(fusion); - bool is_fp16 = dtype == DataType::Half; - - TensorView* tv0 = TensorViewBuilder() - .ndims(4) - .dtype(dtype) - .contiguity({true, std::nullopt, std::nullopt, true}) - .shape({-1, 1, 1, -1}) - .build(); - TensorView* tv1 = makeContigTensor(4, dtype); - - fusion->addInput(tv0); - fusion->addInput(tv1); - - // TODO: should be input - auto d16 = IrBuilder::create(1.0); - - if (is_fp16) { - tv0 = castOp(DataType::Float, tv0); - tv1 = castOp(DataType::Float, tv1); - } - - auto tv2 = div(tv1, d16); - auto tv3 = add(tv2, tv0); - - auto tv10 = softmax(tv3, 3); - auto dropout_tvs = dropout(tv10, IrBuilder::create(0.9)); - auto tv12 = dropout_tvs.mask; - auto tv14 = dropout_tvs.output; - - if (is_fp16) { - tv14 = castOp(DataType::Half, tv14); - tv10 = castOp(DataType::Half, tv10); - tv3 = castOp(DataType::Half, tv3); - } - - fusion->addOutput(tv14); - fusion->addOutput(tv12); - fusion->addOutput(tv10); - fusion->addOutput(tv3); + auto* communicator_ = Communicator::getInstance(); // nick TODO call Communicator::getInstance().cleanup() somewhere before program exit + + const int64_t D = communicator_->size(); // number of devices + + NVF_ERROR((4 * E) % D == 0, "Requires number of devices ", D, "evenly divide 4*E=", 4*E); + + std::unique_ptr model = std::make_unique( + D, B, E, H, S, kDropoutProb, kSdpaProb); + + const auto mesh = DeviceMesh::createForNumDevices(D); } -static void NvFuserScheduler_DivMaxSoftDropFwd( +static void NvFuserScheduler_TransformerFwd( benchmark::State& benchmark_state, FusionExecutorCache* executor_cache, DataType dtype) { @@ -100,8 +87,8 @@ static void NvFuserScheduler_DivMaxSoftDropFwd( NVFUSER_BENCHMARK_DEFINE( nick_transformer, - setupDivMaxSoftmaxDropoutForward, - NvFuserScheduler_DivMaxSoftDropFwd, + setupTransformerForward, + NvFuserScheduler_TransformerFwd, DataType::Float); NVFUSER_BENCHMARK_RUN(nick_transformer) diff --git a/tests/cpp/test_multidevice_transformer.cpp b/tests/cpp/test_multidevice_transformer.cpp index 6ccb217137f..8db0db5915a 100644 --- a/tests/cpp/test_multidevice_transformer.cpp +++ b/tests/cpp/test_multidevice_transformer.cpp @@ -12,6 +12,7 @@ #include #include +#include #include #include #include From 4005cc379798ceefbaa554a4ee938fc6a65ad045 Mon Sep 17 00:00:00 2001 From: Nicholas Sarkauskas Date: Fri, 20 Dec 2024 14:04:15 -0800 Subject: [PATCH 03/19] fusion definition --- CMakeLists.txt | 1 + benchmarks/cpp/transformer.cpp | 17 ++++++----------- tests/cpp/multidevice_transformer.cpp | 14 +++++++++++--- tests/cpp/multidevice_transformer.h | 4 ++++ 4 files changed, 22 insertions(+), 14 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 7733fa6a5aa..47e5c019399 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -759,6 +759,7 @@ if(BUILD_NVFUSER_BENCHMARK) ${NVFUSER_ROOT}/benchmarks/cpp/transpose.cpp ${NVFUSER_ROOT}/benchmarks/cpp/utils.cpp ${NVFUSER_ROOT}/tests/cpp/utils.cpp + ${NVFUSER_ROOT}/tests/cpp/multidevice_transformer.cpp ) add_executable(nvfuser_bench ${BENCHMARK_SRCS}) diff --git a/benchmarks/cpp/transformer.cpp b/benchmarks/cpp/transformer.cpp index 4854178f15f..e9e3b463e53 100644 --- a/benchmarks/cpp/transformer.cpp +++ b/benchmarks/cpp/transformer.cpp @@ -43,25 +43,20 @@ static constexpr double kParamScale = 0.02; // Return reduction tensor view and output of reduction static void setupTransformerForward(Fusion* fusion, DataType dtype) { - - FusionGuard fg(fusion); - - auto* communicator_ = Communicator::getInstance(); // nick TODO call Communicator::getInstance().cleanup() somewhere before program exit + Communicator* communicator_ = &Communicator::getInstance(); // nick TODO call Communicator::getInstance().cleanup() somewhere before program exit const int64_t D = communicator_->size(); // number of devices - NVF_ERROR((4 * E) % D == 0, "Requires number of devices ", D, "evenly divide 4*E=", 4*E); - std::unique_ptr model = std::make_unique( D, B, E, H, S, kDropoutProb, kSdpaProb); - const auto mesh = DeviceMesh::createForNumDevices(D); + model->setupForward(fusion, dtype, /*sequence_parallel*/false); } static void NvFuserScheduler_TransformerFwd( benchmark::State& benchmark_state, FusionExecutorCache* executor_cache, - DataType dtype) { + DataType dtype) { /* auto w = benchmark_state.range(0); auto x = benchmark_state.range(1); auto y = benchmark_state.range(2); @@ -80,18 +75,18 @@ static void NvFuserScheduler_TransformerFwd( runBenchmarkIterations(benchmark_state, executor_cache, at_inputs); benchmark_state.SetBytesProcessed( - bytes * int64_t(benchmark_state.iterations())); + bytes * int64_t(benchmark_state.iterations()));*/ } //------------------------------------------------------------------------------ NVFUSER_BENCHMARK_DEFINE( - nick_transformer, + TransformerForward, setupTransformerForward, NvFuserScheduler_TransformerFwd, DataType::Float); -NVFUSER_BENCHMARK_RUN(nick_transformer) +NVFUSER_BENCHMARK_RUN(TransformerForward) // ->RangeMultiplier(2) ->Ranges({{8, 8}, {16, 16}, {128, 128}, {128, 128}}) ->Unit(benchmark::kMicrosecond) diff --git a/tests/cpp/multidevice_transformer.cpp b/tests/cpp/multidevice_transformer.cpp index fe552b6606a..67b3e48aedc 100644 --- a/tests/cpp/multidevice_transformer.cpp +++ b/tests/cpp/multidevice_transformer.cpp @@ -392,11 +392,13 @@ std::vector DistributedTransformer::mha_backwards( linear0_grads.grad_x}; } -std::unique_ptr DistributedTransformer::forward( +/* NVFuser benchmark manages the unique_ptr for Fusion and FusionExecutorCache, + so update the raw pointer with this setupForward function */ +void DistributedTransformer::setupForward( + Fusion *fusion, DataType dtype, bool sequence_parallel) { - auto fusion = std::make_unique(); - FusionGuard fg(fusion.get()); + FusionGuard fg(fusion); const auto mesh = DeviceMesh::createForNumDevices(D); TensorView* x = sequence_parallel @@ -478,7 +480,13 @@ std::unique_ptr DistributedTransformer::forward( shardBetween({mha_in}, {mha_tvs.output}, mha_w0); shardBetween({mlp_in}, {mlp_tvs.output}, mlp_w0); } +} +std::unique_ptr DistributedTransformer::forward( + DataType dtype, + bool sequence_parallel) { + auto fusion = std::make_unique(); + setupForward(fusion.get(), dtype, sequence_parallel); return std::make_unique(std::move(fusion)); } diff --git a/tests/cpp/multidevice_transformer.h b/tests/cpp/multidevice_transformer.h index 33a3f759926..43c0b06bf80 100644 --- a/tests/cpp/multidevice_transformer.h +++ b/tests/cpp/multidevice_transformer.h @@ -45,6 +45,10 @@ class DistributedTransformer { kDropoutProb(dropout_prob), kSdpaProb(sdpa_dropout_prob) {} + void setupForward( + Fusion *fusion, + DataType dtype, + bool sequence_parallel = false); std::unique_ptr forward( DataType dtype, bool sequence_parallel = false); From 83b51bc74b3c3907a990b718f00409d8e1191118 Mon Sep 17 00:00:00 2001 From: Nicholas Sarkauskas Date: Mon, 23 Dec 2024 15:21:19 -0800 Subject: [PATCH 04/19] add at_inputs, compiling --- benchmarks/cpp/transformer.cpp | 117 +++++++++++++++++++++++++++++++++ 1 file changed, 117 insertions(+) diff --git a/benchmarks/cpp/transformer.cpp b/benchmarks/cpp/transformer.cpp index e9e3b463e53..65672bcfd43 100644 --- a/benchmarks/cpp/transformer.cpp +++ b/benchmarks/cpp/transformer.cpp @@ -22,6 +22,7 @@ #include #include +#include #include #include @@ -53,6 +54,61 @@ static void setupTransformerForward(Fusion* fusion, DataType dtype) { model->setupForward(fusion, dtype, /*sequence_parallel*/false); } +static std::vector reference_mlp( + at::Tensor x, + at::Tensor w0, + at::Tensor b0, + at::Tensor w1, + at::Tensor b1) { + auto at_dtype = w0.dtype(); + auto linear0 = at::linear(x, w0, b0); + auto gelu = at::gelu(linear0.to(at::kFloat), "tanh").to(at_dtype); + auto linear1 = at::linear(gelu, w1, b1).to(at::kFloat); + auto [dropout, mask] = at::native_dropout(linear1, kDropoutProb, true); + return {linear0, gelu, linear1, dropout, mask}; +} + +static std::vector reference_mha( + at::Tensor x, + at::Tensor w0, + at::Tensor b0, + at::Tensor w1, + at::Tensor b1) { + auto linear0 = at::linear(x, w0, b0); + auto qkv = linear0.view({B, S, 3 * E}).split(E, 2); + for (auto i = 0; i < 3; i++) { + qkv[i] = qkv[i].reshape({B, S, H, E / H}).transpose(1, 2); + } + auto sdpa_out = at::_scaled_dot_product_flash_attention( + qkv[0], qkv[1], qkv[2], kSdpaProb, true, false, kSdpaScale); + auto sdpa = std::get<0>(sdpa_out); + // Reassemble heads (B, H, S, E/H) to (B, S, H, E/H) to (B, S, E) + auto y = sdpa.transpose(1, 2).reshape({B * S, E}); + auto linear1 = at::linear(y, w1, b1).to(at::kFloat); + auto [dropout, mask] = at::native_dropout(linear1, kDropoutProb, true); + return {linear0, sdpa, linear1, dropout, mask}; +} + +static at::Tensor transformerShardTensor_Mesh( + at::Tensor tensor, + const int64_t axis, + const DeviceMesh& mesh, + Communicator* communicator_) { + const auto device_id = communicator_->deviceId(); + return nvfuser::shardTensor(tensor, axis, mesh, device_id); +} + +static at::Tensor transformerShardTensor(at::Tensor tensor, TensorView* tv, Communicator* communicator_) { + if (!isSharded(tv)) { + return tensor; + } + NVF_ERROR(tv->hasDeviceMesh(), "`tv` has no DeviceMesh: ", tv); + return transformerShardTensor_Mesh( + tensor, + getShardedLogicalAxis(tv, ParallelType::DIDx), + tv->getDeviceMesh(), communicator_); +} + static void NvFuserScheduler_TransformerFwd( benchmark::State& benchmark_state, FusionExecutorCache* executor_cache, @@ -76,6 +132,67 @@ static void NvFuserScheduler_TransformerFwd( benchmark_state.SetBytesProcessed( bytes * int64_t(benchmark_state.iterations()));*/ + + Communicator* communicator_ = &Communicator::getInstance(); // nick TODO call Communicator::getInstance().cleanup() somewhere before program exit + const int64_t D = communicator_->size(); // number of devices + + at::ScalarType at_dtype = data_type_to_aten(dtype); + const auto mesh = DeviceMesh::createForNumDevices(D); + constexpr float kEps = 1e-5; + std::vector norm_shape{E}; + + const auto options = + at::TensorOptions().dtype(at_dtype).device(communicator_->device()); + auto x_ = at::randn({B * S, E}, options); + auto ln0_w_ = at::randn(E, options).to(at::kFloat); + auto ln0_b_ = at::randn(E, options).to(at::kFloat); + auto mha_w0_ = at::randn({3 * E, E}, options) * kParamScale; + auto mha_b0_ = at::randn({3 * E}, options) * kParamScale; + auto mha_w1_ = at::randn({E, E}, options) * kParamScale; + auto mha_b1_ = at::randn({E}, options) * kParamScale; + auto ln1_w_ = at::randn(E, options).to(at::kFloat); + auto ln1_b_ = at::randn(E, options).to(at::kFloat); + auto mlp_w0_ = at::randn({4 * E, E}, options) * kParamScale; + auto mlp_b0_ = at::randn({4 * E}, options) * kParamScale; + auto mlp_w1_ = at::randn({E, 4 * E}, options) * kParamScale; + auto mlp_b1_ = at::randn({E}, options) * kParamScale; + + at::manual_seed(getATenRandomSeed()); + auto x_float_ = x_.to(at::kFloat); + auto ln0_ = at::native_layer_norm(x_float_, norm_shape, ln0_w_, ln0_b_, kEps); + auto ln0_out_ = std::get<0>(ln0_); + + auto mha_out_ = reference_mha( + ln0_out_.to(at_dtype), mha_w0_, mha_b0_, mha_w1_, mha_b1_)[3]; + + auto resid0_ = mha_out_ + x_float_; + auto ln1_ = at::native_layer_norm(resid0_, norm_shape, ln1_w_, ln1_b_, kEps); + auto ln1_out_ = std::get<0>(ln1_); + + auto mlp_out_ = reference_mlp( + ln1_out_.to(at_dtype), mlp_w0_, mlp_b0_, mlp_w1_, mlp_b1_)[3]; + auto at_out = (resid0_ + mlp_out_).to(at_dtype); + + std::vector at_inputs = { + x_, + ln0_w_, + ln0_b_, + transformerShardTensor_Mesh(mha_w0_.view({3, E, E}), 1, mesh, communicator_).view({1, 3 * E / D, E}), + transformerShardTensor_Mesh(mha_b0_.view({3, E}), 1, mesh, communicator_).view({1, 3 * E / D}), + transformerShardTensor_Mesh(mha_w1_, 1, mesh, communicator_).unsqueeze(0), + mha_b1_, + ln1_w_, + ln1_b_, + transformerShardTensor_Mesh(mlp_w0_, 0, mesh, communicator_).unsqueeze(0), + transformerShardTensor_Mesh(mlp_b0_, 0, mesh, communicator_).unsqueeze(0), + transformerShardTensor_Mesh(mlp_w1_, 1, mesh, communicator_).unsqueeze(0), + mlp_b1_}; + + auto bytes = + runBenchmarkIterations(benchmark_state, executor_cache, at_inputs); + + benchmark_state.SetBytesProcessed( + bytes * int64_t(benchmark_state.iterations())); } //------------------------------------------------------------------------------ From 528caa024dfb798f8136b8327726658b018e56b0 Mon Sep 17 00:00:00 2001 From: Nicholas Sarkauskas Date: Mon, 23 Dec 2024 15:30:07 -0800 Subject: [PATCH 05/19] forward working --- benchmarks/cpp/transformer.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/cpp/transformer.cpp b/benchmarks/cpp/transformer.cpp index 65672bcfd43..9eb2843d2f5 100644 --- a/benchmarks/cpp/transformer.cpp +++ b/benchmarks/cpp/transformer.cpp @@ -201,7 +201,7 @@ NVFUSER_BENCHMARK_DEFINE( TransformerForward, setupTransformerForward, NvFuserScheduler_TransformerFwd, - DataType::Float); + DataType::BFloat16); NVFUSER_BENCHMARK_RUN(TransformerForward) // ->RangeMultiplier(2) From aad41ebd4cffcd897fe4b72c7089d33f444a3901 Mon Sep 17 00:00:00 2001 From: Nicholas Sarkauskas Date: Mon, 23 Dec 2024 15:44:52 -0800 Subject: [PATCH 06/19] remove unused code --- benchmarks/cpp/transformer.cpp | 22 +--------------------- 1 file changed, 1 insertion(+), 21 deletions(-) diff --git a/benchmarks/cpp/transformer.cpp b/benchmarks/cpp/transformer.cpp index 9eb2843d2f5..d74c07fcd80 100644 --- a/benchmarks/cpp/transformer.cpp +++ b/benchmarks/cpp/transformer.cpp @@ -112,27 +112,7 @@ static at::Tensor transformerShardTensor(at::Tensor tensor, TensorView* tv, Comm static void NvFuserScheduler_TransformerFwd( benchmark::State& benchmark_state, FusionExecutorCache* executor_cache, - DataType dtype) { /* - auto w = benchmark_state.range(0); - auto x = benchmark_state.range(1); - auto y = benchmark_state.range(2); - auto z = benchmark_state.range(3); - - at::manual_seed(0); - auto options = - at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0); - - at::Tensor t0 = at::randn({w, 1, 1, z}, options); - at::Tensor t1 = at::randn({w, x, y, z}, options); - - std::vector at_inputs = {t0, t1}; - - auto bytes = - runBenchmarkIterations(benchmark_state, executor_cache, at_inputs); - - benchmark_state.SetBytesProcessed( - bytes * int64_t(benchmark_state.iterations()));*/ - + DataType dtype) { Communicator* communicator_ = &Communicator::getInstance(); // nick TODO call Communicator::getInstance().cleanup() somewhere before program exit const int64_t D = communicator_->size(); // number of devices From 8b46e4d3358b903c88227d65ed3871139cf1220f Mon Sep 17 00:00:00 2001 From: Nicholas Sarkauskas Date: Wed, 8 Jan 2025 14:05:57 -0800 Subject: [PATCH 07/19] add debug prints and comm cleanup --- benchmarks/cpp/main.cpp | 8 ++++++++ benchmarks/cpp/transformer.cpp | 4 ++++ csrc/multidevice/communicator.cpp | 8 ++++++++ 3 files changed, 20 insertions(+) diff --git a/benchmarks/cpp/main.cpp b/benchmarks/cpp/main.cpp index 30a159bc661..69cbfad271c 100644 --- a/benchmarks/cpp/main.cpp +++ b/benchmarks/cpp/main.cpp @@ -69,6 +69,7 @@ void addGPUBenchmarkContext() { // Copied from BENCHMARK_MAIN with extra custom settings int main(int argc, char** argv) { + Communicator* communicator_ = &Communicator::getInstance(); ::benchmark::Initialize(&argc, argv); if (::benchmark::ReportUnrecognizedArguments(argc, argv)) { return 1; @@ -90,6 +91,13 @@ int main(int argc, char** argv) { ::benchmark::RunSpecifiedBenchmarks(); + printf("calling comm cleanup, size=%ld, did=%ld\n", communicator_->size(), communicator_->deviceId()); + Communicator::getInstance().cleanup(); + printf("done calling comm cleanup, size=%ld, did=%ld\n", communicator_->size(), communicator_->deviceId()); + ::benchmark::Shutdown(); + + + return 0; } diff --git a/benchmarks/cpp/transformer.cpp b/benchmarks/cpp/transformer.cpp index d74c07fcd80..66384fb7e0c 100644 --- a/benchmarks/cpp/transformer.cpp +++ b/benchmarks/cpp/transformer.cpp @@ -116,6 +116,10 @@ static void NvFuserScheduler_TransformerFwd( Communicator* communicator_ = &Communicator::getInstance(); // nick TODO call Communicator::getInstance().cleanup() somewhere before program exit const int64_t D = communicator_->size(); // number of devices + printf("did=%ld in fwd before barrier\n", communicator_->deviceId());fflush(0); + communicator_->barrier(); + printf("did=%ld in fwd after barrier\n", communicator_->deviceId());fflush(0); + at::ScalarType at_dtype = data_type_to_aten(dtype); const auto mesh = DeviceMesh::createForNumDevices(D); constexpr float kEps = 1e-5; diff --git a/csrc/multidevice/communicator.cpp b/csrc/multidevice/communicator.cpp index 6cf1a499bb9..f4e4eb9f136 100644 --- a/csrc/multidevice/communicator.cpp +++ b/csrc/multidevice/communicator.cpp @@ -193,6 +193,8 @@ Communicator::Communicator( is_available_ = parseEnv( rank_, size_, local_rank_, local_size_, master_addr_, master_port_); + printf("rank=%ld, size=%ld, local_rank_=%ld, local_size_=%ld\n", rank_, size_, local_rank_, local_size_); + if (!is_available_) { return; } @@ -232,11 +234,15 @@ void Communicator::cleanup() { "likely because Communicator::cleanup was called more than once"); cleaned_up = true; + printf("entered cleanup on rank %ld\n", rank_); + // Without this, the TCPStore server can be cleaned up before TCPStore // clients are created, causing an hang. This happened with // test_multidevice.py::test_sizes_and_ranks. if (is_available()) { + printf("calling barrier on rank %ld\n", rank_); barrier(); + printf("done calling barrier on rank %ld\n", rank_); } store_ = nullptr; @@ -251,7 +257,9 @@ void Communicator::cleanup() { // Call shutdown before destructing a ProcessGroupNCCL as instructed by // https://github.com/pytorch/pytorch/blob/e62073d7997c9e63896cb5289ffd0874a8cc1838/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp#L1164-L1170. if (auto* pg_nccl = dynamic_cast(backend.get())) { + printf("pg shutdown on rank %ld\n", rank_); pg_nccl->shutdown(); + printf("done calling pg shutdown on rank %ld\n", rank_); } } #endif From 569c24c0dad7107bc13a6cf4702c62884f872e6e Mon Sep 17 00:00:00 2001 From: Nick Sarkauskas Date: Thu, 9 Jan 2025 08:39:46 -0500 Subject: [PATCH 08/19] Update benchmarks/cpp/transformer.cpp Co-authored-by: Jingyue Wu --- benchmarks/cpp/transformer.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/cpp/transformer.cpp b/benchmarks/cpp/transformer.cpp index 66384fb7e0c..855215513dd 100644 --- a/benchmarks/cpp/transformer.cpp +++ b/benchmarks/cpp/transformer.cpp @@ -51,7 +51,7 @@ static void setupTransformerForward(Fusion* fusion, DataType dtype) { std::unique_ptr model = std::make_unique( D, B, E, H, S, kDropoutProb, kSdpaProb); - model->setupForward(fusion, dtype, /*sequence_parallel*/false); + model->setupForward(fusion, dtype, /*sequence_parallel=*/false); } static std::vector reference_mlp( From 403367eed5fdc712dd14d83741d0fbae27efeabc Mon Sep 17 00:00:00 2001 From: Nicholas Sarkauskas Date: Thu, 9 Jan 2025 09:02:53 -0800 Subject: [PATCH 09/19] working with multiple ranks, disabled cupti profiling and set iters to 1 --- benchmarks/cpp/transformer.cpp | 64 +++++----------------------------- 1 file changed, 9 insertions(+), 55 deletions(-) diff --git a/benchmarks/cpp/transformer.cpp b/benchmarks/cpp/transformer.cpp index 855215513dd..6c512968063 100644 --- a/benchmarks/cpp/transformer.cpp +++ b/benchmarks/cpp/transformer.cpp @@ -31,7 +31,7 @@ using namespace nvfuser; namespace { // Note: We test on smaller model and input sizes to avoid high error // accumulation for validation. -static constexpr int64_t B = 2, E = 768, H = 16, S = 128; +static constexpr int64_t B = 2, E = 32/*768*/, H = 2/*16*/, S = 32/*128*/; // Note: Dropout probabilities are set to 0. Since the dropout mask is sharded // it throws off the seed offset between the sharded nvFuser program and the // unsharded reference. @@ -48,47 +48,14 @@ static void setupTransformerForward(Fusion* fusion, DataType dtype) { const int64_t D = communicator_->size(); // number of devices + ProfilerOptionsGuard::getCurOptions().set(ProfilerOption::EnableNocupti); + std::unique_ptr model = std::make_unique( D, B, E, H, S, kDropoutProb, kSdpaProb); model->setupForward(fusion, dtype, /*sequence_parallel=*/false); } -static std::vector reference_mlp( - at::Tensor x, - at::Tensor w0, - at::Tensor b0, - at::Tensor w1, - at::Tensor b1) { - auto at_dtype = w0.dtype(); - auto linear0 = at::linear(x, w0, b0); - auto gelu = at::gelu(linear0.to(at::kFloat), "tanh").to(at_dtype); - auto linear1 = at::linear(gelu, w1, b1).to(at::kFloat); - auto [dropout, mask] = at::native_dropout(linear1, kDropoutProb, true); - return {linear0, gelu, linear1, dropout, mask}; -} - -static std::vector reference_mha( - at::Tensor x, - at::Tensor w0, - at::Tensor b0, - at::Tensor w1, - at::Tensor b1) { - auto linear0 = at::linear(x, w0, b0); - auto qkv = linear0.view({B, S, 3 * E}).split(E, 2); - for (auto i = 0; i < 3; i++) { - qkv[i] = qkv[i].reshape({B, S, H, E / H}).transpose(1, 2); - } - auto sdpa_out = at::_scaled_dot_product_flash_attention( - qkv[0], qkv[1], qkv[2], kSdpaProb, true, false, kSdpaScale); - auto sdpa = std::get<0>(sdpa_out); - // Reassemble heads (B, H, S, E/H) to (B, S, H, E/H) to (B, S, E) - auto y = sdpa.transpose(1, 2).reshape({B * S, E}); - auto linear1 = at::linear(y, w1, b1).to(at::kFloat); - auto [dropout, mask] = at::native_dropout(linear1, kDropoutProb, true); - return {linear0, sdpa, linear1, dropout, mask}; -} - static at::Tensor transformerShardTensor_Mesh( at::Tensor tensor, const int64_t axis, @@ -116,13 +83,13 @@ static void NvFuserScheduler_TransformerFwd( Communicator* communicator_ = &Communicator::getInstance(); // nick TODO call Communicator::getInstance().cleanup() somewhere before program exit const int64_t D = communicator_->size(); // number of devices - printf("did=%ld in fwd before barrier\n", communicator_->deviceId());fflush(0); - communicator_->barrier(); - printf("did=%ld in fwd after barrier\n", communicator_->deviceId());fflush(0); + // printf("did=%ld in fwd before barrier\n", communicator_->deviceId());fflush(0); + // communicator_->barrier(); + // printf("did=%ld in fwd after barrier\n", communicator_->deviceId());fflush(0); + printf("did=%ld in fwd\n", communicator_->deviceId());fflush(0); at::ScalarType at_dtype = data_type_to_aten(dtype); const auto mesh = DeviceMesh::createForNumDevices(D); - constexpr float kEps = 1e-5; std::vector norm_shape{E}; const auto options = @@ -142,20 +109,6 @@ static void NvFuserScheduler_TransformerFwd( auto mlp_b1_ = at::randn({E}, options) * kParamScale; at::manual_seed(getATenRandomSeed()); - auto x_float_ = x_.to(at::kFloat); - auto ln0_ = at::native_layer_norm(x_float_, norm_shape, ln0_w_, ln0_b_, kEps); - auto ln0_out_ = std::get<0>(ln0_); - - auto mha_out_ = reference_mha( - ln0_out_.to(at_dtype), mha_w0_, mha_b0_, mha_w1_, mha_b1_)[3]; - - auto resid0_ = mha_out_ + x_float_; - auto ln1_ = at::native_layer_norm(resid0_, norm_shape, ln1_w_, ln1_b_, kEps); - auto ln1_out_ = std::get<0>(ln1_); - - auto mlp_out_ = reference_mlp( - ln1_out_.to(at_dtype), mlp_w0_, mlp_b0_, mlp_w1_, mlp_b1_)[3]; - auto at_out = (resid0_ + mlp_out_).to(at_dtype); std::vector at_inputs = { x_, @@ -189,6 +142,7 @@ NVFUSER_BENCHMARK_DEFINE( NVFUSER_BENCHMARK_RUN(TransformerForward) // ->RangeMultiplier(2) - ->Ranges({{8, 8}, {16, 16}, {128, 128}, {128, 128}}) + ->Ranges({{8, 8}}) + ->Iterations(1) ->Unit(benchmark::kMicrosecond) ->UseManualTime(); From 68cf0dbebb0499de50b9978b615e23cf9a3a911a Mon Sep 17 00:00:00 2001 From: Nicholas Sarkauskas Date: Thu, 9 Jan 2025 09:29:42 -0800 Subject: [PATCH 10/19] remove --- benchmarks/cpp/transformer.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/benchmarks/cpp/transformer.cpp b/benchmarks/cpp/transformer.cpp index 6c512968063..7b7f77de852 100644 --- a/benchmarks/cpp/transformer.cpp +++ b/benchmarks/cpp/transformer.cpp @@ -48,8 +48,6 @@ static void setupTransformerForward(Fusion* fusion, DataType dtype) { const int64_t D = communicator_->size(); // number of devices - ProfilerOptionsGuard::getCurOptions().set(ProfilerOption::EnableNocupti); - std::unique_ptr model = std::make_unique( D, B, E, H, S, kDropoutProb, kSdpaProb); From dadeecc4bbc504aae67348b011d2707050d2da41 Mon Sep 17 00:00:00 2001 From: Nicholas Sarkauskas Date: Thu, 9 Jan 2025 09:39:40 -0800 Subject: [PATCH 11/19] remove debug prints --- benchmarks/cpp/main.cpp | 5 ----- benchmarks/cpp/transformer.cpp | 13 +++---------- csrc/multidevice/communicator.cpp | 8 -------- 3 files changed, 3 insertions(+), 23 deletions(-) diff --git a/benchmarks/cpp/main.cpp b/benchmarks/cpp/main.cpp index 69cbfad271c..3d78e6cac67 100644 --- a/benchmarks/cpp/main.cpp +++ b/benchmarks/cpp/main.cpp @@ -69,7 +69,6 @@ void addGPUBenchmarkContext() { // Copied from BENCHMARK_MAIN with extra custom settings int main(int argc, char** argv) { - Communicator* communicator_ = &Communicator::getInstance(); ::benchmark::Initialize(&argc, argv); if (::benchmark::ReportUnrecognizedArguments(argc, argv)) { return 1; @@ -91,13 +90,9 @@ int main(int argc, char** argv) { ::benchmark::RunSpecifiedBenchmarks(); - printf("calling comm cleanup, size=%ld, did=%ld\n", communicator_->size(), communicator_->deviceId()); Communicator::getInstance().cleanup(); - printf("done calling comm cleanup, size=%ld, did=%ld\n", communicator_->size(), communicator_->deviceId()); ::benchmark::Shutdown(); - - return 0; } diff --git a/benchmarks/cpp/transformer.cpp b/benchmarks/cpp/transformer.cpp index 7b7f77de852..b223f265099 100644 --- a/benchmarks/cpp/transformer.cpp +++ b/benchmarks/cpp/transformer.cpp @@ -31,7 +31,7 @@ using namespace nvfuser; namespace { // Note: We test on smaller model and input sizes to avoid high error // accumulation for validation. -static constexpr int64_t B = 2, E = 32/*768*/, H = 2/*16*/, S = 32/*128*/; +static constexpr int64_t B = 2, E = 768, H = 16, S = 128; // Note: Dropout probabilities are set to 0. Since the dropout mask is sharded // it throws off the seed offset between the sharded nvFuser program and the // unsharded reference. @@ -78,14 +78,9 @@ static void NvFuserScheduler_TransformerFwd( benchmark::State& benchmark_state, FusionExecutorCache* executor_cache, DataType dtype) { - Communicator* communicator_ = &Communicator::getInstance(); // nick TODO call Communicator::getInstance().cleanup() somewhere before program exit + Communicator* communicator_ = &Communicator::getInstance(); const int64_t D = communicator_->size(); // number of devices - // printf("did=%ld in fwd before barrier\n", communicator_->deviceId());fflush(0); - // communicator_->barrier(); - // printf("did=%ld in fwd after barrier\n", communicator_->deviceId());fflush(0); - printf("did=%ld in fwd\n", communicator_->deviceId());fflush(0); - at::ScalarType at_dtype = data_type_to_aten(dtype); const auto mesh = DeviceMesh::createForNumDevices(D); std::vector norm_shape{E}; @@ -139,8 +134,6 @@ NVFUSER_BENCHMARK_DEFINE( DataType::BFloat16); NVFUSER_BENCHMARK_RUN(TransformerForward) - // ->RangeMultiplier(2) - ->Ranges({{8, 8}}) - ->Iterations(1) + ->Iterations(10) ->Unit(benchmark::kMicrosecond) ->UseManualTime(); diff --git a/csrc/multidevice/communicator.cpp b/csrc/multidevice/communicator.cpp index f4e4eb9f136..6cf1a499bb9 100644 --- a/csrc/multidevice/communicator.cpp +++ b/csrc/multidevice/communicator.cpp @@ -193,8 +193,6 @@ Communicator::Communicator( is_available_ = parseEnv( rank_, size_, local_rank_, local_size_, master_addr_, master_port_); - printf("rank=%ld, size=%ld, local_rank_=%ld, local_size_=%ld\n", rank_, size_, local_rank_, local_size_); - if (!is_available_) { return; } @@ -234,15 +232,11 @@ void Communicator::cleanup() { "likely because Communicator::cleanup was called more than once"); cleaned_up = true; - printf("entered cleanup on rank %ld\n", rank_); - // Without this, the TCPStore server can be cleaned up before TCPStore // clients are created, causing an hang. This happened with // test_multidevice.py::test_sizes_and_ranks. if (is_available()) { - printf("calling barrier on rank %ld\n", rank_); barrier(); - printf("done calling barrier on rank %ld\n", rank_); } store_ = nullptr; @@ -257,9 +251,7 @@ void Communicator::cleanup() { // Call shutdown before destructing a ProcessGroupNCCL as instructed by // https://github.com/pytorch/pytorch/blob/e62073d7997c9e63896cb5289ffd0874a8cc1838/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp#L1164-L1170. if (auto* pg_nccl = dynamic_cast(backend.get())) { - printf("pg shutdown on rank %ld\n", rank_); pg_nccl->shutdown(); - printf("done calling pg shutdown on rank %ld\n", rank_); } } #endif From 86e55691b47383f3d40c43900516a5ad5e4c1f60 Mon Sep 17 00:00:00 2001 From: Nick Sarkauskas Date: Thu, 9 Jan 2025 13:11:36 -0500 Subject: [PATCH 12/19] Update benchmarks/cpp/transformer.cpp Co-authored-by: Jingyue Wu --- benchmarks/cpp/transformer.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/cpp/transformer.cpp b/benchmarks/cpp/transformer.cpp index b223f265099..cf2516804bc 100644 --- a/benchmarks/cpp/transformer.cpp +++ b/benchmarks/cpp/transformer.cpp @@ -48,7 +48,7 @@ static void setupTransformerForward(Fusion* fusion, DataType dtype) { const int64_t D = communicator_->size(); // number of devices - std::unique_ptr model = std::make_unique( + auto model = std::make_unique( D, B, E, H, S, kDropoutProb, kSdpaProb); model->setupForward(fusion, dtype, /*sequence_parallel=*/false); From 8604cce7c9a826eac4dd8cd7df674f404fc954f3 Mon Sep 17 00:00:00 2001 From: Nick Sarkauskas Date: Thu, 9 Jan 2025 13:12:10 -0500 Subject: [PATCH 13/19] Update benchmarks/cpp/transformer.cpp Co-authored-by: Jingyue Wu --- benchmarks/cpp/transformer.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/cpp/transformer.cpp b/benchmarks/cpp/transformer.cpp index cf2516804bc..3926e1e5429 100644 --- a/benchmarks/cpp/transformer.cpp +++ b/benchmarks/cpp/transformer.cpp @@ -74,7 +74,7 @@ static at::Tensor transformerShardTensor(at::Tensor tensor, TensorView* tv, Comm tv->getDeviceMesh(), communicator_); } -static void NvFuserScheduler_TransformerFwd( +static void transformerFwd( benchmark::State& benchmark_state, FusionExecutorCache* executor_cache, DataType dtype) { From a3c228c868ed1690f4fd78e81c1adeb81df8d855 Mon Sep 17 00:00:00 2001 From: Nicholas Sarkauskas Date: Thu, 9 Jan 2025 10:16:11 -0800 Subject: [PATCH 14/19] review feedback --- benchmarks/cpp/transformer.cpp | 21 +++++---------------- 1 file changed, 5 insertions(+), 16 deletions(-) diff --git a/benchmarks/cpp/transformer.cpp b/benchmarks/cpp/transformer.cpp index 3926e1e5429..330f6eeb566 100644 --- a/benchmarks/cpp/transformer.cpp +++ b/benchmarks/cpp/transformer.cpp @@ -31,20 +31,20 @@ using namespace nvfuser; namespace { // Note: We test on smaller model and input sizes to avoid high error // accumulation for validation. -static constexpr int64_t B = 2, E = 768, H = 16, S = 128; + constexpr int64_t B = 2, E = 768, H = 16, S = 128; // Note: Dropout probabilities are set to 0. Since the dropout mask is sharded // it throws off the seed offset between the sharded nvFuser program and the // unsharded reference. -static constexpr double kDropoutProb = 0.0, kSdpaProb = 0.0, kSdpaScale = 1e-3; + constexpr double kDropoutProb = 0.0, kSdpaProb = 0.0, kSdpaScale = 1e-3; // Note parameters scaled by kParamScale following weight initialization // recommendations: // https://huggingface.co/docs/transformers/en/model_doc/gpt2#transformers.GPT2Config.initializer_range -static constexpr double kParamScale = 0.02; + constexpr double kParamScale = 0.02; } // namespace // Return reduction tensor view and output of reduction -static void setupTransformerForward(Fusion* fusion, DataType dtype) { - Communicator* communicator_ = &Communicator::getInstance(); // nick TODO call Communicator::getInstance().cleanup() somewhere before program exit +void setupTransformerForward(Fusion* fusion, DataType dtype) { + Communicator* communicator_ = &Communicator::getInstance(); const int64_t D = communicator_->size(); // number of devices @@ -63,17 +63,6 @@ static at::Tensor transformerShardTensor_Mesh( return nvfuser::shardTensor(tensor, axis, mesh, device_id); } -static at::Tensor transformerShardTensor(at::Tensor tensor, TensorView* tv, Communicator* communicator_) { - if (!isSharded(tv)) { - return tensor; - } - NVF_ERROR(tv->hasDeviceMesh(), "`tv` has no DeviceMesh: ", tv); - return transformerShardTensor_Mesh( - tensor, - getShardedLogicalAxis(tv, ParallelType::DIDx), - tv->getDeviceMesh(), communicator_); -} - static void transformerFwd( benchmark::State& benchmark_state, FusionExecutorCache* executor_cache, From b0c65bc8dc8af0e60742c404237bf37e94ac248c Mon Sep 17 00:00:00 2001 From: Nicholas Sarkauskas Date: Thu, 9 Jan 2025 10:19:04 -0800 Subject: [PATCH 15/19] update --- benchmarks/cpp/transformer.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/benchmarks/cpp/transformer.cpp b/benchmarks/cpp/transformer.cpp index 330f6eeb566..2e6ef6d9ff0 100644 --- a/benchmarks/cpp/transformer.cpp +++ b/benchmarks/cpp/transformer.cpp @@ -54,7 +54,7 @@ void setupTransformerForward(Fusion* fusion, DataType dtype) { model->setupForward(fusion, dtype, /*sequence_parallel=*/false); } -static at::Tensor transformerShardTensor_Mesh( +at::Tensor transformerShardTensor_Mesh( at::Tensor tensor, const int64_t axis, const DeviceMesh& mesh, @@ -63,7 +63,7 @@ static at::Tensor transformerShardTensor_Mesh( return nvfuser::shardTensor(tensor, axis, mesh, device_id); } -static void transformerFwd( +void transformerFwd( benchmark::State& benchmark_state, FusionExecutorCache* executor_cache, DataType dtype) { @@ -119,7 +119,7 @@ static void transformerFwd( NVFUSER_BENCHMARK_DEFINE( TransformerForward, setupTransformerForward, - NvFuserScheduler_TransformerFwd, + transformerFwd, DataType::BFloat16); NVFUSER_BENCHMARK_RUN(TransformerForward) From d34a9023fd181c88a01ca4a32aa0711804ad1e74 Mon Sep 17 00:00:00 2001 From: Nicholas Sarkauskas Date: Thu, 9 Jan 2025 10:33:49 -0800 Subject: [PATCH 16/19] linter --- benchmarks/cpp/transformer.cpp | 17 ++++++++++------- tests/cpp/multidevice_transformer.cpp | 2 +- tests/cpp/multidevice_transformer.h | 2 +- tests/cpp/test_multidevice_transformer.cpp | 2 +- 4 files changed, 13 insertions(+), 10 deletions(-) diff --git a/benchmarks/cpp/transformer.cpp b/benchmarks/cpp/transformer.cpp index 2e6ef6d9ff0..4728d6c114f 100644 --- a/benchmarks/cpp/transformer.cpp +++ b/benchmarks/cpp/transformer.cpp @@ -23,23 +23,23 @@ #include #include -#include #include +#include using namespace nvfuser; namespace { // Note: We test on smaller model and input sizes to avoid high error // accumulation for validation. - constexpr int64_t B = 2, E = 768, H = 16, S = 128; +constexpr int64_t B = 2, E = 768, H = 16, S = 128; // Note: Dropout probabilities are set to 0. Since the dropout mask is sharded // it throws off the seed offset between the sharded nvFuser program and the // unsharded reference. - constexpr double kDropoutProb = 0.0, kSdpaProb = 0.0, kSdpaScale = 1e-3; +constexpr double kDropoutProb = 0.0, kSdpaProb = 0.0, kSdpaScale = 1e-3; // Note parameters scaled by kParamScale following weight initialization // recommendations: // https://huggingface.co/docs/transformers/en/model_doc/gpt2#transformers.GPT2Config.initializer_range - constexpr double kParamScale = 0.02; +constexpr double kParamScale = 0.02; } // namespace // Return reduction tensor view and output of reduction @@ -49,7 +49,7 @@ void setupTransformerForward(Fusion* fusion, DataType dtype) { const int64_t D = communicator_->size(); // number of devices auto model = std::make_unique( - D, B, E, H, S, kDropoutProb, kSdpaProb); + D, B, E, H, S, kDropoutProb, kSdpaProb); model->setupForward(fusion, dtype, /*sequence_parallel=*/false); } @@ -96,8 +96,11 @@ void transformerFwd( x_, ln0_w_, ln0_b_, - transformerShardTensor_Mesh(mha_w0_.view({3, E, E}), 1, mesh, communicator_).view({1, 3 * E / D, E}), - transformerShardTensor_Mesh(mha_b0_.view({3, E}), 1, mesh, communicator_).view({1, 3 * E / D}), + transformerShardTensor_Mesh( + mha_w0_.view({3, E, E}), 1, mesh, communicator_) + .view({1, 3 * E / D, E}), + transformerShardTensor_Mesh(mha_b0_.view({3, E}), 1, mesh, communicator_) + .view({1, 3 * E / D}), transformerShardTensor_Mesh(mha_w1_, 1, mesh, communicator_).unsqueeze(0), mha_b1_, ln1_w_, diff --git a/tests/cpp/multidevice_transformer.cpp b/tests/cpp/multidevice_transformer.cpp index 67b3e48aedc..38917151004 100644 --- a/tests/cpp/multidevice_transformer.cpp +++ b/tests/cpp/multidevice_transformer.cpp @@ -395,7 +395,7 @@ std::vector DistributedTransformer::mha_backwards( /* NVFuser benchmark manages the unique_ptr for Fusion and FusionExecutorCache, so update the raw pointer with this setupForward function */ void DistributedTransformer::setupForward( - Fusion *fusion, + Fusion* fusion, DataType dtype, bool sequence_parallel) { FusionGuard fg(fusion); diff --git a/tests/cpp/multidevice_transformer.h b/tests/cpp/multidevice_transformer.h index 43c0b06bf80..caf2a250ced 100644 --- a/tests/cpp/multidevice_transformer.h +++ b/tests/cpp/multidevice_transformer.h @@ -46,7 +46,7 @@ class DistributedTransformer { kSdpaProb(sdpa_dropout_prob) {} void setupForward( - Fusion *fusion, + Fusion* fusion, DataType dtype, bool sequence_parallel = false); std::unique_ptr forward( diff --git a/tests/cpp/test_multidevice_transformer.cpp b/tests/cpp/test_multidevice_transformer.cpp index 8db0db5915a..1b90f1d0b65 100644 --- a/tests/cpp/test_multidevice_transformer.cpp +++ b/tests/cpp/test_multidevice_transformer.cpp @@ -11,8 +11,8 @@ #include #include -#include #include +#include #include #include #include From 437fdd1c2f376ed7f9e92d0b493a662aba63a152 Mon Sep 17 00:00:00 2001 From: Nicholas Sarkauskas Date: Thu, 9 Jan 2025 12:15:52 -0800 Subject: [PATCH 17/19] review feedback --- benchmarks/cpp/transformer.cpp | 40 +++++++++++++++++----------------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/benchmarks/cpp/transformer.cpp b/benchmarks/cpp/transformer.cpp index 4728d6c114f..fab2bd4cdcd 100644 --- a/benchmarks/cpp/transformer.cpp +++ b/benchmarks/cpp/transformer.cpp @@ -28,7 +28,6 @@ using namespace nvfuser; -namespace { // Note: We test on smaller model and input sizes to avoid high error // accumulation for validation. constexpr int64_t B = 2, E = 768, H = 16, S = 128; @@ -40,13 +39,12 @@ constexpr double kDropoutProb = 0.0, kSdpaProb = 0.0, kSdpaScale = 1e-3; // recommendations: // https://huggingface.co/docs/transformers/en/model_doc/gpt2#transformers.GPT2Config.initializer_range constexpr double kParamScale = 0.02; -} // namespace // Return reduction tensor view and output of reduction void setupTransformerForward(Fusion* fusion, DataType dtype) { - Communicator* communicator_ = &Communicator::getInstance(); + Communicator* communicator = &Communicator::getInstance(); - const int64_t D = communicator_->size(); // number of devices + const int64_t D = communicator->size(); // number of devices auto model = std::make_unique( D, B, E, H, S, kDropoutProb, kSdpaProb); @@ -54,28 +52,30 @@ void setupTransformerForward(Fusion* fusion, DataType dtype) { model->setupForward(fusion, dtype, /*sequence_parallel=*/false); } -at::Tensor transformerShardTensor_Mesh( - at::Tensor tensor, - const int64_t axis, - const DeviceMesh& mesh, - Communicator* communicator_) { - const auto device_id = communicator_->deviceId(); +namespace { + at::Tensor shardTensor( + at::Tensor tensor, + const int64_t axis, + const DeviceMesh& mesh, + Communicator* communicator) { + const auto device_id = communicator->deviceId(); return nvfuser::shardTensor(tensor, axis, mesh, device_id); } +} void transformerFwd( benchmark::State& benchmark_state, FusionExecutorCache* executor_cache, DataType dtype) { - Communicator* communicator_ = &Communicator::getInstance(); - const int64_t D = communicator_->size(); // number of devices + Communicator* communicator = &Communicator::getInstance(); + const int64_t D = communicator->size(); // number of devices at::ScalarType at_dtype = data_type_to_aten(dtype); const auto mesh = DeviceMesh::createForNumDevices(D); std::vector norm_shape{E}; const auto options = - at::TensorOptions().dtype(at_dtype).device(communicator_->device()); + at::TensorOptions().dtype(at_dtype).device(communicator->device()); auto x_ = at::randn({B * S, E}, options); auto ln0_w_ = at::randn(E, options).to(at::kFloat); auto ln0_b_ = at::randn(E, options).to(at::kFloat); @@ -96,18 +96,18 @@ void transformerFwd( x_, ln0_w_, ln0_b_, - transformerShardTensor_Mesh( - mha_w0_.view({3, E, E}), 1, mesh, communicator_) + shardTensor( + mha_w0_.view({3, E, E}), 1, mesh, communicator) .view({1, 3 * E / D, E}), - transformerShardTensor_Mesh(mha_b0_.view({3, E}), 1, mesh, communicator_) + shardTensor(mha_b0_.view({3, E}), 1, mesh, communicator) .view({1, 3 * E / D}), - transformerShardTensor_Mesh(mha_w1_, 1, mesh, communicator_).unsqueeze(0), + shardTensor(mha_w1_, 1, mesh, communicator).unsqueeze(0), mha_b1_, ln1_w_, ln1_b_, - transformerShardTensor_Mesh(mlp_w0_, 0, mesh, communicator_).unsqueeze(0), - transformerShardTensor_Mesh(mlp_b0_, 0, mesh, communicator_).unsqueeze(0), - transformerShardTensor_Mesh(mlp_w1_, 1, mesh, communicator_).unsqueeze(0), + shardTensor(mlp_w0_, 0, mesh, communicator).unsqueeze(0), + shardTensor(mlp_b0_, 0, mesh, communicator).unsqueeze(0), + shardTensor(mlp_w1_, 1, mesh, communicator).unsqueeze(0), mlp_b1_}; auto bytes = From 9742a9015a4ed604521438183a6136316898b3ee Mon Sep 17 00:00:00 2001 From: Nicholas Sarkauskas Date: Thu, 9 Jan 2025 12:20:18 -0800 Subject: [PATCH 18/19] lint --- benchmarks/cpp/transformer.cpp | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/benchmarks/cpp/transformer.cpp b/benchmarks/cpp/transformer.cpp index fab2bd4cdcd..66bb1759b13 100644 --- a/benchmarks/cpp/transformer.cpp +++ b/benchmarks/cpp/transformer.cpp @@ -53,15 +53,15 @@ void setupTransformerForward(Fusion* fusion, DataType dtype) { } namespace { - at::Tensor shardTensor( - at::Tensor tensor, - const int64_t axis, - const DeviceMesh& mesh, - Communicator* communicator) { +at::Tensor shardTensor( + at::Tensor tensor, + const int64_t axis, + const DeviceMesh& mesh, + Communicator* communicator) { const auto device_id = communicator->deviceId(); return nvfuser::shardTensor(tensor, axis, mesh, device_id); } -} +} // namespace void transformerFwd( benchmark::State& benchmark_state, @@ -96,8 +96,7 @@ void transformerFwd( x_, ln0_w_, ln0_b_, - shardTensor( - mha_w0_.view({3, E, E}), 1, mesh, communicator) + shardTensor(mha_w0_.view({3, E, E}), 1, mesh, communicator) .view({1, 3 * E / D, E}), shardTensor(mha_b0_.view({3, E}), 1, mesh, communicator) .view({1, 3 * E / D}), From 450596e27bbb9640b4f392c3f79ffa000afb98c5 Mon Sep 17 00:00:00 2001 From: Nicholas Sarkauskas Date: Fri, 10 Jan 2025 10:15:20 -0500 Subject: [PATCH 19/19] remove unused variable --- benchmarks/cpp/transformer.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/cpp/transformer.cpp b/benchmarks/cpp/transformer.cpp index 66bb1759b13..e21e81be987 100644 --- a/benchmarks/cpp/transformer.cpp +++ b/benchmarks/cpp/transformer.cpp @@ -34,7 +34,7 @@ constexpr int64_t B = 2, E = 768, H = 16, S = 128; // Note: Dropout probabilities are set to 0. Since the dropout mask is sharded // it throws off the seed offset between the sharded nvFuser program and the // unsharded reference. -constexpr double kDropoutProb = 0.0, kSdpaProb = 0.0, kSdpaScale = 1e-3; +constexpr double kDropoutProb = 0.0, kSdpaProb = 0.0; // Note parameters scaled by kParamScale following weight initialization // recommendations: // https://huggingface.co/docs/transformers/en/model_doc/gpt2#transformers.GPT2Config.initializer_range