From f87c1375cd69def940a7ac1f27d28a4caca66fec Mon Sep 17 00:00:00 2001 From: Nicholas Sarkauskas Date: Thu, 9 Jan 2025 10:33:49 -0800 Subject: [PATCH] linter --- benchmarks/cpp/transformer.cpp | 17 ++++++++++------- tests/cpp/multidevice_transformer.cpp | 2 +- tests/cpp/multidevice_transformer.h | 2 +- tests/cpp/test_multidevice_transformer.cpp | 2 +- 4 files changed, 13 insertions(+), 10 deletions(-) diff --git a/benchmarks/cpp/transformer.cpp b/benchmarks/cpp/transformer.cpp index 2e6ef6d9ff0..4728d6c114f 100644 --- a/benchmarks/cpp/transformer.cpp +++ b/benchmarks/cpp/transformer.cpp @@ -23,23 +23,23 @@ #include #include -#include #include +#include using namespace nvfuser; namespace { // Note: We test on smaller model and input sizes to avoid high error // accumulation for validation. - constexpr int64_t B = 2, E = 768, H = 16, S = 128; +constexpr int64_t B = 2, E = 768, H = 16, S = 128; // Note: Dropout probabilities are set to 0. Since the dropout mask is sharded // it throws off the seed offset between the sharded nvFuser program and the // unsharded reference. - constexpr double kDropoutProb = 0.0, kSdpaProb = 0.0, kSdpaScale = 1e-3; +constexpr double kDropoutProb = 0.0, kSdpaProb = 0.0, kSdpaScale = 1e-3; // Note parameters scaled by kParamScale following weight initialization // recommendations: // https://huggingface.co/docs/transformers/en/model_doc/gpt2#transformers.GPT2Config.initializer_range - constexpr double kParamScale = 0.02; +constexpr double kParamScale = 0.02; } // namespace // Return reduction tensor view and output of reduction @@ -49,7 +49,7 @@ void setupTransformerForward(Fusion* fusion, DataType dtype) { const int64_t D = communicator_->size(); // number of devices auto model = std::make_unique( - D, B, E, H, S, kDropoutProb, kSdpaProb); + D, B, E, H, S, kDropoutProb, kSdpaProb); model->setupForward(fusion, dtype, /*sequence_parallel=*/false); } @@ -96,8 +96,11 @@ void transformerFwd( x_, ln0_w_, ln0_b_, - transformerShardTensor_Mesh(mha_w0_.view({3, E, E}), 1, mesh, communicator_).view({1, 3 * E / D, E}), - transformerShardTensor_Mesh(mha_b0_.view({3, E}), 1, mesh, communicator_).view({1, 3 * E / D}), + transformerShardTensor_Mesh( + mha_w0_.view({3, E, E}), 1, mesh, communicator_) + .view({1, 3 * E / D, E}), + transformerShardTensor_Mesh(mha_b0_.view({3, E}), 1, mesh, communicator_) + .view({1, 3 * E / D}), transformerShardTensor_Mesh(mha_w1_, 1, mesh, communicator_).unsqueeze(0), mha_b1_, ln1_w_, diff --git a/tests/cpp/multidevice_transformer.cpp b/tests/cpp/multidevice_transformer.cpp index 67b3e48aedc..38917151004 100644 --- a/tests/cpp/multidevice_transformer.cpp +++ b/tests/cpp/multidevice_transformer.cpp @@ -395,7 +395,7 @@ std::vector DistributedTransformer::mha_backwards( /* NVFuser benchmark manages the unique_ptr for Fusion and FusionExecutorCache, so update the raw pointer with this setupForward function */ void DistributedTransformer::setupForward( - Fusion *fusion, + Fusion* fusion, DataType dtype, bool sequence_parallel) { FusionGuard fg(fusion); diff --git a/tests/cpp/multidevice_transformer.h b/tests/cpp/multidevice_transformer.h index 43c0b06bf80..caf2a250ced 100644 --- a/tests/cpp/multidevice_transformer.h +++ b/tests/cpp/multidevice_transformer.h @@ -46,7 +46,7 @@ class DistributedTransformer { kSdpaProb(sdpa_dropout_prob) {} void setupForward( - Fusion *fusion, + Fusion* fusion, DataType dtype, bool sequence_parallel = false); std::unique_ptr forward( diff --git a/tests/cpp/test_multidevice_transformer.cpp b/tests/cpp/test_multidevice_transformer.cpp index 8db0db5915a..1b90f1d0b65 100644 --- a/tests/cpp/test_multidevice_transformer.cpp +++ b/tests/cpp/test_multidevice_transformer.cpp @@ -11,8 +11,8 @@ #include #include -#include #include +#include #include #include #include