Skip to content

Commit

Permalink
linter
Browse files Browse the repository at this point in the history
  • Loading branch information
Nicholas Sarkauskas authored and nsarka committed Jan 9, 2025
1 parent 172e873 commit f87c137
Show file tree
Hide file tree
Showing 4 changed files with 13 additions and 10 deletions.
17 changes: 10 additions & 7 deletions benchmarks/cpp/transformer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,23 +23,23 @@

#include <benchmarks/cpp/utils.h>
#include <csrc/multidevice/utils.h>
#include <tests/cpp/utils.h>
#include <tests/cpp/multidevice_transformer.h>
#include <tests/cpp/utils.h>

using namespace nvfuser;

namespace {
// Note: We test on smaller model and input sizes to avoid high error
// accumulation for validation.
constexpr int64_t B = 2, E = 768, H = 16, S = 128;
constexpr int64_t B = 2, E = 768, H = 16, S = 128;
// Note: Dropout probabilities are set to 0. Since the dropout mask is sharded
// it throws off the seed offset between the sharded nvFuser program and the
// unsharded reference.
constexpr double kDropoutProb = 0.0, kSdpaProb = 0.0, kSdpaScale = 1e-3;
constexpr double kDropoutProb = 0.0, kSdpaProb = 0.0, kSdpaScale = 1e-3;
// Note parameters scaled by kParamScale following weight initialization
// recommendations:
// https://huggingface.co/docs/transformers/en/model_doc/gpt2#transformers.GPT2Config.initializer_range
constexpr double kParamScale = 0.02;
constexpr double kParamScale = 0.02;
} // namespace

// Return reduction tensor view and output of reduction
Expand All @@ -49,7 +49,7 @@ void setupTransformerForward(Fusion* fusion, DataType dtype) {
const int64_t D = communicator_->size(); // number of devices

auto model = std::make_unique<DistributedTransformer>(
D, B, E, H, S, kDropoutProb, kSdpaProb);
D, B, E, H, S, kDropoutProb, kSdpaProb);

model->setupForward(fusion, dtype, /*sequence_parallel=*/false);
}
Expand Down Expand Up @@ -96,8 +96,11 @@ void transformerFwd(
x_,
ln0_w_,
ln0_b_,
transformerShardTensor_Mesh(mha_w0_.view({3, E, E}), 1, mesh, communicator_).view({1, 3 * E / D, E}),
transformerShardTensor_Mesh(mha_b0_.view({3, E}), 1, mesh, communicator_).view({1, 3 * E / D}),
transformerShardTensor_Mesh(
mha_w0_.view({3, E, E}), 1, mesh, communicator_)
.view({1, 3 * E / D, E}),
transformerShardTensor_Mesh(mha_b0_.view({3, E}), 1, mesh, communicator_)
.view({1, 3 * E / D}),
transformerShardTensor_Mesh(mha_w1_, 1, mesh, communicator_).unsqueeze(0),
mha_b1_,
ln1_w_,
Expand Down
2 changes: 1 addition & 1 deletion tests/cpp/multidevice_transformer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -395,7 +395,7 @@ std::vector<TensorView*> DistributedTransformer::mha_backwards(
/* NVFuser benchmark manages the unique_ptr for Fusion and FusionExecutorCache,
so update the raw pointer with this setupForward function */
void DistributedTransformer::setupForward(
Fusion *fusion,
Fusion* fusion,
DataType dtype,
bool sequence_parallel) {
FusionGuard fg(fusion);
Expand Down
2 changes: 1 addition & 1 deletion tests/cpp/multidevice_transformer.h
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ class DistributedTransformer {
kSdpaProb(sdpa_dropout_prob) {}

void setupForward(
Fusion *fusion,
Fusion* fusion,
DataType dtype,
bool sequence_parallel = false);
std::unique_ptr<FusionExecutorCache> forward(
Expand Down
2 changes: 1 addition & 1 deletion tests/cpp/test_multidevice_transformer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,8 @@
#include <gtest/gtest.h>

#include <fusion.h>
#include <ops/all_ops.h>
#include <multidevice/communicator.h>
#include <ops/all_ops.h>
#include <tests/cpp/multidevice.h>
#include <tests/cpp/multidevice_transformer.h>
#include <tests/cpp/validator.h>
Expand Down

0 comments on commit f87c137

Please sign in to comment.