Skip to content

Commit

Permalink
Enable register sharing only for hopper (#3696)
Browse files Browse the repository at this point in the history
Register sharing is only supported for hopper. This PR updates fusion checks and tests for this constraint.
  • Loading branch information
rdspring1 authored Jan 12, 2025
1 parent 05ec62b commit 28ae834
Show file tree
Hide file tree
Showing 2 changed files with 59 additions and 5 deletions.
9 changes: 6 additions & 3 deletions csrc/device_lower/analysis/device_version.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -70,11 +70,14 @@ void MinimumDeviceVersion::handle(LoadStoreOp* ls_op) {
}

void MinimumDeviceVersion::handle(TensorView* tv) {
if (std::holds_alternative<WarpSpecialized>(
tv->circularBufferOptions().type)) {
bool enable_register_sharing = std::holds_alternative<WarpSpecialized>(
tv->circularBufferOptions().type) &&
std::get<WarpSpecialized>(tv->circularBufferOptions().type)
.num_registers.has_value();
if (enable_register_sharing) {
ensureVersion(
{9, 0},
"Warp Specialized Circular Buffering uses the setmaxnreg ptx instruction, which requires Hopper (9.0) or newer");
"Warp Specialized Circular Buffering uses the setmaxnreg ptx instruction, which requires Hopper (9.0)");
}
}

Expand Down
55 changes: 53 additions & 2 deletions tests/cpp/test_circular_buffering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
namespace nvfuser {

TEST_F(NVFuserTest, RegisterSharingCircularBufferingPointwiseCustom) {
NVFUSER_TEST_CUDA_ARCH_GUARD(9, 0);
NVFUSER_TEST_CUDA_ARCH_RANGE_GUARD(9, 0, 10, 0);
std::unique_ptr<Fusion> fusion = std::make_unique<Fusion>();
FusionGuard fg(fusion.get());

Expand Down Expand Up @@ -87,7 +87,7 @@ TEST_F(NVFuserTest, RegisterSharingCircularBufferingPointwiseCustom) {
}

TEST_F(NVFuserTest, RegisterSharingCircularBufferingPointwiseNested) {
NVFUSER_TEST_CUDA_ARCH_GUARD(9, 0);
NVFUSER_TEST_CUDA_ARCH_RANGE_GUARD(9, 0, 10, 0);
std::unique_ptr<Fusion> fusion = std::make_unique<Fusion>();
FusionGuard fg(fusion.get());

Expand Down Expand Up @@ -1000,6 +1000,12 @@ class TmaCircularBufferingTest
NVFuserTest::SetUp();
}

bool testEnablesRegisterSharing() {
return std::holds_alternative<WarpSpecialized>(circular_buffer_type) &&
std::get<WarpSpecialized>(circular_buffer_type)
.num_registers.has_value();
}

template <typename data_type>
void compare(int64_t tensor_dim, at::Tensor result, at::Tensor reference) {
at::Tensor reference_cpu_data = reference.cpu();
Expand Down Expand Up @@ -1137,6 +1143,10 @@ TEST_F(NVFuserTest, ElectSyncCompatibility) {

TEST_P(TmaCircularBufferingTest, SingleDim) {
NVFUSER_TEST_CUDA_ARCH_GUARD(9, 0);
if (testEnablesRegisterSharing() && deviceMajorMinorCheck(10)) {
GTEST_SKIP() << "Register Sharing is only for hopper";
return;
}

std::unique_ptr<Fusion> fusion = std::make_unique<Fusion>();
FusionGuard fg(fusion.get());
Expand Down Expand Up @@ -1187,6 +1197,10 @@ TEST_P(TmaCircularBufferingTest, SingleDim) {

TEST_P(TmaCircularBufferingTest, SingleDimUnroll) {
NVFUSER_TEST_CUDA_ARCH_GUARD(9, 0);
if (testEnablesRegisterSharing() && deviceMajorMinorCheck(10)) {
GTEST_SKIP() << "Register Sharing is only for hopper";
return;
}

std::unique_ptr<Fusion> fusion = std::make_unique<Fusion>();
FusionGuard fg(fusion.get());
Expand Down Expand Up @@ -1248,6 +1262,10 @@ TEST_P(TmaCircularBufferingTest, SingleDimUnroll) {

TEST_P(TmaCircularBufferingTest, SingleDimUnswitch) {
NVFUSER_TEST_CUDA_ARCH_GUARD(9, 0);
if (testEnablesRegisterSharing() && deviceMajorMinorCheck(10)) {
GTEST_SKIP() << "Register Sharing is only for hopper";
return;
}

std::unique_ptr<Fusion> fusion = std::make_unique<Fusion>();
FusionGuard fg(fusion.get());
Expand Down Expand Up @@ -1309,6 +1327,10 @@ TEST_P(TmaCircularBufferingTest, SingleDimUnswitch) {

TEST_P(TmaCircularBufferingTest, MultiDim) {
NVFUSER_TEST_CUDA_ARCH_GUARD(9, 0);
if (testEnablesRegisterSharing() && deviceMajorMinorCheck(10)) {
GTEST_SKIP() << "Register Sharing is only for hopper";
return;
}

std::unique_ptr<Fusion> fusion = std::make_unique<Fusion>();
FusionGuard fg(fusion.get());
Expand Down Expand Up @@ -1373,6 +1395,10 @@ TEST_P(TmaCircularBufferingTest, MultiDim) {

TEST_P(TmaCircularBufferingTest, Pointwise) {
NVFUSER_TEST_CUDA_ARCH_GUARD(9, 0);
if (testEnablesRegisterSharing() && deviceMajorMinorCheck(10)) {
GTEST_SKIP() << "Register Sharing is only for hopper";
return;
}
std::unique_ptr<Fusion> fusion = std::make_unique<Fusion>();
FusionGuard fg(fusion.get());

Expand Down Expand Up @@ -1445,6 +1471,10 @@ TEST_P(TmaCircularBufferingTest, PointwiseCpAsync) {
<< "Needs shared memory predicate, but current needsSharedMemoryPredicate() returns false";

NVFUSER_TEST_CUDA_ARCH_GUARD(9, 0);
if (testEnablesRegisterSharing() && deviceMajorMinorCheck(10)) {
GTEST_SKIP() << "Register Sharing is only for hopper";
return;
}
std::unique_ptr<Fusion> fusion = std::make_unique<Fusion>();
FusionGuard fg(fusion.get());

Expand Down Expand Up @@ -1510,6 +1540,10 @@ TEST_P(TmaCircularBufferingTest, PointwiseCpAsync) {

TEST_P(TmaCircularBufferingTest, InnerReduction) {
NVFUSER_TEST_CUDA_ARCH_GUARD(9, 0);
if (testEnablesRegisterSharing() && deviceMajorMinorCheck(10)) {
GTEST_SKIP() << "Register Sharing is only for hopper";
return;
}

std::unique_ptr<Fusion> fusion = std::make_unique<Fusion>();
FusionGuard fg(fusion.get());
Expand Down Expand Up @@ -1571,6 +1605,10 @@ TEST_P(TmaCircularBufferingTest, InnerReduction) {

TEST_P(TmaCircularBufferingTest, OuterReduction) {
NVFUSER_TEST_CUDA_ARCH_GUARD(9, 0);
if (testEnablesRegisterSharing() && deviceMajorMinorCheck(10)) {
GTEST_SKIP() << "Register Sharing is only for hopper";
return;
}

std::unique_ptr<Fusion> fusion = std::make_unique<Fusion>();
FusionGuard fg(fusion.get());
Expand Down Expand Up @@ -1624,6 +1662,10 @@ TEST_P(TmaCircularBufferingTest, OuterReduction) {

TEST_P(TmaCircularBufferingTest, Persistent) {
NVFUSER_TEST_CUDA_ARCH_GUARD(9, 0);
if (testEnablesRegisterSharing() && deviceMajorMinorCheck(10)) {
GTEST_SKIP() << "Register Sharing is only for hopper";
return;
}

constexpr at::ScalarType dtype = at::ScalarType::Float;
constexpr int64_t correction = 0;
Expand Down Expand Up @@ -1757,6 +1799,10 @@ TEST_P(TmaCircularBufferingTest, Persistent) {

TEST_P(TmaCircularBufferingTest, Matmul) {
NVFUSER_TEST_CUDA_ARCH_GUARD(9, 0);
if (testEnablesRegisterSharing() && deviceMajorMinorCheck(10)) {
GTEST_SKIP() << "Register Sharing is only for hopper";
return;
}

std::unique_ptr<Fusion> fusion = std::make_unique<Fusion>();
FusionGuard fg(fusion.get());
Expand Down Expand Up @@ -1878,6 +1924,11 @@ TEST_P(TmaCircularBufferingTest, Matmul) {
TEST_P(TmaCircularBufferingTest, MatmulWithBroadcastedInput) {
NVFUSER_TEST_CUDA_ARCH_GUARD(9, 0);

if (testEnablesRegisterSharing() && deviceMajorMinorCheck(10)) {
GTEST_SKIP() << "Register Sharing is only for hopper";
return;
}

std::unique_ptr<Fusion> fusion = std::make_unique<Fusion>();
FusionGuard fg(fusion.get());

Expand Down

0 comments on commit 28ae834

Please sign in to comment.