Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Prefer Array class over register arrays. #3737

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 6 additions & 7 deletions csrc/codegen.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -265,7 +265,9 @@ class CudaKernelGenerator : private kir::ConstIrVisitor {
} else if (v->isA<TensorView>()) {
tv = v->as<TensorView>();
}
if (tv && aligned_array_of_regs_.count(tv)) {
if (tv &&
(aligned_array_of_regs_.count(tv) ||
tv->getMemoryType() == MemoryType::Local)) {
return genVariableName(tv).append(".array");
} else {
return genVariableName(v);
Expand Down Expand Up @@ -3169,14 +3171,11 @@ class CudaKernelGenerator : private kir::ConstIrVisitor {
break;
case MemoryType::Local: {
auto va = kernel_->summary().vectorized_accesses;
indent() << "Array<" << buffer_dtype << ", " << genInline(size)
<< ", " << (va.find(tv) != va.end() ? va.at(tv) : 1) << "> "
<< genVariableName(tv) << ";\n";
if (va.find(tv) != va.end()) {
indent() << "Array<" << buffer_dtype << ", " << genInline(size)
<< ", " << va.at(tv) << "> " << genVariableName(tv)
<< ";\n";
aligned_array_of_regs_.insert(tv);
} else {
indent() << buffer_dtype << " " << genVariableName(tv) << "["
<< genInline(size) << "];\n";
}
} break;
default:
Expand Down
32 changes: 16 additions & 16 deletions tests/cpp/test_loop_rotation.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -41,8 +41,8 @@ __global__ void CUDAGeneratedKernel(Tensor<float, 2, 2> T0, Tensor<float, 2, 2>
i1 = T0.alloc_stride[0LL] * i0;
nvfuser_index_t i2;
i2 = 3LL * i0;
float T1[1LL];
float T2[1LL];
Array<float, 1LL, 1> T1;
Array<float, 1LL, 1> T2;
T1[0LL] = 0LL;
T1[0LL]
= T0[i1];
Expand All @@ -53,7 +53,7 @@ __global__ void CUDAGeneratedKernel(Tensor<float, 2, 2> T0, Tensor<float, 2, 2>
for(nvfuser_index_t i3 = 0LL; i3 < 3LL; ++i3) {
nvfuser_index_t i4;
i4 = (1LL + i3) + nvfuser_zero;
float T3[1LL];
Array<float, 1LL, 1> T3;
T3[0LL]
= T2[0LL];
T4[(i2 + (i3 + nvfuser_zero))]
Expand Down Expand Up @@ -101,8 +101,8 @@ TEST_F(LoopRotationTest, RotateOuter) {
const std::string expected_kernel = R"(
__global__ void CUDAGeneratedKernel(Tensor<float, 2, 2> T0, Tensor<float, 2, 2> T4) {
NVFUSER_DEFINE_MAGIC_ZERO;
float T1[3LL];
float T2[3LL];
Array<float, 3LL, 1> T1;
Array<float, 3LL, 1> T2;
#pragma unroll
for(nvfuser_index_t i0 = 0LL; i0 < 3LL; ++i0) {
T1[i0] = 0LL;
Expand Down Expand Up @@ -202,8 +202,8 @@ __global__ void CUDAGeneratedKernel(Tensor<float, 2, 2> T0, Tensor<float, 2, 2>
i0 = T0.logical_size[0LL] * T0.logical_size[1LL];
nvfuser_index_t i1;
i1 = ceilDiv(i0, 5LL);
float T1[5LL];
float T2[5LL];
Array<float, 5LL, 1> T1;
Array<float, 5LL, 1> T2;
#pragma unroll
for(nvfuser_index_t i2 = 0LL; i2 < 5LL; ++i2) {
T1[i2] = 0LL;
Expand Down Expand Up @@ -306,7 +306,7 @@ __global__ void CUDAGeneratedKernel(Tensor<float, 2, 2> T0, Tensor<float, 2, 2>
NVFUSER_DEFINE_MAGIC_ZERO;
nvfuser_index_t i0;
i0 = 4LL * T0.alloc_stride[0LL];
float T1[15LL];
Array<float, 15LL, 1> T1;
#pragma unroll 4
for(nvfuser_index_t i1 = 0LL; i1 < 4LL; ++i1) {
nvfuser_index_t i2;
Expand All @@ -328,7 +328,7 @@ __global__ void CUDAGeneratedKernel(Tensor<float, 2, 2> T0, Tensor<float, 2, 2>
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
float T2[3LL];
Array<float, 3LL, 1> T2;
#pragma unroll
for(nvfuser_index_t i6 = 0LL; i6 < 3LL; ++i6) {
T2[i6]
Expand Down Expand Up @@ -362,7 +362,7 @@ __global__ void CUDAGeneratedKernel(Tensor<float, 2, 2> T0, Tensor<float, 2, 2>
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
float T3[3LL];
Array<float, 3LL, 1> T3;
#pragma unroll
for(nvfuser_index_t i14 = 0LL; i14 < 3LL; ++i14) {
T3[i14]
Expand Down Expand Up @@ -421,7 +421,7 @@ __global__ void CUDAGeneratedKernel(Tensor<float, 2, 2> T0, Tensor<float, 2, 2>
i1 = 5LL * T0.alloc_stride[0LL];
bool b2;
b2 = 4LL < T0.logical_size[0LL];
float T1[15LL];
Array<float, 15LL, 1> T1;
#pragma unroll
for(nvfuser_index_t i3 = 0LL; i3 < 3LL; ++i3) {
T1[i3] = 0LL;
Expand Down Expand Up @@ -454,7 +454,7 @@ __global__ void CUDAGeneratedKernel(Tensor<float, 2, 2> T0, Tensor<float, 2, 2>
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
float T2[3LL];
Array<float, 3LL, 1> T2;
#pragma unroll
for(nvfuser_index_t i3 = 0LL; i3 < 3LL; ++i3) {
T1[(12LL + i3)] = 0LL;
Expand Down Expand Up @@ -486,7 +486,7 @@ __global__ void CUDAGeneratedKernel(Tensor<float, 2, 2> T0, Tensor<float, 2, 2>
i13 = 3LL * ((1LL + i9) % 5LL);
bool b14;
b14 = (5LL + i9) < T0.logical_size[0LL];
float T3[3LL];
Array<float, 3LL, 1> T3;
#pragma unroll
for(nvfuser_index_t i15 = 0LL; i15 < 3LL; ++i15) {
T3[i15]
Expand Down Expand Up @@ -599,7 +599,7 @@ __global__ void CUDAGeneratedKernel(Tensor<float, 2, 2> T0, Tensor<float, 2, 2>
}
NVFUSER_UPDATE_MAGIC_ZERO;
asm volatile("cp.async.wait_group %0;\n"::"n"(3LL));
float T1[2LL];
Array<float, 2LL, 1> T1;
T1[0LL]
= T4[0LL];
#pragma unroll 4
Expand Down Expand Up @@ -637,14 +637,14 @@ __global__ void CUDAGeneratedKernel(Tensor<float, 2, 2> T0, Tensor<float, 2, 2>
for(nvfuser_index_t i14 = 0LL; i14 < 2LL; ++i14) {
T1[((1LL + i14) % 2LL)]
= T4[(i11 + i14)];
float T2[1LL];
Array<float, 1LL, 1> T2;
T2[0LL]
= T1[i14];
T3[(i12 + (i14 + nvfuser_zero))]
= T2[0LL];
}
NVFUSER_UPDATE_MAGIC_ZERO;
float T2[1LL];
Array<float, 1LL, 1> T2;
T2[0LL]
= T1[0LL];
T3[(2LL + i12)]
Expand Down
2 changes: 1 addition & 1 deletion tests/cpp/test_scalar_hoisting.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -316,7 +316,7 @@ __global__ void CUDAGeneratedKernel(Tensor<float, 2, 2> T0, Tensor<float, 2, 2>
b7 = i0 < i6;
float f8;
f8 = (float)(i6);
float T1[1LL];
Array<float, 1LL, 1> T1;
if (b7) {
T1[0LL]
= sinf(T0[i0]);
Expand Down
Loading