Skip to content

Commit

Permalink
Prefer Array class over register arrays. i.e. Array<float, 2> rather …
Browse files Browse the repository at this point in the history
…than float[2].
  • Loading branch information
csarofeen committed Jan 20, 2025
1 parent 062dd50 commit 3158d84
Show file tree
Hide file tree
Showing 3 changed files with 23 additions and 24 deletions.
13 changes: 6 additions & 7 deletions csrc/codegen.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -265,7 +265,9 @@ class CudaKernelGenerator : private kir::ConstIrVisitor {
} else if (v->isA<TensorView>()) {
tv = v->as<TensorView>();
}
if (tv && aligned_array_of_regs_.count(tv)) {
if (tv &&
(aligned_array_of_regs_.count(tv) ||
tv->getMemoryType() == MemoryType::Local)) {
return genVariableName(tv).append(".array");
} else {
return genVariableName(v);
Expand Down Expand Up @@ -3169,14 +3171,11 @@ class CudaKernelGenerator : private kir::ConstIrVisitor {
break;
case MemoryType::Local: {
auto va = kernel_->summary().vectorized_accesses;
indent() << "Array<" << buffer_dtype << ", " << genInline(size)
<< ", " << (va.find(tv) != va.end() ? va.at(tv) : 1) << "> "
<< genVariableName(tv) << ";\n";
if (va.find(tv) != va.end()) {
indent() << "Array<" << buffer_dtype << ", " << genInline(size)
<< ", " << va.at(tv) << "> " << genVariableName(tv)
<< ";\n";
aligned_array_of_regs_.insert(tv);
} else {
indent() << buffer_dtype << " " << genVariableName(tv) << "["
<< genInline(size) << "];\n";
}
} break;
default:
Expand Down
32 changes: 16 additions & 16 deletions tests/cpp/test_loop_rotation.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -41,8 +41,8 @@ __global__ void CUDAGeneratedKernel(Tensor<float, 2, 2> T0, Tensor<float, 2, 2>
i1 = T0.alloc_stride[0LL] * i0;
nvfuser_index_t i2;
i2 = 3LL * i0;
float T1[1LL];
float T2[1LL];
Array<float, 1LL, 1> T1;
Array<float, 1LL, 1> T2;
T1[0LL] = 0LL;
T1[0LL]
= T0[i1];
Expand All @@ -53,7 +53,7 @@ __global__ void CUDAGeneratedKernel(Tensor<float, 2, 2> T0, Tensor<float, 2, 2>
for(nvfuser_index_t i3 = 0LL; i3 < 3LL; ++i3) {
nvfuser_index_t i4;
i4 = (1LL + i3) + nvfuser_zero;
float T3[1LL];
Array<float, 1LL, 1> T3;
T3[0LL]
= T2[0LL];
T4[(i2 + (i3 + nvfuser_zero))]
Expand Down Expand Up @@ -101,8 +101,8 @@ TEST_F(LoopRotationTest, RotateOuter) {
const std::string expected_kernel = R"(
__global__ void CUDAGeneratedKernel(Tensor<float, 2, 2> T0, Tensor<float, 2, 2> T4) {
NVFUSER_DEFINE_MAGIC_ZERO;
float T1[3LL];
float T2[3LL];
Array<float, 3LL, 1> T1;
Array<float, 3LL, 1> T2;
#pragma unroll
for(nvfuser_index_t i0 = 0LL; i0 < 3LL; ++i0) {
T1[i0] = 0LL;
Expand Down Expand Up @@ -202,8 +202,8 @@ __global__ void CUDAGeneratedKernel(Tensor<float, 2, 2> T0, Tensor<float, 2, 2>
i0 = T0.logical_size[0LL] * T0.logical_size[1LL];
nvfuser_index_t i1;
i1 = ceilDiv(i0, 5LL);
float T1[5LL];
float T2[5LL];
Array<float, 5LL, 1> T1;
Array<float, 5LL, 1> T2;
#pragma unroll
for(nvfuser_index_t i2 = 0LL; i2 < 5LL; ++i2) {
T1[i2] = 0LL;
Expand Down Expand Up @@ -306,7 +306,7 @@ __global__ void CUDAGeneratedKernel(Tensor<float, 2, 2> T0, Tensor<float, 2, 2>
NVFUSER_DEFINE_MAGIC_ZERO;
nvfuser_index_t i0;
i0 = 4LL * T0.alloc_stride[0LL];
float T1[15LL];
Array<float, 15LL, 1> T1;
#pragma unroll 4
for(nvfuser_index_t i1 = 0LL; i1 < 4LL; ++i1) {
nvfuser_index_t i2;
Expand All @@ -328,7 +328,7 @@ __global__ void CUDAGeneratedKernel(Tensor<float, 2, 2> T0, Tensor<float, 2, 2>
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
float T2[3LL];
Array<float, 3LL, 1> T2;
#pragma unroll
for(nvfuser_index_t i6 = 0LL; i6 < 3LL; ++i6) {
T2[i6]
Expand Down Expand Up @@ -362,7 +362,7 @@ __global__ void CUDAGeneratedKernel(Tensor<float, 2, 2> T0, Tensor<float, 2, 2>
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
float T3[3LL];
Array<float, 3LL, 1> T3;
#pragma unroll
for(nvfuser_index_t i14 = 0LL; i14 < 3LL; ++i14) {
T3[i14]
Expand Down Expand Up @@ -421,7 +421,7 @@ __global__ void CUDAGeneratedKernel(Tensor<float, 2, 2> T0, Tensor<float, 2, 2>
i1 = 5LL * T0.alloc_stride[0LL];
bool b2;
b2 = 4LL < T0.logical_size[0LL];
float T1[15LL];
Array<float, 15LL, 1> T1;
#pragma unroll
for(nvfuser_index_t i3 = 0LL; i3 < 3LL; ++i3) {
T1[i3] = 0LL;
Expand Down Expand Up @@ -454,7 +454,7 @@ __global__ void CUDAGeneratedKernel(Tensor<float, 2, 2> T0, Tensor<float, 2, 2>
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
float T2[3LL];
Array<float, 3LL, 1> T2;
#pragma unroll
for(nvfuser_index_t i3 = 0LL; i3 < 3LL; ++i3) {
T1[(12LL + i3)] = 0LL;
Expand Down Expand Up @@ -486,7 +486,7 @@ __global__ void CUDAGeneratedKernel(Tensor<float, 2, 2> T0, Tensor<float, 2, 2>
i13 = 3LL * ((1LL + i9) % 5LL);
bool b14;
b14 = (5LL + i9) < T0.logical_size[0LL];
float T3[3LL];
Array<float, 3LL, 1> T3;
#pragma unroll
for(nvfuser_index_t i15 = 0LL; i15 < 3LL; ++i15) {
T3[i15]
Expand Down Expand Up @@ -599,7 +599,7 @@ __global__ void CUDAGeneratedKernel(Tensor<float, 2, 2> T0, Tensor<float, 2, 2>
}
NVFUSER_UPDATE_MAGIC_ZERO;
asm volatile("cp.async.wait_group %0;\n"::"n"(3LL));
float T1[2LL];
Array<float, 2LL, 1> T1;
T1[0LL]
= T4[0LL];
#pragma unroll 4
Expand Down Expand Up @@ -637,14 +637,14 @@ __global__ void CUDAGeneratedKernel(Tensor<float, 2, 2> T0, Tensor<float, 2, 2>
for(nvfuser_index_t i14 = 0LL; i14 < 2LL; ++i14) {
T1[((1LL + i14) % 2LL)]
= T4[(i11 + i14)];
float T2[1LL];
Array<float, 1LL, 1> T2;
T2[0LL]
= T1[i14];
T3[(i12 + (i14 + nvfuser_zero))]
= T2[0LL];
}
NVFUSER_UPDATE_MAGIC_ZERO;
float T2[1LL];
Array<float, 1LL, 1> T2;
T2[0LL]
= T1[0LL];
T3[(2LL + i12)]
Expand Down
2 changes: 1 addition & 1 deletion tests/cpp/test_scalar_hoisting.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -316,7 +316,7 @@ __global__ void CUDAGeneratedKernel(Tensor<float, 2, 2> T0, Tensor<float, 2, 2>
b7 = i0 < i6;
float f8;
f8 = (float)(i6);
float T1[1LL];
Array<float, 1LL, 1> T1;
if (b7) {
T1[0LL]
= sinf(T0[i0]);
Expand Down

0 comments on commit 3158d84

Please sign in to comment.