-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathopt_32_6.cuh
153 lines (116 loc) · 4.67 KB
/
opt_32_6.cuh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
#include "opt32.cuh"
#pragma once
template <int kernel>
inline __device__ void opt6ds79_compute(float *p_in, int stride) {
if (kernel == 0)
opt5ds79_compute2(p_in, stride);
else
us79_compute<32>(p_in, stride);
}
template <int kernel, int block_y>
__launch_bounds__(32 * block_y, 1)
__global__ void opt6wl79_32x32x32(float *in) {
int idx = threadIdx.x;
int idy = threadIdx.y;
const int planes = 12;
const int block_size = 32 * 32 * 32;
const int snx = 33;
const int sny = 32;
const int snxy = snx * sny;
extern __shared__ float smem[];
size_t block_idx =
block_size *
(blockIdx.x + gridDim.x * (blockIdx.y + gridDim.y * blockIdx.z));
// Load one plane per warp into registers
float values[32];
const int ld = 32;
const int plane_size = 32 * 32;
// Put the warps in groups and make each group wait for an estimated amount of time that
// corresponds to how long it takes for one group to complete all of its global memory load
// instructions
{
int group_idx = idy / 4;
clock_t start = clock64();
clock_t stop = start + group_idx * 1200;
while (start < stop) {
start = clock64();
}
}
#pragma unroll
for (int i = 0; i < ld; ++i) {
values[i] = in[idx + i * ld + idy * plane_size + block_idx];
}
// Store data in shared memory in groups of 12, 12, 8, warps
const int num_groups = 3;
int group_size = planes;
int group = idy / group_size;
int group_idx = idy % group_size;
// Transform in y
opt6ds79_compute<kernel>(values, 1);
for (int g = 0; g < num_groups; ++g) {
// store in shared memory
if (g == group) {
#pragma unroll
for (int i = 0; i < ld; ++i) {
int smem_idx = idx + snx * i + group_idx * snxy;
smem[smem_idx] = values[i];
}
__syncwarp();
// Transform in x
opt6ds79_compute<kernel>(&smem[snx * idx + group_idx * snxy], 1);
__syncwarp();
// Write result to registers
#pragma unroll
for (int i = 0; i < ld; ++i) {
int smem_idx = idx + snx * i + group_idx * snxy;
values[i] = smem[smem_idx];
}
}
__syncthreads();
}
// Processes along the y-direction with all warps at once, but in three batches due to
// shared memory limitations
int offset = 0;
#pragma unroll
for (int g = 0; g < num_groups; ++g) {
#pragma unroll
for (int i = 0; i < group_size; ++i) {
int smem_idx = idy + snx * idx + snxy * i;
if (offset + i > 31) break;
smem[smem_idx] = values[i + offset];
offset += 1;
}
__syncthreads();
offset -= group_size;
// Transform in z
if (g == group) {
opt6ds79_compute<kernel>(&smem[snx * idx + snxy * group_idx], 1);
}
__syncthreads();
// Write result to registers
#pragma unroll
for (int i = 0; i < group_size; ++i) {
int smem_idx = idy + snx * idx + snxy * i;
values[i + offset] = smem[smem_idx];
offset += 1;
}
__syncthreads();
}
// Write final result to global memory
#pragma unroll
for (int i = 0; i < ld; ++i) {
in[idx + i * ld + idy * plane_size + block_idx] = values[i];
}
}
template <int mode>
void opt6wl79_32x32x32_h(float *in, const int bx, const int by, const int bz) {
const int block_y = 32;
dim3 threads(32, block_y, 1);
dim3 blocks(bx, by, bz);
const size_t smem = 50688;
cudaErrCheck(cudaFuncSetAttribute(
opt6wl79_32x32x32<mode, block_y>,
cudaFuncAttributeMaxDynamicSharedMemorySize, smem));
opt6wl79_32x32x32<mode, block_y><<<blocks, threads, smem>>>(in);
cudaErrCheck(cudaPeekAtLastError());
}