-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathvector_fft_floats.cl
61 lines (51 loc) · 1.62 KB
/
vector_fft_floats.cl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
#include "cplx.c"
/*
// The fft "butterfly" stage merges two contiguous halves.
//
// Here, 'i' is the size of a half.
//
// For each 'i', we need to compute the index at which
// our thread will operate.
//
// for i = 1:
// -+-+-+-+
// 0 1 2 3
//
// for i = 2:
// --++--++
// 01 23
//
// for i = 4:
// ----++++
// 0123
//
// ... and in the general case:
//
// ---- (i) ----++++ (i) ++++---- (i) ----++++ (i) ++++ ...
// 0123 .....i-1 i ... 2i-1
//
// so from the diagram above we can see that: idx = 2*i*(k/i) + (k % i)
//
// and by definition of the modulo: k%i = k - i*(k/i)
// so we can write: idx = i*(k/i) + k
*/
// This algorithm has the nice property that there is no "branch divergence":
// all threads have the same control flow. So it seems it's the perfect candidate
// for running on a GPU, because the parallelization is massive.
__kernel void kernel_func(__global const float *input, __global const struct cplx *twiddle, __global struct cplx *output) {
// Get the index of the current element to be processed
int const k = get_global_id(0); // we use a single dimension hence we pass 0
output[2*k] = complexFromReal(input[2*k]);
output[2*k+1] = complexFromReal(input[2*k+1]);
barrier(CLK_GLOBAL_MEM_FENCE);
int const n_global_butterflies = get_global_size(0);
for(int i=1; i<=n_global_butterflies; i <<= 1) {
int const tmp = i*(k/i);
int const idx = tmp + k;
//assert(idx+i < Sz);
int const ri = k - tmp;
int const tIdx = ri*(n_global_butterflies/i);
butterfly_global(output+idx, i, twiddle[tIdx]);
barrier(CLK_GLOBAL_MEM_FENCE);
}
}