-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathCLRadixSort.cl
256 lines (188 loc) · 5.95 KB
/
CLRadixSort.cl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
// OpenCL kernel sources for the CLRadixSort class
// the #include does not exist in OpenCL
// thus we simulate the #include "CLRadixSortParam.hpp" by
// string manipulations
// compute the histogram for each radix and each virtual processor for the pass
__kernel void histogram(const __global int* d_Keys,
__global int* d_Histograms,
const int pass,
__local int* loc_histo,
const int n){
int it = get_local_id(0); // i local number of the processor
int ig = get_global_id(0); // global number = i + g I
int gr = get_group_id(0); // g group number
int groups=get_num_groups(0);
int items=get_local_size(0);
// set the local histograms to zero
for(int ir=0;ir<_RADIX;ir++){
loc_histo[ir * items + it] = 0;
}
barrier(CLK_LOCAL_MEM_FENCE);
// range of keys that are analyzed by the work item
int size= n/groups/items; // size of the sub-list
int start= ig * size; // beginning of the sub-list
int key,shortkey,k;
// compute the index
// the computation depends on the transposition
for(int j= 0; j< size;j++){
#ifdef TRANSPOSE
k= groups * items * j + ig;
#else
k=j+start;
#endif
key=d_Keys[k];
// extract the group of _BITS bits of the pass
// the result is in the range 0.._RADIX-1
shortkey=(( key >> (pass * _BITS)) & (_RADIX-1));
// increment the local histogram
loc_histo[shortkey * items + it ]++;
}
barrier(CLK_LOCAL_MEM_FENCE);
// copy the local histogram to the global one
for(int ir=0;ir<_RADIX;ir++){
d_Histograms[items * (ir * groups + gr) + it]=loc_histo[ir * items + it];
}
barrier(CLK_GLOBAL_MEM_FENCE);
}
// initial transpose of the list for improving
// coalescent memory access
__kernel void transpose(const __global int* invect,
__global int* outvect,
const int nbcol,
const int nbrow,
const __global int* inperm,
__global int* outperm,
__local int* blockmat,
__local int* blockperm,
const int tilesize){
int i0 = get_global_id(0)*tilesize; // first row index
int j = get_global_id(1); // column index
int jloc = get_local_id(1); // local column index
// fill the cache
for(int iloc=0;iloc<tilesize;iloc++){
int k=(i0+iloc)*nbcol+j; // position in the matrix
blockmat[iloc*tilesize+jloc]=invect[k];
#ifdef PERMUT
blockperm[iloc*tilesize+jloc]=inperm[k];
#endif
}
barrier(CLK_LOCAL_MEM_FENCE);
// first row index in the transpose
int j0=get_group_id(1)*tilesize;
// put the cache at the good place
for(int iloc=0;iloc<tilesize;iloc++){
int kt=(j0+iloc)*nbrow+i0+jloc; // position in the transpose
outvect[kt]=blockmat[jloc*tilesize+iloc];
#ifdef PERMUT
outperm[kt]=blockperm[jloc*tilesize+iloc];
#endif
}
}
// each virtual processor reorders its data using the scanned histogram
__kernel void reorder(const __global int* d_inKeys,
__global int* d_outKeys,
__global int* d_Histograms,
const int pass,
__global int* d_inPermut,
__global int* d_outPermut,
__local int* loc_histo,
const int n){
int it = get_local_id(0);
int ig = get_global_id(0);
int gr = get_group_id(0);
int groups=get_num_groups(0);
int items=get_local_size(0);
int start= ig *(n/groups/items);
int size= n/groups/items;
// take the histogram in the cache
for(int ir=0;ir<_RADIX;ir++){
loc_histo[ir * items + it]=
d_Histograms[items * (ir * groups + gr) + it];
}
barrier(CLK_LOCAL_MEM_FENCE);
int newpos,key,shortkey,k,newpost;
for(int j= 0; j< size;j++){
#ifdef TRANSPOSE
k= groups * items * j + ig;
#else
k=j+start;
#endif
key = d_inKeys[k];
shortkey=((key >> (pass * _BITS)) & (_RADIX-1));
newpos=loc_histo[shortkey * items + it];
#ifdef TRANSPOSE
int ignew,jnew;
ignew= newpos/(n/groups/items);
jnew = newpos%(n/groups/items);
newpost = jnew * (groups*items) + ignew;
#else
newpost=newpos;
#endif
d_outKeys[newpost]= key; // killing line !!!
#ifdef PERMUT
d_outPermut[newpost]=d_inPermut[k];
#endif
newpos++;
loc_histo[shortkey * items + it]=newpos;
}
}
// perform a parallel prefix sum (a scan) on the local histograms
// (see Blelloch 1990) each workitem worries about two memories
// see also http://http.developer.nvidia.com/GPUGems3/gpugems3_ch39.html
__kernel void scanhistograms( __global int* histo,__local int* temp,__global int* globsum){
int it = get_local_id(0);
int ig = get_global_id(0);
int decale = 1;
int n=get_local_size(0) * 2 ;
int gr=get_group_id(0);
// load input into local memory
// up sweep phase
temp[2*it] = histo[2*ig];
temp[2*it+1] = histo[2*ig+1];
// parallel prefix sum (algorithm of Blelloch 1990)
for (int d = n>>1; d > 0; d >>= 1){
barrier(CLK_LOCAL_MEM_FENCE);
if (it < d){
int ai = decale*(2*it+1)-1;
int bi = decale*(2*it+2)-1;
temp[bi] += temp[ai];
}
decale *= 2;
}
// store the last element in the global sum vector
// (maybe used in the next step for constructing the global scan)
// clear the last element
if (it == 0) {
globsum[gr]=temp[n-1];
temp[n - 1] = 0;
}
// down sweep phase
for (int d = 1; d < n; d *= 2){
decale >>= 1;
barrier(CLK_LOCAL_MEM_FENCE);
if (it < d){
int ai = decale*(2*it+1)-1;
int bi = decale*(2*it+2)-1;
int t = temp[ai];
temp[ai] = temp[bi];
temp[bi] += t;
}
}
barrier(CLK_LOCAL_MEM_FENCE);
// write results to device memory
histo[2*ig] = temp[2*it];
histo[2*ig+1] = temp[2*it+1];
barrier(CLK_GLOBAL_MEM_FENCE);
}
// use the global sum for updating the local histograms
// each work item updates two values
__kernel void pastehistograms( __global int* histo,__global int* globsum){
int ig = get_global_id(0);
int gr=get_group_id(0);
int s;
s=globsum[gr];
// write results to device memory
histo[2*ig] += s;
histo[2*ig+1] += s;
barrier(CLK_GLOBAL_MEM_FENCE);
}