-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtest_all_blocks.cu
94 lines (73 loc) · 3.07 KB
/
test_all_blocks.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#define NVCC
#include "printing.cuh"
#include "cuda_err_check.h"
#include "read_volume.h"
#include "write_volume.h"
#include "wavelet_slow.h"
#include "wavelet_slow.cuh"
#include "compare.h"
#include "diff.h"
#include "norms.h"
#include "init_x.h"
#include "init_random.h"
const int FORWARD = 0;
const int INVERSE = 1;
int err_check(float *x_gpu, float *d_x, float *x, const int nx, const int ny, const int nz,
const int bx, const int by, const int bz, const bool verbose=false,
const double l2_tol=1e-5, const double l1_tol=1e-5, const double linf_tol=1e-5) {
size_t b = bx * by * bz;
size_t n = nx * ny * nz;
size_t num_bytes = b * n * sizeof(float);
cudaMemcpy(x_gpu, d_x, num_bytes, cudaMemcpyDeviceToHost);
const char *errtype[] = {"abs.", "rel."};
for (int a = 0; a < 2; ++a) {
double l2err = l2norm(x, x_gpu, b * n, a);
double l1err = l1norm(x, x_gpu, b * n, a);
double linferr = linfnorm(x, x_gpu, b * n, a);
if (verbose) printf("%s l2 error = %g l1 error = %g linf error = %g \n",
errtype[a], l2err, l1err, linferr);
if (a == 1 && (l2err > l2_tol || l1err > l1_tol || linferr > linf_tol) ) return 1;
}
return 0;
}
void print_status(int err) {
if (!err) printf("OK\n");
else printf("FAILED\n");
}
int test_kernel(enum kernel k, const int nx, const int ny, const int nz, const int bx, const int by, const int bz, const int verbose) {
float *x;
init_random(x, nx, ny, nz, bx, by, bz);
size_t num_bytes = sizeof(float) * nx * ny * nz * bx * by * bz;
float *x_gpu = (float*)malloc(num_bytes);
float *d_x;
cudaMalloc((void**)&d_x, num_bytes);
cudaMemcpy(d_x, x, num_bytes, cudaMemcpyHostToDevice);
printf("%s \t [%d, %d, %d] [%d, %d, %d] \n", get_kernel_name(k), nx, ny, nz, bx, by, bz);
wl79_h<FORWARD>(k, d_x, bx, by, bz);
wl79_h<INVERSE>(k, d_x, bx, by, bz);
cudaDeviceSynchronize();
int err = err_check(x_gpu, d_x, x, nx, ny, nz, bx, by, bz, verbose);
print_status(err);
free(x);
free(x_gpu);
cudaFree(d_x);
return err;
}
int main(int argc, char **argv) {
const int verbose = 0;
int bx = 11;
int by = 9;
int bz = 8;
test_kernel(WL79_8x8x8, bz, bz, bz, bx, by, bz, verbose);
test_kernel(WL79_32x32x32, 32, 32, 32, bx, by, bz, verbose);
test_kernel(OPT1WL79_32x32x32, 32, 32, 32, bx, by, bz, verbose);
test_kernel(OPT2WL79_32x32x32, 32, 32, 32, bx, by, bz, verbose);
test_kernel(OPT3WL79_32x32x32, 32, 32, 32, bx, by, bz, verbose);
test_kernel(OPT4WL79_32x32x32, 32, 32, 32, bx, by, bz, verbose);
test_kernel(OPT5WL79_32x32x32, 32, 32, 32, bx, by, bz, verbose);
test_kernel(OPT6WL79_32x32x32, 32, 32, 32, bx, by, bz, verbose);
test_kernel(OPT7WL79_32x32x32, 32, 32, 32, bx, by, bz, verbose);
}