-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.h
163 lines (137 loc) · 6.96 KB
/
main.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
#ifndef _MAIN_H
#define _MAIN_H value
// #ifdef MNIST
// #define NUM_POINTS 70000
// #define DIMENSION 784
// #define ROUNDED_DIMENSION 1024
// #define NUM_CLUSTER 10
// #define ROUNDED_CLUSTER 16
// #define DATA "mnist"
// #else
// #ifdef THREE_D
// #define NUM_POINTS 434874
// #define DIMENSION 3
// #define ROUNDED_DIMENSION 4
// #define NUM_CLUSTER 5
// #define ROUNDED_CLUSTER 8
// #define DATA "3d_pro_5"
// #else
// #ifdef CIFAR
// #define NUM_POINTS 6000
// #define DIMENSION 3072
// #define ROUNDED_DIMENSION 4096
// #define NUM_CLUSTER 10
// #define ROUNDED_CLUSTER 16
// #define DATA "cifar"
// #else
// #ifdef BIRCH1
// #define NUM_POINTS 100000
// #define DIMENSION 2
// #define ROUNDED_DIMENSION 2
// #define NUM_CLUSTER 100
// #define ROUNDED_CLUSTER 128
// #define DATA "birch1"
// #else
// #ifdef BIRCH2
// #define NUM_POINTS 100000
// #define DIMENSION 2
// #define ROUNDED_DIMENSION 2
// #define NUM_CLUSTER 100
// #define ROUNDED_CLUSTER 128
// #define DATA "birch2"
// #else
// #ifdef BIRCH3
// #define NUM_POINTS 100000
// #define DIMENSION 2
// #define ROUNDED_DIMENSION 2
// #define NUM_CLUSTER 100
// #define ROUNDED_CLUSTER 128
// #define DATA "birch3"
// #endif
// #endif
// #endif
// #endif
// #endif
// #endif
// #define NUM_POINTS 434874
// #define DIMENSION 3
// #define NUM_CLUSTER 5
// #define DATA "3d_pro_5"
// #define NUM_POINTS 581012
// #define DIMENSION 54
// #define NUM_CLUSTER 5
// #define DATA "3d_spatial"
#define NUM_POINTS 100000
#define DIMENSION 2
#define NUM_CLUSTER 100
#define DATA "birch1"
// Should be power of 2 and Max value is 1024 which is max number of threads_per_block on my GPU
#define SCAN_BLOCK_SIZE 32
// Implementation of step efficient scan is correct is WARP_SIZE for the GPU on which this code is running is 32.
// If warp size if different, then we need to change the implementation accordingly
// WARP_SIZE is the size of each partition/segement of the distance array for which scan operation is performed
#define WARP_SIZE 32
// #define BLOCK_SIZE 32
#define ceil(a,b) (a + b - 1)/b
#define roundUp(a,b) ((a + b - 1)/b)*b
#define ROUNDED_NUM_POINTS roundUp(NUM_POINTS,WARP_SIZE)
#define NUM_PARTITIONS ceil(NUM_POINTS,WARP_SIZE)
#define ROUNDED_NUM_PARTITIONS roundUp( NUM_PARTITIONS , WARP_SIZE )
#define NUM_META_PARTITIONS ceil(NUM_PARTITIONS,WARP_SIZE)
#include <stdio.h>
#include <math.h>
#include <stdlib.h>
#include <time.h>
#include <string.h>
#include <unistd.h>
#include "sys/time.h"
#include "omp.h"
#include "float.h"
#include <helper_cuda.h>
#include "cuda_profiler_api.h"
static inline float sd(float* a, int n);
static inline float mean(float* a, int n);
static inline float get_time_diff(struct timeval, struct timeval);
__global__ void kernelAddConstant(int *g_a, const int b);
int correctResult(int *data, const int n, const int b);
int main_check();
__global__ void comp_dist(float* dev_data,float* dev_distances,float* dev_partition_sums, float* dev_centers,int centerIter,int numPoints,int dev_dimension,int numGPUThreads);
__global__ void comp_dist_2(float* dev_data,float* dev_distances,float* dev_partition_sums, float* dev_centers,int centerIter,int numPoints,int dev_dimension,int numGPUThreads);
__global__ void comp_dist_glbl(float* dev_data,float* dev_distances,float* dev_partition_sums,int centerIter,int numPoints,int dev_dimension,int numGPUThreads);
__global__ void comp_dist_glbl_strided(float* dev_data,float* dev_distances,int centerIter,int numPoints, int rndedNumPoints);
__global__ void comp_dist_strided(float* dev_data,float* dev_distances,float* dev_centers,int centerIter,int numPoints,int dev_dimension,int rndedNumPoints);
__global__ void comp_dist_package(float* dev_data,float* dev_distances,float* dev_partition_sums, float* dev_centers,int centerIter,int numPoints,int dev_dimension,int numGPUThreads,float* dev_rnd);
__global__ void comp_dist_package_with_loop(float* dev_data, float* dev_centers,int numPoints,float *dev_rnd);
__global__ void comp_dist_package_with_loop_original(float* dev_data,float* dev_distances,float* dev_partition_sums, float* dev_centers,int numPoints,float *dev_rnd);
__global__ void mean_heuristic_GPU(float* dev_data, float* dev_centers,int numPoints,float *dev_rnd,float* dev_centers_temp);
int sample_from_distribution (float* probabilities, int startIndex, int endIndex, float prob);
__global__ void sample_from_distribution_gpu(float* dev_partition_sums, float* dev_distances, int* dev_sampled_indices, float* dev_rnd,int per_thread, int dev_NUM_POINTS, int dev_num_samples);
__global__ void sample_from_distribution_gpu_copy(float* dev_partition_sums, float* dev_distances, float* dev_multiset, float* dev_rnd,int per_thread, int dev_NUM_POINTS, int dev_num_samples,float* dev_data);
__global__ void sample_from_distribution_gpu_strided(float* dev_distances, int* dev_sampled_indices, float* dev_rnd, int dev_NUM_POINTS, int dev_num_samples);
__global__ void sample_from_distribution_gpu_strided_copy(float* dev_distances, float* dev_multiset, float* dev_rnd, int dev_NUM_POINTS, int dev_num_samples, float* dev_data);
__global__ void exc_scan_2(float* inData,float* outData,int n);
__global__ void inc_scan_1(float* inData,float* outData,int n);
__global__ void inc_scan_1_block(float* inData,float* outData,int n,float* block_sums);
__global__ void inc_scan_1_rev(float* inData,float* outData,int n);
__global__ void inc_scan_1_add(float* block,float* block_sums,int n);
__global__ void inc_scan_1_block_SE(float* inData,float* outData,float* block_sums);
void testScan();
__global__ void copy_to_multiset(float* dev_multiset,float* dev_data,int* dev_sampled_indices);
float distance(float* p1, float* p2);
float* mean_heuristic(float* multiset,int multisetSize);
float* mean_heuristic_assign(float* multiset,int multisetSize,float* level_2_sample);
__global__ void mean_heuristic_assign_gpu(float* dev_multiset,int multisetSize, float* dev_l2_samples,float* dev_centers_temp);
float* d2_sample(float* data,float* centers,int numPts, int numSamples, int size);
float* d2_sample_2(float* data,float* centers,int numPts, int numSamples, int size, float* distances);
float* d2_sample_3(float* data,float* centers,int numPts, int size, float* distances);
// This version of d2_sample has optimized cost calculation by using cost computed in last iteration
// This one does not use the cumulative distance array to get costs computed in last iteration,
// instead it makes another copy of distance array which is cumulative, for sampling purposes
float* d2_sample_4(float* data,float* centers,int numPts, int numSamples, int size, float* distances);
// This version of d2_sample has optimized cost calculation by using cost computed in last iteration
// This is specially optimized for mean_heurisitic
// This one does not make distance array cumulative in-place, instead a separate array is used to to get cumulative distances
float* d2_sample_5(float* data,float* centers,int numPts, int size, float* distances);
void write_centers_to_file(float* centers);
int roundToPowerOf2(int num);
#endif