-
Notifications
You must be signed in to change notification settings - Fork 11
virtual functions on GPUs
rhaas80 edited this page Nov 30, 2022
·
1 revision
I played around a bit with how one could use virtual functions and function pointers on GPUs. It is supported by nvcc but one must call the constructor on the GPU to make sure that function pointers point to GPU code.
I could not find a really "nice" way to make this work, but did not try very hard either.
Here's three flavors:
// compile using:
// nvcc virt.cu
#include <stdio.h>
#include <stdlib.h>
#include <new>
#include <cuda.h>
class vtest
{
public:
virtual __device__ void func(float* a) {
*a = 42.;
}
};
__global__
void make_vtest(vtest* p)
{
if(threadIdx.x == 0) {
new (p) vtest;
}
}
__global__
void use_vtest(vtest* foo, int n, float* y) {
int i = blockIdx.x*blockDim.x + threadIdx.x;
if(i < n)
foo->func(&y[i]);
}
int main(void)
{
const size_t N = 10;
float y[N];
vtest* d_foo;
cudaMalloc(&d_foo, sizeof(vtest));
float* d_y;
cudaMalloc(&d_y, sizeof(y));
for(int i = 0 ; i < N ; i++) {
y[i] = 1.;
}
cudaMemcpy(d_y, y, sizeof(y), cudaMemcpyHostToDevice);
make_vtest<<<1, 1>>>(d_foo);
cudaDeviceSynchronize();
use_vtest<<<1, 256>>>(d_foo, N, d_y);
cudaMemcpy(y, d_y, sizeof(y), cudaMemcpyDeviceToHost);
for (int i = 0; i < N; i++) {
printf("y[%d]: %g\n", i, y[i]);
}
cudaFree(d_y);
// should call destructor
cudaFree(d_foo);
}
// compile using:
// nvcc -std=c++11 --expt-extended-lambda virtlambda.cu
#include <iostream>
class vtest
{
public:
virtual __device__ void func(float* a) const {
*a = 42.;
}
};
template <typename Function>
__global__ void call(Function f, float *y) {
f(y);
}
__global__
void make_vtest(vtest* p)
{
if(threadIdx.x == 0) {
new (p) vtest;
}
}
vtest* make_vtest_gpu() {
vtest* d_vtest;
cudaMalloc(&d_vtest, sizeof(vtest));
make_vtest<<<1, 1>>>(d_vtest);
cudaDeviceSynchronize();
return d_vtest;
}
int main(void)
{
vtest *d_obj = make_vtest_gpu();
auto fun = [=] __device__ (float *a){
d_obj->func(a);
return;
};
const size_t N = 1;
float y[N];
float* d_y;
cudaMalloc(&d_y, sizeof(y));
for(int i = 0 ; i < N ; i++) {
y[i] = 1.;
}
cudaMemcpy(d_y, y, sizeof(y), cudaMemcpyHostToDevice);
call<<<1, 1>>>(fun, d_y);
cudaMemcpy(y, d_y, sizeof(y), cudaMemcpyDeviceToHost);
for (int i = 0; i < N; i++) {
printf("y[%d]: %g\n", i, y[i]);
}
cudaFree(d_y);
}
#include <stdio.h>
#include <stdlib.h>
#include <new>
#include <cuda.h>
class vtest
{
public:
virtual __device__ void func(float *a) {
*a = 42;
}
};
__global__
void make_vtest(vtest** p)
{
if(threadIdx.x == 0) {
*p = new vtest();
}
}
__global__
void use_vtest(vtest** foo, int n, float* y) {
int i = blockIdx.x*blockDim.x + threadIdx.x;
if(i < n)
(*foo)->func(&y[i]);
}
int main(void)
{
cudaError_t cudaStatus;
// on AMReX one would use the arena allocator to get shared memory for the
// pointer storage
vtest** d_foo = 0;
cudaMalloc(&d_foo, sizeof(vtest*));
make_vtest<<<1, 1>>>(d_foo);
// some storage to test things, AMReX would do this for us
const size_t N = 10;
float y[N];
float* d_y;
cudaMalloc(&d_y, sizeof(y));
for(int i = 0 ; i < N ; i++) {
y[i] = 1.0;
}
cudaMemcpy(d_y, y, sizeof(y), cudaMemcpyHostToDevice);
cudaDeviceSynchronize(); // just to be sure...
// use our GPU constructed object with virtual member functions
use_vtest<<<1, 256>>>(d_foo, N, d_y);
cudaMemcpy(y, d_y, sizeof(y), cudaMemcpyDeviceToHost);
for (int i = 0; i < N; i++) {
printf("y[%d]: %g\n", i, y[i]);
}
}