Skip to content

Commit

Permalink
add ginkgo dpcpp to cv_heat2D example
Browse files Browse the repository at this point in the history
Signed-off-by: Yu-Hsiang M. Tsai <[email protected]>
  • Loading branch information
yhmtsai committed Oct 18, 2023
1 parent a3ca695 commit 3f378d5
Show file tree
Hide file tree
Showing 3 changed files with 198 additions and 33 deletions.
5 changes: 4 additions & 1 deletion examples/cvode/ginkgo/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ set(cpu_gpu_examples

sundials_add_examples_ginkgo(cpu_gpu_examples
TARGETS sundials_cvode
BACKENDS REF OMP CUDA HIP)
BACKENDS REF OMP CUDA HIP DPCPP)

# Examples that only support CPU Ginkgo backends
set(cpu_examples
Expand All @@ -39,6 +39,9 @@ if(EXAMPLES_INSTALL)
if(SUNDIALS_GINKGO_BACKENDS MATCHES "HIP")
list(APPEND vectors nvechip)
endif()
if(SUNDIALS_GINKGO_BACKENDS MATCHES "DPCPP")
list(APPEND vectors nvecsycl)
endif()
if((SUNDIALS_GINKGO_BACKENDS MATCHES "OMP") OR
(SUNDIALS_GINKGO_BACKENDS MATCHES "REF"))
list(APPEND vectors nvecserial)
Expand Down
185 changes: 160 additions & 25 deletions examples/cvode/ginkgo/cv_heat2D_ginkgo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -53,19 +53,23 @@

#if defined(USE_CUDA)
#include <nvector/nvector_cuda.h>
#define HIP_OR_CUDA(a, b) b
#define HIP_OR_CUDA_SYCL(a, b, c) b
constexpr auto N_VNew = N_VNew_Cuda;
#elif defined(USE_HIP)
#include <nvector/nvector_hip.h>
#define HIP_OR_CUDA(a, b) a
#define HIP_OR_CUDA_SYCL(a, b, c) a
constexpr auto N_VNew = N_VNew_Hip;
#elif defined(USE_DPCPP)
#include <nvector/nvector_sycl.h>
#define HIP_OR_CUDA_SYCL(a, b, c) c
constexpr auto N_VNew = N_VNew_Sycl;
#elif defined(USE_OMP)
#include <nvector/nvector_serial.h>
#define HIP_OR_CUDA(a, b)
#define HIP_OR_CUDA_SYCL(a, b, c)
constexpr auto N_VNew = N_VNew_Serial;
#else
#include <nvector/nvector_serial.h>
#define HIP_OR_CUDA(a, b)
#define HIP_OR_CUDA_SYCL(a, b, c)
constexpr auto N_VNew = N_VNew_Serial;
#endif

Expand All @@ -85,6 +89,12 @@ int f(sunrealtype t, N_Vector u, N_Vector f, void* user_data);
// Jacobian of RHS function
int J(sunrealtype t, N_Vector y, N_Vector fy, SUNMatrix J, void* user_data, N_Vector tmp1, N_Vector tmp2, N_Vector tmp3);


// -----------------------------------------------------------------------------
// Ginkgo Global Executor for passing queue
// -----------------------------------------------------------------------------
std::shared_ptr<const gko::Executor> global_exec;

// -----------------------------------------------------------------------------
// Main Program
// -----------------------------------------------------------------------------
Expand All @@ -104,36 +114,44 @@ int main(int argc, char* argv[])
if (ReadInputs(args, udata)) return 1;
PrintUserData(udata);

// ---------------------------------------
// Create Ginkgo matrix and linear solver
// ---------------------------------------

#if defined(USE_CUDA)
auto gko_exec{gko::CudaExecutor::create(0, gko::OmpExecutor::create(), false, gko::allocation_mode::device)};
#elif defined(USE_HIP)
auto gko_exec{gko::HipExecutor::create(0, gko::OmpExecutor::create(), false, gko::allocation_mode::device)};
#elif defined(USE_DPCPP)
auto gko_exec{gko::DpcppExecutor::create(0, gko::ReferenceExecutor::create())};
#elif defined(USE_OMP)
auto gko_exec{gko::OmpExecutor::create()};
#else
auto gko_exec{gko::ReferenceExecutor::create()};
#endif

global_exec = gko_exec;

// ---------------
// Create vectors
// ---------------

// Create solution vector
#if defined(USE_DPCPP)
N_Vector u = N_VNew(udata.nodes, gko_exec->get_queue(), sunctx);
#else
N_Vector u = N_VNew(udata.nodes, sunctx);
#endif
if (check_ptr(u, "N_VNew")) return 1;

// Set initial condition
int flag = Solution(ZERO, u, udata);
int flag = Solution(ZERO, u, udata, gko_exec);
if (check_flag(flag, "Solution")) return 1;

// Create error vector
N_Vector e = N_VClone(u);
if (check_ptr(e, "N_VClone")) return 1;

// ---------------------------------------
// Create Ginkgo matrix and linear solver
// ---------------------------------------

#if defined(USE_CUDA)
auto gko_exec{gko::CudaExecutor::create(0, gko::OmpExecutor::create(), false, gko::allocation_mode::device)};
#elif defined(USE_HIP)
auto gko_exec{gko::HipExecutor::create(0, gko::OmpExecutor::create(), false, gko::allocation_mode::device)};
#elif defined(USE_OMP)
auto gko_exec{gko::OmpExecutor::create()};
#else
auto gko_exec{gko::ReferenceExecutor::create()};
#endif

auto gko_matrix_dim = gko::dim<2>(udata.nodes, udata.nodes);
auto gko_matrix_nnz{(5 * (udata.nx - 2) + 2) * (udata.ny - 2) + 2 * udata.nx};
auto gko_matrix = gko::share(GkoMatrixType::create(gko_exec, gko_matrix_dim, gko_matrix_nnz));
Expand Down Expand Up @@ -203,7 +221,7 @@ int main(int argc, char* argv[])
flag = OpenOutput(udata);
if (check_flag(flag, "OpenOutput")) return 1;

flag = WriteOutput(t, u, e, udata);
flag = WriteOutput(t, u, e, udata, gko_exec);
if (check_flag(flag, "WriteOutput")) return 1;

for (int iout = 0; iout < udata.nout; iout++) {
Expand All @@ -212,7 +230,7 @@ int main(int argc, char* argv[])
if (check_flag(flag, "CVode")) break;

// Output solution and error
flag = WriteOutput(t, u, e, udata);
flag = WriteOutput(t, u, e, udata, gko_exec);
if (check_flag(flag, "WriteOutput")) return 1;

// Update output time
Expand All @@ -234,7 +252,7 @@ int main(int argc, char* argv[])
if (check_flag(flag, "CVodePrintAllStats")) return 1;

// Output final error
flag = SolutionError(t, u, e, udata);
flag = SolutionError(t, u, e, udata, gko_exec);
if (check_flag(flag, "SolutionError")) return 1;

sunrealtype maxerr = N_VMaxNorm(e);
Expand All @@ -246,6 +264,7 @@ int main(int argc, char* argv[])
// Clean up and return
// --------------------

global_exec = nullptr;
CVodeFree(&cvode_mem); // Free integrator memory
N_VDestroy(u); // Free vectors
N_VDestroy(e);
Expand Down Expand Up @@ -334,8 +353,45 @@ int f(sunrealtype t, N_Vector u, N_Vector f, void* user_data)

f_kernel<<<num_blocks, threads_per_block>>>(nx, ny, dx, dy, cx, cy, cc, bx, by, sin_t_cos_t, cos_sqr_t, uarray, farray);

HIP_OR_CUDA(hipDeviceSynchronize();, cudaDeviceSynchronize(););
HIP_OR_CUDA_OR_SYCL(hipDeviceSynchronize(), cudaDeviceSynchronize(), global_exec->synchronize());

#elif defined(USE_DPCPP)
// Access device data arrays
sunrealtype* uarray = N_VGetDeviceArrayPointer(u);
if (check_ptr(uarray, "N_VGetDeviceArrayPointer")) return -1;

sunrealtype* farray = N_VGetDeviceArrayPointer(f);
if (check_ptr(farray, "N_VGetDeviceArrayPointer")) return -1;

std::dynamic_pointer_cast<const gko::DpcppExecutor>(global_exec)->get_queue()->submit([&](sycl::handler& cgh) {
cgh.parallel_for(sycl::range<2>(ny, nx), [=](sycl::id<2> id) {
const sunindextype i = id[1];
const sunindextype j = id[0];
if (i > 0 && i < nx - 1 && j > 0 && j < ny - 1) {
auto x = i * dx;
auto y = j * dy;

auto sin_sqr_x = sin(PI * x) * sin(PI * x);
auto sin_sqr_y = sin(PI * y) * sin(PI * y);

auto cos_sqr_x = cos(PI * x) * cos(PI * x);
auto cos_sqr_y = cos(PI * y) * cos(PI * y);

// center, north, south, east, and west indices
auto idx_c = i + j * nx;
auto idx_n = i + (j + 1) * nx;
auto idx_s = i + (j - 1) * nx;
auto idx_e = (i + 1) + j * nx;
auto idx_w = (i - 1) + j * nx;

farray[idx_c] = cc * uarray[idx_c] + cx * (uarray[idx_w] + uarray[idx_e]) + cy * (uarray[idx_s] + uarray[idx_n]) -
TWO * PI * sin_sqr_x * sin_sqr_y * sin_t_cos_t -
bx * (cos_sqr_x - sin_sqr_x) * sin_sqr_y * cos_sqr_t -
by * (cos_sqr_y - sin_sqr_y) * sin_sqr_x * cos_sqr_t;
}
});
});
global_exec->synchronize();
#else

// Access host data arrays
Expand Down Expand Up @@ -508,8 +564,87 @@ int J(sunrealtype t, N_Vector y, N_Vector fy, SUNMatrix J, void* user_data, N_Ve

J_kernel<<<num_blocks_i, threads_per_block_i>>>(nx, ny, cx, cy, cc, row_ptrs, col_idxs, mat_data);

HIP_OR_CUDA(hipDeviceSynchronize();, cudaDeviceSynchronize(););

HIP_OR_CUDA_OR_SYCL(hipDeviceSynchronize(), cudaDeviceSynchronize(), global_exec->synchronize());
#elif defined(USE_DPCPP)
auto queue = std::dynamic_pointer_cast<const gko::DpcppExecutor>(global_exec)->get_queue();
// J_sn_kernel
queue->submit([&](sycl::handler& cgh) {
cgh.parallel_for(nx, [=](sycl::id<1> id) {
const sunindextype i = id[0];

// Southern face
mat_data[i] = ZERO;
col_idxs[i] = i;
row_ptrs[i] = i;

// Northern face
auto col = i + (ny - 1) * nx;
auto idx = (5 * (nx - 2) + 2) * (ny - 2) + nx + i;
mat_data[idx] = ZERO;
col_idxs[idx] = col;
row_ptrs[col] = idx;

if (i == nx - 1) row_ptrs[nx * ny] = (5 * (nx - 2) + 2) * (ny - 2) + 2 * nx;
});
});
// J_we_kernel
queue->submit([&](sycl::handler& cgh) {
cgh.parallel_for(ny, [=](sycl::id<1> id) {
const sunindextype j = id[0];
if (j > 0 && j < ny - 1) {
// Western face
auto col = j * nx;
auto idx = (5 * (nx - 2) + 2) * (j - 1) + nx;
mat_data[idx] = ZERO;
col_idxs[idx] = col;
row_ptrs[col] = idx;

// Eastern face
col = (nx - 1) + j * nx;
idx = (5 * (nx - 2) + 2) * (j - 1) + nx + 1 + 5 * (nx - 2);
mat_data[idx] = ZERO;
col_idxs[idx] = col;
row_ptrs[col] = idx;
}
});
});
// J_kernel
queue->submit([&](sycl::handler& cgh) {
cgh.parallel_for(sycl::range<2>(ny, nx), [=](sycl::id<2> id) {
const sunindextype i = id[1];
const sunindextype j = id[0];

if (i > 0 && i < nx - 1 && j > 0 && j < ny - 1) {
auto row = i + j * nx;
auto col_s = row - nx;
auto col_w = row - 1;
auto col_c = row;
auto col_e = row + 1;
auto col_n = row + nx;

// Number of non-zero entries from preceding rows
auto prior_nnz = (5 * (nx - 2) + 2) * (j - 1) + nx;

// Starting index for this row
auto idx = prior_nnz + 1 + 5 * (i - 1);

mat_data[idx] = cy;
mat_data[idx + 1] = cx;
mat_data[idx + 2] = cc;
mat_data[idx + 3] = cx;
mat_data[idx + 4] = cy;

col_idxs[idx] = col_s;
col_idxs[idx + 1] = col_w;
col_idxs[idx + 2] = col_c;
col_idxs[idx + 3] = col_e;
col_idxs[idx + 4] = col_n;

row_ptrs[row] = idx;
}
});
});
global_exec->synchronize();
#else

// Fill southern boundary entries (j = 0)
Expand Down
41 changes: 34 additions & 7 deletions examples/cvode/ginkgo/cv_heat2D_ginkgo.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,12 @@
#include <nvector/nvector_cuda.h>
#elif defined(USE_HIP)
#include <nvector/nvector_hip.h>
#elif defined(USE_DPCPP)
#include <nvector/nvector_sycl.h>
#endif

#include <ginkgo/ginkgo.hpp>

// Common utility functions
#include <example_utilities.hpp>

Expand Down Expand Up @@ -117,7 +121,7 @@ void solution_kernel(const sunindextype nx, const sunindextype ny,
#endif

// Compute the exact solution
int Solution(sunrealtype t, N_Vector u, UserData &udata)
int Solution(sunrealtype t, N_Vector u, UserData &udata, std::shared_ptr<const gko::Executor> exec)
{
// Access problem data and set shortcuts
const auto nx = udata.nx;
Expand Down Expand Up @@ -145,7 +149,27 @@ int Solution(sunrealtype t, N_Vector u, UserData &udata)

solution_kernel<<<num_blocks, threads_per_block>>>
(nx, ny, dx, dy, cos_sqr_t, uarray);

#elif defined(USE_DPCPP)
sunrealtype *uarray = N_VGetDeviceArrayPointer(u);
if (check_ptr(uarray, "N_VGetDeviceArrayPointer")) return -1;
std::dynamic_pointer_cast<const gko::DpcppExecutor>(exec)->get_queue()->submit([&](sycl::handler& cgh) {
cgh.parallel_for(sycl::range<2>(ny, nx), [=](sycl::id<2> id) {
const sunindextype i = id[1];
const sunindextype j = id[0];

if (i > 0 && i < nx - 1 && j > 0 && j < ny - 1)
{
auto x = i * dx;
auto y = j * dy;

auto sin_sqr_x = sin(PI * x) * sin(PI * x);
auto sin_sqr_y = sin(PI * y) * sin(PI * y);

auto idx = i + j * nx;
uarray[idx] = sin_sqr_x * sin_sqr_y * cos_sqr_t + ONE;
}
});
});
#else

sunrealtype *uarray = N_VGetArrayPointer(u);
Expand All @@ -167,15 +191,15 @@ int Solution(sunrealtype t, N_Vector u, UserData &udata)
}

#endif

exec->synchronize();
return 0;
}

// Compute the solution error
int SolutionError(sunrealtype t, N_Vector u, N_Vector e, UserData &udata)
int SolutionError(sunrealtype t, N_Vector u, N_Vector e, UserData &udata, std::shared_ptr<const gko::Executor> exec)
{
// Compute true solution
int flag = Solution(t, e, udata);
int flag = Solution(t, e, udata, exec);
if (flag != 0) return -1;

// Compute absolute error
Expand Down Expand Up @@ -310,10 +334,10 @@ int OpenOutput(UserData &udata)
}

// Write output
int WriteOutput(sunrealtype t, N_Vector u, N_Vector e, UserData &udata)
int WriteOutput(sunrealtype t, N_Vector u, N_Vector e, UserData &udata, std::shared_ptr<const gko::Executor> exec)
{
// Compute the error
int flag = SolutionError(t, u, e, udata);
int flag = SolutionError(t, u, e, udata, exec);
if (check_flag(flag, "SolutionError")) return 1;

// Compute max error
Expand All @@ -335,6 +359,9 @@ int WriteOutput(sunrealtype t, N_Vector u, N_Vector e, UserData &udata)
#elif defined(USE_HIP)
N_VCopyFromDevice_Hip(u);
N_VCopyFromDevice_Hip(e);
#elif defined(USE_DPCPP)
N_VCopyFromDevice_Sycl(u);
N_VCopyFromDevice_Sycl(e);
#endif

// Access host data array
Expand Down

0 comments on commit 3f378d5

Please sign in to comment.