add ginkgo dpcpp to cv_heat2D example

Signed-off-by: Yu-Hsiang M. Tsai <[email protected]>
LLNL · Oct 18, 2023 · 3f378d5 · 3f378d5
1 parent a3ca695
commit 3f378d5
Show file tree

Hide file tree

Showing 3 changed files with 198 additions and 33 deletions.
diff --git a/examples/cvode/ginkgo/CMakeLists.txt b/examples/cvode/ginkgo/CMakeLists.txt
@@ -20,7 +20,7 @@ set(cpu_gpu_examples
 
 sundials_add_examples_ginkgo(cpu_gpu_examples
   TARGETS sundials_cvode
-  BACKENDS REF OMP CUDA HIP)
+  BACKENDS REF OMP CUDA HIP DPCPP)
 
 # Examples that only support CPU Ginkgo backends
 set(cpu_examples
@@ -39,6 +39,9 @@ if(EXAMPLES_INSTALL)
   if(SUNDIALS_GINKGO_BACKENDS MATCHES "HIP")
     list(APPEND vectors nvechip)
   endif()
+  if(SUNDIALS_GINKGO_BACKENDS MATCHES "DPCPP")
+    list(APPEND vectors nvecsycl)
+  endif()
   if((SUNDIALS_GINKGO_BACKENDS MATCHES "OMP") OR
       (SUNDIALS_GINKGO_BACKENDS MATCHES "REF"))
     list(APPEND vectors nvecserial)

diff --git a/examples/cvode/ginkgo/cv_heat2D_ginkgo.cpp b/examples/cvode/ginkgo/cv_heat2D_ginkgo.cpp
@@ -53,19 +53,23 @@
 
 #if defined(USE_CUDA)
 #include <nvector/nvector_cuda.h>
-#define HIP_OR_CUDA(a, b) b
+#define HIP_OR_CUDA_SYCL(a, b, c) b
 constexpr auto N_VNew = N_VNew_Cuda;
 #elif defined(USE_HIP)
 #include <nvector/nvector_hip.h>
-#define HIP_OR_CUDA(a, b) a
+#define HIP_OR_CUDA_SYCL(a, b, c) a
 constexpr auto N_VNew = N_VNew_Hip;
+#elif defined(USE_DPCPP)
+#include <nvector/nvector_sycl.h>
+#define HIP_OR_CUDA_SYCL(a, b, c) c
+constexpr auto N_VNew = N_VNew_Sycl;
 #elif defined(USE_OMP)
 #include <nvector/nvector_serial.h>
-#define HIP_OR_CUDA(a, b)
+#define HIP_OR_CUDA_SYCL(a, b, c)
 constexpr auto N_VNew = N_VNew_Serial;
 #else
 #include <nvector/nvector_serial.h>
-#define HIP_OR_CUDA(a, b)
+#define HIP_OR_CUDA_SYCL(a, b, c)
 constexpr auto N_VNew = N_VNew_Serial;
 #endif
 
@@ -85,6 +89,12 @@ int f(sunrealtype t, N_Vector u, N_Vector f, void* user_data);
 // Jacobian of RHS function
 int J(sunrealtype t, N_Vector y, N_Vector fy, SUNMatrix J, void* user_data, N_Vector tmp1, N_Vector tmp2, N_Vector tmp3);
 
+
+// -----------------------------------------------------------------------------
+// Ginkgo Global Executor for passing queue
+// -----------------------------------------------------------------------------
+std::shared_ptr<const gko::Executor> global_exec;
+
 // -----------------------------------------------------------------------------
 // Main Program
 // -----------------------------------------------------------------------------
@@ -104,36 +114,44 @@ int main(int argc, char* argv[])
   if (ReadInputs(args, udata)) return 1;
   PrintUserData(udata);
 
+  // ---------------------------------------
+  // Create Ginkgo matrix and linear solver
+  // ---------------------------------------
+
+#if defined(USE_CUDA)
+  auto gko_exec{gko::CudaExecutor::create(0, gko::OmpExecutor::create(), false, gko::allocation_mode::device)};
+#elif defined(USE_HIP)
+  auto gko_exec{gko::HipExecutor::create(0, gko::OmpExecutor::create(), false, gko::allocation_mode::device)};
+#elif defined(USE_DPCPP)
+  auto gko_exec{gko::DpcppExecutor::create(0, gko::ReferenceExecutor::create())};
+#elif defined(USE_OMP)
+  auto gko_exec{gko::OmpExecutor::create()};
+#else
+  auto gko_exec{gko::ReferenceExecutor::create()};
+#endif
+
+  global_exec = gko_exec;
+
   // ---------------
   // Create vectors
   // ---------------
 
   // Create solution vector
+#if defined(USE_DPCPP)
+  N_Vector u = N_VNew(udata.nodes, gko_exec->get_queue(), sunctx);
+#else
   N_Vector u = N_VNew(udata.nodes, sunctx);
+#endif
   if (check_ptr(u, "N_VNew")) return 1;
 
   // Set initial condition
-  int flag = Solution(ZERO, u, udata);
+  int flag = Solution(ZERO, u, udata, gko_exec);
   if (check_flag(flag, "Solution")) return 1;
 
   // Create error vector
   N_Vector e = N_VClone(u);
   if (check_ptr(e, "N_VClone")) return 1;
 
-    // ---------------------------------------
-    // Create Ginkgo matrix and linear solver
-    // ---------------------------------------
-
-#if defined(USE_CUDA)
-  auto gko_exec{gko::CudaExecutor::create(0, gko::OmpExecutor::create(), false, gko::allocation_mode::device)};
-#elif defined(USE_HIP)
-  auto gko_exec{gko::HipExecutor::create(0, gko::OmpExecutor::create(), false, gko::allocation_mode::device)};
-#elif defined(USE_OMP)
-  auto gko_exec{gko::OmpExecutor::create()};
-#else
-  auto gko_exec{gko::ReferenceExecutor::create()};
-#endif
-
   auto gko_matrix_dim = gko::dim<2>(udata.nodes, udata.nodes);
   auto gko_matrix_nnz{(5 * (udata.nx - 2) + 2) * (udata.ny - 2) + 2 * udata.nx};
   auto gko_matrix = gko::share(GkoMatrixType::create(gko_exec, gko_matrix_dim, gko_matrix_nnz));
@@ -203,7 +221,7 @@ int main(int argc, char* argv[])
   flag = OpenOutput(udata);
   if (check_flag(flag, "OpenOutput")) return 1;
 
-  flag = WriteOutput(t, u, e, udata);
+  flag = WriteOutput(t, u, e, udata, gko_exec);
   if (check_flag(flag, "WriteOutput")) return 1;
 
   for (int iout = 0; iout < udata.nout; iout++) {
@@ -212,7 +230,7 @@ int main(int argc, char* argv[])
     if (check_flag(flag, "CVode")) break;
 
     // Output solution and error
-    flag = WriteOutput(t, u, e, udata);
+    flag = WriteOutput(t, u, e, udata, gko_exec);
     if (check_flag(flag, "WriteOutput")) return 1;
 
     // Update output time
@@ -234,7 +252,7 @@ int main(int argc, char* argv[])
   if (check_flag(flag, "CVodePrintAllStats")) return 1;
 
   // Output final error
-  flag = SolutionError(t, u, e, udata);
+  flag = SolutionError(t, u, e, udata, gko_exec);
   if (check_flag(flag, "SolutionError")) return 1;
 
   sunrealtype maxerr = N_VMaxNorm(e);
@@ -246,6 +264,7 @@ int main(int argc, char* argv[])
   // Clean up and return
   // --------------------
 
+  global_exec = nullptr;
   CVodeFree(&cvode_mem); // Free integrator memory
   N_VDestroy(u);         // Free vectors
   N_VDestroy(e);
@@ -334,8 +353,45 @@ int f(sunrealtype t, N_Vector u, N_Vector f, void* user_data)
 
   f_kernel<<<num_blocks, threads_per_block>>>(nx, ny, dx, dy, cx, cy, cc, bx, by, sin_t_cos_t, cos_sqr_t, uarray, farray);
 
-  HIP_OR_CUDA(hipDeviceSynchronize();, cudaDeviceSynchronize(););
+  HIP_OR_CUDA_OR_SYCL(hipDeviceSynchronize(), cudaDeviceSynchronize(), global_exec->synchronize());
+
+#elif defined(USE_DPCPP)
+  // Access device data arrays
+  sunrealtype* uarray = N_VGetDeviceArrayPointer(u);
+  if (check_ptr(uarray, "N_VGetDeviceArrayPointer")) return -1;
 
+  sunrealtype* farray = N_VGetDeviceArrayPointer(f);
+  if (check_ptr(farray, "N_VGetDeviceArrayPointer")) return -1;
+
+  std::dynamic_pointer_cast<const gko::DpcppExecutor>(global_exec)->get_queue()->submit([&](sycl::handler& cgh) {
+    cgh.parallel_for(sycl::range<2>(ny, nx), [=](sycl::id<2> id) {
+      const sunindextype i = id[1];
+      const sunindextype j = id[0];
+      if (i > 0 && i < nx - 1 && j > 0 && j < ny - 1) {
+        auto x = i * dx;
+        auto y = j * dy;
+
+        auto sin_sqr_x = sin(PI * x) * sin(PI * x);
+        auto sin_sqr_y = sin(PI * y) * sin(PI * y);
+
+        auto cos_sqr_x = cos(PI * x) * cos(PI * x);
+        auto cos_sqr_y = cos(PI * y) * cos(PI * y);
+
+        // center, north, south, east, and west indices
+        auto idx_c = i + j * nx;
+        auto idx_n = i + (j + 1) * nx;
+        auto idx_s = i + (j - 1) * nx;
+        auto idx_e = (i + 1) + j * nx;
+        auto idx_w = (i - 1) + j * nx;
+
+        farray[idx_c] = cc * uarray[idx_c] + cx * (uarray[idx_w] + uarray[idx_e]) + cy * (uarray[idx_s] + uarray[idx_n]) -
+                        TWO * PI * sin_sqr_x * sin_sqr_y * sin_t_cos_t -
+                        bx * (cos_sqr_x - sin_sqr_x) * sin_sqr_y * cos_sqr_t -
+                        by * (cos_sqr_y - sin_sqr_y) * sin_sqr_x * cos_sqr_t;
+      }
+    });
+  });
+  global_exec->synchronize();
 #else
 
   // Access host data arrays
@@ -508,8 +564,87 @@ int J(sunrealtype t, N_Vector y, N_Vector fy, SUNMatrix J, void* user_data, N_Ve
 
   J_kernel<<<num_blocks_i, threads_per_block_i>>>(nx, ny, cx, cy, cc, row_ptrs, col_idxs, mat_data);
 
-  HIP_OR_CUDA(hipDeviceSynchronize();, cudaDeviceSynchronize(););
-
+  HIP_OR_CUDA_OR_SYCL(hipDeviceSynchronize(), cudaDeviceSynchronize(), global_exec->synchronize());
+#elif defined(USE_DPCPP)
+  auto queue = std::dynamic_pointer_cast<const gko::DpcppExecutor>(global_exec)->get_queue();
+  // J_sn_kernel
+  queue->submit([&](sycl::handler& cgh) {
+    cgh.parallel_for(nx, [=](sycl::id<1> id) {
+      const sunindextype i = id[0];
+
+      // Southern face
+      mat_data[i] = ZERO;
+      col_idxs[i] = i;
+      row_ptrs[i] = i;
+
+      // Northern face
+      auto col      = i + (ny - 1) * nx;
+      auto idx      = (5 * (nx - 2) + 2) * (ny - 2) + nx + i;
+      mat_data[idx] = ZERO;
+      col_idxs[idx] = col;
+      row_ptrs[col] = idx;
+
+      if (i == nx - 1) row_ptrs[nx * ny] = (5 * (nx - 2) + 2) * (ny - 2) + 2 * nx;
+    });
+  });
+  // J_we_kernel
+  queue->submit([&](sycl::handler& cgh) {
+    cgh.parallel_for(ny, [=](sycl::id<1> id) {
+      const sunindextype j = id[0];
+      if (j > 0 && j < ny - 1) {
+        // Western face
+        auto col      = j * nx;
+        auto idx      = (5 * (nx - 2) + 2) * (j - 1) + nx;
+        mat_data[idx] = ZERO;
+        col_idxs[idx] = col;
+        row_ptrs[col] = idx;
+
+        // Eastern face
+        col           = (nx - 1) + j * nx;
+        idx           = (5 * (nx - 2) + 2) * (j - 1) + nx + 1 + 5 * (nx - 2);
+        mat_data[idx] = ZERO;
+        col_idxs[idx] = col;
+        row_ptrs[col] = idx;
+      }
+    });
+  });
+  // J_kernel
+  queue->submit([&](sycl::handler& cgh) {
+    cgh.parallel_for(sycl::range<2>(ny, nx), [=](sycl::id<2> id) {
+      const sunindextype i = id[1];
+      const sunindextype j = id[0];
+
+      if (i > 0 && i < nx - 1 && j > 0 && j < ny - 1) {
+        auto row   = i + j * nx;
+        auto col_s = row - nx;
+        auto col_w = row - 1;
+        auto col_c = row;
+        auto col_e = row + 1;
+        auto col_n = row + nx;
+
+        // Number of non-zero entries from preceding rows
+        auto prior_nnz = (5 * (nx - 2) + 2) * (j - 1) + nx;
+
+        // Starting index for this row
+        auto idx = prior_nnz + 1 + 5 * (i - 1);
+
+        mat_data[idx]     = cy;
+        mat_data[idx + 1] = cx;
+        mat_data[idx + 2] = cc;
+        mat_data[idx + 3] = cx;
+        mat_data[idx + 4] = cy;
+
+        col_idxs[idx]     = col_s;
+        col_idxs[idx + 1] = col_w;
+        col_idxs[idx + 2] = col_c;
+        col_idxs[idx + 3] = col_e;
+        col_idxs[idx + 4] = col_n;
+
+        row_ptrs[row] = idx;
+      }
+    });
+  });
+  global_exec->synchronize();
 #else
 
   // Fill southern boundary entries (j = 0)

diff --git a/examples/cvode/ginkgo/cv_heat2D_ginkgo.hpp b/examples/cvode/ginkgo/cv_heat2D_ginkgo.hpp
@@ -31,8 +31,12 @@
 #include <nvector/nvector_cuda.h>
 #elif defined(USE_HIP)
 #include <nvector/nvector_hip.h>
+#elif defined(USE_DPCPP)
+#include <nvector/nvector_sycl.h>
 #endif
 
+#include <ginkgo/ginkgo.hpp>
+
 // Common utility functions
 #include <example_utilities.hpp>
 
@@ -117,7 +121,7 @@ void solution_kernel(const sunindextype nx, const sunindextype ny,
 #endif
 
 // Compute the exact solution
-int Solution(sunrealtype t, N_Vector u, UserData &udata)
+int Solution(sunrealtype t, N_Vector u, UserData &udata, std::shared_ptr<const gko::Executor> exec)
 {
   // Access problem data and set shortcuts
   const auto nx = udata.nx;
@@ -145,7 +149,27 @@ int Solution(sunrealtype t, N_Vector u, UserData &udata)
 
   solution_kernel<<<num_blocks, threads_per_block>>>
     (nx, ny, dx, dy, cos_sqr_t, uarray);
-
+#elif defined(USE_DPCPP)
+  sunrealtype *uarray = N_VGetDeviceArrayPointer(u);
+  if (check_ptr(uarray, "N_VGetDeviceArrayPointer")) return -1;
+  std::dynamic_pointer_cast<const gko::DpcppExecutor>(exec)->get_queue()->submit([&](sycl::handler& cgh) {
+    cgh.parallel_for(sycl::range<2>(ny, nx), [=](sycl::id<2> id) {
+      const sunindextype i = id[1];
+      const sunindextype j = id[0];
+
+      if (i > 0 && i < nx - 1 && j > 0 && j < ny - 1)
+      {
+        auto x = i * dx;
+        auto y = j * dy;
+
+        auto sin_sqr_x = sin(PI * x) * sin(PI * x);
+        auto sin_sqr_y = sin(PI * y) * sin(PI * y);
+
+        auto idx = i + j * nx;
+        uarray[idx] = sin_sqr_x * sin_sqr_y * cos_sqr_t + ONE;
+      } 
+    });
+  });
 #else
 
   sunrealtype *uarray = N_VGetArrayPointer(u);
@@ -167,15 +191,15 @@ int Solution(sunrealtype t, N_Vector u, UserData &udata)
   }
 
 #endif
-
+  exec->synchronize();
   return 0;
 }
 
 // Compute the solution error
-int SolutionError(sunrealtype t, N_Vector u, N_Vector e, UserData &udata)
+int SolutionError(sunrealtype t, N_Vector u, N_Vector e, UserData &udata, std::shared_ptr<const gko::Executor> exec)
 {
   // Compute true solution
-  int flag = Solution(t, e, udata);
+  int flag = Solution(t, e, udata, exec);
   if (flag != 0) return -1;
 
   // Compute absolute error
@@ -310,10 +334,10 @@ int OpenOutput(UserData &udata)
 }
 
 // Write output
-int WriteOutput(sunrealtype t, N_Vector u, N_Vector e, UserData &udata)
+int WriteOutput(sunrealtype t, N_Vector u, N_Vector e, UserData &udata, std::shared_ptr<const gko::Executor> exec)
 {
   // Compute the error
-  int flag = SolutionError(t, u, e, udata);
+  int flag = SolutionError(t, u, e, udata, exec);
   if (check_flag(flag, "SolutionError")) return 1;
 
   // Compute max error
@@ -335,6 +359,9 @@ int WriteOutput(sunrealtype t, N_Vector u, N_Vector e, UserData &udata)
 #elif defined(USE_HIP)
     N_VCopyFromDevice_Hip(u);
     N_VCopyFromDevice_Hip(e);
+#elif defined(USE_DPCPP)
+    N_VCopyFromDevice_Sycl(u);
+    N_VCopyFromDevice_Sycl(e);
 #endif
 
     // Access host data array