fix some warnings

update SYCL CMake handling, added QUDA_SYCL_TARGETS fix shared memory handling in some kernels update SYCL API
lattice · Nov 17, 2023 · 1348d90 · 1348d90
1 parent 0106b98
commit 1348d90
Show file tree

Hide file tree

Showing 15 changed files with 119 additions and 99 deletions.
diff --git a/cmake/CMakeSYCLInformation.cmake b/cmake/CMakeSYCLInformation.cmake
@@ -20,6 +20,10 @@ if(NOT CMAKE_INCLUDE_FLAG_SYCL)
   set(CMAKE_INCLUDE_FLAG_SYCL ${CMAKE_INCLUDE_FLAG_CXX})
 endif()
 
+if(NOT CMAKE_SYCL_COMPILE_OPTIONS_EXPLICIT_LANGUAGE)
+  set(CMAKE_SYCL_COMPILE_OPTIONS_EXPLICIT_LANGUAGE ${CMAKE_CXX_COMPILE_OPTIONS_EXPLICIT_LANGUAGE})
+endif()
+
 if(NOT CMAKE_SYCL_DEPENDS_USE_COMPILER)
   set(CMAKE_SYCL_DEPENDS_USE_COMPILER ${CMAKE_CXX_DEPENDS_USE_COMPILER})
 endif()

diff --git a/include/kernels/block_transpose.cuh b/include/kernels/block_transpose.cuh
@@ -69,7 +69,7 @@ namespace quda
         The transpose uses shared memory to avoid strided memory accesses.
      */
     template <bool allthreads = false>
-    __device__ __host__ inline void operator()(int x_cb, int, bool active = true)
+    __device__ __host__ inline void operator()(int x_cb, int, bool = true)
     {
       int parity_color = target::block_idx().z;
       int color = parity_color % Arg::nColor;

diff --git a/include/kernels/staggered_quark_smearing.cuh b/include/kernels/staggered_quark_smearing.cuh
@@ -150,7 +150,7 @@ namespace quda
   struct staggered_qsmear : dslash_default, NoSpecialOps {
 
     const Arg &arg;
-    constexpr staggered_qsmear(const Arg &arg) : arg(arg) { }
+    template <typename Ftor> constexpr staggered_qsmear(const Ftor &ftor) : arg(ftor.arg) {}
     static constexpr const char *filename() { return KERNEL_FILE; } // this file name - used for run-time compilation
 
     template <KernelType mykernel_type = kernel_type>

diff --git a/include/targets/sycl/block_reduce_helper.h b/include/targets/sycl/block_reduce_helper.h
@@ -96,7 +96,7 @@ namespace quda
     //using dependencies = op_Sequential<op_blockSync>;
     //using dependentOps = SpecialOps<op_blockSync>;
     using BlockReduce_t = BlockReduce<T, block_dim, batch_size>;
-    template <typename S> inline block_reduceG(S &ops) {};
+    template <typename S> inline block_reduceG(S &) {};
     /**
        @brief Perform a block-wide reduction
        @param[in] value_ thread-local value to be reduced
@@ -110,7 +110,7 @@ namespace quda
        @return The block-wide reduced value
      */
     template <typename reducer_t>
-    inline T apply(const T &value_, bool async, int batch, bool all, const reducer_t &r)
+    inline T apply(const T &value_, bool async, int batch, bool, const reducer_t &r)
     {
       if (!async) __syncthreads(); // only synchronize if we are not pipelining
       const int nbatch = batch_size;

diff --git a/include/targets/sycl/block_reduction_kernel.h b/include/targets/sycl/block_reduction_kernel.h
@@ -32,7 +32,7 @@ namespace quda
      @return Swizzled block index
    */
   template <typename Arg>
-  int virtual_block_idx(const Arg &arg, const sycl::nd_item<3> &ndi)
+  int virtual_block_idx(const Arg &arg, const sycl::nd_item<3> &)
   {
     int block_idx = groupIdX;
     if (arg.swizzle) {

diff --git a/include/targets/sycl/kernel.h b/include/targets/sycl/kernel.h
@@ -12,7 +12,7 @@ namespace quda {
   // Kernel1D
 
   template <template <typename> class Functor, typename Arg, bool grid_stride = false>
-  void Kernel1DImpl(const Arg &arg, const sycl::nd_item<3> &ndi)
+  void Kernel1DImpl(const Arg &arg, const sycl::nd_item<3> &)
   {
     Functor<Arg> f(arg);
     auto i = globalIdX;
@@ -22,7 +22,7 @@ namespace quda {
     }
   }
   template <template <typename> class Functor, typename Arg, bool grid_stride = false>
-  void Kernel1DImplB(const Arg &arg, const sycl::nd_item<3> &ndi)
+  void Kernel1DImplB(const Arg &arg, const sycl::nd_item<3> &)
   {
     Functor<Arg> f(arg);
     auto tid = globalIdX;
@@ -160,7 +160,7 @@ namespace quda {
   }
 
   template <template <typename> class Functor, typename Arg, bool grid_stride = false>
-  void Kernel2DImplB(const Arg &arg, const sycl::nd_item<3> &ndi)
+  void Kernel2DImplB(const Arg &arg, const sycl::nd_item<3> &)
   {
     Functor<Arg> f(arg);
     auto j = globalIdY;
@@ -310,7 +310,7 @@ namespace quda {
   }
 
   template <template <typename> class Functor, typename Arg, bool grid_stride>
-  void Kernel3DImplB(const Arg &arg, const sycl::nd_item<3> &ndi)
+  void Kernel3DImplB(const Arg &arg, const sycl::nd_item<3> &)
   {
     Functor<Arg> f(arg);
 

diff --git a/include/targets/sycl/math_helper.cuh b/include/targets/sycl/math_helper.cuh
@@ -30,9 +30,9 @@ namespace quda {
   template<typename T>
   inline void sincos(const T a, T* s, T* c)
   {
-    *s = sycl::sincos(a, c);
-    //*s = sycl::sin(a);
-    //*c = sycl::cos(a);
+    //*s = sycl::sincos(a, c);
+    *s = sycl::sin(a);
+    *c = sycl::cos(a);
   }
 
   /**
@@ -58,9 +58,9 @@ namespace quda {
   template<typename T>
   inline void sincospi(const T& a, T *s, T *c)
   {
-    *s = sycl::sincos(static_cast<T>(M_PI)*a, c);
-    //*s = sycl::sinpi(a);
-    //*c = sycl::cospi(a);
+    //*s = sycl::sincos(static_cast<T>(M_PI)*a, c);
+    *s = sycl::sinpi(a);
+    *c = sycl::cospi(a);
   }
 
   /**

diff --git a/include/targets/sycl/quda_sycl.h b/include/targets/sycl/quda_sycl.h
@@ -73,12 +73,12 @@ static inline auto getNdItem()
   return sycl::ext::oneapi::experimental::this_nd_item<3>();
 }
 
-static inline int globalRange(int d) { return getNdItem().get_global_range(d); }
-static inline int globalId(int d) { return getNdItem().get_global_id(d); }
-static inline int groupRange(int d) { return getNdItem().get_group_range(d); }
-static inline int groupId(int d) { return getNdItem().get_group(d); }
-static inline int localRange(int d) { return getNdItem().get_local_range(d); }
-static inline int localId(int d) { return getNdItem().get_local_id(d); }
+static inline unsigned int globalRange(int d) { return getNdItem().get_global_range(d); }
+static inline unsigned int globalId(int d) { return getNdItem().get_global_id(d); }
+static inline unsigned int groupRange(int d) { return getNdItem().get_group_range(d); }
+static inline unsigned int groupId(int d) { return getNdItem().get_group(d); }
+static inline unsigned int localRange(int d) { return getNdItem().get_local_range(d); }
+static inline unsigned int localId(int d) { return getNdItem().get_local_id(d); }
 
 #define globalRangeX ::globalRange(RANGE_X)
 #define globalRangeY ::globalRange(RANGE_Y)

diff --git a/include/targets/sycl/reduce_helper.h b/include/targets/sycl/reduce_helper.h
@@ -27,7 +27,7 @@ namespace quda
     using reduce_t = T;
 
     template <typename Arg, typename Reducer, typename I, typename BR>
-    friend void reduce(Arg &, const Reducer &, const I &, const int, BR &br);
+    friend void reduce(Arg &, const Reducer &, const I &, const unsigned int, BR &br);
     qudaError_t launch_error; /** only do complete if no launch error to avoid hang */
     static constexpr unsigned int max_n_batch_block
       = 1; /** by default reductions do not support batching withing the block */
@@ -138,7 +138,7 @@ namespace quda
      will be constant along constant block_idx().y and block_idx().z.
   */
   template <typename Arg, typename Reducer, typename T, typename O>
-  inline void reduce(Arg &arg, const Reducer &r, const T &in, const int idx, O &ops)
+  inline void reduce(Arg &arg, const Reducer &r, const T &in, const unsigned int idx, O &ops)
   {
     using BlockReduce_t = typename reduceParams<Arg, Reducer, T>::BlockReduce_t;
     BlockReduce_t br(ops, target::thread_idx().z);
@@ -173,7 +173,7 @@ namespace quda
       sycl::atomic_fence(sycl::memory_order::release, sycl::memory_scope::device); // flush result
 
       // increment global block counter
-      auto value = atomicAdd(&arg.count[idx], 1);
+      count_t value = atomicAdd(&arg.count[idx], 1);
 
       // determine if last block
       isLastBlockDone[target::thread_idx().z] = (value == (target::grid_dim().x - 1));

diff --git a/include/targets/sycl/reduction_kernel.h b/include/targets/sycl/reduction_kernel.h
@@ -9,8 +9,8 @@
 namespace quda {
 
 #ifndef HIGH_LEVEL_REDUCTIONS
-  template <template <typename> class Functor, typename Arg, bool grid_stride = true>
-  void Reduction2DImpl(const Arg &arg, const sycl::nd_item<3> &ndi, char *smem)
+  template <template <typename> class Functor, typename Arg, bool grid_stride = true, typename S>
+  void Reduction2DImpl(const Arg &arg, const sycl::nd_item<3> &, S smem)
   {
     Functor<Arg> f(arg);
 #if 0

diff --git a/include/targets/sycl/special_ops_target.h b/include/targets/sycl/special_ops_target.h
@@ -53,10 +53,15 @@ namespace quda {
     inline SpecialOps() {
       static_assert(!needsSharedMem<SpecialOps<T...>>);
     }
-    inline SpecialOps(char *s) {
+    inline SpecialOps(char *s) {  // for host
       static_assert(needsSharedMem<SpecialOps<T...>>);
       smem = s;
     }
+    //template <typename S>
+    //inline SpecialOps(S s) {
+    //  static_assert(needsSharedMem<SpecialOps<T...>>);
+    //  smem = s.get();
+    //}
     template <typename ...U>
     inline SpecialOps(const SpecialOps<U...> &ops) {
       checkSpecialOps<T...>(ops);
@@ -65,6 +70,7 @@ namespace quda {
       }
     }
 
+#if 0
     //inline void setNdItem(const sycl::nd_item<3> &i) { ndi = &i; }
     inline void setNdItem(const sycl::nd_item<3> &i) {}
     inline void setSharedMem(char *s) { smem = s; }
@@ -73,6 +79,7 @@ namespace quda {
       //ndi = ops.ndi;
       smem = ops.smem;
     }
+#endif
 #if 0
     SpecialOpsElemType *getSharedMemPtr() {
       static_assert(!std::is_same_v<SpecialOpsElemType,void>);
@@ -83,7 +90,7 @@ namespace quda {
 
   // blockSync
   template <typename ...T>
-  inline void blockSync(SpecialOps<T...> *ops) {
+  inline void blockSync(const SpecialOps<T...> &) {
     //static_assert(hasBlockSync<T...>);
     checkSpecialOp<op_blockSync,T...>();
     //if (ops->ndi == nullptr) {
@@ -94,7 +101,7 @@ namespace quda {
     sycl::group_barrier(getGroup());
 #endif
   }
-  template <typename ...T> inline void blockSync(SpecialOps<T...> ops) { blockSync(&ops); }
+  //template <typename ...T> inline void blockSync(SpecialOps<T...> ops) { blockSync(&ops); }
 
   //template <typename ...T> static constexpr bool isOpConcurrent = false;
   //template <typename ...T> static constexpr bool isOpConcurrent<op_Concurrent<T...>> = true;
@@ -241,13 +248,13 @@ namespace quda {
   struct depNone {};
   template <> struct sharedMemSizeS<depNone> {
     template <typename ...Arg>
-    static constexpr unsigned int size(dim3 block, Arg &...arg) { return 0; }
+    static constexpr unsigned int size(dim3, Arg &...) { return 0; }
   };
 
   struct depFullBlock {};
   template <> struct sharedMemSizeS<depFullBlock> {
     template <typename ...Arg>
-    static constexpr unsigned int size(dim3 block, Arg &...arg) { return 0; }
+    static constexpr unsigned int size(dim3, Arg &...) { return 0; }
   };
 
   template <typename T, typename S>
@@ -262,7 +269,7 @@ namespace quda {
   struct op_blockSync {
     //using dependencies = depFullBlock;
     template <typename ...Arg>
-    static constexpr unsigned int shared_mem_size(dim3 block, Arg &...arg) { return 0; }
+    static constexpr unsigned int shared_mem_size(dim3, Arg &...) { return 0; }
   };
 
   template <typename T>
@@ -271,7 +278,7 @@ namespace quda {
     //using dependencies = depNone;
     //using dependencies = depFullBlock;
     template <typename ...Arg>
-    static constexpr unsigned int shared_mem_size(dim3 block, Arg &...arg) { return 0; }
+    static constexpr unsigned int shared_mem_size(dim3, Arg &...) { return 0; }
   };
   template <typename T> static constexpr bool needsFullBlockImpl<op_warp_combine<T>> = false;
 

diff --git a/include/targets/sycl/tunable_kernel.h b/include/targets/sycl/tunable_kernel.h
@@ -132,9 +132,10 @@ namespace quda {
 	    (ndRange,
 	     //[=](sycl::nd_item<3> ndi) {
 	     [=](sycl::nd_item<3> ndi) [[sycl::reqd_sub_group_size(QUDA_WARP_SIZE)]] {
-	       auto smem = la.get_pointer();
+	       //auto smem = la.get_pointer();
+	       auto smem = la.get_multi_ptr<sycl::access::decorated::yes>();
 	       //arg.lmem = smem;
-	       F f(arg, ndi, smem);
+	       F f(arg, ndi, smem.get());
 	     });
 	});
       } else { // no shared mem
@@ -187,9 +188,10 @@ namespace quda {
 	    (ndRange,
 	     //[=](sycl::nd_item<3> ndi) {
 	     [=](sycl::nd_item<3> ndi) [[sycl::reqd_sub_group_size(QUDA_WARP_SIZE)]] {
-	       auto smem = la.get_pointer();
+	       //auto smem = la.get_pointer();
+	       auto smem = la.get_multi_ptr<sycl::access::decorated::yes>();
 	       //arg.lmem = smem;
-	       F f(arg, ndi, smem);
+	       F f(arg, ndi, smem.get());
 	     });
 	});
       } else { // no shared mem
@@ -244,9 +246,10 @@ namespace quda {
 	     //[=](sycl::nd_item<3> ndi) {
 	     [=](sycl::nd_item<3> ndi) [[sycl::reqd_sub_group_size(QUDA_WARP_SIZE)]] {
 	       Arg *arg2 = reinterpret_cast<Arg*>(p);
-	       auto smem = la.get_pointer();
+	       //auto smem = la.get_pointer();
+	       auto smem = la.get_multi_ptr<sycl::access::decorated::yes>();
 	       //arg2->lmem = smem;
-	       F f(*arg2, ndi, smem);
+	       F f(*arg2, ndi, smem.get());
 	     });
 	});
       } else {
@@ -388,11 +391,11 @@ namespace quda {
   template <typename F, bool = hasSpecialOps<F>, bool = needsSharedMem<F>>
   struct Ftor : F {
     template <typename Arg, typename S>
-    Ftor(const Arg &arg, const sycl::nd_item<3> &ndi, S smem) : F{arg,smem} {}
+    Ftor(const Arg &arg, const sycl::nd_item<3> &, S smem) : F{arg,smem} {}
   };
   template <typename F, bool ns> struct Ftor<F,ns,false> : F {
     template <typename Arg, typename ...S>
-    Ftor(const Arg &arg, const sycl::nd_item<3> &ndi, S ...smem) : F{arg} {}
+    Ftor(const Arg &arg, const sycl::nd_item<3> &, S ...) : F{arg} {}
   };
 
 }
diff --git a/lib/targets/sycl/comm_target.cpp b/lib/targets/sycl/comm_target.cpp
@@ -21,21 +21,21 @@ namespace quda
     return std::max(accessRank[0], accessRank[1]);
   }
 
-  void comm_create_neighbor_memory(array_2d<void *, QUDA_MAX_DIM, 2> &remote, void *local)
+  void comm_create_neighbor_memory(array_2d<void *, QUDA_MAX_DIM, 2> &, void *)
   {
   }
 
-  void comm_destroy_neighbor_memory(array_2d<void *, QUDA_MAX_DIM, 2> &remote)
+  void comm_destroy_neighbor_memory(array_2d<void *, QUDA_MAX_DIM, 2> &)
   {
   }
 
-  void comm_create_neighbor_event(array_2d<qudaEvent_t, QUDA_MAX_DIM, 2> &remote,
-				  array_2d<qudaEvent_t, QUDA_MAX_DIM, 2> &local)
+  void comm_create_neighbor_event(array_2d<qudaEvent_t, QUDA_MAX_DIM, 2> &,
+				  array_2d<qudaEvent_t, QUDA_MAX_DIM, 2> &)
   {
   }
 
   void comm_destroy_neighbor_event(array_2d<qudaEvent_t, QUDA_MAX_DIM, 2> &,
-				   array_2d<qudaEvent_t, QUDA_MAX_DIM, 2> &local)
+				   array_2d<qudaEvent_t, QUDA_MAX_DIM, 2> &)
   {
   }
 

diff --git a/lib/targets/sycl/quda_api.cpp b/lib/targets/sycl/quda_api.cpp
@@ -354,7 +354,7 @@ namespace quda
     if (ptr.is_device()) {
       auto q = device::get_target_stream(stream);
       char *p = static_cast<char*>(ptr.data());
-      for(int i=0; i<height; i++) {
+      for(size_t i=0; i<height; i++) {
 	q.memset(p, value, width);
 	p += pitch;
       }