Skip to content

Commit

Permalink
fix some warnings
Browse files Browse the repository at this point in the history
update SYCL CMake handling, added QUDA_SYCL_TARGETS
fix shared memory handling in some kernels
update SYCL API
  • Loading branch information
jcosborn committed Nov 17, 2023
1 parent 0106b98 commit 1348d90
Show file tree
Hide file tree
Showing 15 changed files with 119 additions and 99 deletions.
4 changes: 4 additions & 0 deletions cmake/CMakeSYCLInformation.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,10 @@ if(NOT CMAKE_INCLUDE_FLAG_SYCL)
set(CMAKE_INCLUDE_FLAG_SYCL ${CMAKE_INCLUDE_FLAG_CXX})
endif()

if(NOT CMAKE_SYCL_COMPILE_OPTIONS_EXPLICIT_LANGUAGE)
set(CMAKE_SYCL_COMPILE_OPTIONS_EXPLICIT_LANGUAGE ${CMAKE_CXX_COMPILE_OPTIONS_EXPLICIT_LANGUAGE})
endif()

if(NOT CMAKE_SYCL_DEPENDS_USE_COMPILER)
set(CMAKE_SYCL_DEPENDS_USE_COMPILER ${CMAKE_CXX_DEPENDS_USE_COMPILER})
endif()
Expand Down
2 changes: 1 addition & 1 deletion include/kernels/block_transpose.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ namespace quda
The transpose uses shared memory to avoid strided memory accesses.
*/
template <bool allthreads = false>
__device__ __host__ inline void operator()(int x_cb, int, bool active = true)
__device__ __host__ inline void operator()(int x_cb, int, bool = true)
{
int parity_color = target::block_idx().z;
int color = parity_color % Arg::nColor;
Expand Down
2 changes: 1 addition & 1 deletion include/kernels/staggered_quark_smearing.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,7 @@ namespace quda
struct staggered_qsmear : dslash_default, NoSpecialOps {

const Arg &arg;
constexpr staggered_qsmear(const Arg &arg) : arg(arg) { }
template <typename Ftor> constexpr staggered_qsmear(const Ftor &ftor) : arg(ftor.arg) {}
static constexpr const char *filename() { return KERNEL_FILE; } // this file name - used for run-time compilation

template <KernelType mykernel_type = kernel_type>
Expand Down
4 changes: 2 additions & 2 deletions include/targets/sycl/block_reduce_helper.h
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ namespace quda
//using dependencies = op_Sequential<op_blockSync>;
//using dependentOps = SpecialOps<op_blockSync>;
using BlockReduce_t = BlockReduce<T, block_dim, batch_size>;
template <typename S> inline block_reduceG(S &ops) {};
template <typename S> inline block_reduceG(S &) {};
/**
@brief Perform a block-wide reduction
@param[in] value_ thread-local value to be reduced
Expand All @@ -110,7 +110,7 @@ namespace quda
@return The block-wide reduced value
*/
template <typename reducer_t>
inline T apply(const T &value_, bool async, int batch, bool all, const reducer_t &r)
inline T apply(const T &value_, bool async, int batch, bool, const reducer_t &r)
{
if (!async) __syncthreads(); // only synchronize if we are not pipelining
const int nbatch = batch_size;
Expand Down
2 changes: 1 addition & 1 deletion include/targets/sycl/block_reduction_kernel.h
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ namespace quda
@return Swizzled block index
*/
template <typename Arg>
int virtual_block_idx(const Arg &arg, const sycl::nd_item<3> &ndi)
int virtual_block_idx(const Arg &arg, const sycl::nd_item<3> &)
{
int block_idx = groupIdX;
if (arg.swizzle) {
Expand Down
8 changes: 4 additions & 4 deletions include/targets/sycl/kernel.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ namespace quda {
// Kernel1D

template <template <typename> class Functor, typename Arg, bool grid_stride = false>
void Kernel1DImpl(const Arg &arg, const sycl::nd_item<3> &ndi)
void Kernel1DImpl(const Arg &arg, const sycl::nd_item<3> &)
{
Functor<Arg> f(arg);
auto i = globalIdX;
Expand All @@ -22,7 +22,7 @@ namespace quda {
}
}
template <template <typename> class Functor, typename Arg, bool grid_stride = false>
void Kernel1DImplB(const Arg &arg, const sycl::nd_item<3> &ndi)
void Kernel1DImplB(const Arg &arg, const sycl::nd_item<3> &)
{
Functor<Arg> f(arg);
auto tid = globalIdX;
Expand Down Expand Up @@ -160,7 +160,7 @@ namespace quda {
}

template <template <typename> class Functor, typename Arg, bool grid_stride = false>
void Kernel2DImplB(const Arg &arg, const sycl::nd_item<3> &ndi)
void Kernel2DImplB(const Arg &arg, const sycl::nd_item<3> &)
{
Functor<Arg> f(arg);
auto j = globalIdY;
Expand Down Expand Up @@ -310,7 +310,7 @@ namespace quda {
}

template <template <typename> class Functor, typename Arg, bool grid_stride>
void Kernel3DImplB(const Arg &arg, const sycl::nd_item<3> &ndi)
void Kernel3DImplB(const Arg &arg, const sycl::nd_item<3> &)
{
Functor<Arg> f(arg);

Expand Down
12 changes: 6 additions & 6 deletions include/targets/sycl/math_helper.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -30,9 +30,9 @@ namespace quda {
template<typename T>
inline void sincos(const T a, T* s, T* c)
{
*s = sycl::sincos(a, c);
//*s = sycl::sin(a);
//*c = sycl::cos(a);
//*s = sycl::sincos(a, c);
*s = sycl::sin(a);
*c = sycl::cos(a);
}

/**
Expand All @@ -58,9 +58,9 @@ namespace quda {
template<typename T>
inline void sincospi(const T& a, T *s, T *c)
{
*s = sycl::sincos(static_cast<T>(M_PI)*a, c);
//*s = sycl::sinpi(a);
//*c = sycl::cospi(a);
//*s = sycl::sincos(static_cast<T>(M_PI)*a, c);
*s = sycl::sinpi(a);
*c = sycl::cospi(a);
}

/**
Expand Down
12 changes: 6 additions & 6 deletions include/targets/sycl/quda_sycl.h
Original file line number Diff line number Diff line change
Expand Up @@ -73,12 +73,12 @@ static inline auto getNdItem()
return sycl::ext::oneapi::experimental::this_nd_item<3>();
}

static inline int globalRange(int d) { return getNdItem().get_global_range(d); }
static inline int globalId(int d) { return getNdItem().get_global_id(d); }
static inline int groupRange(int d) { return getNdItem().get_group_range(d); }
static inline int groupId(int d) { return getNdItem().get_group(d); }
static inline int localRange(int d) { return getNdItem().get_local_range(d); }
static inline int localId(int d) { return getNdItem().get_local_id(d); }
static inline unsigned int globalRange(int d) { return getNdItem().get_global_range(d); }
static inline unsigned int globalId(int d) { return getNdItem().get_global_id(d); }
static inline unsigned int groupRange(int d) { return getNdItem().get_group_range(d); }
static inline unsigned int groupId(int d) { return getNdItem().get_group(d); }
static inline unsigned int localRange(int d) { return getNdItem().get_local_range(d); }
static inline unsigned int localId(int d) { return getNdItem().get_local_id(d); }

#define globalRangeX ::globalRange(RANGE_X)
#define globalRangeY ::globalRange(RANGE_Y)
Expand Down
6 changes: 3 additions & 3 deletions include/targets/sycl/reduce_helper.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ namespace quda
using reduce_t = T;

template <typename Arg, typename Reducer, typename I, typename BR>
friend void reduce(Arg &, const Reducer &, const I &, const int, BR &br);
friend void reduce(Arg &, const Reducer &, const I &, const unsigned int, BR &br);
qudaError_t launch_error; /** only do complete if no launch error to avoid hang */
static constexpr unsigned int max_n_batch_block
= 1; /** by default reductions do not support batching withing the block */
Expand Down Expand Up @@ -138,7 +138,7 @@ namespace quda
will be constant along constant block_idx().y and block_idx().z.
*/
template <typename Arg, typename Reducer, typename T, typename O>
inline void reduce(Arg &arg, const Reducer &r, const T &in, const int idx, O &ops)
inline void reduce(Arg &arg, const Reducer &r, const T &in, const unsigned int idx, O &ops)
{
using BlockReduce_t = typename reduceParams<Arg, Reducer, T>::BlockReduce_t;
BlockReduce_t br(ops, target::thread_idx().z);
Expand Down Expand Up @@ -173,7 +173,7 @@ namespace quda
sycl::atomic_fence(sycl::memory_order::release, sycl::memory_scope::device); // flush result

// increment global block counter
auto value = atomicAdd(&arg.count[idx], 1);
count_t value = atomicAdd(&arg.count[idx], 1);

// determine if last block
isLastBlockDone[target::thread_idx().z] = (value == (target::grid_dim().x - 1));
Expand Down
4 changes: 2 additions & 2 deletions include/targets/sycl/reduction_kernel.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@
namespace quda {

#ifndef HIGH_LEVEL_REDUCTIONS
template <template <typename> class Functor, typename Arg, bool grid_stride = true>
void Reduction2DImpl(const Arg &arg, const sycl::nd_item<3> &ndi, char *smem)
template <template <typename> class Functor, typename Arg, bool grid_stride = true, typename S>
void Reduction2DImpl(const Arg &arg, const sycl::nd_item<3> &, S smem)
{
Functor<Arg> f(arg);
#if 0
Expand Down
21 changes: 14 additions & 7 deletions include/targets/sycl/special_ops_target.h
Original file line number Diff line number Diff line change
Expand Up @@ -53,10 +53,15 @@ namespace quda {
inline SpecialOps() {
static_assert(!needsSharedMem<SpecialOps<T...>>);
}
inline SpecialOps(char *s) {
inline SpecialOps(char *s) { // for host
static_assert(needsSharedMem<SpecialOps<T...>>);
smem = s;
}
//template <typename S>
//inline SpecialOps(S s) {
// static_assert(needsSharedMem<SpecialOps<T...>>);
// smem = s.get();
//}
template <typename ...U>
inline SpecialOps(const SpecialOps<U...> &ops) {
checkSpecialOps<T...>(ops);
Expand All @@ -65,6 +70,7 @@ namespace quda {
}
}

#if 0
//inline void setNdItem(const sycl::nd_item<3> &i) { ndi = &i; }
inline void setNdItem(const sycl::nd_item<3> &i) {}
inline void setSharedMem(char *s) { smem = s; }
Expand All @@ -73,6 +79,7 @@ namespace quda {
//ndi = ops.ndi;
smem = ops.smem;
}
#endif
#if 0
SpecialOpsElemType *getSharedMemPtr() {
static_assert(!std::is_same_v<SpecialOpsElemType,void>);
Expand All @@ -83,7 +90,7 @@ namespace quda {

// blockSync
template <typename ...T>
inline void blockSync(SpecialOps<T...> *ops) {
inline void blockSync(const SpecialOps<T...> &) {
//static_assert(hasBlockSync<T...>);
checkSpecialOp<op_blockSync,T...>();
//if (ops->ndi == nullptr) {
Expand All @@ -94,7 +101,7 @@ namespace quda {
sycl::group_barrier(getGroup());
#endif
}
template <typename ...T> inline void blockSync(SpecialOps<T...> ops) { blockSync(&ops); }
//template <typename ...T> inline void blockSync(SpecialOps<T...> ops) { blockSync(&ops); }

//template <typename ...T> static constexpr bool isOpConcurrent = false;
//template <typename ...T> static constexpr bool isOpConcurrent<op_Concurrent<T...>> = true;
Expand Down Expand Up @@ -241,13 +248,13 @@ namespace quda {
struct depNone {};
template <> struct sharedMemSizeS<depNone> {
template <typename ...Arg>
static constexpr unsigned int size(dim3 block, Arg &...arg) { return 0; }
static constexpr unsigned int size(dim3, Arg &...) { return 0; }
};

struct depFullBlock {};
template <> struct sharedMemSizeS<depFullBlock> {
template <typename ...Arg>
static constexpr unsigned int size(dim3 block, Arg &...arg) { return 0; }
static constexpr unsigned int size(dim3, Arg &...) { return 0; }
};

template <typename T, typename S>
Expand All @@ -262,7 +269,7 @@ namespace quda {
struct op_blockSync {
//using dependencies = depFullBlock;
template <typename ...Arg>
static constexpr unsigned int shared_mem_size(dim3 block, Arg &...arg) { return 0; }
static constexpr unsigned int shared_mem_size(dim3, Arg &...) { return 0; }
};

template <typename T>
Expand All @@ -271,7 +278,7 @@ namespace quda {
//using dependencies = depNone;
//using dependencies = depFullBlock;
template <typename ...Arg>
static constexpr unsigned int shared_mem_size(dim3 block, Arg &...arg) { return 0; }
static constexpr unsigned int shared_mem_size(dim3, Arg &...) { return 0; }
};
template <typename T> static constexpr bool needsFullBlockImpl<op_warp_combine<T>> = false;

Expand Down
19 changes: 11 additions & 8 deletions include/targets/sycl/tunable_kernel.h
Original file line number Diff line number Diff line change
Expand Up @@ -132,9 +132,10 @@ namespace quda {
(ndRange,
//[=](sycl::nd_item<3> ndi) {
[=](sycl::nd_item<3> ndi) [[sycl::reqd_sub_group_size(QUDA_WARP_SIZE)]] {
auto smem = la.get_pointer();
//auto smem = la.get_pointer();
auto smem = la.get_multi_ptr<sycl::access::decorated::yes>();
//arg.lmem = smem;
F f(arg, ndi, smem);
F f(arg, ndi, smem.get());
});
});
} else { // no shared mem
Expand Down Expand Up @@ -187,9 +188,10 @@ namespace quda {
(ndRange,
//[=](sycl::nd_item<3> ndi) {
[=](sycl::nd_item<3> ndi) [[sycl::reqd_sub_group_size(QUDA_WARP_SIZE)]] {
auto smem = la.get_pointer();
//auto smem = la.get_pointer();
auto smem = la.get_multi_ptr<sycl::access::decorated::yes>();
//arg.lmem = smem;
F f(arg, ndi, smem);
F f(arg, ndi, smem.get());
});
});
} else { // no shared mem
Expand Down Expand Up @@ -244,9 +246,10 @@ namespace quda {
//[=](sycl::nd_item<3> ndi) {
[=](sycl::nd_item<3> ndi) [[sycl::reqd_sub_group_size(QUDA_WARP_SIZE)]] {
Arg *arg2 = reinterpret_cast<Arg*>(p);
auto smem = la.get_pointer();
//auto smem = la.get_pointer();
auto smem = la.get_multi_ptr<sycl::access::decorated::yes>();
//arg2->lmem = smem;
F f(*arg2, ndi, smem);
F f(*arg2, ndi, smem.get());
});
});
} else {
Expand Down Expand Up @@ -388,11 +391,11 @@ namespace quda {
template <typename F, bool = hasSpecialOps<F>, bool = needsSharedMem<F>>
struct Ftor : F {
template <typename Arg, typename S>
Ftor(const Arg &arg, const sycl::nd_item<3> &ndi, S smem) : F{arg,smem} {}
Ftor(const Arg &arg, const sycl::nd_item<3> &, S smem) : F{arg,smem} {}
};
template <typename F, bool ns> struct Ftor<F,ns,false> : F {
template <typename Arg, typename ...S>
Ftor(const Arg &arg, const sycl::nd_item<3> &ndi, S ...smem) : F{arg} {}
Ftor(const Arg &arg, const sycl::nd_item<3> &, S ...) : F{arg} {}
};

}
10 changes: 5 additions & 5 deletions lib/targets/sycl/comm_target.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,21 +21,21 @@ namespace quda
return std::max(accessRank[0], accessRank[1]);
}

void comm_create_neighbor_memory(array_2d<void *, QUDA_MAX_DIM, 2> &remote, void *local)
void comm_create_neighbor_memory(array_2d<void *, QUDA_MAX_DIM, 2> &, void *)
{
}

void comm_destroy_neighbor_memory(array_2d<void *, QUDA_MAX_DIM, 2> &remote)
void comm_destroy_neighbor_memory(array_2d<void *, QUDA_MAX_DIM, 2> &)
{
}

void comm_create_neighbor_event(array_2d<qudaEvent_t, QUDA_MAX_DIM, 2> &remote,
array_2d<qudaEvent_t, QUDA_MAX_DIM, 2> &local)
void comm_create_neighbor_event(array_2d<qudaEvent_t, QUDA_MAX_DIM, 2> &,
array_2d<qudaEvent_t, QUDA_MAX_DIM, 2> &)
{
}

void comm_destroy_neighbor_event(array_2d<qudaEvent_t, QUDA_MAX_DIM, 2> &,
array_2d<qudaEvent_t, QUDA_MAX_DIM, 2> &local)
array_2d<qudaEvent_t, QUDA_MAX_DIM, 2> &)
{
}

Expand Down
2 changes: 1 addition & 1 deletion lib/targets/sycl/quda_api.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -354,7 +354,7 @@ namespace quda
if (ptr.is_device()) {
auto q = device::get_target_stream(stream);
char *p = static_cast<char*>(ptr.data());
for(int i=0; i<height; i++) {
for(size_t i=0; i<height; i++) {
q.memset(p, value, width);
p += pitch;
}
Expand Down
Loading

0 comments on commit 1348d90

Please sign in to comment.